bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Thu May 30 07:31:24 2013 +0100 (2013-05-30)
changeset 70 aa916da2e452
parent 69 1016349e619f
child 71 82d3cc398b54
permissions -rw-r--r--
Switch to using UTF-8 internally
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #include <glib.h>
    26 #include <bl/bl.h>
    27 
    28 gchar *prevline;
    29 
    30 /* Common typos. */
    31 char *typo[] = {
    32     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    33     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    34     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    35     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    36     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    37     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    38     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    39     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    40     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    41     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    42     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    43     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    44     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    45     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    46     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    47     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    48     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    49     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    50     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    51     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    52     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    53     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    54     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    55     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    56     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    57     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    58     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    59     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    60     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    61     "se", ""
    62 };
    63 
    64 GTree *usertypo;
    65 
    66 /* Common abbreviations and other OK words not to query as typos. */
    67 char *okword[] = {
    68     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    69     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    70     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    71     "outbid", "outbids", "frostbite", "frostbitten", ""
    72 };
    73 
    74 /* Common abbreviations that cause otherwise unexplained periods. */
    75 char *abbrev[] = {
    76     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    77     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    78 };
    79 
    80 /*
    81  * Two-Letter combinations that rarely if ever start words,
    82  * but are common scannos or otherwise common letter combinations.
    83  */
    84 char *nostart[] = {
    85     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    86 };
    87 
    88 /*
    89  * Two-Letter combinations that rarely if ever end words,
    90  * but are common scannos or otherwise common letter combinations.
    91  */
    92 char *noend[] = {
    93     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
    94     "sw", "gr", "sl", "cl", "iy", ""
    95 };
    96 
    97 char *markup[] = {
    98     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
    99     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   100     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   101     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   102 };
   103 
   104 char *DPmarkup[] = {
   105     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   106 };
   107 
   108 char *nocomma[] = {
   109     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   110     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   111     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   112     "during", "let", "toward", "among", ""
   113 };
   114 
   115 char *noperiod[] = {
   116     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   117     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   118     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   119     "among", "those", "into", "whom", "having", "thence", ""
   120 }; 
   121 
   122 struct {
   123     char *htmlent;
   124     char *htmlnum;
   125     char *textent;
   126 } entities[] = {
   127     "&amp;",	"&#38;",     "&", 
   128     "&lt;",	"&#60;",     "<",
   129     "&gt;",	"&#62;",     ">",
   130     "&deg;",	"&#176;",    " degrees",
   131     "&pound;",	"&#163;",    "L",
   132     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
   133     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
   134     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
   135     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
   136     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
   137     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
   138     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
   139     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
   140     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
   141     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
   142     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
   143     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
   144     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
   145     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
   146     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
   147     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
   148     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
   149     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
   150     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
   151     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
   152     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
   153     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
   154     "&cent;",	"&#162;",    "c", /* cent sign */
   155     "&pound;",	"&#163;",    "L", /* pound sign */
   156     "&curren;",	"&#164;",    "$", /* currency sign */
   157     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
   158     "&sect;",	"&#167;",    "--", /* section sign */
   159     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
   160     "&copy;",	"&#169;",    "(C) ", /* copyright sign */
   161     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
   162     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
   163     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
   164     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
   165     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
   166     "&deg;",	"&#176;",    " degrees", /* degree sign */
   167     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
   168     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
   169     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
   170     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
   171     "&micro;",	"&#181;",    "m", /* micro sign */
   172     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
   173     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
   174     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
   175     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
   176     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
   177     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
   178     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
   179     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
   180     "&iquest;",	"&#191;",    "?", /* inverted question mark */
   181     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
   182     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
   183     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
   184     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
   185     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
   186     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
   187     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
   188     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
   189     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
   190     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
   191     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
   192     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
   193     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
   194     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
   195     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
   196     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
   197     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
   198     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
   199     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
   200     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
   201     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
   202     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
   203     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
   204     "&times;",	"&#215;",    "*", /* multiplication sign */
   205     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
   206     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
   207     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
   208     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
   209     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
   210     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
   211     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
   212     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
   213     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
   214     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
   215     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
   216     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
   217     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
   218     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
   219     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
   220     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
   221     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
   222     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
   223     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
   224     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
   225     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
   226     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
   227     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
   228     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
   229     "&eth;",	"&#240;",    "eth", /* latin small letter eth */
   230     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
   231     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
   232     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
   233     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
   234     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
   235     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
   236     "&divide;",	"&#247;",    "/", /* division sign */
   237     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
   238     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
   239     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
   240     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
   241     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
   242     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
   243     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
   244     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
   245     "", ""
   246 };
   247 
   248 /* special characters */
   249 #define CHAR_SPACE	  32
   250 #define CHAR_TAB	   9
   251 #define CHAR_LF		  10
   252 #define CHAR_CR		  13
   253 #define CHAR_DQUOTE	  34
   254 #define CHAR_SQUOTE	  39
   255 #define CHAR_OPEN_SQUOTE  96
   256 #define CHAR_TILDE	 126
   257 #define CHAR_ASTERISK	  42
   258 #define CHAR_FORESLASH	  47
   259 #define CHAR_CARAT	  94
   260 
   261 #define CHAR_UNDERSCORE    '_'
   262 #define CHAR_OPEN_CBRACK   '{'
   263 #define CHAR_CLOSE_CBRACK  '}'
   264 #define CHAR_OPEN_RBRACK   '('
   265 #define CHAR_CLOSE_RBRACK  ')'
   266 #define CHAR_OPEN_SBRACK   '['
   267 #define CHAR_CLOSE_SBRACK  ']'
   268 
   269 /* longest and shortest normal PG line lengths */
   270 #define LONGEST_PG_LINE   75
   271 #define WAY_TOO_LONG      80
   272 #define SHORTEST_PG_LINE  55
   273 
   274 enum {
   275     ECHO_SWITCH,
   276     SQUOTE_SWITCH,
   277     TYPO_SWITCH,
   278     QPARA_SWITCH,
   279     PARANOID_SWITCH,
   280     LINE_END_SWITCH,
   281     OVERVIEW_SWITCH,
   282     STDOUT_SWITCH,
   283     HEADER_SWITCH,
   284     WEB_SWITCH,
   285     VERBOSE_SWITCH,
   286     MARKUP_SWITCH,
   287     USERTYPO_SWITCH,
   288     DP_SWITCH,
   289     SWITNO
   290 };
   291 
   292 gboolean pswit[SWITNO];  /* program switches */
   293 
   294 static GOptionEntry options[]={
   295     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   296       "Ignore DP-specific markup", NULL },
   297     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   298       "Don't echo queried line", NULL },
   299     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   300       "Check single quotes", NULL },
   301     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   302       "Check common typos", NULL },
   303     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   304       "Require closure of quotes on every paragraph", NULL },
   305     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   306       "Disable paranoid querying of everything", NULL },
   307     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   308       "Disable line end checking", NULL },
   309     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   310       "Overview: just show counts", NULL },
   311     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   312       "Output errors to stdout instead of stderr", NULL },
   313     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   314       "Echo header fields", NULL },
   315     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   316       "Ignore markup in < >", NULL },
   317     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   318       "Use file of user-defined typos", NULL },
   319     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   320       "Defaults for use on www upload", NULL },
   321     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   322       "Verbose - list everything", NULL },
   323     { NULL }
   324 };
   325 
   326 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   327 long cnt_squot;		/* for overview mode, count of singlequote queries */
   328 long cnt_brack;		/* for overview mode, count of brackets queries */
   329 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   330 long cnt_odd;		/* for overview mode, count of odd character queries */
   331 long cnt_long;		/* for overview mode, count of long line errors */
   332 long cnt_short;		/* for overview mode, count of short line queries */
   333 long cnt_punct;		/* for overview mode,
   334 			   count of punctuation and spacing queries */
   335 long cnt_dash;		/* for overview mode, count of dash-related queries */
   336 long cnt_word;		/* for overview mode, count of word queries */
   337 long cnt_html;		/* for overview mode, count of html queries */
   338 long cnt_lineend;	/* for overview mode, count of line-end queries */
   339 long cnt_spacend;	/* count of lines with space at end */
   340 long linecnt;		/* count of total lines in the file */
   341 long checked_linecnt;	/* count of lines actually checked */
   342 
   343 void proghelp(GOptionContext *context);
   344 void procfile(const char *);
   345 
   346 gchar *running_from;
   347 
   348 gboolean mixdigit(const char *);
   349 gchar *getaword(const char **);
   350 char *flgets(char **,long);
   351 void postprocess_for_HTML(char *);
   352 char *linehasmarkup(char *);
   353 char *losemarkup(char *);
   354 gboolean tagcomp(const char *,const char *);
   355 char *loseentities(char *);
   356 gboolean isroman(const char *);
   357 void postprocess_for_DP(char *);
   358 
   359 GTree *qword,*qperiod;
   360 
   361 struct first_pass_results {
   362     long firstline,astline;
   363     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
   364     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
   365     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
   366     int Dutchcount,Frenchcount;
   367 };
   368 
   369 struct warnings {
   370     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   371     int endquote;
   372     gboolean isDutch,isFrench;
   373 };
   374 
   375 struct counters {
   376     long quot;
   377     int c_unders,c_brack,s_brack,r_brack;
   378     int open_single_quote,close_single_quote;
   379 };
   380 
   381 struct line_properties {
   382     unsigned int len,blen;
   383     gunichar start;
   384 };
   385 
   386 struct parities {
   387     int dquote,squote;
   388 };
   389 
   390 struct pending {
   391     char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
   392     long squot;
   393 };
   394 
   395 void parse_options(int *argc,char ***argv)
   396 {
   397     GError *err=NULL;
   398     GOptionContext *context;
   399     context=g_option_context_new(
   400       "file - looks for errors in Project Gutenberg(TM) etexts");
   401     g_option_context_add_main_entries(context,options,NULL);
   402     if (!g_option_context_parse(context,argc,argv,&err))
   403     {
   404 	g_printerr("Bookloupe: %s\n",err->message);
   405 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   406 	exit(1);
   407     }
   408     /* Paranoid checking is turned OFF, not on, by its switch */
   409     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   410     if (pswit[PARANOID_SWITCH])
   411 	/* if running in paranoid mode, typo checks default to enabled */
   412 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   413     /* Line-end checking is turned OFF, not on, by its switch */
   414     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   415     /* Echoing is turned OFF, not on, by its switch */
   416     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   417     if (pswit[OVERVIEW_SWITCH])
   418 	/* just print summary; don't echo */
   419 	pswit[ECHO_SWITCH]=FALSE;
   420     /*
   421      * Web uploads - for the moment, this is really just a placeholder
   422      * until we decide what processing we really want to do on web uploads
   423      */
   424     if (pswit[WEB_SWITCH])
   425     {
   426 	/* specific override for web uploads */
   427 	pswit[ECHO_SWITCH]=TRUE;
   428 	pswit[SQUOTE_SWITCH]=FALSE;
   429 	pswit[TYPO_SWITCH]=TRUE;
   430 	pswit[QPARA_SWITCH]=FALSE;
   431 	pswit[PARANOID_SWITCH]=TRUE;
   432 	pswit[LINE_END_SWITCH]=FALSE;
   433 	pswit[OVERVIEW_SWITCH]=FALSE;
   434 	pswit[STDOUT_SWITCH]=FALSE;
   435 	pswit[HEADER_SWITCH]=TRUE;
   436 	pswit[VERBOSE_SWITCH]=FALSE;
   437 	pswit[MARKUP_SWITCH]=FALSE;
   438 	pswit[USERTYPO_SWITCH]=FALSE;
   439 	pswit[DP_SWITCH]=FALSE;
   440     }
   441     if (*argc<2)
   442     {
   443 	proghelp(context);
   444 	exit(1);
   445     }
   446     g_option_context_free(context);
   447 }
   448 
   449 /*
   450  * read_user_scannos:
   451  *
   452  * Read in the user-defined stealth scanno list.
   453  */
   454 void read_user_scannos(void)
   455 {
   456     GError *err=NULL;
   457     gchar *usertypo_file;
   458     gboolean okay;
   459     int i;
   460     gsize len,nb;
   461     gchar *contents,*utf8,**lines;
   462     usertypo_file=g_strdup("bookloupe.typ");
   463     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   464     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   465     {
   466 	g_clear_error(&err);
   467 	g_free(usertypo_file);
   468 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   469 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   470     }
   471     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   472     {
   473 	g_clear_error(&err);
   474 	g_free(usertypo_file);
   475 	usertypo_file=g_strdup("gutcheck.typ");
   476 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   477     }
   478     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   479     {
   480 	g_clear_error(&err);
   481 	g_free(usertypo_file);
   482 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   483 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   484     }
   485     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   486     {
   487 	g_free(usertypo_file);
   488 	g_print("   --> I couldn't find bookloupe.typ "
   489 	  "-- proceeding without user typos.\n");
   490 	return;
   491     }
   492     else if (!okay)
   493     {
   494 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   495 	g_free(usertypo_file);
   496 	g_clear_error(&err);
   497 	exit(1);
   498     }
   499     utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   500     g_free(contents);
   501     lines=g_strsplit_set(utf8,"\r\n",0);
   502     g_free(utf8);
   503     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   504     for (i=0;lines[i];i++)
   505 	if (*(unsigned char *)lines[i]>'!')
   506 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   507 	else
   508 	    g_free(lines[i]);
   509     g_free(lines);
   510 }
   511 
   512 /*
   513  * read_etext:
   514  *
   515  * Read an etext returning a newly allocated string containing the file
   516  * contents or NULL on error.
   517  */
   518 gchar *read_etext(const char *filename,GError **err)
   519 {
   520     gchar *contents,*utf8;
   521     gsize len,nb;
   522     if (!g_file_get_contents(filename,&contents,&len,err))
   523 	return NULL;
   524     utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   525     g_free(contents);
   526     return utf8;
   527 }
   528 
   529 int main(int argc,char **argv)
   530 {
   531     running_from=g_path_get_dirname(argv[0]);
   532     parse_options(&argc,&argv);
   533     if (pswit[USERTYPO_SWITCH])
   534 	read_user_scannos();
   535     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   536     procfile(argv[1]);
   537     if (pswit[OVERVIEW_SWITCH])
   538     {
   539 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   540 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   541 	g_print("    --------------- Queries found --------------\n");
   542 	if (cnt_long)
   543 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   544 	if (cnt_short)
   545 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   546 	if (cnt_lineend)
   547 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   548 	if (cnt_word)
   549 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   550 	if (cnt_dquot)
   551 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   552 	if (cnt_squot)
   553 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   554 	if (cnt_brack)
   555 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   556 	if (cnt_bin)
   557 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   558 	if (cnt_odd)
   559 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   560 	if (cnt_punct)
   561 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   562 	if (cnt_dash)
   563 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   564 	if (cnt_html)
   565 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   566 	g_print("\n");
   567 	g_print("    TOTAL QUERIES		  %14ld\n",
   568 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   569 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   570     }
   571     g_free(running_from);
   572     if (usertypo)
   573 	g_tree_unref(usertypo);
   574     return 0;
   575 }
   576 
   577 /*
   578  * first_pass:
   579  *
   580  * Run a first pass - verify that it's a valid PG
   581  * file, decide whether to report some things that
   582  * occur many times in the text like long or short
   583  * lines, non-standard dashes, etc.
   584  */
   585 struct first_pass_results *first_pass(const char *etext)
   586 {
   587     gunichar laststart=CHAR_SPACE;
   588     const char *s;
   589     gchar *lc_line;
   590     int i,j,lbytes,llen;
   591     gchar **lines;
   592     unsigned int lastlen=0,lastblen=0;
   593     long spline=0,nspline=0;
   594     static struct first_pass_results results={0};
   595     gchar *inword;
   596     lines=g_strsplit(etext,"\n",0);
   597     for (j=0;lines[j];j++)
   598     {
   599 	lbytes=strlen(lines[j]);
   600 	while (lines[j][lbytes-1]=='\r')
   601 	    lines[j][--lbytes]='\0';
   602 	llen=g_utf8_strlen(lines[j],lbytes);
   603 	linecnt++;
   604 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   605 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   606 	{
   607 	    if (spline)
   608 		g_print("   --> Duplicate header?\n");
   609 	    spline=linecnt+1;   /* first line of non-header text, that is */
   610 	}
   611 	if (!strncmp(lines[j],"*** START",9) &&
   612 	  strstr(lines[j],"PROJECT GUTENBERG"))
   613 	{
   614 	    if (nspline)
   615 		g_print("   --> Duplicate header?\n");
   616 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   617 	}
   618 	if (spline || nspline)
   619 	{
   620 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   621 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   622 	    {
   623 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   624 		{
   625 		    if (results.footerline)
   626 		    {
   627 			/* it's an old-form header - we can detect duplicates */
   628 			if (!nspline)
   629 			    g_print("   --> Duplicate footer?\n");
   630 		    }
   631 		    else
   632 			results.footerline=linecnt;
   633 		}
   634 	    }
   635 	    g_free(lc_line);
   636 	}
   637 	if (spline)
   638 	    results.firstline=spline;
   639 	if (nspline)
   640 	    results.firstline=nspline;  /* override with new */
   641 	if (results.footerline)
   642 	    continue;    /* don't count the boilerplate in the footer */
   643 	results.totlen+=llen;
   644 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   645 	{
   646 	    if (g_utf8_get_char(s)>127)
   647 		results.binlen++;
   648 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   649 		results.alphalen++;
   650 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   651 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   652 		results.endquote_count++;
   653 	}
   654 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   655 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   656 	    results.shortline++;
   657 	if (lbytes>0 &&
   658 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   659 	    cnt_spacend++;
   660 	if (strstr(lines[j],".,"))
   661 	    results.dotcomma++;
   662 	/* only count ast lines for ignoring purposes where there is */
   663 	/* locase text on the line */
   664 	if (strchr(lines[j],'*'))
   665 	{
   666 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   667 		if (g_unichar_islower(g_utf8_get_char(s)))
   668 		    break;
   669 	    if (*s)
   670 		results.astline++;
   671 	}
   672 	if (strchr(lines[j],'/'))
   673 	    results.fslashline++;
   674 	for (s=g_utf8_prev_char(lines[j]+lbytes);
   675 	  s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
   676 	    ;
   677 	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   678 	  g_utf8_get_char(g_utf8_prev_char(s))!='-')
   679 	    results.hyphens++;
   680 	if (llen>LONGEST_PG_LINE)
   681 	    results.longline++;
   682 	if (llen>WAY_TOO_LONG)
   683 	    results.verylongline++;
   684 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   685 	{
   686 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   687 	    if (i>0)
   688 		results.htmcount++;
   689 	    if (strstr(lines[j],"<i>"))
   690 		results.htmcount+=4; /* bonus marks! */
   691 	}
   692 	/* Check for spaced em-dashes */
   693 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   694 	{
   695 	    results.emdash++;
   696 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   697 		results.space_emdash++;
   698 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   699 		/* count of em-dashes with spaces both sides */
   700 		results.non_PG_space_emdash++;
   701 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   702 		/* count of PG-type em-dashes with no spaces */
   703 		results.PG_space_emdash++;
   704 	}
   705 	for (s=lines[j];*s;)
   706 	{
   707 	    inword=getaword(&s);
   708 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   709 		results.Dutchcount++;
   710 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   711 		results.Frenchcount++;
   712 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   713 		results.standalone_digit++;
   714 	    g_free(inword);
   715 	}
   716 	/* Check for spaced dashes */
   717 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   718 	    results.spacedash++;
   719 	lastblen=lastlen;
   720 	lastlen=llen;
   721 	laststart=lines[j][0];
   722     }
   723     g_strfreev(lines);
   724     return &results;
   725 }
   726 
   727 /*
   728  * report_first_pass:
   729  *
   730  * Make some snap decisions based on the first pass results.
   731  */
   732 struct warnings *report_first_pass(struct first_pass_results *results)
   733 {
   734     static struct warnings warnings={0};
   735     if (cnt_spacend>0)
   736 	g_print("   --> %ld lines in this file have white space at end\n",
   737 	  cnt_spacend);
   738     warnings.dotcomma=1;
   739     if (results->dotcomma>5)
   740     {
   741 	warnings.dotcomma=0;
   742 	g_print("   --> %ld lines in this file contain '.,'. "
   743 	  "Not reporting them.\n",results->dotcomma);
   744     }
   745     /*
   746      * If more than 50 lines, or one-tenth, are short,
   747      * don't bother reporting them.
   748      */
   749     warnings.shortline=1;
   750     if (results->shortline>50 || results->shortline*10>linecnt)
   751     {
   752 	warnings.shortline=0;
   753 	g_print("   --> %ld lines in this file are short. "
   754 	  "Not reporting short lines.\n",results->shortline);
   755     }
   756     /*
   757      * If more than 50 lines, or one-tenth, are long,
   758      * don't bother reporting them.
   759      */
   760     warnings.longline=1;
   761     if (results->longline>50 || results->longline*10>linecnt)
   762     {
   763 	warnings.longline=0;
   764 	g_print("   --> %ld lines in this file are long. "
   765 	  "Not reporting long lines.\n",results->longline);
   766     }
   767     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   768     warnings.ast=1;
   769     if (results->astline>10)
   770     {
   771 	warnings.ast=0;
   772 	g_print("   --> %ld lines in this file contain asterisks. "
   773 	  "Not reporting them.\n",results->astline);
   774     }
   775     /*
   776      * If more than 10 lines contain forward slashes,
   777      * don't bother reporting them.
   778      */
   779     warnings.fslash=1;
   780     if (results->fslashline>10)
   781     {
   782 	warnings.fslash=0;
   783 	g_print("   --> %ld lines in this file contain forward slashes. "
   784 	  "Not reporting them.\n",results->fslashline);
   785     }
   786     /*
   787      * If more than 20 lines contain unpunctuated endquotes,
   788      * don't bother reporting them.
   789      */
   790     warnings.endquote=1;
   791     if (results->endquote_count>20)
   792     {
   793 	warnings.endquote=0;
   794 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   795 	  "Not reporting them.\n",results->endquote_count);
   796     }
   797     /*
   798      * If more than 15 lines contain standalone digits,
   799      * don't bother reporting them.
   800      */
   801     warnings.digit=1;
   802     if (results->standalone_digit>10)
   803     {
   804 	warnings.digit=0;
   805 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   806 	  "Not reporting them.\n",results->standalone_digit);
   807     }
   808     /*
   809      * If more than 20 lines contain hyphens at end,
   810      * don't bother reporting them.
   811      */
   812     warnings.hyphen=1;
   813     if (results->hyphens>20)
   814     {
   815 	warnings.hyphen=0;
   816 	g_print("   --> %ld lines in this file have hyphens at end. "
   817 	  "Not reporting them.\n",results->hyphens);
   818     }
   819     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   820     {
   821 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   822 	pswit[MARKUP_SWITCH]=1;
   823     }
   824     if (results->verylongline>0)
   825 	g_print("   --> %ld lines in this file are VERY long!\n",
   826 	  results->verylongline);
   827     /*
   828      * If there are more non-PG spaced dashes than PG em-dashes,
   829      * assume it's deliberate.
   830      * Current PG guidelines say don't use them, but older texts do,
   831      * and some people insist on them whatever the guidelines say.
   832      */
   833     warnings.dash=1;
   834     if (results->spacedash+results->non_PG_space_emdash>
   835       results->PG_space_emdash)
   836     {
   837 	warnings.dash=0;
   838 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   839 	  "Not reporting them.\n",
   840 	  results->spacedash+results->non_PG_space_emdash);
   841     }
   842     /* If more than a quarter of characters are hi-bit, bug out. */
   843     warnings.bin=1;
   844     if (results->binlen*4>results->totlen)
   845     {
   846 	g_print("   --> This file does not appear to be ASCII. "
   847 	  "Terminating. Best of luck with it!\n");
   848 	exit(1);
   849     }
   850     if (results->alphalen*4<results->totlen)
   851     {
   852 	g_print("   --> This file does not appear to be text. "
   853 	  "Terminating. Best of luck with it!\n");
   854 	exit(1);
   855     }
   856     if (results->binlen*100>results->totlen || results->binlen>100)
   857     {
   858 	g_print("   --> There are a lot of foreign letters here. "
   859 	  "Not reporting them.\n");
   860 	warnings.bin=0;
   861     }
   862     warnings.isDutch=FALSE;
   863     if (results->Dutchcount>50)
   864     {
   865 	warnings.isDutch=TRUE;
   866 	g_print("   --> This looks like Dutch - "
   867 	  "switching off dashes and warnings for 's Middags case.\n");
   868     }
   869     warnings.isFrench=FALSE;
   870     if (results->Frenchcount>50)
   871     {
   872 	warnings.isFrench=TRUE;
   873 	g_print("   --> This looks like French - "
   874 	  "switching off some doublepunct.\n");
   875     }
   876     if (results->firstline && results->footerline)
   877 	g_print("    The PG header and footer appear to be already on.\n");
   878     else
   879     {
   880 	if (results->firstline)
   881 	    g_print("    The PG header is on - no footer.\n");
   882 	if (results->footerline)
   883 	    g_print("    The PG footer is on - no header.\n");
   884     }
   885     g_print("\n");
   886     if (pswit[VERBOSE_SWITCH])
   887     {
   888 	warnings.bin=1;
   889 	warnings.shortline=1;
   890 	warnings.dotcomma=1;
   891 	warnings.longline=1;
   892 	warnings.dash=1;
   893 	warnings.digit=1;
   894 	warnings.ast=1;
   895 	warnings.fslash=1;
   896 	warnings.hyphen=1;
   897 	warnings.endquote=1;
   898 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   899     }
   900     if (warnings.isDutch)
   901 	warnings.dash=0;
   902     if (results->footerline>0 && results->firstline>0 &&
   903       results->footerline>results->firstline &&
   904       results->footerline-results->firstline<100)
   905     {
   906 	g_print("   --> I don't really know where this text starts. \n");
   907 	g_print("       There are no reference points.\n");
   908 	g_print("       I'm going to have to report the header and footer "
   909 	  "as well.\n");
   910 	results->firstline=0;
   911     }
   912     return &warnings;
   913 }
   914 
   915 /*
   916  * analyse_quotes:
   917  *
   918  * Look along the line, accumulate the count of quotes, and see
   919  * if this is an empty line - i.e. a line with nothing on it
   920  * but spaces.
   921  * If line has just spaces, period, * and/or - on it, don't
   922  * count it, since empty lines with asterisks or dashes to
   923  * separate sections are common.
   924  *
   925  * Returns: TRUE if the line is empty.
   926  */
   927 gboolean analyse_quotes(const char *aline,struct counters *counters)
   928 {
   929     int guessquote=0;
   930     /* assume the line is empty until proven otherwise */
   931     gboolean isemptyline=TRUE;
   932     const char *s=aline,*sprev,*snext;
   933     gunichar c;
   934     sprev=NULL;
   935     while (*s)
   936     {
   937 	snext=g_utf8_next_char(s);
   938 	c=g_utf8_get_char(s);
   939 	if (c==CHAR_DQUOTE)
   940 	    counters->quot++;
   941 	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
   942 	{
   943 	    if (s==aline)
   944 	    {
   945 		/*
   946 		 * At start of line, it can only be an openquote.
   947 		 * Hardcode a very common exception!
   948 		 */
   949 		if (!g_str_has_prefix(snext,"tis") &&
   950 		  !g_str_has_prefix(snext,"Tis"))
   951 		    counters->open_single_quote++;
   952 	    }
   953 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   954 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   955 		/* Do nothing! it's definitely an apostrophe, not a quote */
   956 		;
   957 	    /* it's outside a word - let's check it out */
   958 	    else if (c==CHAR_OPEN_SQUOTE ||
   959 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   960 	    {
   961 		/* it damwell better BE an openquote */
   962 		if (!g_str_has_prefix(snext,"tis") &&
   963 		  !g_str_has_prefix(snext,"Tis"))
   964 		    /* hardcode a very common exception! */
   965 		    counters->open_single_quote++;
   966 	    }
   967 	    else
   968 	    {
   969 		/* now - is it a closequote? */
   970 		guessquote=0;   /* accumulate clues */
   971 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   972 		{
   973 		    /* it follows a letter - could be either */
   974 		    guessquote++;
   975 		    if (g_utf8_get_char(sprev)=='s')
   976 		    {
   977 			/* looks like a plural apostrophe */
   978 			guessquote-=3;
   979 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   980 			    /* bonus marks! */
   981 			    guessquote-=2;
   982 		    }
   983 		}
   984 		/* it doesn't have a letter either side */
   985 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
   986 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
   987 		    guessquote+=8; /* looks like a closequote */
   988 		else
   989 		    guessquote++;
   990 		if (counters->open_single_quote>counters->close_single_quote)
   991 		    /*
   992 		     * Give it the benefit of some doubt,
   993 		     * if a squote is already open.
   994 		     */
   995 		    guessquote++;
   996 		else
   997 		    guessquote--;
   998 		if (guessquote>=0)
   999 		    counters->close_single_quote++;
  1000 	    }
  1001 	}
  1002 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1003 	  c!='\r' && c!='\n')
  1004 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1005 	if (c==CHAR_UNDERSCORE)
  1006 	    counters->c_unders++;
  1007 	if (c==CHAR_OPEN_CBRACK)
  1008 	    counters->c_brack++;
  1009 	if (c==CHAR_CLOSE_CBRACK)
  1010 	    counters->c_brack--;
  1011 	if (c==CHAR_OPEN_RBRACK)
  1012 	    counters->r_brack++;
  1013 	if (c==CHAR_CLOSE_RBRACK)
  1014 	    counters->r_brack--;
  1015 	if (c==CHAR_OPEN_SBRACK)
  1016 	    counters->s_brack++;
  1017 	if (c==CHAR_CLOSE_SBRACK)
  1018 	    counters->s_brack--;
  1019 	sprev=s;
  1020 	s=snext;
  1021     }
  1022     return isemptyline;
  1023 }
  1024 
  1025 /*
  1026  * check_for_control_characters:
  1027  *
  1028  * Check for invalid or questionable characters in the line
  1029  * Anything above 127 is invalid for plain ASCII, and
  1030  * non-printable control characters should also be flagged.
  1031  * Tabs should generally not be there.
  1032  */
  1033 void check_for_control_characters(const char *aline)
  1034 {
  1035     gunichar c;
  1036     const char *s;
  1037     for (s=aline;*s;s=g_utf8_next_char(s))
  1038     {
  1039 	c=g_utf8_get_char(s);
  1040 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1041 	{
  1042 	    if (pswit[ECHO_SWITCH])
  1043 		g_print("\n%s\n",aline);
  1044 	    if (!pswit[OVERVIEW_SWITCH])
  1045 		g_print("    Line %ld column %ld - Control character %u\n",
  1046 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1047 	    else
  1048 		cnt_bin++;
  1049 	}
  1050     }
  1051 }
  1052 
  1053 /*
  1054  * check_for_odd_characters:
  1055  *
  1056  * Check for binary and other odd characters.
  1057  */
  1058 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1059   gboolean isemptyline)
  1060 {
  1061     /* Don't repeat multiple warnings on one line. */
  1062     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
  1063     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1064     const char *s;
  1065     gunichar c;
  1066     for (s=aline;*s;s=g_utf8_next_char(s))
  1067     {
  1068 	c=g_utf8_get_char(s);
  1069 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1070 	{
  1071 	    if (pswit[ECHO_SWITCH])
  1072 		g_print("\n%s\n",aline);
  1073 	    if (!pswit[OVERVIEW_SWITCH])
  1074 		if (c>127 && c<160 || c>255)
  1075 		    g_print("    Line %ld column %ld - "
  1076 		      "Non-ISO-8859 character %u\n",
  1077 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1078 		else
  1079 		    g_print("    Line %ld column %ld - "
  1080 		      "Non-ASCII character %u\n",
  1081 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1082 	    else
  1083 		cnt_bin++;
  1084 	    eNon_A=TRUE;
  1085 	}
  1086 	if (!eTab && c==CHAR_TAB)
  1087 	{
  1088 	    if (pswit[ECHO_SWITCH])
  1089 		g_print("\n%s\n",aline);
  1090 	    if (!pswit[OVERVIEW_SWITCH])
  1091 		g_print("    Line %ld column %ld - Tab character?\n",
  1092 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1093 	    else
  1094 		cnt_odd++;
  1095 	    eTab=TRUE;
  1096 	}
  1097 	if (!eTilde && c==CHAR_TILDE)
  1098 	{
  1099 	    /*
  1100 	     * Often used by OCR software to indicate an
  1101 	     * unrecognizable character.
  1102 	     */
  1103 	    if (pswit[ECHO_SWITCH])
  1104 		g_print("\n%s\n",aline);
  1105 	    if (!pswit[OVERVIEW_SWITCH])
  1106 		g_print("    Line %ld column %ld - Tilde character?\n",
  1107 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1108 	    else
  1109 		cnt_odd++;
  1110 	    eTilde=TRUE;
  1111 	}
  1112 	if (!eCarat && c==CHAR_CARAT)
  1113 	{  
  1114 	    if (pswit[ECHO_SWITCH])
  1115 		g_print("\n%s\n",aline);
  1116 	    if (!pswit[OVERVIEW_SWITCH])
  1117 		g_print("    Line %ld column %ld - Carat character?\n",
  1118 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1119 	    else
  1120 		cnt_odd++;
  1121 	    eCarat=TRUE;
  1122 	}
  1123 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1124 	{  
  1125 	    if (pswit[ECHO_SWITCH])
  1126 		g_print("\n%s\n",aline);
  1127 	    if (!pswit[OVERVIEW_SWITCH])
  1128 		g_print("    Line %ld column %ld - Forward slash?\n",
  1129 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1130 	    else
  1131 		cnt_odd++;
  1132 	    eFSlash=TRUE;
  1133 	}
  1134 	/*
  1135 	 * Report asterisks only in paranoid mode,
  1136 	 * since they're often deliberate.
  1137 	 */
  1138 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1139 	  c==CHAR_ASTERISK)
  1140 	{
  1141 	    if (pswit[ECHO_SWITCH])
  1142 		g_print("\n%s\n",aline);
  1143 	    if (!pswit[OVERVIEW_SWITCH])
  1144 		g_print("    Line %ld column %ld - Asterisk?\n",
  1145 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1146 	    else
  1147 		cnt_odd++;
  1148 	    eAst=TRUE;
  1149 	}
  1150     }
  1151 }
  1152 
  1153 /*
  1154  * check_for_long_line:
  1155  *
  1156  * Check for line too long.
  1157  */
  1158 void check_for_long_line(const char *aline)
  1159 {
  1160     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1161     {
  1162 	if (pswit[ECHO_SWITCH])
  1163 	    g_print("\n%s\n",aline);
  1164 	if (!pswit[OVERVIEW_SWITCH])
  1165 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1166 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1167 	else
  1168 	    cnt_long++;
  1169     }
  1170 }
  1171 
  1172 /*
  1173  * check_for_short_line:
  1174  *
  1175  * Check for line too short.
  1176  *
  1177  * This one is a bit trickier to implement: we don't want to
  1178  * flag the last line of a paragraph for being short, so we
  1179  * have to wait until we know that our current line is a
  1180  * "normal" line, then report the _previous_ line if it was too
  1181  * short. We also don't want to report indented lines like
  1182  * chapter heads or formatted quotations. We therefore keep
  1183  * last->len as the length of the last line examined, and
  1184  * last->blen as the length of the last but one, and try to
  1185  * suppress unnecessary warnings by checking that both were of
  1186  * "normal" length. We keep the first character of the last
  1187  * line in last->start, and if it was a space, we assume that
  1188  * the formatting is deliberate. I can't figure out a way to
  1189  * distinguish something like a quoted verse left-aligned or
  1190  * the header or footer of a letter from a paragraph of short
  1191  * lines - maybe if I examined the whole paragraph, and if the
  1192  * para has less than, say, 8 lines and if all lines are short,
  1193  * then just assume it's OK? Need to look at some texts to see
  1194  * how often a formula like this would get the right result.
  1195  */
  1196 void check_for_short_line(const char *aline,const struct line_properties *last)
  1197 {
  1198     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1199       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1200       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1201     {
  1202 	if (pswit[ECHO_SWITCH])
  1203 	    g_print("\n%s\n",prevline);
  1204 	if (!pswit[OVERVIEW_SWITCH])
  1205 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1206 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1207 	else
  1208 	    cnt_short++;
  1209     }
  1210 }
  1211 
  1212 /*
  1213  * check_for_starting_punctuation:
  1214  *
  1215  * Look for punctuation other than full ellipses at start of line.
  1216  */
  1217 void check_for_starting_punctuation(const char *aline)
  1218 {
  1219     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1220       !g_str_has_prefix(aline,". . ."))
  1221     {
  1222 	if (pswit[ECHO_SWITCH])
  1223 	    g_print("\n%s\n",aline);
  1224 	if (!pswit[OVERVIEW_SWITCH])
  1225 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1226 	      linecnt);
  1227 	else
  1228 	    cnt_punct++;
  1229     }
  1230 }
  1231 
  1232 /*
  1233  * check_for_spaced_emdash:
  1234  *
  1235  * Check for spaced em-dashes.
  1236  *
  1237  * We must check _all_ occurrences of "--" on the line
  1238  * hence the loop - even if the first double-dash is OK
  1239  * there may be another that's wrong later on.
  1240  */
  1241 void check_for_spaced_emdash(const char *aline)
  1242 {
  1243     const char *s,*t,*next;
  1244     for (s=aline;t=strstr(s,"--");s=next)
  1245     {
  1246 	next=g_utf8_next_char(g_utf8_next_char(t));
  1247 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1248 	  g_utf8_get_char(next)==CHAR_SPACE)
  1249 	{
  1250 	    if (pswit[ECHO_SWITCH])
  1251 		g_print("\n%s\n",aline);
  1252 	    if (!pswit[OVERVIEW_SWITCH])
  1253 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1254 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1255 	    else
  1256 		cnt_dash++;
  1257 	}
  1258     }
  1259 }
  1260 
  1261 /*
  1262  * check_for_spaced_dash:
  1263  *
  1264  * Check for spaced dashes.
  1265  */
  1266 void check_for_spaced_dash(const char *aline)
  1267 {
  1268     const char *s;
  1269     if ((s=strstr(aline," -")))
  1270     {
  1271 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1272 	{
  1273 	    if (pswit[ECHO_SWITCH])
  1274 		g_print("\n%s\n",aline);
  1275 	    if (!pswit[OVERVIEW_SWITCH])
  1276 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1277 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1278 	    else
  1279 		cnt_dash++;
  1280 	}
  1281     }
  1282     else if ((s=strstr(aline,"- ")))
  1283     {
  1284 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1285 	{
  1286 	    if (pswit[ECHO_SWITCH])
  1287 		g_print("\n%s\n",aline);
  1288 	    if (!pswit[OVERVIEW_SWITCH])
  1289 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1290 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1291 	    else
  1292 		cnt_dash++;
  1293 	}
  1294     }
  1295 }
  1296 
  1297 /*
  1298  * check_for_unmarked_paragraphs:
  1299  *
  1300  * Check for unmarked paragraphs indicated by separate speakers.
  1301  *
  1302  * May well be false positive:
  1303  * "Bravo!" "Wonderful!" called the crowd.
  1304  * but useful all the same.
  1305  */
  1306 void check_for_unmarked_paragraphs(const char *aline)
  1307 {
  1308     const char *s;
  1309     s=strstr(aline,"\"  \"");
  1310     if (!s)
  1311 	s=strstr(aline,"\" \"");
  1312     if (s)
  1313     {
  1314 	if (pswit[ECHO_SWITCH])
  1315 	    g_print("\n%s\n",aline);
  1316 	if (!pswit[OVERVIEW_SWITCH])
  1317 	    g_print("    Line %ld column %ld - "
  1318 	      "Query missing paragraph break?\n",
  1319 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1320 	else
  1321 	    cnt_punct++;
  1322     }
  1323 }
  1324 
  1325 /*
  1326  * check_for_jeebies:
  1327  *
  1328  * Check for "to he" and other easy h/b errors.
  1329  *
  1330  * This is a very inadequate effort on the h/b problem,
  1331  * but the phrase "to he" is always an error, whereas "to
  1332  * be" is quite common.
  1333  * Similarly, '"Quiet!", be said.' is a non-be error
  1334  * "to he" is _not_ always an error!:
  1335  *       "Where they went to he couldn't say."
  1336  * Another false positive:
  1337  *       What would "Cinderella" be without the . . .
  1338  * and another: "If he wants to he can see for himself."
  1339  */
  1340 void check_for_jeebies(const char *aline)
  1341 {
  1342     const char *s;
  1343     s=strstr(aline," be could ");
  1344     if (!s)
  1345 	s=strstr(aline," be would ");
  1346     if (!s)
  1347 	s=strstr(aline," was be ");
  1348     if (!s)
  1349 	s=strstr(aline," be is ");
  1350     if (!s)
  1351 	s=strstr(aline," is be ");
  1352     if (!s)
  1353 	s=strstr(aline,"\", be ");
  1354     if (!s)
  1355 	s=strstr(aline,"\" be ");
  1356     if (!s)
  1357 	s=strstr(aline,"\" be ");
  1358     if (!s)
  1359 	s=strstr(aline," to he ");
  1360     if (s)
  1361     {
  1362 	if (pswit[ECHO_SWITCH])
  1363 	    g_print("\n%s\n",aline);
  1364 	if (!pswit[OVERVIEW_SWITCH])
  1365 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1366 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1367 	else
  1368 	    cnt_word++;
  1369     }
  1370     s=strstr(aline," the had ");
  1371     if (!s)
  1372 	s=strstr(aline," a had ");
  1373     if (!s)
  1374 	s=strstr(aline," they bad ");
  1375     if (!s)
  1376 	s=strstr(aline," she bad ");
  1377     if (!s)
  1378 	s=strstr(aline," he bad ");
  1379     if (!s)
  1380 	s=strstr(aline," you bad ");
  1381     if (!s)
  1382 	s=strstr(aline," i bad ");
  1383     if (s)
  1384     {
  1385 	if (pswit[ECHO_SWITCH])
  1386 	    g_print("\n%s\n",aline);
  1387 	if (!pswit[OVERVIEW_SWITCH])
  1388 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1389 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1390 	else
  1391 	    cnt_word++;
  1392     }
  1393     s=strstr(aline,"; hut ");
  1394     if (!s)
  1395 	s=strstr(aline,", hut ");
  1396     if (s)
  1397     {
  1398 	if (pswit[ECHO_SWITCH])
  1399 	    g_print("\n%s\n",aline);
  1400 	if (!pswit[OVERVIEW_SWITCH])
  1401 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1402 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1403 	else
  1404 	    cnt_word++;
  1405     }
  1406 }
  1407 
  1408 /*
  1409  * check_for_mta_from:
  1410  *
  1411  * Special case - angled bracket in front of "From" placed there by an
  1412  * MTA when sending an e-mail.
  1413  */
  1414 void check_for_mta_from(const char *aline)
  1415 {
  1416     const char *s;
  1417     s=strstr(aline,">From");
  1418     if (s)
  1419     {
  1420 	if (pswit[ECHO_SWITCH])
  1421 	    g_print("\n%s\n",aline);
  1422 	if (!pswit[OVERVIEW_SWITCH])
  1423 	    g_print("    Line %ld column %ld - "
  1424 	      "Query angled bracket with From\n",
  1425 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1426 	else
  1427 	    cnt_punct++;
  1428     }
  1429 }
  1430 
  1431 /*
  1432  * check_for_orphan_character:
  1433  *
  1434  * Check for a single character line -
  1435  * often an overflow from bad wrapping.
  1436  */
  1437 void check_for_orphan_character(const char *aline)
  1438 {
  1439     gunichar c;
  1440     c=g_utf8_get_char(aline);
  1441     if (c && !*g_utf8_next_char(aline))
  1442     {
  1443 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1444 	    ; /* Nothing - ignore numerals alone on a line. */
  1445 	else
  1446 	{
  1447 	    if (pswit[ECHO_SWITCH])
  1448 		g_print("\n%s\n",aline);
  1449 	    if (!pswit[OVERVIEW_SWITCH])
  1450 		g_print("    Line %ld column 1 - Query single character line\n",
  1451 		  linecnt);
  1452 	    else
  1453 		cnt_punct++;
  1454 	}
  1455     }
  1456 }
  1457 
  1458 /*
  1459  * check_for_pling_scanno:
  1460  *
  1461  * Check for I" - often should be !
  1462  */
  1463 void check_for_pling_scanno(const char *aline)
  1464 {
  1465     const char *s;
  1466     s=strstr(aline," I\"");
  1467     if (s)
  1468     {
  1469 	if (pswit[ECHO_SWITCH])
  1470 	    g_print("\n%s\n",aline);
  1471 	if (!pswit[OVERVIEW_SWITCH])
  1472 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1473 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1474 	else
  1475 	    cnt_punct++;
  1476     }
  1477 }
  1478 
  1479 /*
  1480  * check_for_extra_period:
  1481  *
  1482  * Check for period without a capital letter. Cut-down from gutspell.
  1483  * Only works when it happens on a single line.
  1484  */
  1485 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1486 {
  1487     const char *s,*t,*s1;
  1488     int i;
  1489     gsize len;
  1490     gboolean istypo;
  1491     gchar *testword;
  1492     gunichar *decomposition;
  1493     if (pswit[PARANOID_SWITCH])
  1494     {
  1495 	for (t=aline;t=strstr(t,". ");)
  1496 	{
  1497 	    if (t==aline)
  1498 	    {
  1499 		t=g_utf8_next_char(t);
  1500 		/* start of line punctuation is handled elsewhere */
  1501 		continue;
  1502 	    }
  1503 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1504 	    {
  1505 		t=g_utf8_next_char(t);
  1506 		continue;
  1507 	    }
  1508 	    if (warnings->isDutch)
  1509 	    {
  1510 		/* For Frank & Jeroen -- 's Middags case */
  1511 		gunichar c2,c3,c4,c5;
  1512 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1513 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1514 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1515 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1516 		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
  1517 		  c4==CHAR_SPACE && g_unichar_isupper(c5))
  1518 		{
  1519 		    t=g_utf8_next_char(t);
  1520 		    continue;
  1521 		}
  1522 	    }
  1523 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1524 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1525 	      !isdigit(g_utf8_get_char(s1)))
  1526 		s1=g_utf8_next_char(s1);
  1527 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1528 	    {
  1529 		/* we have something to investigate */
  1530 		istypo=TRUE;
  1531 		/* so let's go back and find out */
  1532 		for (s1=g_utf8_prev_char(t);s1>=aline &&
  1533 		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||
  1534 		  g_unichar_isdigit(g_utf8_get_char(s1)) ||
  1535 		  g_utf8_get_char(s1)==CHAR_SQUOTE &&
  1536 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
  1537 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
  1538 		  s1=g_utf8_prev_char(s1))
  1539 		    ;
  1540 		s1=g_utf8_next_char(s1);
  1541 		s=strchr(s1,'.');
  1542 		if (s)
  1543 		    testword=g_strndup(s1,s-s1);
  1544 		else
  1545 		    testword=g_strdup(s1);
  1546 		for (i=0;*abbrev[i];i++)
  1547 		    if (!strcmp(testword,abbrev[i]))
  1548 			istypo=FALSE;
  1549 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1550 		    istypo=FALSE;
  1551 		if (!*g_utf8_next_char(testword))
  1552 		    istypo=FALSE;
  1553 		if (isroman(testword))
  1554 		    istypo=FALSE;
  1555 		if (istypo)
  1556 		{
  1557 		    istypo=FALSE;
  1558 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1559 		    {
  1560 			decomposition=g_unicode_canonical_decomposition(
  1561 			  g_utf8_get_char(s),&len);
  1562 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1563 			    istypo=TRUE;
  1564 			g_free(decomposition);
  1565 		    }
  1566 		}
  1567 		if (istypo &&
  1568 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1569 		{
  1570 		    g_tree_insert(qperiod,g_strdup(testword),
  1571 		      GINT_TO_POINTER(1));
  1572 		    if (pswit[ECHO_SWITCH])
  1573 			g_print("\n%s\n",aline);
  1574 		    if (!pswit[OVERVIEW_SWITCH])
  1575 			g_print("    Line %ld column %ld - Extra period?\n",
  1576 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1577 		    else
  1578 			cnt_punct++;
  1579 		}
  1580 		g_free(testword);
  1581 	    }
  1582 	    t=g_utf8_next_char(t);
  1583 	}
  1584     }
  1585 }
  1586 
  1587 /*
  1588  * check_for_following_punctuation:
  1589  *
  1590  * Check for words usually not followed by punctuation.
  1591  */
  1592 void check_for_following_punctuation(const char *aline)
  1593 {
  1594     int i;
  1595     const char *s,*wordstart;
  1596     gunichar c;
  1597     gchar *inword,*t;
  1598     if (pswit[TYPO_SWITCH])
  1599     {
  1600 	for (s=aline;*s;)
  1601 	{
  1602 	    wordstart=s;
  1603 	    t=getaword(&s);
  1604 	    if (!*t)
  1605 	    {
  1606 		g_free(t);
  1607 		continue;
  1608 	    }
  1609 	    inword=g_utf8_strdown(t,-1);
  1610 	    g_free(t);
  1611 	    for (i=0;*nocomma[i];i++)
  1612 		if (!strcmp(inword,nocomma[i]))
  1613 		{
  1614 		    c=g_utf8_get_char(s);
  1615 		    if (c==',' || c==';' || c==':')
  1616 		    {
  1617 			if (pswit[ECHO_SWITCH])
  1618 			    g_print("\n%s\n",aline);
  1619 			if (!pswit[OVERVIEW_SWITCH])
  1620 			    g_print("    Line %ld column %ld - "
  1621 			      "Query punctuation after %s?\n",
  1622 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1623 			      inword);
  1624 			else
  1625 			    cnt_punct++;
  1626 		    }
  1627 		}
  1628 	    for (i=0;*noperiod[i];i++)
  1629 		if (!strcmp(inword,noperiod[i]))
  1630 		{
  1631 		    c=g_utf8_get_char(s);
  1632 		    if (c=='.' || c=='!')
  1633 		    {
  1634 			if (pswit[ECHO_SWITCH])
  1635 			    g_print("\n%s\n",aline);
  1636 			if (!pswit[OVERVIEW_SWITCH])
  1637 			    g_print("    Line %ld column %ld - "
  1638 			      "Query punctuation after %s?\n",
  1639 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1640 			      inword);
  1641 			else
  1642 			    cnt_punct++;
  1643 		    }
  1644 		}
  1645 	    g_free(inword);
  1646 	}
  1647     }
  1648 }
  1649 
  1650 /*
  1651  * check_for_typos:
  1652  *
  1653  * Check for commonly mistyped words,
  1654  * and digits like 0 for O in a word.
  1655  */
  1656 void check_for_typos(const char *aline,struct warnings *warnings)
  1657 {
  1658     const char *s,*t,*nt,*wordstart;
  1659     gchar *inword;
  1660     gunichar *decomposition;
  1661     gchar *testword;
  1662     int i,vowel,consonant,*dupcnt;
  1663     gboolean isdup,istypo,alower;
  1664     gunichar c;
  1665     long offset,len;
  1666     gsize decomposition_len;
  1667     for (s=aline;*s;)
  1668     {
  1669 	wordstart=s;
  1670 	inword=getaword(&s);
  1671 	if (!*inword)
  1672 	{
  1673 	    g_free(inword);
  1674 	    continue; /* don't bother with empty lines */
  1675 	}
  1676 	if (mixdigit(inword))
  1677 	{
  1678 	    if (pswit[ECHO_SWITCH])
  1679 		g_print("\n%s\n",aline);
  1680 	    if (!pswit[OVERVIEW_SWITCH])
  1681 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1682 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1683 	    else
  1684 		cnt_word++;
  1685 	}
  1686 	/*
  1687 	 * Put the word through a series of tests for likely typos and OCR
  1688 	 * errors.
  1689 	 */
  1690 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1691 	{
  1692 	    istypo=FALSE;
  1693 	    alower=FALSE;
  1694 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1695 	    {
  1696 		c=g_utf8_get_char(t);
  1697 		nt=g_utf8_next_char(t);
  1698 		/* lowercase for testing */
  1699 		if (g_unichar_islower(c))
  1700 		    alower=TRUE;
  1701 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1702 		{
  1703 		    /*
  1704 		     * We have an uppercase mid-word. However, there are
  1705 		     * common cases:
  1706 		     *   Mac and Mc like McGill
  1707 		     *   French contractions like l'Abbe
  1708 		     */
  1709 		    offset=g_utf8_pointer_to_offset(inword,t);
  1710 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1711 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1712 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1713 		      offset>0 &&
  1714 		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
  1715 			; /* do nothing! */
  1716 		    else
  1717 			istypo=TRUE;
  1718 		}
  1719 	    }
  1720 	    testword=g_utf8_casefold(inword,-1);
  1721 	}
  1722 	if (pswit[TYPO_SWITCH])
  1723 	{
  1724 	    /*
  1725 	     * Check for certain unlikely two-letter combinations at word
  1726 	     * start and end.
  1727 	     */
  1728 	    len=g_utf8_strlen(testword,-1);
  1729 	    if (len>1)
  1730 	    {
  1731 		for (i=0;*nostart[i];i++)
  1732 		    if (g_str_has_prefix(testword,nostart[i]))
  1733 			istypo=TRUE;
  1734 		for (i=0;*noend[i];i++)
  1735 		    if (g_str_has_suffix(testword,noend[i]))
  1736 			istypo=TRUE;
  1737 	    }
  1738 	    /* ght is common, gbt never. Like that. */
  1739 	    if (strstr(testword,"cb"))
  1740 		istypo=TRUE;
  1741 	    if (strstr(testword,"gbt"))
  1742 		istypo=TRUE;
  1743 	    if (strstr(testword,"pbt"))
  1744 		istypo=TRUE;
  1745 	    if (strstr(testword,"tbs"))
  1746 		istypo=TRUE;
  1747 	    if (strstr(testword,"mrn"))
  1748 		istypo=TRUE;
  1749 	    if (strstr(testword,"ahle"))
  1750 		istypo=TRUE;
  1751 	    if (strstr(testword,"ihle"))
  1752 		istypo=TRUE;
  1753 	    /*
  1754 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1755 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1756 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1757 	     * numerals, but "ii" is a common scanno.
  1758 	     */
  1759 	    if (strstr(testword,"tbi"))
  1760 		istypo=TRUE;
  1761 	    if (strstr(testword,"tbe"))
  1762 		istypo=TRUE;
  1763 	    if (strstr(testword,"ii"))
  1764 		istypo=TRUE;
  1765 	    /*
  1766 	     * Check for no vowels or no consonants.
  1767 	     * If none, flag a typo.
  1768 	     */
  1769 	    if (!istypo && len>1)
  1770 	    {
  1771 		vowel=consonant=0;
  1772 		for (t=testword;*t;t=g_utf8_next_char(t))
  1773 		{
  1774 		    c=g_utf8_get_char(t);
  1775 		    decomposition=
  1776 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1777 		    if (c=='y' || g_unichar_isdigit(c))
  1778 		    {
  1779 			/* Yah, this is loose. */
  1780 			vowel++;
  1781 			consonant++;
  1782 		    }
  1783 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1784 			vowel++;
  1785 		    else
  1786 			consonant++;
  1787 		    g_free(decomposition);
  1788 		}
  1789 		if (!vowel || !consonant)
  1790 		    istypo=TRUE;
  1791 	    }
  1792 	    /*
  1793 	     * Now exclude the word from being reported if it's in
  1794 	     * the okword list.
  1795 	     */
  1796 	    for (i=0;*okword[i];i++)
  1797 		if (!strcmp(testword,okword[i]))
  1798 		    istypo=FALSE;
  1799 	    /*
  1800 	     * What looks like a typo may be a Roman numeral.
  1801 	     * Exclude these.
  1802 	     */
  1803 	    if (istypo && isroman(testword))
  1804 		istypo=FALSE;
  1805 	    /* Check the manual list of typos. */
  1806 	    if (!istypo)
  1807 		for (i=0;*typo[i];i++)
  1808 		    if (!strcmp(testword,typo[i]))
  1809 			istypo=TRUE;
  1810 	    /*
  1811 	     * Check lowercase s, l, i and m - special cases.
  1812 	     *   "j" - often a semi-colon gone wrong.
  1813 	     *   "d" for a missing apostrophe - he d
  1814 	     *   "n" for "in"
  1815 	     */
  1816 	    if (!istypo && len==1 &&
  1817 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1818 		istypo=TRUE;
  1819 	    if (istypo)
  1820 	    {
  1821 		dupcnt=g_tree_lookup(qword,testword);
  1822 		if (dupcnt)
  1823 		{
  1824 		    (*dupcnt)++;
  1825 		    isdup=!pswit[VERBOSE_SWITCH];
  1826 		}
  1827 		else
  1828 		{
  1829 		    dupcnt=g_new0(int,1);
  1830 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1831 		    isdup=FALSE;
  1832 		}
  1833 		if (!isdup)
  1834 		{
  1835 		    if (pswit[ECHO_SWITCH])
  1836 			g_print("\n%s\n",aline);
  1837 		    if (!pswit[OVERVIEW_SWITCH])
  1838 		    {
  1839 			g_print("    Line %ld column %ld - Query word %s",
  1840 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1841 			  inword);
  1842 			if (!pswit[VERBOSE_SWITCH])
  1843 			    g_print(" - not reporting duplicates");
  1844 			g_print("\n");
  1845 		    }
  1846 		    else
  1847 			cnt_word++;
  1848 		}
  1849 	    }
  1850 	}
  1851 	/* check the user's list of typos */
  1852 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1853 	{
  1854 	    if (pswit[ECHO_SWITCH])
  1855 		g_print("\n%s\n",aline);
  1856 	    if (!pswit[OVERVIEW_SWITCH])  
  1857 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1858 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1859 	}
  1860 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1861 	    g_free(testword);
  1862 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1863 	{
  1864 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1865 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1866 	    {
  1867 		if (pswit[ECHO_SWITCH])
  1868 		    g_print("\n%s\n",aline);
  1869 		if (!pswit[OVERVIEW_SWITCH])
  1870 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1871 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1872 		      inword);
  1873 		else
  1874 		    cnt_word++;
  1875 	    }
  1876 	}
  1877 	g_free(inword);
  1878     }
  1879 }
  1880 
  1881 /*
  1882  * check_for_misspaced_punctuation:
  1883  *
  1884  * Look for added or missing spaces around punctuation and quotes.
  1885  * If there is a punctuation character like ! with no space on
  1886  * either side, suspect a missing!space. If there are spaces on
  1887  * both sides , assume a typo. If we see a double quote with no
  1888  * space or punctuation on either side of it, assume unspaced
  1889  * quotes "like"this.
  1890  */
  1891 void check_for_misspaced_punctuation(const char *aline,
  1892   struct parities *parities,gboolean isemptyline)
  1893 {
  1894     gboolean isacro,isellipsis;
  1895     const char *s;
  1896     gunichar c,nc,pc,n2c;
  1897     c=g_utf8_get_char(aline);
  1898     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1899     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1900     {
  1901 	pc=c;
  1902 	c=nc;
  1903 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1904 	/* For each character in the line after the first. */
  1905 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1906 	{
  1907 	    /* we need to suppress warnings for acronyms like M.D. */
  1908 	    isacro=FALSE;
  1909 	    /* we need to suppress warnings for ellipsis . . . */
  1910 	    isellipsis=FALSE;
  1911 	    /*
  1912 	     * If there are letters on both sides of it or
  1913 	     * if it's strict punctuation followed by an alpha.
  1914 	     */
  1915 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1916 	      g_utf8_strchr("?!,;:",-1,c)))
  1917 	    {
  1918 		if (c=='.')
  1919 		{
  1920 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1921 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1922 			isacro=TRUE;
  1923 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1924 		    if (nc && n2c=='.')
  1925 			isacro=TRUE;
  1926 		}
  1927 		if (!isacro)
  1928 		{
  1929 		    if (pswit[ECHO_SWITCH])
  1930 			g_print("\n%s\n",aline);
  1931 		    if (!pswit[OVERVIEW_SWITCH])
  1932 			g_print("    Line %ld column %ld - Missing space?\n",
  1933 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1934 		    else
  1935 			cnt_punct++;
  1936 		}
  1937 	    }
  1938 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1939 	    {
  1940 		/*
  1941 		 * If there are spaces on both sides,
  1942 		 * or space before and end of line.
  1943 		 */
  1944 		if (c=='.')
  1945 		{
  1946 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1947 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1948 			isellipsis=TRUE;
  1949 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1950 		    if (nc && n2c=='.')
  1951 			isellipsis=TRUE;
  1952 		}
  1953 		if (!isemptyline && !isellipsis)
  1954 		{
  1955 		    if (pswit[ECHO_SWITCH])
  1956 			g_print("\n%s\n",aline);
  1957 		    if (!pswit[OVERVIEW_SWITCH])
  1958 			g_print("    Line %ld column %ld - "
  1959 			  "Spaced punctuation?\n",linecnt,
  1960 			  g_utf8_pointer_to_offset(aline,s)+1);
  1961 		    else
  1962 			cnt_punct++;
  1963 		}
  1964 	    }
  1965 	}
  1966     }
  1967     /* Split out the characters that CANNOT be preceded by space. */
  1968     c=g_utf8_get_char(aline);
  1969     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1970     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1971     {
  1972 	pc=c;
  1973 	c=nc;
  1974 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1975 	/* for each character in the line after the first */
  1976 	if (g_utf8_strchr("?!,;:",-1,c))
  1977 	{
  1978 	    /* if it's punctuation that _cannot_ have a space before it */
  1979 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1980 	    {
  1981 		/*
  1982 		 * If nc DOES == space,
  1983 		 * it was already reported just above.
  1984 		 */
  1985 		if (pswit[ECHO_SWITCH])
  1986 		    g_print("\n%s\n",aline);
  1987 		if (!pswit[OVERVIEW_SWITCH])
  1988 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1989 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1990 		else
  1991 		    cnt_punct++;
  1992 	    }
  1993 	}
  1994     }
  1995     /*
  1996      * Special case " .X" where X is any alpha.
  1997      * This plugs a hole in the acronym code above.
  1998      * Inelegant, but maintainable.
  1999      */
  2000     c=g_utf8_get_char(aline);
  2001     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2002     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2003     {
  2004 	pc=c;
  2005 	c=nc;
  2006 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2007 	/* for each character in the line after the first */
  2008 	if (c=='.')
  2009 	{
  2010 	    /* if it's a period */
  2011 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2012 	    {
  2013 		/*
  2014 		 * If the period follows a space and
  2015 		 * is followed by a letter.
  2016 		 */
  2017 		if (pswit[ECHO_SWITCH])
  2018 		    g_print("\n%s\n",aline);
  2019 		if (!pswit[OVERVIEW_SWITCH])
  2020 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2021 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2022 		else
  2023 		    cnt_punct++;
  2024 	    }
  2025 	}
  2026     }
  2027     c=g_utf8_get_char(aline);
  2028     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2029     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2030     {
  2031 	pc=c;
  2032 	c=nc;
  2033 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2034 	/* for each character in the line after the first */
  2035 	if (c==CHAR_DQUOTE)
  2036 	{
  2037 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2038 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2039 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2040 	    {
  2041 		if (pswit[ECHO_SWITCH])
  2042 		    g_print("\n%s\n",aline);
  2043 		if (!pswit[OVERVIEW_SWITCH])
  2044 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2045 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2046 		else
  2047 		    cnt_punct++;
  2048 	    }
  2049 	}
  2050     }
  2051     /* Check parity of quotes. */
  2052     nc=g_utf8_get_char(aline);
  2053     for (s=aline;*s;s=g_utf8_next_char(s))
  2054     {
  2055 	c=nc;
  2056 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2057 	if (c==CHAR_DQUOTE)
  2058 	{
  2059 	    parities->dquote=!parities->dquote;
  2060 	    if (!parities->dquote)
  2061 	    {
  2062 		/* parity even */
  2063 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2064 		{
  2065 		    if (pswit[ECHO_SWITCH])
  2066 			g_print("\n%s\n",aline);
  2067 		    if (!pswit[OVERVIEW_SWITCH])
  2068 			g_print("    Line %ld column %ld - "
  2069 			  "Wrongspaced quotes?\n",
  2070 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2071 		    else
  2072 			cnt_punct++;
  2073 		}
  2074 	    }
  2075 	    else
  2076 	    {
  2077 		/* parity odd */
  2078 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2079 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2080 		{
  2081 		    if (pswit[ECHO_SWITCH])
  2082 			g_print("\n%s\n",aline);
  2083 		    if (!pswit[OVERVIEW_SWITCH])
  2084 			g_print("    Line %ld column %ld - "
  2085 			  "Wrongspaced quotes?\n",
  2086 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2087 		    else
  2088 			cnt_punct++;
  2089 		}
  2090 	    }
  2091 	}
  2092     }
  2093     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  2094     {
  2095 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2096 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2097 	{
  2098 	    if (pswit[ECHO_SWITCH])
  2099 		g_print("\n%s\n",aline);
  2100 	    if (!pswit[OVERVIEW_SWITCH])
  2101 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2102 		  linecnt);
  2103 	    else
  2104 		cnt_punct++;
  2105 	}
  2106     }
  2107     if (pswit[SQUOTE_SWITCH])
  2108     {
  2109 	nc=g_utf8_get_char(aline);
  2110 	for (s=aline;*s;s=g_utf8_next_char(s))
  2111 	{
  2112 	    c=nc;
  2113 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2114 	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
  2115 	      s>aline &&
  2116 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2117 	      !g_unichar_isalpha(nc)))
  2118 	    {
  2119 		parities->squote=!parities->squote;
  2120 		if (!parities->squote)
  2121 		{
  2122 		    /* parity even */
  2123 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2124 		    {
  2125 			if (pswit[ECHO_SWITCH])
  2126 			    g_print("\n%s\n",aline);
  2127 			if (!pswit[OVERVIEW_SWITCH])
  2128 			    g_print("    Line %ld column %ld - "
  2129 			      "Wrongspaced singlequotes?\n",
  2130 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2131 			else
  2132 			    cnt_punct++;
  2133 		    }
  2134 		}
  2135 		else
  2136 		{
  2137 		    /* parity odd */
  2138 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2139 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2140 		    {
  2141 			if (pswit[ECHO_SWITCH])
  2142 			    g_print("\n%s\n",aline);
  2143 			if (!pswit[OVERVIEW_SWITCH])
  2144 			    g_print("    Line %ld column %ld - "
  2145 			      "Wrongspaced singlequotes?\n",
  2146 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2147 			else
  2148 			    cnt_punct++;
  2149 		    }
  2150 		}
  2151 	    }
  2152 	}
  2153     }
  2154 }
  2155 
  2156 /*
  2157  * check_for_double_punctuation:
  2158  *
  2159  * Look for double punctuation like ,. or ,,
  2160  * Thanks to DW for the suggestion!
  2161  * In books with references, ".," and ".;" are common
  2162  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2163  * OTOH, from my initial tests, there are also fairly
  2164  * common errors. What to do? Make these cases paranoid?
  2165  * ".," is the most common, so warnings->dotcomma is used
  2166  * to suppress detailed reporting if it occurs often.
  2167  */
  2168 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2169 {
  2170     const char *s;
  2171     gunichar c,nc;
  2172     nc=g_utf8_get_char(aline);
  2173     for (s=aline;*s;s=g_utf8_next_char(s))
  2174     {
  2175 	c=nc;
  2176 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2177 	/* for each punctuation character in the line */
  2178 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2179 	  g_utf8_strchr(".?!,;:",-1,nc))
  2180 	{
  2181 	    /* followed by punctuation, it's a query, unless . . . */
  2182 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2183 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2184 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2185 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2186 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2187 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2188 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2189 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2190 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2191 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2192 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2193 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2194 	    {
  2195 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2196 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2197 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2198 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2199 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2200 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2201 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2202 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2203 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2204 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2205 		{
  2206 		    s+=4;
  2207 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2208 		}
  2209 		; /* do nothing for .. !! and ?? which can be legit */
  2210 	    }
  2211 	    else
  2212 	    {
  2213 		if (pswit[ECHO_SWITCH])
  2214 		    g_print("\n%s\n",aline);
  2215 		if (!pswit[OVERVIEW_SWITCH])
  2216 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2217 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2218 		else
  2219 		    cnt_punct++;
  2220 	    }
  2221 	}
  2222     }
  2223 }
  2224 
  2225 /*
  2226  * check_for_spaced_quotes:
  2227  */
  2228 void check_for_spaced_quotes(const char *aline)
  2229 {
  2230     const char *s,*t;
  2231     s=aline;
  2232     while ((t=strstr(s," \" ")))
  2233     {
  2234 	if (pswit[ECHO_SWITCH])
  2235 	    g_print("\n%s\n",aline);
  2236 	if (!pswit[OVERVIEW_SWITCH])
  2237 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2238 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2239 	else
  2240 	    cnt_punct++;
  2241 	s=g_utf8_next_char(g_utf8_next_char(t));
  2242     }
  2243     s=aline;
  2244     while ((t=strstr(s," ' ")))
  2245     {
  2246 	if (pswit[ECHO_SWITCH])
  2247 	    g_print("\n%s\n",aline);
  2248 	if (!pswit[OVERVIEW_SWITCH])
  2249 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2250 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2251 	else
  2252 	    cnt_punct++;
  2253 	s=g_utf8_next_char(g_utf8_next_char(t));
  2254     }
  2255     s=aline;
  2256     while ((t=strstr(s," ` ")))
  2257     {
  2258 	if (pswit[ECHO_SWITCH])
  2259 	    g_print("\n%s\n",aline);
  2260 	if (!pswit[OVERVIEW_SWITCH])
  2261 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2262 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2263 	else
  2264 	    cnt_punct++;
  2265 	s=g_utf8_next_char(g_utf8_next_char(t));
  2266     }
  2267 }
  2268 
  2269 /*
  2270  * check_for_miscased_genative:
  2271  *
  2272  * Check special case of 'S instead of 's at end of word.
  2273  */
  2274 void check_for_miscased_genative(const char *aline)
  2275 {
  2276     const char *s;
  2277     gunichar c,nc,pc;
  2278     if (!*aline)
  2279 	return;
  2280     c=g_utf8_get_char(aline);
  2281     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2282     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2283     {
  2284 	pc=c;
  2285 	c=nc;
  2286 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2287 	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
  2288 	{
  2289 	    if (pswit[ECHO_SWITCH])
  2290 		g_print("\n%s\n",aline);
  2291 	    if (!pswit[OVERVIEW_SWITCH])
  2292 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2293 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2294 	    else
  2295 		cnt_punct++;
  2296 	}
  2297     }
  2298 }
  2299 
  2300 /*
  2301  * check_end_of_line:
  2302  *
  2303  * Now check special cases - start and end of line -
  2304  * for single and double quotes. Start is sometimes [sic]
  2305  * but better to query it anyway.
  2306  * While we're here, check for dash at end of line.
  2307  */
  2308 void check_end_of_line(const char *aline,struct warnings *warnings)
  2309 {
  2310     int lbytes;
  2311     const char *s;
  2312     gunichar c1,c2;
  2313     lbytes=strlen(aline);
  2314     if (g_utf8_strlen(aline,lbytes)>1)
  2315     {
  2316 	s=g_utf8_prev_char(aline+lbytes);
  2317 	c1=g_utf8_get_char(s);
  2318 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2319 	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
  2320 	  c2==CHAR_SPACE)
  2321 	{
  2322 	    if (pswit[ECHO_SWITCH])
  2323 		g_print("\n%s\n",aline);
  2324 	    if (!pswit[OVERVIEW_SWITCH])
  2325 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2326 		  g_utf8_strlen(aline,lbytes));
  2327 	    else
  2328 		cnt_punct++;
  2329 	}
  2330 	c1=g_utf8_get_char(aline);
  2331 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2332 	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
  2333 	{
  2334 	    if (pswit[ECHO_SWITCH])
  2335 		g_print("\n%s\n",aline);
  2336 	    if (!pswit[OVERVIEW_SWITCH])
  2337 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2338 	    else
  2339 		cnt_punct++;
  2340 	}
  2341 	/*
  2342 	 * Dash at end of line may well be legit - paranoid mode only
  2343 	 * and don't report em-dash at line-end.
  2344 	 */
  2345 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2346 	{
  2347 	    for (s=g_utf8_prev_char(aline+lbytes);
  2348 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2349 		;
  2350 	    if (g_utf8_get_char(s)=='-' &&
  2351 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2352 	    {
  2353 		if (pswit[ECHO_SWITCH])
  2354 		    g_print("\n%s\n",aline);
  2355 		if (!pswit[OVERVIEW_SWITCH])
  2356 		    g_print("    Line %ld column %ld - "
  2357 		      "Hyphen at end of line?\n",
  2358 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2359 	    }
  2360 	}
  2361     }
  2362 }
  2363 
  2364 /*
  2365  * check_for_unspaced_bracket:
  2366  *
  2367  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2368  * If so, suspect a scanno like "a]most".
  2369  */
  2370 void check_for_unspaced_bracket(const char *aline)
  2371 {
  2372     const char *s;
  2373     gunichar c,nc,pc;
  2374     c=g_utf8_get_char(aline);
  2375     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2376     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2377     {
  2378 	pc=c;
  2379 	c=nc;
  2380 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2381 	if (!nc)
  2382 	    break;
  2383 	/* for each bracket character in the line except 1st & last */
  2384 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2385 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2386 	{
  2387 	    if (pswit[ECHO_SWITCH])
  2388 		g_print("\n%s\n",aline);
  2389 	    if (!pswit[OVERVIEW_SWITCH])
  2390 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2391 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2392 	    else
  2393 		cnt_punct++;
  2394 	}
  2395     }
  2396 }
  2397 
  2398 /*
  2399  * check_for_unpunctuated_endquote:
  2400  */
  2401 void check_for_unpunctuated_endquote(const char *aline)
  2402 {
  2403     const char *s;
  2404     gunichar c,nc,pc;
  2405     c=g_utf8_get_char(aline);
  2406     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2407     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2408     {
  2409 	pc=c;
  2410 	c=nc;
  2411 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2412 	/* for each character in the line except 1st */
  2413 	if (c==CHAR_DQUOTE && isalpha(pc))
  2414 	{
  2415 	    if (pswit[ECHO_SWITCH])
  2416 		g_print("\n%s\n",aline);
  2417 	    if (!pswit[OVERVIEW_SWITCH])
  2418 		g_print("    Line %ld column %ld - "
  2419 		  "endquote missing punctuation?\n",
  2420 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2421 	    else
  2422 		cnt_punct++;
  2423 	}
  2424     }
  2425 }
  2426 
  2427 /*
  2428  * check_for_html_tag:
  2429  *
  2430  * Check for <HTML TAG>.
  2431  *
  2432  * If there is a < in the line, followed at some point
  2433  * by a > then we suspect HTML.
  2434  */
  2435 void check_for_html_tag(const char *aline)
  2436 {
  2437     const char *open,*close;
  2438     gchar *tag;
  2439     open=strchr(aline,'<');
  2440     if (open)
  2441     {
  2442 	close=strchr(g_utf8_next_char(open),'>');
  2443 	if (close)
  2444 	{
  2445 	    if (pswit[ECHO_SWITCH])
  2446 		g_print("\n%s\n",aline);
  2447 	    if (!pswit[OVERVIEW_SWITCH])
  2448 	    {
  2449 		tag=g_strndup(open,close-open+1);
  2450 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2451 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2452 		g_free(tag);
  2453 	    }
  2454 	    else
  2455 		cnt_html++;
  2456 	}
  2457     }
  2458 }
  2459 
  2460 /*
  2461  * check_for_html_entity:
  2462  *
  2463  * Check for &symbol; HTML.
  2464  *
  2465  * If there is a & in the line, followed at
  2466  * some point by a ; then we suspect HTML.
  2467  */
  2468 void check_for_html_entity(const char *aline)
  2469 {
  2470     const char *s,*amp,*scolon;
  2471     gchar *entity;
  2472     amp=strchr(aline,'&');
  2473     if (amp)
  2474     {
  2475 	scolon=strchr(amp,';');
  2476 	if (scolon)
  2477 	{
  2478 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2479 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2480 		    break;		/* Don't report "Jones & Son;" */
  2481 	    if (s>=scolon)
  2482 	    {
  2483 		if (pswit[ECHO_SWITCH])
  2484 		    g_print("\n%s\n",aline);
  2485 		if (!pswit[OVERVIEW_SWITCH])
  2486 		{
  2487 		    entity=g_strndup(amp,scolon-amp+1);
  2488 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2489 		      linecnt,(int)(amp-aline)+1,entity);
  2490 		    g_free(entity);
  2491 		}
  2492 		else
  2493 		    cnt_html++;
  2494 	    }
  2495 	}
  2496     }
  2497 }
  2498 
  2499 /*
  2500  * print_pending:
  2501  *
  2502  * If we are in a state of unbalanced quotes, and this line
  2503  * doesn't begin with a quote, output the stored error message.
  2504  * If the -P switch was used, print the warning even if the
  2505  * new para starts with quotes.
  2506  */
  2507 void print_pending(const char *aline,const char *parastart,
  2508   struct pending *pending)
  2509 {
  2510     const char *s;
  2511     gunichar c;
  2512     s=aline;
  2513     while (*s==' ')
  2514 	s++;
  2515     c=g_utf8_get_char(s);
  2516     if (pending->dquote)
  2517     {
  2518 	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  2519 	{
  2520 	    if (!pswit[OVERVIEW_SWITCH])
  2521 	    {
  2522 		if (pswit[ECHO_SWITCH])
  2523 		    g_print("\n%s\n",parastart);
  2524 		g_print("%s\n",pending->dquote);
  2525 	    }
  2526 	    else
  2527 		cnt_dquot++;
  2528 	}
  2529 	g_free(pending->dquote);
  2530 	pending->dquote=NULL;
  2531     }
  2532     if (pending->squote)
  2533     {
  2534 	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
  2535 	  pending->squot)
  2536 	{
  2537 	    if (!pswit[OVERVIEW_SWITCH])
  2538 	    {
  2539 		if (pswit[ECHO_SWITCH])
  2540 		    g_print("\n%s\n",parastart);
  2541 		g_print("%s\n",pending->squote);
  2542 	    }
  2543 	    else
  2544 		cnt_squot++;
  2545 	}
  2546 	g_free(pending->squote);
  2547 	pending->squote=NULL;
  2548     }
  2549     if (pending->rbrack)
  2550     {
  2551 	if (!pswit[OVERVIEW_SWITCH])
  2552 	{
  2553 	    if (pswit[ECHO_SWITCH])
  2554 		g_print("\n%s\n",parastart);
  2555 	    g_print("%s\n",pending->rbrack);
  2556 	}
  2557 	else
  2558 	    cnt_brack++;
  2559 	g_free(pending->rbrack);
  2560 	pending->rbrack=NULL;
  2561     }
  2562     if (pending->sbrack)
  2563     {
  2564 	if (!pswit[OVERVIEW_SWITCH])
  2565 	{
  2566 	    if (pswit[ECHO_SWITCH])
  2567 		g_print("\n%s\n",parastart);
  2568 	    g_print("%s\n",pending->sbrack);
  2569 	}
  2570 	else
  2571 	    cnt_brack++;
  2572 	g_free(pending->sbrack);
  2573 	pending->sbrack=NULL;
  2574     }
  2575     if (pending->cbrack)
  2576     {
  2577 	if (!pswit[OVERVIEW_SWITCH])
  2578 	{
  2579 	    if (pswit[ECHO_SWITCH])
  2580 		g_print("\n%s\n",parastart);
  2581 	    g_print("%s\n",pending->cbrack);
  2582 	}
  2583 	else
  2584 	    cnt_brack++;
  2585 	g_free(pending->cbrack);
  2586 	pending->cbrack=NULL;
  2587     }
  2588     if (pending->unders)
  2589     {
  2590 	if (!pswit[OVERVIEW_SWITCH])
  2591 	{
  2592 	    if (pswit[ECHO_SWITCH])
  2593 		g_print("\n%s\n",parastart);
  2594 	    g_print("%s\n",pending->unders);
  2595 	}
  2596 	else
  2597 	    cnt_brack++;
  2598 	g_free(pending->unders);
  2599 	pending->unders=NULL;
  2600     }
  2601 }
  2602 
  2603 /*
  2604  * check_for_mismatched_quotes:
  2605  *
  2606  * At end of paragraph, check for mismatched quotes.
  2607  *
  2608  * We don't want to report an error immediately, since it is a
  2609  * common convention to omit the quotes at end of paragraph if
  2610  * the next paragraph is a continuation of the same speaker.
  2611  * Where this is the case, the next para should begin with a
  2612  * quote, so we store the warning message and only display it
  2613  * at the top of the next iteration if the new para doesn't
  2614  * start with a quote.
  2615  * The -p switch overrides this default, and warns of unclosed
  2616  * quotes on _every_ paragraph, whether the next begins with a
  2617  * quote or not.
  2618  */
  2619 void check_for_mismatched_quotes(const struct counters *counters,
  2620   struct pending *pending)
  2621 {
  2622     if (counters->quot%2)
  2623 	pending->dquote=
  2624 	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
  2625     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  2626       counters->open_single_quote!=counters->close_single_quote)
  2627 	pending->squote=
  2628 	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
  2629     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  2630       counters->open_single_quote!=counters->close_single_quote &&
  2631       counters->open_single_quote!=counters->close_single_quote+1)
  2632 	/*
  2633 	 * Flag it to be noted regardless of the
  2634 	 * first char of the next para.
  2635 	 */
  2636 	pending->squot=1;
  2637     if (counters->r_brack)
  2638 	pending->rbrack=
  2639 	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
  2640     if (counters->s_brack)
  2641 	pending->sbrack=
  2642 	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
  2643     if (counters->c_brack)
  2644 	pending->cbrack=
  2645 	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
  2646     if (counters->c_unders%2)
  2647 	pending->unders=
  2648 	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
  2649 }
  2650 
  2651 /*
  2652  * check_for_omitted_punctuation:
  2653  *
  2654  * Check for omitted punctuation at end of paragraph by working back
  2655  * through prevline. DW.
  2656  * Need to check this only for "normal" paras.
  2657  * So what is a "normal" para?
  2658  *    Not normal if one-liner (chapter headings, etc.)
  2659  *    Not normal if doesn't contain at least one locase letter
  2660  *    Not normal if starts with space
  2661  */
  2662 void check_for_omitted_punctuation(const char *prevline,
  2663   struct line_properties *last,int start_para_line)
  2664 {
  2665     gboolean letter_on_line=FALSE;
  2666     const char *s;
  2667     for (s=prevline;*s;s=g_utf8_next_char(s))
  2668 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2669 	{
  2670 	    letter_on_line=TRUE;
  2671 	    break;
  2672 	}
  2673     /*
  2674      * This next "if" is a problem.
  2675      * If we say "start_para_line <= linecnt - 1", that includes
  2676      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2677      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2678      * misses genuine one-line paragraphs.
  2679      */
  2680     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2681       g_utf8_get_char(prevline)>CHAR_SPACE)
  2682     {
  2683 	for (s=g_utf8_prev_char(prevline+strlen(prevline));
  2684 	  (g_utf8_get_char(s)==CHAR_DQUOTE ||
  2685 	  g_utf8_get_char(s)==CHAR_SQUOTE) &&
  2686 	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
  2687 	  s=g_utf8_prev_char(s))
  2688 	    ;
  2689 	for (;s>prevline;s=g_utf8_prev_char(s))
  2690 	{
  2691 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2692 	    {
  2693 		if (pswit[ECHO_SWITCH])
  2694 		    g_print("\n%s\n",prevline);
  2695 		if (!pswit[OVERVIEW_SWITCH])
  2696 		    g_print("    Line %ld column %ld - "
  2697 		      "No punctuation at para end?\n",
  2698 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2699 		else
  2700 		    cnt_punct++;
  2701 		break;
  2702 	    }
  2703 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2704 		break;
  2705 	}
  2706     }
  2707 }
  2708 
  2709 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2710 {
  2711     const char *word=key;
  2712     int *dupcnt=value;
  2713     if (*dupcnt)
  2714 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2715 	  word,*dupcnt);
  2716     return FALSE;
  2717 }
  2718 
  2719 void print_as_windows_1252(const char *string)
  2720 {
  2721     gsize inbytes,outbytes;
  2722     gchar *buf,*bp;
  2723     GIConv converter=(GIConv)-1;
  2724     if (!string)
  2725     {
  2726 	if (converter!=(GIConv)-1)
  2727 	    g_iconv_close(converter);
  2728 	converter=(GIConv)-1;
  2729 	return;
  2730     }
  2731     if (converter=(GIConv)-1)
  2732 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2733     if (converter!=(GIConv)-1)
  2734     {
  2735 	inbytes=outbytes=strlen(string);
  2736 	bp=buf=g_malloc(outbytes+1);
  2737 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2738 	*bp='\0';
  2739 	fputs(buf,stdout);
  2740 	g_free(buf);
  2741     }
  2742     else
  2743 	fputs(string,stdout);
  2744 }
  2745 
  2746 /*
  2747  * procfile:
  2748  *
  2749  * Process one file.
  2750  */
  2751 void procfile(const char *filename)
  2752 {
  2753     const char *s;
  2754     gchar *parastart=NULL;	/* first line of current para */
  2755     gchar *etext,*aline;
  2756     gchar *etext_ptr;
  2757     GError *err=NULL;
  2758     struct first_pass_results *first_pass_results;
  2759     struct warnings *warnings;
  2760     struct counters counters={0};
  2761     struct line_properties last={0};
  2762     struct parities parities={0};
  2763     struct pending pending={0};
  2764     gboolean isemptyline;
  2765     long start_para_line=0;
  2766     gboolean isnewpara=FALSE,enddash=FALSE;
  2767     last.start=CHAR_SPACE;
  2768     linecnt=checked_linecnt=0;
  2769     etext=read_etext(filename,&err);
  2770     if (!etext)
  2771     {
  2772 	if (pswit[STDOUT_SWITCH])
  2773 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2774 	else
  2775 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2776 	exit(1);
  2777     }
  2778     g_set_print_handler(print_as_windows_1252);
  2779     g_print("\n\nFile: %s\n\n",filename);
  2780     first_pass_results=first_pass(etext);
  2781     warnings=report_first_pass(first_pass_results);
  2782     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2783     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2784     /*
  2785      * Here we go with the main pass. Hold onto yer hat!
  2786      */
  2787     linecnt=0;
  2788     etext_ptr=etext;
  2789     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2790     {
  2791 	linecnt++;
  2792 	if (linecnt==1)
  2793 	    isnewpara=TRUE;
  2794 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2795 	    continue;    // skip DP page separators completely
  2796 	if (linecnt<first_pass_results->firstline ||
  2797 	  (first_pass_results->footerline>0 &&
  2798 	  linecnt>first_pass_results->footerline))
  2799 	{
  2800 	    if (pswit[HEADER_SWITCH])
  2801 	    {
  2802 		if (g_str_has_prefix(aline,"Title:"))
  2803 		    g_print("    %s\n",aline);
  2804 		if (g_str_has_prefix(aline,"Author:"))
  2805 		    g_print("    %s\n",aline);
  2806 		if (g_str_has_prefix(aline,"Release Date:"))
  2807 		    g_print("    %s\n",aline);
  2808 		if (g_str_has_prefix(aline,"Edition:"))
  2809 		    g_print("    %s\n\n",aline);
  2810 	    }
  2811 	    continue;		/* skip through the header */
  2812 	}
  2813 	checked_linecnt++;
  2814 	print_pending(aline,parastart,&pending);
  2815 	memset(&pending,0,sizeof(pending));
  2816 	isemptyline=analyse_quotes(aline,&counters);
  2817 	if (isnewpara && !isemptyline)
  2818 	{
  2819 	    /* This line is the start of a new paragraph. */
  2820 	    start_para_line=linecnt;
  2821 	    /* Capture its first line in case we want to report it later. */
  2822 	    g_free(parastart);
  2823 	    parastart=g_strdup(aline);
  2824 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2825 	    s=aline;
  2826 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2827 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2828 		s=g_utf8_next_char(s);
  2829 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2830 	    {
  2831 		/* and its first letter is lowercase */
  2832 		if (pswit[ECHO_SWITCH])
  2833 		    g_print("\n%s\n",aline);
  2834 		if (!pswit[OVERVIEW_SWITCH])
  2835 		    g_print("    Line %ld column %ld - "
  2836 		      "Paragraph starts with lower-case\n",
  2837 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2838 		else
  2839 		    cnt_punct++;
  2840 	    }
  2841 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2842 	}
  2843 	/* Check for an em-dash broken at line end. */
  2844 	if (enddash && g_utf8_get_char(aline)=='-')
  2845 	{
  2846 	    if (pswit[ECHO_SWITCH])
  2847 		g_print("\n%s\n",aline);
  2848 	    if (!pswit[OVERVIEW_SWITCH])
  2849 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2850 	    else
  2851 		cnt_punct++;
  2852 	}
  2853 	enddash=FALSE;
  2854 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2855 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2856 	    ;
  2857 	if (s>=aline && g_utf8_get_char(s)=='-')
  2858 	    enddash=TRUE;
  2859 	check_for_control_characters(aline);
  2860 	if (warnings->bin)
  2861 	    check_for_odd_characters(aline,warnings,isemptyline);
  2862 	if (warnings->longline)
  2863 	    check_for_long_line(aline);
  2864 	if (warnings->shortline)
  2865 	    check_for_short_line(aline,&last);
  2866 	last.blen=last.len;
  2867 	last.len=g_utf8_strlen(aline,-1);
  2868 	last.start=g_utf8_get_char(aline);
  2869 	check_for_starting_punctuation(aline);
  2870 	if (warnings->dash)
  2871 	{
  2872 	    check_for_spaced_emdash(aline);
  2873 	    check_for_spaced_dash(aline);
  2874 	}
  2875 	check_for_unmarked_paragraphs(aline);
  2876 	check_for_jeebies(aline);
  2877 	check_for_mta_from(aline);
  2878 	check_for_orphan_character(aline);
  2879 	check_for_pling_scanno(aline);
  2880 	check_for_extra_period(aline,warnings);
  2881 	check_for_following_punctuation(aline);
  2882 	check_for_typos(aline,warnings);
  2883 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2884 	check_for_double_punctuation(aline,warnings);
  2885 	check_for_spaced_quotes(aline);
  2886 	check_for_miscased_genative(aline);
  2887 	check_end_of_line(aline,warnings);
  2888 	check_for_unspaced_bracket(aline);
  2889 	if (warnings->endquote)
  2890 	    check_for_unpunctuated_endquote(aline);
  2891 	check_for_html_tag(aline);
  2892 	check_for_html_entity(aline);
  2893 	if (isemptyline)
  2894 	{
  2895 	    check_for_mismatched_quotes(&counters,&pending);
  2896 	    memset(&counters,0,sizeof(counters));
  2897 	    /* let the next iteration know that it's starting a new para */
  2898 	    isnewpara=TRUE;
  2899 	    if (prevline)
  2900 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2901 	}
  2902 	g_free(prevline);
  2903 	prevline=g_strdup(aline);
  2904     }
  2905     if (prevline)
  2906     {
  2907 	g_free(prevline);
  2908 	prevline=NULL;
  2909     }
  2910     g_free(parastart);
  2911     g_free(prevline);
  2912     g_free(etext);
  2913     if (!pswit[OVERVIEW_SWITCH])
  2914 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2915     g_tree_unref(qword);
  2916     g_tree_unref(qperiod);
  2917     g_set_print_handler(NULL);
  2918     print_as_windows_1252(NULL);
  2919 }
  2920 
  2921 /*
  2922  * flgets:
  2923  *
  2924  * Get one line from the input text, checking for
  2925  * the existence of exactly one CR/LF line-end per line.
  2926  *
  2927  * Returns: a pointer to the line.
  2928  */
  2929 char *flgets(char **etext,long lcnt)
  2930 {
  2931     gunichar c;
  2932     gboolean isCR=FALSE;
  2933     char *theline=*etext;
  2934     char *eos=theline;
  2935     gchar *s;
  2936     for (;;)
  2937     {
  2938 	c=g_utf8_get_char(*etext);
  2939 	*etext=g_utf8_next_char(*etext);
  2940 	if (!c)
  2941 	    return NULL;
  2942 	/* either way, it's end of line */
  2943 	if (c=='\n')
  2944 	{
  2945 	    if (isCR)
  2946 		break;
  2947 	    else
  2948 	    {
  2949 		/* Error - a LF without a preceding CR */
  2950 		if (pswit[LINE_END_SWITCH])
  2951 		{
  2952 		    if (pswit[ECHO_SWITCH])
  2953 		    {
  2954 			s=g_strndup(theline,eos-theline);
  2955 			g_print("\n%s\n",s);
  2956 			g_free(s);
  2957 		    }
  2958 		    if (!pswit[OVERVIEW_SWITCH])
  2959 			g_print("    Line %ld - No CR?\n",lcnt);
  2960 		    else
  2961 			cnt_lineend++;
  2962 		}
  2963 		break;
  2964 	    }
  2965 	}
  2966 	if (c=='\r')
  2967 	{
  2968 	    if (isCR)
  2969 	    {
  2970 		/* Error - two successive CRs */
  2971 		if (pswit[LINE_END_SWITCH])
  2972 		{
  2973 		    if (pswit[ECHO_SWITCH])
  2974 		    {
  2975 			s=g_strndup(theline,eos-theline);
  2976 			g_print("\n%s\n",s);
  2977 			g_free(s);
  2978 		    }
  2979 		    if (!pswit[OVERVIEW_SWITCH])
  2980 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2981 		    else
  2982 			cnt_lineend++;
  2983 		}
  2984 	    }
  2985 	    isCR=TRUE;
  2986 	}
  2987 	else
  2988 	{
  2989 	    if (pswit[LINE_END_SWITCH] && isCR)
  2990 	    {
  2991 		if (pswit[ECHO_SWITCH])
  2992 		{
  2993 		    s=g_strndup(theline,eos-theline);
  2994 		    g_print("\n%s\n",s);
  2995 		    g_free(s);
  2996 		}
  2997 		if (!pswit[OVERVIEW_SWITCH])
  2998 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2999 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3000 		else
  3001 		    cnt_lineend++;
  3002 		*eos=' ';
  3003 	    }
  3004 	    isCR=FALSE;
  3005 	    eos=g_utf8_next_char(eos);
  3006 	}
  3007     }
  3008     *eos='\0';
  3009     if (pswit[MARKUP_SWITCH])  
  3010 	postprocess_for_HTML(theline);
  3011     if (pswit[DP_SWITCH])  
  3012 	postprocess_for_DP(theline);
  3013     return theline;
  3014 }
  3015 
  3016 /*
  3017  * mixdigit:
  3018  *
  3019  * Takes a "word" as a parameter, and checks whether it
  3020  * contains a mixture of alpha and digits. Generally, this is an
  3021  * error, but may not be for cases like 4th or L5 12s. 3d.
  3022  *
  3023  * Returns: TRUE iff an is error found.
  3024  */
  3025 gboolean mixdigit(const char *checkword)
  3026 {
  3027     gboolean wehaveadigit,wehavealetter,query;
  3028     const char *s,*nondigit;
  3029     wehaveadigit=wehavealetter=query=FALSE;
  3030     for (s=checkword;*s;s=g_utf8_next_char(s))
  3031 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3032 	    wehavealetter=TRUE;
  3033 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3034 	    wehaveadigit=TRUE;
  3035     if (wehaveadigit && wehavealetter)
  3036     {
  3037 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3038 	query=TRUE;
  3039 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3040 	  nondigit=g_utf8_next_char(nondigit))
  3041 	    ;
  3042 	/* digits, ending in st, rd, nd, th of either case */
  3043 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3044 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3045 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3046 	  !g_ascii_strcasecmp(nondigit,"th"))
  3047 	    query=FALSE;
  3048 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3049 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3050 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3051 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3052 	    query=FALSE;
  3053 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3054 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3055 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3056 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3057 	    query=FALSE;
  3058 	/* digits, ending in l, L, s or d */
  3059 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3060 	  !strcmp(nondigit,"d"))
  3061 	    query=FALSE;
  3062 	/*
  3063 	 * L at the start of a number, representing Britsh pounds, like L500.
  3064 	 * This is cute. We know the current word is mixed digit. If the first
  3065 	 * letter is L, there must be at least one digit following. If both
  3066 	 * digits and letters follow, we have a genuine error, else we have a
  3067 	 * capital L followed by digits, and we accept that as a non-error.
  3068 	 */
  3069 	if (g_utf8_get_char(checkword)=='L' &&
  3070 	  !mixdigit(g_utf8_next_char(checkword)))
  3071 	    query=FALSE;
  3072     }
  3073     return query;
  3074 }
  3075 
  3076 /*
  3077  * getaword:
  3078  *
  3079  * Extracts the first/next "word" from the line, and returns it.
  3080  * A word is defined as one English word unit--or at least that's the aim.
  3081  * "ptr" is advanced to the position in the line where we will start
  3082  * looking for the next word.
  3083  *
  3084  * Returns: A newly-allocated string.
  3085  */
  3086 gchar *getaword(const char **ptr)
  3087 {
  3088     const char *s,*t;
  3089     GString *word;
  3090     gunichar c,pc;
  3091     word=g_string_new(NULL);
  3092     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3093       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3094       **ptr;*ptr=g_utf8_next_char(*ptr))
  3095 	;
  3096     /*
  3097      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3098      * Especially yucky is the case of L1,000
  3099      * This section looks for a pattern of characters including a digit
  3100      * followed by a comma or period followed by one or more digits.
  3101      * If found, it returns this whole pattern as a word; otherwise we discard
  3102      * the results and resume our normal programming.
  3103      */
  3104     s=*ptr;
  3105     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3106       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3107       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3108 	g_string_append_unichar(word,g_utf8_get_char(s));
  3109     for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
  3110       t=g_utf8_next_char(t))
  3111     {
  3112 	c=g_utf8_get_char(t);
  3113 	pc=g_utf8_get_char(g_utf8_prev_char(t));
  3114 	if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3115 	{
  3116 	    *ptr=s;
  3117 	    return g_string_free(word,FALSE);
  3118 	}
  3119     }
  3120     /* we didn't find a punctuated number - do the regular getword thing */
  3121     g_string_truncate(word,0);
  3122     for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
  3123       g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
  3124       g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
  3125 	g_string_append_unichar(word,g_utf8_get_char(*ptr));
  3126     return g_string_free(word,FALSE);
  3127 }
  3128 
  3129 /*
  3130  * isroman:
  3131  *
  3132  * Is this word a Roman Numeral?
  3133  *
  3134  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3135  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3136  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3137  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3138  * expressions thereof, except when it came to taxes. Allow any number of M,
  3139  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3140  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3141  * of optional Is.
  3142  */
  3143 gboolean isroman(const char *t)
  3144 {
  3145     const char *s;
  3146     if (!t || !*t)
  3147 	return FALSE;
  3148     s=t;
  3149     while (g_utf8_get_char(t)=='m' && *t)
  3150 	t++;
  3151     if (g_utf8_get_char(t)=='d')
  3152 	t++;
  3153     if (g_str_has_prefix(t,"cm"))
  3154 	t+=2;
  3155     if (g_str_has_prefix(t,"cd"))
  3156 	t+=2;
  3157     while (g_utf8_get_char(t)=='c' && *t)
  3158 	t++;
  3159     if (g_str_has_prefix(t,"xl"))
  3160 	t+=2;
  3161     if (g_str_has_prefix(t,"xc"))
  3162 	t+=2;
  3163     if (g_utf8_get_char(t)=='l')
  3164 	t++;
  3165     while (g_utf8_get_char(t)=='x' && *t)
  3166 	t++;
  3167     if (g_str_has_prefix(t,"ix"))
  3168 	t+=2;
  3169     if (g_str_has_prefix(t,"iv"))
  3170 	t+=2;
  3171     if (g_utf8_get_char(t)=='v')
  3172 	t++;
  3173     while (g_utf8_get_char(t)=='i' && *t)
  3174 	t++;
  3175     return !*t;
  3176 }
  3177 
  3178 /*
  3179  * postprocess_for_DP:
  3180  *
  3181  * Invoked with the -d switch from flgets().
  3182  * It simply "removes" from the line a hard-coded set of common
  3183  * DP-specific tags, so that the line passed to the main routine has
  3184  * been pre-cleaned of DP markup.
  3185  */
  3186 void postprocess_for_DP(char *theline)
  3187 {
  3188     char *s,*t;
  3189     int i;
  3190     if (!*theline) 
  3191 	return;
  3192     for (i=0;*DPmarkup[i];i++)
  3193 	while ((s=strstr(theline,DPmarkup[i])))
  3194 	{
  3195 	    t=s+strlen(DPmarkup[i]);
  3196 	    memmove(s,t,strlen(t)+1);
  3197 	}
  3198 }
  3199 
  3200 /*
  3201  * postprocess_for_HTML:
  3202  *
  3203  * Invoked with the -m switch from flgets().
  3204  * It simply "removes" from the line a hard-coded set of common
  3205  * HTML tags and "replaces" a hard-coded set of common HTML
  3206  * entities, so that the line passed to the main routine has
  3207  * been pre-cleaned of HTML.
  3208  */
  3209 void postprocess_for_HTML(char *theline)
  3210 {
  3211     while (losemarkup(theline))
  3212 	;
  3213     while (loseentities(theline))
  3214 	;
  3215 }
  3216 
  3217 char *losemarkup(char *theline)
  3218 {
  3219     char *s,*t;
  3220     int i;
  3221     s=strchr(theline,'<');
  3222     t=s?strchr(s,'>'):NULL;
  3223     if (!s || !t)
  3224 	return NULL;
  3225     for (i=0;*markup[i];i++)
  3226 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3227 	{
  3228 	    t=g_utf8_next_char(t);
  3229 	    memmove(s,t,strlen(t)+1);
  3230 	    return s;
  3231 	}
  3232     /* It's an unrecognized <xxx>. */
  3233     return NULL;
  3234 }
  3235 
  3236 char *loseentities(char *theline)
  3237 {
  3238     int i;
  3239     char *s,*t;
  3240     if (!*theline) 
  3241 	return NULL;
  3242     for (i=0;*entities[i].htmlent;i++)
  3243     {
  3244 	s=strstr(theline,entities[i].htmlent);
  3245 	if (s)
  3246 	{
  3247 	    t=g_strdup(s+strlen(entities[i].htmlent));
  3248 	    strcpy(s,entities[i].textent);
  3249 	    strcat(s,t);
  3250 	    g_free(t);
  3251 	    return theline;
  3252 	}
  3253     }
  3254     for (i=0;*entities[i].htmlnum;i++)
  3255     {
  3256 	s=strstr(theline,entities[i].htmlnum);
  3257 	if (s)
  3258 	{
  3259 	    t=g_strdup(s+strlen(entities[i].htmlnum));
  3260 	    strcpy(s,entities[i].textent);
  3261 	    strcat(s,t);
  3262 	    g_free(t);
  3263 	    return theline;
  3264 	}
  3265     }
  3266     return NULL;
  3267 }
  3268 
  3269 gboolean tagcomp(const char *strin,const char *basetag)
  3270 {
  3271     gboolean retval;
  3272     gchar *s,*t;
  3273     if (g_utf8_get_char(strin)=='/')
  3274 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3275     else
  3276 	t=g_utf8_casefold(strin,-1);
  3277     s=g_utf8_casefold(basetag,-1);
  3278     retval=g_str_has_prefix(t,s);
  3279     g_free(s);
  3280     g_free(t);
  3281     return retval;
  3282 }
  3283 
  3284 void proghelp(GOptionContext *context)
  3285 {
  3286     gchar *help;
  3287     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3288     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3289     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3290     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3291       "For details, read the file COPYING.\n",stderr);
  3292     fputs("This is Free Software; "
  3293       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3294     fputs("read the file COPYING for details.\n\n",stderr);
  3295     help=g_option_context_get_help(context,TRUE,NULL);
  3296     fputs(help,stderr);
  3297     g_free(help);
  3298     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3299     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3300       "non-ASCII\n",stderr);
  3301     fputs("characters like accented letters, "
  3302       "lines longer than 75 or shorter than 55,\n",stderr);
  3303     fputs("unbalanced quotes or brackets, "
  3304       "a variety of badly formatted punctuation, \n",stderr);
  3305     fputs("HTML tags, some likely typos. "
  3306       "It is NOT a substitute for human judgement.\n",stderr);
  3307     fputs("\n",stderr);
  3308 }