bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sat May 25 19:35:44 2013 +0100 (2013-05-25)
changeset 46 aa45307a6328
parent 45 d48f66b0ad0d
child 47 7522c36859d0
permissions -rw-r--r--
Break check_for_starting_punctuation() out
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*                                                                       */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */
     6 /*                                                                       */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.                                   */
    11 /*                                                                       */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */
    15 /* GNU General Public License for more details.                          */
    16 /*                                                                       */
    17 /* You should have received a copy of the GNU General Public License     */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 
    26 #define MAXWORDLEN    80    /* max length of one word             */
    27 #define LINEBUFSIZE 2048    /* buffer size for an input line      */
    28 
    29 #define MAX_USER_TYPOS 1000
    30 #define USERTYPO_FILE "gutcheck.typ"
    31 
    32 #ifndef MAX_PATH
    33 #define MAX_PATH 16384
    34 #endif
    35 
    36 char aline[LINEBUFSIZE];
    37 char prevline[LINEBUFSIZE];
    38 
    39 /* Common typos. */
    40 char *typo[] = {
    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    70     "se", ""
    71 };
    72 
    73 char *usertypo[MAX_USER_TYPOS];
    74 
    75 /* Common abbreviations and other OK words not to query as typos. */
    76 char *okword[] = {
    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    80     "outbid", "outbids", "frostbite", "frostbitten", ""
    81 };
    82 
    83 /* Common abbreviations that cause otherwise unexplained periods. */
    84 char *abbrev[] = {
    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    87 };
    88 
    89 /*
    90  * Two-Letter combinations that rarely if ever start words,
    91  * but are common scannos or otherwise common letter combinations.
    92  */
    93 char *nostart[] = {
    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    95 };
    96 
    97 /*
    98  * Two-Letter combinations that rarely if ever end words,
    99  * but are common scannos or otherwise common letter combinations.
   100  */
   101 char *noend[] = {
   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   103     "sw", "gr", "sl", "cl", "iy", ""
   104 };
   105 
   106 char *markup[] = {
   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   111 };
   112 
   113 char *DPmarkup[] = {
   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   115 };
   116 
   117 char *nocomma[] = {
   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   121     "during", "let", "toward", "among", ""
   122 };
   123 
   124 char *noperiod[] = {
   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   128     "among", "those", "into", "whom", "having", "thence", ""
   129 }; 
   130 
   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
   132 
   133 struct {
   134     char *htmlent;
   135     char *htmlnum;
   136     char *textent;
   137 } entities[] = {
   138     "&amp;",	"&#38;",     "&", 
   139     "&lt;",	"&#60;",     "<",
   140     "&gt;",	"&#62;",     ">",
   141     "&deg;",	"&#176;",    " degrees",
   142     "&pound;",	"&#163;",    "L",
   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
   165     "&cent;",	"&#162;",    "c", /* cent sign */
   166     "&pound;",	"&#163;",    "L", /* pound sign */
   167     "&curren;",	"&#164;",    "$", /* currency sign */
   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
   169     "&sect;",	"&#167;",    "--", /* section sign */
   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */
   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
   177     "&deg;",	"&#176;",    " degrees", /* degree sign */
   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
   182     "&micro;",	"&#181;",    "m", /* micro sign */
   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */
   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
   215     "&times;",	"&#215;",    "*", /* multiplication sign */
   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */
   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
   247     "&divide;",	"&#247;",    "/", /* division sign */
   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
   256     "", ""
   257 };
   258 
   259 /* special characters */
   260 #define CHAR_SPACE        32
   261 #define CHAR_TAB           9
   262 #define CHAR_LF           10
   263 #define CHAR_CR           13
   264 #define CHAR_DQUOTE       34
   265 #define CHAR_SQUOTE       39
   266 #define CHAR_OPEN_SQUOTE  96
   267 #define CHAR_TILDE       126
   268 #define CHAR_ASTERISK     42
   269 #define CHAR_FORESLASH    47
   270 #define CHAR_CARAT        94
   271 
   272 #define CHAR_UNDERSCORE    '_'
   273 #define CHAR_OPEN_CBRACK   '{'
   274 #define CHAR_CLOSE_CBRACK  '}'
   275 #define CHAR_OPEN_RBRACK   '('
   276 #define CHAR_CLOSE_RBRACK  ')'
   277 #define CHAR_OPEN_SBRACK   '['
   278 #define CHAR_CLOSE_SBRACK  ']'
   279 
   280 /* longest and shortest normal PG line lengths */
   281 #define LONGEST_PG_LINE   75
   282 #define WAY_TOO_LONG      80
   283 #define SHORTEST_PG_LINE  55
   284 
   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */
   286                                   /*     D - ignore DP-specific markup     */
   287                                   /*     E - echo queried line             */
   288                                   /*     S - check single quotes           */
   289                                   /*     T - check common typos            */
   290                                   /*     P - require closure of quotes on  */
   291                                   /*         every paragraph               */
   292                                   /*     X - "Trust no one" :-) Paranoid!  */
   293                                   /*         Queries everything            */
   294                                   /*     L - line end checking defaults on */
   295                                   /*         -L turns it off               */
   296                                   /*     O - overview. Just shows counts.  */
   297                                   /*     Y - puts errors to stdout         */
   298                                   /*         instead of stderr             */
   299                                   /*     H - Echoes header fields          */
   300                                   /*     M - Ignore markup in < >          */
   301                                   /*     U - Use file of User-defined Typos*/
   302                                   /*     W - Defaults for use on Web upload*/
   303                                   /*     V - Verbose - list EVERYTHING!    */
   304 #define SWITNO 14                 /* max number of switch parms            */
   305                                   /*        - used for defining array-size */
   306 #define MINARGS   1               /* minimum no of args excl switches      */
   307 #define MAXARGS   1               /* maximum no of args excl switches      */
   308 
   309 int pswit[SWITNO];                /* program switches set by SWITCHES      */
   310 
   311 #define ECHO_SWITCH      0
   312 #define SQUOTE_SWITCH    1
   313 #define TYPO_SWITCH      2
   314 #define QPARA_SWITCH     3
   315 #define PARANOID_SWITCH  4
   316 #define LINE_END_SWITCH  5
   317 #define OVERVIEW_SWITCH  6
   318 #define STDOUT_SWITCH    7
   319 #define HEADER_SWITCH    8
   320 #define WEB_SWITCH       9
   321 #define VERBOSE_SWITCH   10
   322 #define MARKUP_SWITCH    11
   323 #define USERTYPO_SWITCH  12
   324 #define DP_SWITCH        13
   325 
   326 long cnt_dquot;       /* for overview mode, count of doublequote queries */
   327 long cnt_squot;       /* for overview mode, count of singlequote queries */
   328 long cnt_brack;       /* for overview mode, count of brackets queries */
   329 long cnt_bin;         /* for overview mode, count of non-ASCII queries */
   330 long cnt_odd;         /* for overview mode, count of odd character queries */
   331 long cnt_long;        /* for overview mode, count of long line errors */
   332 long cnt_short;       /* for overview mode, count of short line queries */
   333 long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */
   334 long cnt_dash;        /* for overview mode, count of dash-related queries */
   335 long cnt_word;        /* for overview mode, count of word queries */
   336 long cnt_html;        /* for overview mode, count of html queries */
   337 long cnt_lineend;     /* for overview mode, count of line-end queries */
   338 long cnt_spacend;     /* count of lines with space at end */
   339 long linecnt;         /* count of total lines in the file */
   340 long checked_linecnt; /* count of lines actually checked */
   341 
   342 void proghelp(void);
   343 void procfile(char *);
   344 
   345 #define LOW_THRESHOLD    0
   346 #define HIGH_THRESHOLD   1
   347 
   348 #define START 0
   349 #define END 1
   350 #define PREV 0
   351 #define NEXT 1
   352 #define FIRST_OF_PAIR 0
   353 #define SECOND_OF_PAIR 1
   354 
   355 #define MAX_WORDPAIR 1000
   356 
   357 char running_from[MAX_PATH];
   358 
   359 int mixdigit(char *);
   360 char *getaword(char *,char *);
   361 int matchword(char *,char *);
   362 char *flgets(char *,int,FILE *,long);
   363 void lowerit(char *);
   364 int gcisalpha(unsigned char);
   365 int gcisdigit(unsigned char);
   366 int gcisletter(unsigned char);
   367 char *gcstrchr(char *s,char c);
   368 void postprocess_for_HTML(char *);
   369 char *linehasmarkup(char *);
   370 char *losemarkup(char *);
   371 int tagcomp(char *,char *);
   372 char *loseentities(char *);
   373 int isroman(char *);
   374 int usertypo_count;
   375 void postprocess_for_DP(char *);
   376 
   377 char wrk[LINEBUFSIZE];
   378 
   379 #define MAX_QWORD 50
   380 #define MAX_QWORD_LENGTH 40
   381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
   382 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
   383 signed int dupcnt[MAX_QWORD];
   384 
   385 int main(int argc,char **argv)
   386 {
   387     char *argsw,*s;
   388     int i,switno,invarg;
   389     char usertypo_file[MAX_PATH];
   390     FILE *usertypofile;
   391     if (strlen(argv[0])<sizeof(running_from))
   392 	/* save the path to the executable */
   393         strcpy(running_from,argv[0]);
   394     /* find out what directory we're running from */
   395     s=running_from+strlen(running_from);
   396     for (;*s!='/' && *s!='\\' && s>=running_from;s--)
   397         *s=0;
   398     switno=strlen(SWITCHES);
   399     for (i=switno;--i>0;)
   400         pswit[i]=0;           /* initialise switches */
   401     /*
   402      * Standard loop to extract switches.
   403      * When we come out of this loop, the arguments will be
   404      * in argv[0] upwards and the switches used will be
   405      * represented by their equivalent elements in pswit[]
   406      */
   407     while (--argc>0 && **++argv=='-')
   408         for (argsw=argv[0]+1;*argsw!='\0';argsw++)
   409             for (i=switno,invarg=1;(--i>=0) && invarg==1;)
   410                 if ((toupper(*argsw))==SWITCHES[i])
   411 		{
   412                     invarg=0;
   413                     pswit[i]=1;
   414 		}
   415     /* Paranoid checking is turned OFF, not on, by its switch */
   416     pswit[PARANOID_SWITCH]^=1;
   417     if (pswit[PARANOID_SWITCH])
   418 	/* if running in paranoid mode force typo checks as well   */
   419         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
   420     /* Line-end checking is turned OFF, not on, by its switch */
   421     pswit[LINE_END_SWITCH]^=1;
   422     /* Echoing is turned OFF, not on, by its switch */
   423     pswit[ECHO_SWITCH]^=1;
   424     if (pswit[OVERVIEW_SWITCH])
   425 	/* just print summary; don't echo */
   426         pswit[ECHO_SWITCH]=0;
   427     /*
   428      * Web uploads - for the moment, this is really just a placeholder
   429      * until we decide what processing we really want to do on web uploads
   430      */
   431     if (pswit[WEB_SWITCH])
   432     {
   433 	/* specific override for web uploads */
   434         pswit[ECHO_SWITCH]=1;
   435         pswit[SQUOTE_SWITCH]=0;
   436         pswit[TYPO_SWITCH]=1;
   437         pswit[QPARA_SWITCH]=0;
   438         pswit[PARANOID_SWITCH]=1;
   439         pswit[LINE_END_SWITCH]=0;
   440         pswit[OVERVIEW_SWITCH]=0;
   441         pswit[STDOUT_SWITCH]=0;
   442         pswit[HEADER_SWITCH]=1;
   443         pswit[VERBOSE_SWITCH]=0;
   444         pswit[MARKUP_SWITCH]=0;
   445         pswit[USERTYPO_SWITCH]=0;
   446         pswit[DP_SWITCH]=0;
   447     }
   448     if (argc<MINARGS || argc>MAXARGS)
   449     {
   450 	/* check number of args */
   451         proghelp();
   452         return 1;
   453     }
   454     /* read in the user-defined stealth scanno list */
   455     if (pswit[USERTYPO_SWITCH])
   456     {
   457 	/* ... we were told we had one! */
   458         usertypofile=fopen(USERTYPO_FILE,"rb");
   459         if (!usertypofile)
   460 	{
   461 	    /* not in cwd. try excuteable directory. */
   462             strcpy(usertypo_file,running_from);
   463             strcat(usertypo_file,USERTYPO_FILE);
   464             usertypofile=fopen(usertypo_file,"rb");
   465             if (!usertypofile) {
   466 		/* we ain't got no user typo file! */
   467                 printf("   --> I couldn't find gutcheck.typ "
   468 		  "-- proceeding without user typos.\n");
   469 	    }
   470 	}
   471         usertypo_count=0;
   472         if (usertypofile)
   473 	{
   474 	    /* we managed to open a User Typo File! */
   475             if (pswit[USERTYPO_SWITCH])
   476 	    {
   477                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
   478 		  (long)usertypo_count))
   479 		{
   480                     if (strlen(aline)>1)
   481 		    {
   482                         if ((int)*aline>33)
   483 			{
   484                             s=malloc(strlen(aline)+1);
   485                             if (!s)
   486 			    {
   487                                 fprintf(stderr,"bookloupe: cannot get enough "
   488 				  "memory for user typo file!\n");
   489                                 exit(1);
   490 			    }
   491                             strcpy(s,aline);
   492                             usertypo[usertypo_count]=s;
   493                             usertypo_count++;
   494                             if (usertypo_count>=MAX_USER_TYPOS)
   495 			    {
   496                                 printf("   --> Only %d user-defined typos "
   497 				  "allowed: ignoring the rest\n",
   498 				  MAX_USER_TYPOS);
   499                                 break;
   500 			    }
   501 			}
   502 		    }
   503 		}
   504 	    }
   505             fclose(usertypofile);
   506 	}
   507     }
   508     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   509     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
   510     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
   511     cnt_spacend=0;
   512     procfile(argv[0]);
   513     if (pswit[OVERVIEW_SWITCH])
   514     {
   515 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   516 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   517         printf("    --------------- Queries found --------------\n");
   518         if (cnt_long)
   519 	    printf("    Long lines:                    %14ld\n",cnt_long);
   520         if (cnt_short)
   521 	    printf("    Short lines:                   %14ld\n",cnt_short);
   522         if (cnt_lineend)
   523 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);
   524         if (cnt_word)
   525 	    printf("    Common typos:                  %14ld\n",cnt_word);
   526         if (cnt_dquot)
   527 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);
   528         if (cnt_squot)
   529 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);
   530         if (cnt_brack)
   531 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);
   532         if (cnt_bin)
   533 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);
   534         if (cnt_odd)
   535 	    printf("    Proofing characters:           %14ld\n",cnt_odd);
   536         if (cnt_punct)
   537 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   538         if (cnt_dash)
   539 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);
   540         if (cnt_html)
   541 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);
   542         printf("\n");
   543         printf("    TOTAL QUERIES                  %14ld\n",
   544           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   545           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   546     }
   547     return 0;
   548 }
   549 
   550 struct first_pass_results {
   551     long firstline,astline;
   552     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
   553     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
   554     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
   555     signed int Dutchcount,Frenchcount;
   556 };
   557 
   558 /*
   559  * first_pass:
   560  *
   561  * Run a first pass - verify that it's a valid PG
   562  * file, decide whether to report some things that
   563  * occur many times in the text like long or short
   564  * lines, non-standard dashes, etc.
   565  */
   566 struct first_pass_results *first_pass(FILE *infile)
   567 {
   568     char laststart=CHAR_SPACE,*s;
   569     signed int i,llen;
   570     unsigned int lastlen=0,lastblen=0;
   571     long spline=0,nspline=0;
   572     static struct first_pass_results results={0};
   573     char inword[MAXWORDLEN]="";
   574     while (fgets(aline,LINEBUFSIZE-1,infile))
   575     {
   576         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
   577 	    aline[strlen(aline)-1]=0;
   578         linecnt++;
   579         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
   580 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
   581 	{
   582             if (spline)
   583                 printf("   --> Duplicate header?\n");
   584             spline=linecnt+1;   /* first line of non-header text, that is */
   585 	}
   586         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
   587 	{
   588             if (nspline)
   589                 printf("   --> Duplicate header?\n");
   590             nspline=linecnt+1;   /* first line of non-header text, that is */
   591 	}
   592         if (spline || nspline)
   593 	{
   594             lowerit(aline);
   595             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
   596 	    {
   597                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
   598 		{
   599                     if (results.footerline)
   600 		    {
   601 			/* it's an old-form header - we can detect duplicates */
   602                         if (!nspline)
   603                             printf("   --> Duplicate footer?\n");
   604 		    }
   605                     else
   606                         results.footerline=linecnt;
   607 		}
   608 	    }
   609 	}
   610         if (spline)
   611 	    results.firstline=spline;
   612         if (nspline)
   613 	    results.firstline=nspline;  /* override with new */
   614         if (results.footerline)
   615 	    continue;    /* don't count the boilerplate in the footer */
   616         llen=strlen(aline);
   617         results.totlen+=llen;
   618         for (i=0;i<llen;i++)
   619 	{
   620             if ((unsigned char)aline[i]>127)
   621 		results.binlen++;
   622             if (gcisalpha(aline[i]))
   623 		results.alphalen++;
   624             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
   625 		results.endquote_count++;
   626 	}
   627         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
   628 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   629 	    results.shortline++;
   630         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
   631 	    cnt_spacend++;
   632         if (strstr(aline,".,"))
   633 	    results.dotcomma++;
   634         /* only count ast lines for ignoring purposes where there is */
   635         /* locase text on the line */
   636         if (strstr(aline,"*"))
   637 	{
   638             for (s=aline;*s;s++)
   639                 if (*s>='a' && *s<='z')
   640                     break;
   641              if (*s)
   642 		results.astline++;
   643 	}
   644         if (strstr(aline,"/"))
   645             results.fslashline++;
   646         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
   647 	    ;
   648         if (aline[i]=='-' && aline[i-1]!='-')
   649 	    results.hyphens++;
   650         if (llen>LONGEST_PG_LINE)
   651 	    results.longline++;
   652         if (llen>WAY_TOO_LONG)
   653 	    results.verylongline++;
   654         if (strstr(aline,"<") && strstr(aline,">"))
   655 	{
   656             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
   657             if (i>0)
   658                 results.htmcount++;
   659             if (strstr(aline,"<i>"))
   660 		results.htmcount+=4; /* bonus marks! */
   661 	}
   662         /* Check for spaced em-dashes */
   663         if (strstr(aline,"--"))
   664 	{
   665             results.emdash++;
   666             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
   667                (*(strstr(aline,"--")+2)==CHAR_SPACE))
   668 		results.space_emdash++;
   669             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
   670                (*(strstr(aline,"--")+2)==CHAR_SPACE))
   671 		/* count of em-dashes with spaces both sides */
   672 		results.non_PG_space_emdash++;
   673             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
   674                (*(strstr(aline,"--")+2)!=CHAR_SPACE))
   675 		/* count of PG-type em-dashes with no spaces */
   676 		results.PG_space_emdash++;
   677 	}
   678         for (s=aline;*s;)
   679 	{
   680             s=getaword(s,inword);
   681             if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   682                 results.Dutchcount++;
   683             if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   684                 results.Frenchcount++;
   685             if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   686                 results.standalone_digit++;
   687 	}
   688         /* Check for spaced dashes */
   689         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
   690 	    results.spacedash++;
   691         lastblen=lastlen;
   692         lastlen=strlen(aline);
   693         laststart=aline[0];
   694     }
   695     return &results;
   696 }
   697 
   698 struct warnings {
   699     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   700     signed int endquote,isDutch,isFrench;
   701 };
   702 
   703 /*
   704  * report_first_pass:
   705  *
   706  * Make some snap decisions based on the first pass results.
   707  */
   708 struct warnings *report_first_pass(struct first_pass_results *results)
   709 {
   710     static struct warnings warnings={0};
   711     if (cnt_spacend>0)
   712         printf("   --> %ld lines in this file have white space at end\n",
   713 	  cnt_spacend);
   714     warnings.dotcomma=1;
   715     if (results->dotcomma>5)
   716     {
   717         warnings.dotcomma=0;
   718         printf("   --> %ld lines in this file contain '.,'. "
   719 	  "Not reporting them.\n",results->dotcomma);
   720     }
   721     /*
   722      * If more than 50 lines, or one-tenth, are short,
   723      * don't bother reporting them.
   724      */
   725     warnings.shortline=1;
   726     if (results->shortline>50 || results->shortline*10>linecnt)
   727     {
   728         warnings.shortline=0;
   729         printf("   --> %ld lines in this file are short. "
   730 	  "Not reporting short lines.\n",results->shortline);
   731     }
   732     /*
   733      * If more than 50 lines, or one-tenth, are long,
   734      * don't bother reporting them.
   735      */
   736     warnings.longline=1;
   737     if (results->longline>50 || results->longline*10>linecnt)
   738     {
   739         warnings.longline=0;
   740         printf("   --> %ld lines in this file are long. "
   741 	  "Not reporting long lines.\n",results->longline);
   742     }
   743     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   744     warnings.ast=1;
   745     if (results->astline>10)
   746     {
   747         warnings.ast=0;
   748         printf("   --> %ld lines in this file contain asterisks. "
   749 	  "Not reporting them.\n",results->astline);
   750     }
   751     /*
   752      * If more than 10 lines contain forward slashes,
   753      * don't bother reporting them.
   754      */
   755     warnings.fslash=1;
   756     if (results->fslashline>10)
   757     {
   758         warnings.fslash=0;
   759         printf("   --> %ld lines in this file contain forward slashes. "
   760 	  "Not reporting them.\n",results->fslashline);
   761     }
   762     /*
   763      * If more than 20 lines contain unpunctuated endquotes,
   764      * don't bother reporting them.
   765      */
   766     warnings.endquote=1;
   767     if (results->endquote_count>20)
   768     {
   769         warnings.endquote=0;
   770         printf("   --> %ld lines in this file contain unpunctuated endquotes. "
   771 	  "Not reporting them.\n",results->endquote_count);
   772     }
   773     /*
   774      * If more than 15 lines contain standalone digits,
   775      * don't bother reporting them.
   776      */
   777     warnings.digit=1;
   778     if (results->standalone_digit>10)
   779     {
   780         warnings.digit=0;
   781         printf("   --> %ld lines in this file contain standalone 0s and 1s. "
   782 	  "Not reporting them.\n",results->standalone_digit);
   783     }
   784     /*
   785      * If more than 20 lines contain hyphens at end,
   786      * don't bother reporting them.
   787      */
   788     warnings.hyphen=1;
   789     if (results->hyphens>20)
   790     {
   791         warnings.hyphen=0;
   792         printf("   --> %ld lines in this file have hyphens at end. "
   793 	  "Not reporting them.\n",results->hyphens);
   794     }
   795     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   796     {
   797         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   798         pswit[MARKUP_SWITCH]=1;
   799     }
   800     if (results->verylongline>0)
   801         printf("   --> %ld lines in this file are VERY long!\n",
   802 	  results->verylongline);
   803     /*
   804      * If there are more non-PG spaced dashes than PG em-dashes,
   805      * assume it's deliberate.
   806      * Current PG guidelines say don't use them, but older texts do,
   807      * and some people insist on them whatever the guidelines say.
   808      */
   809     warnings.dash=1;
   810     if (results->spacedash+results->non_PG_space_emdash>
   811       results->PG_space_emdash)
   812     {
   813         warnings.dash=0;
   814         printf("   --> There are %ld spaced dashes and em-dashes. "
   815 	  "Not reporting them.\n",
   816 	  results->spacedash+results->non_PG_space_emdash);
   817     }
   818     /* If more than a quarter of characters are hi-bit, bug out. */
   819     warnings.bin=1;
   820     if (results->binlen*4>results->totlen)
   821     {
   822         printf("   --> This file does not appear to be ASCII. "
   823 	  "Terminating. Best of luck with it!\n");
   824         exit(1);
   825     }
   826     if (results->alphalen*4<results->totlen)
   827     {
   828         printf("   --> This file does not appear to be text. "
   829 	  "Terminating. Best of luck with it!\n");
   830         exit(1);
   831     }
   832     if (results->binlen*100>results->totlen || results->binlen>100)
   833     {
   834         printf("   --> There are a lot of foreign letters here. "
   835 	  "Not reporting them.\n");
   836         warnings.bin=0;
   837     }
   838     warnings.isDutch=0;
   839     if (results->Dutchcount>50)
   840     {
   841         warnings.isDutch=1;
   842         printf("   --> This looks like Dutch - "
   843 	  "switching off dashes and warnings for 's Middags case.\n");
   844     }
   845     warnings.isFrench=0;
   846     if (results->Frenchcount>50)
   847     {
   848         warnings.isFrench=1;
   849         printf("   --> This looks like French - "
   850 	  "switching off some doublepunct.\n");
   851     }
   852     if (results->firstline && results->footerline)
   853         printf("    The PG header and footer appear to be already on.\n");
   854     else
   855     {
   856         if (results->firstline)
   857             printf("    The PG header is on - no footer.\n");
   858         if (results->footerline)
   859             printf("    The PG footer is on - no header.\n");
   860     }
   861     printf("\n");
   862     if (pswit[VERBOSE_SWITCH])
   863     {
   864         warnings.bin=1;
   865         warnings.shortline=1;
   866         warnings.dotcomma=1;
   867         warnings.longline=1;
   868         warnings.dash=1;
   869         warnings.digit=1;
   870         warnings.ast=1;
   871         warnings.fslash=1;
   872         warnings.hyphen=1;
   873         warnings.endquote=1;
   874         printf("   *** Verbose output is ON -- you asked for it! ***\n");
   875     }
   876     if (warnings.isDutch)
   877         warnings.dash=0;
   878     if (results->footerline>0 && results->firstline>0 &&
   879       results->footerline>results->firstline &&
   880       results->footerline-results->firstline<100)
   881     {
   882         printf("   --> I don't really know where this text starts. \n");
   883         printf("       There are no reference points.\n");
   884         printf("       I'm going to have to report the header and footer "
   885 	  "as well.\n");
   886         results->firstline=0;
   887     }
   888     return &warnings;
   889 }
   890 
   891 struct counters {
   892     long quot;
   893     signed int c_unders,c_brack,s_brack,r_brack;
   894     signed int open_single_quote,close_single_quote;
   895 };
   896 
   897 /*
   898  * analyse_quotes:
   899  *
   900  * Look along the line, accumulate the count of quotes, and see
   901  * if this is an empty line - i.e. a line with nothing on it
   902  * but spaces.
   903  * If line has just spaces, period, * and/or - on it, don't
   904  * count it, since empty lines with asterisks or dashes to
   905  * separate sections are common.
   906  *
   907  * Returns: Non-zero if the line is empty.
   908  */
   909 int analyse_quotes(const char *s,struct counters *counters)
   910 {
   911     signed int guessquote=0;
   912     int isemptyline=1;    /* assume the line is empty until proven otherwise */
   913     while (*s)
   914     {
   915 	if (*s==CHAR_DQUOTE)
   916 	    counters->quot++;
   917 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
   918 	{
   919 	    if (s==aline)
   920 	    {
   921 		/*
   922 		 * At start of line, it can only be an openquote.
   923 		 * Hardcode a very common exception!
   924 		 */
   925 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
   926 		    counters->open_single_quote++;
   927 	    }
   928 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
   929 		/* Do nothing! it's definitely an apostrophe, not a quote */
   930 		;
   931 	    /* it's outside a word - let's check it out */
   932 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
   933 	    {
   934 		/* it damwell better BE an openquote */
   935 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
   936 		    /* hardcode a very common exception! */
   937 		    counters->open_single_quote++;
   938 	    }
   939 	    else
   940 	    {
   941 		/* now - is it a closequote? */
   942 		guessquote=0;   /* accumulate clues */
   943 		if (gcisalpha(s[-1]))
   944 		{
   945 		    /* it follows a letter - could be either */
   946 		    guessquote++;
   947 		    if (s[-1]=='s')
   948 		    {
   949 			/* looks like a plural apostrophe */
   950 			guessquote-=3;
   951 			if (s[1]==CHAR_SPACE)  /* bonus marks! */
   952 			    guessquote-=2;
   953 		    }
   954 		}
   955 		/* it doesn't have a letter either side */
   956 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
   957 		    guessquote+=8; /* looks like a closequote */
   958 		else
   959 		    guessquote++;
   960 		if (counters->open_single_quote>counters->close_single_quote)
   961 		    /*
   962 		     * Give it the benefit of some doubt,
   963 		     * if a squote is already open.
   964 		     */
   965 		    guessquote++;
   966 		else
   967 		    guessquote--;
   968 		if (guessquote>=0)
   969 		    counters->close_single_quote++;
   970 	    }
   971 	}
   972 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
   973 	  *s!=13 && *s!=10)
   974 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */
   975 	if (*s==CHAR_UNDERSCORE)
   976 	    counters->c_unders++;
   977 	if (*s==CHAR_OPEN_CBRACK)
   978 	    counters->c_brack++;
   979 	if (*s==CHAR_CLOSE_CBRACK)
   980 	    counters->c_brack--;
   981 	if (*s==CHAR_OPEN_RBRACK)
   982 	    counters->r_brack++;
   983 	if (*s==CHAR_CLOSE_RBRACK)
   984 	    counters->r_brack--;
   985 	if (*s==CHAR_OPEN_SBRACK)
   986 	    counters->s_brack++;
   987 	if (*s==CHAR_CLOSE_SBRACK)
   988 	    counters->s_brack--;
   989 	s++;
   990     }
   991     return isemptyline;
   992 }
   993 
   994 /*
   995  * check_for_odd_characters:
   996  *
   997  * Check for binary and other odd characters.
   998  */
   999 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1000   int isemptyline)
  1001 {
  1002     /* Don't repeat multiple warnings on one line. */
  1003     signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
  1004     const char *s;
  1005     unsigned char c;
  1006     for (s=aline;*s;s++)
  1007     {
  1008 	c=*(unsigned char *)s;
  1009 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
  1010 	{
  1011 	    if (pswit[ECHO_SWITCH])
  1012 		printf("\n%s\n",aline);
  1013 	    if (!pswit[OVERVIEW_SWITCH])
  1014 		if (c>127 && c<160)
  1015 		    printf("    Line %ld column %d - "
  1016 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
  1017 		else
  1018 		    printf("    Line %ld column %d - Non-ASCII character %d\n",
  1019 		      linecnt,(int)(s-aline)+1,c);
  1020 	    else
  1021 		cnt_bin++;
  1022 	    eNon_A=1;
  1023 	}
  1024 	if (!eTab && *s==CHAR_TAB)
  1025 	{
  1026 	    if (pswit[ECHO_SWITCH])
  1027 		printf("\n%s\n",aline);
  1028 	    if (!pswit[OVERVIEW_SWITCH])
  1029 		printf("    Line %ld column %d - Tab character?\n",
  1030 		  linecnt,(int)(s-aline)+1);
  1031 	    else
  1032 		cnt_odd++;
  1033 	    eTab=1;
  1034 	}
  1035 	if (!eTilde && *s==CHAR_TILDE)
  1036 	{
  1037 	    /*
  1038 	     * Often used by OCR software to indicate an
  1039 	     * unrecognizable character.
  1040 	     */
  1041 	    if (pswit[ECHO_SWITCH])
  1042 		printf("\n%s\n",aline);
  1043 	    if (!pswit[OVERVIEW_SWITCH])
  1044 		printf("    Line %ld column %d - Tilde character?\n",
  1045 		  linecnt,(int)(s-aline)+1);
  1046 	    else
  1047 		cnt_odd++;
  1048 	    eTilde=1;
  1049 	}
  1050 	if (!eCarat && *s==CHAR_CARAT)
  1051 	{  
  1052 	    if (pswit[ECHO_SWITCH])
  1053 		printf("\n%s\n",aline);
  1054 	    if (!pswit[OVERVIEW_SWITCH])
  1055 		printf("    Line %ld column %d - Carat character?\n",
  1056 		  linecnt,(int)(s-aline)+1);
  1057 	    else
  1058 		cnt_odd++;
  1059 	    eCarat=1;
  1060 	}
  1061 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
  1062 	{  
  1063 	    if (pswit[ECHO_SWITCH])
  1064 		printf("\n%s\n",aline);
  1065 	    if (!pswit[OVERVIEW_SWITCH])
  1066 		printf("    Line %ld column %d - Forward slash?\n",
  1067 		  linecnt,(int)(s-aline)+1);
  1068 	    else
  1069 		cnt_odd++;
  1070 	    eFSlash=1;
  1071 	}
  1072 	/*
  1073 	 * Report asterisks only in paranoid mode,
  1074 	 * since they're often deliberate.
  1075 	 */
  1076 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1077 	  *s==CHAR_ASTERISK)
  1078 	{
  1079 	    if (pswit[ECHO_SWITCH])
  1080 		printf("\n%s\n",aline);
  1081 	    if (!pswit[OVERVIEW_SWITCH])
  1082 		printf("    Line %ld column %d - Asterisk?\n",
  1083 		  linecnt,(int)(s-aline)+1);
  1084 	    else
  1085 		cnt_odd++;
  1086 	    eAst=1;
  1087 	}
  1088     }
  1089 }
  1090 
  1091 /*
  1092  * check_for_long_line:
  1093  *
  1094  * Check for line too long.
  1095  */
  1096 void check_for_long_line(const char *aline)
  1097 {
  1098     if (strlen(aline)>LONGEST_PG_LINE)
  1099     {
  1100 	if (pswit[ECHO_SWITCH])
  1101 	    printf("\n%s\n",aline);
  1102 	if (!pswit[OVERVIEW_SWITCH])
  1103 	    printf("    Line %ld column %d - Long line %d\n",
  1104 	      linecnt,strlen(aline),strlen(aline));
  1105 	else
  1106 	    cnt_long++;
  1107     }
  1108 }
  1109 
  1110 struct line_properties {
  1111     unsigned int len,blen;
  1112     char start;
  1113 };
  1114 
  1115 /*
  1116  * check_for_short_line:
  1117  *
  1118  * Check for line too short.
  1119  *
  1120  * This one is a bit trickier to implement: we don't want to
  1121  * flag the last line of a paragraph for being short, so we
  1122  * have to wait until we know that our current line is a
  1123  * "normal" line, then report the _previous_ line if it was too
  1124  * short. We also don't want to report indented lines like
  1125  * chapter heads or formatted quotations. We therefore keep
  1126  * last->len as the length of the last line examined, and
  1127  * last->blen as the length of the last but one, and try to
  1128  * suppress unnecessary warnings by checking that both were of
  1129  * "normal" length. We keep the first character of the last
  1130  * line in last->start, and if it was a space, we assume that
  1131  * the formatting is deliberate. I can't figure out a way to
  1132  * distinguish something like a quoted verse left-aligned or
  1133  * the header or footer of a letter from a paragraph of short
  1134  * lines - maybe if I examined the whole paragraph, and if the
  1135  * para has less than, say, 8 lines and if all lines are short,
  1136  * then just assume it's OK? Need to look at some texts to see
  1137  * how often a formula like this would get the right result.
  1138  */
  1139 void check_for_short_line(const char *aline,const struct line_properties *last)
  1140 {
  1141     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
  1142       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1143     {
  1144 	if (pswit[ECHO_SWITCH])
  1145 	    printf("\n%s\n",prevline);
  1146 	if (!pswit[OVERVIEW_SWITCH])
  1147 	    printf("    Line %ld column %d - Short line %d?\n",
  1148 	      linecnt-1,strlen(prevline),strlen(prevline));
  1149 	else
  1150 	    cnt_short++;
  1151     }
  1152 }
  1153 
  1154 /*
  1155  * check_for_starting_punctuation:
  1156  *
  1157  * Look for punctuation other than full ellipses at start of line.
  1158  */
  1159 void check_for_starting_punctuation(const char *aline)
  1160 {
  1161     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
  1162     {
  1163 	if (pswit[ECHO_SWITCH])
  1164 	    printf("\n%s\n",aline);
  1165 	if (!pswit[OVERVIEW_SWITCH])
  1166 	    printf("    Line %ld column 1 - Begins with punctuation?\n",
  1167 	      linecnt);
  1168 	else
  1169 	    cnt_punct++;
  1170     }
  1171 }
  1172 
  1173 /*
  1174  * procfile:
  1175  *
  1176  * Process one file.
  1177  */
  1178 void procfile(char *filename)
  1179 {
  1180     char *s,*t,*s1,*wordstart;
  1181     char inword[MAXWORDLEN],testword[MAXWORDLEN];
  1182     char parastart[81];     /* first line of current para */
  1183     FILE *infile;
  1184     struct first_pass_results *first_pass_results;
  1185     struct warnings *warnings;
  1186     struct counters counters={0};
  1187     struct line_properties last={0};
  1188     int isemptyline;
  1189     long squot,start_para_line;
  1190     signed int i,j,llen,isacro,isellipsis,istypo,alower;
  1191     signed int dquotepar,squotepar;
  1192     signed int isnewpara,vowel,consonant;
  1193     char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
  1194       cbrack_err[80],unders_err[80];
  1195     signed int qword_index,qperiod_index,isdup;
  1196     signed int enddash;
  1197     last.start=CHAR_SPACE;
  1198     *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
  1199       *unders_err=*prevline=0;
  1200     linecnt=checked_linecnt=start_para_line=0;
  1201     squot=0;
  1202     i=llen=isacro=isellipsis=istypo=0;
  1203     isnewpara=vowel=consonant=enddash=0;
  1204     qword_index=qperiod_index=isdup=0;
  1205     *inword=*testword=0;
  1206     dquotepar=squotepar=0;
  1207     for (j=0;j<MAX_QWORD;j++)
  1208     {
  1209         dupcnt[j]=0;
  1210         for (i=0;i<MAX_QWORD_LENGTH;i++)
  1211 	{
  1212             qword[i][j]=0;
  1213             qperiod[i][j]=0;
  1214 	}
  1215     }
  1216     infile=fopen(filename,"rb");
  1217     if (!infile)
  1218     {
  1219         if (pswit[STDOUT_SWITCH])
  1220             fprintf(stdout,"bookloupe: cannot open %s\n",filename);
  1221         else
  1222             fprintf(stderr,"bookloupe: cannot open %s\n",filename);
  1223 	exit(1);
  1224     }
  1225     fprintf(stdout,"\n\nFile: %s\n\n",filename);
  1226     first_pass_results=first_pass(infile);
  1227     warnings=report_first_pass(first_pass_results);
  1228     rewind(infile);
  1229     /*
  1230      * Here we go with the main pass. Hold onto yer hat!
  1231      * Re-init some variables we've dirtied.
  1232      */
  1233     squot=linecnt=0;
  1234     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
  1235     {
  1236         linecnt++;
  1237         if (linecnt==1)
  1238 	    isnewpara=1;
  1239         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
  1240 	    continue;    // skip DP page separators completely
  1241         if (linecnt<first_pass_results->firstline ||
  1242 	  (first_pass_results->footerline>0 &&
  1243 	  linecnt>first_pass_results->footerline))
  1244 	{
  1245             if (pswit[HEADER_SWITCH])
  1246 	    {
  1247                 if (!strncmp(aline,"Title:",6))
  1248                     printf("    %s\n",aline);
  1249                 if (!strncmp(aline,"Author:",7))
  1250                     printf("    %s\n",aline);
  1251                 if (!strncmp(aline,"Release Date:",13))
  1252                     printf("    %s\n",aline);
  1253                 if (!strncmp(aline,"Edition:",8))
  1254                     printf("    %s\n\n",aline);
  1255 	    }
  1256             continue;                /* skip through the header */
  1257 	}
  1258         checked_linecnt++;
  1259         s=aline;
  1260         /*
  1261 	 * If we are in a state of unbalanced quotes, and this line
  1262          * doesn't begin with a quote, output the stored error message.
  1263          * If the -P switch was used, print the warning even if the
  1264          * new para starts with quotes.
  1265 	 */
  1266         t=s;
  1267         while (*t==' ')
  1268 	    t++;
  1269         if (*dquote_err)
  1270             if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  1271 	    {
  1272                 if (!pswit[OVERVIEW_SWITCH])
  1273 		{
  1274                     if (pswit[ECHO_SWITCH])
  1275 			printf("\n%s\n",parastart);
  1276                     printf(dquote_err);
  1277 		}
  1278                 else
  1279                     cnt_dquot++;
  1280             }
  1281         if (*squote_err)
  1282 	{
  1283             if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
  1284 	      pswit[QPARA_SWITCH] || squot)
  1285 	    {
  1286                 if (!pswit[OVERVIEW_SWITCH])
  1287 		{
  1288                     if (pswit[ECHO_SWITCH])
  1289 			printf("\n%s\n",parastart);
  1290                     printf(squote_err);
  1291 		}
  1292                 else
  1293                     cnt_squot++;
  1294 	    }
  1295             squot=0;
  1296 	}
  1297         if (*rbrack_err)
  1298 	{
  1299             if (!pswit[OVERVIEW_SWITCH])
  1300 	    {
  1301                 if (pswit[ECHO_SWITCH])
  1302 		    printf("\n%s\n",parastart);
  1303                 printf(rbrack_err);
  1304 	    }
  1305             else
  1306                 cnt_brack++;
  1307 	}
  1308         if (*sbrack_err)
  1309 	{
  1310             if (!pswit[OVERVIEW_SWITCH])
  1311 	    {
  1312                 if (pswit[ECHO_SWITCH])
  1313 		    printf("\n%s\n",parastart);
  1314                 printf(sbrack_err);
  1315 	    }
  1316             else
  1317                 cnt_brack++;
  1318 	}
  1319         if (*cbrack_err)
  1320 	{
  1321             if (!pswit[OVERVIEW_SWITCH])
  1322 	    {
  1323                 if (pswit[ECHO_SWITCH])
  1324 		    printf("\n%s\n",parastart);
  1325                 printf(cbrack_err);
  1326 	    }
  1327             else
  1328                 cnt_brack++;
  1329 	}
  1330         if (*unders_err)
  1331 	{
  1332             if (!pswit[OVERVIEW_SWITCH])
  1333 	    {
  1334                 if (pswit[ECHO_SWITCH])
  1335 		    printf("\n%s\n",parastart);
  1336                 printf(unders_err);
  1337 	    }
  1338             else
  1339                 cnt_brack++;
  1340 	}
  1341         *dquote_err=*squote_err=*rbrack_err=*cbrack_err= 
  1342 	  *sbrack_err=*unders_err=0;
  1343 	isemptyline=analyse_quotes(aline,&counters);
  1344         if (isnewpara && !isemptyline)
  1345 	{
  1346 	    /* This line is the start of a new paragraph. */
  1347             start_para_line=linecnt;
  1348 	    /* Capture its first line in case we want to report it later. */
  1349             strncpy(parastart,aline,80);
  1350             parastart[79]=0;
  1351             dquotepar=squotepar=0; /* restart the quote count */
  1352             s=aline;
  1353             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
  1354 		s++;
  1355             if (*s>='a' && *s<='z')
  1356 	    {
  1357 		/* and its first letter is lowercase */
  1358                 if (pswit[ECHO_SWITCH])
  1359 		    printf("\n%s\n",aline);
  1360                 if (!pswit[OVERVIEW_SWITCH])
  1361                     printf("    Line %ld column %d - "
  1362 		      "Paragraph starts with lower-case\n",
  1363 		      linecnt,(int)(s-aline)+1);
  1364                 else
  1365                     cnt_punct++;
  1366 	    }
  1367             isnewpara=0; /* Signal the end of new para processing. */
  1368 	}
  1369         /* Check for an em-dash broken at line end. */
  1370         if (enddash && *aline=='-')
  1371 	{
  1372             if (pswit[ECHO_SWITCH])
  1373 		printf("\n%s\n",aline);
  1374             if (!pswit[OVERVIEW_SWITCH])
  1375                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  1376             else
  1377                 cnt_punct++;
  1378 	}
  1379         enddash=0;
  1380         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
  1381 	    ;
  1382         if (s>=aline && *s=='-')
  1383             enddash=1;
  1384 	/*
  1385          * Check for invalid or questionable characters in the line
  1386          * Anything above 127 is invalid for plain ASCII, and
  1387          * non-printable control characters should also be flagged.
  1388          * Tabs should generally not be there.
  1389 	 */
  1390         for (s=aline;*s;s++)
  1391 	{
  1392             i=(unsigned char)*s;
  1393             if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
  1394 	    {
  1395                 if (pswit[ECHO_SWITCH])
  1396 		    printf("\n%s\n",aline);
  1397                 if (!pswit[OVERVIEW_SWITCH])
  1398                     printf("    Line %ld column %d - Control character %d\n",
  1399 		      linecnt,(int)(s-aline)+1,i);
  1400                 else
  1401                     cnt_bin++;
  1402 	    }
  1403 	}
  1404         if (warnings->bin)
  1405 	    check_for_odd_characters(aline,warnings,isemptyline);
  1406         if (warnings->longline)
  1407 	    check_for_long_line(aline);
  1408         if (warnings->shortline)
  1409 	    check_for_short_line(aline,&last);
  1410         last.blen=last.len;
  1411         last.len=strlen(aline);
  1412         last.start=aline[0];
  1413 	check_for_starting_punctuation(aline);
  1414         /*
  1415 	 * Check for spaced em-dashes.
  1416          * We must check _all_ occurrences of "--" on the line
  1417          * hence the loop - even if the first double-dash is OK
  1418          * there may be another that's wrong later on.
  1419 	 */
  1420         if (warnings->dash)
  1421 	{
  1422             s=aline;
  1423             while (strstr(s,"--"))
  1424 	    {
  1425                 if (*(strstr(s,"--")-1)==CHAR_SPACE ||
  1426                    (*(strstr(s,"--")+2)==CHAR_SPACE))
  1427 		{
  1428                     if (pswit[ECHO_SWITCH])
  1429 			printf("\n%s\n",aline);
  1430                     if (!pswit[OVERVIEW_SWITCH])
  1431                         printf("    Line %ld column %d - Spaced em-dash?\n",
  1432 			  linecnt,(int)(strstr(s,"--")-aline)+1);
  1433                     else
  1434                         cnt_dash++;
  1435 		}
  1436                 s=strstr(s,"--")+2;
  1437 	    }
  1438 	}
  1439         /* Check for spaced dashes. */
  1440         if (warnings->dash)
  1441 	{
  1442             if (strstr(aline," -"))
  1443 	    {
  1444                 if (*(strstr(aline," -")+2)!='-')
  1445 		{
  1446                     if (pswit[ECHO_SWITCH])
  1447 			printf("\n%s\n",aline);
  1448                     if (!pswit[OVERVIEW_SWITCH])
  1449                         printf("    Line %ld column %d - Spaced dash?\n",
  1450 			  linecnt,(int)(strstr(aline," -")-aline)+1);
  1451                     else
  1452                         cnt_dash++;
  1453 		}
  1454 	    }
  1455             else if (strstr(aline,"- "))
  1456 	    {
  1457 		if (*(strstr(aline,"- ")-1)!='-')
  1458 		{
  1459 		    if (pswit[ECHO_SWITCH])
  1460 			printf("\n%s\n",aline);
  1461 		    if (!pswit[OVERVIEW_SWITCH])
  1462 			printf("    Line %ld column %d - Spaced dash?\n",
  1463 			  linecnt,(int)(strstr(aline,"- ")-aline)+1);
  1464 		    else
  1465 			cnt_dash++;
  1466 		}
  1467 	    }
  1468 	}
  1469         /*
  1470 	 * Check for unmarked paragraphs indicated by separate speakers.
  1471          * May well be false positive:
  1472          * "Bravo!" "Wonderful!" called the crowd.
  1473          * but useful all the same.
  1474 	 */
  1475         s=wrk;
  1476         *s=0;
  1477         if (strstr(aline,"\" \""))
  1478 	    s=strstr(aline,"\" \"");
  1479         if (strstr(aline,"\"  \""))
  1480 	    s=strstr(aline,"\"  \"");
  1481         if (*s)
  1482 	{
  1483             if (pswit[ECHO_SWITCH])
  1484 		printf("\n%s\n",aline);
  1485             if (!pswit[OVERVIEW_SWITCH])
  1486                 printf("    Line %ld column %d - "
  1487 		  "Query missing paragraph break?\n",
  1488 		  linecnt,(int)(s-aline)+1);
  1489             else
  1490                 cnt_punct++;
  1491 	}
  1492         /*
  1493 	 * Check for "to he" and other easy he/be errors.
  1494          * This is a very inadequate effort on the he/be problem,
  1495          * but the phrase "to he" is always an error, whereas "to
  1496          * be" is quite common.
  1497          * Similarly, '"Quiet!", be said.' is a non-be error
  1498          * "to he" is _not_ always an error!:
  1499          *       "Where they went to he couldn't say."
  1500          * Another false positive:
  1501          *       What would "Cinderella" be without the . . .
  1502          * and another: "If he wants to he can see for himself."
  1503 	 */
  1504         s=wrk;
  1505         *s=0;
  1506         if (strstr(aline," to he "))
  1507 	    s=strstr(aline," to he ");
  1508         if (strstr(aline,"\" be "))
  1509 	    s=strstr(aline,"\" be ");
  1510         if (strstr(aline,"\", be "))
  1511 	    s=strstr(aline,"\", be ");
  1512         if (strstr(aline," is be "))
  1513 	    s=strstr(aline," is be ");
  1514         if (strstr(aline," be is "))
  1515 	    s=strstr(aline," be is ");
  1516         if (strstr(aline," was be "))
  1517 	    s=strstr(aline," was be ");
  1518         if (strstr(aline," be would "))
  1519 	    s=strstr(aline," be would ");
  1520         if (strstr(aline," be could "))
  1521 	    s=strstr(aline," be could ");
  1522         if (*s)
  1523 	{
  1524             if (pswit[ECHO_SWITCH])
  1525 		printf("\n%s\n",aline);
  1526             if (!pswit[OVERVIEW_SWITCH])
  1527                 printf("    Line %ld column %d - Query he/be error?\n",
  1528 		  linecnt,(int)(s-aline)+1);
  1529             else
  1530                 cnt_word++;
  1531 	}
  1532         s=wrk;
  1533         *s=0;
  1534         if (strstr(aline," i bad "))
  1535 	    s=strstr(aline," i bad ");
  1536         if (strstr(aline," you bad "))
  1537 	    s=strstr(aline," you bad ");
  1538         if (strstr(aline," he bad "))
  1539 	    s=strstr(aline," he bad ");
  1540         if (strstr(aline," she bad "))
  1541 	    s=strstr(aline," she bad ");
  1542         if (strstr(aline," they bad "))
  1543 	    s=strstr(aline," they bad ");
  1544         if (strstr(aline," a had "))
  1545 	    s=strstr(aline," a had ");
  1546         if (strstr(aline," the had "))
  1547 	    s=strstr(aline," the had ");
  1548         if (*s)
  1549 	{
  1550             if (pswit[ECHO_SWITCH])
  1551 		printf("\n%s\n",aline);
  1552             if (!pswit[OVERVIEW_SWITCH])
  1553                 printf("    Line %ld column %d - Query had/bad error?\n",
  1554 		  linecnt,(int)(s-aline)+1);
  1555             else
  1556                 cnt_word++;
  1557 	}
  1558         s=wrk;
  1559         *s=0;
  1560         if (strstr(aline,", hut "))
  1561 	    s=strstr(aline,", hut ");
  1562         if (strstr(aline,"; hut "))
  1563 	    s=strstr(aline,"; hut ");
  1564         if (*s)
  1565 	{
  1566             if (pswit[ECHO_SWITCH])
  1567 		printf("\n%s\n",aline);
  1568             if (!pswit[OVERVIEW_SWITCH])
  1569                 printf("    Line %ld column %d - Query hut/but error?\n",
  1570 		  linecnt,(int)(s-aline)+1);
  1571             else
  1572                 cnt_word++;
  1573 	}
  1574         /*
  1575 	 * Special case - angled bracket in front of "From" placed there by an
  1576 	 * MTA when sending an e-mail.
  1577 	 */
  1578         if (strstr(aline,">From"))
  1579 	{
  1580             if (pswit[ECHO_SWITCH])
  1581 		printf("\n%s\n",aline);
  1582             if (!pswit[OVERVIEW_SWITCH])
  1583                 printf("    Line %ld column %d - "
  1584 		  "Query angled bracket with From\n",
  1585 		  linecnt,(int)(strstr(aline,">From")-aline)+1);
  1586             else
  1587                 cnt_punct++;
  1588 	}
  1589         /*
  1590 	 * Check for a single character line -
  1591 	 * often an overflow from bad wrapping.
  1592 	 */
  1593         if (*aline && !aline[1])
  1594 	{
  1595             if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
  1596 	      gcisdigit(*aline))
  1597                 ; /* Nothing - ignore numerals alone on a line. */
  1598             else
  1599 	    {
  1600                 if (pswit[ECHO_SWITCH])
  1601 		    printf("\n%s\n",aline);
  1602                 if (!pswit[OVERVIEW_SWITCH])
  1603                     printf("    Line %ld column 1 - "
  1604 		      "Query single character line\n",linecnt);
  1605                 else
  1606                     cnt_punct++;
  1607 	    }
  1608 	}
  1609         /* Check for I" - often should be ! */
  1610         if (strstr(aline," I\""))
  1611 	{
  1612             if (pswit[ECHO_SWITCH])
  1613 		printf("\n%s\n",aline);
  1614             if (!pswit[OVERVIEW_SWITCH])
  1615                 printf("    Line %ld column %ld - Query I=exclamation mark?\n",
  1616 		  linecnt,strstr(aline," I\"")-aline);
  1617             else
  1618                 cnt_punct++;
  1619 	}
  1620         /*
  1621 	 * Check for period without a capital letter. Cut-down from gutspell.
  1622          * Only works when it happens on a single line.
  1623 	 */
  1624         if (pswit[PARANOID_SWITCH])
  1625 	{
  1626             for (t=s=aline;strstr(t,". ");)
  1627 	    {
  1628                 t=strstr(t,". ");
  1629                 if (t==s)
  1630 		{
  1631                     t++;
  1632 		    /* start of line punctuation is handled elsewhere */
  1633                     continue;
  1634 		}
  1635                 if (!gcisalpha(t[-1]))
  1636 		{
  1637                     t++;
  1638                     continue;
  1639 		}
  1640                 if (warnings->isDutch)
  1641 		{
  1642 		    /* For Frank & Jeroen -- 's Middags case */
  1643                     if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
  1644 		      t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
  1645 		    {
  1646                         t++;
  1647                         continue;
  1648 		    }
  1649 		}
  1650                 s1=t+2;
  1651                 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
  1652                     s1++;
  1653                 if (*s1>='a' && *s1<='z')
  1654 		{
  1655 		    /* we have something to investigate */
  1656                     istypo=1;
  1657 		    /* so let's go back and find out */
  1658                     for (s1=t-1;s1>=s &&
  1659 		      (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
  1660 		      gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
  1661 			;
  1662                     s1++;
  1663                     for (i=0;*s1 && *s1!='.';s1++,i++)
  1664                         testword[i]=*s1;
  1665                     testword[i]=0;
  1666                     for (i=0;*abbrev[i];i++)
  1667                         if (!strcmp(testword,abbrev[i]))
  1668                             istypo=0;
  1669                     if (gcisdigit(*testword))
  1670 			istypo=0;
  1671                     if (!testword[1])
  1672 			istypo=0;
  1673                     if (isroman(testword))
  1674 			istypo=0;
  1675                     if (istypo)
  1676 		    {
  1677                         istypo=0;
  1678                         for (i=0;testword[i];i++)
  1679                             if (strchr(vowels,testword[i]))
  1680                                 istypo=1;
  1681 		    }
  1682                     if (istypo)
  1683 		    {
  1684                         isdup=0;
  1685                         if (strlen(testword)<MAX_QWORD_LENGTH &&
  1686 			  !pswit[VERBOSE_SWITCH])
  1687                             for (i=0;i<qperiod_index;i++)
  1688                                 if (!strcmp(testword,qperiod[i]))
  1689                                     isdup=1;
  1690                         if (!isdup)
  1691 			{
  1692                             if (qperiod_index<MAX_QWORD &&
  1693 			      strlen(testword)<MAX_QWORD_LENGTH)
  1694 			    {
  1695                                 strcpy(qperiod[qperiod_index],testword);
  1696                                 qperiod_index++;
  1697 			    }
  1698                             if (pswit[ECHO_SWITCH])
  1699 				printf("\n%s\n",aline);
  1700                             if (!pswit[OVERVIEW_SWITCH])
  1701                                 printf("    Line %ld column %d - "
  1702 				  "Extra period?\n",linecnt,(int)(t-aline)+1);
  1703                             else
  1704                                 cnt_punct++;
  1705 			}
  1706 		    }
  1707 		}
  1708 	    t++;
  1709 	    }
  1710 	}
  1711         if (pswit[TYPO_SWITCH])
  1712 	{
  1713             /* Check for words usually not followed by punctuation. */
  1714             for (s=aline;*s;)
  1715 	    {
  1716                 wordstart=s;
  1717                 s=getaword(s,inword);
  1718                 if (!*inword)
  1719 		    continue;
  1720                 lowerit(inword);
  1721                 for (i=0;*nocomma[i];i++)
  1722                     if (!strcmp(inword,nocomma[i]))
  1723 		    {
  1724                         if (*s==',' || *s==';' || *s==':')
  1725 			{
  1726                             if (pswit[ECHO_SWITCH])
  1727 				printf("\n%s\n",aline);
  1728                             if (!pswit[OVERVIEW_SWITCH])
  1729                                 printf("    Line %ld column %d - "
  1730 				  "Query punctuation after %s?\n",
  1731 				  linecnt,(int)(s-aline)+1,inword);
  1732                             else
  1733                                 cnt_punct++;
  1734 			}
  1735 		    }
  1736 		for (i=0;*noperiod[i];i++)
  1737                     if (!strcmp(inword,noperiod[i]))
  1738 		    {
  1739                         if (*s=='.' || *s=='!')
  1740 			{
  1741                             if (pswit[ECHO_SWITCH])
  1742 				printf("\n%s\n",aline);
  1743                             if (!pswit[OVERVIEW_SWITCH])
  1744                                 printf("    Line %ld column %d - "
  1745 				  "Query punctuation after %s?\n",
  1746 				  linecnt,(int)(s-aline)+1,inword);
  1747                             else
  1748                                 cnt_punct++;
  1749 			}
  1750 		    }
  1751 	    }
  1752 	}
  1753         /*
  1754 	 * Check for commonly mistyped words,
  1755 	 * and digits like 0 for O in a word.
  1756 	 */
  1757         for (s=aline;*s;)
  1758 	{
  1759             wordstart=s;
  1760             s=getaword(s,inword);
  1761             if (!*inword)
  1762 		continue; /* don't bother with empty lines */
  1763             if (mixdigit(inword))
  1764 	    {
  1765                 if (pswit[ECHO_SWITCH])
  1766 		    printf("\n%s\n",aline);
  1767                 if (!pswit[OVERVIEW_SWITCH])
  1768                     printf("    Line %ld column %d - Query digit in %s\n",
  1769 		      linecnt,(int)(wordstart-aline)+1,inword);
  1770                 else
  1771                     cnt_word++;
  1772 	    }
  1773             /*
  1774 	     * Put the word through a series of tests for likely typos and OCR
  1775 	     * errors.
  1776 	     */
  1777             if (pswit[TYPO_SWITCH])
  1778 	    {
  1779                 istypo=0;
  1780                 strcpy(testword,inword);
  1781                 alower=0;
  1782                 for (i=0;i<(signed int)strlen(testword);i++)
  1783 		{
  1784 		    /* lowercase for testing */
  1785                     if (testword[i]>='a' && testword[i]<='z')
  1786 			alower=1;
  1787                     if (alower && testword[i]>='A' && testword[i]<='Z')
  1788 		    {
  1789                         /*
  1790 			 * We have an uppercase mid-word. However, there are
  1791 			 * common cases:
  1792                          *   Mac and Mc like McGill
  1793                          *   French contractions like l'Abbe
  1794 			 */
  1795                         if (i==2 && testword[0]=='m' && testword[1]=='c' ||
  1796                           i==3 && testword[0]=='m' && testword[1]=='a' &&
  1797 			  testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
  1798 			    ; /* do nothing! */
  1799                         else
  1800                             istypo=1;
  1801 		    }
  1802                     testword[i]=(char)tolower(testword[i]);
  1803 		}
  1804                 /*
  1805 		 * Check for certain unlikely two-letter combinations at word
  1806 		 * start and end.
  1807 		 */
  1808                 if (strlen(testword)>1)
  1809 		{
  1810                     for (i=0;*nostart[i];i++)
  1811                         if (!strncmp(testword,nostart[i],2))
  1812                             istypo=1;
  1813                     for (i=0;*noend[i];i++)
  1814                         if (!strncmp(testword+strlen(testword)-2,noend[i],2))
  1815                             istypo=1;
  1816 		}
  1817                 /* ght is common, gbt never. Like that. */
  1818                 if (strstr(testword,"cb"))
  1819 		    istypo=1;
  1820                 if (strstr(testword,"gbt"))
  1821 		    istypo=1;
  1822                 if (strstr(testword,"pbt"))
  1823 		    istypo=1;
  1824                 if (strstr(testword,"tbs"))
  1825 		    istypo=1;
  1826                 if (strstr(testword,"mrn"))
  1827 		    istypo=1;
  1828                 if (strstr(testword,"ahle"))
  1829 		    istypo=1;
  1830                 if (strstr(testword,"ihle"))
  1831 		    istypo=1;
  1832                 /*
  1833 		 * "TBE" does happen - like HEARTBEAT - but uncommon.
  1834                  * Also "TBI" - frostbite, outbid - but uncommon.
  1835                  * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1836 		 * numerals, but "ii" is a common scanno.
  1837 		 */
  1838                 if (strstr(testword,"tbi"))
  1839 		    istypo=1;
  1840                 if (strstr(testword,"tbe"))
  1841 		    istypo=1;
  1842                 if (strstr(testword,"ii"))
  1843 		    istypo=1;
  1844                 /*
  1845 		 * Check for no vowels or no consonants.
  1846                  * If none, flag a typo.
  1847 		 */
  1848                 if (!istypo && strlen(testword)>1)
  1849 		{
  1850                     vowel=consonant=0;
  1851                     for (i=0;testword[i];i++)
  1852 		    {
  1853                         if (testword[i]=='y' || gcisdigit(testword[i]))
  1854 			{
  1855 			    /* Yah, this is loose. */
  1856                             vowel++;
  1857                             consonant++;
  1858 			}
  1859                         else if (strchr(vowels,testword[i]))
  1860 			    vowel++;
  1861 			else
  1862 			    consonant++;
  1863 		    }
  1864                     if (!vowel || !consonant)
  1865                         istypo=1;
  1866 		}
  1867                 /*
  1868 		 * Now exclude the word from being reported if it's in
  1869                  * the okword list.
  1870 		 */
  1871                 for (i=0;*okword[i];i++)
  1872                     if (!strcmp(testword,okword[i]))
  1873                         istypo=0;
  1874                 /*
  1875 		 * What looks like a typo may be a Roman numeral.
  1876 		 * Exclude these.
  1877 		 */
  1878                 if (istypo && isroman(testword))
  1879 		    istypo=0;
  1880                 /* Check the manual list of typos. */
  1881                 if (!istypo)
  1882                     for (i=0;*typo[i];i++)
  1883                         if (!strcmp(testword,typo[i]))
  1884                             istypo=1;
  1885                 /*
  1886 		 * Check lowercase s, l, i and m - special cases.
  1887                  *   "j" - often a semi-colon gone wrong.
  1888                  *   "d" for a missing apostrophe - he d
  1889                  *   "n" for "in"
  1890 		 */
  1891                 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
  1892 		    istypo=1;
  1893                 if (istypo)
  1894 		{
  1895                     isdup=0;
  1896                     if (strlen(testword)<MAX_QWORD_LENGTH &&
  1897 		      !pswit[VERBOSE_SWITCH])
  1898                         for (i=0;i<qword_index;i++)
  1899                             if (!strcmp(testword,qword[i]))
  1900 			    {
  1901                                 isdup=1;
  1902                                 ++dupcnt[i];
  1903 			    }
  1904                     if (!isdup)
  1905 		    {
  1906                         if (qword_index<MAX_QWORD &&
  1907 			  strlen(testword)<MAX_QWORD_LENGTH)
  1908 			{
  1909                             strcpy(qword[qword_index],testword);
  1910                             qword_index++;
  1911 			}
  1912                         if (pswit[ECHO_SWITCH])
  1913 			    printf("\n%s\n",aline);
  1914                         if (!pswit[OVERVIEW_SWITCH])
  1915 			{
  1916                             printf("    Line %ld column %d - Query word %s",
  1917 			      linecnt,(int)(wordstart-aline)+1,inword);
  1918                             if (strlen(testword)<MAX_QWORD_LENGTH &&
  1919 			      !pswit[VERBOSE_SWITCH])
  1920                                 printf(" - not reporting duplicates");
  1921                             printf("\n");
  1922 			}
  1923                         else
  1924                             cnt_word++;
  1925 		    }
  1926 		}
  1927 	    }
  1928 	    /* check the user's list of typos */
  1929 	    if (!istypo && usertypo_count)
  1930 		for (i=0;i<usertypo_count;i++)
  1931 		    if (!strcmp(testword,usertypo[i]))
  1932 		    {
  1933 			if (pswit[ECHO_SWITCH])
  1934 			    printf("\n%s\n",aline);
  1935 			if (!pswit[OVERVIEW_SWITCH])  
  1936 			    printf("    Line %ld column %d - "
  1937 			      "Query possible scanno %s\n",
  1938 			      linecnt,(int)(wordstart-aline)+2,inword);
  1939 		    }
  1940             if (pswit[PARANOID_SWITCH] && warnings->digit)
  1941 	    {
  1942 		/* In paranoid mode, query all 0 and 1 standing alone. */
  1943                 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1944 		{
  1945                     if (pswit[ECHO_SWITCH])
  1946 			printf("\n%s\n",aline);
  1947                     if (!pswit[OVERVIEW_SWITCH])
  1948                         printf("    Line %ld column %d - Query standalone %s\n",
  1949 			  linecnt,(int)(wordstart-aline)+2,inword);
  1950                     else
  1951                         cnt_word++;
  1952 		}
  1953 	    }
  1954 	}
  1955 	/*
  1956          * Look for added or missing spaces around punctuation and quotes.
  1957          * If there is a punctuation character like ! with no space on
  1958          * either side, suspect a missing!space. If there are spaces on
  1959          * both sides , assume a typo. If we see a double quote with no
  1960          * space or punctuation on either side of it, assume unspaced
  1961          * quotes "like"this.
  1962 	 */
  1963         llen=strlen(aline);
  1964         for (i=1;i<llen;i++)
  1965 	{
  1966 	    /* For each character in the line after the first. */
  1967             if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
  1968 	    {
  1969 		/* we need to suppress warnings for acronyms like M.D. */
  1970                 isacro=0;
  1971 		/* we need to suppress warnings for ellipsis . . . */
  1972                 isellipsis=0;
  1973 		/* if there are letters on both sides of it or ... */
  1974                 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
  1975                    gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
  1976 		{
  1977 		    /* ...if it's strict punctuation followed by an alpha */
  1978                     if (aline[i]=='.')
  1979 		    {
  1980                         if (i>2 && aline[i-2]=='.')
  1981 			    isacro=1;
  1982                         if (i+2<llen && aline[i+2]=='.')
  1983 			    isacro=1;
  1984 		    }
  1985                     if (!isacro)
  1986 		    {
  1987                         if (pswit[ECHO_SWITCH])
  1988 			    printf("\n%s\n",aline);
  1989                         if (!pswit[OVERVIEW_SWITCH])
  1990                             printf("    Line %ld column %d - Missing space?\n",
  1991 			      linecnt,i+1);
  1992                         else
  1993                             cnt_punct++;
  1994 		    }
  1995 		}
  1996                 if (aline[i-1]==CHAR_SPACE &&
  1997 		  (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
  1998 		{
  1999 		    /*
  2000 		     * If there are spaces on both sides,
  2001 		     * or space before and end of line.
  2002 		     */
  2003                     if (aline[i]=='.')
  2004 		    {
  2005                         if (i>2 && aline[i-2]=='.')
  2006 			    isellipsis=1;
  2007                         if (i+2<llen && aline[i+2]=='.')
  2008 			    isellipsis=1;
  2009 		    }
  2010                     if (!isemptyline && !isellipsis)
  2011 		    {
  2012                         if (pswit[ECHO_SWITCH])
  2013 			    printf("\n%s\n",aline);
  2014                         if (!pswit[OVERVIEW_SWITCH])
  2015                             printf("    Line %ld column %d - "
  2016 			      "Spaced punctuation?\n",linecnt,i+1);
  2017                         else
  2018                             cnt_punct++;
  2019 		    }
  2020 		}
  2021 	    }
  2022 	}
  2023         /* Split out the characters that CANNOT be preceded by space. */
  2024         llen=strlen(aline);
  2025         for (i=1;i<llen;i++)
  2026 	{
  2027 	    /* for each character in the line after the first */
  2028             if (strchr("?!,;:",aline[i]))
  2029 	    {
  2030 		/* if it's punctuation that _cannot_ have a space before it */
  2031                 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
  2032 		  aline[i+1]!=CHAR_SPACE)
  2033 		{
  2034 		    /*
  2035 		     * If aline[i+1) DOES == space,
  2036 		     * it was already reported just above.
  2037 		     */
  2038                     if (pswit[ECHO_SWITCH])
  2039 			printf("\n%s\n",aline);
  2040                     if (!pswit[OVERVIEW_SWITCH])
  2041                         printf("    Line %ld column %d - Spaced punctuation?\n",
  2042 			  linecnt,i+1);
  2043                     else
  2044                         cnt_punct++;
  2045 		}
  2046 	    }
  2047 	}
  2048         /*
  2049 	 * Special case " .X" where X is any alpha.
  2050          * This plugs a hole in the acronym code above.
  2051 	 * Inelegant, but maintainable.
  2052 	 */
  2053         llen=strlen(aline);
  2054         for (i=1;i<llen;i++)
  2055 	{
  2056 	    /* for each character in the line after the first */
  2057             if (aline[i]=='.')
  2058 	    {
  2059 		/* if it's a period */
  2060                 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
  2061 		{
  2062 		    /*
  2063 		     * If the period follows a space and
  2064 		     * is followed by a letter.
  2065 		     */
  2066                     if (pswit[ECHO_SWITCH])
  2067 			printf("\n%s\n",aline);
  2068                     if (!pswit[OVERVIEW_SWITCH])
  2069                         printf("    Line %ld column %d - Spaced punctuation?\n",
  2070 			  linecnt,i+1);
  2071                     else
  2072                         cnt_punct++;
  2073 		}
  2074 	    }
  2075 	}
  2076         for (i=1;i<llen;i++)
  2077 	{
  2078 	    /* for each character in the line after the first */
  2079             if (aline[i]==CHAR_DQUOTE)
  2080 	    {
  2081                 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
  2082 		  !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
  2083 		  !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
  2084 		{
  2085 		    if (pswit[ECHO_SWITCH])
  2086 			printf("\n%s\n",aline);
  2087 		    if (!pswit[OVERVIEW_SWITCH])
  2088 			printf("    Line %ld column %d - Unspaced quotes?\n",
  2089 			  linecnt,i+1);
  2090 		    else
  2091 			cnt_punct++;
  2092 		}
  2093 	    }
  2094 	}
  2095         /* Check parity of quotes. */
  2096         for (s=aline;*s;s++)
  2097 	{
  2098             if (*s==CHAR_DQUOTE)
  2099 	    {
  2100                 if (!(dquotepar=!dquotepar))
  2101 		{
  2102 		    /* parity even */
  2103                     if (!strchr("_-.'`/,;:!?)]} ",s[1]))
  2104 		    {
  2105                         if (pswit[ECHO_SWITCH])
  2106 			    printf("\n%s\n",aline);
  2107                         if (!pswit[OVERVIEW_SWITCH])
  2108                             printf("    Line %ld column %d - "
  2109 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
  2110                         else
  2111                             cnt_punct++;
  2112 		    }
  2113 		}
  2114                 else
  2115 		{
  2116 		    /* parity odd */
  2117                     if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
  2118 		      !strchr("_-/.'`([{$",s[1]) || !s[1])
  2119 		    {
  2120                         if (pswit[ECHO_SWITCH])
  2121 			    printf("\n%s\n",aline);
  2122                         if (!pswit[OVERVIEW_SWITCH])
  2123                             printf("    Line %ld column %d - "
  2124 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
  2125                         else
  2126                             cnt_punct++;
  2127 		    }
  2128 		}
  2129 	    }
  2130 	}
  2131 	if (*aline==CHAR_DQUOTE)
  2132 	{
  2133 	    if (strchr(",;:!?)]} ",aline[1]))
  2134 	    {
  2135 		if (pswit[ECHO_SWITCH])
  2136 		    printf("\n%s\n",aline);
  2137 		if (!pswit[OVERVIEW_SWITCH])
  2138 		    printf("    Line %ld column 1 - Wrongspaced quotes?\n",
  2139 		      linecnt);
  2140 		else
  2141 		    cnt_punct++;
  2142 	    }
  2143 	}
  2144         if (pswit[SQUOTE_SWITCH])
  2145 	{
  2146             for (s=aline;*s;s++)
  2147 	    {
  2148                 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
  2149 		  (s==aline || s>aline && !gcisalpha(s[-1]) ||
  2150 		  !gcisalpha(s[1])))
  2151 		{
  2152                     if (!(squotepar=!squotepar))
  2153 		    {
  2154 			/* parity even */
  2155                         if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
  2156 			{
  2157                             if (pswit[ECHO_SWITCH])
  2158 				printf("\n%s\n",aline);
  2159                             if (!pswit[OVERVIEW_SWITCH])
  2160                                 printf("    Line %ld column %d - "
  2161 				  "Wrongspaced singlequotes?\n",
  2162 				  linecnt,(int)(s-aline)+1);
  2163                             else
  2164                                 cnt_punct++;
  2165 			}
  2166 		    }
  2167                     else
  2168 		    {
  2169 			/* parity odd */
  2170                         if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
  2171 			  !strchr("_-/\".'`",s[1]) || !s[1])
  2172 			{
  2173                             if (pswit[ECHO_SWITCH])
  2174 				printf("\n%s\n",aline);
  2175                             if (!pswit[OVERVIEW_SWITCH])
  2176                                 printf("    Line %ld column %d - "
  2177 				  "Wrongspaced singlequotes?\n",
  2178 				  linecnt,(int)(s-aline)+1);
  2179                             else
  2180                                 cnt_punct++;
  2181 			}
  2182 		    }
  2183 		}
  2184 	    }
  2185 	}
  2186         /*
  2187 	 * Look for double punctuation like ,. or ,,
  2188          * Thanks to DW for the suggestion!
  2189          * In books with references, ".," and ".;" are common
  2190          * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2191          * OTOH, from my initial tests, there are also fairly
  2192          * common errors. What to do? Make these cases paranoid?
  2193          * ".," is the most common, so warnings->dotcomma is used
  2194          * to suppress detailed reporting if it occurs often.
  2195 	 */
  2196         llen=strlen(aline);
  2197         for (i=0;i<llen;i++)
  2198 	{
  2199 	    /* for each punctuation character in the line */
  2200             if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
  2201 	      aline[i] && aline[i+1])
  2202 	    {
  2203 		/* followed by punctuation, it's a query, unless . . . */
  2204                 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
  2205 		  aline[i]=='!') ||
  2206 		  !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
  2207 		  warnings->isFrench && !strncmp(aline+i,",...",4) ||
  2208 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||
  2209 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||
  2210 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||
  2211 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||
  2212 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||
  2213 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||
  2214 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||
  2215 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||
  2216 		  warnings->isFrench && !strncmp(aline+i,"...?",4))
  2217 		{
  2218 		    if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
  2219 		      warnings->isFrench && !strncmp(aline+i,"...,",4) ||
  2220 		      warnings->isFrench && !strncmp(aline+i,";...",4) ||
  2221 		      warnings->isFrench && !strncmp(aline+i,"...;",4) ||
  2222 		      warnings->isFrench && !strncmp(aline+i,":...",4) ||
  2223 		      warnings->isFrench && !strncmp(aline+i,"...:",4) ||
  2224 		      warnings->isFrench && !strncmp(aline+i,"!...",4) ||
  2225 		      warnings->isFrench && !strncmp(aline+i,"...!",4) ||
  2226 		      warnings->isFrench && !strncmp(aline+i,"?...",4) ||
  2227 		      warnings->isFrench && !strncmp(aline+i,"...?",4))
  2228 			i+=4;
  2229 		    ; /* do nothing for .. !! and ?? which can be legit */
  2230 		}
  2231                 else
  2232 		{
  2233                     if (pswit[ECHO_SWITCH])
  2234 			printf("\n%s\n",aline);
  2235                     if (!pswit[OVERVIEW_SWITCH])
  2236                         printf("    Line %ld column %d - Double punctuation?\n",
  2237 			  linecnt,i+1);
  2238                     else
  2239                         cnt_punct++;
  2240 		}
  2241 	    }
  2242 	}
  2243         s=aline;
  2244         while (strstr(s," \" "))
  2245 	{
  2246             if (pswit[ECHO_SWITCH])
  2247 		printf("\n%s\n",aline);
  2248             if (!pswit[OVERVIEW_SWITCH])
  2249                 printf("    Line %ld column %d - Spaced doublequote?\n",
  2250 		  linecnt,(int)(strstr(s," \" ")-aline+1));
  2251             else
  2252                 cnt_punct++;
  2253             s=strstr(s," \" ")+2;
  2254 	}
  2255         s=aline;
  2256         while (strstr(s," ' "))
  2257 	{
  2258             if (pswit[ECHO_SWITCH])
  2259 		printf("\n%s\n",aline);
  2260             if (!pswit[OVERVIEW_SWITCH])
  2261                 printf("    Line %ld column %d - Spaced singlequote?\n",
  2262 		  linecnt,(int)(strstr(s," ' ")-aline+1));
  2263             else
  2264                 cnt_punct++;
  2265             s=strstr(s," ' ")+2;
  2266 	}
  2267         s=aline;
  2268         while (strstr(s," ` "))
  2269 	{
  2270             if (pswit[ECHO_SWITCH])
  2271 		printf("\n%s\n",aline);
  2272             if (!pswit[OVERVIEW_SWITCH])
  2273                 printf("    Line %ld column %d - Spaced singlequote?\n",
  2274 		  linecnt,(int)(strstr(s," ` ")-aline+1));
  2275             else
  2276                 cnt_punct++;
  2277             s=strstr(s," ` ")+2;
  2278 	}
  2279         /* check special case of 'S instead of 's at end of word */
  2280         s=aline+1;
  2281         while (*s)
  2282 	{
  2283             if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
  2284 	    {
  2285                 if (pswit[ECHO_SWITCH])
  2286 		    printf("\n%s\n",aline);
  2287                 if (!pswit[OVERVIEW_SWITCH])
  2288                     printf("    Line %ld column %d - Capital \"S\"?\n",
  2289 		      linecnt,(int)(s-aline+2));
  2290                 else
  2291                     cnt_punct++;
  2292 	    }
  2293             s++;
  2294 	}
  2295         /*
  2296 	 * Now check special cases - start and end of line -
  2297          * for single and double quotes. Start is sometimes [sic]
  2298          * but better to query it anyway.
  2299          * While we're here, check for dash at end of line.
  2300 	 */
  2301         llen=strlen(aline);
  2302         if (llen>1)
  2303 	{
  2304             if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
  2305 	      aline[llen-1]==CHAR_OPEN_SQUOTE)
  2306                 if (aline[llen-2]==CHAR_SPACE)
  2307 		{
  2308                     if (pswit[ECHO_SWITCH])
  2309 			printf("\n%s\n",aline);
  2310                     if (!pswit[OVERVIEW_SWITCH])
  2311                         printf("    Line %ld column %d - Spaced quote?\n",
  2312 			  linecnt,llen);
  2313                     else
  2314                         cnt_punct++;
  2315 		}
  2316             if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
  2317 	      aline[1]==CHAR_SPACE)
  2318 	    {
  2319 		if (pswit[ECHO_SWITCH])
  2320 		    printf("\n%s\n",aline);
  2321 		if (!pswit[OVERVIEW_SWITCH])
  2322 		    printf("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2323 		else
  2324 		    cnt_punct++;
  2325 	    }
  2326             /*
  2327 	     * Dash at end of line may well be legit - paranoid mode only
  2328              * and don't report em-dash at line-end.
  2329 	     */
  2330             if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2331 	    {
  2332                 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
  2333 		    ;
  2334                 if (aline[i]=='-' && aline[i-1]!='-')
  2335 		{
  2336                     if (pswit[ECHO_SWITCH])
  2337 			printf("\n%s\n",aline);
  2338                     if (!pswit[OVERVIEW_SWITCH])
  2339                         printf("    Line %ld column %d - "
  2340 			  "Hyphen at end of line?\n",linecnt,i);
  2341 		}
  2342 	    }
  2343 	}
  2344         /*
  2345 	 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2346          * If so, suspect a scanno like "a]most".
  2347 	 */
  2348         llen=strlen(aline);
  2349         for (i=1;i<llen-1;i++)
  2350 	{
  2351 	    /* for each bracket character in the line except 1st & last */
  2352             if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
  2353 	      gcisalpha(aline[i+1]))
  2354 	    {
  2355                 if (pswit[ECHO_SWITCH])
  2356 		    printf("\n%s\n",aline);
  2357                 if (!pswit[OVERVIEW_SWITCH])
  2358                     printf("    Line %ld column %d - Unspaced bracket?\n",
  2359 		      linecnt,i);
  2360                 else
  2361                     cnt_punct++;
  2362 	    }
  2363 	}
  2364         llen=strlen(aline);
  2365         if (warnings->endquote)
  2366 	{
  2367             for (i=1;i<llen;i++)
  2368 	    {
  2369 		/* for each character in the line except 1st */
  2370                 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
  2371 		{
  2372 		    if (pswit[ECHO_SWITCH])
  2373 			printf("\n%s\n",aline);
  2374 		    if (!pswit[OVERVIEW_SWITCH])
  2375 			printf("    Line %ld column %d - "
  2376 			  "endquote missing punctuation?\n",linecnt,i);
  2377 		    else
  2378 			cnt_punct++;
  2379 		}
  2380 	    }
  2381 	}
  2382 	/*
  2383          * Check for <HTML TAG>.
  2384          * If there is a < in the line, followed at some point
  2385          * by a > then we suspect HTML.
  2386 	 */
  2387         if (strstr(aline,"<") && strstr(aline,">"))
  2388 	{
  2389             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
  2390             if (i>0)
  2391 	    {
  2392                 strncpy(wrk,strstr(aline,"<"),i);
  2393                 wrk[i]=0;
  2394                 if (pswit[ECHO_SWITCH])
  2395 		    printf("\n%s\n",aline);
  2396                 if (!pswit[OVERVIEW_SWITCH])
  2397                     printf("    Line %ld column %d - HTML Tag? %s \n",
  2398 		      linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
  2399                 else
  2400                     cnt_html++;
  2401 	    }
  2402 	}
  2403         /*
  2404 	 * Check for &symbol; HTML.
  2405          * If there is a & in the line, followed at
  2406          * some point by a ; then we suspect HTML.
  2407 	 */
  2408         if (strstr(aline,"&") && strstr(aline,";"))
  2409 	{
  2410             i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
  2411             for (s=strstr(aline,"&");s<strstr(aline,";");s++)   
  2412                 if (*s==CHAR_SPACE)
  2413 		    i=0;                /* Don't report "Jones & Son;" */
  2414             if (i>0)
  2415 	    {
  2416                 strncpy(wrk,strstr(aline,"&"),i);
  2417                 wrk[i]=0;
  2418                 if (pswit[ECHO_SWITCH])
  2419 		    printf("\n%s\n",aline);
  2420                 if (!pswit[OVERVIEW_SWITCH])
  2421                     printf("    Line %ld column %d - HTML symbol? %s \n",
  2422 		      linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
  2423                 else
  2424                     cnt_html++;
  2425 	    }
  2426 	}
  2427         /*
  2428 	 * At end of paragraph, check for mismatched quotes.
  2429          * We don't want to report an error immediately, since it is a
  2430          * common convention to omit the quotes at end of paragraph if
  2431          * the next paragraph is a continuation of the same speaker.
  2432          * Where this is the case, the next para should begin with a
  2433          * quote, so we store the warning message and only display it
  2434          * at the top of the next iteration if the new para doesn't
  2435          * start with a quote.
  2436          * The -p switch overrides this default, and warns of unclosed
  2437          * quotes on _every_ paragraph, whether the next begins with a
  2438          * quote or not.
  2439 	 */
  2440         if (isemptyline)
  2441 	{
  2442 	    /* end of para - add up the totals */
  2443             if (counters.quot%2)
  2444                 sprintf(dquote_err,"    Line %ld - Mismatched quotes\n",
  2445 		  linecnt);
  2446             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
  2447 	      counters.open_single_quote!=counters.close_single_quote)
  2448                 sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n",
  2449 		  linecnt);
  2450             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
  2451 	      counters.open_single_quote!=counters.close_single_quote &&
  2452 	      counters.open_single_quote!=counters.close_single_quote+1)
  2453 		/*
  2454 		 * Flag it to be noted regardless of the
  2455 		 * first char of the next para.
  2456 		 */
  2457                 squot=1;
  2458             if (counters.r_brack)
  2459                 sprintf(rbrack_err,"    Line %ld - "
  2460 		  "Mismatched round brackets?\n",linecnt);
  2461             if (counters.s_brack)
  2462                 sprintf(sbrack_err,"    Line %ld - "
  2463 		  "Mismatched square brackets?\n",linecnt);
  2464             if (counters.c_brack)
  2465                 sprintf(cbrack_err,"    Line %ld - "
  2466 		  "Mismatched curly brackets?\n",linecnt);
  2467             if (counters.c_unders%2)
  2468                 sprintf(unders_err,"    Line %ld - Mismatched underscores?\n",
  2469 		  linecnt);
  2470 	    memset(&counters,0,sizeof(counters));
  2471 	    /* let the next iteration know that it's starting a new para */
  2472             isnewpara=1;
  2473 	}
  2474         /*
  2475 	 * Check for omitted punctuation at end of paragraph by working back
  2476 	 * through prevline. DW.
  2477          * Need to check this only for "normal" paras.
  2478          * So what is a "normal" para?
  2479          *    Not normal if one-liner (chapter headings, etc.)
  2480          *    Not normal if doesn't contain at least one locase letter
  2481          *    Not normal if starts with space
  2482 	 */
  2483         if (isemptyline)
  2484 	{
  2485 	    /* end of para */
  2486             for (s=prevline,i=0;*s && !i;s++)
  2487                 if (gcisletter(*s))
  2488 		    /* use i to indicate the presence of a letter on the line */
  2489                     i=1;
  2490             /*
  2491 	     * This next "if" is a problem.
  2492              * If we say "start_para_line <= linecnt - 1", that includes
  2493 	     * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2494              * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2495              * misses genuine one-line paragraphs.
  2496 	     */
  2497             if (i && last.blen>2 && start_para_line<linecnt-1 &&
  2498 	      *prevline>CHAR_SPACE)
  2499 	    {
  2500                 for (i=strlen(prevline)-1;
  2501 		  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
  2502 		  prevline[i]>CHAR_SPACE && i>0;
  2503 		  i--)
  2504 		    ;
  2505                 for (;i>0;i--)
  2506 		{
  2507                     if (gcisalpha(prevline[i]))
  2508 		    {
  2509                         if (pswit[ECHO_SWITCH])
  2510 			    printf("\n%s\n",prevline);
  2511                         if (!pswit[OVERVIEW_SWITCH])
  2512                             printf("    Line %ld column %d - "
  2513 			      "No punctuation at para end?\n",
  2514 			      linecnt-1,strlen(prevline));
  2515                         else
  2516                             cnt_punct++;
  2517                         break;
  2518 		    }
  2519                     if (strchr("-.:!([{?}])",prevline[i]))
  2520                         break;
  2521 		}
  2522 	    }
  2523 	}
  2524         strcpy(prevline,aline);
  2525     }
  2526     fclose(infile);
  2527     if (!pswit[OVERVIEW_SWITCH])
  2528         for (i=0;i<MAX_QWORD;i++)
  2529             if (dupcnt[i])
  2530                 printf("\nNote: Queried word %s was duplicated %d time%s\n",
  2531 		  qword[i],dupcnt[i],"s");
  2532 }
  2533 
  2534 /*
  2535  * flgets:
  2536  *
  2537  * Get one line from the input stream, checking for
  2538  * the existence of exactly one CR/LF line-end per line.
  2539  *
  2540  * Returns: a pointer to the line.
  2541  */
  2542 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
  2543 {
  2544     char c;
  2545     int len,isCR,cint;
  2546     *theline=0;
  2547     len=isCR=0;
  2548     c=cint=fgetc(thefile);
  2549     do
  2550     {
  2551         if (cint==EOF)
  2552             return NULL;
  2553 	/* either way, it's end of line */
  2554         if (c==10)
  2555 	{
  2556             if (isCR)
  2557                 break;
  2558             else
  2559 	    {
  2560 		/* Error - a LF without a preceding CR */
  2561                 if (pswit[LINE_END_SWITCH])
  2562 		{
  2563                     if (pswit[ECHO_SWITCH])
  2564 			printf("\n%s\n",theline);
  2565                     if (!pswit[OVERVIEW_SWITCH])
  2566                         printf("    Line %ld - No CR?\n",lcnt);
  2567                     else
  2568                         cnt_lineend++;
  2569 		}
  2570                 break;
  2571 	    }
  2572 	}
  2573         if (c==13)
  2574 	{
  2575             if (isCR)
  2576 	    {
  2577 		/* Error - two successive CRs */
  2578                 if (pswit[LINE_END_SWITCH])
  2579 		{
  2580                     if (pswit[ECHO_SWITCH])
  2581 			printf("\n%s\n",theline);
  2582                     if (!pswit[OVERVIEW_SWITCH])
  2583                         printf("    Line %ld - Two successive CRs?\n",lcnt);
  2584                     else
  2585                         cnt_lineend++;
  2586 		}
  2587 	    }
  2588             isCR=1;
  2589 	}
  2590         else
  2591 	{
  2592             if (pswit[LINE_END_SWITCH] && isCR)
  2593 	    {
  2594                 if (pswit[ECHO_SWITCH])
  2595 		    printf("\n%s\n",theline);
  2596                 if (!pswit[OVERVIEW_SWITCH])
  2597                     printf("    Line %ld column %d - CR without LF?\n",
  2598 		      lcnt,len+1);
  2599                 else
  2600                     cnt_lineend++;
  2601 	    }
  2602             theline[len]=c;
  2603             len++;
  2604             theline[len]=0;
  2605             isCR=0;
  2606 	}
  2607         c=cint=fgetc(thefile);
  2608     } while(len<maxlen);
  2609     if (pswit[MARKUP_SWITCH])  
  2610         postprocess_for_HTML(theline);
  2611     if (pswit[DP_SWITCH])  
  2612         postprocess_for_DP(theline);
  2613     return theline;
  2614 }
  2615 
  2616 /*
  2617  * mixdigit:
  2618  *
  2619  * Takes a "word" as a parameter, and checks whether it
  2620  * contains a mixture of alpha and digits. Generally, this is an
  2621  * error, but may not be for cases like 4th or L5 12s. 3d.
  2622  *
  2623  * Returns: 0 if no error found, 1 if error.
  2624  */
  2625 int mixdigit(char *checkword)
  2626 {
  2627     int wehaveadigit,wehavealetter,firstdigits,query,wl;
  2628     char *s;
  2629     wehaveadigit=wehavealetter=query=0;
  2630     for (s=checkword;*s;s++)
  2631         if (gcisalpha(*s))
  2632             wehavealetter=1;
  2633         else
  2634             if (gcisdigit(*s))
  2635                 wehaveadigit=1;
  2636     if (wehaveadigit && wehavealetter)
  2637     {
  2638 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2639         query=1;
  2640         wl=strlen(checkword);
  2641         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
  2642             ;
  2643         /* digits, ending in st, rd, nd, th of either case */
  2644         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
  2645 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
  2646 	  matchword(checkword+wl-2,"th")))
  2647 	    query=0;
  2648         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
  2649 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
  2650 	  matchword(checkword+wl-3,"ths")))
  2651 	    query=0;
  2652         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
  2653 	  matchword(checkword+wl-4,"rdly") ||
  2654 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
  2655 	    query=0;
  2656         /* digits, ending in l, L, s or d */
  2657         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
  2658 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
  2659 	    query=0;
  2660         /*
  2661 	 * L at the start of a number, representing Britsh pounds, like L500.
  2662          * This is cute. We know the current word is mixeddigit. If the first
  2663          * letter is L, there must be at least one digit following. If both
  2664          * digits and letters follow, we have a genuine error, else we have a
  2665          * capital L followed by digits, and we accept that as a non-error.
  2666 	 */
  2667         if (checkword[0]=='L' && !mixdigit(checkword+1))
  2668 	    query=0;
  2669     }
  2670     return query;
  2671 }
  2672 
  2673 /*
  2674  * getaword:
  2675  *
  2676  * Extracts the first/next "word" from the line, and puts
  2677  * it into "thisword". A word is defined as one English word unit--or
  2678  * at least that's the aim.
  2679  *
  2680  * Returns: a pointer to the position in the line where we will start
  2681  *          looking for the next word.
  2682  */
  2683 char *getaword(char *fromline,char *thisword)
  2684 {
  2685     int i,wordlen;
  2686     char *s;
  2687     wordlen=0;
  2688     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
  2689       fromline++)
  2690 	;
  2691     /*
  2692      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2693      * Especially yucky is the case of L1,000
  2694      * This section looks for a pattern of characters including a digit
  2695      * followed by a comma or period followed by one or more digits.
  2696      * If found, it returns this whole pattern as a word; otherwise we discard
  2697      * the results and resume our normal programming.
  2698      */
  2699     s=fromline;
  2700     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
  2701       wordlen<MAXWORDLEN;s++)
  2702     {
  2703 	thisword[wordlen]=*s;
  2704         wordlen++;
  2705     }
  2706     thisword[wordlen]=0;
  2707     for (i=1;i<wordlen-1;i++)
  2708     {
  2709         if (thisword[i]=='.' || thisword[i]==',')
  2710 	{
  2711             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
  2712 	    {
  2713                 fromline=s;
  2714                 return fromline;
  2715 	    }
  2716 	}
  2717     }
  2718     /* we didn't find a punctuated number - do the regular getword thing */
  2719     wordlen=0;
  2720     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
  2721       wordlen<MAXWORDLEN;fromline++)
  2722     {
  2723         thisword[wordlen]=*fromline;
  2724         wordlen++;
  2725     }
  2726     thisword[wordlen]=0;
  2727     return fromline;
  2728 }
  2729 
  2730 /*
  2731  * matchword:
  2732  *
  2733  * A case-insensitive string matcher.
  2734  */
  2735 int matchword(char *checkfor,char *thisword)
  2736 {
  2737     unsigned int ismatch,i;
  2738     if (strlen(checkfor)!=strlen(thisword))
  2739 	return 0;
  2740     ismatch=1;     /* assume a match until we find a difference */
  2741     for (i=0;i<strlen(checkfor);i++)
  2742         if (toupper(checkfor[i])!=toupper(thisword[i]))
  2743             ismatch=0;
  2744     return ismatch;
  2745 }
  2746 
  2747 /*
  2748  * lowerit:
  2749  *
  2750  * Lowercase the line.
  2751  */
  2752 
  2753 void lowerit(char *theline)
  2754 {
  2755     for (;*theline;theline++)
  2756         if (*theline>='A' && *theline<='Z')
  2757             *theline+=32;
  2758 }
  2759 
  2760 /*
  2761  * isroman:
  2762  *
  2763  * Is this word a Roman Numeral?
  2764  *
  2765  * It doesn't actually validate that the number is a valid Roman Numeral--for
  2766  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  2767  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  2768  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  2769  * expressions thereof, except when it came to taxes. Allow any number of M,
  2770  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  2771  * XL or an optional XC, an optional IX or IV, an optional V and any number
  2772  * of optional Is.
  2773  */
  2774 int isroman(char *t)
  2775 {
  2776     char *s;
  2777     if (!t || !*t)
  2778 	return 0;
  2779     s=t;
  2780     while (*t=='m' && *t)
  2781 	t++;
  2782     if (*t=='d')
  2783 	t++;
  2784     if (*t=='c' && t[1]=='m')
  2785 	t+=2;
  2786     if (*t=='c' && t[1]=='d')
  2787 	t+=2;
  2788     while (*t=='c' && *t)
  2789 	t++;
  2790     if (*t=='x' && t[1]=='l')
  2791 	t+=2;
  2792     if (*t=='x' && t[1]=='c')
  2793 	t+=2;
  2794     if (*t=='l')
  2795 	t++;
  2796     while (*t=='x' && *t)
  2797 	t++;
  2798     if (*t=='i' && t[1]=='x')
  2799 	t+=2;
  2800     if (*t=='i' && t[1]=='v')
  2801 	t+=2;
  2802     if (*t=='v')
  2803 	t++;
  2804     while (*t=='i' && *t)
  2805 	t++;
  2806     return !*t;
  2807 }
  2808 
  2809 /*
  2810  * gcisalpha:
  2811  *
  2812  * A version of isalpha() that is somewhat lenient on 8-bit texts.
  2813  * If we use the standard function, 8-bit accented characters break
  2814  * words, so that tete with accented characters appears to be two words, "t"
  2815  * and "t", with 8-bit characters between them. This causes over-reporting of
  2816  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
  2817  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
  2818  */
  2819 int gcisalpha(unsigned char c)
  2820 {
  2821     if (c>='a' && c<='z')
  2822 	return 1;
  2823     if (c>='A' && c<='Z')
  2824 	return 1;
  2825     if (c<140)
  2826 	return 0;
  2827     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
  2828 	return 1;
  2829     if (c==140 || c==142 || c==156 || c==158 || c==159)
  2830 	return 1;
  2831     return 0;
  2832 }
  2833 
  2834 /*
  2835  * gcisdigit:
  2836  *
  2837  * A version of isdigit() that doesn't get confused in 8-bit texts.
  2838  */
  2839 int gcisdigit(unsigned char c)
  2840 {   
  2841     return c>='0' && c<='9';
  2842 }
  2843 
  2844 /*
  2845  * gcisletter:
  2846  *
  2847  * A version of isletter() that doesn't get confused in 8-bit texts.
  2848  * NB: this is ISO-8891-1-specific.
  2849  */
  2850 int gcisletter(unsigned char c)
  2851 {   
  2852     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
  2853 }
  2854 
  2855 /*
  2856  * gcstrchr:
  2857  *
  2858  * Wraps strchr to return NULL if the character being searched for is zero.
  2859  */
  2860 char *gcstrchr(char *s,char c)
  2861 {
  2862     if (!c)
  2863 	return NULL;
  2864     return strchr(s,c);
  2865 }
  2866 
  2867 /*
  2868  * postprocess_for_DP:
  2869  *
  2870  * Invoked with the -d switch from flgets().
  2871  * It simply "removes" from the line a hard-coded set of common
  2872  * DP-specific tags, so that the line passed to the main routine has
  2873  * been pre-cleaned of DP markup.
  2874  */
  2875 void postprocess_for_DP(char *theline)
  2876 {
  2877     char *s,*t;
  2878     int i;
  2879     if (!*theline) 
  2880         return;
  2881     for (i=0;*DPmarkup[i];i++)
  2882     {
  2883         s=strstr(theline,DPmarkup[i]);
  2884         while (s)
  2885 	{
  2886             t=s+strlen(DPmarkup[i]);
  2887             while (*t)
  2888 	    {
  2889                 *s=*t;
  2890                 t++;
  2891 		s++;
  2892 	    }
  2893             *s=0;
  2894             s=strstr(theline,DPmarkup[i]);
  2895 	}
  2896     }
  2897 }
  2898 
  2899 /*
  2900  * postprocess_for_HTML:
  2901  *
  2902  * Invoked with the -m switch from flgets().
  2903  * It simply "removes" from the line a hard-coded set of common
  2904  * HTML tags and "replaces" a hard-coded set of common HTML
  2905  * entities, so that the line passed to the main routine has
  2906  * been pre-cleaned of HTML.
  2907  */
  2908 void postprocess_for_HTML(char *theline)
  2909 {
  2910     if (strstr(theline,"<") && strstr(theline,">"))
  2911         while (losemarkup(theline))
  2912             ;
  2913     while (loseentities(theline))
  2914         ;
  2915 }
  2916 
  2917 char *losemarkup(char *theline)
  2918 {
  2919     char *s,*t;
  2920     int i;
  2921     if (!*theline) 
  2922         return NULL;
  2923     s=strstr(theline,"<");
  2924     t=strstr(theline,">");
  2925     if (!s || !t)
  2926 	return NULL;
  2927     for (i=0;*markup[i];i++)
  2928         if (!tagcomp(s+1,markup[i]))
  2929 	{
  2930             if (!t[1])
  2931 	    {
  2932                 *s=0;
  2933                 return s;
  2934 	    }
  2935             else if (t>s)
  2936 	    {
  2937 		strcpy(s,t+1);
  2938 		return s;
  2939 	    }
  2940         }
  2941     /* It's an unrecognized <xxx>. */
  2942     return NULL;
  2943 }
  2944 
  2945 char *loseentities(char *theline)
  2946 {
  2947     int i;
  2948     char *s,*t;
  2949     if (!*theline) 
  2950         return NULL;
  2951     for (i=0;*entities[i].htmlent;i++)
  2952     {
  2953         s=strstr(theline,entities[i].htmlent);
  2954         if (s)
  2955 	{
  2956             t=malloc((size_t)strlen(s));
  2957             if (!t)
  2958 		return NULL;
  2959             strcpy(t,s+strlen(entities[i].htmlent));
  2960             strcpy(s,entities[i].textent);
  2961             strcat(s,t);
  2962             free(t);
  2963             return theline;
  2964 	}
  2965     }
  2966     for (i=0;*entities[i].htmlnum;i++)
  2967     {
  2968         s=strstr(theline,entities[i].htmlnum);
  2969         if (s)
  2970 	{
  2971             t=malloc((size_t)strlen(s));
  2972             if (!t)
  2973 		return NULL;
  2974             strcpy(t,s+strlen(entities[i].htmlnum));
  2975             strcpy(s,entities[i].textent);
  2976             strcat(s,t);
  2977             free(t);
  2978             return theline;
  2979 	}
  2980     }
  2981     return NULL;
  2982 }
  2983 
  2984 int tagcomp(char *strin,char *basetag)
  2985 {
  2986     char *s,*t;
  2987     s=basetag;
  2988     t=strin;
  2989     if (*t=='/')
  2990 	t++; /* ignore a slash */
  2991     while (*s && *t)
  2992     {
  2993         if (tolower(*s)!=tolower(*t))
  2994 	    return 1;
  2995         s++;
  2996 	t++;
  2997     }
  2998     return 0;
  2999 }
  3000 
  3001 void proghelp()
  3002 {
  3003     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3004     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3005     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3006     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3007       "For details, read the file COPYING.\n",stderr);
  3008     fputs("This is Free Software; "
  3009       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3010     fputs("read the file COPYING for details.\n\n",stderr);
  3011     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
  3012     fputs("  where -s checks single quotes, -e suppresses echoing lines, "
  3013       "-t checks typos\n",stderr);
  3014     fputs("  -x (paranoid) switches OFF -t and extra checks, "
  3015       "-l turns OFF line-end checks\n",stderr);
  3016     fputs("  -o just displays overview without detail, "
  3017       "-h echoes header fields\n",stderr);
  3018     fputs("  -v (verbose) unsuppresses duplicate reporting, "
  3019       "-m suppresses markup\n",stderr);
  3020     fputs("  -d ignores DP-specific markup,\n",stderr);
  3021     fputs("  -u uses a file gutcheck.typ to query user-defined "
  3022       "possible typos\n",stderr);
  3023     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
  3024     fputs("\n",stderr);
  3025     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
  3026       stderr);
  3027     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3028       "non-ASCII\n",stderr);
  3029     fputs("characters like accented letters, "
  3030       "lines longer than 75 or shorter than 55,\n",stderr);
  3031     fputs("unbalanced quotes or brackets, "
  3032       "a variety of badly formatted punctuation, \n",stderr);
  3033     fputs("HTML tags, some likely typos. "
  3034       "It is NOT a substitute for human judgement.\n",stderr);
  3035     fputs("\n",stderr);
  3036 }