bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sat May 25 23:43:48 2013 +0100 (2013-05-25)
changeset 50 1b646720d4a7
parent 49 8a53979c0d65
child 51 0d08cd5055d5
permissions -rw-r--r--
Break check_for_mta_from() out
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*                                                                       */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */
     6 /*                                                                       */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.                                   */
    11 /*                                                                       */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */
    15 /* GNU General Public License for more details.                          */
    16 /*                                                                       */
    17 /* You should have received a copy of the GNU General Public License     */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 
    26 #define MAXWORDLEN    80    /* max length of one word             */
    27 #define LINEBUFSIZE 2048    /* buffer size for an input line      */
    28 
    29 #define MAX_USER_TYPOS 1000
    30 #define USERTYPO_FILE "gutcheck.typ"
    31 
    32 #ifndef MAX_PATH
    33 #define MAX_PATH 16384
    34 #endif
    35 
    36 char aline[LINEBUFSIZE];
    37 char prevline[LINEBUFSIZE];
    38 
    39 /* Common typos. */
    40 char *typo[] = {
    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    70     "se", ""
    71 };
    72 
    73 char *usertypo[MAX_USER_TYPOS];
    74 
    75 /* Common abbreviations and other OK words not to query as typos. */
    76 char *okword[] = {
    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    80     "outbid", "outbids", "frostbite", "frostbitten", ""
    81 };
    82 
    83 /* Common abbreviations that cause otherwise unexplained periods. */
    84 char *abbrev[] = {
    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    87 };
    88 
    89 /*
    90  * Two-Letter combinations that rarely if ever start words,
    91  * but are common scannos or otherwise common letter combinations.
    92  */
    93 char *nostart[] = {
    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    95 };
    96 
    97 /*
    98  * Two-Letter combinations that rarely if ever end words,
    99  * but are common scannos or otherwise common letter combinations.
   100  */
   101 char *noend[] = {
   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   103     "sw", "gr", "sl", "cl", "iy", ""
   104 };
   105 
   106 char *markup[] = {
   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   111 };
   112 
   113 char *DPmarkup[] = {
   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   115 };
   116 
   117 char *nocomma[] = {
   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   121     "during", "let", "toward", "among", ""
   122 };
   123 
   124 char *noperiod[] = {
   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   128     "among", "those", "into", "whom", "having", "thence", ""
   129 }; 
   130 
   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
   132 
   133 struct {
   134     char *htmlent;
   135     char *htmlnum;
   136     char *textent;
   137 } entities[] = {
   138     "&amp;",	"&#38;",     "&", 
   139     "&lt;",	"&#60;",     "<",
   140     "&gt;",	"&#62;",     ">",
   141     "&deg;",	"&#176;",    " degrees",
   142     "&pound;",	"&#163;",    "L",
   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
   165     "&cent;",	"&#162;",    "c", /* cent sign */
   166     "&pound;",	"&#163;",    "L", /* pound sign */
   167     "&curren;",	"&#164;",    "$", /* currency sign */
   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
   169     "&sect;",	"&#167;",    "--", /* section sign */
   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */
   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
   177     "&deg;",	"&#176;",    " degrees", /* degree sign */
   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
   182     "&micro;",	"&#181;",    "m", /* micro sign */
   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */
   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
   215     "&times;",	"&#215;",    "*", /* multiplication sign */
   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */
   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
   247     "&divide;",	"&#247;",    "/", /* division sign */
   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
   256     "", ""
   257 };
   258 
   259 /* special characters */
   260 #define CHAR_SPACE        32
   261 #define CHAR_TAB           9
   262 #define CHAR_LF           10
   263 #define CHAR_CR           13
   264 #define CHAR_DQUOTE       34
   265 #define CHAR_SQUOTE       39
   266 #define CHAR_OPEN_SQUOTE  96
   267 #define CHAR_TILDE       126
   268 #define CHAR_ASTERISK     42
   269 #define CHAR_FORESLASH    47
   270 #define CHAR_CARAT        94
   271 
   272 #define CHAR_UNDERSCORE    '_'
   273 #define CHAR_OPEN_CBRACK   '{'
   274 #define CHAR_CLOSE_CBRACK  '}'
   275 #define CHAR_OPEN_RBRACK   '('
   276 #define CHAR_CLOSE_RBRACK  ')'
   277 #define CHAR_OPEN_SBRACK   '['
   278 #define CHAR_CLOSE_SBRACK  ']'
   279 
   280 /* longest and shortest normal PG line lengths */
   281 #define LONGEST_PG_LINE   75
   282 #define WAY_TOO_LONG      80
   283 #define SHORTEST_PG_LINE  55
   284 
   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */
   286                                   /*     D - ignore DP-specific markup     */
   287                                   /*     E - echo queried line             */
   288                                   /*     S - check single quotes           */
   289                                   /*     T - check common typos            */
   290                                   /*     P - require closure of quotes on  */
   291                                   /*         every paragraph               */
   292                                   /*     X - "Trust no one" :-) Paranoid!  */
   293                                   /*         Queries everything            */
   294                                   /*     L - line end checking defaults on */
   295                                   /*         -L turns it off               */
   296                                   /*     O - overview. Just shows counts.  */
   297                                   /*     Y - puts errors to stdout         */
   298                                   /*         instead of stderr             */
   299                                   /*     H - Echoes header fields          */
   300                                   /*     M - Ignore markup in < >          */
   301                                   /*     U - Use file of User-defined Typos*/
   302                                   /*     W - Defaults for use on Web upload*/
   303                                   /*     V - Verbose - list EVERYTHING!    */
   304 #define SWITNO 14                 /* max number of switch parms            */
   305                                   /*        - used for defining array-size */
   306 #define MINARGS   1               /* minimum no of args excl switches      */
   307 #define MAXARGS   1               /* maximum no of args excl switches      */
   308 
   309 int pswit[SWITNO];                /* program switches set by SWITCHES      */
   310 
   311 #define ECHO_SWITCH      0
   312 #define SQUOTE_SWITCH    1
   313 #define TYPO_SWITCH      2
   314 #define QPARA_SWITCH     3
   315 #define PARANOID_SWITCH  4
   316 #define LINE_END_SWITCH  5
   317 #define OVERVIEW_SWITCH  6
   318 #define STDOUT_SWITCH    7
   319 #define HEADER_SWITCH    8
   320 #define WEB_SWITCH       9
   321 #define VERBOSE_SWITCH   10
   322 #define MARKUP_SWITCH    11
   323 #define USERTYPO_SWITCH  12
   324 #define DP_SWITCH        13
   325 
   326 long cnt_dquot;       /* for overview mode, count of doublequote queries */
   327 long cnt_squot;       /* for overview mode, count of singlequote queries */
   328 long cnt_brack;       /* for overview mode, count of brackets queries */
   329 long cnt_bin;         /* for overview mode, count of non-ASCII queries */
   330 long cnt_odd;         /* for overview mode, count of odd character queries */
   331 long cnt_long;        /* for overview mode, count of long line errors */
   332 long cnt_short;       /* for overview mode, count of short line queries */
   333 long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */
   334 long cnt_dash;        /* for overview mode, count of dash-related queries */
   335 long cnt_word;        /* for overview mode, count of word queries */
   336 long cnt_html;        /* for overview mode, count of html queries */
   337 long cnt_lineend;     /* for overview mode, count of line-end queries */
   338 long cnt_spacend;     /* count of lines with space at end */
   339 long linecnt;         /* count of total lines in the file */
   340 long checked_linecnt; /* count of lines actually checked */
   341 
   342 void proghelp(void);
   343 void procfile(char *);
   344 
   345 #define LOW_THRESHOLD    0
   346 #define HIGH_THRESHOLD   1
   347 
   348 #define START 0
   349 #define END 1
   350 #define PREV 0
   351 #define NEXT 1
   352 #define FIRST_OF_PAIR 0
   353 #define SECOND_OF_PAIR 1
   354 
   355 #define MAX_WORDPAIR 1000
   356 
   357 char running_from[MAX_PATH];
   358 
   359 int mixdigit(char *);
   360 char *getaword(char *,char *);
   361 int matchword(char *,char *);
   362 char *flgets(char *,int,FILE *,long);
   363 void lowerit(char *);
   364 int gcisalpha(unsigned char);
   365 int gcisdigit(unsigned char);
   366 int gcisletter(unsigned char);
   367 char *gcstrchr(char *s,char c);
   368 void postprocess_for_HTML(char *);
   369 char *linehasmarkup(char *);
   370 char *losemarkup(char *);
   371 int tagcomp(char *,char *);
   372 char *loseentities(char *);
   373 int isroman(char *);
   374 int usertypo_count;
   375 void postprocess_for_DP(char *);
   376 
   377 char wrk[LINEBUFSIZE];
   378 
   379 #define MAX_QWORD 50
   380 #define MAX_QWORD_LENGTH 40
   381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
   382 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
   383 signed int dupcnt[MAX_QWORD];
   384 
   385 int main(int argc,char **argv)
   386 {
   387     char *argsw,*s;
   388     int i,switno,invarg;
   389     char usertypo_file[MAX_PATH];
   390     FILE *usertypofile;
   391     if (strlen(argv[0])<sizeof(running_from))
   392 	/* save the path to the executable */
   393         strcpy(running_from,argv[0]);
   394     /* find out what directory we're running from */
   395     s=running_from+strlen(running_from);
   396     for (;*s!='/' && *s!='\\' && s>=running_from;s--)
   397         *s=0;
   398     switno=strlen(SWITCHES);
   399     for (i=switno;--i>0;)
   400         pswit[i]=0;           /* initialise switches */
   401     /*
   402      * Standard loop to extract switches.
   403      * When we come out of this loop, the arguments will be
   404      * in argv[0] upwards and the switches used will be
   405      * represented by their equivalent elements in pswit[]
   406      */
   407     while (--argc>0 && **++argv=='-')
   408         for (argsw=argv[0]+1;*argsw!='\0';argsw++)
   409             for (i=switno,invarg=1;(--i>=0) && invarg==1;)
   410                 if ((toupper(*argsw))==SWITCHES[i])
   411 		{
   412                     invarg=0;
   413                     pswit[i]=1;
   414 		}
   415     /* Paranoid checking is turned OFF, not on, by its switch */
   416     pswit[PARANOID_SWITCH]^=1;
   417     if (pswit[PARANOID_SWITCH])
   418 	/* if running in paranoid mode force typo checks as well   */
   419         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
   420     /* Line-end checking is turned OFF, not on, by its switch */
   421     pswit[LINE_END_SWITCH]^=1;
   422     /* Echoing is turned OFF, not on, by its switch */
   423     pswit[ECHO_SWITCH]^=1;
   424     if (pswit[OVERVIEW_SWITCH])
   425 	/* just print summary; don't echo */
   426         pswit[ECHO_SWITCH]=0;
   427     /*
   428      * Web uploads - for the moment, this is really just a placeholder
   429      * until we decide what processing we really want to do on web uploads
   430      */
   431     if (pswit[WEB_SWITCH])
   432     {
   433 	/* specific override for web uploads */
   434         pswit[ECHO_SWITCH]=1;
   435         pswit[SQUOTE_SWITCH]=0;
   436         pswit[TYPO_SWITCH]=1;
   437         pswit[QPARA_SWITCH]=0;
   438         pswit[PARANOID_SWITCH]=1;
   439         pswit[LINE_END_SWITCH]=0;
   440         pswit[OVERVIEW_SWITCH]=0;
   441         pswit[STDOUT_SWITCH]=0;
   442         pswit[HEADER_SWITCH]=1;
   443         pswit[VERBOSE_SWITCH]=0;
   444         pswit[MARKUP_SWITCH]=0;
   445         pswit[USERTYPO_SWITCH]=0;
   446         pswit[DP_SWITCH]=0;
   447     }
   448     if (argc<MINARGS || argc>MAXARGS)
   449     {
   450 	/* check number of args */
   451         proghelp();
   452         return 1;
   453     }
   454     /* read in the user-defined stealth scanno list */
   455     if (pswit[USERTYPO_SWITCH])
   456     {
   457 	/* ... we were told we had one! */
   458         usertypofile=fopen(USERTYPO_FILE,"rb");
   459         if (!usertypofile)
   460 	{
   461 	    /* not in cwd. try excuteable directory. */
   462             strcpy(usertypo_file,running_from);
   463             strcat(usertypo_file,USERTYPO_FILE);
   464             usertypofile=fopen(usertypo_file,"rb");
   465             if (!usertypofile) {
   466 		/* we ain't got no user typo file! */
   467                 printf("   --> I couldn't find gutcheck.typ "
   468 		  "-- proceeding without user typos.\n");
   469 	    }
   470 	}
   471         usertypo_count=0;
   472         if (usertypofile)
   473 	{
   474 	    /* we managed to open a User Typo File! */
   475             if (pswit[USERTYPO_SWITCH])
   476 	    {
   477                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
   478 		  (long)usertypo_count))
   479 		{
   480                     if (strlen(aline)>1)
   481 		    {
   482                         if ((int)*aline>33)
   483 			{
   484                             s=malloc(strlen(aline)+1);
   485                             if (!s)
   486 			    {
   487                                 fprintf(stderr,"bookloupe: cannot get enough "
   488 				  "memory for user typo file!\n");
   489                                 exit(1);
   490 			    }
   491                             strcpy(s,aline);
   492                             usertypo[usertypo_count]=s;
   493                             usertypo_count++;
   494                             if (usertypo_count>=MAX_USER_TYPOS)
   495 			    {
   496                                 printf("   --> Only %d user-defined typos "
   497 				  "allowed: ignoring the rest\n",
   498 				  MAX_USER_TYPOS);
   499                                 break;
   500 			    }
   501 			}
   502 		    }
   503 		}
   504 	    }
   505             fclose(usertypofile);
   506 	}
   507     }
   508     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   509     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
   510     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
   511     cnt_spacend=0;
   512     procfile(argv[0]);
   513     if (pswit[OVERVIEW_SWITCH])
   514     {
   515 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   516 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   517         printf("    --------------- Queries found --------------\n");
   518         if (cnt_long)
   519 	    printf("    Long lines:                    %14ld\n",cnt_long);
   520         if (cnt_short)
   521 	    printf("    Short lines:                   %14ld\n",cnt_short);
   522         if (cnt_lineend)
   523 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);
   524         if (cnt_word)
   525 	    printf("    Common typos:                  %14ld\n",cnt_word);
   526         if (cnt_dquot)
   527 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);
   528         if (cnt_squot)
   529 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);
   530         if (cnt_brack)
   531 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);
   532         if (cnt_bin)
   533 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);
   534         if (cnt_odd)
   535 	    printf("    Proofing characters:           %14ld\n",cnt_odd);
   536         if (cnt_punct)
   537 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   538         if (cnt_dash)
   539 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);
   540         if (cnt_html)
   541 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);
   542         printf("\n");
   543         printf("    TOTAL QUERIES                  %14ld\n",
   544           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   545           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   546     }
   547     return 0;
   548 }
   549 
   550 struct first_pass_results {
   551     long firstline,astline;
   552     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
   553     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
   554     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
   555     signed int Dutchcount,Frenchcount;
   556 };
   557 
   558 /*
   559  * first_pass:
   560  *
   561  * Run a first pass - verify that it's a valid PG
   562  * file, decide whether to report some things that
   563  * occur many times in the text like long or short
   564  * lines, non-standard dashes, etc.
   565  */
   566 struct first_pass_results *first_pass(FILE *infile)
   567 {
   568     char laststart=CHAR_SPACE,*s;
   569     signed int i,llen;
   570     unsigned int lastlen=0,lastblen=0;
   571     long spline=0,nspline=0;
   572     static struct first_pass_results results={0};
   573     char inword[MAXWORDLEN]="";
   574     while (fgets(aline,LINEBUFSIZE-1,infile))
   575     {
   576         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
   577 	    aline[strlen(aline)-1]=0;
   578         linecnt++;
   579         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
   580 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
   581 	{
   582             if (spline)
   583                 printf("   --> Duplicate header?\n");
   584             spline=linecnt+1;   /* first line of non-header text, that is */
   585 	}
   586         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
   587 	{
   588             if (nspline)
   589                 printf("   --> Duplicate header?\n");
   590             nspline=linecnt+1;   /* first line of non-header text, that is */
   591 	}
   592         if (spline || nspline)
   593 	{
   594             lowerit(aline);
   595             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
   596 	    {
   597                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
   598 		{
   599                     if (results.footerline)
   600 		    {
   601 			/* it's an old-form header - we can detect duplicates */
   602                         if (!nspline)
   603                             printf("   --> Duplicate footer?\n");
   604 		    }
   605                     else
   606                         results.footerline=linecnt;
   607 		}
   608 	    }
   609 	}
   610         if (spline)
   611 	    results.firstline=spline;
   612         if (nspline)
   613 	    results.firstline=nspline;  /* override with new */
   614         if (results.footerline)
   615 	    continue;    /* don't count the boilerplate in the footer */
   616         llen=strlen(aline);
   617         results.totlen+=llen;
   618         for (i=0;i<llen;i++)
   619 	{
   620             if ((unsigned char)aline[i]>127)
   621 		results.binlen++;
   622             if (gcisalpha(aline[i]))
   623 		results.alphalen++;
   624             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
   625 		results.endquote_count++;
   626 	}
   627         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
   628 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   629 	    results.shortline++;
   630         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
   631 	    cnt_spacend++;
   632         if (strstr(aline,".,"))
   633 	    results.dotcomma++;
   634         /* only count ast lines for ignoring purposes where there is */
   635         /* locase text on the line */
   636         if (strstr(aline,"*"))
   637 	{
   638             for (s=aline;*s;s++)
   639                 if (*s>='a' && *s<='z')
   640                     break;
   641              if (*s)
   642 		results.astline++;
   643 	}
   644         if (strstr(aline,"/"))
   645             results.fslashline++;
   646         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
   647 	    ;
   648         if (aline[i]=='-' && aline[i-1]!='-')
   649 	    results.hyphens++;
   650         if (llen>LONGEST_PG_LINE)
   651 	    results.longline++;
   652         if (llen>WAY_TOO_LONG)
   653 	    results.verylongline++;
   654         if (strstr(aline,"<") && strstr(aline,">"))
   655 	{
   656             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
   657             if (i>0)
   658                 results.htmcount++;
   659             if (strstr(aline,"<i>"))
   660 		results.htmcount+=4; /* bonus marks! */
   661 	}
   662         /* Check for spaced em-dashes */
   663         if (strstr(aline,"--"))
   664 	{
   665             results.emdash++;
   666             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
   667                (*(strstr(aline,"--")+2)==CHAR_SPACE))
   668 		results.space_emdash++;
   669             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
   670                (*(strstr(aline,"--")+2)==CHAR_SPACE))
   671 		/* count of em-dashes with spaces both sides */
   672 		results.non_PG_space_emdash++;
   673             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
   674                (*(strstr(aline,"--")+2)!=CHAR_SPACE))
   675 		/* count of PG-type em-dashes with no spaces */
   676 		results.PG_space_emdash++;
   677 	}
   678         for (s=aline;*s;)
   679 	{
   680             s=getaword(s,inword);
   681             if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   682                 results.Dutchcount++;
   683             if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   684                 results.Frenchcount++;
   685             if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   686                 results.standalone_digit++;
   687 	}
   688         /* Check for spaced dashes */
   689         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
   690 	    results.spacedash++;
   691         lastblen=lastlen;
   692         lastlen=strlen(aline);
   693         laststart=aline[0];
   694     }
   695     return &results;
   696 }
   697 
   698 struct warnings {
   699     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   700     signed int endquote,isDutch,isFrench;
   701 };
   702 
   703 /*
   704  * report_first_pass:
   705  *
   706  * Make some snap decisions based on the first pass results.
   707  */
   708 struct warnings *report_first_pass(struct first_pass_results *results)
   709 {
   710     static struct warnings warnings={0};
   711     if (cnt_spacend>0)
   712         printf("   --> %ld lines in this file have white space at end\n",
   713 	  cnt_spacend);
   714     warnings.dotcomma=1;
   715     if (results->dotcomma>5)
   716     {
   717         warnings.dotcomma=0;
   718         printf("   --> %ld lines in this file contain '.,'. "
   719 	  "Not reporting them.\n",results->dotcomma);
   720     }
   721     /*
   722      * If more than 50 lines, or one-tenth, are short,
   723      * don't bother reporting them.
   724      */
   725     warnings.shortline=1;
   726     if (results->shortline>50 || results->shortline*10>linecnt)
   727     {
   728         warnings.shortline=0;
   729         printf("   --> %ld lines in this file are short. "
   730 	  "Not reporting short lines.\n",results->shortline);
   731     }
   732     /*
   733      * If more than 50 lines, or one-tenth, are long,
   734      * don't bother reporting them.
   735      */
   736     warnings.longline=1;
   737     if (results->longline>50 || results->longline*10>linecnt)
   738     {
   739         warnings.longline=0;
   740         printf("   --> %ld lines in this file are long. "
   741 	  "Not reporting long lines.\n",results->longline);
   742     }
   743     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   744     warnings.ast=1;
   745     if (results->astline>10)
   746     {
   747         warnings.ast=0;
   748         printf("   --> %ld lines in this file contain asterisks. "
   749 	  "Not reporting them.\n",results->astline);
   750     }
   751     /*
   752      * If more than 10 lines contain forward slashes,
   753      * don't bother reporting them.
   754      */
   755     warnings.fslash=1;
   756     if (results->fslashline>10)
   757     {
   758         warnings.fslash=0;
   759         printf("   --> %ld lines in this file contain forward slashes. "
   760 	  "Not reporting them.\n",results->fslashline);
   761     }
   762     /*
   763      * If more than 20 lines contain unpunctuated endquotes,
   764      * don't bother reporting them.
   765      */
   766     warnings.endquote=1;
   767     if (results->endquote_count>20)
   768     {
   769         warnings.endquote=0;
   770         printf("   --> %ld lines in this file contain unpunctuated endquotes. "
   771 	  "Not reporting them.\n",results->endquote_count);
   772     }
   773     /*
   774      * If more than 15 lines contain standalone digits,
   775      * don't bother reporting them.
   776      */
   777     warnings.digit=1;
   778     if (results->standalone_digit>10)
   779     {
   780         warnings.digit=0;
   781         printf("   --> %ld lines in this file contain standalone 0s and 1s. "
   782 	  "Not reporting them.\n",results->standalone_digit);
   783     }
   784     /*
   785      * If more than 20 lines contain hyphens at end,
   786      * don't bother reporting them.
   787      */
   788     warnings.hyphen=1;
   789     if (results->hyphens>20)
   790     {
   791         warnings.hyphen=0;
   792         printf("   --> %ld lines in this file have hyphens at end. "
   793 	  "Not reporting them.\n",results->hyphens);
   794     }
   795     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   796     {
   797         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   798         pswit[MARKUP_SWITCH]=1;
   799     }
   800     if (results->verylongline>0)
   801         printf("   --> %ld lines in this file are VERY long!\n",
   802 	  results->verylongline);
   803     /*
   804      * If there are more non-PG spaced dashes than PG em-dashes,
   805      * assume it's deliberate.
   806      * Current PG guidelines say don't use them, but older texts do,
   807      * and some people insist on them whatever the guidelines say.
   808      */
   809     warnings.dash=1;
   810     if (results->spacedash+results->non_PG_space_emdash>
   811       results->PG_space_emdash)
   812     {
   813         warnings.dash=0;
   814         printf("   --> There are %ld spaced dashes and em-dashes. "
   815 	  "Not reporting them.\n",
   816 	  results->spacedash+results->non_PG_space_emdash);
   817     }
   818     /* If more than a quarter of characters are hi-bit, bug out. */
   819     warnings.bin=1;
   820     if (results->binlen*4>results->totlen)
   821     {
   822         printf("   --> This file does not appear to be ASCII. "
   823 	  "Terminating. Best of luck with it!\n");
   824         exit(1);
   825     }
   826     if (results->alphalen*4<results->totlen)
   827     {
   828         printf("   --> This file does not appear to be text. "
   829 	  "Terminating. Best of luck with it!\n");
   830         exit(1);
   831     }
   832     if (results->binlen*100>results->totlen || results->binlen>100)
   833     {
   834         printf("   --> There are a lot of foreign letters here. "
   835 	  "Not reporting them.\n");
   836         warnings.bin=0;
   837     }
   838     warnings.isDutch=0;
   839     if (results->Dutchcount>50)
   840     {
   841         warnings.isDutch=1;
   842         printf("   --> This looks like Dutch - "
   843 	  "switching off dashes and warnings for 's Middags case.\n");
   844     }
   845     warnings.isFrench=0;
   846     if (results->Frenchcount>50)
   847     {
   848         warnings.isFrench=1;
   849         printf("   --> This looks like French - "
   850 	  "switching off some doublepunct.\n");
   851     }
   852     if (results->firstline && results->footerline)
   853         printf("    The PG header and footer appear to be already on.\n");
   854     else
   855     {
   856         if (results->firstline)
   857             printf("    The PG header is on - no footer.\n");
   858         if (results->footerline)
   859             printf("    The PG footer is on - no header.\n");
   860     }
   861     printf("\n");
   862     if (pswit[VERBOSE_SWITCH])
   863     {
   864         warnings.bin=1;
   865         warnings.shortline=1;
   866         warnings.dotcomma=1;
   867         warnings.longline=1;
   868         warnings.dash=1;
   869         warnings.digit=1;
   870         warnings.ast=1;
   871         warnings.fslash=1;
   872         warnings.hyphen=1;
   873         warnings.endquote=1;
   874         printf("   *** Verbose output is ON -- you asked for it! ***\n");
   875     }
   876     if (warnings.isDutch)
   877         warnings.dash=0;
   878     if (results->footerline>0 && results->firstline>0 &&
   879       results->footerline>results->firstline &&
   880       results->footerline-results->firstline<100)
   881     {
   882         printf("   --> I don't really know where this text starts. \n");
   883         printf("       There are no reference points.\n");
   884         printf("       I'm going to have to report the header and footer "
   885 	  "as well.\n");
   886         results->firstline=0;
   887     }
   888     return &warnings;
   889 }
   890 
   891 struct counters {
   892     long quot;
   893     signed int c_unders,c_brack,s_brack,r_brack;
   894     signed int open_single_quote,close_single_quote;
   895 };
   896 
   897 /*
   898  * analyse_quotes:
   899  *
   900  * Look along the line, accumulate the count of quotes, and see
   901  * if this is an empty line - i.e. a line with nothing on it
   902  * but spaces.
   903  * If line has just spaces, period, * and/or - on it, don't
   904  * count it, since empty lines with asterisks or dashes to
   905  * separate sections are common.
   906  *
   907  * Returns: Non-zero if the line is empty.
   908  */
   909 int analyse_quotes(const char *s,struct counters *counters)
   910 {
   911     signed int guessquote=0;
   912     int isemptyline=1;    /* assume the line is empty until proven otherwise */
   913     while (*s)
   914     {
   915 	if (*s==CHAR_DQUOTE)
   916 	    counters->quot++;
   917 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
   918 	{
   919 	    if (s==aline)
   920 	    {
   921 		/*
   922 		 * At start of line, it can only be an openquote.
   923 		 * Hardcode a very common exception!
   924 		 */
   925 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
   926 		    counters->open_single_quote++;
   927 	    }
   928 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
   929 		/* Do nothing! it's definitely an apostrophe, not a quote */
   930 		;
   931 	    /* it's outside a word - let's check it out */
   932 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
   933 	    {
   934 		/* it damwell better BE an openquote */
   935 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
   936 		    /* hardcode a very common exception! */
   937 		    counters->open_single_quote++;
   938 	    }
   939 	    else
   940 	    {
   941 		/* now - is it a closequote? */
   942 		guessquote=0;   /* accumulate clues */
   943 		if (gcisalpha(s[-1]))
   944 		{
   945 		    /* it follows a letter - could be either */
   946 		    guessquote++;
   947 		    if (s[-1]=='s')
   948 		    {
   949 			/* looks like a plural apostrophe */
   950 			guessquote-=3;
   951 			if (s[1]==CHAR_SPACE)  /* bonus marks! */
   952 			    guessquote-=2;
   953 		    }
   954 		}
   955 		/* it doesn't have a letter either side */
   956 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
   957 		    guessquote+=8; /* looks like a closequote */
   958 		else
   959 		    guessquote++;
   960 		if (counters->open_single_quote>counters->close_single_quote)
   961 		    /*
   962 		     * Give it the benefit of some doubt,
   963 		     * if a squote is already open.
   964 		     */
   965 		    guessquote++;
   966 		else
   967 		    guessquote--;
   968 		if (guessquote>=0)
   969 		    counters->close_single_quote++;
   970 	    }
   971 	}
   972 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
   973 	  *s!=13 && *s!=10)
   974 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */
   975 	if (*s==CHAR_UNDERSCORE)
   976 	    counters->c_unders++;
   977 	if (*s==CHAR_OPEN_CBRACK)
   978 	    counters->c_brack++;
   979 	if (*s==CHAR_CLOSE_CBRACK)
   980 	    counters->c_brack--;
   981 	if (*s==CHAR_OPEN_RBRACK)
   982 	    counters->r_brack++;
   983 	if (*s==CHAR_CLOSE_RBRACK)
   984 	    counters->r_brack--;
   985 	if (*s==CHAR_OPEN_SBRACK)
   986 	    counters->s_brack++;
   987 	if (*s==CHAR_CLOSE_SBRACK)
   988 	    counters->s_brack--;
   989 	s++;
   990     }
   991     return isemptyline;
   992 }
   993 
   994 /*
   995  * check_for_odd_characters:
   996  *
   997  * Check for binary and other odd characters.
   998  */
   999 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1000   int isemptyline)
  1001 {
  1002     /* Don't repeat multiple warnings on one line. */
  1003     signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
  1004     const char *s;
  1005     unsigned char c;
  1006     for (s=aline;*s;s++)
  1007     {
  1008 	c=*(unsigned char *)s;
  1009 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
  1010 	{
  1011 	    if (pswit[ECHO_SWITCH])
  1012 		printf("\n%s\n",aline);
  1013 	    if (!pswit[OVERVIEW_SWITCH])
  1014 		if (c>127 && c<160)
  1015 		    printf("    Line %ld column %d - "
  1016 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
  1017 		else
  1018 		    printf("    Line %ld column %d - Non-ASCII character %d\n",
  1019 		      linecnt,(int)(s-aline)+1,c);
  1020 	    else
  1021 		cnt_bin++;
  1022 	    eNon_A=1;
  1023 	}
  1024 	if (!eTab && *s==CHAR_TAB)
  1025 	{
  1026 	    if (pswit[ECHO_SWITCH])
  1027 		printf("\n%s\n",aline);
  1028 	    if (!pswit[OVERVIEW_SWITCH])
  1029 		printf("    Line %ld column %d - Tab character?\n",
  1030 		  linecnt,(int)(s-aline)+1);
  1031 	    else
  1032 		cnt_odd++;
  1033 	    eTab=1;
  1034 	}
  1035 	if (!eTilde && *s==CHAR_TILDE)
  1036 	{
  1037 	    /*
  1038 	     * Often used by OCR software to indicate an
  1039 	     * unrecognizable character.
  1040 	     */
  1041 	    if (pswit[ECHO_SWITCH])
  1042 		printf("\n%s\n",aline);
  1043 	    if (!pswit[OVERVIEW_SWITCH])
  1044 		printf("    Line %ld column %d - Tilde character?\n",
  1045 		  linecnt,(int)(s-aline)+1);
  1046 	    else
  1047 		cnt_odd++;
  1048 	    eTilde=1;
  1049 	}
  1050 	if (!eCarat && *s==CHAR_CARAT)
  1051 	{  
  1052 	    if (pswit[ECHO_SWITCH])
  1053 		printf("\n%s\n",aline);
  1054 	    if (!pswit[OVERVIEW_SWITCH])
  1055 		printf("    Line %ld column %d - Carat character?\n",
  1056 		  linecnt,(int)(s-aline)+1);
  1057 	    else
  1058 		cnt_odd++;
  1059 	    eCarat=1;
  1060 	}
  1061 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
  1062 	{  
  1063 	    if (pswit[ECHO_SWITCH])
  1064 		printf("\n%s\n",aline);
  1065 	    if (!pswit[OVERVIEW_SWITCH])
  1066 		printf("    Line %ld column %d - Forward slash?\n",
  1067 		  linecnt,(int)(s-aline)+1);
  1068 	    else
  1069 		cnt_odd++;
  1070 	    eFSlash=1;
  1071 	}
  1072 	/*
  1073 	 * Report asterisks only in paranoid mode,
  1074 	 * since they're often deliberate.
  1075 	 */
  1076 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1077 	  *s==CHAR_ASTERISK)
  1078 	{
  1079 	    if (pswit[ECHO_SWITCH])
  1080 		printf("\n%s\n",aline);
  1081 	    if (!pswit[OVERVIEW_SWITCH])
  1082 		printf("    Line %ld column %d - Asterisk?\n",
  1083 		  linecnt,(int)(s-aline)+1);
  1084 	    else
  1085 		cnt_odd++;
  1086 	    eAst=1;
  1087 	}
  1088     }
  1089 }
  1090 
  1091 /*
  1092  * check_for_long_line:
  1093  *
  1094  * Check for line too long.
  1095  */
  1096 void check_for_long_line(const char *aline)
  1097 {
  1098     if (strlen(aline)>LONGEST_PG_LINE)
  1099     {
  1100 	if (pswit[ECHO_SWITCH])
  1101 	    printf("\n%s\n",aline);
  1102 	if (!pswit[OVERVIEW_SWITCH])
  1103 	    printf("    Line %ld column %d - Long line %d\n",
  1104 	      linecnt,strlen(aline),strlen(aline));
  1105 	else
  1106 	    cnt_long++;
  1107     }
  1108 }
  1109 
  1110 struct line_properties {
  1111     unsigned int len,blen;
  1112     char start;
  1113 };
  1114 
  1115 /*
  1116  * check_for_short_line:
  1117  *
  1118  * Check for line too short.
  1119  *
  1120  * This one is a bit trickier to implement: we don't want to
  1121  * flag the last line of a paragraph for being short, so we
  1122  * have to wait until we know that our current line is a
  1123  * "normal" line, then report the _previous_ line if it was too
  1124  * short. We also don't want to report indented lines like
  1125  * chapter heads or formatted quotations. We therefore keep
  1126  * last->len as the length of the last line examined, and
  1127  * last->blen as the length of the last but one, and try to
  1128  * suppress unnecessary warnings by checking that both were of
  1129  * "normal" length. We keep the first character of the last
  1130  * line in last->start, and if it was a space, we assume that
  1131  * the formatting is deliberate. I can't figure out a way to
  1132  * distinguish something like a quoted verse left-aligned or
  1133  * the header or footer of a letter from a paragraph of short
  1134  * lines - maybe if I examined the whole paragraph, and if the
  1135  * para has less than, say, 8 lines and if all lines are short,
  1136  * then just assume it's OK? Need to look at some texts to see
  1137  * how often a formula like this would get the right result.
  1138  */
  1139 void check_for_short_line(const char *aline,const struct line_properties *last)
  1140 {
  1141     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
  1142       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1143     {
  1144 	if (pswit[ECHO_SWITCH])
  1145 	    printf("\n%s\n",prevline);
  1146 	if (!pswit[OVERVIEW_SWITCH])
  1147 	    printf("    Line %ld column %d - Short line %d?\n",
  1148 	      linecnt-1,strlen(prevline),strlen(prevline));
  1149 	else
  1150 	    cnt_short++;
  1151     }
  1152 }
  1153 
  1154 /*
  1155  * check_for_starting_punctuation:
  1156  *
  1157  * Look for punctuation other than full ellipses at start of line.
  1158  */
  1159 void check_for_starting_punctuation(const char *aline)
  1160 {
  1161     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
  1162     {
  1163 	if (pswit[ECHO_SWITCH])
  1164 	    printf("\n%s\n",aline);
  1165 	if (!pswit[OVERVIEW_SWITCH])
  1166 	    printf("    Line %ld column 1 - Begins with punctuation?\n",
  1167 	      linecnt);
  1168 	else
  1169 	    cnt_punct++;
  1170     }
  1171 }
  1172 
  1173 /*
  1174  * check_for_spaced_emdash:
  1175  *
  1176  * Check for spaced em-dashes.
  1177  *
  1178  * We must check _all_ occurrences of "--" on the line
  1179  * hence the loop - even if the first double-dash is OK
  1180  * there may be another that's wrong later on.
  1181  */
  1182 void check_for_spaced_emdash(const char *aline)
  1183 {
  1184     const char *s,*t;
  1185     s=aline;
  1186     while ((t=strstr(s,"--")))
  1187     {
  1188 	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
  1189 	{
  1190 	    if (pswit[ECHO_SWITCH])
  1191 		printf("\n%s\n",aline);
  1192 	    if (!pswit[OVERVIEW_SWITCH])
  1193 		printf("    Line %ld column %d - Spaced em-dash?\n",
  1194 		  linecnt,(int)(t-aline)+1);
  1195 	    else
  1196 		cnt_dash++;
  1197 	}
  1198 	s=t+2;
  1199     }
  1200 }
  1201 
  1202 /*
  1203  * check_for_spaced_dash:
  1204  *
  1205  * Check for spaced dashes.
  1206  */
  1207 void check_for_spaced_dash(const char *aline)
  1208 {
  1209     const char *s;
  1210     if ((s=strstr(aline," -")))
  1211     {
  1212 	if (s[2]!='-')
  1213 	{
  1214 	    if (pswit[ECHO_SWITCH])
  1215 		printf("\n%s\n",aline);
  1216 	    if (!pswit[OVERVIEW_SWITCH])
  1217 		printf("    Line %ld column %d - Spaced dash?\n",
  1218 		  linecnt,(int)(s-aline)+1);
  1219 	    else
  1220 		cnt_dash++;
  1221 	}
  1222     }
  1223     else if ((s=strstr(aline,"- ")))
  1224     {
  1225 	if (s==aline || s[-1]!='-')
  1226 	{
  1227 	    if (pswit[ECHO_SWITCH])
  1228 		printf("\n%s\n",aline);
  1229 	    if (!pswit[OVERVIEW_SWITCH])
  1230 		printf("    Line %ld column %d - Spaced dash?\n",
  1231 		  linecnt,(int)(s-aline)+1);
  1232 	    else
  1233 		cnt_dash++;
  1234 	}
  1235     }
  1236 }
  1237 
  1238 /*
  1239  * check_for_unmarked_paragraphs:
  1240  *
  1241  * Check for unmarked paragraphs indicated by separate speakers.
  1242  *
  1243  * May well be false positive:
  1244  * "Bravo!" "Wonderful!" called the crowd.
  1245  * but useful all the same.
  1246  */
  1247 void check_for_unmarked_paragraphs(const char *aline)
  1248 {
  1249     const char *s;
  1250     s=strstr(aline,"\"  \"");
  1251     if (!s)
  1252 	s=strstr(aline,"\" \"");
  1253     if (s)
  1254     {
  1255 	if (pswit[ECHO_SWITCH])
  1256 	    printf("\n%s\n",aline);
  1257 	if (!pswit[OVERVIEW_SWITCH])
  1258 	    printf("    Line %ld column %d - Query missing paragraph break?\n",
  1259 	      linecnt,(int)(s-aline)+1);
  1260 	else
  1261 	    cnt_punct++;
  1262     }
  1263 }
  1264 
  1265 /*
  1266  * check_for_jeebies:
  1267  *
  1268  * Check for "to he" and other easy h/b errors.
  1269  *
  1270  * This is a very inadequate effort on the h/b problem,
  1271  * but the phrase "to he" is always an error, whereas "to
  1272  * be" is quite common.
  1273  * Similarly, '"Quiet!", be said.' is a non-be error
  1274  * "to he" is _not_ always an error!:
  1275  *       "Where they went to he couldn't say."
  1276  * Another false positive:
  1277  *       What would "Cinderella" be without the . . .
  1278  * and another: "If he wants to he can see for himself."
  1279  */
  1280 void check_for_jeebies(const char *aline)
  1281 {
  1282     const char *s;
  1283     s=strstr(aline," be could ");
  1284     if (!s)
  1285 	s=strstr(aline," be would ");
  1286     if (!s)
  1287 	s=strstr(aline," was be ");
  1288     if (!s)
  1289 	s=strstr(aline," be is ");
  1290     if (!s)
  1291 	s=strstr(aline," is be ");
  1292     if (!s)
  1293 	s=strstr(aline,"\", be ");
  1294     if (!s)
  1295 	s=strstr(aline,"\" be ");
  1296     if (!s)
  1297 	s=strstr(aline,"\" be ");
  1298     if (!s)
  1299 	s=strstr(aline," to he ");
  1300     if (s)
  1301     {
  1302 	if (pswit[ECHO_SWITCH])
  1303 	    printf("\n%s\n",aline);
  1304 	if (!pswit[OVERVIEW_SWITCH])
  1305 	    printf("    Line %ld column %d - Query he/be error?\n",
  1306 	      linecnt,(int)(s-aline)+1);
  1307 	else
  1308 	    cnt_word++;
  1309     }
  1310     s=strstr(aline," the had ");
  1311     if (!s)
  1312 	s=strstr(aline," a had ");
  1313     if (!s)
  1314 	s=strstr(aline," they bad ");
  1315     if (!s)
  1316 	s=strstr(aline," she bad ");
  1317     if (!s)
  1318 	s=strstr(aline," he bad ");
  1319     if (!s)
  1320 	s=strstr(aline," you bad ");
  1321     if (!s)
  1322 	s=strstr(aline," i bad ");
  1323     if (s)
  1324     {
  1325 	if (pswit[ECHO_SWITCH])
  1326 	    printf("\n%s\n",aline);
  1327 	if (!pswit[OVERVIEW_SWITCH])
  1328 	    printf("    Line %ld column %d - Query had/bad error?\n",
  1329 	      linecnt,(int)(s-aline)+1);
  1330 	else
  1331 	    cnt_word++;
  1332     }
  1333     s=strstr(aline,"; hut ");
  1334     if (!s)
  1335 	s=strstr(aline,", hut ");
  1336     if (s)
  1337     {
  1338 	if (pswit[ECHO_SWITCH])
  1339 	    printf("\n%s\n",aline);
  1340 	if (!pswit[OVERVIEW_SWITCH])
  1341 	    printf("    Line %ld column %d - Query hut/but error?\n",
  1342 	      linecnt,(int)(s-aline)+1);
  1343 	else
  1344 	    cnt_word++;
  1345     }
  1346 }
  1347 
  1348 /*
  1349  * check_for_mta_from:
  1350  *
  1351  * Special case - angled bracket in front of "From" placed there by an
  1352  * MTA when sending an e-mail.
  1353  */
  1354 void check_for_mta_from(const char *aline)
  1355 {
  1356     const char *s;
  1357     s=strstr(aline,">From");
  1358     if (s)
  1359     {
  1360 	if (pswit[ECHO_SWITCH])
  1361 	    printf("\n%s\n",aline);
  1362 	if (!pswit[OVERVIEW_SWITCH])
  1363 	    printf("    Line %ld column %d - Query angled bracket with From\n",
  1364 	      linecnt,(int)(s-aline)+1);
  1365 	else
  1366 	    cnt_punct++;
  1367     }
  1368 }
  1369 
  1370 /*
  1371  * procfile:
  1372  *
  1373  * Process one file.
  1374  */
  1375 void procfile(char *filename)
  1376 {
  1377     char *s,*t,*s1,*wordstart;
  1378     char inword[MAXWORDLEN],testword[MAXWORDLEN];
  1379     char parastart[81];     /* first line of current para */
  1380     FILE *infile;
  1381     struct first_pass_results *first_pass_results;
  1382     struct warnings *warnings;
  1383     struct counters counters={0};
  1384     struct line_properties last={0};
  1385     int isemptyline;
  1386     long squot,start_para_line;
  1387     signed int i,j,llen,isacro,isellipsis,istypo,alower;
  1388     signed int dquotepar,squotepar;
  1389     signed int isnewpara,vowel,consonant;
  1390     char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
  1391       cbrack_err[80],unders_err[80];
  1392     signed int qword_index,qperiod_index,isdup;
  1393     signed int enddash;
  1394     last.start=CHAR_SPACE;
  1395     *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
  1396       *unders_err=*prevline=0;
  1397     linecnt=checked_linecnt=start_para_line=0;
  1398     squot=0;
  1399     i=llen=isacro=isellipsis=istypo=0;
  1400     isnewpara=vowel=consonant=enddash=0;
  1401     qword_index=qperiod_index=isdup=0;
  1402     *inword=*testword=0;
  1403     dquotepar=squotepar=0;
  1404     for (j=0;j<MAX_QWORD;j++)
  1405     {
  1406         dupcnt[j]=0;
  1407         for (i=0;i<MAX_QWORD_LENGTH;i++)
  1408 	{
  1409             qword[i][j]=0;
  1410             qperiod[i][j]=0;
  1411 	}
  1412     }
  1413     infile=fopen(filename,"rb");
  1414     if (!infile)
  1415     {
  1416         if (pswit[STDOUT_SWITCH])
  1417             fprintf(stdout,"bookloupe: cannot open %s\n",filename);
  1418         else
  1419             fprintf(stderr,"bookloupe: cannot open %s\n",filename);
  1420 	exit(1);
  1421     }
  1422     fprintf(stdout,"\n\nFile: %s\n\n",filename);
  1423     first_pass_results=first_pass(infile);
  1424     warnings=report_first_pass(first_pass_results);
  1425     rewind(infile);
  1426     /*
  1427      * Here we go with the main pass. Hold onto yer hat!
  1428      * Re-init some variables we've dirtied.
  1429      */
  1430     squot=linecnt=0;
  1431     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
  1432     {
  1433         linecnt++;
  1434         if (linecnt==1)
  1435 	    isnewpara=1;
  1436         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
  1437 	    continue;    // skip DP page separators completely
  1438         if (linecnt<first_pass_results->firstline ||
  1439 	  (first_pass_results->footerline>0 &&
  1440 	  linecnt>first_pass_results->footerline))
  1441 	{
  1442             if (pswit[HEADER_SWITCH])
  1443 	    {
  1444                 if (!strncmp(aline,"Title:",6))
  1445                     printf("    %s\n",aline);
  1446                 if (!strncmp(aline,"Author:",7))
  1447                     printf("    %s\n",aline);
  1448                 if (!strncmp(aline,"Release Date:",13))
  1449                     printf("    %s\n",aline);
  1450                 if (!strncmp(aline,"Edition:",8))
  1451                     printf("    %s\n\n",aline);
  1452 	    }
  1453             continue;                /* skip through the header */
  1454 	}
  1455         checked_linecnt++;
  1456         s=aline;
  1457         /*
  1458 	 * If we are in a state of unbalanced quotes, and this line
  1459          * doesn't begin with a quote, output the stored error message.
  1460          * If the -P switch was used, print the warning even if the
  1461          * new para starts with quotes.
  1462 	 */
  1463         t=s;
  1464         while (*t==' ')
  1465 	    t++;
  1466         if (*dquote_err)
  1467             if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  1468 	    {
  1469                 if (!pswit[OVERVIEW_SWITCH])
  1470 		{
  1471                     if (pswit[ECHO_SWITCH])
  1472 			printf("\n%s\n",parastart);
  1473                     printf(dquote_err);
  1474 		}
  1475                 else
  1476                     cnt_dquot++;
  1477             }
  1478         if (*squote_err)
  1479 	{
  1480             if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
  1481 	      pswit[QPARA_SWITCH] || squot)
  1482 	    {
  1483                 if (!pswit[OVERVIEW_SWITCH])
  1484 		{
  1485                     if (pswit[ECHO_SWITCH])
  1486 			printf("\n%s\n",parastart);
  1487                     printf(squote_err);
  1488 		}
  1489                 else
  1490                     cnt_squot++;
  1491 	    }
  1492             squot=0;
  1493 	}
  1494         if (*rbrack_err)
  1495 	{
  1496             if (!pswit[OVERVIEW_SWITCH])
  1497 	    {
  1498                 if (pswit[ECHO_SWITCH])
  1499 		    printf("\n%s\n",parastart);
  1500                 printf(rbrack_err);
  1501 	    }
  1502             else
  1503                 cnt_brack++;
  1504 	}
  1505         if (*sbrack_err)
  1506 	{
  1507             if (!pswit[OVERVIEW_SWITCH])
  1508 	    {
  1509                 if (pswit[ECHO_SWITCH])
  1510 		    printf("\n%s\n",parastart);
  1511                 printf(sbrack_err);
  1512 	    }
  1513             else
  1514                 cnt_brack++;
  1515 	}
  1516         if (*cbrack_err)
  1517 	{
  1518             if (!pswit[OVERVIEW_SWITCH])
  1519 	    {
  1520                 if (pswit[ECHO_SWITCH])
  1521 		    printf("\n%s\n",parastart);
  1522                 printf(cbrack_err);
  1523 	    }
  1524             else
  1525                 cnt_brack++;
  1526 	}
  1527         if (*unders_err)
  1528 	{
  1529             if (!pswit[OVERVIEW_SWITCH])
  1530 	    {
  1531                 if (pswit[ECHO_SWITCH])
  1532 		    printf("\n%s\n",parastart);
  1533                 printf(unders_err);
  1534 	    }
  1535             else
  1536                 cnt_brack++;
  1537 	}
  1538         *dquote_err=*squote_err=*rbrack_err=*cbrack_err= 
  1539 	  *sbrack_err=*unders_err=0;
  1540 	isemptyline=analyse_quotes(aline,&counters);
  1541         if (isnewpara && !isemptyline)
  1542 	{
  1543 	    /* This line is the start of a new paragraph. */
  1544             start_para_line=linecnt;
  1545 	    /* Capture its first line in case we want to report it later. */
  1546             strncpy(parastart,aline,80);
  1547             parastart[79]=0;
  1548             dquotepar=squotepar=0; /* restart the quote count */
  1549             s=aline;
  1550             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
  1551 		s++;
  1552             if (*s>='a' && *s<='z')
  1553 	    {
  1554 		/* and its first letter is lowercase */
  1555                 if (pswit[ECHO_SWITCH])
  1556 		    printf("\n%s\n",aline);
  1557                 if (!pswit[OVERVIEW_SWITCH])
  1558                     printf("    Line %ld column %d - "
  1559 		      "Paragraph starts with lower-case\n",
  1560 		      linecnt,(int)(s-aline)+1);
  1561                 else
  1562                     cnt_punct++;
  1563 	    }
  1564             isnewpara=0; /* Signal the end of new para processing. */
  1565 	}
  1566         /* Check for an em-dash broken at line end. */
  1567         if (enddash && *aline=='-')
  1568 	{
  1569             if (pswit[ECHO_SWITCH])
  1570 		printf("\n%s\n",aline);
  1571             if (!pswit[OVERVIEW_SWITCH])
  1572                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  1573             else
  1574                 cnt_punct++;
  1575 	}
  1576         enddash=0;
  1577         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
  1578 	    ;
  1579         if (s>=aline && *s=='-')
  1580             enddash=1;
  1581 	/*
  1582          * Check for invalid or questionable characters in the line
  1583          * Anything above 127 is invalid for plain ASCII, and
  1584          * non-printable control characters should also be flagged.
  1585          * Tabs should generally not be there.
  1586 	 */
  1587         for (s=aline;*s;s++)
  1588 	{
  1589             i=(unsigned char)*s;
  1590             if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
  1591 	    {
  1592                 if (pswit[ECHO_SWITCH])
  1593 		    printf("\n%s\n",aline);
  1594                 if (!pswit[OVERVIEW_SWITCH])
  1595                     printf("    Line %ld column %d - Control character %d\n",
  1596 		      linecnt,(int)(s-aline)+1,i);
  1597                 else
  1598                     cnt_bin++;
  1599 	    }
  1600 	}
  1601         if (warnings->bin)
  1602 	    check_for_odd_characters(aline,warnings,isemptyline);
  1603         if (warnings->longline)
  1604 	    check_for_long_line(aline);
  1605         if (warnings->shortline)
  1606 	    check_for_short_line(aline,&last);
  1607         last.blen=last.len;
  1608         last.len=strlen(aline);
  1609         last.start=aline[0];
  1610 	check_for_starting_punctuation(aline);
  1611         if (warnings->dash)
  1612 	{
  1613 	    check_for_spaced_emdash(aline);
  1614 	    check_for_spaced_dash(aline);
  1615 	}
  1616 	check_for_unmarked_paragraphs(aline);
  1617 	check_for_jeebies(aline);
  1618 	check_for_mta_from(aline);
  1619         /*
  1620 	 * Check for a single character line -
  1621 	 * often an overflow from bad wrapping.
  1622 	 */
  1623         if (*aline && !aline[1])
  1624 	{
  1625             if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
  1626 	      gcisdigit(*aline))
  1627                 ; /* Nothing - ignore numerals alone on a line. */
  1628             else
  1629 	    {
  1630                 if (pswit[ECHO_SWITCH])
  1631 		    printf("\n%s\n",aline);
  1632                 if (!pswit[OVERVIEW_SWITCH])
  1633                     printf("    Line %ld column 1 - "
  1634 		      "Query single character line\n",linecnt);
  1635                 else
  1636                     cnt_punct++;
  1637 	    }
  1638 	}
  1639         /* Check for I" - often should be ! */
  1640         if (strstr(aline," I\""))
  1641 	{
  1642             if (pswit[ECHO_SWITCH])
  1643 		printf("\n%s\n",aline);
  1644             if (!pswit[OVERVIEW_SWITCH])
  1645                 printf("    Line %ld column %ld - Query I=exclamation mark?\n",
  1646 		  linecnt,strstr(aline," I\"")-aline);
  1647             else
  1648                 cnt_punct++;
  1649 	}
  1650         /*
  1651 	 * Check for period without a capital letter. Cut-down from gutspell.
  1652          * Only works when it happens on a single line.
  1653 	 */
  1654         if (pswit[PARANOID_SWITCH])
  1655 	{
  1656             for (t=s=aline;strstr(t,". ");)
  1657 	    {
  1658                 t=strstr(t,". ");
  1659                 if (t==s)
  1660 		{
  1661                     t++;
  1662 		    /* start of line punctuation is handled elsewhere */
  1663                     continue;
  1664 		}
  1665                 if (!gcisalpha(t[-1]))
  1666 		{
  1667                     t++;
  1668                     continue;
  1669 		}
  1670                 if (warnings->isDutch)
  1671 		{
  1672 		    /* For Frank & Jeroen -- 's Middags case */
  1673                     if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
  1674 		      t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
  1675 		    {
  1676                         t++;
  1677                         continue;
  1678 		    }
  1679 		}
  1680                 s1=t+2;
  1681                 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
  1682                     s1++;
  1683                 if (*s1>='a' && *s1<='z')
  1684 		{
  1685 		    /* we have something to investigate */
  1686                     istypo=1;
  1687 		    /* so let's go back and find out */
  1688                     for (s1=t-1;s1>=s &&
  1689 		      (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
  1690 		      gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
  1691 			;
  1692                     s1++;
  1693                     for (i=0;*s1 && *s1!='.';s1++,i++)
  1694                         testword[i]=*s1;
  1695                     testword[i]=0;
  1696                     for (i=0;*abbrev[i];i++)
  1697                         if (!strcmp(testword,abbrev[i]))
  1698                             istypo=0;
  1699                     if (gcisdigit(*testword))
  1700 			istypo=0;
  1701                     if (!testword[1])
  1702 			istypo=0;
  1703                     if (isroman(testword))
  1704 			istypo=0;
  1705                     if (istypo)
  1706 		    {
  1707                         istypo=0;
  1708                         for (i=0;testword[i];i++)
  1709                             if (strchr(vowels,testword[i]))
  1710                                 istypo=1;
  1711 		    }
  1712                     if (istypo)
  1713 		    {
  1714                         isdup=0;
  1715                         if (strlen(testword)<MAX_QWORD_LENGTH &&
  1716 			  !pswit[VERBOSE_SWITCH])
  1717                             for (i=0;i<qperiod_index;i++)
  1718                                 if (!strcmp(testword,qperiod[i]))
  1719                                     isdup=1;
  1720                         if (!isdup)
  1721 			{
  1722                             if (qperiod_index<MAX_QWORD &&
  1723 			      strlen(testword)<MAX_QWORD_LENGTH)
  1724 			    {
  1725                                 strcpy(qperiod[qperiod_index],testword);
  1726                                 qperiod_index++;
  1727 			    }
  1728                             if (pswit[ECHO_SWITCH])
  1729 				printf("\n%s\n",aline);
  1730                             if (!pswit[OVERVIEW_SWITCH])
  1731                                 printf("    Line %ld column %d - "
  1732 				  "Extra period?\n",linecnt,(int)(t-aline)+1);
  1733                             else
  1734                                 cnt_punct++;
  1735 			}
  1736 		    }
  1737 		}
  1738 	    t++;
  1739 	    }
  1740 	}
  1741         if (pswit[TYPO_SWITCH])
  1742 	{
  1743             /* Check for words usually not followed by punctuation. */
  1744             for (s=aline;*s;)
  1745 	    {
  1746                 wordstart=s;
  1747                 s=getaword(s,inword);
  1748                 if (!*inword)
  1749 		    continue;
  1750                 lowerit(inword);
  1751                 for (i=0;*nocomma[i];i++)
  1752                     if (!strcmp(inword,nocomma[i]))
  1753 		    {
  1754                         if (*s==',' || *s==';' || *s==':')
  1755 			{
  1756                             if (pswit[ECHO_SWITCH])
  1757 				printf("\n%s\n",aline);
  1758                             if (!pswit[OVERVIEW_SWITCH])
  1759                                 printf("    Line %ld column %d - "
  1760 				  "Query punctuation after %s?\n",
  1761 				  linecnt,(int)(s-aline)+1,inword);
  1762                             else
  1763                                 cnt_punct++;
  1764 			}
  1765 		    }
  1766 		for (i=0;*noperiod[i];i++)
  1767                     if (!strcmp(inword,noperiod[i]))
  1768 		    {
  1769                         if (*s=='.' || *s=='!')
  1770 			{
  1771                             if (pswit[ECHO_SWITCH])
  1772 				printf("\n%s\n",aline);
  1773                             if (!pswit[OVERVIEW_SWITCH])
  1774                                 printf("    Line %ld column %d - "
  1775 				  "Query punctuation after %s?\n",
  1776 				  linecnt,(int)(s-aline)+1,inword);
  1777                             else
  1778                                 cnt_punct++;
  1779 			}
  1780 		    }
  1781 	    }
  1782 	}
  1783         /*
  1784 	 * Check for commonly mistyped words,
  1785 	 * and digits like 0 for O in a word.
  1786 	 */
  1787         for (s=aline;*s;)
  1788 	{
  1789             wordstart=s;
  1790             s=getaword(s,inword);
  1791             if (!*inword)
  1792 		continue; /* don't bother with empty lines */
  1793             if (mixdigit(inword))
  1794 	    {
  1795                 if (pswit[ECHO_SWITCH])
  1796 		    printf("\n%s\n",aline);
  1797                 if (!pswit[OVERVIEW_SWITCH])
  1798                     printf("    Line %ld column %d - Query digit in %s\n",
  1799 		      linecnt,(int)(wordstart-aline)+1,inword);
  1800                 else
  1801                     cnt_word++;
  1802 	    }
  1803             /*
  1804 	     * Put the word through a series of tests for likely typos and OCR
  1805 	     * errors.
  1806 	     */
  1807             if (pswit[TYPO_SWITCH])
  1808 	    {
  1809                 istypo=0;
  1810                 strcpy(testword,inword);
  1811                 alower=0;
  1812                 for (i=0;i<(signed int)strlen(testword);i++)
  1813 		{
  1814 		    /* lowercase for testing */
  1815                     if (testword[i]>='a' && testword[i]<='z')
  1816 			alower=1;
  1817                     if (alower && testword[i]>='A' && testword[i]<='Z')
  1818 		    {
  1819                         /*
  1820 			 * We have an uppercase mid-word. However, there are
  1821 			 * common cases:
  1822                          *   Mac and Mc like McGill
  1823                          *   French contractions like l'Abbe
  1824 			 */
  1825                         if (i==2 && testword[0]=='m' && testword[1]=='c' ||
  1826                           i==3 && testword[0]=='m' && testword[1]=='a' &&
  1827 			  testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
  1828 			    ; /* do nothing! */
  1829                         else
  1830                             istypo=1;
  1831 		    }
  1832                     testword[i]=(char)tolower(testword[i]);
  1833 		}
  1834                 /*
  1835 		 * Check for certain unlikely two-letter combinations at word
  1836 		 * start and end.
  1837 		 */
  1838                 if (strlen(testword)>1)
  1839 		{
  1840                     for (i=0;*nostart[i];i++)
  1841                         if (!strncmp(testword,nostart[i],2))
  1842                             istypo=1;
  1843                     for (i=0;*noend[i];i++)
  1844                         if (!strncmp(testword+strlen(testword)-2,noend[i],2))
  1845                             istypo=1;
  1846 		}
  1847                 /* ght is common, gbt never. Like that. */
  1848                 if (strstr(testword,"cb"))
  1849 		    istypo=1;
  1850                 if (strstr(testword,"gbt"))
  1851 		    istypo=1;
  1852                 if (strstr(testword,"pbt"))
  1853 		    istypo=1;
  1854                 if (strstr(testword,"tbs"))
  1855 		    istypo=1;
  1856                 if (strstr(testword,"mrn"))
  1857 		    istypo=1;
  1858                 if (strstr(testword,"ahle"))
  1859 		    istypo=1;
  1860                 if (strstr(testword,"ihle"))
  1861 		    istypo=1;
  1862                 /*
  1863 		 * "TBE" does happen - like HEARTBEAT - but uncommon.
  1864                  * Also "TBI" - frostbite, outbid - but uncommon.
  1865                  * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1866 		 * numerals, but "ii" is a common scanno.
  1867 		 */
  1868                 if (strstr(testword,"tbi"))
  1869 		    istypo=1;
  1870                 if (strstr(testword,"tbe"))
  1871 		    istypo=1;
  1872                 if (strstr(testword,"ii"))
  1873 		    istypo=1;
  1874                 /*
  1875 		 * Check for no vowels or no consonants.
  1876                  * If none, flag a typo.
  1877 		 */
  1878                 if (!istypo && strlen(testword)>1)
  1879 		{
  1880                     vowel=consonant=0;
  1881                     for (i=0;testword[i];i++)
  1882 		    {
  1883                         if (testword[i]=='y' || gcisdigit(testword[i]))
  1884 			{
  1885 			    /* Yah, this is loose. */
  1886                             vowel++;
  1887                             consonant++;
  1888 			}
  1889                         else if (strchr(vowels,testword[i]))
  1890 			    vowel++;
  1891 			else
  1892 			    consonant++;
  1893 		    }
  1894                     if (!vowel || !consonant)
  1895                         istypo=1;
  1896 		}
  1897                 /*
  1898 		 * Now exclude the word from being reported if it's in
  1899                  * the okword list.
  1900 		 */
  1901                 for (i=0;*okword[i];i++)
  1902                     if (!strcmp(testword,okword[i]))
  1903                         istypo=0;
  1904                 /*
  1905 		 * What looks like a typo may be a Roman numeral.
  1906 		 * Exclude these.
  1907 		 */
  1908                 if (istypo && isroman(testword))
  1909 		    istypo=0;
  1910                 /* Check the manual list of typos. */
  1911                 if (!istypo)
  1912                     for (i=0;*typo[i];i++)
  1913                         if (!strcmp(testword,typo[i]))
  1914                             istypo=1;
  1915                 /*
  1916 		 * Check lowercase s, l, i and m - special cases.
  1917                  *   "j" - often a semi-colon gone wrong.
  1918                  *   "d" for a missing apostrophe - he d
  1919                  *   "n" for "in"
  1920 		 */
  1921                 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
  1922 		    istypo=1;
  1923                 if (istypo)
  1924 		{
  1925                     isdup=0;
  1926                     if (strlen(testword)<MAX_QWORD_LENGTH &&
  1927 		      !pswit[VERBOSE_SWITCH])
  1928                         for (i=0;i<qword_index;i++)
  1929                             if (!strcmp(testword,qword[i]))
  1930 			    {
  1931                                 isdup=1;
  1932                                 ++dupcnt[i];
  1933 			    }
  1934                     if (!isdup)
  1935 		    {
  1936                         if (qword_index<MAX_QWORD &&
  1937 			  strlen(testword)<MAX_QWORD_LENGTH)
  1938 			{
  1939                             strcpy(qword[qword_index],testword);
  1940                             qword_index++;
  1941 			}
  1942                         if (pswit[ECHO_SWITCH])
  1943 			    printf("\n%s\n",aline);
  1944                         if (!pswit[OVERVIEW_SWITCH])
  1945 			{
  1946                             printf("    Line %ld column %d - Query word %s",
  1947 			      linecnt,(int)(wordstart-aline)+1,inword);
  1948                             if (strlen(testword)<MAX_QWORD_LENGTH &&
  1949 			      !pswit[VERBOSE_SWITCH])
  1950                                 printf(" - not reporting duplicates");
  1951                             printf("\n");
  1952 			}
  1953                         else
  1954                             cnt_word++;
  1955 		    }
  1956 		}
  1957 	    }
  1958 	    /* check the user's list of typos */
  1959 	    if (!istypo && usertypo_count)
  1960 		for (i=0;i<usertypo_count;i++)
  1961 		    if (!strcmp(testword,usertypo[i]))
  1962 		    {
  1963 			if (pswit[ECHO_SWITCH])
  1964 			    printf("\n%s\n",aline);
  1965 			if (!pswit[OVERVIEW_SWITCH])  
  1966 			    printf("    Line %ld column %d - "
  1967 			      "Query possible scanno %s\n",
  1968 			      linecnt,(int)(wordstart-aline)+2,inword);
  1969 		    }
  1970             if (pswit[PARANOID_SWITCH] && warnings->digit)
  1971 	    {
  1972 		/* In paranoid mode, query all 0 and 1 standing alone. */
  1973                 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1974 		{
  1975                     if (pswit[ECHO_SWITCH])
  1976 			printf("\n%s\n",aline);
  1977                     if (!pswit[OVERVIEW_SWITCH])
  1978                         printf("    Line %ld column %d - Query standalone %s\n",
  1979 			  linecnt,(int)(wordstart-aline)+2,inword);
  1980                     else
  1981                         cnt_word++;
  1982 		}
  1983 	    }
  1984 	}
  1985 	/*
  1986          * Look for added or missing spaces around punctuation and quotes.
  1987          * If there is a punctuation character like ! with no space on
  1988          * either side, suspect a missing!space. If there are spaces on
  1989          * both sides , assume a typo. If we see a double quote with no
  1990          * space or punctuation on either side of it, assume unspaced
  1991          * quotes "like"this.
  1992 	 */
  1993         llen=strlen(aline);
  1994         for (i=1;i<llen;i++)
  1995 	{
  1996 	    /* For each character in the line after the first. */
  1997             if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
  1998 	    {
  1999 		/* we need to suppress warnings for acronyms like M.D. */
  2000                 isacro=0;
  2001 		/* we need to suppress warnings for ellipsis . . . */
  2002                 isellipsis=0;
  2003 		/* if there are letters on both sides of it or ... */
  2004                 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
  2005                    gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
  2006 		{
  2007 		    /* ...if it's strict punctuation followed by an alpha */
  2008                     if (aline[i]=='.')
  2009 		    {
  2010                         if (i>2 && aline[i-2]=='.')
  2011 			    isacro=1;
  2012                         if (i+2<llen && aline[i+2]=='.')
  2013 			    isacro=1;
  2014 		    }
  2015                     if (!isacro)
  2016 		    {
  2017                         if (pswit[ECHO_SWITCH])
  2018 			    printf("\n%s\n",aline);
  2019                         if (!pswit[OVERVIEW_SWITCH])
  2020                             printf("    Line %ld column %d - Missing space?\n",
  2021 			      linecnt,i+1);
  2022                         else
  2023                             cnt_punct++;
  2024 		    }
  2025 		}
  2026                 if (aline[i-1]==CHAR_SPACE &&
  2027 		  (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
  2028 		{
  2029 		    /*
  2030 		     * If there are spaces on both sides,
  2031 		     * or space before and end of line.
  2032 		     */
  2033                     if (aline[i]=='.')
  2034 		    {
  2035                         if (i>2 && aline[i-2]=='.')
  2036 			    isellipsis=1;
  2037                         if (i+2<llen && aline[i+2]=='.')
  2038 			    isellipsis=1;
  2039 		    }
  2040                     if (!isemptyline && !isellipsis)
  2041 		    {
  2042                         if (pswit[ECHO_SWITCH])
  2043 			    printf("\n%s\n",aline);
  2044                         if (!pswit[OVERVIEW_SWITCH])
  2045                             printf("    Line %ld column %d - "
  2046 			      "Spaced punctuation?\n",linecnt,i+1);
  2047                         else
  2048                             cnt_punct++;
  2049 		    }
  2050 		}
  2051 	    }
  2052 	}
  2053         /* Split out the characters that CANNOT be preceded by space. */
  2054         llen=strlen(aline);
  2055         for (i=1;i<llen;i++)
  2056 	{
  2057 	    /* for each character in the line after the first */
  2058             if (strchr("?!,;:",aline[i]))
  2059 	    {
  2060 		/* if it's punctuation that _cannot_ have a space before it */
  2061                 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
  2062 		  aline[i+1]!=CHAR_SPACE)
  2063 		{
  2064 		    /*
  2065 		     * If aline[i+1) DOES == space,
  2066 		     * it was already reported just above.
  2067 		     */
  2068                     if (pswit[ECHO_SWITCH])
  2069 			printf("\n%s\n",aline);
  2070                     if (!pswit[OVERVIEW_SWITCH])
  2071                         printf("    Line %ld column %d - Spaced punctuation?\n",
  2072 			  linecnt,i+1);
  2073                     else
  2074                         cnt_punct++;
  2075 		}
  2076 	    }
  2077 	}
  2078         /*
  2079 	 * Special case " .X" where X is any alpha.
  2080          * This plugs a hole in the acronym code above.
  2081 	 * Inelegant, but maintainable.
  2082 	 */
  2083         llen=strlen(aline);
  2084         for (i=1;i<llen;i++)
  2085 	{
  2086 	    /* for each character in the line after the first */
  2087             if (aline[i]=='.')
  2088 	    {
  2089 		/* if it's a period */
  2090                 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
  2091 		{
  2092 		    /*
  2093 		     * If the period follows a space and
  2094 		     * is followed by a letter.
  2095 		     */
  2096                     if (pswit[ECHO_SWITCH])
  2097 			printf("\n%s\n",aline);
  2098                     if (!pswit[OVERVIEW_SWITCH])
  2099                         printf("    Line %ld column %d - Spaced punctuation?\n",
  2100 			  linecnt,i+1);
  2101                     else
  2102                         cnt_punct++;
  2103 		}
  2104 	    }
  2105 	}
  2106         for (i=1;i<llen;i++)
  2107 	{
  2108 	    /* for each character in the line after the first */
  2109             if (aline[i]==CHAR_DQUOTE)
  2110 	    {
  2111                 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
  2112 		  !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
  2113 		  !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
  2114 		{
  2115 		    if (pswit[ECHO_SWITCH])
  2116 			printf("\n%s\n",aline);
  2117 		    if (!pswit[OVERVIEW_SWITCH])
  2118 			printf("    Line %ld column %d - Unspaced quotes?\n",
  2119 			  linecnt,i+1);
  2120 		    else
  2121 			cnt_punct++;
  2122 		}
  2123 	    }
  2124 	}
  2125         /* Check parity of quotes. */
  2126         for (s=aline;*s;s++)
  2127 	{
  2128             if (*s==CHAR_DQUOTE)
  2129 	    {
  2130                 if (!(dquotepar=!dquotepar))
  2131 		{
  2132 		    /* parity even */
  2133                     if (!strchr("_-.'`/,;:!?)]} ",s[1]))
  2134 		    {
  2135                         if (pswit[ECHO_SWITCH])
  2136 			    printf("\n%s\n",aline);
  2137                         if (!pswit[OVERVIEW_SWITCH])
  2138                             printf("    Line %ld column %d - "
  2139 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
  2140                         else
  2141                             cnt_punct++;
  2142 		    }
  2143 		}
  2144                 else
  2145 		{
  2146 		    /* parity odd */
  2147                     if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
  2148 		      !strchr("_-/.'`([{$",s[1]) || !s[1])
  2149 		    {
  2150                         if (pswit[ECHO_SWITCH])
  2151 			    printf("\n%s\n",aline);
  2152                         if (!pswit[OVERVIEW_SWITCH])
  2153                             printf("    Line %ld column %d - "
  2154 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
  2155                         else
  2156                             cnt_punct++;
  2157 		    }
  2158 		}
  2159 	    }
  2160 	}
  2161 	if (*aline==CHAR_DQUOTE)
  2162 	{
  2163 	    if (strchr(",;:!?)]} ",aline[1]))
  2164 	    {
  2165 		if (pswit[ECHO_SWITCH])
  2166 		    printf("\n%s\n",aline);
  2167 		if (!pswit[OVERVIEW_SWITCH])
  2168 		    printf("    Line %ld column 1 - Wrongspaced quotes?\n",
  2169 		      linecnt);
  2170 		else
  2171 		    cnt_punct++;
  2172 	    }
  2173 	}
  2174         if (pswit[SQUOTE_SWITCH])
  2175 	{
  2176             for (s=aline;*s;s++)
  2177 	    {
  2178                 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
  2179 		  (s==aline || s>aline && !gcisalpha(s[-1]) ||
  2180 		  !gcisalpha(s[1])))
  2181 		{
  2182                     if (!(squotepar=!squotepar))
  2183 		    {
  2184 			/* parity even */
  2185                         if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
  2186 			{
  2187                             if (pswit[ECHO_SWITCH])
  2188 				printf("\n%s\n",aline);
  2189                             if (!pswit[OVERVIEW_SWITCH])
  2190                                 printf("    Line %ld column %d - "
  2191 				  "Wrongspaced singlequotes?\n",
  2192 				  linecnt,(int)(s-aline)+1);
  2193                             else
  2194                                 cnt_punct++;
  2195 			}
  2196 		    }
  2197                     else
  2198 		    {
  2199 			/* parity odd */
  2200                         if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
  2201 			  !strchr("_-/\".'`",s[1]) || !s[1])
  2202 			{
  2203                             if (pswit[ECHO_SWITCH])
  2204 				printf("\n%s\n",aline);
  2205                             if (!pswit[OVERVIEW_SWITCH])
  2206                                 printf("    Line %ld column %d - "
  2207 				  "Wrongspaced singlequotes?\n",
  2208 				  linecnt,(int)(s-aline)+1);
  2209                             else
  2210                                 cnt_punct++;
  2211 			}
  2212 		    }
  2213 		}
  2214 	    }
  2215 	}
  2216         /*
  2217 	 * Look for double punctuation like ,. or ,,
  2218          * Thanks to DW for the suggestion!
  2219          * In books with references, ".," and ".;" are common
  2220          * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2221          * OTOH, from my initial tests, there are also fairly
  2222          * common errors. What to do? Make these cases paranoid?
  2223          * ".," is the most common, so warnings->dotcomma is used
  2224          * to suppress detailed reporting if it occurs often.
  2225 	 */
  2226         llen=strlen(aline);
  2227         for (i=0;i<llen;i++)
  2228 	{
  2229 	    /* for each punctuation character in the line */
  2230             if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
  2231 	      aline[i] && aline[i+1])
  2232 	    {
  2233 		/* followed by punctuation, it's a query, unless . . . */
  2234                 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
  2235 		  aline[i]=='!') ||
  2236 		  !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
  2237 		  warnings->isFrench && !strncmp(aline+i,",...",4) ||
  2238 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||
  2239 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||
  2240 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||
  2241 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||
  2242 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||
  2243 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||
  2244 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||
  2245 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||
  2246 		  warnings->isFrench && !strncmp(aline+i,"...?",4))
  2247 		{
  2248 		    if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
  2249 		      warnings->isFrench && !strncmp(aline+i,"...,",4) ||
  2250 		      warnings->isFrench && !strncmp(aline+i,";...",4) ||
  2251 		      warnings->isFrench && !strncmp(aline+i,"...;",4) ||
  2252 		      warnings->isFrench && !strncmp(aline+i,":...",4) ||
  2253 		      warnings->isFrench && !strncmp(aline+i,"...:",4) ||
  2254 		      warnings->isFrench && !strncmp(aline+i,"!...",4) ||
  2255 		      warnings->isFrench && !strncmp(aline+i,"...!",4) ||
  2256 		      warnings->isFrench && !strncmp(aline+i,"?...",4) ||
  2257 		      warnings->isFrench && !strncmp(aline+i,"...?",4))
  2258 			i+=4;
  2259 		    ; /* do nothing for .. !! and ?? which can be legit */
  2260 		}
  2261                 else
  2262 		{
  2263                     if (pswit[ECHO_SWITCH])
  2264 			printf("\n%s\n",aline);
  2265                     if (!pswit[OVERVIEW_SWITCH])
  2266                         printf("    Line %ld column %d - Double punctuation?\n",
  2267 			  linecnt,i+1);
  2268                     else
  2269                         cnt_punct++;
  2270 		}
  2271 	    }
  2272 	}
  2273         s=aline;
  2274         while (strstr(s," \" "))
  2275 	{
  2276             if (pswit[ECHO_SWITCH])
  2277 		printf("\n%s\n",aline);
  2278             if (!pswit[OVERVIEW_SWITCH])
  2279                 printf("    Line %ld column %d - Spaced doublequote?\n",
  2280 		  linecnt,(int)(strstr(s," \" ")-aline+1));
  2281             else
  2282                 cnt_punct++;
  2283             s=strstr(s," \" ")+2;
  2284 	}
  2285         s=aline;
  2286         while (strstr(s," ' "))
  2287 	{
  2288             if (pswit[ECHO_SWITCH])
  2289 		printf("\n%s\n",aline);
  2290             if (!pswit[OVERVIEW_SWITCH])
  2291                 printf("    Line %ld column %d - Spaced singlequote?\n",
  2292 		  linecnt,(int)(strstr(s," ' ")-aline+1));
  2293             else
  2294                 cnt_punct++;
  2295             s=strstr(s," ' ")+2;
  2296 	}
  2297         s=aline;
  2298         while (strstr(s," ` "))
  2299 	{
  2300             if (pswit[ECHO_SWITCH])
  2301 		printf("\n%s\n",aline);
  2302             if (!pswit[OVERVIEW_SWITCH])
  2303                 printf("    Line %ld column %d - Spaced singlequote?\n",
  2304 		  linecnt,(int)(strstr(s," ` ")-aline+1));
  2305             else
  2306                 cnt_punct++;
  2307             s=strstr(s," ` ")+2;
  2308 	}
  2309         /* check special case of 'S instead of 's at end of word */
  2310         s=aline+1;
  2311         while (*s)
  2312 	{
  2313             if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
  2314 	    {
  2315                 if (pswit[ECHO_SWITCH])
  2316 		    printf("\n%s\n",aline);
  2317                 if (!pswit[OVERVIEW_SWITCH])
  2318                     printf("    Line %ld column %d - Capital \"S\"?\n",
  2319 		      linecnt,(int)(s-aline+2));
  2320                 else
  2321                     cnt_punct++;
  2322 	    }
  2323             s++;
  2324 	}
  2325         /*
  2326 	 * Now check special cases - start and end of line -
  2327          * for single and double quotes. Start is sometimes [sic]
  2328          * but better to query it anyway.
  2329          * While we're here, check for dash at end of line.
  2330 	 */
  2331         llen=strlen(aline);
  2332         if (llen>1)
  2333 	{
  2334             if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
  2335 	      aline[llen-1]==CHAR_OPEN_SQUOTE)
  2336                 if (aline[llen-2]==CHAR_SPACE)
  2337 		{
  2338                     if (pswit[ECHO_SWITCH])
  2339 			printf("\n%s\n",aline);
  2340                     if (!pswit[OVERVIEW_SWITCH])
  2341                         printf("    Line %ld column %d - Spaced quote?\n",
  2342 			  linecnt,llen);
  2343                     else
  2344                         cnt_punct++;
  2345 		}
  2346             if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
  2347 	      aline[1]==CHAR_SPACE)
  2348 	    {
  2349 		if (pswit[ECHO_SWITCH])
  2350 		    printf("\n%s\n",aline);
  2351 		if (!pswit[OVERVIEW_SWITCH])
  2352 		    printf("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2353 		else
  2354 		    cnt_punct++;
  2355 	    }
  2356             /*
  2357 	     * Dash at end of line may well be legit - paranoid mode only
  2358              * and don't report em-dash at line-end.
  2359 	     */
  2360             if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2361 	    {
  2362                 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
  2363 		    ;
  2364                 if (aline[i]=='-' && aline[i-1]!='-')
  2365 		{
  2366                     if (pswit[ECHO_SWITCH])
  2367 			printf("\n%s\n",aline);
  2368                     if (!pswit[OVERVIEW_SWITCH])
  2369                         printf("    Line %ld column %d - "
  2370 			  "Hyphen at end of line?\n",linecnt,i);
  2371 		}
  2372 	    }
  2373 	}
  2374         /*
  2375 	 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2376          * If so, suspect a scanno like "a]most".
  2377 	 */
  2378         llen=strlen(aline);
  2379         for (i=1;i<llen-1;i++)
  2380 	{
  2381 	    /* for each bracket character in the line except 1st & last */
  2382             if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
  2383 	      gcisalpha(aline[i+1]))
  2384 	    {
  2385                 if (pswit[ECHO_SWITCH])
  2386 		    printf("\n%s\n",aline);
  2387                 if (!pswit[OVERVIEW_SWITCH])
  2388                     printf("    Line %ld column %d - Unspaced bracket?\n",
  2389 		      linecnt,i);
  2390                 else
  2391                     cnt_punct++;
  2392 	    }
  2393 	}
  2394         llen=strlen(aline);
  2395         if (warnings->endquote)
  2396 	{
  2397             for (i=1;i<llen;i++)
  2398 	    {
  2399 		/* for each character in the line except 1st */
  2400                 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
  2401 		{
  2402 		    if (pswit[ECHO_SWITCH])
  2403 			printf("\n%s\n",aline);
  2404 		    if (!pswit[OVERVIEW_SWITCH])
  2405 			printf("    Line %ld column %d - "
  2406 			  "endquote missing punctuation?\n",linecnt,i);
  2407 		    else
  2408 			cnt_punct++;
  2409 		}
  2410 	    }
  2411 	}
  2412 	/*
  2413          * Check for <HTML TAG>.
  2414          * If there is a < in the line, followed at some point
  2415          * by a > then we suspect HTML.
  2416 	 */
  2417         if (strstr(aline,"<") && strstr(aline,">"))
  2418 	{
  2419             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
  2420             if (i>0)
  2421 	    {
  2422                 strncpy(wrk,strstr(aline,"<"),i);
  2423                 wrk[i]=0;
  2424                 if (pswit[ECHO_SWITCH])
  2425 		    printf("\n%s\n",aline);
  2426                 if (!pswit[OVERVIEW_SWITCH])
  2427                     printf("    Line %ld column %d - HTML Tag? %s \n",
  2428 		      linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
  2429                 else
  2430                     cnt_html++;
  2431 	    }
  2432 	}
  2433         /*
  2434 	 * Check for &symbol; HTML.
  2435          * If there is a & in the line, followed at
  2436          * some point by a ; then we suspect HTML.
  2437 	 */
  2438         if (strstr(aline,"&") && strstr(aline,";"))
  2439 	{
  2440             i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
  2441             for (s=strstr(aline,"&");s<strstr(aline,";");s++)   
  2442                 if (*s==CHAR_SPACE)
  2443 		    i=0;                /* Don't report "Jones & Son;" */
  2444             if (i>0)
  2445 	    {
  2446                 strncpy(wrk,strstr(aline,"&"),i);
  2447                 wrk[i]=0;
  2448                 if (pswit[ECHO_SWITCH])
  2449 		    printf("\n%s\n",aline);
  2450                 if (!pswit[OVERVIEW_SWITCH])
  2451                     printf("    Line %ld column %d - HTML symbol? %s \n",
  2452 		      linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
  2453                 else
  2454                     cnt_html++;
  2455 	    }
  2456 	}
  2457         /*
  2458 	 * At end of paragraph, check for mismatched quotes.
  2459          * We don't want to report an error immediately, since it is a
  2460          * common convention to omit the quotes at end of paragraph if
  2461          * the next paragraph is a continuation of the same speaker.
  2462          * Where this is the case, the next para should begin with a
  2463          * quote, so we store the warning message and only display it
  2464          * at the top of the next iteration if the new para doesn't
  2465          * start with a quote.
  2466          * The -p switch overrides this default, and warns of unclosed
  2467          * quotes on _every_ paragraph, whether the next begins with a
  2468          * quote or not.
  2469 	 */
  2470         if (isemptyline)
  2471 	{
  2472 	    /* end of para - add up the totals */
  2473             if (counters.quot%2)
  2474                 sprintf(dquote_err,"    Line %ld - Mismatched quotes\n",
  2475 		  linecnt);
  2476             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
  2477 	      counters.open_single_quote!=counters.close_single_quote)
  2478                 sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n",
  2479 		  linecnt);
  2480             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
  2481 	      counters.open_single_quote!=counters.close_single_quote &&
  2482 	      counters.open_single_quote!=counters.close_single_quote+1)
  2483 		/*
  2484 		 * Flag it to be noted regardless of the
  2485 		 * first char of the next para.
  2486 		 */
  2487                 squot=1;
  2488             if (counters.r_brack)
  2489                 sprintf(rbrack_err,"    Line %ld - "
  2490 		  "Mismatched round brackets?\n",linecnt);
  2491             if (counters.s_brack)
  2492                 sprintf(sbrack_err,"    Line %ld - "
  2493 		  "Mismatched square brackets?\n",linecnt);
  2494             if (counters.c_brack)
  2495                 sprintf(cbrack_err,"    Line %ld - "
  2496 		  "Mismatched curly brackets?\n",linecnt);
  2497             if (counters.c_unders%2)
  2498                 sprintf(unders_err,"    Line %ld - Mismatched underscores?\n",
  2499 		  linecnt);
  2500 	    memset(&counters,0,sizeof(counters));
  2501 	    /* let the next iteration know that it's starting a new para */
  2502             isnewpara=1;
  2503 	}
  2504         /*
  2505 	 * Check for omitted punctuation at end of paragraph by working back
  2506 	 * through prevline. DW.
  2507          * Need to check this only for "normal" paras.
  2508          * So what is a "normal" para?
  2509          *    Not normal if one-liner (chapter headings, etc.)
  2510          *    Not normal if doesn't contain at least one locase letter
  2511          *    Not normal if starts with space
  2512 	 */
  2513         if (isemptyline)
  2514 	{
  2515 	    /* end of para */
  2516             for (s=prevline,i=0;*s && !i;s++)
  2517                 if (gcisletter(*s))
  2518 		    /* use i to indicate the presence of a letter on the line */
  2519                     i=1;
  2520             /*
  2521 	     * This next "if" is a problem.
  2522              * If we say "start_para_line <= linecnt - 1", that includes
  2523 	     * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2524              * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2525              * misses genuine one-line paragraphs.
  2526 	     */
  2527             if (i && last.blen>2 && start_para_line<linecnt-1 &&
  2528 	      *prevline>CHAR_SPACE)
  2529 	    {
  2530                 for (i=strlen(prevline)-1;
  2531 		  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
  2532 		  prevline[i]>CHAR_SPACE && i>0;
  2533 		  i--)
  2534 		    ;
  2535                 for (;i>0;i--)
  2536 		{
  2537                     if (gcisalpha(prevline[i]))
  2538 		    {
  2539                         if (pswit[ECHO_SWITCH])
  2540 			    printf("\n%s\n",prevline);
  2541                         if (!pswit[OVERVIEW_SWITCH])
  2542                             printf("    Line %ld column %d - "
  2543 			      "No punctuation at para end?\n",
  2544 			      linecnt-1,strlen(prevline));
  2545                         else
  2546                             cnt_punct++;
  2547                         break;
  2548 		    }
  2549                     if (strchr("-.:!([{?}])",prevline[i]))
  2550                         break;
  2551 		}
  2552 	    }
  2553 	}
  2554         strcpy(prevline,aline);
  2555     }
  2556     fclose(infile);
  2557     if (!pswit[OVERVIEW_SWITCH])
  2558         for (i=0;i<MAX_QWORD;i++)
  2559             if (dupcnt[i])
  2560                 printf("\nNote: Queried word %s was duplicated %d time%s\n",
  2561 		  qword[i],dupcnt[i],"s");
  2562 }
  2563 
  2564 /*
  2565  * flgets:
  2566  *
  2567  * Get one line from the input stream, checking for
  2568  * the existence of exactly one CR/LF line-end per line.
  2569  *
  2570  * Returns: a pointer to the line.
  2571  */
  2572 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
  2573 {
  2574     char c;
  2575     int len,isCR,cint;
  2576     *theline=0;
  2577     len=isCR=0;
  2578     c=cint=fgetc(thefile);
  2579     do
  2580     {
  2581         if (cint==EOF)
  2582             return NULL;
  2583 	/* either way, it's end of line */
  2584         if (c==10)
  2585 	{
  2586             if (isCR)
  2587                 break;
  2588             else
  2589 	    {
  2590 		/* Error - a LF without a preceding CR */
  2591                 if (pswit[LINE_END_SWITCH])
  2592 		{
  2593                     if (pswit[ECHO_SWITCH])
  2594 			printf("\n%s\n",theline);
  2595                     if (!pswit[OVERVIEW_SWITCH])
  2596                         printf("    Line %ld - No CR?\n",lcnt);
  2597                     else
  2598                         cnt_lineend++;
  2599 		}
  2600                 break;
  2601 	    }
  2602 	}
  2603         if (c==13)
  2604 	{
  2605             if (isCR)
  2606 	    {
  2607 		/* Error - two successive CRs */
  2608                 if (pswit[LINE_END_SWITCH])
  2609 		{
  2610                     if (pswit[ECHO_SWITCH])
  2611 			printf("\n%s\n",theline);
  2612                     if (!pswit[OVERVIEW_SWITCH])
  2613                         printf("    Line %ld - Two successive CRs?\n",lcnt);
  2614                     else
  2615                         cnt_lineend++;
  2616 		}
  2617 	    }
  2618             isCR=1;
  2619 	}
  2620         else
  2621 	{
  2622             if (pswit[LINE_END_SWITCH] && isCR)
  2623 	    {
  2624                 if (pswit[ECHO_SWITCH])
  2625 		    printf("\n%s\n",theline);
  2626                 if (!pswit[OVERVIEW_SWITCH])
  2627                     printf("    Line %ld column %d - CR without LF?\n",
  2628 		      lcnt,len+1);
  2629                 else
  2630                     cnt_lineend++;
  2631 	    }
  2632             theline[len]=c;
  2633             len++;
  2634             theline[len]=0;
  2635             isCR=0;
  2636 	}
  2637         c=cint=fgetc(thefile);
  2638     } while(len<maxlen);
  2639     if (pswit[MARKUP_SWITCH])  
  2640         postprocess_for_HTML(theline);
  2641     if (pswit[DP_SWITCH])  
  2642         postprocess_for_DP(theline);
  2643     return theline;
  2644 }
  2645 
  2646 /*
  2647  * mixdigit:
  2648  *
  2649  * Takes a "word" as a parameter, and checks whether it
  2650  * contains a mixture of alpha and digits. Generally, this is an
  2651  * error, but may not be for cases like 4th or L5 12s. 3d.
  2652  *
  2653  * Returns: 0 if no error found, 1 if error.
  2654  */
  2655 int mixdigit(char *checkword)
  2656 {
  2657     int wehaveadigit,wehavealetter,firstdigits,query,wl;
  2658     char *s;
  2659     wehaveadigit=wehavealetter=query=0;
  2660     for (s=checkword;*s;s++)
  2661         if (gcisalpha(*s))
  2662             wehavealetter=1;
  2663         else
  2664             if (gcisdigit(*s))
  2665                 wehaveadigit=1;
  2666     if (wehaveadigit && wehavealetter)
  2667     {
  2668 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2669         query=1;
  2670         wl=strlen(checkword);
  2671         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
  2672             ;
  2673         /* digits, ending in st, rd, nd, th of either case */
  2674         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
  2675 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
  2676 	  matchword(checkword+wl-2,"th")))
  2677 	    query=0;
  2678         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
  2679 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
  2680 	  matchword(checkword+wl-3,"ths")))
  2681 	    query=0;
  2682         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
  2683 	  matchword(checkword+wl-4,"rdly") ||
  2684 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
  2685 	    query=0;
  2686         /* digits, ending in l, L, s or d */
  2687         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
  2688 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
  2689 	    query=0;
  2690         /*
  2691 	 * L at the start of a number, representing Britsh pounds, like L500.
  2692          * This is cute. We know the current word is mixeddigit. If the first
  2693          * letter is L, there must be at least one digit following. If both
  2694          * digits and letters follow, we have a genuine error, else we have a
  2695          * capital L followed by digits, and we accept that as a non-error.
  2696 	 */
  2697         if (checkword[0]=='L' && !mixdigit(checkword+1))
  2698 	    query=0;
  2699     }
  2700     return query;
  2701 }
  2702 
  2703 /*
  2704  * getaword:
  2705  *
  2706  * Extracts the first/next "word" from the line, and puts
  2707  * it into "thisword". A word is defined as one English word unit--or
  2708  * at least that's the aim.
  2709  *
  2710  * Returns: a pointer to the position in the line where we will start
  2711  *          looking for the next word.
  2712  */
  2713 char *getaword(char *fromline,char *thisword)
  2714 {
  2715     int i,wordlen;
  2716     char *s;
  2717     wordlen=0;
  2718     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
  2719       fromline++)
  2720 	;
  2721     /*
  2722      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2723      * Especially yucky is the case of L1,000
  2724      * This section looks for a pattern of characters including a digit
  2725      * followed by a comma or period followed by one or more digits.
  2726      * If found, it returns this whole pattern as a word; otherwise we discard
  2727      * the results and resume our normal programming.
  2728      */
  2729     s=fromline;
  2730     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
  2731       wordlen<MAXWORDLEN;s++)
  2732     {
  2733 	thisword[wordlen]=*s;
  2734         wordlen++;
  2735     }
  2736     thisword[wordlen]=0;
  2737     for (i=1;i<wordlen-1;i++)
  2738     {
  2739         if (thisword[i]=='.' || thisword[i]==',')
  2740 	{
  2741             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
  2742 	    {
  2743                 fromline=s;
  2744                 return fromline;
  2745 	    }
  2746 	}
  2747     }
  2748     /* we didn't find a punctuated number - do the regular getword thing */
  2749     wordlen=0;
  2750     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
  2751       wordlen<MAXWORDLEN;fromline++)
  2752     {
  2753         thisword[wordlen]=*fromline;
  2754         wordlen++;
  2755     }
  2756     thisword[wordlen]=0;
  2757     return fromline;
  2758 }
  2759 
  2760 /*
  2761  * matchword:
  2762  *
  2763  * A case-insensitive string matcher.
  2764  */
  2765 int matchword(char *checkfor,char *thisword)
  2766 {
  2767     unsigned int ismatch,i;
  2768     if (strlen(checkfor)!=strlen(thisword))
  2769 	return 0;
  2770     ismatch=1;     /* assume a match until we find a difference */
  2771     for (i=0;i<strlen(checkfor);i++)
  2772         if (toupper(checkfor[i])!=toupper(thisword[i]))
  2773             ismatch=0;
  2774     return ismatch;
  2775 }
  2776 
  2777 /*
  2778  * lowerit:
  2779  *
  2780  * Lowercase the line.
  2781  */
  2782 
  2783 void lowerit(char *theline)
  2784 {
  2785     for (;*theline;theline++)
  2786         if (*theline>='A' && *theline<='Z')
  2787             *theline+=32;
  2788 }
  2789 
  2790 /*
  2791  * isroman:
  2792  *
  2793  * Is this word a Roman Numeral?
  2794  *
  2795  * It doesn't actually validate that the number is a valid Roman Numeral--for
  2796  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  2797  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  2798  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  2799  * expressions thereof, except when it came to taxes. Allow any number of M,
  2800  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  2801  * XL or an optional XC, an optional IX or IV, an optional V and any number
  2802  * of optional Is.
  2803  */
  2804 int isroman(char *t)
  2805 {
  2806     char *s;
  2807     if (!t || !*t)
  2808 	return 0;
  2809     s=t;
  2810     while (*t=='m' && *t)
  2811 	t++;
  2812     if (*t=='d')
  2813 	t++;
  2814     if (*t=='c' && t[1]=='m')
  2815 	t+=2;
  2816     if (*t=='c' && t[1]=='d')
  2817 	t+=2;
  2818     while (*t=='c' && *t)
  2819 	t++;
  2820     if (*t=='x' && t[1]=='l')
  2821 	t+=2;
  2822     if (*t=='x' && t[1]=='c')
  2823 	t+=2;
  2824     if (*t=='l')
  2825 	t++;
  2826     while (*t=='x' && *t)
  2827 	t++;
  2828     if (*t=='i' && t[1]=='x')
  2829 	t+=2;
  2830     if (*t=='i' && t[1]=='v')
  2831 	t+=2;
  2832     if (*t=='v')
  2833 	t++;
  2834     while (*t=='i' && *t)
  2835 	t++;
  2836     return !*t;
  2837 }
  2838 
  2839 /*
  2840  * gcisalpha:
  2841  *
  2842  * A version of isalpha() that is somewhat lenient on 8-bit texts.
  2843  * If we use the standard function, 8-bit accented characters break
  2844  * words, so that tete with accented characters appears to be two words, "t"
  2845  * and "t", with 8-bit characters between them. This causes over-reporting of
  2846  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
  2847  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
  2848  */
  2849 int gcisalpha(unsigned char c)
  2850 {
  2851     if (c>='a' && c<='z')
  2852 	return 1;
  2853     if (c>='A' && c<='Z')
  2854 	return 1;
  2855     if (c<140)
  2856 	return 0;
  2857     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
  2858 	return 1;
  2859     if (c==140 || c==142 || c==156 || c==158 || c==159)
  2860 	return 1;
  2861     return 0;
  2862 }
  2863 
  2864 /*
  2865  * gcisdigit:
  2866  *
  2867  * A version of isdigit() that doesn't get confused in 8-bit texts.
  2868  */
  2869 int gcisdigit(unsigned char c)
  2870 {   
  2871     return c>='0' && c<='9';
  2872 }
  2873 
  2874 /*
  2875  * gcisletter:
  2876  *
  2877  * A version of isletter() that doesn't get confused in 8-bit texts.
  2878  * NB: this is ISO-8891-1-specific.
  2879  */
  2880 int gcisletter(unsigned char c)
  2881 {   
  2882     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
  2883 }
  2884 
  2885 /*
  2886  * gcstrchr:
  2887  *
  2888  * Wraps strchr to return NULL if the character being searched for is zero.
  2889  */
  2890 char *gcstrchr(char *s,char c)
  2891 {
  2892     if (!c)
  2893 	return NULL;
  2894     return strchr(s,c);
  2895 }
  2896 
  2897 /*
  2898  * postprocess_for_DP:
  2899  *
  2900  * Invoked with the -d switch from flgets().
  2901  * It simply "removes" from the line a hard-coded set of common
  2902  * DP-specific tags, so that the line passed to the main routine has
  2903  * been pre-cleaned of DP markup.
  2904  */
  2905 void postprocess_for_DP(char *theline)
  2906 {
  2907     char *s,*t;
  2908     int i;
  2909     if (!*theline) 
  2910         return;
  2911     for (i=0;*DPmarkup[i];i++)
  2912     {
  2913         s=strstr(theline,DPmarkup[i]);
  2914         while (s)
  2915 	{
  2916             t=s+strlen(DPmarkup[i]);
  2917             while (*t)
  2918 	    {
  2919                 *s=*t;
  2920                 t++;
  2921 		s++;
  2922 	    }
  2923             *s=0;
  2924             s=strstr(theline,DPmarkup[i]);
  2925 	}
  2926     }
  2927 }
  2928 
  2929 /*
  2930  * postprocess_for_HTML:
  2931  *
  2932  * Invoked with the -m switch from flgets().
  2933  * It simply "removes" from the line a hard-coded set of common
  2934  * HTML tags and "replaces" a hard-coded set of common HTML
  2935  * entities, so that the line passed to the main routine has
  2936  * been pre-cleaned of HTML.
  2937  */
  2938 void postprocess_for_HTML(char *theline)
  2939 {
  2940     if (strstr(theline,"<") && strstr(theline,">"))
  2941         while (losemarkup(theline))
  2942             ;
  2943     while (loseentities(theline))
  2944         ;
  2945 }
  2946 
  2947 char *losemarkup(char *theline)
  2948 {
  2949     char *s,*t;
  2950     int i;
  2951     if (!*theline) 
  2952         return NULL;
  2953     s=strstr(theline,"<");
  2954     t=strstr(theline,">");
  2955     if (!s || !t)
  2956 	return NULL;
  2957     for (i=0;*markup[i];i++)
  2958         if (!tagcomp(s+1,markup[i]))
  2959 	{
  2960             if (!t[1])
  2961 	    {
  2962                 *s=0;
  2963                 return s;
  2964 	    }
  2965             else if (t>s)
  2966 	    {
  2967 		strcpy(s,t+1);
  2968 		return s;
  2969 	    }
  2970         }
  2971     /* It's an unrecognized <xxx>. */
  2972     return NULL;
  2973 }
  2974 
  2975 char *loseentities(char *theline)
  2976 {
  2977     int i;
  2978     char *s,*t;
  2979     if (!*theline) 
  2980         return NULL;
  2981     for (i=0;*entities[i].htmlent;i++)
  2982     {
  2983         s=strstr(theline,entities[i].htmlent);
  2984         if (s)
  2985 	{
  2986             t=malloc((size_t)strlen(s));
  2987             if (!t)
  2988 		return NULL;
  2989             strcpy(t,s+strlen(entities[i].htmlent));
  2990             strcpy(s,entities[i].textent);
  2991             strcat(s,t);
  2992             free(t);
  2993             return theline;
  2994 	}
  2995     }
  2996     for (i=0;*entities[i].htmlnum;i++)
  2997     {
  2998         s=strstr(theline,entities[i].htmlnum);
  2999         if (s)
  3000 	{
  3001             t=malloc((size_t)strlen(s));
  3002             if (!t)
  3003 		return NULL;
  3004             strcpy(t,s+strlen(entities[i].htmlnum));
  3005             strcpy(s,entities[i].textent);
  3006             strcat(s,t);
  3007             free(t);
  3008             return theline;
  3009 	}
  3010     }
  3011     return NULL;
  3012 }
  3013 
  3014 int tagcomp(char *strin,char *basetag)
  3015 {
  3016     char *s,*t;
  3017     s=basetag;
  3018     t=strin;
  3019     if (*t=='/')
  3020 	t++; /* ignore a slash */
  3021     while (*s && *t)
  3022     {
  3023         if (tolower(*s)!=tolower(*t))
  3024 	    return 1;
  3025         s++;
  3026 	t++;
  3027     }
  3028     return 0;
  3029 }
  3030 
  3031 void proghelp()
  3032 {
  3033     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3034     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3035     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3036     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3037       "For details, read the file COPYING.\n",stderr);
  3038     fputs("This is Free Software; "
  3039       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3040     fputs("read the file COPYING for details.\n\n",stderr);
  3041     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
  3042     fputs("  where -s checks single quotes, -e suppresses echoing lines, "
  3043       "-t checks typos\n",stderr);
  3044     fputs("  -x (paranoid) switches OFF -t and extra checks, "
  3045       "-l turns OFF line-end checks\n",stderr);
  3046     fputs("  -o just displays overview without detail, "
  3047       "-h echoes header fields\n",stderr);
  3048     fputs("  -v (verbose) unsuppresses duplicate reporting, "
  3049       "-m suppresses markup\n",stderr);
  3050     fputs("  -d ignores DP-specific markup,\n",stderr);
  3051     fputs("  -u uses a file gutcheck.typ to query user-defined "
  3052       "possible typos\n",stderr);
  3053     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
  3054     fputs("\n",stderr);
  3055     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
  3056       stderr);
  3057     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3058       "non-ASCII\n",stderr);
  3059     fputs("characters like accented letters, "
  3060       "lines longer than 75 or shorter than 55,\n",stderr);
  3061     fputs("unbalanced quotes or brackets, "
  3062       "a variety of badly formatted punctuation, \n",stderr);
  3063     fputs("HTML tags, some likely typos. "
  3064       "It is NOT a substitute for human judgement.\n",stderr);
  3065     fputs("\n",stderr);
  3066 }