bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sat May 25 19:14:21 2013 +0100 (2013-05-25)
changeset 44 66483ebc9b56
parent 43 e4042a067753
child 45 d48f66b0ad0d
permissions -rw-r--r--
Break check_for_odd_characters() out
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*                                                                       */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */
     6 /*                                                                       */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.                                   */
    11 /*                                                                       */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */
    15 /* GNU General Public License for more details.                          */
    16 /*                                                                       */
    17 /* You should have received a copy of the GNU General Public License     */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 
    26 #define MAXWORDLEN    80    /* max length of one word             */
    27 #define LINEBUFSIZE 2048    /* buffer size for an input line      */
    28 
    29 #define MAX_USER_TYPOS 1000
    30 #define USERTYPO_FILE "gutcheck.typ"
    31 
    32 #ifndef MAX_PATH
    33 #define MAX_PATH 16384
    34 #endif
    35 
    36 char aline[LINEBUFSIZE];
    37 char prevline[LINEBUFSIZE];
    38 
    39 /* Common typos. */
    40 char *typo[] = {
    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    70     "se", ""
    71 };
    72 
    73 char *usertypo[MAX_USER_TYPOS];
    74 
    75 /* Common abbreviations and other OK words not to query as typos. */
    76 char *okword[] = {
    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    80     "outbid", "outbids", "frostbite", "frostbitten", ""
    81 };
    82 
    83 /* Common abbreviations that cause otherwise unexplained periods. */
    84 char *abbrev[] = {
    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    87 };
    88 
    89 /*
    90  * Two-Letter combinations that rarely if ever start words,
    91  * but are common scannos or otherwise common letter combinations.
    92  */
    93 char *nostart[] = {
    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    95 };
    96 
    97 /*
    98  * Two-Letter combinations that rarely if ever end words,
    99  * but are common scannos or otherwise common letter combinations.
   100  */
   101 char *noend[] = {
   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   103     "sw", "gr", "sl", "cl", "iy", ""
   104 };
   105 
   106 char *markup[] = {
   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   111 };
   112 
   113 char *DPmarkup[] = {
   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   115 };
   116 
   117 char *nocomma[] = {
   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   121     "during", "let", "toward", "among", ""
   122 };
   123 
   124 char *noperiod[] = {
   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   128     "among", "those", "into", "whom", "having", "thence", ""
   129 }; 
   130 
   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
   132 
   133 struct {
   134     char *htmlent;
   135     char *htmlnum;
   136     char *textent;
   137 } entities[] = {
   138     "&amp;",	"&#38;",     "&", 
   139     "&lt;",	"&#60;",     "<",
   140     "&gt;",	"&#62;",     ">",
   141     "&deg;",	"&#176;",    " degrees",
   142     "&pound;",	"&#163;",    "L",
   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
   165     "&cent;",	"&#162;",    "c", /* cent sign */
   166     "&pound;",	"&#163;",    "L", /* pound sign */
   167     "&curren;",	"&#164;",    "$", /* currency sign */
   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
   169     "&sect;",	"&#167;",    "--", /* section sign */
   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */
   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
   177     "&deg;",	"&#176;",    " degrees", /* degree sign */
   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
   182     "&micro;",	"&#181;",    "m", /* micro sign */
   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */
   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
   215     "&times;",	"&#215;",    "*", /* multiplication sign */
   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */
   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
   247     "&divide;",	"&#247;",    "/", /* division sign */
   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
   256     "", ""
   257 };
   258 
   259 /* special characters */
   260 #define CHAR_SPACE        32
   261 #define CHAR_TAB           9
   262 #define CHAR_LF           10
   263 #define CHAR_CR           13
   264 #define CHAR_DQUOTE       34
   265 #define CHAR_SQUOTE       39
   266 #define CHAR_OPEN_SQUOTE  96
   267 #define CHAR_TILDE       126
   268 #define CHAR_ASTERISK     42
   269 #define CHAR_FORESLASH    47
   270 #define CHAR_CARAT        94
   271 
   272 #define CHAR_UNDERSCORE    '_'
   273 #define CHAR_OPEN_CBRACK   '{'
   274 #define CHAR_CLOSE_CBRACK  '}'
   275 #define CHAR_OPEN_RBRACK   '('
   276 #define CHAR_CLOSE_RBRACK  ')'
   277 #define CHAR_OPEN_SBRACK   '['
   278 #define CHAR_CLOSE_SBRACK  ']'
   279 
   280 /* longest and shortest normal PG line lengths */
   281 #define LONGEST_PG_LINE   75
   282 #define WAY_TOO_LONG      80
   283 #define SHORTEST_PG_LINE  55
   284 
   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */
   286                                   /*     D - ignore DP-specific markup     */
   287                                   /*     E - echo queried line             */
   288                                   /*     S - check single quotes           */
   289                                   /*     T - check common typos            */
   290                                   /*     P - require closure of quotes on  */
   291                                   /*         every paragraph               */
   292                                   /*     X - "Trust no one" :-) Paranoid!  */
   293                                   /*         Queries everything            */
   294                                   /*     L - line end checking defaults on */
   295                                   /*         -L turns it off               */
   296                                   /*     O - overview. Just shows counts.  */
   297                                   /*     Y - puts errors to stdout         */
   298                                   /*         instead of stderr             */
   299                                   /*     H - Echoes header fields          */
   300                                   /*     M - Ignore markup in < >          */
   301                                   /*     U - Use file of User-defined Typos*/
   302                                   /*     W - Defaults for use on Web upload*/
   303                                   /*     V - Verbose - list EVERYTHING!    */
   304 #define SWITNO 14                 /* max number of switch parms            */
   305                                   /*        - used for defining array-size */
   306 #define MINARGS   1               /* minimum no of args excl switches      */
   307 #define MAXARGS   1               /* maximum no of args excl switches      */
   308 
   309 int pswit[SWITNO];                /* program switches set by SWITCHES      */
   310 
   311 #define ECHO_SWITCH      0
   312 #define SQUOTE_SWITCH    1
   313 #define TYPO_SWITCH      2
   314 #define QPARA_SWITCH     3
   315 #define PARANOID_SWITCH  4
   316 #define LINE_END_SWITCH  5
   317 #define OVERVIEW_SWITCH  6
   318 #define STDOUT_SWITCH    7
   319 #define HEADER_SWITCH    8
   320 #define WEB_SWITCH       9
   321 #define VERBOSE_SWITCH   10
   322 #define MARKUP_SWITCH    11
   323 #define USERTYPO_SWITCH  12
   324 #define DP_SWITCH        13
   325 
   326 long cnt_dquot;       /* for overview mode, count of doublequote queries */
   327 long cnt_squot;       /* for overview mode, count of singlequote queries */
   328 long cnt_brack;       /* for overview mode, count of brackets queries */
   329 long cnt_bin;         /* for overview mode, count of non-ASCII queries */
   330 long cnt_odd;         /* for overview mode, count of odd character queries */
   331 long cnt_long;        /* for overview mode, count of long line errors */
   332 long cnt_short;       /* for overview mode, count of short line queries */
   333 long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */
   334 long cnt_dash;        /* for overview mode, count of dash-related queries */
   335 long cnt_word;        /* for overview mode, count of word queries */
   336 long cnt_html;        /* for overview mode, count of html queries */
   337 long cnt_lineend;     /* for overview mode, count of line-end queries */
   338 long cnt_spacend;     /* count of lines with space at end */
   339 long linecnt;         /* count of total lines in the file */
   340 long checked_linecnt; /* count of lines actually checked */
   341 
   342 void proghelp(void);
   343 void procfile(char *);
   344 
   345 #define LOW_THRESHOLD    0
   346 #define HIGH_THRESHOLD   1
   347 
   348 #define START 0
   349 #define END 1
   350 #define PREV 0
   351 #define NEXT 1
   352 #define FIRST_OF_PAIR 0
   353 #define SECOND_OF_PAIR 1
   354 
   355 #define MAX_WORDPAIR 1000
   356 
   357 char running_from[MAX_PATH];
   358 
   359 int mixdigit(char *);
   360 char *getaword(char *,char *);
   361 int matchword(char *,char *);
   362 char *flgets(char *,int,FILE *,long);
   363 void lowerit(char *);
   364 int gcisalpha(unsigned char);
   365 int gcisdigit(unsigned char);
   366 int gcisletter(unsigned char);
   367 char *gcstrchr(char *s,char c);
   368 void postprocess_for_HTML(char *);
   369 char *linehasmarkup(char *);
   370 char *losemarkup(char *);
   371 int tagcomp(char *,char *);
   372 char *loseentities(char *);
   373 int isroman(char *);
   374 int usertypo_count;
   375 void postprocess_for_DP(char *);
   376 
   377 char wrk[LINEBUFSIZE];
   378 
   379 #define MAX_QWORD 50
   380 #define MAX_QWORD_LENGTH 40
   381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
   382 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
   383 signed int dupcnt[MAX_QWORD];
   384 
   385 int main(int argc,char **argv)
   386 {
   387     char *argsw,*s;
   388     int i,switno,invarg;
   389     char usertypo_file[MAX_PATH];
   390     FILE *usertypofile;
   391     if (strlen(argv[0])<sizeof(running_from))
   392 	/* save the path to the executable */
   393         strcpy(running_from,argv[0]);
   394     /* find out what directory we're running from */
   395     s=running_from+strlen(running_from);
   396     for (;*s!='/' && *s!='\\' && s>=running_from;s--)
   397         *s=0;
   398     switno=strlen(SWITCHES);
   399     for (i=switno;--i>0;)
   400         pswit[i]=0;           /* initialise switches */
   401     /*
   402      * Standard loop to extract switches.
   403      * When we come out of this loop, the arguments will be
   404      * in argv[0] upwards and the switches used will be
   405      * represented by their equivalent elements in pswit[]
   406      */
   407     while (--argc>0 && **++argv=='-')
   408         for (argsw=argv[0]+1;*argsw!='\0';argsw++)
   409             for (i=switno,invarg=1;(--i>=0) && invarg==1;)
   410                 if ((toupper(*argsw))==SWITCHES[i])
   411 		{
   412                     invarg=0;
   413                     pswit[i]=1;
   414 		}
   415     /* Paranoid checking is turned OFF, not on, by its switch */
   416     pswit[PARANOID_SWITCH]^=1;
   417     if (pswit[PARANOID_SWITCH])
   418 	/* if running in paranoid mode force typo checks as well   */
   419         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
   420     /* Line-end checking is turned OFF, not on, by its switch */
   421     pswit[LINE_END_SWITCH]^=1;
   422     /* Echoing is turned OFF, not on, by its switch */
   423     pswit[ECHO_SWITCH]^=1;
   424     if (pswit[OVERVIEW_SWITCH])
   425 	/* just print summary; don't echo */
   426         pswit[ECHO_SWITCH]=0;
   427     /*
   428      * Web uploads - for the moment, this is really just a placeholder
   429      * until we decide what processing we really want to do on web uploads
   430      */
   431     if (pswit[WEB_SWITCH])
   432     {
   433 	/* specific override for web uploads */
   434         pswit[ECHO_SWITCH]=1;
   435         pswit[SQUOTE_SWITCH]=0;
   436         pswit[TYPO_SWITCH]=1;
   437         pswit[QPARA_SWITCH]=0;
   438         pswit[PARANOID_SWITCH]=1;
   439         pswit[LINE_END_SWITCH]=0;
   440         pswit[OVERVIEW_SWITCH]=0;
   441         pswit[STDOUT_SWITCH]=0;
   442         pswit[HEADER_SWITCH]=1;
   443         pswit[VERBOSE_SWITCH]=0;
   444         pswit[MARKUP_SWITCH]=0;
   445         pswit[USERTYPO_SWITCH]=0;
   446         pswit[DP_SWITCH]=0;
   447     }
   448     if (argc<MINARGS || argc>MAXARGS)
   449     {
   450 	/* check number of args */
   451         proghelp();
   452         return 1;
   453     }
   454     /* read in the user-defined stealth scanno list */
   455     if (pswit[USERTYPO_SWITCH])
   456     {
   457 	/* ... we were told we had one! */
   458         usertypofile=fopen(USERTYPO_FILE,"rb");
   459         if (!usertypofile)
   460 	{
   461 	    /* not in cwd. try excuteable directory. */
   462             strcpy(usertypo_file,running_from);
   463             strcat(usertypo_file,USERTYPO_FILE);
   464             usertypofile=fopen(usertypo_file,"rb");
   465             if (!usertypofile) {
   466 		/* we ain't got no user typo file! */
   467                 printf("   --> I couldn't find gutcheck.typ "
   468 		  "-- proceeding without user typos.\n");
   469 	    }
   470 	}
   471         usertypo_count=0;
   472         if (usertypofile)
   473 	{
   474 	    /* we managed to open a User Typo File! */
   475             if (pswit[USERTYPO_SWITCH])
   476 	    {
   477                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
   478 		  (long)usertypo_count))
   479 		{
   480                     if (strlen(aline)>1)
   481 		    {
   482                         if ((int)*aline>33)
   483 			{
   484                             s=malloc(strlen(aline)+1);
   485                             if (!s)
   486 			    {
   487                                 fprintf(stderr,"bookloupe: cannot get enough "
   488 				  "memory for user typo file!\n");
   489                                 exit(1);
   490 			    }
   491                             strcpy(s,aline);
   492                             usertypo[usertypo_count]=s;
   493                             usertypo_count++;
   494                             if (usertypo_count>=MAX_USER_TYPOS)
   495 			    {
   496                                 printf("   --> Only %d user-defined typos "
   497 				  "allowed: ignoring the rest\n",
   498 				  MAX_USER_TYPOS);
   499                                 break;
   500 			    }
   501 			}
   502 		    }
   503 		}
   504 	    }
   505             fclose(usertypofile);
   506 	}
   507     }
   508     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   509     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
   510     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
   511     cnt_spacend=0;
   512     procfile(argv[0]);
   513     if (pswit[OVERVIEW_SWITCH])
   514     {
   515 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   516 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   517         printf("    --------------- Queries found --------------\n");
   518         if (cnt_long)
   519 	    printf("    Long lines:                    %14ld\n",cnt_long);
   520         if (cnt_short)
   521 	    printf("    Short lines:                   %14ld\n",cnt_short);
   522         if (cnt_lineend)
   523 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);
   524         if (cnt_word)
   525 	    printf("    Common typos:                  %14ld\n",cnt_word);
   526         if (cnt_dquot)
   527 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);
   528         if (cnt_squot)
   529 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);
   530         if (cnt_brack)
   531 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);
   532         if (cnt_bin)
   533 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);
   534         if (cnt_odd)
   535 	    printf("    Proofing characters:           %14ld\n",cnt_odd);
   536         if (cnt_punct)
   537 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   538         if (cnt_dash)
   539 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);
   540         if (cnt_html)
   541 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);
   542         printf("\n");
   543         printf("    TOTAL QUERIES                  %14ld\n",
   544           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   545           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   546     }
   547     return 0;
   548 }
   549 
   550 struct first_pass_results {
   551     long firstline,astline;
   552     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
   553     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
   554     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
   555     signed int Dutchcount,Frenchcount;
   556 };
   557 
   558 /*
   559  * first_pass:
   560  *
   561  * Run a first pass - verify that it's a valid PG
   562  * file, decide whether to report some things that
   563  * occur many times in the text like long or short
   564  * lines, non-standard dashes, etc.
   565  */
   566 struct first_pass_results *first_pass(FILE *infile)
   567 {
   568     char laststart=CHAR_SPACE,*s;
   569     signed int i,llen;
   570     unsigned int lastlen=0,lastblen=0;
   571     long spline=0,nspline=0;
   572     static struct first_pass_results results={0};
   573     char inword[MAXWORDLEN]="";
   574     while (fgets(aline,LINEBUFSIZE-1,infile))
   575     {
   576         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
   577 	    aline[strlen(aline)-1]=0;
   578         linecnt++;
   579         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
   580 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
   581 	{
   582             if (spline)
   583                 printf("   --> Duplicate header?\n");
   584             spline=linecnt+1;   /* first line of non-header text, that is */
   585 	}
   586         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
   587 	{
   588             if (nspline)
   589                 printf("   --> Duplicate header?\n");
   590             nspline=linecnt+1;   /* first line of non-header text, that is */
   591 	}
   592         if (spline || nspline)
   593 	{
   594             lowerit(aline);
   595             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
   596 	    {
   597                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
   598 		{
   599                     if (results.footerline)
   600 		    {
   601 			/* it's an old-form header - we can detect duplicates */
   602                         if (!nspline)
   603                             printf("   --> Duplicate footer?\n");
   604 		    }
   605                     else
   606                         results.footerline=linecnt;
   607 		}
   608 	    }
   609 	}
   610         if (spline)
   611 	    results.firstline=spline;
   612         if (nspline)
   613 	    results.firstline=nspline;  /* override with new */
   614         if (results.footerline)
   615 	    continue;    /* don't count the boilerplate in the footer */
   616         llen=strlen(aline);
   617         results.totlen+=llen;
   618         for (i=0;i<llen;i++)
   619 	{
   620             if ((unsigned char)aline[i]>127)
   621 		results.binlen++;
   622             if (gcisalpha(aline[i]))
   623 		results.alphalen++;
   624             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
   625 		results.endquote_count++;
   626 	}
   627         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
   628 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   629 	    results.shortline++;
   630         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
   631 	    cnt_spacend++;
   632         if (strstr(aline,".,"))
   633 	    results.dotcomma++;
   634         /* only count ast lines for ignoring purposes where there is */
   635         /* locase text on the line */
   636         if (strstr(aline,"*"))
   637 	{
   638             for (s=aline;*s;s++)
   639                 if (*s>='a' && *s<='z')
   640                     break;
   641              if (*s)
   642 		results.astline++;
   643 	}
   644         if (strstr(aline,"/"))
   645             results.fslashline++;
   646         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
   647 	    ;
   648         if (aline[i]=='-' && aline[i-1]!='-')
   649 	    results.hyphens++;
   650         if (llen>LONGEST_PG_LINE)
   651 	    results.longline++;
   652         if (llen>WAY_TOO_LONG)
   653 	    results.verylongline++;
   654         if (strstr(aline,"<") && strstr(aline,">"))
   655 	{
   656             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
   657             if (i>0)
   658                 results.htmcount++;
   659             if (strstr(aline,"<i>"))
   660 		results.htmcount+=4; /* bonus marks! */
   661 	}
   662         /* Check for spaced em-dashes */
   663         if (strstr(aline,"--"))
   664 	{
   665             results.emdash++;
   666             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
   667                (*(strstr(aline,"--")+2)==CHAR_SPACE))
   668 		results.space_emdash++;
   669             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
   670                (*(strstr(aline,"--")+2)==CHAR_SPACE))
   671 		/* count of em-dashes with spaces both sides */
   672 		results.non_PG_space_emdash++;
   673             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
   674                (*(strstr(aline,"--")+2)!=CHAR_SPACE))
   675 		/* count of PG-type em-dashes with no spaces */
   676 		results.PG_space_emdash++;
   677 	}
   678         for (s=aline;*s;)
   679 	{
   680             s=getaword(s,inword);
   681             if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   682                 results.Dutchcount++;
   683             if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   684                 results.Frenchcount++;
   685             if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   686                 results.standalone_digit++;
   687 	}
   688         /* Check for spaced dashes */
   689         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
   690 	    results.spacedash++;
   691         lastblen=lastlen;
   692         lastlen=strlen(aline);
   693         laststart=aline[0];
   694     }
   695     return &results;
   696 }
   697 
   698 struct warnings {
   699     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   700     signed int endquote,isDutch,isFrench;
   701 };
   702 
   703 /*
   704  * report_first_pass:
   705  *
   706  * Make some snap decisions based on the first pass results.
   707  */
   708 struct warnings *report_first_pass(struct first_pass_results *results)
   709 {
   710     static struct warnings warnings={0};
   711     if (cnt_spacend>0)
   712         printf("   --> %ld lines in this file have white space at end\n",
   713 	  cnt_spacend);
   714     warnings.dotcomma=1;
   715     if (results->dotcomma>5)
   716     {
   717         warnings.dotcomma=0;
   718         printf("   --> %ld lines in this file contain '.,'. "
   719 	  "Not reporting them.\n",results->dotcomma);
   720     }
   721     /*
   722      * If more than 50 lines, or one-tenth, are short,
   723      * don't bother reporting them.
   724      */
   725     warnings.shortline=1;
   726     if (results->shortline>50 || results->shortline*10>linecnt)
   727     {
   728         warnings.shortline=0;
   729         printf("   --> %ld lines in this file are short. "
   730 	  "Not reporting short lines.\n",results->shortline);
   731     }
   732     /*
   733      * If more than 50 lines, or one-tenth, are long,
   734      * don't bother reporting them.
   735      */
   736     warnings.longline=1;
   737     if (results->longline>50 || results->longline*10>linecnt)
   738     {
   739         warnings.longline=0;
   740         printf("   --> %ld lines in this file are long. "
   741 	  "Not reporting long lines.\n",results->longline);
   742     }
   743     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   744     warnings.ast=1;
   745     if (results->astline>10)
   746     {
   747         warnings.ast=0;
   748         printf("   --> %ld lines in this file contain asterisks. "
   749 	  "Not reporting them.\n",results->astline);
   750     }
   751     /*
   752      * If more than 10 lines contain forward slashes,
   753      * don't bother reporting them.
   754      */
   755     warnings.fslash=1;
   756     if (results->fslashline>10)
   757     {
   758         warnings.fslash=0;
   759         printf("   --> %ld lines in this file contain forward slashes. "
   760 	  "Not reporting them.\n",results->fslashline);
   761     }
   762     /*
   763      * If more than 20 lines contain unpunctuated endquotes,
   764      * don't bother reporting them.
   765      */
   766     warnings.endquote=1;
   767     if (results->endquote_count>20)
   768     {
   769         warnings.endquote=0;
   770         printf("   --> %ld lines in this file contain unpunctuated endquotes. "
   771 	  "Not reporting them.\n",results->endquote_count);
   772     }
   773     /*
   774      * If more than 15 lines contain standalone digits,
   775      * don't bother reporting them.
   776      */
   777     warnings.digit=1;
   778     if (results->standalone_digit>10)
   779     {
   780         warnings.digit=0;
   781         printf("   --> %ld lines in this file contain standalone 0s and 1s. "
   782 	  "Not reporting them.\n",results->standalone_digit);
   783     }
   784     /*
   785      * If more than 20 lines contain hyphens at end,
   786      * don't bother reporting them.
   787      */
   788     warnings.hyphen=1;
   789     if (results->hyphens>20)
   790     {
   791         warnings.hyphen=0;
   792         printf("   --> %ld lines in this file have hyphens at end. "
   793 	  "Not reporting them.\n",results->hyphens);
   794     }
   795     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   796     {
   797         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   798         pswit[MARKUP_SWITCH]=1;
   799     }
   800     if (results->verylongline>0)
   801         printf("   --> %ld lines in this file are VERY long!\n",
   802 	  results->verylongline);
   803     /*
   804      * If there are more non-PG spaced dashes than PG em-dashes,
   805      * assume it's deliberate.
   806      * Current PG guidelines say don't use them, but older texts do,
   807      * and some people insist on them whatever the guidelines say.
   808      */
   809     warnings.dash=1;
   810     if (results->spacedash+results->non_PG_space_emdash>
   811       results->PG_space_emdash)
   812     {
   813         warnings.dash=0;
   814         printf("   --> There are %ld spaced dashes and em-dashes. "
   815 	  "Not reporting them.\n",
   816 	  results->spacedash+results->non_PG_space_emdash);
   817     }
   818     /* If more than a quarter of characters are hi-bit, bug out. */
   819     warnings.bin=1;
   820     if (results->binlen*4>results->totlen)
   821     {
   822         printf("   --> This file does not appear to be ASCII. "
   823 	  "Terminating. Best of luck with it!\n");
   824         exit(1);
   825     }
   826     if (results->alphalen*4<results->totlen)
   827     {
   828         printf("   --> This file does not appear to be text. "
   829 	  "Terminating. Best of luck with it!\n");
   830         exit(1);
   831     }
   832     if (results->binlen*100>results->totlen || results->binlen>100)
   833     {
   834         printf("   --> There are a lot of foreign letters here. "
   835 	  "Not reporting them.\n");
   836         warnings.bin=0;
   837     }
   838     warnings.isDutch=0;
   839     if (results->Dutchcount>50)
   840     {
   841         warnings.isDutch=1;
   842         printf("   --> This looks like Dutch - "
   843 	  "switching off dashes and warnings for 's Middags case.\n");
   844     }
   845     warnings.isFrench=0;
   846     if (results->Frenchcount>50)
   847     {
   848         warnings.isFrench=1;
   849         printf("   --> This looks like French - "
   850 	  "switching off some doublepunct.\n");
   851     }
   852     if (results->firstline && results->footerline)
   853         printf("    The PG header and footer appear to be already on.\n");
   854     else
   855     {
   856         if (results->firstline)
   857             printf("    The PG header is on - no footer.\n");
   858         if (results->footerline)
   859             printf("    The PG footer is on - no header.\n");
   860     }
   861     printf("\n");
   862     if (pswit[VERBOSE_SWITCH])
   863     {
   864         warnings.bin=1;
   865         warnings.shortline=1;
   866         warnings.dotcomma=1;
   867         warnings.longline=1;
   868         warnings.dash=1;
   869         warnings.digit=1;
   870         warnings.ast=1;
   871         warnings.fslash=1;
   872         warnings.hyphen=1;
   873         warnings.endquote=1;
   874         printf("   *** Verbose output is ON -- you asked for it! ***\n");
   875     }
   876     if (warnings.isDutch)
   877         warnings.dash=0;
   878     if (results->footerline>0 && results->firstline>0 &&
   879       results->footerline>results->firstline &&
   880       results->footerline-results->firstline<100)
   881     {
   882         printf("   --> I don't really know where this text starts. \n");
   883         printf("       There are no reference points.\n");
   884         printf("       I'm going to have to report the header and footer "
   885 	  "as well.\n");
   886         results->firstline=0;
   887     }
   888     return &warnings;
   889 }
   890 
   891 struct counters {
   892     long quot;
   893     signed int c_unders,c_brack,s_brack,r_brack;
   894     signed int open_single_quote,close_single_quote;
   895 };
   896 
   897 /*
   898  * analyse_quotes:
   899  *
   900  * Look along the line, accumulate the count of quotes, and see
   901  * if this is an empty line - i.e. a line with nothing on it
   902  * but spaces.
   903  * If line has just spaces, period, * and/or - on it, don't
   904  * count it, since empty lines with asterisks or dashes to
   905  * separate sections are common.
   906  *
   907  * Returns: Non-zero if the line is empty.
   908  */
   909 int analyse_quotes(const char *s,struct counters *counters)
   910 {
   911     signed int guessquote=0;
   912     int isemptyline=1;    /* assume the line is empty until proven otherwise */
   913     while (*s)
   914     {
   915 	if (*s==CHAR_DQUOTE)
   916 	    counters->quot++;
   917 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
   918 	{
   919 	    if (s==aline)
   920 	    {
   921 		/*
   922 		 * At start of line, it can only be an openquote.
   923 		 * Hardcode a very common exception!
   924 		 */
   925 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
   926 		    counters->open_single_quote++;
   927 	    }
   928 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
   929 		/* Do nothing! it's definitely an apostrophe, not a quote */
   930 		;
   931 	    /* it's outside a word - let's check it out */
   932 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
   933 	    {
   934 		/* it damwell better BE an openquote */
   935 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
   936 		    /* hardcode a very common exception! */
   937 		    counters->open_single_quote++;
   938 	    }
   939 	    else
   940 	    {
   941 		/* now - is it a closequote? */
   942 		guessquote=0;   /* accumulate clues */
   943 		if (gcisalpha(s[-1]))
   944 		{
   945 		    /* it follows a letter - could be either */
   946 		    guessquote++;
   947 		    if (s[-1]=='s')
   948 		    {
   949 			/* looks like a plural apostrophe */
   950 			guessquote-=3;
   951 			if (s[1]==CHAR_SPACE)  /* bonus marks! */
   952 			    guessquote-=2;
   953 		    }
   954 		}
   955 		/* it doesn't have a letter either side */
   956 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
   957 		    guessquote+=8; /* looks like a closequote */
   958 		else
   959 		    guessquote++;
   960 		if (counters->open_single_quote>counters->close_single_quote)
   961 		    /*
   962 		     * Give it the benefit of some doubt,
   963 		     * if a squote is already open.
   964 		     */
   965 		    guessquote++;
   966 		else
   967 		    guessquote--;
   968 		if (guessquote>=0)
   969 		    counters->close_single_quote++;
   970 	    }
   971 	}
   972 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
   973 	  *s!=13 && *s!=10)
   974 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */
   975 	if (*s==CHAR_UNDERSCORE)
   976 	    counters->c_unders++;
   977 	if (*s==CHAR_OPEN_CBRACK)
   978 	    counters->c_brack++;
   979 	if (*s==CHAR_CLOSE_CBRACK)
   980 	    counters->c_brack--;
   981 	if (*s==CHAR_OPEN_RBRACK)
   982 	    counters->r_brack++;
   983 	if (*s==CHAR_CLOSE_RBRACK)
   984 	    counters->r_brack--;
   985 	if (*s==CHAR_OPEN_SBRACK)
   986 	    counters->s_brack++;
   987 	if (*s==CHAR_CLOSE_SBRACK)
   988 	    counters->s_brack--;
   989 	s++;
   990     }
   991     return isemptyline;
   992 }
   993 
   994 /*
   995  * check_for_odd_characters:
   996  *
   997  * Check for binary and other odd characters.
   998  */
   999 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1000   int isemptyline)
  1001 {
  1002     /* Don't repeat multiple warnings on one line. */
  1003     signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
  1004     const char *s;
  1005     unsigned char c;
  1006     for (s=aline;*s;s++)
  1007     {
  1008 	c=*(unsigned char *)s;
  1009 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
  1010 	{
  1011 	    if (pswit[ECHO_SWITCH])
  1012 		printf("\n%s\n",aline);
  1013 	    if (!pswit[OVERVIEW_SWITCH])
  1014 		if (c>127 && c<160)
  1015 		    printf("    Line %ld column %d - "
  1016 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
  1017 		else
  1018 		    printf("    Line %ld column %d - Non-ASCII character %d\n",
  1019 		      linecnt,(int)(s-aline)+1,c);
  1020 	    else
  1021 		cnt_bin++;
  1022 	    eNon_A=1;
  1023 	}
  1024 	if (!eTab && *s==CHAR_TAB)
  1025 	{
  1026 	    if (pswit[ECHO_SWITCH])
  1027 		printf("\n%s\n",aline);
  1028 	    if (!pswit[OVERVIEW_SWITCH])
  1029 		printf("    Line %ld column %d - Tab character?\n",
  1030 		  linecnt,(int)(s-aline)+1);
  1031 	    else
  1032 		cnt_odd++;
  1033 	    eTab=1;
  1034 	}
  1035 	if (!eTilde && *s==CHAR_TILDE)
  1036 	{
  1037 	    /*
  1038 	     * Often used by OCR software to indicate an
  1039 	     * unrecognizable character.
  1040 	     */
  1041 	    if (pswit[ECHO_SWITCH])
  1042 		printf("\n%s\n",aline);
  1043 	    if (!pswit[OVERVIEW_SWITCH])
  1044 		printf("    Line %ld column %d - Tilde character?\n",
  1045 		  linecnt,(int)(s-aline)+1);
  1046 	    else
  1047 		cnt_odd++;
  1048 	    eTilde=1;
  1049 	}
  1050 	if (!eCarat && *s==CHAR_CARAT)
  1051 	{  
  1052 	    if (pswit[ECHO_SWITCH])
  1053 		printf("\n%s\n",aline);
  1054 	    if (!pswit[OVERVIEW_SWITCH])
  1055 		printf("    Line %ld column %d - Carat character?\n",
  1056 		  linecnt,(int)(s-aline)+1);
  1057 	    else
  1058 		cnt_odd++;
  1059 	    eCarat=1;
  1060 	}
  1061 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
  1062 	{  
  1063 	    if (pswit[ECHO_SWITCH])
  1064 		printf("\n%s\n",aline);
  1065 	    if (!pswit[OVERVIEW_SWITCH])
  1066 		printf("    Line %ld column %d - Forward slash?\n",
  1067 		  linecnt,(int)(s-aline)+1);
  1068 	    else
  1069 		cnt_odd++;
  1070 	    eFSlash=1;
  1071 	}
  1072 	/*
  1073 	 * Report asterisks only in paranoid mode,
  1074 	 * since they're often deliberate.
  1075 	 */
  1076 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1077 	  *s==CHAR_ASTERISK)
  1078 	{
  1079 	    if (pswit[ECHO_SWITCH])
  1080 		printf("\n%s\n",aline);
  1081 	    if (!pswit[OVERVIEW_SWITCH])
  1082 		printf("    Line %ld column %d - Asterisk?\n",
  1083 		  linecnt,(int)(s-aline)+1);
  1084 	    else
  1085 		cnt_odd++;
  1086 	    eAst=1;
  1087 	}
  1088     }
  1089 }
  1090 
  1091 /*
  1092  * procfile:
  1093  *
  1094  * Process one file.
  1095  */
  1096 void procfile(char *filename)
  1097 {
  1098     char *s,*t,*s1,laststart,*wordstart;
  1099     char inword[MAXWORDLEN],testword[MAXWORDLEN];
  1100     char parastart[81];     /* first line of current para */
  1101     FILE *infile;
  1102     struct first_pass_results *first_pass_results;
  1103     struct warnings *warnings;
  1104     struct counters counters={0};
  1105     int isemptyline;
  1106     long squot,start_para_line;
  1107     signed int i,j,llen,isacro,isellipsis,istypo,alower;
  1108     unsigned int lastlen,lastblen;
  1109     signed int dquotepar,squotepar;
  1110     signed int isnewpara,vowel,consonant;
  1111     char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
  1112       cbrack_err[80],unders_err[80];
  1113     signed int qword_index,qperiod_index,isdup;
  1114     signed int enddash;
  1115     laststart=CHAR_SPACE;
  1116     lastlen=lastblen=0;
  1117     *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
  1118       *unders_err=*prevline=0;
  1119     linecnt=checked_linecnt=start_para_line=0;
  1120     squot=0;
  1121     i=llen=isacro=isellipsis=istypo=0;
  1122     isnewpara=vowel=consonant=enddash=0;
  1123     qword_index=qperiod_index=isdup=0;
  1124     *inword=*testword=0;
  1125     dquotepar=squotepar=0;
  1126     for (j=0;j<MAX_QWORD;j++)
  1127     {
  1128         dupcnt[j]=0;
  1129         for (i=0;i<MAX_QWORD_LENGTH;i++)
  1130 	{
  1131             qword[i][j]=0;
  1132             qperiod[i][j]=0;
  1133 	}
  1134     }
  1135     infile=fopen(filename,"rb");
  1136     if (!infile)
  1137     {
  1138         if (pswit[STDOUT_SWITCH])
  1139             fprintf(stdout,"bookloupe: cannot open %s\n",filename);
  1140         else
  1141             fprintf(stderr,"bookloupe: cannot open %s\n",filename);
  1142 	exit(1);
  1143     }
  1144     fprintf(stdout,"\n\nFile: %s\n\n",filename);
  1145     first_pass_results=first_pass(infile);
  1146     warnings=report_first_pass(first_pass_results);
  1147     rewind(infile);
  1148     /*
  1149      * Here we go with the main pass. Hold onto yer hat!
  1150      * Re-init some variables we've dirtied.
  1151      */
  1152     squot=linecnt=0;
  1153     laststart=CHAR_SPACE;
  1154     lastlen=lastblen=0;
  1155     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
  1156     {
  1157         linecnt++;
  1158         if (linecnt==1)
  1159 	    isnewpara=1;
  1160         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
  1161 	    continue;    // skip DP page separators completely
  1162         if (linecnt<first_pass_results->firstline ||
  1163 	  (first_pass_results->footerline>0 &&
  1164 	  linecnt>first_pass_results->footerline))
  1165 	{
  1166             if (pswit[HEADER_SWITCH])
  1167 	    {
  1168                 if (!strncmp(aline,"Title:",6))
  1169                     printf("    %s\n",aline);
  1170                 if (!strncmp(aline,"Author:",7))
  1171                     printf("    %s\n",aline);
  1172                 if (!strncmp(aline,"Release Date:",13))
  1173                     printf("    %s\n",aline);
  1174                 if (!strncmp(aline,"Edition:",8))
  1175                     printf("    %s\n\n",aline);
  1176 	    }
  1177             continue;                /* skip through the header */
  1178 	}
  1179         checked_linecnt++;
  1180         s=aline;
  1181         /*
  1182 	 * If we are in a state of unbalanced quotes, and this line
  1183          * doesn't begin with a quote, output the stored error message.
  1184          * If the -P switch was used, print the warning even if the
  1185          * new para starts with quotes.
  1186 	 */
  1187         t=s;
  1188         while (*t==' ')
  1189 	    t++;
  1190         if (*dquote_err)
  1191             if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  1192 	    {
  1193                 if (!pswit[OVERVIEW_SWITCH])
  1194 		{
  1195                     if (pswit[ECHO_SWITCH])
  1196 			printf("\n%s\n",parastart);
  1197                     printf(dquote_err);
  1198 		}
  1199                 else
  1200                     cnt_dquot++;
  1201             }
  1202         if (*squote_err)
  1203 	{
  1204             if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
  1205 	      pswit[QPARA_SWITCH] || squot)
  1206 	    {
  1207                 if (!pswit[OVERVIEW_SWITCH])
  1208 		{
  1209                     if (pswit[ECHO_SWITCH])
  1210 			printf("\n%s\n",parastart);
  1211                     printf(squote_err);
  1212 		}
  1213                 else
  1214                     cnt_squot++;
  1215 	    }
  1216             squot=0;
  1217 	}
  1218         if (*rbrack_err)
  1219 	{
  1220             if (!pswit[OVERVIEW_SWITCH])
  1221 	    {
  1222                 if (pswit[ECHO_SWITCH])
  1223 		    printf("\n%s\n",parastart);
  1224                 printf(rbrack_err);
  1225 	    }
  1226             else
  1227                 cnt_brack++;
  1228 	}
  1229         if (*sbrack_err)
  1230 	{
  1231             if (!pswit[OVERVIEW_SWITCH])
  1232 	    {
  1233                 if (pswit[ECHO_SWITCH])
  1234 		    printf("\n%s\n",parastart);
  1235                 printf(sbrack_err);
  1236 	    }
  1237             else
  1238                 cnt_brack++;
  1239 	}
  1240         if (*cbrack_err)
  1241 	{
  1242             if (!pswit[OVERVIEW_SWITCH])
  1243 	    {
  1244                 if (pswit[ECHO_SWITCH])
  1245 		    printf("\n%s\n",parastart);
  1246                 printf(cbrack_err);
  1247 	    }
  1248             else
  1249                 cnt_brack++;
  1250 	}
  1251         if (*unders_err)
  1252 	{
  1253             if (!pswit[OVERVIEW_SWITCH])
  1254 	    {
  1255                 if (pswit[ECHO_SWITCH])
  1256 		    printf("\n%s\n",parastart);
  1257                 printf(unders_err);
  1258 	    }
  1259             else
  1260                 cnt_brack++;
  1261 	}
  1262         *dquote_err=*squote_err=*rbrack_err=*cbrack_err= 
  1263 	  *sbrack_err=*unders_err=0;
  1264 	isemptyline=analyse_quotes(aline,&counters);
  1265         if (isnewpara && !isemptyline)
  1266 	{
  1267 	    /* This line is the start of a new paragraph. */
  1268             start_para_line=linecnt;
  1269 	    /* Capture its first line in case we want to report it later. */
  1270             strncpy(parastart,aline,80);
  1271             parastart[79]=0;
  1272             dquotepar=squotepar=0; /* restart the quote count */
  1273             s=aline;
  1274             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
  1275 		s++;
  1276             if (*s>='a' && *s<='z')
  1277 	    {
  1278 		/* and its first letter is lowercase */
  1279                 if (pswit[ECHO_SWITCH])
  1280 		    printf("\n%s\n",aline);
  1281                 if (!pswit[OVERVIEW_SWITCH])
  1282                     printf("    Line %ld column %d - "
  1283 		      "Paragraph starts with lower-case\n",
  1284 		      linecnt,(int)(s-aline)+1);
  1285                 else
  1286                     cnt_punct++;
  1287 	    }
  1288             isnewpara=0; /* Signal the end of new para processing. */
  1289 	}
  1290         /* Check for an em-dash broken at line end. */
  1291         if (enddash && *aline=='-')
  1292 	{
  1293             if (pswit[ECHO_SWITCH])
  1294 		printf("\n%s\n",aline);
  1295             if (!pswit[OVERVIEW_SWITCH])
  1296                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  1297             else
  1298                 cnt_punct++;
  1299 	}
  1300         enddash=0;
  1301         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
  1302 	    ;
  1303         if (s>=aline && *s=='-')
  1304             enddash=1;
  1305 	/*
  1306          * Check for invalid or questionable characters in the line
  1307          * Anything above 127 is invalid for plain ASCII, and
  1308          * non-printable control characters should also be flagged.
  1309          * Tabs should generally not be there.
  1310 	 */
  1311         for (s=aline;*s;s++)
  1312 	{
  1313             i=(unsigned char)*s;
  1314             if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
  1315 	    {
  1316                 if (pswit[ECHO_SWITCH])
  1317 		    printf("\n%s\n",aline);
  1318                 if (!pswit[OVERVIEW_SWITCH])
  1319                     printf("    Line %ld column %d - Control character %d\n",
  1320 		      linecnt,(int)(s-aline)+1,i);
  1321                 else
  1322                     cnt_bin++;
  1323 	    }
  1324 	}
  1325         if (warnings->bin)
  1326 	    check_for_odd_characters(aline,warnings,isemptyline);
  1327         /* Check for line too long. */
  1328         if (warnings->longline)
  1329 	{
  1330             if (strlen(aline)>LONGEST_PG_LINE)
  1331 	    {
  1332                 if (pswit[ECHO_SWITCH])
  1333 		    printf("\n%s\n",aline);
  1334                 if (!pswit[OVERVIEW_SWITCH])
  1335                     printf("    Line %ld column %d - Long line %d\n",
  1336 		      linecnt,strlen(aline),strlen(aline));
  1337                 else
  1338                     cnt_long++;
  1339 	    }
  1340 	}
  1341         /*
  1342 	 * Check for line too short.
  1343          * This one is a bit trickier to implement: we don't want to
  1344          * flag the last line of a paragraph for being short, so we
  1345          * have to wait until we know that our current line is a
  1346          * "normal" line, then report the _previous_ line if it was too
  1347          * short. We also don't want to report indented lines like
  1348          * chapter heads or formatted quotations. We therefore keep
  1349          * lastlen as the length of the last line examined, and
  1350          * lastblen as the length of the last but one, and try to
  1351          * suppress unnecessary warnings by checking that both were of
  1352          * "normal" length. We keep the first character of the last
  1353          * line in laststart, and if it was a space, we assume that the
  1354          * formatting is deliberate. I can't figure out a way to
  1355          * distinguish something like a quoted verse left-aligned or
  1356          * the header or footer of a letter from a paragraph of short
  1357          * lines - maybe if I examined the whole paragraph, and if the
  1358          * para has less than, say, 8 lines and if all lines are short,
  1359          * then just assume it's OK? Need to look at some texts to see
  1360          * how often a formula like this would get the right result.
  1361 	 */
  1362         if (warnings->shortline && strlen(aline)>1 && lastlen>1 &&
  1363 	  lastlen<SHORTEST_PG_LINE && lastblen>1 && lastblen>SHORTEST_PG_LINE &&
  1364 	  laststart!=CHAR_SPACE)
  1365 	{
  1366 	    if (pswit[ECHO_SWITCH])
  1367 		printf("\n%s\n",prevline);
  1368 	    if (!pswit[OVERVIEW_SWITCH])
  1369 		printf("    Line %ld column %d - Short line %d?\n",
  1370 		  linecnt-1,strlen(prevline),strlen(prevline));
  1371 	    else
  1372 		cnt_short++;
  1373 	}
  1374         lastblen=lastlen;
  1375         lastlen=strlen(aline);
  1376         laststart=aline[0];
  1377         /* Look for punctuation other than full ellipses at start of line. */
  1378         if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
  1379 	{
  1380 	    if (pswit[ECHO_SWITCH])
  1381 		printf("\n%s\n",aline);
  1382 	    if (!pswit[OVERVIEW_SWITCH])
  1383 		printf("    Line %ld column 1 - Begins with punctuation?\n",
  1384 		  linecnt);
  1385 	    else
  1386 		cnt_punct++;
  1387 	}
  1388         /*
  1389 	 * Check for spaced em-dashes.
  1390          * We must check _all_ occurrences of "--" on the line
  1391          * hence the loop - even if the first double-dash is OK
  1392          * there may be another that's wrong later on.
  1393 	 */
  1394         if (warnings->dash)
  1395 	{
  1396             s=aline;
  1397             while (strstr(s,"--"))
  1398 	    {
  1399                 if (*(strstr(s,"--")-1)==CHAR_SPACE ||
  1400                    (*(strstr(s,"--")+2)==CHAR_SPACE))
  1401 		{
  1402                     if (pswit[ECHO_SWITCH])
  1403 			printf("\n%s\n",aline);
  1404                     if (!pswit[OVERVIEW_SWITCH])
  1405                         printf("    Line %ld column %d - Spaced em-dash?\n",
  1406 			  linecnt,(int)(strstr(s,"--")-aline)+1);
  1407                     else
  1408                         cnt_dash++;
  1409 		}
  1410                 s=strstr(s,"--")+2;
  1411 	    }
  1412 	}
  1413         /* Check for spaced dashes. */
  1414         if (warnings->dash)
  1415 	{
  1416             if (strstr(aline," -"))
  1417 	    {
  1418                 if (*(strstr(aline," -")+2)!='-')
  1419 		{
  1420                     if (pswit[ECHO_SWITCH])
  1421 			printf("\n%s\n",aline);
  1422                     if (!pswit[OVERVIEW_SWITCH])
  1423                         printf("    Line %ld column %d - Spaced dash?\n",
  1424 			  linecnt,(int)(strstr(aline," -")-aline)+1);
  1425                     else
  1426                         cnt_dash++;
  1427 		}
  1428 	    }
  1429             else if (strstr(aline,"- "))
  1430 	    {
  1431 		if (*(strstr(aline,"- ")-1)!='-')
  1432 		{
  1433 		    if (pswit[ECHO_SWITCH])
  1434 			printf("\n%s\n",aline);
  1435 		    if (!pswit[OVERVIEW_SWITCH])
  1436 			printf("    Line %ld column %d - Spaced dash?\n",
  1437 			  linecnt,(int)(strstr(aline,"- ")-aline)+1);
  1438 		    else
  1439 			cnt_dash++;
  1440 		}
  1441 	    }
  1442 	}
  1443         /*
  1444 	 * Check for unmarked paragraphs indicated by separate speakers.
  1445          * May well be false positive:
  1446          * "Bravo!" "Wonderful!" called the crowd.
  1447          * but useful all the same.
  1448 	 */
  1449         s=wrk;
  1450         *s=0;
  1451         if (strstr(aline,"\" \""))
  1452 	    s=strstr(aline,"\" \"");
  1453         if (strstr(aline,"\"  \""))
  1454 	    s=strstr(aline,"\"  \"");
  1455         if (*s)
  1456 	{
  1457             if (pswit[ECHO_SWITCH])
  1458 		printf("\n%s\n",aline);
  1459             if (!pswit[OVERVIEW_SWITCH])
  1460                 printf("    Line %ld column %d - "
  1461 		  "Query missing paragraph break?\n",
  1462 		  linecnt,(int)(s-aline)+1);
  1463             else
  1464                 cnt_punct++;
  1465 	}
  1466         /*
  1467 	 * Check for "to he" and other easy he/be errors.
  1468          * This is a very inadequate effort on the he/be problem,
  1469          * but the phrase "to he" is always an error, whereas "to
  1470          * be" is quite common.
  1471          * Similarly, '"Quiet!", be said.' is a non-be error
  1472          * "to he" is _not_ always an error!:
  1473          *       "Where they went to he couldn't say."
  1474          * Another false positive:
  1475          *       What would "Cinderella" be without the . . .
  1476          * and another: "If he wants to he can see for himself."
  1477 	 */
  1478         s=wrk;
  1479         *s=0;
  1480         if (strstr(aline," to he "))
  1481 	    s=strstr(aline," to he ");
  1482         if (strstr(aline,"\" be "))
  1483 	    s=strstr(aline,"\" be ");
  1484         if (strstr(aline,"\", be "))
  1485 	    s=strstr(aline,"\", be ");
  1486         if (strstr(aline," is be "))
  1487 	    s=strstr(aline," is be ");
  1488         if (strstr(aline," be is "))
  1489 	    s=strstr(aline," be is ");
  1490         if (strstr(aline," was be "))
  1491 	    s=strstr(aline," was be ");
  1492         if (strstr(aline," be would "))
  1493 	    s=strstr(aline," be would ");
  1494         if (strstr(aline," be could "))
  1495 	    s=strstr(aline," be could ");
  1496         if (*s)
  1497 	{
  1498             if (pswit[ECHO_SWITCH])
  1499 		printf("\n%s\n",aline);
  1500             if (!pswit[OVERVIEW_SWITCH])
  1501                 printf("    Line %ld column %d - Query he/be error?\n",
  1502 		  linecnt,(int)(s-aline)+1);
  1503             else
  1504                 cnt_word++;
  1505 	}
  1506         s=wrk;
  1507         *s=0;
  1508         if (strstr(aline," i bad "))
  1509 	    s=strstr(aline," i bad ");
  1510         if (strstr(aline," you bad "))
  1511 	    s=strstr(aline," you bad ");
  1512         if (strstr(aline," he bad "))
  1513 	    s=strstr(aline," he bad ");
  1514         if (strstr(aline," she bad "))
  1515 	    s=strstr(aline," she bad ");
  1516         if (strstr(aline," they bad "))
  1517 	    s=strstr(aline," they bad ");
  1518         if (strstr(aline," a had "))
  1519 	    s=strstr(aline," a had ");
  1520         if (strstr(aline," the had "))
  1521 	    s=strstr(aline," the had ");
  1522         if (*s)
  1523 	{
  1524             if (pswit[ECHO_SWITCH])
  1525 		printf("\n%s\n",aline);
  1526             if (!pswit[OVERVIEW_SWITCH])
  1527                 printf("    Line %ld column %d - Query had/bad error?\n",
  1528 		  linecnt,(int)(s-aline)+1);
  1529             else
  1530                 cnt_word++;
  1531 	}
  1532         s=wrk;
  1533         *s=0;
  1534         if (strstr(aline,", hut "))
  1535 	    s=strstr(aline,", hut ");
  1536         if (strstr(aline,"; hut "))
  1537 	    s=strstr(aline,"; hut ");
  1538         if (*s)
  1539 	{
  1540             if (pswit[ECHO_SWITCH])
  1541 		printf("\n%s\n",aline);
  1542             if (!pswit[OVERVIEW_SWITCH])
  1543                 printf("    Line %ld column %d - Query hut/but error?\n",
  1544 		  linecnt,(int)(s-aline)+1);
  1545             else
  1546                 cnt_word++;
  1547 	}
  1548         /*
  1549 	 * Special case - angled bracket in front of "From" placed there by an
  1550 	 * MTA when sending an e-mail.
  1551 	 */
  1552         if (strstr(aline,">From"))
  1553 	{
  1554             if (pswit[ECHO_SWITCH])
  1555 		printf("\n%s\n",aline);
  1556             if (!pswit[OVERVIEW_SWITCH])
  1557                 printf("    Line %ld column %d - "
  1558 		  "Query angled bracket with From\n",
  1559 		  linecnt,(int)(strstr(aline,">From")-aline)+1);
  1560             else
  1561                 cnt_punct++;
  1562 	}
  1563         /*
  1564 	 * Check for a single character line -
  1565 	 * often an overflow from bad wrapping.
  1566 	 */
  1567         if (*aline && !aline[1])
  1568 	{
  1569             if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
  1570 	      gcisdigit(*aline))
  1571                 ; /* Nothing - ignore numerals alone on a line. */
  1572             else
  1573 	    {
  1574                 if (pswit[ECHO_SWITCH])
  1575 		    printf("\n%s\n",aline);
  1576                 if (!pswit[OVERVIEW_SWITCH])
  1577                     printf("    Line %ld column 1 - "
  1578 		      "Query single character line\n",linecnt);
  1579                 else
  1580                     cnt_punct++;
  1581 	    }
  1582 	}
  1583         /* Check for I" - often should be ! */
  1584         if (strstr(aline," I\""))
  1585 	{
  1586             if (pswit[ECHO_SWITCH])
  1587 		printf("\n%s\n",aline);
  1588             if (!pswit[OVERVIEW_SWITCH])
  1589                 printf("    Line %ld column %ld - Query I=exclamation mark?\n",
  1590 		  linecnt,strstr(aline," I\"")-aline);
  1591             else
  1592                 cnt_punct++;
  1593 	}
  1594         /*
  1595 	 * Check for period without a capital letter. Cut-down from gutspell.
  1596          * Only works when it happens on a single line.
  1597 	 */
  1598         if (pswit[PARANOID_SWITCH])
  1599 	{
  1600             for (t=s=aline;strstr(t,". ");)
  1601 	    {
  1602                 t=strstr(t,". ");
  1603                 if (t==s)
  1604 		{
  1605                     t++;
  1606 		    /* start of line punctuation is handled elsewhere */
  1607                     continue;
  1608 		}
  1609                 if (!gcisalpha(t[-1]))
  1610 		{
  1611                     t++;
  1612                     continue;
  1613 		}
  1614                 if (warnings->isDutch)
  1615 		{
  1616 		    /* For Frank & Jeroen -- 's Middags case */
  1617                     if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
  1618 		      t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
  1619 		    {
  1620                         t++;
  1621                         continue;
  1622 		    }
  1623 		}
  1624                 s1=t+2;
  1625                 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
  1626                     s1++;
  1627                 if (*s1>='a' && *s1<='z')
  1628 		{
  1629 		    /* we have something to investigate */
  1630                     istypo=1;
  1631 		    /* so let's go back and find out */
  1632                     for (s1=t-1;s1>=s &&
  1633 		      (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
  1634 		      gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
  1635 			;
  1636                     s1++;
  1637                     for (i=0;*s1 && *s1!='.';s1++,i++)
  1638                         testword[i]=*s1;
  1639                     testword[i]=0;
  1640                     for (i=0;*abbrev[i];i++)
  1641                         if (!strcmp(testword,abbrev[i]))
  1642                             istypo=0;
  1643                     if (gcisdigit(*testword))
  1644 			istypo=0;
  1645                     if (!testword[1])
  1646 			istypo=0;
  1647                     if (isroman(testword))
  1648 			istypo=0;
  1649                     if (istypo)
  1650 		    {
  1651                         istypo=0;
  1652                         for (i=0;testword[i];i++)
  1653                             if (strchr(vowels,testword[i]))
  1654                                 istypo=1;
  1655 		    }
  1656                     if (istypo)
  1657 		    {
  1658                         isdup=0;
  1659                         if (strlen(testword)<MAX_QWORD_LENGTH &&
  1660 			  !pswit[VERBOSE_SWITCH])
  1661                             for (i=0;i<qperiod_index;i++)
  1662                                 if (!strcmp(testword,qperiod[i]))
  1663                                     isdup=1;
  1664                         if (!isdup)
  1665 			{
  1666                             if (qperiod_index<MAX_QWORD &&
  1667 			      strlen(testword)<MAX_QWORD_LENGTH)
  1668 			    {
  1669                                 strcpy(qperiod[qperiod_index],testword);
  1670                                 qperiod_index++;
  1671 			    }
  1672                             if (pswit[ECHO_SWITCH])
  1673 				printf("\n%s\n",aline);
  1674                             if (!pswit[OVERVIEW_SWITCH])
  1675                                 printf("    Line %ld column %d - "
  1676 				  "Extra period?\n",linecnt,(int)(t-aline)+1);
  1677                             else
  1678                                 cnt_punct++;
  1679 			}
  1680 		    }
  1681 		}
  1682 	    t++;
  1683 	    }
  1684 	}
  1685         if (pswit[TYPO_SWITCH])
  1686 	{
  1687             /* Check for words usually not followed by punctuation. */
  1688             for (s=aline;*s;)
  1689 	    {
  1690                 wordstart=s;
  1691                 s=getaword(s,inword);
  1692                 if (!*inword)
  1693 		    continue;
  1694                 lowerit(inword);
  1695                 for (i=0;*nocomma[i];i++)
  1696                     if (!strcmp(inword,nocomma[i]))
  1697 		    {
  1698                         if (*s==',' || *s==';' || *s==':')
  1699 			{
  1700                             if (pswit[ECHO_SWITCH])
  1701 				printf("\n%s\n",aline);
  1702                             if (!pswit[OVERVIEW_SWITCH])
  1703                                 printf("    Line %ld column %d - "
  1704 				  "Query punctuation after %s?\n",
  1705 				  linecnt,(int)(s-aline)+1,inword);
  1706                             else
  1707                                 cnt_punct++;
  1708 			}
  1709 		    }
  1710 		for (i=0;*noperiod[i];i++)
  1711                     if (!strcmp(inword,noperiod[i]))
  1712 		    {
  1713                         if (*s=='.' || *s=='!')
  1714 			{
  1715                             if (pswit[ECHO_SWITCH])
  1716 				printf("\n%s\n",aline);
  1717                             if (!pswit[OVERVIEW_SWITCH])
  1718                                 printf("    Line %ld column %d - "
  1719 				  "Query punctuation after %s?\n",
  1720 				  linecnt,(int)(s-aline)+1,inword);
  1721                             else
  1722                                 cnt_punct++;
  1723 			}
  1724 		    }
  1725 	    }
  1726 	}
  1727         /*
  1728 	 * Check for commonly mistyped words,
  1729 	 * and digits like 0 for O in a word.
  1730 	 */
  1731         for (s=aline;*s;)
  1732 	{
  1733             wordstart=s;
  1734             s=getaword(s,inword);
  1735             if (!*inword)
  1736 		continue; /* don't bother with empty lines */
  1737             if (mixdigit(inword))
  1738 	    {
  1739                 if (pswit[ECHO_SWITCH])
  1740 		    printf("\n%s\n",aline);
  1741                 if (!pswit[OVERVIEW_SWITCH])
  1742                     printf("    Line %ld column %d - Query digit in %s\n",
  1743 		      linecnt,(int)(wordstart-aline)+1,inword);
  1744                 else
  1745                     cnt_word++;
  1746 	    }
  1747             /*
  1748 	     * Put the word through a series of tests for likely typos and OCR
  1749 	     * errors.
  1750 	     */
  1751             if (pswit[TYPO_SWITCH])
  1752 	    {
  1753                 istypo=0;
  1754                 strcpy(testword,inword);
  1755                 alower=0;
  1756                 for (i=0;i<(signed int)strlen(testword);i++)
  1757 		{
  1758 		    /* lowercase for testing */
  1759                     if (testword[i]>='a' && testword[i]<='z')
  1760 			alower=1;
  1761                     if (alower && testword[i]>='A' && testword[i]<='Z')
  1762 		    {
  1763                         /*
  1764 			 * We have an uppercase mid-word. However, there are
  1765 			 * common cases:
  1766                          *   Mac and Mc like McGill
  1767                          *   French contractions like l'Abbe
  1768 			 */
  1769                         if (i==2 && testword[0]=='m' && testword[1]=='c' ||
  1770                           i==3 && testword[0]=='m' && testword[1]=='a' &&
  1771 			  testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
  1772 			    ; /* do nothing! */
  1773                         else
  1774                             istypo=1;
  1775 		    }
  1776                     testword[i]=(char)tolower(testword[i]);
  1777 		}
  1778                 /*
  1779 		 * Check for certain unlikely two-letter combinations at word
  1780 		 * start and end.
  1781 		 */
  1782                 if (strlen(testword)>1)
  1783 		{
  1784                     for (i=0;*nostart[i];i++)
  1785                         if (!strncmp(testword,nostart[i],2))
  1786                             istypo=1;
  1787                     for (i=0;*noend[i];i++)
  1788                         if (!strncmp(testword+strlen(testword)-2,noend[i],2))
  1789                             istypo=1;
  1790 		}
  1791                 /* ght is common, gbt never. Like that. */
  1792                 if (strstr(testword,"cb"))
  1793 		    istypo=1;
  1794                 if (strstr(testword,"gbt"))
  1795 		    istypo=1;
  1796                 if (strstr(testword,"pbt"))
  1797 		    istypo=1;
  1798                 if (strstr(testword,"tbs"))
  1799 		    istypo=1;
  1800                 if (strstr(testword,"mrn"))
  1801 		    istypo=1;
  1802                 if (strstr(testword,"ahle"))
  1803 		    istypo=1;
  1804                 if (strstr(testword,"ihle"))
  1805 		    istypo=1;
  1806                 /*
  1807 		 * "TBE" does happen - like HEARTBEAT - but uncommon.
  1808                  * Also "TBI" - frostbite, outbid - but uncommon.
  1809                  * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1810 		 * numerals, but "ii" is a common scanno.
  1811 		 */
  1812                 if (strstr(testword,"tbi"))
  1813 		    istypo=1;
  1814                 if (strstr(testword,"tbe"))
  1815 		    istypo=1;
  1816                 if (strstr(testword,"ii"))
  1817 		    istypo=1;
  1818                 /*
  1819 		 * Check for no vowels or no consonants.
  1820                  * If none, flag a typo.
  1821 		 */
  1822                 if (!istypo && strlen(testword)>1)
  1823 		{
  1824                     vowel=consonant=0;
  1825                     for (i=0;testword[i];i++)
  1826 		    {
  1827                         if (testword[i]=='y' || gcisdigit(testword[i]))
  1828 			{
  1829 			    /* Yah, this is loose. */
  1830                             vowel++;
  1831                             consonant++;
  1832 			}
  1833                         else if (strchr(vowels,testword[i]))
  1834 			    vowel++;
  1835 			else
  1836 			    consonant++;
  1837 		    }
  1838                     if (!vowel || !consonant)
  1839                         istypo=1;
  1840 		}
  1841                 /*
  1842 		 * Now exclude the word from being reported if it's in
  1843                  * the okword list.
  1844 		 */
  1845                 for (i=0;*okword[i];i++)
  1846                     if (!strcmp(testword,okword[i]))
  1847                         istypo=0;
  1848                 /*
  1849 		 * What looks like a typo may be a Roman numeral.
  1850 		 * Exclude these.
  1851 		 */
  1852                 if (istypo && isroman(testword))
  1853 		    istypo=0;
  1854                 /* Check the manual list of typos. */
  1855                 if (!istypo)
  1856                     for (i=0;*typo[i];i++)
  1857                         if (!strcmp(testword,typo[i]))
  1858                             istypo=1;
  1859                 /*
  1860 		 * Check lowercase s, l, i and m - special cases.
  1861                  *   "j" - often a semi-colon gone wrong.
  1862                  *   "d" for a missing apostrophe - he d
  1863                  *   "n" for "in"
  1864 		 */
  1865                 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
  1866 		    istypo=1;
  1867                 if (istypo)
  1868 		{
  1869                     isdup=0;
  1870                     if (strlen(testword)<MAX_QWORD_LENGTH &&
  1871 		      !pswit[VERBOSE_SWITCH])
  1872                         for (i=0;i<qword_index;i++)
  1873                             if (!strcmp(testword,qword[i]))
  1874 			    {
  1875                                 isdup=1;
  1876                                 ++dupcnt[i];
  1877 			    }
  1878                     if (!isdup)
  1879 		    {
  1880                         if (qword_index<MAX_QWORD &&
  1881 			  strlen(testword)<MAX_QWORD_LENGTH)
  1882 			{
  1883                             strcpy(qword[qword_index],testword);
  1884                             qword_index++;
  1885 			}
  1886                         if (pswit[ECHO_SWITCH])
  1887 			    printf("\n%s\n",aline);
  1888                         if (!pswit[OVERVIEW_SWITCH])
  1889 			{
  1890                             printf("    Line %ld column %d - Query word %s",
  1891 			      linecnt,(int)(wordstart-aline)+1,inword);
  1892                             if (strlen(testword)<MAX_QWORD_LENGTH &&
  1893 			      !pswit[VERBOSE_SWITCH])
  1894                                 printf(" - not reporting duplicates");
  1895                             printf("\n");
  1896 			}
  1897                         else
  1898                             cnt_word++;
  1899 		    }
  1900 		}
  1901 	    }
  1902 	    /* check the user's list of typos */
  1903 	    if (!istypo && usertypo_count)
  1904 		for (i=0;i<usertypo_count;i++)
  1905 		    if (!strcmp(testword,usertypo[i]))
  1906 		    {
  1907 			if (pswit[ECHO_SWITCH])
  1908 			    printf("\n%s\n",aline);
  1909 			if (!pswit[OVERVIEW_SWITCH])  
  1910 			    printf("    Line %ld column %d - "
  1911 			      "Query possible scanno %s\n",
  1912 			      linecnt,(int)(wordstart-aline)+2,inword);
  1913 		    }
  1914             if (pswit[PARANOID_SWITCH] && warnings->digit)
  1915 	    {
  1916 		/* In paranoid mode, query all 0 and 1 standing alone. */
  1917                 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1918 		{
  1919                     if (pswit[ECHO_SWITCH])
  1920 			printf("\n%s\n",aline);
  1921                     if (!pswit[OVERVIEW_SWITCH])
  1922                         printf("    Line %ld column %d - Query standalone %s\n",
  1923 			  linecnt,(int)(wordstart-aline)+2,inword);
  1924                     else
  1925                         cnt_word++;
  1926 		}
  1927 	    }
  1928 	}
  1929 	/*
  1930          * Look for added or missing spaces around punctuation and quotes.
  1931          * If there is a punctuation character like ! with no space on
  1932          * either side, suspect a missing!space. If there are spaces on
  1933          * both sides , assume a typo. If we see a double quote with no
  1934          * space or punctuation on either side of it, assume unspaced
  1935          * quotes "like"this.
  1936 	 */
  1937         llen=strlen(aline);
  1938         for (i=1;i<llen;i++)
  1939 	{
  1940 	    /* For each character in the line after the first. */
  1941             if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
  1942 	    {
  1943 		/* we need to suppress warnings for acronyms like M.D. */
  1944                 isacro=0;
  1945 		/* we need to suppress warnings for ellipsis . . . */
  1946                 isellipsis=0;
  1947 		/* if there are letters on both sides of it or ... */
  1948                 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
  1949                    gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
  1950 		{
  1951 		    /* ...if it's strict punctuation followed by an alpha */
  1952                     if (aline[i]=='.')
  1953 		    {
  1954                         if (i>2 && aline[i-2]=='.')
  1955 			    isacro=1;
  1956                         if (i+2<llen && aline[i+2]=='.')
  1957 			    isacro=1;
  1958 		    }
  1959                     if (!isacro)
  1960 		    {
  1961                         if (pswit[ECHO_SWITCH])
  1962 			    printf("\n%s\n",aline);
  1963                         if (!pswit[OVERVIEW_SWITCH])
  1964                             printf("    Line %ld column %d - Missing space?\n",
  1965 			      linecnt,i+1);
  1966                         else
  1967                             cnt_punct++;
  1968 		    }
  1969 		}
  1970                 if (aline[i-1]==CHAR_SPACE &&
  1971 		  (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
  1972 		{
  1973 		    /*
  1974 		     * If there are spaces on both sides,
  1975 		     * or space before and end of line.
  1976 		     */
  1977                     if (aline[i]=='.')
  1978 		    {
  1979                         if (i>2 && aline[i-2]=='.')
  1980 			    isellipsis=1;
  1981                         if (i+2<llen && aline[i+2]=='.')
  1982 			    isellipsis=1;
  1983 		    }
  1984                     if (!isemptyline && !isellipsis)
  1985 		    {
  1986                         if (pswit[ECHO_SWITCH])
  1987 			    printf("\n%s\n",aline);
  1988                         if (!pswit[OVERVIEW_SWITCH])
  1989                             printf("    Line %ld column %d - "
  1990 			      "Spaced punctuation?\n",linecnt,i+1);
  1991                         else
  1992                             cnt_punct++;
  1993 		    }
  1994 		}
  1995 	    }
  1996 	}
  1997         /* Split out the characters that CANNOT be preceded by space. */
  1998         llen=strlen(aline);
  1999         for (i=1;i<llen;i++)
  2000 	{
  2001 	    /* for each character in the line after the first */
  2002             if (strchr("?!,;:",aline[i]))
  2003 	    {
  2004 		/* if it's punctuation that _cannot_ have a space before it */
  2005                 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
  2006 		  aline[i+1]!=CHAR_SPACE)
  2007 		{
  2008 		    /*
  2009 		     * If aline[i+1) DOES == space,
  2010 		     * it was already reported just above.
  2011 		     */
  2012                     if (pswit[ECHO_SWITCH])
  2013 			printf("\n%s\n",aline);
  2014                     if (!pswit[OVERVIEW_SWITCH])
  2015                         printf("    Line %ld column %d - Spaced punctuation?\n",
  2016 			  linecnt,i+1);
  2017                     else
  2018                         cnt_punct++;
  2019 		}
  2020 	    }
  2021 	}
  2022         /*
  2023 	 * Special case " .X" where X is any alpha.
  2024          * This plugs a hole in the acronym code above.
  2025 	 * Inelegant, but maintainable.
  2026 	 */
  2027         llen=strlen(aline);
  2028         for (i=1;i<llen;i++)
  2029 	{
  2030 	    /* for each character in the line after the first */
  2031             if (aline[i]=='.')
  2032 	    {
  2033 		/* if it's a period */
  2034                 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
  2035 		{
  2036 		    /*
  2037 		     * If the period follows a space and
  2038 		     * is followed by a letter.
  2039 		     */
  2040                     if (pswit[ECHO_SWITCH])
  2041 			printf("\n%s\n",aline);
  2042                     if (!pswit[OVERVIEW_SWITCH])
  2043                         printf("    Line %ld column %d - Spaced punctuation?\n",
  2044 			  linecnt,i+1);
  2045                     else
  2046                         cnt_punct++;
  2047 		}
  2048 	    }
  2049 	}
  2050         for (i=1;i<llen;i++)
  2051 	{
  2052 	    /* for each character in the line after the first */
  2053             if (aline[i]==CHAR_DQUOTE)
  2054 	    {
  2055                 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
  2056 		  !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
  2057 		  !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
  2058 		{
  2059 		    if (pswit[ECHO_SWITCH])
  2060 			printf("\n%s\n",aline);
  2061 		    if (!pswit[OVERVIEW_SWITCH])
  2062 			printf("    Line %ld column %d - Unspaced quotes?\n",
  2063 			  linecnt,i+1);
  2064 		    else
  2065 			cnt_punct++;
  2066 		}
  2067 	    }
  2068 	}
  2069         /* Check parity of quotes. */
  2070         for (s=aline;*s;s++)
  2071 	{
  2072             if (*s==CHAR_DQUOTE)
  2073 	    {
  2074                 if (!(dquotepar=!dquotepar))
  2075 		{
  2076 		    /* parity even */
  2077                     if (!strchr("_-.'`/,;:!?)]} ",s[1]))
  2078 		    {
  2079                         if (pswit[ECHO_SWITCH])
  2080 			    printf("\n%s\n",aline);
  2081                         if (!pswit[OVERVIEW_SWITCH])
  2082                             printf("    Line %ld column %d - "
  2083 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
  2084                         else
  2085                             cnt_punct++;
  2086 		    }
  2087 		}
  2088                 else
  2089 		{
  2090 		    /* parity odd */
  2091                     if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
  2092 		      !strchr("_-/.'`([{$",s[1]) || !s[1])
  2093 		    {
  2094                         if (pswit[ECHO_SWITCH])
  2095 			    printf("\n%s\n",aline);
  2096                         if (!pswit[OVERVIEW_SWITCH])
  2097                             printf("    Line %ld column %d - "
  2098 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
  2099                         else
  2100                             cnt_punct++;
  2101 		    }
  2102 		}
  2103 	    }
  2104 	}
  2105 	if (*aline==CHAR_DQUOTE)
  2106 	{
  2107 	    if (strchr(",;:!?)]} ",aline[1]))
  2108 	    {
  2109 		if (pswit[ECHO_SWITCH])
  2110 		    printf("\n%s\n",aline);
  2111 		if (!pswit[OVERVIEW_SWITCH])
  2112 		    printf("    Line %ld column 1 - Wrongspaced quotes?\n",
  2113 		      linecnt);
  2114 		else
  2115 		    cnt_punct++;
  2116 	    }
  2117 	}
  2118         if (pswit[SQUOTE_SWITCH])
  2119 	{
  2120             for (s=aline;*s;s++)
  2121 	    {
  2122                 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
  2123 		  (s==aline || s>aline && !gcisalpha(s[-1]) ||
  2124 		  !gcisalpha(s[1])))
  2125 		{
  2126                     if (!(squotepar=!squotepar))
  2127 		    {
  2128 			/* parity even */
  2129                         if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
  2130 			{
  2131                             if (pswit[ECHO_SWITCH])
  2132 				printf("\n%s\n",aline);
  2133                             if (!pswit[OVERVIEW_SWITCH])
  2134                                 printf("    Line %ld column %d - "
  2135 				  "Wrongspaced singlequotes?\n",
  2136 				  linecnt,(int)(s-aline)+1);
  2137                             else
  2138                                 cnt_punct++;
  2139 			}
  2140 		    }
  2141                     else
  2142 		    {
  2143 			/* parity odd */
  2144                         if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
  2145 			  !strchr("_-/\".'`",s[1]) || !s[1])
  2146 			{
  2147                             if (pswit[ECHO_SWITCH])
  2148 				printf("\n%s\n",aline);
  2149                             if (!pswit[OVERVIEW_SWITCH])
  2150                                 printf("    Line %ld column %d - "
  2151 				  "Wrongspaced singlequotes?\n",
  2152 				  linecnt,(int)(s-aline)+1);
  2153                             else
  2154                                 cnt_punct++;
  2155 			}
  2156 		    }
  2157 		}
  2158 	    }
  2159 	}
  2160         /*
  2161 	 * Look for double punctuation like ,. or ,,
  2162          * Thanks to DW for the suggestion!
  2163          * In books with references, ".," and ".;" are common
  2164          * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2165          * OTOH, from my initial tests, there are also fairly
  2166          * common errors. What to do? Make these cases paranoid?
  2167          * ".," is the most common, so warnings->dotcomma is used
  2168          * to suppress detailed reporting if it occurs often.
  2169 	 */
  2170         llen=strlen(aline);
  2171         for (i=0;i<llen;i++)
  2172 	{
  2173 	    /* for each punctuation character in the line */
  2174             if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
  2175 	      aline[i] && aline[i+1])
  2176 	    {
  2177 		/* followed by punctuation, it's a query, unless . . . */
  2178                 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
  2179 		  aline[i]=='!') ||
  2180 		  !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
  2181 		  warnings->isFrench && !strncmp(aline+i,",...",4) ||
  2182 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||
  2183 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||
  2184 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||
  2185 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||
  2186 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||
  2187 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||
  2188 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||
  2189 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||
  2190 		  warnings->isFrench && !strncmp(aline+i,"...?",4))
  2191 		{
  2192 		    if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
  2193 		      warnings->isFrench && !strncmp(aline+i,"...,",4) ||
  2194 		      warnings->isFrench && !strncmp(aline+i,";...",4) ||
  2195 		      warnings->isFrench && !strncmp(aline+i,"...;",4) ||
  2196 		      warnings->isFrench && !strncmp(aline+i,":...",4) ||
  2197 		      warnings->isFrench && !strncmp(aline+i,"...:",4) ||
  2198 		      warnings->isFrench && !strncmp(aline+i,"!...",4) ||
  2199 		      warnings->isFrench && !strncmp(aline+i,"...!",4) ||
  2200 		      warnings->isFrench && !strncmp(aline+i,"?...",4) ||
  2201 		      warnings->isFrench && !strncmp(aline+i,"...?",4))
  2202 			i+=4;
  2203 		    ; /* do nothing for .. !! and ?? which can be legit */
  2204 		}
  2205                 else
  2206 		{
  2207                     if (pswit[ECHO_SWITCH])
  2208 			printf("\n%s\n",aline);
  2209                     if (!pswit[OVERVIEW_SWITCH])
  2210                         printf("    Line %ld column %d - Double punctuation?\n",
  2211 			  linecnt,i+1);
  2212                     else
  2213                         cnt_punct++;
  2214 		}
  2215 	    }
  2216 	}
  2217         s=aline;
  2218         while (strstr(s," \" "))
  2219 	{
  2220             if (pswit[ECHO_SWITCH])
  2221 		printf("\n%s\n",aline);
  2222             if (!pswit[OVERVIEW_SWITCH])
  2223                 printf("    Line %ld column %d - Spaced doublequote?\n",
  2224 		  linecnt,(int)(strstr(s," \" ")-aline+1));
  2225             else
  2226                 cnt_punct++;
  2227             s=strstr(s," \" ")+2;
  2228 	}
  2229         s=aline;
  2230         while (strstr(s," ' "))
  2231 	{
  2232             if (pswit[ECHO_SWITCH])
  2233 		printf("\n%s\n",aline);
  2234             if (!pswit[OVERVIEW_SWITCH])
  2235                 printf("    Line %ld column %d - Spaced singlequote?\n",
  2236 		  linecnt,(int)(strstr(s," ' ")-aline+1));
  2237             else
  2238                 cnt_punct++;
  2239             s=strstr(s," ' ")+2;
  2240 	}
  2241         s=aline;
  2242         while (strstr(s," ` "))
  2243 	{
  2244             if (pswit[ECHO_SWITCH])
  2245 		printf("\n%s\n",aline);
  2246             if (!pswit[OVERVIEW_SWITCH])
  2247                 printf("    Line %ld column %d - Spaced singlequote?\n",
  2248 		  linecnt,(int)(strstr(s," ` ")-aline+1));
  2249             else
  2250                 cnt_punct++;
  2251             s=strstr(s," ` ")+2;
  2252 	}
  2253         /* check special case of 'S instead of 's at end of word */
  2254         s=aline+1;
  2255         while (*s)
  2256 	{
  2257             if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
  2258 	    {
  2259                 if (pswit[ECHO_SWITCH])
  2260 		    printf("\n%s\n",aline);
  2261                 if (!pswit[OVERVIEW_SWITCH])
  2262                     printf("    Line %ld column %d - Capital \"S\"?\n",
  2263 		      linecnt,(int)(s-aline+2));
  2264                 else
  2265                     cnt_punct++;
  2266 	    }
  2267             s++;
  2268 	}
  2269         /*
  2270 	 * Now check special cases - start and end of line -
  2271          * for single and double quotes. Start is sometimes [sic]
  2272          * but better to query it anyway.
  2273          * While we're here, check for dash at end of line.
  2274 	 */
  2275         llen=strlen(aline);
  2276         if (llen>1)
  2277 	{
  2278             if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
  2279 	      aline[llen-1]==CHAR_OPEN_SQUOTE)
  2280                 if (aline[llen-2]==CHAR_SPACE)
  2281 		{
  2282                     if (pswit[ECHO_SWITCH])
  2283 			printf("\n%s\n",aline);
  2284                     if (!pswit[OVERVIEW_SWITCH])
  2285                         printf("    Line %ld column %d - Spaced quote?\n",
  2286 			  linecnt,llen);
  2287                     else
  2288                         cnt_punct++;
  2289 		}
  2290             if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
  2291 	      aline[1]==CHAR_SPACE)
  2292 	    {
  2293 		if (pswit[ECHO_SWITCH])
  2294 		    printf("\n%s\n",aline);
  2295 		if (!pswit[OVERVIEW_SWITCH])
  2296 		    printf("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2297 		else
  2298 		    cnt_punct++;
  2299 	    }
  2300             /*
  2301 	     * Dash at end of line may well be legit - paranoid mode only
  2302              * and don't report em-dash at line-end.
  2303 	     */
  2304             if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2305 	    {
  2306                 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
  2307 		    ;
  2308                 if (aline[i]=='-' && aline[i-1]!='-')
  2309 		{
  2310                     if (pswit[ECHO_SWITCH])
  2311 			printf("\n%s\n",aline);
  2312                     if (!pswit[OVERVIEW_SWITCH])
  2313                         printf("    Line %ld column %d - "
  2314 			  "Hyphen at end of line?\n",linecnt,i);
  2315 		}
  2316 	    }
  2317 	}
  2318         /*
  2319 	 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2320          * If so, suspect a scanno like "a]most".
  2321 	 */
  2322         llen=strlen(aline);
  2323         for (i=1;i<llen-1;i++)
  2324 	{
  2325 	    /* for each bracket character in the line except 1st & last */
  2326             if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
  2327 	      gcisalpha(aline[i+1]))
  2328 	    {
  2329                 if (pswit[ECHO_SWITCH])
  2330 		    printf("\n%s\n",aline);
  2331                 if (!pswit[OVERVIEW_SWITCH])
  2332                     printf("    Line %ld column %d - Unspaced bracket?\n",
  2333 		      linecnt,i);
  2334                 else
  2335                     cnt_punct++;
  2336 	    }
  2337 	}
  2338         llen=strlen(aline);
  2339         if (warnings->endquote)
  2340 	{
  2341             for (i=1;i<llen;i++)
  2342 	    {
  2343 		/* for each character in the line except 1st */
  2344                 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
  2345 		{
  2346 		    if (pswit[ECHO_SWITCH])
  2347 			printf("\n%s\n",aline);
  2348 		    if (!pswit[OVERVIEW_SWITCH])
  2349 			printf("    Line %ld column %d - "
  2350 			  "endquote missing punctuation?\n",linecnt,i);
  2351 		    else
  2352 			cnt_punct++;
  2353 		}
  2354 	    }
  2355 	}
  2356 	/*
  2357          * Check for <HTML TAG>.
  2358          * If there is a < in the line, followed at some point
  2359          * by a > then we suspect HTML.
  2360 	 */
  2361         if (strstr(aline,"<") && strstr(aline,">"))
  2362 	{
  2363             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
  2364             if (i>0)
  2365 	    {
  2366                 strncpy(wrk,strstr(aline,"<"),i);
  2367                 wrk[i]=0;
  2368                 if (pswit[ECHO_SWITCH])
  2369 		    printf("\n%s\n",aline);
  2370                 if (!pswit[OVERVIEW_SWITCH])
  2371                     printf("    Line %ld column %d - HTML Tag? %s \n",
  2372 		      linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
  2373                 else
  2374                     cnt_html++;
  2375 	    }
  2376 	}
  2377         /*
  2378 	 * Check for &symbol; HTML.
  2379          * If there is a & in the line, followed at
  2380          * some point by a ; then we suspect HTML.
  2381 	 */
  2382         if (strstr(aline,"&") && strstr(aline,";"))
  2383 	{
  2384             i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
  2385             for (s=strstr(aline,"&");s<strstr(aline,";");s++)   
  2386                 if (*s==CHAR_SPACE)
  2387 		    i=0;                /* Don't report "Jones & Son;" */
  2388             if (i>0)
  2389 	    {
  2390                 strncpy(wrk,strstr(aline,"&"),i);
  2391                 wrk[i]=0;
  2392                 if (pswit[ECHO_SWITCH])
  2393 		    printf("\n%s\n",aline);
  2394                 if (!pswit[OVERVIEW_SWITCH])
  2395                     printf("    Line %ld column %d - HTML symbol? %s \n",
  2396 		      linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
  2397                 else
  2398                     cnt_html++;
  2399 	    }
  2400 	}
  2401         /*
  2402 	 * At end of paragraph, check for mismatched quotes.
  2403          * We don't want to report an error immediately, since it is a
  2404          * common convention to omit the quotes at end of paragraph if
  2405          * the next paragraph is a continuation of the same speaker.
  2406          * Where this is the case, the next para should begin with a
  2407          * quote, so we store the warning message and only display it
  2408          * at the top of the next iteration if the new para doesn't
  2409          * start with a quote.
  2410          * The -p switch overrides this default, and warns of unclosed
  2411          * quotes on _every_ paragraph, whether the next begins with a
  2412          * quote or not.
  2413 	 */
  2414         if (isemptyline)
  2415 	{
  2416 	    /* end of para - add up the totals */
  2417             if (counters.quot%2)
  2418                 sprintf(dquote_err,"    Line %ld - Mismatched quotes\n",
  2419 		  linecnt);
  2420             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
  2421 	      counters.open_single_quote!=counters.close_single_quote)
  2422                 sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n",
  2423 		  linecnt);
  2424             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
  2425 	      counters.open_single_quote!=counters.close_single_quote &&
  2426 	      counters.open_single_quote!=counters.close_single_quote+1)
  2427 		/*
  2428 		 * Flag it to be noted regardless of the
  2429 		 * first char of the next para.
  2430 		 */
  2431                 squot=1;
  2432             if (counters.r_brack)
  2433                 sprintf(rbrack_err,"    Line %ld - "
  2434 		  "Mismatched round brackets?\n",linecnt);
  2435             if (counters.s_brack)
  2436                 sprintf(sbrack_err,"    Line %ld - "
  2437 		  "Mismatched square brackets?\n",linecnt);
  2438             if (counters.c_brack)
  2439                 sprintf(cbrack_err,"    Line %ld - "
  2440 		  "Mismatched curly brackets?\n",linecnt);
  2441             if (counters.c_unders%2)
  2442                 sprintf(unders_err,"    Line %ld - Mismatched underscores?\n",
  2443 		  linecnt);
  2444 	    memset(&counters,0,sizeof(counters));
  2445 	    /* let the next iteration know that it's starting a new para */
  2446             isnewpara=1;
  2447 	}
  2448         /*
  2449 	 * Check for omitted punctuation at end of paragraph by working back
  2450 	 * through prevline. DW.
  2451          * Need to check this only for "normal" paras.
  2452          * So what is a "normal" para?
  2453          *    Not normal if one-liner (chapter headings, etc.)
  2454          *    Not normal if doesn't contain at least one locase letter
  2455          *    Not normal if starts with space
  2456 	 */
  2457         if (isemptyline)
  2458 	{
  2459 	    /* end of para */
  2460             for (s=prevline,i=0;*s && !i;s++)
  2461                 if (gcisletter(*s))
  2462 		    /* use i to indicate the presence of a letter on the line */
  2463                     i=1;
  2464             /*
  2465 	     * This next "if" is a problem.
  2466              * If we say "start_para_line <= linecnt - 1", that includes
  2467 	     * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2468              * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2469              * misses genuine one-line paragraphs.
  2470 	     */
  2471             if (i && lastblen>2 && start_para_line<linecnt-1 &&
  2472 	      *prevline>CHAR_SPACE)
  2473 	    {
  2474                 for (i=strlen(prevline)-1;
  2475 		  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
  2476 		  prevline[i]>CHAR_SPACE && i>0;
  2477 		  i--)
  2478 		    ;
  2479                 for (;i>0;i--)
  2480 		{
  2481                     if (gcisalpha(prevline[i]))
  2482 		    {
  2483                         if (pswit[ECHO_SWITCH])
  2484 			    printf("\n%s\n",prevline);
  2485                         if (!pswit[OVERVIEW_SWITCH])
  2486                             printf("    Line %ld column %d - "
  2487 			      "No punctuation at para end?\n",
  2488 			      linecnt-1,strlen(prevline));
  2489                         else
  2490                             cnt_punct++;
  2491                         break;
  2492 		    }
  2493                     if (strchr("-.:!([{?}])",prevline[i]))
  2494                         break;
  2495 		}
  2496 	    }
  2497 	}
  2498         strcpy(prevline,aline);
  2499     }
  2500     fclose(infile);
  2501     if (!pswit[OVERVIEW_SWITCH])
  2502         for (i=0;i<MAX_QWORD;i++)
  2503             if (dupcnt[i])
  2504                 printf("\nNote: Queried word %s was duplicated %d time%s\n",
  2505 		  qword[i],dupcnt[i],"s");
  2506 }
  2507 
  2508 /*
  2509  * flgets:
  2510  *
  2511  * Get one line from the input stream, checking for
  2512  * the existence of exactly one CR/LF line-end per line.
  2513  *
  2514  * Returns: a pointer to the line.
  2515  */
  2516 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
  2517 {
  2518     char c;
  2519     int len,isCR,cint;
  2520     *theline=0;
  2521     len=isCR=0;
  2522     c=cint=fgetc(thefile);
  2523     do
  2524     {
  2525         if (cint==EOF)
  2526             return NULL;
  2527 	/* either way, it's end of line */
  2528         if (c==10)
  2529 	{
  2530             if (isCR)
  2531                 break;
  2532             else
  2533 	    {
  2534 		/* Error - a LF without a preceding CR */
  2535                 if (pswit[LINE_END_SWITCH])
  2536 		{
  2537                     if (pswit[ECHO_SWITCH])
  2538 			printf("\n%s\n",theline);
  2539                     if (!pswit[OVERVIEW_SWITCH])
  2540                         printf("    Line %ld - No CR?\n",lcnt);
  2541                     else
  2542                         cnt_lineend++;
  2543 		}
  2544                 break;
  2545 	    }
  2546 	}
  2547         if (c==13)
  2548 	{
  2549             if (isCR)
  2550 	    {
  2551 		/* Error - two successive CRs */
  2552                 if (pswit[LINE_END_SWITCH])
  2553 		{
  2554                     if (pswit[ECHO_SWITCH])
  2555 			printf("\n%s\n",theline);
  2556                     if (!pswit[OVERVIEW_SWITCH])
  2557                         printf("    Line %ld - Two successive CRs?\n",lcnt);
  2558                     else
  2559                         cnt_lineend++;
  2560 		}
  2561 	    }
  2562             isCR=1;
  2563 	}
  2564         else
  2565 	{
  2566             if (pswit[LINE_END_SWITCH] && isCR)
  2567 	    {
  2568                 if (pswit[ECHO_SWITCH])
  2569 		    printf("\n%s\n",theline);
  2570                 if (!pswit[OVERVIEW_SWITCH])
  2571                     printf("    Line %ld column %d - CR without LF?\n",
  2572 		      lcnt,len+1);
  2573                 else
  2574                     cnt_lineend++;
  2575 	    }
  2576             theline[len]=c;
  2577             len++;
  2578             theline[len]=0;
  2579             isCR=0;
  2580 	}
  2581         c=cint=fgetc(thefile);
  2582     } while(len<maxlen);
  2583     if (pswit[MARKUP_SWITCH])  
  2584         postprocess_for_HTML(theline);
  2585     if (pswit[DP_SWITCH])  
  2586         postprocess_for_DP(theline);
  2587     return theline;
  2588 }
  2589 
  2590 /*
  2591  * mixdigit:
  2592  *
  2593  * Takes a "word" as a parameter, and checks whether it
  2594  * contains a mixture of alpha and digits. Generally, this is an
  2595  * error, but may not be for cases like 4th or L5 12s. 3d.
  2596  *
  2597  * Returns: 0 if no error found, 1 if error.
  2598  */
  2599 int mixdigit(char *checkword)
  2600 {
  2601     int wehaveadigit,wehavealetter,firstdigits,query,wl;
  2602     char *s;
  2603     wehaveadigit=wehavealetter=query=0;
  2604     for (s=checkword;*s;s++)
  2605         if (gcisalpha(*s))
  2606             wehavealetter=1;
  2607         else
  2608             if (gcisdigit(*s))
  2609                 wehaveadigit=1;
  2610     if (wehaveadigit && wehavealetter)
  2611     {
  2612 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2613         query=1;
  2614         wl=strlen(checkword);
  2615         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
  2616             ;
  2617         /* digits, ending in st, rd, nd, th of either case */
  2618         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
  2619 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
  2620 	  matchword(checkword+wl-2,"th")))
  2621 	    query=0;
  2622         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
  2623 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
  2624 	  matchword(checkword+wl-3,"ths")))
  2625 	    query=0;
  2626         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
  2627 	  matchword(checkword+wl-4,"rdly") ||
  2628 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
  2629 	    query=0;
  2630         /* digits, ending in l, L, s or d */
  2631         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
  2632 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
  2633 	    query=0;
  2634         /*
  2635 	 * L at the start of a number, representing Britsh pounds, like L500.
  2636          * This is cute. We know the current word is mixeddigit. If the first
  2637          * letter is L, there must be at least one digit following. If both
  2638          * digits and letters follow, we have a genuine error, else we have a
  2639          * capital L followed by digits, and we accept that as a non-error.
  2640 	 */
  2641         if (checkword[0]=='L' && !mixdigit(checkword+1))
  2642 	    query=0;
  2643     }
  2644     return query;
  2645 }
  2646 
  2647 /*
  2648  * getaword:
  2649  *
  2650  * Extracts the first/next "word" from the line, and puts
  2651  * it into "thisword". A word is defined as one English word unit--or
  2652  * at least that's the aim.
  2653  *
  2654  * Returns: a pointer to the position in the line where we will start
  2655  *          looking for the next word.
  2656  */
  2657 char *getaword(char *fromline,char *thisword)
  2658 {
  2659     int i,wordlen;
  2660     char *s;
  2661     wordlen=0;
  2662     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
  2663       fromline++)
  2664 	;
  2665     /*
  2666      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2667      * Especially yucky is the case of L1,000
  2668      * This section looks for a pattern of characters including a digit
  2669      * followed by a comma or period followed by one or more digits.
  2670      * If found, it returns this whole pattern as a word; otherwise we discard
  2671      * the results and resume our normal programming.
  2672      */
  2673     s=fromline;
  2674     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
  2675       wordlen<MAXWORDLEN;s++)
  2676     {
  2677 	thisword[wordlen]=*s;
  2678         wordlen++;
  2679     }
  2680     thisword[wordlen]=0;
  2681     for (i=1;i<wordlen-1;i++)
  2682     {
  2683         if (thisword[i]=='.' || thisword[i]==',')
  2684 	{
  2685             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
  2686 	    {
  2687                 fromline=s;
  2688                 return fromline;
  2689 	    }
  2690 	}
  2691     }
  2692     /* we didn't find a punctuated number - do the regular getword thing */
  2693     wordlen=0;
  2694     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
  2695       wordlen<MAXWORDLEN;fromline++)
  2696     {
  2697         thisword[wordlen]=*fromline;
  2698         wordlen++;
  2699     }
  2700     thisword[wordlen]=0;
  2701     return fromline;
  2702 }
  2703 
  2704 /*
  2705  * matchword:
  2706  *
  2707  * A case-insensitive string matcher.
  2708  */
  2709 int matchword(char *checkfor,char *thisword)
  2710 {
  2711     unsigned int ismatch,i;
  2712     if (strlen(checkfor)!=strlen(thisword))
  2713 	return 0;
  2714     ismatch=1;     /* assume a match until we find a difference */
  2715     for (i=0;i<strlen(checkfor);i++)
  2716         if (toupper(checkfor[i])!=toupper(thisword[i]))
  2717             ismatch=0;
  2718     return ismatch;
  2719 }
  2720 
  2721 /*
  2722  * lowerit:
  2723  *
  2724  * Lowercase the line.
  2725  */
  2726 
  2727 void lowerit(char *theline)
  2728 {
  2729     for (;*theline;theline++)
  2730         if (*theline>='A' && *theline<='Z')
  2731             *theline+=32;
  2732 }
  2733 
  2734 /*
  2735  * isroman:
  2736  *
  2737  * Is this word a Roman Numeral?
  2738  *
  2739  * It doesn't actually validate that the number is a valid Roman Numeral--for
  2740  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  2741  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  2742  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  2743  * expressions thereof, except when it came to taxes. Allow any number of M,
  2744  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  2745  * XL or an optional XC, an optional IX or IV, an optional V and any number
  2746  * of optional Is.
  2747  */
  2748 int isroman(char *t)
  2749 {
  2750     char *s;
  2751     if (!t || !*t)
  2752 	return 0;
  2753     s=t;
  2754     while (*t=='m' && *t)
  2755 	t++;
  2756     if (*t=='d')
  2757 	t++;
  2758     if (*t=='c' && t[1]=='m')
  2759 	t+=2;
  2760     if (*t=='c' && t[1]=='d')
  2761 	t+=2;
  2762     while (*t=='c' && *t)
  2763 	t++;
  2764     if (*t=='x' && t[1]=='l')
  2765 	t+=2;
  2766     if (*t=='x' && t[1]=='c')
  2767 	t+=2;
  2768     if (*t=='l')
  2769 	t++;
  2770     while (*t=='x' && *t)
  2771 	t++;
  2772     if (*t=='i' && t[1]=='x')
  2773 	t+=2;
  2774     if (*t=='i' && t[1]=='v')
  2775 	t+=2;
  2776     if (*t=='v')
  2777 	t++;
  2778     while (*t=='i' && *t)
  2779 	t++;
  2780     return !*t;
  2781 }
  2782 
  2783 /*
  2784  * gcisalpha:
  2785  *
  2786  * A version of isalpha() that is somewhat lenient on 8-bit texts.
  2787  * If we use the standard function, 8-bit accented characters break
  2788  * words, so that tete with accented characters appears to be two words, "t"
  2789  * and "t", with 8-bit characters between them. This causes over-reporting of
  2790  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
  2791  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
  2792  */
  2793 int gcisalpha(unsigned char c)
  2794 {
  2795     if (c>='a' && c<='z')
  2796 	return 1;
  2797     if (c>='A' && c<='Z')
  2798 	return 1;
  2799     if (c<140)
  2800 	return 0;
  2801     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
  2802 	return 1;
  2803     if (c==140 || c==142 || c==156 || c==158 || c==159)
  2804 	return 1;
  2805     return 0;
  2806 }
  2807 
  2808 /*
  2809  * gcisdigit:
  2810  *
  2811  * A version of isdigit() that doesn't get confused in 8-bit texts.
  2812  */
  2813 int gcisdigit(unsigned char c)
  2814 {   
  2815     return c>='0' && c<='9';
  2816 }
  2817 
  2818 /*
  2819  * gcisletter:
  2820  *
  2821  * A version of isletter() that doesn't get confused in 8-bit texts.
  2822  * NB: this is ISO-8891-1-specific.
  2823  */
  2824 int gcisletter(unsigned char c)
  2825 {   
  2826     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
  2827 }
  2828 
  2829 /*
  2830  * gcstrchr:
  2831  *
  2832  * Wraps strchr to return NULL if the character being searched for is zero.
  2833  */
  2834 char *gcstrchr(char *s,char c)
  2835 {
  2836     if (!c)
  2837 	return NULL;
  2838     return strchr(s,c);
  2839 }
  2840 
  2841 /*
  2842  * postprocess_for_DP:
  2843  *
  2844  * Invoked with the -d switch from flgets().
  2845  * It simply "removes" from the line a hard-coded set of common
  2846  * DP-specific tags, so that the line passed to the main routine has
  2847  * been pre-cleaned of DP markup.
  2848  */
  2849 void postprocess_for_DP(char *theline)
  2850 {
  2851     char *s,*t;
  2852     int i;
  2853     if (!*theline) 
  2854         return;
  2855     for (i=0;*DPmarkup[i];i++)
  2856     {
  2857         s=strstr(theline,DPmarkup[i]);
  2858         while (s)
  2859 	{
  2860             t=s+strlen(DPmarkup[i]);
  2861             while (*t)
  2862 	    {
  2863                 *s=*t;
  2864                 t++;
  2865 		s++;
  2866 	    }
  2867             *s=0;
  2868             s=strstr(theline,DPmarkup[i]);
  2869 	}
  2870     }
  2871 }
  2872 
  2873 /*
  2874  * postprocess_for_HTML:
  2875  *
  2876  * Invoked with the -m switch from flgets().
  2877  * It simply "removes" from the line a hard-coded set of common
  2878  * HTML tags and "replaces" a hard-coded set of common HTML
  2879  * entities, so that the line passed to the main routine has
  2880  * been pre-cleaned of HTML.
  2881  */
  2882 void postprocess_for_HTML(char *theline)
  2883 {
  2884     if (strstr(theline,"<") && strstr(theline,">"))
  2885         while (losemarkup(theline))
  2886             ;
  2887     while (loseentities(theline))
  2888         ;
  2889 }
  2890 
  2891 char *losemarkup(char *theline)
  2892 {
  2893     char *s,*t;
  2894     int i;
  2895     if (!*theline) 
  2896         return NULL;
  2897     s=strstr(theline,"<");
  2898     t=strstr(theline,">");
  2899     if (!s || !t)
  2900 	return NULL;
  2901     for (i=0;*markup[i];i++)
  2902         if (!tagcomp(s+1,markup[i]))
  2903 	{
  2904             if (!t[1])
  2905 	    {
  2906                 *s=0;
  2907                 return s;
  2908 	    }
  2909             else if (t>s)
  2910 	    {
  2911 		strcpy(s,t+1);
  2912 		return s;
  2913 	    }
  2914         }
  2915     /* It's an unrecognized <xxx>. */
  2916     return NULL;
  2917 }
  2918 
  2919 char *loseentities(char *theline)
  2920 {
  2921     int i;
  2922     char *s,*t;
  2923     if (!*theline) 
  2924         return NULL;
  2925     for (i=0;*entities[i].htmlent;i++)
  2926     {
  2927         s=strstr(theline,entities[i].htmlent);
  2928         if (s)
  2929 	{
  2930             t=malloc((size_t)strlen(s));
  2931             if (!t)
  2932 		return NULL;
  2933             strcpy(t,s+strlen(entities[i].htmlent));
  2934             strcpy(s,entities[i].textent);
  2935             strcat(s,t);
  2936             free(t);
  2937             return theline;
  2938 	}
  2939     }
  2940     for (i=0;*entities[i].htmlnum;i++)
  2941     {
  2942         s=strstr(theline,entities[i].htmlnum);
  2943         if (s)
  2944 	{
  2945             t=malloc((size_t)strlen(s));
  2946             if (!t)
  2947 		return NULL;
  2948             strcpy(t,s+strlen(entities[i].htmlnum));
  2949             strcpy(s,entities[i].textent);
  2950             strcat(s,t);
  2951             free(t);
  2952             return theline;
  2953 	}
  2954     }
  2955     return NULL;
  2956 }
  2957 
  2958 int tagcomp(char *strin,char *basetag)
  2959 {
  2960     char *s,*t;
  2961     s=basetag;
  2962     t=strin;
  2963     if (*t=='/')
  2964 	t++; /* ignore a slash */
  2965     while (*s && *t)
  2966     {
  2967         if (tolower(*s)!=tolower(*t))
  2968 	    return 1;
  2969         s++;
  2970 	t++;
  2971     }
  2972     return 0;
  2973 }
  2974 
  2975 void proghelp()
  2976 {
  2977     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  2978     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  2979     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  2980     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  2981       "For details, read the file COPYING.\n",stderr);
  2982     fputs("This is Free Software; "
  2983       "you may redistribute it under certain conditions (GPL);\n",stderr);
  2984     fputs("read the file COPYING for details.\n\n",stderr);
  2985     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
  2986     fputs("  where -s checks single quotes, -e suppresses echoing lines, "
  2987       "-t checks typos\n",stderr);
  2988     fputs("  -x (paranoid) switches OFF -t and extra checks, "
  2989       "-l turns OFF line-end checks\n",stderr);
  2990     fputs("  -o just displays overview without detail, "
  2991       "-h echoes header fields\n",stderr);
  2992     fputs("  -v (verbose) unsuppresses duplicate reporting, "
  2993       "-m suppresses markup\n",stderr);
  2994     fputs("  -d ignores DP-specific markup,\n",stderr);
  2995     fputs("  -u uses a file gutcheck.typ to query user-defined "
  2996       "possible typos\n",stderr);
  2997     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
  2998     fputs("\n",stderr);
  2999     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
  3000       stderr);
  3001     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3002       "non-ASCII\n",stderr);
  3003     fputs("characters like accented letters, "
  3004       "lines longer than 75 or shorter than 55,\n",stderr);
  3005     fputs("unbalanced quotes or brackets, "
  3006       "a variety of badly formatted punctuation, \n",stderr);
  3007     fputs("HTML tags, some likely typos. "
  3008       "It is NOT a substitute for human judgement.\n",stderr);
  3009     fputs("\n",stderr);
  3010 }