bookloupe-testing: bookloupe/bookloupe.c@865063352146

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*                                                                       */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */

     6 /*                                                                       */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.                                   */

    11 /*                                                                       */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */

    15 /* GNU General Public License for more details.                          */

    16 /*                                                                       */

    17 /* You should have received a copy of the GNU General Public License     */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    26 #define MAXWORDLEN    80    /* max length of one word             */

    27 #define LINEBUFSIZE 2048    /* buffer size for an input line      */

    29 #define MAX_USER_TYPOS 1000

    30 #define USERTYPO_FILE "gutcheck.typ"

    32 #ifndef MAX_PATH

    33 #define MAX_PATH 16384

    34 #endif

    36 char aline[LINEBUFSIZE];

    37 char prevline[LINEBUFSIZE];

    39 /* Common typos. */

    40 char *typo[] = {

    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    70     "se", ""

    71 };

    73 char *usertypo[MAX_USER_TYPOS];

    75 /* Common abbreviations and other OK words not to query as typos. */

    76 char *okword[] = {

    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    80     "outbid", "outbids", "frostbite", "frostbitten", ""

    81 };

    83 /* Common abbreviations that cause otherwise unexplained periods. */

    84 char *abbrev[] = {

    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    87 };

    89 /*

    90  * Two-Letter combinations that rarely if ever start words,

    91  * but are common scannos or otherwise common letter combinations.

    92  */

    93 char *nostart[] = {

    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    95 };

    97 /*

    98  * Two-Letter combinations that rarely if ever end words,

    99  * but are common scannos or otherwise common letter combinations.

   100  */

   101 char *noend[] = {

   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   103     "sw", "gr", "sl", "cl", "iy", ""

   104 };

   106 char *markup[] = {

   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   111 };

   113 char *DPmarkup[] = {

   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   115 };

   117 char *nocomma[] = {

   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   121     "during", "let", "toward", "among", ""

   122 };

   124 char *noperiod[] = {

   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   128     "among", "those", "into", "whom", "having", "thence", ""

   129 };

   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";

   133 struct {

   134     char *htmlent;

   135     char *htmlnum;

   136     char *textent;

   137 } entities[] = {

   138     "&amp;",	"&#38;",     "&",

   139     "&lt;",	"&#60;",     "<",

   140     "&gt;",	"&#62;",     ">",

   141     "&deg;",	"&#176;",    " degrees",

   142     "&pound;",	"&#163;",    "L",

   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */

   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */

   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */

   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */

   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */

   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */

   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */

   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */

   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */

   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */

   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */

   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */

   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */

   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */

   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */

   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */

   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */

   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */

   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */

   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */

   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */

   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */

   165     "&cent;",	"&#162;",    "c", /* cent sign */

   166     "&pound;",	"&#163;",    "L", /* pound sign */

   167     "&curren;",	"&#164;",    "$", /* currency sign */

   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */

   169     "&sect;",	"&#167;",    "--", /* section sign */

   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */

   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */

   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */

   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */

   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */

   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */

   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */

   177     "&deg;",	"&#176;",    " degrees", /* degree sign */

   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */

   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */

   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */

   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */

   182     "&micro;",	"&#181;",    "m", /* micro sign */

   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */

   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */

   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */

   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */

   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */

   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */

   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */

   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */

   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */

   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */

   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */

   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */

   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */

   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */

   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */

   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */

   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */

   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */

   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */

   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */

   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */

   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */

   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */

   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */

   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */

   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */

   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */

   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */

   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */

   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */

   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */

   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */

   215     "&times;",	"&#215;",    "*", /* multiplication sign */

   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */

   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */

   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */

   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */

   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */

   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */

   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */

   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */

   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */

   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */

   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */

   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */

   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */

   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */

   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */

   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */

   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */

   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */

   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */

   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */

   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */

   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */

   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */

   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */

   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */

   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */

   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */

   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */

   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */

   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */

   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */

   247     "&divide;",	"&#247;",    "/", /* division sign */

   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */

   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */

   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */

   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */

   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */

   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */

   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */

   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */

   256     "", ""

   257 };

   259 /* special characters */

   260 #define CHAR_SPACE        32

   261 #define CHAR_TAB           9

   262 #define CHAR_LF           10

   263 #define CHAR_CR           13

   264 #define CHAR_DQUOTE       34

   265 #define CHAR_SQUOTE       39

   266 #define CHAR_OPEN_SQUOTE  96

   267 #define CHAR_TILDE       126

   268 #define CHAR_ASTERISK     42

   269 #define CHAR_FORESLASH    47

   270 #define CHAR_CARAT        94

   272 #define CHAR_UNDERSCORE    '_'

   273 #define CHAR_OPEN_CBRACK   '{'

   274 #define CHAR_CLOSE_CBRACK  '}'

   275 #define CHAR_OPEN_RBRACK   '('

   276 #define CHAR_CLOSE_RBRACK  ')'

   277 #define CHAR_OPEN_SBRACK   '['

   278 #define CHAR_CLOSE_SBRACK  ']'

   280 /* longest and shortest normal PG line lengths */

   281 #define LONGEST_PG_LINE   75

   282 #define WAY_TOO_LONG      80

   283 #define SHORTEST_PG_LINE  55

   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */

   286                                   /*     D - ignore DP-specific markup     */

   287                                   /*     E - echo queried line             */

   288                                   /*     S - check single quotes           */

   289                                   /*     T - check common typos            */

   290                                   /*     P - require closure of quotes on  */

   291                                   /*         every paragraph               */

   292                                   /*     X - "Trust no one" :-) Paranoid!  */

   293                                   /*         Queries everything            */

   294                                   /*     L - line end checking defaults on */

   295                                   /*         -L turns it off               */

   296                                   /*     O - overview. Just shows counts.  */

   297                                   /*     Y - puts errors to stdout         */

   298                                   /*         instead of stderr             */

   299                                   /*     H - Echoes header fields          */

   300                                   /*     M - Ignore markup in < >          */

   301                                   /*     U - Use file of User-defined Typos*/

   302                                   /*     W - Defaults for use on Web upload*/

   303                                   /*     V - Verbose - list EVERYTHING!    */

   304 #define SWITNO 14                 /* max number of switch parms            */

   305                                   /*        - used for defining array-size */

   306 #define MINARGS   1               /* minimum no of args excl switches      */

   307 #define MAXARGS   1               /* maximum no of args excl switches      */

   309 int pswit[SWITNO];                /* program switches set by SWITCHES      */

   311 #define ECHO_SWITCH      0

   312 #define SQUOTE_SWITCH    1

   313 #define TYPO_SWITCH      2

   314 #define QPARA_SWITCH     3

   315 #define PARANOID_SWITCH  4

   316 #define LINE_END_SWITCH  5

   317 #define OVERVIEW_SWITCH  6

   318 #define STDOUT_SWITCH    7

   319 #define HEADER_SWITCH    8

   320 #define WEB_SWITCH       9

   321 #define VERBOSE_SWITCH   10

   322 #define MARKUP_SWITCH    11

   323 #define USERTYPO_SWITCH  12

   324 #define DP_SWITCH        13

   326 long cnt_dquot;       /* for overview mode, count of doublequote queries */

   327 long cnt_squot;       /* for overview mode, count of singlequote queries */

   328 long cnt_brack;       /* for overview mode, count of brackets queries */

   329 long cnt_bin;         /* for overview mode, count of non-ASCII queries */

   330 long cnt_odd;         /* for overview mode, count of odd character queries */

   331 long cnt_long;        /* for overview mode, count of long line errors */

   332 long cnt_short;       /* for overview mode, count of short line queries */

   333 long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */

   334 long cnt_dash;        /* for overview mode, count of dash-related queries */

   335 long cnt_word;        /* for overview mode, count of word queries */

   336 long cnt_html;        /* for overview mode, count of html queries */

   337 long cnt_lineend;     /* for overview mode, count of line-end queries */

   338 long cnt_spacend;     /* count of lines with space at end */

   339 long linecnt;         /* count of total lines in the file */

   340 long checked_linecnt; /* count of lines actually checked */

   342 void proghelp(void);

   343 void procfile(char *);

   345 #define LOW_THRESHOLD    0

   346 #define HIGH_THRESHOLD   1

   348 #define START 0

   349 #define END 1

   350 #define PREV 0

   351 #define NEXT 1

   352 #define FIRST_OF_PAIR 0

   353 #define SECOND_OF_PAIR 1

   355 #define MAX_WORDPAIR 1000

   357 char running_from[MAX_PATH];

   359 int mixdigit(char *);

   360 const char *getaword(const char *,char *);

   361 int matchword(char *,char *);

   362 char *flgets(char *,int,FILE *,long);

   363 void lowerit(char *);

   364 int gcisalpha(unsigned char);

   365 int gcisdigit(unsigned char);

   366 int gcisletter(unsigned char);

   367 char *gcstrchr(char *s,char c);

   368 void postprocess_for_HTML(char *);

   369 char *linehasmarkup(char *);

   370 char *losemarkup(char *);

   371 int tagcomp(char *,char *);

   372 char *loseentities(char *);

   373 int isroman(char *);

   374 int usertypo_count;

   375 void postprocess_for_DP(char *);

   377 char wrk[LINEBUFSIZE];

   379 #define MAX_QWORD 50

   380 #define MAX_QWORD_LENGTH 40

   381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];

   382 signed int dupcnt[MAX_QWORD];

   384 int main(int argc,char **argv)

   385 {

   386     char *argsw,*s;

   387     int i,switno,invarg;

   388     char usertypo_file[MAX_PATH];

   389     FILE *usertypofile;

   390     if (strlen(argv[0])<sizeof(running_from))

   391 	/* save the path to the executable */

   392         strcpy(running_from,argv[0]);

   393     /* find out what directory we're running from */

   394     s=running_from+strlen(running_from);

   395     for (;*s!='/' && *s!='\\' && s>=running_from;s--)

   396         *s=0;

   397     switno=strlen(SWITCHES);

   398     for (i=switno;--i>0;)

   399         pswit[i]=0;           /* initialise switches */

   400     /*

   401      * Standard loop to extract switches.

   402      * When we come out of this loop, the arguments will be

   403      * in argv[0] upwards and the switches used will be

   404      * represented by their equivalent elements in pswit[]

   405      */

   406     while (--argc>0 && **++argv=='-')

   407         for (argsw=argv[0]+1;*argsw!='\0';argsw++)

   408             for (i=switno,invarg=1;(--i>=0) && invarg==1;)

   409                 if ((toupper(*argsw))==SWITCHES[i])

   410 		{

   411                     invarg=0;

   412                     pswit[i]=1;

   413 		}

   414     /* Paranoid checking is turned OFF, not on, by its switch */

   415     pswit[PARANOID_SWITCH]^=1;

   416     if (pswit[PARANOID_SWITCH])

   417 	/* if running in paranoid mode force typo checks as well   */

   418         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;

   419     /* Line-end checking is turned OFF, not on, by its switch */

   420     pswit[LINE_END_SWITCH]^=1;

   421     /* Echoing is turned OFF, not on, by its switch */

   422     pswit[ECHO_SWITCH]^=1;

   423     if (pswit[OVERVIEW_SWITCH])

   424 	/* just print summary; don't echo */

   425         pswit[ECHO_SWITCH]=0;

   426     /*

   427      * Web uploads - for the moment, this is really just a placeholder

   428      * until we decide what processing we really want to do on web uploads

   429      */

   430     if (pswit[WEB_SWITCH])

   431     {

   432 	/* specific override for web uploads */

   433         pswit[ECHO_SWITCH]=1;

   434         pswit[SQUOTE_SWITCH]=0;

   435         pswit[TYPO_SWITCH]=1;

   436         pswit[QPARA_SWITCH]=0;

   437         pswit[PARANOID_SWITCH]=1;

   438         pswit[LINE_END_SWITCH]=0;

   439         pswit[OVERVIEW_SWITCH]=0;

   440         pswit[STDOUT_SWITCH]=0;

   441         pswit[HEADER_SWITCH]=1;

   442         pswit[VERBOSE_SWITCH]=0;

   443         pswit[MARKUP_SWITCH]=0;

   444         pswit[USERTYPO_SWITCH]=0;

   445         pswit[DP_SWITCH]=0;

   446     }

   447     if (argc<MINARGS || argc>MAXARGS)

   448     {

   449 	/* check number of args */

   450         proghelp();

   451         return 1;

   452     }

   453     /* read in the user-defined stealth scanno list */

   454     if (pswit[USERTYPO_SWITCH])

   455     {

   456 	/* ... we were told we had one! */

   457         usertypofile=fopen(USERTYPO_FILE,"rb");

   458         if (!usertypofile)

   459 	{

   460 	    /* not in cwd. try excuteable directory. */

   461             strcpy(usertypo_file,running_from);

   462             strcat(usertypo_file,USERTYPO_FILE);

   463             usertypofile=fopen(usertypo_file,"rb");

   464             if (!usertypofile) {

   465 		/* we ain't got no user typo file! */

   466                 printf("   --> I couldn't find gutcheck.typ "

   467 		  "-- proceeding without user typos.\n");

   468 	    }

   469 	}

   470         usertypo_count=0;

   471         if (usertypofile)

   472 	{

   473 	    /* we managed to open a User Typo File! */

   474             if (pswit[USERTYPO_SWITCH])

   475 	    {

   476                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,

   477 		  (long)usertypo_count))

   478 		{

   479                     if (strlen(aline)>1)

   480 		    {

   481                         if ((int)*aline>33)

   482 			{

   483                             s=malloc(strlen(aline)+1);

   484                             if (!s)

   485 			    {

   486                                 fprintf(stderr,"bookloupe: cannot get enough "

   487 				  "memory for user typo file!\n");

   488                                 exit(1);

   489 			    }

   490                             strcpy(s,aline);

   491                             usertypo[usertypo_count]=s;

   492                             usertypo_count++;

   493                             if (usertypo_count>=MAX_USER_TYPOS)

   494 			    {

   495                                 printf("   --> Only %d user-defined typos "

   496 				  "allowed: ignoring the rest\n",

   497 				  MAX_USER_TYPOS);

   498                                 break;

   499 			    }

   500 			}

   501 		    }

   502 		}

   503 	    }

   504             fclose(usertypofile);

   505 	}

   506     }

   507     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   508     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=

   509     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=

   510     cnt_spacend=0;

   511     procfile(argv[0]);

   512     if (pswit[OVERVIEW_SWITCH])

   513     {

   514 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   515 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   516         printf("    --------------- Queries found --------------\n");

   517         if (cnt_long)

   518 	    printf("    Long lines:                    %14ld\n",cnt_long);

   519         if (cnt_short)

   520 	    printf("    Short lines:                   %14ld\n",cnt_short);

   521         if (cnt_lineend)

   522 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);

   523         if (cnt_word)

   524 	    printf("    Common typos:                  %14ld\n",cnt_word);

   525         if (cnt_dquot)

   526 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);

   527         if (cnt_squot)

   528 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);

   529         if (cnt_brack)

   530 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);

   531         if (cnt_bin)

   532 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);

   533         if (cnt_odd)

   534 	    printf("    Proofing characters:           %14ld\n",cnt_odd);

   535         if (cnt_punct)

   536 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   537         if (cnt_dash)

   538 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);

   539         if (cnt_html)

   540 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);

   541         printf("\n");

   542         printf("    TOTAL QUERIES                  %14ld\n",

   543           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   544           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   545     }

   546     return 0;

   547 }

   549 struct first_pass_results {

   550     long firstline,astline;

   551     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   552     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   553     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   554     signed int Dutchcount,Frenchcount;

   555 };

   557 /*

   558  * first_pass:

   559  *

   560  * Run a first pass - verify that it's a valid PG

   561  * file, decide whether to report some things that

   562  * occur many times in the text like long or short

   563  * lines, non-standard dashes, etc.

   564  */

   565 struct first_pass_results *first_pass(FILE *infile)

   566 {

   567     char laststart=CHAR_SPACE;

   568     const char *s;

   569     signed int i,llen;

   570     unsigned int lastlen=0,lastblen=0;

   571     long spline=0,nspline=0;

   572     static struct first_pass_results results={0};

   573     char inword[MAXWORDLEN]="";

   574     while (fgets(aline,LINEBUFSIZE-1,infile))

   575     {

   576         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)

   577 	    aline[strlen(aline)-1]=0;

   578         linecnt++;

   579         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&

   580 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))

   581 	{

   582             if (spline)

   583                 printf("   --> Duplicate header?\n");

   584             spline=linecnt+1;   /* first line of non-header text, that is */

   585 	}

   586         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))

   587 	{

   588             if (nspline)

   589                 printf("   --> Duplicate header?\n");

   590             nspline=linecnt+1;   /* first line of non-header text, that is */

   591 	}

   592         if (spline || nspline)

   593 	{

   594             lowerit(aline);

   595             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))

   596 	    {

   597                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))

   598 		{

   599                     if (results.footerline)

   600 		    {

   601 			/* it's an old-form header - we can detect duplicates */

   602                         if (!nspline)

   603                             printf("   --> Duplicate footer?\n");

   604 		    }

   605                     else

   606                         results.footerline=linecnt;

   607 		}

   608 	    }

   609 	}

   610         if (spline)

   611 	    results.firstline=spline;

   612         if (nspline)

   613 	    results.firstline=nspline;  /* override with new */

   614         if (results.footerline)

   615 	    continue;    /* don't count the boilerplate in the footer */

   616         llen=strlen(aline);

   617         results.totlen+=llen;

   618         for (i=0;i<llen;i++)

   619 	{

   620             if ((unsigned char)aline[i]>127)

   621 		results.binlen++;

   622             if (gcisalpha(aline[i]))

   623 		results.alphalen++;

   624             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

   625 		results.endquote_count++;

   626 	}

   627         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&

   628 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   629 	    results.shortline++;

   630         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)

   631 	    cnt_spacend++;

   632         if (strstr(aline,".,"))

   633 	    results.dotcomma++;

   634         /* only count ast lines for ignoring purposes where there is */

   635         /* locase text on the line */

   636         if (strstr(aline,"*"))

   637 	{

   638             for (s=aline;*s;s++)

   639                 if (*s>='a' && *s<='z')

   640                     break;

   641              if (*s)

   642 		results.astline++;

   643 	}

   644         if (strstr(aline,"/"))

   645             results.fslashline++;

   646         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

   647 	    ;

   648         if (aline[i]=='-' && aline[i-1]!='-')

   649 	    results.hyphens++;

   650         if (llen>LONGEST_PG_LINE)

   651 	    results.longline++;

   652         if (llen>WAY_TOO_LONG)

   653 	    results.verylongline++;

   654         if (strstr(aline,"<") && strstr(aline,">"))

   655 	{

   656             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);

   657             if (i>0)

   658                 results.htmcount++;

   659             if (strstr(aline,"<i>"))

   660 		results.htmcount+=4; /* bonus marks! */

   661 	}

   662         /* Check for spaced em-dashes */

   663         if (strstr(aline,"--"))

   664 	{

   665             results.emdash++;

   666             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||

   667                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   668 		results.space_emdash++;

   669             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&

   670                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   671 		/* count of em-dashes with spaces both sides */

   672 		results.non_PG_space_emdash++;

   673             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&

   674                (*(strstr(aline,"--")+2)!=CHAR_SPACE))

   675 		/* count of PG-type em-dashes with no spaces */

   676 		results.PG_space_emdash++;

   677 	}

   678         for (s=aline;*s;)

   679 	{

   680             s=getaword(s,inword);

   681             if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   682                 results.Dutchcount++;

   683             if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   684                 results.Frenchcount++;

   685             if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   686                 results.standalone_digit++;

   687 	}

   688         /* Check for spaced dashes */

   689         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')

   690 	    results.spacedash++;

   691         lastblen=lastlen;

   692         lastlen=strlen(aline);

   693         laststart=aline[0];

   694     }

   695     return &results;

   696 }

   698 struct warnings {

   699     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   700     signed int endquote,isDutch,isFrench;

   701 };

   703 /*

   704  * report_first_pass:

   705  *

   706  * Make some snap decisions based on the first pass results.

   707  */

   708 struct warnings *report_first_pass(struct first_pass_results *results)

   709 {

   710     static struct warnings warnings={0};

   711     if (cnt_spacend>0)

   712         printf("   --> %ld lines in this file have white space at end\n",

   713 	  cnt_spacend);

   714     warnings.dotcomma=1;

   715     if (results->dotcomma>5)

   716     {

   717         warnings.dotcomma=0;

   718         printf("   --> %ld lines in this file contain '.,'. "

   719 	  "Not reporting them.\n",results->dotcomma);

   720     }

   721     /*

   722      * If more than 50 lines, or one-tenth, are short,

   723      * don't bother reporting them.

   724      */

   725     warnings.shortline=1;

   726     if (results->shortline>50 || results->shortline*10>linecnt)

   727     {

   728         warnings.shortline=0;

   729         printf("   --> %ld lines in this file are short. "

   730 	  "Not reporting short lines.\n",results->shortline);

   731     }

   732     /*

   733      * If more than 50 lines, or one-tenth, are long,

   734      * don't bother reporting them.

   735      */

   736     warnings.longline=1;

   737     if (results->longline>50 || results->longline*10>linecnt)

   738     {

   739         warnings.longline=0;

   740         printf("   --> %ld lines in this file are long. "

   741 	  "Not reporting long lines.\n",results->longline);

   742     }

   743     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   744     warnings.ast=1;

   745     if (results->astline>10)

   746     {

   747         warnings.ast=0;

   748         printf("   --> %ld lines in this file contain asterisks. "

   749 	  "Not reporting them.\n",results->astline);

   750     }

   751     /*

   752      * If more than 10 lines contain forward slashes,

   753      * don't bother reporting them.

   754      */

   755     warnings.fslash=1;

   756     if (results->fslashline>10)

   757     {

   758         warnings.fslash=0;

   759         printf("   --> %ld lines in this file contain forward slashes. "

   760 	  "Not reporting them.\n",results->fslashline);

   761     }

   762     /*

   763      * If more than 20 lines contain unpunctuated endquotes,

   764      * don't bother reporting them.

   765      */

   766     warnings.endquote=1;

   767     if (results->endquote_count>20)

   768     {

   769         warnings.endquote=0;

   770         printf("   --> %ld lines in this file contain unpunctuated endquotes. "

   771 	  "Not reporting them.\n",results->endquote_count);

   772     }

   773     /*

   774      * If more than 15 lines contain standalone digits,

   775      * don't bother reporting them.

   776      */

   777     warnings.digit=1;

   778     if (results->standalone_digit>10)

   779     {

   780         warnings.digit=0;

   781         printf("   --> %ld lines in this file contain standalone 0s and 1s. "

   782 	  "Not reporting them.\n",results->standalone_digit);

   783     }

   784     /*

   785      * If more than 20 lines contain hyphens at end,

   786      * don't bother reporting them.

   787      */

   788     warnings.hyphen=1;

   789     if (results->hyphens>20)

   790     {

   791         warnings.hyphen=0;

   792         printf("   --> %ld lines in this file have hyphens at end. "

   793 	  "Not reporting them.\n",results->hyphens);

   794     }

   795     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   796     {

   797         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   798         pswit[MARKUP_SWITCH]=1;

   799     }

   800     if (results->verylongline>0)

   801         printf("   --> %ld lines in this file are VERY long!\n",

   802 	  results->verylongline);

   803     /*

   804      * If there are more non-PG spaced dashes than PG em-dashes,

   805      * assume it's deliberate.

   806      * Current PG guidelines say don't use them, but older texts do,

   807      * and some people insist on them whatever the guidelines say.

   808      */

   809     warnings.dash=1;

   810     if (results->spacedash+results->non_PG_space_emdash>

   811       results->PG_space_emdash)

   812     {

   813         warnings.dash=0;

   814         printf("   --> There are %ld spaced dashes and em-dashes. "

   815 	  "Not reporting them.\n",

   816 	  results->spacedash+results->non_PG_space_emdash);

   817     }

   818     /* If more than a quarter of characters are hi-bit, bug out. */

   819     warnings.bin=1;

   820     if (results->binlen*4>results->totlen)

   821     {

   822         printf("   --> This file does not appear to be ASCII. "

   823 	  "Terminating. Best of luck with it!\n");

   824         exit(1);

   825     }

   826     if (results->alphalen*4<results->totlen)

   827     {

   828         printf("   --> This file does not appear to be text. "

   829 	  "Terminating. Best of luck with it!\n");

   830         exit(1);

   831     }

   832     if (results->binlen*100>results->totlen || results->binlen>100)

   833     {

   834         printf("   --> There are a lot of foreign letters here. "

   835 	  "Not reporting them.\n");

   836         warnings.bin=0;

   837     }

   838     warnings.isDutch=0;

   839     if (results->Dutchcount>50)

   840     {

   841         warnings.isDutch=1;

   842         printf("   --> This looks like Dutch - "

   843 	  "switching off dashes and warnings for 's Middags case.\n");

   844     }

   845     warnings.isFrench=0;

   846     if (results->Frenchcount>50)

   847     {

   848         warnings.isFrench=1;

   849         printf("   --> This looks like French - "

   850 	  "switching off some doublepunct.\n");

   851     }

   852     if (results->firstline && results->footerline)

   853         printf("    The PG header and footer appear to be already on.\n");

   854     else

   855     {

   856         if (results->firstline)

   857             printf("    The PG header is on - no footer.\n");

   858         if (results->footerline)

   859             printf("    The PG footer is on - no header.\n");

   860     }

   861     printf("\n");

   862     if (pswit[VERBOSE_SWITCH])

   863     {

   864         warnings.bin=1;

   865         warnings.shortline=1;

   866         warnings.dotcomma=1;

   867         warnings.longline=1;

   868         warnings.dash=1;

   869         warnings.digit=1;

   870         warnings.ast=1;

   871         warnings.fslash=1;

   872         warnings.hyphen=1;

   873         warnings.endquote=1;

   874         printf("   *** Verbose output is ON -- you asked for it! ***\n");

   875     }

   876     if (warnings.isDutch)

   877         warnings.dash=0;

   878     if (results->footerline>0 && results->firstline>0 &&

   879       results->footerline>results->firstline &&

   880       results->footerline-results->firstline<100)

   881     {

   882         printf("   --> I don't really know where this text starts. \n");

   883         printf("       There are no reference points.\n");

   884         printf("       I'm going to have to report the header and footer "

   885 	  "as well.\n");

   886         results->firstline=0;

   887     }

   888     return &warnings;

   889 }

   891 struct counters {

   892     long quot;

   893     signed int c_unders,c_brack,s_brack,r_brack;

   894     signed int open_single_quote,close_single_quote;

   895 };

   897 /*

   898  * analyse_quotes:

   899  *

   900  * Look along the line, accumulate the count of quotes, and see

   901  * if this is an empty line - i.e. a line with nothing on it

   902  * but spaces.

   903  * If line has just spaces, period, * and/or - on it, don't

   904  * count it, since empty lines with asterisks or dashes to

   905  * separate sections are common.

   906  *

   907  * Returns: Non-zero if the line is empty.

   908  */

   909 int analyse_quotes(const char *s,struct counters *counters)

   910 {

   911     signed int guessquote=0;

   912     int isemptyline=1;    /* assume the line is empty until proven otherwise */

   913     while (*s)

   914     {

   915 	if (*s==CHAR_DQUOTE)

   916 	    counters->quot++;

   917 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)

   918 	{

   919 	    if (s==aline)

   920 	    {

   921 		/*

   922 		 * At start of line, it can only be an openquote.

   923 		 * Hardcode a very common exception!

   924 		 */

   925 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))

   926 		    counters->open_single_quote++;

   927 	    }

   928 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))

   929 		/* Do nothing! it's definitely an apostrophe, not a quote */

   930 		;

   931 	    /* it's outside a word - let's check it out */

   932 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))

   933 	    {

   934 		/* it damwell better BE an openquote */

   935 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))

   936 		    /* hardcode a very common exception! */

   937 		    counters->open_single_quote++;

   938 	    }

   939 	    else

   940 	    {

   941 		/* now - is it a closequote? */

   942 		guessquote=0;   /* accumulate clues */

   943 		if (gcisalpha(s[-1]))

   944 		{

   945 		    /* it follows a letter - could be either */

   946 		    guessquote++;

   947 		    if (s[-1]=='s')

   948 		    {

   949 			/* looks like a plural apostrophe */

   950 			guessquote-=3;

   951 			if (s[1]==CHAR_SPACE)  /* bonus marks! */

   952 			    guessquote-=2;

   953 		    }

   954 		}

   955 		/* it doesn't have a letter either side */

   956 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))

   957 		    guessquote+=8; /* looks like a closequote */

   958 		else

   959 		    guessquote++;

   960 		if (counters->open_single_quote>counters->close_single_quote)

   961 		    /*

   962 		     * Give it the benefit of some doubt,

   963 		     * if a squote is already open.

   964 		     */

   965 		    guessquote++;

   966 		else

   967 		    guessquote--;

   968 		if (guessquote>=0)

   969 		    counters->close_single_quote++;

   970 	    }

   971 	}

   972 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&

   973 	  *s!=13 && *s!=10)

   974 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */

   975 	if (*s==CHAR_UNDERSCORE)

   976 	    counters->c_unders++;

   977 	if (*s==CHAR_OPEN_CBRACK)

   978 	    counters->c_brack++;

   979 	if (*s==CHAR_CLOSE_CBRACK)

   980 	    counters->c_brack--;

   981 	if (*s==CHAR_OPEN_RBRACK)

   982 	    counters->r_brack++;

   983 	if (*s==CHAR_CLOSE_RBRACK)

   984 	    counters->r_brack--;

   985 	if (*s==CHAR_OPEN_SBRACK)

   986 	    counters->s_brack++;

   987 	if (*s==CHAR_CLOSE_SBRACK)

   988 	    counters->s_brack--;

   989 	s++;

   990     }

   991     return isemptyline;

   992 }

   994 /*

   995  * check_for_control_characters:

   996  *

   997  * Check for invalid or questionable characters in the line

   998  * Anything above 127 is invalid for plain ASCII, and

   999  * non-printable control characters should also be flagged.

  1000  * Tabs should generally not be there.

  1001  */

  1002 void check_for_control_characters(const char *aline)

  1003 {

  1004     unsigned char c;

  1005     const char *s;

  1006     for (s=aline;*s;s++)

  1007     {

  1008 	c=*(unsigned char *)s;

  1009 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

  1010 	{

  1011 	    if (pswit[ECHO_SWITCH])

  1012 		printf("\n%s\n",aline);

  1013 	    if (!pswit[OVERVIEW_SWITCH])

  1014 		printf("    Line %ld column %d - Control character %d\n",

  1015 		  linecnt,(int)(s-aline)+1,c);

  1016 	    else

  1017 		cnt_bin++;

  1018 	}

  1019     }

  1020 }

  1022 /*

  1023  * check_for_odd_characters:

  1024  *

  1025  * Check for binary and other odd characters.

  1026  */

  1027 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1028   int isemptyline)

  1029 {

  1030     /* Don't repeat multiple warnings on one line. */

  1031     signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;

  1032     const char *s;

  1033     unsigned char c;

  1034     for (s=aline;*s;s++)

  1035     {

  1036 	c=*(unsigned char *)s;

  1037 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))

  1038 	{

  1039 	    if (pswit[ECHO_SWITCH])

  1040 		printf("\n%s\n",aline);

  1041 	    if (!pswit[OVERVIEW_SWITCH])

  1042 		if (c>127 && c<160)

  1043 		    printf("    Line %ld column %d - "

  1044 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);

  1045 		else

  1046 		    printf("    Line %ld column %d - Non-ASCII character %d\n",

  1047 		      linecnt,(int)(s-aline)+1,c);

  1048 	    else

  1049 		cnt_bin++;

  1050 	    eNon_A=1;

  1051 	}

  1052 	if (!eTab && *s==CHAR_TAB)

  1053 	{

  1054 	    if (pswit[ECHO_SWITCH])

  1055 		printf("\n%s\n",aline);

  1056 	    if (!pswit[OVERVIEW_SWITCH])

  1057 		printf("    Line %ld column %d - Tab character?\n",

  1058 		  linecnt,(int)(s-aline)+1);

  1059 	    else

  1060 		cnt_odd++;

  1061 	    eTab=1;

  1062 	}

  1063 	if (!eTilde && *s==CHAR_TILDE)

  1064 	{

  1065 	    /*

  1066 	     * Often used by OCR software to indicate an

  1067 	     * unrecognizable character.

  1068 	     */

  1069 	    if (pswit[ECHO_SWITCH])

  1070 		printf("\n%s\n",aline);

  1071 	    if (!pswit[OVERVIEW_SWITCH])

  1072 		printf("    Line %ld column %d - Tilde character?\n",

  1073 		  linecnt,(int)(s-aline)+1);

  1074 	    else

  1075 		cnt_odd++;

  1076 	    eTilde=1;

  1077 	}

  1078 	if (!eCarat && *s==CHAR_CARAT)

  1079 	{

  1080 	    if (pswit[ECHO_SWITCH])

  1081 		printf("\n%s\n",aline);

  1082 	    if (!pswit[OVERVIEW_SWITCH])

  1083 		printf("    Line %ld column %d - Carat character?\n",

  1084 		  linecnt,(int)(s-aline)+1);

  1085 	    else

  1086 		cnt_odd++;

  1087 	    eCarat=1;

  1088 	}

  1089 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)

  1090 	{

  1091 	    if (pswit[ECHO_SWITCH])

  1092 		printf("\n%s\n",aline);

  1093 	    if (!pswit[OVERVIEW_SWITCH])

  1094 		printf("    Line %ld column %d - Forward slash?\n",

  1095 		  linecnt,(int)(s-aline)+1);

  1096 	    else

  1097 		cnt_odd++;

  1098 	    eFSlash=1;

  1099 	}

  1100 	/*

  1101 	 * Report asterisks only in paranoid mode,

  1102 	 * since they're often deliberate.

  1103 	 */

  1104 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1105 	  *s==CHAR_ASTERISK)

  1106 	{

  1107 	    if (pswit[ECHO_SWITCH])

  1108 		printf("\n%s\n",aline);

  1109 	    if (!pswit[OVERVIEW_SWITCH])

  1110 		printf("    Line %ld column %d - Asterisk?\n",

  1111 		  linecnt,(int)(s-aline)+1);

  1112 	    else

  1113 		cnt_odd++;

  1114 	    eAst=1;

  1115 	}

  1116     }

  1117 }

  1119 /*

  1120  * check_for_long_line:

  1121  *

  1122  * Check for line too long.

  1123  */

  1124 void check_for_long_line(const char *aline)

  1125 {

  1126     if (strlen(aline)>LONGEST_PG_LINE)

  1127     {

  1128 	if (pswit[ECHO_SWITCH])

  1129 	    printf("\n%s\n",aline);

  1130 	if (!pswit[OVERVIEW_SWITCH])

  1131 	    printf("    Line %ld column %d - Long line %d\n",

  1132 	      linecnt,strlen(aline),strlen(aline));

  1133 	else

  1134 	    cnt_long++;

  1135     }

  1136 }

  1138 struct line_properties {

  1139     unsigned int len,blen;

  1140     char start;

  1141 };

  1143 /*

  1144  * check_for_short_line:

  1145  *

  1146  * Check for line too short.

  1147  *

  1148  * This one is a bit trickier to implement: we don't want to

  1149  * flag the last line of a paragraph for being short, so we

  1150  * have to wait until we know that our current line is a

  1151  * "normal" line, then report the _previous_ line if it was too

  1152  * short. We also don't want to report indented lines like

  1153  * chapter heads or formatted quotations. We therefore keep

  1154  * last->len as the length of the last line examined, and

  1155  * last->blen as the length of the last but one, and try to

  1156  * suppress unnecessary warnings by checking that both were of

  1157  * "normal" length. We keep the first character of the last

  1158  * line in last->start, and if it was a space, we assume that

  1159  * the formatting is deliberate. I can't figure out a way to

  1160  * distinguish something like a quoted verse left-aligned or

  1161  * the header or footer of a letter from a paragraph of short

  1162  * lines - maybe if I examined the whole paragraph, and if the

  1163  * para has less than, say, 8 lines and if all lines are short,

  1164  * then just assume it's OK? Need to look at some texts to see

  1165  * how often a formula like this would get the right result.

  1166  */

  1167 void check_for_short_line(const char *aline,const struct line_properties *last)

  1168 {

  1169     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&

  1170       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1171     {

  1172 	if (pswit[ECHO_SWITCH])

  1173 	    printf("\n%s\n",prevline);

  1174 	if (!pswit[OVERVIEW_SWITCH])

  1175 	    printf("    Line %ld column %d - Short line %d?\n",

  1176 	      linecnt-1,strlen(prevline),strlen(prevline));

  1177 	else

  1178 	    cnt_short++;

  1179     }

  1180 }

  1182 /*

  1183  * check_for_starting_punctuation:

  1184  *

  1185  * Look for punctuation other than full ellipses at start of line.

  1186  */

  1187 void check_for_starting_punctuation(const char *aline)

  1188 {

  1189     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))

  1190     {

  1191 	if (pswit[ECHO_SWITCH])

  1192 	    printf("\n%s\n",aline);

  1193 	if (!pswit[OVERVIEW_SWITCH])

  1194 	    printf("    Line %ld column 1 - Begins with punctuation?\n",

  1195 	      linecnt);

  1196 	else

  1197 	    cnt_punct++;

  1198     }

  1199 }

  1201 /*

  1202  * check_for_spaced_emdash:

  1203  *

  1204  * Check for spaced em-dashes.

  1205  *

  1206  * We must check _all_ occurrences of "--" on the line

  1207  * hence the loop - even if the first double-dash is OK

  1208  * there may be another that's wrong later on.

  1209  */

  1210 void check_for_spaced_emdash(const char *aline)

  1211 {

  1212     const char *s,*t;

  1213     s=aline;

  1214     while ((t=strstr(s,"--")))

  1215     {

  1216 	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)

  1217 	{

  1218 	    if (pswit[ECHO_SWITCH])

  1219 		printf("\n%s\n",aline);

  1220 	    if (!pswit[OVERVIEW_SWITCH])

  1221 		printf("    Line %ld column %d - Spaced em-dash?\n",

  1222 		  linecnt,(int)(t-aline)+1);

  1223 	    else

  1224 		cnt_dash++;

  1225 	}

  1226 	s=t+2;

  1227     }

  1228 }

  1230 /*

  1231  * check_for_spaced_dash:

  1232  *

  1233  * Check for spaced dashes.

  1234  */

  1235 void check_for_spaced_dash(const char *aline)

  1236 {

  1237     const char *s;

  1238     if ((s=strstr(aline," -")))

  1239     {

  1240 	if (s[2]!='-')

  1241 	{

  1242 	    if (pswit[ECHO_SWITCH])

  1243 		printf("\n%s\n",aline);

  1244 	    if (!pswit[OVERVIEW_SWITCH])

  1245 		printf("    Line %ld column %d - Spaced dash?\n",

  1246 		  linecnt,(int)(s-aline)+1);

  1247 	    else

  1248 		cnt_dash++;

  1249 	}

  1250     }

  1251     else if ((s=strstr(aline,"- ")))

  1252     {

  1253 	if (s==aline || s[-1]!='-')

  1254 	{

  1255 	    if (pswit[ECHO_SWITCH])

  1256 		printf("\n%s\n",aline);

  1257 	    if (!pswit[OVERVIEW_SWITCH])

  1258 		printf("    Line %ld column %d - Spaced dash?\n",

  1259 		  linecnt,(int)(s-aline)+1);

  1260 	    else

  1261 		cnt_dash++;

  1262 	}

  1263     }

  1264 }

  1266 /*

  1267  * check_for_unmarked_paragraphs:

  1268  *

  1269  * Check for unmarked paragraphs indicated by separate speakers.

  1270  *

  1271  * May well be false positive:

  1272  * "Bravo!" "Wonderful!" called the crowd.

  1273  * but useful all the same.

  1274  */

  1275 void check_for_unmarked_paragraphs(const char *aline)

  1276 {

  1277     const char *s;

  1278     s=strstr(aline,"\"  \"");

  1279     if (!s)

  1280 	s=strstr(aline,"\" \"");

  1281     if (s)

  1282     {

  1283 	if (pswit[ECHO_SWITCH])

  1284 	    printf("\n%s\n",aline);

  1285 	if (!pswit[OVERVIEW_SWITCH])

  1286 	    printf("    Line %ld column %d - Query missing paragraph break?\n",

  1287 	      linecnt,(int)(s-aline)+1);

  1288 	else

  1289 	    cnt_punct++;

  1290     }

  1291 }

  1293 /*

  1294  * check_for_jeebies:

  1295  *

  1296  * Check for "to he" and other easy h/b errors.

  1297  *

  1298  * This is a very inadequate effort on the h/b problem,

  1299  * but the phrase "to he" is always an error, whereas "to

  1300  * be" is quite common.

  1301  * Similarly, '"Quiet!", be said.' is a non-be error

  1302  * "to he" is _not_ always an error!:

  1303  *       "Where they went to he couldn't say."

  1304  * Another false positive:

  1305  *       What would "Cinderella" be without the . . .

  1306  * and another: "If he wants to he can see for himself."

  1307  */

  1308 void check_for_jeebies(const char *aline)

  1309 {

  1310     const char *s;

  1311     s=strstr(aline," be could ");

  1312     if (!s)

  1313 	s=strstr(aline," be would ");

  1314     if (!s)

  1315 	s=strstr(aline," was be ");

  1316     if (!s)

  1317 	s=strstr(aline," be is ");

  1318     if (!s)

  1319 	s=strstr(aline," is be ");

  1320     if (!s)

  1321 	s=strstr(aline,"\", be ");

  1322     if (!s)

  1323 	s=strstr(aline,"\" be ");

  1324     if (!s)

  1325 	s=strstr(aline,"\" be ");

  1326     if (!s)

  1327 	s=strstr(aline," to he ");

  1328     if (s)

  1329     {

  1330 	if (pswit[ECHO_SWITCH])

  1331 	    printf("\n%s\n",aline);

  1332 	if (!pswit[OVERVIEW_SWITCH])

  1333 	    printf("    Line %ld column %d - Query he/be error?\n",

  1334 	      linecnt,(int)(s-aline)+1);

  1335 	else

  1336 	    cnt_word++;

  1337     }

  1338     s=strstr(aline," the had ");

  1339     if (!s)

  1340 	s=strstr(aline," a had ");

  1341     if (!s)

  1342 	s=strstr(aline," they bad ");

  1343     if (!s)

  1344 	s=strstr(aline," she bad ");

  1345     if (!s)

  1346 	s=strstr(aline," he bad ");

  1347     if (!s)

  1348 	s=strstr(aline," you bad ");

  1349     if (!s)

  1350 	s=strstr(aline," i bad ");

  1351     if (s)

  1352     {

  1353 	if (pswit[ECHO_SWITCH])

  1354 	    printf("\n%s\n",aline);

  1355 	if (!pswit[OVERVIEW_SWITCH])

  1356 	    printf("    Line %ld column %d - Query had/bad error?\n",

  1357 	      linecnt,(int)(s-aline)+1);

  1358 	else

  1359 	    cnt_word++;

  1360     }

  1361     s=strstr(aline,"; hut ");

  1362     if (!s)

  1363 	s=strstr(aline,", hut ");

  1364     if (s)

  1365     {

  1366 	if (pswit[ECHO_SWITCH])

  1367 	    printf("\n%s\n",aline);

  1368 	if (!pswit[OVERVIEW_SWITCH])

  1369 	    printf("    Line %ld column %d - Query hut/but error?\n",

  1370 	      linecnt,(int)(s-aline)+1);

  1371 	else

  1372 	    cnt_word++;

  1373     }

  1374 }

  1376 /*

  1377  * check_for_mta_from:

  1378  *

  1379  * Special case - angled bracket in front of "From" placed there by an

  1380  * MTA when sending an e-mail.

  1381  */

  1382 void check_for_mta_from(const char *aline)

  1383 {

  1384     const char *s;

  1385     s=strstr(aline,">From");

  1386     if (s)

  1387     {

  1388 	if (pswit[ECHO_SWITCH])

  1389 	    printf("\n%s\n",aline);

  1390 	if (!pswit[OVERVIEW_SWITCH])

  1391 	    printf("    Line %ld column %d - Query angled bracket with From\n",

  1392 	      linecnt,(int)(s-aline)+1);

  1393 	else

  1394 	    cnt_punct++;

  1395     }

  1396 }

  1398 /*

  1399  * check_for_orphan_character:

  1400  *

  1401  * Check for a single character line -

  1402  * often an overflow from bad wrapping.

  1403  */

  1404 void check_for_orphan_character(const char *aline)

  1405 {

  1406     if (*aline && !aline[1])

  1407     {

  1408 	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||

  1409 	  gcisdigit(*aline))

  1410 	    ; /* Nothing - ignore numerals alone on a line. */

  1411 	else

  1412 	{

  1413 	    if (pswit[ECHO_SWITCH])

  1414 		printf("\n%s\n",aline);

  1415 	    if (!pswit[OVERVIEW_SWITCH])

  1416 		printf("    Line %ld column 1 - Query single character line\n",

  1417 		  linecnt);

  1418 	    else

  1419 		cnt_punct++;

  1420 	}

  1421     }

  1422 }

  1424 /*

  1425  * check_for_pling_scanno:

  1426  *

  1427  * Check for I" - often should be !

  1428  */

  1429 void check_for_pling_scanno(const char *aline)

  1430 {

  1431     const char *s;

  1432     s=strstr(aline," I\"");

  1433     if (s)

  1434     {

  1435 	if (pswit[ECHO_SWITCH])

  1436 	    printf("\n%s\n",aline);

  1437 	if (!pswit[OVERVIEW_SWITCH])

  1438 	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",

  1439 	      linecnt,s-aline);

  1440 	else

  1441 	    cnt_punct++;

  1442     }

  1443 }

  1445 /*

  1446  * check_for_extra_period:

  1447  *

  1448  * Check for period without a capital letter. Cut-down from gutspell.

  1449  * Only works when it happens on a single line.

  1450  */

  1451 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1452 {

  1453     const char *s,*t,*s1;

  1454     signed int i,istypo,isdup;

  1455     static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];

  1456     static int qperiod_index=0;

  1457     char testword[MAXWORDLEN]="";

  1458     if (pswit[PARANOID_SWITCH])

  1459     {

  1460 	for (t=s=aline;strstr(t,". ");)

  1461 	{

  1462 	    t=strstr(t,". ");

  1463 	    if (t==s)

  1464 	    {

  1465 		t++;

  1466 		/* start of line punctuation is handled elsewhere */

  1467 		continue;

  1468 	    }

  1469 	    if (!gcisalpha(t[-1]))

  1470 	    {

  1471 		t++;

  1472 		continue;

  1473 	    }

  1474 	    if (warnings->isDutch)

  1475 	    {

  1476 		/* For Frank & Jeroen -- 's Middags case */

  1477 		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&

  1478 		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')

  1479 		{

  1480 		    t++;

  1481 		    continue;

  1482 		}

  1483 	    }

  1484 	    s1=t+2;

  1485 	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))

  1486 		s1++;

  1487 	    if (*s1>='a' && *s1<='z')

  1488 	    {

  1489 		/* we have something to investigate */

  1490 		istypo=1;

  1491 		/* so let's go back and find out */

  1492 		for (s1=t-1;s1>=s &&

  1493 		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&

  1494 		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)

  1495 		    ;

  1496 		s1++;

  1497 		for (i=0;*s1 && *s1!='.';s1++,i++)

  1498 		    testword[i]=*s1;

  1499 		testword[i]=0;

  1500 		for (i=0;*abbrev[i];i++)

  1501 		    if (!strcmp(testword,abbrev[i]))

  1502 			istypo=0;

  1503 		if (gcisdigit(*testword))

  1504 		    istypo=0;

  1505 		if (!testword[1])

  1506 		    istypo=0;

  1507 		if (isroman(testword))

  1508 		    istypo=0;

  1509 		if (istypo)

  1510 		{

  1511 		    istypo=0;

  1512 		    for (i=0;testword[i];i++)

  1513 			if (strchr(vowels,testword[i]))

  1514 			    istypo=1;

  1515 		}

  1516 		if (istypo)

  1517 		{

  1518 		    isdup=0;

  1519 		    if (strlen(testword)<MAX_QWORD_LENGTH &&

  1520 		      !pswit[VERBOSE_SWITCH])

  1521 			for (i=0;i<qperiod_index;i++)

  1522 			    if (!strcmp(testword,qperiod[i]))

  1523 				isdup=1;

  1524 		    if (!isdup)

  1525 		    {

  1526 			if (qperiod_index<MAX_QWORD &&

  1527 			  strlen(testword)<MAX_QWORD_LENGTH)

  1528 			{

  1529 			    strcpy(qperiod[qperiod_index],testword);

  1530 			    qperiod_index++;

  1531 			}

  1532 			if (pswit[ECHO_SWITCH])

  1533 			    printf("\n%s\n",aline);

  1534 			if (!pswit[OVERVIEW_SWITCH])

  1535 			    printf("    Line %ld column %d - Extra period?\n",

  1536 			      linecnt,(int)(t-aline)+1);

  1537 			else

  1538 			    cnt_punct++;

  1539 		    }

  1540 		}

  1541 	    }

  1542 	    t++;

  1543 	}

  1544     }

  1545 }

  1547 /*

  1548  * check_for_following_punctuation:

  1549  *

  1550  * Check for words usually not followed by punctuation.

  1551  */

  1552 void check_for_following_punctuation(const char *aline)

  1553 {

  1554     int i;

  1555     const char *s,*wordstart;

  1556     char inword[MAXWORDLEN];

  1557     if (pswit[TYPO_SWITCH])

  1558     {

  1559 	for (s=aline;*s;)

  1560 	{

  1561 	    wordstart=s;

  1562 	    s=getaword(s,inword);

  1563 	    if (!*inword)

  1564 		continue;

  1565 	    lowerit(inword);

  1566 	    for (i=0;*nocomma[i];i++)

  1567 		if (!strcmp(inword,nocomma[i]))

  1568 		{

  1569 		    if (*s==',' || *s==';' || *s==':')

  1570 		    {

  1571 			if (pswit[ECHO_SWITCH])

  1572 			    printf("\n%s\n",aline);

  1573 			if (!pswit[OVERVIEW_SWITCH])

  1574 			    printf("    Line %ld column %d - "

  1575 			      "Query punctuation after %s?\n",

  1576 			      linecnt,(int)(s-aline)+1,inword);

  1577 			else

  1578 			    cnt_punct++;

  1579 		    }

  1580 		}

  1581 	    for (i=0;*noperiod[i];i++)

  1582 		if (!strcmp(inword,noperiod[i]))

  1583 		{

  1584 		    if (*s=='.' || *s=='!')

  1585 		    {

  1586 			if (pswit[ECHO_SWITCH])

  1587 			    printf("\n%s\n",aline);

  1588 			if (!pswit[OVERVIEW_SWITCH])

  1589 			    printf("    Line %ld column %d - "

  1590 			      "Query punctuation after %s?\n",

  1591 			      linecnt,(int)(s-aline)+1,inword);

  1592 			else

  1593 			    cnt_punct++;

  1594 		    }

  1595 		}

  1596 	}

  1597     }

  1598 }

  1600 /*

  1601  * check_for_typos:

  1602  *

  1603  * Check for commonly mistyped words,

  1604  * and digits like 0 for O in a word.

  1605  */

  1606 void check_for_typos(const char *aline,struct warnings *warnings)

  1607 {

  1608     const char *s,*wordstart;

  1609     char inword[MAXWORDLEN],testword[MAXWORDLEN];

  1610     int i,istypo,isdup,alower,vowel,consonant;

  1611     static int qword_index=0;

  1612     for (s=aline;*s;)

  1613     {

  1614 	wordstart=s;

  1615 	s=getaword(s,inword);

  1616 	if (!*inword)

  1617 	    continue; /* don't bother with empty lines */

  1618 	if (mixdigit(inword))

  1619 	{

  1620 	    if (pswit[ECHO_SWITCH])

  1621 		printf("\n%s\n",aline);

  1622 	    if (!pswit[OVERVIEW_SWITCH])

  1623 		printf("    Line %ld column %d - Query digit in %s\n",

  1624 		  linecnt,(int)(wordstart-aline)+1,inword);

  1625 	    else

  1626 		cnt_word++;

  1627 	}

  1628 	/*

  1629 	 * Put the word through a series of tests for likely typos and OCR

  1630 	 * errors.

  1631 	 */

  1632 	if (pswit[TYPO_SWITCH])

  1633 	{

  1634 	    istypo=0;

  1635 	    strcpy(testword,inword);

  1636 	    alower=0;

  1637 	    for (i=0;i<(signed int)strlen(testword);i++)

  1638 	    {

  1639 		/* lowercase for testing */

  1640 		if (testword[i]>='a' && testword[i]<='z')

  1641 		    alower=1;

  1642 		if (alower && testword[i]>='A' && testword[i]<='Z')

  1643 		{

  1644 		    /*

  1645 		     * We have an uppercase mid-word. However, there are

  1646 		     * common cases:

  1647 		     *   Mac and Mc like McGill

  1648 		     *   French contractions like l'Abbe

  1649 		     */

  1650 		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||

  1651 		      i==3 && testword[0]=='m' && testword[1]=='a' &&

  1652 		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)

  1653 			; /* do nothing! */

  1654 		    else

  1655 			istypo=1;

  1656 		}

  1657 		testword[i]=(char)tolower(testword[i]);

  1658 	    }

  1659 	    /*

  1660 	     * Check for certain unlikely two-letter combinations at word

  1661 	     * start and end.

  1662 	     */

  1663 	    if (strlen(testword)>1)

  1664 	    {

  1665 		for (i=0;*nostart[i];i++)

  1666 		    if (!strncmp(testword,nostart[i],2))

  1667 			istypo=1;

  1668 		for (i=0;*noend[i];i++)

  1669 		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))

  1670 			istypo=1;

  1671 	    }

  1672 	    /* ght is common, gbt never. Like that. */

  1673 	    if (strstr(testword,"cb"))

  1674 		istypo=1;

  1675 	    if (strstr(testword,"gbt"))

  1676 		istypo=1;

  1677 	    if (strstr(testword,"pbt"))

  1678 		istypo=1;

  1679 	    if (strstr(testword,"tbs"))

  1680 		istypo=1;

  1681 	    if (strstr(testword,"mrn"))

  1682 		istypo=1;

  1683 	    if (strstr(testword,"ahle"))

  1684 		istypo=1;

  1685 	    if (strstr(testword,"ihle"))

  1686 		istypo=1;

  1687 	    /*

  1688 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1689 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1690 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1691 	     * numerals, but "ii" is a common scanno.

  1692 	     */

  1693 	    if (strstr(testword,"tbi"))

  1694 		istypo=1;

  1695 	    if (strstr(testword,"tbe"))

  1696 		istypo=1;

  1697 	    if (strstr(testword,"ii"))

  1698 		istypo=1;

  1699 	    /*

  1700 	     * Check for no vowels or no consonants.

  1701 	     * If none, flag a typo.

  1702 	     */

  1703 	    if (!istypo && strlen(testword)>1)

  1704 	    {

  1705 		vowel=consonant=0;

  1706 		for (i=0;testword[i];i++)

  1707 		{

  1708 		    if (testword[i]=='y' || gcisdigit(testword[i]))

  1709 		    {

  1710 			/* Yah, this is loose. */

  1711 			vowel++;

  1712 			consonant++;

  1713 		    }

  1714 		    else if (strchr(vowels,testword[i]))

  1715 			vowel++;

  1716 		    else

  1717 			consonant++;

  1718 		}

  1719 		if (!vowel || !consonant)

  1720 		    istypo=1;

  1721 	    }

  1722 	    /*

  1723 	     * Now exclude the word from being reported if it's in

  1724 	     * the okword list.

  1725 	     */

  1726 	    for (i=0;*okword[i];i++)

  1727 		if (!strcmp(testword,okword[i]))

  1728 		    istypo=0;

  1729 	    /*

  1730 	     * What looks like a typo may be a Roman numeral.

  1731 	     * Exclude these.

  1732 	     */

  1733 	    if (istypo && isroman(testword))

  1734 		istypo=0;

  1735 	    /* Check the manual list of typos. */

  1736 	    if (!istypo)

  1737 		for (i=0;*typo[i];i++)

  1738 		    if (!strcmp(testword,typo[i]))

  1739 			istypo=1;

  1740 	    /*

  1741 	     * Check lowercase s, l, i and m - special cases.

  1742 	     *   "j" - often a semi-colon gone wrong.

  1743 	     *   "d" for a missing apostrophe - he d

  1744 	     *   "n" for "in"

  1745 	     */

  1746 	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))

  1747 		istypo=1;

  1748 	    if (istypo)

  1749 	    {

  1750 		isdup=0;

  1751 		if (strlen(testword)<MAX_QWORD_LENGTH &&

  1752 		  !pswit[VERBOSE_SWITCH])

  1753 		    for (i=0;i<qword_index;i++)

  1754 			if (!strcmp(testword,qword[i]))

  1755 			{

  1756 			    isdup=1;

  1757 			    ++dupcnt[i];

  1758 			}

  1759 		if (!isdup)

  1760 		{

  1761 		    if (qword_index<MAX_QWORD &&

  1762 		      strlen(testword)<MAX_QWORD_LENGTH)

  1763 		    {

  1764 			strcpy(qword[qword_index],testword);

  1765 			qword_index++;

  1766 		    }

  1767 		    if (pswit[ECHO_SWITCH])

  1768 			printf("\n%s\n",aline);

  1769 		    if (!pswit[OVERVIEW_SWITCH])

  1770 		    {

  1771 			printf("    Line %ld column %d - Query word %s",

  1772 			  linecnt,(int)(wordstart-aline)+1,inword);

  1773 			if (strlen(testword)<MAX_QWORD_LENGTH &&

  1774 			  !pswit[VERBOSE_SWITCH])

  1775 			    printf(" - not reporting duplicates");

  1776 			printf("\n");

  1777 		    }

  1778 		    else

  1779 			cnt_word++;

  1780 		}

  1781 	    }

  1782 	}

  1783 	/* check the user's list of typos */

  1784 	if (!istypo && usertypo_count)

  1785 	    for (i=0;i<usertypo_count;i++)

  1786 		if (!strcmp(testword,usertypo[i]))

  1787 		{

  1788 		    if (pswit[ECHO_SWITCH])

  1789 			printf("\n%s\n",aline);

  1790 		    if (!pswit[OVERVIEW_SWITCH])

  1791 			printf("    Line %ld column %d - "

  1792 			  "Query possible scanno %s\n",

  1793 			  linecnt,(int)(wordstart-aline)+2,inword);

  1794 		}

  1795 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1796 	{

  1797 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1798 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1799 	    {

  1800 		if (pswit[ECHO_SWITCH])

  1801 		    printf("\n%s\n",aline);

  1802 		if (!pswit[OVERVIEW_SWITCH])

  1803 		    printf("    Line %ld column %d - Query standalone %s\n",

  1804 		      linecnt,(int)(wordstart-aline)+2,inword);

  1805 		else

  1806 		    cnt_word++;

  1807 	    }

  1808 	}

  1809     }

  1810 }

  1812 struct parities {

  1813     int dquote,squote;

  1814 };

  1816 /*

  1817  * check_for_misspaced_punctuation:

  1818  *

  1819  * Look for added or missing spaces around punctuation and quotes.

  1820  * If there is a punctuation character like ! with no space on

  1821  * either side, suspect a missing!space. If there are spaces on

  1822  * both sides , assume a typo. If we see a double quote with no

  1823  * space or punctuation on either side of it, assume unspaced

  1824  * quotes "like"this.

  1825  */

  1826 void check_for_misspaced_punctuation(const char *aline,

  1827   struct parities *parities,int isemptyline)

  1828 {

  1829     int i,llen,isacro,isellipsis;

  1830     const char *s;

  1831     llen=strlen(aline);

  1832     for (i=1;i<llen;i++)

  1833     {

  1834 	/* For each character in the line after the first. */

  1835 	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */

  1836 	{

  1837 	    /* we need to suppress warnings for acronyms like M.D. */

  1838 	    isacro=0;

  1839 	    /* we need to suppress warnings for ellipsis . . . */

  1840 	    isellipsis=0;

  1841 	    /* if there are letters on both sides of it or ... */

  1842 	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||

  1843 	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))

  1844 	    {

  1845 		/* ...if it's strict punctuation followed by an alpha */

  1846 		if (aline[i]=='.')

  1847 		{

  1848 		    if (i>2 && aline[i-2]=='.')

  1849 			isacro=1;

  1850 		    if (i+2<llen && aline[i+2]=='.')

  1851 			isacro=1;

  1852 		}

  1853 		if (!isacro)

  1854 		{

  1855 		    if (pswit[ECHO_SWITCH])

  1856 			printf("\n%s\n",aline);

  1857 		    if (!pswit[OVERVIEW_SWITCH])

  1858 			printf("    Line %ld column %d - Missing space?\n",

  1859 			  linecnt,i+1);

  1860 		    else

  1861 			cnt_punct++;

  1862 		}

  1863 	    }

  1864 	    if (aline[i-1]==CHAR_SPACE &&

  1865 	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))

  1866 	    {

  1867 		/*

  1868 		 * If there are spaces on both sides,

  1869 		 * or space before and end of line.

  1870 		 */

  1871 		if (aline[i]=='.')

  1872 		{

  1873 		    if (i>2 && aline[i-2]=='.')

  1874 			isellipsis=1;

  1875 		    if (i+2<llen && aline[i+2]=='.')

  1876 			isellipsis=1;

  1877 		}

  1878 		if (!isemptyline && !isellipsis)

  1879 		{

  1880 		    if (pswit[ECHO_SWITCH])

  1881 			printf("\n%s\n",aline);

  1882 		    if (!pswit[OVERVIEW_SWITCH])

  1883 			printf("    Line %ld column %d - "

  1884 			  "Spaced punctuation?\n",linecnt,i+1);

  1885 		    else

  1886 			cnt_punct++;

  1887 		}

  1888 	    }

  1889 	}

  1890     }

  1891     /* Split out the characters that CANNOT be preceded by space. */

  1892     llen=strlen(aline);

  1893     for (i=1;i<llen;i++)

  1894     {

  1895 	/* for each character in the line after the first */

  1896 	if (strchr("?!,;:",aline[i]))

  1897 	{

  1898 	    /* if it's punctuation that _cannot_ have a space before it */

  1899 	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&

  1900 	      aline[i+1]!=CHAR_SPACE)

  1901 	    {

  1902 		/*

  1903 		 * If aline[i+1) DOES == space,

  1904 		 * it was already reported just above.

  1905 		 */

  1906 		if (pswit[ECHO_SWITCH])

  1907 		    printf("\n%s\n",aline);

  1908 		if (!pswit[OVERVIEW_SWITCH])

  1909 		    printf("    Line %ld column %d - Spaced punctuation?\n",

  1910 		      linecnt,i+1);

  1911 		else

  1912 		    cnt_punct++;

  1913 	    }

  1914 	}

  1915     }

  1916     /*

  1917      * Special case " .X" where X is any alpha.

  1918      * This plugs a hole in the acronym code above.

  1919      * Inelegant, but maintainable.

  1920      */

  1921     llen=strlen(aline);

  1922     for (i=1;i<llen;i++)

  1923     {

  1924 	/* for each character in the line after the first */

  1925 	if (aline[i]=='.')

  1926 	{

  1927 	    /* if it's a period */

  1928 	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))

  1929 	    {

  1930 		/*

  1931 		 * If the period follows a space and

  1932 		 * is followed by a letter.

  1933 		 */

  1934 		if (pswit[ECHO_SWITCH])

  1935 		    printf("\n%s\n",aline);

  1936 		if (!pswit[OVERVIEW_SWITCH])

  1937 		    printf("    Line %ld column %d - Spaced punctuation?\n",

  1938 		      linecnt,i+1);

  1939 		else

  1940 		    cnt_punct++;

  1941 	    }

  1942 	}

  1943     }

  1944     for (i=1;i<llen;i++)

  1945     {

  1946 	/* for each character in the line after the first */

  1947 	if (aline[i]==CHAR_DQUOTE)

  1948 	{

  1949 	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&

  1950 	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||

  1951 	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))

  1952 	    {

  1953 		if (pswit[ECHO_SWITCH])

  1954 		    printf("\n%s\n",aline);

  1955 		if (!pswit[OVERVIEW_SWITCH])

  1956 		    printf("    Line %ld column %d - Unspaced quotes?\n",

  1957 		      linecnt,i+1);

  1958 		else

  1959 		    cnt_punct++;

  1960 	    }

  1961 	}

  1962     }

  1963     /* Check parity of quotes. */

  1964     for (s=aline;*s;s++)

  1965     {

  1966 	if (*s==CHAR_DQUOTE)

  1967 	{

  1968 	    parities->dquote=!parities->dquote;

  1969 	    if (!parities->dquote)

  1970 	    {

  1971 		/* parity even */

  1972 		if (!strchr("_-.'`/,;:!?)]} ",s[1]))

  1973 		{

  1974 		    if (pswit[ECHO_SWITCH])

  1975 			printf("\n%s\n",aline);

  1976 		    if (!pswit[OVERVIEW_SWITCH])

  1977 			printf("    Line %ld column %d - "

  1978 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  1979 		    else

  1980 			cnt_punct++;

  1981 		}

  1982 	    }

  1983 	    else

  1984 	    {

  1985 		/* parity odd */

  1986 		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  1987 		  !strchr("_-/.'`([{$",s[1]) || !s[1])

  1988 		{

  1989 		    if (pswit[ECHO_SWITCH])

  1990 			printf("\n%s\n",aline);

  1991 		    if (!pswit[OVERVIEW_SWITCH])

  1992 			printf("    Line %ld column %d - "

  1993 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  1994 		    else

  1995 			cnt_punct++;

  1996 		}

  1997 	    }

  1998 	}

  1999     }

  2000     if (*aline==CHAR_DQUOTE)

  2001     {

  2002 	if (strchr(",;:!?)]} ",aline[1]))

  2003 	{

  2004 	    if (pswit[ECHO_SWITCH])

  2005 		printf("\n%s\n",aline);

  2006 	    if (!pswit[OVERVIEW_SWITCH])

  2007 		printf("    Line %ld column 1 - Wrongspaced quotes?\n",

  2008 		  linecnt);

  2009 	    else

  2010 		cnt_punct++;

  2011 	}

  2012     }

  2013     if (pswit[SQUOTE_SWITCH])

  2014     {

  2015 	for (s=aline;*s;s++)

  2016 	{

  2017 	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&

  2018 	      (s==aline || s>aline && !gcisalpha(s[-1]) ||

  2019 	      !gcisalpha(s[1])))

  2020 	    {

  2021 		parities->squote=!parities->squote;

  2022 		if (!parities->squote)

  2023 		{

  2024 		    /* parity even */

  2025 		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))

  2026 		    {

  2027 			if (pswit[ECHO_SWITCH])

  2028 			    printf("\n%s\n",aline);

  2029 			if (!pswit[OVERVIEW_SWITCH])

  2030 			    printf("    Line %ld column %d - "

  2031 			      "Wrongspaced singlequotes?\n",

  2032 			      linecnt,(int)(s-aline)+1);

  2033 			else

  2034 			    cnt_punct++;

  2035 		    }

  2036 		}

  2037 		else

  2038 		{

  2039 		    /* parity odd */

  2040 		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2041 		      !strchr("_-/\".'`",s[1]) || !s[1])

  2042 		    {

  2043 			if (pswit[ECHO_SWITCH])

  2044 			    printf("\n%s\n",aline);

  2045 			if (!pswit[OVERVIEW_SWITCH])

  2046 			    printf("    Line %ld column %d - "

  2047 			      "Wrongspaced singlequotes?\n",

  2048 			      linecnt,(int)(s-aline)+1);

  2049 			else

  2050 			    cnt_punct++;

  2051 		    }

  2052 		}

  2053 	    }

  2054 	}

  2055     }

  2056 }

  2058 /*

  2059  * check_for_double_punctuation:

  2060  *

  2061  * Look for double punctuation like ,. or ,,

  2062  * Thanks to DW for the suggestion!

  2063  * In books with references, ".," and ".;" are common

  2064  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2065  * OTOH, from my initial tests, there are also fairly

  2066  * common errors. What to do? Make these cases paranoid?

  2067  * ".," is the most common, so warnings->dotcomma is used

  2068  * to suppress detailed reporting if it occurs often.

  2069  */

  2070 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2071 {

  2072     int i,llen;

  2073     llen=strlen(aline);

  2074     for (i=0;i<llen;i++)

  2075     {

  2076 	/* for each punctuation character in the line */

  2077 	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&

  2078 	  aline[i] && aline[i+1])

  2079 	{

  2080 	    /* followed by punctuation, it's a query, unless . . . */

  2081 	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||

  2082 	      aline[i]=='!') ||

  2083 	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||

  2084 	      warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2085 	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2086 	      warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2087 	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2088 	      warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2089 	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2090 	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2091 	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2092 	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2093 	      warnings->isFrench && !strncmp(aline+i,"...?",4))

  2094 	    {

  2095 		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2096 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2097 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2098 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2099 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2100 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2101 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2102 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2103 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2104 		  warnings->isFrench && !strncmp(aline+i,"...?",4))

  2105 		    i+=4;

  2106 		; /* do nothing for .. !! and ?? which can be legit */

  2107 	    }

  2108 	    else

  2109 	    {

  2110 		if (pswit[ECHO_SWITCH])

  2111 		    printf("\n%s\n",aline);

  2112 		if (!pswit[OVERVIEW_SWITCH])

  2113 		    printf("    Line %ld column %d - Double punctuation?\n",

  2114 		      linecnt,i+1);

  2115 		else

  2116 		    cnt_punct++;

  2117 	    }

  2118 	}

  2119     }

  2120 }

  2122 /*

  2123  * check_for_spaced_quotes:

  2124  */

  2125 void check_for_spaced_quotes(const char *aline)

  2126 {

  2127     const char *s,*t;

  2128     s=aline;

  2129     while ((t=strstr(s," \" ")))

  2130     {

  2131 	if (pswit[ECHO_SWITCH])

  2132 	    printf("\n%s\n",aline);

  2133 	if (!pswit[OVERVIEW_SWITCH])

  2134 	    printf("    Line %ld column %d - Spaced doublequote?\n",

  2135 	      linecnt,(int)(t-aline+1));

  2136 	else

  2137 	    cnt_punct++;

  2138 	s=t+2;

  2139     }

  2140     s=aline;

  2141     while ((t=strstr(s," ' ")))

  2142     {

  2143 	if (pswit[ECHO_SWITCH])

  2144 	    printf("\n%s\n",aline);

  2145 	if (!pswit[OVERVIEW_SWITCH])

  2146 	    printf("    Line %ld column %d - Spaced singlequote?\n",

  2147 	      linecnt,(int)(t-aline+1));

  2148 	else

  2149 	    cnt_punct++;

  2150 	s=t+2;

  2151     }

  2152     s=aline;

  2153     while ((t=strstr(s," ` ")))

  2154     {

  2155 	if (pswit[ECHO_SWITCH])

  2156 	    printf("\n%s\n",aline);

  2157 	if (!pswit[OVERVIEW_SWITCH])

  2158 	    printf("    Line %ld column %d - Spaced singlequote?\n",

  2159 	      linecnt,(int)(t-aline+1));

  2160 	else

  2161 	    cnt_punct++;

  2162 	s=t+2;

  2163     }

  2164 }

  2166 /*

  2167  * check_for_miscased_genative:

  2168  *

  2169  * Check special case of 'S instead of 's at end of word.

  2170  */

  2171 void check_for_miscased_genative(const char *aline)

  2172 {

  2173     const char *s;

  2174     s=aline+1;

  2175     while (*s)

  2176     {

  2177 	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')

  2178 	{

  2179 	    if (pswit[ECHO_SWITCH])

  2180 		printf("\n%s\n",aline);

  2181 	    if (!pswit[OVERVIEW_SWITCH])

  2182 		printf("    Line %ld column %d - Capital \"S\"?\n",

  2183 		  linecnt,(int)(s-aline+2));

  2184 	    else

  2185 		cnt_punct++;

  2186 	}

  2187 	s++;

  2188     }

  2189 }

  2191 /*

  2192  * check_end_of_line:

  2193  *

  2194  * Now check special cases - start and end of line -

  2195  * for single and double quotes. Start is sometimes [sic]

  2196  * but better to query it anyway.

  2197  * While we're here, check for dash at end of line.

  2198  */

  2199 void check_end_of_line(const char *aline,struct warnings *warnings)

  2200 {

  2201     int i,llen;

  2202     llen=strlen(aline);

  2203     if (llen>1)

  2204     {

  2205 	if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||

  2206 	  aline[llen-1]==CHAR_OPEN_SQUOTE)

  2207 	    if (aline[llen-2]==CHAR_SPACE)

  2208 	    {

  2209 		if (pswit[ECHO_SWITCH])

  2210 		    printf("\n%s\n",aline);

  2211 		if (!pswit[OVERVIEW_SWITCH])

  2212 		    printf("    Line %ld column %d - Spaced quote?\n",

  2213 		      linecnt,llen);

  2214 		else

  2215 		    cnt_punct++;

  2216 	    }

  2217 	if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&

  2218 	  aline[1]==CHAR_SPACE)

  2219 	{

  2220 	    if (pswit[ECHO_SWITCH])

  2221 		printf("\n%s\n",aline);

  2222 	    if (!pswit[OVERVIEW_SWITCH])

  2223 		printf("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2224 	    else

  2225 		cnt_punct++;

  2226 	}

  2227 	/*

  2228 	 * Dash at end of line may well be legit - paranoid mode only

  2229 	 * and don't report em-dash at line-end.

  2230 	 */

  2231 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2232 	{

  2233 	    for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

  2234 		;

  2235 	    if (aline[i]=='-' && aline[i-1]!='-')

  2236 	    {

  2237 		if (pswit[ECHO_SWITCH])

  2238 		    printf("\n%s\n",aline);

  2239 		if (!pswit[OVERVIEW_SWITCH])

  2240 		    printf("    Line %ld column %d - Hyphen at end of line?\n",

  2241 		      linecnt,i);

  2242 	    }

  2243 	}

  2244     }

  2245 }

  2247 /*

  2248  * check_for_unspaced_bracket:

  2249  *

  2250  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2251  * If so, suspect a scanno like "a]most".

  2252  */

  2253 void check_for_unspaced_bracket(const char *aline)

  2254 {

  2255     int i,llen;

  2256     llen=strlen(aline);

  2257     for (i=1;i<llen-1;i++)

  2258     {

  2259 	/* for each bracket character in the line except 1st & last */

  2260 	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&

  2261 	  gcisalpha(aline[i+1]))

  2262 	{

  2263 	    if (pswit[ECHO_SWITCH])

  2264 		printf("\n%s\n",aline);

  2265 	    if (!pswit[OVERVIEW_SWITCH])

  2266 		printf("    Line %ld column %d - Unspaced bracket?\n",

  2267 		  linecnt,i);

  2268 	    else

  2269 		cnt_punct++;

  2270 	}

  2271     }

  2272 }

  2274 /*

  2275  * check_for_unpunctuated_endquote:

  2276  */

  2277 void check_for_unpunctuated_endquote(const char *aline)

  2278 {

  2279     int i,llen;

  2280     llen=strlen(aline);

  2281     for (i=1;i<llen;i++)

  2282     {

  2283 	/* for each character in the line except 1st */

  2284 	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

  2285 	{

  2286 	    if (pswit[ECHO_SWITCH])

  2287 		printf("\n%s\n",aline);

  2288 	    if (!pswit[OVERVIEW_SWITCH])

  2289 		printf("    Line %ld column %d - "

  2290 		  "endquote missing punctuation?\n",linecnt,i);

  2291 	    else

  2292 		cnt_punct++;

  2293 	}

  2294     }

  2295 }

  2297 /*

  2298  * check_for_html_tag:

  2299  *

  2300  * Check for <HTML TAG>.

  2301  *

  2302  * If there is a < in the line, followed at some point

  2303  * by a > then we suspect HTML.

  2304  */

  2305 void check_for_html_tag(const char *aline)

  2306 {

  2307     int i;

  2308     const char *open,*close;

  2309     open=strstr(aline,"<");

  2310     if (open)

  2311     {

  2312 	close=strstr(aline,">");

  2313 	if (close)

  2314 	{

  2315 	    i=(signed int)(close-open+1);

  2316 	    if (i>0)

  2317 	    {

  2318 		strncpy(wrk,open,i);

  2319 		wrk[i]=0;

  2320 		if (pswit[ECHO_SWITCH])

  2321 		    printf("\n%s\n",aline);

  2322 		if (!pswit[OVERVIEW_SWITCH])

  2323 		    printf("    Line %ld column %d - HTML Tag? %s \n",

  2324 		      linecnt,(int)(open-aline)+1,wrk);

  2325 		else

  2326 		    cnt_html++;

  2327 	    }

  2328 	}

  2329     }

  2330 }

  2332 /*

  2333  * check_for_html_entity:

  2334  *

  2335  * Check for &symbol; HTML.

  2336  *

  2337  * If there is a & in the line, followed at

  2338  * some point by a ; then we suspect HTML.

  2339  */

  2340 void check_for_html_entity(const char *aline)

  2341 {

  2342     int i;

  2343     const char *s,*amp,*scolon;

  2344     amp=strstr(aline,"&");

  2345     if (amp)

  2346     {

  2347 	scolon=strstr(aline,";");

  2348 	if (scolon)

  2349 	{

  2350 	    i=(int)(scolon-amp+1);

  2351 	    for (s=amp;s<scolon;s++)

  2352 		if (*s==CHAR_SPACE)

  2353 		    i=0;                /* Don't report "Jones & Son;" */

  2354 	    if (i>0)

  2355 	    {

  2356 		strncpy(wrk,amp,i);

  2357 		wrk[i]=0;

  2358 		if (pswit[ECHO_SWITCH])

  2359 		    printf("\n%s\n",aline);

  2360 		if (!pswit[OVERVIEW_SWITCH])

  2361 		    printf("    Line %ld column %d - HTML symbol? %s \n",

  2362 		      linecnt,(int)(amp-aline)+1,wrk);

  2363 		else

  2364 		    cnt_html++;

  2365 	    }

  2366 	}

  2367     }

  2368 }

  2370 struct pending {

  2371     char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];

  2372     long squot;

  2373 };

  2375 /*

  2376  * print_pending:

  2377  *

  2378  * If we are in a state of unbalanced quotes, and this line

  2379  * doesn't begin with a quote, output the stored error message.

  2380  * If the -P switch was used, print the warning even if the

  2381  * new para starts with quotes.

  2382  */

  2383 void print_pending(const char *aline,const char *parastart,

  2384   struct pending *pending)

  2385 {

  2386     const char *s;

  2387     s=aline;

  2388     while (*s==' ')

  2389 	s++;

  2390     if (*pending->dquote)

  2391 	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  2392 	{

  2393 	    if (!pswit[OVERVIEW_SWITCH])

  2394 	    {

  2395 		if (pswit[ECHO_SWITCH])

  2396 		    printf("\n%s\n",parastart);

  2397 		puts(pending->dquote);

  2398 	    }

  2399 	    else

  2400 		cnt_dquot++;

  2401 	}

  2402     if (*pending->squote)

  2403     {

  2404 	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||

  2405 	  pending->squot)

  2406 	{

  2407 	    if (!pswit[OVERVIEW_SWITCH])

  2408 	    {

  2409 		if (pswit[ECHO_SWITCH])

  2410 		    printf("\n%s\n",parastart);

  2411 		puts(pending->squote);

  2412 	    }

  2413 	    else

  2414 		cnt_squot++;

  2415 	}

  2416     }

  2417     if (*pending->rbrack)

  2418     {

  2419 	if (!pswit[OVERVIEW_SWITCH])

  2420 	{

  2421 	    if (pswit[ECHO_SWITCH])

  2422 		printf("\n%s\n",parastart);

  2423 	    puts(pending->rbrack);

  2424 	}

  2425 	else

  2426 	    cnt_brack++;

  2427     }

  2428     if (*pending->sbrack)

  2429     {

  2430 	if (!pswit[OVERVIEW_SWITCH])

  2431 	{

  2432 	    if (pswit[ECHO_SWITCH])

  2433 		printf("\n%s\n",parastart);

  2434 	    puts(pending->sbrack);

  2435 	}

  2436 	else

  2437 	    cnt_brack++;

  2438     }

  2439     if (*pending->cbrack)

  2440     {

  2441 	if (!pswit[OVERVIEW_SWITCH])

  2442 	{

  2443 	    if (pswit[ECHO_SWITCH])

  2444 		printf("\n%s\n",parastart);

  2445 	    puts(pending->cbrack);

  2446 	}

  2447 	else

  2448 	    cnt_brack++;

  2449     }

  2450     if (*pending->unders)

  2451     {

  2452 	if (!pswit[OVERVIEW_SWITCH])

  2453 	{

  2454 	    if (pswit[ECHO_SWITCH])

  2455 		printf("\n%s\n",parastart);

  2456 	    puts(pending->unders);

  2457 	}

  2458 	else

  2459 	    cnt_brack++;

  2460     }

  2461 }

  2463 /*

  2464  * check_for_mismatched_quotes:

  2465  *

  2466  * At end of paragraph, check for mismatched quotes.

  2467  *

  2468  * We don't want to report an error immediately, since it is a

  2469  * common convention to omit the quotes at end of paragraph if

  2470  * the next paragraph is a continuation of the same speaker.

  2471  * Where this is the case, the next para should begin with a

  2472  * quote, so we store the warning message and only display it

  2473  * at the top of the next iteration if the new para doesn't

  2474  * start with a quote.

  2475  * The -p switch overrides this default, and warns of unclosed

  2476  * quotes on _every_ paragraph, whether the next begins with a

  2477  * quote or not.

  2478  */

  2479 void check_for_mismatched_quotes(const struct counters *counters,

  2480   struct pending *pending)

  2481 {

  2482     if (counters->quot%2)

  2483 	sprintf(pending->dquote,"    Line %ld - Mismatched quotes",

  2484 	  linecnt);

  2485     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2486       counters->open_single_quote!=counters->close_single_quote)

  2487 	sprintf(pending->squote,"    Line %ld - Mismatched singlequotes?",

  2488 	  linecnt);

  2489     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2490       counters->open_single_quote!=counters->close_single_quote &&

  2491       counters->open_single_quote!=counters->close_single_quote+1)

  2492 	/*

  2493 	 * Flag it to be noted regardless of the

  2494 	 * first char of the next para.

  2495 	 */

  2496 	pending->squot=1;

  2497     if (counters->r_brack)

  2498 	sprintf(pending->rbrack,"    Line %ld - Mismatched round brackets?",

  2499 	  linecnt);

  2500     if (counters->s_brack)

  2501 	sprintf(pending->sbrack,"    Line %ld - Mismatched square brackets?",

  2502 	  linecnt);

  2503     if (counters->c_brack)

  2504 	sprintf(pending->cbrack,"    Line %ld - Mismatched curly brackets?",

  2505 	  linecnt);

  2506     if (counters->c_unders%2)

  2507 	sprintf(pending->unders,"    Line %ld - Mismatched underscores?",

  2508 	  linecnt);

  2509 }

  2511 /*

  2512  * check_for_omitted_punctuation:

  2513  *

  2514  * Check for omitted punctuation at end of paragraph by working back

  2515  * through prevline. DW.

  2516  * Need to check this only for "normal" paras.

  2517  * So what is a "normal" para?

  2518  *    Not normal if one-liner (chapter headings, etc.)

  2519  *    Not normal if doesn't contain at least one locase letter

  2520  *    Not normal if starts with space

  2521  */

  2522 void check_for_omitted_punctuation(const char *prevline,

  2523   struct line_properties *last,int start_para_line)

  2524 {

  2525     int i;

  2526     const char *s;

  2527     for (s=prevline,i=0;*s && !i;s++)

  2528 	if (gcisletter(*s))

  2529 	    /* use i to indicate the presence of a letter on the line */

  2530 	    i=1;

  2531     /*

  2532      * This next "if" is a problem.

  2533      * If we say "start_para_line <= linecnt - 1", that includes

  2534      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2535      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2536      * misses genuine one-line paragraphs.

  2537      */

  2538     if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)

  2539     {

  2540 	for (i=strlen(prevline)-1;

  2541 	  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&

  2542 	  prevline[i]>CHAR_SPACE && i>0;

  2543 	  i--)

  2544 	    ;

  2545 	for (;i>0;i--)

  2546 	{

  2547 	    if (gcisalpha(prevline[i]))

  2548 	    {

  2549 		if (pswit[ECHO_SWITCH])

  2550 		    printf("\n%s\n",prevline);

  2551 		if (!pswit[OVERVIEW_SWITCH])

  2552 		    printf("    Line %ld column %d - "

  2553 		      "No punctuation at para end?\n",

  2554 		      linecnt-1,strlen(prevline));

  2555 		else

  2556 		    cnt_punct++;

  2557 		break;

  2558 	    }

  2559 	    if (strchr("-.:!([{?}])",prevline[i]))

  2560 		break;

  2561 	}

  2562     }

  2563 }

  2565 /*

  2566  * procfile:

  2567  *

  2568  * Process one file.

  2569  */

  2570 void procfile(char *filename)

  2571 {

  2572     const char *s;

  2573     char parastart[81];     /* first line of current para */

  2574     FILE *infile;

  2575     struct first_pass_results *first_pass_results;

  2576     struct warnings *warnings;

  2577     struct counters counters={0};

  2578     struct line_properties last={0};

  2579     struct parities parities={0};

  2580     struct pending pending={{0},};

  2581     int isemptyline;

  2582     long start_para_line;

  2583     signed int i,llen,isacro,isellipsis;

  2584     signed int isnewpara;

  2585     signed int enddash;

  2586     last.start=CHAR_SPACE;

  2587     *prevline=0;

  2588     linecnt=checked_linecnt=start_para_line=0;

  2589     i=llen=isacro=isellipsis=0;

  2590     isnewpara=enddash=0;

  2591     infile=fopen(filename,"rb");

  2592     if (!infile)

  2593     {

  2594         if (pswit[STDOUT_SWITCH])

  2595             fprintf(stdout,"bookloupe: cannot open %s\n",filename);

  2596         else

  2597             fprintf(stderr,"bookloupe: cannot open %s\n",filename);

  2598 	exit(1);

  2599     }

  2600     fprintf(stdout,"\n\nFile: %s\n\n",filename);

  2601     first_pass_results=first_pass(infile);

  2602     warnings=report_first_pass(first_pass_results);

  2603     /*

  2604      * Here we go with the main pass. Hold onto yer hat!

  2605      */

  2606     rewind(infile);

  2607     linecnt=0;

  2608     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))

  2609     {

  2610         linecnt++;

  2611         if (linecnt==1)

  2612 	    isnewpara=1;

  2613         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))

  2614 	    continue;    // skip DP page separators completely

  2615         if (linecnt<first_pass_results->firstline ||

  2616 	  (first_pass_results->footerline>0 &&

  2617 	  linecnt>first_pass_results->footerline))

  2618 	{

  2619             if (pswit[HEADER_SWITCH])

  2620 	    {

  2621                 if (!strncmp(aline,"Title:",6))

  2622                     printf("    %s\n",aline);

  2623                 if (!strncmp(aline,"Author:",7))

  2624                     printf("    %s\n",aline);

  2625                 if (!strncmp(aline,"Release Date:",13))

  2626                     printf("    %s\n",aline);

  2627                 if (!strncmp(aline,"Edition:",8))

  2628                     printf("    %s\n\n",aline);

  2629 	    }

  2630             continue;                /* skip through the header */

  2631 	}

  2632         checked_linecnt++;

  2633 	print_pending(aline,parastart,&pending);

  2634 	memset(&pending,0,sizeof(pending));

  2635 	isemptyline=analyse_quotes(aline,&counters);

  2636         if (isnewpara && !isemptyline)

  2637 	{

  2638 	    /* This line is the start of a new paragraph. */

  2639             start_para_line=linecnt;

  2640 	    /* Capture its first line in case we want to report it later. */

  2641             strncpy(parastart,aline,80);

  2642             parastart[79]=0;

  2643 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2644             s=aline;

  2645             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)

  2646 		s++;

  2647             if (*s>='a' && *s<='z')

  2648 	    {

  2649 		/* and its first letter is lowercase */

  2650                 if (pswit[ECHO_SWITCH])

  2651 		    printf("\n%s\n",aline);

  2652                 if (!pswit[OVERVIEW_SWITCH])

  2653                     printf("    Line %ld column %d - "

  2654 		      "Paragraph starts with lower-case\n",

  2655 		      linecnt,(int)(s-aline)+1);

  2656                 else

  2657                     cnt_punct++;

  2658 	    }

  2659             isnewpara=0; /* Signal the end of new para processing. */

  2660 	}

  2661         /* Check for an em-dash broken at line end. */

  2662         if (enddash && *aline=='-')

  2663 	{

  2664             if (pswit[ECHO_SWITCH])

  2665 		printf("\n%s\n",aline);

  2666             if (!pswit[OVERVIEW_SWITCH])

  2667                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2668             else

  2669                 cnt_punct++;

  2670 	}

  2671         enddash=0;

  2672         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)

  2673 	    ;

  2674         if (s>=aline && *s=='-')

  2675             enddash=1;

  2676 	check_for_control_characters(aline);

  2677         if (warnings->bin)

  2678 	    check_for_odd_characters(aline,warnings,isemptyline);

  2679         if (warnings->longline)

  2680 	    check_for_long_line(aline);

  2681         if (warnings->shortline)

  2682 	    check_for_short_line(aline,&last);

  2683         last.blen=last.len;

  2684         last.len=strlen(aline);

  2685         last.start=aline[0];

  2686 	check_for_starting_punctuation(aline);

  2687         if (warnings->dash)

  2688 	{

  2689 	    check_for_spaced_emdash(aline);

  2690 	    check_for_spaced_dash(aline);

  2691 	}

  2692 	check_for_unmarked_paragraphs(aline);

  2693 	check_for_jeebies(aline);

  2694 	check_for_mta_from(aline);

  2695 	check_for_orphan_character(aline);

  2696 	check_for_pling_scanno(aline);

  2697 	check_for_extra_period(aline,warnings);

  2698 	check_for_following_punctuation(aline);

  2699 	check_for_typos(aline,warnings);

  2700 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2701 	check_for_double_punctuation(aline,warnings);

  2702 	check_for_spaced_quotes(aline);

  2703 	check_for_miscased_genative(aline);

  2704 	check_end_of_line(aline,warnings);

  2705 	check_for_unspaced_bracket(aline);

  2706         if (warnings->endquote)

  2707 	    check_for_unpunctuated_endquote(aline);

  2708 	check_for_html_tag(aline);

  2709 	check_for_html_entity(aline);

  2710         if (isemptyline)

  2711 	{

  2712 	    check_for_mismatched_quotes(&counters,&pending);

  2713 	    memset(&counters,0,sizeof(counters));

  2714 	    /* let the next iteration know that it's starting a new para */

  2715             isnewpara=1;

  2716 	    check_for_omitted_punctuation(prevline,&last,start_para_line);

  2717 	}

  2718         strcpy(prevline,aline);

  2719     }

  2720     fclose(infile);

  2721     if (!pswit[OVERVIEW_SWITCH])

  2722         for (i=0;i<MAX_QWORD;i++)

  2723             if (dupcnt[i])

  2724                 printf("\nNote: Queried word %s was duplicated %d time%s\n",

  2725 		  qword[i],dupcnt[i],"s");

  2726 }

  2728 /*

  2729  * flgets:

  2730  *

  2731  * Get one line from the input stream, checking for

  2732  * the existence of exactly one CR/LF line-end per line.

  2733  *

  2734  * Returns: a pointer to the line.

  2735  */

  2736 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)

  2737 {

  2738     char c;

  2739     int len,isCR,cint;

  2740     *theline=0;

  2741     len=isCR=0;

  2742     c=cint=fgetc(thefile);

  2743     do

  2744     {

  2745         if (cint==EOF)

  2746             return NULL;

  2747 	/* either way, it's end of line */

  2748         if (c==10)

  2749 	{

  2750             if (isCR)

  2751                 break;

  2752             else

  2753 	    {

  2754 		/* Error - a LF without a preceding CR */

  2755                 if (pswit[LINE_END_SWITCH])

  2756 		{

  2757                     if (pswit[ECHO_SWITCH])

  2758 			printf("\n%s\n",theline);

  2759                     if (!pswit[OVERVIEW_SWITCH])

  2760                         printf("    Line %ld - No CR?\n",lcnt);

  2761                     else

  2762                         cnt_lineend++;

  2763 		}

  2764                 break;

  2765 	    }

  2766 	}

  2767         if (c==13)

  2768 	{

  2769             if (isCR)

  2770 	    {

  2771 		/* Error - two successive CRs */

  2772                 if (pswit[LINE_END_SWITCH])

  2773 		{

  2774                     if (pswit[ECHO_SWITCH])

  2775 			printf("\n%s\n",theline);

  2776                     if (!pswit[OVERVIEW_SWITCH])

  2777                         printf("    Line %ld - Two successive CRs?\n",lcnt);

  2778                     else

  2779                         cnt_lineend++;

  2780 		}

  2781 	    }

  2782             isCR=1;

  2783 	}

  2784         else

  2785 	{

  2786             if (pswit[LINE_END_SWITCH] && isCR)

  2787 	    {

  2788                 if (pswit[ECHO_SWITCH])

  2789 		    printf("\n%s\n",theline);

  2790                 if (!pswit[OVERVIEW_SWITCH])

  2791                     printf("    Line %ld column %d - CR without LF?\n",

  2792 		      lcnt,len+1);

  2793                 else

  2794                     cnt_lineend++;

  2795 	    }

  2796             theline[len]=c;

  2797             len++;

  2798             theline[len]=0;

  2799             isCR=0;

  2800 	}

  2801         c=cint=fgetc(thefile);

  2802     } while(len<maxlen);

  2803     if (pswit[MARKUP_SWITCH])

  2804         postprocess_for_HTML(theline);

  2805     if (pswit[DP_SWITCH])

  2806         postprocess_for_DP(theline);

  2807     return theline;

  2808 }

  2810 /*

  2811  * mixdigit:

  2812  *

  2813  * Takes a "word" as a parameter, and checks whether it

  2814  * contains a mixture of alpha and digits. Generally, this is an

  2815  * error, but may not be for cases like 4th or L5 12s. 3d.

  2816  *

  2817  * Returns: 0 if no error found, 1 if error.

  2818  */

  2819 int mixdigit(char *checkword)

  2820 {

  2821     int wehaveadigit,wehavealetter,firstdigits,query,wl;

  2822     char *s;

  2823     wehaveadigit=wehavealetter=query=0;

  2824     for (s=checkword;*s;s++)

  2825         if (gcisalpha(*s))

  2826             wehavealetter=1;

  2827         else

  2828             if (gcisdigit(*s))

  2829                 wehaveadigit=1;

  2830     if (wehaveadigit && wehavealetter)

  2831     {

  2832 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2833         query=1;

  2834         wl=strlen(checkword);

  2835         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)

  2836             ;

  2837         /* digits, ending in st, rd, nd, th of either case */

  2838         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||

  2839 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||

  2840 	  matchword(checkword+wl-2,"th")))

  2841 	    query=0;

  2842         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||

  2843 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||

  2844 	  matchword(checkword+wl-3,"ths")))

  2845 	    query=0;

  2846         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||

  2847 	  matchword(checkword+wl-4,"rdly") ||

  2848 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))

  2849 	    query=0;

  2850         /* digits, ending in l, L, s or d */

  2851         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||

  2852 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))

  2853 	    query=0;

  2854         /*

  2855 	 * L at the start of a number, representing Britsh pounds, like L500.

  2856          * This is cute. We know the current word is mixeddigit. If the first

  2857          * letter is L, there must be at least one digit following. If both

  2858          * digits and letters follow, we have a genuine error, else we have a

  2859          * capital L followed by digits, and we accept that as a non-error.

  2860 	 */

  2861         if (checkword[0]=='L' && !mixdigit(checkword+1))

  2862 	    query=0;

  2863     }

  2864     return query;

  2865 }

  2867 /*

  2868  * getaword:

  2869  *

  2870  * Extracts the first/next "word" from the line, and puts

  2871  * it into "thisword". A word is defined as one English word unit--or

  2872  * at least that's the aim.

  2873  *

  2874  * Returns: a pointer to the position in the line where we will start

  2875  *          looking for the next word.

  2876  */

  2877 const char *getaword(const char *fromline,char *thisword)

  2878 {

  2879     int i,wordlen;

  2880     const char *s;

  2881     wordlen=0;

  2882     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;

  2883       fromline++)

  2884 	;

  2885     /*

  2886      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2887      * Especially yucky is the case of L1,000

  2888      * This section looks for a pattern of characters including a digit

  2889      * followed by a comma or period followed by one or more digits.

  2890      * If found, it returns this whole pattern as a word; otherwise we discard

  2891      * the results and resume our normal programming.

  2892      */

  2893     s=fromline;

  2894     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&

  2895       wordlen<MAXWORDLEN;s++)

  2896     {

  2897 	thisword[wordlen]=*s;

  2898         wordlen++;

  2899     }

  2900     thisword[wordlen]=0;

  2901     for (i=1;i<wordlen-1;i++)

  2902     {

  2903         if (thisword[i]=='.' || thisword[i]==',')

  2904 	{

  2905             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))

  2906 	    {

  2907                 fromline=s;

  2908                 return fromline;

  2909 	    }

  2910 	}

  2911     }

  2912     /* we didn't find a punctuated number - do the regular getword thing */

  2913     wordlen=0;

  2914     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&

  2915       wordlen<MAXWORDLEN;fromline++)

  2916     {

  2917         thisword[wordlen]=*fromline;

  2918         wordlen++;

  2919     }

  2920     thisword[wordlen]=0;

  2921     return fromline;

  2922 }

  2924 /*

  2925  * matchword:

  2926  *

  2927  * A case-insensitive string matcher.

  2928  */

  2929 int matchword(char *checkfor,char *thisword)

  2930 {

  2931     unsigned int ismatch,i;

  2932     if (strlen(checkfor)!=strlen(thisword))

  2933 	return 0;

  2934     ismatch=1;     /* assume a match until we find a difference */

  2935     for (i=0;i<strlen(checkfor);i++)

  2936         if (toupper(checkfor[i])!=toupper(thisword[i]))

  2937             ismatch=0;

  2938     return ismatch;

  2939 }

  2941 /*

  2942  * lowerit:

  2943  *

  2944  * Lowercase the line.

  2945  */

  2947 void lowerit(char *theline)

  2948 {

  2949     for (;*theline;theline++)

  2950         if (*theline>='A' && *theline<='Z')

  2951             *theline+=32;

  2952 }

  2954 /*

  2955  * isroman:

  2956  *

  2957  * Is this word a Roman Numeral?

  2958  *

  2959  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2960  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2961  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2962  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2963  * expressions thereof, except when it came to taxes. Allow any number of M,

  2964  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  2965  * XL or an optional XC, an optional IX or IV, an optional V and any number

  2966  * of optional Is.

  2967  */

  2968 int isroman(char *t)

  2969 {

  2970     char *s;

  2971     if (!t || !*t)

  2972 	return 0;

  2973     s=t;

  2974     while (*t=='m' && *t)

  2975 	t++;

  2976     if (*t=='d')

  2977 	t++;

  2978     if (*t=='c' && t[1]=='m')

  2979 	t+=2;

  2980     if (*t=='c' && t[1]=='d')

  2981 	t+=2;

  2982     while (*t=='c' && *t)

  2983 	t++;

  2984     if (*t=='x' && t[1]=='l')

  2985 	t+=2;

  2986     if (*t=='x' && t[1]=='c')

  2987 	t+=2;

  2988     if (*t=='l')

  2989 	t++;

  2990     while (*t=='x' && *t)

  2991 	t++;

  2992     if (*t=='i' && t[1]=='x')

  2993 	t+=2;

  2994     if (*t=='i' && t[1]=='v')

  2995 	t+=2;

  2996     if (*t=='v')

  2997 	t++;

  2998     while (*t=='i' && *t)

  2999 	t++;

  3000     return !*t;

  3001 }

  3003 /*

  3004  * gcisalpha:

  3005  *

  3006  * A version of isalpha() that is somewhat lenient on 8-bit texts.

  3007  * If we use the standard function, 8-bit accented characters break

  3008  * words, so that tete with accented characters appears to be two words, "t"

  3009  * and "t", with 8-bit characters between them. This causes over-reporting of

  3010  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)

  3011  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.

  3012  */

  3013 int gcisalpha(unsigned char c)

  3014 {

  3015     if (c>='a' && c<='z')

  3016 	return 1;

  3017     if (c>='A' && c<='Z')

  3018 	return 1;

  3019     if (c<140)

  3020 	return 0;

  3021     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)

  3022 	return 1;

  3023     if (c==140 || c==142 || c==156 || c==158 || c==159)

  3024 	return 1;

  3025     return 0;

  3026 }

  3028 /*

  3029  * gcisdigit:

  3030  *

  3031  * A version of isdigit() that doesn't get confused in 8-bit texts.

  3032  */

  3033 int gcisdigit(unsigned char c)

  3034 {

  3035     return c>='0' && c<='9';

  3036 }

  3038 /*

  3039  * gcisletter:

  3040  *

  3041  * A version of isletter() that doesn't get confused in 8-bit texts.

  3042  * NB: this is ISO-8891-1-specific.

  3043  */

  3044 int gcisletter(unsigned char c)

  3045 {

  3046     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;

  3047 }

  3049 /*

  3050  * gcstrchr:

  3051  *

  3052  * Wraps strchr to return NULL if the character being searched for is zero.

  3053  */

  3054 char *gcstrchr(char *s,char c)

  3055 {

  3056     if (!c)

  3057 	return NULL;

  3058     return strchr(s,c);

  3059 }

  3061 /*

  3062  * postprocess_for_DP:

  3063  *

  3064  * Invoked with the -d switch from flgets().

  3065  * It simply "removes" from the line a hard-coded set of common

  3066  * DP-specific tags, so that the line passed to the main routine has

  3067  * been pre-cleaned of DP markup.

  3068  */

  3069 void postprocess_for_DP(char *theline)

  3070 {

  3071     char *s,*t;

  3072     int i;

  3073     if (!*theline)

  3074         return;

  3075     for (i=0;*DPmarkup[i];i++)

  3076     {

  3077         s=strstr(theline,DPmarkup[i]);

  3078         while (s)

  3079 	{

  3080             t=s+strlen(DPmarkup[i]);

  3081             while (*t)

  3082 	    {

  3083                 *s=*t;

  3084                 t++;

  3085 		s++;

  3086 	    }

  3087             *s=0;

  3088             s=strstr(theline,DPmarkup[i]);

  3089 	}

  3090     }

  3091 }

  3093 /*

  3094  * postprocess_for_HTML:

  3095  *

  3096  * Invoked with the -m switch from flgets().

  3097  * It simply "removes" from the line a hard-coded set of common

  3098  * HTML tags and "replaces" a hard-coded set of common HTML

  3099  * entities, so that the line passed to the main routine has

  3100  * been pre-cleaned of HTML.

  3101  */

  3102 void postprocess_for_HTML(char *theline)

  3103 {

  3104     if (strstr(theline,"<") && strstr(theline,">"))

  3105         while (losemarkup(theline))

  3106             ;

  3107     while (loseentities(theline))

  3108         ;

  3109 }

  3111 char *losemarkup(char *theline)

  3112 {

  3113     char *s,*t;

  3114     int i;

  3115     if (!*theline)

  3116         return NULL;

  3117     s=strstr(theline,"<");

  3118     t=strstr(theline,">");

  3119     if (!s || !t)

  3120 	return NULL;

  3121     for (i=0;*markup[i];i++)

  3122         if (!tagcomp(s+1,markup[i]))

  3123 	{

  3124             if (!t[1])

  3125 	    {

  3126                 *s=0;

  3127                 return s;

  3128 	    }

  3129             else if (t>s)

  3130 	    {

  3131 		strcpy(s,t+1);

  3132 		return s;

  3133 	    }

  3134         }

  3135     /* It's an unrecognized <xxx>. */

  3136     return NULL;

  3137 }

  3139 char *loseentities(char *theline)

  3140 {

  3141     int i;

  3142     char *s,*t;

  3143     if (!*theline)

  3144         return NULL;

  3145     for (i=0;*entities[i].htmlent;i++)

  3146     {

  3147         s=strstr(theline,entities[i].htmlent);

  3148         if (s)

  3149 	{

  3150             t=malloc((size_t)strlen(s));

  3151             if (!t)

  3152 		return NULL;

  3153             strcpy(t,s+strlen(entities[i].htmlent));

  3154             strcpy(s,entities[i].textent);

  3155             strcat(s,t);

  3156             free(t);

  3157             return theline;

  3158 	}

  3159     }

  3160     for (i=0;*entities[i].htmlnum;i++)

  3161     {

  3162         s=strstr(theline,entities[i].htmlnum);

  3163         if (s)

  3164 	{

  3165             t=malloc((size_t)strlen(s));

  3166             if (!t)

  3167 		return NULL;

  3168             strcpy(t,s+strlen(entities[i].htmlnum));

  3169             strcpy(s,entities[i].textent);

  3170             strcat(s,t);

  3171             free(t);

  3172             return theline;

  3173 	}

  3174     }

  3175     return NULL;

  3176 }

  3178 int tagcomp(char *strin,char *basetag)

  3179 {

  3180     char *s,*t;

  3181     s=basetag;

  3182     t=strin;

  3183     if (*t=='/')

  3184 	t++; /* ignore a slash */

  3185     while (*s && *t)

  3186     {

  3187         if (tolower(*s)!=tolower(*t))

  3188 	    return 1;

  3189         s++;

  3190 	t++;

  3191     }

  3192     return 0;

  3193 }

  3195 void proghelp()

  3196 {

  3197     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3198     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3199     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3200     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3201       "For details, read the file COPYING.\n",stderr);

  3202     fputs("This is Free Software; "

  3203       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3204     fputs("read the file COPYING for details.\n\n",stderr);

  3205     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);

  3206     fputs("  where -s checks single quotes, -e suppresses echoing lines, "

  3207       "-t checks typos\n",stderr);

  3208     fputs("  -x (paranoid) switches OFF -t and extra checks, "

  3209       "-l turns OFF line-end checks\n",stderr);

  3210     fputs("  -o just displays overview without detail, "

  3211       "-h echoes header fields\n",stderr);

  3212     fputs("  -v (verbose) unsuppresses duplicate reporting, "

  3213       "-m suppresses markup\n",stderr);

  3214     fputs("  -d ignores DP-specific markup,\n",stderr);

  3215     fputs("  -u uses a file gutcheck.typ to query user-defined "

  3216       "possible typos\n",stderr);

  3217     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);

  3218     fputs("\n",stderr);

  3219     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",

  3220       stderr);

  3221     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3222       "non-ASCII\n",stderr);

  3223     fputs("characters like accented letters, "

  3224       "lines longer than 75 or shorter than 55,\n",stderr);

  3225     fputs("unbalanced quotes or brackets, "

  3226       "a variety of badly formatted punctuation, \n",stderr);

  3227     fputs("HTML tags, some likely typos. "

  3228       "It is NOT a substitute for human judgement.\n",stderr);

  3229     fputs("\n",stderr);

  3230 }

author	ali <ali@juiblex.co.uk>
	Sun May 26 22:43:45 2013 +0100 (2013-05-26)
changeset 67	865063352146
parent 66	a5ef278feb34
child 68	adb087007d08
permissions	-rw-r--r--