bookloupe-testing: bookloupe/bookloupe.c@a5ef278feb34

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*                                                                       */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */

     6 /*                                                                       */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.                                   */

    11 /*                                                                       */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */

    15 /* GNU General Public License for more details.                          */

    16 /*                                                                       */

    17 /* You should have received a copy of the GNU General Public License     */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    26 #define MAXWORDLEN    80    /* max length of one word             */

    27 #define LINEBUFSIZE 2048    /* buffer size for an input line      */

    29 #define MAX_USER_TYPOS 1000

    30 #define USERTYPO_FILE "gutcheck.typ"

    32 #ifndef MAX_PATH

    33 #define MAX_PATH 16384

    34 #endif

    36 char aline[LINEBUFSIZE];

    37 char prevline[LINEBUFSIZE];

    39 /* Common typos. */

    40 char *typo[] = {

    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    70     "se", ""

    71 };

    73 char *usertypo[MAX_USER_TYPOS];

    75 /* Common abbreviations and other OK words not to query as typos. */

    76 char *okword[] = {

    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    80     "outbid", "outbids", "frostbite", "frostbitten", ""

    81 };

    83 /* Common abbreviations that cause otherwise unexplained periods. */

    84 char *abbrev[] = {

    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    87 };

    89 /*

    90  * Two-Letter combinations that rarely if ever start words,

    91  * but are common scannos or otherwise common letter combinations.

    92  */

    93 char *nostart[] = {

    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    95 };

    97 /*

    98  * Two-Letter combinations that rarely if ever end words,

    99  * but are common scannos or otherwise common letter combinations.

   100  */

   101 char *noend[] = {

   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   103     "sw", "gr", "sl", "cl", "iy", ""

   104 };

   106 char *markup[] = {

   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   111 };

   113 char *DPmarkup[] = {

   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   115 };

   117 char *nocomma[] = {

   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   121     "during", "let", "toward", "among", ""

   122 };

   124 char *noperiod[] = {

   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   128     "among", "those", "into", "whom", "having", "thence", ""

   129 };

   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";

   133 struct {

   134     char *htmlent;

   135     char *htmlnum;

   136     char *textent;

   137 } entities[] = {

   138     "&amp;",	"&#38;",     "&",

   139     "&lt;",	"&#60;",     "<",

   140     "&gt;",	"&#62;",     ">",

   141     "&deg;",	"&#176;",    " degrees",

   142     "&pound;",	"&#163;",    "L",

   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */

   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */

   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */

   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */

   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */

   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */

   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */

   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */

   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */

   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */

   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */

   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */

   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */

   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */

   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */

   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */

   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */

   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */

   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */

   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */

   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */

   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */

   165     "&cent;",	"&#162;",    "c", /* cent sign */

   166     "&pound;",	"&#163;",    "L", /* pound sign */

   167     "&curren;",	"&#164;",    "$", /* currency sign */

   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */

   169     "&sect;",	"&#167;",    "--", /* section sign */

   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */

   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */

   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */

   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */

   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */

   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */

   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */

   177     "&deg;",	"&#176;",    " degrees", /* degree sign */

   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */

   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */

   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */

   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */

   182     "&micro;",	"&#181;",    "m", /* micro sign */

   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */

   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */

   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */

   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */

   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */

   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */

   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */

   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */

   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */

   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */

   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */

   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */

   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */

   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */

   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */

   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */

   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */

   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */

   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */

   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */

   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */

   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */

   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */

   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */

   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */

   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */

   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */

   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */

   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */

   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */

   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */

   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */

   215     "&times;",	"&#215;",    "*", /* multiplication sign */

   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */

   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */

   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */

   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */

   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */

   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */

   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */

   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */

   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */

   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */

   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */

   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */

   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */

   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */

   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */

   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */

   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */

   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */

   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */

   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */

   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */

   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */

   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */

   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */

   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */

   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */

   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */

   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */

   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */

   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */

   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */

   247     "&divide;",	"&#247;",    "/", /* division sign */

   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */

   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */

   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */

   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */

   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */

   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */

   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */

   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */

   256     "", ""

   257 };

   259 /* special characters */

   260 #define CHAR_SPACE        32

   261 #define CHAR_TAB           9

   262 #define CHAR_LF           10

   263 #define CHAR_CR           13

   264 #define CHAR_DQUOTE       34

   265 #define CHAR_SQUOTE       39

   266 #define CHAR_OPEN_SQUOTE  96

   267 #define CHAR_TILDE       126

   268 #define CHAR_ASTERISK     42

   269 #define CHAR_FORESLASH    47

   270 #define CHAR_CARAT        94

   272 #define CHAR_UNDERSCORE    '_'

   273 #define CHAR_OPEN_CBRACK   '{'

   274 #define CHAR_CLOSE_CBRACK  '}'

   275 #define CHAR_OPEN_RBRACK   '('

   276 #define CHAR_CLOSE_RBRACK  ')'

   277 #define CHAR_OPEN_SBRACK   '['

   278 #define CHAR_CLOSE_SBRACK  ']'

   280 /* longest and shortest normal PG line lengths */

   281 #define LONGEST_PG_LINE   75

   282 #define WAY_TOO_LONG      80

   283 #define SHORTEST_PG_LINE  55

   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */

   286                                   /*     D - ignore DP-specific markup     */

   287                                   /*     E - echo queried line             */

   288                                   /*     S - check single quotes           */

   289                                   /*     T - check common typos            */

   290                                   /*     P - require closure of quotes on  */

   291                                   /*         every paragraph               */

   292                                   /*     X - "Trust no one" :-) Paranoid!  */

   293                                   /*         Queries everything            */

   294                                   /*     L - line end checking defaults on */

   295                                   /*         -L turns it off               */

   296                                   /*     O - overview. Just shows counts.  */

   297                                   /*     Y - puts errors to stdout         */

   298                                   /*         instead of stderr             */

   299                                   /*     H - Echoes header fields          */

   300                                   /*     M - Ignore markup in < >          */

   301                                   /*     U - Use file of User-defined Typos*/

   302                                   /*     W - Defaults for use on Web upload*/

   303                                   /*     V - Verbose - list EVERYTHING!    */

   304 #define SWITNO 14                 /* max number of switch parms            */

   305                                   /*        - used for defining array-size */

   306 #define MINARGS   1               /* minimum no of args excl switches      */

   307 #define MAXARGS   1               /* maximum no of args excl switches      */

   309 int pswit[SWITNO];                /* program switches set by SWITCHES      */

   311 #define ECHO_SWITCH      0

   312 #define SQUOTE_SWITCH    1

   313 #define TYPO_SWITCH      2

   314 #define QPARA_SWITCH     3

   315 #define PARANOID_SWITCH  4

   316 #define LINE_END_SWITCH  5

   317 #define OVERVIEW_SWITCH  6

   318 #define STDOUT_SWITCH    7

   319 #define HEADER_SWITCH    8

   320 #define WEB_SWITCH       9

   321 #define VERBOSE_SWITCH   10

   322 #define MARKUP_SWITCH    11

   323 #define USERTYPO_SWITCH  12

   324 #define DP_SWITCH        13

   326 long cnt_dquot;       /* for overview mode, count of doublequote queries */

   327 long cnt_squot;       /* for overview mode, count of singlequote queries */

   328 long cnt_brack;       /* for overview mode, count of brackets queries */

   329 long cnt_bin;         /* for overview mode, count of non-ASCII queries */

   330 long cnt_odd;         /* for overview mode, count of odd character queries */

   331 long cnt_long;        /* for overview mode, count of long line errors */

   332 long cnt_short;       /* for overview mode, count of short line queries */

   333 long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */

   334 long cnt_dash;        /* for overview mode, count of dash-related queries */

   335 long cnt_word;        /* for overview mode, count of word queries */

   336 long cnt_html;        /* for overview mode, count of html queries */

   337 long cnt_lineend;     /* for overview mode, count of line-end queries */

   338 long cnt_spacend;     /* count of lines with space at end */

   339 long linecnt;         /* count of total lines in the file */

   340 long checked_linecnt; /* count of lines actually checked */

   342 void proghelp(void);

   343 void procfile(char *);

   345 #define LOW_THRESHOLD    0

   346 #define HIGH_THRESHOLD   1

   348 #define START 0

   349 #define END 1

   350 #define PREV 0

   351 #define NEXT 1

   352 #define FIRST_OF_PAIR 0

   353 #define SECOND_OF_PAIR 1

   355 #define MAX_WORDPAIR 1000

   357 char running_from[MAX_PATH];

   359 int mixdigit(char *);

   360 const char *getaword(const char *,char *);

   361 int matchword(char *,char *);

   362 char *flgets(char *,int,FILE *,long);

   363 void lowerit(char *);

   364 int gcisalpha(unsigned char);

   365 int gcisdigit(unsigned char);

   366 int gcisletter(unsigned char);

   367 char *gcstrchr(char *s,char c);

   368 void postprocess_for_HTML(char *);

   369 char *linehasmarkup(char *);

   370 char *losemarkup(char *);

   371 int tagcomp(char *,char *);

   372 char *loseentities(char *);

   373 int isroman(char *);

   374 int usertypo_count;

   375 void postprocess_for_DP(char *);

   377 char wrk[LINEBUFSIZE];

   379 #define MAX_QWORD 50

   380 #define MAX_QWORD_LENGTH 40

   381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];

   382 signed int dupcnt[MAX_QWORD];

   384 int main(int argc,char **argv)

   385 {

   386     char *argsw,*s;

   387     int i,switno,invarg;

   388     char usertypo_file[MAX_PATH];

   389     FILE *usertypofile;

   390     if (strlen(argv[0])<sizeof(running_from))

   391 	/* save the path to the executable */

   392         strcpy(running_from,argv[0]);

   393     /* find out what directory we're running from */

   394     s=running_from+strlen(running_from);

   395     for (;*s!='/' && *s!='\\' && s>=running_from;s--)

   396         *s=0;

   397     switno=strlen(SWITCHES);

   398     for (i=switno;--i>0;)

   399         pswit[i]=0;           /* initialise switches */

   400     /*

   401      * Standard loop to extract switches.

   402      * When we come out of this loop, the arguments will be

   403      * in argv[0] upwards and the switches used will be

   404      * represented by their equivalent elements in pswit[]

   405      */

   406     while (--argc>0 && **++argv=='-')

   407         for (argsw=argv[0]+1;*argsw!='\0';argsw++)

   408             for (i=switno,invarg=1;(--i>=0) && invarg==1;)

   409                 if ((toupper(*argsw))==SWITCHES[i])

   410 		{

   411                     invarg=0;

   412                     pswit[i]=1;

   413 		}

   414     /* Paranoid checking is turned OFF, not on, by its switch */

   415     pswit[PARANOID_SWITCH]^=1;

   416     if (pswit[PARANOID_SWITCH])

   417 	/* if running in paranoid mode force typo checks as well   */

   418         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;

   419     /* Line-end checking is turned OFF, not on, by its switch */

   420     pswit[LINE_END_SWITCH]^=1;

   421     /* Echoing is turned OFF, not on, by its switch */

   422     pswit[ECHO_SWITCH]^=1;

   423     if (pswit[OVERVIEW_SWITCH])

   424 	/* just print summary; don't echo */

   425         pswit[ECHO_SWITCH]=0;

   426     /*

   427      * Web uploads - for the moment, this is really just a placeholder

   428      * until we decide what processing we really want to do on web uploads

   429      */

   430     if (pswit[WEB_SWITCH])

   431     {

   432 	/* specific override for web uploads */

   433         pswit[ECHO_SWITCH]=1;

   434         pswit[SQUOTE_SWITCH]=0;

   435         pswit[TYPO_SWITCH]=1;

   436         pswit[QPARA_SWITCH]=0;

   437         pswit[PARANOID_SWITCH]=1;

   438         pswit[LINE_END_SWITCH]=0;

   439         pswit[OVERVIEW_SWITCH]=0;

   440         pswit[STDOUT_SWITCH]=0;

   441         pswit[HEADER_SWITCH]=1;

   442         pswit[VERBOSE_SWITCH]=0;

   443         pswit[MARKUP_SWITCH]=0;

   444         pswit[USERTYPO_SWITCH]=0;

   445         pswit[DP_SWITCH]=0;

   446     }

   447     if (argc<MINARGS || argc>MAXARGS)

   448     {

   449 	/* check number of args */

   450         proghelp();

   451         return 1;

   452     }

   453     /* read in the user-defined stealth scanno list */

   454     if (pswit[USERTYPO_SWITCH])

   455     {

   456 	/* ... we were told we had one! */

   457         usertypofile=fopen(USERTYPO_FILE,"rb");

   458         if (!usertypofile)

   459 	{

   460 	    /* not in cwd. try excuteable directory. */

   461             strcpy(usertypo_file,running_from);

   462             strcat(usertypo_file,USERTYPO_FILE);

   463             usertypofile=fopen(usertypo_file,"rb");

   464             if (!usertypofile) {

   465 		/* we ain't got no user typo file! */

   466                 printf("   --> I couldn't find gutcheck.typ "

   467 		  "-- proceeding without user typos.\n");

   468 	    }

   469 	}

   470         usertypo_count=0;

   471         if (usertypofile)

   472 	{

   473 	    /* we managed to open a User Typo File! */

   474             if (pswit[USERTYPO_SWITCH])

   475 	    {

   476                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,

   477 		  (long)usertypo_count))

   478 		{

   479                     if (strlen(aline)>1)

   480 		    {

   481                         if ((int)*aline>33)

   482 			{

   483                             s=malloc(strlen(aline)+1);

   484                             if (!s)

   485 			    {

   486                                 fprintf(stderr,"bookloupe: cannot get enough "

   487 				  "memory for user typo file!\n");

   488                                 exit(1);

   489 			    }

   490                             strcpy(s,aline);

   491                             usertypo[usertypo_count]=s;

   492                             usertypo_count++;

   493                             if (usertypo_count>=MAX_USER_TYPOS)

   494 			    {

   495                                 printf("   --> Only %d user-defined typos "

   496 				  "allowed: ignoring the rest\n",

   497 				  MAX_USER_TYPOS);

   498                                 break;

   499 			    }

   500 			}

   501 		    }

   502 		}

   503 	    }

   504             fclose(usertypofile);

   505 	}

   506     }

   507     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   508     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=

   509     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=

   510     cnt_spacend=0;

   511     procfile(argv[0]);

   512     if (pswit[OVERVIEW_SWITCH])

   513     {

   514 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   515 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   516         printf("    --------------- Queries found --------------\n");

   517         if (cnt_long)

   518 	    printf("    Long lines:                    %14ld\n",cnt_long);

   519         if (cnt_short)

   520 	    printf("    Short lines:                   %14ld\n",cnt_short);

   521         if (cnt_lineend)

   522 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);

   523         if (cnt_word)

   524 	    printf("    Common typos:                  %14ld\n",cnt_word);

   525         if (cnt_dquot)

   526 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);

   527         if (cnt_squot)

   528 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);

   529         if (cnt_brack)

   530 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);

   531         if (cnt_bin)

   532 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);

   533         if (cnt_odd)

   534 	    printf("    Proofing characters:           %14ld\n",cnt_odd);

   535         if (cnt_punct)

   536 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   537         if (cnt_dash)

   538 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);

   539         if (cnt_html)

   540 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);

   541         printf("\n");

   542         printf("    TOTAL QUERIES                  %14ld\n",

   543           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   544           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   545     }

   546     return 0;

   547 }

   549 struct first_pass_results {

   550     long firstline,astline;

   551     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   552     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   553     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   554     signed int Dutchcount,Frenchcount;

   555 };

   557 /*

   558  * first_pass:

   559  *

   560  * Run a first pass - verify that it's a valid PG

   561  * file, decide whether to report some things that

   562  * occur many times in the text like long or short

   563  * lines, non-standard dashes, etc.

   564  */

   565 struct first_pass_results *first_pass(FILE *infile)

   566 {

   567     char laststart=CHAR_SPACE;

   568     const char *s;

   569     signed int i,llen;

   570     unsigned int lastlen=0,lastblen=0;

   571     long spline=0,nspline=0;

   572     static struct first_pass_results results={0};

   573     char inword[MAXWORDLEN]="";

   574     while (fgets(aline,LINEBUFSIZE-1,infile))

   575     {

   576         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)

   577 	    aline[strlen(aline)-1]=0;

   578         linecnt++;

   579         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&

   580 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))

   581 	{

   582             if (spline)

   583                 printf("   --> Duplicate header?\n");

   584             spline=linecnt+1;   /* first line of non-header text, that is */

   585 	}

   586         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))

   587 	{

   588             if (nspline)

   589                 printf("   --> Duplicate header?\n");

   590             nspline=linecnt+1;   /* first line of non-header text, that is */

   591 	}

   592         if (spline || nspline)

   593 	{

   594             lowerit(aline);

   595             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))

   596 	    {

   597                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))

   598 		{

   599                     if (results.footerline)

   600 		    {

   601 			/* it's an old-form header - we can detect duplicates */

   602                         if (!nspline)

   603                             printf("   --> Duplicate footer?\n");

   604 		    }

   605                     else

   606                         results.footerline=linecnt;

   607 		}

   608 	    }

   609 	}

   610         if (spline)

   611 	    results.firstline=spline;

   612         if (nspline)

   613 	    results.firstline=nspline;  /* override with new */

   614         if (results.footerline)

   615 	    continue;    /* don't count the boilerplate in the footer */

   616         llen=strlen(aline);

   617         results.totlen+=llen;

   618         for (i=0;i<llen;i++)

   619 	{

   620             if ((unsigned char)aline[i]>127)

   621 		results.binlen++;

   622             if (gcisalpha(aline[i]))

   623 		results.alphalen++;

   624             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

   625 		results.endquote_count++;

   626 	}

   627         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&

   628 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   629 	    results.shortline++;

   630         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)

   631 	    cnt_spacend++;

   632         if (strstr(aline,".,"))

   633 	    results.dotcomma++;

   634         /* only count ast lines for ignoring purposes where there is */

   635         /* locase text on the line */

   636         if (strstr(aline,"*"))

   637 	{

   638             for (s=aline;*s;s++)

   639                 if (*s>='a' && *s<='z')

   640                     break;

   641              if (*s)

   642 		results.astline++;

   643 	}

   644         if (strstr(aline,"/"))

   645             results.fslashline++;

   646         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

   647 	    ;

   648         if (aline[i]=='-' && aline[i-1]!='-')

   649 	    results.hyphens++;

   650         if (llen>LONGEST_PG_LINE)

   651 	    results.longline++;

   652         if (llen>WAY_TOO_LONG)

   653 	    results.verylongline++;

   654         if (strstr(aline,"<") && strstr(aline,">"))

   655 	{

   656             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);

   657             if (i>0)

   658                 results.htmcount++;

   659             if (strstr(aline,"<i>"))

   660 		results.htmcount+=4; /* bonus marks! */

   661 	}

   662         /* Check for spaced em-dashes */

   663         if (strstr(aline,"--"))

   664 	{

   665             results.emdash++;

   666             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||

   667                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   668 		results.space_emdash++;

   669             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&

   670                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   671 		/* count of em-dashes with spaces both sides */

   672 		results.non_PG_space_emdash++;

   673             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&

   674                (*(strstr(aline,"--")+2)!=CHAR_SPACE))

   675 		/* count of PG-type em-dashes with no spaces */

   676 		results.PG_space_emdash++;

   677 	}

   678         for (s=aline;*s;)

   679 	{

   680             s=getaword(s,inword);

   681             if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   682                 results.Dutchcount++;

   683             if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   684                 results.Frenchcount++;

   685             if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   686                 results.standalone_digit++;

   687 	}

   688         /* Check for spaced dashes */

   689         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')

   690 	    results.spacedash++;

   691         lastblen=lastlen;

   692         lastlen=strlen(aline);

   693         laststart=aline[0];

   694     }

   695     return &results;

   696 }

   698 struct warnings {

   699     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   700     signed int endquote,isDutch,isFrench;

   701 };

   703 /*

   704  * report_first_pass:

   705  *

   706  * Make some snap decisions based on the first pass results.

   707  */

   708 struct warnings *report_first_pass(struct first_pass_results *results)

   709 {

   710     static struct warnings warnings={0};

   711     if (cnt_spacend>0)

   712         printf("   --> %ld lines in this file have white space at end\n",

   713 	  cnt_spacend);

   714     warnings.dotcomma=1;

   715     if (results->dotcomma>5)

   716     {

   717         warnings.dotcomma=0;

   718         printf("   --> %ld lines in this file contain '.,'. "

   719 	  "Not reporting them.\n",results->dotcomma);

   720     }

   721     /*

   722      * If more than 50 lines, or one-tenth, are short,

   723      * don't bother reporting them.

   724      */

   725     warnings.shortline=1;

   726     if (results->shortline>50 || results->shortline*10>linecnt)

   727     {

   728         warnings.shortline=0;

   729         printf("   --> %ld lines in this file are short. "

   730 	  "Not reporting short lines.\n",results->shortline);

   731     }

   732     /*

   733      * If more than 50 lines, or one-tenth, are long,

   734      * don't bother reporting them.

   735      */

   736     warnings.longline=1;

   737     if (results->longline>50 || results->longline*10>linecnt)

   738     {

   739         warnings.longline=0;

   740         printf("   --> %ld lines in this file are long. "

   741 	  "Not reporting long lines.\n",results->longline);

   742     }

   743     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   744     warnings.ast=1;

   745     if (results->astline>10)

   746     {

   747         warnings.ast=0;

   748         printf("   --> %ld lines in this file contain asterisks. "

   749 	  "Not reporting them.\n",results->astline);

   750     }

   751     /*

   752      * If more than 10 lines contain forward slashes,

   753      * don't bother reporting them.

   754      */

   755     warnings.fslash=1;

   756     if (results->fslashline>10)

   757     {

   758         warnings.fslash=0;

   759         printf("   --> %ld lines in this file contain forward slashes. "

   760 	  "Not reporting them.\n",results->fslashline);

   761     }

   762     /*

   763      * If more than 20 lines contain unpunctuated endquotes,

   764      * don't bother reporting them.

   765      */

   766     warnings.endquote=1;

   767     if (results->endquote_count>20)

   768     {

   769         warnings.endquote=0;

   770         printf("   --> %ld lines in this file contain unpunctuated endquotes. "

   771 	  "Not reporting them.\n",results->endquote_count);

   772     }

   773     /*

   774      * If more than 15 lines contain standalone digits,

   775      * don't bother reporting them.

   776      */

   777     warnings.digit=1;

   778     if (results->standalone_digit>10)

   779     {

   780         warnings.digit=0;

   781         printf("   --> %ld lines in this file contain standalone 0s and 1s. "

   782 	  "Not reporting them.\n",results->standalone_digit);

   783     }

   784     /*

   785      * If more than 20 lines contain hyphens at end,

   786      * don't bother reporting them.

   787      */

   788     warnings.hyphen=1;

   789     if (results->hyphens>20)

   790     {

   791         warnings.hyphen=0;

   792         printf("   --> %ld lines in this file have hyphens at end. "

   793 	  "Not reporting them.\n",results->hyphens);

   794     }

   795     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   796     {

   797         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   798         pswit[MARKUP_SWITCH]=1;

   799     }

   800     if (results->verylongline>0)

   801         printf("   --> %ld lines in this file are VERY long!\n",

   802 	  results->verylongline);

   803     /*

   804      * If there are more non-PG spaced dashes than PG em-dashes,

   805      * assume it's deliberate.

   806      * Current PG guidelines say don't use them, but older texts do,

   807      * and some people insist on them whatever the guidelines say.

   808      */

   809     warnings.dash=1;

   810     if (results->spacedash+results->non_PG_space_emdash>

   811       results->PG_space_emdash)

   812     {

   813         warnings.dash=0;

   814         printf("   --> There are %ld spaced dashes and em-dashes. "

   815 	  "Not reporting them.\n",

   816 	  results->spacedash+results->non_PG_space_emdash);

   817     }

   818     /* If more than a quarter of characters are hi-bit, bug out. */

   819     warnings.bin=1;

   820     if (results->binlen*4>results->totlen)

   821     {

   822         printf("   --> This file does not appear to be ASCII. "

   823 	  "Terminating. Best of luck with it!\n");

   824         exit(1);

   825     }

   826     if (results->alphalen*4<results->totlen)

   827     {

   828         printf("   --> This file does not appear to be text. "

   829 	  "Terminating. Best of luck with it!\n");

   830         exit(1);

   831     }

   832     if (results->binlen*100>results->totlen || results->binlen>100)

   833     {

   834         printf("   --> There are a lot of foreign letters here. "

   835 	  "Not reporting them.\n");

   836         warnings.bin=0;

   837     }

   838     warnings.isDutch=0;

   839     if (results->Dutchcount>50)

   840     {

   841         warnings.isDutch=1;

   842         printf("   --> This looks like Dutch - "

   843 	  "switching off dashes and warnings for 's Middags case.\n");

   844     }

   845     warnings.isFrench=0;

   846     if (results->Frenchcount>50)

   847     {

   848         warnings.isFrench=1;

   849         printf("   --> This looks like French - "

   850 	  "switching off some doublepunct.\n");

   851     }

   852     if (results->firstline && results->footerline)

   853         printf("    The PG header and footer appear to be already on.\n");

   854     else

   855     {

   856         if (results->firstline)

   857             printf("    The PG header is on - no footer.\n");

   858         if (results->footerline)

   859             printf("    The PG footer is on - no header.\n");

   860     }

   861     printf("\n");

   862     if (pswit[VERBOSE_SWITCH])

   863     {

   864         warnings.bin=1;

   865         warnings.shortline=1;

   866         warnings.dotcomma=1;

   867         warnings.longline=1;

   868         warnings.dash=1;

   869         warnings.digit=1;

   870         warnings.ast=1;

   871         warnings.fslash=1;

   872         warnings.hyphen=1;

   873         warnings.endquote=1;

   874         printf("   *** Verbose output is ON -- you asked for it! ***\n");

   875     }

   876     if (warnings.isDutch)

   877         warnings.dash=0;

   878     if (results->footerline>0 && results->firstline>0 &&

   879       results->footerline>results->firstline &&

   880       results->footerline-results->firstline<100)

   881     {

   882         printf("   --> I don't really know where this text starts. \n");

   883         printf("       There are no reference points.\n");

   884         printf("       I'm going to have to report the header and footer "

   885 	  "as well.\n");

   886         results->firstline=0;

   887     }

   888     return &warnings;

   889 }

   891 struct counters {

   892     long quot;

   893     signed int c_unders,c_brack,s_brack,r_brack;

   894     signed int open_single_quote,close_single_quote;

   895 };

   897 /*

   898  * analyse_quotes:

   899  *

   900  * Look along the line, accumulate the count of quotes, and see

   901  * if this is an empty line - i.e. a line with nothing on it

   902  * but spaces.

   903  * If line has just spaces, period, * and/or - on it, don't

   904  * count it, since empty lines with asterisks or dashes to

   905  * separate sections are common.

   906  *

   907  * Returns: Non-zero if the line is empty.

   908  */

   909 int analyse_quotes(const char *s,struct counters *counters)

   910 {

   911     signed int guessquote=0;

   912     int isemptyline=1;    /* assume the line is empty until proven otherwise */

   913     while (*s)

   914     {

   915 	if (*s==CHAR_DQUOTE)

   916 	    counters->quot++;

   917 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)

   918 	{

   919 	    if (s==aline)

   920 	    {

   921 		/*

   922 		 * At start of line, it can only be an openquote.

   923 		 * Hardcode a very common exception!

   924 		 */

   925 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))

   926 		    counters->open_single_quote++;

   927 	    }

   928 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))

   929 		/* Do nothing! it's definitely an apostrophe, not a quote */

   930 		;

   931 	    /* it's outside a word - let's check it out */

   932 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))

   933 	    {

   934 		/* it damwell better BE an openquote */

   935 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))

   936 		    /* hardcode a very common exception! */

   937 		    counters->open_single_quote++;

   938 	    }

   939 	    else

   940 	    {

   941 		/* now - is it a closequote? */

   942 		guessquote=0;   /* accumulate clues */

   943 		if (gcisalpha(s[-1]))

   944 		{

   945 		    /* it follows a letter - could be either */

   946 		    guessquote++;

   947 		    if (s[-1]=='s')

   948 		    {

   949 			/* looks like a plural apostrophe */

   950 			guessquote-=3;

   951 			if (s[1]==CHAR_SPACE)  /* bonus marks! */

   952 			    guessquote-=2;

   953 		    }

   954 		}

   955 		/* it doesn't have a letter either side */

   956 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))

   957 		    guessquote+=8; /* looks like a closequote */

   958 		else

   959 		    guessquote++;

   960 		if (counters->open_single_quote>counters->close_single_quote)

   961 		    /*

   962 		     * Give it the benefit of some doubt,

   963 		     * if a squote is already open.

   964 		     */

   965 		    guessquote++;

   966 		else

   967 		    guessquote--;

   968 		if (guessquote>=0)

   969 		    counters->close_single_quote++;

   970 	    }

   971 	}

   972 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&

   973 	  *s!=13 && *s!=10)

   974 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */

   975 	if (*s==CHAR_UNDERSCORE)

   976 	    counters->c_unders++;

   977 	if (*s==CHAR_OPEN_CBRACK)

   978 	    counters->c_brack++;

   979 	if (*s==CHAR_CLOSE_CBRACK)

   980 	    counters->c_brack--;

   981 	if (*s==CHAR_OPEN_RBRACK)

   982 	    counters->r_brack++;

   983 	if (*s==CHAR_CLOSE_RBRACK)

   984 	    counters->r_brack--;

   985 	if (*s==CHAR_OPEN_SBRACK)

   986 	    counters->s_brack++;

   987 	if (*s==CHAR_CLOSE_SBRACK)

   988 	    counters->s_brack--;

   989 	s++;

   990     }

   991     return isemptyline;

   992 }

   994 /*

   995  * check_for_odd_characters:

   996  *

   997  * Check for binary and other odd characters.

   998  */

   999 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1000   int isemptyline)

  1001 {

  1002     /* Don't repeat multiple warnings on one line. */

  1003     signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;

  1004     const char *s;

  1005     unsigned char c;

  1006     for (s=aline;*s;s++)

  1007     {

  1008 	c=*(unsigned char *)s;

  1009 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))

  1010 	{

  1011 	    if (pswit[ECHO_SWITCH])

  1012 		printf("\n%s\n",aline);

  1013 	    if (!pswit[OVERVIEW_SWITCH])

  1014 		if (c>127 && c<160)

  1015 		    printf("    Line %ld column %d - "

  1016 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);

  1017 		else

  1018 		    printf("    Line %ld column %d - Non-ASCII character %d\n",

  1019 		      linecnt,(int)(s-aline)+1,c);

  1020 	    else

  1021 		cnt_bin++;

  1022 	    eNon_A=1;

  1023 	}

  1024 	if (!eTab && *s==CHAR_TAB)

  1025 	{

  1026 	    if (pswit[ECHO_SWITCH])

  1027 		printf("\n%s\n",aline);

  1028 	    if (!pswit[OVERVIEW_SWITCH])

  1029 		printf("    Line %ld column %d - Tab character?\n",

  1030 		  linecnt,(int)(s-aline)+1);

  1031 	    else

  1032 		cnt_odd++;

  1033 	    eTab=1;

  1034 	}

  1035 	if (!eTilde && *s==CHAR_TILDE)

  1036 	{

  1037 	    /*

  1038 	     * Often used by OCR software to indicate an

  1039 	     * unrecognizable character.

  1040 	     */

  1041 	    if (pswit[ECHO_SWITCH])

  1042 		printf("\n%s\n",aline);

  1043 	    if (!pswit[OVERVIEW_SWITCH])

  1044 		printf("    Line %ld column %d - Tilde character?\n",

  1045 		  linecnt,(int)(s-aline)+1);

  1046 	    else

  1047 		cnt_odd++;

  1048 	    eTilde=1;

  1049 	}

  1050 	if (!eCarat && *s==CHAR_CARAT)

  1051 	{

  1052 	    if (pswit[ECHO_SWITCH])

  1053 		printf("\n%s\n",aline);

  1054 	    if (!pswit[OVERVIEW_SWITCH])

  1055 		printf("    Line %ld column %d - Carat character?\n",

  1056 		  linecnt,(int)(s-aline)+1);

  1057 	    else

  1058 		cnt_odd++;

  1059 	    eCarat=1;

  1060 	}

  1061 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)

  1062 	{

  1063 	    if (pswit[ECHO_SWITCH])

  1064 		printf("\n%s\n",aline);

  1065 	    if (!pswit[OVERVIEW_SWITCH])

  1066 		printf("    Line %ld column %d - Forward slash?\n",

  1067 		  linecnt,(int)(s-aline)+1);

  1068 	    else

  1069 		cnt_odd++;

  1070 	    eFSlash=1;

  1071 	}

  1072 	/*

  1073 	 * Report asterisks only in paranoid mode,

  1074 	 * since they're often deliberate.

  1075 	 */

  1076 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1077 	  *s==CHAR_ASTERISK)

  1078 	{

  1079 	    if (pswit[ECHO_SWITCH])

  1080 		printf("\n%s\n",aline);

  1081 	    if (!pswit[OVERVIEW_SWITCH])

  1082 		printf("    Line %ld column %d - Asterisk?\n",

  1083 		  linecnt,(int)(s-aline)+1);

  1084 	    else

  1085 		cnt_odd++;

  1086 	    eAst=1;

  1087 	}

  1088     }

  1089 }

  1091 /*

  1092  * check_for_long_line:

  1093  *

  1094  * Check for line too long.

  1095  */

  1096 void check_for_long_line(const char *aline)

  1097 {

  1098     if (strlen(aline)>LONGEST_PG_LINE)

  1099     {

  1100 	if (pswit[ECHO_SWITCH])

  1101 	    printf("\n%s\n",aline);

  1102 	if (!pswit[OVERVIEW_SWITCH])

  1103 	    printf("    Line %ld column %d - Long line %d\n",

  1104 	      linecnt,strlen(aline),strlen(aline));

  1105 	else

  1106 	    cnt_long++;

  1107     }

  1108 }

  1110 struct line_properties {

  1111     unsigned int len,blen;

  1112     char start;

  1113 };

  1115 /*

  1116  * check_for_short_line:

  1117  *

  1118  * Check for line too short.

  1119  *

  1120  * This one is a bit trickier to implement: we don't want to

  1121  * flag the last line of a paragraph for being short, so we

  1122  * have to wait until we know that our current line is a

  1123  * "normal" line, then report the _previous_ line if it was too

  1124  * short. We also don't want to report indented lines like

  1125  * chapter heads or formatted quotations. We therefore keep

  1126  * last->len as the length of the last line examined, and

  1127  * last->blen as the length of the last but one, and try to

  1128  * suppress unnecessary warnings by checking that both were of

  1129  * "normal" length. We keep the first character of the last

  1130  * line in last->start, and if it was a space, we assume that

  1131  * the formatting is deliberate. I can't figure out a way to

  1132  * distinguish something like a quoted verse left-aligned or

  1133  * the header or footer of a letter from a paragraph of short

  1134  * lines - maybe if I examined the whole paragraph, and if the

  1135  * para has less than, say, 8 lines and if all lines are short,

  1136  * then just assume it's OK? Need to look at some texts to see

  1137  * how often a formula like this would get the right result.

  1138  */

  1139 void check_for_short_line(const char *aline,const struct line_properties *last)

  1140 {

  1141     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&

  1142       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1143     {

  1144 	if (pswit[ECHO_SWITCH])

  1145 	    printf("\n%s\n",prevline);

  1146 	if (!pswit[OVERVIEW_SWITCH])

  1147 	    printf("    Line %ld column %d - Short line %d?\n",

  1148 	      linecnt-1,strlen(prevline),strlen(prevline));

  1149 	else

  1150 	    cnt_short++;

  1151     }

  1152 }

  1154 /*

  1155  * check_for_starting_punctuation:

  1156  *

  1157  * Look for punctuation other than full ellipses at start of line.

  1158  */

  1159 void check_for_starting_punctuation(const char *aline)

  1160 {

  1161     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))

  1162     {

  1163 	if (pswit[ECHO_SWITCH])

  1164 	    printf("\n%s\n",aline);

  1165 	if (!pswit[OVERVIEW_SWITCH])

  1166 	    printf("    Line %ld column 1 - Begins with punctuation?\n",

  1167 	      linecnt);

  1168 	else

  1169 	    cnt_punct++;

  1170     }

  1171 }

  1173 /*

  1174  * check_for_spaced_emdash:

  1175  *

  1176  * Check for spaced em-dashes.

  1177  *

  1178  * We must check _all_ occurrences of "--" on the line

  1179  * hence the loop - even if the first double-dash is OK

  1180  * there may be another that's wrong later on.

  1181  */

  1182 void check_for_spaced_emdash(const char *aline)

  1183 {

  1184     const char *s,*t;

  1185     s=aline;

  1186     while ((t=strstr(s,"--")))

  1187     {

  1188 	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)

  1189 	{

  1190 	    if (pswit[ECHO_SWITCH])

  1191 		printf("\n%s\n",aline);

  1192 	    if (!pswit[OVERVIEW_SWITCH])

  1193 		printf("    Line %ld column %d - Spaced em-dash?\n",

  1194 		  linecnt,(int)(t-aline)+1);

  1195 	    else

  1196 		cnt_dash++;

  1197 	}

  1198 	s=t+2;

  1199     }

  1200 }

  1202 /*

  1203  * check_for_spaced_dash:

  1204  *

  1205  * Check for spaced dashes.

  1206  */

  1207 void check_for_spaced_dash(const char *aline)

  1208 {

  1209     const char *s;

  1210     if ((s=strstr(aline," -")))

  1211     {

  1212 	if (s[2]!='-')

  1213 	{

  1214 	    if (pswit[ECHO_SWITCH])

  1215 		printf("\n%s\n",aline);

  1216 	    if (!pswit[OVERVIEW_SWITCH])

  1217 		printf("    Line %ld column %d - Spaced dash?\n",

  1218 		  linecnt,(int)(s-aline)+1);

  1219 	    else

  1220 		cnt_dash++;

  1221 	}

  1222     }

  1223     else if ((s=strstr(aline,"- ")))

  1224     {

  1225 	if (s==aline || s[-1]!='-')

  1226 	{

  1227 	    if (pswit[ECHO_SWITCH])

  1228 		printf("\n%s\n",aline);

  1229 	    if (!pswit[OVERVIEW_SWITCH])

  1230 		printf("    Line %ld column %d - Spaced dash?\n",

  1231 		  linecnt,(int)(s-aline)+1);

  1232 	    else

  1233 		cnt_dash++;

  1234 	}

  1235     }

  1236 }

  1238 /*

  1239  * check_for_unmarked_paragraphs:

  1240  *

  1241  * Check for unmarked paragraphs indicated by separate speakers.

  1242  *

  1243  * May well be false positive:

  1244  * "Bravo!" "Wonderful!" called the crowd.

  1245  * but useful all the same.

  1246  */

  1247 void check_for_unmarked_paragraphs(const char *aline)

  1248 {

  1249     const char *s;

  1250     s=strstr(aline,"\"  \"");

  1251     if (!s)

  1252 	s=strstr(aline,"\" \"");

  1253     if (s)

  1254     {

  1255 	if (pswit[ECHO_SWITCH])

  1256 	    printf("\n%s\n",aline);

  1257 	if (!pswit[OVERVIEW_SWITCH])

  1258 	    printf("    Line %ld column %d - Query missing paragraph break?\n",

  1259 	      linecnt,(int)(s-aline)+1);

  1260 	else

  1261 	    cnt_punct++;

  1262     }

  1263 }

  1265 /*

  1266  * check_for_jeebies:

  1267  *

  1268  * Check for "to he" and other easy h/b errors.

  1269  *

  1270  * This is a very inadequate effort on the h/b problem,

  1271  * but the phrase "to he" is always an error, whereas "to

  1272  * be" is quite common.

  1273  * Similarly, '"Quiet!", be said.' is a non-be error

  1274  * "to he" is _not_ always an error!:

  1275  *       "Where they went to he couldn't say."

  1276  * Another false positive:

  1277  *       What would "Cinderella" be without the . . .

  1278  * and another: "If he wants to he can see for himself."

  1279  */

  1280 void check_for_jeebies(const char *aline)

  1281 {

  1282     const char *s;

  1283     s=strstr(aline," be could ");

  1284     if (!s)

  1285 	s=strstr(aline," be would ");

  1286     if (!s)

  1287 	s=strstr(aline," was be ");

  1288     if (!s)

  1289 	s=strstr(aline," be is ");

  1290     if (!s)

  1291 	s=strstr(aline," is be ");

  1292     if (!s)

  1293 	s=strstr(aline,"\", be ");

  1294     if (!s)

  1295 	s=strstr(aline,"\" be ");

  1296     if (!s)

  1297 	s=strstr(aline,"\" be ");

  1298     if (!s)

  1299 	s=strstr(aline," to he ");

  1300     if (s)

  1301     {

  1302 	if (pswit[ECHO_SWITCH])

  1303 	    printf("\n%s\n",aline);

  1304 	if (!pswit[OVERVIEW_SWITCH])

  1305 	    printf("    Line %ld column %d - Query he/be error?\n",

  1306 	      linecnt,(int)(s-aline)+1);

  1307 	else

  1308 	    cnt_word++;

  1309     }

  1310     s=strstr(aline," the had ");

  1311     if (!s)

  1312 	s=strstr(aline," a had ");

  1313     if (!s)

  1314 	s=strstr(aline," they bad ");

  1315     if (!s)

  1316 	s=strstr(aline," she bad ");

  1317     if (!s)

  1318 	s=strstr(aline," he bad ");

  1319     if (!s)

  1320 	s=strstr(aline," you bad ");

  1321     if (!s)

  1322 	s=strstr(aline," i bad ");

  1323     if (s)

  1324     {

  1325 	if (pswit[ECHO_SWITCH])

  1326 	    printf("\n%s\n",aline);

  1327 	if (!pswit[OVERVIEW_SWITCH])

  1328 	    printf("    Line %ld column %d - Query had/bad error?\n",

  1329 	      linecnt,(int)(s-aline)+1);

  1330 	else

  1331 	    cnt_word++;

  1332     }

  1333     s=strstr(aline,"; hut ");

  1334     if (!s)

  1335 	s=strstr(aline,", hut ");

  1336     if (s)

  1337     {

  1338 	if (pswit[ECHO_SWITCH])

  1339 	    printf("\n%s\n",aline);

  1340 	if (!pswit[OVERVIEW_SWITCH])

  1341 	    printf("    Line %ld column %d - Query hut/but error?\n",

  1342 	      linecnt,(int)(s-aline)+1);

  1343 	else

  1344 	    cnt_word++;

  1345     }

  1346 }

  1348 /*

  1349  * check_for_mta_from:

  1350  *

  1351  * Special case - angled bracket in front of "From" placed there by an

  1352  * MTA when sending an e-mail.

  1353  */

  1354 void check_for_mta_from(const char *aline)

  1355 {

  1356     const char *s;

  1357     s=strstr(aline,">From");

  1358     if (s)

  1359     {

  1360 	if (pswit[ECHO_SWITCH])

  1361 	    printf("\n%s\n",aline);

  1362 	if (!pswit[OVERVIEW_SWITCH])

  1363 	    printf("    Line %ld column %d - Query angled bracket with From\n",

  1364 	      linecnt,(int)(s-aline)+1);

  1365 	else

  1366 	    cnt_punct++;

  1367     }

  1368 }

  1370 /*

  1371  * check_for_orphan_character:

  1372  *

  1373  * Check for a single character line -

  1374  * often an overflow from bad wrapping.

  1375  */

  1376 void check_for_orphan_character(const char *aline)

  1377 {

  1378     if (*aline && !aline[1])

  1379     {

  1380 	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||

  1381 	  gcisdigit(*aline))

  1382 	    ; /* Nothing - ignore numerals alone on a line. */

  1383 	else

  1384 	{

  1385 	    if (pswit[ECHO_SWITCH])

  1386 		printf("\n%s\n",aline);

  1387 	    if (!pswit[OVERVIEW_SWITCH])

  1388 		printf("    Line %ld column 1 - Query single character line\n",

  1389 		  linecnt);

  1390 	    else

  1391 		cnt_punct++;

  1392 	}

  1393     }

  1394 }

  1396 /*

  1397  * check_for_pling_scanno:

  1398  *

  1399  * Check for I" - often should be !

  1400  */

  1401 void check_for_pling_scanno(const char *aline)

  1402 {

  1403     const char *s;

  1404     s=strstr(aline," I\"");

  1405     if (s)

  1406     {

  1407 	if (pswit[ECHO_SWITCH])

  1408 	    printf("\n%s\n",aline);

  1409 	if (!pswit[OVERVIEW_SWITCH])

  1410 	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",

  1411 	      linecnt,s-aline);

  1412 	else

  1413 	    cnt_punct++;

  1414     }

  1415 }

  1417 /*

  1418  * check_for_extra_period:

  1419  *

  1420  * Check for period without a capital letter. Cut-down from gutspell.

  1421  * Only works when it happens on a single line.

  1422  */

  1423 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1424 {

  1425     const char *s,*t,*s1;

  1426     signed int i,istypo,isdup;

  1427     static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];

  1428     static int qperiod_index=0;

  1429     char testword[MAXWORDLEN]="";

  1430     if (pswit[PARANOID_SWITCH])

  1431     {

  1432 	for (t=s=aline;strstr(t,". ");)

  1433 	{

  1434 	    t=strstr(t,". ");

  1435 	    if (t==s)

  1436 	    {

  1437 		t++;

  1438 		/* start of line punctuation is handled elsewhere */

  1439 		continue;

  1440 	    }

  1441 	    if (!gcisalpha(t[-1]))

  1442 	    {

  1443 		t++;

  1444 		continue;

  1445 	    }

  1446 	    if (warnings->isDutch)

  1447 	    {

  1448 		/* For Frank & Jeroen -- 's Middags case */

  1449 		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&

  1450 		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')

  1451 		{

  1452 		    t++;

  1453 		    continue;

  1454 		}

  1455 	    }

  1456 	    s1=t+2;

  1457 	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))

  1458 		s1++;

  1459 	    if (*s1>='a' && *s1<='z')

  1460 	    {

  1461 		/* we have something to investigate */

  1462 		istypo=1;

  1463 		/* so let's go back and find out */

  1464 		for (s1=t-1;s1>=s &&

  1465 		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&

  1466 		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)

  1467 		    ;

  1468 		s1++;

  1469 		for (i=0;*s1 && *s1!='.';s1++,i++)

  1470 		    testword[i]=*s1;

  1471 		testword[i]=0;

  1472 		for (i=0;*abbrev[i];i++)

  1473 		    if (!strcmp(testword,abbrev[i]))

  1474 			istypo=0;

  1475 		if (gcisdigit(*testword))

  1476 		    istypo=0;

  1477 		if (!testword[1])

  1478 		    istypo=0;

  1479 		if (isroman(testword))

  1480 		    istypo=0;

  1481 		if (istypo)

  1482 		{

  1483 		    istypo=0;

  1484 		    for (i=0;testword[i];i++)

  1485 			if (strchr(vowels,testword[i]))

  1486 			    istypo=1;

  1487 		}

  1488 		if (istypo)

  1489 		{

  1490 		    isdup=0;

  1491 		    if (strlen(testword)<MAX_QWORD_LENGTH &&

  1492 		      !pswit[VERBOSE_SWITCH])

  1493 			for (i=0;i<qperiod_index;i++)

  1494 			    if (!strcmp(testword,qperiod[i]))

  1495 				isdup=1;

  1496 		    if (!isdup)

  1497 		    {

  1498 			if (qperiod_index<MAX_QWORD &&

  1499 			  strlen(testword)<MAX_QWORD_LENGTH)

  1500 			{

  1501 			    strcpy(qperiod[qperiod_index],testword);

  1502 			    qperiod_index++;

  1503 			}

  1504 			if (pswit[ECHO_SWITCH])

  1505 			    printf("\n%s\n",aline);

  1506 			if (!pswit[OVERVIEW_SWITCH])

  1507 			    printf("    Line %ld column %d - Extra period?\n",

  1508 			      linecnt,(int)(t-aline)+1);

  1509 			else

  1510 			    cnt_punct++;

  1511 		    }

  1512 		}

  1513 	    }

  1514 	    t++;

  1515 	}

  1516     }

  1517 }

  1519 /*

  1520  * check_for_following_punctuation:

  1521  *

  1522  * Check for words usually not followed by punctuation.

  1523  */

  1524 void check_for_following_punctuation(const char *aline)

  1525 {

  1526     int i;

  1527     const char *s,*wordstart;

  1528     char inword[MAXWORDLEN];

  1529     if (pswit[TYPO_SWITCH])

  1530     {

  1531 	for (s=aline;*s;)

  1532 	{

  1533 	    wordstart=s;

  1534 	    s=getaword(s,inword);

  1535 	    if (!*inword)

  1536 		continue;

  1537 	    lowerit(inword);

  1538 	    for (i=0;*nocomma[i];i++)

  1539 		if (!strcmp(inword,nocomma[i]))

  1540 		{

  1541 		    if (*s==',' || *s==';' || *s==':')

  1542 		    {

  1543 			if (pswit[ECHO_SWITCH])

  1544 			    printf("\n%s\n",aline);

  1545 			if (!pswit[OVERVIEW_SWITCH])

  1546 			    printf("    Line %ld column %d - "

  1547 			      "Query punctuation after %s?\n",

  1548 			      linecnt,(int)(s-aline)+1,inword);

  1549 			else

  1550 			    cnt_punct++;

  1551 		    }

  1552 		}

  1553 	    for (i=0;*noperiod[i];i++)

  1554 		if (!strcmp(inword,noperiod[i]))

  1555 		{

  1556 		    if (*s=='.' || *s=='!')

  1557 		    {

  1558 			if (pswit[ECHO_SWITCH])

  1559 			    printf("\n%s\n",aline);

  1560 			if (!pswit[OVERVIEW_SWITCH])

  1561 			    printf("    Line %ld column %d - "

  1562 			      "Query punctuation after %s?\n",

  1563 			      linecnt,(int)(s-aline)+1,inword);

  1564 			else

  1565 			    cnt_punct++;

  1566 		    }

  1567 		}

  1568 	}

  1569     }

  1570 }

  1572 /*

  1573  * check_for_typos:

  1574  *

  1575  * Check for commonly mistyped words,

  1576  * and digits like 0 for O in a word.

  1577  */

  1578 void check_for_typos(const char *aline,struct warnings *warnings)

  1579 {

  1580     const char *s,*wordstart;

  1581     char inword[MAXWORDLEN],testword[MAXWORDLEN];

  1582     int i,istypo,isdup,alower,vowel,consonant;

  1583     static int qword_index=0;

  1584     for (s=aline;*s;)

  1585     {

  1586 	wordstart=s;

  1587 	s=getaword(s,inword);

  1588 	if (!*inword)

  1589 	    continue; /* don't bother with empty lines */

  1590 	if (mixdigit(inword))

  1591 	{

  1592 	    if (pswit[ECHO_SWITCH])

  1593 		printf("\n%s\n",aline);

  1594 	    if (!pswit[OVERVIEW_SWITCH])

  1595 		printf("    Line %ld column %d - Query digit in %s\n",

  1596 		  linecnt,(int)(wordstart-aline)+1,inword);

  1597 	    else

  1598 		cnt_word++;

  1599 	}

  1600 	/*

  1601 	 * Put the word through a series of tests for likely typos and OCR

  1602 	 * errors.

  1603 	 */

  1604 	if (pswit[TYPO_SWITCH])

  1605 	{

  1606 	    istypo=0;

  1607 	    strcpy(testword,inword);

  1608 	    alower=0;

  1609 	    for (i=0;i<(signed int)strlen(testword);i++)

  1610 	    {

  1611 		/* lowercase for testing */

  1612 		if (testword[i]>='a' && testword[i]<='z')

  1613 		    alower=1;

  1614 		if (alower && testword[i]>='A' && testword[i]<='Z')

  1615 		{

  1616 		    /*

  1617 		     * We have an uppercase mid-word. However, there are

  1618 		     * common cases:

  1619 		     *   Mac and Mc like McGill

  1620 		     *   French contractions like l'Abbe

  1621 		     */

  1622 		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||

  1623 		      i==3 && testword[0]=='m' && testword[1]=='a' &&

  1624 		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)

  1625 			; /* do nothing! */

  1626 		    else

  1627 			istypo=1;

  1628 		}

  1629 		testword[i]=(char)tolower(testword[i]);

  1630 	    }

  1631 	    /*

  1632 	     * Check for certain unlikely two-letter combinations at word

  1633 	     * start and end.

  1634 	     */

  1635 	    if (strlen(testword)>1)

  1636 	    {

  1637 		for (i=0;*nostart[i];i++)

  1638 		    if (!strncmp(testword,nostart[i],2))

  1639 			istypo=1;

  1640 		for (i=0;*noend[i];i++)

  1641 		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))

  1642 			istypo=1;

  1643 	    }

  1644 	    /* ght is common, gbt never. Like that. */

  1645 	    if (strstr(testword,"cb"))

  1646 		istypo=1;

  1647 	    if (strstr(testword,"gbt"))

  1648 		istypo=1;

  1649 	    if (strstr(testword,"pbt"))

  1650 		istypo=1;

  1651 	    if (strstr(testword,"tbs"))

  1652 		istypo=1;

  1653 	    if (strstr(testword,"mrn"))

  1654 		istypo=1;

  1655 	    if (strstr(testword,"ahle"))

  1656 		istypo=1;

  1657 	    if (strstr(testword,"ihle"))

  1658 		istypo=1;

  1659 	    /*

  1660 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1661 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1662 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1663 	     * numerals, but "ii" is a common scanno.

  1664 	     */

  1665 	    if (strstr(testword,"tbi"))

  1666 		istypo=1;

  1667 	    if (strstr(testword,"tbe"))

  1668 		istypo=1;

  1669 	    if (strstr(testword,"ii"))

  1670 		istypo=1;

  1671 	    /*

  1672 	     * Check for no vowels or no consonants.

  1673 	     * If none, flag a typo.

  1674 	     */

  1675 	    if (!istypo && strlen(testword)>1)

  1676 	    {

  1677 		vowel=consonant=0;

  1678 		for (i=0;testword[i];i++)

  1679 		{

  1680 		    if (testword[i]=='y' || gcisdigit(testword[i]))

  1681 		    {

  1682 			/* Yah, this is loose. */

  1683 			vowel++;

  1684 			consonant++;

  1685 		    }

  1686 		    else if (strchr(vowels,testword[i]))

  1687 			vowel++;

  1688 		    else

  1689 			consonant++;

  1690 		}

  1691 		if (!vowel || !consonant)

  1692 		    istypo=1;

  1693 	    }

  1694 	    /*

  1695 	     * Now exclude the word from being reported if it's in

  1696 	     * the okword list.

  1697 	     */

  1698 	    for (i=0;*okword[i];i++)

  1699 		if (!strcmp(testword,okword[i]))

  1700 		    istypo=0;

  1701 	    /*

  1702 	     * What looks like a typo may be a Roman numeral.

  1703 	     * Exclude these.

  1704 	     */

  1705 	    if (istypo && isroman(testword))

  1706 		istypo=0;

  1707 	    /* Check the manual list of typos. */

  1708 	    if (!istypo)

  1709 		for (i=0;*typo[i];i++)

  1710 		    if (!strcmp(testword,typo[i]))

  1711 			istypo=1;

  1712 	    /*

  1713 	     * Check lowercase s, l, i and m - special cases.

  1714 	     *   "j" - often a semi-colon gone wrong.

  1715 	     *   "d" for a missing apostrophe - he d

  1716 	     *   "n" for "in"

  1717 	     */

  1718 	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))

  1719 		istypo=1;

  1720 	    if (istypo)

  1721 	    {

  1722 		isdup=0;

  1723 		if (strlen(testword)<MAX_QWORD_LENGTH &&

  1724 		  !pswit[VERBOSE_SWITCH])

  1725 		    for (i=0;i<qword_index;i++)

  1726 			if (!strcmp(testword,qword[i]))

  1727 			{

  1728 			    isdup=1;

  1729 			    ++dupcnt[i];

  1730 			}

  1731 		if (!isdup)

  1732 		{

  1733 		    if (qword_index<MAX_QWORD &&

  1734 		      strlen(testword)<MAX_QWORD_LENGTH)

  1735 		    {

  1736 			strcpy(qword[qword_index],testword);

  1737 			qword_index++;

  1738 		    }

  1739 		    if (pswit[ECHO_SWITCH])

  1740 			printf("\n%s\n",aline);

  1741 		    if (!pswit[OVERVIEW_SWITCH])

  1742 		    {

  1743 			printf("    Line %ld column %d - Query word %s",

  1744 			  linecnt,(int)(wordstart-aline)+1,inword);

  1745 			if (strlen(testword)<MAX_QWORD_LENGTH &&

  1746 			  !pswit[VERBOSE_SWITCH])

  1747 			    printf(" - not reporting duplicates");

  1748 			printf("\n");

  1749 		    }

  1750 		    else

  1751 			cnt_word++;

  1752 		}

  1753 	    }

  1754 	}

  1755 	/* check the user's list of typos */

  1756 	if (!istypo && usertypo_count)

  1757 	    for (i=0;i<usertypo_count;i++)

  1758 		if (!strcmp(testword,usertypo[i]))

  1759 		{

  1760 		    if (pswit[ECHO_SWITCH])

  1761 			printf("\n%s\n",aline);

  1762 		    if (!pswit[OVERVIEW_SWITCH])

  1763 			printf("    Line %ld column %d - "

  1764 			  "Query possible scanno %s\n",

  1765 			  linecnt,(int)(wordstart-aline)+2,inword);

  1766 		}

  1767 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1768 	{

  1769 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1770 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1771 	    {

  1772 		if (pswit[ECHO_SWITCH])

  1773 		    printf("\n%s\n",aline);

  1774 		if (!pswit[OVERVIEW_SWITCH])

  1775 		    printf("    Line %ld column %d - Query standalone %s\n",

  1776 		      linecnt,(int)(wordstart-aline)+2,inword);

  1777 		else

  1778 		    cnt_word++;

  1779 	    }

  1780 	}

  1781     }

  1782 }

  1784 struct parities {

  1785     int dquote,squote;

  1786 };

  1788 /*

  1789  * check_for_misspaced_punctuation:

  1790  *

  1791  * Look for added or missing spaces around punctuation and quotes.

  1792  * If there is a punctuation character like ! with no space on

  1793  * either side, suspect a missing!space. If there are spaces on

  1794  * both sides , assume a typo. If we see a double quote with no

  1795  * space or punctuation on either side of it, assume unspaced

  1796  * quotes "like"this.

  1797  */

  1798 void check_for_misspaced_punctuation(const char *aline,

  1799   struct parities *parities,int isemptyline)

  1800 {

  1801     int i,llen,isacro,isellipsis;

  1802     const char *s;

  1803     llen=strlen(aline);

  1804     for (i=1;i<llen;i++)

  1805     {

  1806 	/* For each character in the line after the first. */

  1807 	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */

  1808 	{

  1809 	    /* we need to suppress warnings for acronyms like M.D. */

  1810 	    isacro=0;

  1811 	    /* we need to suppress warnings for ellipsis . . . */

  1812 	    isellipsis=0;

  1813 	    /* if there are letters on both sides of it or ... */

  1814 	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||

  1815 	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))

  1816 	    {

  1817 		/* ...if it's strict punctuation followed by an alpha */

  1818 		if (aline[i]=='.')

  1819 		{

  1820 		    if (i>2 && aline[i-2]=='.')

  1821 			isacro=1;

  1822 		    if (i+2<llen && aline[i+2]=='.')

  1823 			isacro=1;

  1824 		}

  1825 		if (!isacro)

  1826 		{

  1827 		    if (pswit[ECHO_SWITCH])

  1828 			printf("\n%s\n",aline);

  1829 		    if (!pswit[OVERVIEW_SWITCH])

  1830 			printf("    Line %ld column %d - Missing space?\n",

  1831 			  linecnt,i+1);

  1832 		    else

  1833 			cnt_punct++;

  1834 		}

  1835 	    }

  1836 	    if (aline[i-1]==CHAR_SPACE &&

  1837 	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))

  1838 	    {

  1839 		/*

  1840 		 * If there are spaces on both sides,

  1841 		 * or space before and end of line.

  1842 		 */

  1843 		if (aline[i]=='.')

  1844 		{

  1845 		    if (i>2 && aline[i-2]=='.')

  1846 			isellipsis=1;

  1847 		    if (i+2<llen && aline[i+2]=='.')

  1848 			isellipsis=1;

  1849 		}

  1850 		if (!isemptyline && !isellipsis)

  1851 		{

  1852 		    if (pswit[ECHO_SWITCH])

  1853 			printf("\n%s\n",aline);

  1854 		    if (!pswit[OVERVIEW_SWITCH])

  1855 			printf("    Line %ld column %d - "

  1856 			  "Spaced punctuation?\n",linecnt,i+1);

  1857 		    else

  1858 			cnt_punct++;

  1859 		}

  1860 	    }

  1861 	}

  1862     }

  1863     /* Split out the characters that CANNOT be preceded by space. */

  1864     llen=strlen(aline);

  1865     for (i=1;i<llen;i++)

  1866     {

  1867 	/* for each character in the line after the first */

  1868 	if (strchr("?!,;:",aline[i]))

  1869 	{

  1870 	    /* if it's punctuation that _cannot_ have a space before it */

  1871 	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&

  1872 	      aline[i+1]!=CHAR_SPACE)

  1873 	    {

  1874 		/*

  1875 		 * If aline[i+1) DOES == space,

  1876 		 * it was already reported just above.

  1877 		 */

  1878 		if (pswit[ECHO_SWITCH])

  1879 		    printf("\n%s\n",aline);

  1880 		if (!pswit[OVERVIEW_SWITCH])

  1881 		    printf("    Line %ld column %d - Spaced punctuation?\n",

  1882 		      linecnt,i+1);

  1883 		else

  1884 		    cnt_punct++;

  1885 	    }

  1886 	}

  1887     }

  1888     /*

  1889      * Special case " .X" where X is any alpha.

  1890      * This plugs a hole in the acronym code above.

  1891      * Inelegant, but maintainable.

  1892      */

  1893     llen=strlen(aline);

  1894     for (i=1;i<llen;i++)

  1895     {

  1896 	/* for each character in the line after the first */

  1897 	if (aline[i]=='.')

  1898 	{

  1899 	    /* if it's a period */

  1900 	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))

  1901 	    {

  1902 		/*

  1903 		 * If the period follows a space and

  1904 		 * is followed by a letter.

  1905 		 */

  1906 		if (pswit[ECHO_SWITCH])

  1907 		    printf("\n%s\n",aline);

  1908 		if (!pswit[OVERVIEW_SWITCH])

  1909 		    printf("    Line %ld column %d - Spaced punctuation?\n",

  1910 		      linecnt,i+1);

  1911 		else

  1912 		    cnt_punct++;

  1913 	    }

  1914 	}

  1915     }

  1916     for (i=1;i<llen;i++)

  1917     {

  1918 	/* for each character in the line after the first */

  1919 	if (aline[i]==CHAR_DQUOTE)

  1920 	{

  1921 	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&

  1922 	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||

  1923 	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))

  1924 	    {

  1925 		if (pswit[ECHO_SWITCH])

  1926 		    printf("\n%s\n",aline);

  1927 		if (!pswit[OVERVIEW_SWITCH])

  1928 		    printf("    Line %ld column %d - Unspaced quotes?\n",

  1929 		      linecnt,i+1);

  1930 		else

  1931 		    cnt_punct++;

  1932 	    }

  1933 	}

  1934     }

  1935     /* Check parity of quotes. */

  1936     for (s=aline;*s;s++)

  1937     {

  1938 	if (*s==CHAR_DQUOTE)

  1939 	{

  1940 	    parities->dquote=!parities->dquote;

  1941 	    if (!parities->dquote)

  1942 	    {

  1943 		/* parity even */

  1944 		if (!strchr("_-.'`/,;:!?)]} ",s[1]))

  1945 		{

  1946 		    if (pswit[ECHO_SWITCH])

  1947 			printf("\n%s\n",aline);

  1948 		    if (!pswit[OVERVIEW_SWITCH])

  1949 			printf("    Line %ld column %d - "

  1950 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  1951 		    else

  1952 			cnt_punct++;

  1953 		}

  1954 	    }

  1955 	    else

  1956 	    {

  1957 		/* parity odd */

  1958 		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  1959 		  !strchr("_-/.'`([{$",s[1]) || !s[1])

  1960 		{

  1961 		    if (pswit[ECHO_SWITCH])

  1962 			printf("\n%s\n",aline);

  1963 		    if (!pswit[OVERVIEW_SWITCH])

  1964 			printf("    Line %ld column %d - "

  1965 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  1966 		    else

  1967 			cnt_punct++;

  1968 		}

  1969 	    }

  1970 	}

  1971     }

  1972     if (*aline==CHAR_DQUOTE)

  1973     {

  1974 	if (strchr(",;:!?)]} ",aline[1]))

  1975 	{

  1976 	    if (pswit[ECHO_SWITCH])

  1977 		printf("\n%s\n",aline);

  1978 	    if (!pswit[OVERVIEW_SWITCH])

  1979 		printf("    Line %ld column 1 - Wrongspaced quotes?\n",

  1980 		  linecnt);

  1981 	    else

  1982 		cnt_punct++;

  1983 	}

  1984     }

  1985     if (pswit[SQUOTE_SWITCH])

  1986     {

  1987 	for (s=aline;*s;s++)

  1988 	{

  1989 	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&

  1990 	      (s==aline || s>aline && !gcisalpha(s[-1]) ||

  1991 	      !gcisalpha(s[1])))

  1992 	    {

  1993 		parities->squote=!parities->squote;

  1994 		if (!parities->squote)

  1995 		{

  1996 		    /* parity even */

  1997 		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))

  1998 		    {

  1999 			if (pswit[ECHO_SWITCH])

  2000 			    printf("\n%s\n",aline);

  2001 			if (!pswit[OVERVIEW_SWITCH])

  2002 			    printf("    Line %ld column %d - "

  2003 			      "Wrongspaced singlequotes?\n",

  2004 			      linecnt,(int)(s-aline)+1);

  2005 			else

  2006 			    cnt_punct++;

  2007 		    }

  2008 		}

  2009 		else

  2010 		{

  2011 		    /* parity odd */

  2012 		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2013 		      !strchr("_-/\".'`",s[1]) || !s[1])

  2014 		    {

  2015 			if (pswit[ECHO_SWITCH])

  2016 			    printf("\n%s\n",aline);

  2017 			if (!pswit[OVERVIEW_SWITCH])

  2018 			    printf("    Line %ld column %d - "

  2019 			      "Wrongspaced singlequotes?\n",

  2020 			      linecnt,(int)(s-aline)+1);

  2021 			else

  2022 			    cnt_punct++;

  2023 		    }

  2024 		}

  2025 	    }

  2026 	}

  2027     }

  2028 }

  2030 /*

  2031  * check_for_double_punctuation:

  2032  *

  2033  * Look for double punctuation like ,. or ,,

  2034  * Thanks to DW for the suggestion!

  2035  * In books with references, ".," and ".;" are common

  2036  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2037  * OTOH, from my initial tests, there are also fairly

  2038  * common errors. What to do? Make these cases paranoid?

  2039  * ".," is the most common, so warnings->dotcomma is used

  2040  * to suppress detailed reporting if it occurs often.

  2041  */

  2042 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2043 {

  2044     int i,llen;

  2045     llen=strlen(aline);

  2046     for (i=0;i<llen;i++)

  2047     {

  2048 	/* for each punctuation character in the line */

  2049 	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&

  2050 	  aline[i] && aline[i+1])

  2051 	{

  2052 	    /* followed by punctuation, it's a query, unless . . . */

  2053 	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||

  2054 	      aline[i]=='!') ||

  2055 	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||

  2056 	      warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2057 	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2058 	      warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2059 	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2060 	      warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2061 	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2062 	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2063 	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2064 	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2065 	      warnings->isFrench && !strncmp(aline+i,"...?",4))

  2066 	    {

  2067 		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2068 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2069 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2070 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2071 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2072 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2073 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2074 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2075 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2076 		  warnings->isFrench && !strncmp(aline+i,"...?",4))

  2077 		    i+=4;

  2078 		; /* do nothing for .. !! and ?? which can be legit */

  2079 	    }

  2080 	    else

  2081 	    {

  2082 		if (pswit[ECHO_SWITCH])

  2083 		    printf("\n%s\n",aline);

  2084 		if (!pswit[OVERVIEW_SWITCH])

  2085 		    printf("    Line %ld column %d - Double punctuation?\n",

  2086 		      linecnt,i+1);

  2087 		else

  2088 		    cnt_punct++;

  2089 	    }

  2090 	}

  2091     }

  2092 }

  2094 /*

  2095  * check_for_spaced_quotes:

  2096  */

  2097 void check_for_spaced_quotes(const char *aline)

  2098 {

  2099     const char *s,*t;

  2100     s=aline;

  2101     while ((t=strstr(s," \" ")))

  2102     {

  2103 	if (pswit[ECHO_SWITCH])

  2104 	    printf("\n%s\n",aline);

  2105 	if (!pswit[OVERVIEW_SWITCH])

  2106 	    printf("    Line %ld column %d - Spaced doublequote?\n",

  2107 	      linecnt,(int)(t-aline+1));

  2108 	else

  2109 	    cnt_punct++;

  2110 	s=t+2;

  2111     }

  2112     s=aline;

  2113     while ((t=strstr(s," ' ")))

  2114     {

  2115 	if (pswit[ECHO_SWITCH])

  2116 	    printf("\n%s\n",aline);

  2117 	if (!pswit[OVERVIEW_SWITCH])

  2118 	    printf("    Line %ld column %d - Spaced singlequote?\n",

  2119 	      linecnt,(int)(t-aline+1));

  2120 	else

  2121 	    cnt_punct++;

  2122 	s=t+2;

  2123     }

  2124     s=aline;

  2125     while ((t=strstr(s," ` ")))

  2126     {

  2127 	if (pswit[ECHO_SWITCH])

  2128 	    printf("\n%s\n",aline);

  2129 	if (!pswit[OVERVIEW_SWITCH])

  2130 	    printf("    Line %ld column %d - Spaced singlequote?\n",

  2131 	      linecnt,(int)(t-aline+1));

  2132 	else

  2133 	    cnt_punct++;

  2134 	s=t+2;

  2135     }

  2136 }

  2138 /*

  2139  * check_for_miscased_genative:

  2140  *

  2141  * Check special case of 'S instead of 's at end of word.

  2142  */

  2143 void check_for_miscased_genative(const char *aline)

  2144 {

  2145     const char *s;

  2146     s=aline+1;

  2147     while (*s)

  2148     {

  2149 	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')

  2150 	{

  2151 	    if (pswit[ECHO_SWITCH])

  2152 		printf("\n%s\n",aline);

  2153 	    if (!pswit[OVERVIEW_SWITCH])

  2154 		printf("    Line %ld column %d - Capital \"S\"?\n",

  2155 		  linecnt,(int)(s-aline+2));

  2156 	    else

  2157 		cnt_punct++;

  2158 	}

  2159 	s++;

  2160     }

  2161 }

  2163 /*

  2164  * check_end_of_line:

  2165  *

  2166  * Now check special cases - start and end of line -

  2167  * for single and double quotes. Start is sometimes [sic]

  2168  * but better to query it anyway.

  2169  * While we're here, check for dash at end of line.

  2170  */

  2171 void check_end_of_line(const char *aline,struct warnings *warnings)

  2172 {

  2173     int i,llen;

  2174     llen=strlen(aline);

  2175     if (llen>1)

  2176     {

  2177 	if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||

  2178 	  aline[llen-1]==CHAR_OPEN_SQUOTE)

  2179 	    if (aline[llen-2]==CHAR_SPACE)

  2180 	    {

  2181 		if (pswit[ECHO_SWITCH])

  2182 		    printf("\n%s\n",aline);

  2183 		if (!pswit[OVERVIEW_SWITCH])

  2184 		    printf("    Line %ld column %d - Spaced quote?\n",

  2185 		      linecnt,llen);

  2186 		else

  2187 		    cnt_punct++;

  2188 	    }

  2189 	if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&

  2190 	  aline[1]==CHAR_SPACE)

  2191 	{

  2192 	    if (pswit[ECHO_SWITCH])

  2193 		printf("\n%s\n",aline);

  2194 	    if (!pswit[OVERVIEW_SWITCH])

  2195 		printf("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2196 	    else

  2197 		cnt_punct++;

  2198 	}

  2199 	/*

  2200 	 * Dash at end of line may well be legit - paranoid mode only

  2201 	 * and don't report em-dash at line-end.

  2202 	 */

  2203 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2204 	{

  2205 	    for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

  2206 		;

  2207 	    if (aline[i]=='-' && aline[i-1]!='-')

  2208 	    {

  2209 		if (pswit[ECHO_SWITCH])

  2210 		    printf("\n%s\n",aline);

  2211 		if (!pswit[OVERVIEW_SWITCH])

  2212 		    printf("    Line %ld column %d - Hyphen at end of line?\n",

  2213 		      linecnt,i);

  2214 	    }

  2215 	}

  2216     }

  2217 }

  2219 /*

  2220  * check_for_unspaced_bracket:

  2221  *

  2222  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2223  * If so, suspect a scanno like "a]most".

  2224  */

  2225 void check_for_unspaced_bracket(const char *aline)

  2226 {

  2227     int i,llen;

  2228     llen=strlen(aline);

  2229     for (i=1;i<llen-1;i++)

  2230     {

  2231 	/* for each bracket character in the line except 1st & last */

  2232 	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&

  2233 	  gcisalpha(aline[i+1]))

  2234 	{

  2235 	    if (pswit[ECHO_SWITCH])

  2236 		printf("\n%s\n",aline);

  2237 	    if (!pswit[OVERVIEW_SWITCH])

  2238 		printf("    Line %ld column %d - Unspaced bracket?\n",

  2239 		  linecnt,i);

  2240 	    else

  2241 		cnt_punct++;

  2242 	}

  2243     }

  2244 }

  2246 /*

  2247  * check_for_unpunctuated_endquote:

  2248  */

  2249 void check_for_unpunctuated_endquote(const char *aline)

  2250 {

  2251     int i,llen;

  2252     llen=strlen(aline);

  2253     for (i=1;i<llen;i++)

  2254     {

  2255 	/* for each character in the line except 1st */

  2256 	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

  2257 	{

  2258 	    if (pswit[ECHO_SWITCH])

  2259 		printf("\n%s\n",aline);

  2260 	    if (!pswit[OVERVIEW_SWITCH])

  2261 		printf("    Line %ld column %d - "

  2262 		  "endquote missing punctuation?\n",linecnt,i);

  2263 	    else

  2264 		cnt_punct++;

  2265 	}

  2266     }

  2267 }

  2269 /*

  2270  * check_for_html_tag:

  2271  *

  2272  * Check for <HTML TAG>.

  2273  *

  2274  * If there is a < in the line, followed at some point

  2275  * by a > then we suspect HTML.

  2276  */

  2277 void check_for_html_tag(const char *aline)

  2278 {

  2279     int i;

  2280     const char *open,*close;

  2281     open=strstr(aline,"<");

  2282     if (open)

  2283     {

  2284 	close=strstr(aline,">");

  2285 	if (close)

  2286 	{

  2287 	    i=(signed int)(close-open+1);

  2288 	    if (i>0)

  2289 	    {

  2290 		strncpy(wrk,open,i);

  2291 		wrk[i]=0;

  2292 		if (pswit[ECHO_SWITCH])

  2293 		    printf("\n%s\n",aline);

  2294 		if (!pswit[OVERVIEW_SWITCH])

  2295 		    printf("    Line %ld column %d - HTML Tag? %s \n",

  2296 		      linecnt,(int)(open-aline)+1,wrk);

  2297 		else

  2298 		    cnt_html++;

  2299 	    }

  2300 	}

  2301     }

  2302 }

  2304 /*

  2305  * check_for_html_entity:

  2306  *

  2307  * Check for &symbol; HTML.

  2308  *

  2309  * If there is a & in the line, followed at

  2310  * some point by a ; then we suspect HTML.

  2311  */

  2312 void check_for_html_entity(const char *aline)

  2313 {

  2314     int i;

  2315     const char *s,*amp,*scolon;

  2316     amp=strstr(aline,"&");

  2317     if (amp)

  2318     {

  2319 	scolon=strstr(aline,";");

  2320 	if (scolon)

  2321 	{

  2322 	    i=(int)(scolon-amp+1);

  2323 	    for (s=amp;s<scolon;s++)

  2324 		if (*s==CHAR_SPACE)

  2325 		    i=0;                /* Don't report "Jones & Son;" */

  2326 	    if (i>0)

  2327 	    {

  2328 		strncpy(wrk,amp,i);

  2329 		wrk[i]=0;

  2330 		if (pswit[ECHO_SWITCH])

  2331 		    printf("\n%s\n",aline);

  2332 		if (!pswit[OVERVIEW_SWITCH])

  2333 		    printf("    Line %ld column %d - HTML symbol? %s \n",

  2334 		      linecnt,(int)(amp-aline)+1,wrk);

  2335 		else

  2336 		    cnt_html++;

  2337 	    }

  2338 	}

  2339     }

  2340 }

  2342 struct pending {

  2343     char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];

  2344     long squot;

  2345 };

  2347 /*

  2348  * print_pending:

  2349  *

  2350  * If we are in a state of unbalanced quotes, and this line

  2351  * doesn't begin with a quote, output the stored error message.

  2352  * If the -P switch was used, print the warning even if the

  2353  * new para starts with quotes.

  2354  */

  2355 void print_pending(const char *aline,const char *parastart,

  2356   struct pending *pending)

  2357 {

  2358     const char *s;

  2359     s=aline;

  2360     while (*s==' ')

  2361 	s++;

  2362     if (*pending->dquote)

  2363 	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  2364 	{

  2365 	    if (!pswit[OVERVIEW_SWITCH])

  2366 	    {

  2367 		if (pswit[ECHO_SWITCH])

  2368 		    printf("\n%s\n",parastart);

  2369 		puts(pending->dquote);

  2370 	    }

  2371 	    else

  2372 		cnt_dquot++;

  2373 	}

  2374     if (*pending->squote)

  2375     {

  2376 	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||

  2377 	  pending->squot)

  2378 	{

  2379 	    if (!pswit[OVERVIEW_SWITCH])

  2380 	    {

  2381 		if (pswit[ECHO_SWITCH])

  2382 		    printf("\n%s\n",parastart);

  2383 		puts(pending->squote);

  2384 	    }

  2385 	    else

  2386 		cnt_squot++;

  2387 	}

  2388     }

  2389     if (*pending->rbrack)

  2390     {

  2391 	if (!pswit[OVERVIEW_SWITCH])

  2392 	{

  2393 	    if (pswit[ECHO_SWITCH])

  2394 		printf("\n%s\n",parastart);

  2395 	    puts(pending->rbrack);

  2396 	}

  2397 	else

  2398 	    cnt_brack++;

  2399     }

  2400     if (*pending->sbrack)

  2401     {

  2402 	if (!pswit[OVERVIEW_SWITCH])

  2403 	{

  2404 	    if (pswit[ECHO_SWITCH])

  2405 		printf("\n%s\n",parastart);

  2406 	    puts(pending->sbrack);

  2407 	}

  2408 	else

  2409 	    cnt_brack++;

  2410     }

  2411     if (*pending->cbrack)

  2412     {

  2413 	if (!pswit[OVERVIEW_SWITCH])

  2414 	{

  2415 	    if (pswit[ECHO_SWITCH])

  2416 		printf("\n%s\n",parastart);

  2417 	    puts(pending->cbrack);

  2418 	}

  2419 	else

  2420 	    cnt_brack++;

  2421     }

  2422     if (*pending->unders)

  2423     {

  2424 	if (!pswit[OVERVIEW_SWITCH])

  2425 	{

  2426 	    if (pswit[ECHO_SWITCH])

  2427 		printf("\n%s\n",parastart);

  2428 	    puts(pending->unders);

  2429 	}

  2430 	else

  2431 	    cnt_brack++;

  2432     }

  2433 }

  2435 /*

  2436  * check_for_mismatched_quotes:

  2437  *

  2438  * At end of paragraph, check for mismatched quotes.

  2439  *

  2440  * We don't want to report an error immediately, since it is a

  2441  * common convention to omit the quotes at end of paragraph if

  2442  * the next paragraph is a continuation of the same speaker.

  2443  * Where this is the case, the next para should begin with a

  2444  * quote, so we store the warning message and only display it

  2445  * at the top of the next iteration if the new para doesn't

  2446  * start with a quote.

  2447  * The -p switch overrides this default, and warns of unclosed

  2448  * quotes on _every_ paragraph, whether the next begins with a

  2449  * quote or not.

  2450  */

  2451 void check_for_mismatched_quotes(const struct counters *counters,

  2452   struct pending *pending)

  2453 {

  2454     if (counters->quot%2)

  2455 	sprintf(pending->dquote,"    Line %ld - Mismatched quotes",

  2456 	  linecnt);

  2457     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2458       counters->open_single_quote!=counters->close_single_quote)

  2459 	sprintf(pending->squote,"    Line %ld - Mismatched singlequotes?",

  2460 	  linecnt);

  2461     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2462       counters->open_single_quote!=counters->close_single_quote &&

  2463       counters->open_single_quote!=counters->close_single_quote+1)

  2464 	/*

  2465 	 * Flag it to be noted regardless of the

  2466 	 * first char of the next para.

  2467 	 */

  2468 	pending->squot=1;

  2469     if (counters->r_brack)

  2470 	sprintf(pending->rbrack,"    Line %ld - Mismatched round brackets?",

  2471 	  linecnt);

  2472     if (counters->s_brack)

  2473 	sprintf(pending->sbrack,"    Line %ld - Mismatched square brackets?",

  2474 	  linecnt);

  2475     if (counters->c_brack)

  2476 	sprintf(pending->cbrack,"    Line %ld - Mismatched curly brackets?",

  2477 	  linecnt);

  2478     if (counters->c_unders%2)

  2479 	sprintf(pending->unders,"    Line %ld - Mismatched underscores?",

  2480 	  linecnt);

  2481 }

  2483 /*

  2484  * check_for_omitted_punctuation:

  2485  *

  2486  * Check for omitted punctuation at end of paragraph by working back

  2487  * through prevline. DW.

  2488  * Need to check this only for "normal" paras.

  2489  * So what is a "normal" para?

  2490  *    Not normal if one-liner (chapter headings, etc.)

  2491  *    Not normal if doesn't contain at least one locase letter

  2492  *    Not normal if starts with space

  2493  */

  2494 void check_for_omitted_punctuation(const char *prevline,

  2495   struct line_properties *last,int start_para_line)

  2496 {

  2497     int i;

  2498     const char *s;

  2499     for (s=prevline,i=0;*s && !i;s++)

  2500 	if (gcisletter(*s))

  2501 	    /* use i to indicate the presence of a letter on the line */

  2502 	    i=1;

  2503     /*

  2504      * This next "if" is a problem.

  2505      * If we say "start_para_line <= linecnt - 1", that includes

  2506      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2507      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2508      * misses genuine one-line paragraphs.

  2509      */

  2510     if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)

  2511     {

  2512 	for (i=strlen(prevline)-1;

  2513 	  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&

  2514 	  prevline[i]>CHAR_SPACE && i>0;

  2515 	  i--)

  2516 	    ;

  2517 	for (;i>0;i--)

  2518 	{

  2519 	    if (gcisalpha(prevline[i]))

  2520 	    {

  2521 		if (pswit[ECHO_SWITCH])

  2522 		    printf("\n%s\n",prevline);

  2523 		if (!pswit[OVERVIEW_SWITCH])

  2524 		    printf("    Line %ld column %d - "

  2525 		      "No punctuation at para end?\n",

  2526 		      linecnt-1,strlen(prevline));

  2527 		else

  2528 		    cnt_punct++;

  2529 		break;

  2530 	    }

  2531 	    if (strchr("-.:!([{?}])",prevline[i]))

  2532 		break;

  2533 	}

  2534     }

  2535 }

  2537 /*

  2538  * procfile:

  2539  *

  2540  * Process one file.

  2541  */

  2542 void procfile(char *filename)

  2543 {

  2544     const char *s;

  2545     char parastart[81];     /* first line of current para */

  2546     FILE *infile;

  2547     struct first_pass_results *first_pass_results;

  2548     struct warnings *warnings;

  2549     struct counters counters={0};

  2550     struct line_properties last={0};

  2551     struct parities parities={0};

  2552     struct pending pending={{0},};

  2553     int isemptyline;

  2554     long start_para_line;

  2555     signed int i,llen,isacro,isellipsis;

  2556     signed int isnewpara;

  2557     signed int enddash;

  2558     last.start=CHAR_SPACE;

  2559     *prevline=0;

  2560     linecnt=checked_linecnt=start_para_line=0;

  2561     i=llen=isacro=isellipsis=0;

  2562     isnewpara=enddash=0;

  2563     infile=fopen(filename,"rb");

  2564     if (!infile)

  2565     {

  2566         if (pswit[STDOUT_SWITCH])

  2567             fprintf(stdout,"bookloupe: cannot open %s\n",filename);

  2568         else

  2569             fprintf(stderr,"bookloupe: cannot open %s\n",filename);

  2570 	exit(1);

  2571     }

  2572     fprintf(stdout,"\n\nFile: %s\n\n",filename);

  2573     first_pass_results=first_pass(infile);

  2574     warnings=report_first_pass(first_pass_results);

  2575     /*

  2576      * Here we go with the main pass. Hold onto yer hat!

  2577      */

  2578     rewind(infile);

  2579     linecnt=0;

  2580     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))

  2581     {

  2582         linecnt++;

  2583         if (linecnt==1)

  2584 	    isnewpara=1;

  2585         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))

  2586 	    continue;    // skip DP page separators completely

  2587         if (linecnt<first_pass_results->firstline ||

  2588 	  (first_pass_results->footerline>0 &&

  2589 	  linecnt>first_pass_results->footerline))

  2590 	{

  2591             if (pswit[HEADER_SWITCH])

  2592 	    {

  2593                 if (!strncmp(aline,"Title:",6))

  2594                     printf("    %s\n",aline);

  2595                 if (!strncmp(aline,"Author:",7))

  2596                     printf("    %s\n",aline);

  2597                 if (!strncmp(aline,"Release Date:",13))

  2598                     printf("    %s\n",aline);

  2599                 if (!strncmp(aline,"Edition:",8))

  2600                     printf("    %s\n\n",aline);

  2601 	    }

  2602             continue;                /* skip through the header */

  2603 	}

  2604         checked_linecnt++;

  2605 	print_pending(aline,parastart,&pending);

  2606 	memset(&pending,0,sizeof(pending));

  2607 	isemptyline=analyse_quotes(aline,&counters);

  2608         if (isnewpara && !isemptyline)

  2609 	{

  2610 	    /* This line is the start of a new paragraph. */

  2611             start_para_line=linecnt;

  2612 	    /* Capture its first line in case we want to report it later. */

  2613             strncpy(parastart,aline,80);

  2614             parastart[79]=0;

  2615 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2616             s=aline;

  2617             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)

  2618 		s++;

  2619             if (*s>='a' && *s<='z')

  2620 	    {

  2621 		/* and its first letter is lowercase */

  2622                 if (pswit[ECHO_SWITCH])

  2623 		    printf("\n%s\n",aline);

  2624                 if (!pswit[OVERVIEW_SWITCH])

  2625                     printf("    Line %ld column %d - "

  2626 		      "Paragraph starts with lower-case\n",

  2627 		      linecnt,(int)(s-aline)+1);

  2628                 else

  2629                     cnt_punct++;

  2630 	    }

  2631             isnewpara=0; /* Signal the end of new para processing. */

  2632 	}

  2633         /* Check for an em-dash broken at line end. */

  2634         if (enddash && *aline=='-')

  2635 	{

  2636             if (pswit[ECHO_SWITCH])

  2637 		printf("\n%s\n",aline);

  2638             if (!pswit[OVERVIEW_SWITCH])

  2639                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2640             else

  2641                 cnt_punct++;

  2642 	}

  2643         enddash=0;

  2644         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)

  2645 	    ;

  2646         if (s>=aline && *s=='-')

  2647             enddash=1;

  2648 	/*

  2649          * Check for invalid or questionable characters in the line

  2650          * Anything above 127 is invalid for plain ASCII, and

  2651          * non-printable control characters should also be flagged.

  2652          * Tabs should generally not be there.

  2653 	 */

  2654         for (s=aline;*s;s++)

  2655 	{

  2656             i=(unsigned char)*s;

  2657             if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)

  2658 	    {

  2659                 if (pswit[ECHO_SWITCH])

  2660 		    printf("\n%s\n",aline);

  2661                 if (!pswit[OVERVIEW_SWITCH])

  2662                     printf("    Line %ld column %d - Control character %d\n",

  2663 		      linecnt,(int)(s-aline)+1,i);

  2664                 else

  2665                     cnt_bin++;

  2666 	    }

  2667 	}

  2668         if (warnings->bin)

  2669 	    check_for_odd_characters(aline,warnings,isemptyline);

  2670         if (warnings->longline)

  2671 	    check_for_long_line(aline);

  2672         if (warnings->shortline)

  2673 	    check_for_short_line(aline,&last);

  2674         last.blen=last.len;

  2675         last.len=strlen(aline);

  2676         last.start=aline[0];

  2677 	check_for_starting_punctuation(aline);

  2678         if (warnings->dash)

  2679 	{

  2680 	    check_for_spaced_emdash(aline);

  2681 	    check_for_spaced_dash(aline);

  2682 	}

  2683 	check_for_unmarked_paragraphs(aline);

  2684 	check_for_jeebies(aline);

  2685 	check_for_mta_from(aline);

  2686 	check_for_orphan_character(aline);

  2687 	check_for_pling_scanno(aline);

  2688 	check_for_extra_period(aline,warnings);

  2689 	check_for_following_punctuation(aline);

  2690 	check_for_typos(aline,warnings);

  2691 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2692 	check_for_double_punctuation(aline,warnings);

  2693 	check_for_spaced_quotes(aline);

  2694 	check_for_miscased_genative(aline);

  2695 	check_end_of_line(aline,warnings);

  2696 	check_for_unspaced_bracket(aline);

  2697         if (warnings->endquote)

  2698 	    check_for_unpunctuated_endquote(aline);

  2699 	check_for_html_tag(aline);

  2700 	check_for_html_entity(aline);

  2701         if (isemptyline)

  2702 	{

  2703 	    check_for_mismatched_quotes(&counters,&pending);

  2704 	    memset(&counters,0,sizeof(counters));

  2705 	    /* let the next iteration know that it's starting a new para */

  2706             isnewpara=1;

  2707 	    check_for_omitted_punctuation(prevline,&last,start_para_line);

  2708 	}

  2709         strcpy(prevline,aline);

  2710     }

  2711     fclose(infile);

  2712     if (!pswit[OVERVIEW_SWITCH])

  2713         for (i=0;i<MAX_QWORD;i++)

  2714             if (dupcnt[i])

  2715                 printf("\nNote: Queried word %s was duplicated %d time%s\n",

  2716 		  qword[i],dupcnt[i],"s");

  2717 }

  2719 /*

  2720  * flgets:

  2721  *

  2722  * Get one line from the input stream, checking for

  2723  * the existence of exactly one CR/LF line-end per line.

  2724  *

  2725  * Returns: a pointer to the line.

  2726  */

  2727 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)

  2728 {

  2729     char c;

  2730     int len,isCR,cint;

  2731     *theline=0;

  2732     len=isCR=0;

  2733     c=cint=fgetc(thefile);

  2734     do

  2735     {

  2736         if (cint==EOF)

  2737             return NULL;

  2738 	/* either way, it's end of line */

  2739         if (c==10)

  2740 	{

  2741             if (isCR)

  2742                 break;

  2743             else

  2744 	    {

  2745 		/* Error - a LF without a preceding CR */

  2746                 if (pswit[LINE_END_SWITCH])

  2747 		{

  2748                     if (pswit[ECHO_SWITCH])

  2749 			printf("\n%s\n",theline);

  2750                     if (!pswit[OVERVIEW_SWITCH])

  2751                         printf("    Line %ld - No CR?\n",lcnt);

  2752                     else

  2753                         cnt_lineend++;

  2754 		}

  2755                 break;

  2756 	    }

  2757 	}

  2758         if (c==13)

  2759 	{

  2760             if (isCR)

  2761 	    {

  2762 		/* Error - two successive CRs */

  2763                 if (pswit[LINE_END_SWITCH])

  2764 		{

  2765                     if (pswit[ECHO_SWITCH])

  2766 			printf("\n%s\n",theline);

  2767                     if (!pswit[OVERVIEW_SWITCH])

  2768                         printf("    Line %ld - Two successive CRs?\n",lcnt);

  2769                     else

  2770                         cnt_lineend++;

  2771 		}

  2772 	    }

  2773             isCR=1;

  2774 	}

  2775         else

  2776 	{

  2777             if (pswit[LINE_END_SWITCH] && isCR)

  2778 	    {

  2779                 if (pswit[ECHO_SWITCH])

  2780 		    printf("\n%s\n",theline);

  2781                 if (!pswit[OVERVIEW_SWITCH])

  2782                     printf("    Line %ld column %d - CR without LF?\n",

  2783 		      lcnt,len+1);

  2784                 else

  2785                     cnt_lineend++;

  2786 	    }

  2787             theline[len]=c;

  2788             len++;

  2789             theline[len]=0;

  2790             isCR=0;

  2791 	}

  2792         c=cint=fgetc(thefile);

  2793     } while(len<maxlen);

  2794     if (pswit[MARKUP_SWITCH])

  2795         postprocess_for_HTML(theline);

  2796     if (pswit[DP_SWITCH])

  2797         postprocess_for_DP(theline);

  2798     return theline;

  2799 }

  2801 /*

  2802  * mixdigit:

  2803  *

  2804  * Takes a "word" as a parameter, and checks whether it

  2805  * contains a mixture of alpha and digits. Generally, this is an

  2806  * error, but may not be for cases like 4th or L5 12s. 3d.

  2807  *

  2808  * Returns: 0 if no error found, 1 if error.

  2809  */

  2810 int mixdigit(char *checkword)

  2811 {

  2812     int wehaveadigit,wehavealetter,firstdigits,query,wl;

  2813     char *s;

  2814     wehaveadigit=wehavealetter=query=0;

  2815     for (s=checkword;*s;s++)

  2816         if (gcisalpha(*s))

  2817             wehavealetter=1;

  2818         else

  2819             if (gcisdigit(*s))

  2820                 wehaveadigit=1;

  2821     if (wehaveadigit && wehavealetter)

  2822     {

  2823 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2824         query=1;

  2825         wl=strlen(checkword);

  2826         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)

  2827             ;

  2828         /* digits, ending in st, rd, nd, th of either case */

  2829         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||

  2830 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||

  2831 	  matchword(checkword+wl-2,"th")))

  2832 	    query=0;

  2833         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||

  2834 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||

  2835 	  matchword(checkword+wl-3,"ths")))

  2836 	    query=0;

  2837         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||

  2838 	  matchword(checkword+wl-4,"rdly") ||

  2839 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))

  2840 	    query=0;

  2841         /* digits, ending in l, L, s or d */

  2842         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||

  2843 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))

  2844 	    query=0;

  2845         /*

  2846 	 * L at the start of a number, representing Britsh pounds, like L500.

  2847          * This is cute. We know the current word is mixeddigit. If the first

  2848          * letter is L, there must be at least one digit following. If both

  2849          * digits and letters follow, we have a genuine error, else we have a

  2850          * capital L followed by digits, and we accept that as a non-error.

  2851 	 */

  2852         if (checkword[0]=='L' && !mixdigit(checkword+1))

  2853 	    query=0;

  2854     }

  2855     return query;

  2856 }

  2858 /*

  2859  * getaword:

  2860  *

  2861  * Extracts the first/next "word" from the line, and puts

  2862  * it into "thisword". A word is defined as one English word unit--or

  2863  * at least that's the aim.

  2864  *

  2865  * Returns: a pointer to the position in the line where we will start

  2866  *          looking for the next word.

  2867  */

  2868 const char *getaword(const char *fromline,char *thisword)

  2869 {

  2870     int i,wordlen;

  2871     const char *s;

  2872     wordlen=0;

  2873     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;

  2874       fromline++)

  2875 	;

  2876     /*

  2877      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2878      * Especially yucky is the case of L1,000

  2879      * This section looks for a pattern of characters including a digit

  2880      * followed by a comma or period followed by one or more digits.

  2881      * If found, it returns this whole pattern as a word; otherwise we discard

  2882      * the results and resume our normal programming.

  2883      */

  2884     s=fromline;

  2885     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&

  2886       wordlen<MAXWORDLEN;s++)

  2887     {

  2888 	thisword[wordlen]=*s;

  2889         wordlen++;

  2890     }

  2891     thisword[wordlen]=0;

  2892     for (i=1;i<wordlen-1;i++)

  2893     {

  2894         if (thisword[i]=='.' || thisword[i]==',')

  2895 	{

  2896             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))

  2897 	    {

  2898                 fromline=s;

  2899                 return fromline;

  2900 	    }

  2901 	}

  2902     }

  2903     /* we didn't find a punctuated number - do the regular getword thing */

  2904     wordlen=0;

  2905     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&

  2906       wordlen<MAXWORDLEN;fromline++)

  2907     {

  2908         thisword[wordlen]=*fromline;

  2909         wordlen++;

  2910     }

  2911     thisword[wordlen]=0;

  2912     return fromline;

  2913 }

  2915 /*

  2916  * matchword:

  2917  *

  2918  * A case-insensitive string matcher.

  2919  */

  2920 int matchword(char *checkfor,char *thisword)

  2921 {

  2922     unsigned int ismatch,i;

  2923     if (strlen(checkfor)!=strlen(thisword))

  2924 	return 0;

  2925     ismatch=1;     /* assume a match until we find a difference */

  2926     for (i=0;i<strlen(checkfor);i++)

  2927         if (toupper(checkfor[i])!=toupper(thisword[i]))

  2928             ismatch=0;

  2929     return ismatch;

  2930 }

  2932 /*

  2933  * lowerit:

  2934  *

  2935  * Lowercase the line.

  2936  */

  2938 void lowerit(char *theline)

  2939 {

  2940     for (;*theline;theline++)

  2941         if (*theline>='A' && *theline<='Z')

  2942             *theline+=32;

  2943 }

  2945 /*

  2946  * isroman:

  2947  *

  2948  * Is this word a Roman Numeral?

  2949  *

  2950  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2951  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2952  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2953  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2954  * expressions thereof, except when it came to taxes. Allow any number of M,

  2955  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  2956  * XL or an optional XC, an optional IX or IV, an optional V and any number

  2957  * of optional Is.

  2958  */

  2959 int isroman(char *t)

  2960 {

  2961     char *s;

  2962     if (!t || !*t)

  2963 	return 0;

  2964     s=t;

  2965     while (*t=='m' && *t)

  2966 	t++;

  2967     if (*t=='d')

  2968 	t++;

  2969     if (*t=='c' && t[1]=='m')

  2970 	t+=2;

  2971     if (*t=='c' && t[1]=='d')

  2972 	t+=2;

  2973     while (*t=='c' && *t)

  2974 	t++;

  2975     if (*t=='x' && t[1]=='l')

  2976 	t+=2;

  2977     if (*t=='x' && t[1]=='c')

  2978 	t+=2;

  2979     if (*t=='l')

  2980 	t++;

  2981     while (*t=='x' && *t)

  2982 	t++;

  2983     if (*t=='i' && t[1]=='x')

  2984 	t+=2;

  2985     if (*t=='i' && t[1]=='v')

  2986 	t+=2;

  2987     if (*t=='v')

  2988 	t++;

  2989     while (*t=='i' && *t)

  2990 	t++;

  2991     return !*t;

  2992 }

  2994 /*

  2995  * gcisalpha:

  2996  *

  2997  * A version of isalpha() that is somewhat lenient on 8-bit texts.

  2998  * If we use the standard function, 8-bit accented characters break

  2999  * words, so that tete with accented characters appears to be two words, "t"

  3000  * and "t", with 8-bit characters between them. This causes over-reporting of

  3001  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)

  3002  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.

  3003  */

  3004 int gcisalpha(unsigned char c)

  3005 {

  3006     if (c>='a' && c<='z')

  3007 	return 1;

  3008     if (c>='A' && c<='Z')

  3009 	return 1;

  3010     if (c<140)

  3011 	return 0;

  3012     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)

  3013 	return 1;

  3014     if (c==140 || c==142 || c==156 || c==158 || c==159)

  3015 	return 1;

  3016     return 0;

  3017 }

  3019 /*

  3020  * gcisdigit:

  3021  *

  3022  * A version of isdigit() that doesn't get confused in 8-bit texts.

  3023  */

  3024 int gcisdigit(unsigned char c)

  3025 {

  3026     return c>='0' && c<='9';

  3027 }

  3029 /*

  3030  * gcisletter:

  3031  *

  3032  * A version of isletter() that doesn't get confused in 8-bit texts.

  3033  * NB: this is ISO-8891-1-specific.

  3034  */

  3035 int gcisletter(unsigned char c)

  3036 {

  3037     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;

  3038 }

  3040 /*

  3041  * gcstrchr:

  3042  *

  3043  * Wraps strchr to return NULL if the character being searched for is zero.

  3044  */

  3045 char *gcstrchr(char *s,char c)

  3046 {

  3047     if (!c)

  3048 	return NULL;

  3049     return strchr(s,c);

  3050 }

  3052 /*

  3053  * postprocess_for_DP:

  3054  *

  3055  * Invoked with the -d switch from flgets().

  3056  * It simply "removes" from the line a hard-coded set of common

  3057  * DP-specific tags, so that the line passed to the main routine has

  3058  * been pre-cleaned of DP markup.

  3059  */

  3060 void postprocess_for_DP(char *theline)

  3061 {

  3062     char *s,*t;

  3063     int i;

  3064     if (!*theline)

  3065         return;

  3066     for (i=0;*DPmarkup[i];i++)

  3067     {

  3068         s=strstr(theline,DPmarkup[i]);

  3069         while (s)

  3070 	{

  3071             t=s+strlen(DPmarkup[i]);

  3072             while (*t)

  3073 	    {

  3074                 *s=*t;

  3075                 t++;

  3076 		s++;

  3077 	    }

  3078             *s=0;

  3079             s=strstr(theline,DPmarkup[i]);

  3080 	}

  3081     }

  3082 }

  3084 /*

  3085  * postprocess_for_HTML:

  3086  *

  3087  * Invoked with the -m switch from flgets().

  3088  * It simply "removes" from the line a hard-coded set of common

  3089  * HTML tags and "replaces" a hard-coded set of common HTML

  3090  * entities, so that the line passed to the main routine has

  3091  * been pre-cleaned of HTML.

  3092  */

  3093 void postprocess_for_HTML(char *theline)

  3094 {

  3095     if (strstr(theline,"<") && strstr(theline,">"))

  3096         while (losemarkup(theline))

  3097             ;

  3098     while (loseentities(theline))

  3099         ;

  3100 }

  3102 char *losemarkup(char *theline)

  3103 {

  3104     char *s,*t;

  3105     int i;

  3106     if (!*theline)

  3107         return NULL;

  3108     s=strstr(theline,"<");

  3109     t=strstr(theline,">");

  3110     if (!s || !t)

  3111 	return NULL;

  3112     for (i=0;*markup[i];i++)

  3113         if (!tagcomp(s+1,markup[i]))

  3114 	{

  3115             if (!t[1])

  3116 	    {

  3117                 *s=0;

  3118                 return s;

  3119 	    }

  3120             else if (t>s)

  3121 	    {

  3122 		strcpy(s,t+1);

  3123 		return s;

  3124 	    }

  3125         }

  3126     /* It's an unrecognized <xxx>. */

  3127     return NULL;

  3128 }

  3130 char *loseentities(char *theline)

  3131 {

  3132     int i;

  3133     char *s,*t;

  3134     if (!*theline)

  3135         return NULL;

  3136     for (i=0;*entities[i].htmlent;i++)

  3137     {

  3138         s=strstr(theline,entities[i].htmlent);

  3139         if (s)

  3140 	{

  3141             t=malloc((size_t)strlen(s));

  3142             if (!t)

  3143 		return NULL;

  3144             strcpy(t,s+strlen(entities[i].htmlent));

  3145             strcpy(s,entities[i].textent);

  3146             strcat(s,t);

  3147             free(t);

  3148             return theline;

  3149 	}

  3150     }

  3151     for (i=0;*entities[i].htmlnum;i++)

  3152     {

  3153         s=strstr(theline,entities[i].htmlnum);

  3154         if (s)

  3155 	{

  3156             t=malloc((size_t)strlen(s));

  3157             if (!t)

  3158 		return NULL;

  3159             strcpy(t,s+strlen(entities[i].htmlnum));

  3160             strcpy(s,entities[i].textent);

  3161             strcat(s,t);

  3162             free(t);

  3163             return theline;

  3164 	}

  3165     }

  3166     return NULL;

  3167 }

  3169 int tagcomp(char *strin,char *basetag)

  3170 {

  3171     char *s,*t;

  3172     s=basetag;

  3173     t=strin;

  3174     if (*t=='/')

  3175 	t++; /* ignore a slash */

  3176     while (*s && *t)

  3177     {

  3178         if (tolower(*s)!=tolower(*t))

  3179 	    return 1;

  3180         s++;

  3181 	t++;

  3182     }

  3183     return 0;

  3184 }

  3186 void proghelp()

  3187 {

  3188     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3189     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3190     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3191     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3192       "For details, read the file COPYING.\n",stderr);

  3193     fputs("This is Free Software; "

  3194       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3195     fputs("read the file COPYING for details.\n\n",stderr);

  3196     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);

  3197     fputs("  where -s checks single quotes, -e suppresses echoing lines, "

  3198       "-t checks typos\n",stderr);

  3199     fputs("  -x (paranoid) switches OFF -t and extra checks, "

  3200       "-l turns OFF line-end checks\n",stderr);

  3201     fputs("  -o just displays overview without detail, "

  3202       "-h echoes header fields\n",stderr);

  3203     fputs("  -v (verbose) unsuppresses duplicate reporting, "

  3204       "-m suppresses markup\n",stderr);

  3205     fputs("  -d ignores DP-specific markup,\n",stderr);

  3206     fputs("  -u uses a file gutcheck.typ to query user-defined "

  3207       "possible typos\n",stderr);

  3208     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);

  3209     fputs("\n",stderr);

  3210     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",

  3211       stderr);

  3212     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3213       "non-ASCII\n",stderr);

  3214     fputs("characters like accented letters, "

  3215       "lines longer than 75 or shorter than 55,\n",stderr);

  3216     fputs("unbalanced quotes or brackets, "

  3217       "a variety of badly formatted punctuation, \n",stderr);

  3218     fputs("HTML tags, some likely typos. "

  3219       "It is NOT a substitute for human judgement.\n",stderr);

  3220     fputs("\n",stderr);

  3221 }

author	ali <ali@juiblex.co.uk>
	Sun May 26 22:37:16 2013 +0100 (2013-05-26)
changeset 66	a5ef278feb34
parent 65	1b0e1aaf1800
child 67	865063352146
permissions	-rw-r--r--