bookloupe-testing: bookloupe/bookloupe.c@4c8606eb60c1

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*                                                                       */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */

     6 /*                                                                       */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.                                   */

    11 /*                                                                       */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */

    15 /* GNU General Public License for more details.                          */

    16 /*                                                                       */

    17 /* You should have received a copy of the GNU General Public License     */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    26 #define MAXWORDLEN    80    /* max length of one word             */

    27 #define LINEBUFSIZE 2048    /* buffer size for an input line      */

    29 #define MAX_USER_TYPOS 1000

    30 #define USERTYPO_FILE "gutcheck.typ"

    32 #ifndef MAX_PATH

    33 #define MAX_PATH 16384

    34 #endif

    36 char aline[LINEBUFSIZE];

    37 char prevline[LINEBUFSIZE];

    39 /* Common typos. */

    40 char *typo[] = {

    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    70     "se", ""

    71 };

    73 char *usertypo[MAX_USER_TYPOS];

    75 /* Common abbreviations and other OK words not to query as typos. */

    76 char *okword[] = {

    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    80     "outbid", "outbids", "frostbite", "frostbitten", ""

    81 };

    83 /* Common abbreviations that cause otherwise unexplained periods. */

    84 char *abbrev[] = {

    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    87 };

    89 /*

    90  * Two-Letter combinations that rarely if ever start words,

    91  * but are common scannos or otherwise common letter combinations.

    92  */

    93 char *nostart[] = {

    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    95 };

    97 /*

    98  * Two-Letter combinations that rarely if ever end words,

    99  * but are common scannos or otherwise common letter combinations.

   100  */

   101 char *noend[] = {

   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   103     "sw", "gr", "sl", "cl", "iy", ""

   104 };

   106 char *markup[] = {

   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   111 };

   113 char *DPmarkup[] = {

   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   115 };

   117 char *nocomma[] = {

   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   121     "during", "let", "toward", "among", ""

   122 };

   124 char *noperiod[] = {

   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   128     "among", "those", "into", "whom", "having", "thence", ""

   129 };

   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";

   133 struct {

   134     char *htmlent;

   135     char *htmlnum;

   136     char *textent;

   137 } entities[] = {

   138     "&amp;",	"&#38;",     "&",

   139     "&lt;",	"&#60;",     "<",

   140     "&gt;",	"&#62;",     ">",

   141     "&deg;",	"&#176;",    " degrees",

   142     "&pound;",	"&#163;",    "L",

   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */

   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */

   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */

   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */

   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */

   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */

   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */

   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */

   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */

   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */

   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */

   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */

   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */

   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */

   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */

   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */

   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */

   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */

   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */

   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */

   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */

   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */

   165     "&cent;",	"&#162;",    "c", /* cent sign */

   166     "&pound;",	"&#163;",    "L", /* pound sign */

   167     "&curren;",	"&#164;",    "$", /* currency sign */

   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */

   169     "&sect;",	"&#167;",    "--", /* section sign */

   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */

   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */

   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */

   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */

   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */

   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */

   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */

   177     "&deg;",	"&#176;",    " degrees", /* degree sign */

   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */

   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */

   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */

   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */

   182     "&micro;",	"&#181;",    "m", /* micro sign */

   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */

   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */

   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */

   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */

   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */

   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */

   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */

   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */

   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */

   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */

   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */

   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */

   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */

   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */

   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */

   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */

   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */

   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */

   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */

   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */

   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */

   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */

   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */

   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */

   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */

   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */

   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */

   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */

   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */

   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */

   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */

   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */

   215     "&times;",	"&#215;",    "*", /* multiplication sign */

   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */

   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */

   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */

   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */

   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */

   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */

   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */

   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */

   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */

   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */

   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */

   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */

   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */

   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */

   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */

   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */

   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */

   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */

   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */

   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */

   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */

   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */

   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */

   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */

   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */

   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */

   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */

   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */

   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */

   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */

   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */

   247     "&divide;",	"&#247;",    "/", /* division sign */

   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */

   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */

   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */

   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */

   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */

   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */

   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */

   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */

   256     "", ""

   257 };

   259 /* special characters */

   260 #define CHAR_SPACE        32

   261 #define CHAR_TAB           9

   262 #define CHAR_LF           10

   263 #define CHAR_CR           13

   264 #define CHAR_DQUOTE       34

   265 #define CHAR_SQUOTE       39

   266 #define CHAR_OPEN_SQUOTE  96

   267 #define CHAR_TILDE       126

   268 #define CHAR_ASTERISK     42

   269 #define CHAR_FORESLASH    47

   270 #define CHAR_CARAT        94

   272 #define CHAR_UNDERSCORE    '_'

   273 #define CHAR_OPEN_CBRACK   '{'

   274 #define CHAR_CLOSE_CBRACK  '}'

   275 #define CHAR_OPEN_RBRACK   '('

   276 #define CHAR_CLOSE_RBRACK  ')'

   277 #define CHAR_OPEN_SBRACK   '['

   278 #define CHAR_CLOSE_SBRACK  ']'

   280 /* longest and shortest normal PG line lengths */

   281 #define LONGEST_PG_LINE   75

   282 #define WAY_TOO_LONG      80

   283 #define SHORTEST_PG_LINE  55

   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */

   286                                   /*     D - ignore DP-specific markup     */

   287                                   /*     E - echo queried line             */

   288                                   /*     S - check single quotes           */

   289                                   /*     T - check common typos            */

   290                                   /*     P - require closure of quotes on  */

   291                                   /*         every paragraph               */

   292                                   /*     X - "Trust no one" :-) Paranoid!  */

   293                                   /*         Queries everything            */

   294                                   /*     L - line end checking defaults on */

   295                                   /*         -L turns it off               */

   296                                   /*     O - overview. Just shows counts.  */

   297                                   /*     Y - puts errors to stdout         */

   298                                   /*         instead of stderr             */

   299                                   /*     H - Echoes header fields          */

   300                                   /*     M - Ignore markup in < >          */

   301                                   /*     U - Use file of User-defined Typos*/

   302                                   /*     W - Defaults for use on Web upload*/

   303                                   /*     V - Verbose - list EVERYTHING!    */

   304 #define SWITNO 14                 /* max number of switch parms            */

   305                                   /*        - used for defining array-size */

   306 #define MINARGS   1               /* minimum no of args excl switches      */

   307 #define MAXARGS   1               /* maximum no of args excl switches      */

   309 int pswit[SWITNO];                /* program switches set by SWITCHES      */

   311 #define ECHO_SWITCH      0

   312 #define SQUOTE_SWITCH    1

   313 #define TYPO_SWITCH      2

   314 #define QPARA_SWITCH     3

   315 #define PARANOID_SWITCH  4

   316 #define LINE_END_SWITCH  5

   317 #define OVERVIEW_SWITCH  6

   318 #define STDOUT_SWITCH    7

   319 #define HEADER_SWITCH    8

   320 #define WEB_SWITCH       9

   321 #define VERBOSE_SWITCH   10

   322 #define MARKUP_SWITCH    11

   323 #define USERTYPO_SWITCH  12

   324 #define DP_SWITCH        13

   326 long cnt_dquot;       /* for overview mode, count of doublequote queries */

   327 long cnt_squot;       /* for overview mode, count of singlequote queries */

   328 long cnt_brack;       /* for overview mode, count of brackets queries */

   329 long cnt_bin;         /* for overview mode, count of non-ASCII queries */

   330 long cnt_odd;         /* for overview mode, count of odd character queries */

   331 long cnt_long;        /* for overview mode, count of long line errors */

   332 long cnt_short;       /* for overview mode, count of short line queries */

   333 long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */

   334 long cnt_dash;        /* for overview mode, count of dash-related queries */

   335 long cnt_word;        /* for overview mode, count of word queries */

   336 long cnt_html;        /* for overview mode, count of html queries */

   337 long cnt_lineend;     /* for overview mode, count of line-end queries */

   338 long cnt_spacend;     /* count of lines with space at end */

   339 long linecnt;         /* count of total lines in the file */

   340 long checked_linecnt; /* count of lines actually checked */

   342 void proghelp(void);

   343 void procfile(char *);

   345 #define LOW_THRESHOLD    0

   346 #define HIGH_THRESHOLD   1

   348 #define START 0

   349 #define END 1

   350 #define PREV 0

   351 #define NEXT 1

   352 #define FIRST_OF_PAIR 0

   353 #define SECOND_OF_PAIR 1

   355 #define MAX_WORDPAIR 1000

   357 char running_from[MAX_PATH];

   359 int mixdigit(char *);

   360 char *getaword(char *,char *);

   361 int matchword(char *,char *);

   362 char *flgets(char *,int,FILE *,long);

   363 void lowerit(char *);

   364 int gcisalpha(unsigned char);

   365 int gcisdigit(unsigned char);

   366 int gcisletter(unsigned char);

   367 char *gcstrchr(char *s,char c);

   368 void postprocess_for_HTML(char *);

   369 char *linehasmarkup(char *);

   370 char *losemarkup(char *);

   371 int tagcomp(char *,char *);

   372 char *loseentities(char *);

   373 int isroman(char *);

   374 int usertypo_count;

   375 void postprocess_for_DP(char *);

   377 char wrk[LINEBUFSIZE];

   379 #define MAX_QWORD 50

   380 #define MAX_QWORD_LENGTH 40

   381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];

   382 signed int dupcnt[MAX_QWORD];

   384 int main(int argc,char **argv)

   385 {

   386     char *argsw,*s;

   387     int i,switno,invarg;

   388     char usertypo_file[MAX_PATH];

   389     FILE *usertypofile;

   390     if (strlen(argv[0])<sizeof(running_from))

   391 	/* save the path to the executable */

   392         strcpy(running_from,argv[0]);

   393     /* find out what directory we're running from */

   394     s=running_from+strlen(running_from);

   395     for (;*s!='/' && *s!='\\' && s>=running_from;s--)

   396         *s=0;

   397     switno=strlen(SWITCHES);

   398     for (i=switno;--i>0;)

   399         pswit[i]=0;           /* initialise switches */

   400     /*

   401      * Standard loop to extract switches.

   402      * When we come out of this loop, the arguments will be

   403      * in argv[0] upwards and the switches used will be

   404      * represented by their equivalent elements in pswit[]

   405      */

   406     while (--argc>0 && **++argv=='-')

   407         for (argsw=argv[0]+1;*argsw!='\0';argsw++)

   408             for (i=switno,invarg=1;(--i>=0) && invarg==1;)

   409                 if ((toupper(*argsw))==SWITCHES[i])

   410 		{

   411                     invarg=0;

   412                     pswit[i]=1;

   413 		}

   414     /* Paranoid checking is turned OFF, not on, by its switch */

   415     pswit[PARANOID_SWITCH]^=1;

   416     if (pswit[PARANOID_SWITCH])

   417 	/* if running in paranoid mode force typo checks as well   */

   418         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;

   419     /* Line-end checking is turned OFF, not on, by its switch */

   420     pswit[LINE_END_SWITCH]^=1;

   421     /* Echoing is turned OFF, not on, by its switch */

   422     pswit[ECHO_SWITCH]^=1;

   423     if (pswit[OVERVIEW_SWITCH])

   424 	/* just print summary; don't echo */

   425         pswit[ECHO_SWITCH]=0;

   426     /*

   427      * Web uploads - for the moment, this is really just a placeholder

   428      * until we decide what processing we really want to do on web uploads

   429      */

   430     if (pswit[WEB_SWITCH])

   431     {

   432 	/* specific override for web uploads */

   433         pswit[ECHO_SWITCH]=1;

   434         pswit[SQUOTE_SWITCH]=0;

   435         pswit[TYPO_SWITCH]=1;

   436         pswit[QPARA_SWITCH]=0;

   437         pswit[PARANOID_SWITCH]=1;

   438         pswit[LINE_END_SWITCH]=0;

   439         pswit[OVERVIEW_SWITCH]=0;

   440         pswit[STDOUT_SWITCH]=0;

   441         pswit[HEADER_SWITCH]=1;

   442         pswit[VERBOSE_SWITCH]=0;

   443         pswit[MARKUP_SWITCH]=0;

   444         pswit[USERTYPO_SWITCH]=0;

   445         pswit[DP_SWITCH]=0;

   446     }

   447     if (argc<MINARGS || argc>MAXARGS)

   448     {

   449 	/* check number of args */

   450         proghelp();

   451         return 1;

   452     }

   453     /* read in the user-defined stealth scanno list */

   454     if (pswit[USERTYPO_SWITCH])

   455     {

   456 	/* ... we were told we had one! */

   457         usertypofile=fopen(USERTYPO_FILE,"rb");

   458         if (!usertypofile)

   459 	{

   460 	    /* not in cwd. try excuteable directory. */

   461             strcpy(usertypo_file,running_from);

   462             strcat(usertypo_file,USERTYPO_FILE);

   463             usertypofile=fopen(usertypo_file,"rb");

   464             if (!usertypofile) {

   465 		/* we ain't got no user typo file! */

   466                 printf("   --> I couldn't find gutcheck.typ "

   467 		  "-- proceeding without user typos.\n");

   468 	    }

   469 	}

   470         usertypo_count=0;

   471         if (usertypofile)

   472 	{

   473 	    /* we managed to open a User Typo File! */

   474             if (pswit[USERTYPO_SWITCH])

   475 	    {

   476                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,

   477 		  (long)usertypo_count))

   478 		{

   479                     if (strlen(aline)>1)

   480 		    {

   481                         if ((int)*aline>33)

   482 			{

   483                             s=malloc(strlen(aline)+1);

   484                             if (!s)

   485 			    {

   486                                 fprintf(stderr,"bookloupe: cannot get enough "

   487 				  "memory for user typo file!\n");

   488                                 exit(1);

   489 			    }

   490                             strcpy(s,aline);

   491                             usertypo[usertypo_count]=s;

   492                             usertypo_count++;

   493                             if (usertypo_count>=MAX_USER_TYPOS)

   494 			    {

   495                                 printf("   --> Only %d user-defined typos "

   496 				  "allowed: ignoring the rest\n",

   497 				  MAX_USER_TYPOS);

   498                                 break;

   499 			    }

   500 			}

   501 		    }

   502 		}

   503 	    }

   504             fclose(usertypofile);

   505 	}

   506     }

   507     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   508     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=

   509     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=

   510     cnt_spacend=0;

   511     procfile(argv[0]);

   512     if (pswit[OVERVIEW_SWITCH])

   513     {

   514 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   515 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   516         printf("    --------------- Queries found --------------\n");

   517         if (cnt_long)

   518 	    printf("    Long lines:                    %14ld\n",cnt_long);

   519         if (cnt_short)

   520 	    printf("    Short lines:                   %14ld\n",cnt_short);

   521         if (cnt_lineend)

   522 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);

   523         if (cnt_word)

   524 	    printf("    Common typos:                  %14ld\n",cnt_word);

   525         if (cnt_dquot)

   526 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);

   527         if (cnt_squot)

   528 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);

   529         if (cnt_brack)

   530 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);

   531         if (cnt_bin)

   532 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);

   533         if (cnt_odd)

   534 	    printf("    Proofing characters:           %14ld\n",cnt_odd);

   535         if (cnt_punct)

   536 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   537         if (cnt_dash)

   538 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);

   539         if (cnt_html)

   540 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);

   541         printf("\n");

   542         printf("    TOTAL QUERIES                  %14ld\n",

   543           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   544           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   545     }

   546     return 0;

   547 }

   549 struct first_pass_results {

   550     long firstline,astline;

   551     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   552     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   553     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   554     signed int Dutchcount,Frenchcount;

   555 };

   557 /*

   558  * first_pass:

   559  *

   560  * Run a first pass - verify that it's a valid PG

   561  * file, decide whether to report some things that

   562  * occur many times in the text like long or short

   563  * lines, non-standard dashes, etc.

   564  */

   565 struct first_pass_results *first_pass(FILE *infile)

   566 {

   567     char laststart=CHAR_SPACE,*s;

   568     signed int i,llen;

   569     unsigned int lastlen=0,lastblen=0;

   570     long spline=0,nspline=0;

   571     static struct first_pass_results results={0};

   572     char inword[MAXWORDLEN]="";

   573     while (fgets(aline,LINEBUFSIZE-1,infile))

   574     {

   575         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)

   576 	    aline[strlen(aline)-1]=0;

   577         linecnt++;

   578         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&

   579 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))

   580 	{

   581             if (spline)

   582                 printf("   --> Duplicate header?\n");

   583             spline=linecnt+1;   /* first line of non-header text, that is */

   584 	}

   585         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))

   586 	{

   587             if (nspline)

   588                 printf("   --> Duplicate header?\n");

   589             nspline=linecnt+1;   /* first line of non-header text, that is */

   590 	}

   591         if (spline || nspline)

   592 	{

   593             lowerit(aline);

   594             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))

   595 	    {

   596                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))

   597 		{

   598                     if (results.footerline)

   599 		    {

   600 			/* it's an old-form header - we can detect duplicates */

   601                         if (!nspline)

   602                             printf("   --> Duplicate footer?\n");

   603 		    }

   604                     else

   605                         results.footerline=linecnt;

   606 		}

   607 	    }

   608 	}

   609         if (spline)

   610 	    results.firstline=spline;

   611         if (nspline)

   612 	    results.firstline=nspline;  /* override with new */

   613         if (results.footerline)

   614 	    continue;    /* don't count the boilerplate in the footer */

   615         llen=strlen(aline);

   616         results.totlen+=llen;

   617         for (i=0;i<llen;i++)

   618 	{

   619             if ((unsigned char)aline[i]>127)

   620 		results.binlen++;

   621             if (gcisalpha(aline[i]))

   622 		results.alphalen++;

   623             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

   624 		results.endquote_count++;

   625 	}

   626         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&

   627 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   628 	    results.shortline++;

   629         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)

   630 	    cnt_spacend++;

   631         if (strstr(aline,".,"))

   632 	    results.dotcomma++;

   633         /* only count ast lines for ignoring purposes where there is */

   634         /* locase text on the line */

   635         if (strstr(aline,"*"))

   636 	{

   637             for (s=aline;*s;s++)

   638                 if (*s>='a' && *s<='z')

   639                     break;

   640              if (*s)

   641 		results.astline++;

   642 	}

   643         if (strstr(aline,"/"))

   644             results.fslashline++;

   645         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

   646 	    ;

   647         if (aline[i]=='-' && aline[i-1]!='-')

   648 	    results.hyphens++;

   649         if (llen>LONGEST_PG_LINE)

   650 	    results.longline++;

   651         if (llen>WAY_TOO_LONG)

   652 	    results.verylongline++;

   653         if (strstr(aline,"<") && strstr(aline,">"))

   654 	{

   655             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);

   656             if (i>0)

   657                 results.htmcount++;

   658             if (strstr(aline,"<i>"))

   659 		results.htmcount+=4; /* bonus marks! */

   660 	}

   661         /* Check for spaced em-dashes */

   662         if (strstr(aline,"--"))

   663 	{

   664             results.emdash++;

   665             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||

   666                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   667 		results.space_emdash++;

   668             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&

   669                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   670 		/* count of em-dashes with spaces both sides */

   671 		results.non_PG_space_emdash++;

   672             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&

   673                (*(strstr(aline,"--")+2)!=CHAR_SPACE))

   674 		/* count of PG-type em-dashes with no spaces */

   675 		results.PG_space_emdash++;

   676 	}

   677         for (s=aline;*s;)

   678 	{

   679             s=getaword(s,inword);

   680             if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   681                 results.Dutchcount++;

   682             if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   683                 results.Frenchcount++;

   684             if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   685                 results.standalone_digit++;

   686 	}

   687         /* Check for spaced dashes */

   688         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')

   689 	    results.spacedash++;

   690         lastblen=lastlen;

   691         lastlen=strlen(aline);

   692         laststart=aline[0];

   693     }

   694     return &results;

   695 }

   697 struct warnings {

   698     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   699     signed int endquote,isDutch,isFrench;

   700 };

   702 /*

   703  * report_first_pass:

   704  *

   705  * Make some snap decisions based on the first pass results.

   706  */

   707 struct warnings *report_first_pass(struct first_pass_results *results)

   708 {

   709     static struct warnings warnings={0};

   710     if (cnt_spacend>0)

   711         printf("   --> %ld lines in this file have white space at end\n",

   712 	  cnt_spacend);

   713     warnings.dotcomma=1;

   714     if (results->dotcomma>5)

   715     {

   716         warnings.dotcomma=0;

   717         printf("   --> %ld lines in this file contain '.,'. "

   718 	  "Not reporting them.\n",results->dotcomma);

   719     }

   720     /*

   721      * If more than 50 lines, or one-tenth, are short,

   722      * don't bother reporting them.

   723      */

   724     warnings.shortline=1;

   725     if (results->shortline>50 || results->shortline*10>linecnt)

   726     {

   727         warnings.shortline=0;

   728         printf("   --> %ld lines in this file are short. "

   729 	  "Not reporting short lines.\n",results->shortline);

   730     }

   731     /*

   732      * If more than 50 lines, or one-tenth, are long,

   733      * don't bother reporting them.

   734      */

   735     warnings.longline=1;

   736     if (results->longline>50 || results->longline*10>linecnt)

   737     {

   738         warnings.longline=0;

   739         printf("   --> %ld lines in this file are long. "

   740 	  "Not reporting long lines.\n",results->longline);

   741     }

   742     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   743     warnings.ast=1;

   744     if (results->astline>10)

   745     {

   746         warnings.ast=0;

   747         printf("   --> %ld lines in this file contain asterisks. "

   748 	  "Not reporting them.\n",results->astline);

   749     }

   750     /*

   751      * If more than 10 lines contain forward slashes,

   752      * don't bother reporting them.

   753      */

   754     warnings.fslash=1;

   755     if (results->fslashline>10)

   756     {

   757         warnings.fslash=0;

   758         printf("   --> %ld lines in this file contain forward slashes. "

   759 	  "Not reporting them.\n",results->fslashline);

   760     }

   761     /*

   762      * If more than 20 lines contain unpunctuated endquotes,

   763      * don't bother reporting them.

   764      */

   765     warnings.endquote=1;

   766     if (results->endquote_count>20)

   767     {

   768         warnings.endquote=0;

   769         printf("   --> %ld lines in this file contain unpunctuated endquotes. "

   770 	  "Not reporting them.\n",results->endquote_count);

   771     }

   772     /*

   773      * If more than 15 lines contain standalone digits,

   774      * don't bother reporting them.

   775      */

   776     warnings.digit=1;

   777     if (results->standalone_digit>10)

   778     {

   779         warnings.digit=0;

   780         printf("   --> %ld lines in this file contain standalone 0s and 1s. "

   781 	  "Not reporting them.\n",results->standalone_digit);

   782     }

   783     /*

   784      * If more than 20 lines contain hyphens at end,

   785      * don't bother reporting them.

   786      */

   787     warnings.hyphen=1;

   788     if (results->hyphens>20)

   789     {

   790         warnings.hyphen=0;

   791         printf("   --> %ld lines in this file have hyphens at end. "

   792 	  "Not reporting them.\n",results->hyphens);

   793     }

   794     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   795     {

   796         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   797         pswit[MARKUP_SWITCH]=1;

   798     }

   799     if (results->verylongline>0)

   800         printf("   --> %ld lines in this file are VERY long!\n",

   801 	  results->verylongline);

   802     /*

   803      * If there are more non-PG spaced dashes than PG em-dashes,

   804      * assume it's deliberate.

   805      * Current PG guidelines say don't use them, but older texts do,

   806      * and some people insist on them whatever the guidelines say.

   807      */

   808     warnings.dash=1;

   809     if (results->spacedash+results->non_PG_space_emdash>

   810       results->PG_space_emdash)

   811     {

   812         warnings.dash=0;

   813         printf("   --> There are %ld spaced dashes and em-dashes. "

   814 	  "Not reporting them.\n",

   815 	  results->spacedash+results->non_PG_space_emdash);

   816     }

   817     /* If more than a quarter of characters are hi-bit, bug out. */

   818     warnings.bin=1;

   819     if (results->binlen*4>results->totlen)

   820     {

   821         printf("   --> This file does not appear to be ASCII. "

   822 	  "Terminating. Best of luck with it!\n");

   823         exit(1);

   824     }

   825     if (results->alphalen*4<results->totlen)

   826     {

   827         printf("   --> This file does not appear to be text. "

   828 	  "Terminating. Best of luck with it!\n");

   829         exit(1);

   830     }

   831     if (results->binlen*100>results->totlen || results->binlen>100)

   832     {

   833         printf("   --> There are a lot of foreign letters here. "

   834 	  "Not reporting them.\n");

   835         warnings.bin=0;

   836     }

   837     warnings.isDutch=0;

   838     if (results->Dutchcount>50)

   839     {

   840         warnings.isDutch=1;

   841         printf("   --> This looks like Dutch - "

   842 	  "switching off dashes and warnings for 's Middags case.\n");

   843     }

   844     warnings.isFrench=0;

   845     if (results->Frenchcount>50)

   846     {

   847         warnings.isFrench=1;

   848         printf("   --> This looks like French - "

   849 	  "switching off some doublepunct.\n");

   850     }

   851     if (results->firstline && results->footerline)

   852         printf("    The PG header and footer appear to be already on.\n");

   853     else

   854     {

   855         if (results->firstline)

   856             printf("    The PG header is on - no footer.\n");

   857         if (results->footerline)

   858             printf("    The PG footer is on - no header.\n");

   859     }

   860     printf("\n");

   861     if (pswit[VERBOSE_SWITCH])

   862     {

   863         warnings.bin=1;

   864         warnings.shortline=1;

   865         warnings.dotcomma=1;

   866         warnings.longline=1;

   867         warnings.dash=1;

   868         warnings.digit=1;

   869         warnings.ast=1;

   870         warnings.fslash=1;

   871         warnings.hyphen=1;

   872         warnings.endquote=1;

   873         printf("   *** Verbose output is ON -- you asked for it! ***\n");

   874     }

   875     if (warnings.isDutch)

   876         warnings.dash=0;

   877     if (results->footerline>0 && results->firstline>0 &&

   878       results->footerline>results->firstline &&

   879       results->footerline-results->firstline<100)

   880     {

   881         printf("   --> I don't really know where this text starts. \n");

   882         printf("       There are no reference points.\n");

   883         printf("       I'm going to have to report the header and footer "

   884 	  "as well.\n");

   885         results->firstline=0;

   886     }

   887     return &warnings;

   888 }

   890 struct counters {

   891     long quot;

   892     signed int c_unders,c_brack,s_brack,r_brack;

   893     signed int open_single_quote,close_single_quote;

   894 };

   896 /*

   897  * analyse_quotes:

   898  *

   899  * Look along the line, accumulate the count of quotes, and see

   900  * if this is an empty line - i.e. a line with nothing on it

   901  * but spaces.

   902  * If line has just spaces, period, * and/or - on it, don't

   903  * count it, since empty lines with asterisks or dashes to

   904  * separate sections are common.

   905  *

   906  * Returns: Non-zero if the line is empty.

   907  */

   908 int analyse_quotes(const char *s,struct counters *counters)

   909 {

   910     signed int guessquote=0;

   911     int isemptyline=1;    /* assume the line is empty until proven otherwise */

   912     while (*s)

   913     {

   914 	if (*s==CHAR_DQUOTE)

   915 	    counters->quot++;

   916 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)

   917 	{

   918 	    if (s==aline)

   919 	    {

   920 		/*

   921 		 * At start of line, it can only be an openquote.

   922 		 * Hardcode a very common exception!

   923 		 */

   924 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))

   925 		    counters->open_single_quote++;

   926 	    }

   927 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))

   928 		/* Do nothing! it's definitely an apostrophe, not a quote */

   929 		;

   930 	    /* it's outside a word - let's check it out */

   931 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))

   932 	    {

   933 		/* it damwell better BE an openquote */

   934 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))

   935 		    /* hardcode a very common exception! */

   936 		    counters->open_single_quote++;

   937 	    }

   938 	    else

   939 	    {

   940 		/* now - is it a closequote? */

   941 		guessquote=0;   /* accumulate clues */

   942 		if (gcisalpha(s[-1]))

   943 		{

   944 		    /* it follows a letter - could be either */

   945 		    guessquote++;

   946 		    if (s[-1]=='s')

   947 		    {

   948 			/* looks like a plural apostrophe */

   949 			guessquote-=3;

   950 			if (s[1]==CHAR_SPACE)  /* bonus marks! */

   951 			    guessquote-=2;

   952 		    }

   953 		}

   954 		/* it doesn't have a letter either side */

   955 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))

   956 		    guessquote+=8; /* looks like a closequote */

   957 		else

   958 		    guessquote++;

   959 		if (counters->open_single_quote>counters->close_single_quote)

   960 		    /*

   961 		     * Give it the benefit of some doubt,

   962 		     * if a squote is already open.

   963 		     */

   964 		    guessquote++;

   965 		else

   966 		    guessquote--;

   967 		if (guessquote>=0)

   968 		    counters->close_single_quote++;

   969 	    }

   970 	}

   971 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&

   972 	  *s!=13 && *s!=10)

   973 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */

   974 	if (*s==CHAR_UNDERSCORE)

   975 	    counters->c_unders++;

   976 	if (*s==CHAR_OPEN_CBRACK)

   977 	    counters->c_brack++;

   978 	if (*s==CHAR_CLOSE_CBRACK)

   979 	    counters->c_brack--;

   980 	if (*s==CHAR_OPEN_RBRACK)

   981 	    counters->r_brack++;

   982 	if (*s==CHAR_CLOSE_RBRACK)

   983 	    counters->r_brack--;

   984 	if (*s==CHAR_OPEN_SBRACK)

   985 	    counters->s_brack++;

   986 	if (*s==CHAR_CLOSE_SBRACK)

   987 	    counters->s_brack--;

   988 	s++;

   989     }

   990     return isemptyline;

   991 }

   993 /*

   994  * check_for_odd_characters:

   995  *

   996  * Check for binary and other odd characters.

   997  */

   998 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

   999   int isemptyline)

  1000 {

  1001     /* Don't repeat multiple warnings on one line. */

  1002     signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;

  1003     const char *s;

  1004     unsigned char c;

  1005     for (s=aline;*s;s++)

  1006     {

  1007 	c=*(unsigned char *)s;

  1008 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))

  1009 	{

  1010 	    if (pswit[ECHO_SWITCH])

  1011 		printf("\n%s\n",aline);

  1012 	    if (!pswit[OVERVIEW_SWITCH])

  1013 		if (c>127 && c<160)

  1014 		    printf("    Line %ld column %d - "

  1015 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);

  1016 		else

  1017 		    printf("    Line %ld column %d - Non-ASCII character %d\n",

  1018 		      linecnt,(int)(s-aline)+1,c);

  1019 	    else

  1020 		cnt_bin++;

  1021 	    eNon_A=1;

  1022 	}

  1023 	if (!eTab && *s==CHAR_TAB)

  1024 	{

  1025 	    if (pswit[ECHO_SWITCH])

  1026 		printf("\n%s\n",aline);

  1027 	    if (!pswit[OVERVIEW_SWITCH])

  1028 		printf("    Line %ld column %d - Tab character?\n",

  1029 		  linecnt,(int)(s-aline)+1);

  1030 	    else

  1031 		cnt_odd++;

  1032 	    eTab=1;

  1033 	}

  1034 	if (!eTilde && *s==CHAR_TILDE)

  1035 	{

  1036 	    /*

  1037 	     * Often used by OCR software to indicate an

  1038 	     * unrecognizable character.

  1039 	     */

  1040 	    if (pswit[ECHO_SWITCH])

  1041 		printf("\n%s\n",aline);

  1042 	    if (!pswit[OVERVIEW_SWITCH])

  1043 		printf("    Line %ld column %d - Tilde character?\n",

  1044 		  linecnt,(int)(s-aline)+1);

  1045 	    else

  1046 		cnt_odd++;

  1047 	    eTilde=1;

  1048 	}

  1049 	if (!eCarat && *s==CHAR_CARAT)

  1050 	{

  1051 	    if (pswit[ECHO_SWITCH])

  1052 		printf("\n%s\n",aline);

  1053 	    if (!pswit[OVERVIEW_SWITCH])

  1054 		printf("    Line %ld column %d - Carat character?\n",

  1055 		  linecnt,(int)(s-aline)+1);

  1056 	    else

  1057 		cnt_odd++;

  1058 	    eCarat=1;

  1059 	}

  1060 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)

  1061 	{

  1062 	    if (pswit[ECHO_SWITCH])

  1063 		printf("\n%s\n",aline);

  1064 	    if (!pswit[OVERVIEW_SWITCH])

  1065 		printf("    Line %ld column %d - Forward slash?\n",

  1066 		  linecnt,(int)(s-aline)+1);

  1067 	    else

  1068 		cnt_odd++;

  1069 	    eFSlash=1;

  1070 	}

  1071 	/*

  1072 	 * Report asterisks only in paranoid mode,

  1073 	 * since they're often deliberate.

  1074 	 */

  1075 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1076 	  *s==CHAR_ASTERISK)

  1077 	{

  1078 	    if (pswit[ECHO_SWITCH])

  1079 		printf("\n%s\n",aline);

  1080 	    if (!pswit[OVERVIEW_SWITCH])

  1081 		printf("    Line %ld column %d - Asterisk?\n",

  1082 		  linecnt,(int)(s-aline)+1);

  1083 	    else

  1084 		cnt_odd++;

  1085 	    eAst=1;

  1086 	}

  1087     }

  1088 }

  1090 /*

  1091  * check_for_long_line:

  1092  *

  1093  * Check for line too long.

  1094  */

  1095 void check_for_long_line(const char *aline)

  1096 {

  1097     if (strlen(aline)>LONGEST_PG_LINE)

  1098     {

  1099 	if (pswit[ECHO_SWITCH])

  1100 	    printf("\n%s\n",aline);

  1101 	if (!pswit[OVERVIEW_SWITCH])

  1102 	    printf("    Line %ld column %d - Long line %d\n",

  1103 	      linecnt,strlen(aline),strlen(aline));

  1104 	else

  1105 	    cnt_long++;

  1106     }

  1107 }

  1109 struct line_properties {

  1110     unsigned int len,blen;

  1111     char start;

  1112 };

  1114 /*

  1115  * check_for_short_line:

  1116  *

  1117  * Check for line too short.

  1118  *

  1119  * This one is a bit trickier to implement: we don't want to

  1120  * flag the last line of a paragraph for being short, so we

  1121  * have to wait until we know that our current line is a

  1122  * "normal" line, then report the _previous_ line if it was too

  1123  * short. We also don't want to report indented lines like

  1124  * chapter heads or formatted quotations. We therefore keep

  1125  * last->len as the length of the last line examined, and

  1126  * last->blen as the length of the last but one, and try to

  1127  * suppress unnecessary warnings by checking that both were of

  1128  * "normal" length. We keep the first character of the last

  1129  * line in last->start, and if it was a space, we assume that

  1130  * the formatting is deliberate. I can't figure out a way to

  1131  * distinguish something like a quoted verse left-aligned or

  1132  * the header or footer of a letter from a paragraph of short

  1133  * lines - maybe if I examined the whole paragraph, and if the

  1134  * para has less than, say, 8 lines and if all lines are short,

  1135  * then just assume it's OK? Need to look at some texts to see

  1136  * how often a formula like this would get the right result.

  1137  */

  1138 void check_for_short_line(const char *aline,const struct line_properties *last)

  1139 {

  1140     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&

  1141       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1142     {

  1143 	if (pswit[ECHO_SWITCH])

  1144 	    printf("\n%s\n",prevline);

  1145 	if (!pswit[OVERVIEW_SWITCH])

  1146 	    printf("    Line %ld column %d - Short line %d?\n",

  1147 	      linecnt-1,strlen(prevline),strlen(prevline));

  1148 	else

  1149 	    cnt_short++;

  1150     }

  1151 }

  1153 /*

  1154  * check_for_starting_punctuation:

  1155  *

  1156  * Look for punctuation other than full ellipses at start of line.

  1157  */

  1158 void check_for_starting_punctuation(const char *aline)

  1159 {

  1160     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))

  1161     {

  1162 	if (pswit[ECHO_SWITCH])

  1163 	    printf("\n%s\n",aline);

  1164 	if (!pswit[OVERVIEW_SWITCH])

  1165 	    printf("    Line %ld column 1 - Begins with punctuation?\n",

  1166 	      linecnt);

  1167 	else

  1168 	    cnt_punct++;

  1169     }

  1170 }

  1172 /*

  1173  * check_for_spaced_emdash:

  1174  *

  1175  * Check for spaced em-dashes.

  1176  *

  1177  * We must check _all_ occurrences of "--" on the line

  1178  * hence the loop - even if the first double-dash is OK

  1179  * there may be another that's wrong later on.

  1180  */

  1181 void check_for_spaced_emdash(const char *aline)

  1182 {

  1183     const char *s,*t;

  1184     s=aline;

  1185     while ((t=strstr(s,"--")))

  1186     {

  1187 	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)

  1188 	{

  1189 	    if (pswit[ECHO_SWITCH])

  1190 		printf("\n%s\n",aline);

  1191 	    if (!pswit[OVERVIEW_SWITCH])

  1192 		printf("    Line %ld column %d - Spaced em-dash?\n",

  1193 		  linecnt,(int)(t-aline)+1);

  1194 	    else

  1195 		cnt_dash++;

  1196 	}

  1197 	s=t+2;

  1198     }

  1199 }

  1201 /*

  1202  * check_for_spaced_dash:

  1203  *

  1204  * Check for spaced dashes.

  1205  */

  1206 void check_for_spaced_dash(const char *aline)

  1207 {

  1208     const char *s;

  1209     if ((s=strstr(aline," -")))

  1210     {

  1211 	if (s[2]!='-')

  1212 	{

  1213 	    if (pswit[ECHO_SWITCH])

  1214 		printf("\n%s\n",aline);

  1215 	    if (!pswit[OVERVIEW_SWITCH])

  1216 		printf("    Line %ld column %d - Spaced dash?\n",

  1217 		  linecnt,(int)(s-aline)+1);

  1218 	    else

  1219 		cnt_dash++;

  1220 	}

  1221     }

  1222     else if ((s=strstr(aline,"- ")))

  1223     {

  1224 	if (s==aline || s[-1]!='-')

  1225 	{

  1226 	    if (pswit[ECHO_SWITCH])

  1227 		printf("\n%s\n",aline);

  1228 	    if (!pswit[OVERVIEW_SWITCH])

  1229 		printf("    Line %ld column %d - Spaced dash?\n",

  1230 		  linecnt,(int)(s-aline)+1);

  1231 	    else

  1232 		cnt_dash++;

  1233 	}

  1234     }

  1235 }

  1237 /*

  1238  * check_for_unmarked_paragraphs:

  1239  *

  1240  * Check for unmarked paragraphs indicated by separate speakers.

  1241  *

  1242  * May well be false positive:

  1243  * "Bravo!" "Wonderful!" called the crowd.

  1244  * but useful all the same.

  1245  */

  1246 void check_for_unmarked_paragraphs(const char *aline)

  1247 {

  1248     const char *s;

  1249     s=strstr(aline,"\"  \"");

  1250     if (!s)

  1251 	s=strstr(aline,"\" \"");

  1252     if (s)

  1253     {

  1254 	if (pswit[ECHO_SWITCH])

  1255 	    printf("\n%s\n",aline);

  1256 	if (!pswit[OVERVIEW_SWITCH])

  1257 	    printf("    Line %ld column %d - Query missing paragraph break?\n",

  1258 	      linecnt,(int)(s-aline)+1);

  1259 	else

  1260 	    cnt_punct++;

  1261     }

  1262 }

  1264 /*

  1265  * check_for_jeebies:

  1266  *

  1267  * Check for "to he" and other easy h/b errors.

  1268  *

  1269  * This is a very inadequate effort on the h/b problem,

  1270  * but the phrase "to he" is always an error, whereas "to

  1271  * be" is quite common.

  1272  * Similarly, '"Quiet!", be said.' is a non-be error

  1273  * "to he" is _not_ always an error!:

  1274  *       "Where they went to he couldn't say."

  1275  * Another false positive:

  1276  *       What would "Cinderella" be without the . . .

  1277  * and another: "If he wants to he can see for himself."

  1278  */

  1279 void check_for_jeebies(const char *aline)

  1280 {

  1281     const char *s;

  1282     s=strstr(aline," be could ");

  1283     if (!s)

  1284 	s=strstr(aline," be would ");

  1285     if (!s)

  1286 	s=strstr(aline," was be ");

  1287     if (!s)

  1288 	s=strstr(aline," be is ");

  1289     if (!s)

  1290 	s=strstr(aline," is be ");

  1291     if (!s)

  1292 	s=strstr(aline,"\", be ");

  1293     if (!s)

  1294 	s=strstr(aline,"\" be ");

  1295     if (!s)

  1296 	s=strstr(aline,"\" be ");

  1297     if (!s)

  1298 	s=strstr(aline," to he ");

  1299     if (s)

  1300     {

  1301 	if (pswit[ECHO_SWITCH])

  1302 	    printf("\n%s\n",aline);

  1303 	if (!pswit[OVERVIEW_SWITCH])

  1304 	    printf("    Line %ld column %d - Query he/be error?\n",

  1305 	      linecnt,(int)(s-aline)+1);

  1306 	else

  1307 	    cnt_word++;

  1308     }

  1309     s=strstr(aline," the had ");

  1310     if (!s)

  1311 	s=strstr(aline," a had ");

  1312     if (!s)

  1313 	s=strstr(aline," they bad ");

  1314     if (!s)

  1315 	s=strstr(aline," she bad ");

  1316     if (!s)

  1317 	s=strstr(aline," he bad ");

  1318     if (!s)

  1319 	s=strstr(aline," you bad ");

  1320     if (!s)

  1321 	s=strstr(aline," i bad ");

  1322     if (s)

  1323     {

  1324 	if (pswit[ECHO_SWITCH])

  1325 	    printf("\n%s\n",aline);

  1326 	if (!pswit[OVERVIEW_SWITCH])

  1327 	    printf("    Line %ld column %d - Query had/bad error?\n",

  1328 	      linecnt,(int)(s-aline)+1);

  1329 	else

  1330 	    cnt_word++;

  1331     }

  1332     s=strstr(aline,"; hut ");

  1333     if (!s)

  1334 	s=strstr(aline,", hut ");

  1335     if (s)

  1336     {

  1337 	if (pswit[ECHO_SWITCH])

  1338 	    printf("\n%s\n",aline);

  1339 	if (!pswit[OVERVIEW_SWITCH])

  1340 	    printf("    Line %ld column %d - Query hut/but error?\n",

  1341 	      linecnt,(int)(s-aline)+1);

  1342 	else

  1343 	    cnt_word++;

  1344     }

  1345 }

  1347 /*

  1348  * check_for_mta_from:

  1349  *

  1350  * Special case - angled bracket in front of "From" placed there by an

  1351  * MTA when sending an e-mail.

  1352  */

  1353 void check_for_mta_from(const char *aline)

  1354 {

  1355     const char *s;

  1356     s=strstr(aline,">From");

  1357     if (s)

  1358     {

  1359 	if (pswit[ECHO_SWITCH])

  1360 	    printf("\n%s\n",aline);

  1361 	if (!pswit[OVERVIEW_SWITCH])

  1362 	    printf("    Line %ld column %d - Query angled bracket with From\n",

  1363 	      linecnt,(int)(s-aline)+1);

  1364 	else

  1365 	    cnt_punct++;

  1366     }

  1367 }

  1369 /*

  1370  * check_for_orphan_character:

  1371  *

  1372  * Check for a single character line -

  1373  * often an overflow from bad wrapping.

  1374  */

  1375 void check_for_orphan_character(const char *aline)

  1376 {

  1377     if (*aline && !aline[1])

  1378     {

  1379 	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||

  1380 	  gcisdigit(*aline))

  1381 	    ; /* Nothing - ignore numerals alone on a line. */

  1382 	else

  1383 	{

  1384 	    if (pswit[ECHO_SWITCH])

  1385 		printf("\n%s\n",aline);

  1386 	    if (!pswit[OVERVIEW_SWITCH])

  1387 		printf("    Line %ld column 1 - Query single character line\n",

  1388 		  linecnt);

  1389 	    else

  1390 		cnt_punct++;

  1391 	}

  1392     }

  1393 }

  1395 /*

  1396  * check_for_pling_scanno:

  1397  *

  1398  * Check for I" - often should be !

  1399  */

  1400 void check_for_pling_scanno(const char *aline)

  1401 {

  1402     const char *s;

  1403     s=strstr(aline," I\"");

  1404     if (s)

  1405     {

  1406 	if (pswit[ECHO_SWITCH])

  1407 	    printf("\n%s\n",aline);

  1408 	if (!pswit[OVERVIEW_SWITCH])

  1409 	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",

  1410 	      linecnt,s-aline);

  1411 	else

  1412 	    cnt_punct++;

  1413     }

  1414 }

  1416 /*

  1417  * check_for_extra_period:

  1418  *

  1419  * Check for period without a capital letter. Cut-down from gutspell.

  1420  * Only works when it happens on a single line.

  1421  */

  1422 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1423 {

  1424     const char *s,*t,*s1;

  1425     signed int i,istypo,isdup;

  1426     static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];

  1427     static int qperiod_index=0;

  1428     char testword[MAXWORDLEN]="";

  1429     if (pswit[PARANOID_SWITCH])

  1430     {

  1431 	for (t=s=aline;strstr(t,". ");)

  1432 	{

  1433 	    t=strstr(t,". ");

  1434 	    if (t==s)

  1435 	    {

  1436 		t++;

  1437 		/* start of line punctuation is handled elsewhere */

  1438 		continue;

  1439 	    }

  1440 	    if (!gcisalpha(t[-1]))

  1441 	    {

  1442 		t++;

  1443 		continue;

  1444 	    }

  1445 	    if (warnings->isDutch)

  1446 	    {

  1447 		/* For Frank & Jeroen -- 's Middags case */

  1448 		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&

  1449 		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')

  1450 		{

  1451 		    t++;

  1452 		    continue;

  1453 		}

  1454 	    }

  1455 	    s1=t+2;

  1456 	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))

  1457 		s1++;

  1458 	    if (*s1>='a' && *s1<='z')

  1459 	    {

  1460 		/* we have something to investigate */

  1461 		istypo=1;

  1462 		/* so let's go back and find out */

  1463 		for (s1=t-1;s1>=s &&

  1464 		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&

  1465 		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)

  1466 		    ;

  1467 		s1++;

  1468 		for (i=0;*s1 && *s1!='.';s1++,i++)

  1469 		    testword[i]=*s1;

  1470 		testword[i]=0;

  1471 		for (i=0;*abbrev[i];i++)

  1472 		    if (!strcmp(testword,abbrev[i]))

  1473 			istypo=0;

  1474 		if (gcisdigit(*testword))

  1475 		    istypo=0;

  1476 		if (!testword[1])

  1477 		    istypo=0;

  1478 		if (isroman(testword))

  1479 		    istypo=0;

  1480 		if (istypo)

  1481 		{

  1482 		    istypo=0;

  1483 		    for (i=0;testword[i];i++)

  1484 			if (strchr(vowels,testword[i]))

  1485 			    istypo=1;

  1486 		}

  1487 		if (istypo)

  1488 		{

  1489 		    isdup=0;

  1490 		    if (strlen(testword)<MAX_QWORD_LENGTH &&

  1491 		      !pswit[VERBOSE_SWITCH])

  1492 			for (i=0;i<qperiod_index;i++)

  1493 			    if (!strcmp(testword,qperiod[i]))

  1494 				isdup=1;

  1495 		    if (!isdup)

  1496 		    {

  1497 			if (qperiod_index<MAX_QWORD &&

  1498 			  strlen(testword)<MAX_QWORD_LENGTH)

  1499 			{

  1500 			    strcpy(qperiod[qperiod_index],testword);

  1501 			    qperiod_index++;

  1502 			}

  1503 			if (pswit[ECHO_SWITCH])

  1504 			    printf("\n%s\n",aline);

  1505 			if (!pswit[OVERVIEW_SWITCH])

  1506 			    printf("    Line %ld column %d - Extra period?\n",

  1507 			      linecnt,(int)(t-aline)+1);

  1508 			else

  1509 			    cnt_punct++;

  1510 		    }

  1511 		}

  1512 	    }

  1513 	    t++;

  1514 	}

  1515     }

  1516 }

  1518 /*

  1519  * procfile:

  1520  *

  1521  * Process one file.

  1522  */

  1523 void procfile(char *filename)

  1524 {

  1525     char *s,*t,*wordstart;

  1526     char inword[MAXWORDLEN],testword[MAXWORDLEN];

  1527     char parastart[81];     /* first line of current para */

  1528     FILE *infile;

  1529     struct first_pass_results *first_pass_results;

  1530     struct warnings *warnings;

  1531     struct counters counters={0};

  1532     struct line_properties last={0};

  1533     int isemptyline;

  1534     long squot,start_para_line;

  1535     signed int i,llen,isacro,isellipsis,istypo,alower;

  1536     signed int dquotepar,squotepar;

  1537     signed int isnewpara,vowel,consonant;

  1538     char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],

  1539       cbrack_err[80],unders_err[80];

  1540     signed int qword_index,isdup;

  1541     signed int enddash;

  1542     last.start=CHAR_SPACE;

  1543     *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=

  1544       *unders_err=*prevline=0;

  1545     linecnt=checked_linecnt=start_para_line=0;

  1546     squot=0;

  1547     i=llen=isacro=isellipsis=0;

  1548     isnewpara=vowel=consonant=enddash=0;

  1549     qword_index=0;

  1550     *inword=*testword=0;

  1551     dquotepar=squotepar=0;

  1552     infile=fopen(filename,"rb");

  1553     if (!infile)

  1554     {

  1555         if (pswit[STDOUT_SWITCH])

  1556             fprintf(stdout,"bookloupe: cannot open %s\n",filename);

  1557         else

  1558             fprintf(stderr,"bookloupe: cannot open %s\n",filename);

  1559 	exit(1);

  1560     }

  1561     fprintf(stdout,"\n\nFile: %s\n\n",filename);

  1562     first_pass_results=first_pass(infile);

  1563     warnings=report_first_pass(first_pass_results);

  1564     rewind(infile);

  1565     /*

  1566      * Here we go with the main pass. Hold onto yer hat!

  1567      * Re-init some variables we've dirtied.

  1568      */

  1569     squot=linecnt=0;

  1570     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))

  1571     {

  1572         linecnt++;

  1573         if (linecnt==1)

  1574 	    isnewpara=1;

  1575         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))

  1576 	    continue;    // skip DP page separators completely

  1577         if (linecnt<first_pass_results->firstline ||

  1578 	  (first_pass_results->footerline>0 &&

  1579 	  linecnt>first_pass_results->footerline))

  1580 	{

  1581             if (pswit[HEADER_SWITCH])

  1582 	    {

  1583                 if (!strncmp(aline,"Title:",6))

  1584                     printf("    %s\n",aline);

  1585                 if (!strncmp(aline,"Author:",7))

  1586                     printf("    %s\n",aline);

  1587                 if (!strncmp(aline,"Release Date:",13))

  1588                     printf("    %s\n",aline);

  1589                 if (!strncmp(aline,"Edition:",8))

  1590                     printf("    %s\n\n",aline);

  1591 	    }

  1592             continue;                /* skip through the header */

  1593 	}

  1594         checked_linecnt++;

  1595         s=aline;

  1596         /*

  1597 	 * If we are in a state of unbalanced quotes, and this line

  1598          * doesn't begin with a quote, output the stored error message.

  1599          * If the -P switch was used, print the warning even if the

  1600          * new para starts with quotes.

  1601 	 */

  1602         t=s;

  1603         while (*t==' ')

  1604 	    t++;

  1605         if (*dquote_err)

  1606             if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  1607 	    {

  1608                 if (!pswit[OVERVIEW_SWITCH])

  1609 		{

  1610                     if (pswit[ECHO_SWITCH])

  1611 			printf("\n%s\n",parastart);

  1612                     printf(dquote_err);

  1613 		}

  1614                 else

  1615                     cnt_dquot++;

  1616             }

  1617         if (*squote_err)

  1618 	{

  1619             if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||

  1620 	      pswit[QPARA_SWITCH] || squot)

  1621 	    {

  1622                 if (!pswit[OVERVIEW_SWITCH])

  1623 		{

  1624                     if (pswit[ECHO_SWITCH])

  1625 			printf("\n%s\n",parastart);

  1626                     printf(squote_err);

  1627 		}

  1628                 else

  1629                     cnt_squot++;

  1630 	    }

  1631             squot=0;

  1632 	}

  1633         if (*rbrack_err)

  1634 	{

  1635             if (!pswit[OVERVIEW_SWITCH])

  1636 	    {

  1637                 if (pswit[ECHO_SWITCH])

  1638 		    printf("\n%s\n",parastart);

  1639                 printf(rbrack_err);

  1640 	    }

  1641             else

  1642                 cnt_brack++;

  1643 	}

  1644         if (*sbrack_err)

  1645 	{

  1646             if (!pswit[OVERVIEW_SWITCH])

  1647 	    {

  1648                 if (pswit[ECHO_SWITCH])

  1649 		    printf("\n%s\n",parastart);

  1650                 printf(sbrack_err);

  1651 	    }

  1652             else

  1653                 cnt_brack++;

  1654 	}

  1655         if (*cbrack_err)

  1656 	{

  1657             if (!pswit[OVERVIEW_SWITCH])

  1658 	    {

  1659                 if (pswit[ECHO_SWITCH])

  1660 		    printf("\n%s\n",parastart);

  1661                 printf(cbrack_err);

  1662 	    }

  1663             else

  1664                 cnt_brack++;

  1665 	}

  1666         if (*unders_err)

  1667 	{

  1668             if (!pswit[OVERVIEW_SWITCH])

  1669 	    {

  1670                 if (pswit[ECHO_SWITCH])

  1671 		    printf("\n%s\n",parastart);

  1672                 printf(unders_err);

  1673 	    }

  1674             else

  1675                 cnt_brack++;

  1676 	}

  1677         *dquote_err=*squote_err=*rbrack_err=*cbrack_err=

  1678 	  *sbrack_err=*unders_err=0;

  1679 	isemptyline=analyse_quotes(aline,&counters);

  1680         if (isnewpara && !isemptyline)

  1681 	{

  1682 	    /* This line is the start of a new paragraph. */

  1683             start_para_line=linecnt;

  1684 	    /* Capture its first line in case we want to report it later. */

  1685             strncpy(parastart,aline,80);

  1686             parastart[79]=0;

  1687             dquotepar=squotepar=0; /* restart the quote count */

  1688             s=aline;

  1689             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)

  1690 		s++;

  1691             if (*s>='a' && *s<='z')

  1692 	    {

  1693 		/* and its first letter is lowercase */

  1694                 if (pswit[ECHO_SWITCH])

  1695 		    printf("\n%s\n",aline);

  1696                 if (!pswit[OVERVIEW_SWITCH])

  1697                     printf("    Line %ld column %d - "

  1698 		      "Paragraph starts with lower-case\n",

  1699 		      linecnt,(int)(s-aline)+1);

  1700                 else

  1701                     cnt_punct++;

  1702 	    }

  1703             isnewpara=0; /* Signal the end of new para processing. */

  1704 	}

  1705         /* Check for an em-dash broken at line end. */

  1706         if (enddash && *aline=='-')

  1707 	{

  1708             if (pswit[ECHO_SWITCH])

  1709 		printf("\n%s\n",aline);

  1710             if (!pswit[OVERVIEW_SWITCH])

  1711                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  1712             else

  1713                 cnt_punct++;

  1714 	}

  1715         enddash=0;

  1716         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)

  1717 	    ;

  1718         if (s>=aline && *s=='-')

  1719             enddash=1;

  1720 	/*

  1721          * Check for invalid or questionable characters in the line

  1722          * Anything above 127 is invalid for plain ASCII, and

  1723          * non-printable control characters should also be flagged.

  1724          * Tabs should generally not be there.

  1725 	 */

  1726         for (s=aline;*s;s++)

  1727 	{

  1728             i=(unsigned char)*s;

  1729             if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)

  1730 	    {

  1731                 if (pswit[ECHO_SWITCH])

  1732 		    printf("\n%s\n",aline);

  1733                 if (!pswit[OVERVIEW_SWITCH])

  1734                     printf("    Line %ld column %d - Control character %d\n",

  1735 		      linecnt,(int)(s-aline)+1,i);

  1736                 else

  1737                     cnt_bin++;

  1738 	    }

  1739 	}

  1740         if (warnings->bin)

  1741 	    check_for_odd_characters(aline,warnings,isemptyline);

  1742         if (warnings->longline)

  1743 	    check_for_long_line(aline);

  1744         if (warnings->shortline)

  1745 	    check_for_short_line(aline,&last);

  1746         last.blen=last.len;

  1747         last.len=strlen(aline);

  1748         last.start=aline[0];

  1749 	check_for_starting_punctuation(aline);

  1750         if (warnings->dash)

  1751 	{

  1752 	    check_for_spaced_emdash(aline);

  1753 	    check_for_spaced_dash(aline);

  1754 	}

  1755 	check_for_unmarked_paragraphs(aline);

  1756 	check_for_jeebies(aline);

  1757 	check_for_mta_from(aline);

  1758 	check_for_orphan_character(aline);

  1759 	check_for_pling_scanno(aline);

  1760 	check_for_extra_period(aline,warnings);

  1761         if (pswit[TYPO_SWITCH])

  1762 	{

  1763             /* Check for words usually not followed by punctuation. */

  1764             for (s=aline;*s;)

  1765 	    {

  1766                 wordstart=s;

  1767                 s=getaword(s,inword);

  1768                 if (!*inword)

  1769 		    continue;

  1770                 lowerit(inword);

  1771                 for (i=0;*nocomma[i];i++)

  1772                     if (!strcmp(inword,nocomma[i]))

  1773 		    {

  1774                         if (*s==',' || *s==';' || *s==':')

  1775 			{

  1776                             if (pswit[ECHO_SWITCH])

  1777 				printf("\n%s\n",aline);

  1778                             if (!pswit[OVERVIEW_SWITCH])

  1779                                 printf("    Line %ld column %d - "

  1780 				  "Query punctuation after %s?\n",

  1781 				  linecnt,(int)(s-aline)+1,inword);

  1782                             else

  1783                                 cnt_punct++;

  1784 			}

  1785 		    }

  1786 		for (i=0;*noperiod[i];i++)

  1787                     if (!strcmp(inword,noperiod[i]))

  1788 		    {

  1789                         if (*s=='.' || *s=='!')

  1790 			{

  1791                             if (pswit[ECHO_SWITCH])

  1792 				printf("\n%s\n",aline);

  1793                             if (!pswit[OVERVIEW_SWITCH])

  1794                                 printf("    Line %ld column %d - "

  1795 				  "Query punctuation after %s?\n",

  1796 				  linecnt,(int)(s-aline)+1,inword);

  1797                             else

  1798                                 cnt_punct++;

  1799 			}

  1800 		    }

  1801 	    }

  1802 	}

  1803         /*

  1804 	 * Check for commonly mistyped words,

  1805 	 * and digits like 0 for O in a word.

  1806 	 */

  1807         for (s=aline;*s;)

  1808 	{

  1809             wordstart=s;

  1810             s=getaword(s,inword);

  1811             if (!*inword)

  1812 		continue; /* don't bother with empty lines */

  1813             if (mixdigit(inword))

  1814 	    {

  1815                 if (pswit[ECHO_SWITCH])

  1816 		    printf("\n%s\n",aline);

  1817                 if (!pswit[OVERVIEW_SWITCH])

  1818                     printf("    Line %ld column %d - Query digit in %s\n",

  1819 		      linecnt,(int)(wordstart-aline)+1,inword);

  1820                 else

  1821                     cnt_word++;

  1822 	    }

  1823             /*

  1824 	     * Put the word through a series of tests for likely typos and OCR

  1825 	     * errors.

  1826 	     */

  1827             if (pswit[TYPO_SWITCH])

  1828 	    {

  1829                 istypo=0;

  1830                 strcpy(testword,inword);

  1831                 alower=0;

  1832                 for (i=0;i<(signed int)strlen(testword);i++)

  1833 		{

  1834 		    /* lowercase for testing */

  1835                     if (testword[i]>='a' && testword[i]<='z')

  1836 			alower=1;

  1837                     if (alower && testword[i]>='A' && testword[i]<='Z')

  1838 		    {

  1839                         /*

  1840 			 * We have an uppercase mid-word. However, there are

  1841 			 * common cases:

  1842                          *   Mac and Mc like McGill

  1843                          *   French contractions like l'Abbe

  1844 			 */

  1845                         if (i==2 && testword[0]=='m' && testword[1]=='c' ||

  1846                           i==3 && testword[0]=='m' && testword[1]=='a' &&

  1847 			  testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)

  1848 			    ; /* do nothing! */

  1849                         else

  1850                             istypo=1;

  1851 		    }

  1852                     testword[i]=(char)tolower(testword[i]);

  1853 		}

  1854                 /*

  1855 		 * Check for certain unlikely two-letter combinations at word

  1856 		 * start and end.

  1857 		 */

  1858                 if (strlen(testword)>1)

  1859 		{

  1860                     for (i=0;*nostart[i];i++)

  1861                         if (!strncmp(testword,nostart[i],2))

  1862                             istypo=1;

  1863                     for (i=0;*noend[i];i++)

  1864                         if (!strncmp(testword+strlen(testword)-2,noend[i],2))

  1865                             istypo=1;

  1866 		}

  1867                 /* ght is common, gbt never. Like that. */

  1868                 if (strstr(testword,"cb"))

  1869 		    istypo=1;

  1870                 if (strstr(testword,"gbt"))

  1871 		    istypo=1;

  1872                 if (strstr(testword,"pbt"))

  1873 		    istypo=1;

  1874                 if (strstr(testword,"tbs"))

  1875 		    istypo=1;

  1876                 if (strstr(testword,"mrn"))

  1877 		    istypo=1;

  1878                 if (strstr(testword,"ahle"))

  1879 		    istypo=1;

  1880                 if (strstr(testword,"ihle"))

  1881 		    istypo=1;

  1882                 /*

  1883 		 * "TBE" does happen - like HEARTBEAT - but uncommon.

  1884                  * Also "TBI" - frostbite, outbid - but uncommon.

  1885                  * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1886 		 * numerals, but "ii" is a common scanno.

  1887 		 */

  1888                 if (strstr(testword,"tbi"))

  1889 		    istypo=1;

  1890                 if (strstr(testword,"tbe"))

  1891 		    istypo=1;

  1892                 if (strstr(testword,"ii"))

  1893 		    istypo=1;

  1894                 /*

  1895 		 * Check for no vowels or no consonants.

  1896                  * If none, flag a typo.

  1897 		 */

  1898                 if (!istypo && strlen(testword)>1)

  1899 		{

  1900                     vowel=consonant=0;

  1901                     for (i=0;testword[i];i++)

  1902 		    {

  1903                         if (testword[i]=='y' || gcisdigit(testword[i]))

  1904 			{

  1905 			    /* Yah, this is loose. */

  1906                             vowel++;

  1907                             consonant++;

  1908 			}

  1909                         else if (strchr(vowels,testword[i]))

  1910 			    vowel++;

  1911 			else

  1912 			    consonant++;

  1913 		    }

  1914                     if (!vowel || !consonant)

  1915                         istypo=1;

  1916 		}

  1917                 /*

  1918 		 * Now exclude the word from being reported if it's in

  1919                  * the okword list.

  1920 		 */

  1921                 for (i=0;*okword[i];i++)

  1922                     if (!strcmp(testword,okword[i]))

  1923                         istypo=0;

  1924                 /*

  1925 		 * What looks like a typo may be a Roman numeral.

  1926 		 * Exclude these.

  1927 		 */

  1928                 if (istypo && isroman(testword))

  1929 		    istypo=0;

  1930                 /* Check the manual list of typos. */

  1931                 if (!istypo)

  1932                     for (i=0;*typo[i];i++)

  1933                         if (!strcmp(testword,typo[i]))

  1934                             istypo=1;

  1935                 /*

  1936 		 * Check lowercase s, l, i and m - special cases.

  1937                  *   "j" - often a semi-colon gone wrong.

  1938                  *   "d" for a missing apostrophe - he d

  1939                  *   "n" for "in"

  1940 		 */

  1941                 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))

  1942 		    istypo=1;

  1943                 if (istypo)

  1944 		{

  1945                     isdup=0;

  1946                     if (strlen(testword)<MAX_QWORD_LENGTH &&

  1947 		      !pswit[VERBOSE_SWITCH])

  1948                         for (i=0;i<qword_index;i++)

  1949                             if (!strcmp(testword,qword[i]))

  1950 			    {

  1951                                 isdup=1;

  1952                                 ++dupcnt[i];

  1953 			    }

  1954                     if (!isdup)

  1955 		    {

  1956                         if (qword_index<MAX_QWORD &&

  1957 			  strlen(testword)<MAX_QWORD_LENGTH)

  1958 			{

  1959                             strcpy(qword[qword_index],testword);

  1960                             qword_index++;

  1961 			}

  1962                         if (pswit[ECHO_SWITCH])

  1963 			    printf("\n%s\n",aline);

  1964                         if (!pswit[OVERVIEW_SWITCH])

  1965 			{

  1966                             printf("    Line %ld column %d - Query word %s",

  1967 			      linecnt,(int)(wordstart-aline)+1,inword);

  1968                             if (strlen(testword)<MAX_QWORD_LENGTH &&

  1969 			      !pswit[VERBOSE_SWITCH])

  1970                                 printf(" - not reporting duplicates");

  1971                             printf("\n");

  1972 			}

  1973                         else

  1974                             cnt_word++;

  1975 		    }

  1976 		}

  1977 	    }

  1978 	    /* check the user's list of typos */

  1979 	    if (!istypo && usertypo_count)

  1980 		for (i=0;i<usertypo_count;i++)

  1981 		    if (!strcmp(testword,usertypo[i]))

  1982 		    {

  1983 			if (pswit[ECHO_SWITCH])

  1984 			    printf("\n%s\n",aline);

  1985 			if (!pswit[OVERVIEW_SWITCH])

  1986 			    printf("    Line %ld column %d - "

  1987 			      "Query possible scanno %s\n",

  1988 			      linecnt,(int)(wordstart-aline)+2,inword);

  1989 		    }

  1990             if (pswit[PARANOID_SWITCH] && warnings->digit)

  1991 	    {

  1992 		/* In paranoid mode, query all 0 and 1 standing alone. */

  1993                 if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1994 		{

  1995                     if (pswit[ECHO_SWITCH])

  1996 			printf("\n%s\n",aline);

  1997                     if (!pswit[OVERVIEW_SWITCH])

  1998                         printf("    Line %ld column %d - Query standalone %s\n",

  1999 			  linecnt,(int)(wordstart-aline)+2,inword);

  2000                     else

  2001                         cnt_word++;

  2002 		}

  2003 	    }

  2004 	}

  2005 	/*

  2006          * Look for added or missing spaces around punctuation and quotes.

  2007          * If there is a punctuation character like ! with no space on

  2008          * either side, suspect a missing!space. If there are spaces on

  2009          * both sides , assume a typo. If we see a double quote with no

  2010          * space or punctuation on either side of it, assume unspaced

  2011          * quotes "like"this.

  2012 	 */

  2013         llen=strlen(aline);

  2014         for (i=1;i<llen;i++)

  2015 	{

  2016 	    /* For each character in the line after the first. */

  2017             if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */

  2018 	    {

  2019 		/* we need to suppress warnings for acronyms like M.D. */

  2020                 isacro=0;

  2021 		/* we need to suppress warnings for ellipsis . . . */

  2022                 isellipsis=0;

  2023 		/* if there are letters on both sides of it or ... */

  2024                 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||

  2025                    gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))

  2026 		{

  2027 		    /* ...if it's strict punctuation followed by an alpha */

  2028                     if (aline[i]=='.')

  2029 		    {

  2030                         if (i>2 && aline[i-2]=='.')

  2031 			    isacro=1;

  2032                         if (i+2<llen && aline[i+2]=='.')

  2033 			    isacro=1;

  2034 		    }

  2035                     if (!isacro)

  2036 		    {

  2037                         if (pswit[ECHO_SWITCH])

  2038 			    printf("\n%s\n",aline);

  2039                         if (!pswit[OVERVIEW_SWITCH])

  2040                             printf("    Line %ld column %d - Missing space?\n",

  2041 			      linecnt,i+1);

  2042                         else

  2043                             cnt_punct++;

  2044 		    }

  2045 		}

  2046                 if (aline[i-1]==CHAR_SPACE &&

  2047 		  (aline[i+1]==CHAR_SPACE || aline[i+1]==0))

  2048 		{

  2049 		    /*

  2050 		     * If there are spaces on both sides,

  2051 		     * or space before and end of line.

  2052 		     */

  2053                     if (aline[i]=='.')

  2054 		    {

  2055                         if (i>2 && aline[i-2]=='.')

  2056 			    isellipsis=1;

  2057                         if (i+2<llen && aline[i+2]=='.')

  2058 			    isellipsis=1;

  2059 		    }

  2060                     if (!isemptyline && !isellipsis)

  2061 		    {

  2062                         if (pswit[ECHO_SWITCH])

  2063 			    printf("\n%s\n",aline);

  2064                         if (!pswit[OVERVIEW_SWITCH])

  2065                             printf("    Line %ld column %d - "

  2066 			      "Spaced punctuation?\n",linecnt,i+1);

  2067                         else

  2068                             cnt_punct++;

  2069 		    }

  2070 		}

  2071 	    }

  2072 	}

  2073         /* Split out the characters that CANNOT be preceded by space. */

  2074         llen=strlen(aline);

  2075         for (i=1;i<llen;i++)

  2076 	{

  2077 	    /* for each character in the line after the first */

  2078             if (strchr("?!,;:",aline[i]))

  2079 	    {

  2080 		/* if it's punctuation that _cannot_ have a space before it */

  2081                 if (aline[i-1]==CHAR_SPACE && !isemptyline &&

  2082 		  aline[i+1]!=CHAR_SPACE)

  2083 		{

  2084 		    /*

  2085 		     * If aline[i+1) DOES == space,

  2086 		     * it was already reported just above.

  2087 		     */

  2088                     if (pswit[ECHO_SWITCH])

  2089 			printf("\n%s\n",aline);

  2090                     if (!pswit[OVERVIEW_SWITCH])

  2091                         printf("    Line %ld column %d - Spaced punctuation?\n",

  2092 			  linecnt,i+1);

  2093                     else

  2094                         cnt_punct++;

  2095 		}

  2096 	    }

  2097 	}

  2098         /*

  2099 	 * Special case " .X" where X is any alpha.

  2100          * This plugs a hole in the acronym code above.

  2101 	 * Inelegant, but maintainable.

  2102 	 */

  2103         llen=strlen(aline);

  2104         for (i=1;i<llen;i++)

  2105 	{

  2106 	    /* for each character in the line after the first */

  2107             if (aline[i]=='.')

  2108 	    {

  2109 		/* if it's a period */

  2110                 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))

  2111 		{

  2112 		    /*

  2113 		     * If the period follows a space and

  2114 		     * is followed by a letter.

  2115 		     */

  2116                     if (pswit[ECHO_SWITCH])

  2117 			printf("\n%s\n",aline);

  2118                     if (!pswit[OVERVIEW_SWITCH])

  2119                         printf("    Line %ld column %d - Spaced punctuation?\n",

  2120 			  linecnt,i+1);

  2121                     else

  2122                         cnt_punct++;

  2123 		}

  2124 	    }

  2125 	}

  2126         for (i=1;i<llen;i++)

  2127 	{

  2128 	    /* for each character in the line after the first */

  2129             if (aline[i]==CHAR_DQUOTE)

  2130 	    {

  2131                 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&

  2132 		  !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||

  2133 		  !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))

  2134 		{

  2135 		    if (pswit[ECHO_SWITCH])

  2136 			printf("\n%s\n",aline);

  2137 		    if (!pswit[OVERVIEW_SWITCH])

  2138 			printf("    Line %ld column %d - Unspaced quotes?\n",

  2139 			  linecnt,i+1);

  2140 		    else

  2141 			cnt_punct++;

  2142 		}

  2143 	    }

  2144 	}

  2145         /* Check parity of quotes. */

  2146         for (s=aline;*s;s++)

  2147 	{

  2148             if (*s==CHAR_DQUOTE)

  2149 	    {

  2150                 if (!(dquotepar=!dquotepar))

  2151 		{

  2152 		    /* parity even */

  2153                     if (!strchr("_-.'`/,;:!?)]} ",s[1]))

  2154 		    {

  2155                         if (pswit[ECHO_SWITCH])

  2156 			    printf("\n%s\n",aline);

  2157                         if (!pswit[OVERVIEW_SWITCH])

  2158                             printf("    Line %ld column %d - "

  2159 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2160                         else

  2161                             cnt_punct++;

  2162 		    }

  2163 		}

  2164                 else

  2165 		{

  2166 		    /* parity odd */

  2167                     if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2168 		      !strchr("_-/.'`([{$",s[1]) || !s[1])

  2169 		    {

  2170                         if (pswit[ECHO_SWITCH])

  2171 			    printf("\n%s\n",aline);

  2172                         if (!pswit[OVERVIEW_SWITCH])

  2173                             printf("    Line %ld column %d - "

  2174 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2175                         else

  2176                             cnt_punct++;

  2177 		    }

  2178 		}

  2179 	    }

  2180 	}

  2181 	if (*aline==CHAR_DQUOTE)

  2182 	{

  2183 	    if (strchr(",;:!?)]} ",aline[1]))

  2184 	    {

  2185 		if (pswit[ECHO_SWITCH])

  2186 		    printf("\n%s\n",aline);

  2187 		if (!pswit[OVERVIEW_SWITCH])

  2188 		    printf("    Line %ld column 1 - Wrongspaced quotes?\n",

  2189 		      linecnt);

  2190 		else

  2191 		    cnt_punct++;

  2192 	    }

  2193 	}

  2194         if (pswit[SQUOTE_SWITCH])

  2195 	{

  2196             for (s=aline;*s;s++)

  2197 	    {

  2198                 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&

  2199 		  (s==aline || s>aline && !gcisalpha(s[-1]) ||

  2200 		  !gcisalpha(s[1])))

  2201 		{

  2202                     if (!(squotepar=!squotepar))

  2203 		    {

  2204 			/* parity even */

  2205                         if (!strchr("_-.'`/\",;:!?)]} ",s[1]))

  2206 			{

  2207                             if (pswit[ECHO_SWITCH])

  2208 				printf("\n%s\n",aline);

  2209                             if (!pswit[OVERVIEW_SWITCH])

  2210                                 printf("    Line %ld column %d - "

  2211 				  "Wrongspaced singlequotes?\n",

  2212 				  linecnt,(int)(s-aline)+1);

  2213                             else

  2214                                 cnt_punct++;

  2215 			}

  2216 		    }

  2217                     else

  2218 		    {

  2219 			/* parity odd */

  2220                         if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2221 			  !strchr("_-/\".'`",s[1]) || !s[1])

  2222 			{

  2223                             if (pswit[ECHO_SWITCH])

  2224 				printf("\n%s\n",aline);

  2225                             if (!pswit[OVERVIEW_SWITCH])

  2226                                 printf("    Line %ld column %d - "

  2227 				  "Wrongspaced singlequotes?\n",

  2228 				  linecnt,(int)(s-aline)+1);

  2229                             else

  2230                                 cnt_punct++;

  2231 			}

  2232 		    }

  2233 		}

  2234 	    }

  2235 	}

  2236         /*

  2237 	 * Look for double punctuation like ,. or ,,

  2238          * Thanks to DW for the suggestion!

  2239          * In books with references, ".," and ".;" are common

  2240          * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2241          * OTOH, from my initial tests, there are also fairly

  2242          * common errors. What to do? Make these cases paranoid?

  2243          * ".," is the most common, so warnings->dotcomma is used

  2244          * to suppress detailed reporting if it occurs often.

  2245 	 */

  2246         llen=strlen(aline);

  2247         for (i=0;i<llen;i++)

  2248 	{

  2249 	    /* for each punctuation character in the line */

  2250             if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&

  2251 	      aline[i] && aline[i+1])

  2252 	    {

  2253 		/* followed by punctuation, it's a query, unless . . . */

  2254                 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||

  2255 		  aline[i]=='!') ||

  2256 		  !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||

  2257 		  warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2258 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2259 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2260 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2261 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2262 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2263 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2264 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2265 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2266 		  warnings->isFrench && !strncmp(aline+i,"...?",4))

  2267 		{

  2268 		    if (warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2269 		      warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2270 		      warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2271 		      warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2272 		      warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2273 		      warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2274 		      warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2275 		      warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2276 		      warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2277 		      warnings->isFrench && !strncmp(aline+i,"...?",4))

  2278 			i+=4;

  2279 		    ; /* do nothing for .. !! and ?? which can be legit */

  2280 		}

  2281                 else

  2282 		{

  2283                     if (pswit[ECHO_SWITCH])

  2284 			printf("\n%s\n",aline);

  2285                     if (!pswit[OVERVIEW_SWITCH])

  2286                         printf("    Line %ld column %d - Double punctuation?\n",

  2287 			  linecnt,i+1);

  2288                     else

  2289                         cnt_punct++;

  2290 		}

  2291 	    }

  2292 	}

  2293         s=aline;

  2294         while (strstr(s," \" "))

  2295 	{

  2296             if (pswit[ECHO_SWITCH])

  2297 		printf("\n%s\n",aline);

  2298             if (!pswit[OVERVIEW_SWITCH])

  2299                 printf("    Line %ld column %d - Spaced doublequote?\n",

  2300 		  linecnt,(int)(strstr(s," \" ")-aline+1));

  2301             else

  2302                 cnt_punct++;

  2303             s=strstr(s," \" ")+2;

  2304 	}

  2305         s=aline;

  2306         while (strstr(s," ' "))

  2307 	{

  2308             if (pswit[ECHO_SWITCH])

  2309 		printf("\n%s\n",aline);

  2310             if (!pswit[OVERVIEW_SWITCH])

  2311                 printf("    Line %ld column %d - Spaced singlequote?\n",

  2312 		  linecnt,(int)(strstr(s," ' ")-aline+1));

  2313             else

  2314                 cnt_punct++;

  2315             s=strstr(s," ' ")+2;

  2316 	}

  2317         s=aline;

  2318         while (strstr(s," ` "))

  2319 	{

  2320             if (pswit[ECHO_SWITCH])

  2321 		printf("\n%s\n",aline);

  2322             if (!pswit[OVERVIEW_SWITCH])

  2323                 printf("    Line %ld column %d - Spaced singlequote?\n",

  2324 		  linecnt,(int)(strstr(s," ` ")-aline+1));

  2325             else

  2326                 cnt_punct++;

  2327             s=strstr(s," ` ")+2;

  2328 	}

  2329         /* check special case of 'S instead of 's at end of word */

  2330         s=aline+1;

  2331         while (*s)

  2332 	{

  2333             if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')

  2334 	    {

  2335                 if (pswit[ECHO_SWITCH])

  2336 		    printf("\n%s\n",aline);

  2337                 if (!pswit[OVERVIEW_SWITCH])

  2338                     printf("    Line %ld column %d - Capital \"S\"?\n",

  2339 		      linecnt,(int)(s-aline+2));

  2340                 else

  2341                     cnt_punct++;

  2342 	    }

  2343             s++;

  2344 	}

  2345         /*

  2346 	 * Now check special cases - start and end of line -

  2347          * for single and double quotes. Start is sometimes [sic]

  2348          * but better to query it anyway.

  2349          * While we're here, check for dash at end of line.

  2350 	 */

  2351         llen=strlen(aline);

  2352         if (llen>1)

  2353 	{

  2354             if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||

  2355 	      aline[llen-1]==CHAR_OPEN_SQUOTE)

  2356                 if (aline[llen-2]==CHAR_SPACE)

  2357 		{

  2358                     if (pswit[ECHO_SWITCH])

  2359 			printf("\n%s\n",aline);

  2360                     if (!pswit[OVERVIEW_SWITCH])

  2361                         printf("    Line %ld column %d - Spaced quote?\n",

  2362 			  linecnt,llen);

  2363                     else

  2364                         cnt_punct++;

  2365 		}

  2366             if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&

  2367 	      aline[1]==CHAR_SPACE)

  2368 	    {

  2369 		if (pswit[ECHO_SWITCH])

  2370 		    printf("\n%s\n",aline);

  2371 		if (!pswit[OVERVIEW_SWITCH])

  2372 		    printf("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2373 		else

  2374 		    cnt_punct++;

  2375 	    }

  2376             /*

  2377 	     * Dash at end of line may well be legit - paranoid mode only

  2378              * and don't report em-dash at line-end.

  2379 	     */

  2380             if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2381 	    {

  2382                 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

  2383 		    ;

  2384                 if (aline[i]=='-' && aline[i-1]!='-')

  2385 		{

  2386                     if (pswit[ECHO_SWITCH])

  2387 			printf("\n%s\n",aline);

  2388                     if (!pswit[OVERVIEW_SWITCH])

  2389                         printf("    Line %ld column %d - "

  2390 			  "Hyphen at end of line?\n",linecnt,i);

  2391 		}

  2392 	    }

  2393 	}

  2394         /*

  2395 	 * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2396          * If so, suspect a scanno like "a]most".

  2397 	 */

  2398         llen=strlen(aline);

  2399         for (i=1;i<llen-1;i++)

  2400 	{

  2401 	    /* for each bracket character in the line except 1st & last */

  2402             if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&

  2403 	      gcisalpha(aline[i+1]))

  2404 	    {

  2405                 if (pswit[ECHO_SWITCH])

  2406 		    printf("\n%s\n",aline);

  2407                 if (!pswit[OVERVIEW_SWITCH])

  2408                     printf("    Line %ld column %d - Unspaced bracket?\n",

  2409 		      linecnt,i);

  2410                 else

  2411                     cnt_punct++;

  2412 	    }

  2413 	}

  2414         llen=strlen(aline);

  2415         if (warnings->endquote)

  2416 	{

  2417             for (i=1;i<llen;i++)

  2418 	    {

  2419 		/* for each character in the line except 1st */

  2420                 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

  2421 		{

  2422 		    if (pswit[ECHO_SWITCH])

  2423 			printf("\n%s\n",aline);

  2424 		    if (!pswit[OVERVIEW_SWITCH])

  2425 			printf("    Line %ld column %d - "

  2426 			  "endquote missing punctuation?\n",linecnt,i);

  2427 		    else

  2428 			cnt_punct++;

  2429 		}

  2430 	    }

  2431 	}

  2432 	/*

  2433          * Check for <HTML TAG>.

  2434          * If there is a < in the line, followed at some point

  2435          * by a > then we suspect HTML.

  2436 	 */

  2437         if (strstr(aline,"<") && strstr(aline,">"))

  2438 	{

  2439             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);

  2440             if (i>0)

  2441 	    {

  2442                 strncpy(wrk,strstr(aline,"<"),i);

  2443                 wrk[i]=0;

  2444                 if (pswit[ECHO_SWITCH])

  2445 		    printf("\n%s\n",aline);

  2446                 if (!pswit[OVERVIEW_SWITCH])

  2447                     printf("    Line %ld column %d - HTML Tag? %s \n",

  2448 		      linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);

  2449                 else

  2450                     cnt_html++;

  2451 	    }

  2452 	}

  2453         /*

  2454 	 * Check for &symbol; HTML.

  2455          * If there is a & in the line, followed at

  2456          * some point by a ; then we suspect HTML.

  2457 	 */

  2458         if (strstr(aline,"&") && strstr(aline,";"))

  2459 	{

  2460             i=(int)(strstr(aline,";")-strstr(aline,"&")+1);

  2461             for (s=strstr(aline,"&");s<strstr(aline,";");s++)

  2462                 if (*s==CHAR_SPACE)

  2463 		    i=0;                /* Don't report "Jones & Son;" */

  2464             if (i>0)

  2465 	    {

  2466                 strncpy(wrk,strstr(aline,"&"),i);

  2467                 wrk[i]=0;

  2468                 if (pswit[ECHO_SWITCH])

  2469 		    printf("\n%s\n",aline);

  2470                 if (!pswit[OVERVIEW_SWITCH])

  2471                     printf("    Line %ld column %d - HTML symbol? %s \n",

  2472 		      linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);

  2473                 else

  2474                     cnt_html++;

  2475 	    }

  2476 	}

  2477         /*

  2478 	 * At end of paragraph, check for mismatched quotes.

  2479          * We don't want to report an error immediately, since it is a

  2480          * common convention to omit the quotes at end of paragraph if

  2481          * the next paragraph is a continuation of the same speaker.

  2482          * Where this is the case, the next para should begin with a

  2483          * quote, so we store the warning message and only display it

  2484          * at the top of the next iteration if the new para doesn't

  2485          * start with a quote.

  2486          * The -p switch overrides this default, and warns of unclosed

  2487          * quotes on _every_ paragraph, whether the next begins with a

  2488          * quote or not.

  2489 	 */

  2490         if (isemptyline)

  2491 	{

  2492 	    /* end of para - add up the totals */

  2493             if (counters.quot%2)

  2494                 sprintf(dquote_err,"    Line %ld - Mismatched quotes\n",

  2495 		  linecnt);

  2496             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&

  2497 	      counters.open_single_quote!=counters.close_single_quote)

  2498                 sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n",

  2499 		  linecnt);

  2500             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&

  2501 	      counters.open_single_quote!=counters.close_single_quote &&

  2502 	      counters.open_single_quote!=counters.close_single_quote+1)

  2503 		/*

  2504 		 * Flag it to be noted regardless of the

  2505 		 * first char of the next para.

  2506 		 */

  2507                 squot=1;

  2508             if (counters.r_brack)

  2509                 sprintf(rbrack_err,"    Line %ld - "

  2510 		  "Mismatched round brackets?\n",linecnt);

  2511             if (counters.s_brack)

  2512                 sprintf(sbrack_err,"    Line %ld - "

  2513 		  "Mismatched square brackets?\n",linecnt);

  2514             if (counters.c_brack)

  2515                 sprintf(cbrack_err,"    Line %ld - "

  2516 		  "Mismatched curly brackets?\n",linecnt);

  2517             if (counters.c_unders%2)

  2518                 sprintf(unders_err,"    Line %ld - Mismatched underscores?\n",

  2519 		  linecnt);

  2520 	    memset(&counters,0,sizeof(counters));

  2521 	    /* let the next iteration know that it's starting a new para */

  2522             isnewpara=1;

  2523 	}

  2524         /*

  2525 	 * Check for omitted punctuation at end of paragraph by working back

  2526 	 * through prevline. DW.

  2527          * Need to check this only for "normal" paras.

  2528          * So what is a "normal" para?

  2529          *    Not normal if one-liner (chapter headings, etc.)

  2530          *    Not normal if doesn't contain at least one locase letter

  2531          *    Not normal if starts with space

  2532 	 */

  2533         if (isemptyline)

  2534 	{

  2535 	    /* end of para */

  2536             for (s=prevline,i=0;*s && !i;s++)

  2537                 if (gcisletter(*s))

  2538 		    /* use i to indicate the presence of a letter on the line */

  2539                     i=1;

  2540             /*

  2541 	     * This next "if" is a problem.

  2542              * If we say "start_para_line <= linecnt - 1", that includes

  2543 	     * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2544              * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2545              * misses genuine one-line paragraphs.

  2546 	     */

  2547             if (i && last.blen>2 && start_para_line<linecnt-1 &&

  2548 	      *prevline>CHAR_SPACE)

  2549 	    {

  2550                 for (i=strlen(prevline)-1;

  2551 		  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&

  2552 		  prevline[i]>CHAR_SPACE && i>0;

  2553 		  i--)

  2554 		    ;

  2555                 for (;i>0;i--)

  2556 		{

  2557                     if (gcisalpha(prevline[i]))

  2558 		    {

  2559                         if (pswit[ECHO_SWITCH])

  2560 			    printf("\n%s\n",prevline);

  2561                         if (!pswit[OVERVIEW_SWITCH])

  2562                             printf("    Line %ld column %d - "

  2563 			      "No punctuation at para end?\n",

  2564 			      linecnt-1,strlen(prevline));

  2565                         else

  2566                             cnt_punct++;

  2567                         break;

  2568 		    }

  2569                     if (strchr("-.:!([{?}])",prevline[i]))

  2570                         break;

  2571 		}

  2572 	    }

  2573 	}

  2574         strcpy(prevline,aline);

  2575     }

  2576     fclose(infile);

  2577     if (!pswit[OVERVIEW_SWITCH])

  2578         for (i=0;i<MAX_QWORD;i++)

  2579             if (dupcnt[i])

  2580                 printf("\nNote: Queried word %s was duplicated %d time%s\n",

  2581 		  qword[i],dupcnt[i],"s");

  2582 }

  2584 /*

  2585  * flgets:

  2586  *

  2587  * Get one line from the input stream, checking for

  2588  * the existence of exactly one CR/LF line-end per line.

  2589  *

  2590  * Returns: a pointer to the line.

  2591  */

  2592 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)

  2593 {

  2594     char c;

  2595     int len,isCR,cint;

  2596     *theline=0;

  2597     len=isCR=0;

  2598     c=cint=fgetc(thefile);

  2599     do

  2600     {

  2601         if (cint==EOF)

  2602             return NULL;

  2603 	/* either way, it's end of line */

  2604         if (c==10)

  2605 	{

  2606             if (isCR)

  2607                 break;

  2608             else

  2609 	    {

  2610 		/* Error - a LF without a preceding CR */

  2611                 if (pswit[LINE_END_SWITCH])

  2612 		{

  2613                     if (pswit[ECHO_SWITCH])

  2614 			printf("\n%s\n",theline);

  2615                     if (!pswit[OVERVIEW_SWITCH])

  2616                         printf("    Line %ld - No CR?\n",lcnt);

  2617                     else

  2618                         cnt_lineend++;

  2619 		}

  2620                 break;

  2621 	    }

  2622 	}

  2623         if (c==13)

  2624 	{

  2625             if (isCR)

  2626 	    {

  2627 		/* Error - two successive CRs */

  2628                 if (pswit[LINE_END_SWITCH])

  2629 		{

  2630                     if (pswit[ECHO_SWITCH])

  2631 			printf("\n%s\n",theline);

  2632                     if (!pswit[OVERVIEW_SWITCH])

  2633                         printf("    Line %ld - Two successive CRs?\n",lcnt);

  2634                     else

  2635                         cnt_lineend++;

  2636 		}

  2637 	    }

  2638             isCR=1;

  2639 	}

  2640         else

  2641 	{

  2642             if (pswit[LINE_END_SWITCH] && isCR)

  2643 	    {

  2644                 if (pswit[ECHO_SWITCH])

  2645 		    printf("\n%s\n",theline);

  2646                 if (!pswit[OVERVIEW_SWITCH])

  2647                     printf("    Line %ld column %d - CR without LF?\n",

  2648 		      lcnt,len+1);

  2649                 else

  2650                     cnt_lineend++;

  2651 	    }

  2652             theline[len]=c;

  2653             len++;

  2654             theline[len]=0;

  2655             isCR=0;

  2656 	}

  2657         c=cint=fgetc(thefile);

  2658     } while(len<maxlen);

  2659     if (pswit[MARKUP_SWITCH])

  2660         postprocess_for_HTML(theline);

  2661     if (pswit[DP_SWITCH])

  2662         postprocess_for_DP(theline);

  2663     return theline;

  2664 }

  2666 /*

  2667  * mixdigit:

  2668  *

  2669  * Takes a "word" as a parameter, and checks whether it

  2670  * contains a mixture of alpha and digits. Generally, this is an

  2671  * error, but may not be for cases like 4th or L5 12s. 3d.

  2672  *

  2673  * Returns: 0 if no error found, 1 if error.

  2674  */

  2675 int mixdigit(char *checkword)

  2676 {

  2677     int wehaveadigit,wehavealetter,firstdigits,query,wl;

  2678     char *s;

  2679     wehaveadigit=wehavealetter=query=0;

  2680     for (s=checkword;*s;s++)

  2681         if (gcisalpha(*s))

  2682             wehavealetter=1;

  2683         else

  2684             if (gcisdigit(*s))

  2685                 wehaveadigit=1;

  2686     if (wehaveadigit && wehavealetter)

  2687     {

  2688 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2689         query=1;

  2690         wl=strlen(checkword);

  2691         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)

  2692             ;

  2693         /* digits, ending in st, rd, nd, th of either case */

  2694         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||

  2695 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||

  2696 	  matchword(checkword+wl-2,"th")))

  2697 	    query=0;

  2698         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||

  2699 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||

  2700 	  matchword(checkword+wl-3,"ths")))

  2701 	    query=0;

  2702         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||

  2703 	  matchword(checkword+wl-4,"rdly") ||

  2704 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))

  2705 	    query=0;

  2706         /* digits, ending in l, L, s or d */

  2707         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||

  2708 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))

  2709 	    query=0;

  2710         /*

  2711 	 * L at the start of a number, representing Britsh pounds, like L500.

  2712          * This is cute. We know the current word is mixeddigit. If the first

  2713          * letter is L, there must be at least one digit following. If both

  2714          * digits and letters follow, we have a genuine error, else we have a

  2715          * capital L followed by digits, and we accept that as a non-error.

  2716 	 */

  2717         if (checkword[0]=='L' && !mixdigit(checkword+1))

  2718 	    query=0;

  2719     }

  2720     return query;

  2721 }

  2723 /*

  2724  * getaword:

  2725  *

  2726  * Extracts the first/next "word" from the line, and puts

  2727  * it into "thisword". A word is defined as one English word unit--or

  2728  * at least that's the aim.

  2729  *

  2730  * Returns: a pointer to the position in the line where we will start

  2731  *          looking for the next word.

  2732  */

  2733 char *getaword(char *fromline,char *thisword)

  2734 {

  2735     int i,wordlen;

  2736     char *s;

  2737     wordlen=0;

  2738     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;

  2739       fromline++)

  2740 	;

  2741     /*

  2742      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2743      * Especially yucky is the case of L1,000

  2744      * This section looks for a pattern of characters including a digit

  2745      * followed by a comma or period followed by one or more digits.

  2746      * If found, it returns this whole pattern as a word; otherwise we discard

  2747      * the results and resume our normal programming.

  2748      */

  2749     s=fromline;

  2750     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&

  2751       wordlen<MAXWORDLEN;s++)

  2752     {

  2753 	thisword[wordlen]=*s;

  2754         wordlen++;

  2755     }

  2756     thisword[wordlen]=0;

  2757     for (i=1;i<wordlen-1;i++)

  2758     {

  2759         if (thisword[i]=='.' || thisword[i]==',')

  2760 	{

  2761             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))

  2762 	    {

  2763                 fromline=s;

  2764                 return fromline;

  2765 	    }

  2766 	}

  2767     }

  2768     /* we didn't find a punctuated number - do the regular getword thing */

  2769     wordlen=0;

  2770     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&

  2771       wordlen<MAXWORDLEN;fromline++)

  2772     {

  2773         thisword[wordlen]=*fromline;

  2774         wordlen++;

  2775     }

  2776     thisword[wordlen]=0;

  2777     return fromline;

  2778 }

  2780 /*

  2781  * matchword:

  2782  *

  2783  * A case-insensitive string matcher.

  2784  */

  2785 int matchword(char *checkfor,char *thisword)

  2786 {

  2787     unsigned int ismatch,i;

  2788     if (strlen(checkfor)!=strlen(thisword))

  2789 	return 0;

  2790     ismatch=1;     /* assume a match until we find a difference */

  2791     for (i=0;i<strlen(checkfor);i++)

  2792         if (toupper(checkfor[i])!=toupper(thisword[i]))

  2793             ismatch=0;

  2794     return ismatch;

  2795 }

  2797 /*

  2798  * lowerit:

  2799  *

  2800  * Lowercase the line.

  2801  */

  2803 void lowerit(char *theline)

  2804 {

  2805     for (;*theline;theline++)

  2806         if (*theline>='A' && *theline<='Z')

  2807             *theline+=32;

  2808 }

  2810 /*

  2811  * isroman:

  2812  *

  2813  * Is this word a Roman Numeral?

  2814  *

  2815  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2816  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2817  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2818  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2819  * expressions thereof, except when it came to taxes. Allow any number of M,

  2820  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  2821  * XL or an optional XC, an optional IX or IV, an optional V and any number

  2822  * of optional Is.

  2823  */

  2824 int isroman(char *t)

  2825 {

  2826     char *s;

  2827     if (!t || !*t)

  2828 	return 0;

  2829     s=t;

  2830     while (*t=='m' && *t)

  2831 	t++;

  2832     if (*t=='d')

  2833 	t++;

  2834     if (*t=='c' && t[1]=='m')

  2835 	t+=2;

  2836     if (*t=='c' && t[1]=='d')

  2837 	t+=2;

  2838     while (*t=='c' && *t)

  2839 	t++;

  2840     if (*t=='x' && t[1]=='l')

  2841 	t+=2;

  2842     if (*t=='x' && t[1]=='c')

  2843 	t+=2;

  2844     if (*t=='l')

  2845 	t++;

  2846     while (*t=='x' && *t)

  2847 	t++;

  2848     if (*t=='i' && t[1]=='x')

  2849 	t+=2;

  2850     if (*t=='i' && t[1]=='v')

  2851 	t+=2;

  2852     if (*t=='v')

  2853 	t++;

  2854     while (*t=='i' && *t)

  2855 	t++;

  2856     return !*t;

  2857 }

  2859 /*

  2860  * gcisalpha:

  2861  *

  2862  * A version of isalpha() that is somewhat lenient on 8-bit texts.

  2863  * If we use the standard function, 8-bit accented characters break

  2864  * words, so that tete with accented characters appears to be two words, "t"

  2865  * and "t", with 8-bit characters between them. This causes over-reporting of

  2866  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)

  2867  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.

  2868  */

  2869 int gcisalpha(unsigned char c)

  2870 {

  2871     if (c>='a' && c<='z')

  2872 	return 1;

  2873     if (c>='A' && c<='Z')

  2874 	return 1;

  2875     if (c<140)

  2876 	return 0;

  2877     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)

  2878 	return 1;

  2879     if (c==140 || c==142 || c==156 || c==158 || c==159)

  2880 	return 1;

  2881     return 0;

  2882 }

  2884 /*

  2885  * gcisdigit:

  2886  *

  2887  * A version of isdigit() that doesn't get confused in 8-bit texts.

  2888  */

  2889 int gcisdigit(unsigned char c)

  2890 {

  2891     return c>='0' && c<='9';

  2892 }

  2894 /*

  2895  * gcisletter:

  2896  *

  2897  * A version of isletter() that doesn't get confused in 8-bit texts.

  2898  * NB: this is ISO-8891-1-specific.

  2899  */

  2900 int gcisletter(unsigned char c)

  2901 {

  2902     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;

  2903 }

  2905 /*

  2906  * gcstrchr:

  2907  *

  2908  * Wraps strchr to return NULL if the character being searched for is zero.

  2909  */

  2910 char *gcstrchr(char *s,char c)

  2911 {

  2912     if (!c)

  2913 	return NULL;

  2914     return strchr(s,c);

  2915 }

  2917 /*

  2918  * postprocess_for_DP:

  2919  *

  2920  * Invoked with the -d switch from flgets().

  2921  * It simply "removes" from the line a hard-coded set of common

  2922  * DP-specific tags, so that the line passed to the main routine has

  2923  * been pre-cleaned of DP markup.

  2924  */

  2925 void postprocess_for_DP(char *theline)

  2926 {

  2927     char *s,*t;

  2928     int i;

  2929     if (!*theline)

  2930         return;

  2931     for (i=0;*DPmarkup[i];i++)

  2932     {

  2933         s=strstr(theline,DPmarkup[i]);

  2934         while (s)

  2935 	{

  2936             t=s+strlen(DPmarkup[i]);

  2937             while (*t)

  2938 	    {

  2939                 *s=*t;

  2940                 t++;

  2941 		s++;

  2942 	    }

  2943             *s=0;

  2944             s=strstr(theline,DPmarkup[i]);

  2945 	}

  2946     }

  2947 }

  2949 /*

  2950  * postprocess_for_HTML:

  2951  *

  2952  * Invoked with the -m switch from flgets().

  2953  * It simply "removes" from the line a hard-coded set of common

  2954  * HTML tags and "replaces" a hard-coded set of common HTML

  2955  * entities, so that the line passed to the main routine has

  2956  * been pre-cleaned of HTML.

  2957  */

  2958 void postprocess_for_HTML(char *theline)

  2959 {

  2960     if (strstr(theline,"<") && strstr(theline,">"))

  2961         while (losemarkup(theline))

  2962             ;

  2963     while (loseentities(theline))

  2964         ;

  2965 }

  2967 char *losemarkup(char *theline)

  2968 {

  2969     char *s,*t;

  2970     int i;

  2971     if (!*theline)

  2972         return NULL;

  2973     s=strstr(theline,"<");

  2974     t=strstr(theline,">");

  2975     if (!s || !t)

  2976 	return NULL;

  2977     for (i=0;*markup[i];i++)

  2978         if (!tagcomp(s+1,markup[i]))

  2979 	{

  2980             if (!t[1])

  2981 	    {

  2982                 *s=0;

  2983                 return s;

  2984 	    }

  2985             else if (t>s)

  2986 	    {

  2987 		strcpy(s,t+1);

  2988 		return s;

  2989 	    }

  2990         }

  2991     /* It's an unrecognized <xxx>. */

  2992     return NULL;

  2993 }

  2995 char *loseentities(char *theline)

  2996 {

  2997     int i;

  2998     char *s,*t;

  2999     if (!*theline)

  3000         return NULL;

  3001     for (i=0;*entities[i].htmlent;i++)

  3002     {

  3003         s=strstr(theline,entities[i].htmlent);

  3004         if (s)

  3005 	{

  3006             t=malloc((size_t)strlen(s));

  3007             if (!t)

  3008 		return NULL;

  3009             strcpy(t,s+strlen(entities[i].htmlent));

  3010             strcpy(s,entities[i].textent);

  3011             strcat(s,t);

  3012             free(t);

  3013             return theline;

  3014 	}

  3015     }

  3016     for (i=0;*entities[i].htmlnum;i++)

  3017     {

  3018         s=strstr(theline,entities[i].htmlnum);

  3019         if (s)

  3020 	{

  3021             t=malloc((size_t)strlen(s));

  3022             if (!t)

  3023 		return NULL;

  3024             strcpy(t,s+strlen(entities[i].htmlnum));

  3025             strcpy(s,entities[i].textent);

  3026             strcat(s,t);

  3027             free(t);

  3028             return theline;

  3029 	}

  3030     }

  3031     return NULL;

  3032 }

  3034 int tagcomp(char *strin,char *basetag)

  3035 {

  3036     char *s,*t;

  3037     s=basetag;

  3038     t=strin;

  3039     if (*t=='/')

  3040 	t++; /* ignore a slash */

  3041     while (*s && *t)

  3042     {

  3043         if (tolower(*s)!=tolower(*t))

  3044 	    return 1;

  3045         s++;

  3046 	t++;

  3047     }

  3048     return 0;

  3049 }

  3051 void proghelp()

  3052 {

  3053     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3054     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3055     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3056     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3057       "For details, read the file COPYING.\n",stderr);

  3058     fputs("This is Free Software; "

  3059       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3060     fputs("read the file COPYING for details.\n\n",stderr);

  3061     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);

  3062     fputs("  where -s checks single quotes, -e suppresses echoing lines, "

  3063       "-t checks typos\n",stderr);

  3064     fputs("  -x (paranoid) switches OFF -t and extra checks, "

  3065       "-l turns OFF line-end checks\n",stderr);

  3066     fputs("  -o just displays overview without detail, "

  3067       "-h echoes header fields\n",stderr);

  3068     fputs("  -v (verbose) unsuppresses duplicate reporting, "

  3069       "-m suppresses markup\n",stderr);

  3070     fputs("  -d ignores DP-specific markup,\n",stderr);

  3071     fputs("  -u uses a file gutcheck.typ to query user-defined "

  3072       "possible typos\n",stderr);

  3073     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);

  3074     fputs("\n",stderr);

  3075     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",

  3076       stderr);

  3077     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3078       "non-ASCII\n",stderr);

  3079     fputs("characters like accented letters, "

  3080       "lines longer than 75 or shorter than 55,\n",stderr);

  3081     fputs("unbalanced quotes or brackets, "

  3082       "a variety of badly formatted punctuation, \n",stderr);

  3083     fputs("HTML tags, some likely typos. "

  3084       "It is NOT a substitute for human judgement.\n",stderr);

  3085     fputs("\n",stderr);

  3086 }

author	ali <ali@juiblex.co.uk>
	Sun May 26 01:46:33 2013 +0100 (2013-05-26)
changeset 53	4c8606eb60c1
parent 52	a1fd8d3f0940
child 54	23b2ea51b029
permissions	-rw-r--r--