bookloupe-testing: bookloupe/bookloupe.c@e4042a067753

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*                                                                       */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */

     6 /*                                                                       */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.                                   */

    11 /*                                                                       */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */

    15 /* GNU General Public License for more details.                          */

    16 /*                                                                       */

    17 /* You should have received a copy of the GNU General Public License     */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    26 #define MAXWORDLEN    80    /* max length of one word             */

    27 #define LINEBUFSIZE 2048    /* buffer size for an input line      */

    29 #define MAX_USER_TYPOS 1000

    30 #define USERTYPO_FILE "gutcheck.typ"

    32 #ifndef MAX_PATH

    33 #define MAX_PATH 16384

    34 #endif

    36 char aline[LINEBUFSIZE];

    37 char prevline[LINEBUFSIZE];

    39 /* Common typos. */

    40 char *typo[] = {

    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    70     "se", ""

    71 };

    73 char *usertypo[MAX_USER_TYPOS];

    75 /* Common abbreviations and other OK words not to query as typos. */

    76 char *okword[] = {

    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    80     "outbid", "outbids", "frostbite", "frostbitten", ""

    81 };

    83 /* Common abbreviations that cause otherwise unexplained periods. */

    84 char *abbrev[] = {

    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    87 };

    89 /*

    90  * Two-Letter combinations that rarely if ever start words,

    91  * but are common scannos or otherwise common letter combinations.

    92  */

    93 char *nostart[] = {

    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    95 };

    97 /*

    98  * Two-Letter combinations that rarely if ever end words,

    99  * but are common scannos or otherwise common letter combinations.

   100  */

   101 char *noend[] = {

   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   103     "sw", "gr", "sl", "cl", "iy", ""

   104 };

   106 char *markup[] = {

   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   111 };

   113 char *DPmarkup[] = {

   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   115 };

   117 char *nocomma[] = {

   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   121     "during", "let", "toward", "among", ""

   122 };

   124 char *noperiod[] = {

   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   128     "among", "those", "into", "whom", "having", "thence", ""

   129 };

   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";

   133 struct {

   134     char *htmlent;

   135     char *htmlnum;

   136     char *textent;

   137 } entities[] = {

   138     "&amp;",	"&#38;",     "&",

   139     "&lt;",	"&#60;",     "<",

   140     "&gt;",	"&#62;",     ">",

   141     "&deg;",	"&#176;",    " degrees",

   142     "&pound;",	"&#163;",    "L",

   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */

   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */

   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */

   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */

   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */

   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */

   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */

   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */

   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */

   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */

   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */

   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */

   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */

   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */

   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */

   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */

   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */

   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */

   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */

   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */

   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */

   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */

   165     "&cent;",	"&#162;",    "c", /* cent sign */

   166     "&pound;",	"&#163;",    "L", /* pound sign */

   167     "&curren;",	"&#164;",    "$", /* currency sign */

   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */

   169     "&sect;",	"&#167;",    "--", /* section sign */

   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */

   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */

   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */

   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */

   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */

   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */

   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */

   177     "&deg;",	"&#176;",    " degrees", /* degree sign */

   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */

   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */

   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */

   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */

   182     "&micro;",	"&#181;",    "m", /* micro sign */

   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */

   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */

   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */

   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */

   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */

   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */

   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */

   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */

   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */

   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */

   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */

   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */

   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */

   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */

   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */

   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */

   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */

   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */

   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */

   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */

   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */

   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */

   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */

   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */

   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */

   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */

   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */

   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */

   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */

   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */

   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */

   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */

   215     "&times;",	"&#215;",    "*", /* multiplication sign */

   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */

   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */

   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */

   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */

   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */

   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */

   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */

   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */

   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */

   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */

   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */

   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */

   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */

   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */

   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */

   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */

   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */

   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */

   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */

   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */

   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */

   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */

   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */

   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */

   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */

   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */

   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */

   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */

   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */

   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */

   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */

   247     "&divide;",	"&#247;",    "/", /* division sign */

   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */

   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */

   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */

   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */

   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */

   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */

   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */

   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */

   256     "", ""

   257 };

   259 /* special characters */

   260 #define CHAR_SPACE        32

   261 #define CHAR_TAB           9

   262 #define CHAR_LF           10

   263 #define CHAR_CR           13

   264 #define CHAR_DQUOTE       34

   265 #define CHAR_SQUOTE       39

   266 #define CHAR_OPEN_SQUOTE  96

   267 #define CHAR_TILDE       126

   268 #define CHAR_ASTERISK     42

   269 #define CHAR_FORESLASH    47

   270 #define CHAR_CARAT        94

   272 #define CHAR_UNDERSCORE    '_'

   273 #define CHAR_OPEN_CBRACK   '{'

   274 #define CHAR_CLOSE_CBRACK  '}'

   275 #define CHAR_OPEN_RBRACK   '('

   276 #define CHAR_CLOSE_RBRACK  ')'

   277 #define CHAR_OPEN_SBRACK   '['

   278 #define CHAR_CLOSE_SBRACK  ']'

   280 /* longest and shortest normal PG line lengths */

   281 #define LONGEST_PG_LINE   75

   282 #define WAY_TOO_LONG      80

   283 #define SHORTEST_PG_LINE  55

   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */

   286                                   /*     D - ignore DP-specific markup     */

   287                                   /*     E - echo queried line             */

   288                                   /*     S - check single quotes           */

   289                                   /*     T - check common typos            */

   290                                   /*     P - require closure of quotes on  */

   291                                   /*         every paragraph               */

   292                                   /*     X - "Trust no one" :-) Paranoid!  */

   293                                   /*         Queries everything            */

   294                                   /*     L - line end checking defaults on */

   295                                   /*         -L turns it off               */

   296                                   /*     O - overview. Just shows counts.  */

   297                                   /*     Y - puts errors to stdout         */

   298                                   /*         instead of stderr             */

   299                                   /*     H - Echoes header fields          */

   300                                   /*     M - Ignore markup in < >          */

   301                                   /*     U - Use file of User-defined Typos*/

   302                                   /*     W - Defaults for use on Web upload*/

   303                                   /*     V - Verbose - list EVERYTHING!    */

   304 #define SWITNO 14                 /* max number of switch parms            */

   305                                   /*        - used for defining array-size */

   306 #define MINARGS   1               /* minimum no of args excl switches      */

   307 #define MAXARGS   1               /* maximum no of args excl switches      */

   309 int pswit[SWITNO];                /* program switches set by SWITCHES      */

   311 #define ECHO_SWITCH      0

   312 #define SQUOTE_SWITCH    1

   313 #define TYPO_SWITCH      2

   314 #define QPARA_SWITCH     3

   315 #define PARANOID_SWITCH  4

   316 #define LINE_END_SWITCH  5

   317 #define OVERVIEW_SWITCH  6

   318 #define STDOUT_SWITCH    7

   319 #define HEADER_SWITCH    8

   320 #define WEB_SWITCH       9

   321 #define VERBOSE_SWITCH   10

   322 #define MARKUP_SWITCH    11

   323 #define USERTYPO_SWITCH  12

   324 #define DP_SWITCH        13

   326 long cnt_dquot;       /* for overview mode, count of doublequote queries */

   327 long cnt_squot;       /* for overview mode, count of singlequote queries */

   328 long cnt_brack;       /* for overview mode, count of brackets queries */

   329 long cnt_bin;         /* for overview mode, count of non-ASCII queries */

   330 long cnt_odd;         /* for overview mode, count of odd character queries */

   331 long cnt_long;        /* for overview mode, count of long line errors */

   332 long cnt_short;       /* for overview mode, count of short line queries */

   333 long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */

   334 long cnt_dash;        /* for overview mode, count of dash-related queries */

   335 long cnt_word;        /* for overview mode, count of word queries */

   336 long cnt_html;        /* for overview mode, count of html queries */

   337 long cnt_lineend;     /* for overview mode, count of line-end queries */

   338 long cnt_spacend;     /* count of lines with space at end */

   339 long linecnt;         /* count of total lines in the file */

   340 long checked_linecnt; /* count of lines actually checked */

   342 void proghelp(void);

   343 void procfile(char *);

   345 #define LOW_THRESHOLD    0

   346 #define HIGH_THRESHOLD   1

   348 #define START 0

   349 #define END 1

   350 #define PREV 0

   351 #define NEXT 1

   352 #define FIRST_OF_PAIR 0

   353 #define SECOND_OF_PAIR 1

   355 #define MAX_WORDPAIR 1000

   357 char running_from[MAX_PATH];

   359 int mixdigit(char *);

   360 char *getaword(char *,char *);

   361 int matchword(char *,char *);

   362 char *flgets(char *,int,FILE *,long);

   363 void lowerit(char *);

   364 int gcisalpha(unsigned char);

   365 int gcisdigit(unsigned char);

   366 int gcisletter(unsigned char);

   367 char *gcstrchr(char *s,char c);

   368 void postprocess_for_HTML(char *);

   369 char *linehasmarkup(char *);

   370 char *losemarkup(char *);

   371 int tagcomp(char *,char *);

   372 char *loseentities(char *);

   373 int isroman(char *);

   374 int usertypo_count;

   375 void postprocess_for_DP(char *);

   377 char wrk[LINEBUFSIZE];

   379 #define MAX_QWORD 50

   380 #define MAX_QWORD_LENGTH 40

   381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];

   382 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];

   383 signed int dupcnt[MAX_QWORD];

   385 int main(int argc,char **argv)

   386 {

   387     char *argsw,*s;

   388     int i,switno,invarg;

   389     char usertypo_file[MAX_PATH];

   390     FILE *usertypofile;

   391     if (strlen(argv[0])<sizeof(running_from))

   392 	/* save the path to the executable */

   393         strcpy(running_from,argv[0]);

   394     /* find out what directory we're running from */

   395     s=running_from+strlen(running_from);

   396     for (;*s!='/' && *s!='\\' && s>=running_from;s--)

   397         *s=0;

   398     switno=strlen(SWITCHES);

   399     for (i=switno;--i>0;)

   400         pswit[i]=0;           /* initialise switches */

   401     /*

   402      * Standard loop to extract switches.

   403      * When we come out of this loop, the arguments will be

   404      * in argv[0] upwards and the switches used will be

   405      * represented by their equivalent elements in pswit[]

   406      */

   407     while (--argc>0 && **++argv=='-')

   408         for (argsw=argv[0]+1;*argsw!='\0';argsw++)

   409             for (i=switno,invarg=1;(--i>=0) && invarg==1;)

   410                 if ((toupper(*argsw))==SWITCHES[i])

   411 		{

   412                     invarg=0;

   413                     pswit[i]=1;

   414 		}

   415     /* Paranoid checking is turned OFF, not on, by its switch */

   416     pswit[PARANOID_SWITCH]^=1;

   417     if (pswit[PARANOID_SWITCH])

   418 	/* if running in paranoid mode force typo checks as well   */

   419         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;

   420     /* Line-end checking is turned OFF, not on, by its switch */

   421     pswit[LINE_END_SWITCH]^=1;

   422     /* Echoing is turned OFF, not on, by its switch */

   423     pswit[ECHO_SWITCH]^=1;

   424     if (pswit[OVERVIEW_SWITCH])

   425 	/* just print summary; don't echo */

   426         pswit[ECHO_SWITCH]=0;

   427     /*

   428      * Web uploads - for the moment, this is really just a placeholder

   429      * until we decide what processing we really want to do on web uploads

   430      */

   431     if (pswit[WEB_SWITCH])

   432     {

   433 	/* specific override for web uploads */

   434         pswit[ECHO_SWITCH]=1;

   435         pswit[SQUOTE_SWITCH]=0;

   436         pswit[TYPO_SWITCH]=1;

   437         pswit[QPARA_SWITCH]=0;

   438         pswit[PARANOID_SWITCH]=1;

   439         pswit[LINE_END_SWITCH]=0;

   440         pswit[OVERVIEW_SWITCH]=0;

   441         pswit[STDOUT_SWITCH]=0;

   442         pswit[HEADER_SWITCH]=1;

   443         pswit[VERBOSE_SWITCH]=0;

   444         pswit[MARKUP_SWITCH]=0;

   445         pswit[USERTYPO_SWITCH]=0;

   446         pswit[DP_SWITCH]=0;

   447     }

   448     if (argc<MINARGS || argc>MAXARGS)

   449     {

   450 	/* check number of args */

   451         proghelp();

   452         return 1;

   453     }

   454     /* read in the user-defined stealth scanno list */

   455     if (pswit[USERTYPO_SWITCH])

   456     {

   457 	/* ... we were told we had one! */

   458         usertypofile=fopen(USERTYPO_FILE,"rb");

   459         if (!usertypofile)

   460 	{

   461 	    /* not in cwd. try excuteable directory. */

   462             strcpy(usertypo_file,running_from);

   463             strcat(usertypo_file,USERTYPO_FILE);

   464             usertypofile=fopen(usertypo_file,"rb");

   465             if (!usertypofile) {

   466 		/* we ain't got no user typo file! */

   467                 printf("   --> I couldn't find gutcheck.typ "

   468 		  "-- proceeding without user typos.\n");

   469 	    }

   470 	}

   471         usertypo_count=0;

   472         if (usertypofile)

   473 	{

   474 	    /* we managed to open a User Typo File! */

   475             if (pswit[USERTYPO_SWITCH])

   476 	    {

   477                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,

   478 		  (long)usertypo_count))

   479 		{

   480                     if (strlen(aline)>1)

   481 		    {

   482                         if ((int)*aline>33)

   483 			{

   484                             s=malloc(strlen(aline)+1);

   485                             if (!s)

   486 			    {

   487                                 fprintf(stderr,"bookloupe: cannot get enough "

   488 				  "memory for user typo file!\n");

   489                                 exit(1);

   490 			    }

   491                             strcpy(s,aline);

   492                             usertypo[usertypo_count]=s;

   493                             usertypo_count++;

   494                             if (usertypo_count>=MAX_USER_TYPOS)

   495 			    {

   496                                 printf("   --> Only %d user-defined typos "

   497 				  "allowed: ignoring the rest\n",

   498 				  MAX_USER_TYPOS);

   499                                 break;

   500 			    }

   501 			}

   502 		    }

   503 		}

   504 	    }

   505             fclose(usertypofile);

   506 	}

   507     }

   508     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   509     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=

   510     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=

   511     cnt_spacend=0;

   512     procfile(argv[0]);

   513     if (pswit[OVERVIEW_SWITCH])

   514     {

   515 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   516 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   517         printf("    --------------- Queries found --------------\n");

   518         if (cnt_long)

   519 	    printf("    Long lines:                    %14ld\n",cnt_long);

   520         if (cnt_short)

   521 	    printf("    Short lines:                   %14ld\n",cnt_short);

   522         if (cnt_lineend)

   523 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);

   524         if (cnt_word)

   525 	    printf("    Common typos:                  %14ld\n",cnt_word);

   526         if (cnt_dquot)

   527 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);

   528         if (cnt_squot)

   529 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);

   530         if (cnt_brack)

   531 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);

   532         if (cnt_bin)

   533 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);

   534         if (cnt_odd)

   535 	    printf("    Proofing characters:           %14ld\n",cnt_odd);

   536         if (cnt_punct)

   537 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   538         if (cnt_dash)

   539 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);

   540         if (cnt_html)

   541 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);

   542         printf("\n");

   543         printf("    TOTAL QUERIES                  %14ld\n",

   544           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   545           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   546     }

   547     return 0;

   548 }

   550 struct first_pass_results {

   551     long firstline,astline;

   552     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   553     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   554     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   555     signed int Dutchcount,Frenchcount;

   556 };

   558 /*

   559  * first_pass:

   560  *

   561  * Run a first pass - verify that it's a valid PG

   562  * file, decide whether to report some things that

   563  * occur many times in the text like long or short

   564  * lines, non-standard dashes, etc.

   565  */

   566 struct first_pass_results *first_pass(FILE *infile)

   567 {

   568     char laststart=CHAR_SPACE,*s;

   569     signed int i,llen;

   570     unsigned int lastlen=0,lastblen=0;

   571     long spline=0,nspline=0;

   572     static struct first_pass_results results={0};

   573     char inword[MAXWORDLEN]="";

   574     while (fgets(aline,LINEBUFSIZE-1,infile))

   575     {

   576         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)

   577 	    aline[strlen(aline)-1]=0;

   578         linecnt++;

   579         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&

   580 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))

   581 	{

   582             if (spline)

   583                 printf("   --> Duplicate header?\n");

   584             spline=linecnt+1;   /* first line of non-header text, that is */

   585 	}

   586         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))

   587 	{

   588             if (nspline)

   589                 printf("   --> Duplicate header?\n");

   590             nspline=linecnt+1;   /* first line of non-header text, that is */

   591 	}

   592         if (spline || nspline)

   593 	{

   594             lowerit(aline);

   595             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))

   596 	    {

   597                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))

   598 		{

   599                     if (results.footerline)

   600 		    {

   601 			/* it's an old-form header - we can detect duplicates */

   602                         if (!nspline)

   603                             printf("   --> Duplicate footer?\n");

   604 		    }

   605                     else

   606                         results.footerline=linecnt;

   607 		}

   608 	    }

   609 	}

   610         if (spline)

   611 	    results.firstline=spline;

   612         if (nspline)

   613 	    results.firstline=nspline;  /* override with new */

   614         if (results.footerline)

   615 	    continue;    /* don't count the boilerplate in the footer */

   616         llen=strlen(aline);

   617         results.totlen+=llen;

   618         for (i=0;i<llen;i++)

   619 	{

   620             if ((unsigned char)aline[i]>127)

   621 		results.binlen++;

   622             if (gcisalpha(aline[i]))

   623 		results.alphalen++;

   624             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

   625 		results.endquote_count++;

   626 	}

   627         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&

   628 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   629 	    results.shortline++;

   630         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)

   631 	    cnt_spacend++;

   632         if (strstr(aline,".,"))

   633 	    results.dotcomma++;

   634         /* only count ast lines for ignoring purposes where there is */

   635         /* locase text on the line */

   636         if (strstr(aline,"*"))

   637 	{

   638             for (s=aline;*s;s++)

   639                 if (*s>='a' && *s<='z')

   640                     break;

   641              if (*s)

   642 		results.astline++;

   643 	}

   644         if (strstr(aline,"/"))

   645             results.fslashline++;

   646         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

   647 	    ;

   648         if (aline[i]=='-' && aline[i-1]!='-')

   649 	    results.hyphens++;

   650         if (llen>LONGEST_PG_LINE)

   651 	    results.longline++;

   652         if (llen>WAY_TOO_LONG)

   653 	    results.verylongline++;

   654         if (strstr(aline,"<") && strstr(aline,">"))

   655 	{

   656             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);

   657             if (i>0)

   658                 results.htmcount++;

   659             if (strstr(aline,"<i>"))

   660 		results.htmcount+=4; /* bonus marks! */

   661 	}

   662         /* Check for spaced em-dashes */

   663         if (strstr(aline,"--"))

   664 	{

   665             results.emdash++;

   666             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||

   667                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   668 		results.space_emdash++;

   669             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&

   670                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   671 		/* count of em-dashes with spaces both sides */

   672 		results.non_PG_space_emdash++;

   673             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&

   674                (*(strstr(aline,"--")+2)!=CHAR_SPACE))

   675 		/* count of PG-type em-dashes with no spaces */

   676 		results.PG_space_emdash++;

   677 	}

   678         for (s=aline;*s;)

   679 	{

   680             s=getaword(s,inword);

   681             if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   682                 results.Dutchcount++;

   683             if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   684                 results.Frenchcount++;

   685             if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   686                 results.standalone_digit++;

   687 	}

   688         /* Check for spaced dashes */

   689         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')

   690 	    results.spacedash++;

   691         lastblen=lastlen;

   692         lastlen=strlen(aline);

   693         laststart=aline[0];

   694     }

   695     return &results;

   696 }

   698 struct warnings {

   699     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   700     signed int endquote,isDutch,isFrench;

   701 };

   703 /*

   704  * report_first_pass:

   705  *

   706  * Make some snap decisions based on the first pass results.

   707  */

   708 struct warnings *report_first_pass(struct first_pass_results *results)

   709 {

   710     static struct warnings warnings={0};

   711     if (cnt_spacend>0)

   712         printf("   --> %ld lines in this file have white space at end\n",

   713 	  cnt_spacend);

   714     warnings.dotcomma=1;

   715     if (results->dotcomma>5)

   716     {

   717         warnings.dotcomma=0;

   718         printf("   --> %ld lines in this file contain '.,'. "

   719 	  "Not reporting them.\n",results->dotcomma);

   720     }

   721     /*

   722      * If more than 50 lines, or one-tenth, are short,

   723      * don't bother reporting them.

   724      */

   725     warnings.shortline=1;

   726     if (results->shortline>50 || results->shortline*10>linecnt)

   727     {

   728         warnings.shortline=0;

   729         printf("   --> %ld lines in this file are short. "

   730 	  "Not reporting short lines.\n",results->shortline);

   731     }

   732     /*

   733      * If more than 50 lines, or one-tenth, are long,

   734      * don't bother reporting them.

   735      */

   736     warnings.longline=1;

   737     if (results->longline>50 || results->longline*10>linecnt)

   738     {

   739         warnings.longline=0;

   740         printf("   --> %ld lines in this file are long. "

   741 	  "Not reporting long lines.\n",results->longline);

   742     }

   743     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   744     warnings.ast=1;

   745     if (results->astline>10)

   746     {

   747         warnings.ast=0;

   748         printf("   --> %ld lines in this file contain asterisks. "

   749 	  "Not reporting them.\n",results->astline);

   750     }

   751     /*

   752      * If more than 10 lines contain forward slashes,

   753      * don't bother reporting them.

   754      */

   755     warnings.fslash=1;

   756     if (results->fslashline>10)

   757     {

   758         warnings.fslash=0;

   759         printf("   --> %ld lines in this file contain forward slashes. "

   760 	  "Not reporting them.\n",results->fslashline);

   761     }

   762     /*

   763      * If more than 20 lines contain unpunctuated endquotes,

   764      * don't bother reporting them.

   765      */

   766     warnings.endquote=1;

   767     if (results->endquote_count>20)

   768     {

   769         warnings.endquote=0;

   770         printf("   --> %ld lines in this file contain unpunctuated endquotes. "

   771 	  "Not reporting them.\n",results->endquote_count);

   772     }

   773     /*

   774      * If more than 15 lines contain standalone digits,

   775      * don't bother reporting them.

   776      */

   777     warnings.digit=1;

   778     if (results->standalone_digit>10)

   779     {

   780         warnings.digit=0;

   781         printf("   --> %ld lines in this file contain standalone 0s and 1s. "

   782 	  "Not reporting them.\n",results->standalone_digit);

   783     }

   784     /*

   785      * If more than 20 lines contain hyphens at end,

   786      * don't bother reporting them.

   787      */

   788     warnings.hyphen=1;

   789     if (results->hyphens>20)

   790     {

   791         warnings.hyphen=0;

   792         printf("   --> %ld lines in this file have hyphens at end. "

   793 	  "Not reporting them.\n",results->hyphens);

   794     }

   795     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   796     {

   797         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   798         pswit[MARKUP_SWITCH]=1;

   799     }

   800     if (results->verylongline>0)

   801         printf("   --> %ld lines in this file are VERY long!\n",

   802 	  results->verylongline);

   803     /*

   804      * If there are more non-PG spaced dashes than PG em-dashes,

   805      * assume it's deliberate.

   806      * Current PG guidelines say don't use them, but older texts do,

   807      * and some people insist on them whatever the guidelines say.

   808      */

   809     warnings.dash=1;

   810     if (results->spacedash+results->non_PG_space_emdash>

   811       results->PG_space_emdash)

   812     {

   813         warnings.dash=0;

   814         printf("   --> There are %ld spaced dashes and em-dashes. "

   815 	  "Not reporting them.\n",

   816 	  results->spacedash+results->non_PG_space_emdash);

   817     }

   818     /* If more than a quarter of characters are hi-bit, bug out. */

   819     warnings.bin=1;

   820     if (results->binlen*4>results->totlen)

   821     {

   822         printf("   --> This file does not appear to be ASCII. "

   823 	  "Terminating. Best of luck with it!\n");

   824         exit(1);

   825     }

   826     if (results->alphalen*4<results->totlen)

   827     {

   828         printf("   --> This file does not appear to be text. "

   829 	  "Terminating. Best of luck with it!\n");

   830         exit(1);

   831     }

   832     if (results->binlen*100>results->totlen || results->binlen>100)

   833     {

   834         printf("   --> There are a lot of foreign letters here. "

   835 	  "Not reporting them.\n");

   836         warnings.bin=0;

   837     }

   838     warnings.isDutch=0;

   839     if (results->Dutchcount>50)

   840     {

   841         warnings.isDutch=1;

   842         printf("   --> This looks like Dutch - "

   843 	  "switching off dashes and warnings for 's Middags case.\n");

   844     }

   845     warnings.isFrench=0;

   846     if (results->Frenchcount>50)

   847     {

   848         warnings.isFrench=1;

   849         printf("   --> This looks like French - "

   850 	  "switching off some doublepunct.\n");

   851     }

   852     if (results->firstline && results->footerline)

   853         printf("    The PG header and footer appear to be already on.\n");

   854     else

   855     {

   856         if (results->firstline)

   857             printf("    The PG header is on - no footer.\n");

   858         if (results->footerline)

   859             printf("    The PG footer is on - no header.\n");

   860     }

   861     printf("\n");

   862     if (pswit[VERBOSE_SWITCH])

   863     {

   864         warnings.bin=1;

   865         warnings.shortline=1;

   866         warnings.dotcomma=1;

   867         warnings.longline=1;

   868         warnings.dash=1;

   869         warnings.digit=1;

   870         warnings.ast=1;

   871         warnings.fslash=1;

   872         warnings.hyphen=1;

   873         warnings.endquote=1;

   874         printf("   *** Verbose output is ON -- you asked for it! ***\n");

   875     }

   876     if (warnings.isDutch)

   877         warnings.dash=0;

   878     if (results->footerline>0 && results->firstline>0 &&

   879       results->footerline>results->firstline &&

   880       results->footerline-results->firstline<100)

   881     {

   882         printf("   --> I don't really know where this text starts. \n");

   883         printf("       There are no reference points.\n");

   884         printf("       I'm going to have to report the header and footer "

   885 	  "as well.\n");

   886         results->firstline=0;

   887     }

   888     return &warnings;

   889 }

   891 struct counters {

   892     long quot;

   893     signed int c_unders,c_brack,s_brack,r_brack;

   894     signed int open_single_quote,close_single_quote;

   895 };

   897 /*

   898  * analyse_quotes:

   899  *

   900  * Look along the line, accumulate the count of quotes, and see

   901  * if this is an empty line - i.e. a line with nothing on it

   902  * but spaces.

   903  * If line has just spaces, period, * and/or - on it, don't

   904  * count it, since empty lines with asterisks or dashes to

   905  * separate sections are common.

   906  *

   907  * Returns: Non-zero if the line is empty.

   908  */

   909 int analyse_quotes(const char *s,struct counters *counters)

   910 {

   911     signed int guessquote=0;

   912     int isemptyline=1;    /* assume the line is empty until proven otherwise */

   913     while (*s)

   914     {

   915 	if (*s==CHAR_DQUOTE)

   916 	    counters->quot++;

   917 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)

   918 	{

   919 	    if (s==aline)

   920 	    {

   921 		/*

   922 		 * At start of line, it can only be an openquote.

   923 		 * Hardcode a very common exception!

   924 		 */

   925 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))

   926 		    counters->open_single_quote++;

   927 	    }

   928 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))

   929 		/* Do nothing! it's definitely an apostrophe, not a quote */

   930 		;

   931 	    /* it's outside a word - let's check it out */

   932 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))

   933 	    {

   934 		/* it damwell better BE an openquote */

   935 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))

   936 		    /* hardcode a very common exception! */

   937 		    counters->open_single_quote++;

   938 	    }

   939 	    else

   940 	    {

   941 		/* now - is it a closequote? */

   942 		guessquote=0;   /* accumulate clues */

   943 		if (gcisalpha(s[-1]))

   944 		{

   945 		    /* it follows a letter - could be either */

   946 		    guessquote++;

   947 		    if (s[-1]=='s')

   948 		    {

   949 			/* looks like a plural apostrophe */

   950 			guessquote-=3;

   951 			if (s[1]==CHAR_SPACE)  /* bonus marks! */

   952 			    guessquote-=2;

   953 		    }

   954 		}

   955 		/* it doesn't have a letter either side */

   956 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))

   957 		    guessquote+=8; /* looks like a closequote */

   958 		else

   959 		    guessquote++;

   960 		if (counters->open_single_quote>counters->close_single_quote)

   961 		    /*

   962 		     * Give it the benefit of some doubt,

   963 		     * if a squote is already open.

   964 		     */

   965 		    guessquote++;

   966 		else

   967 		    guessquote--;

   968 		if (guessquote>=0)

   969 		    counters->close_single_quote++;

   970 	    }

   971 	}

   972 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&

   973 	  *s!=13 && *s!=10)

   974 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */

   975 	if (*s==CHAR_UNDERSCORE)

   976 	    counters->c_unders++;

   977 	if (*s==CHAR_OPEN_CBRACK)

   978 	    counters->c_brack++;

   979 	if (*s==CHAR_CLOSE_CBRACK)

   980 	    counters->c_brack--;

   981 	if (*s==CHAR_OPEN_RBRACK)

   982 	    counters->r_brack++;

   983 	if (*s==CHAR_CLOSE_RBRACK)

   984 	    counters->r_brack--;

   985 	if (*s==CHAR_OPEN_SBRACK)

   986 	    counters->s_brack++;

   987 	if (*s==CHAR_CLOSE_SBRACK)

   988 	    counters->s_brack--;

   989 	s++;

   990     }

   991     return isemptyline;

   992 }

   994 /*

   995  * procfile:

   996  *

   997  * Process one file.

   998  */

   999 void procfile(char *filename)

  1000 {

  1001     char *s,*t,*s1,laststart,*wordstart;

  1002     char inword[MAXWORDLEN],testword[MAXWORDLEN];

  1003     char parastart[81];     /* first line of current para */

  1004     FILE *infile;

  1005     struct first_pass_results *first_pass_results;

  1006     struct warnings *warnings;

  1007     struct counters counters={0};

  1008     int isemptyline;

  1009     long squot,start_para_line;

  1010     signed int i,j,llen,isacro,isellipsis,istypo,alower,

  1011       eNon_A,eTab,eTilde,eAst,eFSlash,eCarat;

  1012     unsigned int lastlen,lastblen;

  1013     signed int dquotepar,squotepar;

  1014     signed int isnewpara,vowel,consonant;

  1015     char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],

  1016       cbrack_err[80],unders_err[80];

  1017     signed int qword_index,qperiod_index,isdup;

  1018     signed int enddash;

  1019     laststart=CHAR_SPACE;

  1020     lastlen=lastblen=0;

  1021     *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=

  1022       *unders_err=*prevline=0;

  1023     linecnt=checked_linecnt=start_para_line=0;

  1024     squot=0;

  1025     i=llen=isacro=isellipsis=istypo=0;

  1026     isnewpara=vowel=consonant=enddash=0;

  1027     qword_index=qperiod_index=isdup=0;

  1028     *inword=*testword=0;

  1029     dquotepar=squotepar=0;

  1030     for (j=0;j<MAX_QWORD;j++)

  1031     {

  1032         dupcnt[j]=0;

  1033         for (i=0;i<MAX_QWORD_LENGTH;i++)

  1034 	{

  1035             qword[i][j]=0;

  1036             qperiod[i][j]=0;

  1037 	}

  1038     }

  1039     infile=fopen(filename,"rb");

  1040     if (!infile)

  1041     {

  1042         if (pswit[STDOUT_SWITCH])

  1043             fprintf(stdout,"bookloupe: cannot open %s\n",filename);

  1044         else

  1045             fprintf(stderr,"bookloupe: cannot open %s\n",filename);

  1046 	exit(1);

  1047     }

  1048     fprintf(stdout,"\n\nFile: %s\n\n",filename);

  1049     first_pass_results=first_pass(infile);

  1050     warnings=report_first_pass(first_pass_results);

  1051     rewind(infile);

  1052     /*

  1053      * Here we go with the main pass. Hold onto yer hat!

  1054      * Re-init some variables we've dirtied.

  1055      */

  1056     squot=linecnt=0;

  1057     laststart=CHAR_SPACE;

  1058     lastlen=lastblen=0;

  1059     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))

  1060     {

  1061         linecnt++;

  1062         if (linecnt==1)

  1063 	    isnewpara=1;

  1064         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))

  1065 	    continue;    // skip DP page separators completely

  1066         if (linecnt<first_pass_results->firstline ||

  1067 	  (first_pass_results->footerline>0 &&

  1068 	  linecnt>first_pass_results->footerline))

  1069 	{

  1070             if (pswit[HEADER_SWITCH])

  1071 	    {

  1072                 if (!strncmp(aline,"Title:",6))

  1073                     printf("    %s\n",aline);

  1074                 if (!strncmp(aline,"Author:",7))

  1075                     printf("    %s\n",aline);

  1076                 if (!strncmp(aline,"Release Date:",13))

  1077                     printf("    %s\n",aline);

  1078                 if (!strncmp(aline,"Edition:",8))

  1079                     printf("    %s\n\n",aline);

  1080 	    }

  1081             continue;                /* skip through the header */

  1082 	}

  1083         checked_linecnt++;

  1084         s=aline;

  1085         /*

  1086 	 * If we are in a state of unbalanced quotes, and this line

  1087          * doesn't begin with a quote, output the stored error message.

  1088          * If the -P switch was used, print the warning even if the

  1089          * new para starts with quotes.

  1090 	 */

  1091         t=s;

  1092         while (*t==' ')

  1093 	    t++;

  1094         if (*dquote_err)

  1095             if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  1096 	    {

  1097                 if (!pswit[OVERVIEW_SWITCH])

  1098 		{

  1099                     if (pswit[ECHO_SWITCH])

  1100 			printf("\n%s\n",parastart);

  1101                     printf(dquote_err);

  1102 		}

  1103                 else

  1104                     cnt_dquot++;

  1105             }

  1106         if (*squote_err)

  1107 	{

  1108             if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||

  1109 	      pswit[QPARA_SWITCH] || squot)

  1110 	    {

  1111                 if (!pswit[OVERVIEW_SWITCH])

  1112 		{

  1113                     if (pswit[ECHO_SWITCH])

  1114 			printf("\n%s\n",parastart);

  1115                     printf(squote_err);

  1116 		}

  1117                 else

  1118                     cnt_squot++;

  1119 	    }

  1120             squot=0;

  1121 	}

  1122         if (*rbrack_err)

  1123 	{

  1124             if (!pswit[OVERVIEW_SWITCH])

  1125 	    {

  1126                 if (pswit[ECHO_SWITCH])

  1127 		    printf("\n%s\n",parastart);

  1128                 printf(rbrack_err);

  1129 	    }

  1130             else

  1131                 cnt_brack++;

  1132 	}

  1133         if (*sbrack_err)

  1134 	{

  1135             if (!pswit[OVERVIEW_SWITCH])

  1136 	    {

  1137                 if (pswit[ECHO_SWITCH])

  1138 		    printf("\n%s\n",parastart);

  1139                 printf(sbrack_err);

  1140 	    }

  1141             else

  1142                 cnt_brack++;

  1143 	}

  1144         if (*cbrack_err)

  1145 	{

  1146             if (!pswit[OVERVIEW_SWITCH])

  1147 	    {

  1148                 if (pswit[ECHO_SWITCH])

  1149 		    printf("\n%s\n",parastart);

  1150                 printf(cbrack_err);

  1151 	    }

  1152             else

  1153                 cnt_brack++;

  1154 	}

  1155         if (*unders_err)

  1156 	{

  1157             if (!pswit[OVERVIEW_SWITCH])

  1158 	    {

  1159                 if (pswit[ECHO_SWITCH])

  1160 		    printf("\n%s\n",parastart);

  1161                 printf(unders_err);

  1162 	    }

  1163             else

  1164                 cnt_brack++;

  1165 	}

  1166         *dquote_err=*squote_err=*rbrack_err=*cbrack_err=

  1167 	  *sbrack_err=*unders_err=0;

  1168 	isemptyline=analyse_quotes(aline,&counters);

  1169         if (isnewpara && !isemptyline)

  1170 	{

  1171 	    /* This line is the start of a new paragraph. */

  1172             start_para_line=linecnt;

  1173 	    /* Capture its first line in case we want to report it later. */

  1174             strncpy(parastart,aline,80);

  1175             parastart[79]=0;

  1176             dquotepar=squotepar=0; /* restart the quote count */

  1177             s=aline;

  1178             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)

  1179 		s++;

  1180             if (*s>='a' && *s<='z')

  1181 	    {

  1182 		/* and its first letter is lowercase */

  1183                 if (pswit[ECHO_SWITCH])

  1184 		    printf("\n%s\n",aline);

  1185                 if (!pswit[OVERVIEW_SWITCH])

  1186                     printf("    Line %ld column %d - "

  1187 		      "Paragraph starts with lower-case\n",

  1188 		      linecnt,(int)(s-aline)+1);

  1189                 else

  1190                     cnt_punct++;

  1191 	    }

  1192             isnewpara=0; /* Signal the end of new para processing. */

  1193 	}

  1194         /* Check for an em-dash broken at line end. */

  1195         if (enddash && *aline=='-')

  1196 	{

  1197             if (pswit[ECHO_SWITCH])

  1198 		printf("\n%s\n",aline);

  1199             if (!pswit[OVERVIEW_SWITCH])

  1200                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  1201             else

  1202                 cnt_punct++;

  1203 	}

  1204         enddash=0;

  1205         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)

  1206 	    ;

  1207         if (s>=aline && *s=='-')

  1208             enddash=1;

  1209 	/*

  1210          * Check for invalid or questionable characters in the line

  1211          * Anything above 127 is invalid for plain ASCII, and

  1212          * non-printable control characters should also be flagged.

  1213          * Tabs should generally not be there.

  1214 	 */

  1215         for (s=aline;*s;s++)

  1216 	{

  1217             i=(unsigned char)*s;

  1218             if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)

  1219 	    {

  1220                 if (pswit[ECHO_SWITCH])

  1221 		    printf("\n%s\n",aline);

  1222                 if (!pswit[OVERVIEW_SWITCH])

  1223                     printf("    Line %ld column %d - Control character %d\n",

  1224 		      linecnt,(int)(s-aline)+1,i);

  1225                 else

  1226                     cnt_bin++;

  1227 	    }

  1228 	}

  1229         if (warnings->bin)

  1230 	{

  1231 	    /* Don't repeat multiple warnings on one line. */

  1232             eNon_A=eTab=eTilde=eCarat=eFSlash=eAst=0;

  1233             for (s=aline;*s;s++)

  1234 	    {

  1235                 if (!eNon_A &&

  1236 		  (*s<CHAR_SPACE && *s!=9 && *s!='\n' || (unsigned char)*s>127))

  1237 		{

  1238                     i=*s;  /* annoying kludge for signed chars */

  1239                     if (i<0)

  1240 			i+=256;

  1241                     if (pswit[ECHO_SWITCH])

  1242 			printf("\n%s\n",aline);

  1243                     if (!pswit[OVERVIEW_SWITCH])

  1244                         if (i>127 && i<160)

  1245                             printf("    Line %ld column %d - "

  1246 			      "Non-ISO-8859 character %d\n",

  1247 			      linecnt,(int)(s-aline)+1,i);

  1248                         else

  1249                             printf("    Line %ld column %d - "

  1250 			      "Non-ASCII character %d\n",

  1251 			      linecnt,(int)(s-aline)+1,i);

  1252                     else

  1253                         cnt_bin++;

  1254                     eNon_A=1;

  1255 		}

  1256                 if (!eTab && *s==CHAR_TAB)

  1257 		{

  1258                     if (pswit[ECHO_SWITCH])

  1259 			printf("\n%s\n",aline);

  1260                     if (!pswit[OVERVIEW_SWITCH])

  1261                         printf("    Line %ld column %d - Tab character?\n",

  1262 			  linecnt,(int)(s-aline)+1);

  1263                     else

  1264                         cnt_odd++;

  1265                     eTab=1;

  1266 		}

  1267                 if (!eTilde && *s==CHAR_TILDE)

  1268 		{

  1269 		    /*

  1270 		     * Often used by OCR software to indicate an

  1271 		     * unrecognizable character.

  1272 		     */

  1273                     if (pswit[ECHO_SWITCH])

  1274 			printf("\n%s\n",aline);

  1275                     if (!pswit[OVERVIEW_SWITCH])

  1276                         printf("    Line %ld column %d - Tilde character?\n",

  1277 			  linecnt,(int)(s-aline)+1);

  1278                     else

  1279                         cnt_odd++;

  1280                     eTilde=1;

  1281 		}

  1282                 if (!eCarat && *s==CHAR_CARAT)

  1283 		{

  1284                     if (pswit[ECHO_SWITCH])

  1285 			printf("\n%s\n",aline);

  1286                     if (!pswit[OVERVIEW_SWITCH])

  1287                         printf("    Line %ld column %d - Carat character?\n",

  1288 			  linecnt,(int)(s-aline)+1);

  1289                     else

  1290                         cnt_odd++;

  1291                     eCarat=1;

  1292 		}

  1293                 if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)

  1294 		{

  1295                     if (pswit[ECHO_SWITCH])

  1296 			printf("\n%s\n",aline);

  1297                     if (!pswit[OVERVIEW_SWITCH])

  1298                         printf("    Line %ld column %d - Forward slash?\n",

  1299 			  linecnt,(int)(s-aline)+1);

  1300                     else

  1301                         cnt_odd++;

  1302                     eFSlash=1;

  1303 		}

  1304                 /*

  1305 		 * Report asterisks only in paranoid mode,

  1306 		 * since they're often deliberate.

  1307 		 */

  1308                 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast &&

  1309 		  !isemptyline && *s==CHAR_ASTERISK)

  1310 		{

  1311                     if (pswit[ECHO_SWITCH])

  1312 			printf("\n%s\n",aline);

  1313                     if (!pswit[OVERVIEW_SWITCH])

  1314                         printf("    Line %ld column %d - Asterisk?\n",

  1315 			  linecnt,(int)(s-aline)+1);

  1316                     else

  1317                         cnt_odd++;

  1318                     eAst=1;

  1319 		}

  1320 	    }

  1321 	}

  1322         /* Check for line too long. */

  1323         if (warnings->longline)

  1324 	{

  1325             if (strlen(aline)>LONGEST_PG_LINE)

  1326 	    {

  1327                 if (pswit[ECHO_SWITCH])

  1328 		    printf("\n%s\n",aline);

  1329                 if (!pswit[OVERVIEW_SWITCH])

  1330                     printf("    Line %ld column %d - Long line %d\n",

  1331 		      linecnt,strlen(aline),strlen(aline));

  1332                 else

  1333                     cnt_long++;

  1334 	    }

  1335 	}

  1336         /*

  1337 	 * Check for line too short.

  1338          * This one is a bit trickier to implement: we don't want to

  1339          * flag the last line of a paragraph for being short, so we

  1340          * have to wait until we know that our current line is a

  1341          * "normal" line, then report the _previous_ line if it was too

  1342          * short. We also don't want to report indented lines like

  1343          * chapter heads or formatted quotations. We therefore keep

  1344          * lastlen as the length of the last line examined, and

  1345          * lastblen as the length of the last but one, and try to

  1346          * suppress unnecessary warnings by checking that both were of

  1347          * "normal" length. We keep the first character of the last

  1348          * line in laststart, and if it was a space, we assume that the

  1349          * formatting is deliberate. I can't figure out a way to

  1350          * distinguish something like a quoted verse left-aligned or

  1351          * the header or footer of a letter from a paragraph of short

  1352          * lines - maybe if I examined the whole paragraph, and if the

  1353          * para has less than, say, 8 lines and if all lines are short,

  1354          * then just assume it's OK? Need to look at some texts to see

  1355          * how often a formula like this would get the right result.

  1356 	 */

  1357         if (warnings->shortline && strlen(aline)>1 && lastlen>1 &&

  1358 	  lastlen<SHORTEST_PG_LINE && lastblen>1 && lastblen>SHORTEST_PG_LINE &&

  1359 	  laststart!=CHAR_SPACE)

  1360 	{

  1361 	    if (pswit[ECHO_SWITCH])

  1362 		printf("\n%s\n",prevline);

  1363 	    if (!pswit[OVERVIEW_SWITCH])

  1364 		printf("    Line %ld column %d - Short line %d?\n",

  1365 		  linecnt-1,strlen(prevline),strlen(prevline));

  1366 	    else

  1367 		cnt_short++;

  1368 	}

  1369         lastblen=lastlen;

  1370         lastlen=strlen(aline);

  1371         laststart=aline[0];

  1372         /* Look for punctuation other than full ellipses at start of line. */

  1373         if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))

  1374 	{

  1375 	    if (pswit[ECHO_SWITCH])

  1376 		printf("\n%s\n",aline);

  1377 	    if (!pswit[OVERVIEW_SWITCH])

  1378 		printf("    Line %ld column 1 - Begins with punctuation?\n",

  1379 		  linecnt);

  1380 	    else

  1381 		cnt_punct++;

  1382 	}

  1383         /*

  1384 	 * Check for spaced em-dashes.

  1385          * We must check _all_ occurrences of "--" on the line

  1386          * hence the loop - even if the first double-dash is OK

  1387          * there may be another that's wrong later on.

  1388 	 */

  1389         if (warnings->dash)

  1390 	{

  1391             s=aline;

  1392             while (strstr(s,"--"))

  1393 	    {

  1394                 if (*(strstr(s,"--")-1)==CHAR_SPACE ||

  1395                    (*(strstr(s,"--")+2)==CHAR_SPACE))

  1396 		{

  1397                     if (pswit[ECHO_SWITCH])

  1398 			printf("\n%s\n",aline);

  1399                     if (!pswit[OVERVIEW_SWITCH])

  1400                         printf("    Line %ld column %d - Spaced em-dash?\n",

  1401 			  linecnt,(int)(strstr(s,"--")-aline)+1);

  1402                     else

  1403                         cnt_dash++;

  1404 		}

  1405                 s=strstr(s,"--")+2;

  1406 	    }

  1407 	}

  1408         /* Check for spaced dashes. */

  1409         if (warnings->dash)

  1410 	{

  1411             if (strstr(aline," -"))

  1412 	    {

  1413                 if (*(strstr(aline," -")+2)!='-')

  1414 		{

  1415                     if (pswit[ECHO_SWITCH])

  1416 			printf("\n%s\n",aline);

  1417                     if (!pswit[OVERVIEW_SWITCH])

  1418                         printf("    Line %ld column %d - Spaced dash?\n",

  1419 			  linecnt,(int)(strstr(aline," -")-aline)+1);

  1420                     else

  1421                         cnt_dash++;

  1422 		}

  1423 	    }

  1424             else if (strstr(aline,"- "))

  1425 	    {

  1426 		if (*(strstr(aline,"- ")-1)!='-')

  1427 		{

  1428 		    if (pswit[ECHO_SWITCH])

  1429 			printf("\n%s\n",aline);

  1430 		    if (!pswit[OVERVIEW_SWITCH])

  1431 			printf("    Line %ld column %d - Spaced dash?\n",

  1432 			  linecnt,(int)(strstr(aline,"- ")-aline)+1);

  1433 		    else

  1434 			cnt_dash++;

  1435 		}

  1436 	    }

  1437 	}

  1438         /*

  1439 	 * Check for unmarked paragraphs indicated by separate speakers.

  1440          * May well be false positive:

  1441          * "Bravo!" "Wonderful!" called the crowd.

  1442          * but useful all the same.

  1443 	 */

  1444         s=wrk;

  1445         *s=0;

  1446         if (strstr(aline,"\" \""))

  1447 	    s=strstr(aline,"\" \"");

  1448         if (strstr(aline,"\"  \""))

  1449 	    s=strstr(aline,"\"  \"");

  1450         if (*s)

  1451 	{

  1452             if (pswit[ECHO_SWITCH])

  1453 		printf("\n%s\n",aline);

  1454             if (!pswit[OVERVIEW_SWITCH])

  1455                 printf("    Line %ld column %d - "

  1456 		  "Query missing paragraph break?\n",

  1457 		  linecnt,(int)(s-aline)+1);

  1458             else

  1459                 cnt_punct++;

  1460 	}

  1461         /*

  1462 	 * Check for "to he" and other easy he/be errors.

  1463          * This is a very inadequate effort on the he/be problem,

  1464          * but the phrase "to he" is always an error, whereas "to

  1465          * be" is quite common.

  1466          * Similarly, '"Quiet!", be said.' is a non-be error

  1467          * "to he" is _not_ always an error!:

  1468          *       "Where they went to he couldn't say."

  1469          * Another false positive:

  1470          *       What would "Cinderella" be without the . . .

  1471          * and another: "If he wants to he can see for himself."

  1472 	 */

  1473         s=wrk;

  1474         *s=0;

  1475         if (strstr(aline," to he "))

  1476 	    s=strstr(aline," to he ");

  1477         if (strstr(aline,"\" be "))

  1478 	    s=strstr(aline,"\" be ");

  1479         if (strstr(aline,"\", be "))

  1480 	    s=strstr(aline,"\", be ");

  1481         if (strstr(aline," is be "))

  1482 	    s=strstr(aline," is be ");

  1483         if (strstr(aline," be is "))

  1484 	    s=strstr(aline," be is ");

  1485         if (strstr(aline," was be "))

  1486 	    s=strstr(aline," was be ");

  1487         if (strstr(aline," be would "))

  1488 	    s=strstr(aline," be would ");

  1489         if (strstr(aline," be could "))

  1490 	    s=strstr(aline," be could ");

  1491         if (*s)

  1492 	{

  1493             if (pswit[ECHO_SWITCH])

  1494 		printf("\n%s\n",aline);

  1495             if (!pswit[OVERVIEW_SWITCH])

  1496                 printf("    Line %ld column %d - Query he/be error?\n",

  1497 		  linecnt,(int)(s-aline)+1);

  1498             else

  1499                 cnt_word++;

  1500 	}

  1501         s=wrk;

  1502         *s=0;

  1503         if (strstr(aline," i bad "))

  1504 	    s=strstr(aline," i bad ");

  1505         if (strstr(aline," you bad "))

  1506 	    s=strstr(aline," you bad ");

  1507         if (strstr(aline," he bad "))

  1508 	    s=strstr(aline," he bad ");

  1509         if (strstr(aline," she bad "))

  1510 	    s=strstr(aline," she bad ");

  1511         if (strstr(aline," they bad "))

  1512 	    s=strstr(aline," they bad ");

  1513         if (strstr(aline," a had "))

  1514 	    s=strstr(aline," a had ");

  1515         if (strstr(aline," the had "))

  1516 	    s=strstr(aline," the had ");

  1517         if (*s)

  1518 	{

  1519             if (pswit[ECHO_SWITCH])

  1520 		printf("\n%s\n",aline);

  1521             if (!pswit[OVERVIEW_SWITCH])

  1522                 printf("    Line %ld column %d - Query had/bad error?\n",

  1523 		  linecnt,(int)(s-aline)+1);

  1524             else

  1525                 cnt_word++;

  1526 	}

  1527         s=wrk;

  1528         *s=0;

  1529         if (strstr(aline,", hut "))

  1530 	    s=strstr(aline,", hut ");

  1531         if (strstr(aline,"; hut "))

  1532 	    s=strstr(aline,"; hut ");

  1533         if (*s)

  1534 	{

  1535             if (pswit[ECHO_SWITCH])

  1536 		printf("\n%s\n",aline);

  1537             if (!pswit[OVERVIEW_SWITCH])

  1538                 printf("    Line %ld column %d - Query hut/but error?\n",

  1539 		  linecnt,(int)(s-aline)+1);

  1540             else

  1541                 cnt_word++;

  1542 	}

  1543         /*

  1544 	 * Special case - angled bracket in front of "From" placed there by an

  1545 	 * MTA when sending an e-mail.

  1546 	 */

  1547         if (strstr(aline,">From"))

  1548 	{

  1549             if (pswit[ECHO_SWITCH])

  1550 		printf("\n%s\n",aline);

  1551             if (!pswit[OVERVIEW_SWITCH])

  1552                 printf("    Line %ld column %d - "

  1553 		  "Query angled bracket with From\n",

  1554 		  linecnt,(int)(strstr(aline,">From")-aline)+1);

  1555             else

  1556                 cnt_punct++;

  1557 	}

  1558         /*

  1559 	 * Check for a single character line -

  1560 	 * often an overflow from bad wrapping.

  1561 	 */

  1562         if (*aline && !aline[1])

  1563 	{

  1564             if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||

  1565 	      gcisdigit(*aline))

  1566                 ; /* Nothing - ignore numerals alone on a line. */

  1567             else

  1568 	    {

  1569                 if (pswit[ECHO_SWITCH])

  1570 		    printf("\n%s\n",aline);

  1571                 if (!pswit[OVERVIEW_SWITCH])

  1572                     printf("    Line %ld column 1 - "

  1573 		      "Query single character line\n",linecnt);

  1574                 else

  1575                     cnt_punct++;

  1576 	    }

  1577 	}

  1578         /* Check for I" - often should be ! */

  1579         if (strstr(aline," I\""))

  1580 	{

  1581             if (pswit[ECHO_SWITCH])

  1582 		printf("\n%s\n",aline);

  1583             if (!pswit[OVERVIEW_SWITCH])

  1584                 printf("    Line %ld column %ld - Query I=exclamation mark?\n",

  1585 		  linecnt,strstr(aline," I\"")-aline);

  1586             else

  1587                 cnt_punct++;

  1588 	}

  1589         /*

  1590 	 * Check for period without a capital letter. Cut-down from gutspell.

  1591          * Only works when it happens on a single line.

  1592 	 */

  1593         if (pswit[PARANOID_SWITCH])

  1594 	{

  1595             for (t=s=aline;strstr(t,". ");)

  1596 	    {

  1597                 t=strstr(t,". ");

  1598                 if (t==s)

  1599 		{

  1600                     t++;

  1601 		    /* start of line punctuation is handled elsewhere */

  1602                     continue;

  1603 		}

  1604                 if (!gcisalpha(t[-1]))

  1605 		{

  1606                     t++;

  1607                     continue;

  1608 		}

  1609                 if (warnings->isDutch)

  1610 		{

  1611 		    /* For Frank & Jeroen -- 's Middags case */

  1612                     if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&

  1613 		      t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')

  1614 		    {

  1615                         t++;

  1616                         continue;

  1617 		    }

  1618 		}

  1619                 s1=t+2;

  1620                 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))

  1621                     s1++;

  1622                 if (*s1>='a' && *s1<='z')

  1623 		{

  1624 		    /* we have something to investigate */

  1625                     istypo=1;

  1626 		    /* so let's go back and find out */

  1627                     for (s1=t-1;s1>=s &&

  1628 		      (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&

  1629 		      gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)

  1630 			;

  1631                     s1++;

  1632                     for (i=0;*s1 && *s1!='.';s1++,i++)

  1633                         testword[i]=*s1;

  1634                     testword[i]=0;

  1635                     for (i=0;*abbrev[i];i++)

  1636                         if (!strcmp(testword,abbrev[i]))

  1637                             istypo=0;

  1638                     if (gcisdigit(*testword))

  1639 			istypo=0;

  1640                     if (!testword[1])

  1641 			istypo=0;

  1642                     if (isroman(testword))

  1643 			istypo=0;

  1644                     if (istypo)

  1645 		    {

  1646                         istypo=0;

  1647                         for (i=0;testword[i];i++)

  1648                             if (strchr(vowels,testword[i]))

  1649                                 istypo=1;

  1650 		    }

  1651                     if (istypo)

  1652 		    {

  1653                         isdup=0;

  1654                         if (strlen(testword)<MAX_QWORD_LENGTH &&

  1655 			  !pswit[VERBOSE_SWITCH])

  1656                             for (i=0;i<qperiod_index;i++)

  1657                                 if (!strcmp(testword,qperiod[i]))

  1658                                     isdup=1;

  1659                         if (!isdup)

  1660 			{

  1661                             if (qperiod_index<MAX_QWORD &&

  1662 			      strlen(testword)<MAX_QWORD_LENGTH)

  1663 			    {

  1664                                 strcpy(qperiod[qperiod_index],testword);

  1665                                 qperiod_index++;

  1666 			    }

  1667                             if (pswit[ECHO_SWITCH])

  1668 				printf("\n%s\n",aline);

  1669                             if (!pswit[OVERVIEW_SWITCH])

  1670                                 printf("    Line %ld column %d - "

  1671 				  "Extra period?\n",linecnt,(int)(t-aline)+1);

  1672                             else

  1673                                 cnt_punct++;

  1674 			}

  1675 		    }

  1676 		}

  1677 	    t++;

  1678 	    }

  1679 	}

  1680         if (pswit[TYPO_SWITCH])

  1681 	{

  1682             /* Check for words usually not followed by punctuation. */

  1683             for (s=aline;*s;)

  1684 	    {

  1685                 wordstart=s;

  1686                 s=getaword(s,inword);

  1687                 if (!*inword)

  1688 		    continue;

  1689                 lowerit(inword);

  1690                 for (i=0;*nocomma[i];i++)

  1691                     if (!strcmp(inword,nocomma[i]))

  1692 		    {

  1693                         if (*s==',' || *s==';' || *s==':')

  1694 			{

  1695                             if (pswit[ECHO_SWITCH])

  1696 				printf("\n%s\n",aline);

  1697                             if (!pswit[OVERVIEW_SWITCH])

  1698                                 printf("    Line %ld column %d - "

  1699 				  "Query punctuation after %s?\n",

  1700 				  linecnt,(int)(s-aline)+1,inword);

  1701                             else

  1702                                 cnt_punct++;

  1703 			}

  1704 		    }

  1705 		for (i=0;*noperiod[i];i++)

  1706                     if (!strcmp(inword,noperiod[i]))

  1707 		    {

  1708                         if (*s=='.' || *s=='!')

  1709 			{

  1710                             if (pswit[ECHO_SWITCH])

  1711 				printf("\n%s\n",aline);

  1712                             if (!pswit[OVERVIEW_SWITCH])

  1713                                 printf("    Line %ld column %d - "

  1714 				  "Query punctuation after %s?\n",

  1715 				  linecnt,(int)(s-aline)+1,inword);

  1716                             else

  1717                                 cnt_punct++;

  1718 			}

  1719 		    }

  1720 	    }

  1721 	}

  1722         /*

  1723 	 * Check for commonly mistyped words,

  1724 	 * and digits like 0 for O in a word.

  1725 	 */

  1726         for (s=aline;*s;)

  1727 	{

  1728             wordstart=s;

  1729             s=getaword(s,inword);

  1730             if (!*inword)

  1731 		continue; /* don't bother with empty lines */

  1732             if (mixdigit(inword))

  1733 	    {

  1734                 if (pswit[ECHO_SWITCH])

  1735 		    printf("\n%s\n",aline);

  1736                 if (!pswit[OVERVIEW_SWITCH])

  1737                     printf("    Line %ld column %d - Query digit in %s\n",

  1738 		      linecnt,(int)(wordstart-aline)+1,inword);

  1739                 else

  1740                     cnt_word++;

  1741 	    }

  1742             /*

  1743 	     * Put the word through a series of tests for likely typos and OCR

  1744 	     * errors.

  1745 	     */

  1746             if (pswit[TYPO_SWITCH])

  1747 	    {

  1748                 istypo=0;

  1749                 strcpy(testword,inword);

  1750                 alower=0;

  1751                 for (i=0;i<(signed int)strlen(testword);i++)

  1752 		{

  1753 		    /* lowercase for testing */

  1754                     if (testword[i]>='a' && testword[i]<='z')

  1755 			alower=1;

  1756                     if (alower && testword[i]>='A' && testword[i]<='Z')

  1757 		    {

  1758                         /*

  1759 			 * We have an uppercase mid-word. However, there are

  1760 			 * common cases:

  1761                          *   Mac and Mc like McGill

  1762                          *   French contractions like l'Abbe

  1763 			 */

  1764                         if (i==2 && testword[0]=='m' && testword[1]=='c' ||

  1765                           i==3 && testword[0]=='m' && testword[1]=='a' &&

  1766 			  testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)

  1767 			    ; /* do nothing! */

  1768                         else

  1769                             istypo=1;

  1770 		    }

  1771                     testword[i]=(char)tolower(testword[i]);

  1772 		}

  1773                 /*

  1774 		 * Check for certain unlikely two-letter combinations at word

  1775 		 * start and end.

  1776 		 */

  1777                 if (strlen(testword)>1)

  1778 		{

  1779                     for (i=0;*nostart[i];i++)

  1780                         if (!strncmp(testword,nostart[i],2))

  1781                             istypo=1;

  1782                     for (i=0;*noend[i];i++)

  1783                         if (!strncmp(testword+strlen(testword)-2,noend[i],2))

  1784                             istypo=1;

  1785 		}

  1786                 /* ght is common, gbt never. Like that. */

  1787                 if (strstr(testword,"cb"))

  1788 		    istypo=1;

  1789                 if (strstr(testword,"gbt"))

  1790 		    istypo=1;

  1791                 if (strstr(testword,"pbt"))

  1792 		    istypo=1;

  1793                 if (strstr(testword,"tbs"))

  1794 		    istypo=1;

  1795                 if (strstr(testword,"mrn"))

  1796 		    istypo=1;

  1797                 if (strstr(testword,"ahle"))

  1798 		    istypo=1;

  1799                 if (strstr(testword,"ihle"))

  1800 		    istypo=1;

  1801                 /*

  1802 		 * "TBE" does happen - like HEARTBEAT - but uncommon.

  1803                  * Also "TBI" - frostbite, outbid - but uncommon.

  1804                  * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1805 		 * numerals, but "ii" is a common scanno.

  1806 		 */

  1807                 if (strstr(testword,"tbi"))

  1808 		    istypo=1;

  1809                 if (strstr(testword,"tbe"))

  1810 		    istypo=1;

  1811                 if (strstr(testword,"ii"))

  1812 		    istypo=1;

  1813                 /*

  1814 		 * Check for no vowels or no consonants.

  1815                  * If none, flag a typo.

  1816 		 */

  1817                 if (!istypo && strlen(testword)>1)

  1818 		{

  1819                     vowel=consonant=0;

  1820                     for (i=0;testword[i];i++)

  1821 		    {

  1822                         if (testword[i]=='y' || gcisdigit(testword[i]))

  1823 			{

  1824 			    /* Yah, this is loose. */

  1825                             vowel++;

  1826                             consonant++;

  1827 			}

  1828                         else if (strchr(vowels,testword[i]))

  1829 			    vowel++;

  1830 			else

  1831 			    consonant++;

  1832 		    }

  1833                     if (!vowel || !consonant)

  1834                         istypo=1;

  1835 		}

  1836                 /*

  1837 		 * Now exclude the word from being reported if it's in

  1838                  * the okword list.

  1839 		 */

  1840                 for (i=0;*okword[i];i++)

  1841                     if (!strcmp(testword,okword[i]))

  1842                         istypo=0;

  1843                 /*

  1844 		 * What looks like a typo may be a Roman numeral.

  1845 		 * Exclude these.

  1846 		 */

  1847                 if (istypo && isroman(testword))

  1848 		    istypo=0;

  1849                 /* Check the manual list of typos. */

  1850                 if (!istypo)

  1851                     for (i=0;*typo[i];i++)

  1852                         if (!strcmp(testword,typo[i]))

  1853                             istypo=1;

  1854                 /*

  1855 		 * Check lowercase s, l, i and m - special cases.

  1856                  *   "j" - often a semi-colon gone wrong.

  1857                  *   "d" for a missing apostrophe - he d

  1858                  *   "n" for "in"

  1859 		 */

  1860                 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))

  1861 		    istypo=1;

  1862                 if (istypo)

  1863 		{

  1864                     isdup=0;

  1865                     if (strlen(testword)<MAX_QWORD_LENGTH &&

  1866 		      !pswit[VERBOSE_SWITCH])

  1867                         for (i=0;i<qword_index;i++)

  1868                             if (!strcmp(testword,qword[i]))

  1869 			    {

  1870                                 isdup=1;

  1871                                 ++dupcnt[i];

  1872 			    }

  1873                     if (!isdup)

  1874 		    {

  1875                         if (qword_index<MAX_QWORD &&

  1876 			  strlen(testword)<MAX_QWORD_LENGTH)

  1877 			{

  1878                             strcpy(qword[qword_index],testword);

  1879                             qword_index++;

  1880 			}

  1881                         if (pswit[ECHO_SWITCH])

  1882 			    printf("\n%s\n",aline);

  1883                         if (!pswit[OVERVIEW_SWITCH])

  1884 			{

  1885                             printf("    Line %ld column %d - Query word %s",

  1886 			      linecnt,(int)(wordstart-aline)+1,inword);

  1887                             if (strlen(testword)<MAX_QWORD_LENGTH &&

  1888 			      !pswit[VERBOSE_SWITCH])

  1889                                 printf(" - not reporting duplicates");

  1890                             printf("\n");

  1891 			}

  1892                         else

  1893                             cnt_word++;

  1894 		    }

  1895 		}

  1896 	    }

  1897 	    /* check the user's list of typos */

  1898 	    if (!istypo && usertypo_count)

  1899 		for (i=0;i<usertypo_count;i++)

  1900 		    if (!strcmp(testword,usertypo[i]))

  1901 		    {

  1902 			if (pswit[ECHO_SWITCH])

  1903 			    printf("\n%s\n",aline);

  1904 			if (!pswit[OVERVIEW_SWITCH])

  1905 			    printf("    Line %ld column %d - "

  1906 			      "Query possible scanno %s\n",

  1907 			      linecnt,(int)(wordstart-aline)+2,inword);

  1908 		    }

  1909             if (pswit[PARANOID_SWITCH] && warnings->digit)

  1910 	    {

  1911 		/* In paranoid mode, query all 0 and 1 standing alone. */

  1912                 if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1913 		{

  1914                     if (pswit[ECHO_SWITCH])

  1915 			printf("\n%s\n",aline);

  1916                     if (!pswit[OVERVIEW_SWITCH])

  1917                         printf("    Line %ld column %d - Query standalone %s\n",

  1918 			  linecnt,(int)(wordstart-aline)+2,inword);

  1919                     else

  1920                         cnt_word++;

  1921 		}

  1922 	    }

  1923 	}

  1924 	/*

  1925          * Look for added or missing spaces around punctuation and quotes.

  1926          * If there is a punctuation character like ! with no space on

  1927          * either side, suspect a missing!space. If there are spaces on

  1928          * both sides , assume a typo. If we see a double quote with no

  1929          * space or punctuation on either side of it, assume unspaced

  1930          * quotes "like"this.

  1931 	 */

  1932         llen=strlen(aline);

  1933         for (i=1;i<llen;i++)

  1934 	{

  1935 	    /* For each character in the line after the first. */

  1936             if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */

  1937 	    {

  1938 		/* we need to suppress warnings for acronyms like M.D. */

  1939                 isacro=0;

  1940 		/* we need to suppress warnings for ellipsis . . . */

  1941                 isellipsis=0;

  1942 		/* if there are letters on both sides of it or ... */

  1943                 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||

  1944                    gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))

  1945 		{

  1946 		    /* ...if it's strict punctuation followed by an alpha */

  1947                     if (aline[i]=='.')

  1948 		    {

  1949                         if (i>2 && aline[i-2]=='.')

  1950 			    isacro=1;

  1951                         if (i+2<llen && aline[i+2]=='.')

  1952 			    isacro=1;

  1953 		    }

  1954                     if (!isacro)

  1955 		    {

  1956                         if (pswit[ECHO_SWITCH])

  1957 			    printf("\n%s\n",aline);

  1958                         if (!pswit[OVERVIEW_SWITCH])

  1959                             printf("    Line %ld column %d - Missing space?\n",

  1960 			      linecnt,i+1);

  1961                         else

  1962                             cnt_punct++;

  1963 		    }

  1964 		}

  1965                 if (aline[i-1]==CHAR_SPACE &&

  1966 		  (aline[i+1]==CHAR_SPACE || aline[i+1]==0))

  1967 		{

  1968 		    /*

  1969 		     * If there are spaces on both sides,

  1970 		     * or space before and end of line.

  1971 		     */

  1972                     if (aline[i]=='.')

  1973 		    {

  1974                         if (i>2 && aline[i-2]=='.')

  1975 			    isellipsis=1;

  1976                         if (i+2<llen && aline[i+2]=='.')

  1977 			    isellipsis=1;

  1978 		    }

  1979                     if (!isemptyline && !isellipsis)

  1980 		    {

  1981                         if (pswit[ECHO_SWITCH])

  1982 			    printf("\n%s\n",aline);

  1983                         if (!pswit[OVERVIEW_SWITCH])

  1984                             printf("    Line %ld column %d - "

  1985 			      "Spaced punctuation?\n",linecnt,i+1);

  1986                         else

  1987                             cnt_punct++;

  1988 		    }

  1989 		}

  1990 	    }

  1991 	}

  1992         /* Split out the characters that CANNOT be preceded by space. */

  1993         llen=strlen(aline);

  1994         for (i=1;i<llen;i++)

  1995 	{

  1996 	    /* for each character in the line after the first */

  1997             if (strchr("?!,;:",aline[i]))

  1998 	    {

  1999 		/* if it's punctuation that _cannot_ have a space before it */

  2000                 if (aline[i-1]==CHAR_SPACE && !isemptyline &&

  2001 		  aline[i+1]!=CHAR_SPACE)

  2002 		{

  2003 		    /*

  2004 		     * If aline[i+1) DOES == space,

  2005 		     * it was already reported just above.

  2006 		     */

  2007                     if (pswit[ECHO_SWITCH])

  2008 			printf("\n%s\n",aline);

  2009                     if (!pswit[OVERVIEW_SWITCH])

  2010                         printf("    Line %ld column %d - Spaced punctuation?\n",

  2011 			  linecnt,i+1);

  2012                     else

  2013                         cnt_punct++;

  2014 		}

  2015 	    }

  2016 	}

  2017         /*

  2018 	 * Special case " .X" where X is any alpha.

  2019          * This plugs a hole in the acronym code above.

  2020 	 * Inelegant, but maintainable.

  2021 	 */

  2022         llen=strlen(aline);

  2023         for (i=1;i<llen;i++)

  2024 	{

  2025 	    /* for each character in the line after the first */

  2026             if (aline[i]=='.')

  2027 	    {

  2028 		/* if it's a period */

  2029                 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))

  2030 		{

  2031 		    /*

  2032 		     * If the period follows a space and

  2033 		     * is followed by a letter.

  2034 		     */

  2035                     if (pswit[ECHO_SWITCH])

  2036 			printf("\n%s\n",aline);

  2037                     if (!pswit[OVERVIEW_SWITCH])

  2038                         printf("    Line %ld column %d - Spaced punctuation?\n",

  2039 			  linecnt,i+1);

  2040                     else

  2041                         cnt_punct++;

  2042 		}

  2043 	    }

  2044 	}

  2045         for (i=1;i<llen;i++)

  2046 	{

  2047 	    /* for each character in the line after the first */

  2048             if (aline[i]==CHAR_DQUOTE)

  2049 	    {

  2050                 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&

  2051 		  !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||

  2052 		  !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))

  2053 		{

  2054 		    if (pswit[ECHO_SWITCH])

  2055 			printf("\n%s\n",aline);

  2056 		    if (!pswit[OVERVIEW_SWITCH])

  2057 			printf("    Line %ld column %d - Unspaced quotes?\n",

  2058 			  linecnt,i+1);

  2059 		    else

  2060 			cnt_punct++;

  2061 		}

  2062 	    }

  2063 	}

  2064         /* Check parity of quotes. */

  2065         for (s=aline;*s;s++)

  2066 	{

  2067             if (*s==CHAR_DQUOTE)

  2068 	    {

  2069                 if (!(dquotepar=!dquotepar))

  2070 		{

  2071 		    /* parity even */

  2072                     if (!strchr("_-.'`/,;:!?)]} ",s[1]))

  2073 		    {

  2074                         if (pswit[ECHO_SWITCH])

  2075 			    printf("\n%s\n",aline);

  2076                         if (!pswit[OVERVIEW_SWITCH])

  2077                             printf("    Line %ld column %d - "

  2078 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2079                         else

  2080                             cnt_punct++;

  2081 		    }

  2082 		}

  2083                 else

  2084 		{

  2085 		    /* parity odd */

  2086                     if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2087 		      !strchr("_-/.'`([{$",s[1]) || !s[1])

  2088 		    {

  2089                         if (pswit[ECHO_SWITCH])

  2090 			    printf("\n%s\n",aline);

  2091                         if (!pswit[OVERVIEW_SWITCH])

  2092                             printf("    Line %ld column %d - "

  2093 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2094                         else

  2095                             cnt_punct++;

  2096 		    }

  2097 		}

  2098 	    }

  2099 	}

  2100 	if (*aline==CHAR_DQUOTE)

  2101 	{

  2102 	    if (strchr(",;:!?)]} ",aline[1]))

  2103 	    {

  2104 		if (pswit[ECHO_SWITCH])

  2105 		    printf("\n%s\n",aline);

  2106 		if (!pswit[OVERVIEW_SWITCH])

  2107 		    printf("    Line %ld column 1 - Wrongspaced quotes?\n",

  2108 		      linecnt);

  2109 		else

  2110 		    cnt_punct++;

  2111 	    }

  2112 	}

  2113         if (pswit[SQUOTE_SWITCH])

  2114 	{

  2115             for (s=aline;*s;s++)

  2116 	    {

  2117                 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&

  2118 		  (s==aline || s>aline && !gcisalpha(s[-1]) ||

  2119 		  !gcisalpha(s[1])))

  2120 		{

  2121                     if (!(squotepar=!squotepar))

  2122 		    {

  2123 			/* parity even */

  2124                         if (!strchr("_-.'`/\",;:!?)]} ",s[1]))

  2125 			{

  2126                             if (pswit[ECHO_SWITCH])

  2127 				printf("\n%s\n",aline);

  2128                             if (!pswit[OVERVIEW_SWITCH])

  2129                                 printf("    Line %ld column %d - "

  2130 				  "Wrongspaced singlequotes?\n",

  2131 				  linecnt,(int)(s-aline)+1);

  2132                             else

  2133                                 cnt_punct++;

  2134 			}

  2135 		    }

  2136                     else

  2137 		    {

  2138 			/* parity odd */

  2139                         if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2140 			  !strchr("_-/\".'`",s[1]) || !s[1])

  2141 			{

  2142                             if (pswit[ECHO_SWITCH])

  2143 				printf("\n%s\n",aline);

  2144                             if (!pswit[OVERVIEW_SWITCH])

  2145                                 printf("    Line %ld column %d - "

  2146 				  "Wrongspaced singlequotes?\n",

  2147 				  linecnt,(int)(s-aline)+1);

  2148                             else

  2149                                 cnt_punct++;

  2150 			}

  2151 		    }

  2152 		}

  2153 	    }

  2154 	}

  2155         /*

  2156 	 * Look for double punctuation like ,. or ,,

  2157          * Thanks to DW for the suggestion!

  2158          * In books with references, ".," and ".;" are common

  2159          * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2160          * OTOH, from my initial tests, there are also fairly

  2161          * common errors. What to do? Make these cases paranoid?

  2162          * ".," is the most common, so warnings->dotcomma is used

  2163          * to suppress detailed reporting if it occurs often.

  2164 	 */

  2165         llen=strlen(aline);

  2166         for (i=0;i<llen;i++)

  2167 	{

  2168 	    /* for each punctuation character in the line */

  2169             if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&

  2170 	      aline[i] && aline[i+1])

  2171 	    {

  2172 		/* followed by punctuation, it's a query, unless . . . */

  2173                 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||

  2174 		  aline[i]=='!') ||

  2175 		  !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||

  2176 		  warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2177 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2178 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2179 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2180 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2181 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2182 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2183 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2184 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2185 		  warnings->isFrench && !strncmp(aline+i,"...?",4))

  2186 		{

  2187 		    if (warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2188 		      warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2189 		      warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2190 		      warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2191 		      warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2192 		      warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2193 		      warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2194 		      warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2195 		      warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2196 		      warnings->isFrench && !strncmp(aline+i,"...?",4))

  2197 			i+=4;

  2198 		    ; /* do nothing for .. !! and ?? which can be legit */

  2199 		}

  2200                 else

  2201 		{

  2202                     if (pswit[ECHO_SWITCH])

  2203 			printf("\n%s\n",aline);

  2204                     if (!pswit[OVERVIEW_SWITCH])

  2205                         printf("    Line %ld column %d - Double punctuation?\n",

  2206 			  linecnt,i+1);

  2207                     else

  2208                         cnt_punct++;

  2209 		}

  2210 	    }

  2211 	}

  2212         s=aline;

  2213         while (strstr(s," \" "))

  2214 	{

  2215             if (pswit[ECHO_SWITCH])

  2216 		printf("\n%s\n",aline);

  2217             if (!pswit[OVERVIEW_SWITCH])

  2218                 printf("    Line %ld column %d - Spaced doublequote?\n",

  2219 		  linecnt,(int)(strstr(s," \" ")-aline+1));

  2220             else

  2221                 cnt_punct++;

  2222             s=strstr(s," \" ")+2;

  2223 	}

  2224         s=aline;

  2225         while (strstr(s," ' "))

  2226 	{

  2227             if (pswit[ECHO_SWITCH])

  2228 		printf("\n%s\n",aline);

  2229             if (!pswit[OVERVIEW_SWITCH])

  2230                 printf("    Line %ld column %d - Spaced singlequote?\n",

  2231 		  linecnt,(int)(strstr(s," ' ")-aline+1));

  2232             else

  2233                 cnt_punct++;

  2234             s=strstr(s," ' ")+2;

  2235 	}

  2236         s=aline;

  2237         while (strstr(s," ` "))

  2238 	{

  2239             if (pswit[ECHO_SWITCH])

  2240 		printf("\n%s\n",aline);

  2241             if (!pswit[OVERVIEW_SWITCH])

  2242                 printf("    Line %ld column %d - Spaced singlequote?\n",

  2243 		  linecnt,(int)(strstr(s," ` ")-aline+1));

  2244             else

  2245                 cnt_punct++;

  2246             s=strstr(s," ` ")+2;

  2247 	}

  2248         /* check special case of 'S instead of 's at end of word */

  2249         s=aline+1;

  2250         while (*s)

  2251 	{

  2252             if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')

  2253 	    {

  2254                 if (pswit[ECHO_SWITCH])

  2255 		    printf("\n%s\n",aline);

  2256                 if (!pswit[OVERVIEW_SWITCH])

  2257                     printf("    Line %ld column %d - Capital \"S\"?\n",

  2258 		      linecnt,(int)(s-aline+2));

  2259                 else

  2260                     cnt_punct++;

  2261 	    }

  2262             s++;

  2263 	}

  2264         /*

  2265 	 * Now check special cases - start and end of line -

  2266          * for single and double quotes. Start is sometimes [sic]

  2267          * but better to query it anyway.

  2268          * While we're here, check for dash at end of line.

  2269 	 */

  2270         llen=strlen(aline);

  2271         if (llen>1)

  2272 	{

  2273             if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||

  2274 	      aline[llen-1]==CHAR_OPEN_SQUOTE)

  2275                 if (aline[llen-2]==CHAR_SPACE)

  2276 		{

  2277                     if (pswit[ECHO_SWITCH])

  2278 			printf("\n%s\n",aline);

  2279                     if (!pswit[OVERVIEW_SWITCH])

  2280                         printf("    Line %ld column %d - Spaced quote?\n",

  2281 			  linecnt,llen);

  2282                     else

  2283                         cnt_punct++;

  2284 		}

  2285             if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&

  2286 	      aline[1]==CHAR_SPACE)

  2287 	    {

  2288 		if (pswit[ECHO_SWITCH])

  2289 		    printf("\n%s\n",aline);

  2290 		if (!pswit[OVERVIEW_SWITCH])

  2291 		    printf("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2292 		else

  2293 		    cnt_punct++;

  2294 	    }

  2295             /*

  2296 	     * Dash at end of line may well be legit - paranoid mode only

  2297              * and don't report em-dash at line-end.

  2298 	     */

  2299             if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2300 	    {

  2301                 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

  2302 		    ;

  2303                 if (aline[i]=='-' && aline[i-1]!='-')

  2304 		{

  2305                     if (pswit[ECHO_SWITCH])

  2306 			printf("\n%s\n",aline);

  2307                     if (!pswit[OVERVIEW_SWITCH])

  2308                         printf("    Line %ld column %d - "

  2309 			  "Hyphen at end of line?\n",linecnt,i);

  2310 		}

  2311 	    }

  2312 	}

  2313         /*

  2314 	 * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2315          * If so, suspect a scanno like "a]most".

  2316 	 */

  2317         llen=strlen(aline);

  2318         for (i=1;i<llen-1;i++)

  2319 	{

  2320 	    /* for each bracket character in the line except 1st & last */

  2321             if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&

  2322 	      gcisalpha(aline[i+1]))

  2323 	    {

  2324                 if (pswit[ECHO_SWITCH])

  2325 		    printf("\n%s\n",aline);

  2326                 if (!pswit[OVERVIEW_SWITCH])

  2327                     printf("    Line %ld column %d - Unspaced bracket?\n",

  2328 		      linecnt,i);

  2329                 else

  2330                     cnt_punct++;

  2331 	    }

  2332 	}

  2333         llen=strlen(aline);

  2334         if (warnings->endquote)

  2335 	{

  2336             for (i=1;i<llen;i++)

  2337 	    {

  2338 		/* for each character in the line except 1st */

  2339                 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

  2340 		{

  2341 		    if (pswit[ECHO_SWITCH])

  2342 			printf("\n%s\n",aline);

  2343 		    if (!pswit[OVERVIEW_SWITCH])

  2344 			printf("    Line %ld column %d - "

  2345 			  "endquote missing punctuation?\n",linecnt,i);

  2346 		    else

  2347 			cnt_punct++;

  2348 		}

  2349 	    }

  2350 	}

  2351 	/*

  2352          * Check for <HTML TAG>.

  2353          * If there is a < in the line, followed at some point

  2354          * by a > then we suspect HTML.

  2355 	 */

  2356         if (strstr(aline,"<") && strstr(aline,">"))

  2357 	{

  2358             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);

  2359             if (i>0)

  2360 	    {

  2361                 strncpy(wrk,strstr(aline,"<"),i);

  2362                 wrk[i]=0;

  2363                 if (pswit[ECHO_SWITCH])

  2364 		    printf("\n%s\n",aline);

  2365                 if (!pswit[OVERVIEW_SWITCH])

  2366                     printf("    Line %ld column %d - HTML Tag? %s \n",

  2367 		      linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);

  2368                 else

  2369                     cnt_html++;

  2370 	    }

  2371 	}

  2372         /*

  2373 	 * Check for &symbol; HTML.

  2374          * If there is a & in the line, followed at

  2375          * some point by a ; then we suspect HTML.

  2376 	 */

  2377         if (strstr(aline,"&") && strstr(aline,";"))

  2378 	{

  2379             i=(int)(strstr(aline,";")-strstr(aline,"&")+1);

  2380             for (s=strstr(aline,"&");s<strstr(aline,";");s++)

  2381                 if (*s==CHAR_SPACE)

  2382 		    i=0;                /* Don't report "Jones & Son;" */

  2383             if (i>0)

  2384 	    {

  2385                 strncpy(wrk,strstr(aline,"&"),i);

  2386                 wrk[i]=0;

  2387                 if (pswit[ECHO_SWITCH])

  2388 		    printf("\n%s\n",aline);

  2389                 if (!pswit[OVERVIEW_SWITCH])

  2390                     printf("    Line %ld column %d - HTML symbol? %s \n",

  2391 		      linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);

  2392                 else

  2393                     cnt_html++;

  2394 	    }

  2395 	}

  2396         /*

  2397 	 * At end of paragraph, check for mismatched quotes.

  2398          * We don't want to report an error immediately, since it is a

  2399          * common convention to omit the quotes at end of paragraph if

  2400          * the next paragraph is a continuation of the same speaker.

  2401          * Where this is the case, the next para should begin with a

  2402          * quote, so we store the warning message and only display it

  2403          * at the top of the next iteration if the new para doesn't

  2404          * start with a quote.

  2405          * The -p switch overrides this default, and warns of unclosed

  2406          * quotes on _every_ paragraph, whether the next begins with a

  2407          * quote or not.

  2408 	 */

  2409         if (isemptyline)

  2410 	{

  2411 	    /* end of para - add up the totals */

  2412             if (counters.quot%2)

  2413                 sprintf(dquote_err,"    Line %ld - Mismatched quotes\n",

  2414 		  linecnt);

  2415             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&

  2416 	      counters.open_single_quote!=counters.close_single_quote)

  2417                 sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n",

  2418 		  linecnt);

  2419             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&

  2420 	      counters.open_single_quote!=counters.close_single_quote &&

  2421 	      counters.open_single_quote!=counters.close_single_quote+1)

  2422 		/*

  2423 		 * Flag it to be noted regardless of the

  2424 		 * first char of the next para.

  2425 		 */

  2426                 squot=1;

  2427             if (counters.r_brack)

  2428                 sprintf(rbrack_err,"    Line %ld - "

  2429 		  "Mismatched round brackets?\n",linecnt);

  2430             if (counters.s_brack)

  2431                 sprintf(sbrack_err,"    Line %ld - "

  2432 		  "Mismatched square brackets?\n",linecnt);

  2433             if (counters.c_brack)

  2434                 sprintf(cbrack_err,"    Line %ld - "

  2435 		  "Mismatched curly brackets?\n",linecnt);

  2436             if (counters.c_unders%2)

  2437                 sprintf(unders_err,"    Line %ld - Mismatched underscores?\n",

  2438 		  linecnt);

  2439 	    memset(&counters,0,sizeof(counters));

  2440 	    /* let the next iteration know that it's starting a new para */

  2441             isnewpara=1;

  2442 	}

  2443         /*

  2444 	 * Check for omitted punctuation at end of paragraph by working back

  2445 	 * through prevline. DW.

  2446          * Need to check this only for "normal" paras.

  2447          * So what is a "normal" para?

  2448          *    Not normal if one-liner (chapter headings, etc.)

  2449          *    Not normal if doesn't contain at least one locase letter

  2450          *    Not normal if starts with space

  2451 	 */

  2452         if (isemptyline)

  2453 	{

  2454 	    /* end of para */

  2455             for (s=prevline,i=0;*s && !i;s++)

  2456                 if (gcisletter(*s))

  2457 		    /* use i to indicate the presence of a letter on the line */

  2458                     i=1;

  2459             /*

  2460 	     * This next "if" is a problem.

  2461              * If we say "start_para_line <= linecnt - 1", that includes

  2462 	     * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2463              * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2464              * misses genuine one-line paragraphs.

  2465 	     */

  2466             if (i && lastblen>2 && start_para_line<linecnt-1 &&

  2467 	      *prevline>CHAR_SPACE)

  2468 	    {

  2469                 for (i=strlen(prevline)-1;

  2470 		  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&

  2471 		  prevline[i]>CHAR_SPACE && i>0;

  2472 		  i--)

  2473 		    ;

  2474                 for (;i>0;i--)

  2475 		{

  2476                     if (gcisalpha(prevline[i]))

  2477 		    {

  2478                         if (pswit[ECHO_SWITCH])

  2479 			    printf("\n%s\n",prevline);

  2480                         if (!pswit[OVERVIEW_SWITCH])

  2481                             printf("    Line %ld column %d - "

  2482 			      "No punctuation at para end?\n",

  2483 			      linecnt-1,strlen(prevline));

  2484                         else

  2485                             cnt_punct++;

  2486                         break;

  2487 		    }

  2488                     if (strchr("-.:!([{?}])",prevline[i]))

  2489                         break;

  2490 		}

  2491 	    }

  2492 	}

  2493         strcpy(prevline,aline);

  2494     }

  2495     fclose(infile);

  2496     if (!pswit[OVERVIEW_SWITCH])

  2497         for (i=0;i<MAX_QWORD;i++)

  2498             if (dupcnt[i])

  2499                 printf("\nNote: Queried word %s was duplicated %d time%s\n",

  2500 		  qword[i],dupcnt[i],"s");

  2501 }

  2503 /*

  2504  * flgets:

  2505  *

  2506  * Get one line from the input stream, checking for

  2507  * the existence of exactly one CR/LF line-end per line.

  2508  *

  2509  * Returns: a pointer to the line.

  2510  */

  2511 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)

  2512 {

  2513     char c;

  2514     int len,isCR,cint;

  2515     *theline=0;

  2516     len=isCR=0;

  2517     c=cint=fgetc(thefile);

  2518     do

  2519     {

  2520         if (cint==EOF)

  2521             return NULL;

  2522 	/* either way, it's end of line */

  2523         if (c==10)

  2524 	{

  2525             if (isCR)

  2526                 break;

  2527             else

  2528 	    {

  2529 		/* Error - a LF without a preceding CR */

  2530                 if (pswit[LINE_END_SWITCH])

  2531 		{

  2532                     if (pswit[ECHO_SWITCH])

  2533 			printf("\n%s\n",theline);

  2534                     if (!pswit[OVERVIEW_SWITCH])

  2535                         printf("    Line %ld - No CR?\n",lcnt);

  2536                     else

  2537                         cnt_lineend++;

  2538 		}

  2539                 break;

  2540 	    }

  2541 	}

  2542         if (c==13)

  2543 	{

  2544             if (isCR)

  2545 	    {

  2546 		/* Error - two successive CRs */

  2547                 if (pswit[LINE_END_SWITCH])

  2548 		{

  2549                     if (pswit[ECHO_SWITCH])

  2550 			printf("\n%s\n",theline);

  2551                     if (!pswit[OVERVIEW_SWITCH])

  2552                         printf("    Line %ld - Two successive CRs?\n",lcnt);

  2553                     else

  2554                         cnt_lineend++;

  2555 		}

  2556 	    }

  2557             isCR=1;

  2558 	}

  2559         else

  2560 	{

  2561             if (pswit[LINE_END_SWITCH] && isCR)

  2562 	    {

  2563                 if (pswit[ECHO_SWITCH])

  2564 		    printf("\n%s\n",theline);

  2565                 if (!pswit[OVERVIEW_SWITCH])

  2566                     printf("    Line %ld column %d - CR without LF?\n",

  2567 		      lcnt,len+1);

  2568                 else

  2569                     cnt_lineend++;

  2570 	    }

  2571             theline[len]=c;

  2572             len++;

  2573             theline[len]=0;

  2574             isCR=0;

  2575 	}

  2576         c=cint=fgetc(thefile);

  2577     } while(len<maxlen);

  2578     if (pswit[MARKUP_SWITCH])

  2579         postprocess_for_HTML(theline);

  2580     if (pswit[DP_SWITCH])

  2581         postprocess_for_DP(theline);

  2582     return theline;

  2583 }

  2585 /*

  2586  * mixdigit:

  2587  *

  2588  * Takes a "word" as a parameter, and checks whether it

  2589  * contains a mixture of alpha and digits. Generally, this is an

  2590  * error, but may not be for cases like 4th or L5 12s. 3d.

  2591  *

  2592  * Returns: 0 if no error found, 1 if error.

  2593  */

  2594 int mixdigit(char *checkword)

  2595 {

  2596     int wehaveadigit,wehavealetter,firstdigits,query,wl;

  2597     char *s;

  2598     wehaveadigit=wehavealetter=query=0;

  2599     for (s=checkword;*s;s++)

  2600         if (gcisalpha(*s))

  2601             wehavealetter=1;

  2602         else

  2603             if (gcisdigit(*s))

  2604                 wehaveadigit=1;

  2605     if (wehaveadigit && wehavealetter)

  2606     {

  2607 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2608         query=1;

  2609         wl=strlen(checkword);

  2610         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)

  2611             ;

  2612         /* digits, ending in st, rd, nd, th of either case */

  2613         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||

  2614 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||

  2615 	  matchword(checkword+wl-2,"th")))

  2616 	    query=0;

  2617         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||

  2618 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||

  2619 	  matchword(checkword+wl-3,"ths")))

  2620 	    query=0;

  2621         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||

  2622 	  matchword(checkword+wl-4,"rdly") ||

  2623 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))

  2624 	    query=0;

  2625         /* digits, ending in l, L, s or d */

  2626         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||

  2627 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))

  2628 	    query=0;

  2629         /*

  2630 	 * L at the start of a number, representing Britsh pounds, like L500.

  2631          * This is cute. We know the current word is mixeddigit. If the first

  2632          * letter is L, there must be at least one digit following. If both

  2633          * digits and letters follow, we have a genuine error, else we have a

  2634          * capital L followed by digits, and we accept that as a non-error.

  2635 	 */

  2636         if (checkword[0]=='L' && !mixdigit(checkword+1))

  2637 	    query=0;

  2638     }

  2639     return query;

  2640 }

  2642 /*

  2643  * getaword:

  2644  *

  2645  * Extracts the first/next "word" from the line, and puts

  2646  * it into "thisword". A word is defined as one English word unit--or

  2647  * at least that's the aim.

  2648  *

  2649  * Returns: a pointer to the position in the line where we will start

  2650  *          looking for the next word.

  2651  */

  2652 char *getaword(char *fromline,char *thisword)

  2653 {

  2654     int i,wordlen;

  2655     char *s;

  2656     wordlen=0;

  2657     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;

  2658       fromline++)

  2659 	;

  2660     /*

  2661      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2662      * Especially yucky is the case of L1,000

  2663      * This section looks for a pattern of characters including a digit

  2664      * followed by a comma or period followed by one or more digits.

  2665      * If found, it returns this whole pattern as a word; otherwise we discard

  2666      * the results and resume our normal programming.

  2667      */

  2668     s=fromline;

  2669     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&

  2670       wordlen<MAXWORDLEN;s++)

  2671     {

  2672 	thisword[wordlen]=*s;

  2673         wordlen++;

  2674     }

  2675     thisword[wordlen]=0;

  2676     for (i=1;i<wordlen-1;i++)

  2677     {

  2678         if (thisword[i]=='.' || thisword[i]==',')

  2679 	{

  2680             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))

  2681 	    {

  2682                 fromline=s;

  2683                 return fromline;

  2684 	    }

  2685 	}

  2686     }

  2687     /* we didn't find a punctuated number - do the regular getword thing */

  2688     wordlen=0;

  2689     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&

  2690       wordlen<MAXWORDLEN;fromline++)

  2691     {

  2692         thisword[wordlen]=*fromline;

  2693         wordlen++;

  2694     }

  2695     thisword[wordlen]=0;

  2696     return fromline;

  2697 }

  2699 /*

  2700  * matchword:

  2701  *

  2702  * A case-insensitive string matcher.

  2703  */

  2704 int matchword(char *checkfor,char *thisword)

  2705 {

  2706     unsigned int ismatch,i;

  2707     if (strlen(checkfor)!=strlen(thisword))

  2708 	return 0;

  2709     ismatch=1;     /* assume a match until we find a difference */

  2710     for (i=0;i<strlen(checkfor);i++)

  2711         if (toupper(checkfor[i])!=toupper(thisword[i]))

  2712             ismatch=0;

  2713     return ismatch;

  2714 }

  2716 /*

  2717  * lowerit:

  2718  *

  2719  * Lowercase the line.

  2720  */

  2722 void lowerit(char *theline)

  2723 {

  2724     for (;*theline;theline++)

  2725         if (*theline>='A' && *theline<='Z')

  2726             *theline+=32;

  2727 }

  2729 /*

  2730  * isroman:

  2731  *

  2732  * Is this word a Roman Numeral?

  2733  *

  2734  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2735  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2736  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2737  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2738  * expressions thereof, except when it came to taxes. Allow any number of M,

  2739  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  2740  * XL or an optional XC, an optional IX or IV, an optional V and any number

  2741  * of optional Is.

  2742  */

  2743 int isroman(char *t)

  2744 {

  2745     char *s;

  2746     if (!t || !*t)

  2747 	return 0;

  2748     s=t;

  2749     while (*t=='m' && *t)

  2750 	t++;

  2751     if (*t=='d')

  2752 	t++;

  2753     if (*t=='c' && t[1]=='m')

  2754 	t+=2;

  2755     if (*t=='c' && t[1]=='d')

  2756 	t+=2;

  2757     while (*t=='c' && *t)

  2758 	t++;

  2759     if (*t=='x' && t[1]=='l')

  2760 	t+=2;

  2761     if (*t=='x' && t[1]=='c')

  2762 	t+=2;

  2763     if (*t=='l')

  2764 	t++;

  2765     while (*t=='x' && *t)

  2766 	t++;

  2767     if (*t=='i' && t[1]=='x')

  2768 	t+=2;

  2769     if (*t=='i' && t[1]=='v')

  2770 	t+=2;

  2771     if (*t=='v')

  2772 	t++;

  2773     while (*t=='i' && *t)

  2774 	t++;

  2775     return !*t;

  2776 }

  2778 /*

  2779  * gcisalpha:

  2780  *

  2781  * A version of isalpha() that is somewhat lenient on 8-bit texts.

  2782  * If we use the standard function, 8-bit accented characters break

  2783  * words, so that tete with accented characters appears to be two words, "t"

  2784  * and "t", with 8-bit characters between them. This causes over-reporting of

  2785  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)

  2786  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.

  2787  */

  2788 int gcisalpha(unsigned char c)

  2789 {

  2790     if (c>='a' && c<='z')

  2791 	return 1;

  2792     if (c>='A' && c<='Z')

  2793 	return 1;

  2794     if (c<140)

  2795 	return 0;

  2796     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)

  2797 	return 1;

  2798     if (c==140 || c==142 || c==156 || c==158 || c==159)

  2799 	return 1;

  2800     return 0;

  2801 }

  2803 /*

  2804  * gcisdigit:

  2805  *

  2806  * A version of isdigit() that doesn't get confused in 8-bit texts.

  2807  */

  2808 int gcisdigit(unsigned char c)

  2809 {

  2810     return c>='0' && c<='9';

  2811 }

  2813 /*

  2814  * gcisletter:

  2815  *

  2816  * A version of isletter() that doesn't get confused in 8-bit texts.

  2817  * NB: this is ISO-8891-1-specific.

  2818  */

  2819 int gcisletter(unsigned char c)

  2820 {

  2821     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;

  2822 }

  2824 /*

  2825  * gcstrchr:

  2826  *

  2827  * Wraps strchr to return NULL if the character being searched for is zero.

  2828  */

  2829 char *gcstrchr(char *s,char c)

  2830 {

  2831     if (!c)

  2832 	return NULL;

  2833     return strchr(s,c);

  2834 }

  2836 /*

  2837  * postprocess_for_DP:

  2838  *

  2839  * Invoked with the -d switch from flgets().

  2840  * It simply "removes" from the line a hard-coded set of common

  2841  * DP-specific tags, so that the line passed to the main routine has

  2842  * been pre-cleaned of DP markup.

  2843  */

  2844 void postprocess_for_DP(char *theline)

  2845 {

  2846     char *s,*t;

  2847     int i;

  2848     if (!*theline)

  2849         return;

  2850     for (i=0;*DPmarkup[i];i++)

  2851     {

  2852         s=strstr(theline,DPmarkup[i]);

  2853         while (s)

  2854 	{

  2855             t=s+strlen(DPmarkup[i]);

  2856             while (*t)

  2857 	    {

  2858                 *s=*t;

  2859                 t++;

  2860 		s++;

  2861 	    }

  2862             *s=0;

  2863             s=strstr(theline,DPmarkup[i]);

  2864 	}

  2865     }

  2866 }

  2868 /*

  2869  * postprocess_for_HTML:

  2870  *

  2871  * Invoked with the -m switch from flgets().

  2872  * It simply "removes" from the line a hard-coded set of common

  2873  * HTML tags and "replaces" a hard-coded set of common HTML

  2874  * entities, so that the line passed to the main routine has

  2875  * been pre-cleaned of HTML.

  2876  */

  2877 void postprocess_for_HTML(char *theline)

  2878 {

  2879     if (strstr(theline,"<") && strstr(theline,">"))

  2880         while (losemarkup(theline))

  2881             ;

  2882     while (loseentities(theline))

  2883         ;

  2884 }

  2886 char *losemarkup(char *theline)

  2887 {

  2888     char *s,*t;

  2889     int i;

  2890     if (!*theline)

  2891         return NULL;

  2892     s=strstr(theline,"<");

  2893     t=strstr(theline,">");

  2894     if (!s || !t)

  2895 	return NULL;

  2896     for (i=0;*markup[i];i++)

  2897         if (!tagcomp(s+1,markup[i]))

  2898 	{

  2899             if (!t[1])

  2900 	    {

  2901                 *s=0;

  2902                 return s;

  2903 	    }

  2904             else if (t>s)

  2905 	    {

  2906 		strcpy(s,t+1);

  2907 		return s;

  2908 	    }

  2909         }

  2910     /* It's an unrecognized <xxx>. */

  2911     return NULL;

  2912 }

  2914 char *loseentities(char *theline)

  2915 {

  2916     int i;

  2917     char *s,*t;

  2918     if (!*theline)

  2919         return NULL;

  2920     for (i=0;*entities[i].htmlent;i++)

  2921     {

  2922         s=strstr(theline,entities[i].htmlent);

  2923         if (s)

  2924 	{

  2925             t=malloc((size_t)strlen(s));

  2926             if (!t)

  2927 		return NULL;

  2928             strcpy(t,s+strlen(entities[i].htmlent));

  2929             strcpy(s,entities[i].textent);

  2930             strcat(s,t);

  2931             free(t);

  2932             return theline;

  2933 	}

  2934     }

  2935     for (i=0;*entities[i].htmlnum;i++)

  2936     {

  2937         s=strstr(theline,entities[i].htmlnum);

  2938         if (s)

  2939 	{

  2940             t=malloc((size_t)strlen(s));

  2941             if (!t)

  2942 		return NULL;

  2943             strcpy(t,s+strlen(entities[i].htmlnum));

  2944             strcpy(s,entities[i].textent);

  2945             strcat(s,t);

  2946             free(t);

  2947             return theline;

  2948 	}

  2949     }

  2950     return NULL;

  2951 }

  2953 int tagcomp(char *strin,char *basetag)

  2954 {

  2955     char *s,*t;

  2956     s=basetag;

  2957     t=strin;

  2958     if (*t=='/')

  2959 	t++; /* ignore a slash */

  2960     while (*s && *t)

  2961     {

  2962         if (tolower(*s)!=tolower(*t))

  2963 	    return 1;

  2964         s++;

  2965 	t++;

  2966     }

  2967     return 0;

  2968 }

  2970 void proghelp()

  2971 {

  2972     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  2973     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  2974     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  2975     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  2976       "For details, read the file COPYING.\n",stderr);

  2977     fputs("This is Free Software; "

  2978       "you may redistribute it under certain conditions (GPL);\n",stderr);

  2979     fputs("read the file COPYING for details.\n\n",stderr);

  2980     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);

  2981     fputs("  where -s checks single quotes, -e suppresses echoing lines, "

  2982       "-t checks typos\n",stderr);

  2983     fputs("  -x (paranoid) switches OFF -t and extra checks, "

  2984       "-l turns OFF line-end checks\n",stderr);

  2985     fputs("  -o just displays overview without detail, "

  2986       "-h echoes header fields\n",stderr);

  2987     fputs("  -v (verbose) unsuppresses duplicate reporting, "

  2988       "-m suppresses markup\n",stderr);

  2989     fputs("  -d ignores DP-specific markup,\n",stderr);

  2990     fputs("  -u uses a file gutcheck.typ to query user-defined "

  2991       "possible typos\n",stderr);

  2992     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);

  2993     fputs("\n",stderr);

  2994     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",

  2995       stderr);

  2996     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  2997       "non-ASCII\n",stderr);

  2998     fputs("characters like accented letters, "

  2999       "lines longer than 75 or shorter than 55,\n",stderr);

  3000     fputs("unbalanced quotes or brackets, "

  3001       "a variety of badly formatted punctuation, \n",stderr);

  3002     fputs("HTML tags, some likely typos. "

  3003       "It is NOT a substitute for human judgement.\n",stderr);

  3004     fputs("\n",stderr);

  3005 }

author	ali <ali@juiblex.co.uk>
	Sat May 25 17:01:36 2013 +0100 (2013-05-25)
changeset 43	e4042a067753
parent 42	20d51419e077
child 44	66483ebc9b56
permissions	-rw-r--r--