bookloupe-testing: bookloupe/bookloupe.c@0d08cd5055d5

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*                                                                       */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>                  */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>                     */

     6 /*                                                                       */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.                                   */

    11 /*                                                                       */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of        */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          */

    15 /* GNU General Public License for more details.                          */

    16 /*                                                                       */

    17 /* You should have received a copy of the GNU General Public License     */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.  */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    26 #define MAXWORDLEN    80    /* max length of one word             */

    27 #define LINEBUFSIZE 2048    /* buffer size for an input line      */

    29 #define MAX_USER_TYPOS 1000

    30 #define USERTYPO_FILE "gutcheck.typ"

    32 #ifndef MAX_PATH

    33 #define MAX_PATH 16384

    34 #endif

    36 char aline[LINEBUFSIZE];

    37 char prevline[LINEBUFSIZE];

    39 /* Common typos. */

    40 char *typo[] = {

    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    70     "se", ""

    71 };

    73 char *usertypo[MAX_USER_TYPOS];

    75 /* Common abbreviations and other OK words not to query as typos. */

    76 char *okword[] = {

    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    80     "outbid", "outbids", "frostbite", "frostbitten", ""

    81 };

    83 /* Common abbreviations that cause otherwise unexplained periods. */

    84 char *abbrev[] = {

    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    87 };

    89 /*

    90  * Two-Letter combinations that rarely if ever start words,

    91  * but are common scannos or otherwise common letter combinations.

    92  */

    93 char *nostart[] = {

    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    95 };

    97 /*

    98  * Two-Letter combinations that rarely if ever end words,

    99  * but are common scannos or otherwise common letter combinations.

   100  */

   101 char *noend[] = {

   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   103     "sw", "gr", "sl", "cl", "iy", ""

   104 };

   106 char *markup[] = {

   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   111 };

   113 char *DPmarkup[] = {

   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   115 };

   117 char *nocomma[] = {

   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   121     "during", "let", "toward", "among", ""

   122 };

   124 char *noperiod[] = {

   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   128     "among", "those", "into", "whom", "having", "thence", ""

   129 };

   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";

   133 struct {

   134     char *htmlent;

   135     char *htmlnum;

   136     char *textent;

   137 } entities[] = {

   138     "&amp;",	"&#38;",     "&",

   139     "&lt;",	"&#60;",     "<",

   140     "&gt;",	"&#62;",     ">",

   141     "&deg;",	"&#176;",    " degrees",

   142     "&pound;",	"&#163;",    "L",

   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */

   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */

   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */

   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */

   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */

   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */

   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */

   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */

   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */

   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */

   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */

   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */

   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */

   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */

   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */

   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */

   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */

   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */

   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */

   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */

   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */

   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */

   165     "&cent;",	"&#162;",    "c", /* cent sign */

   166     "&pound;",	"&#163;",    "L", /* pound sign */

   167     "&curren;",	"&#164;",    "$", /* currency sign */

   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */

   169     "&sect;",	"&#167;",    "--", /* section sign */

   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */

   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */

   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */

   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */

   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */

   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */

   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */

   177     "&deg;",	"&#176;",    " degrees", /* degree sign */

   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */

   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */

   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */

   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */

   182     "&micro;",	"&#181;",    "m", /* micro sign */

   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */

   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */

   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */

   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */

   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */

   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */

   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */

   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */

   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */

   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */

   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */

   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */

   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */

   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */

   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */

   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */

   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */

   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */

   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */

   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */

   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */

   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */

   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */

   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */

   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */

   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */

   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */

   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */

   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */

   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */

   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */

   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */

   215     "&times;",	"&#215;",    "*", /* multiplication sign */

   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */

   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */

   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */

   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */

   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */

   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */

   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */

   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */

   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */

   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */

   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */

   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */

   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */

   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */

   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */

   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */

   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */

   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */

   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */

   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */

   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */

   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */

   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */

   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */

   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */

   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */

   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */

   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */

   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */

   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */

   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */

   247     "&divide;",	"&#247;",    "/", /* division sign */

   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */

   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */

   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */

   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */

   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */

   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */

   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */

   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */

   256     "", ""

   257 };

   259 /* special characters */

   260 #define CHAR_SPACE        32

   261 #define CHAR_TAB           9

   262 #define CHAR_LF           10

   263 #define CHAR_CR           13

   264 #define CHAR_DQUOTE       34

   265 #define CHAR_SQUOTE       39

   266 #define CHAR_OPEN_SQUOTE  96

   267 #define CHAR_TILDE       126

   268 #define CHAR_ASTERISK     42

   269 #define CHAR_FORESLASH    47

   270 #define CHAR_CARAT        94

   272 #define CHAR_UNDERSCORE    '_'

   273 #define CHAR_OPEN_CBRACK   '{'

   274 #define CHAR_CLOSE_CBRACK  '}'

   275 #define CHAR_OPEN_RBRACK   '('

   276 #define CHAR_CLOSE_RBRACK  ')'

   277 #define CHAR_OPEN_SBRACK   '['

   278 #define CHAR_CLOSE_SBRACK  ']'

   280 /* longest and shortest normal PG line lengths */

   281 #define LONGEST_PG_LINE   75

   282 #define WAY_TOO_LONG      80

   283 #define SHORTEST_PG_LINE  55

   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:-                            */

   286                                   /*     D - ignore DP-specific markup     */

   287                                   /*     E - echo queried line             */

   288                                   /*     S - check single quotes           */

   289                                   /*     T - check common typos            */

   290                                   /*     P - require closure of quotes on  */

   291                                   /*         every paragraph               */

   292                                   /*     X - "Trust no one" :-) Paranoid!  */

   293                                   /*         Queries everything            */

   294                                   /*     L - line end checking defaults on */

   295                                   /*         -L turns it off               */

   296                                   /*     O - overview. Just shows counts.  */

   297                                   /*     Y - puts errors to stdout         */

   298                                   /*         instead of stderr             */

   299                                   /*     H - Echoes header fields          */

   300                                   /*     M - Ignore markup in < >          */

   301                                   /*     U - Use file of User-defined Typos*/

   302                                   /*     W - Defaults for use on Web upload*/

   303                                   /*     V - Verbose - list EVERYTHING!    */

   304 #define SWITNO 14                 /* max number of switch parms            */

   305                                   /*        - used for defining array-size */

   306 #define MINARGS   1               /* minimum no of args excl switches      */

   307 #define MAXARGS   1               /* maximum no of args excl switches      */

   309 int pswit[SWITNO];                /* program switches set by SWITCHES      */

   311 #define ECHO_SWITCH      0

   312 #define SQUOTE_SWITCH    1

   313 #define TYPO_SWITCH      2

   314 #define QPARA_SWITCH     3

   315 #define PARANOID_SWITCH  4

   316 #define LINE_END_SWITCH  5

   317 #define OVERVIEW_SWITCH  6

   318 #define STDOUT_SWITCH    7

   319 #define HEADER_SWITCH    8

   320 #define WEB_SWITCH       9

   321 #define VERBOSE_SWITCH   10

   322 #define MARKUP_SWITCH    11

   323 #define USERTYPO_SWITCH  12

   324 #define DP_SWITCH        13

   326 long cnt_dquot;       /* for overview mode, count of doublequote queries */

   327 long cnt_squot;       /* for overview mode, count of singlequote queries */

   328 long cnt_brack;       /* for overview mode, count of brackets queries */

   329 long cnt_bin;         /* for overview mode, count of non-ASCII queries */

   330 long cnt_odd;         /* for overview mode, count of odd character queries */

   331 long cnt_long;        /* for overview mode, count of long line errors */

   332 long cnt_short;       /* for overview mode, count of short line queries */

   333 long cnt_punct;       /* for overview mode, count of punctuation and spacing queries */

   334 long cnt_dash;        /* for overview mode, count of dash-related queries */

   335 long cnt_word;        /* for overview mode, count of word queries */

   336 long cnt_html;        /* for overview mode, count of html queries */

   337 long cnt_lineend;     /* for overview mode, count of line-end queries */

   338 long cnt_spacend;     /* count of lines with space at end */

   339 long linecnt;         /* count of total lines in the file */

   340 long checked_linecnt; /* count of lines actually checked */

   342 void proghelp(void);

   343 void procfile(char *);

   345 #define LOW_THRESHOLD    0

   346 #define HIGH_THRESHOLD   1

   348 #define START 0

   349 #define END 1

   350 #define PREV 0

   351 #define NEXT 1

   352 #define FIRST_OF_PAIR 0

   353 #define SECOND_OF_PAIR 1

   355 #define MAX_WORDPAIR 1000

   357 char running_from[MAX_PATH];

   359 int mixdigit(char *);

   360 char *getaword(char *,char *);

   361 int matchword(char *,char *);

   362 char *flgets(char *,int,FILE *,long);

   363 void lowerit(char *);

   364 int gcisalpha(unsigned char);

   365 int gcisdigit(unsigned char);

   366 int gcisletter(unsigned char);

   367 char *gcstrchr(char *s,char c);

   368 void postprocess_for_HTML(char *);

   369 char *linehasmarkup(char *);

   370 char *losemarkup(char *);

   371 int tagcomp(char *,char *);

   372 char *loseentities(char *);

   373 int isroman(char *);

   374 int usertypo_count;

   375 void postprocess_for_DP(char *);

   377 char wrk[LINEBUFSIZE];

   379 #define MAX_QWORD 50

   380 #define MAX_QWORD_LENGTH 40

   381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];

   382 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];

   383 signed int dupcnt[MAX_QWORD];

   385 int main(int argc,char **argv)

   386 {

   387     char *argsw,*s;

   388     int i,switno,invarg;

   389     char usertypo_file[MAX_PATH];

   390     FILE *usertypofile;

   391     if (strlen(argv[0])<sizeof(running_from))

   392 	/* save the path to the executable */

   393         strcpy(running_from,argv[0]);

   394     /* find out what directory we're running from */

   395     s=running_from+strlen(running_from);

   396     for (;*s!='/' && *s!='\\' && s>=running_from;s--)

   397         *s=0;

   398     switno=strlen(SWITCHES);

   399     for (i=switno;--i>0;)

   400         pswit[i]=0;           /* initialise switches */

   401     /*

   402      * Standard loop to extract switches.

   403      * When we come out of this loop, the arguments will be

   404      * in argv[0] upwards and the switches used will be

   405      * represented by their equivalent elements in pswit[]

   406      */

   407     while (--argc>0 && **++argv=='-')

   408         for (argsw=argv[0]+1;*argsw!='\0';argsw++)

   409             for (i=switno,invarg=1;(--i>=0) && invarg==1;)

   410                 if ((toupper(*argsw))==SWITCHES[i])

   411 		{

   412                     invarg=0;

   413                     pswit[i]=1;

   414 		}

   415     /* Paranoid checking is turned OFF, not on, by its switch */

   416     pswit[PARANOID_SWITCH]^=1;

   417     if (pswit[PARANOID_SWITCH])

   418 	/* if running in paranoid mode force typo checks as well   */

   419         pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;

   420     /* Line-end checking is turned OFF, not on, by its switch */

   421     pswit[LINE_END_SWITCH]^=1;

   422     /* Echoing is turned OFF, not on, by its switch */

   423     pswit[ECHO_SWITCH]^=1;

   424     if (pswit[OVERVIEW_SWITCH])

   425 	/* just print summary; don't echo */

   426         pswit[ECHO_SWITCH]=0;

   427     /*

   428      * Web uploads - for the moment, this is really just a placeholder

   429      * until we decide what processing we really want to do on web uploads

   430      */

   431     if (pswit[WEB_SWITCH])

   432     {

   433 	/* specific override for web uploads */

   434         pswit[ECHO_SWITCH]=1;

   435         pswit[SQUOTE_SWITCH]=0;

   436         pswit[TYPO_SWITCH]=1;

   437         pswit[QPARA_SWITCH]=0;

   438         pswit[PARANOID_SWITCH]=1;

   439         pswit[LINE_END_SWITCH]=0;

   440         pswit[OVERVIEW_SWITCH]=0;

   441         pswit[STDOUT_SWITCH]=0;

   442         pswit[HEADER_SWITCH]=1;

   443         pswit[VERBOSE_SWITCH]=0;

   444         pswit[MARKUP_SWITCH]=0;

   445         pswit[USERTYPO_SWITCH]=0;

   446         pswit[DP_SWITCH]=0;

   447     }

   448     if (argc<MINARGS || argc>MAXARGS)

   449     {

   450 	/* check number of args */

   451         proghelp();

   452         return 1;

   453     }

   454     /* read in the user-defined stealth scanno list */

   455     if (pswit[USERTYPO_SWITCH])

   456     {

   457 	/* ... we were told we had one! */

   458         usertypofile=fopen(USERTYPO_FILE,"rb");

   459         if (!usertypofile)

   460 	{

   461 	    /* not in cwd. try excuteable directory. */

   462             strcpy(usertypo_file,running_from);

   463             strcat(usertypo_file,USERTYPO_FILE);

   464             usertypofile=fopen(usertypo_file,"rb");

   465             if (!usertypofile) {

   466 		/* we ain't got no user typo file! */

   467                 printf("   --> I couldn't find gutcheck.typ "

   468 		  "-- proceeding without user typos.\n");

   469 	    }

   470 	}

   471         usertypo_count=0;

   472         if (usertypofile)

   473 	{

   474 	    /* we managed to open a User Typo File! */

   475             if (pswit[USERTYPO_SWITCH])

   476 	    {

   477                 while (flgets(aline,LINEBUFSIZE-1,usertypofile,

   478 		  (long)usertypo_count))

   479 		{

   480                     if (strlen(aline)>1)

   481 		    {

   482                         if ((int)*aline>33)

   483 			{

   484                             s=malloc(strlen(aline)+1);

   485                             if (!s)

   486 			    {

   487                                 fprintf(stderr,"bookloupe: cannot get enough "

   488 				  "memory for user typo file!\n");

   489                                 exit(1);

   490 			    }

   491                             strcpy(s,aline);

   492                             usertypo[usertypo_count]=s;

   493                             usertypo_count++;

   494                             if (usertypo_count>=MAX_USER_TYPOS)

   495 			    {

   496                                 printf("   --> Only %d user-defined typos "

   497 				  "allowed: ignoring the rest\n",

   498 				  MAX_USER_TYPOS);

   499                                 break;

   500 			    }

   501 			}

   502 		    }

   503 		}

   504 	    }

   505             fclose(usertypofile);

   506 	}

   507     }

   508     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   509     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=

   510     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=

   511     cnt_spacend=0;

   512     procfile(argv[0]);

   513     if (pswit[OVERVIEW_SWITCH])

   514     {

   515 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   516 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   517         printf("    --------------- Queries found --------------\n");

   518         if (cnt_long)

   519 	    printf("    Long lines:                    %14ld\n",cnt_long);

   520         if (cnt_short)

   521 	    printf("    Short lines:                   %14ld\n",cnt_short);

   522         if (cnt_lineend)

   523 	    printf("    Line-end problems:             %14ld\n",cnt_lineend);

   524         if (cnt_word)

   525 	    printf("    Common typos:                  %14ld\n",cnt_word);

   526         if (cnt_dquot)

   527 	    printf("    Unmatched quotes:              %14ld\n",cnt_dquot);

   528         if (cnt_squot)

   529 	    printf("    Unmatched SingleQuotes:        %14ld\n",cnt_squot);

   530         if (cnt_brack)

   531 	    printf("    Unmatched brackets:            %14ld\n",cnt_brack);

   532         if (cnt_bin)

   533 	    printf("    Non-ASCII characters:          %14ld\n",cnt_bin);

   534         if (cnt_odd)

   535 	    printf("    Proofing characters:           %14ld\n",cnt_odd);

   536         if (cnt_punct)

   537 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   538         if (cnt_dash)

   539 	    printf("    Non-standard dashes:           %14ld\n",cnt_dash);

   540         if (cnt_html)

   541 	    printf("    Possible HTML tags:            %14ld\n",cnt_html);

   542         printf("\n");

   543         printf("    TOTAL QUERIES                  %14ld\n",

   544           cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   545           cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   546     }

   547     return 0;

   548 }

   550 struct first_pass_results {

   551     long firstline,astline;

   552     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   553     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   554     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   555     signed int Dutchcount,Frenchcount;

   556 };

   558 /*

   559  * first_pass:

   560  *

   561  * Run a first pass - verify that it's a valid PG

   562  * file, decide whether to report some things that

   563  * occur many times in the text like long or short

   564  * lines, non-standard dashes, etc.

   565  */

   566 struct first_pass_results *first_pass(FILE *infile)

   567 {

   568     char laststart=CHAR_SPACE,*s;

   569     signed int i,llen;

   570     unsigned int lastlen=0,lastblen=0;

   571     long spline=0,nspline=0;

   572     static struct first_pass_results results={0};

   573     char inword[MAXWORDLEN]="";

   574     while (fgets(aline,LINEBUFSIZE-1,infile))

   575     {

   576         while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)

   577 	    aline[strlen(aline)-1]=0;

   578         linecnt++;

   579         if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&

   580 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))

   581 	{

   582             if (spline)

   583                 printf("   --> Duplicate header?\n");

   584             spline=linecnt+1;   /* first line of non-header text, that is */

   585 	}

   586         if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))

   587 	{

   588             if (nspline)

   589                 printf("   --> Duplicate header?\n");

   590             nspline=linecnt+1;   /* first line of non-header text, that is */

   591 	}

   592         if (spline || nspline)

   593 	{

   594             lowerit(aline);

   595             if (strstr(aline,"end") && strstr(aline,"project gutenberg"))

   596 	    {

   597                 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))

   598 		{

   599                     if (results.footerline)

   600 		    {

   601 			/* it's an old-form header - we can detect duplicates */

   602                         if (!nspline)

   603                             printf("   --> Duplicate footer?\n");

   604 		    }

   605                     else

   606                         results.footerline=linecnt;

   607 		}

   608 	    }

   609 	}

   610         if (spline)

   611 	    results.firstline=spline;

   612         if (nspline)

   613 	    results.firstline=nspline;  /* override with new */

   614         if (results.footerline)

   615 	    continue;    /* don't count the boilerplate in the footer */

   616         llen=strlen(aline);

   617         results.totlen+=llen;

   618         for (i=0;i<llen;i++)

   619 	{

   620             if ((unsigned char)aline[i]>127)

   621 		results.binlen++;

   622             if (gcisalpha(aline[i]))

   623 		results.alphalen++;

   624             if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

   625 		results.endquote_count++;

   626 	}

   627         if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&

   628 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   629 	    results.shortline++;

   630         if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)

   631 	    cnt_spacend++;

   632         if (strstr(aline,".,"))

   633 	    results.dotcomma++;

   634         /* only count ast lines for ignoring purposes where there is */

   635         /* locase text on the line */

   636         if (strstr(aline,"*"))

   637 	{

   638             for (s=aline;*s;s++)

   639                 if (*s>='a' && *s<='z')

   640                     break;

   641              if (*s)

   642 		results.astline++;

   643 	}

   644         if (strstr(aline,"/"))

   645             results.fslashline++;

   646         for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

   647 	    ;

   648         if (aline[i]=='-' && aline[i-1]!='-')

   649 	    results.hyphens++;

   650         if (llen>LONGEST_PG_LINE)

   651 	    results.longline++;

   652         if (llen>WAY_TOO_LONG)

   653 	    results.verylongline++;

   654         if (strstr(aline,"<") && strstr(aline,">"))

   655 	{

   656             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);

   657             if (i>0)

   658                 results.htmcount++;

   659             if (strstr(aline,"<i>"))

   660 		results.htmcount+=4; /* bonus marks! */

   661 	}

   662         /* Check for spaced em-dashes */

   663         if (strstr(aline,"--"))

   664 	{

   665             results.emdash++;

   666             if (*(strstr(aline,"--")-1)==CHAR_SPACE ||

   667                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   668 		results.space_emdash++;

   669             if (*(strstr(aline,"--")-1)==CHAR_SPACE &&

   670                (*(strstr(aline,"--")+2)==CHAR_SPACE))

   671 		/* count of em-dashes with spaces both sides */

   672 		results.non_PG_space_emdash++;

   673             if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&

   674                (*(strstr(aline,"--")+2)!=CHAR_SPACE))

   675 		/* count of PG-type em-dashes with no spaces */

   676 		results.PG_space_emdash++;

   677 	}

   678         for (s=aline;*s;)

   679 	{

   680             s=getaword(s,inword);

   681             if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   682                 results.Dutchcount++;

   683             if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   684                 results.Frenchcount++;

   685             if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   686                 results.standalone_digit++;

   687 	}

   688         /* Check for spaced dashes */

   689         if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')

   690 	    results.spacedash++;

   691         lastblen=lastlen;

   692         lastlen=strlen(aline);

   693         laststart=aline[0];

   694     }

   695     return &results;

   696 }

   698 struct warnings {

   699     signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   700     signed int endquote,isDutch,isFrench;

   701 };

   703 /*

   704  * report_first_pass:

   705  *

   706  * Make some snap decisions based on the first pass results.

   707  */

   708 struct warnings *report_first_pass(struct first_pass_results *results)

   709 {

   710     static struct warnings warnings={0};

   711     if (cnt_spacend>0)

   712         printf("   --> %ld lines in this file have white space at end\n",

   713 	  cnt_spacend);

   714     warnings.dotcomma=1;

   715     if (results->dotcomma>5)

   716     {

   717         warnings.dotcomma=0;

   718         printf("   --> %ld lines in this file contain '.,'. "

   719 	  "Not reporting them.\n",results->dotcomma);

   720     }

   721     /*

   722      * If more than 50 lines, or one-tenth, are short,

   723      * don't bother reporting them.

   724      */

   725     warnings.shortline=1;

   726     if (results->shortline>50 || results->shortline*10>linecnt)

   727     {

   728         warnings.shortline=0;

   729         printf("   --> %ld lines in this file are short. "

   730 	  "Not reporting short lines.\n",results->shortline);

   731     }

   732     /*

   733      * If more than 50 lines, or one-tenth, are long,

   734      * don't bother reporting them.

   735      */

   736     warnings.longline=1;

   737     if (results->longline>50 || results->longline*10>linecnt)

   738     {

   739         warnings.longline=0;

   740         printf("   --> %ld lines in this file are long. "

   741 	  "Not reporting long lines.\n",results->longline);

   742     }

   743     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   744     warnings.ast=1;

   745     if (results->astline>10)

   746     {

   747         warnings.ast=0;

   748         printf("   --> %ld lines in this file contain asterisks. "

   749 	  "Not reporting them.\n",results->astline);

   750     }

   751     /*

   752      * If more than 10 lines contain forward slashes,

   753      * don't bother reporting them.

   754      */

   755     warnings.fslash=1;

   756     if (results->fslashline>10)

   757     {

   758         warnings.fslash=0;

   759         printf("   --> %ld lines in this file contain forward slashes. "

   760 	  "Not reporting them.\n",results->fslashline);

   761     }

   762     /*

   763      * If more than 20 lines contain unpunctuated endquotes,

   764      * don't bother reporting them.

   765      */

   766     warnings.endquote=1;

   767     if (results->endquote_count>20)

   768     {

   769         warnings.endquote=0;

   770         printf("   --> %ld lines in this file contain unpunctuated endquotes. "

   771 	  "Not reporting them.\n",results->endquote_count);

   772     }

   773     /*

   774      * If more than 15 lines contain standalone digits,

   775      * don't bother reporting them.

   776      */

   777     warnings.digit=1;

   778     if (results->standalone_digit>10)

   779     {

   780         warnings.digit=0;

   781         printf("   --> %ld lines in this file contain standalone 0s and 1s. "

   782 	  "Not reporting them.\n",results->standalone_digit);

   783     }

   784     /*

   785      * If more than 20 lines contain hyphens at end,

   786      * don't bother reporting them.

   787      */

   788     warnings.hyphen=1;

   789     if (results->hyphens>20)

   790     {

   791         warnings.hyphen=0;

   792         printf("   --> %ld lines in this file have hyphens at end. "

   793 	  "Not reporting them.\n",results->hyphens);

   794     }

   795     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   796     {

   797         printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   798         pswit[MARKUP_SWITCH]=1;

   799     }

   800     if (results->verylongline>0)

   801         printf("   --> %ld lines in this file are VERY long!\n",

   802 	  results->verylongline);

   803     /*

   804      * If there are more non-PG spaced dashes than PG em-dashes,

   805      * assume it's deliberate.

   806      * Current PG guidelines say don't use them, but older texts do,

   807      * and some people insist on them whatever the guidelines say.

   808      */

   809     warnings.dash=1;

   810     if (results->spacedash+results->non_PG_space_emdash>

   811       results->PG_space_emdash)

   812     {

   813         warnings.dash=0;

   814         printf("   --> There are %ld spaced dashes and em-dashes. "

   815 	  "Not reporting them.\n",

   816 	  results->spacedash+results->non_PG_space_emdash);

   817     }

   818     /* If more than a quarter of characters are hi-bit, bug out. */

   819     warnings.bin=1;

   820     if (results->binlen*4>results->totlen)

   821     {

   822         printf("   --> This file does not appear to be ASCII. "

   823 	  "Terminating. Best of luck with it!\n");

   824         exit(1);

   825     }

   826     if (results->alphalen*4<results->totlen)

   827     {

   828         printf("   --> This file does not appear to be text. "

   829 	  "Terminating. Best of luck with it!\n");

   830         exit(1);

   831     }

   832     if (results->binlen*100>results->totlen || results->binlen>100)

   833     {

   834         printf("   --> There are a lot of foreign letters here. "

   835 	  "Not reporting them.\n");

   836         warnings.bin=0;

   837     }

   838     warnings.isDutch=0;

   839     if (results->Dutchcount>50)

   840     {

   841         warnings.isDutch=1;

   842         printf("   --> This looks like Dutch - "

   843 	  "switching off dashes and warnings for 's Middags case.\n");

   844     }

   845     warnings.isFrench=0;

   846     if (results->Frenchcount>50)

   847     {

   848         warnings.isFrench=1;

   849         printf("   --> This looks like French - "

   850 	  "switching off some doublepunct.\n");

   851     }

   852     if (results->firstline && results->footerline)

   853         printf("    The PG header and footer appear to be already on.\n");

   854     else

   855     {

   856         if (results->firstline)

   857             printf("    The PG header is on - no footer.\n");

   858         if (results->footerline)

   859             printf("    The PG footer is on - no header.\n");

   860     }

   861     printf("\n");

   862     if (pswit[VERBOSE_SWITCH])

   863     {

   864         warnings.bin=1;

   865         warnings.shortline=1;

   866         warnings.dotcomma=1;

   867         warnings.longline=1;

   868         warnings.dash=1;

   869         warnings.digit=1;

   870         warnings.ast=1;

   871         warnings.fslash=1;

   872         warnings.hyphen=1;

   873         warnings.endquote=1;

   874         printf("   *** Verbose output is ON -- you asked for it! ***\n");

   875     }

   876     if (warnings.isDutch)

   877         warnings.dash=0;

   878     if (results->footerline>0 && results->firstline>0 &&

   879       results->footerline>results->firstline &&

   880       results->footerline-results->firstline<100)

   881     {

   882         printf("   --> I don't really know where this text starts. \n");

   883         printf("       There are no reference points.\n");

   884         printf("       I'm going to have to report the header and footer "

   885 	  "as well.\n");

   886         results->firstline=0;

   887     }

   888     return &warnings;

   889 }

   891 struct counters {

   892     long quot;

   893     signed int c_unders,c_brack,s_brack,r_brack;

   894     signed int open_single_quote,close_single_quote;

   895 };

   897 /*

   898  * analyse_quotes:

   899  *

   900  * Look along the line, accumulate the count of quotes, and see

   901  * if this is an empty line - i.e. a line with nothing on it

   902  * but spaces.

   903  * If line has just spaces, period, * and/or - on it, don't

   904  * count it, since empty lines with asterisks or dashes to

   905  * separate sections are common.

   906  *

   907  * Returns: Non-zero if the line is empty.

   908  */

   909 int analyse_quotes(const char *s,struct counters *counters)

   910 {

   911     signed int guessquote=0;

   912     int isemptyline=1;    /* assume the line is empty until proven otherwise */

   913     while (*s)

   914     {

   915 	if (*s==CHAR_DQUOTE)

   916 	    counters->quot++;

   917 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)

   918 	{

   919 	    if (s==aline)

   920 	    {

   921 		/*

   922 		 * At start of line, it can only be an openquote.

   923 		 * Hardcode a very common exception!

   924 		 */

   925 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))

   926 		    counters->open_single_quote++;

   927 	    }

   928 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))

   929 		/* Do nothing! it's definitely an apostrophe, not a quote */

   930 		;

   931 	    /* it's outside a word - let's check it out */

   932 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))

   933 	    {

   934 		/* it damwell better BE an openquote */

   935 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))

   936 		    /* hardcode a very common exception! */

   937 		    counters->open_single_quote++;

   938 	    }

   939 	    else

   940 	    {

   941 		/* now - is it a closequote? */

   942 		guessquote=0;   /* accumulate clues */

   943 		if (gcisalpha(s[-1]))

   944 		{

   945 		    /* it follows a letter - could be either */

   946 		    guessquote++;

   947 		    if (s[-1]=='s')

   948 		    {

   949 			/* looks like a plural apostrophe */

   950 			guessquote-=3;

   951 			if (s[1]==CHAR_SPACE)  /* bonus marks! */

   952 			    guessquote-=2;

   953 		    }

   954 		}

   955 		/* it doesn't have a letter either side */

   956 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))

   957 		    guessquote+=8; /* looks like a closequote */

   958 		else

   959 		    guessquote++;

   960 		if (counters->open_single_quote>counters->close_single_quote)

   961 		    /*

   962 		     * Give it the benefit of some doubt,

   963 		     * if a squote is already open.

   964 		     */

   965 		    guessquote++;

   966 		else

   967 		    guessquote--;

   968 		if (guessquote>=0)

   969 		    counters->close_single_quote++;

   970 	    }

   971 	}

   972 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&

   973 	  *s!=13 && *s!=10)

   974 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */

   975 	if (*s==CHAR_UNDERSCORE)

   976 	    counters->c_unders++;

   977 	if (*s==CHAR_OPEN_CBRACK)

   978 	    counters->c_brack++;

   979 	if (*s==CHAR_CLOSE_CBRACK)

   980 	    counters->c_brack--;

   981 	if (*s==CHAR_OPEN_RBRACK)

   982 	    counters->r_brack++;

   983 	if (*s==CHAR_CLOSE_RBRACK)

   984 	    counters->r_brack--;

   985 	if (*s==CHAR_OPEN_SBRACK)

   986 	    counters->s_brack++;

   987 	if (*s==CHAR_CLOSE_SBRACK)

   988 	    counters->s_brack--;

   989 	s++;

   990     }

   991     return isemptyline;

   992 }

   994 /*

   995  * check_for_odd_characters:

   996  *

   997  * Check for binary and other odd characters.

   998  */

   999 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1000   int isemptyline)

  1001 {

  1002     /* Don't repeat multiple warnings on one line. */

  1003     signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;

  1004     const char *s;

  1005     unsigned char c;

  1006     for (s=aline;*s;s++)

  1007     {

  1008 	c=*(unsigned char *)s;

  1009 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))

  1010 	{

  1011 	    if (pswit[ECHO_SWITCH])

  1012 		printf("\n%s\n",aline);

  1013 	    if (!pswit[OVERVIEW_SWITCH])

  1014 		if (c>127 && c<160)

  1015 		    printf("    Line %ld column %d - "

  1016 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);

  1017 		else

  1018 		    printf("    Line %ld column %d - Non-ASCII character %d\n",

  1019 		      linecnt,(int)(s-aline)+1,c);

  1020 	    else

  1021 		cnt_bin++;

  1022 	    eNon_A=1;

  1023 	}

  1024 	if (!eTab && *s==CHAR_TAB)

  1025 	{

  1026 	    if (pswit[ECHO_SWITCH])

  1027 		printf("\n%s\n",aline);

  1028 	    if (!pswit[OVERVIEW_SWITCH])

  1029 		printf("    Line %ld column %d - Tab character?\n",

  1030 		  linecnt,(int)(s-aline)+1);

  1031 	    else

  1032 		cnt_odd++;

  1033 	    eTab=1;

  1034 	}

  1035 	if (!eTilde && *s==CHAR_TILDE)

  1036 	{

  1037 	    /*

  1038 	     * Often used by OCR software to indicate an

  1039 	     * unrecognizable character.

  1040 	     */

  1041 	    if (pswit[ECHO_SWITCH])

  1042 		printf("\n%s\n",aline);

  1043 	    if (!pswit[OVERVIEW_SWITCH])

  1044 		printf("    Line %ld column %d - Tilde character?\n",

  1045 		  linecnt,(int)(s-aline)+1);

  1046 	    else

  1047 		cnt_odd++;

  1048 	    eTilde=1;

  1049 	}

  1050 	if (!eCarat && *s==CHAR_CARAT)

  1051 	{

  1052 	    if (pswit[ECHO_SWITCH])

  1053 		printf("\n%s\n",aline);

  1054 	    if (!pswit[OVERVIEW_SWITCH])

  1055 		printf("    Line %ld column %d - Carat character?\n",

  1056 		  linecnt,(int)(s-aline)+1);

  1057 	    else

  1058 		cnt_odd++;

  1059 	    eCarat=1;

  1060 	}

  1061 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)

  1062 	{

  1063 	    if (pswit[ECHO_SWITCH])

  1064 		printf("\n%s\n",aline);

  1065 	    if (!pswit[OVERVIEW_SWITCH])

  1066 		printf("    Line %ld column %d - Forward slash?\n",

  1067 		  linecnt,(int)(s-aline)+1);

  1068 	    else

  1069 		cnt_odd++;

  1070 	    eFSlash=1;

  1071 	}

  1072 	/*

  1073 	 * Report asterisks only in paranoid mode,

  1074 	 * since they're often deliberate.

  1075 	 */

  1076 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1077 	  *s==CHAR_ASTERISK)

  1078 	{

  1079 	    if (pswit[ECHO_SWITCH])

  1080 		printf("\n%s\n",aline);

  1081 	    if (!pswit[OVERVIEW_SWITCH])

  1082 		printf("    Line %ld column %d - Asterisk?\n",

  1083 		  linecnt,(int)(s-aline)+1);

  1084 	    else

  1085 		cnt_odd++;

  1086 	    eAst=1;

  1087 	}

  1088     }

  1089 }

  1091 /*

  1092  * check_for_long_line:

  1093  *

  1094  * Check for line too long.

  1095  */

  1096 void check_for_long_line(const char *aline)

  1097 {

  1098     if (strlen(aline)>LONGEST_PG_LINE)

  1099     {

  1100 	if (pswit[ECHO_SWITCH])

  1101 	    printf("\n%s\n",aline);

  1102 	if (!pswit[OVERVIEW_SWITCH])

  1103 	    printf("    Line %ld column %d - Long line %d\n",

  1104 	      linecnt,strlen(aline),strlen(aline));

  1105 	else

  1106 	    cnt_long++;

  1107     }

  1108 }

  1110 struct line_properties {

  1111     unsigned int len,blen;

  1112     char start;

  1113 };

  1115 /*

  1116  * check_for_short_line:

  1117  *

  1118  * Check for line too short.

  1119  *

  1120  * This one is a bit trickier to implement: we don't want to

  1121  * flag the last line of a paragraph for being short, so we

  1122  * have to wait until we know that our current line is a

  1123  * "normal" line, then report the _previous_ line if it was too

  1124  * short. We also don't want to report indented lines like

  1125  * chapter heads or formatted quotations. We therefore keep

  1126  * last->len as the length of the last line examined, and

  1127  * last->blen as the length of the last but one, and try to

  1128  * suppress unnecessary warnings by checking that both were of

  1129  * "normal" length. We keep the first character of the last

  1130  * line in last->start, and if it was a space, we assume that

  1131  * the formatting is deliberate. I can't figure out a way to

  1132  * distinguish something like a quoted verse left-aligned or

  1133  * the header or footer of a letter from a paragraph of short

  1134  * lines - maybe if I examined the whole paragraph, and if the

  1135  * para has less than, say, 8 lines and if all lines are short,

  1136  * then just assume it's OK? Need to look at some texts to see

  1137  * how often a formula like this would get the right result.

  1138  */

  1139 void check_for_short_line(const char *aline,const struct line_properties *last)

  1140 {

  1141     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&

  1142       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1143     {

  1144 	if (pswit[ECHO_SWITCH])

  1145 	    printf("\n%s\n",prevline);

  1146 	if (!pswit[OVERVIEW_SWITCH])

  1147 	    printf("    Line %ld column %d - Short line %d?\n",

  1148 	      linecnt-1,strlen(prevline),strlen(prevline));

  1149 	else

  1150 	    cnt_short++;

  1151     }

  1152 }

  1154 /*

  1155  * check_for_starting_punctuation:

  1156  *

  1157  * Look for punctuation other than full ellipses at start of line.

  1158  */

  1159 void check_for_starting_punctuation(const char *aline)

  1160 {

  1161     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))

  1162     {

  1163 	if (pswit[ECHO_SWITCH])

  1164 	    printf("\n%s\n",aline);

  1165 	if (!pswit[OVERVIEW_SWITCH])

  1166 	    printf("    Line %ld column 1 - Begins with punctuation?\n",

  1167 	      linecnt);

  1168 	else

  1169 	    cnt_punct++;

  1170     }

  1171 }

  1173 /*

  1174  * check_for_spaced_emdash:

  1175  *

  1176  * Check for spaced em-dashes.

  1177  *

  1178  * We must check _all_ occurrences of "--" on the line

  1179  * hence the loop - even if the first double-dash is OK

  1180  * there may be another that's wrong later on.

  1181  */

  1182 void check_for_spaced_emdash(const char *aline)

  1183 {

  1184     const char *s,*t;

  1185     s=aline;

  1186     while ((t=strstr(s,"--")))

  1187     {

  1188 	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)

  1189 	{

  1190 	    if (pswit[ECHO_SWITCH])

  1191 		printf("\n%s\n",aline);

  1192 	    if (!pswit[OVERVIEW_SWITCH])

  1193 		printf("    Line %ld column %d - Spaced em-dash?\n",

  1194 		  linecnt,(int)(t-aline)+1);

  1195 	    else

  1196 		cnt_dash++;

  1197 	}

  1198 	s=t+2;

  1199     }

  1200 }

  1202 /*

  1203  * check_for_spaced_dash:

  1204  *

  1205  * Check for spaced dashes.

  1206  */

  1207 void check_for_spaced_dash(const char *aline)

  1208 {

  1209     const char *s;

  1210     if ((s=strstr(aline," -")))

  1211     {

  1212 	if (s[2]!='-')

  1213 	{

  1214 	    if (pswit[ECHO_SWITCH])

  1215 		printf("\n%s\n",aline);

  1216 	    if (!pswit[OVERVIEW_SWITCH])

  1217 		printf("    Line %ld column %d - Spaced dash?\n",

  1218 		  linecnt,(int)(s-aline)+1);

  1219 	    else

  1220 		cnt_dash++;

  1221 	}

  1222     }

  1223     else if ((s=strstr(aline,"- ")))

  1224     {

  1225 	if (s==aline || s[-1]!='-')

  1226 	{

  1227 	    if (pswit[ECHO_SWITCH])

  1228 		printf("\n%s\n",aline);

  1229 	    if (!pswit[OVERVIEW_SWITCH])

  1230 		printf("    Line %ld column %d - Spaced dash?\n",

  1231 		  linecnt,(int)(s-aline)+1);

  1232 	    else

  1233 		cnt_dash++;

  1234 	}

  1235     }

  1236 }

  1238 /*

  1239  * check_for_unmarked_paragraphs:

  1240  *

  1241  * Check for unmarked paragraphs indicated by separate speakers.

  1242  *

  1243  * May well be false positive:

  1244  * "Bravo!" "Wonderful!" called the crowd.

  1245  * but useful all the same.

  1246  */

  1247 void check_for_unmarked_paragraphs(const char *aline)

  1248 {

  1249     const char *s;

  1250     s=strstr(aline,"\"  \"");

  1251     if (!s)

  1252 	s=strstr(aline,"\" \"");

  1253     if (s)

  1254     {

  1255 	if (pswit[ECHO_SWITCH])

  1256 	    printf("\n%s\n",aline);

  1257 	if (!pswit[OVERVIEW_SWITCH])

  1258 	    printf("    Line %ld column %d - Query missing paragraph break?\n",

  1259 	      linecnt,(int)(s-aline)+1);

  1260 	else

  1261 	    cnt_punct++;

  1262     }

  1263 }

  1265 /*

  1266  * check_for_jeebies:

  1267  *

  1268  * Check for "to he" and other easy h/b errors.

  1269  *

  1270  * This is a very inadequate effort on the h/b problem,

  1271  * but the phrase "to he" is always an error, whereas "to

  1272  * be" is quite common.

  1273  * Similarly, '"Quiet!", be said.' is a non-be error

  1274  * "to he" is _not_ always an error!:

  1275  *       "Where they went to he couldn't say."

  1276  * Another false positive:

  1277  *       What would "Cinderella" be without the . . .

  1278  * and another: "If he wants to he can see for himself."

  1279  */

  1280 void check_for_jeebies(const char *aline)

  1281 {

  1282     const char *s;

  1283     s=strstr(aline," be could ");

  1284     if (!s)

  1285 	s=strstr(aline," be would ");

  1286     if (!s)

  1287 	s=strstr(aline," was be ");

  1288     if (!s)

  1289 	s=strstr(aline," be is ");

  1290     if (!s)

  1291 	s=strstr(aline," is be ");

  1292     if (!s)

  1293 	s=strstr(aline,"\", be ");

  1294     if (!s)

  1295 	s=strstr(aline,"\" be ");

  1296     if (!s)

  1297 	s=strstr(aline,"\" be ");

  1298     if (!s)

  1299 	s=strstr(aline," to he ");

  1300     if (s)

  1301     {

  1302 	if (pswit[ECHO_SWITCH])

  1303 	    printf("\n%s\n",aline);

  1304 	if (!pswit[OVERVIEW_SWITCH])

  1305 	    printf("    Line %ld column %d - Query he/be error?\n",

  1306 	      linecnt,(int)(s-aline)+1);

  1307 	else

  1308 	    cnt_word++;

  1309     }

  1310     s=strstr(aline," the had ");

  1311     if (!s)

  1312 	s=strstr(aline," a had ");

  1313     if (!s)

  1314 	s=strstr(aline," they bad ");

  1315     if (!s)

  1316 	s=strstr(aline," she bad ");

  1317     if (!s)

  1318 	s=strstr(aline," he bad ");

  1319     if (!s)

  1320 	s=strstr(aline," you bad ");

  1321     if (!s)

  1322 	s=strstr(aline," i bad ");

  1323     if (s)

  1324     {

  1325 	if (pswit[ECHO_SWITCH])

  1326 	    printf("\n%s\n",aline);

  1327 	if (!pswit[OVERVIEW_SWITCH])

  1328 	    printf("    Line %ld column %d - Query had/bad error?\n",

  1329 	      linecnt,(int)(s-aline)+1);

  1330 	else

  1331 	    cnt_word++;

  1332     }

  1333     s=strstr(aline,"; hut ");

  1334     if (!s)

  1335 	s=strstr(aline,", hut ");

  1336     if (s)

  1337     {

  1338 	if (pswit[ECHO_SWITCH])

  1339 	    printf("\n%s\n",aline);

  1340 	if (!pswit[OVERVIEW_SWITCH])

  1341 	    printf("    Line %ld column %d - Query hut/but error?\n",

  1342 	      linecnt,(int)(s-aline)+1);

  1343 	else

  1344 	    cnt_word++;

  1345     }

  1346 }

  1348 /*

  1349  * check_for_mta_from:

  1350  *

  1351  * Special case - angled bracket in front of "From" placed there by an

  1352  * MTA when sending an e-mail.

  1353  */

  1354 void check_for_mta_from(const char *aline)

  1355 {

  1356     const char *s;

  1357     s=strstr(aline,">From");

  1358     if (s)

  1359     {

  1360 	if (pswit[ECHO_SWITCH])

  1361 	    printf("\n%s\n",aline);

  1362 	if (!pswit[OVERVIEW_SWITCH])

  1363 	    printf("    Line %ld column %d - Query angled bracket with From\n",

  1364 	      linecnt,(int)(s-aline)+1);

  1365 	else

  1366 	    cnt_punct++;

  1367     }

  1368 }

  1370 /*

  1371  * check_for_orphan_character:

  1372  *

  1373  * Check for a single character line -

  1374  * often an overflow from bad wrapping.

  1375  */

  1376 void check_for_orphan_character(const char *aline)

  1377 {

  1378     if (*aline && !aline[1])

  1379     {

  1380 	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||

  1381 	  gcisdigit(*aline))

  1382 	    ; /* Nothing - ignore numerals alone on a line. */

  1383 	else

  1384 	{

  1385 	    if (pswit[ECHO_SWITCH])

  1386 		printf("\n%s\n",aline);

  1387 	    if (!pswit[OVERVIEW_SWITCH])

  1388 		printf("    Line %ld column 1 - Query single character line\n",

  1389 		  linecnt);

  1390 	    else

  1391 		cnt_punct++;

  1392 	}

  1393     }

  1394 }

  1396 /*

  1397  * procfile:

  1398  *

  1399  * Process one file.

  1400  */

  1401 void procfile(char *filename)

  1402 {

  1403     char *s,*t,*s1,*wordstart;

  1404     char inword[MAXWORDLEN],testword[MAXWORDLEN];

  1405     char parastart[81];     /* first line of current para */

  1406     FILE *infile;

  1407     struct first_pass_results *first_pass_results;

  1408     struct warnings *warnings;

  1409     struct counters counters={0};

  1410     struct line_properties last={0};

  1411     int isemptyline;

  1412     long squot,start_para_line;

  1413     signed int i,j,llen,isacro,isellipsis,istypo,alower;

  1414     signed int dquotepar,squotepar;

  1415     signed int isnewpara,vowel,consonant;

  1416     char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],

  1417       cbrack_err[80],unders_err[80];

  1418     signed int qword_index,qperiod_index,isdup;

  1419     signed int enddash;

  1420     last.start=CHAR_SPACE;

  1421     *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=

  1422       *unders_err=*prevline=0;

  1423     linecnt=checked_linecnt=start_para_line=0;

  1424     squot=0;

  1425     i=llen=isacro=isellipsis=istypo=0;

  1426     isnewpara=vowel=consonant=enddash=0;

  1427     qword_index=qperiod_index=isdup=0;

  1428     *inword=*testword=0;

  1429     dquotepar=squotepar=0;

  1430     for (j=0;j<MAX_QWORD;j++)

  1431     {

  1432         dupcnt[j]=0;

  1433         for (i=0;i<MAX_QWORD_LENGTH;i++)

  1434 	{

  1435             qword[i][j]=0;

  1436             qperiod[i][j]=0;

  1437 	}

  1438     }

  1439     infile=fopen(filename,"rb");

  1440     if (!infile)

  1441     {

  1442         if (pswit[STDOUT_SWITCH])

  1443             fprintf(stdout,"bookloupe: cannot open %s\n",filename);

  1444         else

  1445             fprintf(stderr,"bookloupe: cannot open %s\n",filename);

  1446 	exit(1);

  1447     }

  1448     fprintf(stdout,"\n\nFile: %s\n\n",filename);

  1449     first_pass_results=first_pass(infile);

  1450     warnings=report_first_pass(first_pass_results);

  1451     rewind(infile);

  1452     /*

  1453      * Here we go with the main pass. Hold onto yer hat!

  1454      * Re-init some variables we've dirtied.

  1455      */

  1456     squot=linecnt=0;

  1457     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))

  1458     {

  1459         linecnt++;

  1460         if (linecnt==1)

  1461 	    isnewpara=1;

  1462         if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))

  1463 	    continue;    // skip DP page separators completely

  1464         if (linecnt<first_pass_results->firstline ||

  1465 	  (first_pass_results->footerline>0 &&

  1466 	  linecnt>first_pass_results->footerline))

  1467 	{

  1468             if (pswit[HEADER_SWITCH])

  1469 	    {

  1470                 if (!strncmp(aline,"Title:",6))

  1471                     printf("    %s\n",aline);

  1472                 if (!strncmp(aline,"Author:",7))

  1473                     printf("    %s\n",aline);

  1474                 if (!strncmp(aline,"Release Date:",13))

  1475                     printf("    %s\n",aline);

  1476                 if (!strncmp(aline,"Edition:",8))

  1477                     printf("    %s\n\n",aline);

  1478 	    }

  1479             continue;                /* skip through the header */

  1480 	}

  1481         checked_linecnt++;

  1482         s=aline;

  1483         /*

  1484 	 * If we are in a state of unbalanced quotes, and this line

  1485          * doesn't begin with a quote, output the stored error message.

  1486          * If the -P switch was used, print the warning even if the

  1487          * new para starts with quotes.

  1488 	 */

  1489         t=s;

  1490         while (*t==' ')

  1491 	    t++;

  1492         if (*dquote_err)

  1493             if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  1494 	    {

  1495                 if (!pswit[OVERVIEW_SWITCH])

  1496 		{

  1497                     if (pswit[ECHO_SWITCH])

  1498 			printf("\n%s\n",parastart);

  1499                     printf(dquote_err);

  1500 		}

  1501                 else

  1502                     cnt_dquot++;

  1503             }

  1504         if (*squote_err)

  1505 	{

  1506             if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||

  1507 	      pswit[QPARA_SWITCH] || squot)

  1508 	    {

  1509                 if (!pswit[OVERVIEW_SWITCH])

  1510 		{

  1511                     if (pswit[ECHO_SWITCH])

  1512 			printf("\n%s\n",parastart);

  1513                     printf(squote_err);

  1514 		}

  1515                 else

  1516                     cnt_squot++;

  1517 	    }

  1518             squot=0;

  1519 	}

  1520         if (*rbrack_err)

  1521 	{

  1522             if (!pswit[OVERVIEW_SWITCH])

  1523 	    {

  1524                 if (pswit[ECHO_SWITCH])

  1525 		    printf("\n%s\n",parastart);

  1526                 printf(rbrack_err);

  1527 	    }

  1528             else

  1529                 cnt_brack++;

  1530 	}

  1531         if (*sbrack_err)

  1532 	{

  1533             if (!pswit[OVERVIEW_SWITCH])

  1534 	    {

  1535                 if (pswit[ECHO_SWITCH])

  1536 		    printf("\n%s\n",parastart);

  1537                 printf(sbrack_err);

  1538 	    }

  1539             else

  1540                 cnt_brack++;

  1541 	}

  1542         if (*cbrack_err)

  1543 	{

  1544             if (!pswit[OVERVIEW_SWITCH])

  1545 	    {

  1546                 if (pswit[ECHO_SWITCH])

  1547 		    printf("\n%s\n",parastart);

  1548                 printf(cbrack_err);

  1549 	    }

  1550             else

  1551                 cnt_brack++;

  1552 	}

  1553         if (*unders_err)

  1554 	{

  1555             if (!pswit[OVERVIEW_SWITCH])

  1556 	    {

  1557                 if (pswit[ECHO_SWITCH])

  1558 		    printf("\n%s\n",parastart);

  1559                 printf(unders_err);

  1560 	    }

  1561             else

  1562                 cnt_brack++;

  1563 	}

  1564         *dquote_err=*squote_err=*rbrack_err=*cbrack_err=

  1565 	  *sbrack_err=*unders_err=0;

  1566 	isemptyline=analyse_quotes(aline,&counters);

  1567         if (isnewpara && !isemptyline)

  1568 	{

  1569 	    /* This line is the start of a new paragraph. */

  1570             start_para_line=linecnt;

  1571 	    /* Capture its first line in case we want to report it later. */

  1572             strncpy(parastart,aline,80);

  1573             parastart[79]=0;

  1574             dquotepar=squotepar=0; /* restart the quote count */

  1575             s=aline;

  1576             while (!gcisalpha(*s) && !gcisdigit(*s) && *s)

  1577 		s++;

  1578             if (*s>='a' && *s<='z')

  1579 	    {

  1580 		/* and its first letter is lowercase */

  1581                 if (pswit[ECHO_SWITCH])

  1582 		    printf("\n%s\n",aline);

  1583                 if (!pswit[OVERVIEW_SWITCH])

  1584                     printf("    Line %ld column %d - "

  1585 		      "Paragraph starts with lower-case\n",

  1586 		      linecnt,(int)(s-aline)+1);

  1587                 else

  1588                     cnt_punct++;

  1589 	    }

  1590             isnewpara=0; /* Signal the end of new para processing. */

  1591 	}

  1592         /* Check for an em-dash broken at line end. */

  1593         if (enddash && *aline=='-')

  1594 	{

  1595             if (pswit[ECHO_SWITCH])

  1596 		printf("\n%s\n",aline);

  1597             if (!pswit[OVERVIEW_SWITCH])

  1598                 printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  1599             else

  1600                 cnt_punct++;

  1601 	}

  1602         enddash=0;

  1603         for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)

  1604 	    ;

  1605         if (s>=aline && *s=='-')

  1606             enddash=1;

  1607 	/*

  1608          * Check for invalid or questionable characters in the line

  1609          * Anything above 127 is invalid for plain ASCII, and

  1610          * non-printable control characters should also be flagged.

  1611          * Tabs should generally not be there.

  1612 	 */

  1613         for (s=aline;*s;s++)

  1614 	{

  1615             i=(unsigned char)*s;

  1616             if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)

  1617 	    {

  1618                 if (pswit[ECHO_SWITCH])

  1619 		    printf("\n%s\n",aline);

  1620                 if (!pswit[OVERVIEW_SWITCH])

  1621                     printf("    Line %ld column %d - Control character %d\n",

  1622 		      linecnt,(int)(s-aline)+1,i);

  1623                 else

  1624                     cnt_bin++;

  1625 	    }

  1626 	}

  1627         if (warnings->bin)

  1628 	    check_for_odd_characters(aline,warnings,isemptyline);

  1629         if (warnings->longline)

  1630 	    check_for_long_line(aline);

  1631         if (warnings->shortline)

  1632 	    check_for_short_line(aline,&last);

  1633         last.blen=last.len;

  1634         last.len=strlen(aline);

  1635         last.start=aline[0];

  1636 	check_for_starting_punctuation(aline);

  1637         if (warnings->dash)

  1638 	{

  1639 	    check_for_spaced_emdash(aline);

  1640 	    check_for_spaced_dash(aline);

  1641 	}

  1642 	check_for_unmarked_paragraphs(aline);

  1643 	check_for_jeebies(aline);

  1644 	check_for_mta_from(aline);

  1645 	check_for_orphan_character(aline);

  1646         /* Check for I" - often should be ! */

  1647         if (strstr(aline," I\""))

  1648 	{

  1649             if (pswit[ECHO_SWITCH])

  1650 		printf("\n%s\n",aline);

  1651             if (!pswit[OVERVIEW_SWITCH])

  1652                 printf("    Line %ld column %ld - Query I=exclamation mark?\n",

  1653 		  linecnt,strstr(aline," I\"")-aline);

  1654             else

  1655                 cnt_punct++;

  1656 	}

  1657         /*

  1658 	 * Check for period without a capital letter. Cut-down from gutspell.

  1659          * Only works when it happens on a single line.

  1660 	 */

  1661         if (pswit[PARANOID_SWITCH])

  1662 	{

  1663             for (t=s=aline;strstr(t,". ");)

  1664 	    {

  1665                 t=strstr(t,". ");

  1666                 if (t==s)

  1667 		{

  1668                     t++;

  1669 		    /* start of line punctuation is handled elsewhere */

  1670                     continue;

  1671 		}

  1672                 if (!gcisalpha(t[-1]))

  1673 		{

  1674                     t++;

  1675                     continue;

  1676 		}

  1677                 if (warnings->isDutch)

  1678 		{

  1679 		    /* For Frank & Jeroen -- 's Middags case */

  1680                     if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&

  1681 		      t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')

  1682 		    {

  1683                         t++;

  1684                         continue;

  1685 		    }

  1686 		}

  1687                 s1=t+2;

  1688                 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))

  1689                     s1++;

  1690                 if (*s1>='a' && *s1<='z')

  1691 		{

  1692 		    /* we have something to investigate */

  1693                     istypo=1;

  1694 		    /* so let's go back and find out */

  1695                     for (s1=t-1;s1>=s &&

  1696 		      (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&

  1697 		      gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)

  1698 			;

  1699                     s1++;

  1700                     for (i=0;*s1 && *s1!='.';s1++,i++)

  1701                         testword[i]=*s1;

  1702                     testword[i]=0;

  1703                     for (i=0;*abbrev[i];i++)

  1704                         if (!strcmp(testword,abbrev[i]))

  1705                             istypo=0;

  1706                     if (gcisdigit(*testword))

  1707 			istypo=0;

  1708                     if (!testword[1])

  1709 			istypo=0;

  1710                     if (isroman(testword))

  1711 			istypo=0;

  1712                     if (istypo)

  1713 		    {

  1714                         istypo=0;

  1715                         for (i=0;testword[i];i++)

  1716                             if (strchr(vowels,testword[i]))

  1717                                 istypo=1;

  1718 		    }

  1719                     if (istypo)

  1720 		    {

  1721                         isdup=0;

  1722                         if (strlen(testword)<MAX_QWORD_LENGTH &&

  1723 			  !pswit[VERBOSE_SWITCH])

  1724                             for (i=0;i<qperiod_index;i++)

  1725                                 if (!strcmp(testword,qperiod[i]))

  1726                                     isdup=1;

  1727                         if (!isdup)

  1728 			{

  1729                             if (qperiod_index<MAX_QWORD &&

  1730 			      strlen(testword)<MAX_QWORD_LENGTH)

  1731 			    {

  1732                                 strcpy(qperiod[qperiod_index],testword);

  1733                                 qperiod_index++;

  1734 			    }

  1735                             if (pswit[ECHO_SWITCH])

  1736 				printf("\n%s\n",aline);

  1737                             if (!pswit[OVERVIEW_SWITCH])

  1738                                 printf("    Line %ld column %d - "

  1739 				  "Extra period?\n",linecnt,(int)(t-aline)+1);

  1740                             else

  1741                                 cnt_punct++;

  1742 			}

  1743 		    }

  1744 		}

  1745 	    t++;

  1746 	    }

  1747 	}

  1748         if (pswit[TYPO_SWITCH])

  1749 	{

  1750             /* Check for words usually not followed by punctuation. */

  1751             for (s=aline;*s;)

  1752 	    {

  1753                 wordstart=s;

  1754                 s=getaword(s,inword);

  1755                 if (!*inword)

  1756 		    continue;

  1757                 lowerit(inword);

  1758                 for (i=0;*nocomma[i];i++)

  1759                     if (!strcmp(inword,nocomma[i]))

  1760 		    {

  1761                         if (*s==',' || *s==';' || *s==':')

  1762 			{

  1763                             if (pswit[ECHO_SWITCH])

  1764 				printf("\n%s\n",aline);

  1765                             if (!pswit[OVERVIEW_SWITCH])

  1766                                 printf("    Line %ld column %d - "

  1767 				  "Query punctuation after %s?\n",

  1768 				  linecnt,(int)(s-aline)+1,inword);

  1769                             else

  1770                                 cnt_punct++;

  1771 			}

  1772 		    }

  1773 		for (i=0;*noperiod[i];i++)

  1774                     if (!strcmp(inword,noperiod[i]))

  1775 		    {

  1776                         if (*s=='.' || *s=='!')

  1777 			{

  1778                             if (pswit[ECHO_SWITCH])

  1779 				printf("\n%s\n",aline);

  1780                             if (!pswit[OVERVIEW_SWITCH])

  1781                                 printf("    Line %ld column %d - "

  1782 				  "Query punctuation after %s?\n",

  1783 				  linecnt,(int)(s-aline)+1,inword);

  1784                             else

  1785                                 cnt_punct++;

  1786 			}

  1787 		    }

  1788 	    }

  1789 	}

  1790         /*

  1791 	 * Check for commonly mistyped words,

  1792 	 * and digits like 0 for O in a word.

  1793 	 */

  1794         for (s=aline;*s;)

  1795 	{

  1796             wordstart=s;

  1797             s=getaword(s,inword);

  1798             if (!*inword)

  1799 		continue; /* don't bother with empty lines */

  1800             if (mixdigit(inword))

  1801 	    {

  1802                 if (pswit[ECHO_SWITCH])

  1803 		    printf("\n%s\n",aline);

  1804                 if (!pswit[OVERVIEW_SWITCH])

  1805                     printf("    Line %ld column %d - Query digit in %s\n",

  1806 		      linecnt,(int)(wordstart-aline)+1,inword);

  1807                 else

  1808                     cnt_word++;

  1809 	    }

  1810             /*

  1811 	     * Put the word through a series of tests for likely typos and OCR

  1812 	     * errors.

  1813 	     */

  1814             if (pswit[TYPO_SWITCH])

  1815 	    {

  1816                 istypo=0;

  1817                 strcpy(testword,inword);

  1818                 alower=0;

  1819                 for (i=0;i<(signed int)strlen(testword);i++)

  1820 		{

  1821 		    /* lowercase for testing */

  1822                     if (testword[i]>='a' && testword[i]<='z')

  1823 			alower=1;

  1824                     if (alower && testword[i]>='A' && testword[i]<='Z')

  1825 		    {

  1826                         /*

  1827 			 * We have an uppercase mid-word. However, there are

  1828 			 * common cases:

  1829                          *   Mac and Mc like McGill

  1830                          *   French contractions like l'Abbe

  1831 			 */

  1832                         if (i==2 && testword[0]=='m' && testword[1]=='c' ||

  1833                           i==3 && testword[0]=='m' && testword[1]=='a' &&

  1834 			  testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)

  1835 			    ; /* do nothing! */

  1836                         else

  1837                             istypo=1;

  1838 		    }

  1839                     testword[i]=(char)tolower(testword[i]);

  1840 		}

  1841                 /*

  1842 		 * Check for certain unlikely two-letter combinations at word

  1843 		 * start and end.

  1844 		 */

  1845                 if (strlen(testword)>1)

  1846 		{

  1847                     for (i=0;*nostart[i];i++)

  1848                         if (!strncmp(testword,nostart[i],2))

  1849                             istypo=1;

  1850                     for (i=0;*noend[i];i++)

  1851                         if (!strncmp(testword+strlen(testword)-2,noend[i],2))

  1852                             istypo=1;

  1853 		}

  1854                 /* ght is common, gbt never. Like that. */

  1855                 if (strstr(testword,"cb"))

  1856 		    istypo=1;

  1857                 if (strstr(testword,"gbt"))

  1858 		    istypo=1;

  1859                 if (strstr(testword,"pbt"))

  1860 		    istypo=1;

  1861                 if (strstr(testword,"tbs"))

  1862 		    istypo=1;

  1863                 if (strstr(testword,"mrn"))

  1864 		    istypo=1;

  1865                 if (strstr(testword,"ahle"))

  1866 		    istypo=1;

  1867                 if (strstr(testword,"ihle"))

  1868 		    istypo=1;

  1869                 /*

  1870 		 * "TBE" does happen - like HEARTBEAT - but uncommon.

  1871                  * Also "TBI" - frostbite, outbid - but uncommon.

  1872                  * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1873 		 * numerals, but "ii" is a common scanno.

  1874 		 */

  1875                 if (strstr(testword,"tbi"))

  1876 		    istypo=1;

  1877                 if (strstr(testword,"tbe"))

  1878 		    istypo=1;

  1879                 if (strstr(testword,"ii"))

  1880 		    istypo=1;

  1881                 /*

  1882 		 * Check for no vowels or no consonants.

  1883                  * If none, flag a typo.

  1884 		 */

  1885                 if (!istypo && strlen(testword)>1)

  1886 		{

  1887                     vowel=consonant=0;

  1888                     for (i=0;testword[i];i++)

  1889 		    {

  1890                         if (testword[i]=='y' || gcisdigit(testword[i]))

  1891 			{

  1892 			    /* Yah, this is loose. */

  1893                             vowel++;

  1894                             consonant++;

  1895 			}

  1896                         else if (strchr(vowels,testword[i]))

  1897 			    vowel++;

  1898 			else

  1899 			    consonant++;

  1900 		    }

  1901                     if (!vowel || !consonant)

  1902                         istypo=1;

  1903 		}

  1904                 /*

  1905 		 * Now exclude the word from being reported if it's in

  1906                  * the okword list.

  1907 		 */

  1908                 for (i=0;*okword[i];i++)

  1909                     if (!strcmp(testword,okword[i]))

  1910                         istypo=0;

  1911                 /*

  1912 		 * What looks like a typo may be a Roman numeral.

  1913 		 * Exclude these.

  1914 		 */

  1915                 if (istypo && isroman(testword))

  1916 		    istypo=0;

  1917                 /* Check the manual list of typos. */

  1918                 if (!istypo)

  1919                     for (i=0;*typo[i];i++)

  1920                         if (!strcmp(testword,typo[i]))

  1921                             istypo=1;

  1922                 /*

  1923 		 * Check lowercase s, l, i and m - special cases.

  1924                  *   "j" - often a semi-colon gone wrong.

  1925                  *   "d" for a missing apostrophe - he d

  1926                  *   "n" for "in"

  1927 		 */

  1928                 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))

  1929 		    istypo=1;

  1930                 if (istypo)

  1931 		{

  1932                     isdup=0;

  1933                     if (strlen(testword)<MAX_QWORD_LENGTH &&

  1934 		      !pswit[VERBOSE_SWITCH])

  1935                         for (i=0;i<qword_index;i++)

  1936                             if (!strcmp(testword,qword[i]))

  1937 			    {

  1938                                 isdup=1;

  1939                                 ++dupcnt[i];

  1940 			    }

  1941                     if (!isdup)

  1942 		    {

  1943                         if (qword_index<MAX_QWORD &&

  1944 			  strlen(testword)<MAX_QWORD_LENGTH)

  1945 			{

  1946                             strcpy(qword[qword_index],testword);

  1947                             qword_index++;

  1948 			}

  1949                         if (pswit[ECHO_SWITCH])

  1950 			    printf("\n%s\n",aline);

  1951                         if (!pswit[OVERVIEW_SWITCH])

  1952 			{

  1953                             printf("    Line %ld column %d - Query word %s",

  1954 			      linecnt,(int)(wordstart-aline)+1,inword);

  1955                             if (strlen(testword)<MAX_QWORD_LENGTH &&

  1956 			      !pswit[VERBOSE_SWITCH])

  1957                                 printf(" - not reporting duplicates");

  1958                             printf("\n");

  1959 			}

  1960                         else

  1961                             cnt_word++;

  1962 		    }

  1963 		}

  1964 	    }

  1965 	    /* check the user's list of typos */

  1966 	    if (!istypo && usertypo_count)

  1967 		for (i=0;i<usertypo_count;i++)

  1968 		    if (!strcmp(testword,usertypo[i]))

  1969 		    {

  1970 			if (pswit[ECHO_SWITCH])

  1971 			    printf("\n%s\n",aline);

  1972 			if (!pswit[OVERVIEW_SWITCH])

  1973 			    printf("    Line %ld column %d - "

  1974 			      "Query possible scanno %s\n",

  1975 			      linecnt,(int)(wordstart-aline)+2,inword);

  1976 		    }

  1977             if (pswit[PARANOID_SWITCH] && warnings->digit)

  1978 	    {

  1979 		/* In paranoid mode, query all 0 and 1 standing alone. */

  1980                 if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1981 		{

  1982                     if (pswit[ECHO_SWITCH])

  1983 			printf("\n%s\n",aline);

  1984                     if (!pswit[OVERVIEW_SWITCH])

  1985                         printf("    Line %ld column %d - Query standalone %s\n",

  1986 			  linecnt,(int)(wordstart-aline)+2,inword);

  1987                     else

  1988                         cnt_word++;

  1989 		}

  1990 	    }

  1991 	}

  1992 	/*

  1993          * Look for added or missing spaces around punctuation and quotes.

  1994          * If there is a punctuation character like ! with no space on

  1995          * either side, suspect a missing!space. If there are spaces on

  1996          * both sides , assume a typo. If we see a double quote with no

  1997          * space or punctuation on either side of it, assume unspaced

  1998          * quotes "like"this.

  1999 	 */

  2000         llen=strlen(aline);

  2001         for (i=1;i<llen;i++)

  2002 	{

  2003 	    /* For each character in the line after the first. */

  2004             if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */

  2005 	    {

  2006 		/* we need to suppress warnings for acronyms like M.D. */

  2007                 isacro=0;

  2008 		/* we need to suppress warnings for ellipsis . . . */

  2009                 isellipsis=0;

  2010 		/* if there are letters on both sides of it or ... */

  2011                 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||

  2012                    gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))

  2013 		{

  2014 		    /* ...if it's strict punctuation followed by an alpha */

  2015                     if (aline[i]=='.')

  2016 		    {

  2017                         if (i>2 && aline[i-2]=='.')

  2018 			    isacro=1;

  2019                         if (i+2<llen && aline[i+2]=='.')

  2020 			    isacro=1;

  2021 		    }

  2022                     if (!isacro)

  2023 		    {

  2024                         if (pswit[ECHO_SWITCH])

  2025 			    printf("\n%s\n",aline);

  2026                         if (!pswit[OVERVIEW_SWITCH])

  2027                             printf("    Line %ld column %d - Missing space?\n",

  2028 			      linecnt,i+1);

  2029                         else

  2030                             cnt_punct++;

  2031 		    }

  2032 		}

  2033                 if (aline[i-1]==CHAR_SPACE &&

  2034 		  (aline[i+1]==CHAR_SPACE || aline[i+1]==0))

  2035 		{

  2036 		    /*

  2037 		     * If there are spaces on both sides,

  2038 		     * or space before and end of line.

  2039 		     */

  2040                     if (aline[i]=='.')

  2041 		    {

  2042                         if (i>2 && aline[i-2]=='.')

  2043 			    isellipsis=1;

  2044                         if (i+2<llen && aline[i+2]=='.')

  2045 			    isellipsis=1;

  2046 		    }

  2047                     if (!isemptyline && !isellipsis)

  2048 		    {

  2049                         if (pswit[ECHO_SWITCH])

  2050 			    printf("\n%s\n",aline);

  2051                         if (!pswit[OVERVIEW_SWITCH])

  2052                             printf("    Line %ld column %d - "

  2053 			      "Spaced punctuation?\n",linecnt,i+1);

  2054                         else

  2055                             cnt_punct++;

  2056 		    }

  2057 		}

  2058 	    }

  2059 	}

  2060         /* Split out the characters that CANNOT be preceded by space. */

  2061         llen=strlen(aline);

  2062         for (i=1;i<llen;i++)

  2063 	{

  2064 	    /* for each character in the line after the first */

  2065             if (strchr("?!,;:",aline[i]))

  2066 	    {

  2067 		/* if it's punctuation that _cannot_ have a space before it */

  2068                 if (aline[i-1]==CHAR_SPACE && !isemptyline &&

  2069 		  aline[i+1]!=CHAR_SPACE)

  2070 		{

  2071 		    /*

  2072 		     * If aline[i+1) DOES == space,

  2073 		     * it was already reported just above.

  2074 		     */

  2075                     if (pswit[ECHO_SWITCH])

  2076 			printf("\n%s\n",aline);

  2077                     if (!pswit[OVERVIEW_SWITCH])

  2078                         printf("    Line %ld column %d - Spaced punctuation?\n",

  2079 			  linecnt,i+1);

  2080                     else

  2081                         cnt_punct++;

  2082 		}

  2083 	    }

  2084 	}

  2085         /*

  2086 	 * Special case " .X" where X is any alpha.

  2087          * This plugs a hole in the acronym code above.

  2088 	 * Inelegant, but maintainable.

  2089 	 */

  2090         llen=strlen(aline);

  2091         for (i=1;i<llen;i++)

  2092 	{

  2093 	    /* for each character in the line after the first */

  2094             if (aline[i]=='.')

  2095 	    {

  2096 		/* if it's a period */

  2097                 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))

  2098 		{

  2099 		    /*

  2100 		     * If the period follows a space and

  2101 		     * is followed by a letter.

  2102 		     */

  2103                     if (pswit[ECHO_SWITCH])

  2104 			printf("\n%s\n",aline);

  2105                     if (!pswit[OVERVIEW_SWITCH])

  2106                         printf("    Line %ld column %d - Spaced punctuation?\n",

  2107 			  linecnt,i+1);

  2108                     else

  2109                         cnt_punct++;

  2110 		}

  2111 	    }

  2112 	}

  2113         for (i=1;i<llen;i++)

  2114 	{

  2115 	    /* for each character in the line after the first */

  2116             if (aline[i]==CHAR_DQUOTE)

  2117 	    {

  2118                 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&

  2119 		  !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||

  2120 		  !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))

  2121 		{

  2122 		    if (pswit[ECHO_SWITCH])

  2123 			printf("\n%s\n",aline);

  2124 		    if (!pswit[OVERVIEW_SWITCH])

  2125 			printf("    Line %ld column %d - Unspaced quotes?\n",

  2126 			  linecnt,i+1);

  2127 		    else

  2128 			cnt_punct++;

  2129 		}

  2130 	    }

  2131 	}

  2132         /* Check parity of quotes. */

  2133         for (s=aline;*s;s++)

  2134 	{

  2135             if (*s==CHAR_DQUOTE)

  2136 	    {

  2137                 if (!(dquotepar=!dquotepar))

  2138 		{

  2139 		    /* parity even */

  2140                     if (!strchr("_-.'`/,;:!?)]} ",s[1]))

  2141 		    {

  2142                         if (pswit[ECHO_SWITCH])

  2143 			    printf("\n%s\n",aline);

  2144                         if (!pswit[OVERVIEW_SWITCH])

  2145                             printf("    Line %ld column %d - "

  2146 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2147                         else

  2148                             cnt_punct++;

  2149 		    }

  2150 		}

  2151                 else

  2152 		{

  2153 		    /* parity odd */

  2154                     if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2155 		      !strchr("_-/.'`([{$",s[1]) || !s[1])

  2156 		    {

  2157                         if (pswit[ECHO_SWITCH])

  2158 			    printf("\n%s\n",aline);

  2159                         if (!pswit[OVERVIEW_SWITCH])

  2160                             printf("    Line %ld column %d - "

  2161 			      "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2162                         else

  2163                             cnt_punct++;

  2164 		    }

  2165 		}

  2166 	    }

  2167 	}

  2168 	if (*aline==CHAR_DQUOTE)

  2169 	{

  2170 	    if (strchr(",;:!?)]} ",aline[1]))

  2171 	    {

  2172 		if (pswit[ECHO_SWITCH])

  2173 		    printf("\n%s\n",aline);

  2174 		if (!pswit[OVERVIEW_SWITCH])

  2175 		    printf("    Line %ld column 1 - Wrongspaced quotes?\n",

  2176 		      linecnt);

  2177 		else

  2178 		    cnt_punct++;

  2179 	    }

  2180 	}

  2181         if (pswit[SQUOTE_SWITCH])

  2182 	{

  2183             for (s=aline;*s;s++)

  2184 	    {

  2185                 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&

  2186 		  (s==aline || s>aline && !gcisalpha(s[-1]) ||

  2187 		  !gcisalpha(s[1])))

  2188 		{

  2189                     if (!(squotepar=!squotepar))

  2190 		    {

  2191 			/* parity even */

  2192                         if (!strchr("_-.'`/\",;:!?)]} ",s[1]))

  2193 			{

  2194                             if (pswit[ECHO_SWITCH])

  2195 				printf("\n%s\n",aline);

  2196                             if (!pswit[OVERVIEW_SWITCH])

  2197                                 printf("    Line %ld column %d - "

  2198 				  "Wrongspaced singlequotes?\n",

  2199 				  linecnt,(int)(s-aline)+1);

  2200                             else

  2201                                 cnt_punct++;

  2202 			}

  2203 		    }

  2204                     else

  2205 		    {

  2206 			/* parity odd */

  2207                         if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2208 			  !strchr("_-/\".'`",s[1]) || !s[1])

  2209 			{

  2210                             if (pswit[ECHO_SWITCH])

  2211 				printf("\n%s\n",aline);

  2212                             if (!pswit[OVERVIEW_SWITCH])

  2213                                 printf("    Line %ld column %d - "

  2214 				  "Wrongspaced singlequotes?\n",

  2215 				  linecnt,(int)(s-aline)+1);

  2216                             else

  2217                                 cnt_punct++;

  2218 			}

  2219 		    }

  2220 		}

  2221 	    }

  2222 	}

  2223         /*

  2224 	 * Look for double punctuation like ,. or ,,

  2225          * Thanks to DW for the suggestion!

  2226          * In books with references, ".," and ".;" are common

  2227          * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2228          * OTOH, from my initial tests, there are also fairly

  2229          * common errors. What to do? Make these cases paranoid?

  2230          * ".," is the most common, so warnings->dotcomma is used

  2231          * to suppress detailed reporting if it occurs often.

  2232 	 */

  2233         llen=strlen(aline);

  2234         for (i=0;i<llen;i++)

  2235 	{

  2236 	    /* for each punctuation character in the line */

  2237             if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&

  2238 	      aline[i] && aline[i+1])

  2239 	    {

  2240 		/* followed by punctuation, it's a query, unless . . . */

  2241                 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||

  2242 		  aline[i]=='!') ||

  2243 		  !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||

  2244 		  warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2245 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2246 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2247 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2248 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2249 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2250 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2251 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2252 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2253 		  warnings->isFrench && !strncmp(aline+i,"...?",4))

  2254 		{

  2255 		    if (warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2256 		      warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2257 		      warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2258 		      warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2259 		      warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2260 		      warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2261 		      warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2262 		      warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2263 		      warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2264 		      warnings->isFrench && !strncmp(aline+i,"...?",4))

  2265 			i+=4;

  2266 		    ; /* do nothing for .. !! and ?? which can be legit */

  2267 		}

  2268                 else

  2269 		{

  2270                     if (pswit[ECHO_SWITCH])

  2271 			printf("\n%s\n",aline);

  2272                     if (!pswit[OVERVIEW_SWITCH])

  2273                         printf("    Line %ld column %d - Double punctuation?\n",

  2274 			  linecnt,i+1);

  2275                     else

  2276                         cnt_punct++;

  2277 		}

  2278 	    }

  2279 	}

  2280         s=aline;

  2281         while (strstr(s," \" "))

  2282 	{

  2283             if (pswit[ECHO_SWITCH])

  2284 		printf("\n%s\n",aline);

  2285             if (!pswit[OVERVIEW_SWITCH])

  2286                 printf("    Line %ld column %d - Spaced doublequote?\n",

  2287 		  linecnt,(int)(strstr(s," \" ")-aline+1));

  2288             else

  2289                 cnt_punct++;

  2290             s=strstr(s," \" ")+2;

  2291 	}

  2292         s=aline;

  2293         while (strstr(s," ' "))

  2294 	{

  2295             if (pswit[ECHO_SWITCH])

  2296 		printf("\n%s\n",aline);

  2297             if (!pswit[OVERVIEW_SWITCH])

  2298                 printf("    Line %ld column %d - Spaced singlequote?\n",

  2299 		  linecnt,(int)(strstr(s," ' ")-aline+1));

  2300             else

  2301                 cnt_punct++;

  2302             s=strstr(s," ' ")+2;

  2303 	}

  2304         s=aline;

  2305         while (strstr(s," ` "))

  2306 	{

  2307             if (pswit[ECHO_SWITCH])

  2308 		printf("\n%s\n",aline);

  2309             if (!pswit[OVERVIEW_SWITCH])

  2310                 printf("    Line %ld column %d - Spaced singlequote?\n",

  2311 		  linecnt,(int)(strstr(s," ` ")-aline+1));

  2312             else

  2313                 cnt_punct++;

  2314             s=strstr(s," ` ")+2;

  2315 	}

  2316         /* check special case of 'S instead of 's at end of word */

  2317         s=aline+1;

  2318         while (*s)

  2319 	{

  2320             if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')

  2321 	    {

  2322                 if (pswit[ECHO_SWITCH])

  2323 		    printf("\n%s\n",aline);

  2324                 if (!pswit[OVERVIEW_SWITCH])

  2325                     printf("    Line %ld column %d - Capital \"S\"?\n",

  2326 		      linecnt,(int)(s-aline+2));

  2327                 else

  2328                     cnt_punct++;

  2329 	    }

  2330             s++;

  2331 	}

  2332         /*

  2333 	 * Now check special cases - start and end of line -

  2334          * for single and double quotes. Start is sometimes [sic]

  2335          * but better to query it anyway.

  2336          * While we're here, check for dash at end of line.

  2337 	 */

  2338         llen=strlen(aline);

  2339         if (llen>1)

  2340 	{

  2341             if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||

  2342 	      aline[llen-1]==CHAR_OPEN_SQUOTE)

  2343                 if (aline[llen-2]==CHAR_SPACE)

  2344 		{

  2345                     if (pswit[ECHO_SWITCH])

  2346 			printf("\n%s\n",aline);

  2347                     if (!pswit[OVERVIEW_SWITCH])

  2348                         printf("    Line %ld column %d - Spaced quote?\n",

  2349 			  linecnt,llen);

  2350                     else

  2351                         cnt_punct++;

  2352 		}

  2353             if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&

  2354 	      aline[1]==CHAR_SPACE)

  2355 	    {

  2356 		if (pswit[ECHO_SWITCH])

  2357 		    printf("\n%s\n",aline);

  2358 		if (!pswit[OVERVIEW_SWITCH])

  2359 		    printf("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2360 		else

  2361 		    cnt_punct++;

  2362 	    }

  2363             /*

  2364 	     * Dash at end of line may well be legit - paranoid mode only

  2365              * and don't report em-dash at line-end.

  2366 	     */

  2367             if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2368 	    {

  2369                 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

  2370 		    ;

  2371                 if (aline[i]=='-' && aline[i-1]!='-')

  2372 		{

  2373                     if (pswit[ECHO_SWITCH])

  2374 			printf("\n%s\n",aline);

  2375                     if (!pswit[OVERVIEW_SWITCH])

  2376                         printf("    Line %ld column %d - "

  2377 			  "Hyphen at end of line?\n",linecnt,i);

  2378 		}

  2379 	    }

  2380 	}

  2381         /*

  2382 	 * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2383          * If so, suspect a scanno like "a]most".

  2384 	 */

  2385         llen=strlen(aline);

  2386         for (i=1;i<llen-1;i++)

  2387 	{

  2388 	    /* for each bracket character in the line except 1st & last */

  2389             if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&

  2390 	      gcisalpha(aline[i+1]))

  2391 	    {

  2392                 if (pswit[ECHO_SWITCH])

  2393 		    printf("\n%s\n",aline);

  2394                 if (!pswit[OVERVIEW_SWITCH])

  2395                     printf("    Line %ld column %d - Unspaced bracket?\n",

  2396 		      linecnt,i);

  2397                 else

  2398                     cnt_punct++;

  2399 	    }

  2400 	}

  2401         llen=strlen(aline);

  2402         if (warnings->endquote)

  2403 	{

  2404             for (i=1;i<llen;i++)

  2405 	    {

  2406 		/* for each character in the line except 1st */

  2407                 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

  2408 		{

  2409 		    if (pswit[ECHO_SWITCH])

  2410 			printf("\n%s\n",aline);

  2411 		    if (!pswit[OVERVIEW_SWITCH])

  2412 			printf("    Line %ld column %d - "

  2413 			  "endquote missing punctuation?\n",linecnt,i);

  2414 		    else

  2415 			cnt_punct++;

  2416 		}

  2417 	    }

  2418 	}

  2419 	/*

  2420          * Check for <HTML TAG>.

  2421          * If there is a < in the line, followed at some point

  2422          * by a > then we suspect HTML.

  2423 	 */

  2424         if (strstr(aline,"<") && strstr(aline,">"))

  2425 	{

  2426             i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);

  2427             if (i>0)

  2428 	    {

  2429                 strncpy(wrk,strstr(aline,"<"),i);

  2430                 wrk[i]=0;

  2431                 if (pswit[ECHO_SWITCH])

  2432 		    printf("\n%s\n",aline);

  2433                 if (!pswit[OVERVIEW_SWITCH])

  2434                     printf("    Line %ld column %d - HTML Tag? %s \n",

  2435 		      linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);

  2436                 else

  2437                     cnt_html++;

  2438 	    }

  2439 	}

  2440         /*

  2441 	 * Check for &symbol; HTML.

  2442          * If there is a & in the line, followed at

  2443          * some point by a ; then we suspect HTML.

  2444 	 */

  2445         if (strstr(aline,"&") && strstr(aline,";"))

  2446 	{

  2447             i=(int)(strstr(aline,";")-strstr(aline,"&")+1);

  2448             for (s=strstr(aline,"&");s<strstr(aline,";");s++)

  2449                 if (*s==CHAR_SPACE)

  2450 		    i=0;                /* Don't report "Jones & Son;" */

  2451             if (i>0)

  2452 	    {

  2453                 strncpy(wrk,strstr(aline,"&"),i);

  2454                 wrk[i]=0;

  2455                 if (pswit[ECHO_SWITCH])

  2456 		    printf("\n%s\n",aline);

  2457                 if (!pswit[OVERVIEW_SWITCH])

  2458                     printf("    Line %ld column %d - HTML symbol? %s \n",

  2459 		      linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);

  2460                 else

  2461                     cnt_html++;

  2462 	    }

  2463 	}

  2464         /*

  2465 	 * At end of paragraph, check for mismatched quotes.

  2466          * We don't want to report an error immediately, since it is a

  2467          * common convention to omit the quotes at end of paragraph if

  2468          * the next paragraph is a continuation of the same speaker.

  2469          * Where this is the case, the next para should begin with a

  2470          * quote, so we store the warning message and only display it

  2471          * at the top of the next iteration if the new para doesn't

  2472          * start with a quote.

  2473          * The -p switch overrides this default, and warns of unclosed

  2474          * quotes on _every_ paragraph, whether the next begins with a

  2475          * quote or not.

  2476 	 */

  2477         if (isemptyline)

  2478 	{

  2479 	    /* end of para - add up the totals */

  2480             if (counters.quot%2)

  2481                 sprintf(dquote_err,"    Line %ld - Mismatched quotes\n",

  2482 		  linecnt);

  2483             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&

  2484 	      counters.open_single_quote!=counters.close_single_quote)

  2485                 sprintf(squote_err,"    Line %ld - Mismatched singlequotes?\n",

  2486 		  linecnt);

  2487             if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&

  2488 	      counters.open_single_quote!=counters.close_single_quote &&

  2489 	      counters.open_single_quote!=counters.close_single_quote+1)

  2490 		/*

  2491 		 * Flag it to be noted regardless of the

  2492 		 * first char of the next para.

  2493 		 */

  2494                 squot=1;

  2495             if (counters.r_brack)

  2496                 sprintf(rbrack_err,"    Line %ld - "

  2497 		  "Mismatched round brackets?\n",linecnt);

  2498             if (counters.s_brack)

  2499                 sprintf(sbrack_err,"    Line %ld - "

  2500 		  "Mismatched square brackets?\n",linecnt);

  2501             if (counters.c_brack)

  2502                 sprintf(cbrack_err,"    Line %ld - "

  2503 		  "Mismatched curly brackets?\n",linecnt);

  2504             if (counters.c_unders%2)

  2505                 sprintf(unders_err,"    Line %ld - Mismatched underscores?\n",

  2506 		  linecnt);

  2507 	    memset(&counters,0,sizeof(counters));

  2508 	    /* let the next iteration know that it's starting a new para */

  2509             isnewpara=1;

  2510 	}

  2511         /*

  2512 	 * Check for omitted punctuation at end of paragraph by working back

  2513 	 * through prevline. DW.

  2514          * Need to check this only for "normal" paras.

  2515          * So what is a "normal" para?

  2516          *    Not normal if one-liner (chapter headings, etc.)

  2517          *    Not normal if doesn't contain at least one locase letter

  2518          *    Not normal if starts with space

  2519 	 */

  2520         if (isemptyline)

  2521 	{

  2522 	    /* end of para */

  2523             for (s=prevline,i=0;*s && !i;s++)

  2524                 if (gcisletter(*s))

  2525 		    /* use i to indicate the presence of a letter on the line */

  2526                     i=1;

  2527             /*

  2528 	     * This next "if" is a problem.

  2529              * If we say "start_para_line <= linecnt - 1", that includes

  2530 	     * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2531              * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2532              * misses genuine one-line paragraphs.

  2533 	     */

  2534             if (i && last.blen>2 && start_para_line<linecnt-1 &&

  2535 	      *prevline>CHAR_SPACE)

  2536 	    {

  2537                 for (i=strlen(prevline)-1;

  2538 		  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&

  2539 		  prevline[i]>CHAR_SPACE && i>0;

  2540 		  i--)

  2541 		    ;

  2542                 for (;i>0;i--)

  2543 		{

  2544                     if (gcisalpha(prevline[i]))

  2545 		    {

  2546                         if (pswit[ECHO_SWITCH])

  2547 			    printf("\n%s\n",prevline);

  2548                         if (!pswit[OVERVIEW_SWITCH])

  2549                             printf("    Line %ld column %d - "

  2550 			      "No punctuation at para end?\n",

  2551 			      linecnt-1,strlen(prevline));

  2552                         else

  2553                             cnt_punct++;

  2554                         break;

  2555 		    }

  2556                     if (strchr("-.:!([{?}])",prevline[i]))

  2557                         break;

  2558 		}

  2559 	    }

  2560 	}

  2561         strcpy(prevline,aline);

  2562     }

  2563     fclose(infile);

  2564     if (!pswit[OVERVIEW_SWITCH])

  2565         for (i=0;i<MAX_QWORD;i++)

  2566             if (dupcnt[i])

  2567                 printf("\nNote: Queried word %s was duplicated %d time%s\n",

  2568 		  qword[i],dupcnt[i],"s");

  2569 }

  2571 /*

  2572  * flgets:

  2573  *

  2574  * Get one line from the input stream, checking for

  2575  * the existence of exactly one CR/LF line-end per line.

  2576  *

  2577  * Returns: a pointer to the line.

  2578  */

  2579 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)

  2580 {

  2581     char c;

  2582     int len,isCR,cint;

  2583     *theline=0;

  2584     len=isCR=0;

  2585     c=cint=fgetc(thefile);

  2586     do

  2587     {

  2588         if (cint==EOF)

  2589             return NULL;

  2590 	/* either way, it's end of line */

  2591         if (c==10)

  2592 	{

  2593             if (isCR)

  2594                 break;

  2595             else

  2596 	    {

  2597 		/* Error - a LF without a preceding CR */

  2598                 if (pswit[LINE_END_SWITCH])

  2599 		{

  2600                     if (pswit[ECHO_SWITCH])

  2601 			printf("\n%s\n",theline);

  2602                     if (!pswit[OVERVIEW_SWITCH])

  2603                         printf("    Line %ld - No CR?\n",lcnt);

  2604                     else

  2605                         cnt_lineend++;

  2606 		}

  2607                 break;

  2608 	    }

  2609 	}

  2610         if (c==13)

  2611 	{

  2612             if (isCR)

  2613 	    {

  2614 		/* Error - two successive CRs */

  2615                 if (pswit[LINE_END_SWITCH])

  2616 		{

  2617                     if (pswit[ECHO_SWITCH])

  2618 			printf("\n%s\n",theline);

  2619                     if (!pswit[OVERVIEW_SWITCH])

  2620                         printf("    Line %ld - Two successive CRs?\n",lcnt);

  2621                     else

  2622                         cnt_lineend++;

  2623 		}

  2624 	    }

  2625             isCR=1;

  2626 	}

  2627         else

  2628 	{

  2629             if (pswit[LINE_END_SWITCH] && isCR)

  2630 	    {

  2631                 if (pswit[ECHO_SWITCH])

  2632 		    printf("\n%s\n",theline);

  2633                 if (!pswit[OVERVIEW_SWITCH])

  2634                     printf("    Line %ld column %d - CR without LF?\n",

  2635 		      lcnt,len+1);

  2636                 else

  2637                     cnt_lineend++;

  2638 	    }

  2639             theline[len]=c;

  2640             len++;

  2641             theline[len]=0;

  2642             isCR=0;

  2643 	}

  2644         c=cint=fgetc(thefile);

  2645     } while(len<maxlen);

  2646     if (pswit[MARKUP_SWITCH])

  2647         postprocess_for_HTML(theline);

  2648     if (pswit[DP_SWITCH])

  2649         postprocess_for_DP(theline);

  2650     return theline;

  2651 }

  2653 /*

  2654  * mixdigit:

  2655  *

  2656  * Takes a "word" as a parameter, and checks whether it

  2657  * contains a mixture of alpha and digits. Generally, this is an

  2658  * error, but may not be for cases like 4th or L5 12s. 3d.

  2659  *

  2660  * Returns: 0 if no error found, 1 if error.

  2661  */

  2662 int mixdigit(char *checkword)

  2663 {

  2664     int wehaveadigit,wehavealetter,firstdigits,query,wl;

  2665     char *s;

  2666     wehaveadigit=wehavealetter=query=0;

  2667     for (s=checkword;*s;s++)

  2668         if (gcisalpha(*s))

  2669             wehavealetter=1;

  2670         else

  2671             if (gcisdigit(*s))

  2672                 wehaveadigit=1;

  2673     if (wehaveadigit && wehavealetter)

  2674     {

  2675 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2676         query=1;

  2677         wl=strlen(checkword);

  2678         for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)

  2679             ;

  2680         /* digits, ending in st, rd, nd, th of either case */

  2681         if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||

  2682 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||

  2683 	  matchword(checkword+wl-2,"th")))

  2684 	    query=0;

  2685         if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||

  2686 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||

  2687 	  matchword(checkword+wl-3,"ths")))

  2688 	    query=0;

  2689         if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||

  2690 	  matchword(checkword+wl-4,"rdly") ||

  2691 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))

  2692 	    query=0;

  2693         /* digits, ending in l, L, s or d */

  2694         if (firstdigits+1==wl && (checkword[wl-1]=='l' ||

  2695 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))

  2696 	    query=0;

  2697         /*

  2698 	 * L at the start of a number, representing Britsh pounds, like L500.

  2699          * This is cute. We know the current word is mixeddigit. If the first

  2700          * letter is L, there must be at least one digit following. If both

  2701          * digits and letters follow, we have a genuine error, else we have a

  2702          * capital L followed by digits, and we accept that as a non-error.

  2703 	 */

  2704         if (checkword[0]=='L' && !mixdigit(checkword+1))

  2705 	    query=0;

  2706     }

  2707     return query;

  2708 }

  2710 /*

  2711  * getaword:

  2712  *

  2713  * Extracts the first/next "word" from the line, and puts

  2714  * it into "thisword". A word is defined as one English word unit--or

  2715  * at least that's the aim.

  2716  *

  2717  * Returns: a pointer to the position in the line where we will start

  2718  *          looking for the next word.

  2719  */

  2720 char *getaword(char *fromline,char *thisword)

  2721 {

  2722     int i,wordlen;

  2723     char *s;

  2724     wordlen=0;

  2725     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;

  2726       fromline++)

  2727 	;

  2728     /*

  2729      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2730      * Especially yucky is the case of L1,000

  2731      * This section looks for a pattern of characters including a digit

  2732      * followed by a comma or period followed by one or more digits.

  2733      * If found, it returns this whole pattern as a word; otherwise we discard

  2734      * the results and resume our normal programming.

  2735      */

  2736     s=fromline;

  2737     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&

  2738       wordlen<MAXWORDLEN;s++)

  2739     {

  2740 	thisword[wordlen]=*s;

  2741         wordlen++;

  2742     }

  2743     thisword[wordlen]=0;

  2744     for (i=1;i<wordlen-1;i++)

  2745     {

  2746         if (thisword[i]=='.' || thisword[i]==',')

  2747 	{

  2748             if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))

  2749 	    {

  2750                 fromline=s;

  2751                 return fromline;

  2752 	    }

  2753 	}

  2754     }

  2755     /* we didn't find a punctuated number - do the regular getword thing */

  2756     wordlen=0;

  2757     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&

  2758       wordlen<MAXWORDLEN;fromline++)

  2759     {

  2760         thisword[wordlen]=*fromline;

  2761         wordlen++;

  2762     }

  2763     thisword[wordlen]=0;

  2764     return fromline;

  2765 }

  2767 /*

  2768  * matchword:

  2769  *

  2770  * A case-insensitive string matcher.

  2771  */

  2772 int matchword(char *checkfor,char *thisword)

  2773 {

  2774     unsigned int ismatch,i;

  2775     if (strlen(checkfor)!=strlen(thisword))

  2776 	return 0;

  2777     ismatch=1;     /* assume a match until we find a difference */

  2778     for (i=0;i<strlen(checkfor);i++)

  2779         if (toupper(checkfor[i])!=toupper(thisword[i]))

  2780             ismatch=0;

  2781     return ismatch;

  2782 }

  2784 /*

  2785  * lowerit:

  2786  *

  2787  * Lowercase the line.

  2788  */

  2790 void lowerit(char *theline)

  2791 {

  2792     for (;*theline;theline++)

  2793         if (*theline>='A' && *theline<='Z')

  2794             *theline+=32;

  2795 }

  2797 /*

  2798  * isroman:

  2799  *

  2800  * Is this word a Roman Numeral?

  2801  *

  2802  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2803  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2804  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2805  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2806  * expressions thereof, except when it came to taxes. Allow any number of M,

  2807  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  2808  * XL or an optional XC, an optional IX or IV, an optional V and any number

  2809  * of optional Is.

  2810  */

  2811 int isroman(char *t)

  2812 {

  2813     char *s;

  2814     if (!t || !*t)

  2815 	return 0;

  2816     s=t;

  2817     while (*t=='m' && *t)

  2818 	t++;

  2819     if (*t=='d')

  2820 	t++;

  2821     if (*t=='c' && t[1]=='m')

  2822 	t+=2;

  2823     if (*t=='c' && t[1]=='d')

  2824 	t+=2;

  2825     while (*t=='c' && *t)

  2826 	t++;

  2827     if (*t=='x' && t[1]=='l')

  2828 	t+=2;

  2829     if (*t=='x' && t[1]=='c')

  2830 	t+=2;

  2831     if (*t=='l')

  2832 	t++;

  2833     while (*t=='x' && *t)

  2834 	t++;

  2835     if (*t=='i' && t[1]=='x')

  2836 	t+=2;

  2837     if (*t=='i' && t[1]=='v')

  2838 	t+=2;

  2839     if (*t=='v')

  2840 	t++;

  2841     while (*t=='i' && *t)

  2842 	t++;

  2843     return !*t;

  2844 }

  2846 /*

  2847  * gcisalpha:

  2848  *

  2849  * A version of isalpha() that is somewhat lenient on 8-bit texts.

  2850  * If we use the standard function, 8-bit accented characters break

  2851  * words, so that tete with accented characters appears to be two words, "t"

  2852  * and "t", with 8-bit characters between them. This causes over-reporting of

  2853  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)

  2854  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.

  2855  */

  2856 int gcisalpha(unsigned char c)

  2857 {

  2858     if (c>='a' && c<='z')

  2859 	return 1;

  2860     if (c>='A' && c<='Z')

  2861 	return 1;

  2862     if (c<140)

  2863 	return 0;

  2864     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)

  2865 	return 1;

  2866     if (c==140 || c==142 || c==156 || c==158 || c==159)

  2867 	return 1;

  2868     return 0;

  2869 }

  2871 /*

  2872  * gcisdigit:

  2873  *

  2874  * A version of isdigit() that doesn't get confused in 8-bit texts.

  2875  */

  2876 int gcisdigit(unsigned char c)

  2877 {

  2878     return c>='0' && c<='9';

  2879 }

  2881 /*

  2882  * gcisletter:

  2883  *

  2884  * A version of isletter() that doesn't get confused in 8-bit texts.

  2885  * NB: this is ISO-8891-1-specific.

  2886  */

  2887 int gcisletter(unsigned char c)

  2888 {

  2889     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;

  2890 }

  2892 /*

  2893  * gcstrchr:

  2894  *

  2895  * Wraps strchr to return NULL if the character being searched for is zero.

  2896  */

  2897 char *gcstrchr(char *s,char c)

  2898 {

  2899     if (!c)

  2900 	return NULL;

  2901     return strchr(s,c);

  2902 }

  2904 /*

  2905  * postprocess_for_DP:

  2906  *

  2907  * Invoked with the -d switch from flgets().

  2908  * It simply "removes" from the line a hard-coded set of common

  2909  * DP-specific tags, so that the line passed to the main routine has

  2910  * been pre-cleaned of DP markup.

  2911  */

  2912 void postprocess_for_DP(char *theline)

  2913 {

  2914     char *s,*t;

  2915     int i;

  2916     if (!*theline)

  2917         return;

  2918     for (i=0;*DPmarkup[i];i++)

  2919     {

  2920         s=strstr(theline,DPmarkup[i]);

  2921         while (s)

  2922 	{

  2923             t=s+strlen(DPmarkup[i]);

  2924             while (*t)

  2925 	    {

  2926                 *s=*t;

  2927                 t++;

  2928 		s++;

  2929 	    }

  2930             *s=0;

  2931             s=strstr(theline,DPmarkup[i]);

  2932 	}

  2933     }

  2934 }

  2936 /*

  2937  * postprocess_for_HTML:

  2938  *

  2939  * Invoked with the -m switch from flgets().

  2940  * It simply "removes" from the line a hard-coded set of common

  2941  * HTML tags and "replaces" a hard-coded set of common HTML

  2942  * entities, so that the line passed to the main routine has

  2943  * been pre-cleaned of HTML.

  2944  */

  2945 void postprocess_for_HTML(char *theline)

  2946 {

  2947     if (strstr(theline,"<") && strstr(theline,">"))

  2948         while (losemarkup(theline))

  2949             ;

  2950     while (loseentities(theline))

  2951         ;

  2952 }

  2954 char *losemarkup(char *theline)

  2955 {

  2956     char *s,*t;

  2957     int i;

  2958     if (!*theline)

  2959         return NULL;

  2960     s=strstr(theline,"<");

  2961     t=strstr(theline,">");

  2962     if (!s || !t)

  2963 	return NULL;

  2964     for (i=0;*markup[i];i++)

  2965         if (!tagcomp(s+1,markup[i]))

  2966 	{

  2967             if (!t[1])

  2968 	    {

  2969                 *s=0;

  2970                 return s;

  2971 	    }

  2972             else if (t>s)

  2973 	    {

  2974 		strcpy(s,t+1);

  2975 		return s;

  2976 	    }

  2977         }

  2978     /* It's an unrecognized <xxx>. */

  2979     return NULL;

  2980 }

  2982 char *loseentities(char *theline)

  2983 {

  2984     int i;

  2985     char *s,*t;

  2986     if (!*theline)

  2987         return NULL;

  2988     for (i=0;*entities[i].htmlent;i++)

  2989     {

  2990         s=strstr(theline,entities[i].htmlent);

  2991         if (s)

  2992 	{

  2993             t=malloc((size_t)strlen(s));

  2994             if (!t)

  2995 		return NULL;

  2996             strcpy(t,s+strlen(entities[i].htmlent));

  2997             strcpy(s,entities[i].textent);

  2998             strcat(s,t);

  2999             free(t);

  3000             return theline;

  3001 	}

  3002     }

  3003     for (i=0;*entities[i].htmlnum;i++)

  3004     {

  3005         s=strstr(theline,entities[i].htmlnum);

  3006         if (s)

  3007 	{

  3008             t=malloc((size_t)strlen(s));

  3009             if (!t)

  3010 		return NULL;

  3011             strcpy(t,s+strlen(entities[i].htmlnum));

  3012             strcpy(s,entities[i].textent);

  3013             strcat(s,t);

  3014             free(t);

  3015             return theline;

  3016 	}

  3017     }

  3018     return NULL;

  3019 }

  3021 int tagcomp(char *strin,char *basetag)

  3022 {

  3023     char *s,*t;

  3024     s=basetag;

  3025     t=strin;

  3026     if (*t=='/')

  3027 	t++; /* ignore a slash */

  3028     while (*s && *t)

  3029     {

  3030         if (tolower(*s)!=tolower(*t))

  3031 	    return 1;

  3032         s++;

  3033 	t++;

  3034     }

  3035     return 0;

  3036 }

  3038 void proghelp()

  3039 {

  3040     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3041     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3042     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3043     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3044       "For details, read the file COPYING.\n",stderr);

  3045     fputs("This is Free Software; "

  3046       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3047     fputs("read the file COPYING for details.\n\n",stderr);

  3048     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);

  3049     fputs("  where -s checks single quotes, -e suppresses echoing lines, "

  3050       "-t checks typos\n",stderr);

  3051     fputs("  -x (paranoid) switches OFF -t and extra checks, "

  3052       "-l turns OFF line-end checks\n",stderr);

  3053     fputs("  -o just displays overview without detail, "

  3054       "-h echoes header fields\n",stderr);

  3055     fputs("  -v (verbose) unsuppresses duplicate reporting, "

  3056       "-m suppresses markup\n",stderr);

  3057     fputs("  -d ignores DP-specific markup,\n",stderr);

  3058     fputs("  -u uses a file gutcheck.typ to query user-defined "

  3059       "possible typos\n",stderr);

  3060     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);

  3061     fputs("\n",stderr);

  3062     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",

  3063       stderr);

  3064     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3065       "non-ASCII\n",stderr);

  3066     fputs("characters like accented letters, "

  3067       "lines longer than 75 or shorter than 55,\n",stderr);

  3068     fputs("unbalanced quotes or brackets, "

  3069       "a variety of badly formatted punctuation, \n",stderr);

  3070     fputs("HTML tags, some likely typos. "

  3071       "It is NOT a substitute for human judgement.\n",stderr);

  3072     fputs("\n",stderr);

  3073 }

author	ali <ali@juiblex.co.uk>
	Sat May 25 23:51:28 2013 +0100 (2013-05-25)
changeset 51	0d08cd5055d5
parent 50	1b646720d4a7
child 52	a1fd8d3f0940
permissions	-rw-r--r--