bookloupe-testing: bookloupe/bookloupe.c@adb087007d08

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    26 #define MAXWORDLEN    80    /* max length of one word */

    27 #define LINEBUFSIZE 2048    /* buffer size for an input line */

    29 #define MAX_USER_TYPOS 1000

    30 #define USERTYPO_FILE "gutcheck.typ"

    32 #ifndef MAX_PATH

    33 #define MAX_PATH 16384

    34 #endif

    36 char aline[LINEBUFSIZE];

    37 char prevline[LINEBUFSIZE];

    39 /* Common typos. */

    40 char *typo[] = {

    41     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    42     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    43     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    44     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    45     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    46     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    47     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    48     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    49     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    50     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    51     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    52     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    53     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    54     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    55     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    56     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    57     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    58     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    59     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    60     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    61     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    62     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    63     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    64     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    65     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    66     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    67     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    68     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    69     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    70     "se", ""

    71 };

    73 char *usertypo[MAX_USER_TYPOS];

    75 /* Common abbreviations and other OK words not to query as typos. */

    76 char *okword[] = {

    77     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    78     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    79     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    80     "outbid", "outbids", "frostbite", "frostbitten", ""

    81 };

    83 /* Common abbreviations that cause otherwise unexplained periods. */

    84 char *abbrev[] = {

    85     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    86     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    87 };

    89 /*

    90  * Two-Letter combinations that rarely if ever start words,

    91  * but are common scannos or otherwise common letter combinations.

    92  */

    93 char *nostart[] = {

    94     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    95 };

    97 /*

    98  * Two-Letter combinations that rarely if ever end words,

    99  * but are common scannos or otherwise common letter combinations.

   100  */

   101 char *noend[] = {

   102     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   103     "sw", "gr", "sl", "cl", "iy", ""

   104 };

   106 char *markup[] = {

   107     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   108     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   109     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   110     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   111 };

   113 char *DPmarkup[] = {

   114     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   115 };

   117 char *nocomma[] = {

   118     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   119     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   120     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   121     "during", "let", "toward", "among", ""

   122 };

   124 char *noperiod[] = {

   125     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   126     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   127     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   128     "among", "those", "into", "whom", "having", "thence", ""

   129 };

   131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";

   133 struct {

   134     char *htmlent;

   135     char *htmlnum;

   136     char *textent;

   137 } entities[] = {

   138     "&amp;",	"&#38;",     "&",

   139     "&lt;",	"&#60;",     "<",

   140     "&gt;",	"&#62;",     ">",

   141     "&deg;",	"&#176;",    " degrees",

   142     "&pound;",	"&#163;",    "L",

   143     "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */

   144     "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */

   145     "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */

   146     "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */

   147     "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */

   148     "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */

   149     "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */

   150     "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */

   151     "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */

   152     "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */

   153     "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */

   154     "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */

   155     "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */

   156     "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */

   157     "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */

   158     "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */

   159     "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */

   160     "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */

   161     "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */

   162     "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */

   163     "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */

   164     "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */

   165     "&cent;",	"&#162;",    "c", /* cent sign */

   166     "&pound;",	"&#163;",    "L", /* pound sign */

   167     "&curren;",	"&#164;",    "$", /* currency sign */

   168     "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */

   169     "&sect;",	"&#167;",    "--", /* section sign */

   170     "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */

   171     "&copy;",	"&#169;",    "(C) ", /* copyright sign */

   172     "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */

   173     "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */

   174     "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */

   175     "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */

   176     "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */

   177     "&deg;",	"&#176;",    " degrees", /* degree sign */

   178     "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */

   179     "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */

   180     "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */

   181     "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */

   182     "&micro;",	"&#181;",    "m", /* micro sign */

   183     "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */

   184     "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */

   185     "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */

   186     "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */

   187     "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */

   188     "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */

   189     "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */

   190     "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */

   191     "&iquest;",	"&#191;",    "?", /* inverted question mark */

   192     "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */

   193     "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */

   194     "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */

   195     "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */

   196     "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */

   197     "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */

   198     "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */

   199     "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */

   200     "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */

   201     "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */

   202     "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */

   203     "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */

   204     "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */

   205     "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */

   206     "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */

   207     "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */

   208     "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */

   209     "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */

   210     "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */

   211     "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */

   212     "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */

   213     "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */

   214     "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */

   215     "&times;",	"&#215;",    "*", /* multiplication sign */

   216     "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */

   217     "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */

   218     "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */

   219     "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */

   220     "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */

   221     "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */

   222     "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */

   223     "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */

   224     "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */

   225     "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */

   226     "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */

   227     "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */

   228     "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */

   229     "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */

   230     "&aelig;",	"&#230;",    "ae", /* latin small letter ae */

   231     "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */

   232     "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */

   233     "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */

   234     "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */

   235     "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */

   236     "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */

   237     "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */

   238     "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */

   239     "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */

   240     "&eth;",	"&#240;",    "eth", /* latin small letter eth */

   241     "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */

   242     "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */

   243     "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */

   244     "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */

   245     "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */

   246     "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */

   247     "&divide;",	"&#247;",    "/", /* division sign */

   248     "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */

   249     "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */

   250     "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */

   251     "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */

   252     "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */

   253     "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */

   254     "&thorn;",	"&#254;",    "th", /* latin small letter thorn */

   255     "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */

   256     "", ""

   257 };

   259 /* special characters */

   260 #define CHAR_SPACE	  32

   261 #define CHAR_TAB	   9

   262 #define CHAR_LF		  10

   263 #define CHAR_CR		  13

   264 #define CHAR_DQUOTE	  34

   265 #define CHAR_SQUOTE	  39

   266 #define CHAR_OPEN_SQUOTE  96

   267 #define CHAR_TILDE	 126

   268 #define CHAR_ASTERISK	  42

   269 #define CHAR_FORESLASH	  47

   270 #define CHAR_CARAT	  94

   272 #define CHAR_UNDERSCORE    '_'

   273 #define CHAR_OPEN_CBRACK   '{'

   274 #define CHAR_CLOSE_CBRACK  '}'

   275 #define CHAR_OPEN_RBRACK   '('

   276 #define CHAR_CLOSE_RBRACK  ')'

   277 #define CHAR_OPEN_SBRACK   '['

   278 #define CHAR_CLOSE_SBRACK  ']'

   280 /* longest and shortest normal PG line lengths */

   281 #define LONGEST_PG_LINE   75

   282 #define WAY_TOO_LONG      80

   283 #define SHORTEST_PG_LINE  55

   285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */

   286 				  /*     D - ignore DP-specific markup */

   287 				  /*     E - echo queried line */

   288 				  /*     S - check single quotes */

   289 				  /*     T - check common typos	*/

   290 				  /*     P - require closure of quotes on */

   291 				  /*	 every paragraph */

   292 				  /*     X - "Trust no one" :-) Paranoid! */

   293 				  /*	 Queries everything */

   294 				  /*     L - line end checking defaults on */

   295 				  /*	 -L turns it off */

   296 				  /*     O - overview. Just shows counts. */

   297 				  /*     Y - puts errors to stdout */

   298 				  /*	 instead of stderr */

   299 				  /*     H - Echoes header fields */

   300 				  /*     M - Ignore markup in < > */

   301 				  /*     U - Use file of User-defined Typos */

   302 				  /*     W - Defaults for use on Web upload */

   303 				  /*     V - Verbose - list EVERYTHING! */

   304 #define SWITNO 14		  /* max number of switch parms	*/

   305 				  /*	- used for defining array-size */

   306 #define MINARGS   1  /* minimum no of args excl switches */

   307 #define MAXARGS   1  /* maximum no of args excl switches */

   309 int pswit[SWITNO];   /* program switches set by SWITCHES */

   311 #define ECHO_SWITCH      0

   312 #define SQUOTE_SWITCH    1

   313 #define TYPO_SWITCH      2

   314 #define QPARA_SWITCH     3

   315 #define PARANOID_SWITCH  4

   316 #define LINE_END_SWITCH  5

   317 #define OVERVIEW_SWITCH  6

   318 #define STDOUT_SWITCH    7

   319 #define HEADER_SWITCH    8

   320 #define WEB_SWITCH       9

   321 #define VERBOSE_SWITCH   10

   322 #define MARKUP_SWITCH    11

   323 #define USERTYPO_SWITCH  12

   324 #define DP_SWITCH	 13

   326 long cnt_dquot;		/* for overview mode, count of doublequote queries */

   327 long cnt_squot;		/* for overview mode, count of singlequote queries */

   328 long cnt_brack;		/* for overview mode, count of brackets queries */

   329 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   330 long cnt_odd;		/* for overview mode, count of odd character queries */

   331 long cnt_long;		/* for overview mode, count of long line errors */

   332 long cnt_short;		/* for overview mode, count of short line queries */

   333 long cnt_punct;		/* for overview mode,

   334 			   count of punctuation and spacing queries */

   335 long cnt_dash;		/* for overview mode, count of dash-related queries */

   336 long cnt_word;		/* for overview mode, count of word queries */

   337 long cnt_html;		/* for overview mode, count of html queries */

   338 long cnt_lineend;	/* for overview mode, count of line-end queries */

   339 long cnt_spacend;	/* count of lines with space at end */

   340 long linecnt;		/* count of total lines in the file */

   341 long checked_linecnt;	/* count of lines actually checked */

   343 void proghelp(void);

   344 void procfile(char *);

   346 #define LOW_THRESHOLD    0

   347 #define HIGH_THRESHOLD   1

   349 #define START 0

   350 #define END 1

   351 #define PREV 0

   352 #define NEXT 1

   353 #define FIRST_OF_PAIR 0

   354 #define SECOND_OF_PAIR 1

   356 #define MAX_WORDPAIR 1000

   358 char running_from[MAX_PATH];

   360 int mixdigit(char *);

   361 const char *getaword(const char *,char *);

   362 int matchword(char *,char *);

   363 char *flgets(char *,int,FILE *,long);

   364 void lowerit(char *);

   365 int gcisalpha(unsigned char);

   366 int gcisdigit(unsigned char);

   367 int gcisletter(unsigned char);

   368 char *gcstrchr(char *s,char c);

   369 void postprocess_for_HTML(char *);

   370 char *linehasmarkup(char *);

   371 char *losemarkup(char *);

   372 int tagcomp(char *,char *);

   373 char *loseentities(char *);

   374 int isroman(char *);

   375 int usertypo_count;

   376 void postprocess_for_DP(char *);

   378 char wrk[LINEBUFSIZE];

   380 #define MAX_QWORD 50

   381 #define MAX_QWORD_LENGTH 40

   382 char qword[MAX_QWORD][MAX_QWORD_LENGTH];

   383 int dupcnt[MAX_QWORD];

   385 struct first_pass_results {

   386     long firstline,astline;

   387     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   388     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   389     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   390     int Dutchcount,Frenchcount;

   391 };

   393 struct warnings {

   394     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   395     int endquote,isDutch,isFrench;

   396 };

   398 struct counters {

   399     long quot;

   400     int c_unders,c_brack,s_brack,r_brack;

   401     int open_single_quote,close_single_quote;

   402 };

   404 struct line_properties {

   405     unsigned int len,blen;

   406     char start;

   407 };

   409 struct parities {

   410     int dquote,squote;

   411 };

   413 struct pending {

   414     char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];

   415     long squot;

   416 };

   418 int main(int argc,char **argv)

   419 {

   420     char *argsw,*s;

   421     int i,switno,invarg;

   422     char usertypo_file[MAX_PATH];

   423     FILE *usertypofile;

   424     if (strlen(argv[0])<sizeof(running_from))

   425 	/* save the path to the executable */

   426 	strcpy(running_from,argv[0]);

   427     /* find out what directory we're running from */

   428     s=running_from+strlen(running_from);

   429     for (;*s!='/' && *s!='\\' && s>=running_from;s--)

   430 	*s=0;

   431     switno=strlen(SWITCHES);

   432     for (i=switno;--i>0;)

   433 	pswit[i]=0;	   /* initialise switches */

   434     /*

   435      * Standard loop to extract switches.

   436      * When we come out of this loop, the arguments will be

   437      * in argv[0] upwards and the switches used will be

   438      * represented by their equivalent elements in pswit[]

   439      */

   440     while (--argc>0 && **++argv=='-')

   441 	for (argsw=argv[0]+1;*argsw!='\0';argsw++)

   442 	    for (i=switno,invarg=1;(--i>=0) && invarg==1;)

   443 		if ((toupper(*argsw))==SWITCHES[i])

   444 		{

   445 		    invarg=0;

   446 		    pswit[i]=1;

   447 		}

   448     /* Paranoid checking is turned OFF, not on, by its switch */

   449     pswit[PARANOID_SWITCH]^=1;

   450     if (pswit[PARANOID_SWITCH])

   451 	/* if running in paranoid mode force typo checks as well   */

   452 	pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;

   453     /* Line-end checking is turned OFF, not on, by its switch */

   454     pswit[LINE_END_SWITCH]^=1;

   455     /* Echoing is turned OFF, not on, by its switch */

   456     pswit[ECHO_SWITCH]^=1;

   457     if (pswit[OVERVIEW_SWITCH])

   458 	/* just print summary; don't echo */

   459 	pswit[ECHO_SWITCH]=0;

   460     /*

   461      * Web uploads - for the moment, this is really just a placeholder

   462      * until we decide what processing we really want to do on web uploads

   463      */

   464     if (pswit[WEB_SWITCH])

   465     {

   466 	/* specific override for web uploads */

   467 	pswit[ECHO_SWITCH]=1;

   468 	pswit[SQUOTE_SWITCH]=0;

   469 	pswit[TYPO_SWITCH]=1;

   470 	pswit[QPARA_SWITCH]=0;

   471 	pswit[PARANOID_SWITCH]=1;

   472 	pswit[LINE_END_SWITCH]=0;

   473 	pswit[OVERVIEW_SWITCH]=0;

   474 	pswit[STDOUT_SWITCH]=0;

   475 	pswit[HEADER_SWITCH]=1;

   476 	pswit[VERBOSE_SWITCH]=0;

   477 	pswit[MARKUP_SWITCH]=0;

   478 	pswit[USERTYPO_SWITCH]=0;

   479 	pswit[DP_SWITCH]=0;

   480     }

   481     if (argc<MINARGS || argc>MAXARGS)

   482     {

   483 	/* check number of args */

   484 	proghelp();

   485 	return 1;

   486     }

   487     /* read in the user-defined stealth scanno list */

   488     if (pswit[USERTYPO_SWITCH])

   489     {

   490 	/* ... we were told we had one! */

   491 	usertypofile=fopen(USERTYPO_FILE,"rb");

   492 	if (!usertypofile)

   493 	{

   494 	    /* not in cwd. try excuteable directory. */

   495 	    strcpy(usertypo_file,running_from);

   496 	    strcat(usertypo_file,USERTYPO_FILE);

   497 	    usertypofile=fopen(usertypo_file,"rb");

   498 	    if (!usertypofile) {

   499 		/* we ain't got no user typo file! */

   500 		printf("   --> I couldn't find gutcheck.typ "

   501 		  "-- proceeding without user typos.\n");

   502 	    }

   503 	}

   504 	usertypo_count=0;

   505 	if (usertypofile)

   506 	{

   507 	    /* we managed to open a User Typo File! */

   508 	    if (pswit[USERTYPO_SWITCH])

   509 	    {

   510 		while (flgets(aline,LINEBUFSIZE-1,usertypofile,

   511 		  (long)usertypo_count))

   512 		{

   513 		    if (strlen(aline)>1)

   514 		    {

   515 			if ((int)*aline>33)

   516 			{

   517 			    s=malloc(strlen(aline)+1);

   518 			    if (!s)

   519 			    {

   520 				fprintf(stderr,"bookloupe: cannot get enough "

   521 				  "memory for user typo file!\n");

   522 				exit(1);

   523 			    }

   524 			    strcpy(s,aline);

   525 			    usertypo[usertypo_count]=s;

   526 			    usertypo_count++;

   527 			    if (usertypo_count>=MAX_USER_TYPOS)

   528 			    {

   529 				printf("   --> Only %d user-defined typos "

   530 				  "allowed: ignoring the rest\n",

   531 				  MAX_USER_TYPOS);

   532 				break;

   533 			    }

   534 			}

   535 		    }

   536 		}

   537 	    }

   538 	    fclose(usertypofile);

   539 	}

   540     }

   541     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   542     cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=

   543     cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=

   544     cnt_spacend=0;

   545     procfile(argv[0]);

   546     if (pswit[OVERVIEW_SWITCH])

   547     {

   548 	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   549 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   550 	printf("    --------------- Queries found --------------\n");

   551 	if (cnt_long)

   552 	    printf("    Long lines:		    %14ld\n",cnt_long);

   553 	if (cnt_short)

   554 	    printf("    Short lines:		   %14ld\n",cnt_short);

   555 	if (cnt_lineend)

   556 	    printf("    Line-end problems:	     %14ld\n",cnt_lineend);

   557 	if (cnt_word)

   558 	    printf("    Common typos:		  %14ld\n",cnt_word);

   559 	if (cnt_dquot)

   560 	    printf("    Unmatched quotes:	      %14ld\n",cnt_dquot);

   561 	if (cnt_squot)

   562 	    printf("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);

   563 	if (cnt_brack)

   564 	    printf("    Unmatched brackets:	    %14ld\n",cnt_brack);

   565 	if (cnt_bin)

   566 	    printf("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   567 	if (cnt_odd)

   568 	    printf("    Proofing characters:	   %14ld\n",cnt_odd);

   569 	if (cnt_punct)

   570 	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   571 	if (cnt_dash)

   572 	    printf("    Non-standard dashes:	   %14ld\n",cnt_dash);

   573 	if (cnt_html)

   574 	    printf("    Possible HTML tags:	    %14ld\n",cnt_html);

   575 	printf("\n");

   576 	printf("    TOTAL QUERIES		  %14ld\n",

   577 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   578 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   579     }

   580     return 0;

   581 }

   583 /*

   584  * first_pass:

   585  *

   586  * Run a first pass - verify that it's a valid PG

   587  * file, decide whether to report some things that

   588  * occur many times in the text like long or short

   589  * lines, non-standard dashes, etc.

   590  */

   591 struct first_pass_results *first_pass(FILE *infile)

   592 {

   593     char laststart=CHAR_SPACE;

   594     const char *s;

   595     int i,llen;

   596     unsigned int lastlen=0,lastblen=0;

   597     long spline=0,nspline=0;

   598     static struct first_pass_results results={0};

   599     char inword[MAXWORDLEN]="";

   600     while (fgets(aline,LINEBUFSIZE-1,infile))

   601     {

   602 	while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)

   603 	    aline[strlen(aline)-1]=0;

   604 	linecnt++;

   605 	if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&

   606 	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))

   607 	{

   608 	    if (spline)

   609 		printf("   --> Duplicate header?\n");

   610 	    spline=linecnt+1;   /* first line of non-header text, that is */

   611 	}

   612 	if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))

   613 	{

   614 	    if (nspline)

   615 		printf("   --> Duplicate header?\n");

   616 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   617 	}

   618 	if (spline || nspline)

   619 	{

   620 	    lowerit(aline);

   621 	    if (strstr(aline,"end") && strstr(aline,"project gutenberg"))

   622 	    {

   623 		if (strstr(aline,"end")<strstr(aline,"project gutenberg"))

   624 		{

   625 		    if (results.footerline)

   626 		    {

   627 			/* it's an old-form header - we can detect duplicates */

   628 			if (!nspline)

   629 			    printf("   --> Duplicate footer?\n");

   630 		    }

   631 		    else

   632 			results.footerline=linecnt;

   633 		}

   634 	    }

   635 	}

   636 	if (spline)

   637 	    results.firstline=spline;

   638 	if (nspline)

   639 	    results.firstline=nspline;  /* override with new */

   640 	if (results.footerline)

   641 	    continue;    /* don't count the boilerplate in the footer */

   642 	llen=strlen(aline);

   643 	results.totlen+=llen;

   644 	for (i=0;i<llen;i++)

   645 	{

   646 	    if ((unsigned char)aline[i]>127)

   647 		results.binlen++;

   648 	    if (gcisalpha(aline[i]))

   649 		results.alphalen++;

   650 	    if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

   651 		results.endquote_count++;

   652 	}

   653 	if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&

   654 	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   655 	    results.shortline++;

   656 	if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)

   657 	    cnt_spacend++;

   658 	if (strstr(aline,".,"))

   659 	    results.dotcomma++;

   660 	/* only count ast lines for ignoring purposes where there is */

   661 	/* locase text on the line */

   662 	if (strstr(aline,"*"))

   663 	{

   664 	    for (s=aline;*s;s++)

   665 		if (*s>='a' && *s<='z')

   666 		    break;

   667 	     if (*s)

   668 		results.astline++;

   669 	}

   670 	if (strstr(aline,"/"))

   671 	    results.fslashline++;

   672 	for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

   673 	    ;

   674 	if (aline[i]=='-' && aline[i-1]!='-')

   675 	    results.hyphens++;

   676 	if (llen>LONGEST_PG_LINE)

   677 	    results.longline++;

   678 	if (llen>WAY_TOO_LONG)

   679 	    results.verylongline++;

   680 	if (strstr(aline,"<") && strstr(aline,">"))

   681 	{

   682 	    i=(int)(strstr(aline,">")-strstr(aline,"<")+1);

   683 	    if (i>0)

   684 		results.htmcount++;

   685 	    if (strstr(aline,"<i>"))

   686 		results.htmcount+=4; /* bonus marks! */

   687 	}

   688 	/* Check for spaced em-dashes */

   689 	if (strstr(aline,"--"))

   690 	{

   691 	    results.emdash++;

   692 	    if (*(strstr(aline,"--")-1)==CHAR_SPACE ||

   693 	       (*(strstr(aline,"--")+2)==CHAR_SPACE))

   694 		results.space_emdash++;

   695 	    if (*(strstr(aline,"--")-1)==CHAR_SPACE &&

   696 	       (*(strstr(aline,"--")+2)==CHAR_SPACE))

   697 		/* count of em-dashes with spaces both sides */

   698 		results.non_PG_space_emdash++;

   699 	    if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&

   700 	       (*(strstr(aline,"--")+2)!=CHAR_SPACE))

   701 		/* count of PG-type em-dashes with no spaces */

   702 		results.PG_space_emdash++;

   703 	}

   704 	for (s=aline;*s;)

   705 	{

   706 	    s=getaword(s,inword);

   707 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   708 		results.Dutchcount++;

   709 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   710 		results.Frenchcount++;

   711 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   712 		results.standalone_digit++;

   713 	}

   714 	/* Check for spaced dashes */

   715 	if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')

   716 	    results.spacedash++;

   717 	lastblen=lastlen;

   718 	lastlen=strlen(aline);

   719 	laststart=aline[0];

   720     }

   721     return &results;

   722 }

   724 /*

   725  * report_first_pass:

   726  *

   727  * Make some snap decisions based on the first pass results.

   728  */

   729 struct warnings *report_first_pass(struct first_pass_results *results)

   730 {

   731     static struct warnings warnings={0};

   732     if (cnt_spacend>0)

   733 	printf("   --> %ld lines in this file have white space at end\n",

   734 	  cnt_spacend);

   735     warnings.dotcomma=1;

   736     if (results->dotcomma>5)

   737     {

   738 	warnings.dotcomma=0;

   739 	printf("   --> %ld lines in this file contain '.,'. "

   740 	  "Not reporting them.\n",results->dotcomma);

   741     }

   742     /*

   743      * If more than 50 lines, or one-tenth, are short,

   744      * don't bother reporting them.

   745      */

   746     warnings.shortline=1;

   747     if (results->shortline>50 || results->shortline*10>linecnt)

   748     {

   749 	warnings.shortline=0;

   750 	printf("   --> %ld lines in this file are short. "

   751 	  "Not reporting short lines.\n",results->shortline);

   752     }

   753     /*

   754      * If more than 50 lines, or one-tenth, are long,

   755      * don't bother reporting them.

   756      */

   757     warnings.longline=1;

   758     if (results->longline>50 || results->longline*10>linecnt)

   759     {

   760 	warnings.longline=0;

   761 	printf("   --> %ld lines in this file are long. "

   762 	  "Not reporting long lines.\n",results->longline);

   763     }

   764     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   765     warnings.ast=1;

   766     if (results->astline>10)

   767     {

   768 	warnings.ast=0;

   769 	printf("   --> %ld lines in this file contain asterisks. "

   770 	  "Not reporting them.\n",results->astline);

   771     }

   772     /*

   773      * If more than 10 lines contain forward slashes,

   774      * don't bother reporting them.

   775      */

   776     warnings.fslash=1;

   777     if (results->fslashline>10)

   778     {

   779 	warnings.fslash=0;

   780 	printf("   --> %ld lines in this file contain forward slashes. "

   781 	  "Not reporting them.\n",results->fslashline);

   782     }

   783     /*

   784      * If more than 20 lines contain unpunctuated endquotes,

   785      * don't bother reporting them.

   786      */

   787     warnings.endquote=1;

   788     if (results->endquote_count>20)

   789     {

   790 	warnings.endquote=0;

   791 	printf("   --> %ld lines in this file contain unpunctuated endquotes. "

   792 	  "Not reporting them.\n",results->endquote_count);

   793     }

   794     /*

   795      * If more than 15 lines contain standalone digits,

   796      * don't bother reporting them.

   797      */

   798     warnings.digit=1;

   799     if (results->standalone_digit>10)

   800     {

   801 	warnings.digit=0;

   802 	printf("   --> %ld lines in this file contain standalone 0s and 1s. "

   803 	  "Not reporting them.\n",results->standalone_digit);

   804     }

   805     /*

   806      * If more than 20 lines contain hyphens at end,

   807      * don't bother reporting them.

   808      */

   809     warnings.hyphen=1;

   810     if (results->hyphens>20)

   811     {

   812 	warnings.hyphen=0;

   813 	printf("   --> %ld lines in this file have hyphens at end. "

   814 	  "Not reporting them.\n",results->hyphens);

   815     }

   816     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   817     {

   818 	printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   819 	pswit[MARKUP_SWITCH]=1;

   820     }

   821     if (results->verylongline>0)

   822 	printf("   --> %ld lines in this file are VERY long!\n",

   823 	  results->verylongline);

   824     /*

   825      * If there are more non-PG spaced dashes than PG em-dashes,

   826      * assume it's deliberate.

   827      * Current PG guidelines say don't use them, but older texts do,

   828      * and some people insist on them whatever the guidelines say.

   829      */

   830     warnings.dash=1;

   831     if (results->spacedash+results->non_PG_space_emdash>

   832       results->PG_space_emdash)

   833     {

   834 	warnings.dash=0;

   835 	printf("   --> There are %ld spaced dashes and em-dashes. "

   836 	  "Not reporting them.\n",

   837 	  results->spacedash+results->non_PG_space_emdash);

   838     }

   839     /* If more than a quarter of characters are hi-bit, bug out. */

   840     warnings.bin=1;

   841     if (results->binlen*4>results->totlen)

   842     {

   843 	printf("   --> This file does not appear to be ASCII. "

   844 	  "Terminating. Best of luck with it!\n");

   845 	exit(1);

   846     }

   847     if (results->alphalen*4<results->totlen)

   848     {

   849 	printf("   --> This file does not appear to be text. "

   850 	  "Terminating. Best of luck with it!\n");

   851 	exit(1);

   852     }

   853     if (results->binlen*100>results->totlen || results->binlen>100)

   854     {

   855 	printf("   --> There are a lot of foreign letters here. "

   856 	  "Not reporting them.\n");

   857 	warnings.bin=0;

   858     }

   859     warnings.isDutch=0;

   860     if (results->Dutchcount>50)

   861     {

   862 	warnings.isDutch=1;

   863 	printf("   --> This looks like Dutch - "

   864 	  "switching off dashes and warnings for 's Middags case.\n");

   865     }

   866     warnings.isFrench=0;

   867     if (results->Frenchcount>50)

   868     {

   869 	warnings.isFrench=1;

   870 	printf("   --> This looks like French - "

   871 	  "switching off some doublepunct.\n");

   872     }

   873     if (results->firstline && results->footerline)

   874 	printf("    The PG header and footer appear to be already on.\n");

   875     else

   876     {

   877 	if (results->firstline)

   878 	    printf("    The PG header is on - no footer.\n");

   879 	if (results->footerline)

   880 	    printf("    The PG footer is on - no header.\n");

   881     }

   882     printf("\n");

   883     if (pswit[VERBOSE_SWITCH])

   884     {

   885 	warnings.bin=1;

   886 	warnings.shortline=1;

   887 	warnings.dotcomma=1;

   888 	warnings.longline=1;

   889 	warnings.dash=1;

   890 	warnings.digit=1;

   891 	warnings.ast=1;

   892 	warnings.fslash=1;

   893 	warnings.hyphen=1;

   894 	warnings.endquote=1;

   895 	printf("   *** Verbose output is ON -- you asked for it! ***\n");

   896     }

   897     if (warnings.isDutch)

   898 	warnings.dash=0;

   899     if (results->footerline>0 && results->firstline>0 &&

   900       results->footerline>results->firstline &&

   901       results->footerline-results->firstline<100)

   902     {

   903 	printf("   --> I don't really know where this text starts. \n");

   904 	printf("       There are no reference points.\n");

   905 	printf("       I'm going to have to report the header and footer "

   906 	  "as well.\n");

   907 	results->firstline=0;

   908     }

   909     return &warnings;

   910 }

   912 /*

   913  * analyse_quotes:

   914  *

   915  * Look along the line, accumulate the count of quotes, and see

   916  * if this is an empty line - i.e. a line with nothing on it

   917  * but spaces.

   918  * If line has just spaces, period, * and/or - on it, don't

   919  * count it, since empty lines with asterisks or dashes to

   920  * separate sections are common.

   921  *

   922  * Returns: Non-zero if the line is empty.

   923  */

   924 int analyse_quotes(const char *s,struct counters *counters)

   925 {

   926     int guessquote=0;

   927     int isemptyline=1;    /* assume the line is empty until proven otherwise */

   928     while (*s)

   929     {

   930 	if (*s==CHAR_DQUOTE)

   931 	    counters->quot++;

   932 	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)

   933 	{

   934 	    if (s==aline)

   935 	    {

   936 		/*

   937 		 * At start of line, it can only be an openquote.

   938 		 * Hardcode a very common exception!

   939 		 */

   940 		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))

   941 		    counters->open_single_quote++;

   942 	    }

   943 	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))

   944 		/* Do nothing! it's definitely an apostrophe, not a quote */

   945 		;

   946 	    /* it's outside a word - let's check it out */

   947 	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))

   948 	    {

   949 		/* it damwell better BE an openquote */

   950 		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))

   951 		    /* hardcode a very common exception! */

   952 		    counters->open_single_quote++;

   953 	    }

   954 	    else

   955 	    {

   956 		/* now - is it a closequote? */

   957 		guessquote=0;   /* accumulate clues */

   958 		if (gcisalpha(s[-1]))

   959 		{

   960 		    /* it follows a letter - could be either */

   961 		    guessquote++;

   962 		    if (s[-1]=='s')

   963 		    {

   964 			/* looks like a plural apostrophe */

   965 			guessquote-=3;

   966 			if (s[1]==CHAR_SPACE)  /* bonus marks! */

   967 			    guessquote-=2;

   968 		    }

   969 		}

   970 		/* it doesn't have a letter either side */

   971 		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))

   972 		    guessquote+=8; /* looks like a closequote */

   973 		else

   974 		    guessquote++;

   975 		if (counters->open_single_quote>counters->close_single_quote)

   976 		    /*

   977 		     * Give it the benefit of some doubt,

   978 		     * if a squote is already open.

   979 		     */

   980 		    guessquote++;

   981 		else

   982 		    guessquote--;

   983 		if (guessquote>=0)

   984 		    counters->close_single_quote++;

   985 	    }

   986 	}

   987 	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&

   988 	  *s!=13 && *s!=10)

   989 	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */

   990 	if (*s==CHAR_UNDERSCORE)

   991 	    counters->c_unders++;

   992 	if (*s==CHAR_OPEN_CBRACK)

   993 	    counters->c_brack++;

   994 	if (*s==CHAR_CLOSE_CBRACK)

   995 	    counters->c_brack--;

   996 	if (*s==CHAR_OPEN_RBRACK)

   997 	    counters->r_brack++;

   998 	if (*s==CHAR_CLOSE_RBRACK)

   999 	    counters->r_brack--;

  1000 	if (*s==CHAR_OPEN_SBRACK)

  1001 	    counters->s_brack++;

  1002 	if (*s==CHAR_CLOSE_SBRACK)

  1003 	    counters->s_brack--;

  1004 	s++;

  1005     }

  1006     return isemptyline;

  1007 }

  1009 /*

  1010  * check_for_control_characters:

  1011  *

  1012  * Check for invalid or questionable characters in the line

  1013  * Anything above 127 is invalid for plain ASCII, and

  1014  * non-printable control characters should also be flagged.

  1015  * Tabs should generally not be there.

  1016  */

  1017 void check_for_control_characters(const char *aline)

  1018 {

  1019     unsigned char c;

  1020     const char *s;

  1021     for (s=aline;*s;s++)

  1022     {

  1023 	c=*(unsigned char *)s;

  1024 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

  1025 	{

  1026 	    if (pswit[ECHO_SWITCH])

  1027 		printf("\n%s\n",aline);

  1028 	    if (!pswit[OVERVIEW_SWITCH])

  1029 		printf("    Line %ld column %d - Control character %d\n",

  1030 		  linecnt,(int)(s-aline)+1,c);

  1031 	    else

  1032 		cnt_bin++;

  1033 	}

  1034     }

  1035 }

  1037 /*

  1038  * check_for_odd_characters:

  1039  *

  1040  * Check for binary and other odd characters.

  1041  */

  1042 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1043   int isemptyline)

  1044 {

  1045     /* Don't repeat multiple warnings on one line. */

  1046     int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;

  1047     const char *s;

  1048     unsigned char c;

  1049     for (s=aline;*s;s++)

  1050     {

  1051 	c=*(unsigned char *)s;

  1052 	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))

  1053 	{

  1054 	    if (pswit[ECHO_SWITCH])

  1055 		printf("\n%s\n",aline);

  1056 	    if (!pswit[OVERVIEW_SWITCH])

  1057 		if (c>127 && c<160)

  1058 		    printf("    Line %ld column %d - "

  1059 		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);

  1060 		else

  1061 		    printf("    Line %ld column %d - Non-ASCII character %d\n",

  1062 		      linecnt,(int)(s-aline)+1,c);

  1063 	    else

  1064 		cnt_bin++;

  1065 	    eNon_A=1;

  1066 	}

  1067 	if (!eTab && *s==CHAR_TAB)

  1068 	{

  1069 	    if (pswit[ECHO_SWITCH])

  1070 		printf("\n%s\n",aline);

  1071 	    if (!pswit[OVERVIEW_SWITCH])

  1072 		printf("    Line %ld column %d - Tab character?\n",

  1073 		  linecnt,(int)(s-aline)+1);

  1074 	    else

  1075 		cnt_odd++;

  1076 	    eTab=1;

  1077 	}

  1078 	if (!eTilde && *s==CHAR_TILDE)

  1079 	{

  1080 	    /*

  1081 	     * Often used by OCR software to indicate an

  1082 	     * unrecognizable character.

  1083 	     */

  1084 	    if (pswit[ECHO_SWITCH])

  1085 		printf("\n%s\n",aline);

  1086 	    if (!pswit[OVERVIEW_SWITCH])

  1087 		printf("    Line %ld column %d - Tilde character?\n",

  1088 		  linecnt,(int)(s-aline)+1);

  1089 	    else

  1090 		cnt_odd++;

  1091 	    eTilde=1;

  1092 	}

  1093 	if (!eCarat && *s==CHAR_CARAT)

  1094 	{

  1095 	    if (pswit[ECHO_SWITCH])

  1096 		printf("\n%s\n",aline);

  1097 	    if (!pswit[OVERVIEW_SWITCH])

  1098 		printf("    Line %ld column %d - Carat character?\n",

  1099 		  linecnt,(int)(s-aline)+1);

  1100 	    else

  1101 		cnt_odd++;

  1102 	    eCarat=1;

  1103 	}

  1104 	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)

  1105 	{

  1106 	    if (pswit[ECHO_SWITCH])

  1107 		printf("\n%s\n",aline);

  1108 	    if (!pswit[OVERVIEW_SWITCH])

  1109 		printf("    Line %ld column %d - Forward slash?\n",

  1110 		  linecnt,(int)(s-aline)+1);

  1111 	    else

  1112 		cnt_odd++;

  1113 	    eFSlash=1;

  1114 	}

  1115 	/*

  1116 	 * Report asterisks only in paranoid mode,

  1117 	 * since they're often deliberate.

  1118 	 */

  1119 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1120 	  *s==CHAR_ASTERISK)

  1121 	{

  1122 	    if (pswit[ECHO_SWITCH])

  1123 		printf("\n%s\n",aline);

  1124 	    if (!pswit[OVERVIEW_SWITCH])

  1125 		printf("    Line %ld column %d - Asterisk?\n",

  1126 		  linecnt,(int)(s-aline)+1);

  1127 	    else

  1128 		cnt_odd++;

  1129 	    eAst=1;

  1130 	}

  1131     }

  1132 }

  1134 /*

  1135  * check_for_long_line:

  1136  *

  1137  * Check for line too long.

  1138  */

  1139 void check_for_long_line(const char *aline)

  1140 {

  1141     if (strlen(aline)>LONGEST_PG_LINE)

  1142     {

  1143 	if (pswit[ECHO_SWITCH])

  1144 	    printf("\n%s\n",aline);

  1145 	if (!pswit[OVERVIEW_SWITCH])

  1146 	    printf("    Line %ld column %d - Long line %d\n",

  1147 	      linecnt,(int)strlen(aline),(int)strlen(aline));

  1148 	else

  1149 	    cnt_long++;

  1150     }

  1151 }

  1153 /*

  1154  * check_for_short_line:

  1155  *

  1156  * Check for line too short.

  1157  *

  1158  * This one is a bit trickier to implement: we don't want to

  1159  * flag the last line of a paragraph for being short, so we

  1160  * have to wait until we know that our current line is a

  1161  * "normal" line, then report the _previous_ line if it was too

  1162  * short. We also don't want to report indented lines like

  1163  * chapter heads or formatted quotations. We therefore keep

  1164  * last->len as the length of the last line examined, and

  1165  * last->blen as the length of the last but one, and try to

  1166  * suppress unnecessary warnings by checking that both were of

  1167  * "normal" length. We keep the first character of the last

  1168  * line in last->start, and if it was a space, we assume that

  1169  * the formatting is deliberate. I can't figure out a way to

  1170  * distinguish something like a quoted verse left-aligned or

  1171  * the header or footer of a letter from a paragraph of short

  1172  * lines - maybe if I examined the whole paragraph, and if the

  1173  * para has less than, say, 8 lines and if all lines are short,

  1174  * then just assume it's OK? Need to look at some texts to see

  1175  * how often a formula like this would get the right result.

  1176  */

  1177 void check_for_short_line(const char *aline,const struct line_properties *last)

  1178 {

  1179     if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&

  1180       last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1181     {

  1182 	if (pswit[ECHO_SWITCH])

  1183 	    printf("\n%s\n",prevline);

  1184 	if (!pswit[OVERVIEW_SWITCH])

  1185 	    printf("    Line %ld column %d - Short line %d?\n",

  1186 	      linecnt-1,(int)strlen(prevline),(int)strlen(prevline));

  1187 	else

  1188 	    cnt_short++;

  1189     }

  1190 }

  1192 /*

  1193  * check_for_starting_punctuation:

  1194  *

  1195  * Look for punctuation other than full ellipses at start of line.

  1196  */

  1197 void check_for_starting_punctuation(const char *aline)

  1198 {

  1199     if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))

  1200     {

  1201 	if (pswit[ECHO_SWITCH])

  1202 	    printf("\n%s\n",aline);

  1203 	if (!pswit[OVERVIEW_SWITCH])

  1204 	    printf("    Line %ld column 1 - Begins with punctuation?\n",

  1205 	      linecnt);

  1206 	else

  1207 	    cnt_punct++;

  1208     }

  1209 }

  1211 /*

  1212  * check_for_spaced_emdash:

  1213  *

  1214  * Check for spaced em-dashes.

  1215  *

  1216  * We must check _all_ occurrences of "--" on the line

  1217  * hence the loop - even if the first double-dash is OK

  1218  * there may be another that's wrong later on.

  1219  */

  1220 void check_for_spaced_emdash(const char *aline)

  1221 {

  1222     const char *s,*t;

  1223     s=aline;

  1224     while ((t=strstr(s,"--")))

  1225     {

  1226 	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)

  1227 	{

  1228 	    if (pswit[ECHO_SWITCH])

  1229 		printf("\n%s\n",aline);

  1230 	    if (!pswit[OVERVIEW_SWITCH])

  1231 		printf("    Line %ld column %d - Spaced em-dash?\n",

  1232 		  linecnt,(int)(t-aline)+1);

  1233 	    else

  1234 		cnt_dash++;

  1235 	}

  1236 	s=t+2;

  1237     }

  1238 }

  1240 /*

  1241  * check_for_spaced_dash:

  1242  *

  1243  * Check for spaced dashes.

  1244  */

  1245 void check_for_spaced_dash(const char *aline)

  1246 {

  1247     const char *s;

  1248     if ((s=strstr(aline," -")))

  1249     {

  1250 	if (s[2]!='-')

  1251 	{

  1252 	    if (pswit[ECHO_SWITCH])

  1253 		printf("\n%s\n",aline);

  1254 	    if (!pswit[OVERVIEW_SWITCH])

  1255 		printf("    Line %ld column %d - Spaced dash?\n",

  1256 		  linecnt,(int)(s-aline)+1);

  1257 	    else

  1258 		cnt_dash++;

  1259 	}

  1260     }

  1261     else if ((s=strstr(aline,"- ")))

  1262     {

  1263 	if (s==aline || s[-1]!='-')

  1264 	{

  1265 	    if (pswit[ECHO_SWITCH])

  1266 		printf("\n%s\n",aline);

  1267 	    if (!pswit[OVERVIEW_SWITCH])

  1268 		printf("    Line %ld column %d - Spaced dash?\n",

  1269 		  linecnt,(int)(s-aline)+1);

  1270 	    else

  1271 		cnt_dash++;

  1272 	}

  1273     }

  1274 }

  1276 /*

  1277  * check_for_unmarked_paragraphs:

  1278  *

  1279  * Check for unmarked paragraphs indicated by separate speakers.

  1280  *

  1281  * May well be false positive:

  1282  * "Bravo!" "Wonderful!" called the crowd.

  1283  * but useful all the same.

  1284  */

  1285 void check_for_unmarked_paragraphs(const char *aline)

  1286 {

  1287     const char *s;

  1288     s=strstr(aline,"\"  \"");

  1289     if (!s)

  1290 	s=strstr(aline,"\" \"");

  1291     if (s)

  1292     {

  1293 	if (pswit[ECHO_SWITCH])

  1294 	    printf("\n%s\n",aline);

  1295 	if (!pswit[OVERVIEW_SWITCH])

  1296 	    printf("    Line %ld column %d - Query missing paragraph break?\n",

  1297 	      linecnt,(int)(s-aline)+1);

  1298 	else

  1299 	    cnt_punct++;

  1300     }

  1301 }

  1303 /*

  1304  * check_for_jeebies:

  1305  *

  1306  * Check for "to he" and other easy h/b errors.

  1307  *

  1308  * This is a very inadequate effort on the h/b problem,

  1309  * but the phrase "to he" is always an error, whereas "to

  1310  * be" is quite common.

  1311  * Similarly, '"Quiet!", be said.' is a non-be error

  1312  * "to he" is _not_ always an error!:

  1313  *       "Where they went to he couldn't say."

  1314  * Another false positive:

  1315  *       What would "Cinderella" be without the . . .

  1316  * and another: "If he wants to he can see for himself."

  1317  */

  1318 void check_for_jeebies(const char *aline)

  1319 {

  1320     const char *s;

  1321     s=strstr(aline," be could ");

  1322     if (!s)

  1323 	s=strstr(aline," be would ");

  1324     if (!s)

  1325 	s=strstr(aline," was be ");

  1326     if (!s)

  1327 	s=strstr(aline," be is ");

  1328     if (!s)

  1329 	s=strstr(aline," is be ");

  1330     if (!s)

  1331 	s=strstr(aline,"\", be ");

  1332     if (!s)

  1333 	s=strstr(aline,"\" be ");

  1334     if (!s)

  1335 	s=strstr(aline,"\" be ");

  1336     if (!s)

  1337 	s=strstr(aline," to he ");

  1338     if (s)

  1339     {

  1340 	if (pswit[ECHO_SWITCH])

  1341 	    printf("\n%s\n",aline);

  1342 	if (!pswit[OVERVIEW_SWITCH])

  1343 	    printf("    Line %ld column %d - Query he/be error?\n",

  1344 	      linecnt,(int)(s-aline)+1);

  1345 	else

  1346 	    cnt_word++;

  1347     }

  1348     s=strstr(aline," the had ");

  1349     if (!s)

  1350 	s=strstr(aline," a had ");

  1351     if (!s)

  1352 	s=strstr(aline," they bad ");

  1353     if (!s)

  1354 	s=strstr(aline," she bad ");

  1355     if (!s)

  1356 	s=strstr(aline," he bad ");

  1357     if (!s)

  1358 	s=strstr(aline," you bad ");

  1359     if (!s)

  1360 	s=strstr(aline," i bad ");

  1361     if (s)

  1362     {

  1363 	if (pswit[ECHO_SWITCH])

  1364 	    printf("\n%s\n",aline);

  1365 	if (!pswit[OVERVIEW_SWITCH])

  1366 	    printf("    Line %ld column %d - Query had/bad error?\n",

  1367 	      linecnt,(int)(s-aline)+1);

  1368 	else

  1369 	    cnt_word++;

  1370     }

  1371     s=strstr(aline,"; hut ");

  1372     if (!s)

  1373 	s=strstr(aline,", hut ");

  1374     if (s)

  1375     {

  1376 	if (pswit[ECHO_SWITCH])

  1377 	    printf("\n%s\n",aline);

  1378 	if (!pswit[OVERVIEW_SWITCH])

  1379 	    printf("    Line %ld column %d - Query hut/but error?\n",

  1380 	      linecnt,(int)(s-aline)+1);

  1381 	else

  1382 	    cnt_word++;

  1383     }

  1384 }

  1386 /*

  1387  * check_for_mta_from:

  1388  *

  1389  * Special case - angled bracket in front of "From" placed there by an

  1390  * MTA when sending an e-mail.

  1391  */

  1392 void check_for_mta_from(const char *aline)

  1393 {

  1394     const char *s;

  1395     s=strstr(aline,">From");

  1396     if (s)

  1397     {

  1398 	if (pswit[ECHO_SWITCH])

  1399 	    printf("\n%s\n",aline);

  1400 	if (!pswit[OVERVIEW_SWITCH])

  1401 	    printf("    Line %ld column %d - Query angled bracket with From\n",

  1402 	      linecnt,(int)(s-aline)+1);

  1403 	else

  1404 	    cnt_punct++;

  1405     }

  1406 }

  1408 /*

  1409  * check_for_orphan_character:

  1410  *

  1411  * Check for a single character line -

  1412  * often an overflow from bad wrapping.

  1413  */

  1414 void check_for_orphan_character(const char *aline)

  1415 {

  1416     if (*aline && !aline[1])

  1417     {

  1418 	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||

  1419 	  gcisdigit(*aline))

  1420 	    ; /* Nothing - ignore numerals alone on a line. */

  1421 	else

  1422 	{

  1423 	    if (pswit[ECHO_SWITCH])

  1424 		printf("\n%s\n",aline);

  1425 	    if (!pswit[OVERVIEW_SWITCH])

  1426 		printf("    Line %ld column 1 - Query single character line\n",

  1427 		  linecnt);

  1428 	    else

  1429 		cnt_punct++;

  1430 	}

  1431     }

  1432 }

  1434 /*

  1435  * check_for_pling_scanno:

  1436  *

  1437  * Check for I" - often should be !

  1438  */

  1439 void check_for_pling_scanno(const char *aline)

  1440 {

  1441     const char *s;

  1442     s=strstr(aline," I\"");

  1443     if (s)

  1444     {

  1445 	if (pswit[ECHO_SWITCH])

  1446 	    printf("\n%s\n",aline);

  1447 	if (!pswit[OVERVIEW_SWITCH])

  1448 	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",

  1449 	      linecnt,s-aline);

  1450 	else

  1451 	    cnt_punct++;

  1452     }

  1453 }

  1455 /*

  1456  * check_for_extra_period:

  1457  *

  1458  * Check for period without a capital letter. Cut-down from gutspell.

  1459  * Only works when it happens on a single line.

  1460  */

  1461 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1462 {

  1463     const char *s,*t,*s1;

  1464     int i,istypo,isdup;

  1465     static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];

  1466     static int qperiod_index=0;

  1467     char testword[MAXWORDLEN]="";

  1468     if (pswit[PARANOID_SWITCH])

  1469     {

  1470 	for (t=s=aline;strstr(t,". ");)

  1471 	{

  1472 	    t=strstr(t,". ");

  1473 	    if (t==s)

  1474 	    {

  1475 		t++;

  1476 		/* start of line punctuation is handled elsewhere */

  1477 		continue;

  1478 	    }

  1479 	    if (!gcisalpha(t[-1]))

  1480 	    {

  1481 		t++;

  1482 		continue;

  1483 	    }

  1484 	    if (warnings->isDutch)

  1485 	    {

  1486 		/* For Frank & Jeroen -- 's Middags case */

  1487 		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&

  1488 		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')

  1489 		{

  1490 		    t++;

  1491 		    continue;

  1492 		}

  1493 	    }

  1494 	    s1=t+2;

  1495 	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))

  1496 		s1++;

  1497 	    if (*s1>='a' && *s1<='z')

  1498 	    {

  1499 		/* we have something to investigate */

  1500 		istypo=1;

  1501 		/* so let's go back and find out */

  1502 		for (s1=t-1;s1>=s &&

  1503 		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&

  1504 		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)

  1505 		    ;

  1506 		s1++;

  1507 		for (i=0;*s1 && *s1!='.';s1++,i++)

  1508 		    testword[i]=*s1;

  1509 		testword[i]=0;

  1510 		for (i=0;*abbrev[i];i++)

  1511 		    if (!strcmp(testword,abbrev[i]))

  1512 			istypo=0;

  1513 		if (gcisdigit(*testword))

  1514 		    istypo=0;

  1515 		if (!testword[1])

  1516 		    istypo=0;

  1517 		if (isroman(testword))

  1518 		    istypo=0;

  1519 		if (istypo)

  1520 		{

  1521 		    istypo=0;

  1522 		    for (i=0;testword[i];i++)

  1523 			if (strchr(vowels,testword[i]))

  1524 			    istypo=1;

  1525 		}

  1526 		if (istypo)

  1527 		{

  1528 		    isdup=0;

  1529 		    if (strlen(testword)<MAX_QWORD_LENGTH &&

  1530 		      !pswit[VERBOSE_SWITCH])

  1531 			for (i=0;i<qperiod_index;i++)

  1532 			    if (!strcmp(testword,qperiod[i]))

  1533 				isdup=1;

  1534 		    if (!isdup)

  1535 		    {

  1536 			if (qperiod_index<MAX_QWORD &&

  1537 			  strlen(testword)<MAX_QWORD_LENGTH)

  1538 			{

  1539 			    strcpy(qperiod[qperiod_index],testword);

  1540 			    qperiod_index++;

  1541 			}

  1542 			if (pswit[ECHO_SWITCH])

  1543 			    printf("\n%s\n",aline);

  1544 			if (!pswit[OVERVIEW_SWITCH])

  1545 			    printf("    Line %ld column %d - Extra period?\n",

  1546 			      linecnt,(int)(t-aline)+1);

  1547 			else

  1548 			    cnt_punct++;

  1549 		    }

  1550 		}

  1551 	    }

  1552 	    t++;

  1553 	}

  1554     }

  1555 }

  1557 /*

  1558  * check_for_following_punctuation:

  1559  *

  1560  * Check for words usually not followed by punctuation.

  1561  */

  1562 void check_for_following_punctuation(const char *aline)

  1563 {

  1564     int i;

  1565     const char *s,*wordstart;

  1566     char inword[MAXWORDLEN];

  1567     if (pswit[TYPO_SWITCH])

  1568     {

  1569 	for (s=aline;*s;)

  1570 	{

  1571 	    wordstart=s;

  1572 	    s=getaword(s,inword);

  1573 	    if (!*inword)

  1574 		continue;

  1575 	    lowerit(inword);

  1576 	    for (i=0;*nocomma[i];i++)

  1577 		if (!strcmp(inword,nocomma[i]))

  1578 		{

  1579 		    if (*s==',' || *s==';' || *s==':')

  1580 		    {

  1581 			if (pswit[ECHO_SWITCH])

  1582 			    printf("\n%s\n",aline);

  1583 			if (!pswit[OVERVIEW_SWITCH])

  1584 			    printf("    Line %ld column %d - "

  1585 			      "Query punctuation after %s?\n",

  1586 			      linecnt,(int)(s-aline)+1,inword);

  1587 			else

  1588 			    cnt_punct++;

  1589 		    }

  1590 		}

  1591 	    for (i=0;*noperiod[i];i++)

  1592 		if (!strcmp(inword,noperiod[i]))

  1593 		{

  1594 		    if (*s=='.' || *s=='!')

  1595 		    {

  1596 			if (pswit[ECHO_SWITCH])

  1597 			    printf("\n%s\n",aline);

  1598 			if (!pswit[OVERVIEW_SWITCH])

  1599 			    printf("    Line %ld column %d - "

  1600 			      "Query punctuation after %s?\n",

  1601 			      linecnt,(int)(s-aline)+1,inword);

  1602 			else

  1603 			    cnt_punct++;

  1604 		    }

  1605 		}

  1606 	}

  1607     }

  1608 }

  1610 /*

  1611  * check_for_typos:

  1612  *

  1613  * Check for commonly mistyped words,

  1614  * and digits like 0 for O in a word.

  1615  */

  1616 void check_for_typos(const char *aline,struct warnings *warnings)

  1617 {

  1618     const char *s,*wordstart;

  1619     char inword[MAXWORDLEN],testword[MAXWORDLEN];

  1620     int i,istypo,isdup,alower,vowel,consonant;

  1621     static int qword_index=0;

  1622     for (s=aline;*s;)

  1623     {

  1624 	wordstart=s;

  1625 	s=getaword(s,inword);

  1626 	if (!*inword)

  1627 	    continue; /* don't bother with empty lines */

  1628 	if (mixdigit(inword))

  1629 	{

  1630 	    if (pswit[ECHO_SWITCH])

  1631 		printf("\n%s\n",aline);

  1632 	    if (!pswit[OVERVIEW_SWITCH])

  1633 		printf("    Line %ld column %d - Query digit in %s\n",

  1634 		  linecnt,(int)(wordstart-aline)+1,inword);

  1635 	    else

  1636 		cnt_word++;

  1637 	}

  1638 	/*

  1639 	 * Put the word through a series of tests for likely typos and OCR

  1640 	 * errors.

  1641 	 */

  1642 	if (pswit[TYPO_SWITCH])

  1643 	{

  1644 	    istypo=0;

  1645 	    strcpy(testword,inword);

  1646 	    alower=0;

  1647 	    for (i=0;i<(int)strlen(testword);i++)

  1648 	    {

  1649 		/* lowercase for testing */

  1650 		if (testword[i]>='a' && testword[i]<='z')

  1651 		    alower=1;

  1652 		if (alower && testword[i]>='A' && testword[i]<='Z')

  1653 		{

  1654 		    /*

  1655 		     * We have an uppercase mid-word. However, there are

  1656 		     * common cases:

  1657 		     *   Mac and Mc like McGill

  1658 		     *   French contractions like l'Abbe

  1659 		     */

  1660 		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||

  1661 		      i==3 && testword[0]=='m' && testword[1]=='a' &&

  1662 		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)

  1663 			; /* do nothing! */

  1664 		    else

  1665 			istypo=1;

  1666 		}

  1667 		testword[i]=(char)tolower(testword[i]);

  1668 	    }

  1669 	    /*

  1670 	     * Check for certain unlikely two-letter combinations at word

  1671 	     * start and end.

  1672 	     */

  1673 	    if (strlen(testword)>1)

  1674 	    {

  1675 		for (i=0;*nostart[i];i++)

  1676 		    if (!strncmp(testword,nostart[i],2))

  1677 			istypo=1;

  1678 		for (i=0;*noend[i];i++)

  1679 		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))

  1680 			istypo=1;

  1681 	    }

  1682 	    /* ght is common, gbt never. Like that. */

  1683 	    if (strstr(testword,"cb"))

  1684 		istypo=1;

  1685 	    if (strstr(testword,"gbt"))

  1686 		istypo=1;

  1687 	    if (strstr(testword,"pbt"))

  1688 		istypo=1;

  1689 	    if (strstr(testword,"tbs"))

  1690 		istypo=1;

  1691 	    if (strstr(testword,"mrn"))

  1692 		istypo=1;

  1693 	    if (strstr(testword,"ahle"))

  1694 		istypo=1;

  1695 	    if (strstr(testword,"ihle"))

  1696 		istypo=1;

  1697 	    /*

  1698 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1699 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1700 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1701 	     * numerals, but "ii" is a common scanno.

  1702 	     */

  1703 	    if (strstr(testword,"tbi"))

  1704 		istypo=1;

  1705 	    if (strstr(testword,"tbe"))

  1706 		istypo=1;

  1707 	    if (strstr(testword,"ii"))

  1708 		istypo=1;

  1709 	    /*

  1710 	     * Check for no vowels or no consonants.

  1711 	     * If none, flag a typo.

  1712 	     */

  1713 	    if (!istypo && strlen(testword)>1)

  1714 	    {

  1715 		vowel=consonant=0;

  1716 		for (i=0;testword[i];i++)

  1717 		{

  1718 		    if (testword[i]=='y' || gcisdigit(testword[i]))

  1719 		    {

  1720 			/* Yah, this is loose. */

  1721 			vowel++;

  1722 			consonant++;

  1723 		    }

  1724 		    else if (strchr(vowels,testword[i]))

  1725 			vowel++;

  1726 		    else

  1727 			consonant++;

  1728 		}

  1729 		if (!vowel || !consonant)

  1730 		    istypo=1;

  1731 	    }

  1732 	    /*

  1733 	     * Now exclude the word from being reported if it's in

  1734 	     * the okword list.

  1735 	     */

  1736 	    for (i=0;*okword[i];i++)

  1737 		if (!strcmp(testword,okword[i]))

  1738 		    istypo=0;

  1739 	    /*

  1740 	     * What looks like a typo may be a Roman numeral.

  1741 	     * Exclude these.

  1742 	     */

  1743 	    if (istypo && isroman(testword))

  1744 		istypo=0;

  1745 	    /* Check the manual list of typos. */

  1746 	    if (!istypo)

  1747 		for (i=0;*typo[i];i++)

  1748 		    if (!strcmp(testword,typo[i]))

  1749 			istypo=1;

  1750 	    /*

  1751 	     * Check lowercase s, l, i and m - special cases.

  1752 	     *   "j" - often a semi-colon gone wrong.

  1753 	     *   "d" for a missing apostrophe - he d

  1754 	     *   "n" for "in"

  1755 	     */

  1756 	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))

  1757 		istypo=1;

  1758 	    if (istypo)

  1759 	    {

  1760 		isdup=0;

  1761 		if (strlen(testword)<MAX_QWORD_LENGTH &&

  1762 		  !pswit[VERBOSE_SWITCH])

  1763 		    for (i=0;i<qword_index;i++)

  1764 			if (!strcmp(testword,qword[i]))

  1765 			{

  1766 			    isdup=1;

  1767 			    ++dupcnt[i];

  1768 			}

  1769 		if (!isdup)

  1770 		{

  1771 		    if (qword_index<MAX_QWORD &&

  1772 		      strlen(testword)<MAX_QWORD_LENGTH)

  1773 		    {

  1774 			strcpy(qword[qword_index],testword);

  1775 			qword_index++;

  1776 		    }

  1777 		    if (pswit[ECHO_SWITCH])

  1778 			printf("\n%s\n",aline);

  1779 		    if (!pswit[OVERVIEW_SWITCH])

  1780 		    {

  1781 			printf("    Line %ld column %d - Query word %s",

  1782 			  linecnt,(int)(wordstart-aline)+1,inword);

  1783 			if (strlen(testword)<MAX_QWORD_LENGTH &&

  1784 			  !pswit[VERBOSE_SWITCH])

  1785 			    printf(" - not reporting duplicates");

  1786 			printf("\n");

  1787 		    }

  1788 		    else

  1789 			cnt_word++;

  1790 		}

  1791 	    }

  1792 	}

  1793 	/* check the user's list of typos */

  1794 	if (!istypo && usertypo_count)

  1795 	    for (i=0;i<usertypo_count;i++)

  1796 		if (!strcmp(testword,usertypo[i]))

  1797 		{

  1798 		    if (pswit[ECHO_SWITCH])

  1799 			printf("\n%s\n",aline);

  1800 		    if (!pswit[OVERVIEW_SWITCH])

  1801 			printf("    Line %ld column %d - "

  1802 			  "Query possible scanno %s\n",

  1803 			  linecnt,(int)(wordstart-aline)+2,inword);

  1804 		}

  1805 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1806 	{

  1807 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1808 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1809 	    {

  1810 		if (pswit[ECHO_SWITCH])

  1811 		    printf("\n%s\n",aline);

  1812 		if (!pswit[OVERVIEW_SWITCH])

  1813 		    printf("    Line %ld column %d - Query standalone %s\n",

  1814 		      linecnt,(int)(wordstart-aline)+2,inword);

  1815 		else

  1816 		    cnt_word++;

  1817 	    }

  1818 	}

  1819     }

  1820 }

  1822 /*

  1823  * check_for_misspaced_punctuation:

  1824  *

  1825  * Look for added or missing spaces around punctuation and quotes.

  1826  * If there is a punctuation character like ! with no space on

  1827  * either side, suspect a missing!space. If there are spaces on

  1828  * both sides , assume a typo. If we see a double quote with no

  1829  * space or punctuation on either side of it, assume unspaced

  1830  * quotes "like"this.

  1831  */

  1832 void check_for_misspaced_punctuation(const char *aline,

  1833   struct parities *parities,int isemptyline)

  1834 {

  1835     int i,llen,isacro,isellipsis;

  1836     const char *s;

  1837     llen=strlen(aline);

  1838     for (i=1;i<llen;i++)

  1839     {

  1840 	/* For each character in the line after the first. */

  1841 	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */

  1842 	{

  1843 	    /* we need to suppress warnings for acronyms like M.D. */

  1844 	    isacro=0;

  1845 	    /* we need to suppress warnings for ellipsis . . . */

  1846 	    isellipsis=0;

  1847 	    /* if there are letters on both sides of it or ... */

  1848 	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||

  1849 	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))

  1850 	    {

  1851 		/* ...if it's strict punctuation followed by an alpha */

  1852 		if (aline[i]=='.')

  1853 		{

  1854 		    if (i>2 && aline[i-2]=='.')

  1855 			isacro=1;

  1856 		    if (i+2<llen && aline[i+2]=='.')

  1857 			isacro=1;

  1858 		}

  1859 		if (!isacro)

  1860 		{

  1861 		    if (pswit[ECHO_SWITCH])

  1862 			printf("\n%s\n",aline);

  1863 		    if (!pswit[OVERVIEW_SWITCH])

  1864 			printf("    Line %ld column %d - Missing space?\n",

  1865 			  linecnt,i+1);

  1866 		    else

  1867 			cnt_punct++;

  1868 		}

  1869 	    }

  1870 	    if (aline[i-1]==CHAR_SPACE &&

  1871 	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))

  1872 	    {

  1873 		/*

  1874 		 * If there are spaces on both sides,

  1875 		 * or space before and end of line.

  1876 		 */

  1877 		if (aline[i]=='.')

  1878 		{

  1879 		    if (i>2 && aline[i-2]=='.')

  1880 			isellipsis=1;

  1881 		    if (i+2<llen && aline[i+2]=='.')

  1882 			isellipsis=1;

  1883 		}

  1884 		if (!isemptyline && !isellipsis)

  1885 		{

  1886 		    if (pswit[ECHO_SWITCH])

  1887 			printf("\n%s\n",aline);

  1888 		    if (!pswit[OVERVIEW_SWITCH])

  1889 			printf("    Line %ld column %d - "

  1890 			  "Spaced punctuation?\n",linecnt,i+1);

  1891 		    else

  1892 			cnt_punct++;

  1893 		}

  1894 	    }

  1895 	}

  1896     }

  1897     /* Split out the characters that CANNOT be preceded by space. */

  1898     llen=strlen(aline);

  1899     for (i=1;i<llen;i++)

  1900     {

  1901 	/* for each character in the line after the first */

  1902 	if (strchr("?!,;:",aline[i]))

  1903 	{

  1904 	    /* if it's punctuation that _cannot_ have a space before it */

  1905 	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&

  1906 	      aline[i+1]!=CHAR_SPACE)

  1907 	    {

  1908 		/*

  1909 		 * If aline[i+1) DOES == space,

  1910 		 * it was already reported just above.

  1911 		 */

  1912 		if (pswit[ECHO_SWITCH])

  1913 		    printf("\n%s\n",aline);

  1914 		if (!pswit[OVERVIEW_SWITCH])

  1915 		    printf("    Line %ld column %d - Spaced punctuation?\n",

  1916 		      linecnt,i+1);

  1917 		else

  1918 		    cnt_punct++;

  1919 	    }

  1920 	}

  1921     }

  1922     /*

  1923      * Special case " .X" where X is any alpha.

  1924      * This plugs a hole in the acronym code above.

  1925      * Inelegant, but maintainable.

  1926      */

  1927     llen=strlen(aline);

  1928     for (i=1;i<llen;i++)

  1929     {

  1930 	/* for each character in the line after the first */

  1931 	if (aline[i]=='.')

  1932 	{

  1933 	    /* if it's a period */

  1934 	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))

  1935 	    {

  1936 		/*

  1937 		 * If the period follows a space and

  1938 		 * is followed by a letter.

  1939 		 */

  1940 		if (pswit[ECHO_SWITCH])

  1941 		    printf("\n%s\n",aline);

  1942 		if (!pswit[OVERVIEW_SWITCH])

  1943 		    printf("    Line %ld column %d - Spaced punctuation?\n",

  1944 		      linecnt,i+1);

  1945 		else

  1946 		    cnt_punct++;

  1947 	    }

  1948 	}

  1949     }

  1950     for (i=1;i<llen;i++)

  1951     {

  1952 	/* for each character in the line after the first */

  1953 	if (aline[i]==CHAR_DQUOTE)

  1954 	{

  1955 	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&

  1956 	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||

  1957 	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))

  1958 	    {

  1959 		if (pswit[ECHO_SWITCH])

  1960 		    printf("\n%s\n",aline);

  1961 		if (!pswit[OVERVIEW_SWITCH])

  1962 		    printf("    Line %ld column %d - Unspaced quotes?\n",

  1963 		      linecnt,i+1);

  1964 		else

  1965 		    cnt_punct++;

  1966 	    }

  1967 	}

  1968     }

  1969     /* Check parity of quotes. */

  1970     for (s=aline;*s;s++)

  1971     {

  1972 	if (*s==CHAR_DQUOTE)

  1973 	{

  1974 	    parities->dquote=!parities->dquote;

  1975 	    if (!parities->dquote)

  1976 	    {

  1977 		/* parity even */

  1978 		if (!strchr("_-.'`/,;:!?)]} ",s[1]))

  1979 		{

  1980 		    if (pswit[ECHO_SWITCH])

  1981 			printf("\n%s\n",aline);

  1982 		    if (!pswit[OVERVIEW_SWITCH])

  1983 			printf("    Line %ld column %d - "

  1984 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  1985 		    else

  1986 			cnt_punct++;

  1987 		}

  1988 	    }

  1989 	    else

  1990 	    {

  1991 		/* parity odd */

  1992 		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  1993 		  !strchr("_-/.'`([{$",s[1]) || !s[1])

  1994 		{

  1995 		    if (pswit[ECHO_SWITCH])

  1996 			printf("\n%s\n",aline);

  1997 		    if (!pswit[OVERVIEW_SWITCH])

  1998 			printf("    Line %ld column %d - "

  1999 			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);

  2000 		    else

  2001 			cnt_punct++;

  2002 		}

  2003 	    }

  2004 	}

  2005     }

  2006     if (*aline==CHAR_DQUOTE)

  2007     {

  2008 	if (strchr(",;:!?)]} ",aline[1]))

  2009 	{

  2010 	    if (pswit[ECHO_SWITCH])

  2011 		printf("\n%s\n",aline);

  2012 	    if (!pswit[OVERVIEW_SWITCH])

  2013 		printf("    Line %ld column 1 - Wrongspaced quotes?\n",

  2014 		  linecnt);

  2015 	    else

  2016 		cnt_punct++;

  2017 	}

  2018     }

  2019     if (pswit[SQUOTE_SWITCH])

  2020     {

  2021 	for (s=aline;*s;s++)

  2022 	{

  2023 	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&

  2024 	      (s==aline || s>aline && !gcisalpha(s[-1]) ||

  2025 	      !gcisalpha(s[1])))

  2026 	    {

  2027 		parities->squote=!parities->squote;

  2028 		if (!parities->squote)

  2029 		{

  2030 		    /* parity even */

  2031 		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))

  2032 		    {

  2033 			if (pswit[ECHO_SWITCH])

  2034 			    printf("\n%s\n",aline);

  2035 			if (!pswit[OVERVIEW_SWITCH])

  2036 			    printf("    Line %ld column %d - "

  2037 			      "Wrongspaced singlequotes?\n",

  2038 			      linecnt,(int)(s-aline)+1);

  2039 			else

  2040 			    cnt_punct++;

  2041 		    }

  2042 		}

  2043 		else

  2044 		{

  2045 		    /* parity odd */

  2046 		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&

  2047 		      !strchr("_-/\".'`",s[1]) || !s[1])

  2048 		    {

  2049 			if (pswit[ECHO_SWITCH])

  2050 			    printf("\n%s\n",aline);

  2051 			if (!pswit[OVERVIEW_SWITCH])

  2052 			    printf("    Line %ld column %d - "

  2053 			      "Wrongspaced singlequotes?\n",

  2054 			      linecnt,(int)(s-aline)+1);

  2055 			else

  2056 			    cnt_punct++;

  2057 		    }

  2058 		}

  2059 	    }

  2060 	}

  2061     }

  2062 }

  2064 /*

  2065  * check_for_double_punctuation:

  2066  *

  2067  * Look for double punctuation like ,. or ,,

  2068  * Thanks to DW for the suggestion!

  2069  * In books with references, ".," and ".;" are common

  2070  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2071  * OTOH, from my initial tests, there are also fairly

  2072  * common errors. What to do? Make these cases paranoid?

  2073  * ".," is the most common, so warnings->dotcomma is used

  2074  * to suppress detailed reporting if it occurs often.

  2075  */

  2076 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2077 {

  2078     int i,llen;

  2079     llen=strlen(aline);

  2080     for (i=0;i<llen;i++)

  2081     {

  2082 	/* for each punctuation character in the line */

  2083 	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&

  2084 	  aline[i] && aline[i+1])

  2085 	{

  2086 	    /* followed by punctuation, it's a query, unless . . . */

  2087 	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||

  2088 	      aline[i]=='!') ||

  2089 	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||

  2090 	      warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2091 	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2092 	      warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2093 	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2094 	      warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2095 	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2096 	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2097 	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2098 	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2099 	      warnings->isFrench && !strncmp(aline+i,"...?",4))

  2100 	    {

  2101 		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||

  2102 		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||

  2103 		  warnings->isFrench && !strncmp(aline+i,";...",4) ||

  2104 		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||

  2105 		  warnings->isFrench && !strncmp(aline+i,":...",4) ||

  2106 		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||

  2107 		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||

  2108 		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||

  2109 		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||

  2110 		  warnings->isFrench && !strncmp(aline+i,"...?",4))

  2111 		    i+=4;

  2112 		; /* do nothing for .. !! and ?? which can be legit */

  2113 	    }

  2114 	    else

  2115 	    {

  2116 		if (pswit[ECHO_SWITCH])

  2117 		    printf("\n%s\n",aline);

  2118 		if (!pswit[OVERVIEW_SWITCH])

  2119 		    printf("    Line %ld column %d - Double punctuation?\n",

  2120 		      linecnt,i+1);

  2121 		else

  2122 		    cnt_punct++;

  2123 	    }

  2124 	}

  2125     }

  2126 }

  2128 /*

  2129  * check_for_spaced_quotes:

  2130  */

  2131 void check_for_spaced_quotes(const char *aline)

  2132 {

  2133     const char *s,*t;

  2134     s=aline;

  2135     while ((t=strstr(s," \" ")))

  2136     {

  2137 	if (pswit[ECHO_SWITCH])

  2138 	    printf("\n%s\n",aline);

  2139 	if (!pswit[OVERVIEW_SWITCH])

  2140 	    printf("    Line %ld column %d - Spaced doublequote?\n",

  2141 	      linecnt,(int)(t-aline+1));

  2142 	else

  2143 	    cnt_punct++;

  2144 	s=t+2;

  2145     }

  2146     s=aline;

  2147     while ((t=strstr(s," ' ")))

  2148     {

  2149 	if (pswit[ECHO_SWITCH])

  2150 	    printf("\n%s\n",aline);

  2151 	if (!pswit[OVERVIEW_SWITCH])

  2152 	    printf("    Line %ld column %d - Spaced singlequote?\n",

  2153 	      linecnt,(int)(t-aline+1));

  2154 	else

  2155 	    cnt_punct++;

  2156 	s=t+2;

  2157     }

  2158     s=aline;

  2159     while ((t=strstr(s," ` ")))

  2160     {

  2161 	if (pswit[ECHO_SWITCH])

  2162 	    printf("\n%s\n",aline);

  2163 	if (!pswit[OVERVIEW_SWITCH])

  2164 	    printf("    Line %ld column %d - Spaced singlequote?\n",

  2165 	      linecnt,(int)(t-aline+1));

  2166 	else

  2167 	    cnt_punct++;

  2168 	s=t+2;

  2169     }

  2170 }

  2172 /*

  2173  * check_for_miscased_genative:

  2174  *

  2175  * Check special case of 'S instead of 's at end of word.

  2176  */

  2177 void check_for_miscased_genative(const char *aline)

  2178 {

  2179     const char *s;

  2180     s=aline+1;

  2181     while (*s)

  2182     {

  2183 	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')

  2184 	{

  2185 	    if (pswit[ECHO_SWITCH])

  2186 		printf("\n%s\n",aline);

  2187 	    if (!pswit[OVERVIEW_SWITCH])

  2188 		printf("    Line %ld column %d - Capital \"S\"?\n",

  2189 		  linecnt,(int)(s-aline+2));

  2190 	    else

  2191 		cnt_punct++;

  2192 	}

  2193 	s++;

  2194     }

  2195 }

  2197 /*

  2198  * check_end_of_line:

  2199  *

  2200  * Now check special cases - start and end of line -

  2201  * for single and double quotes. Start is sometimes [sic]

  2202  * but better to query it anyway.

  2203  * While we're here, check for dash at end of line.

  2204  */

  2205 void check_end_of_line(const char *aline,struct warnings *warnings)

  2206 {

  2207     int i,llen;

  2208     llen=strlen(aline);

  2209     if (llen>1)

  2210     {

  2211 	if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||

  2212 	  aline[llen-1]==CHAR_OPEN_SQUOTE)

  2213 	    if (aline[llen-2]==CHAR_SPACE)

  2214 	    {

  2215 		if (pswit[ECHO_SWITCH])

  2216 		    printf("\n%s\n",aline);

  2217 		if (!pswit[OVERVIEW_SWITCH])

  2218 		    printf("    Line %ld column %d - Spaced quote?\n",

  2219 		      linecnt,llen);

  2220 		else

  2221 		    cnt_punct++;

  2222 	    }

  2223 	if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&

  2224 	  aline[1]==CHAR_SPACE)

  2225 	{

  2226 	    if (pswit[ECHO_SWITCH])

  2227 		printf("\n%s\n",aline);

  2228 	    if (!pswit[OVERVIEW_SWITCH])

  2229 		printf("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2230 	    else

  2231 		cnt_punct++;

  2232 	}

  2233 	/*

  2234 	 * Dash at end of line may well be legit - paranoid mode only

  2235 	 * and don't report em-dash at line-end.

  2236 	 */

  2237 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2238 	{

  2239 	    for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)

  2240 		;

  2241 	    if (aline[i]=='-' && aline[i-1]!='-')

  2242 	    {

  2243 		if (pswit[ECHO_SWITCH])

  2244 		    printf("\n%s\n",aline);

  2245 		if (!pswit[OVERVIEW_SWITCH])

  2246 		    printf("    Line %ld column %d - Hyphen at end of line?\n",

  2247 		      linecnt,i);

  2248 	    }

  2249 	}

  2250     }

  2251 }

  2253 /*

  2254  * check_for_unspaced_bracket:

  2255  *

  2256  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2257  * If so, suspect a scanno like "a]most".

  2258  */

  2259 void check_for_unspaced_bracket(const char *aline)

  2260 {

  2261     int i,llen;

  2262     llen=strlen(aline);

  2263     for (i=1;i<llen-1;i++)

  2264     {

  2265 	/* for each bracket character in the line except 1st & last */

  2266 	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&

  2267 	  gcisalpha(aline[i+1]))

  2268 	{

  2269 	    if (pswit[ECHO_SWITCH])

  2270 		printf("\n%s\n",aline);

  2271 	    if (!pswit[OVERVIEW_SWITCH])

  2272 		printf("    Line %ld column %d - Unspaced bracket?\n",

  2273 		  linecnt,i);

  2274 	    else

  2275 		cnt_punct++;

  2276 	}

  2277     }

  2278 }

  2280 /*

  2281  * check_for_unpunctuated_endquote:

  2282  */

  2283 void check_for_unpunctuated_endquote(const char *aline)

  2284 {

  2285     int i,llen;

  2286     llen=strlen(aline);

  2287     for (i=1;i<llen;i++)

  2288     {

  2289 	/* for each character in the line except 1st */

  2290 	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))

  2291 	{

  2292 	    if (pswit[ECHO_SWITCH])

  2293 		printf("\n%s\n",aline);

  2294 	    if (!pswit[OVERVIEW_SWITCH])

  2295 		printf("    Line %ld column %d - "

  2296 		  "endquote missing punctuation?\n",linecnt,i);

  2297 	    else

  2298 		cnt_punct++;

  2299 	}

  2300     }

  2301 }

  2303 /*

  2304  * check_for_html_tag:

  2305  *

  2306  * Check for <HTML TAG>.

  2307  *

  2308  * If there is a < in the line, followed at some point

  2309  * by a > then we suspect HTML.

  2310  */

  2311 void check_for_html_tag(const char *aline)

  2312 {

  2313     int i;

  2314     const char *open,*close;

  2315     open=strstr(aline,"<");

  2316     if (open)

  2317     {

  2318 	close=strstr(aline,">");

  2319 	if (close)

  2320 	{

  2321 	    i=(int)(close-open+1);

  2322 	    if (i>0)

  2323 	    {

  2324 		strncpy(wrk,open,i);

  2325 		wrk[i]=0;

  2326 		if (pswit[ECHO_SWITCH])

  2327 		    printf("\n%s\n",aline);

  2328 		if (!pswit[OVERVIEW_SWITCH])

  2329 		    printf("    Line %ld column %d - HTML Tag? %s \n",

  2330 		      linecnt,(int)(open-aline)+1,wrk);

  2331 		else

  2332 		    cnt_html++;

  2333 	    }

  2334 	}

  2335     }

  2336 }

  2338 /*

  2339  * check_for_html_entity:

  2340  *

  2341  * Check for &symbol; HTML.

  2342  *

  2343  * If there is a & in the line, followed at

  2344  * some point by a ; then we suspect HTML.

  2345  */

  2346 void check_for_html_entity(const char *aline)

  2347 {

  2348     int i;

  2349     const char *s,*amp,*scolon;

  2350     amp=strstr(aline,"&");

  2351     if (amp)

  2352     {

  2353 	scolon=strstr(aline,";");

  2354 	if (scolon)

  2355 	{

  2356 	    i=(int)(scolon-amp+1);

  2357 	    for (s=amp;s<scolon;s++)

  2358 		if (*s==CHAR_SPACE)

  2359 		    i=0;		/* Don't report "Jones & Son;" */

  2360 	    if (i>0)

  2361 	    {

  2362 		strncpy(wrk,amp,i);

  2363 		wrk[i]=0;

  2364 		if (pswit[ECHO_SWITCH])

  2365 		    printf("\n%s\n",aline);

  2366 		if (!pswit[OVERVIEW_SWITCH])

  2367 		    printf("    Line %ld column %d - HTML symbol? %s \n",

  2368 		      linecnt,(int)(amp-aline)+1,wrk);

  2369 		else

  2370 		    cnt_html++;

  2371 	    }

  2372 	}

  2373     }

  2374 }

  2376 /*

  2377  * print_pending:

  2378  *

  2379  * If we are in a state of unbalanced quotes, and this line

  2380  * doesn't begin with a quote, output the stored error message.

  2381  * If the -P switch was used, print the warning even if the

  2382  * new para starts with quotes.

  2383  */

  2384 void print_pending(const char *aline,const char *parastart,

  2385   struct pending *pending)

  2386 {

  2387     const char *s;

  2388     s=aline;

  2389     while (*s==' ')

  2390 	s++;

  2391     if (*pending->dquote)

  2392 	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  2393 	{

  2394 	    if (!pswit[OVERVIEW_SWITCH])

  2395 	    {

  2396 		if (pswit[ECHO_SWITCH])

  2397 		    printf("\n%s\n",parastart);

  2398 		puts(pending->dquote);

  2399 	    }

  2400 	    else

  2401 		cnt_dquot++;

  2402 	}

  2403     if (*pending->squote)

  2404     {

  2405 	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||

  2406 	  pending->squot)

  2407 	{

  2408 	    if (!pswit[OVERVIEW_SWITCH])

  2409 	    {

  2410 		if (pswit[ECHO_SWITCH])

  2411 		    printf("\n%s\n",parastart);

  2412 		puts(pending->squote);

  2413 	    }

  2414 	    else

  2415 		cnt_squot++;

  2416 	}

  2417     }

  2418     if (*pending->rbrack)

  2419     {

  2420 	if (!pswit[OVERVIEW_SWITCH])

  2421 	{

  2422 	    if (pswit[ECHO_SWITCH])

  2423 		printf("\n%s\n",parastart);

  2424 	    puts(pending->rbrack);

  2425 	}

  2426 	else

  2427 	    cnt_brack++;

  2428     }

  2429     if (*pending->sbrack)

  2430     {

  2431 	if (!pswit[OVERVIEW_SWITCH])

  2432 	{

  2433 	    if (pswit[ECHO_SWITCH])

  2434 		printf("\n%s\n",parastart);

  2435 	    puts(pending->sbrack);

  2436 	}

  2437 	else

  2438 	    cnt_brack++;

  2439     }

  2440     if (*pending->cbrack)

  2441     {

  2442 	if (!pswit[OVERVIEW_SWITCH])

  2443 	{

  2444 	    if (pswit[ECHO_SWITCH])

  2445 		printf("\n%s\n",parastart);

  2446 	    puts(pending->cbrack);

  2447 	}

  2448 	else

  2449 	    cnt_brack++;

  2450     }

  2451     if (*pending->unders)

  2452     {

  2453 	if (!pswit[OVERVIEW_SWITCH])

  2454 	{

  2455 	    if (pswit[ECHO_SWITCH])

  2456 		printf("\n%s\n",parastart);

  2457 	    puts(pending->unders);

  2458 	}

  2459 	else

  2460 	    cnt_brack++;

  2461     }

  2462 }

  2464 /*

  2465  * check_for_mismatched_quotes:

  2466  *

  2467  * At end of paragraph, check for mismatched quotes.

  2468  *

  2469  * We don't want to report an error immediately, since it is a

  2470  * common convention to omit the quotes at end of paragraph if

  2471  * the next paragraph is a continuation of the same speaker.

  2472  * Where this is the case, the next para should begin with a

  2473  * quote, so we store the warning message and only display it

  2474  * at the top of the next iteration if the new para doesn't

  2475  * start with a quote.

  2476  * The -p switch overrides this default, and warns of unclosed

  2477  * quotes on _every_ paragraph, whether the next begins with a

  2478  * quote or not.

  2479  */

  2480 void check_for_mismatched_quotes(const struct counters *counters,

  2481   struct pending *pending)

  2482 {

  2483     if (counters->quot%2)

  2484 	sprintf(pending->dquote,"    Line %ld - Mismatched quotes",

  2485 	  linecnt);

  2486     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2487       counters->open_single_quote!=counters->close_single_quote)

  2488 	sprintf(pending->squote,"    Line %ld - Mismatched singlequotes?",

  2489 	  linecnt);

  2490     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2491       counters->open_single_quote!=counters->close_single_quote &&

  2492       counters->open_single_quote!=counters->close_single_quote+1)

  2493 	/*

  2494 	 * Flag it to be noted regardless of the

  2495 	 * first char of the next para.

  2496 	 */

  2497 	pending->squot=1;

  2498     if (counters->r_brack)

  2499 	sprintf(pending->rbrack,"    Line %ld - Mismatched round brackets?",

  2500 	  linecnt);

  2501     if (counters->s_brack)

  2502 	sprintf(pending->sbrack,"    Line %ld - Mismatched square brackets?",

  2503 	  linecnt);

  2504     if (counters->c_brack)

  2505 	sprintf(pending->cbrack,"    Line %ld - Mismatched curly brackets?",

  2506 	  linecnt);

  2507     if (counters->c_unders%2)

  2508 	sprintf(pending->unders,"    Line %ld - Mismatched underscores?",

  2509 	  linecnt);

  2510 }

  2512 /*

  2513  * check_for_omitted_punctuation:

  2514  *

  2515  * Check for omitted punctuation at end of paragraph by working back

  2516  * through prevline. DW.

  2517  * Need to check this only for "normal" paras.

  2518  * So what is a "normal" para?

  2519  *    Not normal if one-liner (chapter headings, etc.)

  2520  *    Not normal if doesn't contain at least one locase letter

  2521  *    Not normal if starts with space

  2522  */

  2523 void check_for_omitted_punctuation(const char *prevline,

  2524   struct line_properties *last,int start_para_line)

  2525 {

  2526     int i;

  2527     const char *s;

  2528     for (s=prevline,i=0;*s && !i;s++)

  2529 	if (gcisletter(*s))

  2530 	    /* use i to indicate the presence of a letter on the line */

  2531 	    i=1;

  2532     /*

  2533      * This next "if" is a problem.

  2534      * If we say "start_para_line <= linecnt - 1", that includes

  2535      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2536      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2537      * misses genuine one-line paragraphs.

  2538      */

  2539     if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)

  2540     {

  2541 	for (i=strlen(prevline)-1;

  2542 	  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&

  2543 	  prevline[i]>CHAR_SPACE && i>0;

  2544 	  i--)

  2545 	    ;

  2546 	for (;i>0;i--)

  2547 	{

  2548 	    if (gcisalpha(prevline[i]))

  2549 	    {

  2550 		if (pswit[ECHO_SWITCH])

  2551 		    printf("\n%s\n",prevline);

  2552 		if (!pswit[OVERVIEW_SWITCH])

  2553 		    printf("    Line %ld column %d - "

  2554 		      "No punctuation at para end?\n",

  2555 		      linecnt-1,(int)strlen(prevline));

  2556 		else

  2557 		    cnt_punct++;

  2558 		break;

  2559 	    }

  2560 	    if (strchr("-.:!([{?}])",prevline[i]))

  2561 		break;

  2562 	}

  2563     }

  2564 }

  2566 /*

  2567  * procfile:

  2568  *

  2569  * Process one file.

  2570  */

  2571 void procfile(char *filename)

  2572 {

  2573     const char *s;

  2574     char parastart[81];     /* first line of current para */

  2575     FILE *infile;

  2576     struct first_pass_results *first_pass_results;

  2577     struct warnings *warnings;

  2578     struct counters counters={0};

  2579     struct line_properties last={0};

  2580     struct parities parities={0};

  2581     struct pending pending={{0},};

  2582     int isemptyline;

  2583     long start_para_line=0;

  2584     int i,isnewpara=0,enddash=0;

  2585     last.start=CHAR_SPACE;

  2586     *prevline=0;

  2587     linecnt=checked_linecnt=0;

  2588     infile=fopen(filename,"rb");

  2589     if (!infile)

  2590     {

  2591 	if (pswit[STDOUT_SWITCH])

  2592 	    fprintf(stdout,"bookloupe: cannot open %s\n",filename);

  2593 	else

  2594 	    fprintf(stderr,"bookloupe: cannot open %s\n",filename);

  2595 	exit(1);

  2596     }

  2597     fprintf(stdout,"\n\nFile: %s\n\n",filename);

  2598     first_pass_results=first_pass(infile);

  2599     warnings=report_first_pass(first_pass_results);

  2600     /*

  2601      * Here we go with the main pass. Hold onto yer hat!

  2602      */

  2603     rewind(infile);

  2604     linecnt=0;

  2605     while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))

  2606     {

  2607 	linecnt++;

  2608 	if (linecnt==1)

  2609 	    isnewpara=1;

  2610 	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))

  2611 	    continue;    // skip DP page separators completely

  2612 	if (linecnt<first_pass_results->firstline ||

  2613 	  (first_pass_results->footerline>0 &&

  2614 	  linecnt>first_pass_results->footerline))

  2615 	{

  2616 	    if (pswit[HEADER_SWITCH])

  2617 	    {

  2618 		if (!strncmp(aline,"Title:",6))

  2619 		    printf("    %s\n",aline);

  2620 		if (!strncmp(aline,"Author:",7))

  2621 		    printf("    %s\n",aline);

  2622 		if (!strncmp(aline,"Release Date:",13))

  2623 		    printf("    %s\n",aline);

  2624 		if (!strncmp(aline,"Edition:",8))

  2625 		    printf("    %s\n\n",aline);

  2626 	    }

  2627 	    continue;		/* skip through the header */

  2628 	}

  2629 	checked_linecnt++;

  2630 	print_pending(aline,parastart,&pending);

  2631 	memset(&pending,0,sizeof(pending));

  2632 	isemptyline=analyse_quotes(aline,&counters);

  2633 	if (isnewpara && !isemptyline)

  2634 	{

  2635 	    /* This line is the start of a new paragraph. */

  2636 	    start_para_line=linecnt;

  2637 	    /* Capture its first line in case we want to report it later. */

  2638 	    strncpy(parastart,aline,80);

  2639 	    parastart[79]=0;

  2640 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2641 	    s=aline;

  2642 	    while (!gcisalpha(*s) && !gcisdigit(*s) && *s)

  2643 		s++;

  2644 	    if (*s>='a' && *s<='z')

  2645 	    {

  2646 		/* and its first letter is lowercase */

  2647 		if (pswit[ECHO_SWITCH])

  2648 		    printf("\n%s\n",aline);

  2649 		if (!pswit[OVERVIEW_SWITCH])

  2650 		    printf("    Line %ld column %d - "

  2651 		      "Paragraph starts with lower-case\n",

  2652 		      linecnt,(int)(s-aline)+1);

  2653 		else

  2654 		    cnt_punct++;

  2655 	    }

  2656 	    isnewpara=0; /* Signal the end of new para processing. */

  2657 	}

  2658 	/* Check for an em-dash broken at line end. */

  2659 	if (enddash && *aline=='-')

  2660 	{

  2661 	    if (pswit[ECHO_SWITCH])

  2662 		printf("\n%s\n",aline);

  2663 	    if (!pswit[OVERVIEW_SWITCH])

  2664 		printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2665 	    else

  2666 		cnt_punct++;

  2667 	}

  2668 	enddash=0;

  2669 	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)

  2670 	    ;

  2671 	if (s>=aline && *s=='-')

  2672 	    enddash=1;

  2673 	check_for_control_characters(aline);

  2674 	if (warnings->bin)

  2675 	    check_for_odd_characters(aline,warnings,isemptyline);

  2676 	if (warnings->longline)

  2677 	    check_for_long_line(aline);

  2678 	if (warnings->shortline)

  2679 	    check_for_short_line(aline,&last);

  2680 	last.blen=last.len;

  2681 	last.len=strlen(aline);

  2682 	last.start=aline[0];

  2683 	check_for_starting_punctuation(aline);

  2684 	if (warnings->dash)

  2685 	{

  2686 	    check_for_spaced_emdash(aline);

  2687 	    check_for_spaced_dash(aline);

  2688 	}

  2689 	check_for_unmarked_paragraphs(aline);

  2690 	check_for_jeebies(aline);

  2691 	check_for_mta_from(aline);

  2692 	check_for_orphan_character(aline);

  2693 	check_for_pling_scanno(aline);

  2694 	check_for_extra_period(aline,warnings);

  2695 	check_for_following_punctuation(aline);

  2696 	check_for_typos(aline,warnings);

  2697 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2698 	check_for_double_punctuation(aline,warnings);

  2699 	check_for_spaced_quotes(aline);

  2700 	check_for_miscased_genative(aline);

  2701 	check_end_of_line(aline,warnings);

  2702 	check_for_unspaced_bracket(aline);

  2703 	if (warnings->endquote)

  2704 	    check_for_unpunctuated_endquote(aline);

  2705 	check_for_html_tag(aline);

  2706 	check_for_html_entity(aline);

  2707 	if (isemptyline)

  2708 	{

  2709 	    check_for_mismatched_quotes(&counters,&pending);

  2710 	    memset(&counters,0,sizeof(counters));

  2711 	    /* let the next iteration know that it's starting a new para */

  2712 	    isnewpara=1;

  2713 	    check_for_omitted_punctuation(prevline,&last,start_para_line);

  2714 	}

  2715 	strcpy(prevline,aline);

  2716     }

  2717     fclose(infile);

  2718     if (!pswit[OVERVIEW_SWITCH])

  2719 	for (i=0;i<MAX_QWORD;i++)

  2720 	    if (dupcnt[i])

  2721 		printf("\nNote: Queried word %s was duplicated %d time%s\n",

  2722 		  qword[i],dupcnt[i],"s");

  2723 }

  2725 /*

  2726  * flgets:

  2727  *

  2728  * Get one line from the input stream, checking for

  2729  * the existence of exactly one CR/LF line-end per line.

  2730  *

  2731  * Returns: a pointer to the line.

  2732  */

  2733 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)

  2734 {

  2735     char c;

  2736     int len,isCR,cint;

  2737     *theline=0;

  2738     len=isCR=0;

  2739     c=cint=fgetc(thefile);

  2740     do

  2741     {

  2742 	if (cint==EOF)

  2743 	    return NULL;

  2744 	/* either way, it's end of line */

  2745 	if (c==10)

  2746 	{

  2747 	    if (isCR)

  2748 		break;

  2749 	    else

  2750 	    {

  2751 		/* Error - a LF without a preceding CR */

  2752 		if (pswit[LINE_END_SWITCH])

  2753 		{

  2754 		    if (pswit[ECHO_SWITCH])

  2755 			printf("\n%s\n",theline);

  2756 		    if (!pswit[OVERVIEW_SWITCH])

  2757 			printf("    Line %ld - No CR?\n",lcnt);

  2758 		    else

  2759 			cnt_lineend++;

  2760 		}

  2761 		break;

  2762 	    }

  2763 	}

  2764 	if (c==13)

  2765 	{

  2766 	    if (isCR)

  2767 	    {

  2768 		/* Error - two successive CRs */

  2769 		if (pswit[LINE_END_SWITCH])

  2770 		{

  2771 		    if (pswit[ECHO_SWITCH])

  2772 			printf("\n%s\n",theline);

  2773 		    if (!pswit[OVERVIEW_SWITCH])

  2774 			printf("    Line %ld - Two successive CRs?\n",lcnt);

  2775 		    else

  2776 			cnt_lineend++;

  2777 		}

  2778 	    }

  2779 	    isCR=1;

  2780 	}

  2781 	else

  2782 	{

  2783 	    if (pswit[LINE_END_SWITCH] && isCR)

  2784 	    {

  2785 		if (pswit[ECHO_SWITCH])

  2786 		    printf("\n%s\n",theline);

  2787 		if (!pswit[OVERVIEW_SWITCH])

  2788 		    printf("    Line %ld column %d - CR without LF?\n",

  2789 		      lcnt,len+1);

  2790 		else

  2791 		    cnt_lineend++;

  2792 	    }

  2793 	    theline[len]=c;

  2794 	    len++;

  2795 	    theline[len]=0;

  2796 	    isCR=0;

  2797 	}

  2798 	c=cint=fgetc(thefile);

  2799     } while(len<maxlen);

  2800     if (pswit[MARKUP_SWITCH])

  2801 	postprocess_for_HTML(theline);

  2802     if (pswit[DP_SWITCH])

  2803 	postprocess_for_DP(theline);

  2804     return theline;

  2805 }

  2807 /*

  2808  * mixdigit:

  2809  *

  2810  * Takes a "word" as a parameter, and checks whether it

  2811  * contains a mixture of alpha and digits. Generally, this is an

  2812  * error, but may not be for cases like 4th or L5 12s. 3d.

  2813  *

  2814  * Returns: 0 if no error found, 1 if error.

  2815  */

  2816 int mixdigit(char *checkword)

  2817 {

  2818     int wehaveadigit,wehavealetter,firstdigits,query,wl;

  2819     char *s;

  2820     wehaveadigit=wehavealetter=query=0;

  2821     for (s=checkword;*s;s++)

  2822 	if (gcisalpha(*s))

  2823 	    wehavealetter=1;

  2824 	else

  2825 	    if (gcisdigit(*s))

  2826 		wehaveadigit=1;

  2827     if (wehaveadigit && wehavealetter)

  2828     {

  2829 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2830 	query=1;

  2831 	wl=strlen(checkword);

  2832 	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)

  2833 	    ;

  2834 	/* digits, ending in st, rd, nd, th of either case */

  2835 	if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||

  2836 	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||

  2837 	  matchword(checkword+wl-2,"th")))

  2838 	    query=0;

  2839 	if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||

  2840 	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||

  2841 	  matchword(checkword+wl-3,"ths")))

  2842 	    query=0;

  2843 	if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||

  2844 	  matchword(checkword+wl-4,"rdly") ||

  2845 	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))

  2846 	    query=0;

  2847 	/* digits, ending in l, L, s or d */

  2848 	if (firstdigits+1==wl && (checkword[wl-1]=='l' ||

  2849 	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))

  2850 	    query=0;

  2851 	/*

  2852 	 * L at the start of a number, representing Britsh pounds, like L500.

  2853 	 * This is cute. We know the current word is mixeddigit. If the first

  2854 	 * letter is L, there must be at least one digit following. If both

  2855 	 * digits and letters follow, we have a genuine error, else we have a

  2856 	 * capital L followed by digits, and we accept that as a non-error.

  2857 	 */

  2858 	if (checkword[0]=='L' && !mixdigit(checkword+1))

  2859 	    query=0;

  2860     }

  2861     return query;

  2862 }

  2864 /*

  2865  * getaword:

  2866  *

  2867  * Extracts the first/next "word" from the line, and puts

  2868  * it into "thisword". A word is defined as one English word unit--or

  2869  * at least that's the aim.

  2870  *

  2871  * Returns: a pointer to the position in the line where we will start

  2872  *	  looking for the next word.

  2873  */

  2874 const char *getaword(const char *fromline,char *thisword)

  2875 {

  2876     int i,wordlen;

  2877     const char *s;

  2878     wordlen=0;

  2879     for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;

  2880       fromline++)

  2881 	;

  2882     /*

  2883      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2884      * Especially yucky is the case of L1,000

  2885      * This section looks for a pattern of characters including a digit

  2886      * followed by a comma or period followed by one or more digits.

  2887      * If found, it returns this whole pattern as a word; otherwise we discard

  2888      * the results and resume our normal programming.

  2889      */

  2890     s=fromline;

  2891     for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&

  2892       wordlen<MAXWORDLEN;s++)

  2893     {

  2894 	thisword[wordlen]=*s;

  2895 	wordlen++;

  2896     }

  2897     thisword[wordlen]=0;

  2898     for (i=1;i<wordlen-1;i++)

  2899     {

  2900 	if (thisword[i]=='.' || thisword[i]==',')

  2901 	{

  2902 	    if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))

  2903 	    {

  2904 		fromline=s;

  2905 		return fromline;

  2906 	    }

  2907 	}

  2908     }

  2909     /* we didn't find a punctuated number - do the regular getword thing */

  2910     wordlen=0;

  2911     for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&

  2912       wordlen<MAXWORDLEN;fromline++)

  2913     {

  2914 	thisword[wordlen]=*fromline;

  2915 	wordlen++;

  2916     }

  2917     thisword[wordlen]=0;

  2918     return fromline;

  2919 }

  2921 /*

  2922  * matchword:

  2923  *

  2924  * A case-insensitive string matcher.

  2925  */

  2926 int matchword(char *checkfor,char *thisword)

  2927 {

  2928     unsigned int ismatch,i;

  2929     if (strlen(checkfor)!=strlen(thisword))

  2930 	return 0;

  2931     ismatch=1;     /* assume a match until we find a difference */

  2932     for (i=0;i<strlen(checkfor);i++)

  2933 	if (toupper(checkfor[i])!=toupper(thisword[i]))

  2934 	    ismatch=0;

  2935     return ismatch;

  2936 }

  2938 /*

  2939  * lowerit:

  2940  *

  2941  * Lowercase the line.

  2942  */

  2943 void lowerit(char *theline)

  2944 {

  2945     for (;*theline;theline++)

  2946 	if (*theline>='A' && *theline<='Z')

  2947 	    *theline+=32;

  2948 }

  2950 /*

  2951  * isroman:

  2952  *

  2953  * Is this word a Roman Numeral?

  2954  *

  2955  * It doesn't actually validate that the number is a valid Roman Numeral--for

  2956  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  2957  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  2958  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  2959  * expressions thereof, except when it came to taxes. Allow any number of M,

  2960  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  2961  * XL or an optional XC, an optional IX or IV, an optional V and any number

  2962  * of optional Is.

  2963  */

  2964 int isroman(char *t)

  2965 {

  2966     char *s;

  2967     if (!t || !*t)

  2968 	return 0;

  2969     s=t;

  2970     while (*t=='m' && *t)

  2971 	t++;

  2972     if (*t=='d')

  2973 	t++;

  2974     if (*t=='c' && t[1]=='m')

  2975 	t+=2;

  2976     if (*t=='c' && t[1]=='d')

  2977 	t+=2;

  2978     while (*t=='c' && *t)

  2979 	t++;

  2980     if (*t=='x' && t[1]=='l')

  2981 	t+=2;

  2982     if (*t=='x' && t[1]=='c')

  2983 	t+=2;

  2984     if (*t=='l')

  2985 	t++;

  2986     while (*t=='x' && *t)

  2987 	t++;

  2988     if (*t=='i' && t[1]=='x')

  2989 	t+=2;

  2990     if (*t=='i' && t[1]=='v')

  2991 	t+=2;

  2992     if (*t=='v')

  2993 	t++;

  2994     while (*t=='i' && *t)

  2995 	t++;

  2996     return !*t;

  2997 }

  2999 /*

  3000  * gcisalpha:

  3001  *

  3002  * A version of isalpha() that is somewhat lenient on 8-bit texts.

  3003  * If we use the standard function, 8-bit accented characters break

  3004  * words, so that tete with accented characters appears to be two words, "t"

  3005  * and "t", with 8-bit characters between them. This causes over-reporting of

  3006  * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)

  3007  * and ISO-8859-1 character sets, which are the most common PG 8-bit types.

  3008  */

  3009 int gcisalpha(unsigned char c)

  3010 {

  3011     if (c>='a' && c<='z')

  3012 	return 1;

  3013     if (c>='A' && c<='Z')

  3014 	return 1;

  3015     if (c<140)

  3016 	return 0;

  3017     if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)

  3018 	return 1;

  3019     if (c==140 || c==142 || c==156 || c==158 || c==159)

  3020 	return 1;

  3021     return 0;

  3022 }

  3024 /*

  3025  * gcisdigit:

  3026  *

  3027  * A version of isdigit() that doesn't get confused in 8-bit texts.

  3028  */

  3029 int gcisdigit(unsigned char c)

  3030 {

  3031     return c>='0' && c<='9';

  3032 }

  3034 /*

  3035  * gcisletter:

  3036  *

  3037  * A version of isletter() that doesn't get confused in 8-bit texts.

  3038  * NB: this is ISO-8891-1-specific.

  3039  */

  3040 int gcisletter(unsigned char c)

  3041 {

  3042     return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;

  3043 }

  3045 /*

  3046  * gcstrchr:

  3047  *

  3048  * Wraps strchr to return NULL if the character being searched for is zero.

  3049  */

  3050 char *gcstrchr(char *s,char c)

  3051 {

  3052     if (!c)

  3053 	return NULL;

  3054     return strchr(s,c);

  3055 }

  3057 /*

  3058  * postprocess_for_DP:

  3059  *

  3060  * Invoked with the -d switch from flgets().

  3061  * It simply "removes" from the line a hard-coded set of common

  3062  * DP-specific tags, so that the line passed to the main routine has

  3063  * been pre-cleaned of DP markup.

  3064  */

  3065 void postprocess_for_DP(char *theline)

  3066 {

  3067     char *s,*t;

  3068     int i;

  3069     if (!*theline)

  3070 	return;

  3071     for (i=0;*DPmarkup[i];i++)

  3072     {

  3073 	s=strstr(theline,DPmarkup[i]);

  3074 	while (s)

  3075 	{

  3076 	    t=s+strlen(DPmarkup[i]);

  3077 	    while (*t)

  3078 	    {

  3079 		*s=*t;

  3080 		t++;

  3081 		s++;

  3082 	    }

  3083 	    *s=0;

  3084 	    s=strstr(theline,DPmarkup[i]);

  3085 	}

  3086     }

  3087 }

  3089 /*

  3090  * postprocess_for_HTML:

  3091  *

  3092  * Invoked with the -m switch from flgets().

  3093  * It simply "removes" from the line a hard-coded set of common

  3094  * HTML tags and "replaces" a hard-coded set of common HTML

  3095  * entities, so that the line passed to the main routine has

  3096  * been pre-cleaned of HTML.

  3097  */

  3098 void postprocess_for_HTML(char *theline)

  3099 {

  3100     if (strstr(theline,"<") && strstr(theline,">"))

  3101 	while (losemarkup(theline))

  3102 	    ;

  3103     while (loseentities(theline))

  3104 	;

  3105 }

  3107 char *losemarkup(char *theline)

  3108 {

  3109     char *s,*t;

  3110     int i;

  3111     if (!*theline)

  3112 	return NULL;

  3113     s=strstr(theline,"<");

  3114     t=strstr(theline,">");

  3115     if (!s || !t)

  3116 	return NULL;

  3117     for (i=0;*markup[i];i++)

  3118 	if (!tagcomp(s+1,markup[i]))

  3119 	{

  3120 	    if (!t[1])

  3121 	    {

  3122 		*s=0;

  3123 		return s;

  3124 	    }

  3125 	    else if (t>s)

  3126 	    {

  3127 		strcpy(s,t+1);

  3128 		return s;

  3129 	    }

  3130 	}

  3131     /* It's an unrecognized <xxx>. */

  3132     return NULL;

  3133 }

  3135 char *loseentities(char *theline)

  3136 {

  3137     int i;

  3138     char *s,*t;

  3139     if (!*theline)

  3140 	return NULL;

  3141     for (i=0;*entities[i].htmlent;i++)

  3142     {

  3143 	s=strstr(theline,entities[i].htmlent);

  3144 	if (s)

  3145 	{

  3146 	    t=malloc((size_t)strlen(s));

  3147 	    if (!t)

  3148 		return NULL;

  3149 	    strcpy(t,s+strlen(entities[i].htmlent));

  3150 	    strcpy(s,entities[i].textent);

  3151 	    strcat(s,t);

  3152 	    free(t);

  3153 	    return theline;

  3154 	}

  3155     }

  3156     for (i=0;*entities[i].htmlnum;i++)

  3157     {

  3158 	s=strstr(theline,entities[i].htmlnum);

  3159 	if (s)

  3160 	{

  3161 	    t=malloc((size_t)strlen(s));

  3162 	    if (!t)

  3163 		return NULL;

  3164 	    strcpy(t,s+strlen(entities[i].htmlnum));

  3165 	    strcpy(s,entities[i].textent);

  3166 	    strcat(s,t);

  3167 	    free(t);

  3168 	    return theline;

  3169 	}

  3170     }

  3171     return NULL;

  3172 }

  3174 int tagcomp(char *strin,char *basetag)

  3175 {

  3176     char *s,*t;

  3177     s=basetag;

  3178     t=strin;

  3179     if (*t=='/')

  3180 	t++; /* ignore a slash */

  3181     while (*s && *t)

  3182     {

  3183 	if (tolower(*s)!=tolower(*t))

  3184 	    return 1;

  3185 	s++;

  3186 	t++;

  3187     }

  3188     return 0;

  3189 }

  3191 void proghelp()

  3192 {

  3193     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3194     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3195     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3196     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3197       "For details, read the file COPYING.\n",stderr);

  3198     fputs("This is Free Software; "

  3199       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3200     fputs("read the file COPYING for details.\n\n",stderr);

  3201     fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);

  3202     fputs("  where -s checks single quotes, -e suppresses echoing lines, "

  3203       "-t checks typos\n",stderr);

  3204     fputs("  -x (paranoid) switches OFF -t and extra checks, "

  3205       "-l turns OFF line-end checks\n",stderr);

  3206     fputs("  -o just displays overview without detail, "

  3207       "-h echoes header fields\n",stderr);

  3208     fputs("  -v (verbose) unsuppresses duplicate reporting, "

  3209       "-m suppresses markup\n",stderr);

  3210     fputs("  -d ignores DP-specific markup,\n",stderr);

  3211     fputs("  -u uses a file gutcheck.typ to query user-defined "

  3212       "possible typos\n",stderr);

  3213     fputs("Sample usage: bookloupe warpeace.txt \n",stderr);

  3214     fputs("\n",stderr);

  3215     fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",

  3216       stderr);

  3217     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3218       "non-ASCII\n",stderr);

  3219     fputs("characters like accented letters, "

  3220       "lines longer than 75 or shorter than 55,\n",stderr);

  3221     fputs("unbalanced quotes or brackets, "

  3222       "a variety of badly formatted punctuation, \n",stderr);

  3223     fputs("HTML tags, some likely typos. "

  3224       "It is NOT a substitute for human judgement.\n",stderr);

  3225     fputs("\n",stderr);

  3226 }

author	ali <ali@juiblex.co.uk>
	Mon May 27 09:03:04 2013 +0100 (2013-05-27)
changeset 68	adb087007d08
parent 67	865063352146
child 69	1016349e619f
permissions	-rw-r--r--