1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
26 #define MAXWORDLEN 80 /* max length of one word */
27 #define LINEBUFSIZE 2048 /* buffer size for an input line */
29 #define MAX_USER_TYPOS 1000
30 #define USERTYPO_FILE "gutcheck.typ"
33 #define MAX_PATH 16384
36 char aline[LINEBUFSIZE];
37 char prevline[LINEBUFSIZE];
41 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
42 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
43 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
44 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
45 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
46 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
47 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
48 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
49 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
50 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
51 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
52 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
53 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
54 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
55 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
56 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
57 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
58 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
59 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
60 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
61 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
62 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
63 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
64 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
65 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
66 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
67 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
68 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
69 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 char *usertypo[MAX_USER_TYPOS];
75 /* Common abbreviations and other OK words not to query as typos. */
77 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
78 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
79 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
80 "outbid", "outbids", "frostbite", "frostbitten", ""
83 /* Common abbreviations that cause otherwise unexplained periods. */
85 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
86 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
90 * Two-Letter combinations that rarely if ever start words,
91 * but are common scannos or otherwise common letter combinations.
94 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
98 * Two-Letter combinations that rarely if ever end words,
99 * but are common scannos or otherwise common letter combinations.
102 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
103 "sw", "gr", "sl", "cl", "iy", ""
107 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
108 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
109 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
110 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
114 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
118 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
119 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
120 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
121 "during", "let", "toward", "among", ""
125 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
126 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
127 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
128 "among", "those", "into", "whom", "having", "thence", ""
131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
138 "&", "&", "&",
139 "<", "<", "<",
140 ">", ">", ">",
141 "°", "°", " degrees",
142 "£", "£", "L",
143 """, """, "\"", /* quotation mark = APL quote */
144 "Œ", "Œ", "OE", /* latin capital ligature OE */
145 "œ", "œ", "oe", /* latin small ligature oe */
146 "Š", "Š", "S", /* latin capital letter S with caron */
147 "š", "š", "s", /* latin small letter s with caron */
148 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
149 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
150 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
151 " ", " ", " ", /* en space, U+2002 ISOpub */
152 " ", " ", " ", /* em space, U+2003 ISOpub */
153 " ", " ", " ", /* thin space, U+2009 ISOpub */
154 "–", "–", "-", /* en dash, U+2013 ISOpub */
155 "—", "—", "--", /* em dash, U+2014 ISOpub */
156 "’", "’", "'", /* right single quotation mark */
157 "‚", "‚", "'", /* single low-9 quotation mark */
158 "“", "“", "\"", /* left double quotation mark */
159 "”", "”", "\"", /* right double quotation mark */
160 "„", "„", "\"", /* double low-9 quotation mark */
161 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
162 "›", "›", "\"", /* single right-pointing angle quotation mark */
163 " ", " ", " ", /* no-break space = non-breaking space, */
164 "¡", "¡", "!", /* inverted exclamation mark */
165 "¢", "¢", "c", /* cent sign */
166 "£", "£", "L", /* pound sign */
167 "¤", "¤", "$", /* currency sign */
168 "¥", "¥", "Y", /* yen sign = yuan sign */
169 "§", "§", "--", /* section sign */
170 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
171 "©", "©", "(C) ", /* copyright sign */
172 "ª", "ª", " ", /* feminine ordinal indicator */
173 "«", "«", "\"", /* left-pointing double angle quotation mark */
174 "­", "­", "-", /* soft hyphen = discretionary hyphen */
175 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
176 "¯", "¯", " ", /* macron = spacing macron = overline */
177 "°", "°", " degrees", /* degree sign */
178 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
179 "²", "²", "2", /* superscript two = superscript digit two */
180 "³", "³", "3", /* superscript three = superscript digit three */
181 "´", "´", " ", /* acute accent = spacing acute */
182 "µ", "µ", "m", /* micro sign */
183 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
184 "¸", "¸", " ", /* cedilla = spacing cedilla */
185 "¹", "¹", "1", /* superscript one = superscript digit one */
186 "º", "º", " ", /* masculine ordinal indicator */
187 "»", "»", "\"", /* right-pointing double angle quotation mark */
188 "¼", "¼", "1/4", /* vulgar fraction one quarter */
189 "½", "½", "1/2", /* vulgar fraction one half */
190 "¾", "¾", "3/4", /* vulgar fraction three quarters */
191 "¿", "¿", "?", /* inverted question mark */
192 "À", "À", "A", /* latin capital letter A with grave */
193 "Á", "Á", "A", /* latin capital letter A with acute */
194 "Â", "Â", "A", /* latin capital letter A with circumflex */
195 "Ã", "Ã", "A", /* latin capital letter A with tilde */
196 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
197 "Å", "Å", "A", /* latin capital letter A with ring above */
198 "Æ", "Æ", "AE", /* latin capital letter AE */
199 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
200 "È", "È", "E", /* latin capital letter E with grave */
201 "É", "É", "E", /* latin capital letter E with acute */
202 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
203 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
204 "Ì", "Ì", "I", /* latin capital letter I with grave */
205 "Í", "Í", "I", /* latin capital letter I with acute */
206 "Î", "Î", "I", /* latin capital letter I with circumflex */
207 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
208 "Ð", "Ð", "E", /* latin capital letter ETH */
209 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
210 "Ò", "Ò", "O", /* latin capital letter O with grave */
211 "Ó", "Ó", "O", /* latin capital letter O with acute */
212 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
213 "Õ", "Õ", "O", /* latin capital letter O with tilde */
214 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
215 "×", "×", "*", /* multiplication sign */
216 "Ø", "Ø", "O", /* latin capital letter O with stroke */
217 "Ù", "Ù", "U", /* latin capital letter U with grave */
218 "Ú", "Ú", "U", /* latin capital letter U with acute */
219 "Û", "Û", "U", /* latin capital letter U with circumflex */
220 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
221 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
222 "Þ", "Þ", "TH", /* latin capital letter THORN */
223 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
224 "à", "à", "a", /* latin small letter a with grave */
225 "á", "á", "a", /* latin small letter a with acute */
226 "â", "â", "a", /* latin small letter a with circumflex */
227 "ã", "ã", "a", /* latin small letter a with tilde */
228 "ä", "ä", "a", /* latin small letter a with diaeresis */
229 "å", "å", "a", /* latin small letter a with ring above */
230 "æ", "æ", "ae", /* latin small letter ae */
231 "ç", "ç", "c", /* latin small letter c with cedilla */
232 "è", "è", "e", /* latin small letter e with grave */
233 "é", "é", "e", /* latin small letter e with acute */
234 "ê", "ê", "e", /* latin small letter e with circumflex */
235 "ë", "ë", "e", /* latin small letter e with diaeresis */
236 "ì", "ì", "i", /* latin small letter i with grave */
237 "í", "í", "i", /* latin small letter i with acute */
238 "î", "î", "i", /* latin small letter i with circumflex */
239 "ï", "ï", "i", /* latin small letter i with diaeresis */
240 "ð", "ð", "eth", /* latin small letter eth */
241 "ñ", "ñ", "n", /* latin small letter n with tilde */
242 "ò", "ò", "o", /* latin small letter o with grave */
243 "ó", "ó", "o", /* latin small letter o with acute */
244 "ô", "ô", "o", /* latin small letter o with circumflex */
245 "õ", "õ", "o", /* latin small letter o with tilde */
246 "ö", "ö", "o", /* latin small letter o with diaeresis */
247 "÷", "÷", "/", /* division sign */
248 "ø", "ø", "o", /* latin small letter o with stroke */
249 "ù", "ù", "u", /* latin small letter u with grave */
250 "ú", "ú", "u", /* latin small letter u with acute */
251 "û", "û", "u", /* latin small letter u with circumflex */
252 "ü", "ü", "u", /* latin small letter u with diaeresis */
253 "ý", "ý", "y", /* latin small letter y with acute */
254 "þ", "þ", "th", /* latin small letter thorn */
255 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
259 /* special characters */
260 #define CHAR_SPACE 32
264 #define CHAR_DQUOTE 34
265 #define CHAR_SQUOTE 39
266 #define CHAR_OPEN_SQUOTE 96
267 #define CHAR_TILDE 126
268 #define CHAR_ASTERISK 42
269 #define CHAR_FORESLASH 47
270 #define CHAR_CARAT 94
272 #define CHAR_UNDERSCORE '_'
273 #define CHAR_OPEN_CBRACK '{'
274 #define CHAR_CLOSE_CBRACK '}'
275 #define CHAR_OPEN_RBRACK '('
276 #define CHAR_CLOSE_RBRACK ')'
277 #define CHAR_OPEN_SBRACK '['
278 #define CHAR_CLOSE_SBRACK ']'
280 /* longest and shortest normal PG line lengths */
281 #define LONGEST_PG_LINE 75
282 #define WAY_TOO_LONG 80
283 #define SHORTEST_PG_LINE 55
285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
286 /* D - ignore DP-specific markup */
287 /* E - echo queried line */
288 /* S - check single quotes */
289 /* T - check common typos */
290 /* P - require closure of quotes on */
291 /* every paragraph */
292 /* X - "Trust no one" :-) Paranoid! */
293 /* Queries everything */
294 /* L - line end checking defaults on */
295 /* -L turns it off */
296 /* O - overview. Just shows counts. */
297 /* Y - puts errors to stdout */
298 /* instead of stderr */
299 /* H - Echoes header fields */
300 /* M - Ignore markup in < > */
301 /* U - Use file of User-defined Typos*/
302 /* W - Defaults for use on Web upload*/
303 /* V - Verbose - list EVERYTHING! */
304 #define SWITNO 14 /* max number of switch parms */
305 /* - used for defining array-size */
306 #define MINARGS 1 /* minimum no of args excl switches */
307 #define MAXARGS 1 /* maximum no of args excl switches */
309 int pswit[SWITNO]; /* program switches set by SWITCHES */
311 #define ECHO_SWITCH 0
312 #define SQUOTE_SWITCH 1
313 #define TYPO_SWITCH 2
314 #define QPARA_SWITCH 3
315 #define PARANOID_SWITCH 4
316 #define LINE_END_SWITCH 5
317 #define OVERVIEW_SWITCH 6
318 #define STDOUT_SWITCH 7
319 #define HEADER_SWITCH 8
321 #define VERBOSE_SWITCH 10
322 #define MARKUP_SWITCH 11
323 #define USERTYPO_SWITCH 12
326 long cnt_dquot; /* for overview mode, count of doublequote queries */
327 long cnt_squot; /* for overview mode, count of singlequote queries */
328 long cnt_brack; /* for overview mode, count of brackets queries */
329 long cnt_bin; /* for overview mode, count of non-ASCII queries */
330 long cnt_odd; /* for overview mode, count of odd character queries */
331 long cnt_long; /* for overview mode, count of long line errors */
332 long cnt_short; /* for overview mode, count of short line queries */
333 long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
334 long cnt_dash; /* for overview mode, count of dash-related queries */
335 long cnt_word; /* for overview mode, count of word queries */
336 long cnt_html; /* for overview mode, count of html queries */
337 long cnt_lineend; /* for overview mode, count of line-end queries */
338 long cnt_spacend; /* count of lines with space at end */
339 long linecnt; /* count of total lines in the file */
340 long checked_linecnt; /* count of lines actually checked */
343 void procfile(char *);
345 #define LOW_THRESHOLD 0
346 #define HIGH_THRESHOLD 1
352 #define FIRST_OF_PAIR 0
353 #define SECOND_OF_PAIR 1
355 #define MAX_WORDPAIR 1000
357 char running_from[MAX_PATH];
359 int mixdigit(char *);
360 const char *getaword(const char *,char *);
361 int matchword(char *,char *);
362 char *flgets(char *,int,FILE *,long);
363 void lowerit(char *);
364 int gcisalpha(unsigned char);
365 int gcisdigit(unsigned char);
366 int gcisletter(unsigned char);
367 char *gcstrchr(char *s,char c);
368 void postprocess_for_HTML(char *);
369 char *linehasmarkup(char *);
370 char *losemarkup(char *);
371 int tagcomp(char *,char *);
372 char *loseentities(char *);
375 void postprocess_for_DP(char *);
377 char wrk[LINEBUFSIZE];
380 #define MAX_QWORD_LENGTH 40
381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
382 signed int dupcnt[MAX_QWORD];
384 int main(int argc,char **argv)
388 char usertypo_file[MAX_PATH];
390 if (strlen(argv[0])<sizeof(running_from))
391 /* save the path to the executable */
392 strcpy(running_from,argv[0]);
393 /* find out what directory we're running from */
394 s=running_from+strlen(running_from);
395 for (;*s!='/' && *s!='\\' && s>=running_from;s--)
397 switno=strlen(SWITCHES);
398 for (i=switno;--i>0;)
399 pswit[i]=0; /* initialise switches */
401 * Standard loop to extract switches.
402 * When we come out of this loop, the arguments will be
403 * in argv[0] upwards and the switches used will be
404 * represented by their equivalent elements in pswit[]
406 while (--argc>0 && **++argv=='-')
407 for (argsw=argv[0]+1;*argsw!='\0';argsw++)
408 for (i=switno,invarg=1;(--i>=0) && invarg==1;)
409 if ((toupper(*argsw))==SWITCHES[i])
414 /* Paranoid checking is turned OFF, not on, by its switch */
415 pswit[PARANOID_SWITCH]^=1;
416 if (pswit[PARANOID_SWITCH])
417 /* if running in paranoid mode force typo checks as well */
418 pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
419 /* Line-end checking is turned OFF, not on, by its switch */
420 pswit[LINE_END_SWITCH]^=1;
421 /* Echoing is turned OFF, not on, by its switch */
422 pswit[ECHO_SWITCH]^=1;
423 if (pswit[OVERVIEW_SWITCH])
424 /* just print summary; don't echo */
425 pswit[ECHO_SWITCH]=0;
427 * Web uploads - for the moment, this is really just a placeholder
428 * until we decide what processing we really want to do on web uploads
430 if (pswit[WEB_SWITCH])
432 /* specific override for web uploads */
433 pswit[ECHO_SWITCH]=1;
434 pswit[SQUOTE_SWITCH]=0;
435 pswit[TYPO_SWITCH]=1;
436 pswit[QPARA_SWITCH]=0;
437 pswit[PARANOID_SWITCH]=1;
438 pswit[LINE_END_SWITCH]=0;
439 pswit[OVERVIEW_SWITCH]=0;
440 pswit[STDOUT_SWITCH]=0;
441 pswit[HEADER_SWITCH]=1;
442 pswit[VERBOSE_SWITCH]=0;
443 pswit[MARKUP_SWITCH]=0;
444 pswit[USERTYPO_SWITCH]=0;
447 if (argc<MINARGS || argc>MAXARGS)
449 /* check number of args */
453 /* read in the user-defined stealth scanno list */
454 if (pswit[USERTYPO_SWITCH])
456 /* ... we were told we had one! */
457 usertypofile=fopen(USERTYPO_FILE,"rb");
460 /* not in cwd. try excuteable directory. */
461 strcpy(usertypo_file,running_from);
462 strcat(usertypo_file,USERTYPO_FILE);
463 usertypofile=fopen(usertypo_file,"rb");
465 /* we ain't got no user typo file! */
466 printf(" --> I couldn't find gutcheck.typ "
467 "-- proceeding without user typos.\n");
473 /* we managed to open a User Typo File! */
474 if (pswit[USERTYPO_SWITCH])
476 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
477 (long)usertypo_count))
483 s=malloc(strlen(aline)+1);
486 fprintf(stderr,"bookloupe: cannot get enough "
487 "memory for user typo file!\n");
491 usertypo[usertypo_count]=s;
493 if (usertypo_count>=MAX_USER_TYPOS)
495 printf(" --> Only %d user-defined typos "
496 "allowed: ignoring the rest\n",
504 fclose(usertypofile);
507 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
508 cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
509 cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
512 if (pswit[OVERVIEW_SWITCH])
514 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
515 checked_linecnt,linecnt,linecnt-checked_linecnt);
516 printf(" --------------- Queries found --------------\n");
518 printf(" Long lines: %14ld\n",cnt_long);
520 printf(" Short lines: %14ld\n",cnt_short);
522 printf(" Line-end problems: %14ld\n",cnt_lineend);
524 printf(" Common typos: %14ld\n",cnt_word);
526 printf(" Unmatched quotes: %14ld\n",cnt_dquot);
528 printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
530 printf(" Unmatched brackets: %14ld\n",cnt_brack);
532 printf(" Non-ASCII characters: %14ld\n",cnt_bin);
534 printf(" Proofing characters: %14ld\n",cnt_odd);
536 printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
538 printf(" Non-standard dashes: %14ld\n",cnt_dash);
540 printf(" Possible HTML tags: %14ld\n",cnt_html);
542 printf(" TOTAL QUERIES %14ld\n",
543 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
544 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
549 struct first_pass_results {
550 long firstline,astline;
551 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
552 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
553 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
554 signed int Dutchcount,Frenchcount;
560 * Run a first pass - verify that it's a valid PG
561 * file, decide whether to report some things that
562 * occur many times in the text like long or short
563 * lines, non-standard dashes, etc.
565 struct first_pass_results *first_pass(FILE *infile)
567 char laststart=CHAR_SPACE;
570 unsigned int lastlen=0,lastblen=0;
571 long spline=0,nspline=0;
572 static struct first_pass_results results={0};
573 char inword[MAXWORDLEN]="";
574 while (fgets(aline,LINEBUFSIZE-1,infile))
576 while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
577 aline[strlen(aline)-1]=0;
579 if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
580 (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
583 printf(" --> Duplicate header?\n");
584 spline=linecnt+1; /* first line of non-header text, that is */
586 if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
589 printf(" --> Duplicate header?\n");
590 nspline=linecnt+1; /* first line of non-header text, that is */
592 if (spline || nspline)
595 if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
597 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
599 if (results.footerline)
601 /* it's an old-form header - we can detect duplicates */
603 printf(" --> Duplicate footer?\n");
606 results.footerline=linecnt;
611 results.firstline=spline;
613 results.firstline=nspline; /* override with new */
614 if (results.footerline)
615 continue; /* don't count the boilerplate in the footer */
617 results.totlen+=llen;
620 if ((unsigned char)aline[i]>127)
622 if (gcisalpha(aline[i]))
624 if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
625 results.endquote_count++;
627 if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
628 lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
630 if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
632 if (strstr(aline,".,"))
634 /* only count ast lines for ignoring purposes where there is */
635 /* locase text on the line */
636 if (strstr(aline,"*"))
639 if (*s>='a' && *s<='z')
644 if (strstr(aline,"/"))
645 results.fslashline++;
646 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
648 if (aline[i]=='-' && aline[i-1]!='-')
650 if (llen>LONGEST_PG_LINE)
652 if (llen>WAY_TOO_LONG)
653 results.verylongline++;
654 if (strstr(aline,"<") && strstr(aline,">"))
656 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
659 if (strstr(aline,"<i>"))
660 results.htmcount+=4; /* bonus marks! */
662 /* Check for spaced em-dashes */
663 if (strstr(aline,"--"))
666 if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
667 (*(strstr(aline,"--")+2)==CHAR_SPACE))
668 results.space_emdash++;
669 if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
670 (*(strstr(aline,"--")+2)==CHAR_SPACE))
671 /* count of em-dashes with spaces both sides */
672 results.non_PG_space_emdash++;
673 if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
674 (*(strstr(aline,"--")+2)!=CHAR_SPACE))
675 /* count of PG-type em-dashes with no spaces */
676 results.PG_space_emdash++;
680 s=getaword(s,inword);
681 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
682 results.Dutchcount++;
683 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
684 results.Frenchcount++;
685 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
686 results.standalone_digit++;
688 /* Check for spaced dashes */
689 if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
692 lastlen=strlen(aline);
699 signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
700 signed int endquote,isDutch,isFrench;
706 * Make some snap decisions based on the first pass results.
708 struct warnings *report_first_pass(struct first_pass_results *results)
710 static struct warnings warnings={0};
712 printf(" --> %ld lines in this file have white space at end\n",
715 if (results->dotcomma>5)
718 printf(" --> %ld lines in this file contain '.,'. "
719 "Not reporting them.\n",results->dotcomma);
722 * If more than 50 lines, or one-tenth, are short,
723 * don't bother reporting them.
725 warnings.shortline=1;
726 if (results->shortline>50 || results->shortline*10>linecnt)
728 warnings.shortline=0;
729 printf(" --> %ld lines in this file are short. "
730 "Not reporting short lines.\n",results->shortline);
733 * If more than 50 lines, or one-tenth, are long,
734 * don't bother reporting them.
737 if (results->longline>50 || results->longline*10>linecnt)
740 printf(" --> %ld lines in this file are long. "
741 "Not reporting long lines.\n",results->longline);
743 /* If more than 10 lines contain asterisks, don't bother reporting them. */
745 if (results->astline>10)
748 printf(" --> %ld lines in this file contain asterisks. "
749 "Not reporting them.\n",results->astline);
752 * If more than 10 lines contain forward slashes,
753 * don't bother reporting them.
756 if (results->fslashline>10)
759 printf(" --> %ld lines in this file contain forward slashes. "
760 "Not reporting them.\n",results->fslashline);
763 * If more than 20 lines contain unpunctuated endquotes,
764 * don't bother reporting them.
767 if (results->endquote_count>20)
770 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
771 "Not reporting them.\n",results->endquote_count);
774 * If more than 15 lines contain standalone digits,
775 * don't bother reporting them.
778 if (results->standalone_digit>10)
781 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
782 "Not reporting them.\n",results->standalone_digit);
785 * If more than 20 lines contain hyphens at end,
786 * don't bother reporting them.
789 if (results->hyphens>20)
792 printf(" --> %ld lines in this file have hyphens at end. "
793 "Not reporting them.\n",results->hyphens);
795 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
797 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
798 pswit[MARKUP_SWITCH]=1;
800 if (results->verylongline>0)
801 printf(" --> %ld lines in this file are VERY long!\n",
802 results->verylongline);
804 * If there are more non-PG spaced dashes than PG em-dashes,
805 * assume it's deliberate.
806 * Current PG guidelines say don't use them, but older texts do,
807 * and some people insist on them whatever the guidelines say.
810 if (results->spacedash+results->non_PG_space_emdash>
811 results->PG_space_emdash)
814 printf(" --> There are %ld spaced dashes and em-dashes. "
815 "Not reporting them.\n",
816 results->spacedash+results->non_PG_space_emdash);
818 /* If more than a quarter of characters are hi-bit, bug out. */
820 if (results->binlen*4>results->totlen)
822 printf(" --> This file does not appear to be ASCII. "
823 "Terminating. Best of luck with it!\n");
826 if (results->alphalen*4<results->totlen)
828 printf(" --> This file does not appear to be text. "
829 "Terminating. Best of luck with it!\n");
832 if (results->binlen*100>results->totlen || results->binlen>100)
834 printf(" --> There are a lot of foreign letters here. "
835 "Not reporting them.\n");
839 if (results->Dutchcount>50)
842 printf(" --> This looks like Dutch - "
843 "switching off dashes and warnings for 's Middags case.\n");
846 if (results->Frenchcount>50)
849 printf(" --> This looks like French - "
850 "switching off some doublepunct.\n");
852 if (results->firstline && results->footerline)
853 printf(" The PG header and footer appear to be already on.\n");
856 if (results->firstline)
857 printf(" The PG header is on - no footer.\n");
858 if (results->footerline)
859 printf(" The PG footer is on - no header.\n");
862 if (pswit[VERBOSE_SWITCH])
865 warnings.shortline=1;
874 printf(" *** Verbose output is ON -- you asked for it! ***\n");
876 if (warnings.isDutch)
878 if (results->footerline>0 && results->firstline>0 &&
879 results->footerline>results->firstline &&
880 results->footerline-results->firstline<100)
882 printf(" --> I don't really know where this text starts. \n");
883 printf(" There are no reference points.\n");
884 printf(" I'm going to have to report the header and footer "
886 results->firstline=0;
893 signed int c_unders,c_brack,s_brack,r_brack;
894 signed int open_single_quote,close_single_quote;
900 * Look along the line, accumulate the count of quotes, and see
901 * if this is an empty line - i.e. a line with nothing on it
903 * If line has just spaces, period, * and/or - on it, don't
904 * count it, since empty lines with asterisks or dashes to
905 * separate sections are common.
907 * Returns: Non-zero if the line is empty.
909 int analyse_quotes(const char *s,struct counters *counters)
911 signed int guessquote=0;
912 int isemptyline=1; /* assume the line is empty until proven otherwise */
917 if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
922 * At start of line, it can only be an openquote.
923 * Hardcode a very common exception!
925 if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
926 counters->open_single_quote++;
928 else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
929 /* Do nothing! it's definitely an apostrophe, not a quote */
931 /* it's outside a word - let's check it out */
932 else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
934 /* it damwell better BE an openquote */
935 if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
936 /* hardcode a very common exception! */
937 counters->open_single_quote++;
941 /* now - is it a closequote? */
942 guessquote=0; /* accumulate clues */
943 if (gcisalpha(s[-1]))
945 /* it follows a letter - could be either */
949 /* looks like a plural apostrophe */
951 if (s[1]==CHAR_SPACE) /* bonus marks! */
955 /* it doesn't have a letter either side */
956 else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
957 guessquote+=8; /* looks like a closequote */
960 if (counters->open_single_quote>counters->close_single_quote)
962 * Give it the benefit of some doubt,
963 * if a squote is already open.
969 counters->close_single_quote++;
972 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
974 isemptyline=0; /* ignore lines like * * * as spacers */
975 if (*s==CHAR_UNDERSCORE)
976 counters->c_unders++;
977 if (*s==CHAR_OPEN_CBRACK)
979 if (*s==CHAR_CLOSE_CBRACK)
981 if (*s==CHAR_OPEN_RBRACK)
983 if (*s==CHAR_CLOSE_RBRACK)
985 if (*s==CHAR_OPEN_SBRACK)
987 if (*s==CHAR_CLOSE_SBRACK)
995 * check_for_control_characters:
997 * Check for invalid or questionable characters in the line
998 * Anything above 127 is invalid for plain ASCII, and
999 * non-printable control characters should also be flagged.
1000 * Tabs should generally not be there.
1002 void check_for_control_characters(const char *aline)
1006 for (s=aline;*s;s++)
1008 c=*(unsigned char *)s;
1009 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1011 if (pswit[ECHO_SWITCH])
1012 printf("\n%s\n",aline);
1013 if (!pswit[OVERVIEW_SWITCH])
1014 printf(" Line %ld column %d - Control character %d\n",
1015 linecnt,(int)(s-aline)+1,c);
1023 * check_for_odd_characters:
1025 * Check for binary and other odd characters.
1027 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1030 /* Don't repeat multiple warnings on one line. */
1031 signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
1034 for (s=aline;*s;s++)
1036 c=*(unsigned char *)s;
1037 if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
1039 if (pswit[ECHO_SWITCH])
1040 printf("\n%s\n",aline);
1041 if (!pswit[OVERVIEW_SWITCH])
1043 printf(" Line %ld column %d - "
1044 "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
1046 printf(" Line %ld column %d - Non-ASCII character %d\n",
1047 linecnt,(int)(s-aline)+1,c);
1052 if (!eTab && *s==CHAR_TAB)
1054 if (pswit[ECHO_SWITCH])
1055 printf("\n%s\n",aline);
1056 if (!pswit[OVERVIEW_SWITCH])
1057 printf(" Line %ld column %d - Tab character?\n",
1058 linecnt,(int)(s-aline)+1);
1063 if (!eTilde && *s==CHAR_TILDE)
1066 * Often used by OCR software to indicate an
1067 * unrecognizable character.
1069 if (pswit[ECHO_SWITCH])
1070 printf("\n%s\n",aline);
1071 if (!pswit[OVERVIEW_SWITCH])
1072 printf(" Line %ld column %d - Tilde character?\n",
1073 linecnt,(int)(s-aline)+1);
1078 if (!eCarat && *s==CHAR_CARAT)
1080 if (pswit[ECHO_SWITCH])
1081 printf("\n%s\n",aline);
1082 if (!pswit[OVERVIEW_SWITCH])
1083 printf(" Line %ld column %d - Carat character?\n",
1084 linecnt,(int)(s-aline)+1);
1089 if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
1091 if (pswit[ECHO_SWITCH])
1092 printf("\n%s\n",aline);
1093 if (!pswit[OVERVIEW_SWITCH])
1094 printf(" Line %ld column %d - Forward slash?\n",
1095 linecnt,(int)(s-aline)+1);
1101 * Report asterisks only in paranoid mode,
1102 * since they're often deliberate.
1104 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1107 if (pswit[ECHO_SWITCH])
1108 printf("\n%s\n",aline);
1109 if (!pswit[OVERVIEW_SWITCH])
1110 printf(" Line %ld column %d - Asterisk?\n",
1111 linecnt,(int)(s-aline)+1);
1120 * check_for_long_line:
1122 * Check for line too long.
1124 void check_for_long_line(const char *aline)
1126 if (strlen(aline)>LONGEST_PG_LINE)
1128 if (pswit[ECHO_SWITCH])
1129 printf("\n%s\n",aline);
1130 if (!pswit[OVERVIEW_SWITCH])
1131 printf(" Line %ld column %d - Long line %d\n",
1132 linecnt,strlen(aline),strlen(aline));
1138 struct line_properties {
1139 unsigned int len,blen;
1144 * check_for_short_line:
1146 * Check for line too short.
1148 * This one is a bit trickier to implement: we don't want to
1149 * flag the last line of a paragraph for being short, so we
1150 * have to wait until we know that our current line is a
1151 * "normal" line, then report the _previous_ line if it was too
1152 * short. We also don't want to report indented lines like
1153 * chapter heads or formatted quotations. We therefore keep
1154 * last->len as the length of the last line examined, and
1155 * last->blen as the length of the last but one, and try to
1156 * suppress unnecessary warnings by checking that both were of
1157 * "normal" length. We keep the first character of the last
1158 * line in last->start, and if it was a space, we assume that
1159 * the formatting is deliberate. I can't figure out a way to
1160 * distinguish something like a quoted verse left-aligned or
1161 * the header or footer of a letter from a paragraph of short
1162 * lines - maybe if I examined the whole paragraph, and if the
1163 * para has less than, say, 8 lines and if all lines are short,
1164 * then just assume it's OK? Need to look at some texts to see
1165 * how often a formula like this would get the right result.
1167 void check_for_short_line(const char *aline,const struct line_properties *last)
1169 if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
1170 last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1172 if (pswit[ECHO_SWITCH])
1173 printf("\n%s\n",prevline);
1174 if (!pswit[OVERVIEW_SWITCH])
1175 printf(" Line %ld column %d - Short line %d?\n",
1176 linecnt-1,strlen(prevline),strlen(prevline));
1183 * check_for_starting_punctuation:
1185 * Look for punctuation other than full ellipses at start of line.
1187 void check_for_starting_punctuation(const char *aline)
1189 if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1191 if (pswit[ECHO_SWITCH])
1192 printf("\n%s\n",aline);
1193 if (!pswit[OVERVIEW_SWITCH])
1194 printf(" Line %ld column 1 - Begins with punctuation?\n",
1202 * check_for_spaced_emdash:
1204 * Check for spaced em-dashes.
1206 * We must check _all_ occurrences of "--" on the line
1207 * hence the loop - even if the first double-dash is OK
1208 * there may be another that's wrong later on.
1210 void check_for_spaced_emdash(const char *aline)
1214 while ((t=strstr(s,"--")))
1216 if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
1218 if (pswit[ECHO_SWITCH])
1219 printf("\n%s\n",aline);
1220 if (!pswit[OVERVIEW_SWITCH])
1221 printf(" Line %ld column %d - Spaced em-dash?\n",
1222 linecnt,(int)(t-aline)+1);
1231 * check_for_spaced_dash:
1233 * Check for spaced dashes.
1235 void check_for_spaced_dash(const char *aline)
1238 if ((s=strstr(aline," -")))
1242 if (pswit[ECHO_SWITCH])
1243 printf("\n%s\n",aline);
1244 if (!pswit[OVERVIEW_SWITCH])
1245 printf(" Line %ld column %d - Spaced dash?\n",
1246 linecnt,(int)(s-aline)+1);
1251 else if ((s=strstr(aline,"- ")))
1253 if (s==aline || s[-1]!='-')
1255 if (pswit[ECHO_SWITCH])
1256 printf("\n%s\n",aline);
1257 if (!pswit[OVERVIEW_SWITCH])
1258 printf(" Line %ld column %d - Spaced dash?\n",
1259 linecnt,(int)(s-aline)+1);
1267 * check_for_unmarked_paragraphs:
1269 * Check for unmarked paragraphs indicated by separate speakers.
1271 * May well be false positive:
1272 * "Bravo!" "Wonderful!" called the crowd.
1273 * but useful all the same.
1275 void check_for_unmarked_paragraphs(const char *aline)
1278 s=strstr(aline,"\" \"");
1280 s=strstr(aline,"\" \"");
1283 if (pswit[ECHO_SWITCH])
1284 printf("\n%s\n",aline);
1285 if (!pswit[OVERVIEW_SWITCH])
1286 printf(" Line %ld column %d - Query missing paragraph break?\n",
1287 linecnt,(int)(s-aline)+1);
1294 * check_for_jeebies:
1296 * Check for "to he" and other easy h/b errors.
1298 * This is a very inadequate effort on the h/b problem,
1299 * but the phrase "to he" is always an error, whereas "to
1300 * be" is quite common.
1301 * Similarly, '"Quiet!", be said.' is a non-be error
1302 * "to he" is _not_ always an error!:
1303 * "Where they went to he couldn't say."
1304 * Another false positive:
1305 * What would "Cinderella" be without the . . .
1306 * and another: "If he wants to he can see for himself."
1308 void check_for_jeebies(const char *aline)
1311 s=strstr(aline," be could ");
1313 s=strstr(aline," be would ");
1315 s=strstr(aline," was be ");
1317 s=strstr(aline," be is ");
1319 s=strstr(aline," is be ");
1321 s=strstr(aline,"\", be ");
1323 s=strstr(aline,"\" be ");
1325 s=strstr(aline,"\" be ");
1327 s=strstr(aline," to he ");
1330 if (pswit[ECHO_SWITCH])
1331 printf("\n%s\n",aline);
1332 if (!pswit[OVERVIEW_SWITCH])
1333 printf(" Line %ld column %d - Query he/be error?\n",
1334 linecnt,(int)(s-aline)+1);
1338 s=strstr(aline," the had ");
1340 s=strstr(aline," a had ");
1342 s=strstr(aline," they bad ");
1344 s=strstr(aline," she bad ");
1346 s=strstr(aline," he bad ");
1348 s=strstr(aline," you bad ");
1350 s=strstr(aline," i bad ");
1353 if (pswit[ECHO_SWITCH])
1354 printf("\n%s\n",aline);
1355 if (!pswit[OVERVIEW_SWITCH])
1356 printf(" Line %ld column %d - Query had/bad error?\n",
1357 linecnt,(int)(s-aline)+1);
1361 s=strstr(aline,"; hut ");
1363 s=strstr(aline,", hut ");
1366 if (pswit[ECHO_SWITCH])
1367 printf("\n%s\n",aline);
1368 if (!pswit[OVERVIEW_SWITCH])
1369 printf(" Line %ld column %d - Query hut/but error?\n",
1370 linecnt,(int)(s-aline)+1);
1377 * check_for_mta_from:
1379 * Special case - angled bracket in front of "From" placed there by an
1380 * MTA when sending an e-mail.
1382 void check_for_mta_from(const char *aline)
1385 s=strstr(aline,">From");
1388 if (pswit[ECHO_SWITCH])
1389 printf("\n%s\n",aline);
1390 if (!pswit[OVERVIEW_SWITCH])
1391 printf(" Line %ld column %d - Query angled bracket with From\n",
1392 linecnt,(int)(s-aline)+1);
1399 * check_for_orphan_character:
1401 * Check for a single character line -
1402 * often an overflow from bad wrapping.
1404 void check_for_orphan_character(const char *aline)
1406 if (*aline && !aline[1])
1408 if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1410 ; /* Nothing - ignore numerals alone on a line. */
1413 if (pswit[ECHO_SWITCH])
1414 printf("\n%s\n",aline);
1415 if (!pswit[OVERVIEW_SWITCH])
1416 printf(" Line %ld column 1 - Query single character line\n",
1425 * check_for_pling_scanno:
1427 * Check for I" - often should be !
1429 void check_for_pling_scanno(const char *aline)
1432 s=strstr(aline," I\"");
1435 if (pswit[ECHO_SWITCH])
1436 printf("\n%s\n",aline);
1437 if (!pswit[OVERVIEW_SWITCH])
1438 printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1446 * check_for_extra_period:
1448 * Check for period without a capital letter. Cut-down from gutspell.
1449 * Only works when it happens on a single line.
1451 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1453 const char *s,*t,*s1;
1454 signed int i,istypo,isdup;
1455 static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
1456 static int qperiod_index=0;
1457 char testword[MAXWORDLEN]="";
1458 if (pswit[PARANOID_SWITCH])
1460 for (t=s=aline;strstr(t,". ");)
1466 /* start of line punctuation is handled elsewhere */
1469 if (!gcisalpha(t[-1]))
1474 if (warnings->isDutch)
1476 /* For Frank & Jeroen -- 's Middags case */
1477 if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1478 t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1485 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1487 if (*s1>='a' && *s1<='z')
1489 /* we have something to investigate */
1491 /* so let's go back and find out */
1492 for (s1=t-1;s1>=s &&
1493 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1494 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1497 for (i=0;*s1 && *s1!='.';s1++,i++)
1500 for (i=0;*abbrev[i];i++)
1501 if (!strcmp(testword,abbrev[i]))
1503 if (gcisdigit(*testword))
1507 if (isroman(testword))
1512 for (i=0;testword[i];i++)
1513 if (strchr(vowels,testword[i]))
1519 if (strlen(testword)<MAX_QWORD_LENGTH &&
1520 !pswit[VERBOSE_SWITCH])
1521 for (i=0;i<qperiod_index;i++)
1522 if (!strcmp(testword,qperiod[i]))
1526 if (qperiod_index<MAX_QWORD &&
1527 strlen(testword)<MAX_QWORD_LENGTH)
1529 strcpy(qperiod[qperiod_index],testword);
1532 if (pswit[ECHO_SWITCH])
1533 printf("\n%s\n",aline);
1534 if (!pswit[OVERVIEW_SWITCH])
1535 printf(" Line %ld column %d - Extra period?\n",
1536 linecnt,(int)(t-aline)+1);
1548 * check_for_following_punctuation:
1550 * Check for words usually not followed by punctuation.
1552 void check_for_following_punctuation(const char *aline)
1555 const char *s,*wordstart;
1556 char inword[MAXWORDLEN];
1557 if (pswit[TYPO_SWITCH])
1562 s=getaword(s,inword);
1566 for (i=0;*nocomma[i];i++)
1567 if (!strcmp(inword,nocomma[i]))
1569 if (*s==',' || *s==';' || *s==':')
1571 if (pswit[ECHO_SWITCH])
1572 printf("\n%s\n",aline);
1573 if (!pswit[OVERVIEW_SWITCH])
1574 printf(" Line %ld column %d - "
1575 "Query punctuation after %s?\n",
1576 linecnt,(int)(s-aline)+1,inword);
1581 for (i=0;*noperiod[i];i++)
1582 if (!strcmp(inword,noperiod[i]))
1584 if (*s=='.' || *s=='!')
1586 if (pswit[ECHO_SWITCH])
1587 printf("\n%s\n",aline);
1588 if (!pswit[OVERVIEW_SWITCH])
1589 printf(" Line %ld column %d - "
1590 "Query punctuation after %s?\n",
1591 linecnt,(int)(s-aline)+1,inword);
1603 * Check for commonly mistyped words,
1604 * and digits like 0 for O in a word.
1606 void check_for_typos(const char *aline,struct warnings *warnings)
1608 const char *s,*wordstart;
1609 char inword[MAXWORDLEN],testword[MAXWORDLEN];
1610 int i,istypo,isdup,alower,vowel,consonant;
1611 static int qword_index=0;
1615 s=getaword(s,inword);
1617 continue; /* don't bother with empty lines */
1618 if (mixdigit(inword))
1620 if (pswit[ECHO_SWITCH])
1621 printf("\n%s\n",aline);
1622 if (!pswit[OVERVIEW_SWITCH])
1623 printf(" Line %ld column %d - Query digit in %s\n",
1624 linecnt,(int)(wordstart-aline)+1,inword);
1629 * Put the word through a series of tests for likely typos and OCR
1632 if (pswit[TYPO_SWITCH])
1635 strcpy(testword,inword);
1637 for (i=0;i<(signed int)strlen(testword);i++)
1639 /* lowercase for testing */
1640 if (testword[i]>='a' && testword[i]<='z')
1642 if (alower && testword[i]>='A' && testword[i]<='Z')
1645 * We have an uppercase mid-word. However, there are
1647 * Mac and Mc like McGill
1648 * French contractions like l'Abbe
1650 if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1651 i==3 && testword[0]=='m' && testword[1]=='a' &&
1652 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1657 testword[i]=(char)tolower(testword[i]);
1660 * Check for certain unlikely two-letter combinations at word
1663 if (strlen(testword)>1)
1665 for (i=0;*nostart[i];i++)
1666 if (!strncmp(testword,nostart[i],2))
1668 for (i=0;*noend[i];i++)
1669 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1672 /* ght is common, gbt never. Like that. */
1673 if (strstr(testword,"cb"))
1675 if (strstr(testword,"gbt"))
1677 if (strstr(testword,"pbt"))
1679 if (strstr(testword,"tbs"))
1681 if (strstr(testword,"mrn"))
1683 if (strstr(testword,"ahle"))
1685 if (strstr(testword,"ihle"))
1688 * "TBE" does happen - like HEARTBEAT - but uncommon.
1689 * Also "TBI" - frostbite, outbid - but uncommon.
1690 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1691 * numerals, but "ii" is a common scanno.
1693 if (strstr(testword,"tbi"))
1695 if (strstr(testword,"tbe"))
1697 if (strstr(testword,"ii"))
1700 * Check for no vowels or no consonants.
1701 * If none, flag a typo.
1703 if (!istypo && strlen(testword)>1)
1706 for (i=0;testword[i];i++)
1708 if (testword[i]=='y' || gcisdigit(testword[i]))
1710 /* Yah, this is loose. */
1714 else if (strchr(vowels,testword[i]))
1719 if (!vowel || !consonant)
1723 * Now exclude the word from being reported if it's in
1726 for (i=0;*okword[i];i++)
1727 if (!strcmp(testword,okword[i]))
1730 * What looks like a typo may be a Roman numeral.
1733 if (istypo && isroman(testword))
1735 /* Check the manual list of typos. */
1737 for (i=0;*typo[i];i++)
1738 if (!strcmp(testword,typo[i]))
1741 * Check lowercase s, l, i and m - special cases.
1742 * "j" - often a semi-colon gone wrong.
1743 * "d" for a missing apostrophe - he d
1746 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1751 if (strlen(testword)<MAX_QWORD_LENGTH &&
1752 !pswit[VERBOSE_SWITCH])
1753 for (i=0;i<qword_index;i++)
1754 if (!strcmp(testword,qword[i]))
1761 if (qword_index<MAX_QWORD &&
1762 strlen(testword)<MAX_QWORD_LENGTH)
1764 strcpy(qword[qword_index],testword);
1767 if (pswit[ECHO_SWITCH])
1768 printf("\n%s\n",aline);
1769 if (!pswit[OVERVIEW_SWITCH])
1771 printf(" Line %ld column %d - Query word %s",
1772 linecnt,(int)(wordstart-aline)+1,inword);
1773 if (strlen(testword)<MAX_QWORD_LENGTH &&
1774 !pswit[VERBOSE_SWITCH])
1775 printf(" - not reporting duplicates");
1783 /* check the user's list of typos */
1784 if (!istypo && usertypo_count)
1785 for (i=0;i<usertypo_count;i++)
1786 if (!strcmp(testword,usertypo[i]))
1788 if (pswit[ECHO_SWITCH])
1789 printf("\n%s\n",aline);
1790 if (!pswit[OVERVIEW_SWITCH])
1791 printf(" Line %ld column %d - "
1792 "Query possible scanno %s\n",
1793 linecnt,(int)(wordstart-aline)+2,inword);
1795 if (pswit[PARANOID_SWITCH] && warnings->digit)
1797 /* In paranoid mode, query all 0 and 1 standing alone. */
1798 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1800 if (pswit[ECHO_SWITCH])
1801 printf("\n%s\n",aline);
1802 if (!pswit[OVERVIEW_SWITCH])
1803 printf(" Line %ld column %d - Query standalone %s\n",
1804 linecnt,(int)(wordstart-aline)+2,inword);
1817 * check_for_misspaced_punctuation:
1819 * Look for added or missing spaces around punctuation and quotes.
1820 * If there is a punctuation character like ! with no space on
1821 * either side, suspect a missing!space. If there are spaces on
1822 * both sides , assume a typo. If we see a double quote with no
1823 * space or punctuation on either side of it, assume unspaced
1824 * quotes "like"this.
1826 void check_for_misspaced_punctuation(const char *aline,
1827 struct parities *parities,int isemptyline)
1829 int i,llen,isacro,isellipsis;
1832 for (i=1;i<llen;i++)
1834 /* For each character in the line after the first. */
1835 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
1837 /* we need to suppress warnings for acronyms like M.D. */
1839 /* we need to suppress warnings for ellipsis . . . */
1841 /* if there are letters on both sides of it or ... */
1842 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
1843 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
1845 /* ...if it's strict punctuation followed by an alpha */
1848 if (i>2 && aline[i-2]=='.')
1850 if (i+2<llen && aline[i+2]=='.')
1855 if (pswit[ECHO_SWITCH])
1856 printf("\n%s\n",aline);
1857 if (!pswit[OVERVIEW_SWITCH])
1858 printf(" Line %ld column %d - Missing space?\n",
1864 if (aline[i-1]==CHAR_SPACE &&
1865 (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
1868 * If there are spaces on both sides,
1869 * or space before and end of line.
1873 if (i>2 && aline[i-2]=='.')
1875 if (i+2<llen && aline[i+2]=='.')
1878 if (!isemptyline && !isellipsis)
1880 if (pswit[ECHO_SWITCH])
1881 printf("\n%s\n",aline);
1882 if (!pswit[OVERVIEW_SWITCH])
1883 printf(" Line %ld column %d - "
1884 "Spaced punctuation?\n",linecnt,i+1);
1891 /* Split out the characters that CANNOT be preceded by space. */
1893 for (i=1;i<llen;i++)
1895 /* for each character in the line after the first */
1896 if (strchr("?!,;:",aline[i]))
1898 /* if it's punctuation that _cannot_ have a space before it */
1899 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
1900 aline[i+1]!=CHAR_SPACE)
1903 * If aline[i+1) DOES == space,
1904 * it was already reported just above.
1906 if (pswit[ECHO_SWITCH])
1907 printf("\n%s\n",aline);
1908 if (!pswit[OVERVIEW_SWITCH])
1909 printf(" Line %ld column %d - Spaced punctuation?\n",
1917 * Special case " .X" where X is any alpha.
1918 * This plugs a hole in the acronym code above.
1919 * Inelegant, but maintainable.
1922 for (i=1;i<llen;i++)
1924 /* for each character in the line after the first */
1927 /* if it's a period */
1928 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
1931 * If the period follows a space and
1932 * is followed by a letter.
1934 if (pswit[ECHO_SWITCH])
1935 printf("\n%s\n",aline);
1936 if (!pswit[OVERVIEW_SWITCH])
1937 printf(" Line %ld column %d - Spaced punctuation?\n",
1944 for (i=1;i<llen;i++)
1946 /* for each character in the line after the first */
1947 if (aline[i]==CHAR_DQUOTE)
1949 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
1950 !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
1951 !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
1953 if (pswit[ECHO_SWITCH])
1954 printf("\n%s\n",aline);
1955 if (!pswit[OVERVIEW_SWITCH])
1956 printf(" Line %ld column %d - Unspaced quotes?\n",
1963 /* Check parity of quotes. */
1964 for (s=aline;*s;s++)
1966 if (*s==CHAR_DQUOTE)
1968 parities->dquote=!parities->dquote;
1969 if (!parities->dquote)
1972 if (!strchr("_-.'`/,;:!?)]} ",s[1]))
1974 if (pswit[ECHO_SWITCH])
1975 printf("\n%s\n",aline);
1976 if (!pswit[OVERVIEW_SWITCH])
1977 printf(" Line %ld column %d - "
1978 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
1986 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
1987 !strchr("_-/.'`([{$",s[1]) || !s[1])
1989 if (pswit[ECHO_SWITCH])
1990 printf("\n%s\n",aline);
1991 if (!pswit[OVERVIEW_SWITCH])
1992 printf(" Line %ld column %d - "
1993 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2000 if (*aline==CHAR_DQUOTE)
2002 if (strchr(",;:!?)]} ",aline[1]))
2004 if (pswit[ECHO_SWITCH])
2005 printf("\n%s\n",aline);
2006 if (!pswit[OVERVIEW_SWITCH])
2007 printf(" Line %ld column 1 - Wrongspaced quotes?\n",
2013 if (pswit[SQUOTE_SWITCH])
2015 for (s=aline;*s;s++)
2017 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
2018 (s==aline || s>aline && !gcisalpha(s[-1]) ||
2021 parities->squote=!parities->squote;
2022 if (!parities->squote)
2025 if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
2027 if (pswit[ECHO_SWITCH])
2028 printf("\n%s\n",aline);
2029 if (!pswit[OVERVIEW_SWITCH])
2030 printf(" Line %ld column %d - "
2031 "Wrongspaced singlequotes?\n",
2032 linecnt,(int)(s-aline)+1);
2040 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2041 !strchr("_-/\".'`",s[1]) || !s[1])
2043 if (pswit[ECHO_SWITCH])
2044 printf("\n%s\n",aline);
2045 if (!pswit[OVERVIEW_SWITCH])
2046 printf(" Line %ld column %d - "
2047 "Wrongspaced singlequotes?\n",
2048 linecnt,(int)(s-aline)+1);
2059 * check_for_double_punctuation:
2061 * Look for double punctuation like ,. or ,,
2062 * Thanks to DW for the suggestion!
2063 * In books with references, ".," and ".;" are common
2064 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2065 * OTOH, from my initial tests, there are also fairly
2066 * common errors. What to do? Make these cases paranoid?
2067 * ".," is the most common, so warnings->dotcomma is used
2068 * to suppress detailed reporting if it occurs often.
2070 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2074 for (i=0;i<llen;i++)
2076 /* for each punctuation character in the line */
2077 if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
2078 aline[i] && aline[i+1])
2080 /* followed by punctuation, it's a query, unless . . . */
2081 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
2083 !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
2084 warnings->isFrench && !strncmp(aline+i,",...",4) ||
2085 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2086 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2087 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2088 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2089 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2090 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2091 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2092 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2093 warnings->isFrench && !strncmp(aline+i,"...?",4))
2095 if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
2096 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2097 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2098 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2099 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2100 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2101 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2102 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2103 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2104 warnings->isFrench && !strncmp(aline+i,"...?",4))
2106 ; /* do nothing for .. !! and ?? which can be legit */
2110 if (pswit[ECHO_SWITCH])
2111 printf("\n%s\n",aline);
2112 if (!pswit[OVERVIEW_SWITCH])
2113 printf(" Line %ld column %d - Double punctuation?\n",
2123 * check_for_spaced_quotes:
2125 void check_for_spaced_quotes(const char *aline)
2129 while ((t=strstr(s," \" ")))
2131 if (pswit[ECHO_SWITCH])
2132 printf("\n%s\n",aline);
2133 if (!pswit[OVERVIEW_SWITCH])
2134 printf(" Line %ld column %d - Spaced doublequote?\n",
2135 linecnt,(int)(t-aline+1));
2141 while ((t=strstr(s," ' ")))
2143 if (pswit[ECHO_SWITCH])
2144 printf("\n%s\n",aline);
2145 if (!pswit[OVERVIEW_SWITCH])
2146 printf(" Line %ld column %d - Spaced singlequote?\n",
2147 linecnt,(int)(t-aline+1));
2153 while ((t=strstr(s," ` ")))
2155 if (pswit[ECHO_SWITCH])
2156 printf("\n%s\n",aline);
2157 if (!pswit[OVERVIEW_SWITCH])
2158 printf(" Line %ld column %d - Spaced singlequote?\n",
2159 linecnt,(int)(t-aline+1));
2167 * check_for_miscased_genative:
2169 * Check special case of 'S instead of 's at end of word.
2171 void check_for_miscased_genative(const char *aline)
2177 if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
2179 if (pswit[ECHO_SWITCH])
2180 printf("\n%s\n",aline);
2181 if (!pswit[OVERVIEW_SWITCH])
2182 printf(" Line %ld column %d - Capital \"S\"?\n",
2183 linecnt,(int)(s-aline+2));
2192 * check_end_of_line:
2194 * Now check special cases - start and end of line -
2195 * for single and double quotes. Start is sometimes [sic]
2196 * but better to query it anyway.
2197 * While we're here, check for dash at end of line.
2199 void check_end_of_line(const char *aline,struct warnings *warnings)
2205 if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
2206 aline[llen-1]==CHAR_OPEN_SQUOTE)
2207 if (aline[llen-2]==CHAR_SPACE)
2209 if (pswit[ECHO_SWITCH])
2210 printf("\n%s\n",aline);
2211 if (!pswit[OVERVIEW_SWITCH])
2212 printf(" Line %ld column %d - Spaced quote?\n",
2217 if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
2218 aline[1]==CHAR_SPACE)
2220 if (pswit[ECHO_SWITCH])
2221 printf("\n%s\n",aline);
2222 if (!pswit[OVERVIEW_SWITCH])
2223 printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
2228 * Dash at end of line may well be legit - paranoid mode only
2229 * and don't report em-dash at line-end.
2231 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2233 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
2235 if (aline[i]=='-' && aline[i-1]!='-')
2237 if (pswit[ECHO_SWITCH])
2238 printf("\n%s\n",aline);
2239 if (!pswit[OVERVIEW_SWITCH])
2240 printf(" Line %ld column %d - Hyphen at end of line?\n",
2248 * check_for_unspaced_bracket:
2250 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2251 * If so, suspect a scanno like "a]most".
2253 void check_for_unspaced_bracket(const char *aline)
2257 for (i=1;i<llen-1;i++)
2259 /* for each bracket character in the line except 1st & last */
2260 if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
2261 gcisalpha(aline[i+1]))
2263 if (pswit[ECHO_SWITCH])
2264 printf("\n%s\n",aline);
2265 if (!pswit[OVERVIEW_SWITCH])
2266 printf(" Line %ld column %d - Unspaced bracket?\n",
2275 * check_for_unpunctuated_endquote:
2277 void check_for_unpunctuated_endquote(const char *aline)
2281 for (i=1;i<llen;i++)
2283 /* for each character in the line except 1st */
2284 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
2286 if (pswit[ECHO_SWITCH])
2287 printf("\n%s\n",aline);
2288 if (!pswit[OVERVIEW_SWITCH])
2289 printf(" Line %ld column %d - "
2290 "endquote missing punctuation?\n",linecnt,i);
2298 * check_for_html_tag:
2300 * Check for <HTML TAG>.
2302 * If there is a < in the line, followed at some point
2303 * by a > then we suspect HTML.
2305 void check_for_html_tag(const char *aline)
2308 const char *open,*close;
2309 open=strstr(aline,"<");
2312 close=strstr(aline,">");
2315 i=(signed int)(close-open+1);
2318 strncpy(wrk,open,i);
2320 if (pswit[ECHO_SWITCH])
2321 printf("\n%s\n",aline);
2322 if (!pswit[OVERVIEW_SWITCH])
2323 printf(" Line %ld column %d - HTML Tag? %s \n",
2324 linecnt,(int)(open-aline)+1,wrk);
2333 * check_for_html_entity:
2335 * Check for &symbol; HTML.
2337 * If there is a & in the line, followed at
2338 * some point by a ; then we suspect HTML.
2340 void check_for_html_entity(const char *aline)
2343 const char *s,*amp,*scolon;
2344 amp=strstr(aline,"&");
2347 scolon=strstr(aline,";");
2350 i=(int)(scolon-amp+1);
2351 for (s=amp;s<scolon;s++)
2353 i=0; /* Don't report "Jones & Son;" */
2358 if (pswit[ECHO_SWITCH])
2359 printf("\n%s\n",aline);
2360 if (!pswit[OVERVIEW_SWITCH])
2361 printf(" Line %ld column %d - HTML symbol? %s \n",
2362 linecnt,(int)(amp-aline)+1,wrk);
2371 char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];
2378 * If we are in a state of unbalanced quotes, and this line
2379 * doesn't begin with a quote, output the stored error message.
2380 * If the -P switch was used, print the warning even if the
2381 * new para starts with quotes.
2383 void print_pending(const char *aline,const char *parastart,
2384 struct pending *pending)
2390 if (*pending->dquote)
2391 if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2393 if (!pswit[OVERVIEW_SWITCH])
2395 if (pswit[ECHO_SWITCH])
2396 printf("\n%s\n",parastart);
2397 puts(pending->dquote);
2402 if (*pending->squote)
2404 if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2407 if (!pswit[OVERVIEW_SWITCH])
2409 if (pswit[ECHO_SWITCH])
2410 printf("\n%s\n",parastart);
2411 puts(pending->squote);
2417 if (*pending->rbrack)
2419 if (!pswit[OVERVIEW_SWITCH])
2421 if (pswit[ECHO_SWITCH])
2422 printf("\n%s\n",parastart);
2423 puts(pending->rbrack);
2428 if (*pending->sbrack)
2430 if (!pswit[OVERVIEW_SWITCH])
2432 if (pswit[ECHO_SWITCH])
2433 printf("\n%s\n",parastart);
2434 puts(pending->sbrack);
2439 if (*pending->cbrack)
2441 if (!pswit[OVERVIEW_SWITCH])
2443 if (pswit[ECHO_SWITCH])
2444 printf("\n%s\n",parastart);
2445 puts(pending->cbrack);
2450 if (*pending->unders)
2452 if (!pswit[OVERVIEW_SWITCH])
2454 if (pswit[ECHO_SWITCH])
2455 printf("\n%s\n",parastart);
2456 puts(pending->unders);
2464 * check_for_mismatched_quotes:
2466 * At end of paragraph, check for mismatched quotes.
2468 * We don't want to report an error immediately, since it is a
2469 * common convention to omit the quotes at end of paragraph if
2470 * the next paragraph is a continuation of the same speaker.
2471 * Where this is the case, the next para should begin with a
2472 * quote, so we store the warning message and only display it
2473 * at the top of the next iteration if the new para doesn't
2474 * start with a quote.
2475 * The -p switch overrides this default, and warns of unclosed
2476 * quotes on _every_ paragraph, whether the next begins with a
2479 void check_for_mismatched_quotes(const struct counters *counters,
2480 struct pending *pending)
2482 if (counters->quot%2)
2483 sprintf(pending->dquote," Line %ld - Mismatched quotes",
2485 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2486 counters->open_single_quote!=counters->close_single_quote)
2487 sprintf(pending->squote," Line %ld - Mismatched singlequotes?",
2489 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2490 counters->open_single_quote!=counters->close_single_quote &&
2491 counters->open_single_quote!=counters->close_single_quote+1)
2493 * Flag it to be noted regardless of the
2494 * first char of the next para.
2497 if (counters->r_brack)
2498 sprintf(pending->rbrack," Line %ld - Mismatched round brackets?",
2500 if (counters->s_brack)
2501 sprintf(pending->sbrack," Line %ld - Mismatched square brackets?",
2503 if (counters->c_brack)
2504 sprintf(pending->cbrack," Line %ld - Mismatched curly brackets?",
2506 if (counters->c_unders%2)
2507 sprintf(pending->unders," Line %ld - Mismatched underscores?",
2512 * check_for_omitted_punctuation:
2514 * Check for omitted punctuation at end of paragraph by working back
2515 * through prevline. DW.
2516 * Need to check this only for "normal" paras.
2517 * So what is a "normal" para?
2518 * Not normal if one-liner (chapter headings, etc.)
2519 * Not normal if doesn't contain at least one locase letter
2520 * Not normal if starts with space
2522 void check_for_omitted_punctuation(const char *prevline,
2523 struct line_properties *last,int start_para_line)
2527 for (s=prevline,i=0;*s && !i;s++)
2529 /* use i to indicate the presence of a letter on the line */
2532 * This next "if" is a problem.
2533 * If we say "start_para_line <= linecnt - 1", that includes
2534 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2535 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2536 * misses genuine one-line paragraphs.
2538 if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)
2540 for (i=strlen(prevline)-1;
2541 (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
2542 prevline[i]>CHAR_SPACE && i>0;
2547 if (gcisalpha(prevline[i]))
2549 if (pswit[ECHO_SWITCH])
2550 printf("\n%s\n",prevline);
2551 if (!pswit[OVERVIEW_SWITCH])
2552 printf(" Line %ld column %d - "
2553 "No punctuation at para end?\n",
2554 linecnt-1,strlen(prevline));
2559 if (strchr("-.:!([{?}])",prevline[i]))
2570 void procfile(char *filename)
2573 char parastart[81]; /* first line of current para */
2575 struct first_pass_results *first_pass_results;
2576 struct warnings *warnings;
2577 struct counters counters={0};
2578 struct line_properties last={0};
2579 struct parities parities={0};
2580 struct pending pending={{0},};
2582 long start_para_line;
2583 signed int i,llen,isacro,isellipsis;
2584 signed int isnewpara;
2586 last.start=CHAR_SPACE;
2588 linecnt=checked_linecnt=start_para_line=0;
2589 i=llen=isacro=isellipsis=0;
2590 isnewpara=enddash=0;
2591 infile=fopen(filename,"rb");
2594 if (pswit[STDOUT_SWITCH])
2595 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
2597 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
2600 fprintf(stdout,"\n\nFile: %s\n\n",filename);
2601 first_pass_results=first_pass(infile);
2602 warnings=report_first_pass(first_pass_results);
2604 * Here we go with the main pass. Hold onto yer hat!
2608 while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
2613 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
2614 continue; // skip DP page separators completely
2615 if (linecnt<first_pass_results->firstline ||
2616 (first_pass_results->footerline>0 &&
2617 linecnt>first_pass_results->footerline))
2619 if (pswit[HEADER_SWITCH])
2621 if (!strncmp(aline,"Title:",6))
2622 printf(" %s\n",aline);
2623 if (!strncmp(aline,"Author:",7))
2624 printf(" %s\n",aline);
2625 if (!strncmp(aline,"Release Date:",13))
2626 printf(" %s\n",aline);
2627 if (!strncmp(aline,"Edition:",8))
2628 printf(" %s\n\n",aline);
2630 continue; /* skip through the header */
2633 print_pending(aline,parastart,&pending);
2634 memset(&pending,0,sizeof(pending));
2635 isemptyline=analyse_quotes(aline,&counters);
2636 if (isnewpara && !isemptyline)
2638 /* This line is the start of a new paragraph. */
2639 start_para_line=linecnt;
2640 /* Capture its first line in case we want to report it later. */
2641 strncpy(parastart,aline,80);
2643 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2645 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
2647 if (*s>='a' && *s<='z')
2649 /* and its first letter is lowercase */
2650 if (pswit[ECHO_SWITCH])
2651 printf("\n%s\n",aline);
2652 if (!pswit[OVERVIEW_SWITCH])
2653 printf(" Line %ld column %d - "
2654 "Paragraph starts with lower-case\n",
2655 linecnt,(int)(s-aline)+1);
2659 isnewpara=0; /* Signal the end of new para processing. */
2661 /* Check for an em-dash broken at line end. */
2662 if (enddash && *aline=='-')
2664 if (pswit[ECHO_SWITCH])
2665 printf("\n%s\n",aline);
2666 if (!pswit[OVERVIEW_SWITCH])
2667 printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2672 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
2674 if (s>=aline && *s=='-')
2676 check_for_control_characters(aline);
2678 check_for_odd_characters(aline,warnings,isemptyline);
2679 if (warnings->longline)
2680 check_for_long_line(aline);
2681 if (warnings->shortline)
2682 check_for_short_line(aline,&last);
2684 last.len=strlen(aline);
2685 last.start=aline[0];
2686 check_for_starting_punctuation(aline);
2689 check_for_spaced_emdash(aline);
2690 check_for_spaced_dash(aline);
2692 check_for_unmarked_paragraphs(aline);
2693 check_for_jeebies(aline);
2694 check_for_mta_from(aline);
2695 check_for_orphan_character(aline);
2696 check_for_pling_scanno(aline);
2697 check_for_extra_period(aline,warnings);
2698 check_for_following_punctuation(aline);
2699 check_for_typos(aline,warnings);
2700 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2701 check_for_double_punctuation(aline,warnings);
2702 check_for_spaced_quotes(aline);
2703 check_for_miscased_genative(aline);
2704 check_end_of_line(aline,warnings);
2705 check_for_unspaced_bracket(aline);
2706 if (warnings->endquote)
2707 check_for_unpunctuated_endquote(aline);
2708 check_for_html_tag(aline);
2709 check_for_html_entity(aline);
2712 check_for_mismatched_quotes(&counters,&pending);
2713 memset(&counters,0,sizeof(counters));
2714 /* let the next iteration know that it's starting a new para */
2716 check_for_omitted_punctuation(prevline,&last,start_para_line);
2718 strcpy(prevline,aline);
2721 if (!pswit[OVERVIEW_SWITCH])
2722 for (i=0;i<MAX_QWORD;i++)
2724 printf("\nNote: Queried word %s was duplicated %d time%s\n",
2725 qword[i],dupcnt[i],"s");
2731 * Get one line from the input stream, checking for
2732 * the existence of exactly one CR/LF line-end per line.
2734 * Returns: a pointer to the line.
2736 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
2742 c=cint=fgetc(thefile);
2747 /* either way, it's end of line */
2754 /* Error - a LF without a preceding CR */
2755 if (pswit[LINE_END_SWITCH])
2757 if (pswit[ECHO_SWITCH])
2758 printf("\n%s\n",theline);
2759 if (!pswit[OVERVIEW_SWITCH])
2760 printf(" Line %ld - No CR?\n",lcnt);
2771 /* Error - two successive CRs */
2772 if (pswit[LINE_END_SWITCH])
2774 if (pswit[ECHO_SWITCH])
2775 printf("\n%s\n",theline);
2776 if (!pswit[OVERVIEW_SWITCH])
2777 printf(" Line %ld - Two successive CRs?\n",lcnt);
2786 if (pswit[LINE_END_SWITCH] && isCR)
2788 if (pswit[ECHO_SWITCH])
2789 printf("\n%s\n",theline);
2790 if (!pswit[OVERVIEW_SWITCH])
2791 printf(" Line %ld column %d - CR without LF?\n",
2801 c=cint=fgetc(thefile);
2802 } while(len<maxlen);
2803 if (pswit[MARKUP_SWITCH])
2804 postprocess_for_HTML(theline);
2805 if (pswit[DP_SWITCH])
2806 postprocess_for_DP(theline);
2813 * Takes a "word" as a parameter, and checks whether it
2814 * contains a mixture of alpha and digits. Generally, this is an
2815 * error, but may not be for cases like 4th or L5 12s. 3d.
2817 * Returns: 0 if no error found, 1 if error.
2819 int mixdigit(char *checkword)
2821 int wehaveadigit,wehavealetter,firstdigits,query,wl;
2823 wehaveadigit=wehavealetter=query=0;
2824 for (s=checkword;*s;s++)
2830 if (wehaveadigit && wehavealetter)
2832 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2834 wl=strlen(checkword);
2835 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
2837 /* digits, ending in st, rd, nd, th of either case */
2838 if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
2839 matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
2840 matchword(checkword+wl-2,"th")))
2842 if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
2843 matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
2844 matchword(checkword+wl-3,"ths")))
2846 if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
2847 matchword(checkword+wl-4,"rdly") ||
2848 matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
2850 /* digits, ending in l, L, s or d */
2851 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
2852 checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
2855 * L at the start of a number, representing Britsh pounds, like L500.
2856 * This is cute. We know the current word is mixeddigit. If the first
2857 * letter is L, there must be at least one digit following. If both
2858 * digits and letters follow, we have a genuine error, else we have a
2859 * capital L followed by digits, and we accept that as a non-error.
2861 if (checkword[0]=='L' && !mixdigit(checkword+1))
2870 * Extracts the first/next "word" from the line, and puts
2871 * it into "thisword". A word is defined as one English word unit--or
2872 * at least that's the aim.
2874 * Returns: a pointer to the position in the line where we will start
2875 * looking for the next word.
2877 const char *getaword(const char *fromline,char *thisword)
2882 for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
2886 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2887 * Especially yucky is the case of L1,000
2888 * This section looks for a pattern of characters including a digit
2889 * followed by a comma or period followed by one or more digits.
2890 * If found, it returns this whole pattern as a word; otherwise we discard
2891 * the results and resume our normal programming.
2894 for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
2895 wordlen<MAXWORDLEN;s++)
2897 thisword[wordlen]=*s;
2900 thisword[wordlen]=0;
2901 for (i=1;i<wordlen-1;i++)
2903 if (thisword[i]=='.' || thisword[i]==',')
2905 if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
2912 /* we didn't find a punctuated number - do the regular getword thing */
2914 for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
2915 wordlen<MAXWORDLEN;fromline++)
2917 thisword[wordlen]=*fromline;
2920 thisword[wordlen]=0;
2927 * A case-insensitive string matcher.
2929 int matchword(char *checkfor,char *thisword)
2931 unsigned int ismatch,i;
2932 if (strlen(checkfor)!=strlen(thisword))
2934 ismatch=1; /* assume a match until we find a difference */
2935 for (i=0;i<strlen(checkfor);i++)
2936 if (toupper(checkfor[i])!=toupper(thisword[i]))
2944 * Lowercase the line.
2947 void lowerit(char *theline)
2949 for (;*theline;theline++)
2950 if (*theline>='A' && *theline<='Z')
2957 * Is this word a Roman Numeral?
2959 * It doesn't actually validate that the number is a valid Roman Numeral--for
2960 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2961 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2962 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2963 * expressions thereof, except when it came to taxes. Allow any number of M,
2964 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2965 * XL or an optional XC, an optional IX or IV, an optional V and any number
2968 int isroman(char *t)
2974 while (*t=='m' && *t)
2978 if (*t=='c' && t[1]=='m')
2980 if (*t=='c' && t[1]=='d')
2982 while (*t=='c' && *t)
2984 if (*t=='x' && t[1]=='l')
2986 if (*t=='x' && t[1]=='c')
2990 while (*t=='x' && *t)
2992 if (*t=='i' && t[1]=='x')
2994 if (*t=='i' && t[1]=='v')
2998 while (*t=='i' && *t)
3006 * A version of isalpha() that is somewhat lenient on 8-bit texts.
3007 * If we use the standard function, 8-bit accented characters break
3008 * words, so that tete with accented characters appears to be two words, "t"
3009 * and "t", with 8-bit characters between them. This causes over-reporting of
3010 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
3011 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
3013 int gcisalpha(unsigned char c)
3015 if (c>='a' && c<='z')
3017 if (c>='A' && c<='Z')
3021 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
3023 if (c==140 || c==142 || c==156 || c==158 || c==159)
3031 * A version of isdigit() that doesn't get confused in 8-bit texts.
3033 int gcisdigit(unsigned char c)
3035 return c>='0' && c<='9';
3041 * A version of isletter() that doesn't get confused in 8-bit texts.
3042 * NB: this is ISO-8891-1-specific.
3044 int gcisletter(unsigned char c)
3046 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
3052 * Wraps strchr to return NULL if the character being searched for is zero.
3054 char *gcstrchr(char *s,char c)
3062 * postprocess_for_DP:
3064 * Invoked with the -d switch from flgets().
3065 * It simply "removes" from the line a hard-coded set of common
3066 * DP-specific tags, so that the line passed to the main routine has
3067 * been pre-cleaned of DP markup.
3069 void postprocess_for_DP(char *theline)
3075 for (i=0;*DPmarkup[i];i++)
3077 s=strstr(theline,DPmarkup[i]);
3080 t=s+strlen(DPmarkup[i]);
3088 s=strstr(theline,DPmarkup[i]);
3094 * postprocess_for_HTML:
3096 * Invoked with the -m switch from flgets().
3097 * It simply "removes" from the line a hard-coded set of common
3098 * HTML tags and "replaces" a hard-coded set of common HTML
3099 * entities, so that the line passed to the main routine has
3100 * been pre-cleaned of HTML.
3102 void postprocess_for_HTML(char *theline)
3104 if (strstr(theline,"<") && strstr(theline,">"))
3105 while (losemarkup(theline))
3107 while (loseentities(theline))
3111 char *losemarkup(char *theline)
3117 s=strstr(theline,"<");
3118 t=strstr(theline,">");
3121 for (i=0;*markup[i];i++)
3122 if (!tagcomp(s+1,markup[i]))
3135 /* It's an unrecognized <xxx>. */
3139 char *loseentities(char *theline)
3145 for (i=0;*entities[i].htmlent;i++)
3147 s=strstr(theline,entities[i].htmlent);
3150 t=malloc((size_t)strlen(s));
3153 strcpy(t,s+strlen(entities[i].htmlent));
3154 strcpy(s,entities[i].textent);
3160 for (i=0;*entities[i].htmlnum;i++)
3162 s=strstr(theline,entities[i].htmlnum);
3165 t=malloc((size_t)strlen(s));
3168 strcpy(t,s+strlen(entities[i].htmlnum));
3169 strcpy(s,entities[i].textent);
3178 int tagcomp(char *strin,char *basetag)
3184 t++; /* ignore a slash */
3187 if (tolower(*s)!=tolower(*t))
3197 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3198 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3199 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3200 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3201 "For details, read the file COPYING.\n",stderr);
3202 fputs("This is Free Software; "
3203 "you may redistribute it under certain conditions (GPL);\n",stderr);
3204 fputs("read the file COPYING for details.\n\n",stderr);
3205 fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
3206 fputs(" where -s checks single quotes, -e suppresses echoing lines, "
3207 "-t checks typos\n",stderr);
3208 fputs(" -x (paranoid) switches OFF -t and extra checks, "
3209 "-l turns OFF line-end checks\n",stderr);
3210 fputs(" -o just displays overview without detail, "
3211 "-h echoes header fields\n",stderr);
3212 fputs(" -v (verbose) unsuppresses duplicate reporting, "
3213 "-m suppresses markup\n",stderr);
3214 fputs(" -d ignores DP-specific markup,\n",stderr);
3215 fputs(" -u uses a file gutcheck.typ to query user-defined "
3216 "possible typos\n",stderr);
3217 fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
3219 fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
3221 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3222 "non-ASCII\n",stderr);
3223 fputs("characters like accented letters, "
3224 "lines longer than 75 or shorter than 55,\n",stderr);
3225 fputs("unbalanced quotes or brackets, "
3226 "a variety of badly formatted punctuation, \n",stderr);
3227 fputs("HTML tags, some likely typos. "
3228 "It is NOT a substitute for human judgement.\n",stderr);