1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
26 #define MAXWORDLEN 80 /* max length of one word */
27 #define LINEBUFSIZE 2048 /* buffer size for an input line */
29 #define MAX_USER_TYPOS 1000
30 #define USERTYPO_FILE "gutcheck.typ"
33 #define MAX_PATH 16384
36 char aline[LINEBUFSIZE];
37 char prevline[LINEBUFSIZE];
41 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
42 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
43 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
44 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
45 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
46 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
47 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
48 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
49 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
50 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
51 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
52 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
53 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
54 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
55 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
56 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
57 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
58 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
59 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
60 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
61 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
62 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
63 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
64 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
65 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
66 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
67 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
68 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
69 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 char *usertypo[MAX_USER_TYPOS];
75 /* Common abbreviations and other OK words not to query as typos. */
77 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
78 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
79 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
80 "outbid", "outbids", "frostbite", "frostbitten", ""
83 /* Common abbreviations that cause otherwise unexplained periods. */
85 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
86 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
90 * Two-Letter combinations that rarely if ever start words,
91 * but are common scannos or otherwise common letter combinations.
94 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
98 * Two-Letter combinations that rarely if ever end words,
99 * but are common scannos or otherwise common letter combinations.
102 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
103 "sw", "gr", "sl", "cl", "iy", ""
107 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
108 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
109 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
110 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
114 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
118 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
119 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
120 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
121 "during", "let", "toward", "among", ""
125 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
126 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
127 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
128 "among", "those", "into", "whom", "having", "thence", ""
131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
138 "&", "&", "&",
139 "<", "<", "<",
140 ">", ">", ">",
141 "°", "°", " degrees",
142 "£", "£", "L",
143 """, """, "\"", /* quotation mark = APL quote */
144 "Œ", "Œ", "OE", /* latin capital ligature OE */
145 "œ", "œ", "oe", /* latin small ligature oe */
146 "Š", "Š", "S", /* latin capital letter S with caron */
147 "š", "š", "s", /* latin small letter s with caron */
148 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
149 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
150 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
151 " ", " ", " ", /* en space, U+2002 ISOpub */
152 " ", " ", " ", /* em space, U+2003 ISOpub */
153 " ", " ", " ", /* thin space, U+2009 ISOpub */
154 "–", "–", "-", /* en dash, U+2013 ISOpub */
155 "—", "—", "--", /* em dash, U+2014 ISOpub */
156 "’", "’", "'", /* right single quotation mark */
157 "‚", "‚", "'", /* single low-9 quotation mark */
158 "“", "“", "\"", /* left double quotation mark */
159 "”", "”", "\"", /* right double quotation mark */
160 "„", "„", "\"", /* double low-9 quotation mark */
161 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
162 "›", "›", "\"", /* single right-pointing angle quotation mark */
163 " ", " ", " ", /* no-break space = non-breaking space, */
164 "¡", "¡", "!", /* inverted exclamation mark */
165 "¢", "¢", "c", /* cent sign */
166 "£", "£", "L", /* pound sign */
167 "¤", "¤", "$", /* currency sign */
168 "¥", "¥", "Y", /* yen sign = yuan sign */
169 "§", "§", "--", /* section sign */
170 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
171 "©", "©", "(C) ", /* copyright sign */
172 "ª", "ª", " ", /* feminine ordinal indicator */
173 "«", "«", "\"", /* left-pointing double angle quotation mark */
174 "­", "­", "-", /* soft hyphen = discretionary hyphen */
175 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
176 "¯", "¯", " ", /* macron = spacing macron = overline */
177 "°", "°", " degrees", /* degree sign */
178 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
179 "²", "²", "2", /* superscript two = superscript digit two */
180 "³", "³", "3", /* superscript three = superscript digit three */
181 "´", "´", " ", /* acute accent = spacing acute */
182 "µ", "µ", "m", /* micro sign */
183 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
184 "¸", "¸", " ", /* cedilla = spacing cedilla */
185 "¹", "¹", "1", /* superscript one = superscript digit one */
186 "º", "º", " ", /* masculine ordinal indicator */
187 "»", "»", "\"", /* right-pointing double angle quotation mark */
188 "¼", "¼", "1/4", /* vulgar fraction one quarter */
189 "½", "½", "1/2", /* vulgar fraction one half */
190 "¾", "¾", "3/4", /* vulgar fraction three quarters */
191 "¿", "¿", "?", /* inverted question mark */
192 "À", "À", "A", /* latin capital letter A with grave */
193 "Á", "Á", "A", /* latin capital letter A with acute */
194 "Â", "Â", "A", /* latin capital letter A with circumflex */
195 "Ã", "Ã", "A", /* latin capital letter A with tilde */
196 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
197 "Å", "Å", "A", /* latin capital letter A with ring above */
198 "Æ", "Æ", "AE", /* latin capital letter AE */
199 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
200 "È", "È", "E", /* latin capital letter E with grave */
201 "É", "É", "E", /* latin capital letter E with acute */
202 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
203 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
204 "Ì", "Ì", "I", /* latin capital letter I with grave */
205 "Í", "Í", "I", /* latin capital letter I with acute */
206 "Î", "Î", "I", /* latin capital letter I with circumflex */
207 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
208 "Ð", "Ð", "E", /* latin capital letter ETH */
209 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
210 "Ò", "Ò", "O", /* latin capital letter O with grave */
211 "Ó", "Ó", "O", /* latin capital letter O with acute */
212 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
213 "Õ", "Õ", "O", /* latin capital letter O with tilde */
214 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
215 "×", "×", "*", /* multiplication sign */
216 "Ø", "Ø", "O", /* latin capital letter O with stroke */
217 "Ù", "Ù", "U", /* latin capital letter U with grave */
218 "Ú", "Ú", "U", /* latin capital letter U with acute */
219 "Û", "Û", "U", /* latin capital letter U with circumflex */
220 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
221 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
222 "Þ", "Þ", "TH", /* latin capital letter THORN */
223 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
224 "à", "à", "a", /* latin small letter a with grave */
225 "á", "á", "a", /* latin small letter a with acute */
226 "â", "â", "a", /* latin small letter a with circumflex */
227 "ã", "ã", "a", /* latin small letter a with tilde */
228 "ä", "ä", "a", /* latin small letter a with diaeresis */
229 "å", "å", "a", /* latin small letter a with ring above */
230 "æ", "æ", "ae", /* latin small letter ae */
231 "ç", "ç", "c", /* latin small letter c with cedilla */
232 "è", "è", "e", /* latin small letter e with grave */
233 "é", "é", "e", /* latin small letter e with acute */
234 "ê", "ê", "e", /* latin small letter e with circumflex */
235 "ë", "ë", "e", /* latin small letter e with diaeresis */
236 "ì", "ì", "i", /* latin small letter i with grave */
237 "í", "í", "i", /* latin small letter i with acute */
238 "î", "î", "i", /* latin small letter i with circumflex */
239 "ï", "ï", "i", /* latin small letter i with diaeresis */
240 "ð", "ð", "eth", /* latin small letter eth */
241 "ñ", "ñ", "n", /* latin small letter n with tilde */
242 "ò", "ò", "o", /* latin small letter o with grave */
243 "ó", "ó", "o", /* latin small letter o with acute */
244 "ô", "ô", "o", /* latin small letter o with circumflex */
245 "õ", "õ", "o", /* latin small letter o with tilde */
246 "ö", "ö", "o", /* latin small letter o with diaeresis */
247 "÷", "÷", "/", /* division sign */
248 "ø", "ø", "o", /* latin small letter o with stroke */
249 "ù", "ù", "u", /* latin small letter u with grave */
250 "ú", "ú", "u", /* latin small letter u with acute */
251 "û", "û", "u", /* latin small letter u with circumflex */
252 "ü", "ü", "u", /* latin small letter u with diaeresis */
253 "ý", "ý", "y", /* latin small letter y with acute */
254 "þ", "þ", "th", /* latin small letter thorn */
255 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
259 /* special characters */
260 #define CHAR_SPACE 32
264 #define CHAR_DQUOTE 34
265 #define CHAR_SQUOTE 39
266 #define CHAR_OPEN_SQUOTE 96
267 #define CHAR_TILDE 126
268 #define CHAR_ASTERISK 42
269 #define CHAR_FORESLASH 47
270 #define CHAR_CARAT 94
272 #define CHAR_UNDERSCORE '_'
273 #define CHAR_OPEN_CBRACK '{'
274 #define CHAR_CLOSE_CBRACK '}'
275 #define CHAR_OPEN_RBRACK '('
276 #define CHAR_CLOSE_RBRACK ')'
277 #define CHAR_OPEN_SBRACK '['
278 #define CHAR_CLOSE_SBRACK ']'
280 /* longest and shortest normal PG line lengths */
281 #define LONGEST_PG_LINE 75
282 #define WAY_TOO_LONG 80
283 #define SHORTEST_PG_LINE 55
285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
286 /* D - ignore DP-specific markup */
287 /* E - echo queried line */
288 /* S - check single quotes */
289 /* T - check common typos */
290 /* P - require closure of quotes on */
291 /* every paragraph */
292 /* X - "Trust no one" :-) Paranoid! */
293 /* Queries everything */
294 /* L - line end checking defaults on */
295 /* -L turns it off */
296 /* O - overview. Just shows counts. */
297 /* Y - puts errors to stdout */
298 /* instead of stderr */
299 /* H - Echoes header fields */
300 /* M - Ignore markup in < > */
301 /* U - Use file of User-defined Typos*/
302 /* W - Defaults for use on Web upload*/
303 /* V - Verbose - list EVERYTHING! */
304 #define SWITNO 14 /* max number of switch parms */
305 /* - used for defining array-size */
306 #define MINARGS 1 /* minimum no of args excl switches */
307 #define MAXARGS 1 /* maximum no of args excl switches */
309 int pswit[SWITNO]; /* program switches set by SWITCHES */
311 #define ECHO_SWITCH 0
312 #define SQUOTE_SWITCH 1
313 #define TYPO_SWITCH 2
314 #define QPARA_SWITCH 3
315 #define PARANOID_SWITCH 4
316 #define LINE_END_SWITCH 5
317 #define OVERVIEW_SWITCH 6
318 #define STDOUT_SWITCH 7
319 #define HEADER_SWITCH 8
321 #define VERBOSE_SWITCH 10
322 #define MARKUP_SWITCH 11
323 #define USERTYPO_SWITCH 12
326 long cnt_dquot; /* for overview mode, count of doublequote queries */
327 long cnt_squot; /* for overview mode, count of singlequote queries */
328 long cnt_brack; /* for overview mode, count of brackets queries */
329 long cnt_bin; /* for overview mode, count of non-ASCII queries */
330 long cnt_odd; /* for overview mode, count of odd character queries */
331 long cnt_long; /* for overview mode, count of long line errors */
332 long cnt_short; /* for overview mode, count of short line queries */
333 long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
334 long cnt_dash; /* for overview mode, count of dash-related queries */
335 long cnt_word; /* for overview mode, count of word queries */
336 long cnt_html; /* for overview mode, count of html queries */
337 long cnt_lineend; /* for overview mode, count of line-end queries */
338 long cnt_spacend; /* count of lines with space at end */
339 long linecnt; /* count of total lines in the file */
340 long checked_linecnt; /* count of lines actually checked */
343 void procfile(char *);
345 #define LOW_THRESHOLD 0
346 #define HIGH_THRESHOLD 1
352 #define FIRST_OF_PAIR 0
353 #define SECOND_OF_PAIR 1
355 #define MAX_WORDPAIR 1000
357 char running_from[MAX_PATH];
359 int mixdigit(char *);
360 char *getaword(char *,char *);
361 int matchword(char *,char *);
362 char *flgets(char *,int,FILE *,long);
363 void lowerit(char *);
364 int gcisalpha(unsigned char);
365 int gcisdigit(unsigned char);
366 int gcisletter(unsigned char);
367 char *gcstrchr(char *s,char c);
368 void postprocess_for_HTML(char *);
369 char *linehasmarkup(char *);
370 char *losemarkup(char *);
371 int tagcomp(char *,char *);
372 char *loseentities(char *);
375 void postprocess_for_DP(char *);
377 char wrk[LINEBUFSIZE];
380 #define MAX_QWORD_LENGTH 40
381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
382 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
383 signed int dupcnt[MAX_QWORD];
385 int main(int argc,char **argv)
389 char usertypo_file[MAX_PATH];
391 if (strlen(argv[0])<sizeof(running_from))
392 /* save the path to the executable */
393 strcpy(running_from,argv[0]);
394 /* find out what directory we're running from */
395 s=running_from+strlen(running_from);
396 for (;*s!='/' && *s!='\\' && s>=running_from;s--)
398 switno=strlen(SWITCHES);
399 for (i=switno;--i>0;)
400 pswit[i]=0; /* initialise switches */
402 * Standard loop to extract switches.
403 * When we come out of this loop, the arguments will be
404 * in argv[0] upwards and the switches used will be
405 * represented by their equivalent elements in pswit[]
407 while (--argc>0 && **++argv=='-')
408 for (argsw=argv[0]+1;*argsw!='\0';argsw++)
409 for (i=switno,invarg=1;(--i>=0) && invarg==1;)
410 if ((toupper(*argsw))==SWITCHES[i])
415 /* Paranoid checking is turned OFF, not on, by its switch */
416 pswit[PARANOID_SWITCH]^=1;
417 if (pswit[PARANOID_SWITCH])
418 /* if running in paranoid mode force typo checks as well */
419 pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
420 /* Line-end checking is turned OFF, not on, by its switch */
421 pswit[LINE_END_SWITCH]^=1;
422 /* Echoing is turned OFF, not on, by its switch */
423 pswit[ECHO_SWITCH]^=1;
424 if (pswit[OVERVIEW_SWITCH])
425 /* just print summary; don't echo */
426 pswit[ECHO_SWITCH]=0;
428 * Web uploads - for the moment, this is really just a placeholder
429 * until we decide what processing we really want to do on web uploads
431 if (pswit[WEB_SWITCH])
433 /* specific override for web uploads */
434 pswit[ECHO_SWITCH]=1;
435 pswit[SQUOTE_SWITCH]=0;
436 pswit[TYPO_SWITCH]=1;
437 pswit[QPARA_SWITCH]=0;
438 pswit[PARANOID_SWITCH]=1;
439 pswit[LINE_END_SWITCH]=0;
440 pswit[OVERVIEW_SWITCH]=0;
441 pswit[STDOUT_SWITCH]=0;
442 pswit[HEADER_SWITCH]=1;
443 pswit[VERBOSE_SWITCH]=0;
444 pswit[MARKUP_SWITCH]=0;
445 pswit[USERTYPO_SWITCH]=0;
448 if (argc<MINARGS || argc>MAXARGS)
450 /* check number of args */
454 /* read in the user-defined stealth scanno list */
455 if (pswit[USERTYPO_SWITCH])
457 /* ... we were told we had one! */
458 usertypofile=fopen(USERTYPO_FILE,"rb");
461 /* not in cwd. try excuteable directory. */
462 strcpy(usertypo_file,running_from);
463 strcat(usertypo_file,USERTYPO_FILE);
464 usertypofile=fopen(usertypo_file,"rb");
466 /* we ain't got no user typo file! */
467 printf(" --> I couldn't find gutcheck.typ "
468 "-- proceeding without user typos.\n");
474 /* we managed to open a User Typo File! */
475 if (pswit[USERTYPO_SWITCH])
477 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
478 (long)usertypo_count))
484 s=malloc(strlen(aline)+1);
487 fprintf(stderr,"bookloupe: cannot get enough "
488 "memory for user typo file!\n");
492 usertypo[usertypo_count]=s;
494 if (usertypo_count>=MAX_USER_TYPOS)
496 printf(" --> Only %d user-defined typos "
497 "allowed: ignoring the rest\n");
504 fclose(usertypofile);
507 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
508 cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
509 cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
512 if (pswit[OVERVIEW_SWITCH])
514 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
515 checked_linecnt,linecnt,linecnt-checked_linecnt);
516 printf(" --------------- Queries found --------------\n");
518 printf(" Long lines: %14ld\n",cnt_long);
520 printf(" Short lines: %14ld\n",cnt_short);
522 printf(" Line-end problems: %14ld\n",cnt_lineend);
524 printf(" Common typos: %14ld\n",cnt_word);
526 printf(" Unmatched quotes: %14ld\n",cnt_dquot);
528 printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
530 printf(" Unmatched brackets: %14ld\n",cnt_brack);
532 printf(" Non-ASCII characters: %14ld\n",cnt_bin);
534 printf(" Proofing characters: %14ld\n",cnt_odd);
536 printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
538 printf(" Non-standard dashes: %14ld\n",cnt_dash);
540 printf(" Possible HTML tags: %14ld\n",cnt_html);
542 printf(" TOTAL QUERIES %14ld\n",
543 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
544 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
554 void procfile(char *filename)
556 char *s,*t,*s1,laststart,*wordstart;
557 char inword[MAXWORDLEN],testword[MAXWORDLEN];
558 char parastart[81]; /* first line of current para */
560 long quot,squot,firstline,alphalen,totlen,binlen,
561 shortline,longline,verylongline,spacedash,emdash,
562 space_emdash,non_PG_space_emdash,PG_space_emdash,
563 footerline,dotcomma,start_para_line,astline,fslashline,
564 standalone_digit,hyphens,htmcount,endquote_count;
566 signed int i,j,llen,isemptyline,isacro,isellipsis,istypo,alower,
567 eNon_A,eTab,eTilde,eAst,eFSlash,eCarat;
568 signed int warn_short,warn_long,warn_bin,warn_dash,warn_dotcomma,
569 warn_ast,warn_fslash,warn_digit,warn_hyphen,warn_endquote;
570 unsigned int lastlen,lastblen;
571 signed int s_brack,c_brack,r_brack,c_unders;
572 signed int open_single_quote,close_single_quote,guessquote,dquotepar,
574 signed int isnewpara,vowel,consonant;
575 char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
576 cbrack_err[80],unders_err[80];
577 signed int qword_index,qperiod_index,isdup;
579 signed int Dutchcount,isDutch,Frenchcount,isFrench;
580 laststart=CHAR_SPACE;
582 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
583 *unders_err=*prevline=0;
584 linecnt=firstline=alphalen=totlen=binlen=
585 shortline=longline=spacedash=emdash=checked_linecnt=
586 space_emdash=non_PG_space_emdash=PG_space_emdash=
587 footerline=dotcomma=start_para_line=astline=fslashline=
588 standalone_digit=hyphens=htmcount=endquote_count=0;
589 quot=squot=s_brack=c_brack=r_brack=c_unders=0;
590 i=llen=isemptyline=isacro=isellipsis=istypo=0;
591 warn_short=warn_long=warn_bin=warn_dash=warn_dotcomma=
592 warn_ast=warn_fslash=warn_digit=warn_endquote=0;
593 isnewpara=vowel=consonant=enddash=0;
595 qword_index=qperiod_index=isdup=0;
597 open_single_quote=close_single_quote=guessquote=dquotepar=squotepar=0;
598 Dutchcount=isDutch=Frenchcount=isFrench=0;
599 for (j=0;j<MAX_QWORD;j++)
602 for (i=0;i<MAX_QWORD_LENGTH;i++)
608 infile=fopen(filename,"rb");
611 if (pswit[STDOUT_SWITCH])
612 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
614 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
617 fprintf(stdout,"\n\nFile: %s\n\n",filename);
618 firstline=shortline=longline=verylongline=0;
620 * Run a first pass - verify that it's a valid PG
621 * file, decide whether to report some things that
622 * occur many times in the text like long or short
623 * lines, non-standard dashes, etc.
625 while (fgets(aline,LINEBUFSIZE-1,infile))
627 while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
628 aline[strlen(aline)-1]=0;
630 if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
631 (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
634 printf(" --> Duplicate header?\n");
635 spline=linecnt+1; /* first line of non-header text, that is */
637 if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
640 printf(" --> Duplicate header?\n");
641 nspline=linecnt+1; /* first line of non-header text, that is */
643 if (spline || nspline)
646 if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
648 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
652 /* it's an old-form header - we can detect duplicates */
654 printf(" --> Duplicate footer?\n");
664 firstline=nspline; /* override with new */
666 continue; /* don't count the boilerplate in the footer */
671 if ((unsigned char)aline[i]>127)
673 if (gcisalpha(aline[i]))
675 if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
678 if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
679 lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
681 if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
683 if (strstr(aline,".,"))
685 /* only count ast lines for ignoring purposes where there is */
686 /* locase text on the line */
687 if (strstr(aline,"*"))
690 if (*s>='a' && *s<='z')
695 if (strstr(aline,"/"))
697 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
699 if (aline[i]=='-' && aline[i-1]!='-')
701 if (llen>LONGEST_PG_LINE)
703 if (llen>WAY_TOO_LONG)
705 if (strstr(aline,"<") && strstr(aline,">"))
707 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
710 if (strstr(aline,"<i>"))
711 htmcount+=4; /* bonus marks! */
713 /* Check for spaced em-dashes */
714 if (strstr(aline,"--"))
717 if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
718 (*(strstr(aline,"--")+2)==CHAR_SPACE))
720 if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
721 (*(strstr(aline,"--")+2)==CHAR_SPACE))
722 /* count of em-dashes with spaces both sides */
723 non_PG_space_emdash++;
724 if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
725 (*(strstr(aline,"--")+2)!=CHAR_SPACE))
726 /* count of PG-type em-dashes with no spaces */
731 s=getaword(s,inword);
732 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
734 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
736 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
739 /* Check for spaced dashes */
740 if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
743 lastlen=strlen(aline);
747 /* now, based on this quick view, make some snap decisions */
749 printf(" --> %ld lines in this file have white space at end\n",
755 printf(" --> %ld lines in this file contain '.,'. "
756 "Not reporting them.\n",dotcomma);
758 /* if more than 50 lines, or one-tenth, are short,
759 * don't bother reporting them */
761 if (shortline>50 || shortline*10>linecnt)
764 printf(" --> %ld lines in this file are short. "
765 "Not reporting short lines.\n",shortline);
768 * If more than 50 lines, or one-tenth, are long,
769 * don't bother reporting them.
772 if (longline>50 || longline*10>linecnt)
775 printf(" --> %ld lines in this file are long. "
776 "Not reporting long lines.\n",longline);
778 /* If more than 10 lines contain asterisks, don't bother reporting them. */
783 printf(" --> %ld lines in this file contain asterisks. "
784 "Not reporting them.\n",astline);
787 * If more than 10 lines contain forward slashes,
788 * don't bother reporting them.
794 printf(" --> %ld lines in this file contain forward slashes. "
795 "Not reporting them.\n",fslashline);
798 * If more than 20 lines contain unpunctuated endquotes,
799 * don't bother reporting them.
802 if (endquote_count>20)
805 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
806 "Not reporting them.\n",endquote_count);
809 * If more than 15 lines contain standalone digits,
810 * don't bother reporting them.
813 if (standalone_digit>10)
816 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
817 "Not reporting them.\n",standalone_digit);
820 * If more than 20 lines contain hyphens at end,
821 * don't bother reporting them.
827 printf(" --> %ld lines in this file have hyphens at end. "
828 "Not reporting them.\n",hyphens);
830 if (htmcount>20 && !pswit[MARKUP_SWITCH])
832 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
833 pswit[MARKUP_SWITCH]=1;
836 printf(" --> %ld lines in this file are VERY long!\n",verylongline);
838 * If there are more non-PG spaced dashes than PG em-dashes,
839 * assume it's deliberate.
840 * Current PG guidelines say don't use them, but older texts do,
841 * and some people insist on them whatever the guidelines say.
844 if (spacedash+non_PG_space_emdash>PG_space_emdash)
847 printf(" --> There are %ld spaced dashes and em-dashes. "
848 "Not reporting them.\n",spacedash+non_PG_space_emdash);
850 /* If more than a quarter of characters are hi-bit, bug out. */
854 printf(" --> This file does not appear to be ASCII. "
855 "Terminating. Best of luck with it!\n");
858 if (alphalen*4<totlen)
860 printf(" --> This file does not appear to be text. "
861 "Terminating. Best of luck with it!\n");
864 if (binlen*100>totlen || binlen>100)
866 printf(" --> There are a lot of foreign letters here. "
867 "Not reporting them.\n");
874 printf(" --> This looks like Dutch - "
875 "switching off dashes and warnings for 's Middags case.\n");
881 printf(" --> This looks like French - "
882 "switching off some doublepunct.\n");
884 if (firstline && footerline)
885 printf(" The PG header and footer appear to be already on.\n");
889 printf(" The PG header is on - no footer.\n");
891 printf(" The PG footer is on - no header.\n");
894 if (pswit[VERBOSE_SWITCH])
906 printf(" *** Verbose output is ON -- you asked for it! ***\n");
910 infile=fopen(filename,"rb");
913 if (pswit[STDOUT_SWITCH])
914 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
916 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
919 if (footerline>0 && firstline>0 && footerline>firstline &&
920 footerline-firstline<100)
922 printf(" --> I don't really know where this text starts. \n");
923 printf(" There are no reference points.\n");
924 printf(" I'm going to have to report the header and footer "
929 * Here we go with the main pass. Hold onto yer hat!
930 * Re-init some variables we've dirtied.
932 quot=squot=linecnt=0;
933 laststart=CHAR_SPACE;
935 while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
940 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
941 continue; // skip DP page separators completely
942 if (linecnt<firstline || (footerline>0 && linecnt>footerline))
944 if (pswit[HEADER_SWITCH])
946 if (!strncmp(aline,"Title:",6))
947 printf(" %s\n",aline);
948 if (!strncmp(aline,"Author:",7))
949 printf(" %s\n",aline);
950 if (!strncmp(aline,"Release Date:",13))
951 printf(" %s\n",aline);
952 if (!strncmp(aline,"Edition:",8))
953 printf(" %s\n\n",aline);
955 continue; /* skip through the header */
959 isemptyline=1; /* assume the line is empty until proven otherwise */
961 * If we are in a state of unbalanced quotes, and this line
962 * doesn't begin with a quote, output the stored error message.
963 * If the -P switch was used, print the warning even if the
964 * new para starts with quotes.
970 if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
972 if (!pswit[OVERVIEW_SWITCH])
974 if (pswit[ECHO_SWITCH])
975 printf("\n%s\n",parastart);
983 if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
984 pswit[QPARA_SWITCH] || squot)
986 if (!pswit[OVERVIEW_SWITCH])
988 if (pswit[ECHO_SWITCH])
989 printf("\n%s\n",parastart);
999 if (!pswit[OVERVIEW_SWITCH])
1001 if (pswit[ECHO_SWITCH])
1002 printf("\n%s\n",parastart);
1010 if (!pswit[OVERVIEW_SWITCH])
1012 if (pswit[ECHO_SWITCH])
1013 printf("\n%s\n",parastart);
1021 if (!pswit[OVERVIEW_SWITCH])
1023 if (pswit[ECHO_SWITCH])
1024 printf("\n%s\n",parastart);
1032 if (!pswit[OVERVIEW_SWITCH])
1034 if (pswit[ECHO_SWITCH])
1035 printf("\n%s\n",parastart);
1041 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=
1042 *sbrack_err=*unders_err=0;
1044 * Look along the line, accumulate the count of quotes, and see
1045 * if this is an empty line - i.e. a line with nothing on it
1047 * If line has just spaces, period, * and/or - on it, don't
1048 * count it, since empty lines with asterisks or dashes to
1049 * separate sections are common.
1054 if (*s==CHAR_DQUOTE)
1056 if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
1061 * At start of line, it can only be an openquote.
1062 * Hardcode a very common exception!
1064 if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
1065 open_single_quote++;
1067 else if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
1068 /* Do nothing! it's definitely an apostrophe, not a quote */
1070 /* it's outside a word - let's check it out */
1071 else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(*(s+1)))
1073 /* it damwell better BE an openquote */
1074 if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
1075 /* hardcode a very common exception! */
1076 open_single_quote++;
1080 /* now - is it a closequote? */
1081 guessquote=0; /* accumulate clues */
1082 if (gcisalpha(s[-1]))
1084 /* it follows a letter - could be either */
1088 /* looks like a plural apostrophe */
1090 if (s[1]==CHAR_SPACE) /* bonus marks! */
1094 /* it doesn't have a letter either side */
1095 else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
1096 guessquote+=8; /* looks like a closequote */
1099 if (open_single_quote>close_single_quote)
1101 * Give it the benefit of some doubt,
1102 * if a squote is already open.
1108 close_single_quote++;
1111 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
1113 isemptyline=0; /* ignore lines like * * * as spacers */
1114 if (*s==CHAR_UNDERSCORE)
1116 if (*s==CHAR_OPEN_CBRACK)
1118 if (*s==CHAR_CLOSE_CBRACK)
1120 if (*s==CHAR_OPEN_RBRACK)
1122 if (*s==CHAR_CLOSE_RBRACK)
1124 if (*s==CHAR_OPEN_SBRACK)
1126 if (*s==CHAR_CLOSE_SBRACK)
1130 if (isnewpara && !isemptyline)
1132 /* This line is the start of a new paragraph. */
1133 start_para_line=linecnt;
1134 /* Capture its first line in case we want to report it later. */
1135 strncpy(parastart,aline,80);
1137 dquotepar=squotepar=0; /* restart the quote count */
1139 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
1141 if (*s>='a' && *s<='z')
1143 /* and its first letter is lowercase */
1144 if (pswit[ECHO_SWITCH])
1145 printf("\n%s\n",aline);
1146 if (!pswit[OVERVIEW_SWITCH])
1147 printf(" Line %ld column %d - "
1148 "Paragraph starts with lower-case\n",
1149 linecnt,(int)(s-aline)+1);
1153 isnewpara=0; /* Signal the end of new para processing. */
1155 /* Check for an em-dash broken at line end. */
1156 if (enddash && *aline=='-')
1158 if (pswit[ECHO_SWITCH])
1159 printf("\n%s\n",aline);
1160 if (!pswit[OVERVIEW_SWITCH])
1161 printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
1166 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
1168 if (s>=aline && *s=='-')
1171 * Check for invalid or questionable characters in the line
1172 * Anything above 127 is invalid for plain ASCII, and
1173 * non-printable control characters should also be flagged.
1174 * Tabs should generally not be there.
1176 for (s=aline;*s;s++)
1178 i=(unsigned char)*s;
1179 if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
1181 if (pswit[ECHO_SWITCH])
1182 printf("\n%s\n",aline);
1183 if (!pswit[OVERVIEW_SWITCH])
1184 printf(" Line %ld column %d - Control character %d\n",
1185 linecnt,(int)(s-aline)+1,i);
1192 /* Don't repeat multiple warnings on one line. */
1193 eNon_A=eTab=eTilde=eCarat=eFSlash=eAst=0;
1194 for (s=aline;*s;s++)
1197 (*s<CHAR_SPACE && *s!=9 && *s!='\n' || (unsigned char)*s>127))
1199 i=*s; /* annoying kludge for signed chars */
1202 if (pswit[ECHO_SWITCH])
1203 printf("\n%s\n",aline);
1204 if (!pswit[OVERVIEW_SWITCH])
1206 printf(" Line %ld column %d - "
1207 "Non-ISO-8859 character %d\n",
1208 linecnt,(int)(s-aline)+1,i);
1210 printf(" Line %ld column %d - "
1211 "Non-ASCII character %d\n",
1212 linecnt,(int)(s-aline)+1,i);
1217 if (!eTab && *s==CHAR_TAB)
1219 if (pswit[ECHO_SWITCH])
1220 printf("\n%s\n",aline);
1221 if (!pswit[OVERVIEW_SWITCH])
1222 printf(" Line %ld column %d - Tab character?\n",
1223 linecnt,(int)(s-aline)+1);
1228 if (!eTilde && *s==CHAR_TILDE)
1231 * Often used by OCR software to indicate an
1232 * unrecognizable character.
1234 if (pswit[ECHO_SWITCH])
1235 printf("\n%s\n",aline);
1236 if (!pswit[OVERVIEW_SWITCH])
1237 printf(" Line %ld column %d - Tilde character?\n",
1238 linecnt,(int)(s-aline)+1);
1243 if (!eCarat && *s==CHAR_CARAT)
1245 if (pswit[ECHO_SWITCH])
1246 printf("\n%s\n",aline);
1247 if (!pswit[OVERVIEW_SWITCH])
1248 printf(" Line %ld column %d - Carat character?\n",
1249 linecnt,(int)(s-aline)+1);
1254 if (!eFSlash && *s==CHAR_FORESLASH && warn_fslash)
1256 if (pswit[ECHO_SWITCH])
1257 printf("\n%s\n",aline);
1258 if (!pswit[OVERVIEW_SWITCH])
1259 printf(" Line %ld column %d - Forward slash?\n",
1260 linecnt,(int)(s-aline)+1);
1266 * Report asterisks only in paranoid mode,
1267 * since they're often deliberate.
1269 if (!eAst && pswit[PARANOID_SWITCH] && warn_ast &&
1270 !isemptyline && *s==CHAR_ASTERISK)
1272 if (pswit[ECHO_SWITCH])
1273 printf("\n%s\n",aline);
1274 if (!pswit[OVERVIEW_SWITCH])
1275 printf(" Line %ld column %d - Asterisk?\n",
1276 linecnt,(int)(s-aline)+1);
1283 /* Check for line too long. */
1286 if (strlen(aline)>LONGEST_PG_LINE)
1288 if (pswit[ECHO_SWITCH])
1289 printf("\n%s\n",aline);
1290 if (!pswit[OVERVIEW_SWITCH])
1291 printf(" Line %ld column %d - Long line %d\n",
1292 linecnt,strlen(aline),strlen(aline));
1298 * Check for line too short.
1299 * This one is a bit trickier to implement: we don't want to
1300 * flag the last line of a paragraph for being short, so we
1301 * have to wait until we know that our current line is a
1302 * "normal" line, then report the _previous_ line if it was too
1303 * short. We also don't want to report indented lines like
1304 * chapter heads or formatted quotations. We therefore keep
1305 * lastlen as the length of the last line examined, and
1306 * lastblen as the length of the last but one, and try to
1307 * suppress unnecessary warnings by checking that both were of
1308 * "normal" length. We keep the first character of the last
1309 * line in laststart, and if it was a space, we assume that the
1310 * formatting is deliberate. I can't figure out a way to
1311 * distinguish something like a quoted verse left-aligned or
1312 * the header or footer of a letter from a paragraph of short
1313 * lines - maybe if I examined the whole paragraph, and if the
1314 * para has less than, say, 8 lines and if all lines are short,
1315 * then just assume it's OK? Need to look at some texts to see
1316 * how often a formula like this would get the right result.
1318 if (warn_short && strlen(aline)>1 && lastlen>1 &&
1319 lastlen<SHORTEST_PG_LINE && lastblen>1 && lastblen>SHORTEST_PG_LINE &&
1320 laststart!=CHAR_SPACE)
1322 if (pswit[ECHO_SWITCH])
1323 printf("\n%s\n",prevline);
1324 if (!pswit[OVERVIEW_SWITCH])
1325 printf(" Line %ld column %d - Short line %d?\n",
1326 linecnt-1,strlen(prevline),strlen(prevline));
1331 lastlen=strlen(aline);
1333 /* Look for punctuation other than full ellipses at start of line. */
1334 if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1336 if (pswit[ECHO_SWITCH])
1337 printf("\n%s\n",aline);
1338 if (!pswit[OVERVIEW_SWITCH])
1339 printf(" Line %ld column 1 - Begins with punctuation?\n",
1345 * Check for spaced em-dashes.
1346 * We must check _all_ occurrences of "--" on the line
1347 * hence the loop - even if the first double-dash is OK
1348 * there may be another that's wrong later on.
1353 while (strstr(s,"--"))
1355 if (*(strstr(s,"--")-1)==CHAR_SPACE ||
1356 (*(strstr(s,"--")+2)==CHAR_SPACE))
1358 if (pswit[ECHO_SWITCH])
1359 printf("\n%s\n",aline);
1360 if (!pswit[OVERVIEW_SWITCH])
1361 printf(" Line %ld column %d - Spaced em-dash?\n",
1362 linecnt,(int)(strstr(s,"--")-aline)+1);
1369 /* Check for spaced dashes. */
1372 if (strstr(aline," -"))
1374 if (*(strstr(aline," -")+2)!='-')
1376 if (pswit[ECHO_SWITCH])
1377 printf("\n%s\n",aline);
1378 if (!pswit[OVERVIEW_SWITCH])
1379 printf(" Line %ld column %d - Spaced dash?\n",
1380 linecnt,(int)(strstr(aline," -")-aline)+1);
1385 else if (strstr(aline,"- "))
1387 if (*(strstr(aline,"- ")-1)!='-')
1389 if (pswit[ECHO_SWITCH])
1390 printf("\n%s\n",aline);
1391 if (!pswit[OVERVIEW_SWITCH])
1392 printf(" Line %ld column %d - Spaced dash?\n",
1393 linecnt,(int)(strstr(aline,"- ")-aline)+1);
1400 * Check for unmarked paragraphs indicated by separate speakers.
1401 * May well be false positive:
1402 * "Bravo!" "Wonderful!" called the crowd.
1403 * but useful all the same.
1407 if (strstr(aline,"\" \""))
1408 s=strstr(aline,"\" \"");
1409 if (strstr(aline,"\" \""))
1410 s=strstr(aline,"\" \"");
1413 if (pswit[ECHO_SWITCH])
1414 printf("\n%s\n",aline);
1415 if (!pswit[OVERVIEW_SWITCH])
1416 printf(" Line %ld column %d - "
1417 "Query missing paragraph break?\n",
1418 linecnt,(int)(s-aline)+1);
1423 * Check for "to he" and other easy he/be errors.
1424 * This is a very inadequate effort on the he/be problem,
1425 * but the phrase "to he" is always an error, whereas "to
1426 * be" is quite common.
1427 * Similarly, '"Quiet!", be said.' is a non-be error
1428 * "to he" is _not_ always an error!:
1429 * "Where they went to he couldn't say."
1430 * Another false positive:
1431 * What would "Cinderella" be without the . . .
1432 * and another: "If he wants to he can see for himself."
1436 if (strstr(aline," to he "))
1437 s=strstr(aline," to he ");
1438 if (strstr(aline,"\" be "))
1439 s=strstr(aline,"\" be ");
1440 if (strstr(aline,"\", be "))
1441 s=strstr(aline,"\", be ");
1442 if (strstr(aline," is be "))
1443 s=strstr(aline," is be ");
1444 if (strstr(aline," be is "))
1445 s=strstr(aline," be is ");
1446 if (strstr(aline," was be "))
1447 s=strstr(aline," was be ");
1448 if (strstr(aline," be would "))
1449 s=strstr(aline," be would ");
1450 if (strstr(aline," be could "))
1451 s=strstr(aline," be could ");
1454 if (pswit[ECHO_SWITCH])
1455 printf("\n%s\n",aline);
1456 if (!pswit[OVERVIEW_SWITCH])
1457 printf(" Line %ld column %d - Query he/be error?\n",
1458 linecnt,(int)(s-aline)+1);
1464 if (strstr(aline," i bad "))
1465 s=strstr(aline," i bad ");
1466 if (strstr(aline," you bad "))
1467 s=strstr(aline," you bad ");
1468 if (strstr(aline," he bad "))
1469 s=strstr(aline," he bad ");
1470 if (strstr(aline," she bad "))
1471 s=strstr(aline," she bad ");
1472 if (strstr(aline," they bad "))
1473 s=strstr(aline," they bad ");
1474 if (strstr(aline," a had "))
1475 s=strstr(aline," a had ");
1476 if (strstr(aline," the had "))
1477 s=strstr(aline," the had ");
1480 if (pswit[ECHO_SWITCH])
1481 printf("\n%s\n",aline);
1482 if (!pswit[OVERVIEW_SWITCH])
1483 printf(" Line %ld column %d - Query had/bad error?\n",
1484 linecnt,(int)(s-aline)+1);
1490 if (strstr(aline,", hut "))
1491 s=strstr(aline,", hut ");
1492 if (strstr(aline,"; hut "))
1493 s=strstr(aline,"; hut ");
1496 if (pswit[ECHO_SWITCH])
1497 printf("\n%s\n",aline);
1498 if (!pswit[OVERVIEW_SWITCH])
1499 printf(" Line %ld column %d - Query hut/but error?\n",
1500 linecnt,(int)(s-aline)+1);
1505 * Special case - angled bracket in front of "From" placed there by an
1506 * MTA when sending an e-mail.
1508 if (strstr(aline,">From"))
1510 if (pswit[ECHO_SWITCH])
1511 printf("\n%s\n",aline);
1512 if (!pswit[OVERVIEW_SWITCH])
1513 printf(" Line %ld column %d - "
1514 "Query angled bracket with From\n",
1515 linecnt,(int)(strstr(aline,">From")-aline)+1);
1520 * Check for a single character line -
1521 * often an overflow from bad wrapping.
1523 if (*aline && !aline[1])
1525 if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1527 ; /* Nothing - ignore numerals alone on a line. */
1530 if (pswit[ECHO_SWITCH])
1531 printf("\n%s\n",aline);
1532 if (!pswit[OVERVIEW_SWITCH])
1533 printf(" Line %ld column 1 - "
1534 "Query single character line\n",linecnt);
1539 /* Check for I" - often should be ! */
1540 if (strstr(aline," I\""))
1542 if (pswit[ECHO_SWITCH])
1543 printf("\n%s\n",aline);
1544 if (!pswit[OVERVIEW_SWITCH])
1545 printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1546 linecnt,strstr(aline," I\"")-aline);
1551 * Check for period without a capital letter. Cut-down from gutspell.
1552 * Only works when it happens on a single line.
1554 if (pswit[PARANOID_SWITCH])
1556 for (t=s=aline;strstr(t,". ");)
1562 /* start of line punctuation is handled elsewhere */
1565 if (!gcisalpha(t[-1]))
1572 /* For Frank & Jeroen -- 's Middags case */
1573 if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1574 t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1581 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1583 if (*s1>='a' && *s1<='z')
1585 /* we have something to investigate */
1587 /* so let's go back and find out */
1588 for (s1=t-1;s1>=s &&
1589 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1590 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1593 for (i=0;*s1 && *s1!='.';s1++,i++)
1596 for (i=0;*abbrev[i];i++)
1597 if (!strcmp(testword,abbrev[i]))
1599 if (gcisdigit(*testword))
1603 if (isroman(testword))
1608 for (i=0;testword[i];i++)
1609 if (strchr(vowels,testword[i]))
1615 if (strlen(testword)<MAX_QWORD_LENGTH &&
1616 !pswit[VERBOSE_SWITCH])
1617 for (i=0;i<qperiod_index;i++)
1618 if (!strcmp(testword,qperiod[i]))
1622 if (qperiod_index<MAX_QWORD &&
1623 strlen(testword)<MAX_QWORD_LENGTH)
1625 strcpy(qperiod[qperiod_index],testword);
1628 if (pswit[ECHO_SWITCH])
1629 printf("\n%s\n",aline);
1630 if (!pswit[OVERVIEW_SWITCH])
1631 printf(" Line %ld column %d - "
1632 "Extra period?\n",linecnt,(int)(t-aline)+1);
1641 if (pswit[TYPO_SWITCH])
1643 /* Check for words usually not followed by punctuation. */
1647 s=getaword(s,inword);
1651 for (i=0;*nocomma[i];i++)
1652 if (!strcmp(inword,nocomma[i]))
1654 if (*s==',' || *s==';' || *s==':')
1656 if (pswit[ECHO_SWITCH])
1657 printf("\n%s\n",aline);
1658 if (!pswit[OVERVIEW_SWITCH])
1659 printf(" Line %ld column %d - "
1660 "Query punctuation after %s?\n",
1661 linecnt,(int)(s-aline)+1,inword);
1666 for (i=0;*noperiod[i];i++)
1667 if (!strcmp(inword,noperiod[i]))
1669 if (*s=='.' || *s=='!')
1671 if (pswit[ECHO_SWITCH])
1672 printf("\n%s\n",aline);
1673 if (!pswit[OVERVIEW_SWITCH])
1674 printf(" Line %ld column %d - "
1675 "Query punctuation after %s?\n",
1676 linecnt,(int)(s-aline)+1,inword);
1684 * Check for commonly mistyped words,
1685 * and digits like 0 for O in a word.
1690 s=getaword(s,inword);
1692 continue; /* don't bother with empty lines */
1693 if (mixdigit(inword))
1695 if (pswit[ECHO_SWITCH])
1696 printf("\n%s\n",aline);
1697 if (!pswit[OVERVIEW_SWITCH])
1698 printf(" Line %ld column %ld - Query digit in %s\n",
1699 linecnt,(int)(wordstart-aline)+1,inword);
1704 * Put the word through a series of tests for likely typos and OCR
1707 if (pswit[TYPO_SWITCH])
1710 strcpy(testword,inword);
1712 for (i=0;i<(signed int)strlen(testword);i++)
1714 /* lowercase for testing */
1715 if (testword[i]>='a' && testword[i]<='z')
1717 if (alower && testword[i]>='A' && testword[i]<='Z')
1720 * We have an uppercase mid-word. However, there are
1722 * Mac and Mc like McGill
1723 * French contractions like l'Abbe
1725 if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1726 i==3 && testword[0]=='m' && testword[1]=='a' &&
1727 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1732 testword[i]=(char)tolower(testword[i]);
1735 * Check for certain unlikely two-letter combinations at word
1738 if (strlen(testword)>1)
1740 for (i=0;*nostart[i];i++)
1741 if (!strncmp(testword,nostart[i],2))
1743 for (i=0;*noend[i];i++)
1744 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1747 /* ght is common, gbt never. Like that. */
1748 if (strstr(testword,"cb"))
1750 if (strstr(testword,"gbt"))
1752 if (strstr(testword,"pbt"))
1754 if (strstr(testword,"tbs"))
1756 if (strstr(testword,"mrn"))
1758 if (strstr(testword,"ahle"))
1760 if (strstr(testword,"ihle"))
1763 * "TBE" does happen - like HEARTBEAT - but uncommon.
1764 * Also "TBI" - frostbite, outbid - but uncommon.
1765 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1766 * numerals, but "ii" is a common scanno.
1768 if (strstr(testword,"tbi"))
1770 if (strstr(testword,"tbe"))
1772 if (strstr(testword,"ii"))
1775 * Check for no vowels or no consonants.
1776 * If none, flag a typo.
1778 if (!istypo && strlen(testword)>1)
1781 for (i=0;testword[i];i++)
1783 if (testword[i]=='y' || gcisdigit(testword[i]))
1785 /* Yah, this is loose. */
1789 else if (strchr(vowels,testword[i]))
1794 if (!vowel || !consonant)
1798 * Now exclude the word from being reported if it's in
1801 for (i=0;*okword[i];i++)
1802 if (!strcmp(testword,okword[i]))
1805 * What looks like a typo may be a Roman numeral.
1808 if (istypo && isroman(testword))
1810 /* Check the manual list of typos. */
1812 for (i=0;*typo[i];i++)
1813 if (!strcmp(testword,typo[i]))
1816 * Check lowercase s, l, i and m - special cases.
1817 * "j" - often a semi-colon gone wrong.
1818 * "d" for a missing apostrophe - he d
1821 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1826 if (strlen(testword)<MAX_QWORD_LENGTH &&
1827 !pswit[VERBOSE_SWITCH])
1828 for (i=0;i<qword_index;i++)
1829 if (!strcmp(testword,qword[i]))
1836 if (qword_index<MAX_QWORD &&
1837 strlen(testword)<MAX_QWORD_LENGTH)
1839 strcpy(qword[qword_index],testword);
1842 if (pswit[ECHO_SWITCH])
1843 printf("\n%s\n",aline);
1844 if (!pswit[OVERVIEW_SWITCH])
1846 printf(" Line %ld column %d - Query word %s",
1847 linecnt,(int)(wordstart-aline)+1,inword);
1848 if (strlen(testword)<MAX_QWORD_LENGTH &&
1849 !pswit[VERBOSE_SWITCH])
1850 printf(" - not reporting duplicates");
1858 /* check the user's list of typos */
1859 if (!istypo && usertypo_count)
1860 for (i=0;i<usertypo_count;i++)
1861 if (!strcmp(testword,usertypo[i]))
1863 if (pswit[ECHO_SWITCH])
1864 printf("\n%s\n",aline);
1865 if (!pswit[OVERVIEW_SWITCH])
1866 printf(" Line %ld column %d - "
1867 "Query possible scanno %s\n",
1868 linecnt,(int)(wordstart-aline)+2,inword);
1870 if (pswit[PARANOID_SWITCH] && warn_digit)
1872 /* In paranoid mode, query all 0 and 1 standing alone. */
1873 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1875 if (pswit[ECHO_SWITCH])
1876 printf("\n%s\n",aline);
1877 if (!pswit[OVERVIEW_SWITCH])
1878 printf(" Line %ld column %d - Query standalone %s\n",
1879 linecnt,(int)(wordstart-aline)+2,inword);
1886 * Look for added or missing spaces around punctuation and quotes.
1887 * If there is a punctuation character like ! with no space on
1888 * either side, suspect a missing!space. If there are spaces on
1889 * both sides , assume a typo. If we see a double quote with no
1890 * space or punctuation on either side of it, assume unspaced
1891 * quotes "like"this.
1894 for (i=1;i<llen;i++)
1896 /* For each character in the line after the first. */
1897 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
1899 /* we need to suppress warnings for acronyms like M.D. */
1901 /* we need to suppress warnings for ellipsis . . . */
1903 /* if there are letters on both sides of it or ... */
1904 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
1905 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
1907 /* ...if it's strict punctuation followed by an alpha */
1910 if (i>2 && aline[i-2]=='.')
1912 if (i+2<llen && aline[i+2]=='.')
1917 if (pswit[ECHO_SWITCH])
1918 printf("\n%s\n",aline);
1919 if (!pswit[OVERVIEW_SWITCH])
1920 printf(" Line %ld column %d - Missing space?\n",
1926 if (aline[i-1]==CHAR_SPACE &&
1927 (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
1930 * If there are spaces on both sides,
1931 * or space before and end of line.
1935 if (i>2 && aline[i-2]=='.')
1937 if (i+2<llen && aline[i+2]=='.')
1940 if (!isemptyline && !isellipsis)
1942 if (pswit[ECHO_SWITCH])
1943 printf("\n%s\n",aline);
1944 if (!pswit[OVERVIEW_SWITCH])
1945 printf(" Line %ld column %d - "
1946 "Spaced punctuation?\n",linecnt,i+1);
1953 /* Split out the characters that CANNOT be preceded by space. */
1955 for (i=1;i<llen;i++)
1957 /* for each character in the line after the first */
1958 if (strchr("?!,;:",aline[i]))
1960 /* if it's punctuation that _cannot_ have a space before it */
1961 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
1962 aline[i+1]!=CHAR_SPACE)
1965 * If aline[i+1) DOES == space,
1966 * it was already reported just above.
1968 if (pswit[ECHO_SWITCH])
1969 printf("\n%s\n",aline);
1970 if (!pswit[OVERVIEW_SWITCH])
1971 printf(" Line %ld column %d - Spaced punctuation?\n",
1979 * Special case " .X" where X is any alpha.
1980 * This plugs a hole in the acronym code above.
1981 * Inelegant, but maintainable.
1984 for (i=1;i<llen;i++)
1986 /* for each character in the line after the first */
1989 /* if it's a period */
1990 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
1993 * If the period follows a space and
1994 * is followed by a letter.
1996 if (pswit[ECHO_SWITCH])
1997 printf("\n%s\n",aline);
1998 if (!pswit[OVERVIEW_SWITCH])
1999 printf(" Line %ld column %d - Spaced punctuation?\n",
2006 for (i=1;i<llen;i++)
2008 /* for each character in the line after the first */
2009 if (aline[i]==CHAR_DQUOTE)
2011 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
2012 !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
2013 !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
2015 if (pswit[ECHO_SWITCH])
2016 printf("\n%s\n",aline);
2017 if (!pswit[OVERVIEW_SWITCH])
2018 printf(" Line %ld column %d - Unspaced quotes?\n",
2025 /* Check parity of quotes. */
2026 for (s=aline;*s;s++)
2028 if (*s==CHAR_DQUOTE)
2030 if (!(dquotepar=!dquotepar))
2033 if (!strchr("_-.'`/,;:!?)]} ",s[1]))
2035 if (pswit[ECHO_SWITCH])
2036 printf("\n%s\n",aline);
2037 if (!pswit[OVERVIEW_SWITCH])
2038 printf(" Line %ld column %d - "
2039 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2047 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2048 !strchr("_-/.'`([{$",s[1]) || !s[1])
2050 if (pswit[ECHO_SWITCH])
2051 printf("\n%s\n",aline);
2052 if (!pswit[OVERVIEW_SWITCH])
2053 printf(" Line %ld column %d - "
2054 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2061 if (*aline==CHAR_DQUOTE)
2063 if (strchr(",;:!?)]} ",aline[1]))
2065 if (pswit[ECHO_SWITCH])
2066 printf("\n%s\n",aline);
2067 if (!pswit[OVERVIEW_SWITCH])
2068 printf(" Line %ld column 1 - Wrongspaced quotes?\n",
2069 linecnt,(int)(s-aline)+1);
2074 if (pswit[SQUOTE_SWITCH])
2076 for (s=aline;*s;s++)
2078 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
2079 (s==aline || s>aline && !gcisalpha(s[-1]) ||
2082 if (!(squotepar=!squotepar))
2085 if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
2087 if (pswit[ECHO_SWITCH])
2088 printf("\n%s\n",aline);
2089 if (!pswit[OVERVIEW_SWITCH])
2090 printf(" Line %ld column %d - "
2091 "Wrongspaced singlequotes?\n",
2092 linecnt,(int)(s-aline)+1);
2100 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2101 !strchr("_-/\".'`",s[1]) || !s[1])
2103 if (pswit[ECHO_SWITCH])
2104 printf("\n%s\n",aline);
2105 if (!pswit[OVERVIEW_SWITCH])
2106 printf(" Line %ld column %d - "
2107 "Wrongspaced singlequotes?\n",
2108 linecnt,(int)(s-aline)+1);
2117 * Look for double punctuation like ,. or ,,
2118 * Thanks to DW for the suggestion!
2119 * In books with references, ".," and ".;" are common
2120 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2121 * OTOH, from my initial tests, there are also fairly
2122 * common errors. What to do? Make these cases paranoid?
2123 * ".," is the most common, so warn_dotcomma is used
2124 * to suppress detailed reporting if it occurs often.
2127 for (i=0;i<llen;i++)
2129 /* for each punctuation character in the line */
2130 if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
2131 aline[i] && aline[i+1])
2133 /* followed by punctuation, it's a query, unless . . . */
2134 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
2136 !warn_dotcomma && aline[i]=='.' && aline[i+1]==',' ||
2137 isFrench && !strncmp(aline+i,",...",4) ||
2138 isFrench && !strncmp(aline+i,"...,",4) ||
2139 isFrench && !strncmp(aline+i,";...",4) ||
2140 isFrench && !strncmp(aline+i,"...;",4) ||
2141 isFrench && !strncmp(aline+i,":...",4) ||
2142 isFrench && !strncmp(aline+i,"...:",4) ||
2143 isFrench && !strncmp(aline+i,"!...",4) ||
2144 isFrench && !strncmp(aline+i,"...!",4) ||
2145 isFrench && !strncmp(aline+i,"?...",4) ||
2146 isFrench && !strncmp(aline+i,"...?",4))
2148 if (isFrench && !strncmp(aline+i,",...",4) ||
2149 isFrench && !strncmp(aline+i,"...,",4) ||
2150 isFrench && !strncmp(aline+i,";...",4) ||
2151 isFrench && !strncmp(aline+i,"...;",4) ||
2152 isFrench && !strncmp(aline+i,":...",4) ||
2153 isFrench && !strncmp(aline+i,"...:",4) ||
2154 isFrench && !strncmp(aline+i,"!...",4) ||
2155 isFrench && !strncmp(aline+i,"...!",4) ||
2156 isFrench && !strncmp(aline+i,"?...",4) ||
2157 isFrench && !strncmp(aline+i,"...?",4))
2159 ; /* do nothing for .. !! and ?? which can be legit */
2163 if (pswit[ECHO_SWITCH])
2164 printf("\n%s\n",aline);
2165 if (!pswit[OVERVIEW_SWITCH])
2166 printf(" Line %ld column %d - Double punctuation?\n",
2174 while (strstr(s," \" "))
2176 if (pswit[ECHO_SWITCH])
2177 printf("\n%s\n",aline);
2178 if (!pswit[OVERVIEW_SWITCH])
2179 printf(" Line %ld column %d - Spaced doublequote?\n",
2180 linecnt,(int)(strstr(s," \" ")-aline+1));
2183 s=strstr(s," \" ")+2;
2186 while (strstr(s," ' "))
2188 if (pswit[ECHO_SWITCH])
2189 printf("\n%s\n",aline);
2190 if (!pswit[OVERVIEW_SWITCH])
2191 printf(" Line %ld column %d - Spaced singlequote?\n",
2192 linecnt,(int)(strstr(s," ' ")-aline+1));
2195 s=strstr(s," ' ")+2;
2198 while (strstr(s," ` "))
2200 if (pswit[ECHO_SWITCH])
2201 printf("\n%s\n",aline);
2202 if (!pswit[OVERVIEW_SWITCH])
2203 printf(" Line %ld column %d - Spaced singlequote?\n",
2204 linecnt,(int)(strstr(s," ` ")-aline+1));
2207 s=strstr(s," ` ")+2;
2209 /* check special case of 'S instead of 's at end of word */
2213 if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
2215 if (pswit[ECHO_SWITCH])
2216 printf("\n%s\n",aline);
2217 if (!pswit[OVERVIEW_SWITCH])
2218 printf(" Line %ld column %d - Capital \"S\"?\n",
2219 linecnt,(int)(s-aline+2));
2226 * Now check special cases - start and end of line -
2227 * for single and double quotes. Start is sometimes [sic]
2228 * but better to query it anyway.
2229 * While we're here, check for dash at end of line.
2234 if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
2235 aline[llen-1]==CHAR_OPEN_SQUOTE)
2236 if (aline[llen-2]==CHAR_SPACE)
2238 if (pswit[ECHO_SWITCH])
2239 printf("\n%s\n",aline);
2240 if (!pswit[OVERVIEW_SWITCH])
2241 printf(" Line %ld column %d - Spaced quote?\n",
2246 if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
2247 aline[1]==CHAR_SPACE)
2249 if (pswit[ECHO_SWITCH])
2250 printf("\n%s\n",aline);
2251 if (!pswit[OVERVIEW_SWITCH])
2252 printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
2257 * Dash at end of line may well be legit - paranoid mode only
2258 * and don't report em-dash at line-end.
2260 if (pswit[PARANOID_SWITCH] && warn_hyphen)
2262 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
2264 if (aline[i]=='-' && aline[i-1]!='-')
2266 if (pswit[ECHO_SWITCH])
2267 printf("\n%s\n",aline);
2268 if (!pswit[OVERVIEW_SWITCH])
2269 printf(" Line %ld column %d - "
2270 "Hyphen at end of line?\n",linecnt,i);
2275 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2276 * If so, suspect a scanno like "a]most".
2279 for (i=1;i<llen-1;i++)
2281 /* for each bracket character in the line except 1st & last */
2282 if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
2283 gcisalpha(aline[i+1]))
2285 if (pswit[ECHO_SWITCH])
2286 printf("\n%s\n",aline);
2287 if (!pswit[OVERVIEW_SWITCH])
2288 printf(" Line %ld column %d - Unspaced bracket?\n",
2297 for (i=1;i<llen;i++)
2299 /* for each character in the line except 1st */
2300 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
2302 if (pswit[ECHO_SWITCH])
2303 printf("\n%s\n",aline);
2304 if (!pswit[OVERVIEW_SWITCH])
2305 printf(" Line %ld column %d - "
2306 "endquote missing punctuation?\n",linecnt,i);
2313 * Check for <HTML TAG>.
2314 * If there is a < in the line, followed at some point
2315 * by a > then we suspect HTML.
2317 if (strstr(aline,"<") && strstr(aline,">"))
2319 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
2322 strncpy(wrk,strstr(aline,"<"),i);
2324 if (pswit[ECHO_SWITCH])
2325 printf("\n%s\n",aline);
2326 if (!pswit[OVERVIEW_SWITCH])
2327 printf(" Line %ld column %d - HTML Tag? %s \n",
2328 linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
2334 * Check for &symbol; HTML.
2335 * If there is a & in the line, followed at
2336 * some point by a ; then we suspect HTML.
2338 if (strstr(aline,"&") && strstr(aline,";"))
2340 i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
2341 for (s=strstr(aline,"&");s<strstr(aline,";");s++)
2343 i=0; /* Don't report "Jones & Son;" */
2346 strncpy(wrk,strstr(aline,"&"),i);
2348 if (pswit[ECHO_SWITCH])
2349 printf("\n%s\n",aline);
2350 if (!pswit[OVERVIEW_SWITCH])
2351 printf(" Line %ld column %d - HTML symbol? %s \n",
2352 linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
2358 * At end of paragraph, check for mismatched quotes.
2359 * We don't want to report an error immediately, since it is a
2360 * common convention to omit the quotes at end of paragraph if
2361 * the next paragraph is a continuation of the same speaker.
2362 * Where this is the case, the next para should begin with a
2363 * quote, so we store the warning message and only display it
2364 * at the top of the next iteration if the new para doesn't
2365 * start with a quote.
2366 * The -p switch overrides this default, and warns of unclosed
2367 * quotes on _every_ paragraph, whether the next begins with a
2372 /* end of para - add up the totals */
2374 sprintf(dquote_err," Line %ld - Mismatched quotes\n",
2376 if (pswit[SQUOTE_SWITCH] && open_single_quote &&
2377 open_single_quote!=close_single_quote)
2378 sprintf(squote_err," Line %ld - Mismatched singlequotes?\n",
2380 if (pswit[SQUOTE_SWITCH] && open_single_quote &&
2381 open_single_quote!=close_single_quote &&
2382 open_single_quote!=close_single_quote+1)
2384 * Flag it to be noted regardless of the
2385 * first char of the next para.
2389 sprintf(rbrack_err," Line %ld - "
2390 "Mismatched round brackets?\n",linecnt);
2392 sprintf(sbrack_err," Line %ld - "
2393 "Mismatched square brackets?\n",linecnt);
2395 sprintf(cbrack_err," Line %ld - "
2396 "Mismatched curly brackets?\n",linecnt);
2398 sprintf(unders_err," Line %ld - Mismatched underscores?\n",
2400 quot=s_brack=c_brack=r_brack=c_unders=open_single_quote=
2401 close_single_quote=0;
2402 /* let the next iteration know that it's starting a new para */
2406 * Check for omitted punctuation at end of paragraph by working back
2407 * through prevline. DW.
2408 * Need to check this only for "normal" paras.
2409 * So what is a "normal" para?
2410 * Not normal if one-liner (chapter headings, etc.)
2411 * Not normal if doesn't contain at least one locase letter
2412 * Not normal if starts with space
2417 for (s=prevline,i=0;*s && !i;s++)
2419 /* use i to indicate the presence of a letter on the line */
2422 * This next "if" is a problem.
2423 * If we say "start_para_line <= linecnt - 1", that includes
2424 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2425 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2426 * misses genuine one-line paragraphs.
2428 if (i && lastblen>2 && start_para_line<linecnt-1 &&
2429 *prevline>CHAR_SPACE)
2431 for (i=strlen(prevline)-1;
2432 (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
2433 prevline[i]>CHAR_SPACE && i>0;
2438 if (gcisalpha(prevline[i]))
2440 if (pswit[ECHO_SWITCH])
2441 printf("\n%s\n",prevline);
2442 if (!pswit[OVERVIEW_SWITCH])
2443 printf(" Line %ld column %d - "
2444 "No punctuation at para end?\n",
2445 linecnt-1,strlen(prevline));
2450 if (strchr("-.:!([{?}])",prevline[i]))
2455 strcpy(prevline,aline);
2458 if (!pswit[OVERVIEW_SWITCH])
2459 for (i=0;i<MAX_QWORD;i++)
2461 printf("\nNote: Queried word %s was duplicated %d time%s\n",
2462 qword[i],dupcnt[i],"s");
2468 * Get one line from the input stream, checking for
2469 * the existence of exactly one CR/LF line-end per line.
2471 * Returns: a pointer to the line.
2473 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
2479 c=cint=fgetc(thefile);
2484 /* either way, it's end of line */
2491 /* Error - a LF without a preceding CR */
2492 if (pswit[LINE_END_SWITCH])
2494 if (pswit[ECHO_SWITCH])
2495 printf("\n%s\n",theline);
2496 if (!pswit[OVERVIEW_SWITCH])
2497 printf(" Line %ld - No CR?\n",lcnt);
2508 /* Error - two successive CRs */
2509 if (pswit[LINE_END_SWITCH])
2511 if (pswit[ECHO_SWITCH])
2512 printf("\n%s\n",theline);
2513 if (!pswit[OVERVIEW_SWITCH])
2514 printf(" Line %ld - Two successive CRs?\n",lcnt);
2523 if (pswit[LINE_END_SWITCH] && isCR)
2525 if (pswit[ECHO_SWITCH])
2526 printf("\n%s\n",theline);
2527 if (!pswit[OVERVIEW_SWITCH])
2528 printf(" Line %ld column %d - CR without LF?\n",
2538 c=cint=fgetc(thefile);
2539 } while(len<maxlen);
2540 if (pswit[MARKUP_SWITCH])
2541 postprocess_for_HTML(theline);
2542 if (pswit[DP_SWITCH])
2543 postprocess_for_DP(theline);
2550 * Takes a "word" as a parameter, and checks whether it
2551 * contains a mixture of alpha and digits. Generally, this is an
2552 * error, but may not be for cases like 4th or L5 12s. 3d.
2554 * Returns: 0 if no error found, 1 if error.
2556 int mixdigit(char *checkword)
2558 int wehaveadigit,wehavealetter,firstdigits,query,wl;
2560 wehaveadigit=wehavealetter=query=0;
2561 for (s=checkword;*s;s++)
2567 if (wehaveadigit && wehavealetter)
2569 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2571 wl=strlen(checkword);
2572 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
2574 /* digits, ending in st, rd, nd, th of either case */
2575 if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
2576 matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
2577 matchword(checkword+wl-2,"th")))
2579 if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
2580 matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
2581 matchword(checkword+wl-3,"ths")))
2583 if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
2584 matchword(checkword+wl-4,"rdly") ||
2585 matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
2587 /* digits, ending in l, L, s or d */
2588 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
2589 checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
2592 * L at the start of a number, representing Britsh pounds, like L500.
2593 * This is cute. We know the current word is mixeddigit. If the first
2594 * letter is L, there must be at least one digit following. If both
2595 * digits and letters follow, we have a genuine error, else we have a
2596 * capital L followed by digits, and we accept that as a non-error.
2598 if (checkword[0]=='L' && !mixdigit(checkword+1))
2607 * Extracts the first/next "word" from the line, and puts
2608 * it into "thisword". A word is defined as one English word unit--or
2609 * at least that's the aim.
2611 * Returns: a pointer to the position in the line where we will start
2612 * looking for the next word.
2614 char *getaword(char *fromline,char *thisword)
2619 for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
2623 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2624 * Especially yucky is the case of L1,000
2625 * This section looks for a pattern of characters including a digit
2626 * followed by a comma or period followed by one or more digits.
2627 * If found, it returns this whole pattern as a word; otherwise we discard
2628 * the results and resume our normal programming.
2631 for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
2632 wordlen<MAXWORDLEN;s++)
2634 thisword[wordlen]=*s;
2637 thisword[wordlen]=0;
2638 for (i=1;i<wordlen-1;i++)
2640 if (thisword[i]=='.' || thisword[i]==',')
2642 if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
2649 /* we didn't find a punctuated number - do the regular getword thing */
2651 for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
2652 wordlen<MAXWORDLEN;fromline++)
2654 thisword[wordlen]=*fromline;
2657 thisword[wordlen]=0;
2664 * A case-insensitive string matcher.
2666 int matchword(char *checkfor,char *thisword)
2668 unsigned int ismatch,i;
2669 if (strlen(checkfor)!=strlen(thisword))
2671 ismatch=1; /* assume a match until we find a difference */
2672 for (i=0;i<strlen(checkfor);i++)
2673 if (toupper(checkfor[i])!=toupper(thisword[i]))
2681 * Lowercase the line.
2684 void lowerit(char *theline)
2686 for (;*theline;theline++)
2687 if (*theline>='A' && *theline<='Z')
2694 * Is this word a Roman Numeral?
2696 * It doesn't actually validate that the number is a valid Roman Numeral--for
2697 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2698 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2699 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2700 * expressions thereof, except when it came to taxes. Allow any number of M,
2701 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2702 * XL or an optional XC, an optional IX or IV, an optional V and any number
2705 int isroman(char *t)
2711 while (*t=='m' && *t)
2715 if (*t=='c' && t[1]=='m')
2717 if (*t=='c' && t[1]=='d')
2719 while (*t=='c' && *t)
2721 if (*t=='x' && t[1]=='l')
2723 if (*t=='x' && t[1]=='c')
2727 while (*t=='x' && *t)
2729 if (*t=='i' && t[1]=='x')
2731 if (*t=='i' && t[1]=='v')
2735 while (*t=='i' && *t)
2743 * A version of isalpha() that is somewhat lenient on 8-bit texts.
2744 * If we use the standard function, 8-bit accented characters break
2745 * words, so that tete with accented characters appears to be two words, "t"
2746 * and "t", with 8-bit characters between them. This causes over-reporting of
2747 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
2748 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
2750 int gcisalpha(unsigned char c)
2752 if (c>='a' && c<='z')
2754 if (c>='A' && c<='Z')
2758 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
2760 if (c==140 || c==142 || c==156 || c==158 || c==159)
2768 * A version of isdigit() that doesn't get confused in 8-bit texts.
2770 int gcisdigit(unsigned char c)
2772 return c>='0' && c<='9';
2778 * A version of isletter() that doesn't get confused in 8-bit texts.
2779 * NB: this is ISO-8891-1-specific.
2781 int gcisletter(unsigned char c)
2783 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
2789 * Wraps strchr to return NULL if the character being searched for is zero.
2791 char *gcstrchr(char *s,char c)
2799 * postprocess_for_DP:
2801 * Invoked with the -d switch from flgets().
2802 * It simply "removes" from the line a hard-coded set of common
2803 * DP-specific tags, so that the line passed to the main routine has
2804 * been pre-cleaned of DP markup.
2806 void postprocess_for_DP(char *theline)
2812 for (i=0;*DPmarkup[i];i++)
2814 s=strstr(theline,DPmarkup[i]);
2817 t=s+strlen(DPmarkup[i]);
2825 s=strstr(theline,DPmarkup[i]);
2831 * postprocess_for_HTML:
2833 * Invoked with the -m switch from flgets().
2834 * It simply "removes" from the line a hard-coded set of common
2835 * HTML tags and "replaces" a hard-coded set of common HTML
2836 * entities, so that the line passed to the main routine has
2837 * been pre-cleaned of HTML.
2839 void postprocess_for_HTML(char *theline)
2841 if (strstr(theline,"<") && strstr(theline,">"))
2842 while (losemarkup(theline))
2844 while (loseentities(theline))
2848 char *losemarkup(char *theline)
2854 s=strstr(theline,"<");
2855 t=strstr(theline,">");
2858 for (i=0;*markup[i];i++)
2859 if (!tagcomp(s+1,markup[i]))
2872 /* It's an unrecognized <xxx>. */
2876 char *loseentities(char *theline)
2882 for (i=0;*entities[i].htmlent;i++)
2884 s=strstr(theline,entities[i].htmlent);
2887 t=malloc((size_t)strlen(s));
2890 strcpy(t,s+strlen(entities[i].htmlent));
2891 strcpy(s,entities[i].textent);
2897 for (i=0;*entities[i].htmlnum;i++)
2899 s=strstr(theline,entities[i].htmlnum);
2902 t=malloc((size_t)strlen(s));
2905 strcpy(t,s+strlen(entities[i].htmlnum));
2906 strcpy(s,entities[i].textent);
2915 int tagcomp(char *strin,char *basetag)
2921 t++; /* ignore a slash */
2924 if (tolower(*s)!=tolower(*t))
2934 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
2935 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
2936 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
2937 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
2938 "For details, read the file COPYING.\n",stderr);
2939 fputs("This is Free Software; "
2940 "you may redistribute it under certain conditions (GPL);\n",stderr);
2941 fputs("read the file COPYING for details.\n\n",stderr);
2942 fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
2943 fputs(" where -s checks single quotes, -e suppresses echoing lines, "
2944 "-t checks typos\n",stderr);
2945 fputs(" -x (paranoid) switches OFF -t and extra checks, "
2946 "-l turns OFF line-end checks\n",stderr);
2947 fputs(" -o just displays overview without detail, "
2948 "-h echoes header fields\n",stderr);
2949 fputs(" -v (verbose) unsuppresses duplicate reporting, "
2950 "-m suppresses markup\n",stderr);
2951 fputs(" -d ignores DP-specific markup,\n",stderr);
2952 fputs(" -u uses a file gutcheck.typ to query user-defined "
2953 "possible typos\n",stderr);
2954 fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
2956 fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
2958 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
2959 "non-ASCII\n",stderr);
2960 fputs("characters like accented letters, "
2961 "lines longer than 75 or shorter than 55,\n",stderr);
2962 fputs("unbalanced quotes or brackets, "
2963 "a variety of badly formatted punctuation, \n",stderr);
2964 fputs("HTML tags, some likely typos. "
2965 "It is NOT a substitute for human judgement.\n",stderr);