1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
26 #define MAXWORDLEN 80 /* max length of one word */
27 #define LINEBUFSIZE 2048 /* buffer size for an input line */
29 #define MAX_USER_TYPOS 1000
30 #define USERTYPO_FILE "gutcheck.typ"
33 #define MAX_PATH 16384
36 char aline[LINEBUFSIZE];
37 char prevline[LINEBUFSIZE];
41 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
42 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
43 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
44 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
45 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
46 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
47 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
48 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
49 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
50 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
51 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
52 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
53 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
54 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
55 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
56 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
57 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
58 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
59 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
60 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
61 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
62 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
63 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
64 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
65 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
66 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
67 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
68 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
69 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 char *usertypo[MAX_USER_TYPOS];
75 /* Common abbreviations and other OK words not to query as typos. */
77 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
78 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
79 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
80 "outbid", "outbids", "frostbite", "frostbitten", ""
83 /* Common abbreviations that cause otherwise unexplained periods. */
85 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
86 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
90 * Two-Letter combinations that rarely if ever start words,
91 * but are common scannos or otherwise common letter combinations.
94 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
98 * Two-Letter combinations that rarely if ever end words,
99 * but are common scannos or otherwise common letter combinations.
102 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
103 "sw", "gr", "sl", "cl", "iy", ""
107 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
108 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
109 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
110 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
114 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
118 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
119 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
120 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
121 "during", "let", "toward", "among", ""
125 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
126 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
127 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
128 "among", "those", "into", "whom", "having", "thence", ""
131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
138 "&", "&", "&",
139 "<", "<", "<",
140 ">", ">", ">",
141 "°", "°", " degrees",
142 "£", "£", "L",
143 """, """, "\"", /* quotation mark = APL quote */
144 "Œ", "Œ", "OE", /* latin capital ligature OE */
145 "œ", "œ", "oe", /* latin small ligature oe */
146 "Š", "Š", "S", /* latin capital letter S with caron */
147 "š", "š", "s", /* latin small letter s with caron */
148 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
149 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
150 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
151 " ", " ", " ", /* en space, U+2002 ISOpub */
152 " ", " ", " ", /* em space, U+2003 ISOpub */
153 " ", " ", " ", /* thin space, U+2009 ISOpub */
154 "–", "–", "-", /* en dash, U+2013 ISOpub */
155 "—", "—", "--", /* em dash, U+2014 ISOpub */
156 "’", "’", "'", /* right single quotation mark */
157 "‚", "‚", "'", /* single low-9 quotation mark */
158 "“", "“", "\"", /* left double quotation mark */
159 "”", "”", "\"", /* right double quotation mark */
160 "„", "„", "\"", /* double low-9 quotation mark */
161 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
162 "›", "›", "\"", /* single right-pointing angle quotation mark */
163 " ", " ", " ", /* no-break space = non-breaking space, */
164 "¡", "¡", "!", /* inverted exclamation mark */
165 "¢", "¢", "c", /* cent sign */
166 "£", "£", "L", /* pound sign */
167 "¤", "¤", "$", /* currency sign */
168 "¥", "¥", "Y", /* yen sign = yuan sign */
169 "§", "§", "--", /* section sign */
170 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
171 "©", "©", "(C) ", /* copyright sign */
172 "ª", "ª", " ", /* feminine ordinal indicator */
173 "«", "«", "\"", /* left-pointing double angle quotation mark */
174 "­", "­", "-", /* soft hyphen = discretionary hyphen */
175 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
176 "¯", "¯", " ", /* macron = spacing macron = overline */
177 "°", "°", " degrees", /* degree sign */
178 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
179 "²", "²", "2", /* superscript two = superscript digit two */
180 "³", "³", "3", /* superscript three = superscript digit three */
181 "´", "´", " ", /* acute accent = spacing acute */
182 "µ", "µ", "m", /* micro sign */
183 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
184 "¸", "¸", " ", /* cedilla = spacing cedilla */
185 "¹", "¹", "1", /* superscript one = superscript digit one */
186 "º", "º", " ", /* masculine ordinal indicator */
187 "»", "»", "\"", /* right-pointing double angle quotation mark */
188 "¼", "¼", "1/4", /* vulgar fraction one quarter */
189 "½", "½", "1/2", /* vulgar fraction one half */
190 "¾", "¾", "3/4", /* vulgar fraction three quarters */
191 "¿", "¿", "?", /* inverted question mark */
192 "À", "À", "A", /* latin capital letter A with grave */
193 "Á", "Á", "A", /* latin capital letter A with acute */
194 "Â", "Â", "A", /* latin capital letter A with circumflex */
195 "Ã", "Ã", "A", /* latin capital letter A with tilde */
196 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
197 "Å", "Å", "A", /* latin capital letter A with ring above */
198 "Æ", "Æ", "AE", /* latin capital letter AE */
199 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
200 "È", "È", "E", /* latin capital letter E with grave */
201 "É", "É", "E", /* latin capital letter E with acute */
202 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
203 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
204 "Ì", "Ì", "I", /* latin capital letter I with grave */
205 "Í", "Í", "I", /* latin capital letter I with acute */
206 "Î", "Î", "I", /* latin capital letter I with circumflex */
207 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
208 "Ð", "Ð", "E", /* latin capital letter ETH */
209 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
210 "Ò", "Ò", "O", /* latin capital letter O with grave */
211 "Ó", "Ó", "O", /* latin capital letter O with acute */
212 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
213 "Õ", "Õ", "O", /* latin capital letter O with tilde */
214 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
215 "×", "×", "*", /* multiplication sign */
216 "Ø", "Ø", "O", /* latin capital letter O with stroke */
217 "Ù", "Ù", "U", /* latin capital letter U with grave */
218 "Ú", "Ú", "U", /* latin capital letter U with acute */
219 "Û", "Û", "U", /* latin capital letter U with circumflex */
220 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
221 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
222 "Þ", "Þ", "TH", /* latin capital letter THORN */
223 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
224 "à", "à", "a", /* latin small letter a with grave */
225 "á", "á", "a", /* latin small letter a with acute */
226 "â", "â", "a", /* latin small letter a with circumflex */
227 "ã", "ã", "a", /* latin small letter a with tilde */
228 "ä", "ä", "a", /* latin small letter a with diaeresis */
229 "å", "å", "a", /* latin small letter a with ring above */
230 "æ", "æ", "ae", /* latin small letter ae */
231 "ç", "ç", "c", /* latin small letter c with cedilla */
232 "è", "è", "e", /* latin small letter e with grave */
233 "é", "é", "e", /* latin small letter e with acute */
234 "ê", "ê", "e", /* latin small letter e with circumflex */
235 "ë", "ë", "e", /* latin small letter e with diaeresis */
236 "ì", "ì", "i", /* latin small letter i with grave */
237 "í", "í", "i", /* latin small letter i with acute */
238 "î", "î", "i", /* latin small letter i with circumflex */
239 "ï", "ï", "i", /* latin small letter i with diaeresis */
240 "ð", "ð", "eth", /* latin small letter eth */
241 "ñ", "ñ", "n", /* latin small letter n with tilde */
242 "ò", "ò", "o", /* latin small letter o with grave */
243 "ó", "ó", "o", /* latin small letter o with acute */
244 "ô", "ô", "o", /* latin small letter o with circumflex */
245 "õ", "õ", "o", /* latin small letter o with tilde */
246 "ö", "ö", "o", /* latin small letter o with diaeresis */
247 "÷", "÷", "/", /* division sign */
248 "ø", "ø", "o", /* latin small letter o with stroke */
249 "ù", "ù", "u", /* latin small letter u with grave */
250 "ú", "ú", "u", /* latin small letter u with acute */
251 "û", "û", "u", /* latin small letter u with circumflex */
252 "ü", "ü", "u", /* latin small letter u with diaeresis */
253 "ý", "ý", "y", /* latin small letter y with acute */
254 "þ", "þ", "th", /* latin small letter thorn */
255 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
259 /* special characters */
260 #define CHAR_SPACE 32
264 #define CHAR_DQUOTE 34
265 #define CHAR_SQUOTE 39
266 #define CHAR_OPEN_SQUOTE 96
267 #define CHAR_TILDE 126
268 #define CHAR_ASTERISK 42
269 #define CHAR_FORESLASH 47
270 #define CHAR_CARAT 94
272 #define CHAR_UNDERSCORE '_'
273 #define CHAR_OPEN_CBRACK '{'
274 #define CHAR_CLOSE_CBRACK '}'
275 #define CHAR_OPEN_RBRACK '('
276 #define CHAR_CLOSE_RBRACK ')'
277 #define CHAR_OPEN_SBRACK '['
278 #define CHAR_CLOSE_SBRACK ']'
280 /* longest and shortest normal PG line lengths */
281 #define LONGEST_PG_LINE 75
282 #define WAY_TOO_LONG 80
283 #define SHORTEST_PG_LINE 55
285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
286 /* D - ignore DP-specific markup */
287 /* E - echo queried line */
288 /* S - check single quotes */
289 /* T - check common typos */
290 /* P - require closure of quotes on */
291 /* every paragraph */
292 /* X - "Trust no one" :-) Paranoid! */
293 /* Queries everything */
294 /* L - line end checking defaults on */
295 /* -L turns it off */
296 /* O - overview. Just shows counts. */
297 /* Y - puts errors to stdout */
298 /* instead of stderr */
299 /* H - Echoes header fields */
300 /* M - Ignore markup in < > */
301 /* U - Use file of User-defined Typos*/
302 /* W - Defaults for use on Web upload*/
303 /* V - Verbose - list EVERYTHING! */
304 #define SWITNO 14 /* max number of switch parms */
305 /* - used for defining array-size */
306 #define MINARGS 1 /* minimum no of args excl switches */
307 #define MAXARGS 1 /* maximum no of args excl switches */
309 int pswit[SWITNO]; /* program switches set by SWITCHES */
311 #define ECHO_SWITCH 0
312 #define SQUOTE_SWITCH 1
313 #define TYPO_SWITCH 2
314 #define QPARA_SWITCH 3
315 #define PARANOID_SWITCH 4
316 #define LINE_END_SWITCH 5
317 #define OVERVIEW_SWITCH 6
318 #define STDOUT_SWITCH 7
319 #define HEADER_SWITCH 8
321 #define VERBOSE_SWITCH 10
322 #define MARKUP_SWITCH 11
323 #define USERTYPO_SWITCH 12
326 long cnt_dquot; /* for overview mode, count of doublequote queries */
327 long cnt_squot; /* for overview mode, count of singlequote queries */
328 long cnt_brack; /* for overview mode, count of brackets queries */
329 long cnt_bin; /* for overview mode, count of non-ASCII queries */
330 long cnt_odd; /* for overview mode, count of odd character queries */
331 long cnt_long; /* for overview mode, count of long line errors */
332 long cnt_short; /* for overview mode, count of short line queries */
333 long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
334 long cnt_dash; /* for overview mode, count of dash-related queries */
335 long cnt_word; /* for overview mode, count of word queries */
336 long cnt_html; /* for overview mode, count of html queries */
337 long cnt_lineend; /* for overview mode, count of line-end queries */
338 long cnt_spacend; /* count of lines with space at end */
339 long linecnt; /* count of total lines in the file */
340 long checked_linecnt; /* count of lines actually checked */
343 void procfile(char *);
345 #define LOW_THRESHOLD 0
346 #define HIGH_THRESHOLD 1
352 #define FIRST_OF_PAIR 0
353 #define SECOND_OF_PAIR 1
355 #define MAX_WORDPAIR 1000
357 char running_from[MAX_PATH];
359 int mixdigit(char *);
360 char *getaword(char *,char *);
361 int matchword(char *,char *);
362 char *flgets(char *,int,FILE *,long);
363 void lowerit(char *);
364 int gcisalpha(unsigned char);
365 int gcisdigit(unsigned char);
366 int gcisletter(unsigned char);
367 char *gcstrchr(char *s,char c);
368 void postprocess_for_HTML(char *);
369 char *linehasmarkup(char *);
370 char *losemarkup(char *);
371 int tagcomp(char *,char *);
372 char *loseentities(char *);
375 void postprocess_for_DP(char *);
377 char wrk[LINEBUFSIZE];
380 #define MAX_QWORD_LENGTH 40
381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
382 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
383 signed int dupcnt[MAX_QWORD];
385 int main(int argc,char **argv)
389 char usertypo_file[MAX_PATH];
391 if (strlen(argv[0])<sizeof(running_from))
392 /* save the path to the executable */
393 strcpy(running_from,argv[0]);
394 /* find out what directory we're running from */
395 s=running_from+strlen(running_from);
396 for (;*s!='/' && *s!='\\' && s>=running_from;s--)
398 switno=strlen(SWITCHES);
399 for (i=switno;--i>0;)
400 pswit[i]=0; /* initialise switches */
402 * Standard loop to extract switches.
403 * When we come out of this loop, the arguments will be
404 * in argv[0] upwards and the switches used will be
405 * represented by their equivalent elements in pswit[]
407 while (--argc>0 && **++argv=='-')
408 for (argsw=argv[0]+1;*argsw!='\0';argsw++)
409 for (i=switno,invarg=1;(--i>=0) && invarg==1;)
410 if ((toupper(*argsw))==SWITCHES[i])
415 /* Paranoid checking is turned OFF, not on, by its switch */
416 pswit[PARANOID_SWITCH]^=1;
417 if (pswit[PARANOID_SWITCH])
418 /* if running in paranoid mode force typo checks as well */
419 pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
420 /* Line-end checking is turned OFF, not on, by its switch */
421 pswit[LINE_END_SWITCH]^=1;
422 /* Echoing is turned OFF, not on, by its switch */
423 pswit[ECHO_SWITCH]^=1;
424 if (pswit[OVERVIEW_SWITCH])
425 /* just print summary; don't echo */
426 pswit[ECHO_SWITCH]=0;
428 * Web uploads - for the moment, this is really just a placeholder
429 * until we decide what processing we really want to do on web uploads
431 if (pswit[WEB_SWITCH])
433 /* specific override for web uploads */
434 pswit[ECHO_SWITCH]=1;
435 pswit[SQUOTE_SWITCH]=0;
436 pswit[TYPO_SWITCH]=1;
437 pswit[QPARA_SWITCH]=0;
438 pswit[PARANOID_SWITCH]=1;
439 pswit[LINE_END_SWITCH]=0;
440 pswit[OVERVIEW_SWITCH]=0;
441 pswit[STDOUT_SWITCH]=0;
442 pswit[HEADER_SWITCH]=1;
443 pswit[VERBOSE_SWITCH]=0;
444 pswit[MARKUP_SWITCH]=0;
445 pswit[USERTYPO_SWITCH]=0;
448 if (argc<MINARGS || argc>MAXARGS)
450 /* check number of args */
454 /* read in the user-defined stealth scanno list */
455 if (pswit[USERTYPO_SWITCH])
457 /* ... we were told we had one! */
458 usertypofile=fopen(USERTYPO_FILE,"rb");
461 /* not in cwd. try excuteable directory. */
462 strcpy(usertypo_file,running_from);
463 strcat(usertypo_file,USERTYPO_FILE);
464 usertypofile=fopen(usertypo_file,"rb");
466 /* we ain't got no user typo file! */
467 printf(" --> I couldn't find gutcheck.typ "
468 "-- proceeding without user typos.\n");
474 /* we managed to open a User Typo File! */
475 if (pswit[USERTYPO_SWITCH])
477 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
478 (long)usertypo_count))
484 s=malloc(strlen(aline)+1);
487 fprintf(stderr,"bookloupe: cannot get enough "
488 "memory for user typo file!\n");
492 usertypo[usertypo_count]=s;
494 if (usertypo_count>=MAX_USER_TYPOS)
496 printf(" --> Only %d user-defined typos "
497 "allowed: ignoring the rest\n",
505 fclose(usertypofile);
508 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
509 cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
510 cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
513 if (pswit[OVERVIEW_SWITCH])
515 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
516 checked_linecnt,linecnt,linecnt-checked_linecnt);
517 printf(" --------------- Queries found --------------\n");
519 printf(" Long lines: %14ld\n",cnt_long);
521 printf(" Short lines: %14ld\n",cnt_short);
523 printf(" Line-end problems: %14ld\n",cnt_lineend);
525 printf(" Common typos: %14ld\n",cnt_word);
527 printf(" Unmatched quotes: %14ld\n",cnt_dquot);
529 printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
531 printf(" Unmatched brackets: %14ld\n",cnt_brack);
533 printf(" Non-ASCII characters: %14ld\n",cnt_bin);
535 printf(" Proofing characters: %14ld\n",cnt_odd);
537 printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
539 printf(" Non-standard dashes: %14ld\n",cnt_dash);
541 printf(" Possible HTML tags: %14ld\n",cnt_html);
543 printf(" TOTAL QUERIES %14ld\n",
544 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
545 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
550 struct first_pass_results {
551 long firstline,astline;
552 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
553 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
554 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
555 signed int Dutchcount,Frenchcount;
561 * Run a first pass - verify that it's a valid PG
562 * file, decide whether to report some things that
563 * occur many times in the text like long or short
564 * lines, non-standard dashes, etc.
566 struct first_pass_results *first_pass(FILE *infile)
568 char laststart=CHAR_SPACE,*s;
570 unsigned int lastlen=0,lastblen=0;
571 long spline=0,nspline=0;
572 static struct first_pass_results results={0};
573 char inword[MAXWORDLEN]="";
574 while (fgets(aline,LINEBUFSIZE-1,infile))
576 while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
577 aline[strlen(aline)-1]=0;
579 if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
580 (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
583 printf(" --> Duplicate header?\n");
584 spline=linecnt+1; /* first line of non-header text, that is */
586 if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
589 printf(" --> Duplicate header?\n");
590 nspline=linecnt+1; /* first line of non-header text, that is */
592 if (spline || nspline)
595 if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
597 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
599 if (results.footerline)
601 /* it's an old-form header - we can detect duplicates */
603 printf(" --> Duplicate footer?\n");
606 results.footerline=linecnt;
611 results.firstline=spline;
613 results.firstline=nspline; /* override with new */
614 if (results.footerline)
615 continue; /* don't count the boilerplate in the footer */
617 results.totlen+=llen;
620 if ((unsigned char)aline[i]>127)
622 if (gcisalpha(aline[i]))
624 if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
625 results.endquote_count++;
627 if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
628 lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
630 if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
632 if (strstr(aline,".,"))
634 /* only count ast lines for ignoring purposes where there is */
635 /* locase text on the line */
636 if (strstr(aline,"*"))
639 if (*s>='a' && *s<='z')
644 if (strstr(aline,"/"))
645 results.fslashline++;
646 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
648 if (aline[i]=='-' && aline[i-1]!='-')
650 if (llen>LONGEST_PG_LINE)
652 if (llen>WAY_TOO_LONG)
653 results.verylongline++;
654 if (strstr(aline,"<") && strstr(aline,">"))
656 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
659 if (strstr(aline,"<i>"))
660 results.htmcount+=4; /* bonus marks! */
662 /* Check for spaced em-dashes */
663 if (strstr(aline,"--"))
666 if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
667 (*(strstr(aline,"--")+2)==CHAR_SPACE))
668 results.space_emdash++;
669 if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
670 (*(strstr(aline,"--")+2)==CHAR_SPACE))
671 /* count of em-dashes with spaces both sides */
672 results.non_PG_space_emdash++;
673 if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
674 (*(strstr(aline,"--")+2)!=CHAR_SPACE))
675 /* count of PG-type em-dashes with no spaces */
676 results.PG_space_emdash++;
680 s=getaword(s,inword);
681 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
682 results.Dutchcount++;
683 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
684 results.Frenchcount++;
685 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
686 results.standalone_digit++;
688 /* Check for spaced dashes */
689 if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
692 lastlen=strlen(aline);
699 signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
700 signed int endquote,isDutch,isFrench;
706 * Make some snap decisions based on the first pass results.
708 struct warnings *report_first_pass(struct first_pass_results *results)
710 static struct warnings warnings={0};
712 printf(" --> %ld lines in this file have white space at end\n",
715 if (results->dotcomma>5)
718 printf(" --> %ld lines in this file contain '.,'. "
719 "Not reporting them.\n",results->dotcomma);
722 * If more than 50 lines, or one-tenth, are short,
723 * don't bother reporting them.
725 warnings.shortline=1;
726 if (results->shortline>50 || results->shortline*10>linecnt)
728 warnings.shortline=0;
729 printf(" --> %ld lines in this file are short. "
730 "Not reporting short lines.\n",results->shortline);
733 * If more than 50 lines, or one-tenth, are long,
734 * don't bother reporting them.
737 if (results->longline>50 || results->longline*10>linecnt)
740 printf(" --> %ld lines in this file are long. "
741 "Not reporting long lines.\n",results->longline);
743 /* If more than 10 lines contain asterisks, don't bother reporting them. */
745 if (results->astline>10)
748 printf(" --> %ld lines in this file contain asterisks. "
749 "Not reporting them.\n",results->astline);
752 * If more than 10 lines contain forward slashes,
753 * don't bother reporting them.
756 if (results->fslashline>10)
759 printf(" --> %ld lines in this file contain forward slashes. "
760 "Not reporting them.\n",results->fslashline);
763 * If more than 20 lines contain unpunctuated endquotes,
764 * don't bother reporting them.
767 if (results->endquote_count>20)
770 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
771 "Not reporting them.\n",results->endquote_count);
774 * If more than 15 lines contain standalone digits,
775 * don't bother reporting them.
778 if (results->standalone_digit>10)
781 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
782 "Not reporting them.\n",results->standalone_digit);
785 * If more than 20 lines contain hyphens at end,
786 * don't bother reporting them.
789 if (results->hyphens>20)
792 printf(" --> %ld lines in this file have hyphens at end. "
793 "Not reporting them.\n",results->hyphens);
795 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
797 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
798 pswit[MARKUP_SWITCH]=1;
800 if (results->verylongline>0)
801 printf(" --> %ld lines in this file are VERY long!\n",
802 results->verylongline);
804 * If there are more non-PG spaced dashes than PG em-dashes,
805 * assume it's deliberate.
806 * Current PG guidelines say don't use them, but older texts do,
807 * and some people insist on them whatever the guidelines say.
810 if (results->spacedash+results->non_PG_space_emdash>
811 results->PG_space_emdash)
814 printf(" --> There are %ld spaced dashes and em-dashes. "
815 "Not reporting them.\n",
816 results->spacedash+results->non_PG_space_emdash);
818 /* If more than a quarter of characters are hi-bit, bug out. */
820 if (results->binlen*4>results->totlen)
822 printf(" --> This file does not appear to be ASCII. "
823 "Terminating. Best of luck with it!\n");
826 if (results->alphalen*4<results->totlen)
828 printf(" --> This file does not appear to be text. "
829 "Terminating. Best of luck with it!\n");
832 if (results->binlen*100>results->totlen || results->binlen>100)
834 printf(" --> There are a lot of foreign letters here. "
835 "Not reporting them.\n");
839 if (results->Dutchcount>50)
842 printf(" --> This looks like Dutch - "
843 "switching off dashes and warnings for 's Middags case.\n");
846 if (results->Frenchcount>50)
849 printf(" --> This looks like French - "
850 "switching off some doublepunct.\n");
852 if (results->firstline && results->footerline)
853 printf(" The PG header and footer appear to be already on.\n");
856 if (results->firstline)
857 printf(" The PG header is on - no footer.\n");
858 if (results->footerline)
859 printf(" The PG footer is on - no header.\n");
862 if (pswit[VERBOSE_SWITCH])
865 warnings.shortline=1;
874 printf(" *** Verbose output is ON -- you asked for it! ***\n");
876 if (warnings.isDutch)
878 if (results->footerline>0 && results->firstline>0 &&
879 results->footerline>results->firstline &&
880 results->footerline-results->firstline<100)
882 printf(" --> I don't really know where this text starts. \n");
883 printf(" There are no reference points.\n");
884 printf(" I'm going to have to report the header and footer "
886 results->firstline=0;
893 signed int c_unders,c_brack,s_brack,r_brack;
894 signed int open_single_quote,close_single_quote;
900 * Look along the line, accumulate the count of quotes, and see
901 * if this is an empty line - i.e. a line with nothing on it
903 * If line has just spaces, period, * and/or - on it, don't
904 * count it, since empty lines with asterisks or dashes to
905 * separate sections are common.
907 * Returns: Non-zero if the line is empty.
909 int analyse_quotes(const char *s,struct counters *counters)
911 signed int guessquote=0;
912 int isemptyline=1; /* assume the line is empty until proven otherwise */
917 if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
922 * At start of line, it can only be an openquote.
923 * Hardcode a very common exception!
925 if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
926 counters->open_single_quote++;
928 else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
929 /* Do nothing! it's definitely an apostrophe, not a quote */
931 /* it's outside a word - let's check it out */
932 else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
934 /* it damwell better BE an openquote */
935 if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
936 /* hardcode a very common exception! */
937 counters->open_single_quote++;
941 /* now - is it a closequote? */
942 guessquote=0; /* accumulate clues */
943 if (gcisalpha(s[-1]))
945 /* it follows a letter - could be either */
949 /* looks like a plural apostrophe */
951 if (s[1]==CHAR_SPACE) /* bonus marks! */
955 /* it doesn't have a letter either side */
956 else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
957 guessquote+=8; /* looks like a closequote */
960 if (counters->open_single_quote>counters->close_single_quote)
962 * Give it the benefit of some doubt,
963 * if a squote is already open.
969 counters->close_single_quote++;
972 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
974 isemptyline=0; /* ignore lines like * * * as spacers */
975 if (*s==CHAR_UNDERSCORE)
976 counters->c_unders++;
977 if (*s==CHAR_OPEN_CBRACK)
979 if (*s==CHAR_CLOSE_CBRACK)
981 if (*s==CHAR_OPEN_RBRACK)
983 if (*s==CHAR_CLOSE_RBRACK)
985 if (*s==CHAR_OPEN_SBRACK)
987 if (*s==CHAR_CLOSE_SBRACK)
995 * check_for_odd_characters:
997 * Check for binary and other odd characters.
999 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1002 /* Don't repeat multiple warnings on one line. */
1003 signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
1006 for (s=aline;*s;s++)
1008 c=*(unsigned char *)s;
1009 if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
1011 if (pswit[ECHO_SWITCH])
1012 printf("\n%s\n",aline);
1013 if (!pswit[OVERVIEW_SWITCH])
1015 printf(" Line %ld column %d - "
1016 "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
1018 printf(" Line %ld column %d - Non-ASCII character %d\n",
1019 linecnt,(int)(s-aline)+1,c);
1024 if (!eTab && *s==CHAR_TAB)
1026 if (pswit[ECHO_SWITCH])
1027 printf("\n%s\n",aline);
1028 if (!pswit[OVERVIEW_SWITCH])
1029 printf(" Line %ld column %d - Tab character?\n",
1030 linecnt,(int)(s-aline)+1);
1035 if (!eTilde && *s==CHAR_TILDE)
1038 * Often used by OCR software to indicate an
1039 * unrecognizable character.
1041 if (pswit[ECHO_SWITCH])
1042 printf("\n%s\n",aline);
1043 if (!pswit[OVERVIEW_SWITCH])
1044 printf(" Line %ld column %d - Tilde character?\n",
1045 linecnt,(int)(s-aline)+1);
1050 if (!eCarat && *s==CHAR_CARAT)
1052 if (pswit[ECHO_SWITCH])
1053 printf("\n%s\n",aline);
1054 if (!pswit[OVERVIEW_SWITCH])
1055 printf(" Line %ld column %d - Carat character?\n",
1056 linecnt,(int)(s-aline)+1);
1061 if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
1063 if (pswit[ECHO_SWITCH])
1064 printf("\n%s\n",aline);
1065 if (!pswit[OVERVIEW_SWITCH])
1066 printf(" Line %ld column %d - Forward slash?\n",
1067 linecnt,(int)(s-aline)+1);
1073 * Report asterisks only in paranoid mode,
1074 * since they're often deliberate.
1076 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1079 if (pswit[ECHO_SWITCH])
1080 printf("\n%s\n",aline);
1081 if (!pswit[OVERVIEW_SWITCH])
1082 printf(" Line %ld column %d - Asterisk?\n",
1083 linecnt,(int)(s-aline)+1);
1096 void procfile(char *filename)
1098 char *s,*t,*s1,laststart,*wordstart;
1099 char inword[MAXWORDLEN],testword[MAXWORDLEN];
1100 char parastart[81]; /* first line of current para */
1102 struct first_pass_results *first_pass_results;
1103 struct warnings *warnings;
1104 struct counters counters={0};
1106 long squot,start_para_line;
1107 signed int i,j,llen,isacro,isellipsis,istypo,alower;
1108 unsigned int lastlen,lastblen;
1109 signed int dquotepar,squotepar;
1110 signed int isnewpara,vowel,consonant;
1111 char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
1112 cbrack_err[80],unders_err[80];
1113 signed int qword_index,qperiod_index,isdup;
1115 laststart=CHAR_SPACE;
1117 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
1118 *unders_err=*prevline=0;
1119 linecnt=checked_linecnt=start_para_line=0;
1121 i=llen=isacro=isellipsis=istypo=0;
1122 isnewpara=vowel=consonant=enddash=0;
1123 qword_index=qperiod_index=isdup=0;
1124 *inword=*testword=0;
1125 dquotepar=squotepar=0;
1126 for (j=0;j<MAX_QWORD;j++)
1129 for (i=0;i<MAX_QWORD_LENGTH;i++)
1135 infile=fopen(filename,"rb");
1138 if (pswit[STDOUT_SWITCH])
1139 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
1141 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
1144 fprintf(stdout,"\n\nFile: %s\n\n",filename);
1145 first_pass_results=first_pass(infile);
1146 warnings=report_first_pass(first_pass_results);
1149 * Here we go with the main pass. Hold onto yer hat!
1150 * Re-init some variables we've dirtied.
1153 laststart=CHAR_SPACE;
1155 while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
1160 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
1161 continue; // skip DP page separators completely
1162 if (linecnt<first_pass_results->firstline ||
1163 (first_pass_results->footerline>0 &&
1164 linecnt>first_pass_results->footerline))
1166 if (pswit[HEADER_SWITCH])
1168 if (!strncmp(aline,"Title:",6))
1169 printf(" %s\n",aline);
1170 if (!strncmp(aline,"Author:",7))
1171 printf(" %s\n",aline);
1172 if (!strncmp(aline,"Release Date:",13))
1173 printf(" %s\n",aline);
1174 if (!strncmp(aline,"Edition:",8))
1175 printf(" %s\n\n",aline);
1177 continue; /* skip through the header */
1182 * If we are in a state of unbalanced quotes, and this line
1183 * doesn't begin with a quote, output the stored error message.
1184 * If the -P switch was used, print the warning even if the
1185 * new para starts with quotes.
1191 if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
1193 if (!pswit[OVERVIEW_SWITCH])
1195 if (pswit[ECHO_SWITCH])
1196 printf("\n%s\n",parastart);
1204 if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
1205 pswit[QPARA_SWITCH] || squot)
1207 if (!pswit[OVERVIEW_SWITCH])
1209 if (pswit[ECHO_SWITCH])
1210 printf("\n%s\n",parastart);
1220 if (!pswit[OVERVIEW_SWITCH])
1222 if (pswit[ECHO_SWITCH])
1223 printf("\n%s\n",parastart);
1231 if (!pswit[OVERVIEW_SWITCH])
1233 if (pswit[ECHO_SWITCH])
1234 printf("\n%s\n",parastart);
1242 if (!pswit[OVERVIEW_SWITCH])
1244 if (pswit[ECHO_SWITCH])
1245 printf("\n%s\n",parastart);
1253 if (!pswit[OVERVIEW_SWITCH])
1255 if (pswit[ECHO_SWITCH])
1256 printf("\n%s\n",parastart);
1262 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=
1263 *sbrack_err=*unders_err=0;
1264 isemptyline=analyse_quotes(aline,&counters);
1265 if (isnewpara && !isemptyline)
1267 /* This line is the start of a new paragraph. */
1268 start_para_line=linecnt;
1269 /* Capture its first line in case we want to report it later. */
1270 strncpy(parastart,aline,80);
1272 dquotepar=squotepar=0; /* restart the quote count */
1274 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
1276 if (*s>='a' && *s<='z')
1278 /* and its first letter is lowercase */
1279 if (pswit[ECHO_SWITCH])
1280 printf("\n%s\n",aline);
1281 if (!pswit[OVERVIEW_SWITCH])
1282 printf(" Line %ld column %d - "
1283 "Paragraph starts with lower-case\n",
1284 linecnt,(int)(s-aline)+1);
1288 isnewpara=0; /* Signal the end of new para processing. */
1290 /* Check for an em-dash broken at line end. */
1291 if (enddash && *aline=='-')
1293 if (pswit[ECHO_SWITCH])
1294 printf("\n%s\n",aline);
1295 if (!pswit[OVERVIEW_SWITCH])
1296 printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
1301 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
1303 if (s>=aline && *s=='-')
1306 * Check for invalid or questionable characters in the line
1307 * Anything above 127 is invalid for plain ASCII, and
1308 * non-printable control characters should also be flagged.
1309 * Tabs should generally not be there.
1311 for (s=aline;*s;s++)
1313 i=(unsigned char)*s;
1314 if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
1316 if (pswit[ECHO_SWITCH])
1317 printf("\n%s\n",aline);
1318 if (!pswit[OVERVIEW_SWITCH])
1319 printf(" Line %ld column %d - Control character %d\n",
1320 linecnt,(int)(s-aline)+1,i);
1326 check_for_odd_characters(aline,warnings,isemptyline);
1327 /* Check for line too long. */
1328 if (warnings->longline)
1330 if (strlen(aline)>LONGEST_PG_LINE)
1332 if (pswit[ECHO_SWITCH])
1333 printf("\n%s\n",aline);
1334 if (!pswit[OVERVIEW_SWITCH])
1335 printf(" Line %ld column %d - Long line %d\n",
1336 linecnt,strlen(aline),strlen(aline));
1342 * Check for line too short.
1343 * This one is a bit trickier to implement: we don't want to
1344 * flag the last line of a paragraph for being short, so we
1345 * have to wait until we know that our current line is a
1346 * "normal" line, then report the _previous_ line if it was too
1347 * short. We also don't want to report indented lines like
1348 * chapter heads or formatted quotations. We therefore keep
1349 * lastlen as the length of the last line examined, and
1350 * lastblen as the length of the last but one, and try to
1351 * suppress unnecessary warnings by checking that both were of
1352 * "normal" length. We keep the first character of the last
1353 * line in laststart, and if it was a space, we assume that the
1354 * formatting is deliberate. I can't figure out a way to
1355 * distinguish something like a quoted verse left-aligned or
1356 * the header or footer of a letter from a paragraph of short
1357 * lines - maybe if I examined the whole paragraph, and if the
1358 * para has less than, say, 8 lines and if all lines are short,
1359 * then just assume it's OK? Need to look at some texts to see
1360 * how often a formula like this would get the right result.
1362 if (warnings->shortline && strlen(aline)>1 && lastlen>1 &&
1363 lastlen<SHORTEST_PG_LINE && lastblen>1 && lastblen>SHORTEST_PG_LINE &&
1364 laststart!=CHAR_SPACE)
1366 if (pswit[ECHO_SWITCH])
1367 printf("\n%s\n",prevline);
1368 if (!pswit[OVERVIEW_SWITCH])
1369 printf(" Line %ld column %d - Short line %d?\n",
1370 linecnt-1,strlen(prevline),strlen(prevline));
1375 lastlen=strlen(aline);
1377 /* Look for punctuation other than full ellipses at start of line. */
1378 if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1380 if (pswit[ECHO_SWITCH])
1381 printf("\n%s\n",aline);
1382 if (!pswit[OVERVIEW_SWITCH])
1383 printf(" Line %ld column 1 - Begins with punctuation?\n",
1389 * Check for spaced em-dashes.
1390 * We must check _all_ occurrences of "--" on the line
1391 * hence the loop - even if the first double-dash is OK
1392 * there may be another that's wrong later on.
1397 while (strstr(s,"--"))
1399 if (*(strstr(s,"--")-1)==CHAR_SPACE ||
1400 (*(strstr(s,"--")+2)==CHAR_SPACE))
1402 if (pswit[ECHO_SWITCH])
1403 printf("\n%s\n",aline);
1404 if (!pswit[OVERVIEW_SWITCH])
1405 printf(" Line %ld column %d - Spaced em-dash?\n",
1406 linecnt,(int)(strstr(s,"--")-aline)+1);
1413 /* Check for spaced dashes. */
1416 if (strstr(aline," -"))
1418 if (*(strstr(aline," -")+2)!='-')
1420 if (pswit[ECHO_SWITCH])
1421 printf("\n%s\n",aline);
1422 if (!pswit[OVERVIEW_SWITCH])
1423 printf(" Line %ld column %d - Spaced dash?\n",
1424 linecnt,(int)(strstr(aline," -")-aline)+1);
1429 else if (strstr(aline,"- "))
1431 if (*(strstr(aline,"- ")-1)!='-')
1433 if (pswit[ECHO_SWITCH])
1434 printf("\n%s\n",aline);
1435 if (!pswit[OVERVIEW_SWITCH])
1436 printf(" Line %ld column %d - Spaced dash?\n",
1437 linecnt,(int)(strstr(aline,"- ")-aline)+1);
1444 * Check for unmarked paragraphs indicated by separate speakers.
1445 * May well be false positive:
1446 * "Bravo!" "Wonderful!" called the crowd.
1447 * but useful all the same.
1451 if (strstr(aline,"\" \""))
1452 s=strstr(aline,"\" \"");
1453 if (strstr(aline,"\" \""))
1454 s=strstr(aline,"\" \"");
1457 if (pswit[ECHO_SWITCH])
1458 printf("\n%s\n",aline);
1459 if (!pswit[OVERVIEW_SWITCH])
1460 printf(" Line %ld column %d - "
1461 "Query missing paragraph break?\n",
1462 linecnt,(int)(s-aline)+1);
1467 * Check for "to he" and other easy he/be errors.
1468 * This is a very inadequate effort on the he/be problem,
1469 * but the phrase "to he" is always an error, whereas "to
1470 * be" is quite common.
1471 * Similarly, '"Quiet!", be said.' is a non-be error
1472 * "to he" is _not_ always an error!:
1473 * "Where they went to he couldn't say."
1474 * Another false positive:
1475 * What would "Cinderella" be without the . . .
1476 * and another: "If he wants to he can see for himself."
1480 if (strstr(aline," to he "))
1481 s=strstr(aline," to he ");
1482 if (strstr(aline,"\" be "))
1483 s=strstr(aline,"\" be ");
1484 if (strstr(aline,"\", be "))
1485 s=strstr(aline,"\", be ");
1486 if (strstr(aline," is be "))
1487 s=strstr(aline," is be ");
1488 if (strstr(aline," be is "))
1489 s=strstr(aline," be is ");
1490 if (strstr(aline," was be "))
1491 s=strstr(aline," was be ");
1492 if (strstr(aline," be would "))
1493 s=strstr(aline," be would ");
1494 if (strstr(aline," be could "))
1495 s=strstr(aline," be could ");
1498 if (pswit[ECHO_SWITCH])
1499 printf("\n%s\n",aline);
1500 if (!pswit[OVERVIEW_SWITCH])
1501 printf(" Line %ld column %d - Query he/be error?\n",
1502 linecnt,(int)(s-aline)+1);
1508 if (strstr(aline," i bad "))
1509 s=strstr(aline," i bad ");
1510 if (strstr(aline," you bad "))
1511 s=strstr(aline," you bad ");
1512 if (strstr(aline," he bad "))
1513 s=strstr(aline," he bad ");
1514 if (strstr(aline," she bad "))
1515 s=strstr(aline," she bad ");
1516 if (strstr(aline," they bad "))
1517 s=strstr(aline," they bad ");
1518 if (strstr(aline," a had "))
1519 s=strstr(aline," a had ");
1520 if (strstr(aline," the had "))
1521 s=strstr(aline," the had ");
1524 if (pswit[ECHO_SWITCH])
1525 printf("\n%s\n",aline);
1526 if (!pswit[OVERVIEW_SWITCH])
1527 printf(" Line %ld column %d - Query had/bad error?\n",
1528 linecnt,(int)(s-aline)+1);
1534 if (strstr(aline,", hut "))
1535 s=strstr(aline,", hut ");
1536 if (strstr(aline,"; hut "))
1537 s=strstr(aline,"; hut ");
1540 if (pswit[ECHO_SWITCH])
1541 printf("\n%s\n",aline);
1542 if (!pswit[OVERVIEW_SWITCH])
1543 printf(" Line %ld column %d - Query hut/but error?\n",
1544 linecnt,(int)(s-aline)+1);
1549 * Special case - angled bracket in front of "From" placed there by an
1550 * MTA when sending an e-mail.
1552 if (strstr(aline,">From"))
1554 if (pswit[ECHO_SWITCH])
1555 printf("\n%s\n",aline);
1556 if (!pswit[OVERVIEW_SWITCH])
1557 printf(" Line %ld column %d - "
1558 "Query angled bracket with From\n",
1559 linecnt,(int)(strstr(aline,">From")-aline)+1);
1564 * Check for a single character line -
1565 * often an overflow from bad wrapping.
1567 if (*aline && !aline[1])
1569 if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1571 ; /* Nothing - ignore numerals alone on a line. */
1574 if (pswit[ECHO_SWITCH])
1575 printf("\n%s\n",aline);
1576 if (!pswit[OVERVIEW_SWITCH])
1577 printf(" Line %ld column 1 - "
1578 "Query single character line\n",linecnt);
1583 /* Check for I" - often should be ! */
1584 if (strstr(aline," I\""))
1586 if (pswit[ECHO_SWITCH])
1587 printf("\n%s\n",aline);
1588 if (!pswit[OVERVIEW_SWITCH])
1589 printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1590 linecnt,strstr(aline," I\"")-aline);
1595 * Check for period without a capital letter. Cut-down from gutspell.
1596 * Only works when it happens on a single line.
1598 if (pswit[PARANOID_SWITCH])
1600 for (t=s=aline;strstr(t,". ");)
1606 /* start of line punctuation is handled elsewhere */
1609 if (!gcisalpha(t[-1]))
1614 if (warnings->isDutch)
1616 /* For Frank & Jeroen -- 's Middags case */
1617 if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1618 t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1625 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1627 if (*s1>='a' && *s1<='z')
1629 /* we have something to investigate */
1631 /* so let's go back and find out */
1632 for (s1=t-1;s1>=s &&
1633 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1634 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1637 for (i=0;*s1 && *s1!='.';s1++,i++)
1640 for (i=0;*abbrev[i];i++)
1641 if (!strcmp(testword,abbrev[i]))
1643 if (gcisdigit(*testword))
1647 if (isroman(testword))
1652 for (i=0;testword[i];i++)
1653 if (strchr(vowels,testword[i]))
1659 if (strlen(testword)<MAX_QWORD_LENGTH &&
1660 !pswit[VERBOSE_SWITCH])
1661 for (i=0;i<qperiod_index;i++)
1662 if (!strcmp(testword,qperiod[i]))
1666 if (qperiod_index<MAX_QWORD &&
1667 strlen(testword)<MAX_QWORD_LENGTH)
1669 strcpy(qperiod[qperiod_index],testword);
1672 if (pswit[ECHO_SWITCH])
1673 printf("\n%s\n",aline);
1674 if (!pswit[OVERVIEW_SWITCH])
1675 printf(" Line %ld column %d - "
1676 "Extra period?\n",linecnt,(int)(t-aline)+1);
1685 if (pswit[TYPO_SWITCH])
1687 /* Check for words usually not followed by punctuation. */
1691 s=getaword(s,inword);
1695 for (i=0;*nocomma[i];i++)
1696 if (!strcmp(inword,nocomma[i]))
1698 if (*s==',' || *s==';' || *s==':')
1700 if (pswit[ECHO_SWITCH])
1701 printf("\n%s\n",aline);
1702 if (!pswit[OVERVIEW_SWITCH])
1703 printf(" Line %ld column %d - "
1704 "Query punctuation after %s?\n",
1705 linecnt,(int)(s-aline)+1,inword);
1710 for (i=0;*noperiod[i];i++)
1711 if (!strcmp(inword,noperiod[i]))
1713 if (*s=='.' || *s=='!')
1715 if (pswit[ECHO_SWITCH])
1716 printf("\n%s\n",aline);
1717 if (!pswit[OVERVIEW_SWITCH])
1718 printf(" Line %ld column %d - "
1719 "Query punctuation after %s?\n",
1720 linecnt,(int)(s-aline)+1,inword);
1728 * Check for commonly mistyped words,
1729 * and digits like 0 for O in a word.
1734 s=getaword(s,inword);
1736 continue; /* don't bother with empty lines */
1737 if (mixdigit(inword))
1739 if (pswit[ECHO_SWITCH])
1740 printf("\n%s\n",aline);
1741 if (!pswit[OVERVIEW_SWITCH])
1742 printf(" Line %ld column %d - Query digit in %s\n",
1743 linecnt,(int)(wordstart-aline)+1,inword);
1748 * Put the word through a series of tests for likely typos and OCR
1751 if (pswit[TYPO_SWITCH])
1754 strcpy(testword,inword);
1756 for (i=0;i<(signed int)strlen(testword);i++)
1758 /* lowercase for testing */
1759 if (testword[i]>='a' && testword[i]<='z')
1761 if (alower && testword[i]>='A' && testword[i]<='Z')
1764 * We have an uppercase mid-word. However, there are
1766 * Mac and Mc like McGill
1767 * French contractions like l'Abbe
1769 if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1770 i==3 && testword[0]=='m' && testword[1]=='a' &&
1771 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1776 testword[i]=(char)tolower(testword[i]);
1779 * Check for certain unlikely two-letter combinations at word
1782 if (strlen(testword)>1)
1784 for (i=0;*nostart[i];i++)
1785 if (!strncmp(testword,nostart[i],2))
1787 for (i=0;*noend[i];i++)
1788 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1791 /* ght is common, gbt never. Like that. */
1792 if (strstr(testword,"cb"))
1794 if (strstr(testword,"gbt"))
1796 if (strstr(testword,"pbt"))
1798 if (strstr(testword,"tbs"))
1800 if (strstr(testword,"mrn"))
1802 if (strstr(testword,"ahle"))
1804 if (strstr(testword,"ihle"))
1807 * "TBE" does happen - like HEARTBEAT - but uncommon.
1808 * Also "TBI" - frostbite, outbid - but uncommon.
1809 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1810 * numerals, but "ii" is a common scanno.
1812 if (strstr(testword,"tbi"))
1814 if (strstr(testword,"tbe"))
1816 if (strstr(testword,"ii"))
1819 * Check for no vowels or no consonants.
1820 * If none, flag a typo.
1822 if (!istypo && strlen(testword)>1)
1825 for (i=0;testword[i];i++)
1827 if (testword[i]=='y' || gcisdigit(testword[i]))
1829 /* Yah, this is loose. */
1833 else if (strchr(vowels,testword[i]))
1838 if (!vowel || !consonant)
1842 * Now exclude the word from being reported if it's in
1845 for (i=0;*okword[i];i++)
1846 if (!strcmp(testword,okword[i]))
1849 * What looks like a typo may be a Roman numeral.
1852 if (istypo && isroman(testword))
1854 /* Check the manual list of typos. */
1856 for (i=0;*typo[i];i++)
1857 if (!strcmp(testword,typo[i]))
1860 * Check lowercase s, l, i and m - special cases.
1861 * "j" - often a semi-colon gone wrong.
1862 * "d" for a missing apostrophe - he d
1865 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1870 if (strlen(testword)<MAX_QWORD_LENGTH &&
1871 !pswit[VERBOSE_SWITCH])
1872 for (i=0;i<qword_index;i++)
1873 if (!strcmp(testword,qword[i]))
1880 if (qword_index<MAX_QWORD &&
1881 strlen(testword)<MAX_QWORD_LENGTH)
1883 strcpy(qword[qword_index],testword);
1886 if (pswit[ECHO_SWITCH])
1887 printf("\n%s\n",aline);
1888 if (!pswit[OVERVIEW_SWITCH])
1890 printf(" Line %ld column %d - Query word %s",
1891 linecnt,(int)(wordstart-aline)+1,inword);
1892 if (strlen(testword)<MAX_QWORD_LENGTH &&
1893 !pswit[VERBOSE_SWITCH])
1894 printf(" - not reporting duplicates");
1902 /* check the user's list of typos */
1903 if (!istypo && usertypo_count)
1904 for (i=0;i<usertypo_count;i++)
1905 if (!strcmp(testword,usertypo[i]))
1907 if (pswit[ECHO_SWITCH])
1908 printf("\n%s\n",aline);
1909 if (!pswit[OVERVIEW_SWITCH])
1910 printf(" Line %ld column %d - "
1911 "Query possible scanno %s\n",
1912 linecnt,(int)(wordstart-aline)+2,inword);
1914 if (pswit[PARANOID_SWITCH] && warnings->digit)
1916 /* In paranoid mode, query all 0 and 1 standing alone. */
1917 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1919 if (pswit[ECHO_SWITCH])
1920 printf("\n%s\n",aline);
1921 if (!pswit[OVERVIEW_SWITCH])
1922 printf(" Line %ld column %d - Query standalone %s\n",
1923 linecnt,(int)(wordstart-aline)+2,inword);
1930 * Look for added or missing spaces around punctuation and quotes.
1931 * If there is a punctuation character like ! with no space on
1932 * either side, suspect a missing!space. If there are spaces on
1933 * both sides , assume a typo. If we see a double quote with no
1934 * space or punctuation on either side of it, assume unspaced
1935 * quotes "like"this.
1938 for (i=1;i<llen;i++)
1940 /* For each character in the line after the first. */
1941 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
1943 /* we need to suppress warnings for acronyms like M.D. */
1945 /* we need to suppress warnings for ellipsis . . . */
1947 /* if there are letters on both sides of it or ... */
1948 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
1949 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
1951 /* ...if it's strict punctuation followed by an alpha */
1954 if (i>2 && aline[i-2]=='.')
1956 if (i+2<llen && aline[i+2]=='.')
1961 if (pswit[ECHO_SWITCH])
1962 printf("\n%s\n",aline);
1963 if (!pswit[OVERVIEW_SWITCH])
1964 printf(" Line %ld column %d - Missing space?\n",
1970 if (aline[i-1]==CHAR_SPACE &&
1971 (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
1974 * If there are spaces on both sides,
1975 * or space before and end of line.
1979 if (i>2 && aline[i-2]=='.')
1981 if (i+2<llen && aline[i+2]=='.')
1984 if (!isemptyline && !isellipsis)
1986 if (pswit[ECHO_SWITCH])
1987 printf("\n%s\n",aline);
1988 if (!pswit[OVERVIEW_SWITCH])
1989 printf(" Line %ld column %d - "
1990 "Spaced punctuation?\n",linecnt,i+1);
1997 /* Split out the characters that CANNOT be preceded by space. */
1999 for (i=1;i<llen;i++)
2001 /* for each character in the line after the first */
2002 if (strchr("?!,;:",aline[i]))
2004 /* if it's punctuation that _cannot_ have a space before it */
2005 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
2006 aline[i+1]!=CHAR_SPACE)
2009 * If aline[i+1) DOES == space,
2010 * it was already reported just above.
2012 if (pswit[ECHO_SWITCH])
2013 printf("\n%s\n",aline);
2014 if (!pswit[OVERVIEW_SWITCH])
2015 printf(" Line %ld column %d - Spaced punctuation?\n",
2023 * Special case " .X" where X is any alpha.
2024 * This plugs a hole in the acronym code above.
2025 * Inelegant, but maintainable.
2028 for (i=1;i<llen;i++)
2030 /* for each character in the line after the first */
2033 /* if it's a period */
2034 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
2037 * If the period follows a space and
2038 * is followed by a letter.
2040 if (pswit[ECHO_SWITCH])
2041 printf("\n%s\n",aline);
2042 if (!pswit[OVERVIEW_SWITCH])
2043 printf(" Line %ld column %d - Spaced punctuation?\n",
2050 for (i=1;i<llen;i++)
2052 /* for each character in the line after the first */
2053 if (aline[i]==CHAR_DQUOTE)
2055 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
2056 !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
2057 !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
2059 if (pswit[ECHO_SWITCH])
2060 printf("\n%s\n",aline);
2061 if (!pswit[OVERVIEW_SWITCH])
2062 printf(" Line %ld column %d - Unspaced quotes?\n",
2069 /* Check parity of quotes. */
2070 for (s=aline;*s;s++)
2072 if (*s==CHAR_DQUOTE)
2074 if (!(dquotepar=!dquotepar))
2077 if (!strchr("_-.'`/,;:!?)]} ",s[1]))
2079 if (pswit[ECHO_SWITCH])
2080 printf("\n%s\n",aline);
2081 if (!pswit[OVERVIEW_SWITCH])
2082 printf(" Line %ld column %d - "
2083 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2091 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2092 !strchr("_-/.'`([{$",s[1]) || !s[1])
2094 if (pswit[ECHO_SWITCH])
2095 printf("\n%s\n",aline);
2096 if (!pswit[OVERVIEW_SWITCH])
2097 printf(" Line %ld column %d - "
2098 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2105 if (*aline==CHAR_DQUOTE)
2107 if (strchr(",;:!?)]} ",aline[1]))
2109 if (pswit[ECHO_SWITCH])
2110 printf("\n%s\n",aline);
2111 if (!pswit[OVERVIEW_SWITCH])
2112 printf(" Line %ld column 1 - Wrongspaced quotes?\n",
2118 if (pswit[SQUOTE_SWITCH])
2120 for (s=aline;*s;s++)
2122 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
2123 (s==aline || s>aline && !gcisalpha(s[-1]) ||
2126 if (!(squotepar=!squotepar))
2129 if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
2131 if (pswit[ECHO_SWITCH])
2132 printf("\n%s\n",aline);
2133 if (!pswit[OVERVIEW_SWITCH])
2134 printf(" Line %ld column %d - "
2135 "Wrongspaced singlequotes?\n",
2136 linecnt,(int)(s-aline)+1);
2144 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2145 !strchr("_-/\".'`",s[1]) || !s[1])
2147 if (pswit[ECHO_SWITCH])
2148 printf("\n%s\n",aline);
2149 if (!pswit[OVERVIEW_SWITCH])
2150 printf(" Line %ld column %d - "
2151 "Wrongspaced singlequotes?\n",
2152 linecnt,(int)(s-aline)+1);
2161 * Look for double punctuation like ,. or ,,
2162 * Thanks to DW for the suggestion!
2163 * In books with references, ".," and ".;" are common
2164 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2165 * OTOH, from my initial tests, there are also fairly
2166 * common errors. What to do? Make these cases paranoid?
2167 * ".," is the most common, so warnings->dotcomma is used
2168 * to suppress detailed reporting if it occurs often.
2171 for (i=0;i<llen;i++)
2173 /* for each punctuation character in the line */
2174 if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
2175 aline[i] && aline[i+1])
2177 /* followed by punctuation, it's a query, unless . . . */
2178 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
2180 !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
2181 warnings->isFrench && !strncmp(aline+i,",...",4) ||
2182 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2183 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2184 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2185 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2186 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2187 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2188 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2189 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2190 warnings->isFrench && !strncmp(aline+i,"...?",4))
2192 if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
2193 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2194 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2195 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2196 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2197 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2198 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2199 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2200 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2201 warnings->isFrench && !strncmp(aline+i,"...?",4))
2203 ; /* do nothing for .. !! and ?? which can be legit */
2207 if (pswit[ECHO_SWITCH])
2208 printf("\n%s\n",aline);
2209 if (!pswit[OVERVIEW_SWITCH])
2210 printf(" Line %ld column %d - Double punctuation?\n",
2218 while (strstr(s," \" "))
2220 if (pswit[ECHO_SWITCH])
2221 printf("\n%s\n",aline);
2222 if (!pswit[OVERVIEW_SWITCH])
2223 printf(" Line %ld column %d - Spaced doublequote?\n",
2224 linecnt,(int)(strstr(s," \" ")-aline+1));
2227 s=strstr(s," \" ")+2;
2230 while (strstr(s," ' "))
2232 if (pswit[ECHO_SWITCH])
2233 printf("\n%s\n",aline);
2234 if (!pswit[OVERVIEW_SWITCH])
2235 printf(" Line %ld column %d - Spaced singlequote?\n",
2236 linecnt,(int)(strstr(s," ' ")-aline+1));
2239 s=strstr(s," ' ")+2;
2242 while (strstr(s," ` "))
2244 if (pswit[ECHO_SWITCH])
2245 printf("\n%s\n",aline);
2246 if (!pswit[OVERVIEW_SWITCH])
2247 printf(" Line %ld column %d - Spaced singlequote?\n",
2248 linecnt,(int)(strstr(s," ` ")-aline+1));
2251 s=strstr(s," ` ")+2;
2253 /* check special case of 'S instead of 's at end of word */
2257 if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
2259 if (pswit[ECHO_SWITCH])
2260 printf("\n%s\n",aline);
2261 if (!pswit[OVERVIEW_SWITCH])
2262 printf(" Line %ld column %d - Capital \"S\"?\n",
2263 linecnt,(int)(s-aline+2));
2270 * Now check special cases - start and end of line -
2271 * for single and double quotes. Start is sometimes [sic]
2272 * but better to query it anyway.
2273 * While we're here, check for dash at end of line.
2278 if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
2279 aline[llen-1]==CHAR_OPEN_SQUOTE)
2280 if (aline[llen-2]==CHAR_SPACE)
2282 if (pswit[ECHO_SWITCH])
2283 printf("\n%s\n",aline);
2284 if (!pswit[OVERVIEW_SWITCH])
2285 printf(" Line %ld column %d - Spaced quote?\n",
2290 if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
2291 aline[1]==CHAR_SPACE)
2293 if (pswit[ECHO_SWITCH])
2294 printf("\n%s\n",aline);
2295 if (!pswit[OVERVIEW_SWITCH])
2296 printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
2301 * Dash at end of line may well be legit - paranoid mode only
2302 * and don't report em-dash at line-end.
2304 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2306 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
2308 if (aline[i]=='-' && aline[i-1]!='-')
2310 if (pswit[ECHO_SWITCH])
2311 printf("\n%s\n",aline);
2312 if (!pswit[OVERVIEW_SWITCH])
2313 printf(" Line %ld column %d - "
2314 "Hyphen at end of line?\n",linecnt,i);
2319 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2320 * If so, suspect a scanno like "a]most".
2323 for (i=1;i<llen-1;i++)
2325 /* for each bracket character in the line except 1st & last */
2326 if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
2327 gcisalpha(aline[i+1]))
2329 if (pswit[ECHO_SWITCH])
2330 printf("\n%s\n",aline);
2331 if (!pswit[OVERVIEW_SWITCH])
2332 printf(" Line %ld column %d - Unspaced bracket?\n",
2339 if (warnings->endquote)
2341 for (i=1;i<llen;i++)
2343 /* for each character in the line except 1st */
2344 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
2346 if (pswit[ECHO_SWITCH])
2347 printf("\n%s\n",aline);
2348 if (!pswit[OVERVIEW_SWITCH])
2349 printf(" Line %ld column %d - "
2350 "endquote missing punctuation?\n",linecnt,i);
2357 * Check for <HTML TAG>.
2358 * If there is a < in the line, followed at some point
2359 * by a > then we suspect HTML.
2361 if (strstr(aline,"<") && strstr(aline,">"))
2363 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
2366 strncpy(wrk,strstr(aline,"<"),i);
2368 if (pswit[ECHO_SWITCH])
2369 printf("\n%s\n",aline);
2370 if (!pswit[OVERVIEW_SWITCH])
2371 printf(" Line %ld column %d - HTML Tag? %s \n",
2372 linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
2378 * Check for &symbol; HTML.
2379 * If there is a & in the line, followed at
2380 * some point by a ; then we suspect HTML.
2382 if (strstr(aline,"&") && strstr(aline,";"))
2384 i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
2385 for (s=strstr(aline,"&");s<strstr(aline,";");s++)
2387 i=0; /* Don't report "Jones & Son;" */
2390 strncpy(wrk,strstr(aline,"&"),i);
2392 if (pswit[ECHO_SWITCH])
2393 printf("\n%s\n",aline);
2394 if (!pswit[OVERVIEW_SWITCH])
2395 printf(" Line %ld column %d - HTML symbol? %s \n",
2396 linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
2402 * At end of paragraph, check for mismatched quotes.
2403 * We don't want to report an error immediately, since it is a
2404 * common convention to omit the quotes at end of paragraph if
2405 * the next paragraph is a continuation of the same speaker.
2406 * Where this is the case, the next para should begin with a
2407 * quote, so we store the warning message and only display it
2408 * at the top of the next iteration if the new para doesn't
2409 * start with a quote.
2410 * The -p switch overrides this default, and warns of unclosed
2411 * quotes on _every_ paragraph, whether the next begins with a
2416 /* end of para - add up the totals */
2417 if (counters.quot%2)
2418 sprintf(dquote_err," Line %ld - Mismatched quotes\n",
2420 if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
2421 counters.open_single_quote!=counters.close_single_quote)
2422 sprintf(squote_err," Line %ld - Mismatched singlequotes?\n",
2424 if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
2425 counters.open_single_quote!=counters.close_single_quote &&
2426 counters.open_single_quote!=counters.close_single_quote+1)
2428 * Flag it to be noted regardless of the
2429 * first char of the next para.
2432 if (counters.r_brack)
2433 sprintf(rbrack_err," Line %ld - "
2434 "Mismatched round brackets?\n",linecnt);
2435 if (counters.s_brack)
2436 sprintf(sbrack_err," Line %ld - "
2437 "Mismatched square brackets?\n",linecnt);
2438 if (counters.c_brack)
2439 sprintf(cbrack_err," Line %ld - "
2440 "Mismatched curly brackets?\n",linecnt);
2441 if (counters.c_unders%2)
2442 sprintf(unders_err," Line %ld - Mismatched underscores?\n",
2444 memset(&counters,0,sizeof(counters));
2445 /* let the next iteration know that it's starting a new para */
2449 * Check for omitted punctuation at end of paragraph by working back
2450 * through prevline. DW.
2451 * Need to check this only for "normal" paras.
2452 * So what is a "normal" para?
2453 * Not normal if one-liner (chapter headings, etc.)
2454 * Not normal if doesn't contain at least one locase letter
2455 * Not normal if starts with space
2460 for (s=prevline,i=0;*s && !i;s++)
2462 /* use i to indicate the presence of a letter on the line */
2465 * This next "if" is a problem.
2466 * If we say "start_para_line <= linecnt - 1", that includes
2467 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2468 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2469 * misses genuine one-line paragraphs.
2471 if (i && lastblen>2 && start_para_line<linecnt-1 &&
2472 *prevline>CHAR_SPACE)
2474 for (i=strlen(prevline)-1;
2475 (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
2476 prevline[i]>CHAR_SPACE && i>0;
2481 if (gcisalpha(prevline[i]))
2483 if (pswit[ECHO_SWITCH])
2484 printf("\n%s\n",prevline);
2485 if (!pswit[OVERVIEW_SWITCH])
2486 printf(" Line %ld column %d - "
2487 "No punctuation at para end?\n",
2488 linecnt-1,strlen(prevline));
2493 if (strchr("-.:!([{?}])",prevline[i]))
2498 strcpy(prevline,aline);
2501 if (!pswit[OVERVIEW_SWITCH])
2502 for (i=0;i<MAX_QWORD;i++)
2504 printf("\nNote: Queried word %s was duplicated %d time%s\n",
2505 qword[i],dupcnt[i],"s");
2511 * Get one line from the input stream, checking for
2512 * the existence of exactly one CR/LF line-end per line.
2514 * Returns: a pointer to the line.
2516 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
2522 c=cint=fgetc(thefile);
2527 /* either way, it's end of line */
2534 /* Error - a LF without a preceding CR */
2535 if (pswit[LINE_END_SWITCH])
2537 if (pswit[ECHO_SWITCH])
2538 printf("\n%s\n",theline);
2539 if (!pswit[OVERVIEW_SWITCH])
2540 printf(" Line %ld - No CR?\n",lcnt);
2551 /* Error - two successive CRs */
2552 if (pswit[LINE_END_SWITCH])
2554 if (pswit[ECHO_SWITCH])
2555 printf("\n%s\n",theline);
2556 if (!pswit[OVERVIEW_SWITCH])
2557 printf(" Line %ld - Two successive CRs?\n",lcnt);
2566 if (pswit[LINE_END_SWITCH] && isCR)
2568 if (pswit[ECHO_SWITCH])
2569 printf("\n%s\n",theline);
2570 if (!pswit[OVERVIEW_SWITCH])
2571 printf(" Line %ld column %d - CR without LF?\n",
2581 c=cint=fgetc(thefile);
2582 } while(len<maxlen);
2583 if (pswit[MARKUP_SWITCH])
2584 postprocess_for_HTML(theline);
2585 if (pswit[DP_SWITCH])
2586 postprocess_for_DP(theline);
2593 * Takes a "word" as a parameter, and checks whether it
2594 * contains a mixture of alpha and digits. Generally, this is an
2595 * error, but may not be for cases like 4th or L5 12s. 3d.
2597 * Returns: 0 if no error found, 1 if error.
2599 int mixdigit(char *checkword)
2601 int wehaveadigit,wehavealetter,firstdigits,query,wl;
2603 wehaveadigit=wehavealetter=query=0;
2604 for (s=checkword;*s;s++)
2610 if (wehaveadigit && wehavealetter)
2612 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2614 wl=strlen(checkword);
2615 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
2617 /* digits, ending in st, rd, nd, th of either case */
2618 if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
2619 matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
2620 matchword(checkword+wl-2,"th")))
2622 if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
2623 matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
2624 matchword(checkword+wl-3,"ths")))
2626 if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
2627 matchword(checkword+wl-4,"rdly") ||
2628 matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
2630 /* digits, ending in l, L, s or d */
2631 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
2632 checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
2635 * L at the start of a number, representing Britsh pounds, like L500.
2636 * This is cute. We know the current word is mixeddigit. If the first
2637 * letter is L, there must be at least one digit following. If both
2638 * digits and letters follow, we have a genuine error, else we have a
2639 * capital L followed by digits, and we accept that as a non-error.
2641 if (checkword[0]=='L' && !mixdigit(checkword+1))
2650 * Extracts the first/next "word" from the line, and puts
2651 * it into "thisword". A word is defined as one English word unit--or
2652 * at least that's the aim.
2654 * Returns: a pointer to the position in the line where we will start
2655 * looking for the next word.
2657 char *getaword(char *fromline,char *thisword)
2662 for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
2666 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2667 * Especially yucky is the case of L1,000
2668 * This section looks for a pattern of characters including a digit
2669 * followed by a comma or period followed by one or more digits.
2670 * If found, it returns this whole pattern as a word; otherwise we discard
2671 * the results and resume our normal programming.
2674 for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
2675 wordlen<MAXWORDLEN;s++)
2677 thisword[wordlen]=*s;
2680 thisword[wordlen]=0;
2681 for (i=1;i<wordlen-1;i++)
2683 if (thisword[i]=='.' || thisword[i]==',')
2685 if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
2692 /* we didn't find a punctuated number - do the regular getword thing */
2694 for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
2695 wordlen<MAXWORDLEN;fromline++)
2697 thisword[wordlen]=*fromline;
2700 thisword[wordlen]=0;
2707 * A case-insensitive string matcher.
2709 int matchword(char *checkfor,char *thisword)
2711 unsigned int ismatch,i;
2712 if (strlen(checkfor)!=strlen(thisword))
2714 ismatch=1; /* assume a match until we find a difference */
2715 for (i=0;i<strlen(checkfor);i++)
2716 if (toupper(checkfor[i])!=toupper(thisword[i]))
2724 * Lowercase the line.
2727 void lowerit(char *theline)
2729 for (;*theline;theline++)
2730 if (*theline>='A' && *theline<='Z')
2737 * Is this word a Roman Numeral?
2739 * It doesn't actually validate that the number is a valid Roman Numeral--for
2740 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2741 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2742 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2743 * expressions thereof, except when it came to taxes. Allow any number of M,
2744 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2745 * XL or an optional XC, an optional IX or IV, an optional V and any number
2748 int isroman(char *t)
2754 while (*t=='m' && *t)
2758 if (*t=='c' && t[1]=='m')
2760 if (*t=='c' && t[1]=='d')
2762 while (*t=='c' && *t)
2764 if (*t=='x' && t[1]=='l')
2766 if (*t=='x' && t[1]=='c')
2770 while (*t=='x' && *t)
2772 if (*t=='i' && t[1]=='x')
2774 if (*t=='i' && t[1]=='v')
2778 while (*t=='i' && *t)
2786 * A version of isalpha() that is somewhat lenient on 8-bit texts.
2787 * If we use the standard function, 8-bit accented characters break
2788 * words, so that tete with accented characters appears to be two words, "t"
2789 * and "t", with 8-bit characters between them. This causes over-reporting of
2790 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
2791 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
2793 int gcisalpha(unsigned char c)
2795 if (c>='a' && c<='z')
2797 if (c>='A' && c<='Z')
2801 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
2803 if (c==140 || c==142 || c==156 || c==158 || c==159)
2811 * A version of isdigit() that doesn't get confused in 8-bit texts.
2813 int gcisdigit(unsigned char c)
2815 return c>='0' && c<='9';
2821 * A version of isletter() that doesn't get confused in 8-bit texts.
2822 * NB: this is ISO-8891-1-specific.
2824 int gcisletter(unsigned char c)
2826 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
2832 * Wraps strchr to return NULL if the character being searched for is zero.
2834 char *gcstrchr(char *s,char c)
2842 * postprocess_for_DP:
2844 * Invoked with the -d switch from flgets().
2845 * It simply "removes" from the line a hard-coded set of common
2846 * DP-specific tags, so that the line passed to the main routine has
2847 * been pre-cleaned of DP markup.
2849 void postprocess_for_DP(char *theline)
2855 for (i=0;*DPmarkup[i];i++)
2857 s=strstr(theline,DPmarkup[i]);
2860 t=s+strlen(DPmarkup[i]);
2868 s=strstr(theline,DPmarkup[i]);
2874 * postprocess_for_HTML:
2876 * Invoked with the -m switch from flgets().
2877 * It simply "removes" from the line a hard-coded set of common
2878 * HTML tags and "replaces" a hard-coded set of common HTML
2879 * entities, so that the line passed to the main routine has
2880 * been pre-cleaned of HTML.
2882 void postprocess_for_HTML(char *theline)
2884 if (strstr(theline,"<") && strstr(theline,">"))
2885 while (losemarkup(theline))
2887 while (loseentities(theline))
2891 char *losemarkup(char *theline)
2897 s=strstr(theline,"<");
2898 t=strstr(theline,">");
2901 for (i=0;*markup[i];i++)
2902 if (!tagcomp(s+1,markup[i]))
2915 /* It's an unrecognized <xxx>. */
2919 char *loseentities(char *theline)
2925 for (i=0;*entities[i].htmlent;i++)
2927 s=strstr(theline,entities[i].htmlent);
2930 t=malloc((size_t)strlen(s));
2933 strcpy(t,s+strlen(entities[i].htmlent));
2934 strcpy(s,entities[i].textent);
2940 for (i=0;*entities[i].htmlnum;i++)
2942 s=strstr(theline,entities[i].htmlnum);
2945 t=malloc((size_t)strlen(s));
2948 strcpy(t,s+strlen(entities[i].htmlnum));
2949 strcpy(s,entities[i].textent);
2958 int tagcomp(char *strin,char *basetag)
2964 t++; /* ignore a slash */
2967 if (tolower(*s)!=tolower(*t))
2977 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
2978 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
2979 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
2980 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
2981 "For details, read the file COPYING.\n",stderr);
2982 fputs("This is Free Software; "
2983 "you may redistribute it under certain conditions (GPL);\n",stderr);
2984 fputs("read the file COPYING for details.\n\n",stderr);
2985 fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
2986 fputs(" where -s checks single quotes, -e suppresses echoing lines, "
2987 "-t checks typos\n",stderr);
2988 fputs(" -x (paranoid) switches OFF -t and extra checks, "
2989 "-l turns OFF line-end checks\n",stderr);
2990 fputs(" -o just displays overview without detail, "
2991 "-h echoes header fields\n",stderr);
2992 fputs(" -v (verbose) unsuppresses duplicate reporting, "
2993 "-m suppresses markup\n",stderr);
2994 fputs(" -d ignores DP-specific markup,\n",stderr);
2995 fputs(" -u uses a file gutcheck.typ to query user-defined "
2996 "possible typos\n",stderr);
2997 fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
2999 fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
3001 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3002 "non-ASCII\n",stderr);
3003 fputs("characters like accented letters, "
3004 "lines longer than 75 or shorter than 55,\n",stderr);
3005 fputs("unbalanced quotes or brackets, "
3006 "a variety of badly formatted punctuation, \n",stderr);
3007 fputs("HTML tags, some likely typos. "
3008 "It is NOT a substitute for human judgement.\n",stderr);