1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
26 #define MAXWORDLEN 80 /* max length of one word */
27 #define LINEBUFSIZE 2048 /* buffer size for an input line */
29 #define MAX_USER_TYPOS 1000
30 #define USERTYPO_FILE "gutcheck.typ"
33 #define MAX_PATH 16384
36 char aline[LINEBUFSIZE];
37 char prevline[LINEBUFSIZE];
41 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
42 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
43 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
44 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
45 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
46 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
47 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
48 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
49 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
50 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
51 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
52 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
53 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
54 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
55 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
56 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
57 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
58 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
59 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
60 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
61 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
62 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
63 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
64 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
65 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
66 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
67 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
68 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
69 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 char *usertypo[MAX_USER_TYPOS];
75 /* Common abbreviations and other OK words not to query as typos. */
77 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
78 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
79 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
80 "outbid", "outbids", "frostbite", "frostbitten", ""
83 /* Common abbreviations that cause otherwise unexplained periods. */
85 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
86 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
90 * Two-Letter combinations that rarely if ever start words,
91 * but are common scannos or otherwise common letter combinations.
94 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
98 * Two-Letter combinations that rarely if ever end words,
99 * but are common scannos or otherwise common letter combinations.
102 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
103 "sw", "gr", "sl", "cl", "iy", ""
107 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
108 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
109 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
110 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
114 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
118 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
119 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
120 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
121 "during", "let", "toward", "among", ""
125 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
126 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
127 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
128 "among", "those", "into", "whom", "having", "thence", ""
131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
138 "&", "&", "&",
139 "<", "<", "<",
140 ">", ">", ">",
141 "°", "°", " degrees",
142 "£", "£", "L",
143 """, """, "\"", /* quotation mark = APL quote */
144 "Œ", "Œ", "OE", /* latin capital ligature OE */
145 "œ", "œ", "oe", /* latin small ligature oe */
146 "Š", "Š", "S", /* latin capital letter S with caron */
147 "š", "š", "s", /* latin small letter s with caron */
148 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
149 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
150 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
151 " ", " ", " ", /* en space, U+2002 ISOpub */
152 " ", " ", " ", /* em space, U+2003 ISOpub */
153 " ", " ", " ", /* thin space, U+2009 ISOpub */
154 "–", "–", "-", /* en dash, U+2013 ISOpub */
155 "—", "—", "--", /* em dash, U+2014 ISOpub */
156 "’", "’", "'", /* right single quotation mark */
157 "‚", "‚", "'", /* single low-9 quotation mark */
158 "“", "“", "\"", /* left double quotation mark */
159 "”", "”", "\"", /* right double quotation mark */
160 "„", "„", "\"", /* double low-9 quotation mark */
161 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
162 "›", "›", "\"", /* single right-pointing angle quotation mark */
163 " ", " ", " ", /* no-break space = non-breaking space, */
164 "¡", "¡", "!", /* inverted exclamation mark */
165 "¢", "¢", "c", /* cent sign */
166 "£", "£", "L", /* pound sign */
167 "¤", "¤", "$", /* currency sign */
168 "¥", "¥", "Y", /* yen sign = yuan sign */
169 "§", "§", "--", /* section sign */
170 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
171 "©", "©", "(C) ", /* copyright sign */
172 "ª", "ª", " ", /* feminine ordinal indicator */
173 "«", "«", "\"", /* left-pointing double angle quotation mark */
174 "­", "­", "-", /* soft hyphen = discretionary hyphen */
175 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
176 "¯", "¯", " ", /* macron = spacing macron = overline */
177 "°", "°", " degrees", /* degree sign */
178 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
179 "²", "²", "2", /* superscript two = superscript digit two */
180 "³", "³", "3", /* superscript three = superscript digit three */
181 "´", "´", " ", /* acute accent = spacing acute */
182 "µ", "µ", "m", /* micro sign */
183 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
184 "¸", "¸", " ", /* cedilla = spacing cedilla */
185 "¹", "¹", "1", /* superscript one = superscript digit one */
186 "º", "º", " ", /* masculine ordinal indicator */
187 "»", "»", "\"", /* right-pointing double angle quotation mark */
188 "¼", "¼", "1/4", /* vulgar fraction one quarter */
189 "½", "½", "1/2", /* vulgar fraction one half */
190 "¾", "¾", "3/4", /* vulgar fraction three quarters */
191 "¿", "¿", "?", /* inverted question mark */
192 "À", "À", "A", /* latin capital letter A with grave */
193 "Á", "Á", "A", /* latin capital letter A with acute */
194 "Â", "Â", "A", /* latin capital letter A with circumflex */
195 "Ã", "Ã", "A", /* latin capital letter A with tilde */
196 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
197 "Å", "Å", "A", /* latin capital letter A with ring above */
198 "Æ", "Æ", "AE", /* latin capital letter AE */
199 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
200 "È", "È", "E", /* latin capital letter E with grave */
201 "É", "É", "E", /* latin capital letter E with acute */
202 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
203 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
204 "Ì", "Ì", "I", /* latin capital letter I with grave */
205 "Í", "Í", "I", /* latin capital letter I with acute */
206 "Î", "Î", "I", /* latin capital letter I with circumflex */
207 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
208 "Ð", "Ð", "E", /* latin capital letter ETH */
209 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
210 "Ò", "Ò", "O", /* latin capital letter O with grave */
211 "Ó", "Ó", "O", /* latin capital letter O with acute */
212 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
213 "Õ", "Õ", "O", /* latin capital letter O with tilde */
214 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
215 "×", "×", "*", /* multiplication sign */
216 "Ø", "Ø", "O", /* latin capital letter O with stroke */
217 "Ù", "Ù", "U", /* latin capital letter U with grave */
218 "Ú", "Ú", "U", /* latin capital letter U with acute */
219 "Û", "Û", "U", /* latin capital letter U with circumflex */
220 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
221 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
222 "Þ", "Þ", "TH", /* latin capital letter THORN */
223 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
224 "à", "à", "a", /* latin small letter a with grave */
225 "á", "á", "a", /* latin small letter a with acute */
226 "â", "â", "a", /* latin small letter a with circumflex */
227 "ã", "ã", "a", /* latin small letter a with tilde */
228 "ä", "ä", "a", /* latin small letter a with diaeresis */
229 "å", "å", "a", /* latin small letter a with ring above */
230 "æ", "æ", "ae", /* latin small letter ae */
231 "ç", "ç", "c", /* latin small letter c with cedilla */
232 "è", "è", "e", /* latin small letter e with grave */
233 "é", "é", "e", /* latin small letter e with acute */
234 "ê", "ê", "e", /* latin small letter e with circumflex */
235 "ë", "ë", "e", /* latin small letter e with diaeresis */
236 "ì", "ì", "i", /* latin small letter i with grave */
237 "í", "í", "i", /* latin small letter i with acute */
238 "î", "î", "i", /* latin small letter i with circumflex */
239 "ï", "ï", "i", /* latin small letter i with diaeresis */
240 "ð", "ð", "eth", /* latin small letter eth */
241 "ñ", "ñ", "n", /* latin small letter n with tilde */
242 "ò", "ò", "o", /* latin small letter o with grave */
243 "ó", "ó", "o", /* latin small letter o with acute */
244 "ô", "ô", "o", /* latin small letter o with circumflex */
245 "õ", "õ", "o", /* latin small letter o with tilde */
246 "ö", "ö", "o", /* latin small letter o with diaeresis */
247 "÷", "÷", "/", /* division sign */
248 "ø", "ø", "o", /* latin small letter o with stroke */
249 "ù", "ù", "u", /* latin small letter u with grave */
250 "ú", "ú", "u", /* latin small letter u with acute */
251 "û", "û", "u", /* latin small letter u with circumflex */
252 "ü", "ü", "u", /* latin small letter u with diaeresis */
253 "ý", "ý", "y", /* latin small letter y with acute */
254 "þ", "þ", "th", /* latin small letter thorn */
255 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
259 /* special characters */
260 #define CHAR_SPACE 32
264 #define CHAR_DQUOTE 34
265 #define CHAR_SQUOTE 39
266 #define CHAR_OPEN_SQUOTE 96
267 #define CHAR_TILDE 126
268 #define CHAR_ASTERISK 42
269 #define CHAR_FORESLASH 47
270 #define CHAR_CARAT 94
272 #define CHAR_UNDERSCORE '_'
273 #define CHAR_OPEN_CBRACK '{'
274 #define CHAR_CLOSE_CBRACK '}'
275 #define CHAR_OPEN_RBRACK '('
276 #define CHAR_CLOSE_RBRACK ')'
277 #define CHAR_OPEN_SBRACK '['
278 #define CHAR_CLOSE_SBRACK ']'
280 /* longest and shortest normal PG line lengths */
281 #define LONGEST_PG_LINE 75
282 #define WAY_TOO_LONG 80
283 #define SHORTEST_PG_LINE 55
285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
286 /* D - ignore DP-specific markup */
287 /* E - echo queried line */
288 /* S - check single quotes */
289 /* T - check common typos */
290 /* P - require closure of quotes on */
291 /* every paragraph */
292 /* X - "Trust no one" :-) Paranoid! */
293 /* Queries everything */
294 /* L - line end checking defaults on */
295 /* -L turns it off */
296 /* O - overview. Just shows counts. */
297 /* Y - puts errors to stdout */
298 /* instead of stderr */
299 /* H - Echoes header fields */
300 /* M - Ignore markup in < > */
301 /* U - Use file of User-defined Typos*/
302 /* W - Defaults for use on Web upload*/
303 /* V - Verbose - list EVERYTHING! */
304 #define SWITNO 14 /* max number of switch parms */
305 /* - used for defining array-size */
306 #define MINARGS 1 /* minimum no of args excl switches */
307 #define MAXARGS 1 /* maximum no of args excl switches */
309 int pswit[SWITNO]; /* program switches set by SWITCHES */
311 #define ECHO_SWITCH 0
312 #define SQUOTE_SWITCH 1
313 #define TYPO_SWITCH 2
314 #define QPARA_SWITCH 3
315 #define PARANOID_SWITCH 4
316 #define LINE_END_SWITCH 5
317 #define OVERVIEW_SWITCH 6
318 #define STDOUT_SWITCH 7
319 #define HEADER_SWITCH 8
321 #define VERBOSE_SWITCH 10
322 #define MARKUP_SWITCH 11
323 #define USERTYPO_SWITCH 12
326 long cnt_dquot; /* for overview mode, count of doublequote queries */
327 long cnt_squot; /* for overview mode, count of singlequote queries */
328 long cnt_brack; /* for overview mode, count of brackets queries */
329 long cnt_bin; /* for overview mode, count of non-ASCII queries */
330 long cnt_odd; /* for overview mode, count of odd character queries */
331 long cnt_long; /* for overview mode, count of long line errors */
332 long cnt_short; /* for overview mode, count of short line queries */
333 long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
334 long cnt_dash; /* for overview mode, count of dash-related queries */
335 long cnt_word; /* for overview mode, count of word queries */
336 long cnt_html; /* for overview mode, count of html queries */
337 long cnt_lineend; /* for overview mode, count of line-end queries */
338 long cnt_spacend; /* count of lines with space at end */
339 long linecnt; /* count of total lines in the file */
340 long checked_linecnt; /* count of lines actually checked */
343 void procfile(char *);
345 #define LOW_THRESHOLD 0
346 #define HIGH_THRESHOLD 1
352 #define FIRST_OF_PAIR 0
353 #define SECOND_OF_PAIR 1
355 #define MAX_WORDPAIR 1000
357 char running_from[MAX_PATH];
359 int mixdigit(char *);
360 char *getaword(char *,char *);
361 int matchword(char *,char *);
362 char *flgets(char *,int,FILE *,long);
363 void lowerit(char *);
364 int gcisalpha(unsigned char);
365 int gcisdigit(unsigned char);
366 int gcisletter(unsigned char);
367 char *gcstrchr(char *s,char c);
368 void postprocess_for_HTML(char *);
369 char *linehasmarkup(char *);
370 char *losemarkup(char *);
371 int tagcomp(char *,char *);
372 char *loseentities(char *);
375 void postprocess_for_DP(char *);
377 char wrk[LINEBUFSIZE];
380 #define MAX_QWORD_LENGTH 40
381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
382 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
383 signed int dupcnt[MAX_QWORD];
385 int main(int argc,char **argv)
389 char usertypo_file[MAX_PATH];
391 if (strlen(argv[0])<sizeof(running_from))
392 /* save the path to the executable */
393 strcpy(running_from,argv[0]);
394 /* find out what directory we're running from */
395 s=running_from+strlen(running_from);
396 for (;*s!='/' && *s!='\\' && s>=running_from;s--)
398 switno=strlen(SWITCHES);
399 for (i=switno;--i>0;)
400 pswit[i]=0; /* initialise switches */
402 * Standard loop to extract switches.
403 * When we come out of this loop, the arguments will be
404 * in argv[0] upwards and the switches used will be
405 * represented by their equivalent elements in pswit[]
407 while (--argc>0 && **++argv=='-')
408 for (argsw=argv[0]+1;*argsw!='\0';argsw++)
409 for (i=switno,invarg=1;(--i>=0) && invarg==1;)
410 if ((toupper(*argsw))==SWITCHES[i])
415 /* Paranoid checking is turned OFF, not on, by its switch */
416 pswit[PARANOID_SWITCH]^=1;
417 if (pswit[PARANOID_SWITCH])
418 /* if running in paranoid mode force typo checks as well */
419 pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
420 /* Line-end checking is turned OFF, not on, by its switch */
421 pswit[LINE_END_SWITCH]^=1;
422 /* Echoing is turned OFF, not on, by its switch */
423 pswit[ECHO_SWITCH]^=1;
424 if (pswit[OVERVIEW_SWITCH])
425 /* just print summary; don't echo */
426 pswit[ECHO_SWITCH]=0;
428 * Web uploads - for the moment, this is really just a placeholder
429 * until we decide what processing we really want to do on web uploads
431 if (pswit[WEB_SWITCH])
433 /* specific override for web uploads */
434 pswit[ECHO_SWITCH]=1;
435 pswit[SQUOTE_SWITCH]=0;
436 pswit[TYPO_SWITCH]=1;
437 pswit[QPARA_SWITCH]=0;
438 pswit[PARANOID_SWITCH]=1;
439 pswit[LINE_END_SWITCH]=0;
440 pswit[OVERVIEW_SWITCH]=0;
441 pswit[STDOUT_SWITCH]=0;
442 pswit[HEADER_SWITCH]=1;
443 pswit[VERBOSE_SWITCH]=0;
444 pswit[MARKUP_SWITCH]=0;
445 pswit[USERTYPO_SWITCH]=0;
448 if (argc<MINARGS || argc>MAXARGS)
450 /* check number of args */
454 /* read in the user-defined stealth scanno list */
455 if (pswit[USERTYPO_SWITCH])
457 /* ... we were told we had one! */
458 usertypofile=fopen(USERTYPO_FILE,"rb");
461 /* not in cwd. try excuteable directory. */
462 strcpy(usertypo_file,running_from);
463 strcat(usertypo_file,USERTYPO_FILE);
464 usertypofile=fopen(usertypo_file,"rb");
466 /* we ain't got no user typo file! */
467 printf(" --> I couldn't find gutcheck.typ "
468 "-- proceeding without user typos.\n");
474 /* we managed to open a User Typo File! */
475 if (pswit[USERTYPO_SWITCH])
477 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
478 (long)usertypo_count))
484 s=malloc(strlen(aline)+1);
487 fprintf(stderr,"bookloupe: cannot get enough "
488 "memory for user typo file!\n");
492 usertypo[usertypo_count]=s;
494 if (usertypo_count>=MAX_USER_TYPOS)
496 printf(" --> Only %d user-defined typos "
497 "allowed: ignoring the rest\n");
504 fclose(usertypofile);
507 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
508 cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
509 cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
512 if (pswit[OVERVIEW_SWITCH])
514 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
515 checked_linecnt,linecnt,linecnt-checked_linecnt);
516 printf(" --------------- Queries found --------------\n");
518 printf(" Long lines: %14ld\n",cnt_long);
520 printf(" Short lines: %14ld\n",cnt_short);
522 printf(" Line-end problems: %14ld\n",cnt_lineend);
524 printf(" Common typos: %14ld\n",cnt_word);
526 printf(" Unmatched quotes: %14ld\n",cnt_dquot);
528 printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
530 printf(" Unmatched brackets: %14ld\n",cnt_brack);
532 printf(" Non-ASCII characters: %14ld\n",cnt_bin);
534 printf(" Proofing characters: %14ld\n",cnt_odd);
536 printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
538 printf(" Non-standard dashes: %14ld\n",cnt_dash);
540 printf(" Possible HTML tags: %14ld\n",cnt_html);
542 printf(" TOTAL QUERIES %14ld\n",
543 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
544 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
549 struct first_pass_results {
550 long firstline,astline;
551 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
552 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
553 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
554 signed int Dutchcount,Frenchcount;
560 * Run a first pass - verify that it's a valid PG
561 * file, decide whether to report some things that
562 * occur many times in the text like long or short
563 * lines, non-standard dashes, etc.
565 struct first_pass_results *first_pass(FILE *infile)
567 char laststart=CHAR_SPACE,*s;
569 unsigned int lastlen=0,lastblen=0;
570 long spline=0,nspline=0;
571 static struct first_pass_results results={0};
572 char inword[MAXWORDLEN]="";
573 while (fgets(aline,LINEBUFSIZE-1,infile))
575 while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
576 aline[strlen(aline)-1]=0;
578 if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
579 (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
582 printf(" --> Duplicate header?\n");
583 spline=linecnt+1; /* first line of non-header text, that is */
585 if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
588 printf(" --> Duplicate header?\n");
589 nspline=linecnt+1; /* first line of non-header text, that is */
591 if (spline || nspline)
594 if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
596 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
598 if (results.footerline)
600 /* it's an old-form header - we can detect duplicates */
602 printf(" --> Duplicate footer?\n");
605 results.footerline=linecnt;
610 results.firstline=spline;
612 results.firstline=nspline; /* override with new */
613 if (results.footerline)
614 continue; /* don't count the boilerplate in the footer */
616 results.totlen+=llen;
619 if ((unsigned char)aline[i]>127)
621 if (gcisalpha(aline[i]))
623 if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
624 results.endquote_count++;
626 if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
627 lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
629 if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
631 if (strstr(aline,".,"))
633 /* only count ast lines for ignoring purposes where there is */
634 /* locase text on the line */
635 if (strstr(aline,"*"))
638 if (*s>='a' && *s<='z')
643 if (strstr(aline,"/"))
644 results.fslashline++;
645 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
647 if (aline[i]=='-' && aline[i-1]!='-')
649 if (llen>LONGEST_PG_LINE)
651 if (llen>WAY_TOO_LONG)
652 results.verylongline++;
653 if (strstr(aline,"<") && strstr(aline,">"))
655 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
658 if (strstr(aline,"<i>"))
659 results.htmcount+=4; /* bonus marks! */
661 /* Check for spaced em-dashes */
662 if (strstr(aline,"--"))
665 if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
666 (*(strstr(aline,"--")+2)==CHAR_SPACE))
667 results.space_emdash++;
668 if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
669 (*(strstr(aline,"--")+2)==CHAR_SPACE))
670 /* count of em-dashes with spaces both sides */
671 results.non_PG_space_emdash++;
672 if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
673 (*(strstr(aline,"--")+2)!=CHAR_SPACE))
674 /* count of PG-type em-dashes with no spaces */
675 results.PG_space_emdash++;
679 s=getaword(s,inword);
680 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
681 results.Dutchcount++;
682 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
683 results.Frenchcount++;
684 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
685 results.standalone_digit++;
687 /* Check for spaced dashes */
688 if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
691 lastlen=strlen(aline);
702 void procfile(char *filename)
704 char *s,*t,*s1,laststart,*wordstart;
705 char inword[MAXWORDLEN],testword[MAXWORDLEN];
706 char parastart[81]; /* first line of current para */
708 struct first_pass_results *first_pass_results;
709 long quot,squot,start_para_line;
710 signed int i,j,llen,isemptyline,isacro,isellipsis,istypo,alower,
711 eNon_A,eTab,eTilde,eAst,eFSlash,eCarat;
712 signed int warn_short,warn_long,warn_bin,warn_dash,warn_dotcomma,
713 warn_ast,warn_fslash,warn_digit,warn_hyphen,warn_endquote;
714 unsigned int lastlen,lastblen;
715 signed int s_brack,c_brack,r_brack,c_unders;
716 signed int open_single_quote,close_single_quote,guessquote,dquotepar,
718 signed int isnewpara,vowel,consonant;
719 char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
720 cbrack_err[80],unders_err[80];
721 signed int qword_index,qperiod_index,isdup;
723 signed int isDutch,isFrench;
724 laststart=CHAR_SPACE;
726 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
727 *unders_err=*prevline=0;
728 linecnt=checked_linecnt=start_para_line=0;
729 quot=squot=s_brack=c_brack=r_brack=c_unders=0;
730 i=llen=isemptyline=isacro=isellipsis=istypo=0;
731 warn_short=warn_long=warn_bin=warn_dash=warn_dotcomma=
732 warn_ast=warn_fslash=warn_digit=warn_endquote=0;
733 isnewpara=vowel=consonant=enddash=0;
734 qword_index=qperiod_index=isdup=0;
736 open_single_quote=close_single_quote=guessquote=dquotepar=squotepar=0;
738 for (j=0;j<MAX_QWORD;j++)
741 for (i=0;i<MAX_QWORD_LENGTH;i++)
747 infile=fopen(filename,"rb");
750 if (pswit[STDOUT_SWITCH])
751 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
753 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
756 fprintf(stdout,"\n\nFile: %s\n\n",filename);
758 first_pass_results=first_pass(infile);
761 /* now, based on this quick view, make some snap decisions */
763 printf(" --> %ld lines in this file have white space at end\n",
766 if (first_pass_results->dotcomma>5)
769 printf(" --> %ld lines in this file contain '.,'. "
770 "Not reporting them.\n",first_pass_results->dotcomma);
772 /* if more than 50 lines, or one-tenth, are short,
773 * don't bother reporting them */
775 if (first_pass_results->shortline>50 ||
776 first_pass_results->shortline*10>linecnt)
779 printf(" --> %ld lines in this file are short. "
780 "Not reporting short lines.\n",first_pass_results->shortline);
783 * If more than 50 lines, or one-tenth, are long,
784 * don't bother reporting them.
787 if (first_pass_results->longline>50 ||
788 first_pass_results->longline*10>linecnt)
791 printf(" --> %ld lines in this file are long. "
792 "Not reporting long lines.\n",first_pass_results->longline);
794 /* If more than 10 lines contain asterisks, don't bother reporting them. */
796 if (first_pass_results->astline>10)
799 printf(" --> %ld lines in this file contain asterisks. "
800 "Not reporting them.\n",first_pass_results->astline);
803 * If more than 10 lines contain forward slashes,
804 * don't bother reporting them.
807 if (first_pass_results->fslashline>10)
810 printf(" --> %ld lines in this file contain forward slashes. "
811 "Not reporting them.\n",first_pass_results->fslashline);
814 * If more than 20 lines contain unpunctuated endquotes,
815 * don't bother reporting them.
818 if (first_pass_results->endquote_count>20)
821 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
822 "Not reporting them.\n",first_pass_results->endquote_count);
825 * If more than 15 lines contain standalone digits,
826 * don't bother reporting them.
829 if (first_pass_results->standalone_digit>10)
832 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
833 "Not reporting them.\n",first_pass_results->standalone_digit);
836 * If more than 20 lines contain hyphens at end,
837 * don't bother reporting them.
840 if (first_pass_results->hyphens>20)
843 printf(" --> %ld lines in this file have hyphens at end. "
844 "Not reporting them.\n",first_pass_results->hyphens);
846 if (first_pass_results->htmcount>20 && !pswit[MARKUP_SWITCH])
848 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
849 pswit[MARKUP_SWITCH]=1;
851 if (first_pass_results->verylongline>0)
852 printf(" --> %ld lines in this file are VERY long!\n",
853 first_pass_results->verylongline);
855 * If there are more non-PG spaced dashes than PG em-dashes,
856 * assume it's deliberate.
857 * Current PG guidelines say don't use them, but older texts do,
858 * and some people insist on them whatever the guidelines say.
861 if (first_pass_results->spacedash+first_pass_results->non_PG_space_emdash>
862 first_pass_results->PG_space_emdash)
865 printf(" --> There are %ld spaced dashes and em-dashes. "
866 "Not reporting them.\n",first_pass_results->spacedash+
867 first_pass_results->non_PG_space_emdash);
869 /* If more than a quarter of characters are hi-bit, bug out. */
871 if (first_pass_results->binlen*4>first_pass_results->totlen)
873 printf(" --> This file does not appear to be ASCII. "
874 "Terminating. Best of luck with it!\n");
877 if (first_pass_results->alphalen*4<first_pass_results->totlen)
879 printf(" --> This file does not appear to be text. "
880 "Terminating. Best of luck with it!\n");
883 if (first_pass_results->binlen*100>first_pass_results->totlen ||
884 first_pass_results->binlen>100)
886 printf(" --> There are a lot of foreign letters here. "
887 "Not reporting them.\n");
891 if (first_pass_results->Dutchcount>50)
894 printf(" --> This looks like Dutch - "
895 "switching off dashes and warnings for 's Middags case.\n");
898 if (first_pass_results->Frenchcount>50)
901 printf(" --> This looks like French - "
902 "switching off some doublepunct.\n");
904 if (first_pass_results->firstline && first_pass_results->footerline)
905 printf(" The PG header and footer appear to be already on.\n");
908 if (first_pass_results->firstline)
909 printf(" The PG header is on - no footer.\n");
910 if (first_pass_results->footerline)
911 printf(" The PG footer is on - no header.\n");
914 if (pswit[VERBOSE_SWITCH])
926 printf(" *** Verbose output is ON -- you asked for it! ***\n");
930 infile=fopen(filename,"rb");
933 if (pswit[STDOUT_SWITCH])
934 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
936 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
939 if (first_pass_results->footerline>0 && first_pass_results->firstline>0 &&
940 first_pass_results->footerline>first_pass_results->firstline &&
941 first_pass_results->footerline-first_pass_results->firstline<100)
943 printf(" --> I don't really know where this text starts. \n");
944 printf(" There are no reference points.\n");
945 printf(" I'm going to have to report the header and footer "
947 first_pass_results->firstline=0;
950 * Here we go with the main pass. Hold onto yer hat!
951 * Re-init some variables we've dirtied.
953 quot=squot=linecnt=0;
954 laststart=CHAR_SPACE;
956 while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
961 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
962 continue; // skip DP page separators completely
963 if (linecnt<first_pass_results->firstline ||
964 (first_pass_results->footerline>0 &&
965 linecnt>first_pass_results->footerline))
967 if (pswit[HEADER_SWITCH])
969 if (!strncmp(aline,"Title:",6))
970 printf(" %s\n",aline);
971 if (!strncmp(aline,"Author:",7))
972 printf(" %s\n",aline);
973 if (!strncmp(aline,"Release Date:",13))
974 printf(" %s\n",aline);
975 if (!strncmp(aline,"Edition:",8))
976 printf(" %s\n\n",aline);
978 continue; /* skip through the header */
982 isemptyline=1; /* assume the line is empty until proven otherwise */
984 * If we are in a state of unbalanced quotes, and this line
985 * doesn't begin with a quote, output the stored error message.
986 * If the -P switch was used, print the warning even if the
987 * new para starts with quotes.
993 if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
995 if (!pswit[OVERVIEW_SWITCH])
997 if (pswit[ECHO_SWITCH])
998 printf("\n%s\n",parastart);
1006 if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
1007 pswit[QPARA_SWITCH] || squot)
1009 if (!pswit[OVERVIEW_SWITCH])
1011 if (pswit[ECHO_SWITCH])
1012 printf("\n%s\n",parastart);
1022 if (!pswit[OVERVIEW_SWITCH])
1024 if (pswit[ECHO_SWITCH])
1025 printf("\n%s\n",parastart);
1033 if (!pswit[OVERVIEW_SWITCH])
1035 if (pswit[ECHO_SWITCH])
1036 printf("\n%s\n",parastart);
1044 if (!pswit[OVERVIEW_SWITCH])
1046 if (pswit[ECHO_SWITCH])
1047 printf("\n%s\n",parastart);
1055 if (!pswit[OVERVIEW_SWITCH])
1057 if (pswit[ECHO_SWITCH])
1058 printf("\n%s\n",parastart);
1064 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=
1065 *sbrack_err=*unders_err=0;
1067 * Look along the line, accumulate the count of quotes, and see
1068 * if this is an empty line - i.e. a line with nothing on it
1070 * If line has just spaces, period, * and/or - on it, don't
1071 * count it, since empty lines with asterisks or dashes to
1072 * separate sections are common.
1077 if (*s==CHAR_DQUOTE)
1079 if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
1084 * At start of line, it can only be an openquote.
1085 * Hardcode a very common exception!
1087 if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
1088 open_single_quote++;
1090 else if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
1091 /* Do nothing! it's definitely an apostrophe, not a quote */
1093 /* it's outside a word - let's check it out */
1094 else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(*(s+1)))
1096 /* it damwell better BE an openquote */
1097 if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
1098 /* hardcode a very common exception! */
1099 open_single_quote++;
1103 /* now - is it a closequote? */
1104 guessquote=0; /* accumulate clues */
1105 if (gcisalpha(s[-1]))
1107 /* it follows a letter - could be either */
1111 /* looks like a plural apostrophe */
1113 if (s[1]==CHAR_SPACE) /* bonus marks! */
1117 /* it doesn't have a letter either side */
1118 else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
1119 guessquote+=8; /* looks like a closequote */
1122 if (open_single_quote>close_single_quote)
1124 * Give it the benefit of some doubt,
1125 * if a squote is already open.
1131 close_single_quote++;
1134 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
1136 isemptyline=0; /* ignore lines like * * * as spacers */
1137 if (*s==CHAR_UNDERSCORE)
1139 if (*s==CHAR_OPEN_CBRACK)
1141 if (*s==CHAR_CLOSE_CBRACK)
1143 if (*s==CHAR_OPEN_RBRACK)
1145 if (*s==CHAR_CLOSE_RBRACK)
1147 if (*s==CHAR_OPEN_SBRACK)
1149 if (*s==CHAR_CLOSE_SBRACK)
1153 if (isnewpara && !isemptyline)
1155 /* This line is the start of a new paragraph. */
1156 start_para_line=linecnt;
1157 /* Capture its first line in case we want to report it later. */
1158 strncpy(parastart,aline,80);
1160 dquotepar=squotepar=0; /* restart the quote count */
1162 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
1164 if (*s>='a' && *s<='z')
1166 /* and its first letter is lowercase */
1167 if (pswit[ECHO_SWITCH])
1168 printf("\n%s\n",aline);
1169 if (!pswit[OVERVIEW_SWITCH])
1170 printf(" Line %ld column %d - "
1171 "Paragraph starts with lower-case\n",
1172 linecnt,(int)(s-aline)+1);
1176 isnewpara=0; /* Signal the end of new para processing. */
1178 /* Check for an em-dash broken at line end. */
1179 if (enddash && *aline=='-')
1181 if (pswit[ECHO_SWITCH])
1182 printf("\n%s\n",aline);
1183 if (!pswit[OVERVIEW_SWITCH])
1184 printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
1189 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
1191 if (s>=aline && *s=='-')
1194 * Check for invalid or questionable characters in the line
1195 * Anything above 127 is invalid for plain ASCII, and
1196 * non-printable control characters should also be flagged.
1197 * Tabs should generally not be there.
1199 for (s=aline;*s;s++)
1201 i=(unsigned char)*s;
1202 if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
1204 if (pswit[ECHO_SWITCH])
1205 printf("\n%s\n",aline);
1206 if (!pswit[OVERVIEW_SWITCH])
1207 printf(" Line %ld column %d - Control character %d\n",
1208 linecnt,(int)(s-aline)+1,i);
1215 /* Don't repeat multiple warnings on one line. */
1216 eNon_A=eTab=eTilde=eCarat=eFSlash=eAst=0;
1217 for (s=aline;*s;s++)
1220 (*s<CHAR_SPACE && *s!=9 && *s!='\n' || (unsigned char)*s>127))
1222 i=*s; /* annoying kludge for signed chars */
1225 if (pswit[ECHO_SWITCH])
1226 printf("\n%s\n",aline);
1227 if (!pswit[OVERVIEW_SWITCH])
1229 printf(" Line %ld column %d - "
1230 "Non-ISO-8859 character %d\n",
1231 linecnt,(int)(s-aline)+1,i);
1233 printf(" Line %ld column %d - "
1234 "Non-ASCII character %d\n",
1235 linecnt,(int)(s-aline)+1,i);
1240 if (!eTab && *s==CHAR_TAB)
1242 if (pswit[ECHO_SWITCH])
1243 printf("\n%s\n",aline);
1244 if (!pswit[OVERVIEW_SWITCH])
1245 printf(" Line %ld column %d - Tab character?\n",
1246 linecnt,(int)(s-aline)+1);
1251 if (!eTilde && *s==CHAR_TILDE)
1254 * Often used by OCR software to indicate an
1255 * unrecognizable character.
1257 if (pswit[ECHO_SWITCH])
1258 printf("\n%s\n",aline);
1259 if (!pswit[OVERVIEW_SWITCH])
1260 printf(" Line %ld column %d - Tilde character?\n",
1261 linecnt,(int)(s-aline)+1);
1266 if (!eCarat && *s==CHAR_CARAT)
1268 if (pswit[ECHO_SWITCH])
1269 printf("\n%s\n",aline);
1270 if (!pswit[OVERVIEW_SWITCH])
1271 printf(" Line %ld column %d - Carat character?\n",
1272 linecnt,(int)(s-aline)+1);
1277 if (!eFSlash && *s==CHAR_FORESLASH && warn_fslash)
1279 if (pswit[ECHO_SWITCH])
1280 printf("\n%s\n",aline);
1281 if (!pswit[OVERVIEW_SWITCH])
1282 printf(" Line %ld column %d - Forward slash?\n",
1283 linecnt,(int)(s-aline)+1);
1289 * Report asterisks only in paranoid mode,
1290 * since they're often deliberate.
1292 if (!eAst && pswit[PARANOID_SWITCH] && warn_ast &&
1293 !isemptyline && *s==CHAR_ASTERISK)
1295 if (pswit[ECHO_SWITCH])
1296 printf("\n%s\n",aline);
1297 if (!pswit[OVERVIEW_SWITCH])
1298 printf(" Line %ld column %d - Asterisk?\n",
1299 linecnt,(int)(s-aline)+1);
1306 /* Check for line too long. */
1309 if (strlen(aline)>LONGEST_PG_LINE)
1311 if (pswit[ECHO_SWITCH])
1312 printf("\n%s\n",aline);
1313 if (!pswit[OVERVIEW_SWITCH])
1314 printf(" Line %ld column %d - Long line %d\n",
1315 linecnt,strlen(aline),strlen(aline));
1321 * Check for line too short.
1322 * This one is a bit trickier to implement: we don't want to
1323 * flag the last line of a paragraph for being short, so we
1324 * have to wait until we know that our current line is a
1325 * "normal" line, then report the _previous_ line if it was too
1326 * short. We also don't want to report indented lines like
1327 * chapter heads or formatted quotations. We therefore keep
1328 * lastlen as the length of the last line examined, and
1329 * lastblen as the length of the last but one, and try to
1330 * suppress unnecessary warnings by checking that both were of
1331 * "normal" length. We keep the first character of the last
1332 * line in laststart, and if it was a space, we assume that the
1333 * formatting is deliberate. I can't figure out a way to
1334 * distinguish something like a quoted verse left-aligned or
1335 * the header or footer of a letter from a paragraph of short
1336 * lines - maybe if I examined the whole paragraph, and if the
1337 * para has less than, say, 8 lines and if all lines are short,
1338 * then just assume it's OK? Need to look at some texts to see
1339 * how often a formula like this would get the right result.
1341 if (warn_short && strlen(aline)>1 && lastlen>1 &&
1342 lastlen<SHORTEST_PG_LINE && lastblen>1 && lastblen>SHORTEST_PG_LINE &&
1343 laststart!=CHAR_SPACE)
1345 if (pswit[ECHO_SWITCH])
1346 printf("\n%s\n",prevline);
1347 if (!pswit[OVERVIEW_SWITCH])
1348 printf(" Line %ld column %d - Short line %d?\n",
1349 linecnt-1,strlen(prevline),strlen(prevline));
1354 lastlen=strlen(aline);
1356 /* Look for punctuation other than full ellipses at start of line. */
1357 if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1359 if (pswit[ECHO_SWITCH])
1360 printf("\n%s\n",aline);
1361 if (!pswit[OVERVIEW_SWITCH])
1362 printf(" Line %ld column 1 - Begins with punctuation?\n",
1368 * Check for spaced em-dashes.
1369 * We must check _all_ occurrences of "--" on the line
1370 * hence the loop - even if the first double-dash is OK
1371 * there may be another that's wrong later on.
1376 while (strstr(s,"--"))
1378 if (*(strstr(s,"--")-1)==CHAR_SPACE ||
1379 (*(strstr(s,"--")+2)==CHAR_SPACE))
1381 if (pswit[ECHO_SWITCH])
1382 printf("\n%s\n",aline);
1383 if (!pswit[OVERVIEW_SWITCH])
1384 printf(" Line %ld column %d - Spaced em-dash?\n",
1385 linecnt,(int)(strstr(s,"--")-aline)+1);
1392 /* Check for spaced dashes. */
1395 if (strstr(aline," -"))
1397 if (*(strstr(aline," -")+2)!='-')
1399 if (pswit[ECHO_SWITCH])
1400 printf("\n%s\n",aline);
1401 if (!pswit[OVERVIEW_SWITCH])
1402 printf(" Line %ld column %d - Spaced dash?\n",
1403 linecnt,(int)(strstr(aline," -")-aline)+1);
1408 else if (strstr(aline,"- "))
1410 if (*(strstr(aline,"- ")-1)!='-')
1412 if (pswit[ECHO_SWITCH])
1413 printf("\n%s\n",aline);
1414 if (!pswit[OVERVIEW_SWITCH])
1415 printf(" Line %ld column %d - Spaced dash?\n",
1416 linecnt,(int)(strstr(aline,"- ")-aline)+1);
1423 * Check for unmarked paragraphs indicated by separate speakers.
1424 * May well be false positive:
1425 * "Bravo!" "Wonderful!" called the crowd.
1426 * but useful all the same.
1430 if (strstr(aline,"\" \""))
1431 s=strstr(aline,"\" \"");
1432 if (strstr(aline,"\" \""))
1433 s=strstr(aline,"\" \"");
1436 if (pswit[ECHO_SWITCH])
1437 printf("\n%s\n",aline);
1438 if (!pswit[OVERVIEW_SWITCH])
1439 printf(" Line %ld column %d - "
1440 "Query missing paragraph break?\n",
1441 linecnt,(int)(s-aline)+1);
1446 * Check for "to he" and other easy he/be errors.
1447 * This is a very inadequate effort on the he/be problem,
1448 * but the phrase "to he" is always an error, whereas "to
1449 * be" is quite common.
1450 * Similarly, '"Quiet!", be said.' is a non-be error
1451 * "to he" is _not_ always an error!:
1452 * "Where they went to he couldn't say."
1453 * Another false positive:
1454 * What would "Cinderella" be without the . . .
1455 * and another: "If he wants to he can see for himself."
1459 if (strstr(aline," to he "))
1460 s=strstr(aline," to he ");
1461 if (strstr(aline,"\" be "))
1462 s=strstr(aline,"\" be ");
1463 if (strstr(aline,"\", be "))
1464 s=strstr(aline,"\", be ");
1465 if (strstr(aline," is be "))
1466 s=strstr(aline," is be ");
1467 if (strstr(aline," be is "))
1468 s=strstr(aline," be is ");
1469 if (strstr(aline," was be "))
1470 s=strstr(aline," was be ");
1471 if (strstr(aline," be would "))
1472 s=strstr(aline," be would ");
1473 if (strstr(aline," be could "))
1474 s=strstr(aline," be could ");
1477 if (pswit[ECHO_SWITCH])
1478 printf("\n%s\n",aline);
1479 if (!pswit[OVERVIEW_SWITCH])
1480 printf(" Line %ld column %d - Query he/be error?\n",
1481 linecnt,(int)(s-aline)+1);
1487 if (strstr(aline," i bad "))
1488 s=strstr(aline," i bad ");
1489 if (strstr(aline," you bad "))
1490 s=strstr(aline," you bad ");
1491 if (strstr(aline," he bad "))
1492 s=strstr(aline," he bad ");
1493 if (strstr(aline," she bad "))
1494 s=strstr(aline," she bad ");
1495 if (strstr(aline," they bad "))
1496 s=strstr(aline," they bad ");
1497 if (strstr(aline," a had "))
1498 s=strstr(aline," a had ");
1499 if (strstr(aline," the had "))
1500 s=strstr(aline," the had ");
1503 if (pswit[ECHO_SWITCH])
1504 printf("\n%s\n",aline);
1505 if (!pswit[OVERVIEW_SWITCH])
1506 printf(" Line %ld column %d - Query had/bad error?\n",
1507 linecnt,(int)(s-aline)+1);
1513 if (strstr(aline,", hut "))
1514 s=strstr(aline,", hut ");
1515 if (strstr(aline,"; hut "))
1516 s=strstr(aline,"; hut ");
1519 if (pswit[ECHO_SWITCH])
1520 printf("\n%s\n",aline);
1521 if (!pswit[OVERVIEW_SWITCH])
1522 printf(" Line %ld column %d - Query hut/but error?\n",
1523 linecnt,(int)(s-aline)+1);
1528 * Special case - angled bracket in front of "From" placed there by an
1529 * MTA when sending an e-mail.
1531 if (strstr(aline,">From"))
1533 if (pswit[ECHO_SWITCH])
1534 printf("\n%s\n",aline);
1535 if (!pswit[OVERVIEW_SWITCH])
1536 printf(" Line %ld column %d - "
1537 "Query angled bracket with From\n",
1538 linecnt,(int)(strstr(aline,">From")-aline)+1);
1543 * Check for a single character line -
1544 * often an overflow from bad wrapping.
1546 if (*aline && !aline[1])
1548 if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1550 ; /* Nothing - ignore numerals alone on a line. */
1553 if (pswit[ECHO_SWITCH])
1554 printf("\n%s\n",aline);
1555 if (!pswit[OVERVIEW_SWITCH])
1556 printf(" Line %ld column 1 - "
1557 "Query single character line\n",linecnt);
1562 /* Check for I" - often should be ! */
1563 if (strstr(aline," I\""))
1565 if (pswit[ECHO_SWITCH])
1566 printf("\n%s\n",aline);
1567 if (!pswit[OVERVIEW_SWITCH])
1568 printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1569 linecnt,strstr(aline," I\"")-aline);
1574 * Check for period without a capital letter. Cut-down from gutspell.
1575 * Only works when it happens on a single line.
1577 if (pswit[PARANOID_SWITCH])
1579 for (t=s=aline;strstr(t,". ");)
1585 /* start of line punctuation is handled elsewhere */
1588 if (!gcisalpha(t[-1]))
1595 /* For Frank & Jeroen -- 's Middags case */
1596 if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1597 t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1604 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1606 if (*s1>='a' && *s1<='z')
1608 /* we have something to investigate */
1610 /* so let's go back and find out */
1611 for (s1=t-1;s1>=s &&
1612 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1613 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1616 for (i=0;*s1 && *s1!='.';s1++,i++)
1619 for (i=0;*abbrev[i];i++)
1620 if (!strcmp(testword,abbrev[i]))
1622 if (gcisdigit(*testword))
1626 if (isroman(testword))
1631 for (i=0;testword[i];i++)
1632 if (strchr(vowels,testword[i]))
1638 if (strlen(testword)<MAX_QWORD_LENGTH &&
1639 !pswit[VERBOSE_SWITCH])
1640 for (i=0;i<qperiod_index;i++)
1641 if (!strcmp(testword,qperiod[i]))
1645 if (qperiod_index<MAX_QWORD &&
1646 strlen(testword)<MAX_QWORD_LENGTH)
1648 strcpy(qperiod[qperiod_index],testword);
1651 if (pswit[ECHO_SWITCH])
1652 printf("\n%s\n",aline);
1653 if (!pswit[OVERVIEW_SWITCH])
1654 printf(" Line %ld column %d - "
1655 "Extra period?\n",linecnt,(int)(t-aline)+1);
1664 if (pswit[TYPO_SWITCH])
1666 /* Check for words usually not followed by punctuation. */
1670 s=getaword(s,inword);
1674 for (i=0;*nocomma[i];i++)
1675 if (!strcmp(inword,nocomma[i]))
1677 if (*s==',' || *s==';' || *s==':')
1679 if (pswit[ECHO_SWITCH])
1680 printf("\n%s\n",aline);
1681 if (!pswit[OVERVIEW_SWITCH])
1682 printf(" Line %ld column %d - "
1683 "Query punctuation after %s?\n",
1684 linecnt,(int)(s-aline)+1,inword);
1689 for (i=0;*noperiod[i];i++)
1690 if (!strcmp(inword,noperiod[i]))
1692 if (*s=='.' || *s=='!')
1694 if (pswit[ECHO_SWITCH])
1695 printf("\n%s\n",aline);
1696 if (!pswit[OVERVIEW_SWITCH])
1697 printf(" Line %ld column %d - "
1698 "Query punctuation after %s?\n",
1699 linecnt,(int)(s-aline)+1,inword);
1707 * Check for commonly mistyped words,
1708 * and digits like 0 for O in a word.
1713 s=getaword(s,inword);
1715 continue; /* don't bother with empty lines */
1716 if (mixdigit(inword))
1718 if (pswit[ECHO_SWITCH])
1719 printf("\n%s\n",aline);
1720 if (!pswit[OVERVIEW_SWITCH])
1721 printf(" Line %ld column %ld - Query digit in %s\n",
1722 linecnt,(int)(wordstart-aline)+1,inword);
1727 * Put the word through a series of tests for likely typos and OCR
1730 if (pswit[TYPO_SWITCH])
1733 strcpy(testword,inword);
1735 for (i=0;i<(signed int)strlen(testword);i++)
1737 /* lowercase for testing */
1738 if (testword[i]>='a' && testword[i]<='z')
1740 if (alower && testword[i]>='A' && testword[i]<='Z')
1743 * We have an uppercase mid-word. However, there are
1745 * Mac and Mc like McGill
1746 * French contractions like l'Abbe
1748 if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1749 i==3 && testword[0]=='m' && testword[1]=='a' &&
1750 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1755 testword[i]=(char)tolower(testword[i]);
1758 * Check for certain unlikely two-letter combinations at word
1761 if (strlen(testword)>1)
1763 for (i=0;*nostart[i];i++)
1764 if (!strncmp(testword,nostart[i],2))
1766 for (i=0;*noend[i];i++)
1767 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1770 /* ght is common, gbt never. Like that. */
1771 if (strstr(testword,"cb"))
1773 if (strstr(testword,"gbt"))
1775 if (strstr(testword,"pbt"))
1777 if (strstr(testword,"tbs"))
1779 if (strstr(testword,"mrn"))
1781 if (strstr(testword,"ahle"))
1783 if (strstr(testword,"ihle"))
1786 * "TBE" does happen - like HEARTBEAT - but uncommon.
1787 * Also "TBI" - frostbite, outbid - but uncommon.
1788 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1789 * numerals, but "ii" is a common scanno.
1791 if (strstr(testword,"tbi"))
1793 if (strstr(testword,"tbe"))
1795 if (strstr(testword,"ii"))
1798 * Check for no vowels or no consonants.
1799 * If none, flag a typo.
1801 if (!istypo && strlen(testword)>1)
1804 for (i=0;testword[i];i++)
1806 if (testword[i]=='y' || gcisdigit(testword[i]))
1808 /* Yah, this is loose. */
1812 else if (strchr(vowels,testword[i]))
1817 if (!vowel || !consonant)
1821 * Now exclude the word from being reported if it's in
1824 for (i=0;*okword[i];i++)
1825 if (!strcmp(testword,okword[i]))
1828 * What looks like a typo may be a Roman numeral.
1831 if (istypo && isroman(testword))
1833 /* Check the manual list of typos. */
1835 for (i=0;*typo[i];i++)
1836 if (!strcmp(testword,typo[i]))
1839 * Check lowercase s, l, i and m - special cases.
1840 * "j" - often a semi-colon gone wrong.
1841 * "d" for a missing apostrophe - he d
1844 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1849 if (strlen(testword)<MAX_QWORD_LENGTH &&
1850 !pswit[VERBOSE_SWITCH])
1851 for (i=0;i<qword_index;i++)
1852 if (!strcmp(testword,qword[i]))
1859 if (qword_index<MAX_QWORD &&
1860 strlen(testword)<MAX_QWORD_LENGTH)
1862 strcpy(qword[qword_index],testword);
1865 if (pswit[ECHO_SWITCH])
1866 printf("\n%s\n",aline);
1867 if (!pswit[OVERVIEW_SWITCH])
1869 printf(" Line %ld column %d - Query word %s",
1870 linecnt,(int)(wordstart-aline)+1,inword);
1871 if (strlen(testword)<MAX_QWORD_LENGTH &&
1872 !pswit[VERBOSE_SWITCH])
1873 printf(" - not reporting duplicates");
1881 /* check the user's list of typos */
1882 if (!istypo && usertypo_count)
1883 for (i=0;i<usertypo_count;i++)
1884 if (!strcmp(testword,usertypo[i]))
1886 if (pswit[ECHO_SWITCH])
1887 printf("\n%s\n",aline);
1888 if (!pswit[OVERVIEW_SWITCH])
1889 printf(" Line %ld column %d - "
1890 "Query possible scanno %s\n",
1891 linecnt,(int)(wordstart-aline)+2,inword);
1893 if (pswit[PARANOID_SWITCH] && warn_digit)
1895 /* In paranoid mode, query all 0 and 1 standing alone. */
1896 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1898 if (pswit[ECHO_SWITCH])
1899 printf("\n%s\n",aline);
1900 if (!pswit[OVERVIEW_SWITCH])
1901 printf(" Line %ld column %d - Query standalone %s\n",
1902 linecnt,(int)(wordstart-aline)+2,inword);
1909 * Look for added or missing spaces around punctuation and quotes.
1910 * If there is a punctuation character like ! with no space on
1911 * either side, suspect a missing!space. If there are spaces on
1912 * both sides , assume a typo. If we see a double quote with no
1913 * space or punctuation on either side of it, assume unspaced
1914 * quotes "like"this.
1917 for (i=1;i<llen;i++)
1919 /* For each character in the line after the first. */
1920 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
1922 /* we need to suppress warnings for acronyms like M.D. */
1924 /* we need to suppress warnings for ellipsis . . . */
1926 /* if there are letters on both sides of it or ... */
1927 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
1928 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
1930 /* ...if it's strict punctuation followed by an alpha */
1933 if (i>2 && aline[i-2]=='.')
1935 if (i+2<llen && aline[i+2]=='.')
1940 if (pswit[ECHO_SWITCH])
1941 printf("\n%s\n",aline);
1942 if (!pswit[OVERVIEW_SWITCH])
1943 printf(" Line %ld column %d - Missing space?\n",
1949 if (aline[i-1]==CHAR_SPACE &&
1950 (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
1953 * If there are spaces on both sides,
1954 * or space before and end of line.
1958 if (i>2 && aline[i-2]=='.')
1960 if (i+2<llen && aline[i+2]=='.')
1963 if (!isemptyline && !isellipsis)
1965 if (pswit[ECHO_SWITCH])
1966 printf("\n%s\n",aline);
1967 if (!pswit[OVERVIEW_SWITCH])
1968 printf(" Line %ld column %d - "
1969 "Spaced punctuation?\n",linecnt,i+1);
1976 /* Split out the characters that CANNOT be preceded by space. */
1978 for (i=1;i<llen;i++)
1980 /* for each character in the line after the first */
1981 if (strchr("?!,;:",aline[i]))
1983 /* if it's punctuation that _cannot_ have a space before it */
1984 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
1985 aline[i+1]!=CHAR_SPACE)
1988 * If aline[i+1) DOES == space,
1989 * it was already reported just above.
1991 if (pswit[ECHO_SWITCH])
1992 printf("\n%s\n",aline);
1993 if (!pswit[OVERVIEW_SWITCH])
1994 printf(" Line %ld column %d - Spaced punctuation?\n",
2002 * Special case " .X" where X is any alpha.
2003 * This plugs a hole in the acronym code above.
2004 * Inelegant, but maintainable.
2007 for (i=1;i<llen;i++)
2009 /* for each character in the line after the first */
2012 /* if it's a period */
2013 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
2016 * If the period follows a space and
2017 * is followed by a letter.
2019 if (pswit[ECHO_SWITCH])
2020 printf("\n%s\n",aline);
2021 if (!pswit[OVERVIEW_SWITCH])
2022 printf(" Line %ld column %d - Spaced punctuation?\n",
2029 for (i=1;i<llen;i++)
2031 /* for each character in the line after the first */
2032 if (aline[i]==CHAR_DQUOTE)
2034 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
2035 !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
2036 !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
2038 if (pswit[ECHO_SWITCH])
2039 printf("\n%s\n",aline);
2040 if (!pswit[OVERVIEW_SWITCH])
2041 printf(" Line %ld column %d - Unspaced quotes?\n",
2048 /* Check parity of quotes. */
2049 for (s=aline;*s;s++)
2051 if (*s==CHAR_DQUOTE)
2053 if (!(dquotepar=!dquotepar))
2056 if (!strchr("_-.'`/,;:!?)]} ",s[1]))
2058 if (pswit[ECHO_SWITCH])
2059 printf("\n%s\n",aline);
2060 if (!pswit[OVERVIEW_SWITCH])
2061 printf(" Line %ld column %d - "
2062 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2070 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2071 !strchr("_-/.'`([{$",s[1]) || !s[1])
2073 if (pswit[ECHO_SWITCH])
2074 printf("\n%s\n",aline);
2075 if (!pswit[OVERVIEW_SWITCH])
2076 printf(" Line %ld column %d - "
2077 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2084 if (*aline==CHAR_DQUOTE)
2086 if (strchr(",;:!?)]} ",aline[1]))
2088 if (pswit[ECHO_SWITCH])
2089 printf("\n%s\n",aline);
2090 if (!pswit[OVERVIEW_SWITCH])
2091 printf(" Line %ld column 1 - Wrongspaced quotes?\n",
2092 linecnt,(int)(s-aline)+1);
2097 if (pswit[SQUOTE_SWITCH])
2099 for (s=aline;*s;s++)
2101 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
2102 (s==aline || s>aline && !gcisalpha(s[-1]) ||
2105 if (!(squotepar=!squotepar))
2108 if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
2110 if (pswit[ECHO_SWITCH])
2111 printf("\n%s\n",aline);
2112 if (!pswit[OVERVIEW_SWITCH])
2113 printf(" Line %ld column %d - "
2114 "Wrongspaced singlequotes?\n",
2115 linecnt,(int)(s-aline)+1);
2123 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2124 !strchr("_-/\".'`",s[1]) || !s[1])
2126 if (pswit[ECHO_SWITCH])
2127 printf("\n%s\n",aline);
2128 if (!pswit[OVERVIEW_SWITCH])
2129 printf(" Line %ld column %d - "
2130 "Wrongspaced singlequotes?\n",
2131 linecnt,(int)(s-aline)+1);
2140 * Look for double punctuation like ,. or ,,
2141 * Thanks to DW for the suggestion!
2142 * In books with references, ".," and ".;" are common
2143 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2144 * OTOH, from my initial tests, there are also fairly
2145 * common errors. What to do? Make these cases paranoid?
2146 * ".," is the most common, so warn_dotcomma is used
2147 * to suppress detailed reporting if it occurs often.
2150 for (i=0;i<llen;i++)
2152 /* for each punctuation character in the line */
2153 if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
2154 aline[i] && aline[i+1])
2156 /* followed by punctuation, it's a query, unless . . . */
2157 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
2159 !warn_dotcomma && aline[i]=='.' && aline[i+1]==',' ||
2160 isFrench && !strncmp(aline+i,",...",4) ||
2161 isFrench && !strncmp(aline+i,"...,",4) ||
2162 isFrench && !strncmp(aline+i,";...",4) ||
2163 isFrench && !strncmp(aline+i,"...;",4) ||
2164 isFrench && !strncmp(aline+i,":...",4) ||
2165 isFrench && !strncmp(aline+i,"...:",4) ||
2166 isFrench && !strncmp(aline+i,"!...",4) ||
2167 isFrench && !strncmp(aline+i,"...!",4) ||
2168 isFrench && !strncmp(aline+i,"?...",4) ||
2169 isFrench && !strncmp(aline+i,"...?",4))
2171 if (isFrench && !strncmp(aline+i,",...",4) ||
2172 isFrench && !strncmp(aline+i,"...,",4) ||
2173 isFrench && !strncmp(aline+i,";...",4) ||
2174 isFrench && !strncmp(aline+i,"...;",4) ||
2175 isFrench && !strncmp(aline+i,":...",4) ||
2176 isFrench && !strncmp(aline+i,"...:",4) ||
2177 isFrench && !strncmp(aline+i,"!...",4) ||
2178 isFrench && !strncmp(aline+i,"...!",4) ||
2179 isFrench && !strncmp(aline+i,"?...",4) ||
2180 isFrench && !strncmp(aline+i,"...?",4))
2182 ; /* do nothing for .. !! and ?? which can be legit */
2186 if (pswit[ECHO_SWITCH])
2187 printf("\n%s\n",aline);
2188 if (!pswit[OVERVIEW_SWITCH])
2189 printf(" Line %ld column %d - Double punctuation?\n",
2197 while (strstr(s," \" "))
2199 if (pswit[ECHO_SWITCH])
2200 printf("\n%s\n",aline);
2201 if (!pswit[OVERVIEW_SWITCH])
2202 printf(" Line %ld column %d - Spaced doublequote?\n",
2203 linecnt,(int)(strstr(s," \" ")-aline+1));
2206 s=strstr(s," \" ")+2;
2209 while (strstr(s," ' "))
2211 if (pswit[ECHO_SWITCH])
2212 printf("\n%s\n",aline);
2213 if (!pswit[OVERVIEW_SWITCH])
2214 printf(" Line %ld column %d - Spaced singlequote?\n",
2215 linecnt,(int)(strstr(s," ' ")-aline+1));
2218 s=strstr(s," ' ")+2;
2221 while (strstr(s," ` "))
2223 if (pswit[ECHO_SWITCH])
2224 printf("\n%s\n",aline);
2225 if (!pswit[OVERVIEW_SWITCH])
2226 printf(" Line %ld column %d - Spaced singlequote?\n",
2227 linecnt,(int)(strstr(s," ` ")-aline+1));
2230 s=strstr(s," ` ")+2;
2232 /* check special case of 'S instead of 's at end of word */
2236 if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
2238 if (pswit[ECHO_SWITCH])
2239 printf("\n%s\n",aline);
2240 if (!pswit[OVERVIEW_SWITCH])
2241 printf(" Line %ld column %d - Capital \"S\"?\n",
2242 linecnt,(int)(s-aline+2));
2249 * Now check special cases - start and end of line -
2250 * for single and double quotes. Start is sometimes [sic]
2251 * but better to query it anyway.
2252 * While we're here, check for dash at end of line.
2257 if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
2258 aline[llen-1]==CHAR_OPEN_SQUOTE)
2259 if (aline[llen-2]==CHAR_SPACE)
2261 if (pswit[ECHO_SWITCH])
2262 printf("\n%s\n",aline);
2263 if (!pswit[OVERVIEW_SWITCH])
2264 printf(" Line %ld column %d - Spaced quote?\n",
2269 if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
2270 aline[1]==CHAR_SPACE)
2272 if (pswit[ECHO_SWITCH])
2273 printf("\n%s\n",aline);
2274 if (!pswit[OVERVIEW_SWITCH])
2275 printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
2280 * Dash at end of line may well be legit - paranoid mode only
2281 * and don't report em-dash at line-end.
2283 if (pswit[PARANOID_SWITCH] && warn_hyphen)
2285 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
2287 if (aline[i]=='-' && aline[i-1]!='-')
2289 if (pswit[ECHO_SWITCH])
2290 printf("\n%s\n",aline);
2291 if (!pswit[OVERVIEW_SWITCH])
2292 printf(" Line %ld column %d - "
2293 "Hyphen at end of line?\n",linecnt,i);
2298 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2299 * If so, suspect a scanno like "a]most".
2302 for (i=1;i<llen-1;i++)
2304 /* for each bracket character in the line except 1st & last */
2305 if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
2306 gcisalpha(aline[i+1]))
2308 if (pswit[ECHO_SWITCH])
2309 printf("\n%s\n",aline);
2310 if (!pswit[OVERVIEW_SWITCH])
2311 printf(" Line %ld column %d - Unspaced bracket?\n",
2320 for (i=1;i<llen;i++)
2322 /* for each character in the line except 1st */
2323 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
2325 if (pswit[ECHO_SWITCH])
2326 printf("\n%s\n",aline);
2327 if (!pswit[OVERVIEW_SWITCH])
2328 printf(" Line %ld column %d - "
2329 "endquote missing punctuation?\n",linecnt,i);
2336 * Check for <HTML TAG>.
2337 * If there is a < in the line, followed at some point
2338 * by a > then we suspect HTML.
2340 if (strstr(aline,"<") && strstr(aline,">"))
2342 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
2345 strncpy(wrk,strstr(aline,"<"),i);
2347 if (pswit[ECHO_SWITCH])
2348 printf("\n%s\n",aline);
2349 if (!pswit[OVERVIEW_SWITCH])
2350 printf(" Line %ld column %d - HTML Tag? %s \n",
2351 linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
2357 * Check for &symbol; HTML.
2358 * If there is a & in the line, followed at
2359 * some point by a ; then we suspect HTML.
2361 if (strstr(aline,"&") && strstr(aline,";"))
2363 i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
2364 for (s=strstr(aline,"&");s<strstr(aline,";");s++)
2366 i=0; /* Don't report "Jones & Son;" */
2369 strncpy(wrk,strstr(aline,"&"),i);
2371 if (pswit[ECHO_SWITCH])
2372 printf("\n%s\n",aline);
2373 if (!pswit[OVERVIEW_SWITCH])
2374 printf(" Line %ld column %d - HTML symbol? %s \n",
2375 linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
2381 * At end of paragraph, check for mismatched quotes.
2382 * We don't want to report an error immediately, since it is a
2383 * common convention to omit the quotes at end of paragraph if
2384 * the next paragraph is a continuation of the same speaker.
2385 * Where this is the case, the next para should begin with a
2386 * quote, so we store the warning message and only display it
2387 * at the top of the next iteration if the new para doesn't
2388 * start with a quote.
2389 * The -p switch overrides this default, and warns of unclosed
2390 * quotes on _every_ paragraph, whether the next begins with a
2395 /* end of para - add up the totals */
2397 sprintf(dquote_err," Line %ld - Mismatched quotes\n",
2399 if (pswit[SQUOTE_SWITCH] && open_single_quote &&
2400 open_single_quote!=close_single_quote)
2401 sprintf(squote_err," Line %ld - Mismatched singlequotes?\n",
2403 if (pswit[SQUOTE_SWITCH] && open_single_quote &&
2404 open_single_quote!=close_single_quote &&
2405 open_single_quote!=close_single_quote+1)
2407 * Flag it to be noted regardless of the
2408 * first char of the next para.
2412 sprintf(rbrack_err," Line %ld - "
2413 "Mismatched round brackets?\n",linecnt);
2415 sprintf(sbrack_err," Line %ld - "
2416 "Mismatched square brackets?\n",linecnt);
2418 sprintf(cbrack_err," Line %ld - "
2419 "Mismatched curly brackets?\n",linecnt);
2421 sprintf(unders_err," Line %ld - Mismatched underscores?\n",
2423 quot=s_brack=c_brack=r_brack=c_unders=open_single_quote=
2424 close_single_quote=0;
2425 /* let the next iteration know that it's starting a new para */
2429 * Check for omitted punctuation at end of paragraph by working back
2430 * through prevline. DW.
2431 * Need to check this only for "normal" paras.
2432 * So what is a "normal" para?
2433 * Not normal if one-liner (chapter headings, etc.)
2434 * Not normal if doesn't contain at least one locase letter
2435 * Not normal if starts with space
2440 for (s=prevline,i=0;*s && !i;s++)
2442 /* use i to indicate the presence of a letter on the line */
2445 * This next "if" is a problem.
2446 * If we say "start_para_line <= linecnt - 1", that includes
2447 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2448 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2449 * misses genuine one-line paragraphs.
2451 if (i && lastblen>2 && start_para_line<linecnt-1 &&
2452 *prevline>CHAR_SPACE)
2454 for (i=strlen(prevline)-1;
2455 (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
2456 prevline[i]>CHAR_SPACE && i>0;
2461 if (gcisalpha(prevline[i]))
2463 if (pswit[ECHO_SWITCH])
2464 printf("\n%s\n",prevline);
2465 if (!pswit[OVERVIEW_SWITCH])
2466 printf(" Line %ld column %d - "
2467 "No punctuation at para end?\n",
2468 linecnt-1,strlen(prevline));
2473 if (strchr("-.:!([{?}])",prevline[i]))
2478 strcpy(prevline,aline);
2481 if (!pswit[OVERVIEW_SWITCH])
2482 for (i=0;i<MAX_QWORD;i++)
2484 printf("\nNote: Queried word %s was duplicated %d time%s\n",
2485 qword[i],dupcnt[i],"s");
2491 * Get one line from the input stream, checking for
2492 * the existence of exactly one CR/LF line-end per line.
2494 * Returns: a pointer to the line.
2496 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
2502 c=cint=fgetc(thefile);
2507 /* either way, it's end of line */
2514 /* Error - a LF without a preceding CR */
2515 if (pswit[LINE_END_SWITCH])
2517 if (pswit[ECHO_SWITCH])
2518 printf("\n%s\n",theline);
2519 if (!pswit[OVERVIEW_SWITCH])
2520 printf(" Line %ld - No CR?\n",lcnt);
2531 /* Error - two successive CRs */
2532 if (pswit[LINE_END_SWITCH])
2534 if (pswit[ECHO_SWITCH])
2535 printf("\n%s\n",theline);
2536 if (!pswit[OVERVIEW_SWITCH])
2537 printf(" Line %ld - Two successive CRs?\n",lcnt);
2546 if (pswit[LINE_END_SWITCH] && isCR)
2548 if (pswit[ECHO_SWITCH])
2549 printf("\n%s\n",theline);
2550 if (!pswit[OVERVIEW_SWITCH])
2551 printf(" Line %ld column %d - CR without LF?\n",
2561 c=cint=fgetc(thefile);
2562 } while(len<maxlen);
2563 if (pswit[MARKUP_SWITCH])
2564 postprocess_for_HTML(theline);
2565 if (pswit[DP_SWITCH])
2566 postprocess_for_DP(theline);
2573 * Takes a "word" as a parameter, and checks whether it
2574 * contains a mixture of alpha and digits. Generally, this is an
2575 * error, but may not be for cases like 4th or L5 12s. 3d.
2577 * Returns: 0 if no error found, 1 if error.
2579 int mixdigit(char *checkword)
2581 int wehaveadigit,wehavealetter,firstdigits,query,wl;
2583 wehaveadigit=wehavealetter=query=0;
2584 for (s=checkword;*s;s++)
2590 if (wehaveadigit && wehavealetter)
2592 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2594 wl=strlen(checkword);
2595 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
2597 /* digits, ending in st, rd, nd, th of either case */
2598 if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
2599 matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
2600 matchword(checkword+wl-2,"th")))
2602 if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
2603 matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
2604 matchword(checkword+wl-3,"ths")))
2606 if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
2607 matchword(checkword+wl-4,"rdly") ||
2608 matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
2610 /* digits, ending in l, L, s or d */
2611 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
2612 checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
2615 * L at the start of a number, representing Britsh pounds, like L500.
2616 * This is cute. We know the current word is mixeddigit. If the first
2617 * letter is L, there must be at least one digit following. If both
2618 * digits and letters follow, we have a genuine error, else we have a
2619 * capital L followed by digits, and we accept that as a non-error.
2621 if (checkword[0]=='L' && !mixdigit(checkword+1))
2630 * Extracts the first/next "word" from the line, and puts
2631 * it into "thisword". A word is defined as one English word unit--or
2632 * at least that's the aim.
2634 * Returns: a pointer to the position in the line where we will start
2635 * looking for the next word.
2637 char *getaword(char *fromline,char *thisword)
2642 for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
2646 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2647 * Especially yucky is the case of L1,000
2648 * This section looks for a pattern of characters including a digit
2649 * followed by a comma or period followed by one or more digits.
2650 * If found, it returns this whole pattern as a word; otherwise we discard
2651 * the results and resume our normal programming.
2654 for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
2655 wordlen<MAXWORDLEN;s++)
2657 thisword[wordlen]=*s;
2660 thisword[wordlen]=0;
2661 for (i=1;i<wordlen-1;i++)
2663 if (thisword[i]=='.' || thisword[i]==',')
2665 if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
2672 /* we didn't find a punctuated number - do the regular getword thing */
2674 for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
2675 wordlen<MAXWORDLEN;fromline++)
2677 thisword[wordlen]=*fromline;
2680 thisword[wordlen]=0;
2687 * A case-insensitive string matcher.
2689 int matchword(char *checkfor,char *thisword)
2691 unsigned int ismatch,i;
2692 if (strlen(checkfor)!=strlen(thisword))
2694 ismatch=1; /* assume a match until we find a difference */
2695 for (i=0;i<strlen(checkfor);i++)
2696 if (toupper(checkfor[i])!=toupper(thisword[i]))
2704 * Lowercase the line.
2707 void lowerit(char *theline)
2709 for (;*theline;theline++)
2710 if (*theline>='A' && *theline<='Z')
2717 * Is this word a Roman Numeral?
2719 * It doesn't actually validate that the number is a valid Roman Numeral--for
2720 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2721 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2722 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2723 * expressions thereof, except when it came to taxes. Allow any number of M,
2724 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2725 * XL or an optional XC, an optional IX or IV, an optional V and any number
2728 int isroman(char *t)
2734 while (*t=='m' && *t)
2738 if (*t=='c' && t[1]=='m')
2740 if (*t=='c' && t[1]=='d')
2742 while (*t=='c' && *t)
2744 if (*t=='x' && t[1]=='l')
2746 if (*t=='x' && t[1]=='c')
2750 while (*t=='x' && *t)
2752 if (*t=='i' && t[1]=='x')
2754 if (*t=='i' && t[1]=='v')
2758 while (*t=='i' && *t)
2766 * A version of isalpha() that is somewhat lenient on 8-bit texts.
2767 * If we use the standard function, 8-bit accented characters break
2768 * words, so that tete with accented characters appears to be two words, "t"
2769 * and "t", with 8-bit characters between them. This causes over-reporting of
2770 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
2771 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
2773 int gcisalpha(unsigned char c)
2775 if (c>='a' && c<='z')
2777 if (c>='A' && c<='Z')
2781 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
2783 if (c==140 || c==142 || c==156 || c==158 || c==159)
2791 * A version of isdigit() that doesn't get confused in 8-bit texts.
2793 int gcisdigit(unsigned char c)
2795 return c>='0' && c<='9';
2801 * A version of isletter() that doesn't get confused in 8-bit texts.
2802 * NB: this is ISO-8891-1-specific.
2804 int gcisletter(unsigned char c)
2806 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
2812 * Wraps strchr to return NULL if the character being searched for is zero.
2814 char *gcstrchr(char *s,char c)
2822 * postprocess_for_DP:
2824 * Invoked with the -d switch from flgets().
2825 * It simply "removes" from the line a hard-coded set of common
2826 * DP-specific tags, so that the line passed to the main routine has
2827 * been pre-cleaned of DP markup.
2829 void postprocess_for_DP(char *theline)
2835 for (i=0;*DPmarkup[i];i++)
2837 s=strstr(theline,DPmarkup[i]);
2840 t=s+strlen(DPmarkup[i]);
2848 s=strstr(theline,DPmarkup[i]);
2854 * postprocess_for_HTML:
2856 * Invoked with the -m switch from flgets().
2857 * It simply "removes" from the line a hard-coded set of common
2858 * HTML tags and "replaces" a hard-coded set of common HTML
2859 * entities, so that the line passed to the main routine has
2860 * been pre-cleaned of HTML.
2862 void postprocess_for_HTML(char *theline)
2864 if (strstr(theline,"<") && strstr(theline,">"))
2865 while (losemarkup(theline))
2867 while (loseentities(theline))
2871 char *losemarkup(char *theline)
2877 s=strstr(theline,"<");
2878 t=strstr(theline,">");
2881 for (i=0;*markup[i];i++)
2882 if (!tagcomp(s+1,markup[i]))
2895 /* It's an unrecognized <xxx>. */
2899 char *loseentities(char *theline)
2905 for (i=0;*entities[i].htmlent;i++)
2907 s=strstr(theline,entities[i].htmlent);
2910 t=malloc((size_t)strlen(s));
2913 strcpy(t,s+strlen(entities[i].htmlent));
2914 strcpy(s,entities[i].textent);
2920 for (i=0;*entities[i].htmlnum;i++)
2922 s=strstr(theline,entities[i].htmlnum);
2925 t=malloc((size_t)strlen(s));
2928 strcpy(t,s+strlen(entities[i].htmlnum));
2929 strcpy(s,entities[i].textent);
2938 int tagcomp(char *strin,char *basetag)
2944 t++; /* ignore a slash */
2947 if (tolower(*s)!=tolower(*t))
2957 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
2958 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
2959 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
2960 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
2961 "For details, read the file COPYING.\n",stderr);
2962 fputs("This is Free Software; "
2963 "you may redistribute it under certain conditions (GPL);\n",stderr);
2964 fputs("read the file COPYING for details.\n\n",stderr);
2965 fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
2966 fputs(" where -s checks single quotes, -e suppresses echoing lines, "
2967 "-t checks typos\n",stderr);
2968 fputs(" -x (paranoid) switches OFF -t and extra checks, "
2969 "-l turns OFF line-end checks\n",stderr);
2970 fputs(" -o just displays overview without detail, "
2971 "-h echoes header fields\n",stderr);
2972 fputs(" -v (verbose) unsuppresses duplicate reporting, "
2973 "-m suppresses markup\n",stderr);
2974 fputs(" -d ignores DP-specific markup,\n",stderr);
2975 fputs(" -u uses a file gutcheck.typ to query user-defined "
2976 "possible typos\n",stderr);
2977 fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
2979 fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
2981 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
2982 "non-ASCII\n",stderr);
2983 fputs("characters like accented letters, "
2984 "lines longer than 75 or shorter than 55,\n",stderr);
2985 fputs("unbalanced quotes or brackets, "
2986 "a variety of badly formatted punctuation, \n",stderr);
2987 fputs("HTML tags, some likely typos. "
2988 "It is NOT a substitute for human judgement.\n",stderr);