1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
26 #define MAXWORDLEN 80 /* max length of one word */
27 #define LINEBUFSIZE 2048 /* buffer size for an input line */
29 #define MAX_USER_TYPOS 1000
30 #define USERTYPO_FILE "gutcheck.typ"
33 #define MAX_PATH 16384
36 char aline[LINEBUFSIZE];
37 char prevline[LINEBUFSIZE];
41 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
42 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
43 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
44 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
45 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
46 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
47 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
48 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
49 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
50 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
51 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
52 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
53 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
54 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
55 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
56 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
57 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
58 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
59 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
60 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
61 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
62 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
63 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
64 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
65 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
66 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
67 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
68 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
69 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 char *usertypo[MAX_USER_TYPOS];
75 /* Common abbreviations and other OK words not to query as typos. */
77 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
78 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
79 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
80 "outbid", "outbids", "frostbite", "frostbitten", ""
83 /* Common abbreviations that cause otherwise unexplained periods. */
85 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
86 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
90 * Two-Letter combinations that rarely if ever start words,
91 * but are common scannos or otherwise common letter combinations.
94 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
98 * Two-Letter combinations that rarely if ever end words,
99 * but are common scannos or otherwise common letter combinations.
102 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
103 "sw", "gr", "sl", "cl", "iy", ""
107 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
108 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
109 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
110 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
114 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
118 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
119 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
120 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
121 "during", "let", "toward", "among", ""
125 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
126 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
127 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
128 "among", "those", "into", "whom", "having", "thence", ""
131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
138 "&", "&", "&",
139 "<", "<", "<",
140 ">", ">", ">",
141 "°", "°", " degrees",
142 "£", "£", "L",
143 """, """, "\"", /* quotation mark = APL quote */
144 "Œ", "Œ", "OE", /* latin capital ligature OE */
145 "œ", "œ", "oe", /* latin small ligature oe */
146 "Š", "Š", "S", /* latin capital letter S with caron */
147 "š", "š", "s", /* latin small letter s with caron */
148 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
149 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
150 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
151 " ", " ", " ", /* en space, U+2002 ISOpub */
152 " ", " ", " ", /* em space, U+2003 ISOpub */
153 " ", " ", " ", /* thin space, U+2009 ISOpub */
154 "–", "–", "-", /* en dash, U+2013 ISOpub */
155 "—", "—", "--", /* em dash, U+2014 ISOpub */
156 "’", "’", "'", /* right single quotation mark */
157 "‚", "‚", "'", /* single low-9 quotation mark */
158 "“", "“", "\"", /* left double quotation mark */
159 "”", "”", "\"", /* right double quotation mark */
160 "„", "„", "\"", /* double low-9 quotation mark */
161 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
162 "›", "›", "\"", /* single right-pointing angle quotation mark */
163 " ", " ", " ", /* no-break space = non-breaking space, */
164 "¡", "¡", "!", /* inverted exclamation mark */
165 "¢", "¢", "c", /* cent sign */
166 "£", "£", "L", /* pound sign */
167 "¤", "¤", "$", /* currency sign */
168 "¥", "¥", "Y", /* yen sign = yuan sign */
169 "§", "§", "--", /* section sign */
170 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
171 "©", "©", "(C) ", /* copyright sign */
172 "ª", "ª", " ", /* feminine ordinal indicator */
173 "«", "«", "\"", /* left-pointing double angle quotation mark */
174 "­", "­", "-", /* soft hyphen = discretionary hyphen */
175 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
176 "¯", "¯", " ", /* macron = spacing macron = overline */
177 "°", "°", " degrees", /* degree sign */
178 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
179 "²", "²", "2", /* superscript two = superscript digit two */
180 "³", "³", "3", /* superscript three = superscript digit three */
181 "´", "´", " ", /* acute accent = spacing acute */
182 "µ", "µ", "m", /* micro sign */
183 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
184 "¸", "¸", " ", /* cedilla = spacing cedilla */
185 "¹", "¹", "1", /* superscript one = superscript digit one */
186 "º", "º", " ", /* masculine ordinal indicator */
187 "»", "»", "\"", /* right-pointing double angle quotation mark */
188 "¼", "¼", "1/4", /* vulgar fraction one quarter */
189 "½", "½", "1/2", /* vulgar fraction one half */
190 "¾", "¾", "3/4", /* vulgar fraction three quarters */
191 "¿", "¿", "?", /* inverted question mark */
192 "À", "À", "A", /* latin capital letter A with grave */
193 "Á", "Á", "A", /* latin capital letter A with acute */
194 "Â", "Â", "A", /* latin capital letter A with circumflex */
195 "Ã", "Ã", "A", /* latin capital letter A with tilde */
196 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
197 "Å", "Å", "A", /* latin capital letter A with ring above */
198 "Æ", "Æ", "AE", /* latin capital letter AE */
199 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
200 "È", "È", "E", /* latin capital letter E with grave */
201 "É", "É", "E", /* latin capital letter E with acute */
202 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
203 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
204 "Ì", "Ì", "I", /* latin capital letter I with grave */
205 "Í", "Í", "I", /* latin capital letter I with acute */
206 "Î", "Î", "I", /* latin capital letter I with circumflex */
207 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
208 "Ð", "Ð", "E", /* latin capital letter ETH */
209 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
210 "Ò", "Ò", "O", /* latin capital letter O with grave */
211 "Ó", "Ó", "O", /* latin capital letter O with acute */
212 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
213 "Õ", "Õ", "O", /* latin capital letter O with tilde */
214 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
215 "×", "×", "*", /* multiplication sign */
216 "Ø", "Ø", "O", /* latin capital letter O with stroke */
217 "Ù", "Ù", "U", /* latin capital letter U with grave */
218 "Ú", "Ú", "U", /* latin capital letter U with acute */
219 "Û", "Û", "U", /* latin capital letter U with circumflex */
220 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
221 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
222 "Þ", "Þ", "TH", /* latin capital letter THORN */
223 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
224 "à", "à", "a", /* latin small letter a with grave */
225 "á", "á", "a", /* latin small letter a with acute */
226 "â", "â", "a", /* latin small letter a with circumflex */
227 "ã", "ã", "a", /* latin small letter a with tilde */
228 "ä", "ä", "a", /* latin small letter a with diaeresis */
229 "å", "å", "a", /* latin small letter a with ring above */
230 "æ", "æ", "ae", /* latin small letter ae */
231 "ç", "ç", "c", /* latin small letter c with cedilla */
232 "è", "è", "e", /* latin small letter e with grave */
233 "é", "é", "e", /* latin small letter e with acute */
234 "ê", "ê", "e", /* latin small letter e with circumflex */
235 "ë", "ë", "e", /* latin small letter e with diaeresis */
236 "ì", "ì", "i", /* latin small letter i with grave */
237 "í", "í", "i", /* latin small letter i with acute */
238 "î", "î", "i", /* latin small letter i with circumflex */
239 "ï", "ï", "i", /* latin small letter i with diaeresis */
240 "ð", "ð", "eth", /* latin small letter eth */
241 "ñ", "ñ", "n", /* latin small letter n with tilde */
242 "ò", "ò", "o", /* latin small letter o with grave */
243 "ó", "ó", "o", /* latin small letter o with acute */
244 "ô", "ô", "o", /* latin small letter o with circumflex */
245 "õ", "õ", "o", /* latin small letter o with tilde */
246 "ö", "ö", "o", /* latin small letter o with diaeresis */
247 "÷", "÷", "/", /* division sign */
248 "ø", "ø", "o", /* latin small letter o with stroke */
249 "ù", "ù", "u", /* latin small letter u with grave */
250 "ú", "ú", "u", /* latin small letter u with acute */
251 "û", "û", "u", /* latin small letter u with circumflex */
252 "ü", "ü", "u", /* latin small letter u with diaeresis */
253 "ý", "ý", "y", /* latin small letter y with acute */
254 "þ", "þ", "th", /* latin small letter thorn */
255 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
259 /* special characters */
260 #define CHAR_SPACE 32
264 #define CHAR_DQUOTE 34
265 #define CHAR_SQUOTE 39
266 #define CHAR_OPEN_SQUOTE 96
267 #define CHAR_TILDE 126
268 #define CHAR_ASTERISK 42
269 #define CHAR_FORESLASH 47
270 #define CHAR_CARAT 94
272 #define CHAR_UNDERSCORE '_'
273 #define CHAR_OPEN_CBRACK '{'
274 #define CHAR_CLOSE_CBRACK '}'
275 #define CHAR_OPEN_RBRACK '('
276 #define CHAR_CLOSE_RBRACK ')'
277 #define CHAR_OPEN_SBRACK '['
278 #define CHAR_CLOSE_SBRACK ']'
280 /* longest and shortest normal PG line lengths */
281 #define LONGEST_PG_LINE 75
282 #define WAY_TOO_LONG 80
283 #define SHORTEST_PG_LINE 55
285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
286 /* D - ignore DP-specific markup */
287 /* E - echo queried line */
288 /* S - check single quotes */
289 /* T - check common typos */
290 /* P - require closure of quotes on */
291 /* every paragraph */
292 /* X - "Trust no one" :-) Paranoid! */
293 /* Queries everything */
294 /* L - line end checking defaults on */
295 /* -L turns it off */
296 /* O - overview. Just shows counts. */
297 /* Y - puts errors to stdout */
298 /* instead of stderr */
299 /* H - Echoes header fields */
300 /* M - Ignore markup in < > */
301 /* U - Use file of User-defined Typos*/
302 /* W - Defaults for use on Web upload*/
303 /* V - Verbose - list EVERYTHING! */
304 #define SWITNO 14 /* max number of switch parms */
305 /* - used for defining array-size */
306 #define MINARGS 1 /* minimum no of args excl switches */
307 #define MAXARGS 1 /* maximum no of args excl switches */
309 int pswit[SWITNO]; /* program switches set by SWITCHES */
311 #define ECHO_SWITCH 0
312 #define SQUOTE_SWITCH 1
313 #define TYPO_SWITCH 2
314 #define QPARA_SWITCH 3
315 #define PARANOID_SWITCH 4
316 #define LINE_END_SWITCH 5
317 #define OVERVIEW_SWITCH 6
318 #define STDOUT_SWITCH 7
319 #define HEADER_SWITCH 8
321 #define VERBOSE_SWITCH 10
322 #define MARKUP_SWITCH 11
323 #define USERTYPO_SWITCH 12
326 long cnt_dquot; /* for overview mode, count of doublequote queries */
327 long cnt_squot; /* for overview mode, count of singlequote queries */
328 long cnt_brack; /* for overview mode, count of brackets queries */
329 long cnt_bin; /* for overview mode, count of non-ASCII queries */
330 long cnt_odd; /* for overview mode, count of odd character queries */
331 long cnt_long; /* for overview mode, count of long line errors */
332 long cnt_short; /* for overview mode, count of short line queries */
333 long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
334 long cnt_dash; /* for overview mode, count of dash-related queries */
335 long cnt_word; /* for overview mode, count of word queries */
336 long cnt_html; /* for overview mode, count of html queries */
337 long cnt_lineend; /* for overview mode, count of line-end queries */
338 long cnt_spacend; /* count of lines with space at end */
339 long linecnt; /* count of total lines in the file */
340 long checked_linecnt; /* count of lines actually checked */
343 void procfile(char *);
345 #define LOW_THRESHOLD 0
346 #define HIGH_THRESHOLD 1
352 #define FIRST_OF_PAIR 0
353 #define SECOND_OF_PAIR 1
355 #define MAX_WORDPAIR 1000
357 char running_from[MAX_PATH];
359 int mixdigit(char *);
360 const char *getaword(const char *,char *);
361 int matchword(char *,char *);
362 char *flgets(char *,int,FILE *,long);
363 void lowerit(char *);
364 int gcisalpha(unsigned char);
365 int gcisdigit(unsigned char);
366 int gcisletter(unsigned char);
367 char *gcstrchr(char *s,char c);
368 void postprocess_for_HTML(char *);
369 char *linehasmarkup(char *);
370 char *losemarkup(char *);
371 int tagcomp(char *,char *);
372 char *loseentities(char *);
375 void postprocess_for_DP(char *);
377 char wrk[LINEBUFSIZE];
380 #define MAX_QWORD_LENGTH 40
381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
382 signed int dupcnt[MAX_QWORD];
384 int main(int argc,char **argv)
388 char usertypo_file[MAX_PATH];
390 if (strlen(argv[0])<sizeof(running_from))
391 /* save the path to the executable */
392 strcpy(running_from,argv[0]);
393 /* find out what directory we're running from */
394 s=running_from+strlen(running_from);
395 for (;*s!='/' && *s!='\\' && s>=running_from;s--)
397 switno=strlen(SWITCHES);
398 for (i=switno;--i>0;)
399 pswit[i]=0; /* initialise switches */
401 * Standard loop to extract switches.
402 * When we come out of this loop, the arguments will be
403 * in argv[0] upwards and the switches used will be
404 * represented by their equivalent elements in pswit[]
406 while (--argc>0 && **++argv=='-')
407 for (argsw=argv[0]+1;*argsw!='\0';argsw++)
408 for (i=switno,invarg=1;(--i>=0) && invarg==1;)
409 if ((toupper(*argsw))==SWITCHES[i])
414 /* Paranoid checking is turned OFF, not on, by its switch */
415 pswit[PARANOID_SWITCH]^=1;
416 if (pswit[PARANOID_SWITCH])
417 /* if running in paranoid mode force typo checks as well */
418 pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
419 /* Line-end checking is turned OFF, not on, by its switch */
420 pswit[LINE_END_SWITCH]^=1;
421 /* Echoing is turned OFF, not on, by its switch */
422 pswit[ECHO_SWITCH]^=1;
423 if (pswit[OVERVIEW_SWITCH])
424 /* just print summary; don't echo */
425 pswit[ECHO_SWITCH]=0;
427 * Web uploads - for the moment, this is really just a placeholder
428 * until we decide what processing we really want to do on web uploads
430 if (pswit[WEB_SWITCH])
432 /* specific override for web uploads */
433 pswit[ECHO_SWITCH]=1;
434 pswit[SQUOTE_SWITCH]=0;
435 pswit[TYPO_SWITCH]=1;
436 pswit[QPARA_SWITCH]=0;
437 pswit[PARANOID_SWITCH]=1;
438 pswit[LINE_END_SWITCH]=0;
439 pswit[OVERVIEW_SWITCH]=0;
440 pswit[STDOUT_SWITCH]=0;
441 pswit[HEADER_SWITCH]=1;
442 pswit[VERBOSE_SWITCH]=0;
443 pswit[MARKUP_SWITCH]=0;
444 pswit[USERTYPO_SWITCH]=0;
447 if (argc<MINARGS || argc>MAXARGS)
449 /* check number of args */
453 /* read in the user-defined stealth scanno list */
454 if (pswit[USERTYPO_SWITCH])
456 /* ... we were told we had one! */
457 usertypofile=fopen(USERTYPO_FILE,"rb");
460 /* not in cwd. try excuteable directory. */
461 strcpy(usertypo_file,running_from);
462 strcat(usertypo_file,USERTYPO_FILE);
463 usertypofile=fopen(usertypo_file,"rb");
465 /* we ain't got no user typo file! */
466 printf(" --> I couldn't find gutcheck.typ "
467 "-- proceeding without user typos.\n");
473 /* we managed to open a User Typo File! */
474 if (pswit[USERTYPO_SWITCH])
476 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
477 (long)usertypo_count))
483 s=malloc(strlen(aline)+1);
486 fprintf(stderr,"bookloupe: cannot get enough "
487 "memory for user typo file!\n");
491 usertypo[usertypo_count]=s;
493 if (usertypo_count>=MAX_USER_TYPOS)
495 printf(" --> Only %d user-defined typos "
496 "allowed: ignoring the rest\n",
504 fclose(usertypofile);
507 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
508 cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
509 cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
512 if (pswit[OVERVIEW_SWITCH])
514 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
515 checked_linecnt,linecnt,linecnt-checked_linecnt);
516 printf(" --------------- Queries found --------------\n");
518 printf(" Long lines: %14ld\n",cnt_long);
520 printf(" Short lines: %14ld\n",cnt_short);
522 printf(" Line-end problems: %14ld\n",cnt_lineend);
524 printf(" Common typos: %14ld\n",cnt_word);
526 printf(" Unmatched quotes: %14ld\n",cnt_dquot);
528 printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
530 printf(" Unmatched brackets: %14ld\n",cnt_brack);
532 printf(" Non-ASCII characters: %14ld\n",cnt_bin);
534 printf(" Proofing characters: %14ld\n",cnt_odd);
536 printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
538 printf(" Non-standard dashes: %14ld\n",cnt_dash);
540 printf(" Possible HTML tags: %14ld\n",cnt_html);
542 printf(" TOTAL QUERIES %14ld\n",
543 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
544 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
549 struct first_pass_results {
550 long firstline,astline;
551 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
552 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
553 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
554 signed int Dutchcount,Frenchcount;
560 * Run a first pass - verify that it's a valid PG
561 * file, decide whether to report some things that
562 * occur many times in the text like long or short
563 * lines, non-standard dashes, etc.
565 struct first_pass_results *first_pass(FILE *infile)
567 char laststart=CHAR_SPACE;
570 unsigned int lastlen=0,lastblen=0;
571 long spline=0,nspline=0;
572 static struct first_pass_results results={0};
573 char inword[MAXWORDLEN]="";
574 while (fgets(aline,LINEBUFSIZE-1,infile))
576 while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
577 aline[strlen(aline)-1]=0;
579 if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
580 (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
583 printf(" --> Duplicate header?\n");
584 spline=linecnt+1; /* first line of non-header text, that is */
586 if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
589 printf(" --> Duplicate header?\n");
590 nspline=linecnt+1; /* first line of non-header text, that is */
592 if (spline || nspline)
595 if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
597 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
599 if (results.footerline)
601 /* it's an old-form header - we can detect duplicates */
603 printf(" --> Duplicate footer?\n");
606 results.footerline=linecnt;
611 results.firstline=spline;
613 results.firstline=nspline; /* override with new */
614 if (results.footerline)
615 continue; /* don't count the boilerplate in the footer */
617 results.totlen+=llen;
620 if ((unsigned char)aline[i]>127)
622 if (gcisalpha(aline[i]))
624 if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
625 results.endquote_count++;
627 if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
628 lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
630 if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
632 if (strstr(aline,".,"))
634 /* only count ast lines for ignoring purposes where there is */
635 /* locase text on the line */
636 if (strstr(aline,"*"))
639 if (*s>='a' && *s<='z')
644 if (strstr(aline,"/"))
645 results.fslashline++;
646 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
648 if (aline[i]=='-' && aline[i-1]!='-')
650 if (llen>LONGEST_PG_LINE)
652 if (llen>WAY_TOO_LONG)
653 results.verylongline++;
654 if (strstr(aline,"<") && strstr(aline,">"))
656 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
659 if (strstr(aline,"<i>"))
660 results.htmcount+=4; /* bonus marks! */
662 /* Check for spaced em-dashes */
663 if (strstr(aline,"--"))
666 if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
667 (*(strstr(aline,"--")+2)==CHAR_SPACE))
668 results.space_emdash++;
669 if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
670 (*(strstr(aline,"--")+2)==CHAR_SPACE))
671 /* count of em-dashes with spaces both sides */
672 results.non_PG_space_emdash++;
673 if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
674 (*(strstr(aline,"--")+2)!=CHAR_SPACE))
675 /* count of PG-type em-dashes with no spaces */
676 results.PG_space_emdash++;
680 s=getaword(s,inword);
681 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
682 results.Dutchcount++;
683 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
684 results.Frenchcount++;
685 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
686 results.standalone_digit++;
688 /* Check for spaced dashes */
689 if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
692 lastlen=strlen(aline);
699 signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
700 signed int endquote,isDutch,isFrench;
706 * Make some snap decisions based on the first pass results.
708 struct warnings *report_first_pass(struct first_pass_results *results)
710 static struct warnings warnings={0};
712 printf(" --> %ld lines in this file have white space at end\n",
715 if (results->dotcomma>5)
718 printf(" --> %ld lines in this file contain '.,'. "
719 "Not reporting them.\n",results->dotcomma);
722 * If more than 50 lines, or one-tenth, are short,
723 * don't bother reporting them.
725 warnings.shortline=1;
726 if (results->shortline>50 || results->shortline*10>linecnt)
728 warnings.shortline=0;
729 printf(" --> %ld lines in this file are short. "
730 "Not reporting short lines.\n",results->shortline);
733 * If more than 50 lines, or one-tenth, are long,
734 * don't bother reporting them.
737 if (results->longline>50 || results->longline*10>linecnt)
740 printf(" --> %ld lines in this file are long. "
741 "Not reporting long lines.\n",results->longline);
743 /* If more than 10 lines contain asterisks, don't bother reporting them. */
745 if (results->astline>10)
748 printf(" --> %ld lines in this file contain asterisks. "
749 "Not reporting them.\n",results->astline);
752 * If more than 10 lines contain forward slashes,
753 * don't bother reporting them.
756 if (results->fslashline>10)
759 printf(" --> %ld lines in this file contain forward slashes. "
760 "Not reporting them.\n",results->fslashline);
763 * If more than 20 lines contain unpunctuated endquotes,
764 * don't bother reporting them.
767 if (results->endquote_count>20)
770 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
771 "Not reporting them.\n",results->endquote_count);
774 * If more than 15 lines contain standalone digits,
775 * don't bother reporting them.
778 if (results->standalone_digit>10)
781 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
782 "Not reporting them.\n",results->standalone_digit);
785 * If more than 20 lines contain hyphens at end,
786 * don't bother reporting them.
789 if (results->hyphens>20)
792 printf(" --> %ld lines in this file have hyphens at end. "
793 "Not reporting them.\n",results->hyphens);
795 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
797 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
798 pswit[MARKUP_SWITCH]=1;
800 if (results->verylongline>0)
801 printf(" --> %ld lines in this file are VERY long!\n",
802 results->verylongline);
804 * If there are more non-PG spaced dashes than PG em-dashes,
805 * assume it's deliberate.
806 * Current PG guidelines say don't use them, but older texts do,
807 * and some people insist on them whatever the guidelines say.
810 if (results->spacedash+results->non_PG_space_emdash>
811 results->PG_space_emdash)
814 printf(" --> There are %ld spaced dashes and em-dashes. "
815 "Not reporting them.\n",
816 results->spacedash+results->non_PG_space_emdash);
818 /* If more than a quarter of characters are hi-bit, bug out. */
820 if (results->binlen*4>results->totlen)
822 printf(" --> This file does not appear to be ASCII. "
823 "Terminating. Best of luck with it!\n");
826 if (results->alphalen*4<results->totlen)
828 printf(" --> This file does not appear to be text. "
829 "Terminating. Best of luck with it!\n");
832 if (results->binlen*100>results->totlen || results->binlen>100)
834 printf(" --> There are a lot of foreign letters here. "
835 "Not reporting them.\n");
839 if (results->Dutchcount>50)
842 printf(" --> This looks like Dutch - "
843 "switching off dashes and warnings for 's Middags case.\n");
846 if (results->Frenchcount>50)
849 printf(" --> This looks like French - "
850 "switching off some doublepunct.\n");
852 if (results->firstline && results->footerline)
853 printf(" The PG header and footer appear to be already on.\n");
856 if (results->firstline)
857 printf(" The PG header is on - no footer.\n");
858 if (results->footerline)
859 printf(" The PG footer is on - no header.\n");
862 if (pswit[VERBOSE_SWITCH])
865 warnings.shortline=1;
874 printf(" *** Verbose output is ON -- you asked for it! ***\n");
876 if (warnings.isDutch)
878 if (results->footerline>0 && results->firstline>0 &&
879 results->footerline>results->firstline &&
880 results->footerline-results->firstline<100)
882 printf(" --> I don't really know where this text starts. \n");
883 printf(" There are no reference points.\n");
884 printf(" I'm going to have to report the header and footer "
886 results->firstline=0;
893 signed int c_unders,c_brack,s_brack,r_brack;
894 signed int open_single_quote,close_single_quote;
900 * Look along the line, accumulate the count of quotes, and see
901 * if this is an empty line - i.e. a line with nothing on it
903 * If line has just spaces, period, * and/or - on it, don't
904 * count it, since empty lines with asterisks or dashes to
905 * separate sections are common.
907 * Returns: Non-zero if the line is empty.
909 int analyse_quotes(const char *s,struct counters *counters)
911 signed int guessquote=0;
912 int isemptyline=1; /* assume the line is empty until proven otherwise */
917 if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
922 * At start of line, it can only be an openquote.
923 * Hardcode a very common exception!
925 if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
926 counters->open_single_quote++;
928 else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
929 /* Do nothing! it's definitely an apostrophe, not a quote */
931 /* it's outside a word - let's check it out */
932 else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
934 /* it damwell better BE an openquote */
935 if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
936 /* hardcode a very common exception! */
937 counters->open_single_quote++;
941 /* now - is it a closequote? */
942 guessquote=0; /* accumulate clues */
943 if (gcisalpha(s[-1]))
945 /* it follows a letter - could be either */
949 /* looks like a plural apostrophe */
951 if (s[1]==CHAR_SPACE) /* bonus marks! */
955 /* it doesn't have a letter either side */
956 else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
957 guessquote+=8; /* looks like a closequote */
960 if (counters->open_single_quote>counters->close_single_quote)
962 * Give it the benefit of some doubt,
963 * if a squote is already open.
969 counters->close_single_quote++;
972 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
974 isemptyline=0; /* ignore lines like * * * as spacers */
975 if (*s==CHAR_UNDERSCORE)
976 counters->c_unders++;
977 if (*s==CHAR_OPEN_CBRACK)
979 if (*s==CHAR_CLOSE_CBRACK)
981 if (*s==CHAR_OPEN_RBRACK)
983 if (*s==CHAR_CLOSE_RBRACK)
985 if (*s==CHAR_OPEN_SBRACK)
987 if (*s==CHAR_CLOSE_SBRACK)
995 * check_for_odd_characters:
997 * Check for binary and other odd characters.
999 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1002 /* Don't repeat multiple warnings on one line. */
1003 signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
1006 for (s=aline;*s;s++)
1008 c=*(unsigned char *)s;
1009 if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
1011 if (pswit[ECHO_SWITCH])
1012 printf("\n%s\n",aline);
1013 if (!pswit[OVERVIEW_SWITCH])
1015 printf(" Line %ld column %d - "
1016 "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
1018 printf(" Line %ld column %d - Non-ASCII character %d\n",
1019 linecnt,(int)(s-aline)+1,c);
1024 if (!eTab && *s==CHAR_TAB)
1026 if (pswit[ECHO_SWITCH])
1027 printf("\n%s\n",aline);
1028 if (!pswit[OVERVIEW_SWITCH])
1029 printf(" Line %ld column %d - Tab character?\n",
1030 linecnt,(int)(s-aline)+1);
1035 if (!eTilde && *s==CHAR_TILDE)
1038 * Often used by OCR software to indicate an
1039 * unrecognizable character.
1041 if (pswit[ECHO_SWITCH])
1042 printf("\n%s\n",aline);
1043 if (!pswit[OVERVIEW_SWITCH])
1044 printf(" Line %ld column %d - Tilde character?\n",
1045 linecnt,(int)(s-aline)+1);
1050 if (!eCarat && *s==CHAR_CARAT)
1052 if (pswit[ECHO_SWITCH])
1053 printf("\n%s\n",aline);
1054 if (!pswit[OVERVIEW_SWITCH])
1055 printf(" Line %ld column %d - Carat character?\n",
1056 linecnt,(int)(s-aline)+1);
1061 if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
1063 if (pswit[ECHO_SWITCH])
1064 printf("\n%s\n",aline);
1065 if (!pswit[OVERVIEW_SWITCH])
1066 printf(" Line %ld column %d - Forward slash?\n",
1067 linecnt,(int)(s-aline)+1);
1073 * Report asterisks only in paranoid mode,
1074 * since they're often deliberate.
1076 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1079 if (pswit[ECHO_SWITCH])
1080 printf("\n%s\n",aline);
1081 if (!pswit[OVERVIEW_SWITCH])
1082 printf(" Line %ld column %d - Asterisk?\n",
1083 linecnt,(int)(s-aline)+1);
1092 * check_for_long_line:
1094 * Check for line too long.
1096 void check_for_long_line(const char *aline)
1098 if (strlen(aline)>LONGEST_PG_LINE)
1100 if (pswit[ECHO_SWITCH])
1101 printf("\n%s\n",aline);
1102 if (!pswit[OVERVIEW_SWITCH])
1103 printf(" Line %ld column %d - Long line %d\n",
1104 linecnt,strlen(aline),strlen(aline));
1110 struct line_properties {
1111 unsigned int len,blen;
1116 * check_for_short_line:
1118 * Check for line too short.
1120 * This one is a bit trickier to implement: we don't want to
1121 * flag the last line of a paragraph for being short, so we
1122 * have to wait until we know that our current line is a
1123 * "normal" line, then report the _previous_ line if it was too
1124 * short. We also don't want to report indented lines like
1125 * chapter heads or formatted quotations. We therefore keep
1126 * last->len as the length of the last line examined, and
1127 * last->blen as the length of the last but one, and try to
1128 * suppress unnecessary warnings by checking that both were of
1129 * "normal" length. We keep the first character of the last
1130 * line in last->start, and if it was a space, we assume that
1131 * the formatting is deliberate. I can't figure out a way to
1132 * distinguish something like a quoted verse left-aligned or
1133 * the header or footer of a letter from a paragraph of short
1134 * lines - maybe if I examined the whole paragraph, and if the
1135 * para has less than, say, 8 lines and if all lines are short,
1136 * then just assume it's OK? Need to look at some texts to see
1137 * how often a formula like this would get the right result.
1139 void check_for_short_line(const char *aline,const struct line_properties *last)
1141 if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
1142 last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1144 if (pswit[ECHO_SWITCH])
1145 printf("\n%s\n",prevline);
1146 if (!pswit[OVERVIEW_SWITCH])
1147 printf(" Line %ld column %d - Short line %d?\n",
1148 linecnt-1,strlen(prevline),strlen(prevline));
1155 * check_for_starting_punctuation:
1157 * Look for punctuation other than full ellipses at start of line.
1159 void check_for_starting_punctuation(const char *aline)
1161 if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1163 if (pswit[ECHO_SWITCH])
1164 printf("\n%s\n",aline);
1165 if (!pswit[OVERVIEW_SWITCH])
1166 printf(" Line %ld column 1 - Begins with punctuation?\n",
1174 * check_for_spaced_emdash:
1176 * Check for spaced em-dashes.
1178 * We must check _all_ occurrences of "--" on the line
1179 * hence the loop - even if the first double-dash is OK
1180 * there may be another that's wrong later on.
1182 void check_for_spaced_emdash(const char *aline)
1186 while ((t=strstr(s,"--")))
1188 if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
1190 if (pswit[ECHO_SWITCH])
1191 printf("\n%s\n",aline);
1192 if (!pswit[OVERVIEW_SWITCH])
1193 printf(" Line %ld column %d - Spaced em-dash?\n",
1194 linecnt,(int)(t-aline)+1);
1203 * check_for_spaced_dash:
1205 * Check for spaced dashes.
1207 void check_for_spaced_dash(const char *aline)
1210 if ((s=strstr(aline," -")))
1214 if (pswit[ECHO_SWITCH])
1215 printf("\n%s\n",aline);
1216 if (!pswit[OVERVIEW_SWITCH])
1217 printf(" Line %ld column %d - Spaced dash?\n",
1218 linecnt,(int)(s-aline)+1);
1223 else if ((s=strstr(aline,"- ")))
1225 if (s==aline || s[-1]!='-')
1227 if (pswit[ECHO_SWITCH])
1228 printf("\n%s\n",aline);
1229 if (!pswit[OVERVIEW_SWITCH])
1230 printf(" Line %ld column %d - Spaced dash?\n",
1231 linecnt,(int)(s-aline)+1);
1239 * check_for_unmarked_paragraphs:
1241 * Check for unmarked paragraphs indicated by separate speakers.
1243 * May well be false positive:
1244 * "Bravo!" "Wonderful!" called the crowd.
1245 * but useful all the same.
1247 void check_for_unmarked_paragraphs(const char *aline)
1250 s=strstr(aline,"\" \"");
1252 s=strstr(aline,"\" \"");
1255 if (pswit[ECHO_SWITCH])
1256 printf("\n%s\n",aline);
1257 if (!pswit[OVERVIEW_SWITCH])
1258 printf(" Line %ld column %d - Query missing paragraph break?\n",
1259 linecnt,(int)(s-aline)+1);
1266 * check_for_jeebies:
1268 * Check for "to he" and other easy h/b errors.
1270 * This is a very inadequate effort on the h/b problem,
1271 * but the phrase "to he" is always an error, whereas "to
1272 * be" is quite common.
1273 * Similarly, '"Quiet!", be said.' is a non-be error
1274 * "to he" is _not_ always an error!:
1275 * "Where they went to he couldn't say."
1276 * Another false positive:
1277 * What would "Cinderella" be without the . . .
1278 * and another: "If he wants to he can see for himself."
1280 void check_for_jeebies(const char *aline)
1283 s=strstr(aline," be could ");
1285 s=strstr(aline," be would ");
1287 s=strstr(aline," was be ");
1289 s=strstr(aline," be is ");
1291 s=strstr(aline," is be ");
1293 s=strstr(aline,"\", be ");
1295 s=strstr(aline,"\" be ");
1297 s=strstr(aline,"\" be ");
1299 s=strstr(aline," to he ");
1302 if (pswit[ECHO_SWITCH])
1303 printf("\n%s\n",aline);
1304 if (!pswit[OVERVIEW_SWITCH])
1305 printf(" Line %ld column %d - Query he/be error?\n",
1306 linecnt,(int)(s-aline)+1);
1310 s=strstr(aline," the had ");
1312 s=strstr(aline," a had ");
1314 s=strstr(aline," they bad ");
1316 s=strstr(aline," she bad ");
1318 s=strstr(aline," he bad ");
1320 s=strstr(aline," you bad ");
1322 s=strstr(aline," i bad ");
1325 if (pswit[ECHO_SWITCH])
1326 printf("\n%s\n",aline);
1327 if (!pswit[OVERVIEW_SWITCH])
1328 printf(" Line %ld column %d - Query had/bad error?\n",
1329 linecnt,(int)(s-aline)+1);
1333 s=strstr(aline,"; hut ");
1335 s=strstr(aline,", hut ");
1338 if (pswit[ECHO_SWITCH])
1339 printf("\n%s\n",aline);
1340 if (!pswit[OVERVIEW_SWITCH])
1341 printf(" Line %ld column %d - Query hut/but error?\n",
1342 linecnt,(int)(s-aline)+1);
1349 * check_for_mta_from:
1351 * Special case - angled bracket in front of "From" placed there by an
1352 * MTA when sending an e-mail.
1354 void check_for_mta_from(const char *aline)
1357 s=strstr(aline,">From");
1360 if (pswit[ECHO_SWITCH])
1361 printf("\n%s\n",aline);
1362 if (!pswit[OVERVIEW_SWITCH])
1363 printf(" Line %ld column %d - Query angled bracket with From\n",
1364 linecnt,(int)(s-aline)+1);
1371 * check_for_orphan_character:
1373 * Check for a single character line -
1374 * often an overflow from bad wrapping.
1376 void check_for_orphan_character(const char *aline)
1378 if (*aline && !aline[1])
1380 if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1382 ; /* Nothing - ignore numerals alone on a line. */
1385 if (pswit[ECHO_SWITCH])
1386 printf("\n%s\n",aline);
1387 if (!pswit[OVERVIEW_SWITCH])
1388 printf(" Line %ld column 1 - Query single character line\n",
1397 * check_for_pling_scanno:
1399 * Check for I" - often should be !
1401 void check_for_pling_scanno(const char *aline)
1404 s=strstr(aline," I\"");
1407 if (pswit[ECHO_SWITCH])
1408 printf("\n%s\n",aline);
1409 if (!pswit[OVERVIEW_SWITCH])
1410 printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1418 * check_for_extra_period:
1420 * Check for period without a capital letter. Cut-down from gutspell.
1421 * Only works when it happens on a single line.
1423 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1425 const char *s,*t,*s1;
1426 signed int i,istypo,isdup;
1427 static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
1428 static int qperiod_index=0;
1429 char testword[MAXWORDLEN]="";
1430 if (pswit[PARANOID_SWITCH])
1432 for (t=s=aline;strstr(t,". ");)
1438 /* start of line punctuation is handled elsewhere */
1441 if (!gcisalpha(t[-1]))
1446 if (warnings->isDutch)
1448 /* For Frank & Jeroen -- 's Middags case */
1449 if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1450 t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1457 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1459 if (*s1>='a' && *s1<='z')
1461 /* we have something to investigate */
1463 /* so let's go back and find out */
1464 for (s1=t-1;s1>=s &&
1465 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1466 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1469 for (i=0;*s1 && *s1!='.';s1++,i++)
1472 for (i=0;*abbrev[i];i++)
1473 if (!strcmp(testword,abbrev[i]))
1475 if (gcisdigit(*testword))
1479 if (isroman(testword))
1484 for (i=0;testword[i];i++)
1485 if (strchr(vowels,testword[i]))
1491 if (strlen(testword)<MAX_QWORD_LENGTH &&
1492 !pswit[VERBOSE_SWITCH])
1493 for (i=0;i<qperiod_index;i++)
1494 if (!strcmp(testword,qperiod[i]))
1498 if (qperiod_index<MAX_QWORD &&
1499 strlen(testword)<MAX_QWORD_LENGTH)
1501 strcpy(qperiod[qperiod_index],testword);
1504 if (pswit[ECHO_SWITCH])
1505 printf("\n%s\n",aline);
1506 if (!pswit[OVERVIEW_SWITCH])
1507 printf(" Line %ld column %d - Extra period?\n",
1508 linecnt,(int)(t-aline)+1);
1520 * check_for_following_punctuation:
1522 * Check for words usually not followed by punctuation.
1524 void check_for_following_punctuation(const char *aline)
1527 const char *s,*wordstart;
1528 char inword[MAXWORDLEN];
1529 if (pswit[TYPO_SWITCH])
1534 s=getaword(s,inword);
1538 for (i=0;*nocomma[i];i++)
1539 if (!strcmp(inword,nocomma[i]))
1541 if (*s==',' || *s==';' || *s==':')
1543 if (pswit[ECHO_SWITCH])
1544 printf("\n%s\n",aline);
1545 if (!pswit[OVERVIEW_SWITCH])
1546 printf(" Line %ld column %d - "
1547 "Query punctuation after %s?\n",
1548 linecnt,(int)(s-aline)+1,inword);
1553 for (i=0;*noperiod[i];i++)
1554 if (!strcmp(inword,noperiod[i]))
1556 if (*s=='.' || *s=='!')
1558 if (pswit[ECHO_SWITCH])
1559 printf("\n%s\n",aline);
1560 if (!pswit[OVERVIEW_SWITCH])
1561 printf(" Line %ld column %d - "
1562 "Query punctuation after %s?\n",
1563 linecnt,(int)(s-aline)+1,inword);
1577 void procfile(char *filename)
1579 const char *s,*t,*wordstart;
1580 char inword[MAXWORDLEN],testword[MAXWORDLEN];
1581 char parastart[81]; /* first line of current para */
1583 struct first_pass_results *first_pass_results;
1584 struct warnings *warnings;
1585 struct counters counters={0};
1586 struct line_properties last={0};
1588 long squot,start_para_line;
1589 signed int i,llen,isacro,isellipsis,istypo,alower;
1590 signed int dquotepar,squotepar;
1591 signed int isnewpara,vowel,consonant;
1592 char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
1593 cbrack_err[80],unders_err[80];
1594 signed int qword_index,isdup;
1596 last.start=CHAR_SPACE;
1597 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
1598 *unders_err=*prevline=0;
1599 linecnt=checked_linecnt=start_para_line=0;
1601 i=llen=isacro=isellipsis=0;
1602 isnewpara=vowel=consonant=enddash=0;
1604 *inword=*testword=0;
1605 dquotepar=squotepar=0;
1606 infile=fopen(filename,"rb");
1609 if (pswit[STDOUT_SWITCH])
1610 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
1612 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
1615 fprintf(stdout,"\n\nFile: %s\n\n",filename);
1616 first_pass_results=first_pass(infile);
1617 warnings=report_first_pass(first_pass_results);
1620 * Here we go with the main pass. Hold onto yer hat!
1621 * Re-init some variables we've dirtied.
1624 while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
1629 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
1630 continue; // skip DP page separators completely
1631 if (linecnt<first_pass_results->firstline ||
1632 (first_pass_results->footerline>0 &&
1633 linecnt>first_pass_results->footerline))
1635 if (pswit[HEADER_SWITCH])
1637 if (!strncmp(aline,"Title:",6))
1638 printf(" %s\n",aline);
1639 if (!strncmp(aline,"Author:",7))
1640 printf(" %s\n",aline);
1641 if (!strncmp(aline,"Release Date:",13))
1642 printf(" %s\n",aline);
1643 if (!strncmp(aline,"Edition:",8))
1644 printf(" %s\n\n",aline);
1646 continue; /* skip through the header */
1651 * If we are in a state of unbalanced quotes, and this line
1652 * doesn't begin with a quote, output the stored error message.
1653 * If the -P switch was used, print the warning even if the
1654 * new para starts with quotes.
1660 if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
1662 if (!pswit[OVERVIEW_SWITCH])
1664 if (pswit[ECHO_SWITCH])
1665 printf("\n%s\n",parastart);
1673 if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
1674 pswit[QPARA_SWITCH] || squot)
1676 if (!pswit[OVERVIEW_SWITCH])
1678 if (pswit[ECHO_SWITCH])
1679 printf("\n%s\n",parastart);
1689 if (!pswit[OVERVIEW_SWITCH])
1691 if (pswit[ECHO_SWITCH])
1692 printf("\n%s\n",parastart);
1700 if (!pswit[OVERVIEW_SWITCH])
1702 if (pswit[ECHO_SWITCH])
1703 printf("\n%s\n",parastart);
1711 if (!pswit[OVERVIEW_SWITCH])
1713 if (pswit[ECHO_SWITCH])
1714 printf("\n%s\n",parastart);
1722 if (!pswit[OVERVIEW_SWITCH])
1724 if (pswit[ECHO_SWITCH])
1725 printf("\n%s\n",parastart);
1731 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=
1732 *sbrack_err=*unders_err=0;
1733 isemptyline=analyse_quotes(aline,&counters);
1734 if (isnewpara && !isemptyline)
1736 /* This line is the start of a new paragraph. */
1737 start_para_line=linecnt;
1738 /* Capture its first line in case we want to report it later. */
1739 strncpy(parastart,aline,80);
1741 dquotepar=squotepar=0; /* restart the quote count */
1743 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
1745 if (*s>='a' && *s<='z')
1747 /* and its first letter is lowercase */
1748 if (pswit[ECHO_SWITCH])
1749 printf("\n%s\n",aline);
1750 if (!pswit[OVERVIEW_SWITCH])
1751 printf(" Line %ld column %d - "
1752 "Paragraph starts with lower-case\n",
1753 linecnt,(int)(s-aline)+1);
1757 isnewpara=0; /* Signal the end of new para processing. */
1759 /* Check for an em-dash broken at line end. */
1760 if (enddash && *aline=='-')
1762 if (pswit[ECHO_SWITCH])
1763 printf("\n%s\n",aline);
1764 if (!pswit[OVERVIEW_SWITCH])
1765 printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
1770 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
1772 if (s>=aline && *s=='-')
1775 * Check for invalid or questionable characters in the line
1776 * Anything above 127 is invalid for plain ASCII, and
1777 * non-printable control characters should also be flagged.
1778 * Tabs should generally not be there.
1780 for (s=aline;*s;s++)
1782 i=(unsigned char)*s;
1783 if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
1785 if (pswit[ECHO_SWITCH])
1786 printf("\n%s\n",aline);
1787 if (!pswit[OVERVIEW_SWITCH])
1788 printf(" Line %ld column %d - Control character %d\n",
1789 linecnt,(int)(s-aline)+1,i);
1795 check_for_odd_characters(aline,warnings,isemptyline);
1796 if (warnings->longline)
1797 check_for_long_line(aline);
1798 if (warnings->shortline)
1799 check_for_short_line(aline,&last);
1801 last.len=strlen(aline);
1802 last.start=aline[0];
1803 check_for_starting_punctuation(aline);
1806 check_for_spaced_emdash(aline);
1807 check_for_spaced_dash(aline);
1809 check_for_unmarked_paragraphs(aline);
1810 check_for_jeebies(aline);
1811 check_for_mta_from(aline);
1812 check_for_orphan_character(aline);
1813 check_for_pling_scanno(aline);
1814 check_for_extra_period(aline,warnings);
1815 check_for_following_punctuation(aline);
1817 * Check for commonly mistyped words,
1818 * and digits like 0 for O in a word.
1823 s=getaword(s,inword);
1825 continue; /* don't bother with empty lines */
1826 if (mixdigit(inword))
1828 if (pswit[ECHO_SWITCH])
1829 printf("\n%s\n",aline);
1830 if (!pswit[OVERVIEW_SWITCH])
1831 printf(" Line %ld column %d - Query digit in %s\n",
1832 linecnt,(int)(wordstart-aline)+1,inword);
1837 * Put the word through a series of tests for likely typos and OCR
1840 if (pswit[TYPO_SWITCH])
1843 strcpy(testword,inword);
1845 for (i=0;i<(signed int)strlen(testword);i++)
1847 /* lowercase for testing */
1848 if (testword[i]>='a' && testword[i]<='z')
1850 if (alower && testword[i]>='A' && testword[i]<='Z')
1853 * We have an uppercase mid-word. However, there are
1855 * Mac and Mc like McGill
1856 * French contractions like l'Abbe
1858 if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1859 i==3 && testword[0]=='m' && testword[1]=='a' &&
1860 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1865 testword[i]=(char)tolower(testword[i]);
1868 * Check for certain unlikely two-letter combinations at word
1871 if (strlen(testword)>1)
1873 for (i=0;*nostart[i];i++)
1874 if (!strncmp(testword,nostart[i],2))
1876 for (i=0;*noend[i];i++)
1877 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1880 /* ght is common, gbt never. Like that. */
1881 if (strstr(testword,"cb"))
1883 if (strstr(testword,"gbt"))
1885 if (strstr(testword,"pbt"))
1887 if (strstr(testword,"tbs"))
1889 if (strstr(testword,"mrn"))
1891 if (strstr(testword,"ahle"))
1893 if (strstr(testword,"ihle"))
1896 * "TBE" does happen - like HEARTBEAT - but uncommon.
1897 * Also "TBI" - frostbite, outbid - but uncommon.
1898 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1899 * numerals, but "ii" is a common scanno.
1901 if (strstr(testword,"tbi"))
1903 if (strstr(testword,"tbe"))
1905 if (strstr(testword,"ii"))
1908 * Check for no vowels or no consonants.
1909 * If none, flag a typo.
1911 if (!istypo && strlen(testword)>1)
1914 for (i=0;testword[i];i++)
1916 if (testword[i]=='y' || gcisdigit(testword[i]))
1918 /* Yah, this is loose. */
1922 else if (strchr(vowels,testword[i]))
1927 if (!vowel || !consonant)
1931 * Now exclude the word from being reported if it's in
1934 for (i=0;*okword[i];i++)
1935 if (!strcmp(testword,okword[i]))
1938 * What looks like a typo may be a Roman numeral.
1941 if (istypo && isroman(testword))
1943 /* Check the manual list of typos. */
1945 for (i=0;*typo[i];i++)
1946 if (!strcmp(testword,typo[i]))
1949 * Check lowercase s, l, i and m - special cases.
1950 * "j" - often a semi-colon gone wrong.
1951 * "d" for a missing apostrophe - he d
1954 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1959 if (strlen(testword)<MAX_QWORD_LENGTH &&
1960 !pswit[VERBOSE_SWITCH])
1961 for (i=0;i<qword_index;i++)
1962 if (!strcmp(testword,qword[i]))
1969 if (qword_index<MAX_QWORD &&
1970 strlen(testword)<MAX_QWORD_LENGTH)
1972 strcpy(qword[qword_index],testword);
1975 if (pswit[ECHO_SWITCH])
1976 printf("\n%s\n",aline);
1977 if (!pswit[OVERVIEW_SWITCH])
1979 printf(" Line %ld column %d - Query word %s",
1980 linecnt,(int)(wordstart-aline)+1,inword);
1981 if (strlen(testword)<MAX_QWORD_LENGTH &&
1982 !pswit[VERBOSE_SWITCH])
1983 printf(" - not reporting duplicates");
1991 /* check the user's list of typos */
1992 if (!istypo && usertypo_count)
1993 for (i=0;i<usertypo_count;i++)
1994 if (!strcmp(testword,usertypo[i]))
1996 if (pswit[ECHO_SWITCH])
1997 printf("\n%s\n",aline);
1998 if (!pswit[OVERVIEW_SWITCH])
1999 printf(" Line %ld column %d - "
2000 "Query possible scanno %s\n",
2001 linecnt,(int)(wordstart-aline)+2,inword);
2003 if (pswit[PARANOID_SWITCH] && warnings->digit)
2005 /* In paranoid mode, query all 0 and 1 standing alone. */
2006 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2008 if (pswit[ECHO_SWITCH])
2009 printf("\n%s\n",aline);
2010 if (!pswit[OVERVIEW_SWITCH])
2011 printf(" Line %ld column %d - Query standalone %s\n",
2012 linecnt,(int)(wordstart-aline)+2,inword);
2019 * Look for added or missing spaces around punctuation and quotes.
2020 * If there is a punctuation character like ! with no space on
2021 * either side, suspect a missing!space. If there are spaces on
2022 * both sides , assume a typo. If we see a double quote with no
2023 * space or punctuation on either side of it, assume unspaced
2024 * quotes "like"this.
2027 for (i=1;i<llen;i++)
2029 /* For each character in the line after the first. */
2030 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
2032 /* we need to suppress warnings for acronyms like M.D. */
2034 /* we need to suppress warnings for ellipsis . . . */
2036 /* if there are letters on both sides of it or ... */
2037 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
2038 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
2040 /* ...if it's strict punctuation followed by an alpha */
2043 if (i>2 && aline[i-2]=='.')
2045 if (i+2<llen && aline[i+2]=='.')
2050 if (pswit[ECHO_SWITCH])
2051 printf("\n%s\n",aline);
2052 if (!pswit[OVERVIEW_SWITCH])
2053 printf(" Line %ld column %d - Missing space?\n",
2059 if (aline[i-1]==CHAR_SPACE &&
2060 (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
2063 * If there are spaces on both sides,
2064 * or space before and end of line.
2068 if (i>2 && aline[i-2]=='.')
2070 if (i+2<llen && aline[i+2]=='.')
2073 if (!isemptyline && !isellipsis)
2075 if (pswit[ECHO_SWITCH])
2076 printf("\n%s\n",aline);
2077 if (!pswit[OVERVIEW_SWITCH])
2078 printf(" Line %ld column %d - "
2079 "Spaced punctuation?\n",linecnt,i+1);
2086 /* Split out the characters that CANNOT be preceded by space. */
2088 for (i=1;i<llen;i++)
2090 /* for each character in the line after the first */
2091 if (strchr("?!,;:",aline[i]))
2093 /* if it's punctuation that _cannot_ have a space before it */
2094 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
2095 aline[i+1]!=CHAR_SPACE)
2098 * If aline[i+1) DOES == space,
2099 * it was already reported just above.
2101 if (pswit[ECHO_SWITCH])
2102 printf("\n%s\n",aline);
2103 if (!pswit[OVERVIEW_SWITCH])
2104 printf(" Line %ld column %d - Spaced punctuation?\n",
2112 * Special case " .X" where X is any alpha.
2113 * This plugs a hole in the acronym code above.
2114 * Inelegant, but maintainable.
2117 for (i=1;i<llen;i++)
2119 /* for each character in the line after the first */
2122 /* if it's a period */
2123 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
2126 * If the period follows a space and
2127 * is followed by a letter.
2129 if (pswit[ECHO_SWITCH])
2130 printf("\n%s\n",aline);
2131 if (!pswit[OVERVIEW_SWITCH])
2132 printf(" Line %ld column %d - Spaced punctuation?\n",
2139 for (i=1;i<llen;i++)
2141 /* for each character in the line after the first */
2142 if (aline[i]==CHAR_DQUOTE)
2144 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
2145 !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
2146 !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
2148 if (pswit[ECHO_SWITCH])
2149 printf("\n%s\n",aline);
2150 if (!pswit[OVERVIEW_SWITCH])
2151 printf(" Line %ld column %d - Unspaced quotes?\n",
2158 /* Check parity of quotes. */
2159 for (s=aline;*s;s++)
2161 if (*s==CHAR_DQUOTE)
2163 if (!(dquotepar=!dquotepar))
2166 if (!strchr("_-.'`/,;:!?)]} ",s[1]))
2168 if (pswit[ECHO_SWITCH])
2169 printf("\n%s\n",aline);
2170 if (!pswit[OVERVIEW_SWITCH])
2171 printf(" Line %ld column %d - "
2172 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2180 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2181 !strchr("_-/.'`([{$",s[1]) || !s[1])
2183 if (pswit[ECHO_SWITCH])
2184 printf("\n%s\n",aline);
2185 if (!pswit[OVERVIEW_SWITCH])
2186 printf(" Line %ld column %d - "
2187 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2194 if (*aline==CHAR_DQUOTE)
2196 if (strchr(",;:!?)]} ",aline[1]))
2198 if (pswit[ECHO_SWITCH])
2199 printf("\n%s\n",aline);
2200 if (!pswit[OVERVIEW_SWITCH])
2201 printf(" Line %ld column 1 - Wrongspaced quotes?\n",
2207 if (pswit[SQUOTE_SWITCH])
2209 for (s=aline;*s;s++)
2211 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
2212 (s==aline || s>aline && !gcisalpha(s[-1]) ||
2215 if (!(squotepar=!squotepar))
2218 if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
2220 if (pswit[ECHO_SWITCH])
2221 printf("\n%s\n",aline);
2222 if (!pswit[OVERVIEW_SWITCH])
2223 printf(" Line %ld column %d - "
2224 "Wrongspaced singlequotes?\n",
2225 linecnt,(int)(s-aline)+1);
2233 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2234 !strchr("_-/\".'`",s[1]) || !s[1])
2236 if (pswit[ECHO_SWITCH])
2237 printf("\n%s\n",aline);
2238 if (!pswit[OVERVIEW_SWITCH])
2239 printf(" Line %ld column %d - "
2240 "Wrongspaced singlequotes?\n",
2241 linecnt,(int)(s-aline)+1);
2250 * Look for double punctuation like ,. or ,,
2251 * Thanks to DW for the suggestion!
2252 * In books with references, ".," and ".;" are common
2253 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2254 * OTOH, from my initial tests, there are also fairly
2255 * common errors. What to do? Make these cases paranoid?
2256 * ".," is the most common, so warnings->dotcomma is used
2257 * to suppress detailed reporting if it occurs often.
2260 for (i=0;i<llen;i++)
2262 /* for each punctuation character in the line */
2263 if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
2264 aline[i] && aline[i+1])
2266 /* followed by punctuation, it's a query, unless . . . */
2267 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
2269 !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
2270 warnings->isFrench && !strncmp(aline+i,",...",4) ||
2271 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2272 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2273 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2274 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2275 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2276 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2277 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2278 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2279 warnings->isFrench && !strncmp(aline+i,"...?",4))
2281 if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
2282 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2283 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2284 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2285 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2286 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2287 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2288 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2289 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2290 warnings->isFrench && !strncmp(aline+i,"...?",4))
2292 ; /* do nothing for .. !! and ?? which can be legit */
2296 if (pswit[ECHO_SWITCH])
2297 printf("\n%s\n",aline);
2298 if (!pswit[OVERVIEW_SWITCH])
2299 printf(" Line %ld column %d - Double punctuation?\n",
2307 while (strstr(s," \" "))
2309 if (pswit[ECHO_SWITCH])
2310 printf("\n%s\n",aline);
2311 if (!pswit[OVERVIEW_SWITCH])
2312 printf(" Line %ld column %d - Spaced doublequote?\n",
2313 linecnt,(int)(strstr(s," \" ")-aline+1));
2316 s=strstr(s," \" ")+2;
2319 while (strstr(s," ' "))
2321 if (pswit[ECHO_SWITCH])
2322 printf("\n%s\n",aline);
2323 if (!pswit[OVERVIEW_SWITCH])
2324 printf(" Line %ld column %d - Spaced singlequote?\n",
2325 linecnt,(int)(strstr(s," ' ")-aline+1));
2328 s=strstr(s," ' ")+2;
2331 while (strstr(s," ` "))
2333 if (pswit[ECHO_SWITCH])
2334 printf("\n%s\n",aline);
2335 if (!pswit[OVERVIEW_SWITCH])
2336 printf(" Line %ld column %d - Spaced singlequote?\n",
2337 linecnt,(int)(strstr(s," ` ")-aline+1));
2340 s=strstr(s," ` ")+2;
2342 /* check special case of 'S instead of 's at end of word */
2346 if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
2348 if (pswit[ECHO_SWITCH])
2349 printf("\n%s\n",aline);
2350 if (!pswit[OVERVIEW_SWITCH])
2351 printf(" Line %ld column %d - Capital \"S\"?\n",
2352 linecnt,(int)(s-aline+2));
2359 * Now check special cases - start and end of line -
2360 * for single and double quotes. Start is sometimes [sic]
2361 * but better to query it anyway.
2362 * While we're here, check for dash at end of line.
2367 if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
2368 aline[llen-1]==CHAR_OPEN_SQUOTE)
2369 if (aline[llen-2]==CHAR_SPACE)
2371 if (pswit[ECHO_SWITCH])
2372 printf("\n%s\n",aline);
2373 if (!pswit[OVERVIEW_SWITCH])
2374 printf(" Line %ld column %d - Spaced quote?\n",
2379 if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
2380 aline[1]==CHAR_SPACE)
2382 if (pswit[ECHO_SWITCH])
2383 printf("\n%s\n",aline);
2384 if (!pswit[OVERVIEW_SWITCH])
2385 printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
2390 * Dash at end of line may well be legit - paranoid mode only
2391 * and don't report em-dash at line-end.
2393 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2395 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
2397 if (aline[i]=='-' && aline[i-1]!='-')
2399 if (pswit[ECHO_SWITCH])
2400 printf("\n%s\n",aline);
2401 if (!pswit[OVERVIEW_SWITCH])
2402 printf(" Line %ld column %d - "
2403 "Hyphen at end of line?\n",linecnt,i);
2408 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2409 * If so, suspect a scanno like "a]most".
2412 for (i=1;i<llen-1;i++)
2414 /* for each bracket character in the line except 1st & last */
2415 if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
2416 gcisalpha(aline[i+1]))
2418 if (pswit[ECHO_SWITCH])
2419 printf("\n%s\n",aline);
2420 if (!pswit[OVERVIEW_SWITCH])
2421 printf(" Line %ld column %d - Unspaced bracket?\n",
2428 if (warnings->endquote)
2430 for (i=1;i<llen;i++)
2432 /* for each character in the line except 1st */
2433 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
2435 if (pswit[ECHO_SWITCH])
2436 printf("\n%s\n",aline);
2437 if (!pswit[OVERVIEW_SWITCH])
2438 printf(" Line %ld column %d - "
2439 "endquote missing punctuation?\n",linecnt,i);
2446 * Check for <HTML TAG>.
2447 * If there is a < in the line, followed at some point
2448 * by a > then we suspect HTML.
2450 if (strstr(aline,"<") && strstr(aline,">"))
2452 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
2455 strncpy(wrk,strstr(aline,"<"),i);
2457 if (pswit[ECHO_SWITCH])
2458 printf("\n%s\n",aline);
2459 if (!pswit[OVERVIEW_SWITCH])
2460 printf(" Line %ld column %d - HTML Tag? %s \n",
2461 linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
2467 * Check for &symbol; HTML.
2468 * If there is a & in the line, followed at
2469 * some point by a ; then we suspect HTML.
2471 if (strstr(aline,"&") && strstr(aline,";"))
2473 i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
2474 for (s=strstr(aline,"&");s<strstr(aline,";");s++)
2476 i=0; /* Don't report "Jones & Son;" */
2479 strncpy(wrk,strstr(aline,"&"),i);
2481 if (pswit[ECHO_SWITCH])
2482 printf("\n%s\n",aline);
2483 if (!pswit[OVERVIEW_SWITCH])
2484 printf(" Line %ld column %d - HTML symbol? %s \n",
2485 linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
2491 * At end of paragraph, check for mismatched quotes.
2492 * We don't want to report an error immediately, since it is a
2493 * common convention to omit the quotes at end of paragraph if
2494 * the next paragraph is a continuation of the same speaker.
2495 * Where this is the case, the next para should begin with a
2496 * quote, so we store the warning message and only display it
2497 * at the top of the next iteration if the new para doesn't
2498 * start with a quote.
2499 * The -p switch overrides this default, and warns of unclosed
2500 * quotes on _every_ paragraph, whether the next begins with a
2505 /* end of para - add up the totals */
2506 if (counters.quot%2)
2507 sprintf(dquote_err," Line %ld - Mismatched quotes\n",
2509 if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
2510 counters.open_single_quote!=counters.close_single_quote)
2511 sprintf(squote_err," Line %ld - Mismatched singlequotes?\n",
2513 if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
2514 counters.open_single_quote!=counters.close_single_quote &&
2515 counters.open_single_quote!=counters.close_single_quote+1)
2517 * Flag it to be noted regardless of the
2518 * first char of the next para.
2521 if (counters.r_brack)
2522 sprintf(rbrack_err," Line %ld - "
2523 "Mismatched round brackets?\n",linecnt);
2524 if (counters.s_brack)
2525 sprintf(sbrack_err," Line %ld - "
2526 "Mismatched square brackets?\n",linecnt);
2527 if (counters.c_brack)
2528 sprintf(cbrack_err," Line %ld - "
2529 "Mismatched curly brackets?\n",linecnt);
2530 if (counters.c_unders%2)
2531 sprintf(unders_err," Line %ld - Mismatched underscores?\n",
2533 memset(&counters,0,sizeof(counters));
2534 /* let the next iteration know that it's starting a new para */
2538 * Check for omitted punctuation at end of paragraph by working back
2539 * through prevline. DW.
2540 * Need to check this only for "normal" paras.
2541 * So what is a "normal" para?
2542 * Not normal if one-liner (chapter headings, etc.)
2543 * Not normal if doesn't contain at least one locase letter
2544 * Not normal if starts with space
2549 for (s=prevline,i=0;*s && !i;s++)
2551 /* use i to indicate the presence of a letter on the line */
2554 * This next "if" is a problem.
2555 * If we say "start_para_line <= linecnt - 1", that includes
2556 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2557 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2558 * misses genuine one-line paragraphs.
2560 if (i && last.blen>2 && start_para_line<linecnt-1 &&
2561 *prevline>CHAR_SPACE)
2563 for (i=strlen(prevline)-1;
2564 (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
2565 prevline[i]>CHAR_SPACE && i>0;
2570 if (gcisalpha(prevline[i]))
2572 if (pswit[ECHO_SWITCH])
2573 printf("\n%s\n",prevline);
2574 if (!pswit[OVERVIEW_SWITCH])
2575 printf(" Line %ld column %d - "
2576 "No punctuation at para end?\n",
2577 linecnt-1,strlen(prevline));
2582 if (strchr("-.:!([{?}])",prevline[i]))
2587 strcpy(prevline,aline);
2590 if (!pswit[OVERVIEW_SWITCH])
2591 for (i=0;i<MAX_QWORD;i++)
2593 printf("\nNote: Queried word %s was duplicated %d time%s\n",
2594 qword[i],dupcnt[i],"s");
2600 * Get one line from the input stream, checking for
2601 * the existence of exactly one CR/LF line-end per line.
2603 * Returns: a pointer to the line.
2605 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
2611 c=cint=fgetc(thefile);
2616 /* either way, it's end of line */
2623 /* Error - a LF without a preceding CR */
2624 if (pswit[LINE_END_SWITCH])
2626 if (pswit[ECHO_SWITCH])
2627 printf("\n%s\n",theline);
2628 if (!pswit[OVERVIEW_SWITCH])
2629 printf(" Line %ld - No CR?\n",lcnt);
2640 /* Error - two successive CRs */
2641 if (pswit[LINE_END_SWITCH])
2643 if (pswit[ECHO_SWITCH])
2644 printf("\n%s\n",theline);
2645 if (!pswit[OVERVIEW_SWITCH])
2646 printf(" Line %ld - Two successive CRs?\n",lcnt);
2655 if (pswit[LINE_END_SWITCH] && isCR)
2657 if (pswit[ECHO_SWITCH])
2658 printf("\n%s\n",theline);
2659 if (!pswit[OVERVIEW_SWITCH])
2660 printf(" Line %ld column %d - CR without LF?\n",
2670 c=cint=fgetc(thefile);
2671 } while(len<maxlen);
2672 if (pswit[MARKUP_SWITCH])
2673 postprocess_for_HTML(theline);
2674 if (pswit[DP_SWITCH])
2675 postprocess_for_DP(theline);
2682 * Takes a "word" as a parameter, and checks whether it
2683 * contains a mixture of alpha and digits. Generally, this is an
2684 * error, but may not be for cases like 4th or L5 12s. 3d.
2686 * Returns: 0 if no error found, 1 if error.
2688 int mixdigit(char *checkword)
2690 int wehaveadigit,wehavealetter,firstdigits,query,wl;
2692 wehaveadigit=wehavealetter=query=0;
2693 for (s=checkword;*s;s++)
2699 if (wehaveadigit && wehavealetter)
2701 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2703 wl=strlen(checkword);
2704 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
2706 /* digits, ending in st, rd, nd, th of either case */
2707 if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
2708 matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
2709 matchword(checkword+wl-2,"th")))
2711 if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
2712 matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
2713 matchword(checkword+wl-3,"ths")))
2715 if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
2716 matchword(checkword+wl-4,"rdly") ||
2717 matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
2719 /* digits, ending in l, L, s or d */
2720 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
2721 checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
2724 * L at the start of a number, representing Britsh pounds, like L500.
2725 * This is cute. We know the current word is mixeddigit. If the first
2726 * letter is L, there must be at least one digit following. If both
2727 * digits and letters follow, we have a genuine error, else we have a
2728 * capital L followed by digits, and we accept that as a non-error.
2730 if (checkword[0]=='L' && !mixdigit(checkword+1))
2739 * Extracts the first/next "word" from the line, and puts
2740 * it into "thisword". A word is defined as one English word unit--or
2741 * at least that's the aim.
2743 * Returns: a pointer to the position in the line where we will start
2744 * looking for the next word.
2746 const char *getaword(const char *fromline,char *thisword)
2751 for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
2755 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2756 * Especially yucky is the case of L1,000
2757 * This section looks for a pattern of characters including a digit
2758 * followed by a comma or period followed by one or more digits.
2759 * If found, it returns this whole pattern as a word; otherwise we discard
2760 * the results and resume our normal programming.
2763 for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
2764 wordlen<MAXWORDLEN;s++)
2766 thisword[wordlen]=*s;
2769 thisword[wordlen]=0;
2770 for (i=1;i<wordlen-1;i++)
2772 if (thisword[i]=='.' || thisword[i]==',')
2774 if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
2781 /* we didn't find a punctuated number - do the regular getword thing */
2783 for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
2784 wordlen<MAXWORDLEN;fromline++)
2786 thisword[wordlen]=*fromline;
2789 thisword[wordlen]=0;
2796 * A case-insensitive string matcher.
2798 int matchword(char *checkfor,char *thisword)
2800 unsigned int ismatch,i;
2801 if (strlen(checkfor)!=strlen(thisword))
2803 ismatch=1; /* assume a match until we find a difference */
2804 for (i=0;i<strlen(checkfor);i++)
2805 if (toupper(checkfor[i])!=toupper(thisword[i]))
2813 * Lowercase the line.
2816 void lowerit(char *theline)
2818 for (;*theline;theline++)
2819 if (*theline>='A' && *theline<='Z')
2826 * Is this word a Roman Numeral?
2828 * It doesn't actually validate that the number is a valid Roman Numeral--for
2829 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2830 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2831 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2832 * expressions thereof, except when it came to taxes. Allow any number of M,
2833 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2834 * XL or an optional XC, an optional IX or IV, an optional V and any number
2837 int isroman(char *t)
2843 while (*t=='m' && *t)
2847 if (*t=='c' && t[1]=='m')
2849 if (*t=='c' && t[1]=='d')
2851 while (*t=='c' && *t)
2853 if (*t=='x' && t[1]=='l')
2855 if (*t=='x' && t[1]=='c')
2859 while (*t=='x' && *t)
2861 if (*t=='i' && t[1]=='x')
2863 if (*t=='i' && t[1]=='v')
2867 while (*t=='i' && *t)
2875 * A version of isalpha() that is somewhat lenient on 8-bit texts.
2876 * If we use the standard function, 8-bit accented characters break
2877 * words, so that tete with accented characters appears to be two words, "t"
2878 * and "t", with 8-bit characters between them. This causes over-reporting of
2879 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
2880 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
2882 int gcisalpha(unsigned char c)
2884 if (c>='a' && c<='z')
2886 if (c>='A' && c<='Z')
2890 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
2892 if (c==140 || c==142 || c==156 || c==158 || c==159)
2900 * A version of isdigit() that doesn't get confused in 8-bit texts.
2902 int gcisdigit(unsigned char c)
2904 return c>='0' && c<='9';
2910 * A version of isletter() that doesn't get confused in 8-bit texts.
2911 * NB: this is ISO-8891-1-specific.
2913 int gcisletter(unsigned char c)
2915 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
2921 * Wraps strchr to return NULL if the character being searched for is zero.
2923 char *gcstrchr(char *s,char c)
2931 * postprocess_for_DP:
2933 * Invoked with the -d switch from flgets().
2934 * It simply "removes" from the line a hard-coded set of common
2935 * DP-specific tags, so that the line passed to the main routine has
2936 * been pre-cleaned of DP markup.
2938 void postprocess_for_DP(char *theline)
2944 for (i=0;*DPmarkup[i];i++)
2946 s=strstr(theline,DPmarkup[i]);
2949 t=s+strlen(DPmarkup[i]);
2957 s=strstr(theline,DPmarkup[i]);
2963 * postprocess_for_HTML:
2965 * Invoked with the -m switch from flgets().
2966 * It simply "removes" from the line a hard-coded set of common
2967 * HTML tags and "replaces" a hard-coded set of common HTML
2968 * entities, so that the line passed to the main routine has
2969 * been pre-cleaned of HTML.
2971 void postprocess_for_HTML(char *theline)
2973 if (strstr(theline,"<") && strstr(theline,">"))
2974 while (losemarkup(theline))
2976 while (loseentities(theline))
2980 char *losemarkup(char *theline)
2986 s=strstr(theline,"<");
2987 t=strstr(theline,">");
2990 for (i=0;*markup[i];i++)
2991 if (!tagcomp(s+1,markup[i]))
3004 /* It's an unrecognized <xxx>. */
3008 char *loseentities(char *theline)
3014 for (i=0;*entities[i].htmlent;i++)
3016 s=strstr(theline,entities[i].htmlent);
3019 t=malloc((size_t)strlen(s));
3022 strcpy(t,s+strlen(entities[i].htmlent));
3023 strcpy(s,entities[i].textent);
3029 for (i=0;*entities[i].htmlnum;i++)
3031 s=strstr(theline,entities[i].htmlnum);
3034 t=malloc((size_t)strlen(s));
3037 strcpy(t,s+strlen(entities[i].htmlnum));
3038 strcpy(s,entities[i].textent);
3047 int tagcomp(char *strin,char *basetag)
3053 t++; /* ignore a slash */
3056 if (tolower(*s)!=tolower(*t))
3066 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3067 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3068 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3069 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3070 "For details, read the file COPYING.\n",stderr);
3071 fputs("This is Free Software; "
3072 "you may redistribute it under certain conditions (GPL);\n",stderr);
3073 fputs("read the file COPYING for details.\n\n",stderr);
3074 fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
3075 fputs(" where -s checks single quotes, -e suppresses echoing lines, "
3076 "-t checks typos\n",stderr);
3077 fputs(" -x (paranoid) switches OFF -t and extra checks, "
3078 "-l turns OFF line-end checks\n",stderr);
3079 fputs(" -o just displays overview without detail, "
3080 "-h echoes header fields\n",stderr);
3081 fputs(" -v (verbose) unsuppresses duplicate reporting, "
3082 "-m suppresses markup\n",stderr);
3083 fputs(" -d ignores DP-specific markup,\n",stderr);
3084 fputs(" -u uses a file gutcheck.typ to query user-defined "
3085 "possible typos\n",stderr);
3086 fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
3088 fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
3090 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3091 "non-ASCII\n",stderr);
3092 fputs("characters like accented letters, "
3093 "lines longer than 75 or shorter than 55,\n",stderr);
3094 fputs("unbalanced quotes or brackets, "
3095 "a variety of badly formatted punctuation, \n",stderr);
3096 fputs("HTML tags, some likely typos. "
3097 "It is NOT a substitute for human judgement.\n",stderr);