1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
26 #define MAXWORDLEN 80 /* max length of one word */
27 #define LINEBUFSIZE 2048 /* buffer size for an input line */
29 #define MAX_USER_TYPOS 1000
30 #define USERTYPO_FILE "gutcheck.typ"
33 #define MAX_PATH 16384
36 char aline[LINEBUFSIZE];
37 char prevline[LINEBUFSIZE];
41 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
42 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
43 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
44 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
45 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
46 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
47 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
48 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
49 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
50 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
51 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
52 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
53 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
54 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
55 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
56 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
57 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
58 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
59 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
60 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
61 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
62 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
63 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
64 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
65 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
66 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
67 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
68 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
69 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 char *usertypo[MAX_USER_TYPOS];
75 /* Common abbreviations and other OK words not to query as typos. */
77 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
78 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
79 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
80 "outbid", "outbids", "frostbite", "frostbitten", ""
83 /* Common abbreviations that cause otherwise unexplained periods. */
85 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
86 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
90 * Two-Letter combinations that rarely if ever start words,
91 * but are common scannos or otherwise common letter combinations.
94 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
98 * Two-Letter combinations that rarely if ever end words,
99 * but are common scannos or otherwise common letter combinations.
102 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
103 "sw", "gr", "sl", "cl", "iy", ""
107 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
108 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
109 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
110 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
114 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
118 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
119 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
120 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
121 "during", "let", "toward", "among", ""
125 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
126 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
127 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
128 "among", "those", "into", "whom", "having", "thence", ""
131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
138 "&", "&", "&",
139 "<", "<", "<",
140 ">", ">", ">",
141 "°", "°", " degrees",
142 "£", "£", "L",
143 """, """, "\"", /* quotation mark = APL quote */
144 "Œ", "Œ", "OE", /* latin capital ligature OE */
145 "œ", "œ", "oe", /* latin small ligature oe */
146 "Š", "Š", "S", /* latin capital letter S with caron */
147 "š", "š", "s", /* latin small letter s with caron */
148 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
149 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
150 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
151 " ", " ", " ", /* en space, U+2002 ISOpub */
152 " ", " ", " ", /* em space, U+2003 ISOpub */
153 " ", " ", " ", /* thin space, U+2009 ISOpub */
154 "–", "–", "-", /* en dash, U+2013 ISOpub */
155 "—", "—", "--", /* em dash, U+2014 ISOpub */
156 "’", "’", "'", /* right single quotation mark */
157 "‚", "‚", "'", /* single low-9 quotation mark */
158 "“", "“", "\"", /* left double quotation mark */
159 "”", "”", "\"", /* right double quotation mark */
160 "„", "„", "\"", /* double low-9 quotation mark */
161 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
162 "›", "›", "\"", /* single right-pointing angle quotation mark */
163 " ", " ", " ", /* no-break space = non-breaking space, */
164 "¡", "¡", "!", /* inverted exclamation mark */
165 "¢", "¢", "c", /* cent sign */
166 "£", "£", "L", /* pound sign */
167 "¤", "¤", "$", /* currency sign */
168 "¥", "¥", "Y", /* yen sign = yuan sign */
169 "§", "§", "--", /* section sign */
170 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
171 "©", "©", "(C) ", /* copyright sign */
172 "ª", "ª", " ", /* feminine ordinal indicator */
173 "«", "«", "\"", /* left-pointing double angle quotation mark */
174 "­", "­", "-", /* soft hyphen = discretionary hyphen */
175 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
176 "¯", "¯", " ", /* macron = spacing macron = overline */
177 "°", "°", " degrees", /* degree sign */
178 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
179 "²", "²", "2", /* superscript two = superscript digit two */
180 "³", "³", "3", /* superscript three = superscript digit three */
181 "´", "´", " ", /* acute accent = spacing acute */
182 "µ", "µ", "m", /* micro sign */
183 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
184 "¸", "¸", " ", /* cedilla = spacing cedilla */
185 "¹", "¹", "1", /* superscript one = superscript digit one */
186 "º", "º", " ", /* masculine ordinal indicator */
187 "»", "»", "\"", /* right-pointing double angle quotation mark */
188 "¼", "¼", "1/4", /* vulgar fraction one quarter */
189 "½", "½", "1/2", /* vulgar fraction one half */
190 "¾", "¾", "3/4", /* vulgar fraction three quarters */
191 "¿", "¿", "?", /* inverted question mark */
192 "À", "À", "A", /* latin capital letter A with grave */
193 "Á", "Á", "A", /* latin capital letter A with acute */
194 "Â", "Â", "A", /* latin capital letter A with circumflex */
195 "Ã", "Ã", "A", /* latin capital letter A with tilde */
196 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
197 "Å", "Å", "A", /* latin capital letter A with ring above */
198 "Æ", "Æ", "AE", /* latin capital letter AE */
199 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
200 "È", "È", "E", /* latin capital letter E with grave */
201 "É", "É", "E", /* latin capital letter E with acute */
202 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
203 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
204 "Ì", "Ì", "I", /* latin capital letter I with grave */
205 "Í", "Í", "I", /* latin capital letter I with acute */
206 "Î", "Î", "I", /* latin capital letter I with circumflex */
207 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
208 "Ð", "Ð", "E", /* latin capital letter ETH */
209 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
210 "Ò", "Ò", "O", /* latin capital letter O with grave */
211 "Ó", "Ó", "O", /* latin capital letter O with acute */
212 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
213 "Õ", "Õ", "O", /* latin capital letter O with tilde */
214 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
215 "×", "×", "*", /* multiplication sign */
216 "Ø", "Ø", "O", /* latin capital letter O with stroke */
217 "Ù", "Ù", "U", /* latin capital letter U with grave */
218 "Ú", "Ú", "U", /* latin capital letter U with acute */
219 "Û", "Û", "U", /* latin capital letter U with circumflex */
220 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
221 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
222 "Þ", "Þ", "TH", /* latin capital letter THORN */
223 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
224 "à", "à", "a", /* latin small letter a with grave */
225 "á", "á", "a", /* latin small letter a with acute */
226 "â", "â", "a", /* latin small letter a with circumflex */
227 "ã", "ã", "a", /* latin small letter a with tilde */
228 "ä", "ä", "a", /* latin small letter a with diaeresis */
229 "å", "å", "a", /* latin small letter a with ring above */
230 "æ", "æ", "ae", /* latin small letter ae */
231 "ç", "ç", "c", /* latin small letter c with cedilla */
232 "è", "è", "e", /* latin small letter e with grave */
233 "é", "é", "e", /* latin small letter e with acute */
234 "ê", "ê", "e", /* latin small letter e with circumflex */
235 "ë", "ë", "e", /* latin small letter e with diaeresis */
236 "ì", "ì", "i", /* latin small letter i with grave */
237 "í", "í", "i", /* latin small letter i with acute */
238 "î", "î", "i", /* latin small letter i with circumflex */
239 "ï", "ï", "i", /* latin small letter i with diaeresis */
240 "ð", "ð", "eth", /* latin small letter eth */
241 "ñ", "ñ", "n", /* latin small letter n with tilde */
242 "ò", "ò", "o", /* latin small letter o with grave */
243 "ó", "ó", "o", /* latin small letter o with acute */
244 "ô", "ô", "o", /* latin small letter o with circumflex */
245 "õ", "õ", "o", /* latin small letter o with tilde */
246 "ö", "ö", "o", /* latin small letter o with diaeresis */
247 "÷", "÷", "/", /* division sign */
248 "ø", "ø", "o", /* latin small letter o with stroke */
249 "ù", "ù", "u", /* latin small letter u with grave */
250 "ú", "ú", "u", /* latin small letter u with acute */
251 "û", "û", "u", /* latin small letter u with circumflex */
252 "ü", "ü", "u", /* latin small letter u with diaeresis */
253 "ý", "ý", "y", /* latin small letter y with acute */
254 "þ", "þ", "th", /* latin small letter thorn */
255 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
259 /* special characters */
260 #define CHAR_SPACE 32
264 #define CHAR_DQUOTE 34
265 #define CHAR_SQUOTE 39
266 #define CHAR_OPEN_SQUOTE 96
267 #define CHAR_TILDE 126
268 #define CHAR_ASTERISK 42
269 #define CHAR_FORESLASH 47
270 #define CHAR_CARAT 94
272 #define CHAR_UNDERSCORE '_'
273 #define CHAR_OPEN_CBRACK '{'
274 #define CHAR_CLOSE_CBRACK '}'
275 #define CHAR_OPEN_RBRACK '('
276 #define CHAR_CLOSE_RBRACK ')'
277 #define CHAR_OPEN_SBRACK '['
278 #define CHAR_CLOSE_SBRACK ']'
280 /* longest and shortest normal PG line lengths */
281 #define LONGEST_PG_LINE 75
282 #define WAY_TOO_LONG 80
283 #define SHORTEST_PG_LINE 55
285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
286 /* D - ignore DP-specific markup */
287 /* E - echo queried line */
288 /* S - check single quotes */
289 /* T - check common typos */
290 /* P - require closure of quotes on */
291 /* every paragraph */
292 /* X - "Trust no one" :-) Paranoid! */
293 /* Queries everything */
294 /* L - line end checking defaults on */
295 /* -L turns it off */
296 /* O - overview. Just shows counts. */
297 /* Y - puts errors to stdout */
298 /* instead of stderr */
299 /* H - Echoes header fields */
300 /* M - Ignore markup in < > */
301 /* U - Use file of User-defined Typos*/
302 /* W - Defaults for use on Web upload*/
303 /* V - Verbose - list EVERYTHING! */
304 #define SWITNO 14 /* max number of switch parms */
305 /* - used for defining array-size */
306 #define MINARGS 1 /* minimum no of args excl switches */
307 #define MAXARGS 1 /* maximum no of args excl switches */
309 int pswit[SWITNO]; /* program switches set by SWITCHES */
311 #define ECHO_SWITCH 0
312 #define SQUOTE_SWITCH 1
313 #define TYPO_SWITCH 2
314 #define QPARA_SWITCH 3
315 #define PARANOID_SWITCH 4
316 #define LINE_END_SWITCH 5
317 #define OVERVIEW_SWITCH 6
318 #define STDOUT_SWITCH 7
319 #define HEADER_SWITCH 8
321 #define VERBOSE_SWITCH 10
322 #define MARKUP_SWITCH 11
323 #define USERTYPO_SWITCH 12
326 long cnt_dquot; /* for overview mode, count of doublequote queries */
327 long cnt_squot; /* for overview mode, count of singlequote queries */
328 long cnt_brack; /* for overview mode, count of brackets queries */
329 long cnt_bin; /* for overview mode, count of non-ASCII queries */
330 long cnt_odd; /* for overview mode, count of odd character queries */
331 long cnt_long; /* for overview mode, count of long line errors */
332 long cnt_short; /* for overview mode, count of short line queries */
333 long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
334 long cnt_dash; /* for overview mode, count of dash-related queries */
335 long cnt_word; /* for overview mode, count of word queries */
336 long cnt_html; /* for overview mode, count of html queries */
337 long cnt_lineend; /* for overview mode, count of line-end queries */
338 long cnt_spacend; /* count of lines with space at end */
339 long linecnt; /* count of total lines in the file */
340 long checked_linecnt; /* count of lines actually checked */
343 void procfile(char *);
345 #define LOW_THRESHOLD 0
346 #define HIGH_THRESHOLD 1
352 #define FIRST_OF_PAIR 0
353 #define SECOND_OF_PAIR 1
355 #define MAX_WORDPAIR 1000
357 char running_from[MAX_PATH];
359 int mixdigit(char *);
360 char *getaword(char *,char *);
361 int matchword(char *,char *);
362 char *flgets(char *,int,FILE *,long);
363 void lowerit(char *);
364 int gcisalpha(unsigned char);
365 int gcisdigit(unsigned char);
366 int gcisletter(unsigned char);
367 char *gcstrchr(char *s,char c);
368 void postprocess_for_HTML(char *);
369 char *linehasmarkup(char *);
370 char *losemarkup(char *);
371 int tagcomp(char *,char *);
372 char *loseentities(char *);
375 void postprocess_for_DP(char *);
377 char wrk[LINEBUFSIZE];
380 #define MAX_QWORD_LENGTH 40
381 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
382 signed int dupcnt[MAX_QWORD];
384 int main(int argc,char **argv)
388 char usertypo_file[MAX_PATH];
390 if (strlen(argv[0])<sizeof(running_from))
391 /* save the path to the executable */
392 strcpy(running_from,argv[0]);
393 /* find out what directory we're running from */
394 s=running_from+strlen(running_from);
395 for (;*s!='/' && *s!='\\' && s>=running_from;s--)
397 switno=strlen(SWITCHES);
398 for (i=switno;--i>0;)
399 pswit[i]=0; /* initialise switches */
401 * Standard loop to extract switches.
402 * When we come out of this loop, the arguments will be
403 * in argv[0] upwards and the switches used will be
404 * represented by their equivalent elements in pswit[]
406 while (--argc>0 && **++argv=='-')
407 for (argsw=argv[0]+1;*argsw!='\0';argsw++)
408 for (i=switno,invarg=1;(--i>=0) && invarg==1;)
409 if ((toupper(*argsw))==SWITCHES[i])
414 /* Paranoid checking is turned OFF, not on, by its switch */
415 pswit[PARANOID_SWITCH]^=1;
416 if (pswit[PARANOID_SWITCH])
417 /* if running in paranoid mode force typo checks as well */
418 pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
419 /* Line-end checking is turned OFF, not on, by its switch */
420 pswit[LINE_END_SWITCH]^=1;
421 /* Echoing is turned OFF, not on, by its switch */
422 pswit[ECHO_SWITCH]^=1;
423 if (pswit[OVERVIEW_SWITCH])
424 /* just print summary; don't echo */
425 pswit[ECHO_SWITCH]=0;
427 * Web uploads - for the moment, this is really just a placeholder
428 * until we decide what processing we really want to do on web uploads
430 if (pswit[WEB_SWITCH])
432 /* specific override for web uploads */
433 pswit[ECHO_SWITCH]=1;
434 pswit[SQUOTE_SWITCH]=0;
435 pswit[TYPO_SWITCH]=1;
436 pswit[QPARA_SWITCH]=0;
437 pswit[PARANOID_SWITCH]=1;
438 pswit[LINE_END_SWITCH]=0;
439 pswit[OVERVIEW_SWITCH]=0;
440 pswit[STDOUT_SWITCH]=0;
441 pswit[HEADER_SWITCH]=1;
442 pswit[VERBOSE_SWITCH]=0;
443 pswit[MARKUP_SWITCH]=0;
444 pswit[USERTYPO_SWITCH]=0;
447 if (argc<MINARGS || argc>MAXARGS)
449 /* check number of args */
453 /* read in the user-defined stealth scanno list */
454 if (pswit[USERTYPO_SWITCH])
456 /* ... we were told we had one! */
457 usertypofile=fopen(USERTYPO_FILE,"rb");
460 /* not in cwd. try excuteable directory. */
461 strcpy(usertypo_file,running_from);
462 strcat(usertypo_file,USERTYPO_FILE);
463 usertypofile=fopen(usertypo_file,"rb");
465 /* we ain't got no user typo file! */
466 printf(" --> I couldn't find gutcheck.typ "
467 "-- proceeding without user typos.\n");
473 /* we managed to open a User Typo File! */
474 if (pswit[USERTYPO_SWITCH])
476 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
477 (long)usertypo_count))
483 s=malloc(strlen(aline)+1);
486 fprintf(stderr,"bookloupe: cannot get enough "
487 "memory for user typo file!\n");
491 usertypo[usertypo_count]=s;
493 if (usertypo_count>=MAX_USER_TYPOS)
495 printf(" --> Only %d user-defined typos "
496 "allowed: ignoring the rest\n",
504 fclose(usertypofile);
507 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
508 cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
509 cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
512 if (pswit[OVERVIEW_SWITCH])
514 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
515 checked_linecnt,linecnt,linecnt-checked_linecnt);
516 printf(" --------------- Queries found --------------\n");
518 printf(" Long lines: %14ld\n",cnt_long);
520 printf(" Short lines: %14ld\n",cnt_short);
522 printf(" Line-end problems: %14ld\n",cnt_lineend);
524 printf(" Common typos: %14ld\n",cnt_word);
526 printf(" Unmatched quotes: %14ld\n",cnt_dquot);
528 printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
530 printf(" Unmatched brackets: %14ld\n",cnt_brack);
532 printf(" Non-ASCII characters: %14ld\n",cnt_bin);
534 printf(" Proofing characters: %14ld\n",cnt_odd);
536 printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
538 printf(" Non-standard dashes: %14ld\n",cnt_dash);
540 printf(" Possible HTML tags: %14ld\n",cnt_html);
542 printf(" TOTAL QUERIES %14ld\n",
543 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
544 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
549 struct first_pass_results {
550 long firstline,astline;
551 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
552 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
553 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
554 signed int Dutchcount,Frenchcount;
560 * Run a first pass - verify that it's a valid PG
561 * file, decide whether to report some things that
562 * occur many times in the text like long or short
563 * lines, non-standard dashes, etc.
565 struct first_pass_results *first_pass(FILE *infile)
567 char laststart=CHAR_SPACE,*s;
569 unsigned int lastlen=0,lastblen=0;
570 long spline=0,nspline=0;
571 static struct first_pass_results results={0};
572 char inword[MAXWORDLEN]="";
573 while (fgets(aline,LINEBUFSIZE-1,infile))
575 while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
576 aline[strlen(aline)-1]=0;
578 if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
579 (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
582 printf(" --> Duplicate header?\n");
583 spline=linecnt+1; /* first line of non-header text, that is */
585 if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
588 printf(" --> Duplicate header?\n");
589 nspline=linecnt+1; /* first line of non-header text, that is */
591 if (spline || nspline)
594 if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
596 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
598 if (results.footerline)
600 /* it's an old-form header - we can detect duplicates */
602 printf(" --> Duplicate footer?\n");
605 results.footerline=linecnt;
610 results.firstline=spline;
612 results.firstline=nspline; /* override with new */
613 if (results.footerline)
614 continue; /* don't count the boilerplate in the footer */
616 results.totlen+=llen;
619 if ((unsigned char)aline[i]>127)
621 if (gcisalpha(aline[i]))
623 if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
624 results.endquote_count++;
626 if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
627 lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
629 if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
631 if (strstr(aline,".,"))
633 /* only count ast lines for ignoring purposes where there is */
634 /* locase text on the line */
635 if (strstr(aline,"*"))
638 if (*s>='a' && *s<='z')
643 if (strstr(aline,"/"))
644 results.fslashline++;
645 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
647 if (aline[i]=='-' && aline[i-1]!='-')
649 if (llen>LONGEST_PG_LINE)
651 if (llen>WAY_TOO_LONG)
652 results.verylongline++;
653 if (strstr(aline,"<") && strstr(aline,">"))
655 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
658 if (strstr(aline,"<i>"))
659 results.htmcount+=4; /* bonus marks! */
661 /* Check for spaced em-dashes */
662 if (strstr(aline,"--"))
665 if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
666 (*(strstr(aline,"--")+2)==CHAR_SPACE))
667 results.space_emdash++;
668 if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
669 (*(strstr(aline,"--")+2)==CHAR_SPACE))
670 /* count of em-dashes with spaces both sides */
671 results.non_PG_space_emdash++;
672 if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
673 (*(strstr(aline,"--")+2)!=CHAR_SPACE))
674 /* count of PG-type em-dashes with no spaces */
675 results.PG_space_emdash++;
679 s=getaword(s,inword);
680 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
681 results.Dutchcount++;
682 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
683 results.Frenchcount++;
684 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
685 results.standalone_digit++;
687 /* Check for spaced dashes */
688 if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
691 lastlen=strlen(aline);
698 signed int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
699 signed int endquote,isDutch,isFrench;
705 * Make some snap decisions based on the first pass results.
707 struct warnings *report_first_pass(struct first_pass_results *results)
709 static struct warnings warnings={0};
711 printf(" --> %ld lines in this file have white space at end\n",
714 if (results->dotcomma>5)
717 printf(" --> %ld lines in this file contain '.,'. "
718 "Not reporting them.\n",results->dotcomma);
721 * If more than 50 lines, or one-tenth, are short,
722 * don't bother reporting them.
724 warnings.shortline=1;
725 if (results->shortline>50 || results->shortline*10>linecnt)
727 warnings.shortline=0;
728 printf(" --> %ld lines in this file are short. "
729 "Not reporting short lines.\n",results->shortline);
732 * If more than 50 lines, or one-tenth, are long,
733 * don't bother reporting them.
736 if (results->longline>50 || results->longline*10>linecnt)
739 printf(" --> %ld lines in this file are long. "
740 "Not reporting long lines.\n",results->longline);
742 /* If more than 10 lines contain asterisks, don't bother reporting them. */
744 if (results->astline>10)
747 printf(" --> %ld lines in this file contain asterisks. "
748 "Not reporting them.\n",results->astline);
751 * If more than 10 lines contain forward slashes,
752 * don't bother reporting them.
755 if (results->fslashline>10)
758 printf(" --> %ld lines in this file contain forward slashes. "
759 "Not reporting them.\n",results->fslashline);
762 * If more than 20 lines contain unpunctuated endquotes,
763 * don't bother reporting them.
766 if (results->endquote_count>20)
769 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
770 "Not reporting them.\n",results->endquote_count);
773 * If more than 15 lines contain standalone digits,
774 * don't bother reporting them.
777 if (results->standalone_digit>10)
780 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
781 "Not reporting them.\n",results->standalone_digit);
784 * If more than 20 lines contain hyphens at end,
785 * don't bother reporting them.
788 if (results->hyphens>20)
791 printf(" --> %ld lines in this file have hyphens at end. "
792 "Not reporting them.\n",results->hyphens);
794 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
796 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
797 pswit[MARKUP_SWITCH]=1;
799 if (results->verylongline>0)
800 printf(" --> %ld lines in this file are VERY long!\n",
801 results->verylongline);
803 * If there are more non-PG spaced dashes than PG em-dashes,
804 * assume it's deliberate.
805 * Current PG guidelines say don't use them, but older texts do,
806 * and some people insist on them whatever the guidelines say.
809 if (results->spacedash+results->non_PG_space_emdash>
810 results->PG_space_emdash)
813 printf(" --> There are %ld spaced dashes and em-dashes. "
814 "Not reporting them.\n",
815 results->spacedash+results->non_PG_space_emdash);
817 /* If more than a quarter of characters are hi-bit, bug out. */
819 if (results->binlen*4>results->totlen)
821 printf(" --> This file does not appear to be ASCII. "
822 "Terminating. Best of luck with it!\n");
825 if (results->alphalen*4<results->totlen)
827 printf(" --> This file does not appear to be text. "
828 "Terminating. Best of luck with it!\n");
831 if (results->binlen*100>results->totlen || results->binlen>100)
833 printf(" --> There are a lot of foreign letters here. "
834 "Not reporting them.\n");
838 if (results->Dutchcount>50)
841 printf(" --> This looks like Dutch - "
842 "switching off dashes and warnings for 's Middags case.\n");
845 if (results->Frenchcount>50)
848 printf(" --> This looks like French - "
849 "switching off some doublepunct.\n");
851 if (results->firstline && results->footerline)
852 printf(" The PG header and footer appear to be already on.\n");
855 if (results->firstline)
856 printf(" The PG header is on - no footer.\n");
857 if (results->footerline)
858 printf(" The PG footer is on - no header.\n");
861 if (pswit[VERBOSE_SWITCH])
864 warnings.shortline=1;
873 printf(" *** Verbose output is ON -- you asked for it! ***\n");
875 if (warnings.isDutch)
877 if (results->footerline>0 && results->firstline>0 &&
878 results->footerline>results->firstline &&
879 results->footerline-results->firstline<100)
881 printf(" --> I don't really know where this text starts. \n");
882 printf(" There are no reference points.\n");
883 printf(" I'm going to have to report the header and footer "
885 results->firstline=0;
892 signed int c_unders,c_brack,s_brack,r_brack;
893 signed int open_single_quote,close_single_quote;
899 * Look along the line, accumulate the count of quotes, and see
900 * if this is an empty line - i.e. a line with nothing on it
902 * If line has just spaces, period, * and/or - on it, don't
903 * count it, since empty lines with asterisks or dashes to
904 * separate sections are common.
906 * Returns: Non-zero if the line is empty.
908 int analyse_quotes(const char *s,struct counters *counters)
910 signed int guessquote=0;
911 int isemptyline=1; /* assume the line is empty until proven otherwise */
916 if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
921 * At start of line, it can only be an openquote.
922 * Hardcode a very common exception!
924 if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
925 counters->open_single_quote++;
927 else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
928 /* Do nothing! it's definitely an apostrophe, not a quote */
930 /* it's outside a word - let's check it out */
931 else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
933 /* it damwell better BE an openquote */
934 if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
935 /* hardcode a very common exception! */
936 counters->open_single_quote++;
940 /* now - is it a closequote? */
941 guessquote=0; /* accumulate clues */
942 if (gcisalpha(s[-1]))
944 /* it follows a letter - could be either */
948 /* looks like a plural apostrophe */
950 if (s[1]==CHAR_SPACE) /* bonus marks! */
954 /* it doesn't have a letter either side */
955 else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
956 guessquote+=8; /* looks like a closequote */
959 if (counters->open_single_quote>counters->close_single_quote)
961 * Give it the benefit of some doubt,
962 * if a squote is already open.
968 counters->close_single_quote++;
971 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
973 isemptyline=0; /* ignore lines like * * * as spacers */
974 if (*s==CHAR_UNDERSCORE)
975 counters->c_unders++;
976 if (*s==CHAR_OPEN_CBRACK)
978 if (*s==CHAR_CLOSE_CBRACK)
980 if (*s==CHAR_OPEN_RBRACK)
982 if (*s==CHAR_CLOSE_RBRACK)
984 if (*s==CHAR_OPEN_SBRACK)
986 if (*s==CHAR_CLOSE_SBRACK)
994 * check_for_odd_characters:
996 * Check for binary and other odd characters.
998 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1001 /* Don't repeat multiple warnings on one line. */
1002 signed int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
1005 for (s=aline;*s;s++)
1007 c=*(unsigned char *)s;
1008 if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
1010 if (pswit[ECHO_SWITCH])
1011 printf("\n%s\n",aline);
1012 if (!pswit[OVERVIEW_SWITCH])
1014 printf(" Line %ld column %d - "
1015 "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
1017 printf(" Line %ld column %d - Non-ASCII character %d\n",
1018 linecnt,(int)(s-aline)+1,c);
1023 if (!eTab && *s==CHAR_TAB)
1025 if (pswit[ECHO_SWITCH])
1026 printf("\n%s\n",aline);
1027 if (!pswit[OVERVIEW_SWITCH])
1028 printf(" Line %ld column %d - Tab character?\n",
1029 linecnt,(int)(s-aline)+1);
1034 if (!eTilde && *s==CHAR_TILDE)
1037 * Often used by OCR software to indicate an
1038 * unrecognizable character.
1040 if (pswit[ECHO_SWITCH])
1041 printf("\n%s\n",aline);
1042 if (!pswit[OVERVIEW_SWITCH])
1043 printf(" Line %ld column %d - Tilde character?\n",
1044 linecnt,(int)(s-aline)+1);
1049 if (!eCarat && *s==CHAR_CARAT)
1051 if (pswit[ECHO_SWITCH])
1052 printf("\n%s\n",aline);
1053 if (!pswit[OVERVIEW_SWITCH])
1054 printf(" Line %ld column %d - Carat character?\n",
1055 linecnt,(int)(s-aline)+1);
1060 if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
1062 if (pswit[ECHO_SWITCH])
1063 printf("\n%s\n",aline);
1064 if (!pswit[OVERVIEW_SWITCH])
1065 printf(" Line %ld column %d - Forward slash?\n",
1066 linecnt,(int)(s-aline)+1);
1072 * Report asterisks only in paranoid mode,
1073 * since they're often deliberate.
1075 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1078 if (pswit[ECHO_SWITCH])
1079 printf("\n%s\n",aline);
1080 if (!pswit[OVERVIEW_SWITCH])
1081 printf(" Line %ld column %d - Asterisk?\n",
1082 linecnt,(int)(s-aline)+1);
1091 * check_for_long_line:
1093 * Check for line too long.
1095 void check_for_long_line(const char *aline)
1097 if (strlen(aline)>LONGEST_PG_LINE)
1099 if (pswit[ECHO_SWITCH])
1100 printf("\n%s\n",aline);
1101 if (!pswit[OVERVIEW_SWITCH])
1102 printf(" Line %ld column %d - Long line %d\n",
1103 linecnt,strlen(aline),strlen(aline));
1109 struct line_properties {
1110 unsigned int len,blen;
1115 * check_for_short_line:
1117 * Check for line too short.
1119 * This one is a bit trickier to implement: we don't want to
1120 * flag the last line of a paragraph for being short, so we
1121 * have to wait until we know that our current line is a
1122 * "normal" line, then report the _previous_ line if it was too
1123 * short. We also don't want to report indented lines like
1124 * chapter heads or formatted quotations. We therefore keep
1125 * last->len as the length of the last line examined, and
1126 * last->blen as the length of the last but one, and try to
1127 * suppress unnecessary warnings by checking that both were of
1128 * "normal" length. We keep the first character of the last
1129 * line in last->start, and if it was a space, we assume that
1130 * the formatting is deliberate. I can't figure out a way to
1131 * distinguish something like a quoted verse left-aligned or
1132 * the header or footer of a letter from a paragraph of short
1133 * lines - maybe if I examined the whole paragraph, and if the
1134 * para has less than, say, 8 lines and if all lines are short,
1135 * then just assume it's OK? Need to look at some texts to see
1136 * how often a formula like this would get the right result.
1138 void check_for_short_line(const char *aline,const struct line_properties *last)
1140 if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
1141 last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1143 if (pswit[ECHO_SWITCH])
1144 printf("\n%s\n",prevline);
1145 if (!pswit[OVERVIEW_SWITCH])
1146 printf(" Line %ld column %d - Short line %d?\n",
1147 linecnt-1,strlen(prevline),strlen(prevline));
1154 * check_for_starting_punctuation:
1156 * Look for punctuation other than full ellipses at start of line.
1158 void check_for_starting_punctuation(const char *aline)
1160 if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1162 if (pswit[ECHO_SWITCH])
1163 printf("\n%s\n",aline);
1164 if (!pswit[OVERVIEW_SWITCH])
1165 printf(" Line %ld column 1 - Begins with punctuation?\n",
1173 * check_for_spaced_emdash:
1175 * Check for spaced em-dashes.
1177 * We must check _all_ occurrences of "--" on the line
1178 * hence the loop - even if the first double-dash is OK
1179 * there may be another that's wrong later on.
1181 void check_for_spaced_emdash(const char *aline)
1185 while ((t=strstr(s,"--")))
1187 if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
1189 if (pswit[ECHO_SWITCH])
1190 printf("\n%s\n",aline);
1191 if (!pswit[OVERVIEW_SWITCH])
1192 printf(" Line %ld column %d - Spaced em-dash?\n",
1193 linecnt,(int)(t-aline)+1);
1202 * check_for_spaced_dash:
1204 * Check for spaced dashes.
1206 void check_for_spaced_dash(const char *aline)
1209 if ((s=strstr(aline," -")))
1213 if (pswit[ECHO_SWITCH])
1214 printf("\n%s\n",aline);
1215 if (!pswit[OVERVIEW_SWITCH])
1216 printf(" Line %ld column %d - Spaced dash?\n",
1217 linecnt,(int)(s-aline)+1);
1222 else if ((s=strstr(aline,"- ")))
1224 if (s==aline || s[-1]!='-')
1226 if (pswit[ECHO_SWITCH])
1227 printf("\n%s\n",aline);
1228 if (!pswit[OVERVIEW_SWITCH])
1229 printf(" Line %ld column %d - Spaced dash?\n",
1230 linecnt,(int)(s-aline)+1);
1238 * check_for_unmarked_paragraphs:
1240 * Check for unmarked paragraphs indicated by separate speakers.
1242 * May well be false positive:
1243 * "Bravo!" "Wonderful!" called the crowd.
1244 * but useful all the same.
1246 void check_for_unmarked_paragraphs(const char *aline)
1249 s=strstr(aline,"\" \"");
1251 s=strstr(aline,"\" \"");
1254 if (pswit[ECHO_SWITCH])
1255 printf("\n%s\n",aline);
1256 if (!pswit[OVERVIEW_SWITCH])
1257 printf(" Line %ld column %d - Query missing paragraph break?\n",
1258 linecnt,(int)(s-aline)+1);
1265 * check_for_jeebies:
1267 * Check for "to he" and other easy h/b errors.
1269 * This is a very inadequate effort on the h/b problem,
1270 * but the phrase "to he" is always an error, whereas "to
1271 * be" is quite common.
1272 * Similarly, '"Quiet!", be said.' is a non-be error
1273 * "to he" is _not_ always an error!:
1274 * "Where they went to he couldn't say."
1275 * Another false positive:
1276 * What would "Cinderella" be without the . . .
1277 * and another: "If he wants to he can see for himself."
1279 void check_for_jeebies(const char *aline)
1282 s=strstr(aline," be could ");
1284 s=strstr(aline," be would ");
1286 s=strstr(aline," was be ");
1288 s=strstr(aline," be is ");
1290 s=strstr(aline," is be ");
1292 s=strstr(aline,"\", be ");
1294 s=strstr(aline,"\" be ");
1296 s=strstr(aline,"\" be ");
1298 s=strstr(aline," to he ");
1301 if (pswit[ECHO_SWITCH])
1302 printf("\n%s\n",aline);
1303 if (!pswit[OVERVIEW_SWITCH])
1304 printf(" Line %ld column %d - Query he/be error?\n",
1305 linecnt,(int)(s-aline)+1);
1309 s=strstr(aline," the had ");
1311 s=strstr(aline," a had ");
1313 s=strstr(aline," they bad ");
1315 s=strstr(aline," she bad ");
1317 s=strstr(aline," he bad ");
1319 s=strstr(aline," you bad ");
1321 s=strstr(aline," i bad ");
1324 if (pswit[ECHO_SWITCH])
1325 printf("\n%s\n",aline);
1326 if (!pswit[OVERVIEW_SWITCH])
1327 printf(" Line %ld column %d - Query had/bad error?\n",
1328 linecnt,(int)(s-aline)+1);
1332 s=strstr(aline,"; hut ");
1334 s=strstr(aline,", hut ");
1337 if (pswit[ECHO_SWITCH])
1338 printf("\n%s\n",aline);
1339 if (!pswit[OVERVIEW_SWITCH])
1340 printf(" Line %ld column %d - Query hut/but error?\n",
1341 linecnt,(int)(s-aline)+1);
1348 * check_for_mta_from:
1350 * Special case - angled bracket in front of "From" placed there by an
1351 * MTA when sending an e-mail.
1353 void check_for_mta_from(const char *aline)
1356 s=strstr(aline,">From");
1359 if (pswit[ECHO_SWITCH])
1360 printf("\n%s\n",aline);
1361 if (!pswit[OVERVIEW_SWITCH])
1362 printf(" Line %ld column %d - Query angled bracket with From\n",
1363 linecnt,(int)(s-aline)+1);
1370 * check_for_orphan_character:
1372 * Check for a single character line -
1373 * often an overflow from bad wrapping.
1375 void check_for_orphan_character(const char *aline)
1377 if (*aline && !aline[1])
1379 if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1381 ; /* Nothing - ignore numerals alone on a line. */
1384 if (pswit[ECHO_SWITCH])
1385 printf("\n%s\n",aline);
1386 if (!pswit[OVERVIEW_SWITCH])
1387 printf(" Line %ld column 1 - Query single character line\n",
1396 * check_for_pling_scanno:
1398 * Check for I" - often should be !
1400 void check_for_pling_scanno(const char *aline)
1403 s=strstr(aline," I\"");
1406 if (pswit[ECHO_SWITCH])
1407 printf("\n%s\n",aline);
1408 if (!pswit[OVERVIEW_SWITCH])
1409 printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1417 * check_for_extra_period:
1419 * Check for period without a capital letter. Cut-down from gutspell.
1420 * Only works when it happens on a single line.
1422 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1424 const char *s,*t,*s1;
1425 signed int i,istypo,isdup;
1426 static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
1427 static int qperiod_index=0;
1428 char testword[MAXWORDLEN]="";
1429 if (pswit[PARANOID_SWITCH])
1431 for (t=s=aline;strstr(t,". ");)
1437 /* start of line punctuation is handled elsewhere */
1440 if (!gcisalpha(t[-1]))
1445 if (warnings->isDutch)
1447 /* For Frank & Jeroen -- 's Middags case */
1448 if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1449 t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1456 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1458 if (*s1>='a' && *s1<='z')
1460 /* we have something to investigate */
1462 /* so let's go back and find out */
1463 for (s1=t-1;s1>=s &&
1464 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1465 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1468 for (i=0;*s1 && *s1!='.';s1++,i++)
1471 for (i=0;*abbrev[i];i++)
1472 if (!strcmp(testword,abbrev[i]))
1474 if (gcisdigit(*testword))
1478 if (isroman(testword))
1483 for (i=0;testword[i];i++)
1484 if (strchr(vowels,testword[i]))
1490 if (strlen(testword)<MAX_QWORD_LENGTH &&
1491 !pswit[VERBOSE_SWITCH])
1492 for (i=0;i<qperiod_index;i++)
1493 if (!strcmp(testword,qperiod[i]))
1497 if (qperiod_index<MAX_QWORD &&
1498 strlen(testword)<MAX_QWORD_LENGTH)
1500 strcpy(qperiod[qperiod_index],testword);
1503 if (pswit[ECHO_SWITCH])
1504 printf("\n%s\n",aline);
1505 if (!pswit[OVERVIEW_SWITCH])
1506 printf(" Line %ld column %d - Extra period?\n",
1507 linecnt,(int)(t-aline)+1);
1523 void procfile(char *filename)
1525 char *s,*t,*wordstart;
1526 char inword[MAXWORDLEN],testword[MAXWORDLEN];
1527 char parastart[81]; /* first line of current para */
1529 struct first_pass_results *first_pass_results;
1530 struct warnings *warnings;
1531 struct counters counters={0};
1532 struct line_properties last={0};
1534 long squot,start_para_line;
1535 signed int i,llen,isacro,isellipsis,istypo,alower;
1536 signed int dquotepar,squotepar;
1537 signed int isnewpara,vowel,consonant;
1538 char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
1539 cbrack_err[80],unders_err[80];
1540 signed int qword_index,isdup;
1542 last.start=CHAR_SPACE;
1543 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
1544 *unders_err=*prevline=0;
1545 linecnt=checked_linecnt=start_para_line=0;
1547 i=llen=isacro=isellipsis=0;
1548 isnewpara=vowel=consonant=enddash=0;
1550 *inword=*testword=0;
1551 dquotepar=squotepar=0;
1552 infile=fopen(filename,"rb");
1555 if (pswit[STDOUT_SWITCH])
1556 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
1558 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
1561 fprintf(stdout,"\n\nFile: %s\n\n",filename);
1562 first_pass_results=first_pass(infile);
1563 warnings=report_first_pass(first_pass_results);
1566 * Here we go with the main pass. Hold onto yer hat!
1567 * Re-init some variables we've dirtied.
1570 while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
1575 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
1576 continue; // skip DP page separators completely
1577 if (linecnt<first_pass_results->firstline ||
1578 (first_pass_results->footerline>0 &&
1579 linecnt>first_pass_results->footerline))
1581 if (pswit[HEADER_SWITCH])
1583 if (!strncmp(aline,"Title:",6))
1584 printf(" %s\n",aline);
1585 if (!strncmp(aline,"Author:",7))
1586 printf(" %s\n",aline);
1587 if (!strncmp(aline,"Release Date:",13))
1588 printf(" %s\n",aline);
1589 if (!strncmp(aline,"Edition:",8))
1590 printf(" %s\n\n",aline);
1592 continue; /* skip through the header */
1597 * If we are in a state of unbalanced quotes, and this line
1598 * doesn't begin with a quote, output the stored error message.
1599 * If the -P switch was used, print the warning even if the
1600 * new para starts with quotes.
1606 if (*t!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
1608 if (!pswit[OVERVIEW_SWITCH])
1610 if (pswit[ECHO_SWITCH])
1611 printf("\n%s\n",parastart);
1619 if (*t!=CHAR_SQUOTE && *t!=CHAR_OPEN_SQUOTE ||
1620 pswit[QPARA_SWITCH] || squot)
1622 if (!pswit[OVERVIEW_SWITCH])
1624 if (pswit[ECHO_SWITCH])
1625 printf("\n%s\n",parastart);
1635 if (!pswit[OVERVIEW_SWITCH])
1637 if (pswit[ECHO_SWITCH])
1638 printf("\n%s\n",parastart);
1646 if (!pswit[OVERVIEW_SWITCH])
1648 if (pswit[ECHO_SWITCH])
1649 printf("\n%s\n",parastart);
1657 if (!pswit[OVERVIEW_SWITCH])
1659 if (pswit[ECHO_SWITCH])
1660 printf("\n%s\n",parastart);
1668 if (!pswit[OVERVIEW_SWITCH])
1670 if (pswit[ECHO_SWITCH])
1671 printf("\n%s\n",parastart);
1677 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=
1678 *sbrack_err=*unders_err=0;
1679 isemptyline=analyse_quotes(aline,&counters);
1680 if (isnewpara && !isemptyline)
1682 /* This line is the start of a new paragraph. */
1683 start_para_line=linecnt;
1684 /* Capture its first line in case we want to report it later. */
1685 strncpy(parastart,aline,80);
1687 dquotepar=squotepar=0; /* restart the quote count */
1689 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
1691 if (*s>='a' && *s<='z')
1693 /* and its first letter is lowercase */
1694 if (pswit[ECHO_SWITCH])
1695 printf("\n%s\n",aline);
1696 if (!pswit[OVERVIEW_SWITCH])
1697 printf(" Line %ld column %d - "
1698 "Paragraph starts with lower-case\n",
1699 linecnt,(int)(s-aline)+1);
1703 isnewpara=0; /* Signal the end of new para processing. */
1705 /* Check for an em-dash broken at line end. */
1706 if (enddash && *aline=='-')
1708 if (pswit[ECHO_SWITCH])
1709 printf("\n%s\n",aline);
1710 if (!pswit[OVERVIEW_SWITCH])
1711 printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
1716 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
1718 if (s>=aline && *s=='-')
1721 * Check for invalid or questionable characters in the line
1722 * Anything above 127 is invalid for plain ASCII, and
1723 * non-printable control characters should also be flagged.
1724 * Tabs should generally not be there.
1726 for (s=aline;*s;s++)
1728 i=(unsigned char)*s;
1729 if (i<CHAR_SPACE && i!=CHAR_LF && i!=CHAR_CR && i!=CHAR_TAB)
1731 if (pswit[ECHO_SWITCH])
1732 printf("\n%s\n",aline);
1733 if (!pswit[OVERVIEW_SWITCH])
1734 printf(" Line %ld column %d - Control character %d\n",
1735 linecnt,(int)(s-aline)+1,i);
1741 check_for_odd_characters(aline,warnings,isemptyline);
1742 if (warnings->longline)
1743 check_for_long_line(aline);
1744 if (warnings->shortline)
1745 check_for_short_line(aline,&last);
1747 last.len=strlen(aline);
1748 last.start=aline[0];
1749 check_for_starting_punctuation(aline);
1752 check_for_spaced_emdash(aline);
1753 check_for_spaced_dash(aline);
1755 check_for_unmarked_paragraphs(aline);
1756 check_for_jeebies(aline);
1757 check_for_mta_from(aline);
1758 check_for_orphan_character(aline);
1759 check_for_pling_scanno(aline);
1760 check_for_extra_period(aline,warnings);
1761 if (pswit[TYPO_SWITCH])
1763 /* Check for words usually not followed by punctuation. */
1767 s=getaword(s,inword);
1771 for (i=0;*nocomma[i];i++)
1772 if (!strcmp(inword,nocomma[i]))
1774 if (*s==',' || *s==';' || *s==':')
1776 if (pswit[ECHO_SWITCH])
1777 printf("\n%s\n",aline);
1778 if (!pswit[OVERVIEW_SWITCH])
1779 printf(" Line %ld column %d - "
1780 "Query punctuation after %s?\n",
1781 linecnt,(int)(s-aline)+1,inword);
1786 for (i=0;*noperiod[i];i++)
1787 if (!strcmp(inword,noperiod[i]))
1789 if (*s=='.' || *s=='!')
1791 if (pswit[ECHO_SWITCH])
1792 printf("\n%s\n",aline);
1793 if (!pswit[OVERVIEW_SWITCH])
1794 printf(" Line %ld column %d - "
1795 "Query punctuation after %s?\n",
1796 linecnt,(int)(s-aline)+1,inword);
1804 * Check for commonly mistyped words,
1805 * and digits like 0 for O in a word.
1810 s=getaword(s,inword);
1812 continue; /* don't bother with empty lines */
1813 if (mixdigit(inword))
1815 if (pswit[ECHO_SWITCH])
1816 printf("\n%s\n",aline);
1817 if (!pswit[OVERVIEW_SWITCH])
1818 printf(" Line %ld column %d - Query digit in %s\n",
1819 linecnt,(int)(wordstart-aline)+1,inword);
1824 * Put the word through a series of tests for likely typos and OCR
1827 if (pswit[TYPO_SWITCH])
1830 strcpy(testword,inword);
1832 for (i=0;i<(signed int)strlen(testword);i++)
1834 /* lowercase for testing */
1835 if (testword[i]>='a' && testword[i]<='z')
1837 if (alower && testword[i]>='A' && testword[i]<='Z')
1840 * We have an uppercase mid-word. However, there are
1842 * Mac and Mc like McGill
1843 * French contractions like l'Abbe
1845 if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1846 i==3 && testword[0]=='m' && testword[1]=='a' &&
1847 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1852 testword[i]=(char)tolower(testword[i]);
1855 * Check for certain unlikely two-letter combinations at word
1858 if (strlen(testword)>1)
1860 for (i=0;*nostart[i];i++)
1861 if (!strncmp(testword,nostart[i],2))
1863 for (i=0;*noend[i];i++)
1864 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1867 /* ght is common, gbt never. Like that. */
1868 if (strstr(testword,"cb"))
1870 if (strstr(testword,"gbt"))
1872 if (strstr(testword,"pbt"))
1874 if (strstr(testword,"tbs"))
1876 if (strstr(testword,"mrn"))
1878 if (strstr(testword,"ahle"))
1880 if (strstr(testword,"ihle"))
1883 * "TBE" does happen - like HEARTBEAT - but uncommon.
1884 * Also "TBI" - frostbite, outbid - but uncommon.
1885 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1886 * numerals, but "ii" is a common scanno.
1888 if (strstr(testword,"tbi"))
1890 if (strstr(testword,"tbe"))
1892 if (strstr(testword,"ii"))
1895 * Check for no vowels or no consonants.
1896 * If none, flag a typo.
1898 if (!istypo && strlen(testword)>1)
1901 for (i=0;testword[i];i++)
1903 if (testword[i]=='y' || gcisdigit(testword[i]))
1905 /* Yah, this is loose. */
1909 else if (strchr(vowels,testword[i]))
1914 if (!vowel || !consonant)
1918 * Now exclude the word from being reported if it's in
1921 for (i=0;*okword[i];i++)
1922 if (!strcmp(testword,okword[i]))
1925 * What looks like a typo may be a Roman numeral.
1928 if (istypo && isroman(testword))
1930 /* Check the manual list of typos. */
1932 for (i=0;*typo[i];i++)
1933 if (!strcmp(testword,typo[i]))
1936 * Check lowercase s, l, i and m - special cases.
1937 * "j" - often a semi-colon gone wrong.
1938 * "d" for a missing apostrophe - he d
1941 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1946 if (strlen(testword)<MAX_QWORD_LENGTH &&
1947 !pswit[VERBOSE_SWITCH])
1948 for (i=0;i<qword_index;i++)
1949 if (!strcmp(testword,qword[i]))
1956 if (qword_index<MAX_QWORD &&
1957 strlen(testword)<MAX_QWORD_LENGTH)
1959 strcpy(qword[qword_index],testword);
1962 if (pswit[ECHO_SWITCH])
1963 printf("\n%s\n",aline);
1964 if (!pswit[OVERVIEW_SWITCH])
1966 printf(" Line %ld column %d - Query word %s",
1967 linecnt,(int)(wordstart-aline)+1,inword);
1968 if (strlen(testword)<MAX_QWORD_LENGTH &&
1969 !pswit[VERBOSE_SWITCH])
1970 printf(" - not reporting duplicates");
1978 /* check the user's list of typos */
1979 if (!istypo && usertypo_count)
1980 for (i=0;i<usertypo_count;i++)
1981 if (!strcmp(testword,usertypo[i]))
1983 if (pswit[ECHO_SWITCH])
1984 printf("\n%s\n",aline);
1985 if (!pswit[OVERVIEW_SWITCH])
1986 printf(" Line %ld column %d - "
1987 "Query possible scanno %s\n",
1988 linecnt,(int)(wordstart-aline)+2,inword);
1990 if (pswit[PARANOID_SWITCH] && warnings->digit)
1992 /* In paranoid mode, query all 0 and 1 standing alone. */
1993 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1995 if (pswit[ECHO_SWITCH])
1996 printf("\n%s\n",aline);
1997 if (!pswit[OVERVIEW_SWITCH])
1998 printf(" Line %ld column %d - Query standalone %s\n",
1999 linecnt,(int)(wordstart-aline)+2,inword);
2006 * Look for added or missing spaces around punctuation and quotes.
2007 * If there is a punctuation character like ! with no space on
2008 * either side, suspect a missing!space. If there are spaces on
2009 * both sides , assume a typo. If we see a double quote with no
2010 * space or punctuation on either side of it, assume unspaced
2011 * quotes "like"this.
2014 for (i=1;i<llen;i++)
2016 /* For each character in the line after the first. */
2017 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
2019 /* we need to suppress warnings for acronyms like M.D. */
2021 /* we need to suppress warnings for ellipsis . . . */
2023 /* if there are letters on both sides of it or ... */
2024 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
2025 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
2027 /* ...if it's strict punctuation followed by an alpha */
2030 if (i>2 && aline[i-2]=='.')
2032 if (i+2<llen && aline[i+2]=='.')
2037 if (pswit[ECHO_SWITCH])
2038 printf("\n%s\n",aline);
2039 if (!pswit[OVERVIEW_SWITCH])
2040 printf(" Line %ld column %d - Missing space?\n",
2046 if (aline[i-1]==CHAR_SPACE &&
2047 (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
2050 * If there are spaces on both sides,
2051 * or space before and end of line.
2055 if (i>2 && aline[i-2]=='.')
2057 if (i+2<llen && aline[i+2]=='.')
2060 if (!isemptyline && !isellipsis)
2062 if (pswit[ECHO_SWITCH])
2063 printf("\n%s\n",aline);
2064 if (!pswit[OVERVIEW_SWITCH])
2065 printf(" Line %ld column %d - "
2066 "Spaced punctuation?\n",linecnt,i+1);
2073 /* Split out the characters that CANNOT be preceded by space. */
2075 for (i=1;i<llen;i++)
2077 /* for each character in the line after the first */
2078 if (strchr("?!,;:",aline[i]))
2080 /* if it's punctuation that _cannot_ have a space before it */
2081 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
2082 aline[i+1]!=CHAR_SPACE)
2085 * If aline[i+1) DOES == space,
2086 * it was already reported just above.
2088 if (pswit[ECHO_SWITCH])
2089 printf("\n%s\n",aline);
2090 if (!pswit[OVERVIEW_SWITCH])
2091 printf(" Line %ld column %d - Spaced punctuation?\n",
2099 * Special case " .X" where X is any alpha.
2100 * This plugs a hole in the acronym code above.
2101 * Inelegant, but maintainable.
2104 for (i=1;i<llen;i++)
2106 /* for each character in the line after the first */
2109 /* if it's a period */
2110 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
2113 * If the period follows a space and
2114 * is followed by a letter.
2116 if (pswit[ECHO_SWITCH])
2117 printf("\n%s\n",aline);
2118 if (!pswit[OVERVIEW_SWITCH])
2119 printf(" Line %ld column %d - Spaced punctuation?\n",
2126 for (i=1;i<llen;i++)
2128 /* for each character in the line after the first */
2129 if (aline[i]==CHAR_DQUOTE)
2131 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
2132 !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
2133 !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
2135 if (pswit[ECHO_SWITCH])
2136 printf("\n%s\n",aline);
2137 if (!pswit[OVERVIEW_SWITCH])
2138 printf(" Line %ld column %d - Unspaced quotes?\n",
2145 /* Check parity of quotes. */
2146 for (s=aline;*s;s++)
2148 if (*s==CHAR_DQUOTE)
2150 if (!(dquotepar=!dquotepar))
2153 if (!strchr("_-.'`/,;:!?)]} ",s[1]))
2155 if (pswit[ECHO_SWITCH])
2156 printf("\n%s\n",aline);
2157 if (!pswit[OVERVIEW_SWITCH])
2158 printf(" Line %ld column %d - "
2159 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2167 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2168 !strchr("_-/.'`([{$",s[1]) || !s[1])
2170 if (pswit[ECHO_SWITCH])
2171 printf("\n%s\n",aline);
2172 if (!pswit[OVERVIEW_SWITCH])
2173 printf(" Line %ld column %d - "
2174 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2181 if (*aline==CHAR_DQUOTE)
2183 if (strchr(",;:!?)]} ",aline[1]))
2185 if (pswit[ECHO_SWITCH])
2186 printf("\n%s\n",aline);
2187 if (!pswit[OVERVIEW_SWITCH])
2188 printf(" Line %ld column 1 - Wrongspaced quotes?\n",
2194 if (pswit[SQUOTE_SWITCH])
2196 for (s=aline;*s;s++)
2198 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
2199 (s==aline || s>aline && !gcisalpha(s[-1]) ||
2202 if (!(squotepar=!squotepar))
2205 if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
2207 if (pswit[ECHO_SWITCH])
2208 printf("\n%s\n",aline);
2209 if (!pswit[OVERVIEW_SWITCH])
2210 printf(" Line %ld column %d - "
2211 "Wrongspaced singlequotes?\n",
2212 linecnt,(int)(s-aline)+1);
2220 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2221 !strchr("_-/\".'`",s[1]) || !s[1])
2223 if (pswit[ECHO_SWITCH])
2224 printf("\n%s\n",aline);
2225 if (!pswit[OVERVIEW_SWITCH])
2226 printf(" Line %ld column %d - "
2227 "Wrongspaced singlequotes?\n",
2228 linecnt,(int)(s-aline)+1);
2237 * Look for double punctuation like ,. or ,,
2238 * Thanks to DW for the suggestion!
2239 * In books with references, ".," and ".;" are common
2240 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2241 * OTOH, from my initial tests, there are also fairly
2242 * common errors. What to do? Make these cases paranoid?
2243 * ".," is the most common, so warnings->dotcomma is used
2244 * to suppress detailed reporting if it occurs often.
2247 for (i=0;i<llen;i++)
2249 /* for each punctuation character in the line */
2250 if (strchr(".?!,;:",aline[i]) && (strchr(".?!,;:",aline[i+1])) &&
2251 aline[i] && aline[i+1])
2253 /* followed by punctuation, it's a query, unless . . . */
2254 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
2256 !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
2257 warnings->isFrench && !strncmp(aline+i,",...",4) ||
2258 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2259 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2260 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2261 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2262 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2263 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2264 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2265 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2266 warnings->isFrench && !strncmp(aline+i,"...?",4))
2268 if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
2269 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2270 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2271 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2272 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2273 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2274 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2275 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2276 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2277 warnings->isFrench && !strncmp(aline+i,"...?",4))
2279 ; /* do nothing for .. !! and ?? which can be legit */
2283 if (pswit[ECHO_SWITCH])
2284 printf("\n%s\n",aline);
2285 if (!pswit[OVERVIEW_SWITCH])
2286 printf(" Line %ld column %d - Double punctuation?\n",
2294 while (strstr(s," \" "))
2296 if (pswit[ECHO_SWITCH])
2297 printf("\n%s\n",aline);
2298 if (!pswit[OVERVIEW_SWITCH])
2299 printf(" Line %ld column %d - Spaced doublequote?\n",
2300 linecnt,(int)(strstr(s," \" ")-aline+1));
2303 s=strstr(s," \" ")+2;
2306 while (strstr(s," ' "))
2308 if (pswit[ECHO_SWITCH])
2309 printf("\n%s\n",aline);
2310 if (!pswit[OVERVIEW_SWITCH])
2311 printf(" Line %ld column %d - Spaced singlequote?\n",
2312 linecnt,(int)(strstr(s," ' ")-aline+1));
2315 s=strstr(s," ' ")+2;
2318 while (strstr(s," ` "))
2320 if (pswit[ECHO_SWITCH])
2321 printf("\n%s\n",aline);
2322 if (!pswit[OVERVIEW_SWITCH])
2323 printf(" Line %ld column %d - Spaced singlequote?\n",
2324 linecnt,(int)(strstr(s," ` ")-aline+1));
2327 s=strstr(s," ` ")+2;
2329 /* check special case of 'S instead of 's at end of word */
2333 if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
2335 if (pswit[ECHO_SWITCH])
2336 printf("\n%s\n",aline);
2337 if (!pswit[OVERVIEW_SWITCH])
2338 printf(" Line %ld column %d - Capital \"S\"?\n",
2339 linecnt,(int)(s-aline+2));
2346 * Now check special cases - start and end of line -
2347 * for single and double quotes. Start is sometimes [sic]
2348 * but better to query it anyway.
2349 * While we're here, check for dash at end of line.
2354 if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
2355 aline[llen-1]==CHAR_OPEN_SQUOTE)
2356 if (aline[llen-2]==CHAR_SPACE)
2358 if (pswit[ECHO_SWITCH])
2359 printf("\n%s\n",aline);
2360 if (!pswit[OVERVIEW_SWITCH])
2361 printf(" Line %ld column %d - Spaced quote?\n",
2366 if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
2367 aline[1]==CHAR_SPACE)
2369 if (pswit[ECHO_SWITCH])
2370 printf("\n%s\n",aline);
2371 if (!pswit[OVERVIEW_SWITCH])
2372 printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
2377 * Dash at end of line may well be legit - paranoid mode only
2378 * and don't report em-dash at line-end.
2380 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2382 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
2384 if (aline[i]=='-' && aline[i-1]!='-')
2386 if (pswit[ECHO_SWITCH])
2387 printf("\n%s\n",aline);
2388 if (!pswit[OVERVIEW_SWITCH])
2389 printf(" Line %ld column %d - "
2390 "Hyphen at end of line?\n",linecnt,i);
2395 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2396 * If so, suspect a scanno like "a]most".
2399 for (i=1;i<llen-1;i++)
2401 /* for each bracket character in the line except 1st & last */
2402 if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
2403 gcisalpha(aline[i+1]))
2405 if (pswit[ECHO_SWITCH])
2406 printf("\n%s\n",aline);
2407 if (!pswit[OVERVIEW_SWITCH])
2408 printf(" Line %ld column %d - Unspaced bracket?\n",
2415 if (warnings->endquote)
2417 for (i=1;i<llen;i++)
2419 /* for each character in the line except 1st */
2420 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
2422 if (pswit[ECHO_SWITCH])
2423 printf("\n%s\n",aline);
2424 if (!pswit[OVERVIEW_SWITCH])
2425 printf(" Line %ld column %d - "
2426 "endquote missing punctuation?\n",linecnt,i);
2433 * Check for <HTML TAG>.
2434 * If there is a < in the line, followed at some point
2435 * by a > then we suspect HTML.
2437 if (strstr(aline,"<") && strstr(aline,">"))
2439 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
2442 strncpy(wrk,strstr(aline,"<"),i);
2444 if (pswit[ECHO_SWITCH])
2445 printf("\n%s\n",aline);
2446 if (!pswit[OVERVIEW_SWITCH])
2447 printf(" Line %ld column %d - HTML Tag? %s \n",
2448 linecnt,(int)(strstr(aline,"<")-aline)+1,wrk);
2454 * Check for &symbol; HTML.
2455 * If there is a & in the line, followed at
2456 * some point by a ; then we suspect HTML.
2458 if (strstr(aline,"&") && strstr(aline,";"))
2460 i=(int)(strstr(aline,";")-strstr(aline,"&")+1);
2461 for (s=strstr(aline,"&");s<strstr(aline,";");s++)
2463 i=0; /* Don't report "Jones & Son;" */
2466 strncpy(wrk,strstr(aline,"&"),i);
2468 if (pswit[ECHO_SWITCH])
2469 printf("\n%s\n",aline);
2470 if (!pswit[OVERVIEW_SWITCH])
2471 printf(" Line %ld column %d - HTML symbol? %s \n",
2472 linecnt,(int)(strstr(aline,"&")-aline)+1,wrk);
2478 * At end of paragraph, check for mismatched quotes.
2479 * We don't want to report an error immediately, since it is a
2480 * common convention to omit the quotes at end of paragraph if
2481 * the next paragraph is a continuation of the same speaker.
2482 * Where this is the case, the next para should begin with a
2483 * quote, so we store the warning message and only display it
2484 * at the top of the next iteration if the new para doesn't
2485 * start with a quote.
2486 * The -p switch overrides this default, and warns of unclosed
2487 * quotes on _every_ paragraph, whether the next begins with a
2492 /* end of para - add up the totals */
2493 if (counters.quot%2)
2494 sprintf(dquote_err," Line %ld - Mismatched quotes\n",
2496 if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
2497 counters.open_single_quote!=counters.close_single_quote)
2498 sprintf(squote_err," Line %ld - Mismatched singlequotes?\n",
2500 if (pswit[SQUOTE_SWITCH] && counters.open_single_quote &&
2501 counters.open_single_quote!=counters.close_single_quote &&
2502 counters.open_single_quote!=counters.close_single_quote+1)
2504 * Flag it to be noted regardless of the
2505 * first char of the next para.
2508 if (counters.r_brack)
2509 sprintf(rbrack_err," Line %ld - "
2510 "Mismatched round brackets?\n",linecnt);
2511 if (counters.s_brack)
2512 sprintf(sbrack_err," Line %ld - "
2513 "Mismatched square brackets?\n",linecnt);
2514 if (counters.c_brack)
2515 sprintf(cbrack_err," Line %ld - "
2516 "Mismatched curly brackets?\n",linecnt);
2517 if (counters.c_unders%2)
2518 sprintf(unders_err," Line %ld - Mismatched underscores?\n",
2520 memset(&counters,0,sizeof(counters));
2521 /* let the next iteration know that it's starting a new para */
2525 * Check for omitted punctuation at end of paragraph by working back
2526 * through prevline. DW.
2527 * Need to check this only for "normal" paras.
2528 * So what is a "normal" para?
2529 * Not normal if one-liner (chapter headings, etc.)
2530 * Not normal if doesn't contain at least one locase letter
2531 * Not normal if starts with space
2536 for (s=prevline,i=0;*s && !i;s++)
2538 /* use i to indicate the presence of a letter on the line */
2541 * This next "if" is a problem.
2542 * If we say "start_para_line <= linecnt - 1", that includes
2543 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2544 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2545 * misses genuine one-line paragraphs.
2547 if (i && last.blen>2 && start_para_line<linecnt-1 &&
2548 *prevline>CHAR_SPACE)
2550 for (i=strlen(prevline)-1;
2551 (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
2552 prevline[i]>CHAR_SPACE && i>0;
2557 if (gcisalpha(prevline[i]))
2559 if (pswit[ECHO_SWITCH])
2560 printf("\n%s\n",prevline);
2561 if (!pswit[OVERVIEW_SWITCH])
2562 printf(" Line %ld column %d - "
2563 "No punctuation at para end?\n",
2564 linecnt-1,strlen(prevline));
2569 if (strchr("-.:!([{?}])",prevline[i]))
2574 strcpy(prevline,aline);
2577 if (!pswit[OVERVIEW_SWITCH])
2578 for (i=0;i<MAX_QWORD;i++)
2580 printf("\nNote: Queried word %s was duplicated %d time%s\n",
2581 qword[i],dupcnt[i],"s");
2587 * Get one line from the input stream, checking for
2588 * the existence of exactly one CR/LF line-end per line.
2590 * Returns: a pointer to the line.
2592 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
2598 c=cint=fgetc(thefile);
2603 /* either way, it's end of line */
2610 /* Error - a LF without a preceding CR */
2611 if (pswit[LINE_END_SWITCH])
2613 if (pswit[ECHO_SWITCH])
2614 printf("\n%s\n",theline);
2615 if (!pswit[OVERVIEW_SWITCH])
2616 printf(" Line %ld - No CR?\n",lcnt);
2627 /* Error - two successive CRs */
2628 if (pswit[LINE_END_SWITCH])
2630 if (pswit[ECHO_SWITCH])
2631 printf("\n%s\n",theline);
2632 if (!pswit[OVERVIEW_SWITCH])
2633 printf(" Line %ld - Two successive CRs?\n",lcnt);
2642 if (pswit[LINE_END_SWITCH] && isCR)
2644 if (pswit[ECHO_SWITCH])
2645 printf("\n%s\n",theline);
2646 if (!pswit[OVERVIEW_SWITCH])
2647 printf(" Line %ld column %d - CR without LF?\n",
2657 c=cint=fgetc(thefile);
2658 } while(len<maxlen);
2659 if (pswit[MARKUP_SWITCH])
2660 postprocess_for_HTML(theline);
2661 if (pswit[DP_SWITCH])
2662 postprocess_for_DP(theline);
2669 * Takes a "word" as a parameter, and checks whether it
2670 * contains a mixture of alpha and digits. Generally, this is an
2671 * error, but may not be for cases like 4th or L5 12s. 3d.
2673 * Returns: 0 if no error found, 1 if error.
2675 int mixdigit(char *checkword)
2677 int wehaveadigit,wehavealetter,firstdigits,query,wl;
2679 wehaveadigit=wehavealetter=query=0;
2680 for (s=checkword;*s;s++)
2686 if (wehaveadigit && wehavealetter)
2688 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2690 wl=strlen(checkword);
2691 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
2693 /* digits, ending in st, rd, nd, th of either case */
2694 if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
2695 matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
2696 matchword(checkword+wl-2,"th")))
2698 if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
2699 matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
2700 matchword(checkword+wl-3,"ths")))
2702 if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
2703 matchword(checkword+wl-4,"rdly") ||
2704 matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
2706 /* digits, ending in l, L, s or d */
2707 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
2708 checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
2711 * L at the start of a number, representing Britsh pounds, like L500.
2712 * This is cute. We know the current word is mixeddigit. If the first
2713 * letter is L, there must be at least one digit following. If both
2714 * digits and letters follow, we have a genuine error, else we have a
2715 * capital L followed by digits, and we accept that as a non-error.
2717 if (checkword[0]=='L' && !mixdigit(checkword+1))
2726 * Extracts the first/next "word" from the line, and puts
2727 * it into "thisword". A word is defined as one English word unit--or
2728 * at least that's the aim.
2730 * Returns: a pointer to the position in the line where we will start
2731 * looking for the next word.
2733 char *getaword(char *fromline,char *thisword)
2738 for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
2742 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2743 * Especially yucky is the case of L1,000
2744 * This section looks for a pattern of characters including a digit
2745 * followed by a comma or period followed by one or more digits.
2746 * If found, it returns this whole pattern as a word; otherwise we discard
2747 * the results and resume our normal programming.
2750 for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
2751 wordlen<MAXWORDLEN;s++)
2753 thisword[wordlen]=*s;
2756 thisword[wordlen]=0;
2757 for (i=1;i<wordlen-1;i++)
2759 if (thisword[i]=='.' || thisword[i]==',')
2761 if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
2768 /* we didn't find a punctuated number - do the regular getword thing */
2770 for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
2771 wordlen<MAXWORDLEN;fromline++)
2773 thisword[wordlen]=*fromline;
2776 thisword[wordlen]=0;
2783 * A case-insensitive string matcher.
2785 int matchword(char *checkfor,char *thisword)
2787 unsigned int ismatch,i;
2788 if (strlen(checkfor)!=strlen(thisword))
2790 ismatch=1; /* assume a match until we find a difference */
2791 for (i=0;i<strlen(checkfor);i++)
2792 if (toupper(checkfor[i])!=toupper(thisword[i]))
2800 * Lowercase the line.
2803 void lowerit(char *theline)
2805 for (;*theline;theline++)
2806 if (*theline>='A' && *theline<='Z')
2813 * Is this word a Roman Numeral?
2815 * It doesn't actually validate that the number is a valid Roman Numeral--for
2816 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2817 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2818 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2819 * expressions thereof, except when it came to taxes. Allow any number of M,
2820 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2821 * XL or an optional XC, an optional IX or IV, an optional V and any number
2824 int isroman(char *t)
2830 while (*t=='m' && *t)
2834 if (*t=='c' && t[1]=='m')
2836 if (*t=='c' && t[1]=='d')
2838 while (*t=='c' && *t)
2840 if (*t=='x' && t[1]=='l')
2842 if (*t=='x' && t[1]=='c')
2846 while (*t=='x' && *t)
2848 if (*t=='i' && t[1]=='x')
2850 if (*t=='i' && t[1]=='v')
2854 while (*t=='i' && *t)
2862 * A version of isalpha() that is somewhat lenient on 8-bit texts.
2863 * If we use the standard function, 8-bit accented characters break
2864 * words, so that tete with accented characters appears to be two words, "t"
2865 * and "t", with 8-bit characters between them. This causes over-reporting of
2866 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
2867 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
2869 int gcisalpha(unsigned char c)
2871 if (c>='a' && c<='z')
2873 if (c>='A' && c<='Z')
2877 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
2879 if (c==140 || c==142 || c==156 || c==158 || c==159)
2887 * A version of isdigit() that doesn't get confused in 8-bit texts.
2889 int gcisdigit(unsigned char c)
2891 return c>='0' && c<='9';
2897 * A version of isletter() that doesn't get confused in 8-bit texts.
2898 * NB: this is ISO-8891-1-specific.
2900 int gcisletter(unsigned char c)
2902 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
2908 * Wraps strchr to return NULL if the character being searched for is zero.
2910 char *gcstrchr(char *s,char c)
2918 * postprocess_for_DP:
2920 * Invoked with the -d switch from flgets().
2921 * It simply "removes" from the line a hard-coded set of common
2922 * DP-specific tags, so that the line passed to the main routine has
2923 * been pre-cleaned of DP markup.
2925 void postprocess_for_DP(char *theline)
2931 for (i=0;*DPmarkup[i];i++)
2933 s=strstr(theline,DPmarkup[i]);
2936 t=s+strlen(DPmarkup[i]);
2944 s=strstr(theline,DPmarkup[i]);
2950 * postprocess_for_HTML:
2952 * Invoked with the -m switch from flgets().
2953 * It simply "removes" from the line a hard-coded set of common
2954 * HTML tags and "replaces" a hard-coded set of common HTML
2955 * entities, so that the line passed to the main routine has
2956 * been pre-cleaned of HTML.
2958 void postprocess_for_HTML(char *theline)
2960 if (strstr(theline,"<") && strstr(theline,">"))
2961 while (losemarkup(theline))
2963 while (loseentities(theline))
2967 char *losemarkup(char *theline)
2973 s=strstr(theline,"<");
2974 t=strstr(theline,">");
2977 for (i=0;*markup[i];i++)
2978 if (!tagcomp(s+1,markup[i]))
2991 /* It's an unrecognized <xxx>. */
2995 char *loseentities(char *theline)
3001 for (i=0;*entities[i].htmlent;i++)
3003 s=strstr(theline,entities[i].htmlent);
3006 t=malloc((size_t)strlen(s));
3009 strcpy(t,s+strlen(entities[i].htmlent));
3010 strcpy(s,entities[i].textent);
3016 for (i=0;*entities[i].htmlnum;i++)
3018 s=strstr(theline,entities[i].htmlnum);
3021 t=malloc((size_t)strlen(s));
3024 strcpy(t,s+strlen(entities[i].htmlnum));
3025 strcpy(s,entities[i].textent);
3034 int tagcomp(char *strin,char *basetag)
3040 t++; /* ignore a slash */
3043 if (tolower(*s)!=tolower(*t))
3053 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3054 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3055 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3056 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3057 "For details, read the file COPYING.\n",stderr);
3058 fputs("This is Free Software; "
3059 "you may redistribute it under certain conditions (GPL);\n",stderr);
3060 fputs("read the file COPYING for details.\n\n",stderr);
3061 fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
3062 fputs(" where -s checks single quotes, -e suppresses echoing lines, "
3063 "-t checks typos\n",stderr);
3064 fputs(" -x (paranoid) switches OFF -t and extra checks, "
3065 "-l turns OFF line-end checks\n",stderr);
3066 fputs(" -o just displays overview without detail, "
3067 "-h echoes header fields\n",stderr);
3068 fputs(" -v (verbose) unsuppresses duplicate reporting, "
3069 "-m suppresses markup\n",stderr);
3070 fputs(" -d ignores DP-specific markup,\n",stderr);
3071 fputs(" -u uses a file gutcheck.typ to query user-defined "
3072 "possible typos\n",stderr);
3073 fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
3075 fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
3077 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3078 "non-ASCII\n",stderr);
3079 fputs("characters like accented letters, "
3080 "lines longer than 75 or shorter than 55,\n",stderr);
3081 fputs("unbalanced quotes or brackets, "
3082 "a variety of badly formatted punctuation, \n",stderr);
3083 fputs("HTML tags, some likely typos. "
3084 "It is NOT a substitute for human judgement.\n",stderr);