1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
26 #define MAXWORDLEN 80 /* max length of one word */
27 #define LINEBUFSIZE 2048 /* buffer size for an input line */
29 #define MAX_USER_TYPOS 1000
30 #define USERTYPO_FILE "gutcheck.typ"
33 #define MAX_PATH 16384
36 char aline[LINEBUFSIZE];
37 char prevline[LINEBUFSIZE];
41 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
42 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
43 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
44 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
45 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
46 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
47 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
48 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
49 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
50 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
51 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
52 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
53 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
54 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
55 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
56 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
57 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
58 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
59 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
60 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
61 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
62 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
63 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
64 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
65 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
66 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
67 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
68 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
69 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 char *usertypo[MAX_USER_TYPOS];
75 /* Common abbreviations and other OK words not to query as typos. */
77 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
78 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
79 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
80 "outbid", "outbids", "frostbite", "frostbitten", ""
83 /* Common abbreviations that cause otherwise unexplained periods. */
85 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
86 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
90 * Two-Letter combinations that rarely if ever start words,
91 * but are common scannos or otherwise common letter combinations.
94 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
98 * Two-Letter combinations that rarely if ever end words,
99 * but are common scannos or otherwise common letter combinations.
102 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
103 "sw", "gr", "sl", "cl", "iy", ""
107 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
108 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
109 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
110 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
114 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
118 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
119 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
120 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
121 "during", "let", "toward", "among", ""
125 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
126 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
127 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
128 "among", "those", "into", "whom", "having", "thence", ""
131 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
138 "&", "&", "&",
139 "<", "<", "<",
140 ">", ">", ">",
141 "°", "°", " degrees",
142 "£", "£", "L",
143 """, """, "\"", /* quotation mark = APL quote */
144 "Œ", "Œ", "OE", /* latin capital ligature OE */
145 "œ", "œ", "oe", /* latin small ligature oe */
146 "Š", "Š", "S", /* latin capital letter S with caron */
147 "š", "š", "s", /* latin small letter s with caron */
148 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
149 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
150 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
151 " ", " ", " ", /* en space, U+2002 ISOpub */
152 " ", " ", " ", /* em space, U+2003 ISOpub */
153 " ", " ", " ", /* thin space, U+2009 ISOpub */
154 "–", "–", "-", /* en dash, U+2013 ISOpub */
155 "—", "—", "--", /* em dash, U+2014 ISOpub */
156 "’", "’", "'", /* right single quotation mark */
157 "‚", "‚", "'", /* single low-9 quotation mark */
158 "“", "“", "\"", /* left double quotation mark */
159 "”", "”", "\"", /* right double quotation mark */
160 "„", "„", "\"", /* double low-9 quotation mark */
161 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
162 "›", "›", "\"", /* single right-pointing angle quotation mark */
163 " ", " ", " ", /* no-break space = non-breaking space, */
164 "¡", "¡", "!", /* inverted exclamation mark */
165 "¢", "¢", "c", /* cent sign */
166 "£", "£", "L", /* pound sign */
167 "¤", "¤", "$", /* currency sign */
168 "¥", "¥", "Y", /* yen sign = yuan sign */
169 "§", "§", "--", /* section sign */
170 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
171 "©", "©", "(C) ", /* copyright sign */
172 "ª", "ª", " ", /* feminine ordinal indicator */
173 "«", "«", "\"", /* left-pointing double angle quotation mark */
174 "­", "­", "-", /* soft hyphen = discretionary hyphen */
175 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
176 "¯", "¯", " ", /* macron = spacing macron = overline */
177 "°", "°", " degrees", /* degree sign */
178 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
179 "²", "²", "2", /* superscript two = superscript digit two */
180 "³", "³", "3", /* superscript three = superscript digit three */
181 "´", "´", " ", /* acute accent = spacing acute */
182 "µ", "µ", "m", /* micro sign */
183 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
184 "¸", "¸", " ", /* cedilla = spacing cedilla */
185 "¹", "¹", "1", /* superscript one = superscript digit one */
186 "º", "º", " ", /* masculine ordinal indicator */
187 "»", "»", "\"", /* right-pointing double angle quotation mark */
188 "¼", "¼", "1/4", /* vulgar fraction one quarter */
189 "½", "½", "1/2", /* vulgar fraction one half */
190 "¾", "¾", "3/4", /* vulgar fraction three quarters */
191 "¿", "¿", "?", /* inverted question mark */
192 "À", "À", "A", /* latin capital letter A with grave */
193 "Á", "Á", "A", /* latin capital letter A with acute */
194 "Â", "Â", "A", /* latin capital letter A with circumflex */
195 "Ã", "Ã", "A", /* latin capital letter A with tilde */
196 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
197 "Å", "Å", "A", /* latin capital letter A with ring above */
198 "Æ", "Æ", "AE", /* latin capital letter AE */
199 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
200 "È", "È", "E", /* latin capital letter E with grave */
201 "É", "É", "E", /* latin capital letter E with acute */
202 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
203 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
204 "Ì", "Ì", "I", /* latin capital letter I with grave */
205 "Í", "Í", "I", /* latin capital letter I with acute */
206 "Î", "Î", "I", /* latin capital letter I with circumflex */
207 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
208 "Ð", "Ð", "E", /* latin capital letter ETH */
209 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
210 "Ò", "Ò", "O", /* latin capital letter O with grave */
211 "Ó", "Ó", "O", /* latin capital letter O with acute */
212 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
213 "Õ", "Õ", "O", /* latin capital letter O with tilde */
214 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
215 "×", "×", "*", /* multiplication sign */
216 "Ø", "Ø", "O", /* latin capital letter O with stroke */
217 "Ù", "Ù", "U", /* latin capital letter U with grave */
218 "Ú", "Ú", "U", /* latin capital letter U with acute */
219 "Û", "Û", "U", /* latin capital letter U with circumflex */
220 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
221 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
222 "Þ", "Þ", "TH", /* latin capital letter THORN */
223 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
224 "à", "à", "a", /* latin small letter a with grave */
225 "á", "á", "a", /* latin small letter a with acute */
226 "â", "â", "a", /* latin small letter a with circumflex */
227 "ã", "ã", "a", /* latin small letter a with tilde */
228 "ä", "ä", "a", /* latin small letter a with diaeresis */
229 "å", "å", "a", /* latin small letter a with ring above */
230 "æ", "æ", "ae", /* latin small letter ae */
231 "ç", "ç", "c", /* latin small letter c with cedilla */
232 "è", "è", "e", /* latin small letter e with grave */
233 "é", "é", "e", /* latin small letter e with acute */
234 "ê", "ê", "e", /* latin small letter e with circumflex */
235 "ë", "ë", "e", /* latin small letter e with diaeresis */
236 "ì", "ì", "i", /* latin small letter i with grave */
237 "í", "í", "i", /* latin small letter i with acute */
238 "î", "î", "i", /* latin small letter i with circumflex */
239 "ï", "ï", "i", /* latin small letter i with diaeresis */
240 "ð", "ð", "eth", /* latin small letter eth */
241 "ñ", "ñ", "n", /* latin small letter n with tilde */
242 "ò", "ò", "o", /* latin small letter o with grave */
243 "ó", "ó", "o", /* latin small letter o with acute */
244 "ô", "ô", "o", /* latin small letter o with circumflex */
245 "õ", "õ", "o", /* latin small letter o with tilde */
246 "ö", "ö", "o", /* latin small letter o with diaeresis */
247 "÷", "÷", "/", /* division sign */
248 "ø", "ø", "o", /* latin small letter o with stroke */
249 "ù", "ù", "u", /* latin small letter u with grave */
250 "ú", "ú", "u", /* latin small letter u with acute */
251 "û", "û", "u", /* latin small letter u with circumflex */
252 "ü", "ü", "u", /* latin small letter u with diaeresis */
253 "ý", "ý", "y", /* latin small letter y with acute */
254 "þ", "þ", "th", /* latin small letter thorn */
255 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
259 /* special characters */
260 #define CHAR_SPACE 32
264 #define CHAR_DQUOTE 34
265 #define CHAR_SQUOTE 39
266 #define CHAR_OPEN_SQUOTE 96
267 #define CHAR_TILDE 126
268 #define CHAR_ASTERISK 42
269 #define CHAR_FORESLASH 47
270 #define CHAR_CARAT 94
272 #define CHAR_UNDERSCORE '_'
273 #define CHAR_OPEN_CBRACK '{'
274 #define CHAR_CLOSE_CBRACK '}'
275 #define CHAR_OPEN_RBRACK '('
276 #define CHAR_CLOSE_RBRACK ')'
277 #define CHAR_OPEN_SBRACK '['
278 #define CHAR_CLOSE_SBRACK ']'
280 /* longest and shortest normal PG line lengths */
281 #define LONGEST_PG_LINE 75
282 #define WAY_TOO_LONG 80
283 #define SHORTEST_PG_LINE 55
285 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
286 /* D - ignore DP-specific markup */
287 /* E - echo queried line */
288 /* S - check single quotes */
289 /* T - check common typos */
290 /* P - require closure of quotes on */
291 /* every paragraph */
292 /* X - "Trust no one" :-) Paranoid! */
293 /* Queries everything */
294 /* L - line end checking defaults on */
295 /* -L turns it off */
296 /* O - overview. Just shows counts. */
297 /* Y - puts errors to stdout */
298 /* instead of stderr */
299 /* H - Echoes header fields */
300 /* M - Ignore markup in < > */
301 /* U - Use file of User-defined Typos */
302 /* W - Defaults for use on Web upload */
303 /* V - Verbose - list EVERYTHING! */
304 #define SWITNO 14 /* max number of switch parms */
305 /* - used for defining array-size */
306 #define MINARGS 1 /* minimum no of args excl switches */
307 #define MAXARGS 1 /* maximum no of args excl switches */
309 int pswit[SWITNO]; /* program switches set by SWITCHES */
311 #define ECHO_SWITCH 0
312 #define SQUOTE_SWITCH 1
313 #define TYPO_SWITCH 2
314 #define QPARA_SWITCH 3
315 #define PARANOID_SWITCH 4
316 #define LINE_END_SWITCH 5
317 #define OVERVIEW_SWITCH 6
318 #define STDOUT_SWITCH 7
319 #define HEADER_SWITCH 8
321 #define VERBOSE_SWITCH 10
322 #define MARKUP_SWITCH 11
323 #define USERTYPO_SWITCH 12
326 long cnt_dquot; /* for overview mode, count of doublequote queries */
327 long cnt_squot; /* for overview mode, count of singlequote queries */
328 long cnt_brack; /* for overview mode, count of brackets queries */
329 long cnt_bin; /* for overview mode, count of non-ASCII queries */
330 long cnt_odd; /* for overview mode, count of odd character queries */
331 long cnt_long; /* for overview mode, count of long line errors */
332 long cnt_short; /* for overview mode, count of short line queries */
333 long cnt_punct; /* for overview mode,
334 count of punctuation and spacing queries */
335 long cnt_dash; /* for overview mode, count of dash-related queries */
336 long cnt_word; /* for overview mode, count of word queries */
337 long cnt_html; /* for overview mode, count of html queries */
338 long cnt_lineend; /* for overview mode, count of line-end queries */
339 long cnt_spacend; /* count of lines with space at end */
340 long linecnt; /* count of total lines in the file */
341 long checked_linecnt; /* count of lines actually checked */
344 void procfile(char *);
346 #define LOW_THRESHOLD 0
347 #define HIGH_THRESHOLD 1
353 #define FIRST_OF_PAIR 0
354 #define SECOND_OF_PAIR 1
356 #define MAX_WORDPAIR 1000
358 char running_from[MAX_PATH];
360 int mixdigit(char *);
361 const char *getaword(const char *,char *);
362 int matchword(char *,char *);
363 char *flgets(char *,int,FILE *,long);
364 void lowerit(char *);
365 int gcisalpha(unsigned char);
366 int gcisdigit(unsigned char);
367 int gcisletter(unsigned char);
368 char *gcstrchr(char *s,char c);
369 void postprocess_for_HTML(char *);
370 char *linehasmarkup(char *);
371 char *losemarkup(char *);
372 int tagcomp(char *,char *);
373 char *loseentities(char *);
376 void postprocess_for_DP(char *);
378 char wrk[LINEBUFSIZE];
381 #define MAX_QWORD_LENGTH 40
382 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
383 int dupcnt[MAX_QWORD];
385 struct first_pass_results {
386 long firstline,astline;
387 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
388 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
389 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
390 int Dutchcount,Frenchcount;
394 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
395 int endquote,isDutch,isFrench;
400 int c_unders,c_brack,s_brack,r_brack;
401 int open_single_quote,close_single_quote;
404 struct line_properties {
405 unsigned int len,blen;
414 char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];
418 int main(int argc,char **argv)
422 char usertypo_file[MAX_PATH];
424 if (strlen(argv[0])<sizeof(running_from))
425 /* save the path to the executable */
426 strcpy(running_from,argv[0]);
427 /* find out what directory we're running from */
428 s=running_from+strlen(running_from);
429 for (;*s!='/' && *s!='\\' && s>=running_from;s--)
431 switno=strlen(SWITCHES);
432 for (i=switno;--i>0;)
433 pswit[i]=0; /* initialise switches */
435 * Standard loop to extract switches.
436 * When we come out of this loop, the arguments will be
437 * in argv[0] upwards and the switches used will be
438 * represented by their equivalent elements in pswit[]
440 while (--argc>0 && **++argv=='-')
441 for (argsw=argv[0]+1;*argsw!='\0';argsw++)
442 for (i=switno,invarg=1;(--i>=0) && invarg==1;)
443 if ((toupper(*argsw))==SWITCHES[i])
448 /* Paranoid checking is turned OFF, not on, by its switch */
449 pswit[PARANOID_SWITCH]^=1;
450 if (pswit[PARANOID_SWITCH])
451 /* if running in paranoid mode force typo checks as well */
452 pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
453 /* Line-end checking is turned OFF, not on, by its switch */
454 pswit[LINE_END_SWITCH]^=1;
455 /* Echoing is turned OFF, not on, by its switch */
456 pswit[ECHO_SWITCH]^=1;
457 if (pswit[OVERVIEW_SWITCH])
458 /* just print summary; don't echo */
459 pswit[ECHO_SWITCH]=0;
461 * Web uploads - for the moment, this is really just a placeholder
462 * until we decide what processing we really want to do on web uploads
464 if (pswit[WEB_SWITCH])
466 /* specific override for web uploads */
467 pswit[ECHO_SWITCH]=1;
468 pswit[SQUOTE_SWITCH]=0;
469 pswit[TYPO_SWITCH]=1;
470 pswit[QPARA_SWITCH]=0;
471 pswit[PARANOID_SWITCH]=1;
472 pswit[LINE_END_SWITCH]=0;
473 pswit[OVERVIEW_SWITCH]=0;
474 pswit[STDOUT_SWITCH]=0;
475 pswit[HEADER_SWITCH]=1;
476 pswit[VERBOSE_SWITCH]=0;
477 pswit[MARKUP_SWITCH]=0;
478 pswit[USERTYPO_SWITCH]=0;
481 if (argc<MINARGS || argc>MAXARGS)
483 /* check number of args */
487 /* read in the user-defined stealth scanno list */
488 if (pswit[USERTYPO_SWITCH])
490 /* ... we were told we had one! */
491 usertypofile=fopen(USERTYPO_FILE,"rb");
494 /* not in cwd. try excuteable directory. */
495 strcpy(usertypo_file,running_from);
496 strcat(usertypo_file,USERTYPO_FILE);
497 usertypofile=fopen(usertypo_file,"rb");
499 /* we ain't got no user typo file! */
500 printf(" --> I couldn't find gutcheck.typ "
501 "-- proceeding without user typos.\n");
507 /* we managed to open a User Typo File! */
508 if (pswit[USERTYPO_SWITCH])
510 while (flgets(aline,LINEBUFSIZE-1,usertypofile,
511 (long)usertypo_count))
517 s=malloc(strlen(aline)+1);
520 fprintf(stderr,"bookloupe: cannot get enough "
521 "memory for user typo file!\n");
525 usertypo[usertypo_count]=s;
527 if (usertypo_count>=MAX_USER_TYPOS)
529 printf(" --> Only %d user-defined typos "
530 "allowed: ignoring the rest\n",
538 fclose(usertypofile);
541 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
542 cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
543 cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
546 if (pswit[OVERVIEW_SWITCH])
548 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
549 checked_linecnt,linecnt,linecnt-checked_linecnt);
550 printf(" --------------- Queries found --------------\n");
552 printf(" Long lines: %14ld\n",cnt_long);
554 printf(" Short lines: %14ld\n",cnt_short);
556 printf(" Line-end problems: %14ld\n",cnt_lineend);
558 printf(" Common typos: %14ld\n",cnt_word);
560 printf(" Unmatched quotes: %14ld\n",cnt_dquot);
562 printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
564 printf(" Unmatched brackets: %14ld\n",cnt_brack);
566 printf(" Non-ASCII characters: %14ld\n",cnt_bin);
568 printf(" Proofing characters: %14ld\n",cnt_odd);
570 printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
572 printf(" Non-standard dashes: %14ld\n",cnt_dash);
574 printf(" Possible HTML tags: %14ld\n",cnt_html);
576 printf(" TOTAL QUERIES %14ld\n",
577 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
578 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
586 * Run a first pass - verify that it's a valid PG
587 * file, decide whether to report some things that
588 * occur many times in the text like long or short
589 * lines, non-standard dashes, etc.
591 struct first_pass_results *first_pass(FILE *infile)
593 char laststart=CHAR_SPACE;
596 unsigned int lastlen=0,lastblen=0;
597 long spline=0,nspline=0;
598 static struct first_pass_results results={0};
599 char inword[MAXWORDLEN]="";
600 while (fgets(aline,LINEBUFSIZE-1,infile))
602 while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
603 aline[strlen(aline)-1]=0;
605 if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
606 (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
609 printf(" --> Duplicate header?\n");
610 spline=linecnt+1; /* first line of non-header text, that is */
612 if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
615 printf(" --> Duplicate header?\n");
616 nspline=linecnt+1; /* first line of non-header text, that is */
618 if (spline || nspline)
621 if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
623 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
625 if (results.footerline)
627 /* it's an old-form header - we can detect duplicates */
629 printf(" --> Duplicate footer?\n");
632 results.footerline=linecnt;
637 results.firstline=spline;
639 results.firstline=nspline; /* override with new */
640 if (results.footerline)
641 continue; /* don't count the boilerplate in the footer */
643 results.totlen+=llen;
646 if ((unsigned char)aline[i]>127)
648 if (gcisalpha(aline[i]))
650 if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
651 results.endquote_count++;
653 if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
654 lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
656 if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
658 if (strstr(aline,".,"))
660 /* only count ast lines for ignoring purposes where there is */
661 /* locase text on the line */
662 if (strstr(aline,"*"))
665 if (*s>='a' && *s<='z')
670 if (strstr(aline,"/"))
671 results.fslashline++;
672 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
674 if (aline[i]=='-' && aline[i-1]!='-')
676 if (llen>LONGEST_PG_LINE)
678 if (llen>WAY_TOO_LONG)
679 results.verylongline++;
680 if (strstr(aline,"<") && strstr(aline,">"))
682 i=(int)(strstr(aline,">")-strstr(aline,"<")+1);
685 if (strstr(aline,"<i>"))
686 results.htmcount+=4; /* bonus marks! */
688 /* Check for spaced em-dashes */
689 if (strstr(aline,"--"))
692 if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
693 (*(strstr(aline,"--")+2)==CHAR_SPACE))
694 results.space_emdash++;
695 if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
696 (*(strstr(aline,"--")+2)==CHAR_SPACE))
697 /* count of em-dashes with spaces both sides */
698 results.non_PG_space_emdash++;
699 if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
700 (*(strstr(aline,"--")+2)!=CHAR_SPACE))
701 /* count of PG-type em-dashes with no spaces */
702 results.PG_space_emdash++;
706 s=getaword(s,inword);
707 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
708 results.Dutchcount++;
709 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
710 results.Frenchcount++;
711 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
712 results.standalone_digit++;
714 /* Check for spaced dashes */
715 if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
718 lastlen=strlen(aline);
727 * Make some snap decisions based on the first pass results.
729 struct warnings *report_first_pass(struct first_pass_results *results)
731 static struct warnings warnings={0};
733 printf(" --> %ld lines in this file have white space at end\n",
736 if (results->dotcomma>5)
739 printf(" --> %ld lines in this file contain '.,'. "
740 "Not reporting them.\n",results->dotcomma);
743 * If more than 50 lines, or one-tenth, are short,
744 * don't bother reporting them.
746 warnings.shortline=1;
747 if (results->shortline>50 || results->shortline*10>linecnt)
749 warnings.shortline=0;
750 printf(" --> %ld lines in this file are short. "
751 "Not reporting short lines.\n",results->shortline);
754 * If more than 50 lines, or one-tenth, are long,
755 * don't bother reporting them.
758 if (results->longline>50 || results->longline*10>linecnt)
761 printf(" --> %ld lines in this file are long. "
762 "Not reporting long lines.\n",results->longline);
764 /* If more than 10 lines contain asterisks, don't bother reporting them. */
766 if (results->astline>10)
769 printf(" --> %ld lines in this file contain asterisks. "
770 "Not reporting them.\n",results->astline);
773 * If more than 10 lines contain forward slashes,
774 * don't bother reporting them.
777 if (results->fslashline>10)
780 printf(" --> %ld lines in this file contain forward slashes. "
781 "Not reporting them.\n",results->fslashline);
784 * If more than 20 lines contain unpunctuated endquotes,
785 * don't bother reporting them.
788 if (results->endquote_count>20)
791 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
792 "Not reporting them.\n",results->endquote_count);
795 * If more than 15 lines contain standalone digits,
796 * don't bother reporting them.
799 if (results->standalone_digit>10)
802 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
803 "Not reporting them.\n",results->standalone_digit);
806 * If more than 20 lines contain hyphens at end,
807 * don't bother reporting them.
810 if (results->hyphens>20)
813 printf(" --> %ld lines in this file have hyphens at end. "
814 "Not reporting them.\n",results->hyphens);
816 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
818 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
819 pswit[MARKUP_SWITCH]=1;
821 if (results->verylongline>0)
822 printf(" --> %ld lines in this file are VERY long!\n",
823 results->verylongline);
825 * If there are more non-PG spaced dashes than PG em-dashes,
826 * assume it's deliberate.
827 * Current PG guidelines say don't use them, but older texts do,
828 * and some people insist on them whatever the guidelines say.
831 if (results->spacedash+results->non_PG_space_emdash>
832 results->PG_space_emdash)
835 printf(" --> There are %ld spaced dashes and em-dashes. "
836 "Not reporting them.\n",
837 results->spacedash+results->non_PG_space_emdash);
839 /* If more than a quarter of characters are hi-bit, bug out. */
841 if (results->binlen*4>results->totlen)
843 printf(" --> This file does not appear to be ASCII. "
844 "Terminating. Best of luck with it!\n");
847 if (results->alphalen*4<results->totlen)
849 printf(" --> This file does not appear to be text. "
850 "Terminating. Best of luck with it!\n");
853 if (results->binlen*100>results->totlen || results->binlen>100)
855 printf(" --> There are a lot of foreign letters here. "
856 "Not reporting them.\n");
860 if (results->Dutchcount>50)
863 printf(" --> This looks like Dutch - "
864 "switching off dashes and warnings for 's Middags case.\n");
867 if (results->Frenchcount>50)
870 printf(" --> This looks like French - "
871 "switching off some doublepunct.\n");
873 if (results->firstline && results->footerline)
874 printf(" The PG header and footer appear to be already on.\n");
877 if (results->firstline)
878 printf(" The PG header is on - no footer.\n");
879 if (results->footerline)
880 printf(" The PG footer is on - no header.\n");
883 if (pswit[VERBOSE_SWITCH])
886 warnings.shortline=1;
895 printf(" *** Verbose output is ON -- you asked for it! ***\n");
897 if (warnings.isDutch)
899 if (results->footerline>0 && results->firstline>0 &&
900 results->footerline>results->firstline &&
901 results->footerline-results->firstline<100)
903 printf(" --> I don't really know where this text starts. \n");
904 printf(" There are no reference points.\n");
905 printf(" I'm going to have to report the header and footer "
907 results->firstline=0;
915 * Look along the line, accumulate the count of quotes, and see
916 * if this is an empty line - i.e. a line with nothing on it
918 * If line has just spaces, period, * and/or - on it, don't
919 * count it, since empty lines with asterisks or dashes to
920 * separate sections are common.
922 * Returns: Non-zero if the line is empty.
924 int analyse_quotes(const char *s,struct counters *counters)
927 int isemptyline=1; /* assume the line is empty until proven otherwise */
932 if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
937 * At start of line, it can only be an openquote.
938 * Hardcode a very common exception!
940 if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
941 counters->open_single_quote++;
943 else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
944 /* Do nothing! it's definitely an apostrophe, not a quote */
946 /* it's outside a word - let's check it out */
947 else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
949 /* it damwell better BE an openquote */
950 if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
951 /* hardcode a very common exception! */
952 counters->open_single_quote++;
956 /* now - is it a closequote? */
957 guessquote=0; /* accumulate clues */
958 if (gcisalpha(s[-1]))
960 /* it follows a letter - could be either */
964 /* looks like a plural apostrophe */
966 if (s[1]==CHAR_SPACE) /* bonus marks! */
970 /* it doesn't have a letter either side */
971 else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
972 guessquote+=8; /* looks like a closequote */
975 if (counters->open_single_quote>counters->close_single_quote)
977 * Give it the benefit of some doubt,
978 * if a squote is already open.
984 counters->close_single_quote++;
987 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
989 isemptyline=0; /* ignore lines like * * * as spacers */
990 if (*s==CHAR_UNDERSCORE)
991 counters->c_unders++;
992 if (*s==CHAR_OPEN_CBRACK)
994 if (*s==CHAR_CLOSE_CBRACK)
996 if (*s==CHAR_OPEN_RBRACK)
998 if (*s==CHAR_CLOSE_RBRACK)
1000 if (*s==CHAR_OPEN_SBRACK)
1001 counters->s_brack++;
1002 if (*s==CHAR_CLOSE_SBRACK)
1003 counters->s_brack--;
1010 * check_for_control_characters:
1012 * Check for invalid or questionable characters in the line
1013 * Anything above 127 is invalid for plain ASCII, and
1014 * non-printable control characters should also be flagged.
1015 * Tabs should generally not be there.
1017 void check_for_control_characters(const char *aline)
1021 for (s=aline;*s;s++)
1023 c=*(unsigned char *)s;
1024 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1026 if (pswit[ECHO_SWITCH])
1027 printf("\n%s\n",aline);
1028 if (!pswit[OVERVIEW_SWITCH])
1029 printf(" Line %ld column %d - Control character %d\n",
1030 linecnt,(int)(s-aline)+1,c);
1038 * check_for_odd_characters:
1040 * Check for binary and other odd characters.
1042 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1045 /* Don't repeat multiple warnings on one line. */
1046 int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
1049 for (s=aline;*s;s++)
1051 c=*(unsigned char *)s;
1052 if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
1054 if (pswit[ECHO_SWITCH])
1055 printf("\n%s\n",aline);
1056 if (!pswit[OVERVIEW_SWITCH])
1058 printf(" Line %ld column %d - "
1059 "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
1061 printf(" Line %ld column %d - Non-ASCII character %d\n",
1062 linecnt,(int)(s-aline)+1,c);
1067 if (!eTab && *s==CHAR_TAB)
1069 if (pswit[ECHO_SWITCH])
1070 printf("\n%s\n",aline);
1071 if (!pswit[OVERVIEW_SWITCH])
1072 printf(" Line %ld column %d - Tab character?\n",
1073 linecnt,(int)(s-aline)+1);
1078 if (!eTilde && *s==CHAR_TILDE)
1081 * Often used by OCR software to indicate an
1082 * unrecognizable character.
1084 if (pswit[ECHO_SWITCH])
1085 printf("\n%s\n",aline);
1086 if (!pswit[OVERVIEW_SWITCH])
1087 printf(" Line %ld column %d - Tilde character?\n",
1088 linecnt,(int)(s-aline)+1);
1093 if (!eCarat && *s==CHAR_CARAT)
1095 if (pswit[ECHO_SWITCH])
1096 printf("\n%s\n",aline);
1097 if (!pswit[OVERVIEW_SWITCH])
1098 printf(" Line %ld column %d - Carat character?\n",
1099 linecnt,(int)(s-aline)+1);
1104 if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
1106 if (pswit[ECHO_SWITCH])
1107 printf("\n%s\n",aline);
1108 if (!pswit[OVERVIEW_SWITCH])
1109 printf(" Line %ld column %d - Forward slash?\n",
1110 linecnt,(int)(s-aline)+1);
1116 * Report asterisks only in paranoid mode,
1117 * since they're often deliberate.
1119 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1122 if (pswit[ECHO_SWITCH])
1123 printf("\n%s\n",aline);
1124 if (!pswit[OVERVIEW_SWITCH])
1125 printf(" Line %ld column %d - Asterisk?\n",
1126 linecnt,(int)(s-aline)+1);
1135 * check_for_long_line:
1137 * Check for line too long.
1139 void check_for_long_line(const char *aline)
1141 if (strlen(aline)>LONGEST_PG_LINE)
1143 if (pswit[ECHO_SWITCH])
1144 printf("\n%s\n",aline);
1145 if (!pswit[OVERVIEW_SWITCH])
1146 printf(" Line %ld column %d - Long line %d\n",
1147 linecnt,(int)strlen(aline),(int)strlen(aline));
1154 * check_for_short_line:
1156 * Check for line too short.
1158 * This one is a bit trickier to implement: we don't want to
1159 * flag the last line of a paragraph for being short, so we
1160 * have to wait until we know that our current line is a
1161 * "normal" line, then report the _previous_ line if it was too
1162 * short. We also don't want to report indented lines like
1163 * chapter heads or formatted quotations. We therefore keep
1164 * last->len as the length of the last line examined, and
1165 * last->blen as the length of the last but one, and try to
1166 * suppress unnecessary warnings by checking that both were of
1167 * "normal" length. We keep the first character of the last
1168 * line in last->start, and if it was a space, we assume that
1169 * the formatting is deliberate. I can't figure out a way to
1170 * distinguish something like a quoted verse left-aligned or
1171 * the header or footer of a letter from a paragraph of short
1172 * lines - maybe if I examined the whole paragraph, and if the
1173 * para has less than, say, 8 lines and if all lines are short,
1174 * then just assume it's OK? Need to look at some texts to see
1175 * how often a formula like this would get the right result.
1177 void check_for_short_line(const char *aline,const struct line_properties *last)
1179 if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
1180 last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1182 if (pswit[ECHO_SWITCH])
1183 printf("\n%s\n",prevline);
1184 if (!pswit[OVERVIEW_SWITCH])
1185 printf(" Line %ld column %d - Short line %d?\n",
1186 linecnt-1,(int)strlen(prevline),(int)strlen(prevline));
1193 * check_for_starting_punctuation:
1195 * Look for punctuation other than full ellipses at start of line.
1197 void check_for_starting_punctuation(const char *aline)
1199 if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1201 if (pswit[ECHO_SWITCH])
1202 printf("\n%s\n",aline);
1203 if (!pswit[OVERVIEW_SWITCH])
1204 printf(" Line %ld column 1 - Begins with punctuation?\n",
1212 * check_for_spaced_emdash:
1214 * Check for spaced em-dashes.
1216 * We must check _all_ occurrences of "--" on the line
1217 * hence the loop - even if the first double-dash is OK
1218 * there may be another that's wrong later on.
1220 void check_for_spaced_emdash(const char *aline)
1224 while ((t=strstr(s,"--")))
1226 if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
1228 if (pswit[ECHO_SWITCH])
1229 printf("\n%s\n",aline);
1230 if (!pswit[OVERVIEW_SWITCH])
1231 printf(" Line %ld column %d - Spaced em-dash?\n",
1232 linecnt,(int)(t-aline)+1);
1241 * check_for_spaced_dash:
1243 * Check for spaced dashes.
1245 void check_for_spaced_dash(const char *aline)
1248 if ((s=strstr(aline," -")))
1252 if (pswit[ECHO_SWITCH])
1253 printf("\n%s\n",aline);
1254 if (!pswit[OVERVIEW_SWITCH])
1255 printf(" Line %ld column %d - Spaced dash?\n",
1256 linecnt,(int)(s-aline)+1);
1261 else if ((s=strstr(aline,"- ")))
1263 if (s==aline || s[-1]!='-')
1265 if (pswit[ECHO_SWITCH])
1266 printf("\n%s\n",aline);
1267 if (!pswit[OVERVIEW_SWITCH])
1268 printf(" Line %ld column %d - Spaced dash?\n",
1269 linecnt,(int)(s-aline)+1);
1277 * check_for_unmarked_paragraphs:
1279 * Check for unmarked paragraphs indicated by separate speakers.
1281 * May well be false positive:
1282 * "Bravo!" "Wonderful!" called the crowd.
1283 * but useful all the same.
1285 void check_for_unmarked_paragraphs(const char *aline)
1288 s=strstr(aline,"\" \"");
1290 s=strstr(aline,"\" \"");
1293 if (pswit[ECHO_SWITCH])
1294 printf("\n%s\n",aline);
1295 if (!pswit[OVERVIEW_SWITCH])
1296 printf(" Line %ld column %d - Query missing paragraph break?\n",
1297 linecnt,(int)(s-aline)+1);
1304 * check_for_jeebies:
1306 * Check for "to he" and other easy h/b errors.
1308 * This is a very inadequate effort on the h/b problem,
1309 * but the phrase "to he" is always an error, whereas "to
1310 * be" is quite common.
1311 * Similarly, '"Quiet!", be said.' is a non-be error
1312 * "to he" is _not_ always an error!:
1313 * "Where they went to he couldn't say."
1314 * Another false positive:
1315 * What would "Cinderella" be without the . . .
1316 * and another: "If he wants to he can see for himself."
1318 void check_for_jeebies(const char *aline)
1321 s=strstr(aline," be could ");
1323 s=strstr(aline," be would ");
1325 s=strstr(aline," was be ");
1327 s=strstr(aline," be is ");
1329 s=strstr(aline," is be ");
1331 s=strstr(aline,"\", be ");
1333 s=strstr(aline,"\" be ");
1335 s=strstr(aline,"\" be ");
1337 s=strstr(aline," to he ");
1340 if (pswit[ECHO_SWITCH])
1341 printf("\n%s\n",aline);
1342 if (!pswit[OVERVIEW_SWITCH])
1343 printf(" Line %ld column %d - Query he/be error?\n",
1344 linecnt,(int)(s-aline)+1);
1348 s=strstr(aline," the had ");
1350 s=strstr(aline," a had ");
1352 s=strstr(aline," they bad ");
1354 s=strstr(aline," she bad ");
1356 s=strstr(aline," he bad ");
1358 s=strstr(aline," you bad ");
1360 s=strstr(aline," i bad ");
1363 if (pswit[ECHO_SWITCH])
1364 printf("\n%s\n",aline);
1365 if (!pswit[OVERVIEW_SWITCH])
1366 printf(" Line %ld column %d - Query had/bad error?\n",
1367 linecnt,(int)(s-aline)+1);
1371 s=strstr(aline,"; hut ");
1373 s=strstr(aline,", hut ");
1376 if (pswit[ECHO_SWITCH])
1377 printf("\n%s\n",aline);
1378 if (!pswit[OVERVIEW_SWITCH])
1379 printf(" Line %ld column %d - Query hut/but error?\n",
1380 linecnt,(int)(s-aline)+1);
1387 * check_for_mta_from:
1389 * Special case - angled bracket in front of "From" placed there by an
1390 * MTA when sending an e-mail.
1392 void check_for_mta_from(const char *aline)
1395 s=strstr(aline,">From");
1398 if (pswit[ECHO_SWITCH])
1399 printf("\n%s\n",aline);
1400 if (!pswit[OVERVIEW_SWITCH])
1401 printf(" Line %ld column %d - Query angled bracket with From\n",
1402 linecnt,(int)(s-aline)+1);
1409 * check_for_orphan_character:
1411 * Check for a single character line -
1412 * often an overflow from bad wrapping.
1414 void check_for_orphan_character(const char *aline)
1416 if (*aline && !aline[1])
1418 if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1420 ; /* Nothing - ignore numerals alone on a line. */
1423 if (pswit[ECHO_SWITCH])
1424 printf("\n%s\n",aline);
1425 if (!pswit[OVERVIEW_SWITCH])
1426 printf(" Line %ld column 1 - Query single character line\n",
1435 * check_for_pling_scanno:
1437 * Check for I" - often should be !
1439 void check_for_pling_scanno(const char *aline)
1442 s=strstr(aline," I\"");
1445 if (pswit[ECHO_SWITCH])
1446 printf("\n%s\n",aline);
1447 if (!pswit[OVERVIEW_SWITCH])
1448 printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1456 * check_for_extra_period:
1458 * Check for period without a capital letter. Cut-down from gutspell.
1459 * Only works when it happens on a single line.
1461 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1463 const char *s,*t,*s1;
1465 static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
1466 static int qperiod_index=0;
1467 char testword[MAXWORDLEN]="";
1468 if (pswit[PARANOID_SWITCH])
1470 for (t=s=aline;strstr(t,". ");)
1476 /* start of line punctuation is handled elsewhere */
1479 if (!gcisalpha(t[-1]))
1484 if (warnings->isDutch)
1486 /* For Frank & Jeroen -- 's Middags case */
1487 if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1488 t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1495 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1497 if (*s1>='a' && *s1<='z')
1499 /* we have something to investigate */
1501 /* so let's go back and find out */
1502 for (s1=t-1;s1>=s &&
1503 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1504 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1507 for (i=0;*s1 && *s1!='.';s1++,i++)
1510 for (i=0;*abbrev[i];i++)
1511 if (!strcmp(testword,abbrev[i]))
1513 if (gcisdigit(*testword))
1517 if (isroman(testword))
1522 for (i=0;testword[i];i++)
1523 if (strchr(vowels,testword[i]))
1529 if (strlen(testword)<MAX_QWORD_LENGTH &&
1530 !pswit[VERBOSE_SWITCH])
1531 for (i=0;i<qperiod_index;i++)
1532 if (!strcmp(testword,qperiod[i]))
1536 if (qperiod_index<MAX_QWORD &&
1537 strlen(testword)<MAX_QWORD_LENGTH)
1539 strcpy(qperiod[qperiod_index],testword);
1542 if (pswit[ECHO_SWITCH])
1543 printf("\n%s\n",aline);
1544 if (!pswit[OVERVIEW_SWITCH])
1545 printf(" Line %ld column %d - Extra period?\n",
1546 linecnt,(int)(t-aline)+1);
1558 * check_for_following_punctuation:
1560 * Check for words usually not followed by punctuation.
1562 void check_for_following_punctuation(const char *aline)
1565 const char *s,*wordstart;
1566 char inword[MAXWORDLEN];
1567 if (pswit[TYPO_SWITCH])
1572 s=getaword(s,inword);
1576 for (i=0;*nocomma[i];i++)
1577 if (!strcmp(inword,nocomma[i]))
1579 if (*s==',' || *s==';' || *s==':')
1581 if (pswit[ECHO_SWITCH])
1582 printf("\n%s\n",aline);
1583 if (!pswit[OVERVIEW_SWITCH])
1584 printf(" Line %ld column %d - "
1585 "Query punctuation after %s?\n",
1586 linecnt,(int)(s-aline)+1,inword);
1591 for (i=0;*noperiod[i];i++)
1592 if (!strcmp(inword,noperiod[i]))
1594 if (*s=='.' || *s=='!')
1596 if (pswit[ECHO_SWITCH])
1597 printf("\n%s\n",aline);
1598 if (!pswit[OVERVIEW_SWITCH])
1599 printf(" Line %ld column %d - "
1600 "Query punctuation after %s?\n",
1601 linecnt,(int)(s-aline)+1,inword);
1613 * Check for commonly mistyped words,
1614 * and digits like 0 for O in a word.
1616 void check_for_typos(const char *aline,struct warnings *warnings)
1618 const char *s,*wordstart;
1619 char inword[MAXWORDLEN],testword[MAXWORDLEN];
1620 int i,istypo,isdup,alower,vowel,consonant;
1621 static int qword_index=0;
1625 s=getaword(s,inword);
1627 continue; /* don't bother with empty lines */
1628 if (mixdigit(inword))
1630 if (pswit[ECHO_SWITCH])
1631 printf("\n%s\n",aline);
1632 if (!pswit[OVERVIEW_SWITCH])
1633 printf(" Line %ld column %d - Query digit in %s\n",
1634 linecnt,(int)(wordstart-aline)+1,inword);
1639 * Put the word through a series of tests for likely typos and OCR
1642 if (pswit[TYPO_SWITCH])
1645 strcpy(testword,inword);
1647 for (i=0;i<(int)strlen(testword);i++)
1649 /* lowercase for testing */
1650 if (testword[i]>='a' && testword[i]<='z')
1652 if (alower && testword[i]>='A' && testword[i]<='Z')
1655 * We have an uppercase mid-word. However, there are
1657 * Mac and Mc like McGill
1658 * French contractions like l'Abbe
1660 if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1661 i==3 && testword[0]=='m' && testword[1]=='a' &&
1662 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1667 testword[i]=(char)tolower(testword[i]);
1670 * Check for certain unlikely two-letter combinations at word
1673 if (strlen(testword)>1)
1675 for (i=0;*nostart[i];i++)
1676 if (!strncmp(testword,nostart[i],2))
1678 for (i=0;*noend[i];i++)
1679 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1682 /* ght is common, gbt never. Like that. */
1683 if (strstr(testword,"cb"))
1685 if (strstr(testword,"gbt"))
1687 if (strstr(testword,"pbt"))
1689 if (strstr(testword,"tbs"))
1691 if (strstr(testword,"mrn"))
1693 if (strstr(testword,"ahle"))
1695 if (strstr(testword,"ihle"))
1698 * "TBE" does happen - like HEARTBEAT - but uncommon.
1699 * Also "TBI" - frostbite, outbid - but uncommon.
1700 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1701 * numerals, but "ii" is a common scanno.
1703 if (strstr(testword,"tbi"))
1705 if (strstr(testword,"tbe"))
1707 if (strstr(testword,"ii"))
1710 * Check for no vowels or no consonants.
1711 * If none, flag a typo.
1713 if (!istypo && strlen(testword)>1)
1716 for (i=0;testword[i];i++)
1718 if (testword[i]=='y' || gcisdigit(testword[i]))
1720 /* Yah, this is loose. */
1724 else if (strchr(vowels,testword[i]))
1729 if (!vowel || !consonant)
1733 * Now exclude the word from being reported if it's in
1736 for (i=0;*okword[i];i++)
1737 if (!strcmp(testword,okword[i]))
1740 * What looks like a typo may be a Roman numeral.
1743 if (istypo && isroman(testword))
1745 /* Check the manual list of typos. */
1747 for (i=0;*typo[i];i++)
1748 if (!strcmp(testword,typo[i]))
1751 * Check lowercase s, l, i and m - special cases.
1752 * "j" - often a semi-colon gone wrong.
1753 * "d" for a missing apostrophe - he d
1756 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1761 if (strlen(testword)<MAX_QWORD_LENGTH &&
1762 !pswit[VERBOSE_SWITCH])
1763 for (i=0;i<qword_index;i++)
1764 if (!strcmp(testword,qword[i]))
1771 if (qword_index<MAX_QWORD &&
1772 strlen(testword)<MAX_QWORD_LENGTH)
1774 strcpy(qword[qword_index],testword);
1777 if (pswit[ECHO_SWITCH])
1778 printf("\n%s\n",aline);
1779 if (!pswit[OVERVIEW_SWITCH])
1781 printf(" Line %ld column %d - Query word %s",
1782 linecnt,(int)(wordstart-aline)+1,inword);
1783 if (strlen(testword)<MAX_QWORD_LENGTH &&
1784 !pswit[VERBOSE_SWITCH])
1785 printf(" - not reporting duplicates");
1793 /* check the user's list of typos */
1794 if (!istypo && usertypo_count)
1795 for (i=0;i<usertypo_count;i++)
1796 if (!strcmp(testword,usertypo[i]))
1798 if (pswit[ECHO_SWITCH])
1799 printf("\n%s\n",aline);
1800 if (!pswit[OVERVIEW_SWITCH])
1801 printf(" Line %ld column %d - "
1802 "Query possible scanno %s\n",
1803 linecnt,(int)(wordstart-aline)+2,inword);
1805 if (pswit[PARANOID_SWITCH] && warnings->digit)
1807 /* In paranoid mode, query all 0 and 1 standing alone. */
1808 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1810 if (pswit[ECHO_SWITCH])
1811 printf("\n%s\n",aline);
1812 if (!pswit[OVERVIEW_SWITCH])
1813 printf(" Line %ld column %d - Query standalone %s\n",
1814 linecnt,(int)(wordstart-aline)+2,inword);
1823 * check_for_misspaced_punctuation:
1825 * Look for added or missing spaces around punctuation and quotes.
1826 * If there is a punctuation character like ! with no space on
1827 * either side, suspect a missing!space. If there are spaces on
1828 * both sides , assume a typo. If we see a double quote with no
1829 * space or punctuation on either side of it, assume unspaced
1830 * quotes "like"this.
1832 void check_for_misspaced_punctuation(const char *aline,
1833 struct parities *parities,int isemptyline)
1835 int i,llen,isacro,isellipsis;
1838 for (i=1;i<llen;i++)
1840 /* For each character in the line after the first. */
1841 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
1843 /* we need to suppress warnings for acronyms like M.D. */
1845 /* we need to suppress warnings for ellipsis . . . */
1847 /* if there are letters on both sides of it or ... */
1848 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
1849 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
1851 /* ...if it's strict punctuation followed by an alpha */
1854 if (i>2 && aline[i-2]=='.')
1856 if (i+2<llen && aline[i+2]=='.')
1861 if (pswit[ECHO_SWITCH])
1862 printf("\n%s\n",aline);
1863 if (!pswit[OVERVIEW_SWITCH])
1864 printf(" Line %ld column %d - Missing space?\n",
1870 if (aline[i-1]==CHAR_SPACE &&
1871 (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
1874 * If there are spaces on both sides,
1875 * or space before and end of line.
1879 if (i>2 && aline[i-2]=='.')
1881 if (i+2<llen && aline[i+2]=='.')
1884 if (!isemptyline && !isellipsis)
1886 if (pswit[ECHO_SWITCH])
1887 printf("\n%s\n",aline);
1888 if (!pswit[OVERVIEW_SWITCH])
1889 printf(" Line %ld column %d - "
1890 "Spaced punctuation?\n",linecnt,i+1);
1897 /* Split out the characters that CANNOT be preceded by space. */
1899 for (i=1;i<llen;i++)
1901 /* for each character in the line after the first */
1902 if (strchr("?!,;:",aline[i]))
1904 /* if it's punctuation that _cannot_ have a space before it */
1905 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
1906 aline[i+1]!=CHAR_SPACE)
1909 * If aline[i+1) DOES == space,
1910 * it was already reported just above.
1912 if (pswit[ECHO_SWITCH])
1913 printf("\n%s\n",aline);
1914 if (!pswit[OVERVIEW_SWITCH])
1915 printf(" Line %ld column %d - Spaced punctuation?\n",
1923 * Special case " .X" where X is any alpha.
1924 * This plugs a hole in the acronym code above.
1925 * Inelegant, but maintainable.
1928 for (i=1;i<llen;i++)
1930 /* for each character in the line after the first */
1933 /* if it's a period */
1934 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
1937 * If the period follows a space and
1938 * is followed by a letter.
1940 if (pswit[ECHO_SWITCH])
1941 printf("\n%s\n",aline);
1942 if (!pswit[OVERVIEW_SWITCH])
1943 printf(" Line %ld column %d - Spaced punctuation?\n",
1950 for (i=1;i<llen;i++)
1952 /* for each character in the line after the first */
1953 if (aline[i]==CHAR_DQUOTE)
1955 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
1956 !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
1957 !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
1959 if (pswit[ECHO_SWITCH])
1960 printf("\n%s\n",aline);
1961 if (!pswit[OVERVIEW_SWITCH])
1962 printf(" Line %ld column %d - Unspaced quotes?\n",
1969 /* Check parity of quotes. */
1970 for (s=aline;*s;s++)
1972 if (*s==CHAR_DQUOTE)
1974 parities->dquote=!parities->dquote;
1975 if (!parities->dquote)
1978 if (!strchr("_-.'`/,;:!?)]} ",s[1]))
1980 if (pswit[ECHO_SWITCH])
1981 printf("\n%s\n",aline);
1982 if (!pswit[OVERVIEW_SWITCH])
1983 printf(" Line %ld column %d - "
1984 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
1992 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
1993 !strchr("_-/.'`([{$",s[1]) || !s[1])
1995 if (pswit[ECHO_SWITCH])
1996 printf("\n%s\n",aline);
1997 if (!pswit[OVERVIEW_SWITCH])
1998 printf(" Line %ld column %d - "
1999 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2006 if (*aline==CHAR_DQUOTE)
2008 if (strchr(",;:!?)]} ",aline[1]))
2010 if (pswit[ECHO_SWITCH])
2011 printf("\n%s\n",aline);
2012 if (!pswit[OVERVIEW_SWITCH])
2013 printf(" Line %ld column 1 - Wrongspaced quotes?\n",
2019 if (pswit[SQUOTE_SWITCH])
2021 for (s=aline;*s;s++)
2023 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
2024 (s==aline || s>aline && !gcisalpha(s[-1]) ||
2027 parities->squote=!parities->squote;
2028 if (!parities->squote)
2031 if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
2033 if (pswit[ECHO_SWITCH])
2034 printf("\n%s\n",aline);
2035 if (!pswit[OVERVIEW_SWITCH])
2036 printf(" Line %ld column %d - "
2037 "Wrongspaced singlequotes?\n",
2038 linecnt,(int)(s-aline)+1);
2046 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2047 !strchr("_-/\".'`",s[1]) || !s[1])
2049 if (pswit[ECHO_SWITCH])
2050 printf("\n%s\n",aline);
2051 if (!pswit[OVERVIEW_SWITCH])
2052 printf(" Line %ld column %d - "
2053 "Wrongspaced singlequotes?\n",
2054 linecnt,(int)(s-aline)+1);
2065 * check_for_double_punctuation:
2067 * Look for double punctuation like ,. or ,,
2068 * Thanks to DW for the suggestion!
2069 * In books with references, ".," and ".;" are common
2070 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2071 * OTOH, from my initial tests, there are also fairly
2072 * common errors. What to do? Make these cases paranoid?
2073 * ".," is the most common, so warnings->dotcomma is used
2074 * to suppress detailed reporting if it occurs often.
2076 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2080 for (i=0;i<llen;i++)
2082 /* for each punctuation character in the line */
2083 if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
2084 aline[i] && aline[i+1])
2086 /* followed by punctuation, it's a query, unless . . . */
2087 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
2089 !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
2090 warnings->isFrench && !strncmp(aline+i,",...",4) ||
2091 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2092 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2093 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2094 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2095 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2096 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2097 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2098 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2099 warnings->isFrench && !strncmp(aline+i,"...?",4))
2101 if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
2102 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2103 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2104 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2105 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2106 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2107 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2108 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2109 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2110 warnings->isFrench && !strncmp(aline+i,"...?",4))
2112 ; /* do nothing for .. !! and ?? which can be legit */
2116 if (pswit[ECHO_SWITCH])
2117 printf("\n%s\n",aline);
2118 if (!pswit[OVERVIEW_SWITCH])
2119 printf(" Line %ld column %d - Double punctuation?\n",
2129 * check_for_spaced_quotes:
2131 void check_for_spaced_quotes(const char *aline)
2135 while ((t=strstr(s," \" ")))
2137 if (pswit[ECHO_SWITCH])
2138 printf("\n%s\n",aline);
2139 if (!pswit[OVERVIEW_SWITCH])
2140 printf(" Line %ld column %d - Spaced doublequote?\n",
2141 linecnt,(int)(t-aline+1));
2147 while ((t=strstr(s," ' ")))
2149 if (pswit[ECHO_SWITCH])
2150 printf("\n%s\n",aline);
2151 if (!pswit[OVERVIEW_SWITCH])
2152 printf(" Line %ld column %d - Spaced singlequote?\n",
2153 linecnt,(int)(t-aline+1));
2159 while ((t=strstr(s," ` ")))
2161 if (pswit[ECHO_SWITCH])
2162 printf("\n%s\n",aline);
2163 if (!pswit[OVERVIEW_SWITCH])
2164 printf(" Line %ld column %d - Spaced singlequote?\n",
2165 linecnt,(int)(t-aline+1));
2173 * check_for_miscased_genative:
2175 * Check special case of 'S instead of 's at end of word.
2177 void check_for_miscased_genative(const char *aline)
2183 if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
2185 if (pswit[ECHO_SWITCH])
2186 printf("\n%s\n",aline);
2187 if (!pswit[OVERVIEW_SWITCH])
2188 printf(" Line %ld column %d - Capital \"S\"?\n",
2189 linecnt,(int)(s-aline+2));
2198 * check_end_of_line:
2200 * Now check special cases - start and end of line -
2201 * for single and double quotes. Start is sometimes [sic]
2202 * but better to query it anyway.
2203 * While we're here, check for dash at end of line.
2205 void check_end_of_line(const char *aline,struct warnings *warnings)
2211 if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
2212 aline[llen-1]==CHAR_OPEN_SQUOTE)
2213 if (aline[llen-2]==CHAR_SPACE)
2215 if (pswit[ECHO_SWITCH])
2216 printf("\n%s\n",aline);
2217 if (!pswit[OVERVIEW_SWITCH])
2218 printf(" Line %ld column %d - Spaced quote?\n",
2223 if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
2224 aline[1]==CHAR_SPACE)
2226 if (pswit[ECHO_SWITCH])
2227 printf("\n%s\n",aline);
2228 if (!pswit[OVERVIEW_SWITCH])
2229 printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
2234 * Dash at end of line may well be legit - paranoid mode only
2235 * and don't report em-dash at line-end.
2237 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2239 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
2241 if (aline[i]=='-' && aline[i-1]!='-')
2243 if (pswit[ECHO_SWITCH])
2244 printf("\n%s\n",aline);
2245 if (!pswit[OVERVIEW_SWITCH])
2246 printf(" Line %ld column %d - Hyphen at end of line?\n",
2254 * check_for_unspaced_bracket:
2256 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2257 * If so, suspect a scanno like "a]most".
2259 void check_for_unspaced_bracket(const char *aline)
2263 for (i=1;i<llen-1;i++)
2265 /* for each bracket character in the line except 1st & last */
2266 if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
2267 gcisalpha(aline[i+1]))
2269 if (pswit[ECHO_SWITCH])
2270 printf("\n%s\n",aline);
2271 if (!pswit[OVERVIEW_SWITCH])
2272 printf(" Line %ld column %d - Unspaced bracket?\n",
2281 * check_for_unpunctuated_endquote:
2283 void check_for_unpunctuated_endquote(const char *aline)
2287 for (i=1;i<llen;i++)
2289 /* for each character in the line except 1st */
2290 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
2292 if (pswit[ECHO_SWITCH])
2293 printf("\n%s\n",aline);
2294 if (!pswit[OVERVIEW_SWITCH])
2295 printf(" Line %ld column %d - "
2296 "endquote missing punctuation?\n",linecnt,i);
2304 * check_for_html_tag:
2306 * Check for <HTML TAG>.
2308 * If there is a < in the line, followed at some point
2309 * by a > then we suspect HTML.
2311 void check_for_html_tag(const char *aline)
2314 const char *open,*close;
2315 open=strstr(aline,"<");
2318 close=strstr(aline,">");
2321 i=(int)(close-open+1);
2324 strncpy(wrk,open,i);
2326 if (pswit[ECHO_SWITCH])
2327 printf("\n%s\n",aline);
2328 if (!pswit[OVERVIEW_SWITCH])
2329 printf(" Line %ld column %d - HTML Tag? %s \n",
2330 linecnt,(int)(open-aline)+1,wrk);
2339 * check_for_html_entity:
2341 * Check for &symbol; HTML.
2343 * If there is a & in the line, followed at
2344 * some point by a ; then we suspect HTML.
2346 void check_for_html_entity(const char *aline)
2349 const char *s,*amp,*scolon;
2350 amp=strstr(aline,"&");
2353 scolon=strstr(aline,";");
2356 i=(int)(scolon-amp+1);
2357 for (s=amp;s<scolon;s++)
2359 i=0; /* Don't report "Jones & Son;" */
2364 if (pswit[ECHO_SWITCH])
2365 printf("\n%s\n",aline);
2366 if (!pswit[OVERVIEW_SWITCH])
2367 printf(" Line %ld column %d - HTML symbol? %s \n",
2368 linecnt,(int)(amp-aline)+1,wrk);
2379 * If we are in a state of unbalanced quotes, and this line
2380 * doesn't begin with a quote, output the stored error message.
2381 * If the -P switch was used, print the warning even if the
2382 * new para starts with quotes.
2384 void print_pending(const char *aline,const char *parastart,
2385 struct pending *pending)
2391 if (*pending->dquote)
2392 if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2394 if (!pswit[OVERVIEW_SWITCH])
2396 if (pswit[ECHO_SWITCH])
2397 printf("\n%s\n",parastart);
2398 puts(pending->dquote);
2403 if (*pending->squote)
2405 if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2408 if (!pswit[OVERVIEW_SWITCH])
2410 if (pswit[ECHO_SWITCH])
2411 printf("\n%s\n",parastart);
2412 puts(pending->squote);
2418 if (*pending->rbrack)
2420 if (!pswit[OVERVIEW_SWITCH])
2422 if (pswit[ECHO_SWITCH])
2423 printf("\n%s\n",parastart);
2424 puts(pending->rbrack);
2429 if (*pending->sbrack)
2431 if (!pswit[OVERVIEW_SWITCH])
2433 if (pswit[ECHO_SWITCH])
2434 printf("\n%s\n",parastart);
2435 puts(pending->sbrack);
2440 if (*pending->cbrack)
2442 if (!pswit[OVERVIEW_SWITCH])
2444 if (pswit[ECHO_SWITCH])
2445 printf("\n%s\n",parastart);
2446 puts(pending->cbrack);
2451 if (*pending->unders)
2453 if (!pswit[OVERVIEW_SWITCH])
2455 if (pswit[ECHO_SWITCH])
2456 printf("\n%s\n",parastart);
2457 puts(pending->unders);
2465 * check_for_mismatched_quotes:
2467 * At end of paragraph, check for mismatched quotes.
2469 * We don't want to report an error immediately, since it is a
2470 * common convention to omit the quotes at end of paragraph if
2471 * the next paragraph is a continuation of the same speaker.
2472 * Where this is the case, the next para should begin with a
2473 * quote, so we store the warning message and only display it
2474 * at the top of the next iteration if the new para doesn't
2475 * start with a quote.
2476 * The -p switch overrides this default, and warns of unclosed
2477 * quotes on _every_ paragraph, whether the next begins with a
2480 void check_for_mismatched_quotes(const struct counters *counters,
2481 struct pending *pending)
2483 if (counters->quot%2)
2484 sprintf(pending->dquote," Line %ld - Mismatched quotes",
2486 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2487 counters->open_single_quote!=counters->close_single_quote)
2488 sprintf(pending->squote," Line %ld - Mismatched singlequotes?",
2490 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2491 counters->open_single_quote!=counters->close_single_quote &&
2492 counters->open_single_quote!=counters->close_single_quote+1)
2494 * Flag it to be noted regardless of the
2495 * first char of the next para.
2498 if (counters->r_brack)
2499 sprintf(pending->rbrack," Line %ld - Mismatched round brackets?",
2501 if (counters->s_brack)
2502 sprintf(pending->sbrack," Line %ld - Mismatched square brackets?",
2504 if (counters->c_brack)
2505 sprintf(pending->cbrack," Line %ld - Mismatched curly brackets?",
2507 if (counters->c_unders%2)
2508 sprintf(pending->unders," Line %ld - Mismatched underscores?",
2513 * check_for_omitted_punctuation:
2515 * Check for omitted punctuation at end of paragraph by working back
2516 * through prevline. DW.
2517 * Need to check this only for "normal" paras.
2518 * So what is a "normal" para?
2519 * Not normal if one-liner (chapter headings, etc.)
2520 * Not normal if doesn't contain at least one locase letter
2521 * Not normal if starts with space
2523 void check_for_omitted_punctuation(const char *prevline,
2524 struct line_properties *last,int start_para_line)
2528 for (s=prevline,i=0;*s && !i;s++)
2530 /* use i to indicate the presence of a letter on the line */
2533 * This next "if" is a problem.
2534 * If we say "start_para_line <= linecnt - 1", that includes
2535 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2536 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2537 * misses genuine one-line paragraphs.
2539 if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)
2541 for (i=strlen(prevline)-1;
2542 (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
2543 prevline[i]>CHAR_SPACE && i>0;
2548 if (gcisalpha(prevline[i]))
2550 if (pswit[ECHO_SWITCH])
2551 printf("\n%s\n",prevline);
2552 if (!pswit[OVERVIEW_SWITCH])
2553 printf(" Line %ld column %d - "
2554 "No punctuation at para end?\n",
2555 linecnt-1,(int)strlen(prevline));
2560 if (strchr("-.:!([{?}])",prevline[i]))
2571 void procfile(char *filename)
2574 char parastart[81]; /* first line of current para */
2576 struct first_pass_results *first_pass_results;
2577 struct warnings *warnings;
2578 struct counters counters={0};
2579 struct line_properties last={0};
2580 struct parities parities={0};
2581 struct pending pending={{0},};
2583 long start_para_line=0;
2584 int i,isnewpara=0,enddash=0;
2585 last.start=CHAR_SPACE;
2587 linecnt=checked_linecnt=0;
2588 infile=fopen(filename,"rb");
2591 if (pswit[STDOUT_SWITCH])
2592 fprintf(stdout,"bookloupe: cannot open %s\n",filename);
2594 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
2597 fprintf(stdout,"\n\nFile: %s\n\n",filename);
2598 first_pass_results=first_pass(infile);
2599 warnings=report_first_pass(first_pass_results);
2601 * Here we go with the main pass. Hold onto yer hat!
2605 while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
2610 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
2611 continue; // skip DP page separators completely
2612 if (linecnt<first_pass_results->firstline ||
2613 (first_pass_results->footerline>0 &&
2614 linecnt>first_pass_results->footerline))
2616 if (pswit[HEADER_SWITCH])
2618 if (!strncmp(aline,"Title:",6))
2619 printf(" %s\n",aline);
2620 if (!strncmp(aline,"Author:",7))
2621 printf(" %s\n",aline);
2622 if (!strncmp(aline,"Release Date:",13))
2623 printf(" %s\n",aline);
2624 if (!strncmp(aline,"Edition:",8))
2625 printf(" %s\n\n",aline);
2627 continue; /* skip through the header */
2630 print_pending(aline,parastart,&pending);
2631 memset(&pending,0,sizeof(pending));
2632 isemptyline=analyse_quotes(aline,&counters);
2633 if (isnewpara && !isemptyline)
2635 /* This line is the start of a new paragraph. */
2636 start_para_line=linecnt;
2637 /* Capture its first line in case we want to report it later. */
2638 strncpy(parastart,aline,80);
2640 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2642 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
2644 if (*s>='a' && *s<='z')
2646 /* and its first letter is lowercase */
2647 if (pswit[ECHO_SWITCH])
2648 printf("\n%s\n",aline);
2649 if (!pswit[OVERVIEW_SWITCH])
2650 printf(" Line %ld column %d - "
2651 "Paragraph starts with lower-case\n",
2652 linecnt,(int)(s-aline)+1);
2656 isnewpara=0; /* Signal the end of new para processing. */
2658 /* Check for an em-dash broken at line end. */
2659 if (enddash && *aline=='-')
2661 if (pswit[ECHO_SWITCH])
2662 printf("\n%s\n",aline);
2663 if (!pswit[OVERVIEW_SWITCH])
2664 printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2669 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
2671 if (s>=aline && *s=='-')
2673 check_for_control_characters(aline);
2675 check_for_odd_characters(aline,warnings,isemptyline);
2676 if (warnings->longline)
2677 check_for_long_line(aline);
2678 if (warnings->shortline)
2679 check_for_short_line(aline,&last);
2681 last.len=strlen(aline);
2682 last.start=aline[0];
2683 check_for_starting_punctuation(aline);
2686 check_for_spaced_emdash(aline);
2687 check_for_spaced_dash(aline);
2689 check_for_unmarked_paragraphs(aline);
2690 check_for_jeebies(aline);
2691 check_for_mta_from(aline);
2692 check_for_orphan_character(aline);
2693 check_for_pling_scanno(aline);
2694 check_for_extra_period(aline,warnings);
2695 check_for_following_punctuation(aline);
2696 check_for_typos(aline,warnings);
2697 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2698 check_for_double_punctuation(aline,warnings);
2699 check_for_spaced_quotes(aline);
2700 check_for_miscased_genative(aline);
2701 check_end_of_line(aline,warnings);
2702 check_for_unspaced_bracket(aline);
2703 if (warnings->endquote)
2704 check_for_unpunctuated_endquote(aline);
2705 check_for_html_tag(aline);
2706 check_for_html_entity(aline);
2709 check_for_mismatched_quotes(&counters,&pending);
2710 memset(&counters,0,sizeof(counters));
2711 /* let the next iteration know that it's starting a new para */
2713 check_for_omitted_punctuation(prevline,&last,start_para_line);
2715 strcpy(prevline,aline);
2718 if (!pswit[OVERVIEW_SWITCH])
2719 for (i=0;i<MAX_QWORD;i++)
2721 printf("\nNote: Queried word %s was duplicated %d time%s\n",
2722 qword[i],dupcnt[i],"s");
2728 * Get one line from the input stream, checking for
2729 * the existence of exactly one CR/LF line-end per line.
2731 * Returns: a pointer to the line.
2733 char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
2739 c=cint=fgetc(thefile);
2744 /* either way, it's end of line */
2751 /* Error - a LF without a preceding CR */
2752 if (pswit[LINE_END_SWITCH])
2754 if (pswit[ECHO_SWITCH])
2755 printf("\n%s\n",theline);
2756 if (!pswit[OVERVIEW_SWITCH])
2757 printf(" Line %ld - No CR?\n",lcnt);
2768 /* Error - two successive CRs */
2769 if (pswit[LINE_END_SWITCH])
2771 if (pswit[ECHO_SWITCH])
2772 printf("\n%s\n",theline);
2773 if (!pswit[OVERVIEW_SWITCH])
2774 printf(" Line %ld - Two successive CRs?\n",lcnt);
2783 if (pswit[LINE_END_SWITCH] && isCR)
2785 if (pswit[ECHO_SWITCH])
2786 printf("\n%s\n",theline);
2787 if (!pswit[OVERVIEW_SWITCH])
2788 printf(" Line %ld column %d - CR without LF?\n",
2798 c=cint=fgetc(thefile);
2799 } while(len<maxlen);
2800 if (pswit[MARKUP_SWITCH])
2801 postprocess_for_HTML(theline);
2802 if (pswit[DP_SWITCH])
2803 postprocess_for_DP(theline);
2810 * Takes a "word" as a parameter, and checks whether it
2811 * contains a mixture of alpha and digits. Generally, this is an
2812 * error, but may not be for cases like 4th or L5 12s. 3d.
2814 * Returns: 0 if no error found, 1 if error.
2816 int mixdigit(char *checkword)
2818 int wehaveadigit,wehavealetter,firstdigits,query,wl;
2820 wehaveadigit=wehavealetter=query=0;
2821 for (s=checkword;*s;s++)
2827 if (wehaveadigit && wehavealetter)
2829 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2831 wl=strlen(checkword);
2832 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
2834 /* digits, ending in st, rd, nd, th of either case */
2835 if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
2836 matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
2837 matchword(checkword+wl-2,"th")))
2839 if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
2840 matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
2841 matchword(checkword+wl-3,"ths")))
2843 if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
2844 matchword(checkword+wl-4,"rdly") ||
2845 matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
2847 /* digits, ending in l, L, s or d */
2848 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
2849 checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
2852 * L at the start of a number, representing Britsh pounds, like L500.
2853 * This is cute. We know the current word is mixeddigit. If the first
2854 * letter is L, there must be at least one digit following. If both
2855 * digits and letters follow, we have a genuine error, else we have a
2856 * capital L followed by digits, and we accept that as a non-error.
2858 if (checkword[0]=='L' && !mixdigit(checkword+1))
2867 * Extracts the first/next "word" from the line, and puts
2868 * it into "thisword". A word is defined as one English word unit--or
2869 * at least that's the aim.
2871 * Returns: a pointer to the position in the line where we will start
2872 * looking for the next word.
2874 const char *getaword(const char *fromline,char *thisword)
2879 for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
2883 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2884 * Especially yucky is the case of L1,000
2885 * This section looks for a pattern of characters including a digit
2886 * followed by a comma or period followed by one or more digits.
2887 * If found, it returns this whole pattern as a word; otherwise we discard
2888 * the results and resume our normal programming.
2891 for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
2892 wordlen<MAXWORDLEN;s++)
2894 thisword[wordlen]=*s;
2897 thisword[wordlen]=0;
2898 for (i=1;i<wordlen-1;i++)
2900 if (thisword[i]=='.' || thisword[i]==',')
2902 if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
2909 /* we didn't find a punctuated number - do the regular getword thing */
2911 for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
2912 wordlen<MAXWORDLEN;fromline++)
2914 thisword[wordlen]=*fromline;
2917 thisword[wordlen]=0;
2924 * A case-insensitive string matcher.
2926 int matchword(char *checkfor,char *thisword)
2928 unsigned int ismatch,i;
2929 if (strlen(checkfor)!=strlen(thisword))
2931 ismatch=1; /* assume a match until we find a difference */
2932 for (i=0;i<strlen(checkfor);i++)
2933 if (toupper(checkfor[i])!=toupper(thisword[i]))
2941 * Lowercase the line.
2943 void lowerit(char *theline)
2945 for (;*theline;theline++)
2946 if (*theline>='A' && *theline<='Z')
2953 * Is this word a Roman Numeral?
2955 * It doesn't actually validate that the number is a valid Roman Numeral--for
2956 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2957 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2958 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2959 * expressions thereof, except when it came to taxes. Allow any number of M,
2960 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2961 * XL or an optional XC, an optional IX or IV, an optional V and any number
2964 int isroman(char *t)
2970 while (*t=='m' && *t)
2974 if (*t=='c' && t[1]=='m')
2976 if (*t=='c' && t[1]=='d')
2978 while (*t=='c' && *t)
2980 if (*t=='x' && t[1]=='l')
2982 if (*t=='x' && t[1]=='c')
2986 while (*t=='x' && *t)
2988 if (*t=='i' && t[1]=='x')
2990 if (*t=='i' && t[1]=='v')
2994 while (*t=='i' && *t)
3002 * A version of isalpha() that is somewhat lenient on 8-bit texts.
3003 * If we use the standard function, 8-bit accented characters break
3004 * words, so that tete with accented characters appears to be two words, "t"
3005 * and "t", with 8-bit characters between them. This causes over-reporting of
3006 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
3007 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
3009 int gcisalpha(unsigned char c)
3011 if (c>='a' && c<='z')
3013 if (c>='A' && c<='Z')
3017 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
3019 if (c==140 || c==142 || c==156 || c==158 || c==159)
3027 * A version of isdigit() that doesn't get confused in 8-bit texts.
3029 int gcisdigit(unsigned char c)
3031 return c>='0' && c<='9';
3037 * A version of isletter() that doesn't get confused in 8-bit texts.
3038 * NB: this is ISO-8891-1-specific.
3040 int gcisletter(unsigned char c)
3042 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
3048 * Wraps strchr to return NULL if the character being searched for is zero.
3050 char *gcstrchr(char *s,char c)
3058 * postprocess_for_DP:
3060 * Invoked with the -d switch from flgets().
3061 * It simply "removes" from the line a hard-coded set of common
3062 * DP-specific tags, so that the line passed to the main routine has
3063 * been pre-cleaned of DP markup.
3065 void postprocess_for_DP(char *theline)
3071 for (i=0;*DPmarkup[i];i++)
3073 s=strstr(theline,DPmarkup[i]);
3076 t=s+strlen(DPmarkup[i]);
3084 s=strstr(theline,DPmarkup[i]);
3090 * postprocess_for_HTML:
3092 * Invoked with the -m switch from flgets().
3093 * It simply "removes" from the line a hard-coded set of common
3094 * HTML tags and "replaces" a hard-coded set of common HTML
3095 * entities, so that the line passed to the main routine has
3096 * been pre-cleaned of HTML.
3098 void postprocess_for_HTML(char *theline)
3100 if (strstr(theline,"<") && strstr(theline,">"))
3101 while (losemarkup(theline))
3103 while (loseentities(theline))
3107 char *losemarkup(char *theline)
3113 s=strstr(theline,"<");
3114 t=strstr(theline,">");
3117 for (i=0;*markup[i];i++)
3118 if (!tagcomp(s+1,markup[i]))
3131 /* It's an unrecognized <xxx>. */
3135 char *loseentities(char *theline)
3141 for (i=0;*entities[i].htmlent;i++)
3143 s=strstr(theline,entities[i].htmlent);
3146 t=malloc((size_t)strlen(s));
3149 strcpy(t,s+strlen(entities[i].htmlent));
3150 strcpy(s,entities[i].textent);
3156 for (i=0;*entities[i].htmlnum;i++)
3158 s=strstr(theline,entities[i].htmlnum);
3161 t=malloc((size_t)strlen(s));
3164 strcpy(t,s+strlen(entities[i].htmlnum));
3165 strcpy(s,entities[i].textent);
3174 int tagcomp(char *strin,char *basetag)
3180 t++; /* ignore a slash */
3183 if (tolower(*s)!=tolower(*t))
3193 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3194 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3195 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3196 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3197 "For details, read the file COPYING.\n",stderr);
3198 fputs("This is Free Software; "
3199 "you may redistribute it under certain conditions (GPL);\n",stderr);
3200 fputs("read the file COPYING for details.\n\n",stderr);
3201 fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
3202 fputs(" where -s checks single quotes, -e suppresses echoing lines, "
3203 "-t checks typos\n",stderr);
3204 fputs(" -x (paranoid) switches OFF -t and extra checks, "
3205 "-l turns OFF line-end checks\n",stderr);
3206 fputs(" -o just displays overview without detail, "
3207 "-h echoes header fields\n",stderr);
3208 fputs(" -v (verbose) unsuppresses duplicate reporting, "
3209 "-m suppresses markup\n",stderr);
3210 fputs(" -d ignores DP-specific markup,\n",stderr);
3211 fputs(" -u uses a file gutcheck.typ to query user-defined "
3212 "possible typos\n",stderr);
3213 fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
3215 fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
3217 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3218 "non-ASCII\n",stderr);
3219 fputs("characters like accented letters, "
3220 "lines longer than 75 or shorter than 55,\n",stderr);
3221 fputs("unbalanced quotes or brackets, "
3222 "a variety of badly formatted punctuation, \n",stderr);
3223 fputs("HTML tags, some likely typos. "
3224 "It is NOT a substitute for human judgement.\n",stderr);