1 /*************************************************************************/
2 /* gutcheck - check for assorted weirdnesses in a PG candidate text file */
5 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program; if not, write to the */
19 /* Free Software Foundation, Inc., */
20 /* 59 Temple Place, */
22 /* Boston, MA 02111-1307 USA */
26 /* Overview comments: */
28 /* If you're reading this, you're either interested in how to detect */
29 /* formatting errors, or very very bored. */
31 /* Gutcheck is a homebrew formatting checker specifically for */
32 /* spotting common formatting problems in a PG e-text. I typically */
33 /* run it once or twice on a file I'm about to submit; it usually */
34 /* finds a few formatting problems. It also usually finds lots of */
35 /* queries that aren't problems at all; it _really_ doesn't like */
36 /* the standard PG header, for example. It's optimized for straight */
37 /* prose; poetry and non-fiction involving tables tend to trigger */
40 /* The code of gutcheck is not very interesting, but the experience */
41 /* of what constitutes a possible error may be, and the best way to */
42 /* illustrate that is by example. */
45 /* Here are some common typos found in PG texts that gutcheck */
46 /* will flag as errors: */
48 /* "Look!John , over there!" */
49 /* <this is a HTML tag> */
51 /* Margaret said: " Now you should start for school." */
52 /* Margaret said: "Now you should start for school. (if end of para) */
53 /* The horse is said to he worth a lot. */
54 /* 0K - this'11 make you look close1y. */
55 /* "If you do. you'll regret it!" */
57 /* There are some complications . The extra space left around that */
58 /* period was an error . . . but that ellipsis wasn't. */
60 /* The last line of a paragraph */
61 /* is usually short. */
63 /* This period is an error.But the periods in a.m. aren't. */
65 /* Checks that are do-able but not (well) implemented are: */
66 /* Single-quote chcking. */
67 /* Despite 3 attempts at it, singlequote checking is still */
68 /* crap in gutcheck. It may not be possible without analysis */
69 /* of the whole paragraph. */
71 /*************************************************************************/
79 #define MAXWORDLEN 80 /* max length of one word */
80 #define LINEBUFSIZE 2048 /* buffer size for an input line */
82 #define MAX_USER_TYPOS 1000
83 #define USERTYPO_FILE "gutcheck.typ"
86 #define MAX_PATH 16384
89 char aline[LINEBUFSIZE];
90 char prevline[LINEBUFSIZE];
93 char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
94 "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
95 "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
96 "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
97 "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
98 "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
99 "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
100 "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
101 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
102 "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
103 "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
104 "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
105 "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
106 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
107 "yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
108 /* added h/b words for version 12 - removed a few with "tbe" v.25 */
109 "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind",
110 "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates",
111 "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing",
112 "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh",
113 "meanwbile", "memher", "memhers", "numher", "numhers",
114 "perbaps", "prohlem", "puhlic", "witbout",
115 /* and a few more for .18 */
116 "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
117 "heside", "chapteb", "chaptee", "se",
120 char *usertypo[MAX_USER_TYPOS];
122 /* Common abbreviations and other OK words not to query as typos. */
123 /* 0.99 last-minute - removed "ms" */
124 char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
125 "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
126 "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
129 /* Common abbreviations that cause otherwise unexplained periods. */
130 char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
131 "deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
133 /* Two-Letter combinations that rarely if ever start words, */
134 /* but are common scannos or otherwise common letter */
136 char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
137 "tn", "rn", "lt", "tj",
140 /* Two-Letter combinations that rarely if ever end words */
141 /* but are common scannos or otherwise common letter */
143 char *noend[] = { "cb", "gb", "pb", "sb", "tb",
144 "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
148 char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center",
149 "col", "div", "em", "font", "h1", "h2", "h3", "h4",
150 "h5", "h6", "head", "hr", "html", "i", "img", "li",
151 "meta", "ol", "p", "pre", "small", "span", "strong",
152 "sub", "sup", "table", "td", "tfoot", "thead", "title",
153 "tr", "tt", "u", "ul",
156 char *DPmarkup[] = { "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>",
157 ""}; /* <tb> added .991 */
159 char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
160 "its", "whose", "every", "i'll", "your", "my",
161 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd",
162 "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
163 "i'm", "during", "let", "toward", "among",
167 char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
168 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
169 "i'll", "whose", "who", "because", "when", "let", "till", "very",
170 "an", "among", "those", "into", "whom", "having", "thence",
174 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */
180 } entities[] = { "&", "&", "&",
181 "<", "<", "<",
182 ">", ">", ">",
183 "°", "°", " degrees",
184 "£", "£", "L",
185 """, """, "\"", /* -- quotation mark = APL quote, */
186 "Œ", "Œ", "OE", /* -- latin capital ligature OE, */
187 "œ", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */
188 "Š", "Š", "S", /* -- latin capital letter S with caron, */
189 "š", "š", "s", /* -- latin small letter s with caron, */
190 "Ÿ", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */
191 "ˆ", "ˆ", "", /* -- modifier letter circumflex accent, */
192 "˜", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */
193 " ", " ", " ", /* -- en space, U+2002 ISOpub --> */
194 " ", " ", " ", /* -- em space, U+2003 ISOpub --> */
195 " ", " ", " ", /* -- thin space, U+2009 ISOpub --> */
196 "–", "–", "-", /* -- en dash, U+2013 ISOpub --> */
197 "—", "—", "--", /* -- em dash, U+2014 ISOpub --> */
198 "‘", "‘", "'", /* -- left single quotation mark, */
199 "’", "’", "'", /* -- right single quotation mark, */
200 "‚", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */
201 "“", "“", "\"", /* -- left double quotation mark, */
202 "”", "”", "\"", /* -- right double quotation mark, */
203 "„", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
204 "‹", "‹", "\"", /* -- single left-pointing angle quotation mark, */
205 "›", "›", "\"", /* -- single right-pointing angle quotation mark, */
206 " ", " ", " ", /* -- no-break space = non-breaking space, */
207 "¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
208 "¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */
209 "£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */
210 "¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */
211 "¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
212 "§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */
213 "¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */
214 "©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
215 "ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
216 "«", "«", "\"", /* -- left-pointing double angle quotation mark */
217 "­", "­", "-", /* -- soft hyphen = discretionary hyphen, */
218 "®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */
219 "¯", "¯", " ", /* -- macron = spacing macron = overline */
220 "°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */
221 "±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */
222 "²", "²", "2", /* -- superscript two = superscript digit two */
223 "³", "³", "3", /* -- superscript three = superscript digit three */
224 "´", "´", " ", /* -- acute accent = spacing acute, */
225 "µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */
226 "¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */
227 "¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
228 "¹", "¹", "1", /* -- superscript one = superscript digit one, */
229 "º", "º", " ", /* -- masculine ordinal indicator, */
230 "»", "»", "\"", /* -- right-pointing double angle quotation mark */
231 "¼", "¼", "1/4", /* -- vulgar fraction one quarter */
232 "½", "½", "1/2", /* -- vulgar fraction one half */
233 "¾", "¾", "3/4", /* -- vulgar fraction three quarters */
234 "¿", "¿", "?", /* -- inverted question mark */
235 "À", "À", "A", /* -- latin capital letter A with grave */
236 "Á", "Á", "A", /* -- latin capital letter A with acute, */
237 "Â", "Â", "A", /* -- latin capital letter A with circumflex, */
238 "Ã", "Ã", "A", /* -- latin capital letter A with tilde, */
239 "Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */
240 "Å", "Å", "A", /* -- latin capital letter A with ring above */
241 "Æ", "Æ", "AE", /* -- latin capital letter AE */
242 "Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */
243 "È", "È", "E", /* -- latin capital letter E with grave, */
244 "É", "É", "E", /* -- latin capital letter E with acute, */
245 "Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */
246 "Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */
247 "Ì", "Ì", "I", /* -- latin capital letter I with grave, */
248 "Í", "Í", "I", /* -- latin capital letter I with acute, */
249 "Î", "Î", "I", /* -- latin capital letter I with circumflex, */
250 "Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */
251 "Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
252 "Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */
253 "Ò", "Ò", "O", /* -- latin capital letter O with grave, */
254 "Ó", "Ó", "O", /* -- latin capital letter O with acute, */
255 "Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */
256 "Õ", "Õ", "O", /* -- latin capital letter O with tilde, */
257 "Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */
258 "×", "×", "*", /* -- multiplication sign, U+00D7 ISOnum --> */
259 "Ø", "Ø", "O", /* -- latin capital letter O with stroke */
260 "Ù", "Ù", "U", /* -- latin capital letter U with grave, */
261 "Ú", "Ú", "U", /* -- latin capital letter U with acute, */
262 "Û", "Û", "U", /* -- latin capital letter U with circumflex, */
263 "Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */
264 "Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */
265 "Þ", "Þ", "TH", /* -- latin capital letter THORN, */
266 "ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */
267 "à", "à", "a", /* -- latin small letter a with grave */
268 "á", "á", "a", /* -- latin small letter a with acute, */
269 "â", "â", "a", /* -- latin small letter a with circumflex, */
270 "ã", "ã", "a", /* -- latin small letter a with tilde, */
271 "ä", "ä", "a", /* -- latin small letter a with diaeresis, */
272 "å", "å", "a", /* -- latin small letter a with ring above */
273 "æ", "æ", "ae", /* -- latin small letter ae */
274 "ç", "ç", "c", /* -- latin small letter c with cedilla, */
275 "è", "è", "e", /* -- latin small letter e with grave, */
276 "é", "é", "e", /* -- latin small letter e with acute, */
277 "ê", "ê", "e", /* -- latin small letter e with circumflex, */
278 "ë", "ë", "e", /* -- latin small letter e with diaeresis, */
279 "ì", "ì", "i", /* -- latin small letter i with grave, */
280 "í", "í", "i", /* -- latin small letter i with acute, */
281 "î", "î", "i", /* -- latin small letter i with circumflex, */
282 "ï", "ï", "i", /* -- latin small letter i with diaeresis, */
283 "ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
284 "ñ", "ñ", "n", /* -- latin small letter n with tilde, */
285 "ò", "ò", "o", /* -- latin small letter o with grave, */
286 "ó", "ó", "o", /* -- latin small letter o with acute, */
287 "ô", "ô", "o", /* -- latin small letter o with circumflex, */
288 "õ", "õ", "o", /* -- latin small letter o with tilde, */
289 "ö", "ö", "o", /* -- latin small letter o with diaeresis, */
290 "÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */
291 "ø", "ø", "o", /* -- latin small letter o with stroke, */
292 "ù", "ù", "u", /* -- latin small letter u with grave, */
293 "ú", "ú", "u", /* -- latin small letter u with acute, */
294 "û", "û", "u", /* -- latin small letter u with circumflex, */
295 "ü", "ü", "u", /* -- latin small letter u with diaeresis, */
296 "ý", "ý", "y", /* -- latin small letter y with acute, */
297 "þ", "þ", "th", /* -- latin small letter thorn, */
298 "ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */
301 /* ---- list of special characters ---- */
302 #define CHAR_SPACE 32
306 #define CHAR_DQUOTE 34
307 #define CHAR_SQUOTE 39
308 #define CHAR_OPEN_SQUOTE 96
309 #define CHAR_TILDE 126
310 #define CHAR_ASTERISK 42
311 #define CHAR_FORESLASH 47
312 #define CHAR_CARAT 94
314 #define CHAR_UNDERSCORE '_'
315 #define CHAR_OPEN_CBRACK '{'
316 #define CHAR_CLOSE_CBRACK '}'
317 #define CHAR_OPEN_RBRACK '('
318 #define CHAR_CLOSE_RBRACK ')'
319 #define CHAR_OPEN_SBRACK '['
320 #define CHAR_CLOSE_SBRACK ']'
326 /* ---- longest and shortest normal PG line lengths ----*/
327 #define LONGEST_PG_LINE 75
328 #define WAY_TOO_LONG 80
329 #define SHORTEST_PG_LINE 55
331 #define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
332 /* D - ignore DP-specific markup */
333 /* E - echo queried line */
334 /* S - check single quotes */
335 /* T - check common typos */
336 /* P - require closure of quotes on */
337 /* every paragraph */
338 /* X - "Trust no one" :-) Paranoid! */
339 /* Queries everything */
340 /* L - line end checking defaults on */
341 /* -L turns it off */
342 /* O - overview. Just shows counts. */
343 /* Y - puts errors to stdout */
344 /* instead of stderr */
345 /* H - Echoes header fields */
346 /* M - Ignore markup in < > */
347 /* U - Use file of User-defined Typos*/
348 /* W - Defaults for use on Web upload*/
349 /* V - Verbose - list EVERYTHING! */
350 #define SWITNO 14 /* max number of switch parms */
351 /* - used for defining array-size */
352 #define MINARGS 1 /* minimum no of args excl switches */
353 #define MAXARGS 1 /* maximum no of args excl switches */
355 int pswit[SWITNO]; /* program switches set by SWITCHES */
357 #define ECHO_SWITCH 0
358 #define SQUOTE_SWITCH 1
359 #define TYPO_SWITCH 2
360 #define QPARA_SWITCH 3
361 #define PARANOID_SWITCH 4
362 #define LINE_END_SWITCH 5
363 #define OVERVIEW_SWITCH 6
364 #define STDOUT_SWITCH 7
365 #define HEADER_SWITCH 8
367 #define VERBOSE_SWITCH 10
368 #define MARKUP_SWITCH 11
369 #define USERTYPO_SWITCH 12
374 long cnt_dquot; /* for overview mode, count of doublequote queries */
375 long cnt_squot; /* for overview mode, count of singlequote queries */
376 long cnt_brack; /* for overview mode, count of brackets queries */
377 long cnt_bin; /* for overview mode, count of non-ASCII queries */
378 long cnt_odd; /* for overview mode, count of odd character queries */
379 long cnt_long; /* for overview mode, count of long line errors */
380 long cnt_short; /* for overview mode, count of short line queries */
381 long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
382 long cnt_dash; /* for overview mode, count of dash-related queries */
383 long cnt_word; /* for overview mode, count of word queries */
384 long cnt_html; /* for overview mode, count of html queries */
385 long cnt_lineend; /* for overview mode, count of line-end queries */
386 long cnt_spacend; /* count of lines with space at end V .21 */
387 long linecnt; /* count of total lines in the file */
388 long checked_linecnt; /* count of lines actually gutchecked V .26 */
391 void procfile(char *);
393 #define LOW_THRESHOLD 0
394 #define HIGH_THRESHOLD 1
400 #define FIRST_OF_PAIR 0
401 #define SECOND_OF_PAIR 1
403 #define MAX_WORDPAIR 1000
405 char running_from[MAX_PATH];
407 int mixdigit(char *);
408 char *getaword(char *, char *);
409 int matchword(char *, char *);
410 char *flgets(char *, int, FILE *, long);
411 void lowerit(char *);
412 int gcisalpha(unsigned char);
413 int gcisdigit(unsigned char);
414 int gcisletter(unsigned char);
415 char *gcstrchr(char *s, char c);
416 void postprocess_for_HTML(char *);
417 char *linehasmarkup(char *);
418 char *losemarkup(char *);
419 int tagcomp(char *, char *);
420 char *loseentities(char *);
423 void postprocess_for_DP(char *);
425 char wrk[LINEBUFSIZE];
427 /* This is disgustingly lazy, predefining max words & lengths, */
428 /* but now I'm out of 16-bit restrictions, what's a couple of K? */
430 #define MAX_QWORD_LENGTH 40
431 char qword[MAX_QWORD][MAX_QWORD_LENGTH];
432 char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
433 signed int dupcnt[MAX_QWORD];
438 int main(int argc, char **argv)
441 int i, switno, invarg;
442 char usertypo_file[MAX_PATH];
446 if (strlen(argv[0]) < sizeof(running_from))
447 strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */
449 /* find out what directory we're running from */
450 for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--)
454 switno = strlen(SWITCHES);
455 for (i = switno ; --i >0 ; )
456 pswit[i] = 0; /* initialise switches */
458 /* Standard loop to extract switches. */
459 /* When we come out of this loop, the arguments will be */
460 /* in argv[0] upwards and the switches used will be */
461 /* represented by their equivalent elements in pswit[] */
462 while ( --argc > 0 && **++argv == '-')
463 for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
464 for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
465 if ((toupper(*argsw)) == SWITCHES[i] ) {
470 pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */
472 if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */
473 pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */
474 } /* v.20 removed s and p switches from paranoid mode */
476 pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */
477 pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */
479 if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */
480 pswit[ECHO_SWITCH] = 0;
482 /* Web uploads - for the moment, this is really just a placeholder */
483 /* until we decide what processing we really want to do on web uploads */
484 if (pswit[WEB_SWITCH]) { /* specific override for web uploads */
485 pswit[ECHO_SWITCH] = 1;
486 pswit[SQUOTE_SWITCH] = 0;
487 pswit[TYPO_SWITCH] = 1;
488 pswit[QPARA_SWITCH] = 0;
489 pswit[PARANOID_SWITCH] = 1;
490 pswit[LINE_END_SWITCH] = 0;
491 pswit[OVERVIEW_SWITCH] = 0;
492 pswit[STDOUT_SWITCH] = 0;
493 pswit[HEADER_SWITCH] = 1;
494 pswit[VERBOSE_SWITCH] = 0;
495 pswit[MARKUP_SWITCH] = 0;
496 pswit[USERTYPO_SWITCH] = 0;
497 pswit[DP_SWITCH] = 0;
501 if (argc < MINARGS || argc > MAXARGS) { /* check number of args */
503 return(1); /* exit */
507 /* read in the user-defined stealth scanno list */
509 if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */
510 if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */
511 strcpy(usertypo_file, running_from);
512 strcat(usertypo_file, USERTYPO_FILE);
513 if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */
514 printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
519 if (usertypofile) { /* we managed to open a User Typo File! */
520 if (pswit[USERTYPO_SWITCH]) {
521 while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
522 if (strlen(aline) > 1) {
523 if ((int)*aline > 33) {
524 s = malloc(strlen(aline)+1);
526 fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
530 usertypo[usertypo_count] = s;
532 if (usertypo_count >= MAX_USER_TYPOS) {
533 printf(" --> Only %d user-defined typos allowed: ignoring the rest\n");
540 fclose(usertypofile);
547 fprintf(stderr, "gutcheck: Check and report on an e-text\n");
549 cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
550 cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
555 if (pswit[OVERVIEW_SWITCH]) {
556 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
557 checked_linecnt, linecnt, linecnt - checked_linecnt);
558 printf(" --------------- Queries found --------------\n");
559 if (cnt_long) printf(" Long lines: %5ld\n",cnt_long);
560 if (cnt_short) printf(" Short lines: %5ld\n",cnt_short);
561 if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend);
562 if (cnt_word) printf(" Common typos: %5ld\n",cnt_word);
563 if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot);
564 if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot);
565 if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack);
566 if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin);
567 if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd);
568 if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct);
569 if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash);
570 if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html);
572 printf(" TOTAL QUERIES %5ld\n",
573 cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
574 cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
582 /* procfile - process one file */
584 void procfile(char *filename)
587 char *s, *t, *s1, laststart, *wordstart;
588 char inword[MAXWORDLEN], testword[MAXWORDLEN];
589 char parastart[81]; /* first line of current para */
591 long quot, squot, firstline, alphalen, totlen, binlen,
592 shortline, longline, verylongline, spacedash, emdash,
593 space_emdash, non_PG_space_emdash, PG_space_emdash,
594 footerline, dotcomma, start_para_line, astline, fslashline,
595 standalone_digit, hyphens, htmcount, endquote_count;
596 long spline, nspline;
597 signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
598 eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
599 signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
600 warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
601 unsigned int lastlen, lastblen;
602 signed int s_brack, c_brack, r_brack, c_unders;
603 signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
604 signed int isnewpara, vowel, consonant;
605 char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
607 signed int qword_index, qperiod_index, isdup;
609 signed int Dutchcount, isDutch, Frenchcount, isFrench;
615 laststart = CHAR_SPACE;
616 lastlen = lastblen = 0;
617 *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err =
618 *unders_err = *prevline = 0;
619 linecnt = firstline = alphalen = totlen = binlen =
620 shortline = longline = spacedash = emdash = checked_linecnt =
621 space_emdash = non_PG_space_emdash = PG_space_emdash =
622 footerline = dotcomma = start_para_line = astline = fslashline =
623 standalone_digit = hyphens = htmcount = endquote_count = 0;
624 quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
625 i = llen = isemptyline = isacro = isellipsis = istypo = 0;
626 warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma =
627 warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
628 isnewpara = vowel = consonant = enddash = 0;
629 spline = nspline = 0;
630 qword_index = qperiod_index = isdup = 0;
631 *inword = *testword = 0;
632 open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
633 Dutchcount = isDutch = Frenchcount = isFrench = 0;
636 for (j = 0; j < MAX_QWORD; j++) {
638 for (i = 0; i < MAX_QWORD_LENGTH; i++)
644 if ((infile = fopen(filename, "rb")) == NULL) {
645 if (pswit[STDOUT_SWITCH])
646 fprintf(stdout, "gutcheck: cannot open %s\n", filename);
648 fprintf(stderr, "gutcheck: cannot open %s\n", filename);
652 fprintf(stdout, "\n\nFile: %s\n\n", filename);
653 firstline = shortline = longline = verylongline = 0;
656 /*****************************************************/
658 /* Run a first pass - verify that it's a valid PG */
659 /* file, decide whether to report some things that */
660 /* occur many times in the text like long or short */
661 /* lines, non-standard dashes, and other good stuff */
662 /* I'll doubtless think of later. */
664 /*****************************************************/
666 /*****************************************************/
667 /* V.24 Sigh. Yet Another Header Change */
668 /*****************************************************/
670 while (fgets(aline, LINEBUFSIZE-1, infile)) {
671 while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
673 if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) {
675 printf(" --> Duplicate header?\n");
676 spline = linecnt + 1; /* first line of non-header text, that is */
678 if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
680 printf(" --> Duplicate header?\n");
681 nspline = linecnt + 1; /* first line of non-header text, that is */
683 if (spline || nspline) {
685 if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
686 if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
688 if (!nspline) /* it's an old-form header - we can detect duplicates */
689 printf(" --> Duplicate footer?\n");
694 footerline = linecnt;
699 if (spline) firstline = spline;
700 if (nspline) firstline = nspline; /* override with new */
702 if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */
704 llen = strlen(aline);
706 for (i = 0; i < llen; i++) {
707 if ((unsigned char)aline[i] > 127) binlen++;
708 if (gcisalpha(aline[i])) alphalen++;
710 if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
713 if (strlen(aline) > 2
714 && lastlen > 2 && lastlen < SHORTEST_PG_LINE
715 && lastblen > 2 && lastblen > SHORTEST_PG_LINE
716 && laststart != CHAR_SPACE)
719 if (*aline) /* fixed line below for 0.96 */
720 if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
722 if (strstr(aline, ".,")) dotcomma++;
723 /* 0.98 only count ast lines for ignoring purposes where there is */
724 /* locase text on the line */
725 if (strstr(aline, "*")) {
726 for (s = aline; *s; s++)
727 if (*s >='a' && *s <= 'z')
731 if (strstr(aline, "/"))
733 for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
734 if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
736 if (llen > LONGEST_PG_LINE) longline++;
737 if (llen > WAY_TOO_LONG) verylongline++;
739 if (strstr(aline, "<") && strstr(aline, ">")) {
740 i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
743 if (strstr(aline, "<i>")) htmcount +=4; /* bonus marks! */
746 /* Check for spaced em-dashes */
747 if (strstr(aline,"--")) {
749 if (*(strstr(aline, "--")-1) == CHAR_SPACE ||
750 (*(strstr(aline, "--")+2) == CHAR_SPACE))
752 if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
753 (*(strstr(aline, "--")+2) == CHAR_SPACE))
754 non_PG_space_emdash++; /* count of em-dashes with spaces both sides */
755 if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
756 (*(strstr(aline, "--")+2) != CHAR_SPACE))
757 PG_space_emdash++; /* count of PG-type em-dashes with no spaces */
760 for (s = aline; *s;) {
761 s = getaword(s, inword);
762 if (!strcmp(inword, "hij") || !strcmp(inword, "niet"))
764 if (!strcmp(inword, "dans") || !strcmp(inword, "avec"))
766 if (!strcmp(inword, "0") || !strcmp(inword, "1"))
770 /* Check for spaced dashes */
771 if (strstr(aline," -"))
772 if (*(strstr(aline, " -")+2) != '-')
775 lastlen = strlen(aline);
776 laststart = aline[0];
782 /* now, based on this quick view, make some snap decisions */
783 if (cnt_spacend > 0) {
784 printf(" --> %ld lines in this file have white space at end\n", cnt_spacend);
790 printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
793 /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
795 if (shortline > 50 || shortline * 10 > linecnt) {
797 printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
800 /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
802 if (longline > 50 || longline * 10 > linecnt) {
804 printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline);
807 /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
811 printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
814 /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
816 if (fslashline > 10 ) {
818 printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
821 /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
823 if (endquote_count > 20 ) {
825 printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
828 /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
830 if (standalone_digit > 10 ) {
832 printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
835 /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
839 printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
842 if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
843 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
844 pswit[MARKUP_SWITCH] = 1;
847 if (verylongline > 0) {
848 printf(" --> %ld lines in this file are VERY long!\n", verylongline);
851 /* If there are more non-PG spaced dashes than PG em-dashes, */
852 /* assume it's deliberate */
853 /* Current PG guidelines say don't use them, but older texts do,*/
854 /* and some people insist on them whatever the guidelines say. */
855 /* V.20 removed requirement that PG_space_emdash be greater than*/
856 /* ten before turning off warnings about spaced dashes. */
858 if (spacedash + non_PG_space_emdash > PG_space_emdash) {
860 printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
863 /* if more than a quarter of characters are hi-bit, bug out */
865 if (binlen * 4 > totlen) {
866 printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
869 if (alphalen * 4 < totlen) {
870 printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n");
873 if ((binlen * 100 > totlen) || (binlen > 100)) {
874 printf(" --> There are a lot of foreign letters here. Not reporting them.\n");
878 /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
880 if (Dutchcount > 50) {
882 printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
886 if (Frenchcount > 50) {
888 printf(" --> This looks like French - switching off some doublepunct.\n");
891 if (firstline && footerline)
892 printf(" The PG header and footer appear to be already on.\n");
895 printf(" The PG header is on - no footer.\n");
897 printf(" The PG footer is on - no header.\n");
901 /* V.22 George Davis asked for an override switch to force it to list everything */
902 if (pswit[VERBOSE_SWITCH]) {
913 printf(" *** Verbose output is ON -- you asked for it! ***\n");
917 warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */
919 if ((infile = fopen(filename, "rb")) == NULL) {
920 if (pswit[STDOUT_SWITCH])
921 fprintf(stdout, "gutcheck: cannot open %s\n", filename);
923 fprintf(stderr, "gutcheck: cannot open %s\n", filename);
927 if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
928 printf(" --> I don't really know where this text starts. \n");
929 printf(" There are no reference points.\n");
930 printf(" I'm going to have to report the header and footer as well.\n");
936 /*****************************************************/
938 /* Here we go with the main pass. Hold onto yer hat! */
940 /*****************************************************/
942 /* Re-init some variables we've dirtied */
943 quot = squot = linecnt = 0;
944 laststart = CHAR_SPACE;
945 lastlen = lastblen = 0;
947 while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
949 if (linecnt == 1) isnewpara = 1;
950 if (pswit[DP_SWITCH])
951 if (!strncmp(aline, "-----File: ", 11))
952 continue; // skip DP page separators completely
953 if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) {
954 if (pswit[HEADER_SWITCH]) {
955 if (!strncmp(aline, "Title:", 6))
956 printf(" %s\n", aline);
957 if (!strncmp (aline, "Author:", 7))
958 printf(" %s\n", aline);
959 if (!strncmp(aline, "Release Date:", 13))
960 printf(" %s\n", aline);
961 if (!strncmp(aline, "Edition:", 8))
962 printf(" %s\n\n", aline);
964 continue; /* skip through the header */
968 isemptyline = 1; /* assume the line is empty until proven otherwise */
970 /* If we are in a state of unbalanced quotes, and this line */
971 /* doesn't begin with a quote, output the stored error message */
972 /* If the -P switch was used, print the warning even if the */
973 /* new para starts with quotes */
974 /* Version .20 - if the new paragraph does start with a quote, */
975 /* but is indented, I was giving a spurious error. Need to */
976 /* check the first _non-space_ character on the line rather */
977 /* than the first character when deciding whether the para */
978 /* starts with a quote. Using *t for this. */
980 while (*t == ' ') t++;
982 if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) {
983 if (!pswit[OVERVIEW_SWITCH]) {
984 if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
991 if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) {
992 if (!pswit[OVERVIEW_SWITCH]) {
993 if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1002 if (!pswit[OVERVIEW_SWITCH]) {
1003 if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1010 if (!pswit[OVERVIEW_SWITCH]) {
1011 if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1018 if (!pswit[OVERVIEW_SWITCH]) {
1019 if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1026 if (!pswit[OVERVIEW_SWITCH]) {
1027 if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1034 *dquote_err = *squote_err = *rbrack_err = *cbrack_err =
1035 *sbrack_err = *unders_err = 0;
1038 /* look along the line, accumulate the count of quotes, and see */
1039 /* if this is an empty line - i.e. a line with nothing on it */
1041 /* V .12 also if line has just spaces, * and/or - on it, don't */
1042 /* count it, since empty lines with asterisks or dashes to */
1043 /* separate sections are common. */
1044 /* V .15 new single-quote checking - has to be better than the */
1045 /* previous version, but how much better? fingers crossed! */
1046 /* V .20 add period to * and - as characters on a separator line*/
1049 if (*s == CHAR_DQUOTE) quot++;
1050 if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
1051 if (s == aline) { /* at start of line, it can only be an openquote */
1052 if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
1053 open_single_quote++;
1056 if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
1057 ; /* do nothing! - it's definitely an apostrophe, not a quote */
1058 else /* it's outside a word - let's check it out */
1059 if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */
1060 if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
1061 open_single_quote++;
1063 else { /* now - is it a closequote? */
1064 guessquote = 0; /* accumulate clues */
1065 if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */
1067 if (*(s-1) == 's') { /* looks like a plural apostrophe */
1069 if (*(s+1) == CHAR_SPACE) /* bonus marks! */
1073 else /* it doesn't have a letter either side */
1074 if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1))))
1075 guessquote += 8; /* looks like a closequote */
1078 if (open_single_quote > close_single_quote)
1079 guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
1082 if (guessquote >= 0)
1083 close_single_quote++;
1086 if (*s != CHAR_SPACE
1089 && *s != CHAR_ASTERISK
1091 && *s != 10) isemptyline = 0; /* ignore lines like * * * as spacers */
1092 if (*s == CHAR_UNDERSCORE) c_unders++;
1093 if (*s == CHAR_OPEN_CBRACK) c_brack++;
1094 if (*s == CHAR_CLOSE_CBRACK) c_brack--;
1095 if (*s == CHAR_OPEN_RBRACK) r_brack++;
1096 if (*s == CHAR_CLOSE_RBRACK) r_brack--;
1097 if (*s == CHAR_OPEN_SBRACK) s_brack++;
1098 if (*s == CHAR_CLOSE_SBRACK) s_brack--;
1102 if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */
1103 start_para_line = linecnt;
1104 strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
1106 dquotepar = squotepar = 0; /* restart the quote count 0.98 */
1108 while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; /* V.97 fixed bug - overran line and gave false warning - rare */
1109 if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */
1110 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1111 if (!pswit[OVERVIEW_SWITCH])
1112 printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
1116 isnewpara = 0; /* Signal the end of new para processing */
1119 /* Check for an em-dash broken at line end */
1120 if (enddash && *aline == '-') {
1121 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1122 if (!pswit[OVERVIEW_SWITCH])
1123 printf(" Line %ld column 1 - Broken em-dash?\n", linecnt);
1128 for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
1129 if (s >= aline && *s == '-')
1133 /* Check for invalid or questionable characters in the line */
1134 /* Anything above 127 is invalid for plain ASCII, and */
1135 /* non-printable control characters should also be flagged. */
1136 /* Tabs should generally not be there. */
1137 /* Jan 06, in 0.99: Hm. For some strange reason, I either */
1138 /* never created or deleted the check for unprintable */
1139 /* control characters. They should be reported even if */
1140 /* warn_bin is on, I think, and in full. */
1142 for (s = aline; *s; s++) {
1143 i = (unsigned char) *s;
1144 if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
1145 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1146 if (!pswit[OVERVIEW_SWITCH])
1147 printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
1154 eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */
1155 for (s = aline; *s; s++) {
1156 if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) {
1157 i = *s; /* annoying kludge for signed chars */
1158 if (i < 0) i += 256;
1159 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1160 if (!pswit[OVERVIEW_SWITCH])
1161 if (i > 127 && i < 160)
1162 printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
1164 printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
1169 if (!eTab && *s == CHAR_TAB) {
1170 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1171 if (!pswit[OVERVIEW_SWITCH])
1172 printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
1177 if (!eTilde && *s == CHAR_TILDE) { /* often used by OCR software to indicate an unrecognizable character */
1178 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1179 if (!pswit[OVERVIEW_SWITCH])
1180 printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
1185 if (!eCarat && *s == CHAR_CARAT) {
1186 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1187 if (!pswit[OVERVIEW_SWITCH])
1188 printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
1193 if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {
1194 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1195 if (!pswit[OVERVIEW_SWITCH])
1196 printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
1201 /* report asterisks only in paranoid mode, since they're often deliberate */
1202 if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
1203 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1204 if (!pswit[OVERVIEW_SWITCH])
1205 printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
1213 /* Check for line too long */
1215 if (strlen(aline) > LONGEST_PG_LINE) {
1216 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1217 if (!pswit[OVERVIEW_SWITCH])
1218 printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
1224 /* Check for line too short. */
1225 /* This one is a bit trickier to implement: we don't want to */
1226 /* flag the last line of a paragraph for being short, so we */
1227 /* have to wait until we know that our current line is a */
1228 /* "normal" line, then report the _previous_ line if it was too */
1229 /* short. We also don't want to report indented lines like */
1230 /* chapter heads or formatted quotations. We therefore keep */
1231 /* lastlen as the length of the last line examined, and */
1232 /* lastblen as the length of the last but one, and try to */
1233 /* suppress unnecessary warnings by checking that both were of */
1234 /* "normal" length. We keep the first character of the last */
1235 /* line in laststart, and if it was a space, we assume that the */
1236 /* formatting is deliberate. I can't figure out a way to */
1237 /* distinguish something like a quoted verse left-aligned or */
1238 /* the header or footer of a letter from a paragraph of short */
1239 /* lines - maybe if I examined the whole paragraph, and if the */
1240 /* para has less than, say, 8 lines and if all lines are short, */
1241 /* then just assume it's OK? Need to look at some texts to see */
1242 /* how often a formula like this would get the right result. */
1243 /* V0.99 changed the tolerance for length to ignore from 2 to 1 */
1245 if (strlen(aline) > 1
1246 && lastlen > 1 && lastlen < SHORTEST_PG_LINE
1247 && lastblen > 1 && lastblen > SHORTEST_PG_LINE
1248 && laststart != CHAR_SPACE) {
1249 if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
1250 if (!pswit[OVERVIEW_SWITCH])
1251 printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
1257 lastlen = strlen(aline);
1258 laststart = aline[0];
1260 /* look for punctuation at start of line */
1261 if (*aline && strchr(".?!,;:", aline[0])) { /* if it's punctuation */
1262 if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
1263 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1264 if (!pswit[OVERVIEW_SWITCH])
1265 printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt);
1271 /* Check for spaced em-dashes */
1272 /* V.20 must check _all_ occurrences of "--" on the line */
1273 /* hence the loop - even if the first double-dash is OK */
1274 /* there may be another that's wrong later on. */
1277 while (strstr(s,"--")) {
1278 if (*(strstr(s, "--")-1) == CHAR_SPACE ||
1279 (*(strstr(s, "--")+2) == CHAR_SPACE)) {
1280 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1281 if (!pswit[OVERVIEW_SWITCH])
1282 printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
1286 s = strstr(s,"--") + 2;
1290 /* Check for spaced dashes */
1292 if (strstr(aline," -")) {
1293 if (*(strstr(aline, " -")+2) != '-') {
1294 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1295 if (!pswit[OVERVIEW_SWITCH])
1296 printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
1302 if (strstr(aline,"- ")) {
1303 if (*(strstr(aline, "- ")-1) != '-') {
1304 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1305 if (!pswit[OVERVIEW_SWITCH])
1306 printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
1313 /* Check for unmarked paragraphs indicated by separate speakers */
1314 /* May well be false positive: */
1315 /* "Bravo!" "Wonderful!" called the crowd. */
1316 /* but useful all the same. */
1319 if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
1320 if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
1322 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1323 if (!pswit[OVERVIEW_SWITCH])
1324 printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
1331 /* Check for "to he" and other easy he/be errors */
1332 /* This is a very inadequate effort on the he/be problem, */
1333 /* but the phrase "to he" is always an error, whereas "to */
1334 /* be" is quite common. I chuckle when it does catch one! */
1335 /* Similarly, '"Quiet!", be said.' is a non-be error */
1336 /* V .18 - "to he" is _not_ always an error!: */
1337 /* "Where they went to he couldn't say." */
1338 /* but I'm leaving it in anyway. */
1339 /* V .20 Another false positive: */
1340 /* What would "Cinderella" be without the . . . */
1341 /* and another "If he wants to he can see for himself." */
1342 /* V .21 Added " is be " and " be is " and " be was " */
1343 /* V .99 Added jeebies code -- removed again. */
1344 /* Is jeebies code worth adding? Rare to see he/be */
1345 /* errors with modern OCR. Separate program? Yes! */
1346 /* jeebies does the job without cluttering up this. */
1347 /* We do get a few more queryable pairs from the */
1348 /* project though -- they're cheap to implement. */
1349 /* Also added a column number for guiguts. */
1353 if (strstr(aline," to he ")) s = strstr(aline," to he ");
1354 if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
1355 if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
1356 if (strstr(aline," is be ")) s = strstr(aline," is be ");
1357 if (strstr(aline," be is ")) s = strstr(aline," be is ");
1358 if (strstr(aline," was be ")) s = strstr(aline," was be ");
1359 if (strstr(aline," be would ")) s = strstr(aline," be would ");
1360 if (strstr(aline," be could ")) s = strstr(aline," be could ");
1362 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1363 if (!pswit[OVERVIEW_SWITCH])
1364 printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
1371 if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
1372 if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
1373 if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
1374 if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
1375 if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
1376 if (strstr(aline," a had ")) s = strstr(aline," a had ");
1377 if (strstr(aline," the had ")) s = strstr(aline," the had ");
1379 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1380 if (!pswit[OVERVIEW_SWITCH])
1381 printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
1387 /* V .97 Added ", hut " Not too common, hut pretty certain */
1388 /* V.99 changed to add a column number for guiguts */
1391 if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
1392 if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
1394 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1395 if (!pswit[OVERVIEW_SWITCH])
1396 printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
1401 /* Special case - angled bracket in front of "From" placed there by an MTA */
1402 /* when sending an e-mail. V .21 */
1403 if (strstr(aline, ">From")) {
1404 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1405 if (!pswit[OVERVIEW_SWITCH])
1406 printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
1411 /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
1412 if (*aline && !*(aline+1)) {
1413 if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline))
1414 ; /* nothing - ignore numerals alone on a line. */
1416 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1417 if (!pswit[OVERVIEW_SWITCH])
1418 printf(" Line %ld column 1 - Query single character line\n", linecnt);
1424 /* V 0.98 Check for I" - often should be ! */
1425 if (strstr(aline, " I\"")) {
1426 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1427 if (!pswit[OVERVIEW_SWITCH])
1428 printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
1433 /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
1434 /* Only works when it happens on a single line. */
1436 if (pswit[PARANOID_SWITCH])
1437 for (t = s = aline; strstr(t,". ");) {
1438 t = strstr(t, ". ");
1441 continue; /* start of line punctuation is handled elsewhere */
1443 if (!gcisalpha(*(t-1))) {
1447 if (isDutch) { /* For Frank & Jeroen -- 's Middags case */
1448 if (*(t+2) == CHAR_SQUOTE &&
1449 *(t+3)>='a' && *(t+3)<='z' &&
1450 *(t+4) == CHAR_SPACE &&
1451 *(t+5)>='A' && *(t+5)<='Z') {
1457 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1459 if (*s1 >= 'a' && *s1 <= 'z') { /* we have something to investigate */
1461 for (s1 = t - 1; s1 >= s &&
1462 (gcisalpha(*s1) || gcisdigit(*s1) ||
1463 (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */
1465 for (i = 0; *s1 && *s1 != '.'; s1++, i++)
1468 for (i = 0; *abbrev[i]; i++)
1469 if (!strcmp(testword, abbrev[i]))
1471 // if (*testword >= 'A' && *testword <= 'Z')
1473 if (gcisdigit(*testword)) istypo = 0;
1474 if (!*(testword+1)) istypo = 0;
1475 if (isroman(testword)) istypo = 0;
1478 for (i = 0; testword[i]; i++)
1479 if (strchr(vowels, testword[i]))
1484 if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
1485 for (i = 0; i < qperiod_index; i++)
1486 if (!strcmp(testword, qperiod[i])) {
1490 if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
1491 strcpy(qperiod[qperiod_index], testword);
1494 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1495 if (!pswit[OVERVIEW_SWITCH])
1496 printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
1506 if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */
1507 /* Check for words usually not followed by punctuation 0.99 */
1508 for (s = aline; *s;) {
1510 s = getaword(s, inword);
1511 if (!*inword) continue;
1513 for (i = 0; *nocomma[i]; i++)
1514 if (!strcmp(inword, nocomma[i])) {
1515 if (*s == ',' || *s == ';' || *s == ':') {
1516 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1517 if (!pswit[OVERVIEW_SWITCH])
1518 printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
1523 for (i = 0; *noperiod[i]; i++)
1524 if (!strcmp(inword, noperiod[i])) {
1525 if (*s == '.' || *s == '!') {
1526 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1527 if (!pswit[OVERVIEW_SWITCH])
1528 printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
1538 /* Check for commonly mistyped words, and digits like 0 for O in a word */
1539 for (s = aline; *s;) {
1541 s = getaword(s, inword);
1542 if (!*inword) continue; /* don't bother with empty lines */
1543 if (mixdigit(inword)) {
1544 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1545 if (!pswit[OVERVIEW_SWITCH])
1546 printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
1551 /* put the word through a series of tests for likely typos and OCR errors */
1552 /* V.21 I had allowed lots of typo-checking even with the typo switch */
1553 /* turned off, but I really should disallow reporting of them when */
1554 /* the switch is off. Hence the "if" below. */
1555 if (pswit[TYPO_SWITCH]) {
1557 strcpy(testword, inword);
1559 for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
1560 if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
1561 if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
1562 /* we have an uppercase mid-word. However, there are common cases: */
1563 /* Mac and Mc like McGill */
1564 /* French contractions like l'Abbe */
1565 if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') ||
1566 (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') ||
1567 (i > 0 && testword[i-1] == CHAR_SQUOTE))
1570 else { /* V.97 - remove separate case of uppercase within word so that */
1571 /* names like VanAllen fall into qword_index and get reported only once */
1575 testword[i] = (char)tolower(testword[i]);
1578 /* check for certain unlikely two-letter combinations at word start and end */
1579 /* V.0.97 - this replaces individual hardcoded checks in previous versions */
1580 if (strlen(testword) > 1) {
1581 for (i = 0; *nostart[i]; i++)
1582 if (!strncmp(testword, nostart[i], 2))
1584 for (i = 0; *noend[i]; i++)
1585 if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
1590 /* ght is common, gbt never. Like that. */
1591 if (strstr(testword, "cb")) istypo = 1;
1592 if (strstr(testword, "gbt")) istypo = 1;
1593 if (strstr(testword, "pbt")) istypo = 1;
1594 if (strstr(testword, "tbs")) istypo = 1;
1595 if (strstr(testword, "mrn")) istypo = 1;
1596 if (strstr(testword, "ahle")) istypo = 1;
1597 if (strstr(testword, "ihle")) istypo = 1;
1599 /* "TBE" does happen - like HEARTBEAT - but uncommon. */
1600 /* Also "TBI" - frostbite, outbid - but uncommon. */
1601 /* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */
1602 /* but these are covered in V.20. "ii" is a common scanno. */
1603 if (strstr(testword, "tbi")) istypo = 1;
1604 if (strstr(testword, "tbe")) istypo = 1;
1605 if (strstr(testword, "ii")) istypo = 1;
1607 /* check for no vowels or no consonants. */
1608 /* If none, flag a typo */
1609 if (!istypo && strlen(testword)>1) {
1610 vowel = consonant = 0;
1611 for (i = 0; testword[i]; i++)
1612 if (testword[i] == 'y' || gcisdigit(testword[i])) { /* Yah, this is loose. */
1617 if (strchr(vowels, testword[i])) vowel++;
1619 if (!vowel || !consonant) {
1624 /* now exclude the word from being reported if it's in */
1625 /* the okword list */
1626 for (i = 0; *okword[i]; i++)
1627 if (!strcmp(testword, okword[i]))
1630 /* what looks like a typo may be a Roman numeral. Exclude these */
1632 if (isroman(testword))
1635 /* check the manual list of typos */
1637 for (i = 0; *typo[i]; i++)
1638 if (!strcmp(testword, typo[i]))
1642 /* V.21 - check lowercase s and l - special cases */
1643 /* V.98 - added "i" and "m" */
1644 /* V.99 - added "j" often a semi-colon gone wrong */
1645 /* - and "d" for a missing apostrophe - he d */
1646 /* - and "n" for "in" */
1647 if (!istypo && strlen(testword) == 1)
1648 if (strchr("slmijdn", *inword))
1654 if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
1655 for (i = 0; i < qword_index; i++)
1656 if (!strcmp(testword, qword[i])) {
1661 if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
1662 strcpy(qword[qword_index], testword);
1665 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1666 if (!pswit[OVERVIEW_SWITCH]) {
1667 printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
1668 if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
1669 printf(" - not reporting duplicates");
1676 } /* end of typo-checking */
1678 /* check the user's list of typos */
1681 for (i = 0; i < usertypo_count; i++)
1682 if (!strcmp(testword, usertypo[i])) {
1683 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1684 if (!pswit[OVERVIEW_SWITCH])
1685 printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
1690 if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
1691 if (!strcmp(inword, "0") || !strcmp(inword, "1")) {
1692 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1693 if (!pswit[OVERVIEW_SWITCH])
1694 printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
1701 /* look for added or missing spaces around punctuation and quotes */
1702 /* If there is a punctuation character like ! with no space on */
1703 /* either side, suspect a missing!space. If there are spaces on */
1704 /* both sides , assume a typo. If we see a double quote with no */
1705 /* space or punctuation on either side of it, assume unspaced */
1706 /* quotes "like"this. */
1707 llen = strlen(aline);
1708 for (i = 1; i < llen; i++) { /* for each character in the line after the first */
1709 if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */
1710 isacro = 0; /* we need to suppress warnings for acronyms like M.D. */
1711 isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */
1712 if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) || /* if there are letters on both sides of it or ... */
1713 (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
1714 if (aline[i] == '.') {
1716 if (aline[i-2] == '.') isacro = 1;
1718 if (aline[i+2] == '.') isacro = 1;
1721 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1722 if (!pswit[OVERVIEW_SWITCH])
1723 printf(" Line %ld column %d - Missing space?\n", linecnt, i+1);
1728 if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
1729 if (aline[i] == '.') {
1731 if (aline[i-2] == '.') isellipsis = 1;
1733 if (aline[i+2] == '.') isellipsis = 1;
1735 if (!isemptyline && !isellipsis) {
1736 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1737 if (!pswit[OVERVIEW_SWITCH])
1738 printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
1746 /* 0.98 -- split out the characters that CANNOT be preceded by space */
1747 llen = strlen(aline);
1748 for (i = 1; i < llen; i++) { /* for each character in the line after the first */
1749 if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */
1750 if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
1751 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1752 if (!pswit[OVERVIEW_SWITCH])
1753 printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
1761 /* 0.99 -- special case " .X" where X is any alpha. */
1762 /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
1763 llen = strlen(aline);
1764 for (i = 1; i < llen; i++) { /* for each character in the line after the first */
1765 if (aline[i] == '.') { /* if it's a period */
1766 if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
1767 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1768 if (!pswit[OVERVIEW_SWITCH])
1769 printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
1779 /* v.21 breaking out the search for unspaced doublequotes */
1780 /* This is not as efficient, but it's more maintainable */
1781 /* V.97 added underscore to the list of characters not to query, */
1782 /* since underscores are commonly used as italics indicators. */
1783 /* V.98 Added slash as well, same reason. */
1784 for (i = 1; i < llen; i++) { /* for each character in the line after the first */
1785 if (aline[i] == CHAR_DQUOTE) {
1786 if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) &&
1787 !strchr(" _-.'`,;:!/([{?}])", aline[i+1]) &&
1789 || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
1790 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1791 if (!pswit[OVERVIEW_SWITCH])
1792 printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
1800 /* v.98 check parity of quotes */
1801 /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */
1802 for (s = aline; *s; s++) {
1803 if (*s == CHAR_DQUOTE) {
1804 if (!(dquotepar = !dquotepar)) { /* parity even */
1805 if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) {
1806 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1807 if (!pswit[OVERVIEW_SWITCH])
1808 printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
1813 else { /* parity odd */
1814 if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$", *(s+1)) || !*(s+1)) {
1815 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1816 if (!pswit[OVERVIEW_SWITCH])
1817 printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
1825 if (*aline == CHAR_DQUOTE) {
1826 if (strchr(",;:!?)]} ", aline[1])) {
1827 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1828 if (!pswit[OVERVIEW_SWITCH])
1829 printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
1835 if (pswit[SQUOTE_SWITCH])
1836 for (s = aline; *s; s++) {
1837 if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
1838 && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) {
1839 if (!(squotepar = !squotepar)) { /* parity even */
1840 if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) {
1841 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1842 if (!pswit[OVERVIEW_SWITCH])
1843 printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
1848 else { /* parity odd */
1849 if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`", *(s+1)) || !*(s+1)) {
1850 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1851 if (!pswit[OVERVIEW_SWITCH])
1852 printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
1861 /* v.20 also look for double punctuation like ,. or ,, */
1862 /* Thanks to DW for the suggestion! */
1863 /* I'm putting this in a separate loop for clarity */
1864 /* In books with references, ".," and ".;" are common */
1865 /* e.g. "etc., etc.," and vol. 1.; vol 3.; */
1866 /* OTOH, from my initial tests, there are also fairly */
1867 /* common errors. What to do? Make these cases paranoid? */
1868 /* V.21 ".," is the most common, so invented warn_dotcomma */
1869 /* to suppress detailed reporting if it occurs often */
1870 llen = strlen(aline);
1871 for (i = 0; i < llen; i++) /* for each character in the line */
1872 if (strchr(".?!,;:", aline[i]) /* if it's punctuation */
1873 && (strchr(".?!,;:", aline[i+1]))
1874 && aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */
1876 (aline[i] == aline[i+1]
1877 && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!'))
1878 || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
1879 || (isFrench && !strncmp(aline+i, ",...", 4))
1880 || (isFrench && !strncmp(aline+i, "...,", 4))
1881 || (isFrench && !strncmp(aline+i, ";...", 4))
1882 || (isFrench && !strncmp(aline+i, "...;", 4))
1883 || (isFrench && !strncmp(aline+i, ":...", 4))
1884 || (isFrench && !strncmp(aline+i, "...:", 4))
1885 || (isFrench && !strncmp(aline+i, "!...", 4))
1886 || (isFrench && !strncmp(aline+i, "...!", 4))
1887 || (isFrench && !strncmp(aline+i, "?...", 4))
1888 || (isFrench && !strncmp(aline+i, "...?", 4))
1890 if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */
1891 || (isFrench && !strncmp(aline+i, "...,", 4))
1892 || (isFrench && !strncmp(aline+i, ";...", 4))
1893 || (isFrench && !strncmp(aline+i, "...;", 4))
1894 || (isFrench && !strncmp(aline+i, ":...", 4))
1895 || (isFrench && !strncmp(aline+i, "...:", 4))
1896 || (isFrench && !strncmp(aline+i, "!...", 4))
1897 || (isFrench && !strncmp(aline+i, "...!", 4))
1898 || (isFrench && !strncmp(aline+i, "?...", 4))
1899 || (isFrench && !strncmp(aline+i, "...?", 4)))
1901 ; /* do nothing for .. !! and ?? which can be legit */
1904 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1905 if (!pswit[OVERVIEW_SWITCH])
1906 printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1);
1911 /* v.21 breaking out the search for spaced doublequotes */
1912 /* This is not as efficient, but it's more maintainable */
1914 while (strstr(s," \" ")) {
1915 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1916 if (!pswit[OVERVIEW_SWITCH])
1917 printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
1920 s = strstr(s," \" ") + 2;
1923 /* v.20 also look for spaced singlequotes ' and ` */
1925 while (strstr(s," ' ")) {
1926 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1927 if (!pswit[OVERVIEW_SWITCH])
1928 printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
1931 s = strstr(s," ' ") + 2;
1935 while (strstr(s," ` ")) {
1936 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1937 if (!pswit[OVERVIEW_SWITCH])
1938 printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
1941 s = strstr(s," ` ") + 2;
1944 /* v.99 check special case of 'S instead of 's at end of word */
1947 if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z') {
1948 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1949 if (!pswit[OVERVIEW_SWITCH])
1950 printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
1958 /* v.21 Now check special cases - start and end of line - */
1959 /* for single and double quotes. Start is sometimes [sic] */
1960 /* but better to query it anyway. */
1961 /* While I'm here, check for dash at end of line */
1962 llen = strlen(aline);
1964 if (aline[llen-1] == CHAR_DQUOTE ||
1965 aline[llen-1] == CHAR_SQUOTE ||
1966 aline[llen-1] == CHAR_OPEN_SQUOTE)
1967 if (aline[llen-2] == CHAR_SPACE) {
1968 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1969 if (!pswit[OVERVIEW_SWITCH])
1970 printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen);
1975 /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
1976 /* Wrongspaced quotes test also catches it for " */
1977 if (aline[0] == CHAR_SQUOTE ||
1978 aline[0] == CHAR_OPEN_SQUOTE)
1979 if (aline[1] == CHAR_SPACE) {
1980 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1981 if (!pswit[OVERVIEW_SWITCH])
1982 printf(" Line %ld column 1 - Spaced quote?\n", linecnt);
1986 /* dash at end of line may well be legit - paranoid mode only */
1987 /* and don't report em-dash at line-end */
1988 if (pswit[PARANOID_SWITCH] && warn_hyphen) {
1989 for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
1990 if (aline[i] == '-' && aline[i-1] != '-') {
1991 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1992 if (!pswit[OVERVIEW_SWITCH])
1993 printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
1998 /* v.21 also look for brackets surrounded by alpha */
1999 /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
2000 /* If so, suspect a scanno like "a]most" */
2001 llen = strlen(aline);
2002 for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/
2003 if (strchr("{[()]}", aline[i]) /* if it's a bracket */
2004 && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
2005 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
2006 if (!pswit[OVERVIEW_SWITCH])
2007 printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i);
2012 /* The "Cinderella" case, back in again! :-S Give it another shot */
2013 if (warn_endquote) {
2014 llen = strlen(aline);
2015 for (i = 1; i < llen; i++) { /* for each character in the line except 1st */
2016 if (aline[i] == CHAR_DQUOTE)
2017 if (isalpha(aline[i-1])) {
2018 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
2019 if (!pswit[OVERVIEW_SWITCH])
2020 printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
2027 llen = strlen(aline);
2029 /* Check for <HTML TAG> */
2030 /* If there is a < in the line, followed at some point */
2031 /* by a > then we suspect HTML */
2032 if (strstr(aline, "<") && strstr(aline, ">")) {
2033 i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
2035 strncpy(wrk, strstr(aline, "<"), i);
2037 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
2038 if (!pswit[OVERVIEW_SWITCH])
2039 printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
2045 /* Check for &symbol; HTML */
2046 /* If there is a & in the line, followed at */
2047 /* some point by a ; then we suspect HTML */
2048 if (strstr(aline, "&") && strstr(aline, ";")) {
2049 i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
2050 for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)
2051 if (*s == CHAR_SPACE) i = 0; /* 0.99 don't report "Jones & Son;" */
2053 strncpy(wrk, strstr(aline,"&"), i);
2055 if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
2056 if (!pswit[OVERVIEW_SWITCH])
2057 printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
2063 /* At end of paragraph, check for mismatched quotes. */
2064 /* We don't want to report an error immediately, since it is a */
2065 /* common convention to omit the quotes at end of paragraph if */
2066 /* the next paragraph is a continuation of the same speaker. */
2067 /* Where this is the case, the next para should begin with a */
2068 /* quote, so we store the warning message and only display it */
2069 /* at the top of the next iteration if the new para doesn't */
2070 /* start with a quote. */
2071 /* The -p switch overrides this default, and warns of unclosed */
2072 /* quotes on _every_ paragraph, whether the next begins with a */
2074 /* Version .16 - only report mismatched single quotes if */
2075 /* an open_single_quotes was found. */
2077 if (isemptyline) { /* end of para - add up the totals */
2079 sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt);
2080 if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
2081 sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt);
2082 if (pswit[SQUOTE_SWITCH] && open_single_quote
2083 && (open_single_quote != close_single_quote)
2084 && (open_single_quote != close_single_quote +1) )
2085 squot = 1; /* flag it to be noted regardless of the first char of the next para */
2087 sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt);
2089 sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt);
2091 sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt);
2093 sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt);
2094 quot = s_brack = c_brack = r_brack = c_unders =
2095 open_single_quote = close_single_quote = 0;
2096 isnewpara = 1; /* let the next iteration know that it's starting a new para */
2099 /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
2100 /* by working back through prevline. DW. */
2101 /* Hmmm. Need to check this only for "normal" paras. */
2102 /* So what is a "normal" para? ouch! */
2103 /* Not normal if one-liner (chapter headings, etc.) */
2104 /* Not normal if doesn't contain at least one locase letter */
2105 /* Not normal if starts with space */
2107 /* 0.99 tighten up on para end checks. Disallow comma and */
2108 /* semi-colon. Check for legit para end before quotes. */
2109 if (isemptyline) { /* end of para */
2110 for (s = prevline, i = 0; *s && !i; s++)
2112 i = 1; /* use i to indicate the presence of a letter on the line */
2113 /* This next "if" is a problem. */
2114 /* If I say "start_para_line <= linecnt - 1", that includes one-line */
2115 /* "paragraphs" like chapter heads. Lotsa false positives. */
2116 /* If I say "start_para_line < linecnt - 1" it doesn't, but then it */
2117 /* misses genuine one-line paragraphs. */
2118 /* So what do I do? */
2121 && start_para_line < linecnt - 1
2122 && *prevline > CHAR_SPACE
2124 for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
2125 for ( ; i > 0; i--) {
2126 if (gcisalpha(prevline[i])) {
2127 if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
2128 if (!pswit[OVERVIEW_SWITCH])
2129 printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
2134 if (strchr("-.:!([{?}])", prevline[i]))
2139 strcpy(prevline, aline);
2142 if (!pswit[OVERVIEW_SWITCH])
2143 for (i = 0; i < MAX_QWORD; i++)
2145 printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
2150 /* flgets - get one line from the input stream, checking for */
2151 /* the existence of exactly one CR/LF line-end per line. */
2152 /* Returns a pointer to the line. */
2154 char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt)
2157 int len, isCR, cint;
2161 c = cint = fgetc(thefile);
2165 if (c == 10) /* either way, it's end of line */
2168 else { /* Error - a LF without a preceding CR */
2169 if (pswit[LINE_END_SWITCH]) {
2170 if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
2171 if (!pswit[OVERVIEW_SWITCH])
2172 printf(" Line %ld - No CR?\n", lcnt);
2179 if (isCR) { /* Error - two successive CRs */
2180 if (pswit[LINE_END_SWITCH]) {
2181 if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
2182 if (!pswit[OVERVIEW_SWITCH])
2183 printf(" Line %ld - Two successive CRs?\n", lcnt);
2191 if (pswit[LINE_END_SWITCH] && isCR) {
2192 if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
2193 if (!pswit[OVERVIEW_SWITCH])
2194 printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1);
2203 c = cint = fgetc(thefile);
2204 } while(len < maxlen);
2205 if (pswit[MARKUP_SWITCH])
2206 postprocess_for_HTML(theline);
2207 if (pswit[DP_SWITCH])
2208 postprocess_for_DP(theline);
2215 /* mixdigit - takes a "word" as a parameter, and checks whether it */
2216 /* contains a mixture of alpha and digits. Generally, this is an */
2217 /* error, but may not be for cases like 4th or L5 12s. 3d. */
2218 /* Returns 0 if no error found, 1 if error. */
2220 int mixdigit(char *checkword) /* check for digits like 1 or 0 in words */
2222 int wehaveadigit, wehavealetter, firstdigits, query, wl;
2226 wehaveadigit = wehavealetter = query = 0;
2227 for (s = checkword; *s; s++)
2233 if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2235 wl = strlen(checkword);
2236 for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
2238 /* digits, ending in st, rd, nd, th of either case */
2239 /* 0.99 donovan points out an error below. Turns out */
2240 /* I was using matchword like strcmp when the */
2241 /* return values are different! Duh. */
2242 if (firstdigits + 2 == wl &&
2243 (matchword(checkword + wl - 2, "st")
2244 || matchword(checkword + wl - 2, "rd")
2245 || matchword(checkword + wl - 2, "nd")
2246 || matchword(checkword + wl - 2, "th"))
2249 if (firstdigits + 3 == wl &&
2250 (matchword(checkword + wl - 3, "sts")
2251 || matchword(checkword + wl - 3, "rds")
2252 || matchword(checkword + wl - 3, "nds")
2253 || matchword(checkword + wl - 3, "ths"))
2256 if (firstdigits + 3 == wl &&
2257 (matchword(checkword + wl - 4, "stly")
2258 || matchword(checkword + wl - 4, "rdly")
2259 || matchword(checkword + wl - 4, "ndly")
2260 || matchword(checkword + wl - 4, "thly"))
2264 /* digits, ending in l, L, s or d */
2265 if (firstdigits + 1 == wl &&
2266 (checkword[wl-1] == 'l'
2267 || checkword[wl-1] == 'L'
2268 || checkword[wl-1] == 's'
2269 || checkword[wl-1] == 'd'))
2271 /* L at the start of a number, representing Britsh pounds, like L500 */
2272 /* This is cute. We know the current word is mixeddigit. If the first */
2273 /* letter is L, there must be at least one digit following. If both */
2274 /* digits and letters follow, we have a genuine error, else we have a */
2275 /* capital L followed by digits, and we accept that as a non-error. */
2276 if (checkword[0] == 'L')
2277 if (!mixdigit(checkword+1))
2286 /* getaword - extracts the first/next "word" from the line, and puts */
2287 /* it into "thisword". A word is defined as one English word unit */
2288 /* -- or at least that's what I'm trying for. */
2289 /* Returns a pointer to the position in the line where we will start */
2290 /* looking for the next word. */
2292 char *getaword(char *fromline, char *thisword)
2298 for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ );
2301 /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */
2302 /* Especially yucky is the case of L1,000 */
2303 /* I hate this, and I see other ways, but I don't see that any is _better_.*/
2304 /* This section looks for a pattern of characters including a digit */
2305 /* followed by a comma or period followed by one or more digits. */
2306 /* If found, it returns this whole pattern as a word; otherwise we discard */
2307 /* the results and resume our normal programming. */
2309 for ( ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) {
2310 thisword[wordlen] = *s;
2313 thisword[wordlen] = 0;
2314 for (i = 1; i < wordlen -1; i++) {
2315 if (thisword[i] == '.' || thisword[i] == ',') {
2316 if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */
2323 /* we didn't find a punctuated number - do the regular getword thing */
2325 for ( ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
2326 thisword[wordlen] = *fromline;
2329 thisword[wordlen] = 0;
2337 /* matchword - just a case-insensitive string matcher */
2338 /* yes, I know this is not efficient. I'll worry about */
2339 /* that when I have a clear idea where I'm going with it.*/
2341 int matchword(char *checkfor, char *thisword)
2343 unsigned int ismatch, i;
2345 if (strlen(checkfor) != strlen(thisword)) return(0);
2347 ismatch = 1; /* assume a match until we find a difference */
2348 for (i = 0; i <strlen(checkfor); i++)
2349 if (toupper(checkfor[i]) != toupper(thisword[i]))
2358 /* lowerit - lowercase the line. Yes, strlwr does the same job, */
2359 /* but not on all platforms, and I'm a bit paranoid about what */
2360 /* some implementations of tolower might do to hi-bit characters,*/
2361 /* which shouldn't matter, but better safe than sorry. */
2363 void lowerit(char *theline)
2365 for ( ; *theline; theline++)
2366 if (*theline >='A' && *theline <='Z')
2371 /* Is this word a Roman Numeral? */
2372 /* v 0.99 improved to be better. It still doesn't actually */
2373 /* validate that the number is a valid Roman Numeral -- for example */
2374 /* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
2375 /* what we're here to do. If it passes this, it LOOKS like a Roman */
2376 /* numeral. Anyway, the actual Romans were pretty tolerant of bad */
2377 /* arithmetic, or expressions thereof, except when it came to taxes.*/
2378 /* Allow any number of M, an optional D, an optional CM or CD, */
2379 /* any number of optional Cs, an optional XL or an optional XC, an */
2380 /* optional IX or IV, an optional V and any number of optional Is. */
2381 /* Good enough for jazz chords. */
2383 int isroman(char *t)
2387 if (!t || !*t) return (0);
2391 while (*t == 'm' && *t ) t++;
2393 if (*t == 'c' && *(t+1) == 'm') t+=2;
2394 if (*t == 'c' && *(t+1) == 'd') t+=2;
2395 while (*t == 'c' && *t) t++;
2396 if (*t == 'x' && *(t+1) == 'l') t+=2;
2397 if (*t == 'x' && *(t+1) == 'c') t+=2;
2399 while (*t == 'x' && *t) t++;
2400 if (*t == 'i' && *(t+1) == 'x') t+=2;
2401 if (*t == 'i' && *(t+1) == 'v') t+=2;
2403 while (*t == 'i' && *t) t++;
2404 if (!*t) return (1);
2412 /* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */
2413 /* If we use the standard isalpha() function, 8-bit accented characters break */
2414 /* words, so that tete with accented characters appears to be two words, "t" */
2415 /* and "t", with 8-bit characters between them. This causes over-reporting of */
2416 /* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */
2417 /* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */
2419 int gcisalpha(unsigned char c)
2421 if (c >='a' && c <='z') return(1);
2422 if (c >='A' && c <='Z') return(1);
2423 if (c < 140) return(0);
2424 if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
2425 if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1);
2429 /* gcisdigit is a special version that doesn't get confused in 8-bit texts. */
2430 int gcisdigit(unsigned char c)
2432 if (c >= '0' && c <='9') return(1);
2436 /* gcisletter is a special version that doesn't get confused in 8-bit texts. */
2437 /* Yeah, we're ISO-8891-1-specific. So sue me. */
2438 int gcisletter(unsigned char c)
2440 if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1);
2447 /* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
2449 char *gcstrchr(char *s, char c)
2451 if (c == 0) return(NULL);
2452 return(strchr(s,c));
2455 /* postprocess_for_DP is derived from postprocess_for_HTML */
2456 /* It is invoked with the -d switch from flgets(). */
2457 /* It simply "removes" from the line a hard-coded set of common */
2458 /* DP-specific tags, so that the line passed to the main routine has*/
2459 /* been pre-cleaned of DP markup. */
2461 void postprocess_for_DP(char *theline)
2470 for (i = 0; *DPmarkup[i]; i++) {
2471 s = strstr(theline, DPmarkup[i]);
2473 t = s + strlen(DPmarkup[i]);
2479 s = strstr(theline, DPmarkup[i]);
2486 /* postprocess_for_HTML is, at the moment (0.97), a very nasty */
2487 /* short-term fix for Charlz. Nasty, nasty, nasty. */
2488 /* It is invoked with the -m switch from flgets(). */
2489 /* It simply "removes" from the line a hard-coded set of common */
2490 /* HTML tags and "replaces" a hard-coded set of common HTML */
2491 /* entities, so that the line passed to the main routine has */
2492 /* been pre-cleaned of HTML. This is _so_ not the right way to */
2493 /* deal with HTML, but what Charlz needs now is not HTML handling */
2494 /* proper: just ignoring <i> tags and some others. */
2495 /* To be revisited in future releases! */
2497 void postprocess_for_HTML(char *theline)
2500 if (strstr(theline, "<") && strstr(theline, ">"))
2501 while (losemarkup(theline))
2503 while (loseentities(theline))
2507 char *losemarkup(char *theline)
2515 s = strstr(theline, "<");
2516 t = strstr(theline, ">");
2517 if (!s || !t) return(NULL);
2518 for (i = 0; *markup[i]; i++)
2519 if (!tagcomp(s+1, markup[i])) {
2530 /* it's an unrecognized <xxx> */
2534 char *loseentities(char *theline)
2542 for (i = 0; *entities[i].htmlent; i++) {
2543 s = strstr(theline, entities[i].htmlent);
2545 t = malloc((size_t)strlen(s));
2546 if (!t) return(NULL);
2547 strcpy(t, s + strlen(entities[i].htmlent));
2548 strcpy(s, entities[i].textent);
2555 /* V0.97 Duh. Forgot to check the htmlnum member */
2556 for (i = 0; *entities[i].htmlnum; i++) {
2557 s = strstr(theline, entities[i].htmlnum);
2559 t = malloc((size_t)strlen(s));
2560 if (!t) return(NULL);
2561 strcpy(t, s + strlen(entities[i].htmlnum));
2562 strcpy(s, entities[i].textent);
2572 int tagcomp(char *strin, char *basetag)
2578 if (*t == '/') t++; /* ignore a slash */
2580 if (tolower(*s) != tolower(*t)) return(1);
2583 /* OK, we have < followed by a valid tag start */
2584 /* should I do something about length? */
2585 /* this is messy. The length of an <i> tag is */
2586 /* limited, but a <table> could go on for miles */
2587 /* so I'd have to parse the tags . . . ugh. */
2588 /* It isn't what Charlz needs now, so mark it */
2593 void proghelp() /* explain program usage here */
2595 fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
2596 fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr);
2597 fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr);
2598 fputs("read the file COPYING for details.\n\n", stderr);
2599 fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr);
2600 fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr);
2601 fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr);
2602 fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr);
2603 fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr);
2604 fputs(" -d ignores DP-specific markup,\n",stderr);
2605 fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr);
2606 fputs("Sample usage: gutcheck warpeace.txt \n",stderr);
2608 fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr);
2609 fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr);
2610 fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr);
2611 fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr);
2612 fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr);
2618 /*********************************************************************
2621 04/22/01 Cleaned up some stuff and released .10
2625 05/09/01 Added the typo list, added two extra cases of he/be error,
2626 added -p switch, OPEN_SINGLE QUOTE char as .11
2630 05/20/01 Increased the typo list,
2631 added paranoid mode,
2632 ANSIfied the code and added some casts
2633 so the compiler wouldn't keep asking if I knew what I was doing,
2634 fixed bug in l.s.d. condition (thanks, Dave!),
2635 standardized spacing when echoing,
2636 added letter-combo checking code to typo section,
2637 added more h/b words to typo array.
2638 Not too sure about putting letter combos outside of the TYPO conditions -
2639 someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see.
2644 06/01/01 Removed duplicate reporting of Tildes, asterisks, etc.
2645 06/10/01 Added flgets routine to help with platform-independent
2646 detection of invalid line-ends. All PG text files should
2647 have CR/LF (13/10) at end of line, regardless of system.
2648 Gutcheck now validates this by default. (Thanks, Charles!)
2653 06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.)
2658 06/23/01 Fixed: 'No',he said. not being flagged.
2660 Improved: better single-quotes checking:
2662 Ignore singlequotes surrounded by alpha, like didn't. (was OK)
2664 If a singlequote is at the END of a word AND the word ends in "s":
2665 The dogs' tails wagged.
2666 it's probably an apostrophe, but less commonly may be a closequote:
2667 "These 'pack dogs' of yours look more like wolves."
2669 If it's got punctuation before it and is followed by a space
2671 . . . was a problem,' he said
2672 . . . was a problem,'"
2673 it is probably (certainly?) a closequote.
2675 If it's at start of paragraph, it's probably an openquote.
2678 Words with ' at beginning and end are probably quoted:
2679 "You have the word 'chivalry' frequently on your lips."
2680 (Not specifically implemented)
2681 V.18 I'm glad I didn't implement this, 'cos it jest ain't so
2682 where the convention is to punctuate outside the quotes.
2683 'Come', he said, 'and join the party'.
2685 If it is followed by an alpha, and especially a capital:
2687 it is either an openquote or dialect.
2689 Dialect breaks ALL the rules:
2690 A man's a man for a' that.
2691 "Aye, but 'tis all in the pas' now."
2692 "'Tis often the way," he said.
2695 This version looks to be an improvement, and produces
2696 fewer false positives, but is still not perfect. The
2697 'pack dogs' case still fools it, and dialect is still
2698 a problem. Oh, well, it's an improvement, and I have
2699 a weighted structure in place for refining guesses at
2700 closequotes. Maybe next time, I'll add a bit of logic
2701 where if there is an open quote and one that was guessed
2702 to be a possessive apostrophe after s, I'll re-guess it
2703 to be a closequote. Let's see how this one flies, first.
2705 (Afterview: it's still crap. Needs much work, and a deeper insight.)
2709 TODO: More he/be checks. Can't be perfect - counterexamples:
2710 I gave my son good advice: be married regardless of the world's opinion.
2711 I gave my son good advice: he married regardless of the world's opinion.
2713 If by "primitive" be meant "crude", we can understand the sentence.
2714 If by "primitive" he meant "crude", we can understand the sentence.
2716 No matter what be said, I must go on.
2717 No matter what he said, I must go on.
2719 No value, however great, can be set upon them.
2720 No value, however great, can he set upon them.
2722 Real-Life one from a DP International Weekly Miscellany:
2723 He wandered through the forest without fear, sleeping
2724 much, for in sleep be had companionship--the Great
2725 Spirit teaching him what he should know in dreams.
2726 That one found by jeebies, and it turned out to be "he".
2731 07/01/01 Added -O option.
2732 Improved singlequotes by reporting mismatched single quotes
2733 only if an open_single_quotes was found.
2739 08/27/01 Added -Y switch for Robert Rowe to allow his app to
2740 catch the error output.
2746 09/08/01 Added checking Capitals at start of paragraph, but not
2747 checking them at start of sentence.
2749 TODO: Parse sentences out so can check reliably for start of
2750 sentence. Need a whole different approach for that.
2751 (Can't just rely on periods, since they are also
2752 used for abbreviations, etc.)
2754 Added checking for all vowels or all consonants in a word.
2756 While I was in, I added "ii" checking and "tl" at start of word.
2758 Added echoing of first line of paragraph when reporting
2759 mismatched quoted or brackets (thanks to David Widger for the
2762 Not querying L at start of a number (used for British pounds).
2764 The spelling changes are sort of half-done but released anyway
2765 Skipped .18 because I had given out a couple of test versions
2768 09/25/01 Released as .19
2773 Use the logic from my new version of safewrap to stop querying
2774 short lines like poems and TOCs.
2775 Ignore non-standard ellipses like . . . or ...
2779 10/01/01 Made any line over 80 a VERY long line (was 85).
2780 Recognized openquotes on indented paragraphs as continuations
2782 Added "cf" to the okword list (how did I forget _that_?) and a few others.
2783 Moved abbrev to okword and made it more general.
2784 Removed requirement that PG_space_emdash be greater than
2785 ten before turning off warnings about spaced dashes.
2786 Added period to list of characters that might constitute a separator line.
2787 Now checking for double punctuation (Thanks, David!)
2788 Now if two spaced em-dashes on a line, reports both. (DW)
2789 Bug: Wasn't catching spaced punctuation at line-end since I
2790 added flgets in version .13 - fixed.
2791 Bug: Wasn't catching spaced singlequotes - fixed
2792 Now reads punctuated numbers like 1,000 as a single word.
2793 (Used to give "standalone 1" type queries)
2794 Changed paranoid mode - not including s and p options. -ex is now quite usable.
2795 Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed
2796 Bug: Sometimes gave _next_ line number for queried word at end of line - fixed
2798 10/22/01 Released as .20
2802 Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!)
2803 Reduced the number of hi-bit letters needed to stop reporting them
2804 from 1/20 to 1/100 or 200 in total.
2805 Added PG footer check.
2806 Added the -h switch.
2807 Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10
2808 Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23"
2809 Added unspaced brackets check when surrounded by alpha.
2810 Removed all typo reporting unless the typo switch is on.
2811 Added gcisalpha to ease over-reporting of 8-bit queries.
2812 ECHO_SWITCH is now ON by default!
2813 PARANOID_SWITCH is now ON by default!
2814 Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg)
2815 Checking for standalone lowercase "l"
2816 Checking for standalone lowercase "s"
2817 Considering "is be" and "be is" "be was" "was be" as he/be errors
2818 Looking at punct at end of para
2820 01/20/02 Released as .21
2824 Added VERBOSE_SWITCH to make it list everything. (George Davis)
2828 02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have.
2830 This line caused a coredump on Solaris - fixed.
2831 Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe
2832 03/09/02 Changed header recognition for another header change
2834 03/29/02 Added qword[][] so I can suppress massive overreporting
2835 of queried "words" like "FN", "Wm.", "th'", people's
2836 initials, chemical formulae and suchlike in some texts.
2838 04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed.
2839 Added linecounts in overview mode.
2840 Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done.
2841 "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that?
2843 Added checking for broken em-dash at line-end (enddash)
2845 08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo.
2847 10/10/02 Suppressing some annoying multiple reports by default:
2848 Standalone Ones, Asterisks, Square Brackets.
2849 Digit 1 occurs often in many scientific texts.
2850 Asterisk occurs often in multi-footnoted texts.
2851 Mismatch Square Brackets occurs often in multi-para footnotes.
2852 Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil.
2853 . . . but it does more or less work for the main cases.
2854 Removed uppercase within a word as a separate category so
2855 that names like VanAllen get reported only once, like other
2857 11/24/02 Fixed - -m switch wasn't looking at htmlnum in
2858 loseentities (Thanks, Brett!)
2859 Fixed bug which occasionally gave false warning of
2860 paragraph starting with lowercase.
2861 Added underscore as character not to query around doublequotes.
2862 Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859"
2863 . . . this is to help detect things like CP1252 characters.
2866 12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell,
2867 for doublequotes only. Replaces "Spaced quote", since it also covers that
2869 Added "warn_hyphen" to ease over-reporting of hyphens.
2871 12/20/02 Added "extra period" checks.
2872 Added single character line check
2873 Added I" check - is usually an exclam
2876 1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile()
2877 from when I was looking at ways to identify markup. Refuses to
2878 open files for *nix users with upcase in the filemanes. Removed.
2879 Fixed quickly and released as 0.981
2881 1/8/03 Added "arid" to the list of typos, slightly against my better
2882 judgement, but the DP gang are all excited about it. :-)
2883 Added a check for comma followed by capital letter, where
2884 a period has OCRed into a comma. (DW). Not sure about this
2886 Compiling for Win32 to allow longfilenames.
2888 6/1/04 A messy test release for DW to include the "gutcheck.typ"
2889 process. And the gutcheck.jee trials. Removed "arid" --
2890 it can go in gutcheck.typ
2892 Added checks for carats ^ and slants / but disabling slant
2893 queries if more than 20 of them, because some people use them
2894 for /italics/. Slants are commonly mistaken italic "I"s.
2896 Later: removed gutcheck.jee -- wrote jeebies instead.
2899 Check brackets more closely, like quotes, so that it becomes
2900 easy to find the error in long paragraphs full of brackets.
2903 11/4/04 Assorted cleanup. Fixed case where text started with an
2904 unbalanced paragraph.
2906 1/2/05 Has it really been that long? Added "nocomma", "noperiod" check.
2907 Bits and pieces: improved isroman(). Added isletter().
2908 Other stuff I never noted before this.
2910 7/3/05 Stuck in a quick start on DP-markup ignoring
2911 at BillFlis's suggestion.
2913 1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in?
2914 Don't count footer for dotcomma etc.
2981 *********************************************************************/