1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
32 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
33 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
34 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
35 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
36 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
37 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
38 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
39 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
40 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
41 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
42 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
43 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
44 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
45 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
46 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
47 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
48 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
49 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
50 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
51 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
52 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
53 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
54 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
55 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
56 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
57 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
58 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
59 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
60 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
66 /* Common abbreviations and other OK words not to query as typos. */
68 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
69 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
70 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
71 "outbid", "outbids", "frostbite", "frostbitten", ""
74 /* Common abbreviations that cause otherwise unexplained periods. */
76 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
77 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
81 * Two-Letter combinations that rarely if ever start words,
82 * but are common scannos or otherwise common letter combinations.
85 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
89 * Two-Letter combinations that rarely if ever end words,
90 * but are common scannos or otherwise common letter combinations.
93 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
94 "sw", "gr", "sl", "cl", "iy", ""
98 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
99 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
100 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
101 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
105 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
109 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
110 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
111 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
112 "during", "let", "toward", "among", ""
116 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
117 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
118 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
119 "among", "those", "into", "whom", "having", "thence", ""
122 char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
129 "&", "&", "&",
130 "<", "<", "<",
131 ">", ">", ">",
132 "°", "°", " degrees",
133 "£", "£", "L",
134 """, """, "\"", /* quotation mark = APL quote */
135 "Œ", "Œ", "OE", /* latin capital ligature OE */
136 "œ", "œ", "oe", /* latin small ligature oe */
137 "Š", "Š", "S", /* latin capital letter S with caron */
138 "š", "š", "s", /* latin small letter s with caron */
139 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
140 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
141 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
142 " ", " ", " ", /* en space, U+2002 ISOpub */
143 " ", " ", " ", /* em space, U+2003 ISOpub */
144 " ", " ", " ", /* thin space, U+2009 ISOpub */
145 "–", "–", "-", /* en dash, U+2013 ISOpub */
146 "—", "—", "--", /* em dash, U+2014 ISOpub */
147 "’", "’", "'", /* right single quotation mark */
148 "‚", "‚", "'", /* single low-9 quotation mark */
149 "“", "“", "\"", /* left double quotation mark */
150 "”", "”", "\"", /* right double quotation mark */
151 "„", "„", "\"", /* double low-9 quotation mark */
152 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
153 "›", "›", "\"", /* single right-pointing angle quotation mark */
154 " ", " ", " ", /* no-break space = non-breaking space, */
155 "¡", "¡", "!", /* inverted exclamation mark */
156 "¢", "¢", "c", /* cent sign */
157 "£", "£", "L", /* pound sign */
158 "¤", "¤", "$", /* currency sign */
159 "¥", "¥", "Y", /* yen sign = yuan sign */
160 "§", "§", "--", /* section sign */
161 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
162 "©", "©", "(C) ", /* copyright sign */
163 "ª", "ª", " ", /* feminine ordinal indicator */
164 "«", "«", "\"", /* left-pointing double angle quotation mark */
165 "­", "­", "-", /* soft hyphen = discretionary hyphen */
166 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
167 "¯", "¯", " ", /* macron = spacing macron = overline */
168 "°", "°", " degrees", /* degree sign */
169 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
170 "²", "²", "2", /* superscript two = superscript digit two */
171 "³", "³", "3", /* superscript three = superscript digit three */
172 "´", "´", " ", /* acute accent = spacing acute */
173 "µ", "µ", "m", /* micro sign */
174 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
175 "¸", "¸", " ", /* cedilla = spacing cedilla */
176 "¹", "¹", "1", /* superscript one = superscript digit one */
177 "º", "º", " ", /* masculine ordinal indicator */
178 "»", "»", "\"", /* right-pointing double angle quotation mark */
179 "¼", "¼", "1/4", /* vulgar fraction one quarter */
180 "½", "½", "1/2", /* vulgar fraction one half */
181 "¾", "¾", "3/4", /* vulgar fraction three quarters */
182 "¿", "¿", "?", /* inverted question mark */
183 "À", "À", "A", /* latin capital letter A with grave */
184 "Á", "Á", "A", /* latin capital letter A with acute */
185 "Â", "Â", "A", /* latin capital letter A with circumflex */
186 "Ã", "Ã", "A", /* latin capital letter A with tilde */
187 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
188 "Å", "Å", "A", /* latin capital letter A with ring above */
189 "Æ", "Æ", "AE", /* latin capital letter AE */
190 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
191 "È", "È", "E", /* latin capital letter E with grave */
192 "É", "É", "E", /* latin capital letter E with acute */
193 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
194 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
195 "Ì", "Ì", "I", /* latin capital letter I with grave */
196 "Í", "Í", "I", /* latin capital letter I with acute */
197 "Î", "Î", "I", /* latin capital letter I with circumflex */
198 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
199 "Ð", "Ð", "E", /* latin capital letter ETH */
200 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
201 "Ò", "Ò", "O", /* latin capital letter O with grave */
202 "Ó", "Ó", "O", /* latin capital letter O with acute */
203 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
204 "Õ", "Õ", "O", /* latin capital letter O with tilde */
205 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
206 "×", "×", "*", /* multiplication sign */
207 "Ø", "Ø", "O", /* latin capital letter O with stroke */
208 "Ù", "Ù", "U", /* latin capital letter U with grave */
209 "Ú", "Ú", "U", /* latin capital letter U with acute */
210 "Û", "Û", "U", /* latin capital letter U with circumflex */
211 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
212 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
213 "Þ", "Þ", "TH", /* latin capital letter THORN */
214 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
215 "à", "à", "a", /* latin small letter a with grave */
216 "á", "á", "a", /* latin small letter a with acute */
217 "â", "â", "a", /* latin small letter a with circumflex */
218 "ã", "ã", "a", /* latin small letter a with tilde */
219 "ä", "ä", "a", /* latin small letter a with diaeresis */
220 "å", "å", "a", /* latin small letter a with ring above */
221 "æ", "æ", "ae", /* latin small letter ae */
222 "ç", "ç", "c", /* latin small letter c with cedilla */
223 "è", "è", "e", /* latin small letter e with grave */
224 "é", "é", "e", /* latin small letter e with acute */
225 "ê", "ê", "e", /* latin small letter e with circumflex */
226 "ë", "ë", "e", /* latin small letter e with diaeresis */
227 "ì", "ì", "i", /* latin small letter i with grave */
228 "í", "í", "i", /* latin small letter i with acute */
229 "î", "î", "i", /* latin small letter i with circumflex */
230 "ï", "ï", "i", /* latin small letter i with diaeresis */
231 "ð", "ð", "eth", /* latin small letter eth */
232 "ñ", "ñ", "n", /* latin small letter n with tilde */
233 "ò", "ò", "o", /* latin small letter o with grave */
234 "ó", "ó", "o", /* latin small letter o with acute */
235 "ô", "ô", "o", /* latin small letter o with circumflex */
236 "õ", "õ", "o", /* latin small letter o with tilde */
237 "ö", "ö", "o", /* latin small letter o with diaeresis */
238 "÷", "÷", "/", /* division sign */
239 "ø", "ø", "o", /* latin small letter o with stroke */
240 "ù", "ù", "u", /* latin small letter u with grave */
241 "ú", "ú", "u", /* latin small letter u with acute */
242 "û", "û", "u", /* latin small letter u with circumflex */
243 "ü", "ü", "u", /* latin small letter u with diaeresis */
244 "ý", "ý", "y", /* latin small letter y with acute */
245 "þ", "þ", "th", /* latin small letter thorn */
246 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
250 /* special characters */
251 #define CHAR_SPACE 32
255 #define CHAR_DQUOTE 34
256 #define CHAR_SQUOTE 39
257 #define CHAR_OPEN_SQUOTE 96
258 #define CHAR_TILDE 126
259 #define CHAR_ASTERISK 42
260 #define CHAR_FORESLASH 47
261 #define CHAR_CARAT 94
263 #define CHAR_UNDERSCORE '_'
264 #define CHAR_OPEN_CBRACK '{'
265 #define CHAR_CLOSE_CBRACK '}'
266 #define CHAR_OPEN_RBRACK '('
267 #define CHAR_CLOSE_RBRACK ')'
268 #define CHAR_OPEN_SBRACK '['
269 #define CHAR_CLOSE_SBRACK ']'
271 /* longest and shortest normal PG line lengths */
272 #define LONGEST_PG_LINE 75
273 #define WAY_TOO_LONG 80
274 #define SHORTEST_PG_LINE 55
294 gboolean pswit[SWITNO]; /* program switches */
296 static GOptionEntry options[]={
297 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
298 "Ignore DP-specific markup", NULL },
299 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
300 "Don't echo queried line", NULL },
301 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
302 "Check single quotes", NULL },
303 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
304 "Check common typos", NULL },
305 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
306 "Require closure of quotes on every paragraph", NULL },
307 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
308 "Disable paranoid querying of everything", NULL },
309 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
310 "Disable line end checking", NULL },
311 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
312 "Overview: just show counts", NULL },
313 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
314 "Output errors to stdout instead of stderr", NULL },
315 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
316 "Echo header fields", NULL },
317 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
318 "Ignore markup in < >", NULL },
319 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
320 "Use file of user-defined typos", NULL },
321 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
322 "Defaults for use on www upload", NULL },
323 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
324 "Verbose - list everything", NULL },
328 long cnt_dquot; /* for overview mode, count of doublequote queries */
329 long cnt_squot; /* for overview mode, count of singlequote queries */
330 long cnt_brack; /* for overview mode, count of brackets queries */
331 long cnt_bin; /* for overview mode, count of non-ASCII queries */
332 long cnt_odd; /* for overview mode, count of odd character queries */
333 long cnt_long; /* for overview mode, count of long line errors */
334 long cnt_short; /* for overview mode, count of short line queries */
335 long cnt_punct; /* for overview mode,
336 count of punctuation and spacing queries */
337 long cnt_dash; /* for overview mode, count of dash-related queries */
338 long cnt_word; /* for overview mode, count of word queries */
339 long cnt_html; /* for overview mode, count of html queries */
340 long cnt_lineend; /* for overview mode, count of line-end queries */
341 long cnt_spacend; /* count of lines with space at end */
342 long linecnt; /* count of total lines in the file */
343 long checked_linecnt; /* count of lines actually checked */
345 void proghelp(GOptionContext *context);
346 void procfile(const char *);
350 int mixdigit(const char *);
351 gchar *getaword(const char **);
352 char *flgets(char **,long);
353 gboolean gcisalpha(unsigned char);
354 gboolean gcisdigit(unsigned char);
355 gboolean gcisletter(unsigned char);
356 void postprocess_for_HTML(char *);
357 char *linehasmarkup(char *);
358 char *losemarkup(char *);
359 int tagcomp(const char *,const char *);
360 char *loseentities(char *);
361 gboolean isroman(const char *);
362 void postprocess_for_DP(char *);
364 GTree *qword,*qperiod;
366 struct first_pass_results {
367 long firstline,astline;
368 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
369 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
370 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
371 int Dutchcount,Frenchcount;
375 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
377 gboolean isDutch,isFrench;
382 int c_unders,c_brack,s_brack,r_brack;
383 int open_single_quote,close_single_quote;
386 struct line_properties {
387 unsigned int len,blen;
396 char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
400 void parse_options(int *argc,char ***argv)
403 GOptionContext *context;
404 context=g_option_context_new(
405 "file - looks for errors in Project Gutenberg(TM) etexts");
406 g_option_context_add_main_entries(context,options,NULL);
407 if (!g_option_context_parse(context,argc,argv,&err))
409 g_printerr("Bookloupe: %s\n",err->message);
410 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
413 /* Paranoid checking is turned OFF, not on, by its switch */
414 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
415 if (pswit[PARANOID_SWITCH])
416 /* if running in paranoid mode, typo checks default to enabled */
417 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
418 /* Line-end checking is turned OFF, not on, by its switch */
419 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
420 /* Echoing is turned OFF, not on, by its switch */
421 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
422 if (pswit[OVERVIEW_SWITCH])
423 /* just print summary; don't echo */
424 pswit[ECHO_SWITCH]=FALSE;
426 * Web uploads - for the moment, this is really just a placeholder
427 * until we decide what processing we really want to do on web uploads
429 if (pswit[WEB_SWITCH])
431 /* specific override for web uploads */
432 pswit[ECHO_SWITCH]=TRUE;
433 pswit[SQUOTE_SWITCH]=FALSE;
434 pswit[TYPO_SWITCH]=TRUE;
435 pswit[QPARA_SWITCH]=FALSE;
436 pswit[PARANOID_SWITCH]=TRUE;
437 pswit[LINE_END_SWITCH]=FALSE;
438 pswit[OVERVIEW_SWITCH]=FALSE;
439 pswit[STDOUT_SWITCH]=FALSE;
440 pswit[HEADER_SWITCH]=TRUE;
441 pswit[VERBOSE_SWITCH]=FALSE;
442 pswit[MARKUP_SWITCH]=FALSE;
443 pswit[USERTYPO_SWITCH]=FALSE;
444 pswit[DP_SWITCH]=FALSE;
451 g_option_context_free(context);
457 * Read in the user-defined stealth scanno list.
459 void read_user_scannos(void)
462 gchar *usertypo_file;
466 gchar *contents,**lines;
467 usertypo_file=g_strdup("bookloupe.typ");
468 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
469 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
472 g_free(usertypo_file);
473 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
474 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
476 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
479 g_free(usertypo_file);
480 usertypo_file=g_strdup("gutcheck.typ");
481 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
483 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
486 g_free(usertypo_file);
487 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
488 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
490 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
492 g_free(usertypo_file);
493 printf(" --> I couldn't find bookloupe.typ "
494 "-- proceeding without user typos.\n");
499 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
500 g_free(usertypo_file);
504 lines=g_strsplit(contents,"\n",0);
505 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
506 for (i=0;lines[i];i++)
507 if (*(unsigned char *)lines[i]>'!')
508 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
518 * Read an etext returning an array of lines. Lines are normally expected
519 * to be terminated by CR LF. Solitary LFs delimit lines but are left
520 * embedded at the end of the line for further processing. Solitary CRs
521 * do not delimit lines.
523 gchar **read_etext(const char *filename,GError **err)
531 if (!g_file_get_contents(filename,&contents,&len,err))
533 raw_lines=g_strsplit(contents,"\r\n",0);
534 lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
535 for (i=0;raw_lines[i];i++)
537 t=strchr(raw_lines[i],'\n');
541 while ((t=strchr(s,'\n')))
543 g_ptr_array_add(lines,g_strndup(s,t-s+1));
546 g_ptr_array_add(lines,g_strdup(s));
547 g_free(raw_lines[i]);
550 g_ptr_array_add(lines,raw_lines[i]);
553 g_ptr_array_add(lines,NULL);
554 return (gchar **)g_ptr_array_free(lines,FALSE);
560 * Read an etext returning a newly allocated string containing the file
561 * contents or NULL on error.
563 gchar *read_etext(const char *filename,GError **err)
567 if (!g_file_get_contents(filename,&contents,&len,err))
573 int main(int argc,char **argv)
575 running_from=g_path_get_dirname(argv[0]);
576 parse_options(&argc,&argv);
577 if (pswit[USERTYPO_SWITCH])
579 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
581 if (pswit[OVERVIEW_SWITCH])
583 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
584 checked_linecnt,linecnt,linecnt-checked_linecnt);
585 printf(" --------------- Queries found --------------\n");
587 printf(" Long lines: %14ld\n",cnt_long);
589 printf(" Short lines: %14ld\n",cnt_short);
591 printf(" Line-end problems: %14ld\n",cnt_lineend);
593 printf(" Common typos: %14ld\n",cnt_word);
595 printf(" Unmatched quotes: %14ld\n",cnt_dquot);
597 printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
599 printf(" Unmatched brackets: %14ld\n",cnt_brack);
601 printf(" Non-ASCII characters: %14ld\n",cnt_bin);
603 printf(" Proofing characters: %14ld\n",cnt_odd);
605 printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
607 printf(" Non-standard dashes: %14ld\n",cnt_dash);
609 printf(" Possible HTML tags: %14ld\n",cnt_html);
611 printf(" TOTAL QUERIES %14ld\n",
612 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
613 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
615 g_free(running_from);
617 g_tree_unref(usertypo);
624 * Run a first pass - verify that it's a valid PG
625 * file, decide whether to report some things that
626 * occur many times in the text like long or short
627 * lines, non-standard dashes, etc.
629 struct first_pass_results *first_pass(const char *etext)
631 char laststart=CHAR_SPACE;
636 unsigned int lastlen=0,lastblen=0;
637 long spline=0,nspline=0;
638 static struct first_pass_results results={0};
640 lines=g_strsplit(etext,"\n",0);
641 for (j=0;lines[j];j++)
643 llen=strlen(lines[j]);
644 while(lines[j][llen-1]=='\r')
645 lines[j][llen--]='\0';
647 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
648 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
651 printf(" --> Duplicate header?\n");
652 spline=linecnt+1; /* first line of non-header text, that is */
654 if (!strncmp(lines[j],"*** START",9) &&
655 strstr(lines[j],"PROJECT GUTENBERG"))
658 printf(" --> Duplicate header?\n");
659 nspline=linecnt+1; /* first line of non-header text, that is */
661 if (spline || nspline)
663 lc_line=g_ascii_strdown(lines[j],llen);
664 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
666 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
668 if (results.footerline)
670 /* it's an old-form header - we can detect duplicates */
672 printf(" --> Duplicate footer?\n");
675 results.footerline=linecnt;
681 results.firstline=spline;
683 results.firstline=nspline; /* override with new */
684 if (results.footerline)
685 continue; /* don't count the boilerplate in the footer */
686 results.totlen+=llen;
689 if ((unsigned char)lines[j][i]>127)
691 if (gcisalpha(lines[j][i]))
693 if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
694 results.endquote_count++;
696 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
697 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
699 if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
701 if (strstr(lines[j],".,"))
703 /* only count ast lines for ignoring purposes where there is */
704 /* locase text on the line */
705 if (strchr(lines[j],'*'))
707 for (s=lines[j];*s;s++)
708 if (*s>='a' && *s<='z')
713 if (strchr(lines[j],'/'))
714 results.fslashline++;
715 for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
717 if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
719 if (llen>LONGEST_PG_LINE)
721 if (llen>WAY_TOO_LONG)
722 results.verylongline++;
723 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
725 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
728 if (strstr(lines[j],"<i>"))
729 results.htmcount+=4; /* bonus marks! */
731 /* Check for spaced em-dashes */
732 if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
735 if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
736 results.space_emdash++;
737 if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
738 /* count of em-dashes with spaces both sides */
739 results.non_PG_space_emdash++;
740 if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
741 /* count of PG-type em-dashes with no spaces */
742 results.PG_space_emdash++;
747 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
748 results.Dutchcount++;
749 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
750 results.Frenchcount++;
751 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
752 results.standalone_digit++;
755 /* Check for spaced dashes */
756 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
760 laststart=lines[j][0];
769 * Make some snap decisions based on the first pass results.
771 struct warnings *report_first_pass(struct first_pass_results *results)
773 static struct warnings warnings={0};
775 printf(" --> %ld lines in this file have white space at end\n",
778 if (results->dotcomma>5)
781 printf(" --> %ld lines in this file contain '.,'. "
782 "Not reporting them.\n",results->dotcomma);
785 * If more than 50 lines, or one-tenth, are short,
786 * don't bother reporting them.
788 warnings.shortline=1;
789 if (results->shortline>50 || results->shortline*10>linecnt)
791 warnings.shortline=0;
792 printf(" --> %ld lines in this file are short. "
793 "Not reporting short lines.\n",results->shortline);
796 * If more than 50 lines, or one-tenth, are long,
797 * don't bother reporting them.
800 if (results->longline>50 || results->longline*10>linecnt)
803 printf(" --> %ld lines in this file are long. "
804 "Not reporting long lines.\n",results->longline);
806 /* If more than 10 lines contain asterisks, don't bother reporting them. */
808 if (results->astline>10)
811 printf(" --> %ld lines in this file contain asterisks. "
812 "Not reporting them.\n",results->astline);
815 * If more than 10 lines contain forward slashes,
816 * don't bother reporting them.
819 if (results->fslashline>10)
822 printf(" --> %ld lines in this file contain forward slashes. "
823 "Not reporting them.\n",results->fslashline);
826 * If more than 20 lines contain unpunctuated endquotes,
827 * don't bother reporting them.
830 if (results->endquote_count>20)
833 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
834 "Not reporting them.\n",results->endquote_count);
837 * If more than 15 lines contain standalone digits,
838 * don't bother reporting them.
841 if (results->standalone_digit>10)
844 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
845 "Not reporting them.\n",results->standalone_digit);
848 * If more than 20 lines contain hyphens at end,
849 * don't bother reporting them.
852 if (results->hyphens>20)
855 printf(" --> %ld lines in this file have hyphens at end. "
856 "Not reporting them.\n",results->hyphens);
858 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
860 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
861 pswit[MARKUP_SWITCH]=1;
863 if (results->verylongline>0)
864 printf(" --> %ld lines in this file are VERY long!\n",
865 results->verylongline);
867 * If there are more non-PG spaced dashes than PG em-dashes,
868 * assume it's deliberate.
869 * Current PG guidelines say don't use them, but older texts do,
870 * and some people insist on them whatever the guidelines say.
873 if (results->spacedash+results->non_PG_space_emdash>
874 results->PG_space_emdash)
877 printf(" --> There are %ld spaced dashes and em-dashes. "
878 "Not reporting them.\n",
879 results->spacedash+results->non_PG_space_emdash);
881 /* If more than a quarter of characters are hi-bit, bug out. */
883 if (results->binlen*4>results->totlen)
885 printf(" --> This file does not appear to be ASCII. "
886 "Terminating. Best of luck with it!\n");
889 if (results->alphalen*4<results->totlen)
891 printf(" --> This file does not appear to be text. "
892 "Terminating. Best of luck with it!\n");
895 if (results->binlen*100>results->totlen || results->binlen>100)
897 printf(" --> There are a lot of foreign letters here. "
898 "Not reporting them.\n");
901 warnings.isDutch=FALSE;
902 if (results->Dutchcount>50)
904 warnings.isDutch=TRUE;
905 printf(" --> This looks like Dutch - "
906 "switching off dashes and warnings for 's Middags case.\n");
908 warnings.isFrench=FALSE;
909 if (results->Frenchcount>50)
911 warnings.isFrench=TRUE;
912 printf(" --> This looks like French - "
913 "switching off some doublepunct.\n");
915 if (results->firstline && results->footerline)
916 printf(" The PG header and footer appear to be already on.\n");
919 if (results->firstline)
920 printf(" The PG header is on - no footer.\n");
921 if (results->footerline)
922 printf(" The PG footer is on - no header.\n");
925 if (pswit[VERBOSE_SWITCH])
928 warnings.shortline=1;
937 printf(" *** Verbose output is ON -- you asked for it! ***\n");
939 if (warnings.isDutch)
941 if (results->footerline>0 && results->firstline>0 &&
942 results->footerline>results->firstline &&
943 results->footerline-results->firstline<100)
945 printf(" --> I don't really know where this text starts. \n");
946 printf(" There are no reference points.\n");
947 printf(" I'm going to have to report the header and footer "
949 results->firstline=0;
957 * Look along the line, accumulate the count of quotes, and see
958 * if this is an empty line - i.e. a line with nothing on it
960 * If line has just spaces, period, * and/or - on it, don't
961 * count it, since empty lines with asterisks or dashes to
962 * separate sections are common.
964 * Returns: TRUE if the line is empty.
966 gboolean analyse_quotes(const char *aline,struct counters *counters)
969 /* assume the line is empty until proven otherwise */
970 gboolean isemptyline=TRUE;
976 if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
981 * At start of line, it can only be an openquote.
982 * Hardcode a very common exception!
984 if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
985 counters->open_single_quote++;
987 else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
988 /* Do nothing! it's definitely an apostrophe, not a quote */
990 /* it's outside a word - let's check it out */
991 else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
993 /* it damwell better BE an openquote */
994 if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
995 /* hardcode a very common exception! */
996 counters->open_single_quote++;
1000 /* now - is it a closequote? */
1001 guessquote=0; /* accumulate clues */
1002 if (gcisalpha(s[-1]))
1004 /* it follows a letter - could be either */
1008 /* looks like a plural apostrophe */
1010 if (s[1]==CHAR_SPACE) /* bonus marks! */
1014 /* it doesn't have a letter either side */
1015 else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
1016 guessquote+=8; /* looks like a closequote */
1019 if (counters->open_single_quote>counters->close_single_quote)
1021 * Give it the benefit of some doubt,
1022 * if a squote is already open.
1028 counters->close_single_quote++;
1031 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
1033 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1034 if (*s==CHAR_UNDERSCORE)
1035 counters->c_unders++;
1036 if (*s==CHAR_OPEN_CBRACK)
1037 counters->c_brack++;
1038 if (*s==CHAR_CLOSE_CBRACK)
1039 counters->c_brack--;
1040 if (*s==CHAR_OPEN_RBRACK)
1041 counters->r_brack++;
1042 if (*s==CHAR_CLOSE_RBRACK)
1043 counters->r_brack--;
1044 if (*s==CHAR_OPEN_SBRACK)
1045 counters->s_brack++;
1046 if (*s==CHAR_CLOSE_SBRACK)
1047 counters->s_brack--;
1054 * check_for_control_characters:
1056 * Check for invalid or questionable characters in the line
1057 * Anything above 127 is invalid for plain ASCII, and
1058 * non-printable control characters should also be flagged.
1059 * Tabs should generally not be there.
1061 void check_for_control_characters(const char *aline)
1065 for (s=aline;*s;s++)
1067 c=*(unsigned char *)s;
1068 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1070 if (pswit[ECHO_SWITCH])
1071 printf("\n%s\n",aline);
1072 if (!pswit[OVERVIEW_SWITCH])
1073 printf(" Line %ld column %d - Control character %d\n",
1074 linecnt,(int)(s-aline)+1,c);
1082 * check_for_odd_characters:
1084 * Check for binary and other odd characters.
1086 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1087 gboolean isemptyline)
1089 /* Don't repeat multiple warnings on one line. */
1090 int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
1093 for (s=aline;*s;s++)
1095 c=*(unsigned char *)s;
1096 if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
1098 if (pswit[ECHO_SWITCH])
1099 printf("\n%s\n",aline);
1100 if (!pswit[OVERVIEW_SWITCH])
1102 printf(" Line %ld column %d - "
1103 "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
1105 printf(" Line %ld column %d - Non-ASCII character %d\n",
1106 linecnt,(int)(s-aline)+1,c);
1111 if (!eTab && *s==CHAR_TAB)
1113 if (pswit[ECHO_SWITCH])
1114 printf("\n%s\n",aline);
1115 if (!pswit[OVERVIEW_SWITCH])
1116 printf(" Line %ld column %d - Tab character?\n",
1117 linecnt,(int)(s-aline)+1);
1122 if (!eTilde && *s==CHAR_TILDE)
1125 * Often used by OCR software to indicate an
1126 * unrecognizable character.
1128 if (pswit[ECHO_SWITCH])
1129 printf("\n%s\n",aline);
1130 if (!pswit[OVERVIEW_SWITCH])
1131 printf(" Line %ld column %d - Tilde character?\n",
1132 linecnt,(int)(s-aline)+1);
1137 if (!eCarat && *s==CHAR_CARAT)
1139 if (pswit[ECHO_SWITCH])
1140 printf("\n%s\n",aline);
1141 if (!pswit[OVERVIEW_SWITCH])
1142 printf(" Line %ld column %d - Carat character?\n",
1143 linecnt,(int)(s-aline)+1);
1148 if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
1150 if (pswit[ECHO_SWITCH])
1151 printf("\n%s\n",aline);
1152 if (!pswit[OVERVIEW_SWITCH])
1153 printf(" Line %ld column %d - Forward slash?\n",
1154 linecnt,(int)(s-aline)+1);
1160 * Report asterisks only in paranoid mode,
1161 * since they're often deliberate.
1163 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1166 if (pswit[ECHO_SWITCH])
1167 printf("\n%s\n",aline);
1168 if (!pswit[OVERVIEW_SWITCH])
1169 printf(" Line %ld column %d - Asterisk?\n",
1170 linecnt,(int)(s-aline)+1);
1179 * check_for_long_line:
1181 * Check for line too long.
1183 void check_for_long_line(const char *aline)
1185 if (strlen(aline)>LONGEST_PG_LINE)
1187 if (pswit[ECHO_SWITCH])
1188 printf("\n%s\n",aline);
1189 if (!pswit[OVERVIEW_SWITCH])
1190 printf(" Line %ld column %d - Long line %d\n",
1191 linecnt,(int)strlen(aline),(int)strlen(aline));
1198 * check_for_short_line:
1200 * Check for line too short.
1202 * This one is a bit trickier to implement: we don't want to
1203 * flag the last line of a paragraph for being short, so we
1204 * have to wait until we know that our current line is a
1205 * "normal" line, then report the _previous_ line if it was too
1206 * short. We also don't want to report indented lines like
1207 * chapter heads or formatted quotations. We therefore keep
1208 * last->len as the length of the last line examined, and
1209 * last->blen as the length of the last but one, and try to
1210 * suppress unnecessary warnings by checking that both were of
1211 * "normal" length. We keep the first character of the last
1212 * line in last->start, and if it was a space, we assume that
1213 * the formatting is deliberate. I can't figure out a way to
1214 * distinguish something like a quoted verse left-aligned or
1215 * the header or footer of a letter from a paragraph of short
1216 * lines - maybe if I examined the whole paragraph, and if the
1217 * para has less than, say, 8 lines and if all lines are short,
1218 * then just assume it's OK? Need to look at some texts to see
1219 * how often a formula like this would get the right result.
1221 void check_for_short_line(const char *aline,const struct line_properties *last)
1223 if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
1224 last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1226 if (pswit[ECHO_SWITCH])
1227 printf("\n%s\n",prevline);
1228 if (!pswit[OVERVIEW_SWITCH])
1229 printf(" Line %ld column %d - Short line %d?\n",
1230 linecnt-1,(int)strlen(prevline),(int)strlen(prevline));
1237 * check_for_starting_punctuation:
1239 * Look for punctuation other than full ellipses at start of line.
1241 void check_for_starting_punctuation(const char *aline)
1243 if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1245 if (pswit[ECHO_SWITCH])
1246 printf("\n%s\n",aline);
1247 if (!pswit[OVERVIEW_SWITCH])
1248 printf(" Line %ld column 1 - Begins with punctuation?\n",
1256 * check_for_spaced_emdash:
1258 * Check for spaced em-dashes.
1260 * We must check _all_ occurrences of "--" on the line
1261 * hence the loop - even if the first double-dash is OK
1262 * there may be another that's wrong later on.
1264 void check_for_spaced_emdash(const char *aline)
1268 while ((t=strstr(s,"--")))
1270 if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
1272 if (pswit[ECHO_SWITCH])
1273 printf("\n%s\n",aline);
1274 if (!pswit[OVERVIEW_SWITCH])
1275 printf(" Line %ld column %d - Spaced em-dash?\n",
1276 linecnt,(int)(t-aline)+1);
1285 * check_for_spaced_dash:
1287 * Check for spaced dashes.
1289 void check_for_spaced_dash(const char *aline)
1292 if ((s=strstr(aline," -")))
1296 if (pswit[ECHO_SWITCH])
1297 printf("\n%s\n",aline);
1298 if (!pswit[OVERVIEW_SWITCH])
1299 printf(" Line %ld column %d - Spaced dash?\n",
1300 linecnt,(int)(s-aline)+1);
1305 else if ((s=strstr(aline,"- ")))
1307 if (s==aline || s[-1]!='-')
1309 if (pswit[ECHO_SWITCH])
1310 printf("\n%s\n",aline);
1311 if (!pswit[OVERVIEW_SWITCH])
1312 printf(" Line %ld column %d - Spaced dash?\n",
1313 linecnt,(int)(s-aline)+1);
1321 * check_for_unmarked_paragraphs:
1323 * Check for unmarked paragraphs indicated by separate speakers.
1325 * May well be false positive:
1326 * "Bravo!" "Wonderful!" called the crowd.
1327 * but useful all the same.
1329 void check_for_unmarked_paragraphs(const char *aline)
1332 s=strstr(aline,"\" \"");
1334 s=strstr(aline,"\" \"");
1337 if (pswit[ECHO_SWITCH])
1338 printf("\n%s\n",aline);
1339 if (!pswit[OVERVIEW_SWITCH])
1340 printf(" Line %ld column %d - Query missing paragraph break?\n",
1341 linecnt,(int)(s-aline)+1);
1348 * check_for_jeebies:
1350 * Check for "to he" and other easy h/b errors.
1352 * This is a very inadequate effort on the h/b problem,
1353 * but the phrase "to he" is always an error, whereas "to
1354 * be" is quite common.
1355 * Similarly, '"Quiet!", be said.' is a non-be error
1356 * "to he" is _not_ always an error!:
1357 * "Where they went to he couldn't say."
1358 * Another false positive:
1359 * What would "Cinderella" be without the . . .
1360 * and another: "If he wants to he can see for himself."
1362 void check_for_jeebies(const char *aline)
1365 s=strstr(aline," be could ");
1367 s=strstr(aline," be would ");
1369 s=strstr(aline," was be ");
1371 s=strstr(aline," be is ");
1373 s=strstr(aline," is be ");
1375 s=strstr(aline,"\", be ");
1377 s=strstr(aline,"\" be ");
1379 s=strstr(aline,"\" be ");
1381 s=strstr(aline," to he ");
1384 if (pswit[ECHO_SWITCH])
1385 printf("\n%s\n",aline);
1386 if (!pswit[OVERVIEW_SWITCH])
1387 printf(" Line %ld column %d - Query he/be error?\n",
1388 linecnt,(int)(s-aline)+1);
1392 s=strstr(aline," the had ");
1394 s=strstr(aline," a had ");
1396 s=strstr(aline," they bad ");
1398 s=strstr(aline," she bad ");
1400 s=strstr(aline," he bad ");
1402 s=strstr(aline," you bad ");
1404 s=strstr(aline," i bad ");
1407 if (pswit[ECHO_SWITCH])
1408 printf("\n%s\n",aline);
1409 if (!pswit[OVERVIEW_SWITCH])
1410 printf(" Line %ld column %d - Query had/bad error?\n",
1411 linecnt,(int)(s-aline)+1);
1415 s=strstr(aline,"; hut ");
1417 s=strstr(aline,", hut ");
1420 if (pswit[ECHO_SWITCH])
1421 printf("\n%s\n",aline);
1422 if (!pswit[OVERVIEW_SWITCH])
1423 printf(" Line %ld column %d - Query hut/but error?\n",
1424 linecnt,(int)(s-aline)+1);
1431 * check_for_mta_from:
1433 * Special case - angled bracket in front of "From" placed there by an
1434 * MTA when sending an e-mail.
1436 void check_for_mta_from(const char *aline)
1439 s=strstr(aline,">From");
1442 if (pswit[ECHO_SWITCH])
1443 printf("\n%s\n",aline);
1444 if (!pswit[OVERVIEW_SWITCH])
1445 printf(" Line %ld column %d - Query angled bracket with From\n",
1446 linecnt,(int)(s-aline)+1);
1453 * check_for_orphan_character:
1455 * Check for a single character line -
1456 * often an overflow from bad wrapping.
1458 void check_for_orphan_character(const char *aline)
1460 if (*aline && !aline[1])
1462 if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1464 ; /* Nothing - ignore numerals alone on a line. */
1467 if (pswit[ECHO_SWITCH])
1468 printf("\n%s\n",aline);
1469 if (!pswit[OVERVIEW_SWITCH])
1470 printf(" Line %ld column 1 - Query single character line\n",
1479 * check_for_pling_scanno:
1481 * Check for I" - often should be !
1483 void check_for_pling_scanno(const char *aline)
1486 s=strstr(aline," I\"");
1489 if (pswit[ECHO_SWITCH])
1490 printf("\n%s\n",aline);
1491 if (!pswit[OVERVIEW_SWITCH])
1492 printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1500 * check_for_extra_period:
1502 * Check for period without a capital letter. Cut-down from gutspell.
1503 * Only works when it happens on a single line.
1505 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1507 const char *s,*t,*s1;
1511 if (pswit[PARANOID_SWITCH])
1513 for (t=aline;strstr(t,". ");)
1519 /* start of line punctuation is handled elsewhere */
1522 if (!gcisalpha(t[-1]))
1527 if (warnings->isDutch)
1529 /* For Frank & Jeroen -- 's Middags case */
1530 if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1531 t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1538 while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1540 if (*s1>='a' && *s1<='z')
1542 /* we have something to investigate */
1544 /* so let's go back and find out */
1545 for (s1=t-1;s1>=aline &&
1546 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1547 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1552 testword=g_strndup(s1,s-s1);
1554 testword=g_strdup(s1);
1555 for (i=0;*abbrev[i];i++)
1556 if (!strcmp(testword,abbrev[i]))
1558 if (gcisdigit(*testword))
1562 if (isroman(testword))
1567 for (i=0;testword[i];i++)
1568 if (strchr(vowels,testword[i]))
1572 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1574 g_tree_insert(qperiod,g_strdup(testword),
1575 GINT_TO_POINTER(1));
1576 if (pswit[ECHO_SWITCH])
1577 printf("\n%s\n",aline);
1578 if (!pswit[OVERVIEW_SWITCH])
1579 printf(" Line %ld column %d - Extra period?\n",
1580 linecnt,(int)(t-aline)+1);
1592 * check_for_following_punctuation:
1594 * Check for words usually not followed by punctuation.
1596 void check_for_following_punctuation(const char *aline)
1599 const char *s,*wordstart;
1601 if (pswit[TYPO_SWITCH])
1612 inword=g_ascii_strdown(t,-1);
1614 for (i=0;*nocomma[i];i++)
1615 if (!strcmp(inword,nocomma[i]))
1617 if (*s==',' || *s==';' || *s==':')
1619 if (pswit[ECHO_SWITCH])
1620 printf("\n%s\n",aline);
1621 if (!pswit[OVERVIEW_SWITCH])
1622 printf(" Line %ld column %d - "
1623 "Query punctuation after %s?\n",
1624 linecnt,(int)(s-aline)+1,inword);
1629 for (i=0;*noperiod[i];i++)
1630 if (!strcmp(inword,noperiod[i]))
1632 if (*s=='.' || *s=='!')
1634 if (pswit[ECHO_SWITCH])
1635 printf("\n%s\n",aline);
1636 if (!pswit[OVERVIEW_SWITCH])
1637 printf(" Line %ld column %d - "
1638 "Query punctuation after %s?\n",
1639 linecnt,(int)(s-aline)+1,inword);
1652 * Check for commonly mistyped words,
1653 * and digits like 0 for O in a word.
1655 void check_for_typos(const char *aline,struct warnings *warnings)
1657 const char *s,*wordstart;
1658 gchar *inword,*testword;
1659 int i,alower,vowel,consonant,*dupcnt;
1660 gboolean isdup,istypo;
1664 inword=getaword(&s);
1668 continue; /* don't bother with empty lines */
1670 if (mixdigit(inword))
1672 if (pswit[ECHO_SWITCH])
1673 printf("\n%s\n",aline);
1674 if (!pswit[OVERVIEW_SWITCH])
1675 printf(" Line %ld column %d - Query digit in %s\n",
1676 linecnt,(int)(wordstart-aline)+1,inword);
1681 * Put the word through a series of tests for likely typos and OCR
1684 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1687 testword=g_strdup(inword);
1689 for (i=0;i<(int)strlen(testword);i++)
1691 /* lowercase for testing */
1692 if (testword[i]>='a' && testword[i]<='z')
1694 if (alower && testword[i]>='A' && testword[i]<='Z')
1697 * We have an uppercase mid-word. However, there are
1699 * Mac and Mc like McGill
1700 * French contractions like l'Abbe
1702 if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1703 i==3 && testword[0]=='m' && testword[1]=='a' &&
1704 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1709 testword[i]=(char)tolower(testword[i]);
1712 if (pswit[TYPO_SWITCH])
1715 * Check for certain unlikely two-letter combinations at word
1718 if (strlen(testword)>1)
1720 for (i=0;*nostart[i];i++)
1721 if (!strncmp(testword,nostart[i],2))
1723 for (i=0;*noend[i];i++)
1724 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1727 /* ght is common, gbt never. Like that. */
1728 if (strstr(testword,"cb"))
1730 if (strstr(testword,"gbt"))
1732 if (strstr(testword,"pbt"))
1734 if (strstr(testword,"tbs"))
1736 if (strstr(testword,"mrn"))
1738 if (strstr(testword,"ahle"))
1740 if (strstr(testword,"ihle"))
1743 * "TBE" does happen - like HEARTBEAT - but uncommon.
1744 * Also "TBI" - frostbite, outbid - but uncommon.
1745 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1746 * numerals, but "ii" is a common scanno.
1748 if (strstr(testword,"tbi"))
1750 if (strstr(testword,"tbe"))
1752 if (strstr(testword,"ii"))
1755 * Check for no vowels or no consonants.
1756 * If none, flag a typo.
1758 if (!istypo && strlen(testword)>1)
1761 for (i=0;testword[i];i++)
1763 if (testword[i]=='y' || gcisdigit(testword[i]))
1765 /* Yah, this is loose. */
1769 else if (strchr(vowels,testword[i]))
1774 if (!vowel || !consonant)
1778 * Now exclude the word from being reported if it's in
1781 for (i=0;*okword[i];i++)
1782 if (!strcmp(testword,okword[i]))
1785 * What looks like a typo may be a Roman numeral.
1788 if (istypo && isroman(testword))
1790 /* Check the manual list of typos. */
1792 for (i=0;*typo[i];i++)
1793 if (!strcmp(testword,typo[i]))
1796 * Check lowercase s, l, i and m - special cases.
1797 * "j" - often a semi-colon gone wrong.
1798 * "d" for a missing apostrophe - he d
1801 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1805 dupcnt=g_tree_lookup(qword,testword);
1809 isdup=!pswit[VERBOSE_SWITCH];
1813 dupcnt=g_new0(int,1);
1814 g_tree_insert(qword,g_strdup(testword),dupcnt);
1819 if (pswit[ECHO_SWITCH])
1820 printf("\n%s\n",aline);
1821 if (!pswit[OVERVIEW_SWITCH])
1823 printf(" Line %ld column %d - Query word %s",
1824 linecnt,(int)(wordstart-aline)+1,inword);
1825 if (!pswit[VERBOSE_SWITCH])
1826 printf(" - not reporting duplicates");
1834 /* check the user's list of typos */
1835 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1837 if (pswit[ECHO_SWITCH])
1838 printf("\n%s\n",aline);
1839 if (!pswit[OVERVIEW_SWITCH])
1840 printf(" Line %ld column %d - Query possible scanno %s\n",
1841 linecnt,(int)(wordstart-aline)+2,inword);
1843 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1845 if (pswit[PARANOID_SWITCH] && warnings->digit)
1847 /* In paranoid mode, query all 0 and 1 standing alone. */
1848 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1850 if (pswit[ECHO_SWITCH])
1851 printf("\n%s\n",aline);
1852 if (!pswit[OVERVIEW_SWITCH])
1853 printf(" Line %ld column %d - Query standalone %s\n",
1854 linecnt,(int)(wordstart-aline)+2,inword);
1864 * check_for_misspaced_punctuation:
1866 * Look for added or missing spaces around punctuation and quotes.
1867 * If there is a punctuation character like ! with no space on
1868 * either side, suspect a missing!space. If there are spaces on
1869 * both sides , assume a typo. If we see a double quote with no
1870 * space or punctuation on either side of it, assume unspaced
1871 * quotes "like"this.
1873 void check_for_misspaced_punctuation(const char *aline,
1874 struct parities *parities,gboolean isemptyline)
1877 gboolean isacro,isellipsis;
1880 for (i=1;i<llen;i++)
1882 /* For each character in the line after the first. */
1883 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
1885 /* we need to suppress warnings for acronyms like M.D. */
1887 /* we need to suppress warnings for ellipsis . . . */
1889 /* if there are letters on both sides of it or ... */
1890 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
1891 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
1893 /* ...if it's strict punctuation followed by an alpha */
1896 if (i>2 && aline[i-2]=='.')
1898 if (i+2<llen && aline[i+2]=='.')
1903 if (pswit[ECHO_SWITCH])
1904 printf("\n%s\n",aline);
1905 if (!pswit[OVERVIEW_SWITCH])
1906 printf(" Line %ld column %d - Missing space?\n",
1912 if (aline[i-1]==CHAR_SPACE &&
1913 (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
1916 * If there are spaces on both sides,
1917 * or space before and end of line.
1921 if (i>2 && aline[i-2]=='.')
1923 if (i+2<llen && aline[i+2]=='.')
1926 if (!isemptyline && !isellipsis)
1928 if (pswit[ECHO_SWITCH])
1929 printf("\n%s\n",aline);
1930 if (!pswit[OVERVIEW_SWITCH])
1931 printf(" Line %ld column %d - "
1932 "Spaced punctuation?\n",linecnt,i+1);
1939 /* Split out the characters that CANNOT be preceded by space. */
1941 for (i=1;i<llen;i++)
1943 /* for each character in the line after the first */
1944 if (strchr("?!,;:",aline[i]))
1946 /* if it's punctuation that _cannot_ have a space before it */
1947 if (aline[i-1]==CHAR_SPACE && !isemptyline &&
1948 aline[i+1]!=CHAR_SPACE)
1951 * If aline[i+1) DOES == space,
1952 * it was already reported just above.
1954 if (pswit[ECHO_SWITCH])
1955 printf("\n%s\n",aline);
1956 if (!pswit[OVERVIEW_SWITCH])
1957 printf(" Line %ld column %d - Spaced punctuation?\n",
1965 * Special case " .X" where X is any alpha.
1966 * This plugs a hole in the acronym code above.
1967 * Inelegant, but maintainable.
1970 for (i=1;i<llen;i++)
1972 /* for each character in the line after the first */
1975 /* if it's a period */
1976 if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
1979 * If the period follows a space and
1980 * is followed by a letter.
1982 if (pswit[ECHO_SWITCH])
1983 printf("\n%s\n",aline);
1984 if (!pswit[OVERVIEW_SWITCH])
1985 printf(" Line %ld column %d - Spaced punctuation?\n",
1992 for (i=1;i<llen;i++)
1994 /* for each character in the line after the first */
1995 if (aline[i]==CHAR_DQUOTE)
1997 if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
1998 !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
1999 !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
2001 if (pswit[ECHO_SWITCH])
2002 printf("\n%s\n",aline);
2003 if (!pswit[OVERVIEW_SWITCH])
2004 printf(" Line %ld column %d - Unspaced quotes?\n",
2011 /* Check parity of quotes. */
2012 for (s=aline;*s;s++)
2014 if (*s==CHAR_DQUOTE)
2016 parities->dquote=!parities->dquote;
2017 if (!parities->dquote)
2020 if (!strchr("_-.'`/,;:!?)]} ",s[1]))
2022 if (pswit[ECHO_SWITCH])
2023 printf("\n%s\n",aline);
2024 if (!pswit[OVERVIEW_SWITCH])
2025 printf(" Line %ld column %d - "
2026 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2034 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2035 !strchr("_-/.'`([{$",s[1]) || !s[1])
2037 if (pswit[ECHO_SWITCH])
2038 printf("\n%s\n",aline);
2039 if (!pswit[OVERVIEW_SWITCH])
2040 printf(" Line %ld column %d - "
2041 "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
2048 if (*aline==CHAR_DQUOTE)
2050 if (strchr(",;:!?)]} ",aline[1]))
2052 if (pswit[ECHO_SWITCH])
2053 printf("\n%s\n",aline);
2054 if (!pswit[OVERVIEW_SWITCH])
2055 printf(" Line %ld column 1 - Wrongspaced quotes?\n",
2061 if (pswit[SQUOTE_SWITCH])
2063 for (s=aline;*s;s++)
2065 if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
2066 (s==aline || s>aline && !gcisalpha(s[-1]) ||
2069 parities->squote=!parities->squote;
2070 if (!parities->squote)
2073 if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
2075 if (pswit[ECHO_SWITCH])
2076 printf("\n%s\n",aline);
2077 if (!pswit[OVERVIEW_SWITCH])
2078 printf(" Line %ld column %d - "
2079 "Wrongspaced singlequotes?\n",
2080 linecnt,(int)(s-aline)+1);
2088 if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
2089 !strchr("_-/\".'`",s[1]) || !s[1])
2091 if (pswit[ECHO_SWITCH])
2092 printf("\n%s\n",aline);
2093 if (!pswit[OVERVIEW_SWITCH])
2094 printf(" Line %ld column %d - "
2095 "Wrongspaced singlequotes?\n",
2096 linecnt,(int)(s-aline)+1);
2107 * check_for_double_punctuation:
2109 * Look for double punctuation like ,. or ,,
2110 * Thanks to DW for the suggestion!
2111 * In books with references, ".," and ".;" are common
2112 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2113 * OTOH, from my initial tests, there are also fairly
2114 * common errors. What to do? Make these cases paranoid?
2115 * ".," is the most common, so warnings->dotcomma is used
2116 * to suppress detailed reporting if it occurs often.
2118 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2122 for (i=0;i<llen;i++)
2124 /* for each punctuation character in the line */
2125 if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
2126 aline[i] && aline[i+1])
2128 /* followed by punctuation, it's a query, unless . . . */
2129 if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
2131 !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
2132 warnings->isFrench && !strncmp(aline+i,",...",4) ||
2133 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2134 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2135 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2136 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2137 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2138 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2139 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2140 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2141 warnings->isFrench && !strncmp(aline+i,"...?",4))
2143 if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
2144 warnings->isFrench && !strncmp(aline+i,"...,",4) ||
2145 warnings->isFrench && !strncmp(aline+i,";...",4) ||
2146 warnings->isFrench && !strncmp(aline+i,"...;",4) ||
2147 warnings->isFrench && !strncmp(aline+i,":...",4) ||
2148 warnings->isFrench && !strncmp(aline+i,"...:",4) ||
2149 warnings->isFrench && !strncmp(aline+i,"!...",4) ||
2150 warnings->isFrench && !strncmp(aline+i,"...!",4) ||
2151 warnings->isFrench && !strncmp(aline+i,"?...",4) ||
2152 warnings->isFrench && !strncmp(aline+i,"...?",4))
2154 ; /* do nothing for .. !! and ?? which can be legit */
2158 if (pswit[ECHO_SWITCH])
2159 printf("\n%s\n",aline);
2160 if (!pswit[OVERVIEW_SWITCH])
2161 printf(" Line %ld column %d - Double punctuation?\n",
2171 * check_for_spaced_quotes:
2173 void check_for_spaced_quotes(const char *aline)
2177 while ((t=strstr(s," \" ")))
2179 if (pswit[ECHO_SWITCH])
2180 printf("\n%s\n",aline);
2181 if (!pswit[OVERVIEW_SWITCH])
2182 printf(" Line %ld column %d - Spaced doublequote?\n",
2183 linecnt,(int)(t-aline+1));
2189 while ((t=strstr(s," ' ")))
2191 if (pswit[ECHO_SWITCH])
2192 printf("\n%s\n",aline);
2193 if (!pswit[OVERVIEW_SWITCH])
2194 printf(" Line %ld column %d - Spaced singlequote?\n",
2195 linecnt,(int)(t-aline+1));
2201 while ((t=strstr(s," ` ")))
2203 if (pswit[ECHO_SWITCH])
2204 printf("\n%s\n",aline);
2205 if (!pswit[OVERVIEW_SWITCH])
2206 printf(" Line %ld column %d - Spaced singlequote?\n",
2207 linecnt,(int)(t-aline+1));
2215 * check_for_miscased_genative:
2217 * Check special case of 'S instead of 's at end of word.
2219 void check_for_miscased_genative(const char *aline)
2227 if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
2229 if (pswit[ECHO_SWITCH])
2230 printf("\n%s\n",aline);
2231 if (!pswit[OVERVIEW_SWITCH])
2232 printf(" Line %ld column %d - Capital \"S\"?\n",
2233 linecnt,(int)(s-aline+2));
2242 * check_end_of_line:
2244 * Now check special cases - start and end of line -
2245 * for single and double quotes. Start is sometimes [sic]
2246 * but better to query it anyway.
2247 * While we're here, check for dash at end of line.
2249 void check_end_of_line(const char *aline,struct warnings *warnings)
2255 if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
2256 aline[llen-1]==CHAR_OPEN_SQUOTE)
2257 if (aline[llen-2]==CHAR_SPACE)
2259 if (pswit[ECHO_SWITCH])
2260 printf("\n%s\n",aline);
2261 if (!pswit[OVERVIEW_SWITCH])
2262 printf(" Line %ld column %d - Spaced quote?\n",
2267 if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
2268 aline[1]==CHAR_SPACE)
2270 if (pswit[ECHO_SWITCH])
2271 printf("\n%s\n",aline);
2272 if (!pswit[OVERVIEW_SWITCH])
2273 printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
2278 * Dash at end of line may well be legit - paranoid mode only
2279 * and don't report em-dash at line-end.
2281 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2283 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
2285 if (aline[i]=='-' && aline[i-1]!='-')
2287 if (pswit[ECHO_SWITCH])
2288 printf("\n%s\n",aline);
2289 if (!pswit[OVERVIEW_SWITCH])
2290 printf(" Line %ld column %d - Hyphen at end of line?\n",
2298 * check_for_unspaced_bracket:
2300 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2301 * If so, suspect a scanno like "a]most".
2303 void check_for_unspaced_bracket(const char *aline)
2307 for (i=1;i<llen-1;i++)
2309 /* for each bracket character in the line except 1st & last */
2310 if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
2311 gcisalpha(aline[i+1]))
2313 if (pswit[ECHO_SWITCH])
2314 printf("\n%s\n",aline);
2315 if (!pswit[OVERVIEW_SWITCH])
2316 printf(" Line %ld column %d - Unspaced bracket?\n",
2325 * check_for_unpunctuated_endquote:
2327 void check_for_unpunctuated_endquote(const char *aline)
2331 for (i=1;i<llen;i++)
2333 /* for each character in the line except 1st */
2334 if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
2336 if (pswit[ECHO_SWITCH])
2337 printf("\n%s\n",aline);
2338 if (!pswit[OVERVIEW_SWITCH])
2339 printf(" Line %ld column %d - "
2340 "endquote missing punctuation?\n",linecnt,i);
2348 * check_for_html_tag:
2350 * Check for <HTML TAG>.
2352 * If there is a < in the line, followed at some point
2353 * by a > then we suspect HTML.
2355 void check_for_html_tag(const char *aline)
2358 const char *open,*close;
2359 open=strstr(aline,"<");
2362 close=strstr(aline,">");
2365 i=(int)(close-open+1);
2368 if (pswit[ECHO_SWITCH])
2369 printf("\n%s\n",aline);
2370 if (!pswit[OVERVIEW_SWITCH])
2371 printf(" Line %ld column %d - HTML Tag? %*.*s \n",
2372 linecnt,(int)(open-aline)+1,i,i,open);
2381 * check_for_html_entity:
2383 * Check for &symbol; HTML.
2385 * If there is a & in the line, followed at
2386 * some point by a ; then we suspect HTML.
2388 void check_for_html_entity(const char *aline)
2391 const char *s,*amp,*scolon;
2392 amp=strstr(aline,"&");
2395 scolon=strstr(aline,";");
2398 i=(int)(scolon-amp+1);
2399 for (s=amp;s<scolon;s++)
2401 i=0; /* Don't report "Jones & Son;" */
2404 if (pswit[ECHO_SWITCH])
2405 printf("\n%s\n",aline);
2406 if (!pswit[OVERVIEW_SWITCH])
2407 printf(" Line %ld column %d - HTML symbol? %*.*s \n",
2408 linecnt,(int)(amp-aline)+1,i,i,amp);
2419 * If we are in a state of unbalanced quotes, and this line
2420 * doesn't begin with a quote, output the stored error message.
2421 * If the -P switch was used, print the warning even if the
2422 * new para starts with quotes.
2424 void print_pending(const char *aline,const char *parastart,
2425 struct pending *pending)
2431 if (pending->dquote)
2433 if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2435 if (!pswit[OVERVIEW_SWITCH])
2437 if (pswit[ECHO_SWITCH])
2438 printf("\n%s\n",parastart);
2439 puts(pending->dquote);
2444 g_free(pending->dquote);
2445 pending->dquote=NULL;
2447 if (pending->squote)
2449 if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2452 if (!pswit[OVERVIEW_SWITCH])
2454 if (pswit[ECHO_SWITCH])
2455 printf("\n%s\n",parastart);
2456 puts(pending->squote);
2461 g_free(pending->squote);
2462 pending->squote=NULL;
2464 if (pending->rbrack)
2466 if (!pswit[OVERVIEW_SWITCH])
2468 if (pswit[ECHO_SWITCH])
2469 printf("\n%s\n",parastart);
2470 puts(pending->rbrack);
2474 g_free(pending->rbrack);
2475 pending->rbrack=NULL;
2477 if (pending->sbrack)
2479 if (!pswit[OVERVIEW_SWITCH])
2481 if (pswit[ECHO_SWITCH])
2482 printf("\n%s\n",parastart);
2483 puts(pending->sbrack);
2487 g_free(pending->sbrack);
2488 pending->sbrack=NULL;
2490 if (pending->cbrack)
2492 if (!pswit[OVERVIEW_SWITCH])
2494 if (pswit[ECHO_SWITCH])
2495 printf("\n%s\n",parastart);
2496 puts(pending->cbrack);
2500 g_free(pending->cbrack);
2501 pending->cbrack=NULL;
2503 if (pending->unders)
2505 if (!pswit[OVERVIEW_SWITCH])
2507 if (pswit[ECHO_SWITCH])
2508 printf("\n%s\n",parastart);
2509 puts(pending->unders);
2513 g_free(pending->unders);
2514 pending->unders=NULL;
2519 * check_for_mismatched_quotes:
2521 * At end of paragraph, check for mismatched quotes.
2523 * We don't want to report an error immediately, since it is a
2524 * common convention to omit the quotes at end of paragraph if
2525 * the next paragraph is a continuation of the same speaker.
2526 * Where this is the case, the next para should begin with a
2527 * quote, so we store the warning message and only display it
2528 * at the top of the next iteration if the new para doesn't
2529 * start with a quote.
2530 * The -p switch overrides this default, and warns of unclosed
2531 * quotes on _every_ paragraph, whether the next begins with a
2534 void check_for_mismatched_quotes(const struct counters *counters,
2535 struct pending *pending)
2537 if (counters->quot%2)
2539 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2540 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2541 counters->open_single_quote!=counters->close_single_quote)
2543 g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
2544 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2545 counters->open_single_quote!=counters->close_single_quote &&
2546 counters->open_single_quote!=counters->close_single_quote+1)
2548 * Flag it to be noted regardless of the
2549 * first char of the next para.
2552 if (counters->r_brack)
2554 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2555 if (counters->s_brack)
2557 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2558 if (counters->c_brack)
2560 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2561 if (counters->c_unders%2)
2563 g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
2567 * check_for_omitted_punctuation:
2569 * Check for omitted punctuation at end of paragraph by working back
2570 * through prevline. DW.
2571 * Need to check this only for "normal" paras.
2572 * So what is a "normal" para?
2573 * Not normal if one-liner (chapter headings, etc.)
2574 * Not normal if doesn't contain at least one locase letter
2575 * Not normal if starts with space
2577 void check_for_omitted_punctuation(const char *prevline,
2578 struct line_properties *last,int start_para_line)
2582 for (s=prevline,i=0;*s && !i;s++)
2584 /* use i to indicate the presence of a letter on the line */
2587 * This next "if" is a problem.
2588 * If we say "start_para_line <= linecnt - 1", that includes
2589 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2590 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2591 * misses genuine one-line paragraphs.
2593 if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)
2595 for (i=strlen(prevline)-1;
2596 (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
2597 prevline[i]>CHAR_SPACE && i>0;
2602 if (gcisalpha(prevline[i]))
2604 if (pswit[ECHO_SWITCH])
2605 printf("\n%s\n",prevline);
2606 if (!pswit[OVERVIEW_SWITCH])
2607 printf(" Line %ld column %d - "
2608 "No punctuation at para end?\n",
2609 linecnt-1,(int)strlen(prevline));
2614 if (strchr("-.:!([{?}])",prevline[i]))
2620 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2622 const char *word=key;
2625 printf("\nNote: Queried word %s was duplicated %d times\n",
2635 void procfile(const char *filename)
2638 gchar *parastart=NULL; /* first line of current para */
2639 gchar *etext,*aline;
2642 struct first_pass_results *first_pass_results;
2643 struct warnings *warnings;
2644 struct counters counters={0};
2645 struct line_properties last={0};
2646 struct parities parities={0};
2647 struct pending pending={0};
2648 gboolean isemptyline;
2649 long start_para_line=0;
2650 gboolean isnewpara=FALSE,enddash=FALSE;
2651 last.start=CHAR_SPACE;
2652 linecnt=checked_linecnt=0;
2653 etext=read_etext(filename,&err);
2656 if (pswit[STDOUT_SWITCH])
2657 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2659 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2662 fprintf(stdout,"\n\nFile: %s\n\n",filename);
2663 first_pass_results=first_pass(etext);
2664 warnings=report_first_pass(first_pass_results);
2665 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2666 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2668 * Here we go with the main pass. Hold onto yer hat!
2672 while ((aline=flgets(&etext_ptr,linecnt+1)))
2677 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
2678 continue; // skip DP page separators completely
2679 if (linecnt<first_pass_results->firstline ||
2680 (first_pass_results->footerline>0 &&
2681 linecnt>first_pass_results->footerline))
2683 if (pswit[HEADER_SWITCH])
2685 if (!strncmp(aline,"Title:",6))
2686 printf(" %s\n",aline);
2687 if (!strncmp(aline,"Author:",7))
2688 printf(" %s\n",aline);
2689 if (!strncmp(aline,"Release Date:",13))
2690 printf(" %s\n",aline);
2691 if (!strncmp(aline,"Edition:",8))
2692 printf(" %s\n\n",aline);
2694 continue; /* skip through the header */
2697 print_pending(aline,parastart,&pending);
2698 memset(&pending,0,sizeof(pending));
2699 isemptyline=analyse_quotes(aline,&counters);
2700 if (isnewpara && !isemptyline)
2702 /* This line is the start of a new paragraph. */
2703 start_para_line=linecnt;
2704 /* Capture its first line in case we want to report it later. */
2706 parastart=g_strdup(aline);
2707 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2709 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
2711 if (*s>='a' && *s<='z')
2713 /* and its first letter is lowercase */
2714 if (pswit[ECHO_SWITCH])
2715 printf("\n%s\n",aline);
2716 if (!pswit[OVERVIEW_SWITCH])
2717 printf(" Line %ld column %d - "
2718 "Paragraph starts with lower-case\n",
2719 linecnt,(int)(s-aline)+1);
2723 isnewpara=FALSE; /* Signal the end of new para processing. */
2725 /* Check for an em-dash broken at line end. */
2726 if (enddash && *aline=='-')
2728 if (pswit[ECHO_SWITCH])
2729 printf("\n%s\n",aline);
2730 if (!pswit[OVERVIEW_SWITCH])
2731 printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2736 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
2738 if (s>=aline && *s=='-')
2740 check_for_control_characters(aline);
2742 check_for_odd_characters(aline,warnings,isemptyline);
2743 if (warnings->longline)
2744 check_for_long_line(aline);
2745 if (warnings->shortline)
2746 check_for_short_line(aline,&last);
2748 last.len=strlen(aline);
2749 last.start=aline[0];
2750 check_for_starting_punctuation(aline);
2753 check_for_spaced_emdash(aline);
2754 check_for_spaced_dash(aline);
2756 check_for_unmarked_paragraphs(aline);
2757 check_for_jeebies(aline);
2758 check_for_mta_from(aline);
2759 check_for_orphan_character(aline);
2760 check_for_pling_scanno(aline);
2761 check_for_extra_period(aline,warnings);
2762 check_for_following_punctuation(aline);
2763 check_for_typos(aline,warnings);
2764 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2765 check_for_double_punctuation(aline,warnings);
2766 check_for_spaced_quotes(aline);
2767 check_for_miscased_genative(aline);
2768 check_end_of_line(aline,warnings);
2769 check_for_unspaced_bracket(aline);
2770 if (warnings->endquote)
2771 check_for_unpunctuated_endquote(aline);
2772 check_for_html_tag(aline);
2773 check_for_html_entity(aline);
2776 check_for_mismatched_quotes(&counters,&pending);
2777 memset(&counters,0,sizeof(counters));
2778 /* let the next iteration know that it's starting a new para */
2781 check_for_omitted_punctuation(prevline,&last,start_para_line);
2784 prevline=g_strdup(aline);
2794 if (!pswit[OVERVIEW_SWITCH])
2795 g_tree_foreach(qword,report_duplicate_queries,NULL);
2796 g_tree_unref(qword);
2797 g_tree_unref(qperiod);
2803 * Get one line from the input text, checking for
2804 * the existence of exactly one CR/LF line-end per line.
2806 * Returns: a pointer to the line.
2808 char *flgets(char **etext,long lcnt)
2812 gboolean isCR=FALSE;
2813 char *theline=*etext;
2820 /* either way, it's end of line */
2827 /* Error - a LF without a preceding CR */
2828 if (pswit[LINE_END_SWITCH])
2830 if (pswit[ECHO_SWITCH])
2831 printf("\n%*.*s\n",len,len,theline);
2832 if (!pswit[OVERVIEW_SWITCH])
2833 printf(" Line %ld - No CR?\n",lcnt);
2844 /* Error - two successive CRs */
2845 if (pswit[LINE_END_SWITCH])
2847 if (pswit[ECHO_SWITCH])
2848 printf("\n%*.*s\n",len,len,theline);
2849 if (!pswit[OVERVIEW_SWITCH])
2850 printf(" Line %ld - Two successive CRs?\n",lcnt);
2859 if (pswit[LINE_END_SWITCH] && isCR)
2861 if (pswit[ECHO_SWITCH])
2862 printf("\n%*.*s\n",len,len,theline);
2863 if (!pswit[OVERVIEW_SWITCH])
2864 printf(" Line %ld column %d - CR without LF?\n",
2875 if (pswit[MARKUP_SWITCH])
2876 postprocess_for_HTML(theline);
2877 if (pswit[DP_SWITCH])
2878 postprocess_for_DP(theline);
2885 * Takes a "word" as a parameter, and checks whether it
2886 * contains a mixture of alpha and digits. Generally, this is an
2887 * error, but may not be for cases like 4th or L5 12s. 3d.
2889 * Returns: 0 if no error found, 1 if error.
2891 int mixdigit(const char *checkword)
2893 int wehaveadigit,wehavealetter,firstdigits,query,wl;
2895 wehaveadigit=wehavealetter=query=0;
2896 for (s=checkword;*s;s++)
2902 if (wehaveadigit && wehavealetter)
2904 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2906 wl=strlen(checkword);
2907 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
2909 /* digits, ending in st, rd, nd, th of either case */
2910 if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
2911 !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
2912 !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
2913 !g_ascii_strcasecmp(checkword+wl-2,"th")))
2915 if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
2916 !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
2917 !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
2918 !g_ascii_strcasecmp(checkword+wl-3,"ths")))
2920 if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
2921 !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
2922 !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
2923 !g_ascii_strcasecmp(checkword+wl-4,"thly")))
2925 /* digits, ending in l, L, s or d */
2926 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
2927 checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
2930 * L at the start of a number, representing Britsh pounds, like L500.
2931 * This is cute. We know the current word is mixeddigit. If the first
2932 * letter is L, there must be at least one digit following. If both
2933 * digits and letters follow, we have a genuine error, else we have a
2934 * capital L followed by digits, and we accept that as a non-error.
2936 if (checkword[0]=='L' && !mixdigit(checkword+1))
2945 * Extracts the first/next "word" from the line, and returns it.
2946 * A word is defined as one English word unit--or at least that's the aim.
2947 * "ptr" is advanced to the position in the line where we will start
2948 * looking for the next word.
2950 * Returns: A newly-allocated string.
2952 gchar *getaword(const char **ptr)
2957 word=g_string_new(NULL);
2958 for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
2961 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2962 * Especially yucky is the case of L1,000
2963 * This section looks for a pattern of characters including a digit
2964 * followed by a comma or period followed by one or more digits.
2965 * If found, it returns this whole pattern as a word; otherwise we discard
2966 * the results and resume our normal programming.
2969 for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
2970 g_string_append_c(word,*s);
2971 for (i=1;i+1<word->len;i++)
2973 if (word->str[i]=='.' || word->str[i]==',')
2975 if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
2978 return g_string_free(word,FALSE);
2982 /* we didn't find a punctuated number - do the regular getword thing */
2983 g_string_truncate(word,0);
2984 for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
2985 g_string_append_c(word,**ptr);
2986 return g_string_free(word,FALSE);
2992 * Is this word a Roman Numeral?
2994 * It doesn't actually validate that the number is a valid Roman Numeral--for
2995 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2996 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2997 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2998 * expressions thereof, except when it came to taxes. Allow any number of M,
2999 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3000 * XL or an optional XC, an optional IX or IV, an optional V and any number
3003 gboolean isroman(const char *t)
3009 while (*t=='m' && *t)
3013 if (*t=='c' && t[1]=='m')
3015 if (*t=='c' && t[1]=='d')
3017 while (*t=='c' && *t)
3019 if (*t=='x' && t[1]=='l')
3021 if (*t=='x' && t[1]=='c')
3025 while (*t=='x' && *t)
3027 if (*t=='i' && t[1]=='x')
3029 if (*t=='i' && t[1]=='v')
3033 while (*t=='i' && *t)
3041 * A version of isalpha() that is somewhat lenient on 8-bit texts.
3042 * If we use the standard function, 8-bit accented characters break
3043 * words, so that tete with accented characters appears to be two words, "t"
3044 * and "t", with 8-bit characters between them. This causes over-reporting of
3045 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
3046 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
3048 gboolean gcisalpha(unsigned char c)
3050 if (c>='a' && c<='z')
3052 if (c>='A' && c<='Z')
3056 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
3058 if (c==140 || c==142 || c==156 || c==158 || c==159)
3066 * A version of isdigit() that doesn't get confused in 8-bit texts.
3068 gboolean gcisdigit(unsigned char c)
3070 return c>='0' && c<='9';
3076 * A version of isletter() that doesn't get confused in 8-bit texts.
3077 * NB: this is ISO-8891-1-specific.
3079 gboolean gcisletter(unsigned char c)
3081 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
3085 * postprocess_for_DP:
3087 * Invoked with the -d switch from flgets().
3088 * It simply "removes" from the line a hard-coded set of common
3089 * DP-specific tags, so that the line passed to the main routine has
3090 * been pre-cleaned of DP markup.
3092 void postprocess_for_DP(char *theline)
3098 for (i=0;*DPmarkup[i];i++)
3100 s=strstr(theline,DPmarkup[i]);
3103 t=s+strlen(DPmarkup[i]);
3111 s=strstr(theline,DPmarkup[i]);
3117 * postprocess_for_HTML:
3119 * Invoked with the -m switch from flgets().
3120 * It simply "removes" from the line a hard-coded set of common
3121 * HTML tags and "replaces" a hard-coded set of common HTML
3122 * entities, so that the line passed to the main routine has
3123 * been pre-cleaned of HTML.
3125 void postprocess_for_HTML(char *theline)
3127 if (strchr(theline,'<') && strchr(theline,'>'))
3128 while (losemarkup(theline))
3130 while (loseentities(theline))
3134 char *losemarkup(char *theline)
3140 s=strstr(theline,"<");
3141 t=strstr(theline,">");
3144 for (i=0;*markup[i];i++)
3145 if (!tagcomp(s+1,markup[i]))
3158 /* It's an unrecognized <xxx>. */
3162 char *loseentities(char *theline)
3168 for (i=0;*entities[i].htmlent;i++)
3170 s=strstr(theline,entities[i].htmlent);
3173 t=malloc((size_t)strlen(s));
3176 strcpy(t,s+strlen(entities[i].htmlent));
3177 strcpy(s,entities[i].textent);
3183 for (i=0;*entities[i].htmlnum;i++)
3185 s=strstr(theline,entities[i].htmlnum);
3188 t=malloc((size_t)strlen(s));
3191 strcpy(t,s+strlen(entities[i].htmlnum));
3192 strcpy(s,entities[i].textent);
3201 int tagcomp(const char *strin,const char *basetag)
3207 t++; /* ignore a slash */
3210 if (tolower(*s)!=tolower(*t))
3218 void proghelp(GOptionContext *context)
3221 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3222 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3223 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3224 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3225 "For details, read the file COPYING.\n",stderr);
3226 fputs("This is Free Software; "
3227 "you may redistribute it under certain conditions (GPL);\n",stderr);
3228 fputs("read the file COPYING for details.\n\n",stderr);
3229 help=g_option_context_get_help(context,TRUE,NULL);
3232 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3233 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3234 "non-ASCII\n",stderr);
3235 fputs("characters like accented letters, "
3236 "lines longer than 75 or shorter than 55,\n",stderr);
3237 fputs("unbalanced quotes or brackets, "
3238 "a variety of badly formatted punctuation, \n",stderr);
3239 fputs("HTML tags, some likely typos. "
3240 "It is NOT a substitute for human judgement.\n",stderr);