1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
32 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
33 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
34 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
35 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
36 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
37 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
38 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
39 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
40 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
41 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
42 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
43 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
44 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
45 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
46 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
47 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
48 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
49 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
50 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
51 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
52 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
53 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
54 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
55 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
56 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
57 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
58 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
59 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
60 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
66 /* Common abbreviations and other OK words not to query as typos. */
68 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
69 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
70 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
71 "outbid", "outbids", "frostbite", "frostbitten", ""
74 /* Common abbreviations that cause otherwise unexplained periods. */
76 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
77 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
81 * Two-Letter combinations that rarely if ever start words,
82 * but are common scannos or otherwise common letter combinations.
85 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
89 * Two-Letter combinations that rarely if ever end words,
90 * but are common scannos or otherwise common letter combinations.
93 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
94 "sw", "gr", "sl", "cl", "iy", ""
98 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
99 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
100 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
101 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
105 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
109 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
110 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
111 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
112 "during", "let", "toward", "among", ""
116 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
117 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
118 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
119 "among", "those", "into", "whom", "having", "thence", ""
127 "&", "&", "&",
128 "<", "<", "<",
129 ">", ">", ">",
130 "°", "°", " degrees",
131 "£", "£", "L",
132 """, """, "\"", /* quotation mark = APL quote */
133 "Œ", "Œ", "OE", /* latin capital ligature OE */
134 "œ", "œ", "oe", /* latin small ligature oe */
135 "Š", "Š", "S", /* latin capital letter S with caron */
136 "š", "š", "s", /* latin small letter s with caron */
137 "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
138 "ˆ", "ˆ", "", /* modifier letter circumflex accent */
139 "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
140 " ", " ", " ", /* en space, U+2002 ISOpub */
141 " ", " ", " ", /* em space, U+2003 ISOpub */
142 " ", " ", " ", /* thin space, U+2009 ISOpub */
143 "–", "–", "-", /* en dash, U+2013 ISOpub */
144 "—", "—", "--", /* em dash, U+2014 ISOpub */
145 "’", "’", "'", /* right single quotation mark */
146 "‚", "‚", "'", /* single low-9 quotation mark */
147 "“", "“", "\"", /* left double quotation mark */
148 "”", "”", "\"", /* right double quotation mark */
149 "„", "„", "\"", /* double low-9 quotation mark */
150 "‹", "‹", "\"", /* single left-pointing angle quotation mark */
151 "›", "›", "\"", /* single right-pointing angle quotation mark */
152 " ", " ", " ", /* no-break space = non-breaking space, */
153 "¡", "¡", "!", /* inverted exclamation mark */
154 "¢", "¢", "c", /* cent sign */
155 "£", "£", "L", /* pound sign */
156 "¤", "¤", "$", /* currency sign */
157 "¥", "¥", "Y", /* yen sign = yuan sign */
158 "§", "§", "--", /* section sign */
159 "¨", "¨", " ", /* diaeresis = spacing diaeresis */
160 "©", "©", "(C) ", /* copyright sign */
161 "ª", "ª", " ", /* feminine ordinal indicator */
162 "«", "«", "\"", /* left-pointing double angle quotation mark */
163 "­", "­", "-", /* soft hyphen = discretionary hyphen */
164 "®", "®", "(R) ", /* registered sign = registered trade mark sign */
165 "¯", "¯", " ", /* macron = spacing macron = overline */
166 "°", "°", " degrees", /* degree sign */
167 "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
168 "²", "²", "2", /* superscript two = superscript digit two */
169 "³", "³", "3", /* superscript three = superscript digit three */
170 "´", "´", " ", /* acute accent = spacing acute */
171 "µ", "µ", "m", /* micro sign */
172 "¶", "¶", "--", /* pilcrow sign = paragraph sign */
173 "¸", "¸", " ", /* cedilla = spacing cedilla */
174 "¹", "¹", "1", /* superscript one = superscript digit one */
175 "º", "º", " ", /* masculine ordinal indicator */
176 "»", "»", "\"", /* right-pointing double angle quotation mark */
177 "¼", "¼", "1/4", /* vulgar fraction one quarter */
178 "½", "½", "1/2", /* vulgar fraction one half */
179 "¾", "¾", "3/4", /* vulgar fraction three quarters */
180 "¿", "¿", "?", /* inverted question mark */
181 "À", "À", "A", /* latin capital letter A with grave */
182 "Á", "Á", "A", /* latin capital letter A with acute */
183 "Â", "Â", "A", /* latin capital letter A with circumflex */
184 "Ã", "Ã", "A", /* latin capital letter A with tilde */
185 "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
186 "Å", "Å", "A", /* latin capital letter A with ring above */
187 "Æ", "Æ", "AE", /* latin capital letter AE */
188 "Ç", "Ç", "C", /* latin capital letter C with cedilla */
189 "È", "È", "E", /* latin capital letter E with grave */
190 "É", "É", "E", /* latin capital letter E with acute */
191 "Ê", "Ê", "E", /* latin capital letter E with circumflex */
192 "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
193 "Ì", "Ì", "I", /* latin capital letter I with grave */
194 "Í", "Í", "I", /* latin capital letter I with acute */
195 "Î", "Î", "I", /* latin capital letter I with circumflex */
196 "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
197 "Ð", "Ð", "E", /* latin capital letter ETH */
198 "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
199 "Ò", "Ò", "O", /* latin capital letter O with grave */
200 "Ó", "Ó", "O", /* latin capital letter O with acute */
201 "Ô", "Ô", "O", /* latin capital letter O with circumflex */
202 "Õ", "Õ", "O", /* latin capital letter O with tilde */
203 "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
204 "×", "×", "*", /* multiplication sign */
205 "Ø", "Ø", "O", /* latin capital letter O with stroke */
206 "Ù", "Ù", "U", /* latin capital letter U with grave */
207 "Ú", "Ú", "U", /* latin capital letter U with acute */
208 "Û", "Û", "U", /* latin capital letter U with circumflex */
209 "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
210 "Ý", "Ý", "Y", /* latin capital letter Y with acute */
211 "Þ", "Þ", "TH", /* latin capital letter THORN */
212 "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
213 "à", "à", "a", /* latin small letter a with grave */
214 "á", "á", "a", /* latin small letter a with acute */
215 "â", "â", "a", /* latin small letter a with circumflex */
216 "ã", "ã", "a", /* latin small letter a with tilde */
217 "ä", "ä", "a", /* latin small letter a with diaeresis */
218 "å", "å", "a", /* latin small letter a with ring above */
219 "æ", "æ", "ae", /* latin small letter ae */
220 "ç", "ç", "c", /* latin small letter c with cedilla */
221 "è", "è", "e", /* latin small letter e with grave */
222 "é", "é", "e", /* latin small letter e with acute */
223 "ê", "ê", "e", /* latin small letter e with circumflex */
224 "ë", "ë", "e", /* latin small letter e with diaeresis */
225 "ì", "ì", "i", /* latin small letter i with grave */
226 "í", "í", "i", /* latin small letter i with acute */
227 "î", "î", "i", /* latin small letter i with circumflex */
228 "ï", "ï", "i", /* latin small letter i with diaeresis */
229 "ð", "ð", "eth", /* latin small letter eth */
230 "ñ", "ñ", "n", /* latin small letter n with tilde */
231 "ò", "ò", "o", /* latin small letter o with grave */
232 "ó", "ó", "o", /* latin small letter o with acute */
233 "ô", "ô", "o", /* latin small letter o with circumflex */
234 "õ", "õ", "o", /* latin small letter o with tilde */
235 "ö", "ö", "o", /* latin small letter o with diaeresis */
236 "÷", "÷", "/", /* division sign */
237 "ø", "ø", "o", /* latin small letter o with stroke */
238 "ù", "ù", "u", /* latin small letter u with grave */
239 "ú", "ú", "u", /* latin small letter u with acute */
240 "û", "û", "u", /* latin small letter u with circumflex */
241 "ü", "ü", "u", /* latin small letter u with diaeresis */
242 "ý", "ý", "y", /* latin small letter y with acute */
243 "þ", "þ", "th", /* latin small letter thorn */
244 "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
248 /* special characters */
249 #define CHAR_SPACE 32
253 #define CHAR_DQUOTE 34
254 #define CHAR_SQUOTE 39
255 #define CHAR_OPEN_SQUOTE 96
256 #define CHAR_TILDE 126
257 #define CHAR_ASTERISK 42
258 #define CHAR_FORESLASH 47
259 #define CHAR_CARAT 94
261 #define CHAR_UNDERSCORE '_'
262 #define CHAR_OPEN_CBRACK '{'
263 #define CHAR_CLOSE_CBRACK '}'
264 #define CHAR_OPEN_RBRACK '('
265 #define CHAR_CLOSE_RBRACK ')'
266 #define CHAR_OPEN_SBRACK '['
267 #define CHAR_CLOSE_SBRACK ']'
269 /* longest and shortest normal PG line lengths */
270 #define LONGEST_PG_LINE 75
271 #define WAY_TOO_LONG 80
272 #define SHORTEST_PG_LINE 55
292 gboolean pswit[SWITNO]; /* program switches */
294 static GOptionEntry options[]={
295 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
296 "Ignore DP-specific markup", NULL },
297 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
298 "Don't echo queried line", NULL },
299 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
300 "Check single quotes", NULL },
301 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
302 "Check common typos", NULL },
303 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
304 "Require closure of quotes on every paragraph", NULL },
305 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
306 "Disable paranoid querying of everything", NULL },
307 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
308 "Disable line end checking", NULL },
309 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
310 "Overview: just show counts", NULL },
311 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
312 "Output errors to stdout instead of stderr", NULL },
313 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
314 "Echo header fields", NULL },
315 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
316 "Ignore markup in < >", NULL },
317 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
318 "Use file of user-defined typos", NULL },
319 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
320 "Defaults for use on www upload", NULL },
321 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
322 "Verbose - list everything", NULL },
326 long cnt_dquot; /* for overview mode, count of doublequote queries */
327 long cnt_squot; /* for overview mode, count of singlequote queries */
328 long cnt_brack; /* for overview mode, count of brackets queries */
329 long cnt_bin; /* for overview mode, count of non-ASCII queries */
330 long cnt_odd; /* for overview mode, count of odd character queries */
331 long cnt_long; /* for overview mode, count of long line errors */
332 long cnt_short; /* for overview mode, count of short line queries */
333 long cnt_punct; /* for overview mode,
334 count of punctuation and spacing queries */
335 long cnt_dash; /* for overview mode, count of dash-related queries */
336 long cnt_word; /* for overview mode, count of word queries */
337 long cnt_html; /* for overview mode, count of html queries */
338 long cnt_lineend; /* for overview mode, count of line-end queries */
339 long cnt_spacend; /* count of lines with space at end */
340 long linecnt; /* count of total lines in the file */
341 long checked_linecnt; /* count of lines actually checked */
343 void proghelp(GOptionContext *context);
344 void procfile(const char *);
348 gboolean mixdigit(const char *);
349 gchar *getaword(const char **);
350 char *flgets(char **,long);
351 void postprocess_for_HTML(char *);
352 char *linehasmarkup(char *);
353 char *losemarkup(char *);
354 gboolean tagcomp(const char *,const char *);
355 char *loseentities(char *);
356 gboolean isroman(const char *);
357 void postprocess_for_DP(char *);
359 GTree *qword,*qperiod;
361 struct first_pass_results {
362 long firstline,astline;
363 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
364 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
365 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
366 int Dutchcount,Frenchcount;
370 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
372 gboolean isDutch,isFrench;
377 int c_unders,c_brack,s_brack,r_brack;
378 int open_single_quote,close_single_quote;
381 struct line_properties {
382 unsigned int len,blen;
391 char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
395 void parse_options(int *argc,char ***argv)
398 GOptionContext *context;
399 context=g_option_context_new(
400 "file - looks for errors in Project Gutenberg(TM) etexts");
401 g_option_context_add_main_entries(context,options,NULL);
402 if (!g_option_context_parse(context,argc,argv,&err))
404 g_printerr("Bookloupe: %s\n",err->message);
405 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
408 /* Paranoid checking is turned OFF, not on, by its switch */
409 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
410 if (pswit[PARANOID_SWITCH])
411 /* if running in paranoid mode, typo checks default to enabled */
412 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
413 /* Line-end checking is turned OFF, not on, by its switch */
414 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
415 /* Echoing is turned OFF, not on, by its switch */
416 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
417 if (pswit[OVERVIEW_SWITCH])
418 /* just print summary; don't echo */
419 pswit[ECHO_SWITCH]=FALSE;
421 * Web uploads - for the moment, this is really just a placeholder
422 * until we decide what processing we really want to do on web uploads
424 if (pswit[WEB_SWITCH])
426 /* specific override for web uploads */
427 pswit[ECHO_SWITCH]=TRUE;
428 pswit[SQUOTE_SWITCH]=FALSE;
429 pswit[TYPO_SWITCH]=TRUE;
430 pswit[QPARA_SWITCH]=FALSE;
431 pswit[PARANOID_SWITCH]=TRUE;
432 pswit[LINE_END_SWITCH]=FALSE;
433 pswit[OVERVIEW_SWITCH]=FALSE;
434 pswit[STDOUT_SWITCH]=FALSE;
435 pswit[HEADER_SWITCH]=TRUE;
436 pswit[VERBOSE_SWITCH]=FALSE;
437 pswit[MARKUP_SWITCH]=FALSE;
438 pswit[USERTYPO_SWITCH]=FALSE;
439 pswit[DP_SWITCH]=FALSE;
446 g_option_context_free(context);
452 * Read in the user-defined stealth scanno list.
454 void read_user_scannos(void)
457 gchar *usertypo_file;
461 gchar *contents,*utf8,**lines;
462 usertypo_file=g_strdup("bookloupe.typ");
463 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
464 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
467 g_free(usertypo_file);
468 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
469 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
471 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
474 g_free(usertypo_file);
475 usertypo_file=g_strdup("gutcheck.typ");
476 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
478 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
481 g_free(usertypo_file);
482 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
483 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
485 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
487 g_free(usertypo_file);
488 g_print(" --> I couldn't find bookloupe.typ "
489 "-- proceeding without user typos.\n");
494 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
495 g_free(usertypo_file);
499 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
501 lines=g_strsplit_set(utf8,"\r\n",0);
503 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
504 for (i=0;lines[i];i++)
505 if (*(unsigned char *)lines[i]>'!')
506 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
515 * Read an etext returning a newly allocated string containing the file
516 * contents or NULL on error.
518 gchar *read_etext(const char *filename,GError **err)
520 gchar *contents,*utf8;
522 if (!g_file_get_contents(filename,&contents,&len,err))
524 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
529 int main(int argc,char **argv)
531 running_from=g_path_get_dirname(argv[0]);
532 parse_options(&argc,&argv);
533 if (pswit[USERTYPO_SWITCH])
535 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
537 if (pswit[OVERVIEW_SWITCH])
539 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
540 checked_linecnt,linecnt,linecnt-checked_linecnt);
541 g_print(" --------------- Queries found --------------\n");
543 g_print(" Long lines: %14ld\n",cnt_long);
545 g_print(" Short lines: %14ld\n",cnt_short);
547 g_print(" Line-end problems: %14ld\n",cnt_lineend);
549 g_print(" Common typos: %14ld\n",cnt_word);
551 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
553 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
555 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
557 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
559 g_print(" Proofing characters: %14ld\n",cnt_odd);
561 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
563 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
565 g_print(" Possible HTML tags: %14ld\n",cnt_html);
567 g_print(" TOTAL QUERIES %14ld\n",
568 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
569 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
571 g_free(running_from);
573 g_tree_unref(usertypo);
580 * Run a first pass - verify that it's a valid PG
581 * file, decide whether to report some things that
582 * occur many times in the text like long or short
583 * lines, non-standard dashes, etc.
585 struct first_pass_results *first_pass(const char *etext)
587 gunichar laststart=CHAR_SPACE;
592 unsigned int lastlen=0,lastblen=0;
593 long spline=0,nspline=0;
594 static struct first_pass_results results={0};
596 lines=g_strsplit(etext,"\n",0);
597 for (j=0;lines[j];j++)
599 lbytes=strlen(lines[j]);
600 while (lines[j][lbytes-1]=='\r')
601 lines[j][--lbytes]='\0';
602 llen=g_utf8_strlen(lines[j],lbytes);
604 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
605 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
608 g_print(" --> Duplicate header?\n");
609 spline=linecnt+1; /* first line of non-header text, that is */
611 if (!strncmp(lines[j],"*** START",9) &&
612 strstr(lines[j],"PROJECT GUTENBERG"))
615 g_print(" --> Duplicate header?\n");
616 nspline=linecnt+1; /* first line of non-header text, that is */
618 if (spline || nspline)
620 lc_line=g_utf8_strdown(lines[j],lbytes);
621 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
623 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
625 if (results.footerline)
627 /* it's an old-form header - we can detect duplicates */
629 g_print(" --> Duplicate footer?\n");
632 results.footerline=linecnt;
638 results.firstline=spline;
640 results.firstline=nspline; /* override with new */
641 if (results.footerline)
642 continue; /* don't count the boilerplate in the footer */
643 results.totlen+=llen;
644 for (s=lines[j];*s;s=g_utf8_next_char(s))
646 if (g_utf8_get_char(s)>127)
648 if (g_unichar_isalpha(g_utf8_get_char(s)))
650 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
651 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
652 results.endquote_count++;
654 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
655 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
658 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
660 if (strstr(lines[j],".,"))
662 /* only count ast lines for ignoring purposes where there is */
663 /* locase text on the line */
664 if (strchr(lines[j],'*'))
666 for (s=lines[j];*s;s=g_utf8_next_char(s))
667 if (g_unichar_islower(g_utf8_get_char(s)))
672 if (strchr(lines[j],'/'))
673 results.fslashline++;
674 for (s=g_utf8_prev_char(lines[j]+lbytes);
675 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
677 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
678 g_utf8_get_char(g_utf8_prev_char(s))!='-')
680 if (llen>LONGEST_PG_LINE)
682 if (llen>WAY_TOO_LONG)
683 results.verylongline++;
684 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
686 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
689 if (strstr(lines[j],"<i>"))
690 results.htmcount+=4; /* bonus marks! */
692 /* Check for spaced em-dashes */
693 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
696 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
697 results.space_emdash++;
698 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
699 /* count of em-dashes with spaces both sides */
700 results.non_PG_space_emdash++;
701 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
702 /* count of PG-type em-dashes with no spaces */
703 results.PG_space_emdash++;
708 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
709 results.Dutchcount++;
710 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
711 results.Frenchcount++;
712 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
713 results.standalone_digit++;
716 /* Check for spaced dashes */
717 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
721 laststart=lines[j][0];
730 * Make some snap decisions based on the first pass results.
732 struct warnings *report_first_pass(struct first_pass_results *results)
734 static struct warnings warnings={0};
736 g_print(" --> %ld lines in this file have white space at end\n",
739 if (results->dotcomma>5)
742 g_print(" --> %ld lines in this file contain '.,'. "
743 "Not reporting them.\n",results->dotcomma);
746 * If more than 50 lines, or one-tenth, are short,
747 * don't bother reporting them.
749 warnings.shortline=1;
750 if (results->shortline>50 || results->shortline*10>linecnt)
752 warnings.shortline=0;
753 g_print(" --> %ld lines in this file are short. "
754 "Not reporting short lines.\n",results->shortline);
757 * If more than 50 lines, or one-tenth, are long,
758 * don't bother reporting them.
761 if (results->longline>50 || results->longline*10>linecnt)
764 g_print(" --> %ld lines in this file are long. "
765 "Not reporting long lines.\n",results->longline);
767 /* If more than 10 lines contain asterisks, don't bother reporting them. */
769 if (results->astline>10)
772 g_print(" --> %ld lines in this file contain asterisks. "
773 "Not reporting them.\n",results->astline);
776 * If more than 10 lines contain forward slashes,
777 * don't bother reporting them.
780 if (results->fslashline>10)
783 g_print(" --> %ld lines in this file contain forward slashes. "
784 "Not reporting them.\n",results->fslashline);
787 * If more than 20 lines contain unpunctuated endquotes,
788 * don't bother reporting them.
791 if (results->endquote_count>20)
794 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
795 "Not reporting them.\n",results->endquote_count);
798 * If more than 15 lines contain standalone digits,
799 * don't bother reporting them.
802 if (results->standalone_digit>10)
805 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
806 "Not reporting them.\n",results->standalone_digit);
809 * If more than 20 lines contain hyphens at end,
810 * don't bother reporting them.
813 if (results->hyphens>20)
816 g_print(" --> %ld lines in this file have hyphens at end. "
817 "Not reporting them.\n",results->hyphens);
819 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
821 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
822 pswit[MARKUP_SWITCH]=1;
824 if (results->verylongline>0)
825 g_print(" --> %ld lines in this file are VERY long!\n",
826 results->verylongline);
828 * If there are more non-PG spaced dashes than PG em-dashes,
829 * assume it's deliberate.
830 * Current PG guidelines say don't use them, but older texts do,
831 * and some people insist on them whatever the guidelines say.
834 if (results->spacedash+results->non_PG_space_emdash>
835 results->PG_space_emdash)
838 g_print(" --> There are %ld spaced dashes and em-dashes. "
839 "Not reporting them.\n",
840 results->spacedash+results->non_PG_space_emdash);
842 /* If more than a quarter of characters are hi-bit, bug out. */
844 if (results->binlen*4>results->totlen)
846 g_print(" --> This file does not appear to be ASCII. "
847 "Terminating. Best of luck with it!\n");
850 if (results->alphalen*4<results->totlen)
852 g_print(" --> This file does not appear to be text. "
853 "Terminating. Best of luck with it!\n");
856 if (results->binlen*100>results->totlen || results->binlen>100)
858 g_print(" --> There are a lot of foreign letters here. "
859 "Not reporting them.\n");
862 warnings.isDutch=FALSE;
863 if (results->Dutchcount>50)
865 warnings.isDutch=TRUE;
866 g_print(" --> This looks like Dutch - "
867 "switching off dashes and warnings for 's Middags case.\n");
869 warnings.isFrench=FALSE;
870 if (results->Frenchcount>50)
872 warnings.isFrench=TRUE;
873 g_print(" --> This looks like French - "
874 "switching off some doublepunct.\n");
876 if (results->firstline && results->footerline)
877 g_print(" The PG header and footer appear to be already on.\n");
880 if (results->firstline)
881 g_print(" The PG header is on - no footer.\n");
882 if (results->footerline)
883 g_print(" The PG footer is on - no header.\n");
886 if (pswit[VERBOSE_SWITCH])
889 warnings.shortline=1;
898 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
900 if (warnings.isDutch)
902 if (results->footerline>0 && results->firstline>0 &&
903 results->footerline>results->firstline &&
904 results->footerline-results->firstline<100)
906 g_print(" --> I don't really know where this text starts. \n");
907 g_print(" There are no reference points.\n");
908 g_print(" I'm going to have to report the header and footer "
910 results->firstline=0;
918 * Look along the line, accumulate the count of quotes, and see
919 * if this is an empty line - i.e. a line with nothing on it
921 * If line has just spaces, period, * and/or - on it, don't
922 * count it, since empty lines with asterisks or dashes to
923 * separate sections are common.
925 * Returns: TRUE if the line is empty.
927 gboolean analyse_quotes(const char *aline,struct counters *counters)
930 /* assume the line is empty until proven otherwise */
931 gboolean isemptyline=TRUE;
932 const char *s=aline,*sprev,*snext;
937 snext=g_utf8_next_char(s);
938 c=g_utf8_get_char(s);
941 if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
946 * At start of line, it can only be an openquote.
947 * Hardcode a very common exception!
949 if (!g_str_has_prefix(snext,"tis") &&
950 !g_str_has_prefix(snext,"Tis"))
951 counters->open_single_quote++;
953 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
954 g_unichar_isalpha(g_utf8_get_char(snext)))
955 /* Do nothing! it's definitely an apostrophe, not a quote */
957 /* it's outside a word - let's check it out */
958 else if (c==CHAR_OPEN_SQUOTE ||
959 g_unichar_isalpha(g_utf8_get_char(snext)))
961 /* it damwell better BE an openquote */
962 if (!g_str_has_prefix(snext,"tis") &&
963 !g_str_has_prefix(snext,"Tis"))
964 /* hardcode a very common exception! */
965 counters->open_single_quote++;
969 /* now - is it a closequote? */
970 guessquote=0; /* accumulate clues */
971 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
973 /* it follows a letter - could be either */
975 if (g_utf8_get_char(sprev)=='s')
977 /* looks like a plural apostrophe */
979 if (g_utf8_get_char(snext)==CHAR_SPACE)
984 /* it doesn't have a letter either side */
985 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
986 strchr(".?!,;: ",g_utf8_get_char(snext)))
987 guessquote+=8; /* looks like a closequote */
990 if (counters->open_single_quote>counters->close_single_quote)
992 * Give it the benefit of some doubt,
993 * if a squote is already open.
999 counters->close_single_quote++;
1002 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1004 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1005 if (c==CHAR_UNDERSCORE)
1006 counters->c_unders++;
1007 if (c==CHAR_OPEN_CBRACK)
1008 counters->c_brack++;
1009 if (c==CHAR_CLOSE_CBRACK)
1010 counters->c_brack--;
1011 if (c==CHAR_OPEN_RBRACK)
1012 counters->r_brack++;
1013 if (c==CHAR_CLOSE_RBRACK)
1014 counters->r_brack--;
1015 if (c==CHAR_OPEN_SBRACK)
1016 counters->s_brack++;
1017 if (c==CHAR_CLOSE_SBRACK)
1018 counters->s_brack--;
1026 * check_for_control_characters:
1028 * Check for invalid or questionable characters in the line
1029 * Anything above 127 is invalid for plain ASCII, and
1030 * non-printable control characters should also be flagged.
1031 * Tabs should generally not be there.
1033 void check_for_control_characters(const char *aline)
1037 for (s=aline;*s;s=g_utf8_next_char(s))
1039 c=g_utf8_get_char(s);
1040 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1042 if (pswit[ECHO_SWITCH])
1043 g_print("\n%s\n",aline);
1044 if (!pswit[OVERVIEW_SWITCH])
1045 g_print(" Line %ld column %ld - Control character %u\n",
1046 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1054 * check_for_odd_characters:
1056 * Check for binary and other odd characters.
1058 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1059 gboolean isemptyline)
1061 /* Don't repeat multiple warnings on one line. */
1062 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1063 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1066 for (s=aline;*s;s=g_utf8_next_char(s))
1068 c=g_utf8_get_char(s);
1069 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1071 if (pswit[ECHO_SWITCH])
1072 g_print("\n%s\n",aline);
1073 if (!pswit[OVERVIEW_SWITCH])
1074 if (c>127 && c<160 || c>255)
1075 g_print(" Line %ld column %ld - "
1076 "Non-ISO-8859 character %u\n",
1077 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1079 g_print(" Line %ld column %ld - "
1080 "Non-ASCII character %u\n",
1081 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1086 if (!eTab && c==CHAR_TAB)
1088 if (pswit[ECHO_SWITCH])
1089 g_print("\n%s\n",aline);
1090 if (!pswit[OVERVIEW_SWITCH])
1091 g_print(" Line %ld column %ld - Tab character?\n",
1092 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1097 if (!eTilde && c==CHAR_TILDE)
1100 * Often used by OCR software to indicate an
1101 * unrecognizable character.
1103 if (pswit[ECHO_SWITCH])
1104 g_print("\n%s\n",aline);
1105 if (!pswit[OVERVIEW_SWITCH])
1106 g_print(" Line %ld column %ld - Tilde character?\n",
1107 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1112 if (!eCarat && c==CHAR_CARAT)
1114 if (pswit[ECHO_SWITCH])
1115 g_print("\n%s\n",aline);
1116 if (!pswit[OVERVIEW_SWITCH])
1117 g_print(" Line %ld column %ld - Carat character?\n",
1118 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1123 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1125 if (pswit[ECHO_SWITCH])
1126 g_print("\n%s\n",aline);
1127 if (!pswit[OVERVIEW_SWITCH])
1128 g_print(" Line %ld column %ld - Forward slash?\n",
1129 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1135 * Report asterisks only in paranoid mode,
1136 * since they're often deliberate.
1138 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1141 if (pswit[ECHO_SWITCH])
1142 g_print("\n%s\n",aline);
1143 if (!pswit[OVERVIEW_SWITCH])
1144 g_print(" Line %ld column %ld - Asterisk?\n",
1145 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1154 * check_for_long_line:
1156 * Check for line too long.
1158 void check_for_long_line(const char *aline)
1160 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1162 if (pswit[ECHO_SWITCH])
1163 g_print("\n%s\n",aline);
1164 if (!pswit[OVERVIEW_SWITCH])
1165 g_print(" Line %ld column %ld - Long line %ld\n",
1166 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1173 * check_for_short_line:
1175 * Check for line too short.
1177 * This one is a bit trickier to implement: we don't want to
1178 * flag the last line of a paragraph for being short, so we
1179 * have to wait until we know that our current line is a
1180 * "normal" line, then report the _previous_ line if it was too
1181 * short. We also don't want to report indented lines like
1182 * chapter heads or formatted quotations. We therefore keep
1183 * last->len as the length of the last line examined, and
1184 * last->blen as the length of the last but one, and try to
1185 * suppress unnecessary warnings by checking that both were of
1186 * "normal" length. We keep the first character of the last
1187 * line in last->start, and if it was a space, we assume that
1188 * the formatting is deliberate. I can't figure out a way to
1189 * distinguish something like a quoted verse left-aligned or
1190 * the header or footer of a letter from a paragraph of short
1191 * lines - maybe if I examined the whole paragraph, and if the
1192 * para has less than, say, 8 lines and if all lines are short,
1193 * then just assume it's OK? Need to look at some texts to see
1194 * how often a formula like this would get the right result.
1196 void check_for_short_line(const char *aline,const struct line_properties *last)
1198 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1199 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1200 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1202 if (pswit[ECHO_SWITCH])
1203 g_print("\n%s\n",prevline);
1204 if (!pswit[OVERVIEW_SWITCH])
1205 g_print(" Line %ld column %ld - Short line %ld?\n",
1206 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1213 * check_for_starting_punctuation:
1215 * Look for punctuation other than full ellipses at start of line.
1217 void check_for_starting_punctuation(const char *aline)
1219 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1220 !g_str_has_prefix(aline,". . ."))
1222 if (pswit[ECHO_SWITCH])
1223 g_print("\n%s\n",aline);
1224 if (!pswit[OVERVIEW_SWITCH])
1225 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1233 * check_for_spaced_emdash:
1235 * Check for spaced em-dashes.
1237 * We must check _all_ occurrences of "--" on the line
1238 * hence the loop - even if the first double-dash is OK
1239 * there may be another that's wrong later on.
1241 void check_for_spaced_emdash(const char *aline)
1243 const char *s,*t,*next;
1244 for (s=aline;t=strstr(s,"--");s=next)
1246 next=g_utf8_next_char(g_utf8_next_char(t));
1247 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1248 g_utf8_get_char(next)==CHAR_SPACE)
1250 if (pswit[ECHO_SWITCH])
1251 g_print("\n%s\n",aline);
1252 if (!pswit[OVERVIEW_SWITCH])
1253 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1254 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1262 * check_for_spaced_dash:
1264 * Check for spaced dashes.
1266 void check_for_spaced_dash(const char *aline)
1269 if ((s=strstr(aline," -")))
1271 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1273 if (pswit[ECHO_SWITCH])
1274 g_print("\n%s\n",aline);
1275 if (!pswit[OVERVIEW_SWITCH])
1276 g_print(" Line %ld column %ld - Spaced dash?\n",
1277 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1282 else if ((s=strstr(aline,"- ")))
1284 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1286 if (pswit[ECHO_SWITCH])
1287 g_print("\n%s\n",aline);
1288 if (!pswit[OVERVIEW_SWITCH])
1289 g_print(" Line %ld column %ld - Spaced dash?\n",
1290 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1298 * check_for_unmarked_paragraphs:
1300 * Check for unmarked paragraphs indicated by separate speakers.
1302 * May well be false positive:
1303 * "Bravo!" "Wonderful!" called the crowd.
1304 * but useful all the same.
1306 void check_for_unmarked_paragraphs(const char *aline)
1309 s=strstr(aline,"\" \"");
1311 s=strstr(aline,"\" \"");
1314 if (pswit[ECHO_SWITCH])
1315 g_print("\n%s\n",aline);
1316 if (!pswit[OVERVIEW_SWITCH])
1317 g_print(" Line %ld column %ld - "
1318 "Query missing paragraph break?\n",
1319 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1326 * check_for_jeebies:
1328 * Check for "to he" and other easy h/b errors.
1330 * This is a very inadequate effort on the h/b problem,
1331 * but the phrase "to he" is always an error, whereas "to
1332 * be" is quite common.
1333 * Similarly, '"Quiet!", be said.' is a non-be error
1334 * "to he" is _not_ always an error!:
1335 * "Where they went to he couldn't say."
1336 * Another false positive:
1337 * What would "Cinderella" be without the . . .
1338 * and another: "If he wants to he can see for himself."
1340 void check_for_jeebies(const char *aline)
1343 s=strstr(aline," be could ");
1345 s=strstr(aline," be would ");
1347 s=strstr(aline," was be ");
1349 s=strstr(aline," be is ");
1351 s=strstr(aline," is be ");
1353 s=strstr(aline,"\", be ");
1355 s=strstr(aline,"\" be ");
1357 s=strstr(aline,"\" be ");
1359 s=strstr(aline," to he ");
1362 if (pswit[ECHO_SWITCH])
1363 g_print("\n%s\n",aline);
1364 if (!pswit[OVERVIEW_SWITCH])
1365 g_print(" Line %ld column %ld - Query he/be error?\n",
1366 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1370 s=strstr(aline," the had ");
1372 s=strstr(aline," a had ");
1374 s=strstr(aline," they bad ");
1376 s=strstr(aline," she bad ");
1378 s=strstr(aline," he bad ");
1380 s=strstr(aline," you bad ");
1382 s=strstr(aline," i bad ");
1385 if (pswit[ECHO_SWITCH])
1386 g_print("\n%s\n",aline);
1387 if (!pswit[OVERVIEW_SWITCH])
1388 g_print(" Line %ld column %ld - Query had/bad error?\n",
1389 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1393 s=strstr(aline,"; hut ");
1395 s=strstr(aline,", hut ");
1398 if (pswit[ECHO_SWITCH])
1399 g_print("\n%s\n",aline);
1400 if (!pswit[OVERVIEW_SWITCH])
1401 g_print(" Line %ld column %ld - Query hut/but error?\n",
1402 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1409 * check_for_mta_from:
1411 * Special case - angled bracket in front of "From" placed there by an
1412 * MTA when sending an e-mail.
1414 void check_for_mta_from(const char *aline)
1417 s=strstr(aline,">From");
1420 if (pswit[ECHO_SWITCH])
1421 g_print("\n%s\n",aline);
1422 if (!pswit[OVERVIEW_SWITCH])
1423 g_print(" Line %ld column %ld - "
1424 "Query angled bracket with From\n",
1425 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1432 * check_for_orphan_character:
1434 * Check for a single character line -
1435 * often an overflow from bad wrapping.
1437 void check_for_orphan_character(const char *aline)
1440 c=g_utf8_get_char(aline);
1441 if (c && !*g_utf8_next_char(aline))
1443 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1444 ; /* Nothing - ignore numerals alone on a line. */
1447 if (pswit[ECHO_SWITCH])
1448 g_print("\n%s\n",aline);
1449 if (!pswit[OVERVIEW_SWITCH])
1450 g_print(" Line %ld column 1 - Query single character line\n",
1459 * check_for_pling_scanno:
1461 * Check for I" - often should be !
1463 void check_for_pling_scanno(const char *aline)
1466 s=strstr(aline," I\"");
1469 if (pswit[ECHO_SWITCH])
1470 g_print("\n%s\n",aline);
1471 if (!pswit[OVERVIEW_SWITCH])
1472 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1473 linecnt,g_utf8_pointer_to_offset(aline,s));
1480 * check_for_extra_period:
1482 * Check for period without a capital letter. Cut-down from gutspell.
1483 * Only works when it happens on a single line.
1485 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1487 const char *s,*t,*s1;
1492 gunichar *decomposition;
1493 if (pswit[PARANOID_SWITCH])
1495 for (t=aline;t=strstr(t,". ");)
1499 t=g_utf8_next_char(t);
1500 /* start of line punctuation is handled elsewhere */
1503 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1505 t=g_utf8_next_char(t);
1508 if (warnings->isDutch)
1510 /* For Frank & Jeroen -- 's Middags case */
1511 gunichar c2,c3,c4,c5;
1512 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1513 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1514 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1515 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1516 if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
1517 c4==CHAR_SPACE && g_unichar_isupper(c5))
1519 t=g_utf8_next_char(t);
1523 s1=g_utf8_next_char(g_utf8_next_char(t));
1524 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1525 !isdigit(g_utf8_get_char(s1)))
1526 s1=g_utf8_next_char(s1);
1527 if (g_unichar_islower(g_utf8_get_char(s1)))
1529 /* we have something to investigate */
1531 /* so let's go back and find out */
1532 for (s1=g_utf8_prev_char(t);s1>=aline &&
1533 (g_unichar_isalpha(g_utf8_get_char(s1)) ||
1534 g_unichar_isdigit(g_utf8_get_char(s1)) ||
1535 g_utf8_get_char(s1)==CHAR_SQUOTE &&
1536 g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
1537 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
1538 s1=g_utf8_prev_char(s1))
1540 s1=g_utf8_next_char(s1);
1543 testword=g_strndup(s1,s-s1);
1545 testword=g_strdup(s1);
1546 for (i=0;*abbrev[i];i++)
1547 if (!strcmp(testword,abbrev[i]))
1549 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1551 if (!*g_utf8_next_char(testword))
1553 if (isroman(testword))
1558 for (s=testword;*s;s=g_utf8_next_char(s))
1560 decomposition=g_unicode_canonical_decomposition(
1561 g_utf8_get_char(s),&len);
1562 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1564 g_free(decomposition);
1568 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1570 g_tree_insert(qperiod,g_strdup(testword),
1571 GINT_TO_POINTER(1));
1572 if (pswit[ECHO_SWITCH])
1573 g_print("\n%s\n",aline);
1574 if (!pswit[OVERVIEW_SWITCH])
1575 g_print(" Line %ld column %ld - Extra period?\n",
1576 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1582 t=g_utf8_next_char(t);
1588 * check_for_following_punctuation:
1590 * Check for words usually not followed by punctuation.
1592 void check_for_following_punctuation(const char *aline)
1595 const char *s,*wordstart;
1598 if (pswit[TYPO_SWITCH])
1609 inword=g_utf8_strdown(t,-1);
1611 for (i=0;*nocomma[i];i++)
1612 if (!strcmp(inword,nocomma[i]))
1614 c=g_utf8_get_char(s);
1615 if (c==',' || c==';' || c==':')
1617 if (pswit[ECHO_SWITCH])
1618 g_print("\n%s\n",aline);
1619 if (!pswit[OVERVIEW_SWITCH])
1620 g_print(" Line %ld column %ld - "
1621 "Query punctuation after %s?\n",
1622 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1628 for (i=0;*noperiod[i];i++)
1629 if (!strcmp(inword,noperiod[i]))
1631 c=g_utf8_get_char(s);
1632 if (c=='.' || c=='!')
1634 if (pswit[ECHO_SWITCH])
1635 g_print("\n%s\n",aline);
1636 if (!pswit[OVERVIEW_SWITCH])
1637 g_print(" Line %ld column %ld - "
1638 "Query punctuation after %s?\n",
1639 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1653 * Check for commonly mistyped words,
1654 * and digits like 0 for O in a word.
1656 void check_for_typos(const char *aline,struct warnings *warnings)
1658 const char *s,*t,*nt,*wordstart;
1660 gunichar *decomposition;
1662 int i,vowel,consonant,*dupcnt;
1663 gboolean isdup,istypo,alower;
1666 gsize decomposition_len;
1670 inword=getaword(&s);
1674 continue; /* don't bother with empty lines */
1676 if (mixdigit(inword))
1678 if (pswit[ECHO_SWITCH])
1679 g_print("\n%s\n",aline);
1680 if (!pswit[OVERVIEW_SWITCH])
1681 g_print(" Line %ld column %ld - Query digit in %s\n",
1682 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1687 * Put the word through a series of tests for likely typos and OCR
1690 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1694 for (t=inword;*t;t=g_utf8_next_char(t))
1696 c=g_utf8_get_char(t);
1697 nt=g_utf8_next_char(t);
1698 /* lowercase for testing */
1699 if (g_unichar_islower(c))
1701 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1704 * We have an uppercase mid-word. However, there are
1706 * Mac and Mc like McGill
1707 * French contractions like l'Abbe
1709 offset=g_utf8_pointer_to_offset(inword,t);
1710 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1711 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1712 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1714 g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
1720 testword=g_utf8_casefold(inword,-1);
1722 if (pswit[TYPO_SWITCH])
1725 * Check for certain unlikely two-letter combinations at word
1728 len=g_utf8_strlen(testword,-1);
1731 for (i=0;*nostart[i];i++)
1732 if (g_str_has_prefix(testword,nostart[i]))
1734 for (i=0;*noend[i];i++)
1735 if (g_str_has_suffix(testword,noend[i]))
1738 /* ght is common, gbt never. Like that. */
1739 if (strstr(testword,"cb"))
1741 if (strstr(testword,"gbt"))
1743 if (strstr(testword,"pbt"))
1745 if (strstr(testword,"tbs"))
1747 if (strstr(testword,"mrn"))
1749 if (strstr(testword,"ahle"))
1751 if (strstr(testword,"ihle"))
1754 * "TBE" does happen - like HEARTBEAT - but uncommon.
1755 * Also "TBI" - frostbite, outbid - but uncommon.
1756 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1757 * numerals, but "ii" is a common scanno.
1759 if (strstr(testword,"tbi"))
1761 if (strstr(testword,"tbe"))
1763 if (strstr(testword,"ii"))
1766 * Check for no vowels or no consonants.
1767 * If none, flag a typo.
1769 if (!istypo && len>1)
1772 for (t=testword;*t;t=g_utf8_next_char(t))
1774 c=g_utf8_get_char(t);
1776 g_unicode_canonical_decomposition(c,&decomposition_len);
1777 if (c=='y' || g_unichar_isdigit(c))
1779 /* Yah, this is loose. */
1783 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1787 g_free(decomposition);
1789 if (!vowel || !consonant)
1793 * Now exclude the word from being reported if it's in
1796 for (i=0;*okword[i];i++)
1797 if (!strcmp(testword,okword[i]))
1800 * What looks like a typo may be a Roman numeral.
1803 if (istypo && isroman(testword))
1805 /* Check the manual list of typos. */
1807 for (i=0;*typo[i];i++)
1808 if (!strcmp(testword,typo[i]))
1811 * Check lowercase s, l, i and m - special cases.
1812 * "j" - often a semi-colon gone wrong.
1813 * "d" for a missing apostrophe - he d
1816 if (!istypo && len==1 &&
1817 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1821 dupcnt=g_tree_lookup(qword,testword);
1825 isdup=!pswit[VERBOSE_SWITCH];
1829 dupcnt=g_new0(int,1);
1830 g_tree_insert(qword,g_strdup(testword),dupcnt);
1835 if (pswit[ECHO_SWITCH])
1836 g_print("\n%s\n",aline);
1837 if (!pswit[OVERVIEW_SWITCH])
1839 g_print(" Line %ld column %ld - Query word %s",
1840 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1842 if (!pswit[VERBOSE_SWITCH])
1843 g_print(" - not reporting duplicates");
1851 /* check the user's list of typos */
1852 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1854 if (pswit[ECHO_SWITCH])
1855 g_print("\n%s\n",aline);
1856 if (!pswit[OVERVIEW_SWITCH])
1857 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1858 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1860 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1862 if (pswit[PARANOID_SWITCH] && warnings->digit)
1864 /* In paranoid mode, query all 0 and 1 standing alone. */
1865 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1867 if (pswit[ECHO_SWITCH])
1868 g_print("\n%s\n",aline);
1869 if (!pswit[OVERVIEW_SWITCH])
1870 g_print(" Line %ld column %ld - Query standalone %s\n",
1871 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1882 * check_for_misspaced_punctuation:
1884 * Look for added or missing spaces around punctuation and quotes.
1885 * If there is a punctuation character like ! with no space on
1886 * either side, suspect a missing!space. If there are spaces on
1887 * both sides , assume a typo. If we see a double quote with no
1888 * space or punctuation on either side of it, assume unspaced
1889 * quotes "like"this.
1891 void check_for_misspaced_punctuation(const char *aline,
1892 struct parities *parities,gboolean isemptyline)
1894 gboolean isacro,isellipsis;
1896 gunichar c,nc,pc,n2c;
1897 c=g_utf8_get_char(aline);
1898 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1899 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1903 nc=g_utf8_get_char(g_utf8_next_char(s));
1904 /* For each character in the line after the first. */
1905 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1907 /* we need to suppress warnings for acronyms like M.D. */
1909 /* we need to suppress warnings for ellipsis . . . */
1912 * If there are letters on both sides of it or
1913 * if it's strict punctuation followed by an alpha.
1915 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1916 g_utf8_strchr("?!,;:",-1,c)))
1920 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1921 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1923 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1929 if (pswit[ECHO_SWITCH])
1930 g_print("\n%s\n",aline);
1931 if (!pswit[OVERVIEW_SWITCH])
1932 g_print(" Line %ld column %ld - Missing space?\n",
1933 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1938 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1941 * If there are spaces on both sides,
1942 * or space before and end of line.
1946 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1947 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1949 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1953 if (!isemptyline && !isellipsis)
1955 if (pswit[ECHO_SWITCH])
1956 g_print("\n%s\n",aline);
1957 if (!pswit[OVERVIEW_SWITCH])
1958 g_print(" Line %ld column %ld - "
1959 "Spaced punctuation?\n",linecnt,
1960 g_utf8_pointer_to_offset(aline,s)+1);
1967 /* Split out the characters that CANNOT be preceded by space. */
1968 c=g_utf8_get_char(aline);
1969 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1970 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1974 nc=g_utf8_get_char(g_utf8_next_char(s));
1975 /* for each character in the line after the first */
1976 if (g_utf8_strchr("?!,;:",-1,c))
1978 /* if it's punctuation that _cannot_ have a space before it */
1979 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1982 * If nc DOES == space,
1983 * it was already reported just above.
1985 if (pswit[ECHO_SWITCH])
1986 g_print("\n%s\n",aline);
1987 if (!pswit[OVERVIEW_SWITCH])
1988 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1989 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1996 * Special case " .X" where X is any alpha.
1997 * This plugs a hole in the acronym code above.
1998 * Inelegant, but maintainable.
2000 c=g_utf8_get_char(aline);
2001 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2002 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2006 nc=g_utf8_get_char(g_utf8_next_char(s));
2007 /* for each character in the line after the first */
2010 /* if it's a period */
2011 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2014 * If the period follows a space and
2015 * is followed by a letter.
2017 if (pswit[ECHO_SWITCH])
2018 g_print("\n%s\n",aline);
2019 if (!pswit[OVERVIEW_SWITCH])
2020 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2021 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2027 c=g_utf8_get_char(aline);
2028 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2029 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2033 nc=g_utf8_get_char(g_utf8_next_char(s));
2034 /* for each character in the line after the first */
2037 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2038 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2039 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2041 if (pswit[ECHO_SWITCH])
2042 g_print("\n%s\n",aline);
2043 if (!pswit[OVERVIEW_SWITCH])
2044 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2045 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2051 /* Check parity of quotes. */
2052 nc=g_utf8_get_char(aline);
2053 for (s=aline;*s;s=g_utf8_next_char(s))
2056 nc=g_utf8_get_char(g_utf8_next_char(s));
2059 parities->dquote=!parities->dquote;
2060 if (!parities->dquote)
2063 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2065 if (pswit[ECHO_SWITCH])
2066 g_print("\n%s\n",aline);
2067 if (!pswit[OVERVIEW_SWITCH])
2068 g_print(" Line %ld column %ld - "
2069 "Wrongspaced quotes?\n",
2070 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2078 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2079 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2081 if (pswit[ECHO_SWITCH])
2082 g_print("\n%s\n",aline);
2083 if (!pswit[OVERVIEW_SWITCH])
2084 g_print(" Line %ld column %ld - "
2085 "Wrongspaced quotes?\n",
2086 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2093 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2095 if (g_utf8_strchr(",;:!?)]} ",-1,
2096 g_utf8_get_char(g_utf8_next_char(aline))))
2098 if (pswit[ECHO_SWITCH])
2099 g_print("\n%s\n",aline);
2100 if (!pswit[OVERVIEW_SWITCH])
2101 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2107 if (pswit[SQUOTE_SWITCH])
2109 nc=g_utf8_get_char(aline);
2110 for (s=aline;*s;s=g_utf8_next_char(s))
2113 nc=g_utf8_get_char(g_utf8_next_char(s));
2114 if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
2116 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2117 !g_unichar_isalpha(nc)))
2119 parities->squote=!parities->squote;
2120 if (!parities->squote)
2123 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2125 if (pswit[ECHO_SWITCH])
2126 g_print("\n%s\n",aline);
2127 if (!pswit[OVERVIEW_SWITCH])
2128 g_print(" Line %ld column %ld - "
2129 "Wrongspaced singlequotes?\n",
2130 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2138 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2139 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2141 if (pswit[ECHO_SWITCH])
2142 g_print("\n%s\n",aline);
2143 if (!pswit[OVERVIEW_SWITCH])
2144 g_print(" Line %ld column %ld - "
2145 "Wrongspaced singlequotes?\n",
2146 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2157 * check_for_double_punctuation:
2159 * Look for double punctuation like ,. or ,,
2160 * Thanks to DW for the suggestion!
2161 * In books with references, ".," and ".;" are common
2162 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2163 * OTOH, from my initial tests, there are also fairly
2164 * common errors. What to do? Make these cases paranoid?
2165 * ".," is the most common, so warnings->dotcomma is used
2166 * to suppress detailed reporting if it occurs often.
2168 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2172 nc=g_utf8_get_char(aline);
2173 for (s=aline;*s;s=g_utf8_next_char(s))
2176 nc=g_utf8_get_char(g_utf8_next_char(s));
2177 /* for each punctuation character in the line */
2178 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2179 g_utf8_strchr(".?!,;:",-1,nc))
2181 /* followed by punctuation, it's a query, unless . . . */
2182 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2183 !warnings->dotcomma && c=='.' && nc==',' ||
2184 warnings->isFrench && g_str_has_prefix(s,",...") ||
2185 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2186 warnings->isFrench && g_str_has_prefix(s,";...") ||
2187 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2188 warnings->isFrench && g_str_has_prefix(s,":...") ||
2189 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2190 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2191 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2192 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2193 warnings->isFrench && g_str_has_prefix(s,"...?"))
2195 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2196 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2197 warnings->isFrench && g_str_has_prefix(s,";...") ||
2198 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2199 warnings->isFrench && g_str_has_prefix(s,":...") ||
2200 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2201 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2202 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2203 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2204 warnings->isFrench && g_str_has_prefix(s,"...?"))
2207 nc=g_utf8_get_char(g_utf8_next_char(s));
2209 ; /* do nothing for .. !! and ?? which can be legit */
2213 if (pswit[ECHO_SWITCH])
2214 g_print("\n%s\n",aline);
2215 if (!pswit[OVERVIEW_SWITCH])
2216 g_print(" Line %ld column %ld - Double punctuation?\n",
2217 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2226 * check_for_spaced_quotes:
2228 void check_for_spaced_quotes(const char *aline)
2232 while ((t=strstr(s," \" ")))
2234 if (pswit[ECHO_SWITCH])
2235 g_print("\n%s\n",aline);
2236 if (!pswit[OVERVIEW_SWITCH])
2237 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2238 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2241 s=g_utf8_next_char(g_utf8_next_char(t));
2244 while ((t=strstr(s," ' ")))
2246 if (pswit[ECHO_SWITCH])
2247 g_print("\n%s\n",aline);
2248 if (!pswit[OVERVIEW_SWITCH])
2249 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2250 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2253 s=g_utf8_next_char(g_utf8_next_char(t));
2256 while ((t=strstr(s," ` ")))
2258 if (pswit[ECHO_SWITCH])
2259 g_print("\n%s\n",aline);
2260 if (!pswit[OVERVIEW_SWITCH])
2261 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2262 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2265 s=g_utf8_next_char(g_utf8_next_char(t));
2270 * check_for_miscased_genative:
2272 * Check special case of 'S instead of 's at end of word.
2274 void check_for_miscased_genative(const char *aline)
2280 c=g_utf8_get_char(aline);
2281 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2282 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2286 nc=g_utf8_get_char(g_utf8_next_char(s));
2287 if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
2289 if (pswit[ECHO_SWITCH])
2290 g_print("\n%s\n",aline);
2291 if (!pswit[OVERVIEW_SWITCH])
2292 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2293 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2301 * check_end_of_line:
2303 * Now check special cases - start and end of line -
2304 * for single and double quotes. Start is sometimes [sic]
2305 * but better to query it anyway.
2306 * While we're here, check for dash at end of line.
2308 void check_end_of_line(const char *aline,struct warnings *warnings)
2313 lbytes=strlen(aline);
2314 if (g_utf8_strlen(aline,lbytes)>1)
2316 s=g_utf8_prev_char(aline+lbytes);
2317 c1=g_utf8_get_char(s);
2318 c2=g_utf8_get_char(g_utf8_prev_char(s));
2319 if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
2322 if (pswit[ECHO_SWITCH])
2323 g_print("\n%s\n",aline);
2324 if (!pswit[OVERVIEW_SWITCH])
2325 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2326 g_utf8_strlen(aline,lbytes));
2330 c1=g_utf8_get_char(aline);
2331 c2=g_utf8_get_char(g_utf8_next_char(aline));
2332 if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
2334 if (pswit[ECHO_SWITCH])
2335 g_print("\n%s\n",aline);
2336 if (!pswit[OVERVIEW_SWITCH])
2337 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2342 * Dash at end of line may well be legit - paranoid mode only
2343 * and don't report em-dash at line-end.
2345 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2347 for (s=g_utf8_prev_char(aline+lbytes);
2348 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2350 if (g_utf8_get_char(s)=='-' &&
2351 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2353 if (pswit[ECHO_SWITCH])
2354 g_print("\n%s\n",aline);
2355 if (!pswit[OVERVIEW_SWITCH])
2356 g_print(" Line %ld column %ld - "
2357 "Hyphen at end of line?\n",
2358 linecnt,g_utf8_pointer_to_offset(aline,s));
2365 * check_for_unspaced_bracket:
2367 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2368 * If so, suspect a scanno like "a]most".
2370 void check_for_unspaced_bracket(const char *aline)
2374 c=g_utf8_get_char(aline);
2375 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2376 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2380 nc=g_utf8_get_char(g_utf8_next_char(s));
2383 /* for each bracket character in the line except 1st & last */
2384 if (g_utf8_strchr("{[()]}",-1,c) &&
2385 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2387 if (pswit[ECHO_SWITCH])
2388 g_print("\n%s\n",aline);
2389 if (!pswit[OVERVIEW_SWITCH])
2390 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2391 linecnt,g_utf8_pointer_to_offset(aline,s));
2399 * check_for_unpunctuated_endquote:
2401 void check_for_unpunctuated_endquote(const char *aline)
2405 c=g_utf8_get_char(aline);
2406 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2407 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2411 nc=g_utf8_get_char(g_utf8_next_char(s));
2412 /* for each character in the line except 1st */
2413 if (c==CHAR_DQUOTE && isalpha(pc))
2415 if (pswit[ECHO_SWITCH])
2416 g_print("\n%s\n",aline);
2417 if (!pswit[OVERVIEW_SWITCH])
2418 g_print(" Line %ld column %ld - "
2419 "endquote missing punctuation?\n",
2420 linecnt,g_utf8_pointer_to_offset(aline,s));
2428 * check_for_html_tag:
2430 * Check for <HTML TAG>.
2432 * If there is a < in the line, followed at some point
2433 * by a > then we suspect HTML.
2435 void check_for_html_tag(const char *aline)
2437 const char *open,*close;
2439 open=strchr(aline,'<');
2442 close=strchr(g_utf8_next_char(open),'>');
2445 if (pswit[ECHO_SWITCH])
2446 g_print("\n%s\n",aline);
2447 if (!pswit[OVERVIEW_SWITCH])
2449 tag=g_strndup(open,close-open+1);
2450 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2451 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2461 * check_for_html_entity:
2463 * Check for &symbol; HTML.
2465 * If there is a & in the line, followed at
2466 * some point by a ; then we suspect HTML.
2468 void check_for_html_entity(const char *aline)
2470 const char *s,*amp,*scolon;
2472 amp=strchr(aline,'&');
2475 scolon=strchr(amp,';');
2478 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2479 if (g_utf8_get_char(s)==CHAR_SPACE)
2480 break; /* Don't report "Jones & Son;" */
2483 if (pswit[ECHO_SWITCH])
2484 g_print("\n%s\n",aline);
2485 if (!pswit[OVERVIEW_SWITCH])
2487 entity=g_strndup(amp,scolon-amp+1);
2488 g_print(" Line %ld column %d - HTML symbol? %s \n",
2489 linecnt,(int)(amp-aline)+1,entity);
2502 * If we are in a state of unbalanced quotes, and this line
2503 * doesn't begin with a quote, output the stored error message.
2504 * If the -P switch was used, print the warning even if the
2505 * new para starts with quotes.
2507 void print_pending(const char *aline,const char *parastart,
2508 struct pending *pending)
2515 c=g_utf8_get_char(s);
2516 if (pending->dquote)
2518 if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2520 if (!pswit[OVERVIEW_SWITCH])
2522 if (pswit[ECHO_SWITCH])
2523 g_print("\n%s\n",parastart);
2524 g_print("%s\n",pending->dquote);
2529 g_free(pending->dquote);
2530 pending->dquote=NULL;
2532 if (pending->squote)
2534 if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2537 if (!pswit[OVERVIEW_SWITCH])
2539 if (pswit[ECHO_SWITCH])
2540 g_print("\n%s\n",parastart);
2541 g_print("%s\n",pending->squote);
2546 g_free(pending->squote);
2547 pending->squote=NULL;
2549 if (pending->rbrack)
2551 if (!pswit[OVERVIEW_SWITCH])
2553 if (pswit[ECHO_SWITCH])
2554 g_print("\n%s\n",parastart);
2555 g_print("%s\n",pending->rbrack);
2559 g_free(pending->rbrack);
2560 pending->rbrack=NULL;
2562 if (pending->sbrack)
2564 if (!pswit[OVERVIEW_SWITCH])
2566 if (pswit[ECHO_SWITCH])
2567 g_print("\n%s\n",parastart);
2568 g_print("%s\n",pending->sbrack);
2572 g_free(pending->sbrack);
2573 pending->sbrack=NULL;
2575 if (pending->cbrack)
2577 if (!pswit[OVERVIEW_SWITCH])
2579 if (pswit[ECHO_SWITCH])
2580 g_print("\n%s\n",parastart);
2581 g_print("%s\n",pending->cbrack);
2585 g_free(pending->cbrack);
2586 pending->cbrack=NULL;
2588 if (pending->unders)
2590 if (!pswit[OVERVIEW_SWITCH])
2592 if (pswit[ECHO_SWITCH])
2593 g_print("\n%s\n",parastart);
2594 g_print("%s\n",pending->unders);
2598 g_free(pending->unders);
2599 pending->unders=NULL;
2604 * check_for_mismatched_quotes:
2606 * At end of paragraph, check for mismatched quotes.
2608 * We don't want to report an error immediately, since it is a
2609 * common convention to omit the quotes at end of paragraph if
2610 * the next paragraph is a continuation of the same speaker.
2611 * Where this is the case, the next para should begin with a
2612 * quote, so we store the warning message and only display it
2613 * at the top of the next iteration if the new para doesn't
2614 * start with a quote.
2615 * The -p switch overrides this default, and warns of unclosed
2616 * quotes on _every_ paragraph, whether the next begins with a
2619 void check_for_mismatched_quotes(const struct counters *counters,
2620 struct pending *pending)
2622 if (counters->quot%2)
2624 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2625 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2626 counters->open_single_quote!=counters->close_single_quote)
2628 g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
2629 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2630 counters->open_single_quote!=counters->close_single_quote &&
2631 counters->open_single_quote!=counters->close_single_quote+1)
2633 * Flag it to be noted regardless of the
2634 * first char of the next para.
2637 if (counters->r_brack)
2639 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2640 if (counters->s_brack)
2642 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2643 if (counters->c_brack)
2645 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2646 if (counters->c_unders%2)
2648 g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
2652 * check_for_omitted_punctuation:
2654 * Check for omitted punctuation at end of paragraph by working back
2655 * through prevline. DW.
2656 * Need to check this only for "normal" paras.
2657 * So what is a "normal" para?
2658 * Not normal if one-liner (chapter headings, etc.)
2659 * Not normal if doesn't contain at least one locase letter
2660 * Not normal if starts with space
2662 void check_for_omitted_punctuation(const char *prevline,
2663 struct line_properties *last,int start_para_line)
2665 gboolean letter_on_line=FALSE;
2667 for (s=prevline;*s;s=g_utf8_next_char(s))
2668 if (g_unichar_isalpha(g_utf8_get_char(s)))
2670 letter_on_line=TRUE;
2674 * This next "if" is a problem.
2675 * If we say "start_para_line <= linecnt - 1", that includes
2676 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2677 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2678 * misses genuine one-line paragraphs.
2680 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2681 g_utf8_get_char(prevline)>CHAR_SPACE)
2683 for (s=g_utf8_prev_char(prevline+strlen(prevline));
2684 (g_utf8_get_char(s)==CHAR_DQUOTE ||
2685 g_utf8_get_char(s)==CHAR_SQUOTE) &&
2686 g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
2687 s=g_utf8_prev_char(s))
2689 for (;s>prevline;s=g_utf8_prev_char(s))
2691 if (g_unichar_isalpha(g_utf8_get_char(s)))
2693 if (pswit[ECHO_SWITCH])
2694 g_print("\n%s\n",prevline);
2695 if (!pswit[OVERVIEW_SWITCH])
2696 g_print(" Line %ld column %ld - "
2697 "No punctuation at para end?\n",
2698 linecnt-1,g_utf8_strlen(prevline,-1));
2703 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2709 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2711 const char *word=key;
2714 g_print("\nNote: Queried word %s was duplicated %d times\n",
2719 void print_as_windows_1252(const char *string)
2721 gsize inbytes,outbytes;
2723 GIConv converter=(GIConv)-1;
2726 if (converter!=(GIConv)-1)
2727 g_iconv_close(converter);
2728 converter=(GIConv)-1;
2731 if (converter=(GIConv)-1)
2732 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2733 if (converter!=(GIConv)-1)
2735 inbytes=outbytes=strlen(string);
2736 bp=buf=g_malloc(outbytes+1);
2737 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2743 fputs(string,stdout);
2751 void procfile(const char *filename)
2754 gchar *parastart=NULL; /* first line of current para */
2755 gchar *etext,*aline;
2758 struct first_pass_results *first_pass_results;
2759 struct warnings *warnings;
2760 struct counters counters={0};
2761 struct line_properties last={0};
2762 struct parities parities={0};
2763 struct pending pending={0};
2764 gboolean isemptyline;
2765 long start_para_line=0;
2766 gboolean isnewpara=FALSE,enddash=FALSE;
2767 last.start=CHAR_SPACE;
2768 linecnt=checked_linecnt=0;
2769 etext=read_etext(filename,&err);
2772 if (pswit[STDOUT_SWITCH])
2773 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2775 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2778 g_set_print_handler(print_as_windows_1252);
2779 g_print("\n\nFile: %s\n\n",filename);
2780 first_pass_results=first_pass(etext);
2781 warnings=report_first_pass(first_pass_results);
2782 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2783 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2785 * Here we go with the main pass. Hold onto yer hat!
2789 while ((aline=flgets(&etext_ptr,linecnt+1)))
2794 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2795 continue; // skip DP page separators completely
2796 if (linecnt<first_pass_results->firstline ||
2797 (first_pass_results->footerline>0 &&
2798 linecnt>first_pass_results->footerline))
2800 if (pswit[HEADER_SWITCH])
2802 if (g_str_has_prefix(aline,"Title:"))
2803 g_print(" %s\n",aline);
2804 if (g_str_has_prefix(aline,"Author:"))
2805 g_print(" %s\n",aline);
2806 if (g_str_has_prefix(aline,"Release Date:"))
2807 g_print(" %s\n",aline);
2808 if (g_str_has_prefix(aline,"Edition:"))
2809 g_print(" %s\n\n",aline);
2811 continue; /* skip through the header */
2814 print_pending(aline,parastart,&pending);
2815 memset(&pending,0,sizeof(pending));
2816 isemptyline=analyse_quotes(aline,&counters);
2817 if (isnewpara && !isemptyline)
2819 /* This line is the start of a new paragraph. */
2820 start_para_line=linecnt;
2821 /* Capture its first line in case we want to report it later. */
2823 parastart=g_strdup(aline);
2824 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2826 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2827 !g_unichar_isdigit(g_utf8_get_char(s)))
2828 s=g_utf8_next_char(s);
2829 if (g_unichar_islower(g_utf8_get_char(s)))
2831 /* and its first letter is lowercase */
2832 if (pswit[ECHO_SWITCH])
2833 g_print("\n%s\n",aline);
2834 if (!pswit[OVERVIEW_SWITCH])
2835 g_print(" Line %ld column %ld - "
2836 "Paragraph starts with lower-case\n",
2837 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2841 isnewpara=FALSE; /* Signal the end of new para processing. */
2843 /* Check for an em-dash broken at line end. */
2844 if (enddash && g_utf8_get_char(aline)=='-')
2846 if (pswit[ECHO_SWITCH])
2847 g_print("\n%s\n",aline);
2848 if (!pswit[OVERVIEW_SWITCH])
2849 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2854 for (s=g_utf8_prev_char(aline+strlen(aline));
2855 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2857 if (s>=aline && g_utf8_get_char(s)=='-')
2859 check_for_control_characters(aline);
2861 check_for_odd_characters(aline,warnings,isemptyline);
2862 if (warnings->longline)
2863 check_for_long_line(aline);
2864 if (warnings->shortline)
2865 check_for_short_line(aline,&last);
2867 last.len=g_utf8_strlen(aline,-1);
2868 last.start=g_utf8_get_char(aline);
2869 check_for_starting_punctuation(aline);
2872 check_for_spaced_emdash(aline);
2873 check_for_spaced_dash(aline);
2875 check_for_unmarked_paragraphs(aline);
2876 check_for_jeebies(aline);
2877 check_for_mta_from(aline);
2878 check_for_orphan_character(aline);
2879 check_for_pling_scanno(aline);
2880 check_for_extra_period(aline,warnings);
2881 check_for_following_punctuation(aline);
2882 check_for_typos(aline,warnings);
2883 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2884 check_for_double_punctuation(aline,warnings);
2885 check_for_spaced_quotes(aline);
2886 check_for_miscased_genative(aline);
2887 check_end_of_line(aline,warnings);
2888 check_for_unspaced_bracket(aline);
2889 if (warnings->endquote)
2890 check_for_unpunctuated_endquote(aline);
2891 check_for_html_tag(aline);
2892 check_for_html_entity(aline);
2895 check_for_mismatched_quotes(&counters,&pending);
2896 memset(&counters,0,sizeof(counters));
2897 /* let the next iteration know that it's starting a new para */
2900 check_for_omitted_punctuation(prevline,&last,start_para_line);
2903 prevline=g_strdup(aline);
2913 if (!pswit[OVERVIEW_SWITCH])
2914 g_tree_foreach(qword,report_duplicate_queries,NULL);
2915 g_tree_unref(qword);
2916 g_tree_unref(qperiod);
2917 g_set_print_handler(NULL);
2918 print_as_windows_1252(NULL);
2924 * Get one line from the input text, checking for
2925 * the existence of exactly one CR/LF line-end per line.
2927 * Returns: a pointer to the line.
2929 char *flgets(char **etext,long lcnt)
2932 gboolean isCR=FALSE;
2933 char *theline=*etext;
2938 c=g_utf8_get_char(*etext);
2939 *etext=g_utf8_next_char(*etext);
2942 /* either way, it's end of line */
2949 /* Error - a LF without a preceding CR */
2950 if (pswit[LINE_END_SWITCH])
2952 if (pswit[ECHO_SWITCH])
2954 s=g_strndup(theline,eos-theline);
2955 g_print("\n%s\n",s);
2958 if (!pswit[OVERVIEW_SWITCH])
2959 g_print(" Line %ld - No CR?\n",lcnt);
2970 /* Error - two successive CRs */
2971 if (pswit[LINE_END_SWITCH])
2973 if (pswit[ECHO_SWITCH])
2975 s=g_strndup(theline,eos-theline);
2976 g_print("\n%s\n",s);
2979 if (!pswit[OVERVIEW_SWITCH])
2980 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2989 if (pswit[LINE_END_SWITCH] && isCR)
2991 if (pswit[ECHO_SWITCH])
2993 s=g_strndup(theline,eos-theline);
2994 g_print("\n%s\n",s);
2997 if (!pswit[OVERVIEW_SWITCH])
2998 g_print(" Line %ld column %ld - CR without LF?\n",
2999 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3005 eos=g_utf8_next_char(eos);
3009 if (pswit[MARKUP_SWITCH])
3010 postprocess_for_HTML(theline);
3011 if (pswit[DP_SWITCH])
3012 postprocess_for_DP(theline);
3019 * Takes a "word" as a parameter, and checks whether it
3020 * contains a mixture of alpha and digits. Generally, this is an
3021 * error, but may not be for cases like 4th or L5 12s. 3d.
3023 * Returns: TRUE iff an is error found.
3025 gboolean mixdigit(const char *checkword)
3027 gboolean wehaveadigit,wehavealetter,query;
3028 const char *s,*nondigit;
3029 wehaveadigit=wehavealetter=query=FALSE;
3030 for (s=checkword;*s;s=g_utf8_next_char(s))
3031 if (g_unichar_isalpha(g_utf8_get_char(s)))
3033 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3035 if (wehaveadigit && wehavealetter)
3037 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3039 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3040 nondigit=g_utf8_next_char(nondigit))
3042 /* digits, ending in st, rd, nd, th of either case */
3043 if (!g_ascii_strcasecmp(nondigit,"st") ||
3044 !g_ascii_strcasecmp(nondigit,"rd") ||
3045 !g_ascii_strcasecmp(nondigit,"nd") ||
3046 !g_ascii_strcasecmp(nondigit,"th"))
3048 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3049 !g_ascii_strcasecmp(nondigit,"rds") ||
3050 !g_ascii_strcasecmp(nondigit,"nds") ||
3051 !g_ascii_strcasecmp(nondigit,"ths"))
3053 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3054 !g_ascii_strcasecmp(nondigit,"rdly") ||
3055 !g_ascii_strcasecmp(nondigit,"ndly") ||
3056 !g_ascii_strcasecmp(nondigit,"thly"))
3058 /* digits, ending in l, L, s or d */
3059 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3060 !strcmp(nondigit,"d"))
3063 * L at the start of a number, representing Britsh pounds, like L500.
3064 * This is cute. We know the current word is mixed digit. If the first
3065 * letter is L, there must be at least one digit following. If both
3066 * digits and letters follow, we have a genuine error, else we have a
3067 * capital L followed by digits, and we accept that as a non-error.
3069 if (g_utf8_get_char(checkword)=='L' &&
3070 !mixdigit(g_utf8_next_char(checkword)))
3079 * Extracts the first/next "word" from the line, and returns it.
3080 * A word is defined as one English word unit--or at least that's the aim.
3081 * "ptr" is advanced to the position in the line where we will start
3082 * looking for the next word.
3084 * Returns: A newly-allocated string.
3086 gchar *getaword(const char **ptr)
3091 word=g_string_new(NULL);
3092 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3093 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3094 **ptr;*ptr=g_utf8_next_char(*ptr))
3097 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3098 * Especially yucky is the case of L1,000
3099 * This section looks for a pattern of characters including a digit
3100 * followed by a comma or period followed by one or more digits.
3101 * If found, it returns this whole pattern as a word; otherwise we discard
3102 * the results and resume our normal programming.
3105 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3106 g_unichar_isalpha(g_utf8_get_char(s)) ||
3107 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3108 g_string_append_unichar(word,g_utf8_get_char(s));
3109 for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
3110 t=g_utf8_next_char(t))
3112 c=g_utf8_get_char(t);
3113 pc=g_utf8_get_char(g_utf8_prev_char(t));
3114 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3117 return g_string_free(word,FALSE);
3120 /* we didn't find a punctuated number - do the regular getword thing */
3121 g_string_truncate(word,0);
3122 for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
3123 g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
3124 g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
3125 g_string_append_unichar(word,g_utf8_get_char(*ptr));
3126 return g_string_free(word,FALSE);
3132 * Is this word a Roman Numeral?
3134 * It doesn't actually validate that the number is a valid Roman Numeral--for
3135 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3136 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3137 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3138 * expressions thereof, except when it came to taxes. Allow any number of M,
3139 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3140 * XL or an optional XC, an optional IX or IV, an optional V and any number
3143 gboolean isroman(const char *t)
3149 while (g_utf8_get_char(t)=='m' && *t)
3151 if (g_utf8_get_char(t)=='d')
3153 if (g_str_has_prefix(t,"cm"))
3155 if (g_str_has_prefix(t,"cd"))
3157 while (g_utf8_get_char(t)=='c' && *t)
3159 if (g_str_has_prefix(t,"xl"))
3161 if (g_str_has_prefix(t,"xc"))
3163 if (g_utf8_get_char(t)=='l')
3165 while (g_utf8_get_char(t)=='x' && *t)
3167 if (g_str_has_prefix(t,"ix"))
3169 if (g_str_has_prefix(t,"iv"))
3171 if (g_utf8_get_char(t)=='v')
3173 while (g_utf8_get_char(t)=='i' && *t)
3179 * postprocess_for_DP:
3181 * Invoked with the -d switch from flgets().
3182 * It simply "removes" from the line a hard-coded set of common
3183 * DP-specific tags, so that the line passed to the main routine has
3184 * been pre-cleaned of DP markup.
3186 void postprocess_for_DP(char *theline)
3192 for (i=0;*DPmarkup[i];i++)
3193 while ((s=strstr(theline,DPmarkup[i])))
3195 t=s+strlen(DPmarkup[i]);
3196 memmove(s,t,strlen(t)+1);
3201 * postprocess_for_HTML:
3203 * Invoked with the -m switch from flgets().
3204 * It simply "removes" from the line a hard-coded set of common
3205 * HTML tags and "replaces" a hard-coded set of common HTML
3206 * entities, so that the line passed to the main routine has
3207 * been pre-cleaned of HTML.
3209 void postprocess_for_HTML(char *theline)
3211 while (losemarkup(theline))
3213 while (loseentities(theline))
3217 char *losemarkup(char *theline)
3221 s=strchr(theline,'<');
3222 t=s?strchr(s,'>'):NULL;
3225 for (i=0;*markup[i];i++)
3226 if (tagcomp(g_utf8_next_char(s),markup[i]))
3228 t=g_utf8_next_char(t);
3229 memmove(s,t,strlen(t)+1);
3232 /* It's an unrecognized <xxx>. */
3236 char *loseentities(char *theline)
3242 for (i=0;*entities[i].htmlent;i++)
3244 s=strstr(theline,entities[i].htmlent);
3247 t=g_strdup(s+strlen(entities[i].htmlent));
3248 strcpy(s,entities[i].textent);
3254 for (i=0;*entities[i].htmlnum;i++)
3256 s=strstr(theline,entities[i].htmlnum);
3259 t=g_strdup(s+strlen(entities[i].htmlnum));
3260 strcpy(s,entities[i].textent);
3269 gboolean tagcomp(const char *strin,const char *basetag)
3273 if (g_utf8_get_char(strin)=='/')
3274 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3276 t=g_utf8_casefold(strin,-1);
3277 s=g_utf8_casefold(basetag,-1);
3278 retval=g_str_has_prefix(t,s);
3284 void proghelp(GOptionContext *context)
3287 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3288 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3289 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3290 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3291 "For details, read the file COPYING.\n",stderr);
3292 fputs("This is Free Software; "
3293 "you may redistribute it under certain conditions (GPL);\n",stderr);
3294 fputs("read the file COPYING for details.\n\n",stderr);
3295 help=g_option_context_get_help(context,TRUE,NULL);
3298 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3299 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3300 "non-ASCII\n",stderr);
3301 fputs("characters like accented letters, "
3302 "lines longer than 75 or shorter than 55,\n",stderr);
3303 fputs("unbalanced quotes or brackets, "
3304 "a variety of badly formatted punctuation, \n",stderr);
3305 fputs("HTML tags, some likely typos. "
3306 "It is NOT a substitute for human judgement.\n",stderr);