1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 gboolean typo_compat,paranoid_compat;
137 static GOptionEntry options[]={
138 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
139 "Ignore DP-specific markup", NULL },
140 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
141 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
142 "Don't ignore DP-specific markup", NULL },
143 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
144 "Echo queried line", NULL },
145 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
146 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
147 "Don't echo queried line", NULL },
148 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
149 "Check single quotes", NULL },
150 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
151 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
152 "Don't check single quotes", NULL },
153 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
154 "Check common typos", NULL },
155 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
156 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
157 "Don't check common typos", NULL },
158 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
159 "Require closure of quotes on every paragraph", NULL },
160 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
161 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
162 "Don't require closure of quotes on every paragraph", NULL },
163 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
164 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
165 "Enable paranoid querying of everything", NULL },
166 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
167 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
168 "Disable paranoid querying of everything", NULL },
169 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
170 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
171 "Enable line end checking", NULL },
172 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
173 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
174 "Diable line end checking", NULL },
175 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
176 "Overview: just show counts", NULL },
177 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
178 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
179 "Show individual warnings", NULL },
180 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
181 "Output errors to stdout instead of stderr", NULL },
182 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
183 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
184 "Output errors to stderr instead of stdout", NULL },
185 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
186 "Echo header fields", NULL },
187 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
188 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Don't echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
193 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "No special handling for markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
198 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
199 "Ignore file of user-defined typos", NULL },
200 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
201 "Verbose - list everything", NULL },
202 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
203 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
204 "Switch off verbose mode", NULL },
205 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
206 "Set of characters valid for this ebook", "NAME" },
211 * Options relating to configuration which make no sense from inside
212 * a configuration file.
215 static GOptionEntry config_options[]={
216 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
217 "Defaults for use on www upload", NULL },
218 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
219 "Dump current config settings", NULL },
223 static GOptionEntry compatibility_options[]={
224 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
225 "Toggle checking for common typos", NULL },
226 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
227 "Toggle both paranoid mode and common typos", NULL },
231 long cnt_dquot; /* for overview mode, count of doublequote queries */
232 long cnt_squot; /* for overview mode, count of singlequote queries */
233 long cnt_brack; /* for overview mode, count of brackets queries */
234 long cnt_bin; /* for overview mode, count of non-ASCII queries */
235 long cnt_odd; /* for overview mode, count of odd character queries */
236 long cnt_long; /* for overview mode, count of long line errors */
237 long cnt_short; /* for overview mode, count of short line queries */
238 long cnt_punct; /* for overview mode,
239 count of punctuation and spacing queries */
240 long cnt_dash; /* for overview mode, count of dash-related queries */
241 long cnt_word; /* for overview mode, count of word queries */
242 long cnt_html; /* for overview mode, count of html queries */
243 long cnt_lineend; /* for overview mode, count of line-end queries */
244 long cnt_spacend; /* count of lines with space at end */
245 long linecnt; /* count of total lines in the file */
246 long checked_linecnt; /* count of lines actually checked */
248 void proghelp(GOptionContext *context);
249 void procfile(const char *);
253 gboolean mixdigit(const char *);
254 gchar *getaword(const char **);
255 char *flgets(char **,long);
256 void postprocess_for_HTML(char *);
257 char *linehasmarkup(char *);
258 char *losemarkup(char *);
259 gboolean tagcomp(const char *,const char *);
260 void loseentities(char *);
261 gboolean isroman(const char *);
262 void postprocess_for_DP(char *);
263 void print_as_windows_1252(const char *string);
264 void print_as_utf_8(const char *string);
266 GTree *qword,*qperiod;
272 gboolean set_charset(const char *name,GError **err)
274 /* The various UNICODE encodings all share the same character set. */
275 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
276 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
277 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
278 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
279 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
283 if (charset_validator!=(GIConv)-1)
284 g_iconv_close(charset_validator);
285 if (!name || !g_strcasecmp(name,"auto"))
288 charset_validator=(GIConv)-1;
292 charset=g_strdup(name);
293 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
294 if (!g_strcasecmp(charset,unicode_aliases[i]))
297 charset=g_strdup("UTF-8");
300 if (!strcmp(charset,"UTF-8"))
301 charset_validator=(GIConv)-1;
304 charset_validator=g_iconv_open(charset,"UTF-8");
305 if (charset_validator==(GIConv)-1)
307 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
308 "Unknown character set \"%s\"",charset);
317 void config_file_update(GKeyFile *kf)
321 for(i=0;options[i].long_name;i++)
323 if (g_str_has_prefix(options[i].long_name,"no-"))
325 if (options[i].arg==G_OPTION_ARG_NONE)
327 sw=*(gboolean *)options[i].arg_data;
328 if (options[i].flags&G_OPTION_FLAG_REVERSE)
330 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
333 g_assert_not_reached();
337 void config_file_add_comments(GKeyFile *kf)
341 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
343 for(i=0;options[i].long_name;i++)
345 if (g_str_has_prefix(options[i].long_name,"no-"))
347 comment=g_strconcat(" ",options[i].description,NULL);
348 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
353 void dump_config(void)
357 config_file_update(config);
360 config=g_key_file_new();
361 config_file_update(config);
362 config_file_add_comments(config);
364 s=g_key_file_to_data(config,NULL,NULL);
370 GKeyFile *read_config_file(gchar **full_path)
376 const char *search_path;
379 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
383 search_dirs=g_strsplit(search_path,";",0);
385 search_dirs=g_strsplit(search_path,":",0);
390 search_dirs=g_new(gchar *,4);
391 search_dirs[0]=g_get_current_dir();
392 search_dirs[1]=g_strdup(running_from);
393 search_dirs[2]=g_strdup(g_get_user_config_dir());
396 for(i=0;search_dirs[i];i++)
398 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
399 if (g_key_file_load_from_file(kf,path,
400 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
402 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
404 g_printerr("Bookloupe: Error reading %s\n",path);
405 g_printerr("%s\n",err->message);
417 g_strfreev(search_dirs);
425 void parse_config_file(void)
432 config=read_config_file(&path);
434 keys=g_key_file_get_keys(config,"options",NULL,NULL);
441 for(j=0;options[j].long_name;j++)
443 if (g_str_has_prefix(options[j].long_name,"no-"))
445 else if (!strcmp(keys[i],options[j].long_name))
447 if (options[j].arg==G_OPTION_ARG_NONE)
449 sw=g_key_file_get_boolean(config,"options",keys[i],
453 g_printerr("Bookloupe: %s: options.%s: %s\n",
454 path,keys[i],err->message);
457 if (options[j].flags&G_OPTION_FLAG_REVERSE)
459 *(gboolean *)options[j].arg_data=sw;
463 g_assert_not_reached();
466 if (!options[j].long_name)
467 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
476 void parse_options(int *argc,char ***argv)
479 GOptionContext *context;
480 GOptionGroup *compatibility;
481 context=g_option_context_new(
482 "file - look for errors in Project Gutenberg(TM) etexts");
483 g_option_context_add_main_entries(context,options,NULL);
484 g_option_context_add_main_entries(context,config_options,NULL);
485 compatibility=g_option_group_new("compatibility",
486 "Options for Compatibility with Gutcheck:",
487 "Show compatibility options",NULL,NULL);
488 g_option_group_add_entries(compatibility,compatibility_options);
489 g_option_context_add_group(context,compatibility);
490 g_option_context_set_description(context,
491 "For simplicity, only the switch options which reverse the\n"
492 "default configuration are listed. In most cases, both vanilla\n"
493 "and \"no-\" prefixed versions are available for use.");
494 if (!g_option_context_parse(context,argc,argv,&err))
496 g_printerr("Bookloupe: %s\n",err->message);
497 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
501 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
504 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
505 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
508 * Web uploads - for the moment, this is really just a placeholder
509 * until we decide what processing we really want to do on web uploads
511 if (pswit[WEB_SWITCH])
513 /* specific override for web uploads */
514 pswit[ECHO_SWITCH]=TRUE;
515 pswit[SQUOTE_SWITCH]=FALSE;
516 pswit[TYPO_SWITCH]=TRUE;
517 pswit[QPARA_SWITCH]=FALSE;
518 pswit[PARANOID_SWITCH]=TRUE;
519 pswit[LINE_END_SWITCH]=FALSE;
520 pswit[OVERVIEW_SWITCH]=FALSE;
521 pswit[STDOUT_SWITCH]=FALSE;
522 pswit[HEADER_SWITCH]=TRUE;
523 pswit[VERBOSE_SWITCH]=FALSE;
524 pswit[MARKUP_SWITCH]=FALSE;
525 pswit[USERTYPO_SWITCH]=FALSE;
526 pswit[DP_SWITCH]=FALSE;
528 if (opt_charset && !set_charset(opt_charset,&err))
530 g_printerr("%s\n",err->message);
535 if (pswit[DUMP_CONFIG_SWITCH])
540 if (pswit[OVERVIEW_SWITCH])
541 /* just print summary; don't echo */
542 pswit[ECHO_SWITCH]=FALSE;
548 g_option_context_free(context);
554 * Read in the user-defined stealth scanno list.
556 void read_user_scannos(void)
559 gchar *usertypo_file;
563 gchar *contents,*utf8,**lines;
564 usertypo_file=g_strdup("bookloupe.typ");
565 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
566 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
569 g_free(usertypo_file);
570 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
571 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
573 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
576 g_free(usertypo_file);
577 usertypo_file=g_strdup("gutcheck.typ");
578 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
580 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
583 g_free(usertypo_file);
584 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
585 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
587 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
589 g_free(usertypo_file);
590 g_print(" --> I couldn't find bookloupe.typ "
591 "-- proceeding without user typos.\n");
596 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
597 g_free(usertypo_file);
601 if (g_utf8_validate(contents,len,NULL))
603 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
605 (void)set_charset("UNICODE",NULL);
608 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
610 lines=g_strsplit_set(utf8,"\r\n",0);
612 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
613 for (i=0;lines[i];i++)
614 if (*(unsigned char *)lines[i]>'!')
615 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
624 * Read an etext returning a newly allocated string containing the file
625 * contents or NULL on error.
627 gchar *read_etext(const char *filename,GError **err)
629 GError *tmp_err=NULL;
630 gchar *contents,*utf8;
631 gsize len,bytes_read,bytes_written;
633 if (!g_file_get_contents(filename,&contents,&len,err))
635 if (g_utf8_validate(contents,len,NULL))
637 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
638 g_set_print_handler(print_as_utf_8);
640 SetConsoleOutputCP(CP_UTF8);
645 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
646 &bytes_written,&tmp_err);
647 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
648 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
651 for(i=0;i<bytes_read;i++)
652 if (contents[i]=='\n')
657 else if (contents[i]!='\r')
659 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
660 "Input conversion failed. Byte %d at line %d, column %d is not a "
661 "valid Windows-1252 character",
662 ((unsigned char *)contents)[bytes_read],line,col);
665 g_propagate_error(err,tmp_err);
666 g_set_print_handler(print_as_windows_1252);
668 SetConsoleOutputCP(1252);
675 void cleanup_on_exit(void)
678 SetConsoleOutputCP(saved_cp);
682 int main(int argc,char **argv)
685 atexit(cleanup_on_exit);
686 saved_cp=GetConsoleOutputCP();
688 running_from=g_path_get_dirname(argv[0]);
689 /* Paranoid checking is turned OFF, not on, by its switch */
690 pswit[PARANOID_SWITCH]=TRUE;
691 /* if running in paranoid mode, typo checks default to enabled */
692 pswit[TYPO_SWITCH]=TRUE;
693 /* Line-end checking is turned OFF, not on, by its switch */
694 pswit[LINE_END_SWITCH]=TRUE;
695 /* Echoing is turned OFF, not on, by its switch */
696 pswit[ECHO_SWITCH]=TRUE;
698 parse_options(&argc,&argv);
699 if (pswit[USERTYPO_SWITCH])
701 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
703 if (pswit[OVERVIEW_SWITCH])
705 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
706 checked_linecnt,linecnt,linecnt-checked_linecnt);
707 g_print(" --------------- Queries found --------------\n");
709 g_print(" Long lines: %14ld\n",cnt_long);
711 g_print(" Short lines: %14ld\n",cnt_short);
713 g_print(" Line-end problems: %14ld\n",cnt_lineend);
715 g_print(" Common typos: %14ld\n",cnt_word);
717 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
719 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
721 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
723 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
725 g_print(" Proofing characters: %14ld\n",cnt_odd);
727 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
729 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
731 g_print(" Possible HTML tags: %14ld\n",cnt_html);
733 g_print(" TOTAL QUERIES %14ld\n",
734 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
735 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
737 g_free(running_from);
739 g_tree_unref(usertypo);
740 set_charset(NULL,NULL);
742 g_key_file_free(config);
749 * Run a first pass - verify that it's a valid PG
750 * file, decide whether to report some things that
751 * occur many times in the text like long or short
752 * lines, non-standard dashes, etc.
754 struct first_pass_results *first_pass(const char *etext)
756 gunichar laststart=CHAR_SPACE;
761 unsigned int lastlen=0,lastblen=0;
762 long spline=0,nspline=0;
763 static struct first_pass_results results={0};
765 lines=g_strsplit(etext,"\n",0);
766 for (j=0;lines[j];j++)
768 lbytes=strlen(lines[j]);
769 while (lbytes>0 && lines[j][lbytes-1]=='\r')
770 lines[j][--lbytes]='\0';
771 llen=g_utf8_strlen(lines[j],lbytes);
773 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
774 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
777 g_print(" --> Duplicate header?\n");
778 spline=linecnt+1; /* first line of non-header text, that is */
780 if (!strncmp(lines[j],"*** START",9) &&
781 strstr(lines[j],"PROJECT GUTENBERG"))
784 g_print(" --> Duplicate header?\n");
785 nspline=linecnt+1; /* first line of non-header text, that is */
787 if (spline || nspline)
789 lc_line=g_utf8_strdown(lines[j],lbytes);
790 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
792 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
794 if (results.footerline)
796 /* it's an old-form header - we can detect duplicates */
798 g_print(" --> Duplicate footer?\n");
801 results.footerline=linecnt;
807 results.firstline=spline;
809 results.firstline=nspline; /* override with new */
810 if (results.footerline)
811 continue; /* don't count the boilerplate in the footer */
812 results.totlen+=llen;
813 for (s=lines[j];*s;s=g_utf8_next_char(s))
815 if (g_utf8_get_char(s)>127)
817 if (g_unichar_isalpha(g_utf8_get_char(s)))
819 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
820 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
821 results.endquote_count++;
823 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
824 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
827 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
829 if (strstr(lines[j],".,"))
831 /* only count ast lines for ignoring purposes where there is */
832 /* locase text on the line */
833 if (strchr(lines[j],'*'))
835 for (s=lines[j];*s;s=g_utf8_next_char(s))
836 if (g_unichar_islower(g_utf8_get_char(s)))
841 if (strchr(lines[j],'/'))
842 results.fslashline++;
845 for (s=g_utf8_prev_char(lines[j]+lbytes);
846 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
847 s=g_utf8_prev_char(s))
849 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
850 g_utf8_get_char(g_utf8_prev_char(s))!='-')
853 if (llen>LONGEST_PG_LINE)
855 if (llen>WAY_TOO_LONG)
856 results.verylongline++;
857 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
859 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
862 if (strstr(lines[j],"<i>"))
863 results.htmcount+=4; /* bonus marks! */
865 /* Check for spaced em-dashes */
866 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
869 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
870 results.space_emdash++;
871 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
872 /* count of em-dashes with spaces both sides */
873 results.non_PG_space_emdash++;
874 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
875 /* count of PG-type em-dashes with no spaces */
876 results.PG_space_emdash++;
881 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
882 results.Dutchcount++;
883 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
884 results.Frenchcount++;
885 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
886 results.standalone_digit++;
889 /* Check for spaced dashes */
890 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
894 laststart=lines[j][0];
903 * Make some snap decisions based on the first pass results.
905 struct warnings *report_first_pass(struct first_pass_results *results)
907 static struct warnings warnings={0};
909 g_print(" --> %ld lines in this file have white space at end\n",
912 if (results->dotcomma>5)
915 g_print(" --> %ld lines in this file contain '.,'. "
916 "Not reporting them.\n",results->dotcomma);
919 * If more than 50 lines, or one-tenth, are short,
920 * don't bother reporting them.
922 warnings.shortline=1;
923 if (results->shortline>50 || results->shortline*10>linecnt)
925 warnings.shortline=0;
926 g_print(" --> %ld lines in this file are short. "
927 "Not reporting short lines.\n",results->shortline);
930 * If more than 50 lines, or one-tenth, are long,
931 * don't bother reporting them.
934 if (results->longline>50 || results->longline*10>linecnt)
937 g_print(" --> %ld lines in this file are long. "
938 "Not reporting long lines.\n",results->longline);
940 /* If more than 10 lines contain asterisks, don't bother reporting them. */
942 if (results->astline>10)
945 g_print(" --> %ld lines in this file contain asterisks. "
946 "Not reporting them.\n",results->astline);
949 * If more than 10 lines contain forward slashes,
950 * don't bother reporting them.
953 if (results->fslashline>10)
956 g_print(" --> %ld lines in this file contain forward slashes. "
957 "Not reporting them.\n",results->fslashline);
960 * If more than 20 lines contain unpunctuated endquotes,
961 * don't bother reporting them.
964 if (results->endquote_count>20)
967 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
968 "Not reporting them.\n",results->endquote_count);
971 * If more than 15 lines contain standalone digits,
972 * don't bother reporting them.
975 if (results->standalone_digit>10)
978 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
979 "Not reporting them.\n",results->standalone_digit);
982 * If more than 20 lines contain hyphens at end,
983 * don't bother reporting them.
986 if (results->hyphens>20)
989 g_print(" --> %ld lines in this file have hyphens at end. "
990 "Not reporting them.\n",results->hyphens);
992 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
994 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
995 pswit[MARKUP_SWITCH]=1;
997 if (results->verylongline>0)
998 g_print(" --> %ld lines in this file are VERY long!\n",
999 results->verylongline);
1001 * If there are more non-PG spaced dashes than PG em-dashes,
1002 * assume it's deliberate.
1003 * Current PG guidelines say don't use them, but older texts do,
1004 * and some people insist on them whatever the guidelines say.
1007 if (results->spacedash+results->non_PG_space_emdash>
1008 results->PG_space_emdash)
1011 g_print(" --> There are %ld spaced dashes and em-dashes. "
1012 "Not reporting them.\n",
1013 results->spacedash+results->non_PG_space_emdash);
1019 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1021 /* If more than a quarter of characters are hi-bit, bug out. */
1022 if (results->binlen*4>results->totlen)
1024 g_print(" --> This file does not appear to be ASCII. "
1025 "Terminating. Best of luck with it!\n");
1028 if (results->alphalen*4<results->totlen)
1030 g_print(" --> This file does not appear to be text. "
1031 "Terminating. Best of luck with it!\n");
1034 if (results->binlen*100>results->totlen || results->binlen>100)
1036 g_print(" --> There are a lot of foreign letters here. "
1037 "Not reporting them.\n");
1038 if (!pswit[VERBOSE_SWITCH])
1042 warnings.isDutch=FALSE;
1043 if (results->Dutchcount>50)
1045 warnings.isDutch=TRUE;
1046 g_print(" --> This looks like Dutch - "
1047 "switching off dashes and warnings for 's Middags case.\n");
1049 warnings.isFrench=FALSE;
1050 if (results->Frenchcount>50)
1052 warnings.isFrench=TRUE;
1053 g_print(" --> This looks like French - "
1054 "switching off some doublepunct.\n");
1056 if (results->firstline && results->footerline)
1057 g_print(" The PG header and footer appear to be already on.\n");
1060 if (results->firstline)
1061 g_print(" The PG header is on - no footer.\n");
1062 if (results->footerline)
1063 g_print(" The PG footer is on - no header.\n");
1066 if (pswit[VERBOSE_SWITCH])
1068 warnings.shortline=1;
1069 warnings.dotcomma=1;
1070 warnings.longline=1;
1076 warnings.endquote=1;
1077 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1079 if (warnings.isDutch)
1081 if (results->footerline>0 && results->firstline>0 &&
1082 results->footerline>results->firstline &&
1083 results->footerline-results->firstline<100)
1085 g_print(" --> I don't really know where this text starts. \n");
1086 g_print(" There are no reference points.\n");
1087 g_print(" I'm going to have to report the header and footer "
1089 results->firstline=0;
1097 * Look along the line, accumulate the count of quotes, and see
1098 * if this is an empty line - i.e. a line with nothing on it
1100 * If line has just spaces, period, * and/or - on it, don't
1101 * count it, since empty lines with asterisks or dashes to
1102 * separate sections are common.
1104 * Returns: TRUE if the line is empty.
1106 gboolean analyse_quotes(const char *aline,struct counters *counters)
1109 /* assume the line is empty until proven otherwise */
1110 gboolean isemptyline=TRUE;
1111 const char *s=aline,*sprev,*snext;
1116 snext=g_utf8_next_char(s);
1117 c=g_utf8_get_char(s);
1120 if (CHAR_IS_SQUOTE(c))
1125 * At start of line, it can only be an openquote.
1126 * Hardcode a very common exception!
1128 if (!g_str_has_prefix(snext,"tis") &&
1129 !g_str_has_prefix(snext,"Tis"))
1130 increment_matching(counters,c,TRUE);
1132 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1133 g_unichar_isalpha(g_utf8_get_char(snext)))
1134 /* Do nothing! it's definitely an apostrophe, not a quote */
1136 /* it's outside a word - let's check it out */
1137 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1138 g_unichar_isalpha(g_utf8_get_char(snext)))
1140 /* it damwell better BE an openquote */
1141 if (!g_str_has_prefix(snext,"tis") &&
1142 !g_str_has_prefix(snext,"Tis"))
1143 /* hardcode a very common exception! */
1144 increment_matching(counters,c,TRUE);
1148 /* now - is it a closequote? */
1149 guessquote=0; /* accumulate clues */
1150 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1152 /* it follows a letter - could be either */
1154 if (g_utf8_get_char(sprev)=='s')
1156 /* looks like a plural apostrophe */
1158 if (g_utf8_get_char(snext)==CHAR_SPACE)
1163 /* it doesn't have a letter either side */
1164 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
1165 strchr(".?!,;: ",g_utf8_get_char(snext)))
1166 guessquote+=8; /* looks like a closequote */
1169 if (matching_difference(counters,CHAR_SQUOTE)>0)
1171 * Give it the benefit of some doubt,
1172 * if a squote is already open.
1178 increment_matching(counters,c,FALSE);
1181 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1183 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1184 if (c==CHAR_UNDERSCORE)
1185 counters->c_unders++;
1186 if (c==CHAR_OPEN_SBRACK)
1188 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1189 !matching_difference(counters,c) && s==aline &&
1190 g_str_has_prefix(s,"[Illustration:"))
1191 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1193 increment_matching(counters,c,TRUE);
1195 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1196 increment_matching(counters,c,TRUE);
1197 if (c==CHAR_CLOSE_SBRACK)
1199 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1200 !matching_difference(counters,c) && !*snext)
1201 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1203 increment_matching(counters,c,FALSE);
1205 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1206 increment_matching(counters,c,FALSE);
1214 * check_for_control_characters:
1216 * Check for invalid or questionable characters in the line
1217 * Anything above 127 is invalid for plain ASCII, and
1218 * non-printable control characters should also be flagged.
1219 * Tabs should generally not be there.
1221 void check_for_control_characters(const char *aline)
1225 for (s=aline;*s;s=g_utf8_next_char(s))
1227 c=g_utf8_get_char(s);
1228 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1230 if (pswit[ECHO_SWITCH])
1231 g_print("\n%s\n",aline);
1232 if (!pswit[OVERVIEW_SWITCH])
1233 g_print(" Line %ld column %ld - Control character %u\n",
1234 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1242 * check_for_odd_characters:
1244 * Check for binary and other odd characters.
1246 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1247 gboolean isemptyline)
1249 /* Don't repeat multiple warnings on one line. */
1250 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1251 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1256 for (s=aline;*s;s=g_utf8_next_char(s))
1258 c=g_utf8_get_char(s);
1259 if (warnings->bin && !eInvalidChar &&
1260 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1262 if (pswit[ECHO_SWITCH])
1263 g_print("\n%s\n",aline);
1264 if (!pswit[OVERVIEW_SWITCH])
1265 if (c>127 && c<160 || c>255)
1266 g_print(" Line %ld column %ld - "
1267 "Non-ISO-8859 character %u\n",
1268 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1270 g_print(" Line %ld column %ld - "
1271 "Non-ASCII character %u\n",
1272 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1277 if (!eInvalidChar && charset)
1279 if (charset_validator==(GIConv)-1)
1281 if (!g_unichar_isdefined(c))
1283 if (pswit[ECHO_SWITCH])
1284 g_print("\n%s\n",aline);
1285 if (!pswit[OVERVIEW_SWITCH])
1286 g_print(" Line %ld column %ld - Unassigned UNICODE "
1287 "code point U+%04" G_GINT32_MODIFIER "X\n",
1288 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1293 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1294 c>=100000 && c<=0x10FFFD)
1296 if (pswit[ECHO_SWITCH])
1297 g_print("\n%s\n",aline);
1298 if (!pswit[OVERVIEW_SWITCH])
1299 g_print(" Line %ld column %ld - Private Use "
1300 "character U+%04" G_GINT32_MODIFIER "X\n",
1301 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1309 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1310 charset_validator,NULL,&nb,NULL);
1315 if (pswit[ECHO_SWITCH])
1316 g_print("\n%s\n",aline);
1317 if (!pswit[OVERVIEW_SWITCH])
1318 g_print(" Line %ld column %ld - Non-%s "
1319 "character %u\n",linecnt,
1320 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1327 if (!eTab && c==CHAR_TAB)
1329 if (pswit[ECHO_SWITCH])
1330 g_print("\n%s\n",aline);
1331 if (!pswit[OVERVIEW_SWITCH])
1332 g_print(" Line %ld column %ld - Tab character?\n",
1333 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1338 if (!eTilde && c==CHAR_TILDE)
1341 * Often used by OCR software to indicate an
1342 * unrecognizable character.
1344 if (pswit[ECHO_SWITCH])
1345 g_print("\n%s\n",aline);
1346 if (!pswit[OVERVIEW_SWITCH])
1347 g_print(" Line %ld column %ld - Tilde character?\n",
1348 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1353 if (!eCarat && c==CHAR_CARAT)
1355 if (pswit[ECHO_SWITCH])
1356 g_print("\n%s\n",aline);
1357 if (!pswit[OVERVIEW_SWITCH])
1358 g_print(" Line %ld column %ld - Carat character?\n",
1359 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1364 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1366 if (pswit[ECHO_SWITCH])
1367 g_print("\n%s\n",aline);
1368 if (!pswit[OVERVIEW_SWITCH])
1369 g_print(" Line %ld column %ld - Forward slash?\n",
1370 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1376 * Report asterisks only in paranoid mode,
1377 * since they're often deliberate.
1379 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1382 if (pswit[ECHO_SWITCH])
1383 g_print("\n%s\n",aline);
1384 if (!pswit[OVERVIEW_SWITCH])
1385 g_print(" Line %ld column %ld - Asterisk?\n",
1386 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1395 * check_for_long_line:
1397 * Check for line too long.
1399 void check_for_long_line(const char *aline)
1401 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1403 if (pswit[ECHO_SWITCH])
1404 g_print("\n%s\n",aline);
1405 if (!pswit[OVERVIEW_SWITCH])
1406 g_print(" Line %ld column %ld - Long line %ld\n",
1407 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1414 * check_for_short_line:
1416 * Check for line too short.
1418 * This one is a bit trickier to implement: we don't want to
1419 * flag the last line of a paragraph for being short, so we
1420 * have to wait until we know that our current line is a
1421 * "normal" line, then report the _previous_ line if it was too
1422 * short. We also don't want to report indented lines like
1423 * chapter heads or formatted quotations. We therefore keep
1424 * last->len as the length of the last line examined, and
1425 * last->blen as the length of the last but one, and try to
1426 * suppress unnecessary warnings by checking that both were of
1427 * "normal" length. We keep the first character of the last
1428 * line in last->start, and if it was a space, we assume that
1429 * the formatting is deliberate. I can't figure out a way to
1430 * distinguish something like a quoted verse left-aligned or
1431 * the header or footer of a letter from a paragraph of short
1432 * lines - maybe if I examined the whole paragraph, and if the
1433 * para has less than, say, 8 lines and if all lines are short,
1434 * then just assume it's OK? Need to look at some texts to see
1435 * how often a formula like this would get the right result.
1437 void check_for_short_line(const char *aline,const struct line_properties *last)
1439 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1440 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1441 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1443 if (pswit[ECHO_SWITCH])
1444 g_print("\n%s\n",prevline);
1445 if (!pswit[OVERVIEW_SWITCH])
1446 g_print(" Line %ld column %ld - Short line %ld?\n",
1447 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1454 * check_for_starting_punctuation:
1456 * Look for punctuation other than full ellipses at start of line.
1458 void check_for_starting_punctuation(const char *aline)
1460 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1461 !g_str_has_prefix(aline,". . ."))
1463 if (pswit[ECHO_SWITCH])
1464 g_print("\n%s\n",aline);
1465 if (!pswit[OVERVIEW_SWITCH])
1466 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1474 * check_for_spaced_emdash:
1476 * Check for spaced em-dashes.
1478 * We must check _all_ occurrences of "--" on the line
1479 * hence the loop - even if the first double-dash is OK
1480 * there may be another that's wrong later on.
1482 void check_for_spaced_emdash(const char *aline)
1484 const char *s,*t,*next;
1485 for (s=aline;t=strstr(s,"--");s=next)
1487 next=g_utf8_next_char(g_utf8_next_char(t));
1488 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1489 g_utf8_get_char(next)==CHAR_SPACE)
1491 if (pswit[ECHO_SWITCH])
1492 g_print("\n%s\n",aline);
1493 if (!pswit[OVERVIEW_SWITCH])
1494 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1495 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1503 * check_for_spaced_dash:
1505 * Check for spaced dashes.
1507 void check_for_spaced_dash(const char *aline)
1510 if ((s=strstr(aline," -")))
1512 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1514 if (pswit[ECHO_SWITCH])
1515 g_print("\n%s\n",aline);
1516 if (!pswit[OVERVIEW_SWITCH])
1517 g_print(" Line %ld column %ld - Spaced dash?\n",
1518 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1523 else if ((s=strstr(aline,"- ")))
1525 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1527 if (pswit[ECHO_SWITCH])
1528 g_print("\n%s\n",aline);
1529 if (!pswit[OVERVIEW_SWITCH])
1530 g_print(" Line %ld column %ld - Spaced dash?\n",
1531 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1539 * check_for_unmarked_paragraphs:
1541 * Check for unmarked paragraphs indicated by separate speakers.
1543 * May well be false positive:
1544 * "Bravo!" "Wonderful!" called the crowd.
1545 * but useful all the same.
1547 void check_for_unmarked_paragraphs(const char *aline)
1550 s=strstr(aline,"\" \"");
1552 s=strstr(aline,"\" \"");
1555 if (pswit[ECHO_SWITCH])
1556 g_print("\n%s\n",aline);
1557 if (!pswit[OVERVIEW_SWITCH])
1558 g_print(" Line %ld column %ld - "
1559 "Query missing paragraph break?\n",
1560 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1567 * check_for_jeebies:
1569 * Check for "to he" and other easy h/b errors.
1571 * This is a very inadequate effort on the h/b problem,
1572 * but the phrase "to he" is always an error, whereas "to
1573 * be" is quite common.
1574 * Similarly, '"Quiet!", be said.' is a non-be error
1575 * "to he" is _not_ always an error!:
1576 * "Where they went to he couldn't say."
1577 * Another false positive:
1578 * What would "Cinderella" be without the . . .
1579 * and another: "If he wants to he can see for himself."
1581 void check_for_jeebies(const char *aline)
1584 s=strstr(aline," be could ");
1586 s=strstr(aline," be would ");
1588 s=strstr(aline," was be ");
1590 s=strstr(aline," be is ");
1592 s=strstr(aline," is be ");
1594 s=strstr(aline,"\", be ");
1596 s=strstr(aline,"\" be ");
1598 s=strstr(aline,"\" be ");
1600 s=strstr(aline," to he ");
1603 if (pswit[ECHO_SWITCH])
1604 g_print("\n%s\n",aline);
1605 if (!pswit[OVERVIEW_SWITCH])
1606 g_print(" Line %ld column %ld - Query he/be error?\n",
1607 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1611 s=strstr(aline," the had ");
1613 s=strstr(aline," a had ");
1615 s=strstr(aline," they bad ");
1617 s=strstr(aline," she bad ");
1619 s=strstr(aline," he bad ");
1621 s=strstr(aline," you bad ");
1623 s=strstr(aline," i bad ");
1626 if (pswit[ECHO_SWITCH])
1627 g_print("\n%s\n",aline);
1628 if (!pswit[OVERVIEW_SWITCH])
1629 g_print(" Line %ld column %ld - Query had/bad error?\n",
1630 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1634 s=strstr(aline,"; hut ");
1636 s=strstr(aline,", hut ");
1639 if (pswit[ECHO_SWITCH])
1640 g_print("\n%s\n",aline);
1641 if (!pswit[OVERVIEW_SWITCH])
1642 g_print(" Line %ld column %ld - Query hut/but error?\n",
1643 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1650 * check_for_mta_from:
1652 * Special case - angled bracket in front of "From" placed there by an
1653 * MTA when sending an e-mail.
1655 void check_for_mta_from(const char *aline)
1658 s=strstr(aline,">From");
1661 if (pswit[ECHO_SWITCH])
1662 g_print("\n%s\n",aline);
1663 if (!pswit[OVERVIEW_SWITCH])
1664 g_print(" Line %ld column %ld - "
1665 "Query angled bracket with From\n",
1666 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1673 * check_for_orphan_character:
1675 * Check for a single character line -
1676 * often an overflow from bad wrapping.
1678 void check_for_orphan_character(const char *aline)
1681 c=g_utf8_get_char(aline);
1682 if (c && !*g_utf8_next_char(aline))
1684 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1685 ; /* Nothing - ignore numerals alone on a line. */
1688 if (pswit[ECHO_SWITCH])
1689 g_print("\n%s\n",aline);
1690 if (!pswit[OVERVIEW_SWITCH])
1691 g_print(" Line %ld column 1 - Query single character line\n",
1700 * check_for_pling_scanno:
1702 * Check for I" - often should be !
1704 void check_for_pling_scanno(const char *aline)
1707 s=strstr(aline," I\"");
1710 if (pswit[ECHO_SWITCH])
1711 g_print("\n%s\n",aline);
1712 if (!pswit[OVERVIEW_SWITCH])
1713 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1714 linecnt,g_utf8_pointer_to_offset(aline,s));
1721 * check_for_extra_period:
1723 * Check for period without a capital letter. Cut-down from gutspell.
1724 * Only works when it happens on a single line.
1726 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1728 const char *s,*t,*s1,*sprev;
1733 gunichar c,nc,pc,*decomposition;
1734 if (pswit[PARANOID_SWITCH])
1736 for (t=aline;t=strstr(t,". ");)
1740 t=g_utf8_next_char(t);
1741 /* start of line punctuation is handled elsewhere */
1744 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1746 t=g_utf8_next_char(t);
1749 if (warnings->isDutch)
1751 /* For Frank & Jeroen -- 's Middags case */
1752 gunichar c2,c3,c4,c5;
1753 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1754 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1755 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1756 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1757 if (CHAR_IS_APOSTROPHE(c2) &&
1758 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1759 g_unichar_isupper(c5))
1761 t=g_utf8_next_char(t);
1765 s1=g_utf8_next_char(g_utf8_next_char(t));
1766 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1767 !isdigit(g_utf8_get_char(s1)))
1768 s1=g_utf8_next_char(s1);
1769 if (g_unichar_islower(g_utf8_get_char(s1)))
1771 /* we have something to investigate */
1773 /* so let's go back and find out */
1774 nc=g_utf8_get_char(t);
1775 s1=g_utf8_prev_char(t);
1776 c=g_utf8_get_char(s1);
1777 sprev=g_utf8_prev_char(s1);
1778 pc=g_utf8_get_char(sprev);
1780 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1781 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1782 g_unichar_isalpha(nc)))
1787 sprev=g_utf8_prev_char(s1);
1788 pc=g_utf8_get_char(sprev);
1790 s1=g_utf8_next_char(s1);
1793 testword=g_strndup(s1,s-s1);
1795 testword=g_strdup(s1);
1796 for (i=0;*abbrev[i];i++)
1797 if (!strcmp(testword,abbrev[i]))
1799 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1801 if (!*g_utf8_next_char(testword))
1803 if (isroman(testword))
1808 for (s=testword;*s;s=g_utf8_next_char(s))
1810 decomposition=g_unicode_canonical_decomposition(
1811 g_utf8_get_char(s),&len);
1812 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1814 g_free(decomposition);
1818 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1820 g_tree_insert(qperiod,g_strdup(testword),
1821 GINT_TO_POINTER(1));
1822 if (pswit[ECHO_SWITCH])
1823 g_print("\n%s\n",aline);
1824 if (!pswit[OVERVIEW_SWITCH])
1825 g_print(" Line %ld column %ld - Extra period?\n",
1826 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1832 t=g_utf8_next_char(t);
1838 * check_for_following_punctuation:
1840 * Check for words usually not followed by punctuation.
1842 void check_for_following_punctuation(const char *aline)
1845 const char *s,*wordstart;
1848 if (pswit[TYPO_SWITCH])
1859 inword=g_utf8_strdown(t,-1);
1861 for (i=0;*nocomma[i];i++)
1862 if (!strcmp(inword,nocomma[i]))
1864 c=g_utf8_get_char(s);
1865 if (c==',' || c==';' || c==':')
1867 if (pswit[ECHO_SWITCH])
1868 g_print("\n%s\n",aline);
1869 if (!pswit[OVERVIEW_SWITCH])
1870 g_print(" Line %ld column %ld - "
1871 "Query punctuation after %s?\n",
1872 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1878 for (i=0;*noperiod[i];i++)
1879 if (!strcmp(inword,noperiod[i]))
1881 c=g_utf8_get_char(s);
1882 if (c=='.' || c=='!')
1884 if (pswit[ECHO_SWITCH])
1885 g_print("\n%s\n",aline);
1886 if (!pswit[OVERVIEW_SWITCH])
1887 g_print(" Line %ld column %ld - "
1888 "Query punctuation after %s?\n",
1889 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1903 * Check for commonly mistyped words,
1904 * and digits like 0 for O in a word.
1906 void check_for_typos(const char *aline,struct warnings *warnings)
1908 const char *s,*t,*nt,*wordstart;
1910 gunichar *decomposition;
1912 int i,vowel,consonant,*dupcnt;
1913 gboolean isdup,istypo,alower;
1916 gsize decomposition_len;
1920 inword=getaword(&s);
1924 continue; /* don't bother with empty lines */
1926 if (mixdigit(inword))
1928 if (pswit[ECHO_SWITCH])
1929 g_print("\n%s\n",aline);
1930 if (!pswit[OVERVIEW_SWITCH])
1931 g_print(" Line %ld column %ld - Query digit in %s\n",
1932 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1937 * Put the word through a series of tests for likely typos and OCR
1940 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1944 for (t=inword;*t;t=g_utf8_next_char(t))
1946 c=g_utf8_get_char(t);
1947 nt=g_utf8_next_char(t);
1948 /* lowercase for testing */
1949 if (g_unichar_islower(c))
1951 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1954 * We have an uppercase mid-word. However, there are
1956 * Mac and Mc like McGill
1957 * French contractions like l'Abbe
1959 offset=g_utf8_pointer_to_offset(inword,t);
1961 pc=g_utf8_get_char(g_utf8_prev_char(t));
1964 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1965 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1966 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1967 CHAR_IS_APOSTROPHE(pc))
1973 testword=g_utf8_casefold(inword,-1);
1975 if (pswit[TYPO_SWITCH])
1978 * Check for certain unlikely two-letter combinations at word
1981 len=g_utf8_strlen(testword,-1);
1984 for (i=0;*nostart[i];i++)
1985 if (g_str_has_prefix(testword,nostart[i]))
1987 for (i=0;*noend[i];i++)
1988 if (g_str_has_suffix(testword,noend[i]))
1991 /* ght is common, gbt never. Like that. */
1992 if (strstr(testword,"cb"))
1994 if (strstr(testword,"gbt"))
1996 if (strstr(testword,"pbt"))
1998 if (strstr(testword,"tbs"))
2000 if (strstr(testword,"mrn"))
2002 if (strstr(testword,"ahle"))
2004 if (strstr(testword,"ihle"))
2007 * "TBE" does happen - like HEARTBEAT - but uncommon.
2008 * Also "TBI" - frostbite, outbid - but uncommon.
2009 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2010 * numerals, but "ii" is a common scanno.
2012 if (strstr(testword,"tbi"))
2014 if (strstr(testword,"tbe"))
2016 if (strstr(testword,"ii"))
2019 * Check for no vowels or no consonants.
2020 * If none, flag a typo.
2022 if (!istypo && len>1)
2025 for (t=testword;*t;t=g_utf8_next_char(t))
2027 c=g_utf8_get_char(t);
2029 g_unicode_canonical_decomposition(c,&decomposition_len);
2030 if (c=='y' || g_unichar_isdigit(c))
2032 /* Yah, this is loose. */
2036 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2040 g_free(decomposition);
2042 if (!vowel || !consonant)
2046 * Now exclude the word from being reported if it's in
2049 for (i=0;*okword[i];i++)
2050 if (!strcmp(testword,okword[i]))
2053 * What looks like a typo may be a Roman numeral.
2056 if (istypo && isroman(testword))
2058 /* Check the manual list of typos. */
2060 for (i=0;*typo[i];i++)
2061 if (!strcmp(testword,typo[i]))
2064 * Check lowercase s, l, i and m - special cases.
2065 * "j" - often a semi-colon gone wrong.
2066 * "d" for a missing apostrophe - he d
2069 if (!istypo && len==1 &&
2070 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2074 dupcnt=g_tree_lookup(qword,testword);
2078 isdup=!pswit[VERBOSE_SWITCH];
2082 dupcnt=g_new0(int,1);
2083 g_tree_insert(qword,g_strdup(testword),dupcnt);
2088 if (pswit[ECHO_SWITCH])
2089 g_print("\n%s\n",aline);
2090 if (!pswit[OVERVIEW_SWITCH])
2092 g_print(" Line %ld column %ld - Query word %s",
2093 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2095 if (!pswit[VERBOSE_SWITCH])
2096 g_print(" - not reporting duplicates");
2104 /* check the user's list of typos */
2105 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2107 if (pswit[ECHO_SWITCH])
2108 g_print("\n%s\n",aline);
2109 if (!pswit[OVERVIEW_SWITCH])
2110 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2111 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2113 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2115 if (pswit[PARANOID_SWITCH] && warnings->digit)
2117 /* In paranoid mode, query all 0 and 1 standing alone. */
2118 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2120 if (pswit[ECHO_SWITCH])
2121 g_print("\n%s\n",aline);
2122 if (!pswit[OVERVIEW_SWITCH])
2123 g_print(" Line %ld column %ld - Query standalone %s\n",
2124 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2135 * check_for_misspaced_punctuation:
2137 * Look for added or missing spaces around punctuation and quotes.
2138 * If there is a punctuation character like ! with no space on
2139 * either side, suspect a missing!space. If there are spaces on
2140 * both sides , assume a typo. If we see a double quote with no
2141 * space or punctuation on either side of it, assume unspaced
2142 * quotes "like"this.
2144 void check_for_misspaced_punctuation(const char *aline,
2145 struct parities *parities,gboolean isemptyline)
2147 gboolean isacro,isellipsis;
2149 gunichar c,nc,pc,n2c;
2150 c=g_utf8_get_char(aline);
2151 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2152 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2156 nc=g_utf8_get_char(g_utf8_next_char(s));
2157 /* For each character in the line after the first. */
2158 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2160 /* we need to suppress warnings for acronyms like M.D. */
2162 /* we need to suppress warnings for ellipsis . . . */
2165 * If there are letters on both sides of it or
2166 * if it's strict punctuation followed by an alpha.
2168 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2169 g_utf8_strchr("?!,;:",-1,c)))
2173 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2174 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2176 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2182 if (pswit[ECHO_SWITCH])
2183 g_print("\n%s\n",aline);
2184 if (!pswit[OVERVIEW_SWITCH])
2185 g_print(" Line %ld column %ld - Missing space?\n",
2186 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2191 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2194 * If there are spaces on both sides,
2195 * or space before and end of line.
2199 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2200 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2202 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2206 if (!isemptyline && !isellipsis)
2208 if (pswit[ECHO_SWITCH])
2209 g_print("\n%s\n",aline);
2210 if (!pswit[OVERVIEW_SWITCH])
2211 g_print(" Line %ld column %ld - "
2212 "Spaced punctuation?\n",linecnt,
2213 g_utf8_pointer_to_offset(aline,s)+1);
2220 /* Split out the characters that CANNOT be preceded by space. */
2221 c=g_utf8_get_char(aline);
2222 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2223 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2227 nc=g_utf8_get_char(g_utf8_next_char(s));
2228 /* for each character in the line after the first */
2229 if (g_utf8_strchr("?!,;:",-1,c))
2231 /* if it's punctuation that _cannot_ have a space before it */
2232 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2235 * If nc DOES == space,
2236 * it was already reported just above.
2238 if (pswit[ECHO_SWITCH])
2239 g_print("\n%s\n",aline);
2240 if (!pswit[OVERVIEW_SWITCH])
2241 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2242 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2249 * Special case " .X" where X is any alpha.
2250 * This plugs a hole in the acronym code above.
2251 * Inelegant, but maintainable.
2253 c=g_utf8_get_char(aline);
2254 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2255 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2259 nc=g_utf8_get_char(g_utf8_next_char(s));
2260 /* for each character in the line after the first */
2263 /* if it's a period */
2264 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2267 * If the period follows a space and
2268 * is followed by a letter.
2270 if (pswit[ECHO_SWITCH])
2271 g_print("\n%s\n",aline);
2272 if (!pswit[OVERVIEW_SWITCH])
2273 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2274 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2280 c=g_utf8_get_char(aline);
2281 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2282 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2286 nc=g_utf8_get_char(g_utf8_next_char(s));
2287 /* for each character in the line after the first */
2290 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2291 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2292 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2294 if (pswit[ECHO_SWITCH])
2295 g_print("\n%s\n",aline);
2296 if (!pswit[OVERVIEW_SWITCH])
2297 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2298 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2304 /* Check parity of quotes. */
2305 nc=g_utf8_get_char(aline);
2306 for (s=aline;*s;s=g_utf8_next_char(s))
2309 nc=g_utf8_get_char(g_utf8_next_char(s));
2312 parities->dquote=!parities->dquote;
2313 if (!parities->dquote)
2316 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2318 if (pswit[ECHO_SWITCH])
2319 g_print("\n%s\n",aline);
2320 if (!pswit[OVERVIEW_SWITCH])
2321 g_print(" Line %ld column %ld - "
2322 "Wrongspaced quotes?\n",
2323 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2331 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2332 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2334 if (pswit[ECHO_SWITCH])
2335 g_print("\n%s\n",aline);
2336 if (!pswit[OVERVIEW_SWITCH])
2337 g_print(" Line %ld column %ld - "
2338 "Wrongspaced quotes?\n",
2339 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2346 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2348 if (g_utf8_strchr(",;:!?)]} ",-1,
2349 g_utf8_get_char(g_utf8_next_char(aline))))
2351 if (pswit[ECHO_SWITCH])
2352 g_print("\n%s\n",aline);
2353 if (!pswit[OVERVIEW_SWITCH])
2354 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2360 if (pswit[SQUOTE_SWITCH])
2362 nc=g_utf8_get_char(aline);
2363 for (s=aline;*s;s=g_utf8_next_char(s))
2366 nc=g_utf8_get_char(g_utf8_next_char(s));
2367 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2368 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2369 !g_unichar_isalpha(nc)))
2371 parities->squote=!parities->squote;
2372 if (!parities->squote)
2375 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2377 if (pswit[ECHO_SWITCH])
2378 g_print("\n%s\n",aline);
2379 if (!pswit[OVERVIEW_SWITCH])
2380 g_print(" Line %ld column %ld - "
2381 "Wrongspaced singlequotes?\n",
2382 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2390 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2391 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2393 if (pswit[ECHO_SWITCH])
2394 g_print("\n%s\n",aline);
2395 if (!pswit[OVERVIEW_SWITCH])
2396 g_print(" Line %ld column %ld - "
2397 "Wrongspaced singlequotes?\n",
2398 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2409 * check_for_double_punctuation:
2411 * Look for double punctuation like ,. or ,,
2412 * Thanks to DW for the suggestion!
2413 * In books with references, ".," and ".;" are common
2414 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2415 * OTOH, from my initial tests, there are also fairly
2416 * common errors. What to do? Make these cases paranoid?
2417 * ".," is the most common, so warnings->dotcomma is used
2418 * to suppress detailed reporting if it occurs often.
2420 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2424 nc=g_utf8_get_char(aline);
2425 for (s=aline;*s;s=g_utf8_next_char(s))
2428 nc=g_utf8_get_char(g_utf8_next_char(s));
2429 /* for each punctuation character in the line */
2430 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2431 g_utf8_strchr(".?!,;:",-1,nc))
2433 /* followed by punctuation, it's a query, unless . . . */
2434 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2435 !warnings->dotcomma && c=='.' && nc==',' ||
2436 warnings->isFrench && g_str_has_prefix(s,",...") ||
2437 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2438 warnings->isFrench && g_str_has_prefix(s,";...") ||
2439 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2440 warnings->isFrench && g_str_has_prefix(s,":...") ||
2441 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2442 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2443 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2444 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2445 warnings->isFrench && g_str_has_prefix(s,"...?"))
2447 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2448 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2449 warnings->isFrench && g_str_has_prefix(s,";...") ||
2450 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2451 warnings->isFrench && g_str_has_prefix(s,":...") ||
2452 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2453 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2454 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2455 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2456 warnings->isFrench && g_str_has_prefix(s,"...?"))
2459 nc=g_utf8_get_char(g_utf8_next_char(s));
2461 ; /* do nothing for .. !! and ?? which can be legit */
2465 if (pswit[ECHO_SWITCH])
2466 g_print("\n%s\n",aline);
2467 if (!pswit[OVERVIEW_SWITCH])
2468 g_print(" Line %ld column %ld - Double punctuation?\n",
2469 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2478 * check_for_spaced_quotes:
2480 void check_for_spaced_quotes(const char *aline)
2484 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2488 while ((t=strstr(s," \" ")))
2490 if (pswit[ECHO_SWITCH])
2491 g_print("\n%s\n",aline);
2492 if (!pswit[OVERVIEW_SWITCH])
2493 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2494 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2497 s=g_utf8_next_char(g_utf8_next_char(t));
2499 pattern=g_string_new(NULL);
2500 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2502 g_string_assign(pattern," ");
2503 g_string_append_unichar(pattern,single_quotes[i]);
2504 g_string_append_c(pattern,' ');
2506 while ((t=strstr(s,pattern->str)))
2508 if (pswit[ECHO_SWITCH])
2509 g_print("\n%s\n",aline);
2510 if (!pswit[OVERVIEW_SWITCH])
2511 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2512 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2515 s=g_utf8_next_char(g_utf8_next_char(t));
2518 g_string_free(pattern,TRUE);
2522 * check_for_miscased_genative:
2524 * Check special case of 'S instead of 's at end of word.
2526 void check_for_miscased_genative(const char *aline)
2532 c=g_utf8_get_char(aline);
2533 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2534 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2538 nc=g_utf8_get_char(g_utf8_next_char(s));
2539 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2541 if (pswit[ECHO_SWITCH])
2542 g_print("\n%s\n",aline);
2543 if (!pswit[OVERVIEW_SWITCH])
2544 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2545 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2553 * check_end_of_line:
2555 * Now check special cases - start and end of line -
2556 * for single and double quotes. Start is sometimes [sic]
2557 * but better to query it anyway.
2558 * While we're here, check for dash at end of line.
2560 void check_end_of_line(const char *aline,struct warnings *warnings)
2565 lbytes=strlen(aline);
2566 if (g_utf8_strlen(aline,lbytes)>1)
2568 s=g_utf8_prev_char(aline+lbytes);
2569 c1=g_utf8_get_char(s);
2570 c2=g_utf8_get_char(g_utf8_prev_char(s));
2571 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2573 if (pswit[ECHO_SWITCH])
2574 g_print("\n%s\n",aline);
2575 if (!pswit[OVERVIEW_SWITCH])
2576 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2577 g_utf8_strlen(aline,lbytes));
2581 c1=g_utf8_get_char(aline);
2582 c2=g_utf8_get_char(g_utf8_next_char(aline));
2583 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2585 if (pswit[ECHO_SWITCH])
2586 g_print("\n%s\n",aline);
2587 if (!pswit[OVERVIEW_SWITCH])
2588 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2593 * Dash at end of line may well be legit - paranoid mode only
2594 * and don't report em-dash at line-end.
2596 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2598 for (s=g_utf8_prev_char(aline+lbytes);
2599 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2601 if (g_utf8_get_char(s)=='-' &&
2602 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2604 if (pswit[ECHO_SWITCH])
2605 g_print("\n%s\n",aline);
2606 if (!pswit[OVERVIEW_SWITCH])
2607 g_print(" Line %ld column %ld - "
2608 "Hyphen at end of line?\n",
2609 linecnt,g_utf8_pointer_to_offset(aline,s));
2616 * check_for_unspaced_bracket:
2618 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2619 * If so, suspect a scanno like "a]most".
2621 void check_for_unspaced_bracket(const char *aline)
2625 c=g_utf8_get_char(aline);
2626 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2627 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2631 nc=g_utf8_get_char(g_utf8_next_char(s));
2634 /* for each bracket character in the line except 1st & last */
2635 if (g_utf8_strchr("{[()]}",-1,c) &&
2636 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2638 if (pswit[ECHO_SWITCH])
2639 g_print("\n%s\n",aline);
2640 if (!pswit[OVERVIEW_SWITCH])
2641 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2642 linecnt,g_utf8_pointer_to_offset(aline,s));
2650 * check_for_unpunctuated_endquote:
2652 void check_for_unpunctuated_endquote(const char *aline)
2656 c=g_utf8_get_char(aline);
2657 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2658 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2662 nc=g_utf8_get_char(g_utf8_next_char(s));
2663 /* for each character in the line except 1st */
2664 if (c==CHAR_DQUOTE && isalpha(pc))
2666 if (pswit[ECHO_SWITCH])
2667 g_print("\n%s\n",aline);
2668 if (!pswit[OVERVIEW_SWITCH])
2669 g_print(" Line %ld column %ld - "
2670 "endquote missing punctuation?\n",
2671 linecnt,g_utf8_pointer_to_offset(aline,s));
2679 * check_for_html_tag:
2681 * Check for <HTML TAG>.
2683 * If there is a < in the line, followed at some point
2684 * by a > then we suspect HTML.
2686 void check_for_html_tag(const char *aline)
2688 const char *open,*close;
2690 open=strchr(aline,'<');
2693 close=strchr(g_utf8_next_char(open),'>');
2696 if (pswit[ECHO_SWITCH])
2697 g_print("\n%s\n",aline);
2698 if (!pswit[OVERVIEW_SWITCH])
2700 tag=g_strndup(open,close-open+1);
2701 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2702 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2712 * check_for_html_entity:
2714 * Check for &symbol; HTML.
2716 * If there is a & in the line, followed at
2717 * some point by a ; then we suspect HTML.
2719 void check_for_html_entity(const char *aline)
2721 const char *s,*amp,*scolon;
2723 amp=strchr(aline,'&');
2726 scolon=strchr(amp,';');
2729 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2730 if (g_utf8_get_char(s)==CHAR_SPACE)
2731 break; /* Don't report "Jones & Son;" */
2734 if (pswit[ECHO_SWITCH])
2735 g_print("\n%s\n",aline);
2736 if (!pswit[OVERVIEW_SWITCH])
2738 entity=g_strndup(amp,scolon-amp+1);
2739 g_print(" Line %ld column %d - HTML symbol? %s \n",
2740 linecnt,(int)(amp-aline)+1,entity);
2751 * check_for_omitted_punctuation:
2753 * Check for omitted punctuation at end of paragraph by working back
2754 * through prevline. DW.
2755 * Need to check this only for "normal" paras.
2756 * So what is a "normal" para?
2757 * Not normal if one-liner (chapter headings, etc.)
2758 * Not normal if doesn't contain at least one locase letter
2759 * Not normal if starts with space
2761 void check_for_omitted_punctuation(const char *prevline,
2762 struct line_properties *last,int start_para_line)
2764 gboolean letter_on_line=FALSE;
2767 for (s=prevline;*s;s=g_utf8_next_char(s))
2768 if (g_unichar_isalpha(g_utf8_get_char(s)))
2770 letter_on_line=TRUE;
2774 * This next "if" is a problem.
2775 * If we say "start_para_line <= linecnt - 1", that includes
2776 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2777 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2778 * misses genuine one-line paragraphs.
2780 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2781 g_utf8_get_char(prevline)>CHAR_SPACE)
2783 s=prevline+strlen(prevline);
2786 s=g_utf8_prev_char(s);
2787 c=g_utf8_get_char(s);
2788 } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
2789 for (;s>prevline;s=g_utf8_prev_char(s))
2791 if (g_unichar_isalpha(g_utf8_get_char(s)))
2793 if (pswit[ECHO_SWITCH])
2794 g_print("\n%s\n",prevline);
2795 if (!pswit[OVERVIEW_SWITCH])
2796 g_print(" Line %ld column %ld - "
2797 "No punctuation at para end?\n",
2798 linecnt-1,g_utf8_strlen(prevline,-1));
2803 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2809 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2811 const char *word=key;
2814 g_print("\nNote: Queried word %s was duplicated %d times\n",
2819 void print_as_windows_1252(const char *string)
2821 gsize inbytes,outbytes;
2823 static GIConv converter=(GIConv)-1;
2826 if (converter!=(GIConv)-1)
2827 g_iconv_close(converter);
2828 converter=(GIConv)-1;
2831 if (converter==(GIConv)-1)
2832 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2833 if (converter!=(GIConv)-1)
2835 inbytes=outbytes=strlen(string);
2836 bp=buf=g_malloc(outbytes+1);
2837 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2843 fputs(string,stdout);
2846 void print_as_utf_8(const char *string)
2848 fputs(string,stdout);
2856 void procfile(const char *filename)
2859 gchar *parastart=NULL; /* first line of current para */
2860 gchar *etext,*aline;
2863 struct first_pass_results *first_pass_results;
2864 struct warnings *warnings;
2865 struct counters counters={0};
2866 struct line_properties last={0};
2867 struct parities parities={0};
2868 struct pending pending={0};
2869 gboolean isemptyline;
2870 long start_para_line=0;
2871 gboolean isnewpara=FALSE,enddash=FALSE;
2872 last.start=CHAR_SPACE;
2873 linecnt=checked_linecnt=0;
2874 etext=read_etext(filename,&err);
2877 if (pswit[STDOUT_SWITCH])
2878 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2880 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2883 g_print("\n\nFile: %s\n\n",filename);
2884 first_pass_results=first_pass(etext);
2885 warnings=report_first_pass(first_pass_results);
2886 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2887 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2889 * Here we go with the main pass. Hold onto yer hat!
2893 while ((aline=flgets(&etext_ptr,linecnt+1)))
2898 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2899 continue; // skip DP page separators completely
2900 if (linecnt<first_pass_results->firstline ||
2901 (first_pass_results->footerline>0 &&
2902 linecnt>first_pass_results->footerline))
2904 if (pswit[HEADER_SWITCH])
2906 if (g_str_has_prefix(aline,"Title:"))
2907 g_print(" %s\n",aline);
2908 if (g_str_has_prefix(aline,"Author:"))
2909 g_print(" %s\n",aline);
2910 if (g_str_has_prefix(aline,"Release Date:"))
2911 g_print(" %s\n",aline);
2912 if (g_str_has_prefix(aline,"Edition:"))
2913 g_print(" %s\n\n",aline);
2915 continue; /* skip through the header */
2918 print_pending(aline,parastart,&pending);
2919 isemptyline=analyse_quotes(aline,&counters);
2920 if (isnewpara && !isemptyline)
2922 /* This line is the start of a new paragraph. */
2923 start_para_line=linecnt;
2924 /* Capture its first line in case we want to report it later. */
2926 parastart=g_strdup(aline);
2927 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2929 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2930 !g_unichar_isdigit(g_utf8_get_char(s)))
2931 s=g_utf8_next_char(s);
2932 if (g_unichar_islower(g_utf8_get_char(s)))
2934 /* and its first letter is lowercase */
2935 if (pswit[ECHO_SWITCH])
2936 g_print("\n%s\n",aline);
2937 if (!pswit[OVERVIEW_SWITCH])
2938 g_print(" Line %ld column %ld - "
2939 "Paragraph starts with lower-case\n",
2940 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2944 isnewpara=FALSE; /* Signal the end of new para processing. */
2946 /* Check for an em-dash broken at line end. */
2947 if (enddash && g_utf8_get_char(aline)=='-')
2949 if (pswit[ECHO_SWITCH])
2950 g_print("\n%s\n",aline);
2951 if (!pswit[OVERVIEW_SWITCH])
2952 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2957 for (s=g_utf8_prev_char(aline+strlen(aline));
2958 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2960 if (s>=aline && g_utf8_get_char(s)=='-')
2962 check_for_control_characters(aline);
2963 check_for_odd_characters(aline,warnings,isemptyline);
2964 if (warnings->longline)
2965 check_for_long_line(aline);
2966 if (warnings->shortline)
2967 check_for_short_line(aline,&last);
2969 last.len=g_utf8_strlen(aline,-1);
2970 last.start=g_utf8_get_char(aline);
2971 check_for_starting_punctuation(aline);
2974 check_for_spaced_emdash(aline);
2975 check_for_spaced_dash(aline);
2977 check_for_unmarked_paragraphs(aline);
2978 check_for_jeebies(aline);
2979 check_for_mta_from(aline);
2980 check_for_orphan_character(aline);
2981 check_for_pling_scanno(aline);
2982 check_for_extra_period(aline,warnings);
2983 check_for_following_punctuation(aline);
2984 check_for_typos(aline,warnings);
2985 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2986 check_for_double_punctuation(aline,warnings);
2987 check_for_spaced_quotes(aline);
2988 check_for_miscased_genative(aline);
2989 check_end_of_line(aline,warnings);
2990 check_for_unspaced_bracket(aline);
2991 if (warnings->endquote)
2992 check_for_unpunctuated_endquote(aline);
2993 check_for_html_tag(aline);
2994 check_for_html_entity(aline);
2997 check_for_mismatched_quotes(&counters,&pending);
2998 counters_reset(&counters);
2999 /* let the next iteration know that it's starting a new para */
3002 check_for_omitted_punctuation(prevline,&last,start_para_line);
3005 prevline=g_strdup(aline);
3008 check_for_mismatched_quotes(&counters,&pending);
3009 print_pending(NULL,parastart,&pending);
3010 reset_pending(&pending);
3019 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3020 g_tree_foreach(qword,report_duplicate_queries,NULL);
3021 g_tree_unref(qword);
3022 g_tree_unref(qperiod);
3023 counters_destroy(&counters);
3024 g_set_print_handler(NULL);
3025 print_as_windows_1252(NULL);
3026 if (pswit[MARKUP_SWITCH])
3033 * Get one line from the input text, checking for
3034 * the existence of exactly one CR/LF line-end per line.
3036 * Returns: a pointer to the line.
3038 char *flgets(char **etext,long lcnt)
3041 gboolean isCR=FALSE;
3042 char *theline=*etext;
3047 c=g_utf8_get_char(*etext);
3048 *etext=g_utf8_next_char(*etext);
3051 /* either way, it's end of line */
3058 /* Error - a LF without a preceding CR */
3059 if (pswit[LINE_END_SWITCH])
3061 if (pswit[ECHO_SWITCH])
3063 s=g_strndup(theline,eos-theline);
3064 g_print("\n%s\n",s);
3067 if (!pswit[OVERVIEW_SWITCH])
3068 g_print(" Line %ld - No CR?\n",lcnt);
3079 /* Error - two successive CRs */
3080 if (pswit[LINE_END_SWITCH])
3082 if (pswit[ECHO_SWITCH])
3084 s=g_strndup(theline,eos-theline);
3085 g_print("\n%s\n",s);
3088 if (!pswit[OVERVIEW_SWITCH])
3089 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3098 if (pswit[LINE_END_SWITCH] && isCR)
3100 if (pswit[ECHO_SWITCH])
3102 s=g_strndup(theline,eos-theline);
3103 g_print("\n%s\n",s);
3106 if (!pswit[OVERVIEW_SWITCH])
3107 g_print(" Line %ld column %ld - CR without LF?\n",
3108 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3114 eos=g_utf8_next_char(eos);
3118 if (pswit[MARKUP_SWITCH])
3119 postprocess_for_HTML(theline);
3120 if (pswit[DP_SWITCH])
3121 postprocess_for_DP(theline);
3128 * Takes a "word" as a parameter, and checks whether it
3129 * contains a mixture of alpha and digits. Generally, this is an
3130 * error, but may not be for cases like 4th or L5 12s. 3d.
3132 * Returns: TRUE iff an is error found.
3134 gboolean mixdigit(const char *checkword)
3136 gboolean wehaveadigit,wehavealetter,query;
3137 const char *s,*nondigit;
3138 wehaveadigit=wehavealetter=query=FALSE;
3139 for (s=checkword;*s;s=g_utf8_next_char(s))
3140 if (g_unichar_isalpha(g_utf8_get_char(s)))
3142 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3144 if (wehaveadigit && wehavealetter)
3146 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3148 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3149 nondigit=g_utf8_next_char(nondigit))
3151 /* digits, ending in st, rd, nd, th of either case */
3152 if (!g_ascii_strcasecmp(nondigit,"st") ||
3153 !g_ascii_strcasecmp(nondigit,"rd") ||
3154 !g_ascii_strcasecmp(nondigit,"nd") ||
3155 !g_ascii_strcasecmp(nondigit,"th"))
3157 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3158 !g_ascii_strcasecmp(nondigit,"rds") ||
3159 !g_ascii_strcasecmp(nondigit,"nds") ||
3160 !g_ascii_strcasecmp(nondigit,"ths"))
3162 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3163 !g_ascii_strcasecmp(nondigit,"rdly") ||
3164 !g_ascii_strcasecmp(nondigit,"ndly") ||
3165 !g_ascii_strcasecmp(nondigit,"thly"))
3167 /* digits, ending in l, L, s or d */
3168 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3169 !strcmp(nondigit,"d"))
3172 * L at the start of a number, representing Britsh pounds, like L500.
3173 * This is cute. We know the current word is mixed digit. If the first
3174 * letter is L, there must be at least one digit following. If both
3175 * digits and letters follow, we have a genuine error, else we have a
3176 * capital L followed by digits, and we accept that as a non-error.
3178 if (g_utf8_get_char(checkword)=='L' &&
3179 !mixdigit(g_utf8_next_char(checkword)))
3188 * Extracts the first/next "word" from the line, and returns it.
3189 * A word is defined as one English word unit--or at least that's the aim.
3190 * "ptr" is advanced to the position in the line where we will start
3191 * looking for the next word.
3193 * Returns: A newly-allocated string.
3195 gchar *getaword(const char **ptr)
3200 word=g_string_new(NULL);
3201 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3202 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3203 **ptr;*ptr=g_utf8_next_char(*ptr))
3206 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3207 * Especially yucky is the case of L1,000
3208 * This section looks for a pattern of characters including a digit
3209 * followed by a comma or period followed by one or more digits.
3210 * If found, it returns this whole pattern as a word; otherwise we discard
3211 * the results and resume our normal programming.
3214 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3215 g_unichar_isalpha(g_utf8_get_char(s)) ||
3216 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3217 g_string_append_unichar(word,g_utf8_get_char(s));
3220 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3222 c=g_utf8_get_char(t);
3223 pc=g_utf8_get_char(g_utf8_prev_char(t));
3224 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3227 return g_string_free(word,FALSE);
3231 /* we didn't find a punctuated number - do the regular getword thing */
3232 g_string_truncate(word,0);
3233 c=g_utf8_get_char(*ptr);
3234 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3235 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3236 g_string_append_unichar(word,c);
3237 return g_string_free(word,FALSE);
3243 * Is this word a Roman Numeral?
3245 * It doesn't actually validate that the number is a valid Roman Numeral--for
3246 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3247 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3248 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3249 * expressions thereof, except when it came to taxes. Allow any number of M,
3250 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3251 * XL or an optional XC, an optional IX or IV, an optional V and any number
3254 gboolean isroman(const char *t)
3260 while (g_utf8_get_char(t)=='m' && *t)
3262 if (g_utf8_get_char(t)=='d')
3264 if (g_str_has_prefix(t,"cm"))
3266 if (g_str_has_prefix(t,"cd"))
3268 while (g_utf8_get_char(t)=='c' && *t)
3270 if (g_str_has_prefix(t,"xl"))
3272 if (g_str_has_prefix(t,"xc"))
3274 if (g_utf8_get_char(t)=='l')
3276 while (g_utf8_get_char(t)=='x' && *t)
3278 if (g_str_has_prefix(t,"ix"))
3280 if (g_str_has_prefix(t,"iv"))
3282 if (g_utf8_get_char(t)=='v')
3284 while (g_utf8_get_char(t)=='i' && *t)
3290 * postprocess_for_DP:
3292 * Invoked with the -d switch from flgets().
3293 * It simply "removes" from the line a hard-coded set of common
3294 * DP-specific tags, so that the line passed to the main routine has
3295 * been pre-cleaned of DP markup.
3297 void postprocess_for_DP(char *theline)
3303 for (i=0;*DPmarkup[i];i++)
3304 while ((s=strstr(theline,DPmarkup[i])))
3306 t=s+strlen(DPmarkup[i]);
3307 memmove(s,t,strlen(t)+1);
3312 * postprocess_for_HTML:
3314 * Invoked with the -m switch from flgets().
3315 * It simply "removes" from the line a hard-coded set of common
3316 * HTML tags and "replaces" a hard-coded set of common HTML
3317 * entities, so that the line passed to the main routine has
3318 * been pre-cleaned of HTML.
3320 void postprocess_for_HTML(char *theline)
3322 while (losemarkup(theline))
3324 loseentities(theline);
3327 char *losemarkup(char *theline)
3331 s=strchr(theline,'<');
3332 t=s?strchr(s,'>'):NULL;
3335 for (i=0;*markup[i];i++)
3336 if (tagcomp(g_utf8_next_char(s),markup[i]))
3338 t=g_utf8_next_char(t);
3339 memmove(s,t,strlen(t)+1);
3342 /* It's an unrecognized <xxx>. */
3346 void loseentities(char *theline)
3353 GTree *entities=NULL;
3354 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3358 g_tree_destroy(entities);
3360 if (translit!=(GIConv)-1)
3361 g_iconv_close(translit);
3362 translit=(GIConv)-1;
3363 if (to_utf8!=(GIConv)-1)
3364 g_iconv_close(to_utf8);
3372 entities=g_tree_new((GCompareFunc)strcmp);
3373 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3374 g_tree_insert(entities,HTMLentities[i].name,
3375 GUINT_TO_POINTER(HTMLentities[i].c));
3377 if (translit==(GIConv)-1)
3378 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3379 if (to_utf8==(GIConv)-1)
3380 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3381 while((amp=strchr(theline,'&')))
3383 scolon=strchr(amp,';');
3388 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3389 c=strtol(amp+2,NULL,10);
3390 else if (amp[2]=='x' &&
3391 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3392 c=strtol(amp+3,NULL,16);
3396 s=g_strndup(amp+1,scolon-(amp+1));
3397 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3406 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3407 theline+=g_unichar_to_utf8(c,theline);
3411 nb=g_unichar_to_utf8(c,s);
3412 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3414 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3416 memcpy(theline,s,nb);
3420 memmove(theline,g_utf8_next_char(scolon),
3421 strlen(g_utf8_next_char(scolon))+1);
3424 theline=g_utf8_next_char(amp);
3428 gboolean tagcomp(const char *strin,const char *basetag)
3432 if (g_utf8_get_char(strin)=='/')
3433 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3435 t=g_utf8_casefold(strin,-1);
3436 s=g_utf8_casefold(basetag,-1);
3437 retval=g_str_has_prefix(t,s);
3443 void proghelp(GOptionContext *context)
3446 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3447 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3448 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3449 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3450 "For details, read the file COPYING.\n",stderr);
3451 fputs("This is Free Software; "
3452 "you may redistribute it under certain conditions (GPL);\n",stderr);
3453 fputs("read the file COPYING for details.\n\n",stderr);
3454 help=g_option_context_get_help(context,TRUE,NULL);
3457 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3458 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3459 "non-ASCII\n",stderr);
3460 fputs("characters like accented letters, "
3461 "lines longer than 75 or shorter than 55,\n",stderr);
3462 fputs("unbalanced quotes or brackets, "
3463 "a variety of badly formatted punctuation, \n",stderr);
3464 fputs("HTML tags, some likely typos. "
3465 "It is NOT a substitute for human judgement.\n",stderr);