1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 gboolean typo_compat,paranoid_compat;
137 static GOptionEntry options[]={
138 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
139 "Ignore DP-specific markup", NULL },
140 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
141 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
142 "Don't ignore DP-specific markup", NULL },
143 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
144 "Echo queried line", NULL },
145 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
146 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
147 "Don't echo queried line", NULL },
148 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
149 "Check single quotes", NULL },
150 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
151 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
152 "Don't check single quotes", NULL },
153 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
154 "Check common typos", NULL },
155 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
156 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
157 "Don't check common typos", NULL },
158 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
159 "Require closure of quotes on every paragraph", NULL },
160 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
161 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
162 "Don't require closure of quotes on every paragraph", NULL },
163 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
164 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
165 "Enable paranoid querying of everything", NULL },
166 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
167 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
168 "Disable paranoid querying of everything", NULL },
169 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
170 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
171 "Enable line end checking", NULL },
172 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
173 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
174 "Diable line end checking", NULL },
175 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
176 "Overview: just show counts", NULL },
177 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
178 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
179 "Show individual warnings", NULL },
180 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
181 "Output errors to stdout instead of stderr", NULL },
182 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
183 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
184 "Output errors to stderr instead of stdout", NULL },
185 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
186 "Echo header fields", NULL },
187 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
188 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Don't echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
193 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "No special handling for markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
198 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
199 "Ignore file of user-defined typos", NULL },
200 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
201 "Verbose - list everything", NULL },
202 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
203 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
204 "Switch off verbose mode", NULL },
205 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
206 "Set of characters valid for this ebook", "NAME" },
211 * Options relating to configuration which make no sense from inside
212 * a configuration file.
215 static GOptionEntry config_options[]={
216 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
217 "Defaults for use on www upload", NULL },
218 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
219 "Dump current config settings", NULL },
223 static GOptionEntry compatibility_options[]={
224 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
225 "Toggle checking for common typos", NULL },
226 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
227 "Toggle both paranoid mode and common typos", NULL },
231 long cnt_quote; /* for overview mode, count of quote queries */
232 long cnt_brack; /* for overview mode, count of brackets queries */
233 long cnt_bin; /* for overview mode, count of non-ASCII queries */
234 long cnt_odd; /* for overview mode, count of odd character queries */
235 long cnt_long; /* for overview mode, count of long line errors */
236 long cnt_short; /* for overview mode, count of short line queries */
237 long cnt_punct; /* for overview mode,
238 count of punctuation and spacing queries */
239 long cnt_dash; /* for overview mode, count of dash-related queries */
240 long cnt_word; /* for overview mode, count of word queries */
241 long cnt_html; /* for overview mode, count of html queries */
242 long cnt_lineend; /* for overview mode, count of line-end queries */
243 long cnt_spacend; /* count of lines with space at end */
244 long linecnt; /* count of total lines in the file */
245 long checked_linecnt; /* count of lines actually checked */
247 void proghelp(GOptionContext *context);
248 void procfile(const char *);
252 gboolean mixdigit(const char *);
253 gchar *getaword(const char **);
254 char *flgets(char **,long);
255 void postprocess_for_HTML(char *);
256 char *linehasmarkup(char *);
257 char *losemarkup(char *);
258 gboolean tagcomp(const char *,const char *);
259 void loseentities(char *);
260 gboolean isroman(const char *);
261 void postprocess_for_DP(char *);
262 void print_as_windows_1252(const char *string);
263 void print_as_utf_8(const char *string);
265 GTree *qword,*qperiod;
273 void config_file_update(GKeyFile *kf)
277 for(i=0;options[i].long_name;i++)
279 if (g_str_has_prefix(options[i].long_name,"no-"))
281 if (options[i].arg==G_OPTION_ARG_NONE)
283 sw=*(gboolean *)options[i].arg_data;
284 if (options[i].flags&G_OPTION_FLAG_REVERSE)
286 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
289 g_assert_not_reached();
293 void config_file_add_comments(GKeyFile *kf)
297 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
299 for(i=0;options[i].long_name;i++)
301 if (g_str_has_prefix(options[i].long_name,"no-"))
303 comment=g_strconcat(" ",options[i].description,NULL);
304 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
309 void dump_config(void)
313 config_file_update(config);
316 config=g_key_file_new();
317 config_file_update(config);
318 config_file_add_comments(config);
320 s=g_key_file_to_data(config,NULL,NULL);
326 GKeyFile *read_config_file(gchar **full_path)
332 const char *search_path;
335 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
339 search_dirs=g_strsplit(search_path,";",0);
341 search_dirs=g_strsplit(search_path,":",0);
346 search_dirs=g_new(gchar *,4);
347 search_dirs[0]=g_get_current_dir();
348 search_dirs[1]=g_strdup(running_from);
349 search_dirs[2]=g_strdup(g_get_user_config_dir());
352 for(i=0;search_dirs[i];i++)
354 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
355 if (g_key_file_load_from_file(kf,path,
356 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
358 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
360 g_printerr("Bookloupe: Error reading %s\n",path);
361 g_printerr("%s\n",err->message);
373 g_strfreev(search_dirs);
381 void parse_config_file(void)
388 config=read_config_file(&path);
390 keys=g_key_file_get_keys(config,"options",NULL,NULL);
397 for(j=0;options[j].long_name;j++)
399 if (g_str_has_prefix(options[j].long_name,"no-"))
401 else if (!strcmp(keys[i],options[j].long_name))
403 if (options[j].arg==G_OPTION_ARG_NONE)
405 sw=g_key_file_get_boolean(config,"options",keys[i],
409 g_printerr("Bookloupe: %s: options.%s: %s\n",
410 path,keys[i],err->message);
413 if (options[j].flags&G_OPTION_FLAG_REVERSE)
415 *(gboolean *)options[j].arg_data=sw;
419 g_assert_not_reached();
422 if (!options[j].long_name)
423 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
432 gboolean set_charset(const char *name,GError **err)
434 /* The various UNICODE encodings all share the same character set. */
435 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
436 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
437 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
438 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
439 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
443 if (charset_validator!=(GIConv)-1)
444 g_iconv_close(charset_validator);
445 if (!name || !g_strcasecmp(name,"auto"))
448 charset_validator=(GIConv)-1;
452 charset=g_strdup(name);
453 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
454 if (!g_strcasecmp(charset,unicode_aliases[i]))
457 charset=g_strdup("UTF-8");
460 if (!strcmp(charset,"UTF-8"))
461 charset_validator=(GIConv)-1;
464 charset_validator=g_iconv_open(charset,"UTF-8");
465 if (charset_validator==(GIConv)-1)
467 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
468 "Unknown character set \"%s\"",charset);
475 void parse_options(int *argc,char ***argv)
478 GOptionContext *context;
479 GOptionGroup *compatibility;
480 context=g_option_context_new(
481 "file - look for errors in Project Gutenberg(TM) etexts");
482 g_option_context_add_main_entries(context,options,NULL);
483 g_option_context_add_main_entries(context,config_options,NULL);
484 compatibility=g_option_group_new("compatibility",
485 "Options for Compatibility with Gutcheck:",
486 "Show compatibility options",NULL,NULL);
487 g_option_group_add_entries(compatibility,compatibility_options);
488 g_option_context_add_group(context,compatibility);
489 g_option_context_set_description(context,
490 "For simplicity, only the switch options which reverse the\n"
491 "default configuration are listed. In most cases, both vanilla\n"
492 "and \"no-\" prefixed versions are available for use.");
493 if (!g_option_context_parse(context,argc,argv,&err))
495 g_printerr("Bookloupe: %s\n",err->message);
496 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
500 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
503 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
504 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
507 * Web uploads - for the moment, this is really just a placeholder
508 * until we decide what processing we really want to do on web uploads
510 if (pswit[WEB_SWITCH])
512 /* specific override for web uploads */
513 pswit[ECHO_SWITCH]=TRUE;
514 pswit[SQUOTE_SWITCH]=FALSE;
515 pswit[TYPO_SWITCH]=TRUE;
516 pswit[QPARA_SWITCH]=FALSE;
517 pswit[PARANOID_SWITCH]=TRUE;
518 pswit[LINE_END_SWITCH]=FALSE;
519 pswit[OVERVIEW_SWITCH]=FALSE;
520 pswit[STDOUT_SWITCH]=FALSE;
521 pswit[HEADER_SWITCH]=TRUE;
522 pswit[VERBOSE_SWITCH]=FALSE;
523 pswit[MARKUP_SWITCH]=FALSE;
524 pswit[USERTYPO_SWITCH]=FALSE;
525 pswit[DP_SWITCH]=FALSE;
527 if (opt_charset && !set_charset(opt_charset,&err))
529 g_printerr("%s\n",err->message);
532 if (pswit[DUMP_CONFIG_SWITCH])
539 if (pswit[OVERVIEW_SWITCH])
540 /* just print summary; don't echo */
541 pswit[ECHO_SWITCH]=FALSE;
547 g_option_context_free(context);
553 * Read in the user-defined stealth scanno list.
555 void read_user_scannos(void)
558 gchar *usertypo_file;
562 gchar *contents,*utf8,**lines;
563 usertypo_file=g_strdup("bookloupe.typ");
564 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
565 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
568 g_free(usertypo_file);
569 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
570 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
572 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
575 g_free(usertypo_file);
576 usertypo_file=g_strdup("gutcheck.typ");
577 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
579 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
582 g_free(usertypo_file);
583 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
584 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
586 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
588 g_free(usertypo_file);
589 g_print(" --> I couldn't find bookloupe.typ "
590 "-- proceeding without user typos.\n");
595 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
596 g_free(usertypo_file);
600 if (g_utf8_validate(contents,len,NULL))
602 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
604 (void)set_charset("UNICODE",NULL);
607 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
609 lines=g_strsplit_set(utf8,"\r\n",0);
611 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
612 for (i=0;lines[i];i++)
613 if (*(unsigned char *)lines[i]>'!')
614 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
623 * Read an etext returning a newly allocated string containing the file
624 * contents or NULL on error.
626 gchar *read_etext(const char *filename,GError **err)
628 GError *tmp_err=NULL;
629 gchar *contents,*utf8;
630 gsize len,bytes_read,bytes_written;
632 if (!g_file_get_contents(filename,&contents,&len,err))
634 if (g_utf8_validate(contents,len,NULL))
636 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
637 g_set_print_handler(print_as_utf_8);
639 SetConsoleOutputCP(CP_UTF8);
644 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
645 &bytes_written,&tmp_err);
646 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
647 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
650 for(i=0;i<bytes_read;i++)
651 if (contents[i]=='\n')
656 else if (contents[i]!='\r')
658 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
659 "Input conversion failed. Byte %d at line %d, column %d is not a "
660 "valid Windows-1252 character",
661 ((unsigned char *)contents)[bytes_read],line,col);
664 g_propagate_error(err,tmp_err);
665 g_set_print_handler(print_as_windows_1252);
667 SetConsoleOutputCP(1252);
674 void cleanup_on_exit(void)
677 SetConsoleOutputCP(saved_cp);
681 int main(int argc,char **argv)
684 atexit(cleanup_on_exit);
685 saved_cp=GetConsoleOutputCP();
687 running_from=g_path_get_dirname(argv[0]);
688 /* Paranoid checking is turned OFF, not on, by its switch */
689 pswit[PARANOID_SWITCH]=TRUE;
690 /* if running in paranoid mode, typo checks default to enabled */
691 pswit[TYPO_SWITCH]=TRUE;
692 /* Line-end checking is turned OFF, not on, by its switch */
693 pswit[LINE_END_SWITCH]=TRUE;
694 /* Echoing is turned OFF, not on, by its switch */
695 pswit[ECHO_SWITCH]=TRUE;
697 parse_options(&argc,&argv);
698 if (pswit[USERTYPO_SWITCH])
700 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
702 if (pswit[OVERVIEW_SWITCH])
704 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
705 checked_linecnt,linecnt,linecnt-checked_linecnt);
706 g_print(" --------------- Queries found --------------\n");
708 g_print(" Long lines: %14ld\n",cnt_long);
710 g_print(" Short lines: %14ld\n",cnt_short);
712 g_print(" Line-end problems: %14ld\n",cnt_lineend);
714 g_print(" Common typos: %14ld\n",cnt_word);
716 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
718 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
720 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
722 g_print(" Proofing characters: %14ld\n",cnt_odd);
724 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
726 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
728 g_print(" Possible HTML tags: %14ld\n",cnt_html);
730 g_print(" TOTAL QUERIES %14ld\n",
731 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
732 cnt_dash+cnt_word+cnt_html+cnt_lineend);
734 g_free(running_from);
736 g_tree_unref(usertypo);
737 set_charset(NULL,NULL);
739 g_key_file_free(config);
743 void count_dashes(const char *line,const char *dash,
744 struct dash_results *results)
749 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
752 tokens=g_strsplit(line,dash,0);
755 for(i=1;tokens[i];i++)
757 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
758 nc=g_utf8_get_char(tokens[i]);
759 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
761 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
763 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
769 /* count of lines with em-dashes with spaces both sides */
770 results->non_PG_space++;
772 /* count of lines with PG-type em-dashes with no spaces */
780 * Run a first pass - verify that it's a valid PG
781 * file, decide whether to report some things that
782 * occur many times in the text like long or short
783 * lines, non-standard dashes, etc.
785 struct first_pass_results *first_pass(const char *etext)
787 gunichar laststart=CHAR_SPACE;
792 unsigned int lastlen=0,lastblen=0;
793 long spline=0,nspline=0;
794 static struct first_pass_results results={0};
795 struct dash_results tmp_dash_results;
798 lines=g_strsplit(etext,"\n",0);
799 for (j=0;lines[j];j++)
801 lbytes=strlen(lines[j]);
802 while (lbytes>0 && lines[j][lbytes-1]=='\r')
803 lines[j][--lbytes]='\0';
804 llen=g_utf8_strlen(lines[j],lbytes);
806 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
807 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
810 g_print(" --> Duplicate header?\n");
811 spline=linecnt+1; /* first line of non-header text, that is */
813 if (!strncmp(lines[j],"*** START",9) &&
814 strstr(lines[j],"PROJECT GUTENBERG"))
817 g_print(" --> Duplicate header?\n");
818 nspline=linecnt+1; /* first line of non-header text, that is */
820 if (spline || nspline)
822 lc_line=g_utf8_strdown(lines[j],lbytes);
823 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
825 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
827 if (results.footerline)
829 /* it's an old-form header - we can detect duplicates */
831 g_print(" --> Duplicate footer?\n");
834 results.footerline=linecnt;
840 results.firstline=spline;
842 results.firstline=nspline; /* override with new */
843 if (results.footerline)
844 continue; /* don't count the boilerplate in the footer */
845 results.totlen+=llen;
846 for (s=lines[j];*s;s=g_utf8_next_char(s))
848 if (g_utf8_get_char(s)>127)
850 if (g_unichar_isalpha(g_utf8_get_char(s)))
854 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
855 qc=QUOTE_CLASS(g_utf8_get_char(s));
858 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
859 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
860 results.endquote_count++;
863 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
864 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
867 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
869 if (strstr(lines[j],".,"))
871 /* only count ast lines for ignoring purposes where there is */
872 /* locase text on the line */
873 if (strchr(lines[j],'*'))
875 for (s=lines[j];*s;s=g_utf8_next_char(s))
876 if (g_unichar_islower(g_utf8_get_char(s)))
881 if (strchr(lines[j],'/'))
882 results.fslashline++;
885 for (s=g_utf8_prev_char(lines[j]+lbytes);
886 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
887 s=g_utf8_prev_char(s))
889 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
890 g_utf8_get_char(g_utf8_prev_char(s))!='-')
893 if (llen>LONGEST_PG_LINE)
895 if (llen>WAY_TOO_LONG)
896 results.verylongline++;
897 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
899 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
902 if (strstr(lines[j],"<i>"))
903 results.htmcount+=4; /* bonus marks! */
905 /* Check for spaced em-dashes */
906 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
907 count_dashes(lines[j],"--",&tmp_dash_results);
908 count_dashes(lines[j],"—",&tmp_dash_results);
909 if (tmp_dash_results.base)
910 results.emdash.base++;
911 if (tmp_dash_results.non_PG_space)
912 results.emdash.non_PG_space++;
913 if (tmp_dash_results.PG_space)
914 results.emdash.PG_space++;
918 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
919 results.Dutchcount++;
920 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
921 results.Frenchcount++;
922 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
923 results.standalone_digit++;
926 /* Check for spaced dashes */
927 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
931 laststart=lines[j][0];
940 * Make some snap decisions based on the first pass results.
942 struct warnings *report_first_pass(struct first_pass_results *results)
944 static struct warnings warnings={0};
946 g_print(" --> %ld lines in this file have white space at end\n",
949 if (results->dotcomma>5)
952 g_print(" --> %ld lines in this file contain '.,'. "
953 "Not reporting them.\n",results->dotcomma);
956 * If more than 50 lines, or one-tenth, are short,
957 * don't bother reporting them.
959 warnings.shortline=1;
960 if (results->shortline>50 || results->shortline*10>linecnt)
962 warnings.shortline=0;
963 g_print(" --> %ld lines in this file are short. "
964 "Not reporting short lines.\n",results->shortline);
967 * If more than 50 lines, or one-tenth, are long,
968 * don't bother reporting them.
971 if (results->longline>50 || results->longline*10>linecnt)
974 g_print(" --> %ld lines in this file are long. "
975 "Not reporting long lines.\n",results->longline);
977 /* If more than 10 lines contain asterisks, don't bother reporting them. */
979 if (results->astline>10)
982 g_print(" --> %ld lines in this file contain asterisks. "
983 "Not reporting them.\n",results->astline);
986 * If more than 10 lines contain forward slashes,
987 * don't bother reporting them.
990 if (results->fslashline>10)
993 g_print(" --> %ld lines in this file contain forward slashes. "
994 "Not reporting them.\n",results->fslashline);
997 * If more than 20 lines contain unpunctuated endquotes,
998 * don't bother reporting them.
1000 warnings.endquote=1;
1001 if (results->endquote_count>20)
1003 warnings.endquote=0;
1004 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1005 "Not reporting them.\n",results->endquote_count);
1008 * If more than 15 lines contain standalone digits,
1009 * don't bother reporting them.
1012 if (results->standalone_digit>10)
1015 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1016 "Not reporting them.\n",results->standalone_digit);
1019 * If more than 20 lines contain hyphens at end,
1020 * don't bother reporting them.
1023 if (results->hyphens>20)
1026 g_print(" --> %ld lines in this file have hyphens at end. "
1027 "Not reporting them.\n",results->hyphens);
1029 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1031 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1032 pswit[MARKUP_SWITCH]=1;
1034 if (results->verylongline>0)
1035 g_print(" --> %ld lines in this file are VERY long!\n",
1036 results->verylongline);
1038 * If there are more non-PG spaced dashes than PG em-dashes,
1039 * assume it's deliberate.
1040 * Current PG guidelines say don't use them, but older texts do,
1041 * and some people insist on them whatever the guidelines say.
1044 if (results->spacedash+results->emdash.non_PG_space>
1045 results->emdash.PG_space)
1048 g_print(" --> There are %ld spaced dashes and em-dashes. "
1049 "Not reporting them.\n",
1050 results->spacedash+results->emdash.non_PG_space);
1056 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1058 /* If more than a quarter of characters are hi-bit, bug out. */
1059 if (results->binlen*4>results->totlen)
1061 g_print(" --> This file does not appear to be ASCII. "
1062 "Terminating. Best of luck with it!\n");
1065 if (results->alphalen*4<results->totlen)
1067 g_print(" --> This file does not appear to be text. "
1068 "Terminating. Best of luck with it!\n");
1071 if (results->binlen*100>results->totlen || results->binlen>100)
1073 g_print(" --> There are a lot of foreign letters here. "
1074 "Not reporting them.\n");
1075 if (!pswit[VERBOSE_SWITCH])
1079 warnings.isDutch=FALSE;
1080 if (results->Dutchcount>50)
1082 warnings.isDutch=TRUE;
1083 g_print(" --> This looks like Dutch - "
1084 "switching off dashes and warnings for 's Middags case.\n");
1086 warnings.isFrench=FALSE;
1087 if (results->Frenchcount>50)
1089 warnings.isFrench=TRUE;
1090 g_print(" --> This looks like French - "
1091 "switching off some doublepunct.\n");
1093 if (results->firstline && results->footerline)
1094 g_print(" The PG header and footer appear to be already on.\n");
1097 if (results->firstline)
1098 g_print(" The PG header is on - no footer.\n");
1099 if (results->footerline)
1100 g_print(" The PG footer is on - no header.\n");
1103 if (pswit[VERBOSE_SWITCH])
1105 warnings.shortline=1;
1106 warnings.dotcomma=1;
1107 warnings.longline=1;
1113 warnings.endquote=1;
1114 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1116 if (warnings.isDutch)
1118 if (results->footerline>0 && results->firstline>0 &&
1119 results->footerline>results->firstline &&
1120 results->footerline-results->firstline<100)
1122 g_print(" --> I don't really know where this text starts. \n");
1123 g_print(" There are no reference points.\n");
1124 g_print(" I'm going to have to report the header and footer "
1126 results->firstline=0;
1134 * Look along the line, accumulate the count of quotes, and see
1135 * if this is an empty line - i.e. a line with nothing on it
1137 * If line has just spaces, period, * and/or - on it, don't
1138 * count it, since empty lines with asterisks or dashes to
1139 * separate sections are common.
1141 * Returns: TRUE if the line is empty.
1143 gboolean analyse_quotes(const char *aline,struct counters *counters)
1146 /* assume the line is empty until proven otherwise */
1147 gboolean isemptyline=TRUE;
1148 const char *s=aline,*sprev,*snext;
1151 GError *tmp_err=NULL;
1154 snext=g_utf8_next_char(s);
1155 c=g_utf8_get_char(s);
1156 if (CHAR_IS_DQUOTE(c))
1157 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1158 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1163 * At start of line, it can only be a quotation mark.
1164 * Hardcode a very common exception!
1166 if (!g_str_has_prefix(snext,"tis") &&
1167 !g_str_has_prefix(snext,"Tis"))
1168 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1170 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1171 g_unichar_isalpha(g_utf8_get_char(snext)))
1172 /* Do nothing! it's definitely an apostrophe, not a quote */
1174 /* it's outside a word - let's check it out */
1175 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1176 g_unichar_isalpha(g_utf8_get_char(snext)))
1178 /* certainly looks like a quotation mark */
1179 if (!g_str_has_prefix(snext,"tis") &&
1180 !g_str_has_prefix(snext,"Tis"))
1181 /* hardcode a very common exception! */
1183 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1184 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1186 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1191 /* now - is it a quotation mark? */
1192 guessquote=0; /* accumulate clues */
1193 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1195 /* it follows a letter - could be either */
1197 if (g_utf8_get_char(sprev)=='s')
1199 /* looks like a plural apostrophe */
1201 if (g_utf8_get_char(snext)==CHAR_SPACE)
1205 if (innermost_quote_matches(counters,c))
1207 * Give it the benefit of some doubt,
1208 * if a squote is already open.
1214 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1217 /* no adjacent letter - it must be a quote of some kind */
1218 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1223 if (pswit[ECHO_SWITCH])
1224 g_print("\n%s\n",aline);
1225 if (!pswit[OVERVIEW_SWITCH])
1226 g_print(" Line %ld column %ld - %s\n",
1227 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1228 g_clear_error(&tmp_err);
1230 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1232 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1233 if (c==CHAR_UNDERSCORE)
1234 counters->c_unders++;
1235 if (c==CHAR_OPEN_SBRACK)
1237 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1238 !matching_difference(counters,c) && s==aline &&
1239 g_str_has_prefix(s,"[Illustration:"))
1240 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1242 increment_matching(counters,c,TRUE);
1244 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1245 increment_matching(counters,c,TRUE);
1246 if (c==CHAR_CLOSE_SBRACK)
1248 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1249 !matching_difference(counters,c) && !*snext)
1250 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1252 increment_matching(counters,c,FALSE);
1254 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1255 increment_matching(counters,c,FALSE);
1263 * check_for_control_characters:
1265 * Check for invalid or questionable characters in the line
1266 * Anything above 127 is invalid for plain ASCII, and
1267 * non-printable control characters should also be flagged.
1268 * Tabs should generally not be there.
1270 void check_for_control_characters(const char *aline)
1274 for (s=aline;*s;s=g_utf8_next_char(s))
1276 c=g_utf8_get_char(s);
1277 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1279 if (pswit[ECHO_SWITCH])
1280 g_print("\n%s\n",aline);
1281 if (!pswit[OVERVIEW_SWITCH])
1282 g_print(" Line %ld column %ld - Control character %u\n",
1283 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1291 * check_for_odd_characters:
1293 * Check for binary and other odd characters.
1295 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1296 gboolean isemptyline)
1298 /* Don't repeat multiple warnings on one line. */
1299 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1300 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1305 for (s=aline;*s;s=g_utf8_next_char(s))
1307 c=g_utf8_get_char(s);
1308 if (warnings->bin && !eInvalidChar &&
1309 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1311 if (pswit[ECHO_SWITCH])
1312 g_print("\n%s\n",aline);
1313 if (!pswit[OVERVIEW_SWITCH])
1314 if (c>127 && c<160 || c>255)
1315 g_print(" Line %ld column %ld - "
1316 "Non-ISO-8859 character %u\n",
1317 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1319 g_print(" Line %ld column %ld - "
1320 "Non-ASCII character %u\n",
1321 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1326 if (!eInvalidChar && charset)
1328 if (charset_validator==(GIConv)-1)
1330 if (!g_unichar_isdefined(c))
1332 if (pswit[ECHO_SWITCH])
1333 g_print("\n%s\n",aline);
1334 if (!pswit[OVERVIEW_SWITCH])
1335 g_print(" Line %ld column %ld - Unassigned UNICODE "
1336 "code point U+%04" G_GINT32_MODIFIER "X\n",
1337 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1342 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1343 c>=100000 && c<=0x10FFFD)
1345 if (pswit[ECHO_SWITCH])
1346 g_print("\n%s\n",aline);
1347 if (!pswit[OVERVIEW_SWITCH])
1348 g_print(" Line %ld column %ld - Private Use "
1349 "character U+%04" G_GINT32_MODIFIER "X\n",
1350 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1358 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1359 charset_validator,NULL,&nb,NULL);
1364 if (pswit[ECHO_SWITCH])
1365 g_print("\n%s\n",aline);
1366 if (!pswit[OVERVIEW_SWITCH])
1367 g_print(" Line %ld column %ld - Non-%s "
1368 "character %u\n",linecnt,
1369 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1376 if (!eTab && c==CHAR_TAB)
1378 if (pswit[ECHO_SWITCH])
1379 g_print("\n%s\n",aline);
1380 if (!pswit[OVERVIEW_SWITCH])
1381 g_print(" Line %ld column %ld - Tab character?\n",
1382 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1387 if (!eTilde && c==CHAR_TILDE)
1390 * Often used by OCR software to indicate an
1391 * unrecognizable character.
1393 if (pswit[ECHO_SWITCH])
1394 g_print("\n%s\n",aline);
1395 if (!pswit[OVERVIEW_SWITCH])
1396 g_print(" Line %ld column %ld - Tilde character?\n",
1397 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1402 if (!eCarat && c==CHAR_CARAT)
1404 if (pswit[ECHO_SWITCH])
1405 g_print("\n%s\n",aline);
1406 if (!pswit[OVERVIEW_SWITCH])
1407 g_print(" Line %ld column %ld - Carat character?\n",
1408 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1413 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1415 if (pswit[ECHO_SWITCH])
1416 g_print("\n%s\n",aline);
1417 if (!pswit[OVERVIEW_SWITCH])
1418 g_print(" Line %ld column %ld - Forward slash?\n",
1419 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1425 * Report asterisks only in paranoid mode,
1426 * since they're often deliberate.
1428 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1431 if (pswit[ECHO_SWITCH])
1432 g_print("\n%s\n",aline);
1433 if (!pswit[OVERVIEW_SWITCH])
1434 g_print(" Line %ld column %ld - Asterisk?\n",
1435 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1444 * check_for_long_line:
1446 * Check for line too long.
1448 void check_for_long_line(const char *aline)
1450 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1452 if (pswit[ECHO_SWITCH])
1453 g_print("\n%s\n",aline);
1454 if (!pswit[OVERVIEW_SWITCH])
1455 g_print(" Line %ld column %ld - Long line %ld\n",
1456 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1463 * check_for_short_line:
1465 * Check for line too short.
1467 * This one is a bit trickier to implement: we don't want to
1468 * flag the last line of a paragraph for being short, so we
1469 * have to wait until we know that our current line is a
1470 * "normal" line, then report the _previous_ line if it was too
1471 * short. We also don't want to report indented lines like
1472 * chapter heads or formatted quotations. We therefore keep
1473 * last->len as the length of the last line examined, and
1474 * last->blen as the length of the last but one, and try to
1475 * suppress unnecessary warnings by checking that both were of
1476 * "normal" length. We keep the first character of the last
1477 * line in last->start, and if it was a space, we assume that
1478 * the formatting is deliberate. I can't figure out a way to
1479 * distinguish something like a quoted verse left-aligned or
1480 * the header or footer of a letter from a paragraph of short
1481 * lines - maybe if I examined the whole paragraph, and if the
1482 * para has less than, say, 8 lines and if all lines are short,
1483 * then just assume it's OK? Need to look at some texts to see
1484 * how often a formula like this would get the right result.
1486 void check_for_short_line(const char *aline,const struct line_properties *last)
1488 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1489 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1490 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1492 if (pswit[ECHO_SWITCH])
1493 g_print("\n%s\n",prevline);
1494 if (!pswit[OVERVIEW_SWITCH])
1495 g_print(" Line %ld column %ld - Short line %ld?\n",
1496 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1503 * check_for_starting_punctuation:
1505 * Look for punctuation other than full ellipses at start of line.
1507 void check_for_starting_punctuation(const char *aline)
1509 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1510 !g_str_has_prefix(aline,". . ."))
1512 if (pswit[ECHO_SWITCH])
1513 g_print("\n%s\n",aline);
1514 if (!pswit[OVERVIEW_SWITCH])
1515 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1525 * Find the first em-dash, return a pointer to it and set <next> to the
1526 * character following the dash.
1528 char *str_emdash(const char *s,const char **next)
1536 *next=g_utf8_next_char(s2);
1541 *next=g_utf8_next_char(g_utf8_next_char(s1));
1546 *next=g_utf8_next_char(g_utf8_next_char(s1));
1551 *next=g_utf8_next_char(s2);
1557 * check_for_spaced_emdash:
1559 * Check for spaced em-dashes.
1561 * We must check _all_ occurrences of em-dashes on the line
1562 * hence the loop - even if the first dash is OK
1563 * there may be another that's wrong later on.
1565 void check_for_spaced_emdash(const char *aline)
1567 const char *s,*t,*next;
1568 for (s=aline;t=str_emdash(s,&next);s=next)
1570 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1571 g_utf8_get_char(next)==CHAR_SPACE)
1573 if (pswit[ECHO_SWITCH])
1574 g_print("\n%s\n",aline);
1575 if (!pswit[OVERVIEW_SWITCH])
1576 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1577 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1585 * check_for_spaced_dash:
1587 * Check for spaced dashes.
1589 void check_for_spaced_dash(const char *aline)
1592 if ((s=strstr(aline," -")))
1594 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1596 if (pswit[ECHO_SWITCH])
1597 g_print("\n%s\n",aline);
1598 if (!pswit[OVERVIEW_SWITCH])
1599 g_print(" Line %ld column %ld - Spaced dash?\n",
1600 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1605 else if ((s=strstr(aline,"- ")))
1607 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1609 if (pswit[ECHO_SWITCH])
1610 g_print("\n%s\n",aline);
1611 if (!pswit[OVERVIEW_SWITCH])
1612 g_print(" Line %ld column %ld - Spaced dash?\n",
1613 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1621 * check_for_unmarked_paragraphs:
1623 * Check for unmarked paragraphs indicated by separate speakers.
1625 * May well be false positive:
1626 * "Bravo!" "Wonderful!" called the crowd.
1627 * but useful all the same.
1629 void check_for_unmarked_paragraphs(const char *aline)
1632 s=strstr(aline,"\" \"");
1634 s=strstr(aline,"\" \"");
1637 if (pswit[ECHO_SWITCH])
1638 g_print("\n%s\n",aline);
1639 if (!pswit[OVERVIEW_SWITCH])
1640 g_print(" Line %ld column %ld - "
1641 "Query missing paragraph break?\n",
1642 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1649 * check_for_jeebies:
1651 * Check for "to he" and other easy h/b errors.
1653 * This is a very inadequate effort on the h/b problem,
1654 * but the phrase "to he" is always an error, whereas "to
1655 * be" is quite common.
1656 * Similarly, '"Quiet!", be said.' is a non-be error
1657 * "to he" is _not_ always an error!:
1658 * "Where they went to he couldn't say."
1659 * Another false positive:
1660 * What would "Cinderella" be without the . . .
1661 * and another: "If he wants to he can see for himself."
1663 void check_for_jeebies(const char *aline)
1666 s=strstr(aline," be could ");
1668 s=strstr(aline," be would ");
1670 s=strstr(aline," was be ");
1672 s=strstr(aline," be is ");
1674 s=strstr(aline," is be ");
1676 s=strstr(aline,"\", be ");
1678 s=strstr(aline,"\" be ");
1680 s=strstr(aline,"\" be ");
1682 s=strstr(aline," to he ");
1685 if (pswit[ECHO_SWITCH])
1686 g_print("\n%s\n",aline);
1687 if (!pswit[OVERVIEW_SWITCH])
1688 g_print(" Line %ld column %ld - Query he/be error?\n",
1689 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1693 s=strstr(aline," the had ");
1695 s=strstr(aline," a had ");
1697 s=strstr(aline," they bad ");
1699 s=strstr(aline," she bad ");
1701 s=strstr(aline," he bad ");
1703 s=strstr(aline," you bad ");
1705 s=strstr(aline," i bad ");
1708 if (pswit[ECHO_SWITCH])
1709 g_print("\n%s\n",aline);
1710 if (!pswit[OVERVIEW_SWITCH])
1711 g_print(" Line %ld column %ld - Query had/bad error?\n",
1712 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1716 s=strstr(aline,"; hut ");
1718 s=strstr(aline,", hut ");
1721 if (pswit[ECHO_SWITCH])
1722 g_print("\n%s\n",aline);
1723 if (!pswit[OVERVIEW_SWITCH])
1724 g_print(" Line %ld column %ld - Query hut/but error?\n",
1725 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1732 * check_for_mta_from:
1734 * Special case - angled bracket in front of "From" placed there by an
1735 * MTA when sending an e-mail.
1737 void check_for_mta_from(const char *aline)
1740 s=strstr(aline,">From");
1743 if (pswit[ECHO_SWITCH])
1744 g_print("\n%s\n",aline);
1745 if (!pswit[OVERVIEW_SWITCH])
1746 g_print(" Line %ld column %ld - "
1747 "Query angled bracket with From\n",
1748 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1755 * check_for_orphan_character:
1757 * Check for a single character line -
1758 * often an overflow from bad wrapping.
1760 void check_for_orphan_character(const char *aline)
1763 c=g_utf8_get_char(aline);
1764 if (c && !*g_utf8_next_char(aline))
1766 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1767 ; /* Nothing - ignore numerals alone on a line. */
1770 if (pswit[ECHO_SWITCH])
1771 g_print("\n%s\n",aline);
1772 if (!pswit[OVERVIEW_SWITCH])
1773 g_print(" Line %ld column 1 - Query single character line\n",
1782 * check_for_pling_scanno:
1784 * Check for I" - often should be !
1786 void check_for_pling_scanno(const char *aline)
1789 s=strstr(aline," I\"");
1792 if (pswit[ECHO_SWITCH])
1793 g_print("\n%s\n",aline);
1794 if (!pswit[OVERVIEW_SWITCH])
1795 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1796 linecnt,g_utf8_pointer_to_offset(aline,s));
1803 * check_for_extra_period:
1805 * Check for period without a capital letter. Cut-down from gutspell.
1806 * Only works when it happens on a single line.
1808 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1810 const char *s,*t,*s1,*sprev;
1815 gunichar c,nc,pc,*decomposition;
1816 if (pswit[PARANOID_SWITCH])
1818 for (t=aline;t=strstr(t,". ");)
1822 t=g_utf8_next_char(t);
1823 /* start of line punctuation is handled elsewhere */
1826 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1828 t=g_utf8_next_char(t);
1831 if (warnings->isDutch)
1833 /* For Frank & Jeroen -- 's Middags case */
1834 gunichar c2,c3,c4,c5;
1835 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1836 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1837 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1838 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1839 if (CHAR_IS_APOSTROPHE(c2) &&
1840 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1841 g_unichar_isupper(c5))
1843 t=g_utf8_next_char(t);
1847 s1=g_utf8_next_char(g_utf8_next_char(t));
1848 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1849 !g_unichar_isdigit(g_utf8_get_char(s1)))
1850 s1=g_utf8_next_char(s1);
1851 if (g_unichar_islower(g_utf8_get_char(s1)))
1853 /* we have something to investigate */
1855 /* so let's go back and find out */
1856 nc=g_utf8_get_char(t);
1857 s1=g_utf8_prev_char(t);
1858 c=g_utf8_get_char(s1);
1859 sprev=g_utf8_prev_char(s1);
1860 pc=g_utf8_get_char(sprev);
1862 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1863 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1864 g_unichar_isalpha(nc)))
1869 sprev=g_utf8_prev_char(s1);
1870 pc=g_utf8_get_char(sprev);
1872 s1=g_utf8_next_char(s1);
1875 testword=g_strndup(s1,s-s1);
1877 testword=g_strdup(s1);
1878 for (i=0;*abbrev[i];i++)
1879 if (!strcmp(testword,abbrev[i]))
1881 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1883 if (!*g_utf8_next_char(testword))
1885 if (isroman(testword))
1890 for (s=testword;*s;s=g_utf8_next_char(s))
1892 decomposition=g_unicode_canonical_decomposition(
1893 g_utf8_get_char(s),&len);
1894 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1896 g_free(decomposition);
1900 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1902 g_tree_insert(qperiod,g_strdup(testword),
1903 GINT_TO_POINTER(1));
1904 if (pswit[ECHO_SWITCH])
1905 g_print("\n%s\n",aline);
1906 if (!pswit[OVERVIEW_SWITCH])
1907 g_print(" Line %ld column %ld - Extra period?\n",
1908 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1914 t=g_utf8_next_char(t);
1920 * check_for_following_punctuation:
1922 * Check for words usually not followed by punctuation.
1924 void check_for_following_punctuation(const char *aline)
1927 const char *s,*wordstart;
1930 if (pswit[TYPO_SWITCH])
1941 inword=g_utf8_strdown(t,-1);
1943 for (i=0;*nocomma[i];i++)
1944 if (!strcmp(inword,nocomma[i]))
1946 c=g_utf8_get_char(s);
1947 if (c==',' || c==';' || c==':')
1949 if (pswit[ECHO_SWITCH])
1950 g_print("\n%s\n",aline);
1951 if (!pswit[OVERVIEW_SWITCH])
1952 g_print(" Line %ld column %ld - "
1953 "Query punctuation after %s?\n",
1954 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1960 for (i=0;*noperiod[i];i++)
1961 if (!strcmp(inword,noperiod[i]))
1963 c=g_utf8_get_char(s);
1964 if (c=='.' || c=='!')
1966 if (pswit[ECHO_SWITCH])
1967 g_print("\n%s\n",aline);
1968 if (!pswit[OVERVIEW_SWITCH])
1969 g_print(" Line %ld column %ld - "
1970 "Query punctuation after %s?\n",
1971 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1985 * Check for commonly mistyped words,
1986 * and digits like 0 for O in a word.
1988 void check_for_typos(const char *aline,struct warnings *warnings)
1990 const char *s,*t,*nt,*wordstart;
1992 gunichar *decomposition;
1994 int i,vowel,consonant,*dupcnt;
1995 gboolean isdup,istypo,alower;
1998 gsize decomposition_len;
2002 inword=getaword(&s);
2006 continue; /* don't bother with empty lines */
2008 if (mixdigit(inword))
2010 if (pswit[ECHO_SWITCH])
2011 g_print("\n%s\n",aline);
2012 if (!pswit[OVERVIEW_SWITCH])
2013 g_print(" Line %ld column %ld - Query digit in %s\n",
2014 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
2019 * Put the word through a series of tests for likely typos and OCR
2022 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2026 for (t=inword;*t;t=g_utf8_next_char(t))
2028 c=g_utf8_get_char(t);
2029 nt=g_utf8_next_char(t);
2030 /* lowercase for testing */
2031 if (g_unichar_islower(c))
2033 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
2036 * We have an uppercase mid-word. However, there are
2038 * Mac and Mc like McGill
2039 * French contractions like l'Abbe
2041 offset=g_utf8_pointer_to_offset(inword,t);
2043 pc=g_utf8_get_char(g_utf8_prev_char(t));
2046 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
2047 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2048 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2049 CHAR_IS_APOSTROPHE(pc))
2055 testword=g_utf8_casefold(inword,-1);
2057 if (pswit[TYPO_SWITCH])
2060 * Check for certain unlikely two-letter combinations at word
2063 len=g_utf8_strlen(testword,-1);
2066 for (i=0;*nostart[i];i++)
2067 if (g_str_has_prefix(testword,nostart[i]))
2069 for (i=0;*noend[i];i++)
2070 if (g_str_has_suffix(testword,noend[i]))
2073 /* ght is common, gbt never. Like that. */
2074 if (strstr(testword,"cb"))
2076 if (strstr(testword,"gbt"))
2078 if (strstr(testword,"pbt"))
2080 if (strstr(testword,"tbs"))
2082 if (strstr(testword,"mrn"))
2084 if (strstr(testword,"ahle"))
2086 if (strstr(testword,"ihle"))
2089 * "TBE" does happen - like HEARTBEAT - but uncommon.
2090 * Also "TBI" - frostbite, outbid - but uncommon.
2091 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2092 * numerals, but "ii" is a common scanno.
2094 if (strstr(testword,"tbi"))
2096 if (strstr(testword,"tbe"))
2098 if (strstr(testword,"ii"))
2101 * Check for no vowels or no consonants.
2102 * If none, flag a typo.
2104 if (!istypo && len>1)
2107 for (t=testword;*t;t=g_utf8_next_char(t))
2109 c=g_utf8_get_char(t);
2111 g_unicode_canonical_decomposition(c,&decomposition_len);
2112 if (c=='y' || g_unichar_isdigit(c))
2114 /* Yah, this is loose. */
2118 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2122 g_free(decomposition);
2124 if (!vowel || !consonant)
2128 * Now exclude the word from being reported if it's in
2131 for (i=0;*okword[i];i++)
2132 if (!strcmp(testword,okword[i]))
2135 * What looks like a typo may be a Roman numeral.
2138 if (istypo && isroman(testword))
2140 /* Check the manual list of typos. */
2142 for (i=0;*typo[i];i++)
2143 if (!strcmp(testword,typo[i]))
2146 * Check lowercase s, l, i and m - special cases.
2147 * "j" - often a semi-colon gone wrong.
2148 * "d" for a missing apostrophe - he d
2151 if (!istypo && len==1 &&
2152 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2156 dupcnt=g_tree_lookup(qword,testword);
2160 isdup=!pswit[VERBOSE_SWITCH];
2164 dupcnt=g_new0(int,1);
2165 g_tree_insert(qword,g_strdup(testword),dupcnt);
2170 if (pswit[ECHO_SWITCH])
2171 g_print("\n%s\n",aline);
2172 if (!pswit[OVERVIEW_SWITCH])
2174 g_print(" Line %ld column %ld - Query word %s",
2175 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2177 if (!pswit[VERBOSE_SWITCH])
2178 g_print(" - not reporting duplicates");
2186 /* check the user's list of typos */
2187 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2189 if (pswit[ECHO_SWITCH])
2190 g_print("\n%s\n",aline);
2191 if (!pswit[OVERVIEW_SWITCH])
2192 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2193 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2195 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2197 if (pswit[PARANOID_SWITCH] && warnings->digit)
2199 /* In paranoid mode, query all 0 and 1 standing alone. */
2200 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2202 if (pswit[ECHO_SWITCH])
2203 g_print("\n%s\n",aline);
2204 if (!pswit[OVERVIEW_SWITCH])
2205 g_print(" Line %ld column %ld - Query standalone %s\n",
2206 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2217 * check_for_misspaced_punctuation:
2219 * Look for added or missing spaces around punctuation and quotes.
2220 * If there is a punctuation character like ! with no space on
2221 * either side, suspect a missing!space. If there are spaces on
2222 * both sides , assume a typo. If we see a double quote with no
2223 * space or punctuation on either side of it, assume unspaced
2224 * quotes "like"this.
2226 void check_for_misspaced_punctuation(const char *aline,
2227 struct parities *parities,gboolean isemptyline)
2229 gboolean isacro,isellipsis;
2231 gunichar c,nc,pc,n2c;
2233 c=g_utf8_get_char(aline);
2234 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2235 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2239 nc=g_utf8_get_char(g_utf8_next_char(s));
2240 /* For each character in the line after the first. */
2241 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2243 /* we need to suppress warnings for acronyms like M.D. */
2245 /* we need to suppress warnings for ellipsis . . . */
2248 * If there are letters on both sides of it or
2249 * if it's strict punctuation followed by an alpha.
2251 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2252 g_utf8_strchr("?!,;:",-1,c)))
2256 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2257 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2259 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2265 if (pswit[ECHO_SWITCH])
2266 g_print("\n%s\n",aline);
2267 if (!pswit[OVERVIEW_SWITCH])
2268 g_print(" Line %ld column %ld - Missing space?\n",
2269 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2274 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2277 * If there are spaces on both sides,
2278 * or space before and end of line.
2282 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2283 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2285 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2289 if (!isemptyline && !isellipsis)
2291 if (pswit[ECHO_SWITCH])
2292 g_print("\n%s\n",aline);
2293 if (!pswit[OVERVIEW_SWITCH])
2294 g_print(" Line %ld column %ld - "
2295 "Spaced punctuation?\n",linecnt,
2296 g_utf8_pointer_to_offset(aline,s)+1);
2303 /* Split out the characters that CANNOT be preceded by space. */
2304 c=g_utf8_get_char(aline);
2305 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2306 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2310 nc=g_utf8_get_char(g_utf8_next_char(s));
2311 /* for each character in the line after the first */
2312 if (g_utf8_strchr("?!,;:",-1,c))
2314 /* if it's punctuation that _cannot_ have a space before it */
2315 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2318 * If nc DOES == space,
2319 * it was already reported just above.
2321 if (pswit[ECHO_SWITCH])
2322 g_print("\n%s\n",aline);
2323 if (!pswit[OVERVIEW_SWITCH])
2324 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2325 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2332 * Special case " .X" where X is any alpha.
2333 * This plugs a hole in the acronym code above.
2334 * Inelegant, but maintainable.
2336 c=g_utf8_get_char(aline);
2337 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2338 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2342 nc=g_utf8_get_char(g_utf8_next_char(s));
2343 /* for each character in the line after the first */
2346 /* if it's a period */
2347 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2350 * If the period follows a space and
2351 * is followed by a letter.
2353 if (pswit[ECHO_SWITCH])
2354 g_print("\n%s\n",aline);
2355 if (!pswit[OVERVIEW_SWITCH])
2356 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2357 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2363 c=g_utf8_get_char(aline);
2364 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2365 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2369 nc=g_utf8_get_char(g_utf8_next_char(s));
2370 /* for each character in the line after the first */
2371 if (CHAR_IS_DQUOTE(c))
2373 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2374 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2375 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2377 if (pswit[ECHO_SWITCH])
2378 g_print("\n%s\n",aline);
2379 if (!pswit[OVERVIEW_SWITCH])
2380 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2381 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2387 /* Check parity of quotes. */
2388 nc=g_utf8_get_char(aline);
2389 for (s=aline;*s;s=g_utf8_next_char(s))
2392 nc=g_utf8_get_char(g_utf8_next_char(s));
2393 if (CHAR_IS_DQUOTE(c))
2397 parities->dquote=!parities->dquote;
2398 parity=parities->dquote;
2400 else if (c==CHAR_LD_QUOTE)
2407 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2409 if (pswit[ECHO_SWITCH])
2410 g_print("\n%s\n",aline);
2411 if (!pswit[OVERVIEW_SWITCH])
2412 g_print(" Line %ld column %ld - "
2413 "Wrongspaced quotes?\n",
2414 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2422 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2423 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2425 if (pswit[ECHO_SWITCH])
2426 g_print("\n%s\n",aline);
2427 if (!pswit[OVERVIEW_SWITCH])
2428 g_print(" Line %ld column %ld - "
2429 "Wrongspaced quotes?\n",
2430 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2437 c=g_utf8_get_char(aline);
2438 if (CHAR_IS_DQUOTE(c))
2440 if (g_utf8_strchr(",;:!?)]} ",-1,
2441 g_utf8_get_char(g_utf8_next_char(aline))))
2443 if (pswit[ECHO_SWITCH])
2444 g_print("\n%s\n",aline);
2445 if (!pswit[OVERVIEW_SWITCH])
2446 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2452 if (pswit[SQUOTE_SWITCH])
2454 nc=g_utf8_get_char(aline);
2455 for (s=aline;*s;s=g_utf8_next_char(s))
2458 nc=g_utf8_get_char(g_utf8_next_char(s));
2459 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2460 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2461 !g_unichar_isalpha(nc)))
2463 parities->squote=!parities->squote;
2464 if (!parities->squote)
2467 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2469 if (pswit[ECHO_SWITCH])
2470 g_print("\n%s\n",aline);
2471 if (!pswit[OVERVIEW_SWITCH])
2472 g_print(" Line %ld column %ld - "
2473 "Wrongspaced singlequotes?\n",
2474 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2482 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2483 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2485 if (pswit[ECHO_SWITCH])
2486 g_print("\n%s\n",aline);
2487 if (!pswit[OVERVIEW_SWITCH])
2488 g_print(" Line %ld column %ld - "
2489 "Wrongspaced singlequotes?\n",
2490 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2501 * check_for_double_punctuation:
2503 * Look for double punctuation like ,. or ,,
2504 * Thanks to DW for the suggestion!
2505 * In books with references, ".," and ".;" are common
2506 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2507 * OTOH, from my initial tests, there are also fairly
2508 * common errors. What to do? Make these cases paranoid?
2509 * ".," is the most common, so warnings->dotcomma is used
2510 * to suppress detailed reporting if it occurs often.
2512 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2516 nc=g_utf8_get_char(aline);
2517 for (s=aline;*s;s=g_utf8_next_char(s))
2520 nc=g_utf8_get_char(g_utf8_next_char(s));
2521 /* for each punctuation character in the line */
2522 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2523 g_utf8_strchr(".?!,;:",-1,nc))
2525 /* followed by punctuation, it's a query, unless . . . */
2526 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2527 !warnings->dotcomma && c=='.' && nc==',' ||
2528 warnings->isFrench && g_str_has_prefix(s,",...") ||
2529 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2530 warnings->isFrench && g_str_has_prefix(s,";...") ||
2531 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2532 warnings->isFrench && g_str_has_prefix(s,":...") ||
2533 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2534 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2535 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2536 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2537 warnings->isFrench && g_str_has_prefix(s,"...?"))
2539 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2540 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2541 warnings->isFrench && g_str_has_prefix(s,";...") ||
2542 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2543 warnings->isFrench && g_str_has_prefix(s,":...") ||
2544 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2545 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2546 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2547 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2548 warnings->isFrench && g_str_has_prefix(s,"...?"))
2551 nc=g_utf8_get_char(g_utf8_next_char(s));
2553 ; /* do nothing for .. !! and ?? which can be legit */
2557 if (pswit[ECHO_SWITCH])
2558 g_print("\n%s\n",aline);
2559 if (!pswit[OVERVIEW_SWITCH])
2560 g_print(" Line %ld column %ld - Double punctuation?\n",
2561 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2570 * check_for_spaced_quotes:
2572 void check_for_spaced_quotes(const char *aline)
2576 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2580 while ((t=strstr(s," \" ")))
2582 if (pswit[ECHO_SWITCH])
2583 g_print("\n%s\n",aline);
2584 if (!pswit[OVERVIEW_SWITCH])
2585 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2586 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2589 s=g_utf8_next_char(g_utf8_next_char(t));
2591 pattern=g_string_new(NULL);
2592 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2594 g_string_assign(pattern," ");
2595 g_string_append_unichar(pattern,single_quotes[i]);
2596 g_string_append_c(pattern,' ');
2598 while ((t=strstr(s,pattern->str)))
2600 if (pswit[ECHO_SWITCH])
2601 g_print("\n%s\n",aline);
2602 if (!pswit[OVERVIEW_SWITCH])
2603 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2604 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2607 s=g_utf8_next_char(g_utf8_next_char(t));
2610 g_string_free(pattern,TRUE);
2614 * check_for_miscased_genative:
2616 * Check special case of 'S instead of 's at end of word.
2618 void check_for_miscased_genative(const char *aline)
2624 c=g_utf8_get_char(aline);
2625 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2626 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2630 nc=g_utf8_get_char(g_utf8_next_char(s));
2631 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2633 if (pswit[ECHO_SWITCH])
2634 g_print("\n%s\n",aline);
2635 if (!pswit[OVERVIEW_SWITCH])
2636 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2637 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2645 * check_end_of_line:
2647 * Now check special cases - start and end of line -
2648 * for single and double quotes. Start is sometimes [sic]
2649 * but better to query it anyway.
2650 * While we're here, check for dash at end of line.
2652 void check_end_of_line(const char *aline,struct warnings *warnings)
2657 lbytes=strlen(aline);
2658 if (g_utf8_strlen(aline,lbytes)>1)
2660 s=g_utf8_prev_char(aline+lbytes);
2661 c1=g_utf8_get_char(s);
2662 c2=g_utf8_get_char(g_utf8_prev_char(s));
2663 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2665 if (pswit[ECHO_SWITCH])
2666 g_print("\n%s\n",aline);
2667 if (!pswit[OVERVIEW_SWITCH])
2668 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2669 g_utf8_strlen(aline,lbytes));
2673 c1=g_utf8_get_char(aline);
2674 c2=g_utf8_get_char(g_utf8_next_char(aline));
2675 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2677 if (pswit[ECHO_SWITCH])
2678 g_print("\n%s\n",aline);
2679 if (!pswit[OVERVIEW_SWITCH])
2680 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2685 * Dash at end of line may well be legit - paranoid mode only
2686 * and don't report em-dash at line-end.
2688 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2690 for (s=g_utf8_prev_char(aline+lbytes);
2691 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2693 if (g_utf8_get_char(s)=='-' &&
2694 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2696 if (pswit[ECHO_SWITCH])
2697 g_print("\n%s\n",aline);
2698 if (!pswit[OVERVIEW_SWITCH])
2699 g_print(" Line %ld column %ld - "
2700 "Hyphen at end of line?\n",
2701 linecnt,g_utf8_pointer_to_offset(aline,s));
2708 * check_for_unspaced_bracket:
2710 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2711 * If so, suspect a scanno like "a]most".
2713 void check_for_unspaced_bracket(const char *aline)
2717 c=g_utf8_get_char(aline);
2718 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2719 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2723 nc=g_utf8_get_char(g_utf8_next_char(s));
2726 /* for each bracket character in the line except 1st & last */
2727 if (g_utf8_strchr("{[()]}",-1,c) &&
2728 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2730 if (pswit[ECHO_SWITCH])
2731 g_print("\n%s\n",aline);
2732 if (!pswit[OVERVIEW_SWITCH])
2733 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2734 linecnt,g_utf8_pointer_to_offset(aline,s));
2742 * check_for_unpunctuated_endquote:
2744 void check_for_unpunctuated_endquote(const char *aline)
2749 c=g_utf8_get_char(aline);
2750 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2751 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2755 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2756 nc=g_utf8_get_char(g_utf8_next_char(s));
2757 /* for each character in the line except 1st */
2758 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2760 if (pswit[ECHO_SWITCH])
2761 g_print("\n%s\n",aline);
2762 if (!pswit[OVERVIEW_SWITCH])
2763 g_print(" Line %ld column %ld - "
2764 "endquote missing punctuation?\n",
2765 linecnt,g_utf8_pointer_to_offset(aline,s));
2773 * check_for_html_tag:
2775 * Check for <HTML TAG>.
2777 * If there is a < in the line, followed at some point
2778 * by a > then we suspect HTML.
2780 void check_for_html_tag(const char *aline)
2782 const char *open,*close;
2784 open=strchr(aline,'<');
2787 close=strchr(g_utf8_next_char(open),'>');
2790 if (pswit[ECHO_SWITCH])
2791 g_print("\n%s\n",aline);
2792 if (!pswit[OVERVIEW_SWITCH])
2794 tag=g_strndup(open,close-open+1);
2795 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2796 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2806 * check_for_html_entity:
2808 * Check for &symbol; HTML.
2810 * If there is a & in the line, followed at
2811 * some point by a ; then we suspect HTML.
2813 void check_for_html_entity(const char *aline)
2815 const char *s,*amp,*scolon;
2817 amp=strchr(aline,'&');
2820 scolon=strchr(amp,';');
2823 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2824 if (g_utf8_get_char(s)==CHAR_SPACE)
2825 break; /* Don't report "Jones & Son;" */
2828 if (pswit[ECHO_SWITCH])
2829 g_print("\n%s\n",aline);
2830 if (!pswit[OVERVIEW_SWITCH])
2832 entity=g_strndup(amp,scolon-amp+1);
2833 g_print(" Line %ld column %d - HTML symbol? %s \n",
2834 linecnt,(int)(amp-aline)+1,entity);
2845 * check_for_omitted_punctuation:
2847 * Check for omitted punctuation at end of paragraph by working back
2848 * through prevline. DW.
2849 * Need to check this only for "normal" paras.
2850 * So what is a "normal" para?
2851 * Not normal if one-liner (chapter headings, etc.)
2852 * Not normal if doesn't contain at least one locase letter
2853 * Not normal if starts with space
2855 void check_for_omitted_punctuation(const char *prevline,
2856 struct line_properties *last,int start_para_line)
2858 gboolean letter_on_line=FALSE;
2861 gboolean closing_quote;
2862 for (s=prevline;*s;s=g_utf8_next_char(s))
2863 if (g_unichar_isalpha(g_utf8_get_char(s)))
2865 letter_on_line=TRUE;
2869 * This next "if" is a problem.
2870 * If we say "start_para_line <= linecnt - 1", that includes
2871 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2872 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2873 * misses genuine one-line paragraphs.
2875 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2876 g_utf8_get_char(prevline)>CHAR_SPACE)
2878 s=prevline+strlen(prevline);
2881 s=g_utf8_prev_char(s);
2882 c=g_utf8_get_char(s);
2883 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2886 closing_quote=FALSE;
2887 } while (closing_quote && s>prevline);
2888 for (;s>prevline;s=g_utf8_prev_char(s))
2890 if (g_unichar_isalpha(g_utf8_get_char(s)))
2892 if (pswit[ECHO_SWITCH])
2893 g_print("\n%s\n",prevline);
2894 if (!pswit[OVERVIEW_SWITCH])
2895 g_print(" Line %ld column %ld - "
2896 "No punctuation at para end?\n",
2897 linecnt-1,g_utf8_strlen(prevline,-1));
2902 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2908 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2910 const char *word=key;
2913 g_print("\nNote: Queried word %s was duplicated %d times\n",
2918 void print_as_windows_1252(const char *string)
2920 gsize inbytes,outbytes;
2922 static GIConv converter=(GIConv)-1;
2925 if (converter!=(GIConv)-1)
2926 g_iconv_close(converter);
2927 converter=(GIConv)-1;
2930 if (converter==(GIConv)-1)
2931 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2932 if (converter!=(GIConv)-1)
2934 inbytes=outbytes=strlen(string);
2935 bp=buf=g_malloc(outbytes+1);
2936 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2942 fputs(string,stdout);
2945 void print_as_utf_8(const char *string)
2947 fputs(string,stdout);
2955 void procfile(const char *filename)
2958 gchar *parastart=NULL; /* first line of current para */
2959 gchar *etext,*aline;
2962 struct first_pass_results *first_pass_results;
2963 struct warnings *warnings;
2964 struct counters counters={0};
2965 struct line_properties last={0};
2966 struct parities parities={0};
2967 struct pending pending={0};
2968 gboolean isemptyline;
2969 long start_para_line=0;
2970 gboolean isnewpara=FALSE,enddash=FALSE;
2971 last.start=CHAR_SPACE;
2972 linecnt=checked_linecnt=0;
2973 etext=read_etext(filename,&err);
2976 if (pswit[STDOUT_SWITCH])
2977 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2979 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2982 g_print("\n\nFile: %s\n\n",filename);
2983 first_pass_results=first_pass(etext);
2984 warnings=report_first_pass(first_pass_results);
2985 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2986 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2988 * Here we go with the main pass. Hold onto yer hat!
2992 while ((aline=flgets(&etext_ptr,linecnt+1)))
2997 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2998 continue; // skip DP page separators completely
2999 if (linecnt<first_pass_results->firstline ||
3000 (first_pass_results->footerline>0 &&
3001 linecnt>first_pass_results->footerline))
3003 if (pswit[HEADER_SWITCH])
3005 if (g_str_has_prefix(aline,"Title:"))
3006 g_print(" %s\n",aline);
3007 if (g_str_has_prefix(aline,"Author:"))
3008 g_print(" %s\n",aline);
3009 if (g_str_has_prefix(aline,"Release Date:"))
3010 g_print(" %s\n",aline);
3011 if (g_str_has_prefix(aline,"Edition:"))
3012 g_print(" %s\n\n",aline);
3014 continue; /* skip through the header */
3017 print_pending(aline,parastart,&pending);
3018 isemptyline=analyse_quotes(aline,&counters);
3019 if (isnewpara && !isemptyline)
3021 /* This line is the start of a new paragraph. */
3022 start_para_line=linecnt;
3023 /* Capture its first line in case we want to report it later. */
3025 parastart=g_strdup(aline);
3026 memset(&parities,0,sizeof(parities)); /* restart the quote count */
3028 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
3029 !g_unichar_isdigit(g_utf8_get_char(s)))
3030 s=g_utf8_next_char(s);
3031 if (g_unichar_islower(g_utf8_get_char(s)))
3033 /* and its first letter is lowercase */
3034 if (pswit[ECHO_SWITCH])
3035 g_print("\n%s\n",aline);
3036 if (!pswit[OVERVIEW_SWITCH])
3037 g_print(" Line %ld column %ld - "
3038 "Paragraph starts with lower-case\n",
3039 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
3043 isnewpara=FALSE; /* Signal the end of new para processing. */
3045 /* Check for an em-dash broken at line end. */
3046 if (enddash && g_utf8_get_char(aline)=='-')
3048 if (pswit[ECHO_SWITCH])
3049 g_print("\n%s\n",aline);
3050 if (!pswit[OVERVIEW_SWITCH])
3051 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
3056 for (s=g_utf8_prev_char(aline+strlen(aline));
3057 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
3059 if (s>=aline && g_utf8_get_char(s)=='-')
3061 check_for_control_characters(aline);
3062 check_for_odd_characters(aline,warnings,isemptyline);
3063 if (warnings->longline)
3064 check_for_long_line(aline);
3065 if (warnings->shortline)
3066 check_for_short_line(aline,&last);
3068 last.len=g_utf8_strlen(aline,-1);
3069 last.start=g_utf8_get_char(aline);
3070 check_for_starting_punctuation(aline);
3073 check_for_spaced_emdash(aline);
3074 check_for_spaced_dash(aline);
3076 check_for_unmarked_paragraphs(aline);
3077 check_for_jeebies(aline);
3078 check_for_mta_from(aline);
3079 check_for_orphan_character(aline);
3080 check_for_pling_scanno(aline);
3081 check_for_extra_period(aline,warnings);
3082 check_for_following_punctuation(aline);
3083 check_for_typos(aline,warnings);
3084 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3085 check_for_double_punctuation(aline,warnings);
3086 check_for_spaced_quotes(aline);
3087 check_for_miscased_genative(aline);
3088 check_end_of_line(aline,warnings);
3089 check_for_unspaced_bracket(aline);
3090 if (warnings->endquote)
3091 check_for_unpunctuated_endquote(aline);
3092 check_for_html_tag(aline);
3093 check_for_html_entity(aline);
3096 check_for_mismatched_quotes(&counters,&pending);
3097 counters_reset(&counters);
3098 /* let the next iteration know that it's starting a new para */
3101 check_for_omitted_punctuation(prevline,&last,start_para_line);
3104 prevline=g_strdup(aline);
3107 check_for_mismatched_quotes(&counters,&pending);
3108 print_pending(NULL,parastart,&pending);
3109 reset_pending(&pending);
3118 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3119 g_tree_foreach(qword,report_duplicate_queries,NULL);
3120 g_tree_unref(qword);
3121 g_tree_unref(qperiod);
3122 counters_destroy(&counters);
3123 g_set_print_handler(NULL);
3124 print_as_windows_1252(NULL);
3125 if (pswit[MARKUP_SWITCH])
3132 * Get one line from the input text, checking for
3133 * the existence of exactly one CR/LF line-end per line.
3135 * Returns: a pointer to the line.
3137 char *flgets(char **etext,long lcnt)
3140 gboolean isCR=FALSE;
3141 char *theline=*etext;
3146 c=g_utf8_get_char(*etext);
3149 if (*etext==theline)
3151 else if (pswit[LINE_END_SWITCH])
3153 if (pswit[ECHO_SWITCH])
3155 s=g_strndup(theline,eos-theline);
3156 g_print("\n%s\n",s);
3159 if (!pswit[OVERVIEW_SWITCH])
3160 /* There may, or may not, have been a CR */
3161 g_print(" Line %ld - No LF?\n",lcnt);
3167 *etext=g_utf8_next_char(*etext);
3168 /* either way, it's end of line */
3175 /* Error - a LF without a preceding CR */
3176 if (pswit[LINE_END_SWITCH])
3178 if (pswit[ECHO_SWITCH])
3180 s=g_strndup(theline,eos-theline);
3181 g_print("\n%s\n",s);
3184 if (!pswit[OVERVIEW_SWITCH])
3185 g_print(" Line %ld - No CR?\n",lcnt);
3196 /* Error - two successive CRs */
3197 if (pswit[LINE_END_SWITCH])
3199 if (pswit[ECHO_SWITCH])
3201 s=g_strndup(theline,eos-theline);
3202 g_print("\n%s\n",s);
3205 if (!pswit[OVERVIEW_SWITCH])
3206 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3215 if (pswit[LINE_END_SWITCH] && isCR)
3217 if (pswit[ECHO_SWITCH])
3219 s=g_strndup(theline,eos-theline);
3220 g_print("\n%s\n",s);
3223 if (!pswit[OVERVIEW_SWITCH])
3224 g_print(" Line %ld column %ld - CR without LF?\n",
3225 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3231 eos=g_utf8_next_char(eos);
3235 if (pswit[MARKUP_SWITCH])
3236 postprocess_for_HTML(theline);
3237 if (pswit[DP_SWITCH])
3238 postprocess_for_DP(theline);
3245 * Takes a "word" as a parameter, and checks whether it
3246 * contains a mixture of alpha and digits. Generally, this is an
3247 * error, but may not be for cases like 4th or L5 12s. 3d.
3249 * Returns: TRUE iff an is error found.
3251 gboolean mixdigit(const char *checkword)
3253 gboolean wehaveadigit,wehavealetter,query;
3254 const char *s,*nondigit;
3255 wehaveadigit=wehavealetter=query=FALSE;
3256 for (s=checkword;*s;s=g_utf8_next_char(s))
3257 if (g_unichar_isalpha(g_utf8_get_char(s)))
3259 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3261 if (wehaveadigit && wehavealetter)
3263 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3265 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3266 nondigit=g_utf8_next_char(nondigit))
3268 /* digits, ending in st, rd, nd, th of either case */
3269 if (!g_ascii_strcasecmp(nondigit,"st") ||
3270 !g_ascii_strcasecmp(nondigit,"rd") ||
3271 !g_ascii_strcasecmp(nondigit,"nd") ||
3272 !g_ascii_strcasecmp(nondigit,"th"))
3274 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3275 !g_ascii_strcasecmp(nondigit,"rds") ||
3276 !g_ascii_strcasecmp(nondigit,"nds") ||
3277 !g_ascii_strcasecmp(nondigit,"ths"))
3279 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3280 !g_ascii_strcasecmp(nondigit,"rdly") ||
3281 !g_ascii_strcasecmp(nondigit,"ndly") ||
3282 !g_ascii_strcasecmp(nondigit,"thly"))
3284 /* digits, ending in l, L, s or d */
3285 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3286 !strcmp(nondigit,"d"))
3289 * L at the start of a number, representing Britsh pounds, like L500.
3290 * This is cute. We know the current word is mixed digit. If the first
3291 * letter is L, there must be at least one digit following. If both
3292 * digits and letters follow, we have a genuine error, else we have a
3293 * capital L followed by digits, and we accept that as a non-error.
3295 if (g_utf8_get_char(checkword)=='L' &&
3296 !mixdigit(g_utf8_next_char(checkword)))
3305 * Extracts the first/next "word" from the line, and returns it.
3306 * A word is defined as one English word unit--or at least that's the aim.
3307 * "ptr" is advanced to the position in the line where we will start
3308 * looking for the next word.
3310 * Returns: A newly-allocated string.
3312 gchar *getaword(const char **ptr)
3317 word=g_string_new(NULL);
3318 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3319 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3320 **ptr;*ptr=g_utf8_next_char(*ptr))
3323 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3324 * Especially yucky is the case of L1,000
3325 * This section looks for a pattern of characters including a digit
3326 * followed by a comma or period followed by one or more digits.
3327 * If found, it returns this whole pattern as a word; otherwise we discard
3328 * the results and resume our normal programming.
3331 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3332 g_unichar_isalpha(g_utf8_get_char(s)) ||
3333 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3334 g_string_append_unichar(word,g_utf8_get_char(s));
3337 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3339 c=g_utf8_get_char(t);
3340 pc=g_utf8_get_char(g_utf8_prev_char(t));
3341 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3344 return g_string_free(word,FALSE);
3348 /* we didn't find a punctuated number - do the regular getword thing */
3349 g_string_truncate(word,0);
3350 c=g_utf8_get_char(*ptr);
3351 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3352 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3353 g_string_append_unichar(word,c);
3354 return g_string_free(word,FALSE);
3360 * Is this word a Roman Numeral?
3362 * It doesn't actually validate that the number is a valid Roman Numeral--for
3363 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3364 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3365 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3366 * expressions thereof, except when it came to taxes. Allow any number of M,
3367 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3368 * XL or an optional XC, an optional IX or IV, an optional V and any number
3371 gboolean isroman(const char *t)
3377 while (g_utf8_get_char(t)=='m' && *t)
3379 if (g_utf8_get_char(t)=='d')
3381 if (g_str_has_prefix(t,"cm"))
3383 if (g_str_has_prefix(t,"cd"))
3385 while (g_utf8_get_char(t)=='c' && *t)
3387 if (g_str_has_prefix(t,"xl"))
3389 if (g_str_has_prefix(t,"xc"))
3391 if (g_utf8_get_char(t)=='l')
3393 while (g_utf8_get_char(t)=='x' && *t)
3395 if (g_str_has_prefix(t,"ix"))
3397 if (g_str_has_prefix(t,"iv"))
3399 if (g_utf8_get_char(t)=='v')
3401 while (g_utf8_get_char(t)=='i' && *t)
3407 * postprocess_for_DP:
3409 * Invoked with the -d switch from flgets().
3410 * It simply "removes" from the line a hard-coded set of common
3411 * DP-specific tags, so that the line passed to the main routine has
3412 * been pre-cleaned of DP markup.
3414 void postprocess_for_DP(char *theline)
3420 for (i=0;*DPmarkup[i];i++)
3421 while ((s=strstr(theline,DPmarkup[i])))
3423 t=s+strlen(DPmarkup[i]);
3424 memmove(s,t,strlen(t)+1);
3429 * postprocess_for_HTML:
3431 * Invoked with the -m switch from flgets().
3432 * It simply "removes" from the line a hard-coded set of common
3433 * HTML tags and "replaces" a hard-coded set of common HTML
3434 * entities, so that the line passed to the main routine has
3435 * been pre-cleaned of HTML.
3437 void postprocess_for_HTML(char *theline)
3439 while (losemarkup(theline))
3441 loseentities(theline);
3444 char *losemarkup(char *theline)
3448 s=strchr(theline,'<');
3449 t=s?strchr(s,'>'):NULL;
3452 for (i=0;*markup[i];i++)
3453 if (tagcomp(g_utf8_next_char(s),markup[i]))
3455 t=g_utf8_next_char(t);
3456 memmove(s,t,strlen(t)+1);
3459 /* It's an unrecognized <xxx>. */
3463 void loseentities(char *theline)
3470 GTree *entities=NULL;
3471 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3475 g_tree_destroy(entities);
3477 if (translit!=(GIConv)-1)
3478 g_iconv_close(translit);
3479 translit=(GIConv)-1;
3480 if (to_utf8!=(GIConv)-1)
3481 g_iconv_close(to_utf8);
3489 entities=g_tree_new((GCompareFunc)strcmp);
3490 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3491 g_tree_insert(entities,HTMLentities[i].name,
3492 GUINT_TO_POINTER(HTMLentities[i].c));
3494 if (translit==(GIConv)-1)
3495 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3496 if (to_utf8==(GIConv)-1)
3497 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3498 while((amp=strchr(theline,'&')))
3500 scolon=strchr(amp,';');
3505 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3506 c=strtol(amp+2,NULL,10);
3507 else if (amp[2]=='x' &&
3508 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3509 c=strtol(amp+3,NULL,16);
3513 s=g_strndup(amp+1,scolon-(amp+1));
3514 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3523 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3524 theline+=g_unichar_to_utf8(c,theline);
3528 nb=g_unichar_to_utf8(c,s);
3529 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3531 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3533 memcpy(theline,s,nb);
3537 memmove(theline,g_utf8_next_char(scolon),
3538 strlen(g_utf8_next_char(scolon))+1);
3541 theline=g_utf8_next_char(amp);
3545 gboolean tagcomp(const char *strin,const char *basetag)
3549 if (g_utf8_get_char(strin)=='/')
3550 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3552 t=g_utf8_casefold(strin,-1);
3553 s=g_utf8_casefold(basetag,-1);
3554 retval=g_str_has_prefix(t,s);
3560 void proghelp(GOptionContext *context)
3563 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3564 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3565 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3566 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3567 "For details, read the file COPYING.\n",stderr);
3568 fputs("This is Free Software; "
3569 "you may redistribute it under certain conditions (GPL);\n",stderr);
3570 fputs("read the file COPYING for details.\n\n",stderr);
3571 help=g_option_context_get_help(context,TRUE,NULL);
3574 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3575 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3576 "non-ASCII\n",stderr);
3577 fputs("characters like accented letters, "
3578 "lines longer than 75 or shorter than 55,\n",stderr);
3579 fputs("unbalanced quotes or brackets, "
3580 "a variety of badly formatted punctuation, \n",stderr);
3581 fputs("HTML tags, some likely typos. "
3582 "It is NOT a substitute for human judgement.\n",stderr);