1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 gboolean typo_compat,paranoid_compat;
137 static GOptionEntry options[]={
138 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
139 "Ignore DP-specific markup", NULL },
140 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
141 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
142 "Don't ignore DP-specific markup", NULL },
143 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
144 "Echo queried line", NULL },
145 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
146 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
147 "Don't echo queried line", NULL },
148 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
149 "Check single quotes", NULL },
150 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
151 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
152 "Don't check single quotes", NULL },
153 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
154 "Check common typos", NULL },
155 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
156 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
157 "Don't check common typos", NULL },
158 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
159 "Require closure of quotes on every paragraph", NULL },
160 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
161 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
162 "Don't require closure of quotes on every paragraph", NULL },
163 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
164 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
165 "Enable paranoid querying of everything", NULL },
166 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
167 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
168 "Disable paranoid querying of everything", NULL },
169 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
170 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
171 "Enable line end checking", NULL },
172 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
173 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
174 "Disable line end checking", NULL },
175 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
176 "Overview: just show counts", NULL },
177 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
178 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
179 "Show individual warnings", NULL },
180 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
181 "Output errors to stdout instead of stderr", NULL },
182 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
183 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
184 "Output errors to stderr instead of stdout", NULL },
185 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
186 "Echo header fields", NULL },
187 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
188 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Don't echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
193 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "No special handling for markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
198 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
199 "Ignore file of user-defined typos", NULL },
200 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
201 "Verbose - list everything", NULL },
202 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
203 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
204 "Switch off verbose mode", NULL },
209 * Options relating to configuration which make no sense from inside
210 * a configuration file.
213 static GOptionEntry config_options[]={
214 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
215 "Defaults for use on www upload", NULL },
216 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
217 "Dump current config settings", NULL },
218 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
219 "Set of characters valid for this ebook", "NAME" },
223 static GOptionEntry compatibility_options[]={
224 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
225 "Toggle checking for common typos", NULL },
226 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
227 "Toggle both paranoid mode and common typos", NULL },
231 long cnt_quote; /* for overview mode, count of quote queries */
232 long cnt_brack; /* for overview mode, count of brackets queries */
233 long cnt_bin; /* for overview mode, count of non-ASCII queries */
234 long cnt_odd; /* for overview mode, count of odd character queries */
235 long cnt_long; /* for overview mode, count of long line errors */
236 long cnt_short; /* for overview mode, count of short line queries */
237 long cnt_punct; /* for overview mode,
238 count of punctuation and spacing queries */
239 long cnt_dash; /* for overview mode, count of dash-related queries */
240 long cnt_word; /* for overview mode, count of word queries */
241 long cnt_html; /* for overview mode, count of html queries */
242 long cnt_lineend; /* for overview mode, count of line-end queries */
243 long cnt_spacend; /* count of lines with space at end */
244 long linecnt; /* count of total lines in the file */
245 long checked_linecnt; /* count of lines actually checked */
247 void proghelp(GOptionContext *context);
248 void procfile(const char *);
252 gboolean mixdigit(const char *);
253 gchar *getaword(const char **);
254 char *flgets(char **,long,int);
255 void postprocess_for_HTML(char *);
256 char *linehasmarkup(char *);
257 char *losemarkup(char *);
258 gboolean tagcomp(const char *,const char *);
259 void loseentities(char *);
260 gboolean isroman(const char *);
261 void postprocess_for_DP(char *);
262 void print_as_windows_1252(const char *string);
263 void print_as_utf_8(const char *string);
265 GTree *qword,*qperiod;
273 void config_file_update(GKeyFile *kf)
277 for(i=0;options[i].long_name;i++)
279 if (g_str_has_prefix(options[i].long_name,"no-"))
281 if (options[i].arg==G_OPTION_ARG_NONE)
283 sw=*(gboolean *)options[i].arg_data;
284 if (options[i].flags&G_OPTION_FLAG_REVERSE)
286 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
289 g_assert_not_reached();
293 void config_file_add_comments(GKeyFile *kf)
297 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
299 for(i=0;options[i].long_name;i++)
301 if (g_str_has_prefix(options[i].long_name,"no-"))
303 comment=g_strconcat(" ",options[i].description,NULL);
304 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
309 void dump_config(void)
313 config_file_update(config);
316 config=g_key_file_new();
317 config_file_update(config);
318 config_file_add_comments(config);
320 s=g_key_file_to_data(config,NULL,NULL);
326 GKeyFile *read_config_file(gchar **full_path)
332 const char *search_path;
335 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
339 search_dirs=g_strsplit(search_path,";",0);
341 search_dirs=g_strsplit(search_path,":",0);
346 search_dirs=g_new(gchar *,4);
347 search_dirs[0]=g_get_current_dir();
348 search_dirs[1]=g_strdup(running_from);
349 search_dirs[2]=g_strdup(g_get_user_config_dir());
352 for(i=0;search_dirs[i];i++)
354 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
355 if (g_key_file_load_from_file(kf,path,
356 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
358 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
360 g_printerr("Bookloupe: Error reading %s\n",path);
361 g_printerr("%s\n",err->message);
373 g_strfreev(search_dirs);
381 void parse_config_file(void)
388 config=read_config_file(&path);
390 keys=g_key_file_get_keys(config,"options",NULL,NULL);
397 for(j=0;options[j].long_name;j++)
399 if (g_str_has_prefix(options[j].long_name,"no-"))
401 else if (!strcmp(keys[i],options[j].long_name))
403 if (options[j].arg==G_OPTION_ARG_NONE)
405 sw=g_key_file_get_boolean(config,"options",keys[i],
409 g_printerr("Bookloupe: %s: options.%s: %s\n",
410 path,keys[i],err->message);
413 if (options[j].flags&G_OPTION_FLAG_REVERSE)
415 *(gboolean *)options[j].arg_data=sw;
419 g_assert_not_reached();
422 if (!options[j].long_name)
423 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
432 gboolean set_charset(const char *name,GError **err)
434 /* The various UNICODE encodings all share the same character set. */
435 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
436 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
437 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
438 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
439 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
443 if (charset_validator!=(GIConv)-1)
444 g_iconv_close(charset_validator);
445 if (!name || !g_strcasecmp(name,"auto"))
448 charset_validator=(GIConv)-1;
452 charset=g_strdup(name);
453 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
454 if (!g_strcasecmp(charset,unicode_aliases[i]))
457 charset=g_strdup("UTF-8");
460 if (!strcmp(charset,"UTF-8"))
461 charset_validator=(GIConv)-1;
464 charset_validator=g_iconv_open(charset,"UTF-8");
465 if (charset_validator==(GIConv)-1)
467 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
468 "Unknown character set \"%s\"",charset);
475 void parse_options(int *argc,char ***argv)
478 GOptionContext *context;
479 GOptionGroup *compatibility;
480 context=g_option_context_new(
481 "file - look for errors in Project Gutenberg(TM) etexts");
482 g_option_context_add_main_entries(context,options,NULL);
483 g_option_context_add_main_entries(context,config_options,NULL);
484 compatibility=g_option_group_new("compatibility",
485 "Options for Compatibility with Gutcheck:",
486 "Show compatibility options",NULL,NULL);
487 g_option_group_add_entries(compatibility,compatibility_options);
488 g_option_context_add_group(context,compatibility);
489 g_option_context_set_description(context,
490 "For simplicity, only the switch options which reverse the\n"
491 "default configuration are listed. In most cases, both vanilla\n"
492 "and \"no-\" prefixed versions are available for use.");
493 if (!g_option_context_parse(context,argc,argv,&err))
495 g_printerr("Bookloupe: %s\n",err->message);
496 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
500 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
503 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
504 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
507 * Web uploads - for the moment, this is really just a placeholder
508 * until we decide what processing we really want to do on web uploads
510 if (pswit[WEB_SWITCH])
512 /* specific override for web uploads */
513 pswit[ECHO_SWITCH]=TRUE;
514 pswit[SQUOTE_SWITCH]=FALSE;
515 pswit[TYPO_SWITCH]=TRUE;
516 pswit[QPARA_SWITCH]=FALSE;
517 pswit[PARANOID_SWITCH]=TRUE;
518 pswit[LINE_END_SWITCH]=FALSE;
519 pswit[OVERVIEW_SWITCH]=FALSE;
520 pswit[STDOUT_SWITCH]=FALSE;
521 pswit[HEADER_SWITCH]=TRUE;
522 pswit[VERBOSE_SWITCH]=FALSE;
523 pswit[MARKUP_SWITCH]=FALSE;
524 pswit[USERTYPO_SWITCH]=FALSE;
525 pswit[DP_SWITCH]=FALSE;
527 if (opt_charset && !set_charset(opt_charset,&err))
529 g_printerr("%s\n",err->message);
532 if (pswit[DUMP_CONFIG_SWITCH])
539 if (pswit[OVERVIEW_SWITCH])
540 /* just print summary; don't echo */
541 pswit[ECHO_SWITCH]=FALSE;
547 g_option_context_free(context);
553 * Read in the user-defined stealth scanno list.
555 void read_user_scannos(void)
558 gchar *usertypo_file;
562 gchar *contents,*utf8,**lines;
563 usertypo_file=g_strdup("bookloupe.typ");
564 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
565 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
568 g_free(usertypo_file);
569 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
570 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
572 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
575 g_free(usertypo_file);
576 usertypo_file=g_strdup("gutcheck.typ");
577 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
579 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
582 g_free(usertypo_file);
583 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
584 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
586 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
588 g_free(usertypo_file);
589 g_print(" --> I couldn't find bookloupe.typ "
590 "-- proceeding without user typos.\n");
595 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
596 g_free(usertypo_file);
600 if (g_utf8_validate(contents,len,NULL))
602 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
604 (void)set_charset("UNICODE",NULL);
607 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
609 lines=g_strsplit_set(utf8,"\r\n",0);
611 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
612 for (i=0;lines[i];i++)
613 if (*(unsigned char *)lines[i]>'!')
614 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
623 * Read an etext returning a newly allocated string containing the file
624 * contents or NULL on error.
626 gchar *read_etext(const char *filename,GError **err)
628 GError *tmp_err=NULL;
629 gchar *contents,*utf8;
630 gsize len,bytes_read,bytes_written;
632 if (!g_file_get_contents(filename,&contents,&len,err))
634 if (g_utf8_validate(contents,len,NULL))
636 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
637 g_set_print_handler(print_as_utf_8);
639 SetConsoleOutputCP(CP_UTF8);
644 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
645 &bytes_written,&tmp_err);
646 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
647 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
650 for(i=0;i<bytes_read;i++)
651 if (contents[i]=='\n')
656 else if (contents[i]!='\r')
658 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
659 "Input conversion failed. Byte %d at line %d, column %d is not a "
660 "valid Windows-1252 character",
661 ((unsigned char *)contents)[bytes_read],line,col);
664 g_propagate_error(err,tmp_err);
665 g_set_print_handler(print_as_windows_1252);
667 SetConsoleOutputCP(1252);
674 void cleanup_on_exit(void)
677 SetConsoleOutputCP(saved_cp);
681 int main(int argc,char **argv)
684 atexit(cleanup_on_exit);
685 saved_cp=GetConsoleOutputCP();
687 running_from=g_path_get_dirname(argv[0]);
688 /* Paranoid checking is turned OFF, not on, by its switch */
689 pswit[PARANOID_SWITCH]=TRUE;
690 /* if running in paranoid mode, typo checks default to enabled */
691 pswit[TYPO_SWITCH]=TRUE;
692 /* Line-end checking is turned OFF, not on, by its switch */
693 pswit[LINE_END_SWITCH]=TRUE;
694 /* Echoing is turned OFF, not on, by its switch */
695 pswit[ECHO_SWITCH]=TRUE;
697 parse_options(&argc,&argv);
698 if (pswit[USERTYPO_SWITCH])
700 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
702 if (pswit[OVERVIEW_SWITCH])
704 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
705 checked_linecnt,linecnt,linecnt-checked_linecnt);
706 g_print(" --------------- Queries found --------------\n");
708 g_print(" Long lines: %14ld\n",cnt_long);
710 g_print(" Short lines: %14ld\n",cnt_short);
712 g_print(" Line-end problems: %14ld\n",cnt_lineend);
714 g_print(" Common typos: %14ld\n",cnt_word);
716 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
718 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
720 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
722 g_print(" Proofing characters: %14ld\n",cnt_odd);
724 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
726 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
728 g_print(" Possible HTML tags: %14ld\n",cnt_html);
730 g_print(" TOTAL QUERIES %14ld\n",
731 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
732 cnt_dash+cnt_word+cnt_html+cnt_lineend);
734 g_free(running_from);
736 g_tree_unref(usertypo);
737 set_charset(NULL,NULL);
739 g_key_file_free(config);
743 void count_dashes(const char *line,const char *dash,
744 struct dash_results *results)
749 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
752 tokens=g_strsplit(line,dash,0);
755 for(i=1;tokens[i];i++)
757 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
758 nc=g_utf8_get_char(tokens[i]);
759 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
761 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
763 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
769 /* count of lines with em-dashes with spaces both sides */
770 results->non_PG_space++;
772 /* count of lines with PG-type em-dashes with no spaces */
780 * Run a first pass - verify that it's a valid PG
781 * file, decide whether to report some things that
782 * occur many times in the text like long or short
783 * lines, non-standard dashes, etc.
785 struct first_pass_results *first_pass(const char *etext)
787 gunichar laststart=CHAR_SPACE;
792 unsigned int lastlen=0,lastblen=0;
793 long spline=0,nspline=0;
794 static struct first_pass_results results={0};
795 struct dash_results tmp_dash_results;
798 lines=g_strsplit(etext,"\n",0);
801 /* An empty etext has no terminators */
802 results.newlines=DOS_NEWLINES;
807 * If there are no LFs, we don't have UNIX-style
808 * terminators, but we might have OS9-style ones.
810 results.newlines=OS9_NEWLINES;
812 lines=g_strsplit(etext,"\r",0);
813 if (!lines[0] || !lines[1])
814 /* Looks like we don't have any terminators at all */
815 results.newlines=DOS_NEWLINES;
819 /* We might have UNIX-style terminators */
820 results.newlines=UNIX_NEWLINES;
822 for (j=0;lines[j];j++)
824 lbytes=strlen(lines[j]);
825 if (lbytes>0 && lines[j][lbytes-1]=='\r')
827 results.newlines=DOS_NEWLINES;
830 lines[j][--lbytes]='\0';
831 } while (lbytes>0 && lines[j][lbytes-1]=='\r');
833 llen=g_utf8_strlen(lines[j],lbytes);
835 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
836 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
839 g_print(" --> Duplicate header?\n");
840 spline=linecnt+1; /* first line of non-header text, that is */
842 if (!strncmp(lines[j],"*** START",9) &&
843 strstr(lines[j],"PROJECT GUTENBERG"))
846 g_print(" --> Duplicate header?\n");
847 nspline=linecnt+1; /* first line of non-header text, that is */
849 if (spline || nspline)
851 lc_line=g_utf8_strdown(lines[j],lbytes);
852 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
854 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
856 if (results.footerline)
858 /* it's an old-form header - we can detect duplicates */
860 g_print(" --> Duplicate footer?\n");
863 results.footerline=linecnt;
869 results.firstline=spline;
871 results.firstline=nspline; /* override with new */
872 if (results.footerline)
873 continue; /* don't count the boilerplate in the footer */
874 results.totlen+=llen;
875 for (s=lines[j];*s;s=g_utf8_next_char(s))
877 if (g_utf8_get_char(s)>127)
879 if (g_unichar_isalpha(g_utf8_get_char(s)))
883 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
884 qc=QUOTE_CLASS(g_utf8_get_char(s));
887 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
888 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
889 results.endquote_count++;
892 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
893 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
896 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
898 if (strstr(lines[j],".,"))
900 /* only count ast lines for ignoring purposes where there is */
901 /* locase text on the line */
902 if (strchr(lines[j],'*'))
904 for (s=lines[j];*s;s=g_utf8_next_char(s))
905 if (g_unichar_islower(g_utf8_get_char(s)))
910 if (strchr(lines[j],'/'))
911 results.fslashline++;
914 for (s=g_utf8_prev_char(lines[j]+lbytes);
915 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
916 s=g_utf8_prev_char(s))
918 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
919 g_utf8_get_char(g_utf8_prev_char(s))!='-')
922 if (llen>LONGEST_PG_LINE)
924 if (llen>WAY_TOO_LONG)
925 results.verylongline++;
926 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
928 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
931 if (strstr(lines[j],"<i>"))
932 results.htmcount+=4; /* bonus marks! */
934 /* Check for spaced em-dashes */
935 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
936 count_dashes(lines[j],"--",&tmp_dash_results);
937 count_dashes(lines[j],"—",&tmp_dash_results);
938 if (tmp_dash_results.base)
939 results.emdash.base++;
940 if (tmp_dash_results.non_PG_space)
941 results.emdash.non_PG_space++;
942 if (tmp_dash_results.PG_space)
943 results.emdash.PG_space++;
947 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
948 results.Dutchcount++;
949 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
950 results.Frenchcount++;
951 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
952 results.standalone_digit++;
955 /* Check for spaced dashes */
956 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
960 laststart=lines[j][0];
969 * Make some snap decisions based on the first pass results.
971 struct warnings *report_first_pass(struct first_pass_results *results)
973 static struct warnings warnings={0};
974 warnings.newlines=results->newlines;
975 if (warnings.newlines==UNIX_NEWLINES)
976 g_print(" --> No lines in this file have a CR. Not reporting them. "
977 "Project Gutenberg requires that all lineends be CR-LF.\n");
978 else if (warnings.newlines==OS9_NEWLINES)
979 g_print(" --> No lines in this file have a LF. Not reporting them. "
980 "Project Gutenberg requires that all lineends be CR-LF.\n");
982 g_print(" --> %ld lines in this file have white space at end\n",
985 if (results->dotcomma>5)
988 g_print(" --> %ld lines in this file contain '.,'. "
989 "Not reporting them.\n",results->dotcomma);
992 * If more than 50 lines, or one-tenth, are short,
993 * don't bother reporting them.
995 warnings.shortline=1;
996 if (results->shortline>50 || results->shortline*10>linecnt)
998 warnings.shortline=0;
999 g_print(" --> %ld lines in this file are short. "
1000 "Not reporting short lines.\n",results->shortline);
1003 * If more than 50 lines, or one-tenth, are long,
1004 * don't bother reporting them.
1006 warnings.longline=1;
1007 if (results->longline>50 || results->longline*10>linecnt)
1009 warnings.longline=0;
1010 g_print(" --> %ld lines in this file are long. "
1011 "Not reporting long lines.\n",results->longline);
1013 /* If more than 10 lines contain asterisks, don't bother reporting them. */
1015 if (results->astline>10)
1018 g_print(" --> %ld lines in this file contain asterisks. "
1019 "Not reporting them.\n",results->astline);
1022 * If more than 10 lines contain forward slashes,
1023 * don't bother reporting them.
1026 if (results->fslashline>10)
1029 g_print(" --> %ld lines in this file contain forward slashes. "
1030 "Not reporting them.\n",results->fslashline);
1033 * If more than 20 lines contain unpunctuated endquotes,
1034 * don't bother reporting them.
1036 warnings.endquote=1;
1037 if (results->endquote_count>20)
1039 warnings.endquote=0;
1040 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1041 "Not reporting them.\n",results->endquote_count);
1044 * If more than 15 lines contain standalone digits,
1045 * don't bother reporting them.
1048 if (results->standalone_digit>10)
1051 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1052 "Not reporting them.\n",results->standalone_digit);
1055 * If more than 20 lines contain hyphens at end,
1056 * don't bother reporting them.
1059 if (results->hyphens>20)
1062 g_print(" --> %ld lines in this file have hyphens at end. "
1063 "Not reporting them.\n",results->hyphens);
1065 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1067 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1068 pswit[MARKUP_SWITCH]=1;
1070 if (results->verylongline>0)
1071 g_print(" --> %ld lines in this file are VERY long!\n",
1072 results->verylongline);
1074 * If there are more non-PG spaced dashes than PG em-dashes,
1075 * assume it's deliberate.
1076 * Current PG guidelines say don't use them, but older texts do,
1077 * and some people insist on them whatever the guidelines say.
1080 if (results->spacedash+results->emdash.non_PG_space>
1081 results->emdash.PG_space)
1084 g_print(" --> There are %ld spaced dashes and em-dashes. "
1085 "Not reporting them.\n",
1086 results->spacedash+results->emdash.non_PG_space);
1092 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1094 /* If more than a quarter of characters are hi-bit, bug out. */
1095 if (results->binlen*4>results->totlen)
1097 g_print(" --> This file does not appear to be ASCII. "
1098 "Terminating. Best of luck with it!\n");
1101 if (results->alphalen*4<results->totlen)
1103 g_print(" --> This file does not appear to be text. "
1104 "Terminating. Best of luck with it!\n");
1107 if (results->binlen*100>results->totlen || results->binlen>100)
1109 g_print(" --> There are a lot of foreign letters here. "
1110 "Not reporting them.\n");
1111 if (!pswit[VERBOSE_SWITCH])
1115 warnings.isDutch=FALSE;
1116 if (results->Dutchcount>50)
1118 warnings.isDutch=TRUE;
1119 g_print(" --> This looks like Dutch - "
1120 "switching off dashes and warnings for 's Middags case.\n");
1122 warnings.isFrench=FALSE;
1123 if (results->Frenchcount>50)
1125 warnings.isFrench=TRUE;
1126 g_print(" --> This looks like French - "
1127 "switching off some doublepunct.\n");
1129 if (results->firstline && results->footerline)
1130 g_print(" The PG header and footer appear to be already on.\n");
1133 if (results->firstline)
1134 g_print(" The PG header is on - no footer.\n");
1135 if (results->footerline)
1136 g_print(" The PG footer is on - no header.\n");
1139 if (pswit[VERBOSE_SWITCH])
1141 warnings.shortline=1;
1142 warnings.dotcomma=1;
1143 warnings.longline=1;
1149 warnings.endquote=1;
1150 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1152 if (warnings.isDutch)
1154 if (results->footerline>0 && results->firstline>0 &&
1155 results->footerline>results->firstline &&
1156 results->footerline-results->firstline<100)
1158 g_print(" --> I don't really know where this text starts. \n");
1159 g_print(" There are no reference points.\n");
1160 g_print(" I'm going to have to report the header and footer "
1162 results->firstline=0;
1170 * Look along the line, accumulate the count of quotes, and see
1171 * if this is an empty line - i.e. a line with nothing on it
1173 * If line has just spaces, period, * and/or - on it, don't
1174 * count it, since empty lines with asterisks or dashes to
1175 * separate sections are common.
1177 * Returns: TRUE if the line is empty.
1179 gboolean analyse_quotes(const char *aline,struct counters *counters)
1182 /* assume the line is empty until proven otherwise */
1183 gboolean isemptyline=TRUE;
1184 const char *s=aline,*sprev,*snext;
1187 GError *tmp_err=NULL;
1190 snext=g_utf8_next_char(s);
1191 c=g_utf8_get_char(s);
1192 if (CHAR_IS_DQUOTE(c))
1193 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1194 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1199 * At start of line, it can only be a quotation mark.
1200 * Hardcode a very common exception!
1202 if (!g_str_has_prefix(snext,"tis") &&
1203 !g_str_has_prefix(snext,"Tis"))
1204 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1206 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1207 g_unichar_isalpha(g_utf8_get_char(snext)))
1208 /* Do nothing! it's definitely an apostrophe, not a quote */
1210 /* it's outside a word - let's check it out */
1211 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1212 g_unichar_isalpha(g_utf8_get_char(snext)))
1214 /* certainly looks like a quotation mark */
1215 if (!g_str_has_prefix(snext,"tis") &&
1216 !g_str_has_prefix(snext,"Tis"))
1217 /* hardcode a very common exception! */
1219 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1220 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1222 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1227 /* now - is it a quotation mark? */
1228 guessquote=0; /* accumulate clues */
1229 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1231 /* it follows a letter - could be either */
1233 if (g_utf8_get_char(sprev)=='s')
1235 /* looks like a plural apostrophe */
1237 if (g_utf8_get_char(snext)==CHAR_SPACE)
1241 if (innermost_quote_matches(counters,c))
1243 * Give it the benefit of some doubt,
1244 * if a squote is already open.
1250 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1253 /* no adjacent letter - it must be a quote of some kind */
1254 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1259 if (pswit[ECHO_SWITCH])
1260 g_print("\n%s\n",aline);
1261 if (!pswit[OVERVIEW_SWITCH])
1262 g_print(" Line %ld column %ld - %s\n",
1263 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1264 g_clear_error(&tmp_err);
1266 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1268 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1269 if (c==CHAR_UNDERSCORE)
1270 counters->c_unders++;
1271 if (c==CHAR_OPEN_SBRACK)
1273 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1274 !matching_difference(counters,c) && s==aline &&
1275 g_str_has_prefix(s,"[Illustration:"))
1276 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1278 increment_matching(counters,c,TRUE);
1280 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1281 increment_matching(counters,c,TRUE);
1282 if (c==CHAR_CLOSE_SBRACK)
1284 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1285 !matching_difference(counters,c) && !*snext)
1286 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1288 increment_matching(counters,c,FALSE);
1290 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1291 increment_matching(counters,c,FALSE);
1299 * check_for_control_characters:
1301 * Check for invalid or questionable characters in the line
1302 * Anything above 127 is invalid for plain ASCII, and
1303 * non-printable control characters should also be flagged.
1304 * Tabs should generally not be there.
1306 void check_for_control_characters(const char *aline)
1310 for (s=aline;*s;s=g_utf8_next_char(s))
1312 c=g_utf8_get_char(s);
1313 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1315 if (pswit[ECHO_SWITCH])
1316 g_print("\n%s\n",aline);
1317 if (!pswit[OVERVIEW_SWITCH])
1318 g_print(" Line %ld column %ld - Control character %u\n",
1319 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1327 * check_for_odd_characters:
1329 * Check for binary and other odd characters.
1331 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1332 gboolean isemptyline)
1334 /* Don't repeat multiple warnings on one line. */
1335 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1336 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1341 for (s=aline;*s;s=g_utf8_next_char(s))
1343 c=g_utf8_get_char(s);
1344 if (warnings->bin && !eInvalidChar &&
1345 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1347 if (pswit[ECHO_SWITCH])
1348 g_print("\n%s\n",aline);
1349 if (!pswit[OVERVIEW_SWITCH])
1350 if (c>127 && c<160 || c>255)
1351 g_print(" Line %ld column %ld - "
1352 "Non-ISO-8859 character %u\n",
1353 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1355 g_print(" Line %ld column %ld - "
1356 "Non-ASCII character %u\n",
1357 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1362 if (!eInvalidChar && charset)
1364 if (charset_validator==(GIConv)-1)
1366 if (!g_unichar_isdefined(c))
1368 if (pswit[ECHO_SWITCH])
1369 g_print("\n%s\n",aline);
1370 if (!pswit[OVERVIEW_SWITCH])
1371 g_print(" Line %ld column %ld - Unassigned UNICODE "
1372 "code point U+%04" G_GINT32_MODIFIER "X\n",
1373 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1378 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1379 c>=100000 && c<=0x10FFFD)
1381 if (pswit[ECHO_SWITCH])
1382 g_print("\n%s\n",aline);
1383 if (!pswit[OVERVIEW_SWITCH])
1384 g_print(" Line %ld column %ld - Private Use "
1385 "character U+%04" G_GINT32_MODIFIER "X\n",
1386 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1394 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1395 charset_validator,NULL,&nb,NULL);
1400 if (pswit[ECHO_SWITCH])
1401 g_print("\n%s\n",aline);
1402 if (!pswit[OVERVIEW_SWITCH])
1403 g_print(" Line %ld column %ld - Non-%s "
1404 "character %u\n",linecnt,
1405 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1412 if (!eTab && c==CHAR_TAB)
1414 if (pswit[ECHO_SWITCH])
1415 g_print("\n%s\n",aline);
1416 if (!pswit[OVERVIEW_SWITCH])
1417 g_print(" Line %ld column %ld - Tab character?\n",
1418 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1423 if (!eTilde && c==CHAR_TILDE)
1426 * Often used by OCR software to indicate an
1427 * unrecognizable character.
1429 if (pswit[ECHO_SWITCH])
1430 g_print("\n%s\n",aline);
1431 if (!pswit[OVERVIEW_SWITCH])
1432 g_print(" Line %ld column %ld - Tilde character?\n",
1433 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1438 if (!eCarat && c==CHAR_CARAT)
1440 if (pswit[ECHO_SWITCH])
1441 g_print("\n%s\n",aline);
1442 if (!pswit[OVERVIEW_SWITCH])
1443 g_print(" Line %ld column %ld - Carat character?\n",
1444 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1449 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1451 if (pswit[ECHO_SWITCH])
1452 g_print("\n%s\n",aline);
1453 if (!pswit[OVERVIEW_SWITCH])
1454 g_print(" Line %ld column %ld - Forward slash?\n",
1455 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1461 * Report asterisks only in paranoid mode,
1462 * since they're often deliberate.
1464 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1467 if (pswit[ECHO_SWITCH])
1468 g_print("\n%s\n",aline);
1469 if (!pswit[OVERVIEW_SWITCH])
1470 g_print(" Line %ld column %ld - Asterisk?\n",
1471 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1480 * check_for_long_line:
1482 * Check for line too long.
1484 void check_for_long_line(const char *aline)
1486 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1488 if (pswit[ECHO_SWITCH])
1489 g_print("\n%s\n",aline);
1490 if (!pswit[OVERVIEW_SWITCH])
1491 g_print(" Line %ld column %ld - Long line %ld\n",
1492 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1499 * check_for_short_line:
1501 * Check for line too short.
1503 * This one is a bit trickier to implement: we don't want to
1504 * flag the last line of a paragraph for being short, so we
1505 * have to wait until we know that our current line is a
1506 * "normal" line, then report the _previous_ line if it was too
1507 * short. We also don't want to report indented lines like
1508 * chapter heads or formatted quotations. We therefore keep
1509 * last->len as the length of the last line examined, and
1510 * last->blen as the length of the last but one, and try to
1511 * suppress unnecessary warnings by checking that both were of
1512 * "normal" length. We keep the first character of the last
1513 * line in last->start, and if it was a space, we assume that
1514 * the formatting is deliberate. I can't figure out a way to
1515 * distinguish something like a quoted verse left-aligned or
1516 * the header or footer of a letter from a paragraph of short
1517 * lines - maybe if I examined the whole paragraph, and if the
1518 * para has less than, say, 8 lines and if all lines are short,
1519 * then just assume it's OK? Need to look at some texts to see
1520 * how often a formula like this would get the right result.
1522 void check_for_short_line(const char *aline,const struct line_properties *last)
1524 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1525 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1526 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1528 if (pswit[ECHO_SWITCH])
1529 g_print("\n%s\n",prevline);
1530 if (!pswit[OVERVIEW_SWITCH])
1531 g_print(" Line %ld column %ld - Short line %ld?\n",
1532 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1539 * check_for_starting_punctuation:
1541 * Look for punctuation other than full ellipses at start of line.
1543 void check_for_starting_punctuation(const char *aline)
1545 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1546 !g_str_has_prefix(aline,". . ."))
1548 if (pswit[ECHO_SWITCH])
1549 g_print("\n%s\n",aline);
1550 if (!pswit[OVERVIEW_SWITCH])
1551 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1561 * Find the first em-dash, return a pointer to it and set <next> to the
1562 * character following the dash.
1564 char *str_emdash(const char *s,const char **next)
1572 *next=g_utf8_next_char(s2);
1577 *next=g_utf8_next_char(g_utf8_next_char(s1));
1582 *next=g_utf8_next_char(g_utf8_next_char(s1));
1587 *next=g_utf8_next_char(s2);
1593 * check_for_spaced_emdash:
1595 * Check for spaced em-dashes.
1597 * We must check _all_ occurrences of em-dashes on the line
1598 * hence the loop - even if the first dash is OK
1599 * there may be another that's wrong later on.
1601 void check_for_spaced_emdash(const char *aline)
1603 const char *s,*t,*next;
1604 for (s=aline;t=str_emdash(s,&next);s=next)
1606 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1607 g_utf8_get_char(next)==CHAR_SPACE)
1609 if (pswit[ECHO_SWITCH])
1610 g_print("\n%s\n",aline);
1611 if (!pswit[OVERVIEW_SWITCH])
1612 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1613 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1621 * check_for_spaced_dash:
1623 * Check for spaced dashes.
1625 void check_for_spaced_dash(const char *aline)
1628 if ((s=strstr(aline," -")))
1630 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1632 if (pswit[ECHO_SWITCH])
1633 g_print("\n%s\n",aline);
1634 if (!pswit[OVERVIEW_SWITCH])
1635 g_print(" Line %ld column %ld - Spaced dash?\n",
1636 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1641 else if ((s=strstr(aline,"- ")))
1643 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1645 if (pswit[ECHO_SWITCH])
1646 g_print("\n%s\n",aline);
1647 if (!pswit[OVERVIEW_SWITCH])
1648 g_print(" Line %ld column %ld - Spaced dash?\n",
1649 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1657 * check_for_unmarked_paragraphs:
1659 * Check for unmarked paragraphs indicated by separate speakers.
1661 * May well be false positive:
1662 * "Bravo!" "Wonderful!" called the crowd.
1663 * but useful all the same.
1665 void check_for_unmarked_paragraphs(const char *aline)
1668 s=strstr(aline,"\" \"");
1670 s=strstr(aline,"\" \"");
1673 if (pswit[ECHO_SWITCH])
1674 g_print("\n%s\n",aline);
1675 if (!pswit[OVERVIEW_SWITCH])
1676 g_print(" Line %ld column %ld - "
1677 "Query missing paragraph break?\n",
1678 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1685 * check_for_jeebies:
1687 * Check for "to he" and other easy h/b errors.
1689 * This is a very inadequate effort on the h/b problem,
1690 * but the phrase "to he" is always an error, whereas "to
1691 * be" is quite common.
1692 * Similarly, '"Quiet!", be said.' is a non-be error
1693 * "to he" is _not_ always an error!:
1694 * "Where they went to he couldn't say."
1695 * Another false positive:
1696 * What would "Cinderella" be without the . . .
1697 * and another: "If he wants to he can see for himself."
1699 void check_for_jeebies(const char *aline)
1702 s=strstr(aline," be could ");
1704 s=strstr(aline," be would ");
1706 s=strstr(aline," was be ");
1708 s=strstr(aline," be is ");
1710 s=strstr(aline," is be ");
1712 s=strstr(aline,"\", be ");
1714 s=strstr(aline,"\" be ");
1716 s=strstr(aline,"\" be ");
1718 s=strstr(aline," to he ");
1721 if (pswit[ECHO_SWITCH])
1722 g_print("\n%s\n",aline);
1723 if (!pswit[OVERVIEW_SWITCH])
1724 g_print(" Line %ld column %ld - Query he/be error?\n",
1725 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1729 s=strstr(aline," the had ");
1731 s=strstr(aline," a had ");
1733 s=strstr(aline," they bad ");
1735 s=strstr(aline," she bad ");
1737 s=strstr(aline," he bad ");
1739 s=strstr(aline," you bad ");
1741 s=strstr(aline," i bad ");
1744 if (pswit[ECHO_SWITCH])
1745 g_print("\n%s\n",aline);
1746 if (!pswit[OVERVIEW_SWITCH])
1747 g_print(" Line %ld column %ld - Query had/bad error?\n",
1748 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1752 s=strstr(aline,"; hut ");
1754 s=strstr(aline,", hut ");
1757 if (pswit[ECHO_SWITCH])
1758 g_print("\n%s\n",aline);
1759 if (!pswit[OVERVIEW_SWITCH])
1760 g_print(" Line %ld column %ld - Query hut/but error?\n",
1761 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1768 * check_for_mta_from:
1770 * Special case - angled bracket in front of "From" placed there by an
1771 * MTA when sending an e-mail.
1773 void check_for_mta_from(const char *aline)
1776 s=strstr(aline,">From");
1779 if (pswit[ECHO_SWITCH])
1780 g_print("\n%s\n",aline);
1781 if (!pswit[OVERVIEW_SWITCH])
1782 g_print(" Line %ld column %ld - "
1783 "Query angled bracket with From\n",
1784 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1791 * check_for_orphan_character:
1793 * Check for a single character line -
1794 * often an overflow from bad wrapping.
1796 void check_for_orphan_character(const char *aline)
1799 c=g_utf8_get_char(aline);
1800 if (c && !*g_utf8_next_char(aline))
1802 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1803 ; /* Nothing - ignore numerals alone on a line. */
1806 if (pswit[ECHO_SWITCH])
1807 g_print("\n%s\n",aline);
1808 if (!pswit[OVERVIEW_SWITCH])
1809 g_print(" Line %ld column 1 - Query single character line\n",
1818 * check_for_pling_scanno:
1820 * Check for I" - often should be !
1822 void check_for_pling_scanno(const char *aline)
1825 s=strstr(aline," I\"");
1828 if (pswit[ECHO_SWITCH])
1829 g_print("\n%s\n",aline);
1830 if (!pswit[OVERVIEW_SWITCH])
1831 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1832 linecnt,g_utf8_pointer_to_offset(aline,s));
1839 * check_for_extra_period:
1841 * Check for period without a capital letter. Cut-down from gutspell.
1842 * Only works when it happens on a single line.
1844 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1846 const char *s,*t,*s1,*sprev;
1851 gunichar c,nc,pc,*decomposition;
1852 if (pswit[PARANOID_SWITCH])
1854 for (t=aline;t=strstr(t,". ");)
1858 t=g_utf8_next_char(t);
1859 /* start of line punctuation is handled elsewhere */
1862 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1864 t=g_utf8_next_char(t);
1867 if (warnings->isDutch)
1869 /* For Frank & Jeroen -- 's Middags case */
1870 gunichar c2,c3,c4,c5;
1871 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1872 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1873 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1874 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1875 if (CHAR_IS_APOSTROPHE(c2) &&
1876 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1877 g_unichar_isupper(c5))
1879 t=g_utf8_next_char(t);
1883 s1=g_utf8_next_char(g_utf8_next_char(t));
1884 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1885 !g_unichar_isdigit(g_utf8_get_char(s1)))
1886 s1=g_utf8_next_char(s1);
1887 if (g_unichar_islower(g_utf8_get_char(s1)))
1889 /* we have something to investigate */
1891 /* so let's go back and find out */
1892 nc=g_utf8_get_char(t);
1893 s1=g_utf8_prev_char(t);
1894 c=g_utf8_get_char(s1);
1895 sprev=g_utf8_prev_char(s1);
1896 pc=g_utf8_get_char(sprev);
1898 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1899 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1900 g_unichar_isalpha(nc)))
1905 sprev=g_utf8_prev_char(s1);
1906 pc=g_utf8_get_char(sprev);
1908 s1=g_utf8_next_char(s1);
1911 testword=g_strndup(s1,s-s1);
1913 testword=g_strdup(s1);
1914 for (i=0;*abbrev[i];i++)
1915 if (!strcmp(testword,abbrev[i]))
1917 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1919 if (!*g_utf8_next_char(testword))
1921 if (isroman(testword))
1926 for (s=testword;*s;s=g_utf8_next_char(s))
1928 decomposition=g_unicode_canonical_decomposition(
1929 g_utf8_get_char(s),&len);
1930 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1932 g_free(decomposition);
1936 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1938 g_tree_insert(qperiod,g_strdup(testword),
1939 GINT_TO_POINTER(1));
1940 if (pswit[ECHO_SWITCH])
1941 g_print("\n%s\n",aline);
1942 if (!pswit[OVERVIEW_SWITCH])
1943 g_print(" Line %ld column %ld - Extra period?\n",
1944 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1950 t=g_utf8_next_char(t);
1956 * check_for_following_punctuation:
1958 * Check for words usually not followed by punctuation.
1960 void check_for_following_punctuation(const char *aline)
1963 const char *s,*wordstart;
1966 if (pswit[TYPO_SWITCH])
1977 inword=g_utf8_strdown(t,-1);
1979 for (i=0;*nocomma[i];i++)
1980 if (!strcmp(inword,nocomma[i]))
1982 c=g_utf8_get_char(s);
1983 if (c==',' || c==';' || c==':')
1985 if (pswit[ECHO_SWITCH])
1986 g_print("\n%s\n",aline);
1987 if (!pswit[OVERVIEW_SWITCH])
1988 g_print(" Line %ld column %ld - "
1989 "Query punctuation after %s?\n",
1990 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1996 for (i=0;*noperiod[i];i++)
1997 if (!strcmp(inword,noperiod[i]))
1999 c=g_utf8_get_char(s);
2000 if (c=='.' || c=='!')
2002 if (pswit[ECHO_SWITCH])
2003 g_print("\n%s\n",aline);
2004 if (!pswit[OVERVIEW_SWITCH])
2005 g_print(" Line %ld column %ld - "
2006 "Query punctuation after %s?\n",
2007 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
2021 * Check for commonly mistyped words,
2022 * and digits like 0 for O in a word.
2024 void check_for_typos(const char *aline,struct warnings *warnings)
2026 const char *s,*t,*nt,*wordstart;
2028 gunichar *decomposition;
2030 int i,vowel,consonant,*dupcnt;
2031 gboolean isdup,istypo,alower;
2034 gsize decomposition_len;
2038 inword=getaword(&s);
2042 continue; /* don't bother with empty lines */
2044 if (mixdigit(inword))
2046 if (pswit[ECHO_SWITCH])
2047 g_print("\n%s\n",aline);
2048 if (!pswit[OVERVIEW_SWITCH])
2049 g_print(" Line %ld column %ld - Query digit in %s\n",
2050 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
2055 * Put the word through a series of tests for likely typos and OCR
2058 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2062 for (t=inword;*t;t=g_utf8_next_char(t))
2064 c=g_utf8_get_char(t);
2065 nt=g_utf8_next_char(t);
2066 /* lowercase for testing */
2067 if (g_unichar_islower(c))
2069 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
2072 * We have an uppercase mid-word. However, there are
2074 * Mac and Mc like McGill
2075 * French contractions like l'Abbe
2077 offset=g_utf8_pointer_to_offset(inword,t);
2079 pc=g_utf8_get_char(g_utf8_prev_char(t));
2082 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
2083 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2084 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2085 CHAR_IS_APOSTROPHE(pc))
2091 testword=g_utf8_casefold(inword,-1);
2093 if (pswit[TYPO_SWITCH])
2096 * Check for certain unlikely two-letter combinations at word
2099 len=g_utf8_strlen(testword,-1);
2102 for (i=0;*nostart[i];i++)
2103 if (g_str_has_prefix(testword,nostart[i]))
2105 for (i=0;*noend[i];i++)
2106 if (g_str_has_suffix(testword,noend[i]))
2109 /* ght is common, gbt never. Like that. */
2110 if (strstr(testword,"cb"))
2112 if (strstr(testword,"gbt"))
2114 if (strstr(testword,"pbt"))
2116 if (strstr(testword,"tbs"))
2118 if (strstr(testword,"mrn"))
2120 if (strstr(testword,"ahle"))
2122 if (strstr(testword,"ihle"))
2125 * "TBE" does happen - like HEARTBEAT - but uncommon.
2126 * Also "TBI" - frostbite, outbid - but uncommon.
2127 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2128 * numerals, but "ii" is a common scanno.
2130 if (strstr(testword,"tbi"))
2132 if (strstr(testword,"tbe"))
2134 if (strstr(testword,"ii"))
2137 * Check for no vowels or no consonants.
2138 * If none, flag a typo.
2140 if (!istypo && len>1)
2143 for (t=testword;*t;t=g_utf8_next_char(t))
2145 c=g_utf8_get_char(t);
2147 g_unicode_canonical_decomposition(c,&decomposition_len);
2148 if (c=='y' || g_unichar_isdigit(c))
2150 /* Yah, this is loose. */
2154 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2158 g_free(decomposition);
2160 if (!vowel || !consonant)
2164 * Now exclude the word from being reported if it's in
2167 for (i=0;*okword[i];i++)
2168 if (!strcmp(testword,okword[i]))
2171 * What looks like a typo may be a Roman numeral.
2174 if (istypo && isroman(testword))
2176 /* Check the manual list of typos. */
2178 for (i=0;*typo[i];i++)
2179 if (!strcmp(testword,typo[i]))
2182 * Check lowercase s, l, i and m - special cases.
2183 * "j" - often a semi-colon gone wrong.
2184 * "d" for a missing apostrophe - he d
2187 if (!istypo && len==1 &&
2188 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2192 dupcnt=g_tree_lookup(qword,testword);
2196 isdup=!pswit[VERBOSE_SWITCH];
2200 dupcnt=g_new0(int,1);
2201 g_tree_insert(qword,g_strdup(testword),dupcnt);
2206 if (pswit[ECHO_SWITCH])
2207 g_print("\n%s\n",aline);
2208 if (!pswit[OVERVIEW_SWITCH])
2210 g_print(" Line %ld column %ld - Query word %s",
2211 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2213 if (!pswit[VERBOSE_SWITCH])
2214 g_print(" - not reporting duplicates");
2222 /* check the user's list of typos */
2223 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2225 if (pswit[ECHO_SWITCH])
2226 g_print("\n%s\n",aline);
2227 if (!pswit[OVERVIEW_SWITCH])
2228 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2229 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2231 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2233 if (pswit[PARANOID_SWITCH] && warnings->digit)
2235 /* In paranoid mode, query all 0 and 1 standing alone. */
2236 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2238 if (pswit[ECHO_SWITCH])
2239 g_print("\n%s\n",aline);
2240 if (!pswit[OVERVIEW_SWITCH])
2241 g_print(" Line %ld column %ld - Query standalone %s\n",
2242 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2253 * check_for_misspaced_punctuation:
2255 * Look for added or missing spaces around punctuation and quotes.
2256 * If there is a punctuation character like ! with no space on
2257 * either side, suspect a missing!space. If there are spaces on
2258 * both sides , assume a typo. If we see a double quote with no
2259 * space or punctuation on either side of it, assume unspaced
2260 * quotes "like"this.
2262 void check_for_misspaced_punctuation(const char *aline,
2263 struct parities *parities,gboolean isemptyline)
2265 gboolean isacro,isellipsis;
2267 gunichar c,nc,pc,n2c;
2269 c=g_utf8_get_char(aline);
2270 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2271 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2275 nc=g_utf8_get_char(g_utf8_next_char(s));
2276 /* For each character in the line after the first. */
2277 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2279 /* we need to suppress warnings for acronyms like M.D. */
2281 /* we need to suppress warnings for ellipsis . . . */
2284 * If there are letters on both sides of it or
2285 * if it's strict punctuation followed by an alpha.
2287 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2288 g_utf8_strchr("?!,;:",-1,c)))
2292 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2293 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2295 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2301 if (pswit[ECHO_SWITCH])
2302 g_print("\n%s\n",aline);
2303 if (!pswit[OVERVIEW_SWITCH])
2304 g_print(" Line %ld column %ld - Missing space?\n",
2305 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2310 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2313 * If there are spaces on both sides,
2314 * or space before and end of line.
2318 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2319 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2321 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2325 if (!isemptyline && !isellipsis)
2327 if (pswit[ECHO_SWITCH])
2328 g_print("\n%s\n",aline);
2329 if (!pswit[OVERVIEW_SWITCH])
2330 g_print(" Line %ld column %ld - "
2331 "Spaced punctuation?\n",linecnt,
2332 g_utf8_pointer_to_offset(aline,s)+1);
2339 /* Split out the characters that CANNOT be preceded by space. */
2340 c=g_utf8_get_char(aline);
2341 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2342 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2346 nc=g_utf8_get_char(g_utf8_next_char(s));
2347 /* for each character in the line after the first */
2348 if (g_utf8_strchr("?!,;:",-1,c))
2350 /* if it's punctuation that _cannot_ have a space before it */
2351 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2354 * If nc DOES == space,
2355 * it was already reported just above.
2357 if (pswit[ECHO_SWITCH])
2358 g_print("\n%s\n",aline);
2359 if (!pswit[OVERVIEW_SWITCH])
2360 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2361 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2368 * Special case " .X" where X is any alpha.
2369 * This plugs a hole in the acronym code above.
2370 * Inelegant, but maintainable.
2372 c=g_utf8_get_char(aline);
2373 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2374 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2378 nc=g_utf8_get_char(g_utf8_next_char(s));
2379 /* for each character in the line after the first */
2382 /* if it's a period */
2383 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2386 * If the period follows a space and
2387 * is followed by a letter.
2389 if (pswit[ECHO_SWITCH])
2390 g_print("\n%s\n",aline);
2391 if (!pswit[OVERVIEW_SWITCH])
2392 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2393 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2399 c=g_utf8_get_char(aline);
2400 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2401 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2405 nc=g_utf8_get_char(g_utf8_next_char(s));
2406 /* for each character in the line after the first */
2407 if (CHAR_IS_DQUOTE(c))
2409 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2410 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2411 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2413 if (pswit[ECHO_SWITCH])
2414 g_print("\n%s\n",aline);
2415 if (!pswit[OVERVIEW_SWITCH])
2416 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2417 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2423 /* Check parity of quotes. */
2424 nc=g_utf8_get_char(aline);
2425 for (s=aline;*s;s=g_utf8_next_char(s))
2428 nc=g_utf8_get_char(g_utf8_next_char(s));
2429 if (CHAR_IS_DQUOTE(c))
2433 parities->dquote=!parities->dquote;
2434 parity=parities->dquote;
2436 else if (c==CHAR_LD_QUOTE)
2443 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2445 if (pswit[ECHO_SWITCH])
2446 g_print("\n%s\n",aline);
2447 if (!pswit[OVERVIEW_SWITCH])
2448 g_print(" Line %ld column %ld - "
2449 "Wrongspaced quotes?\n",
2450 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2458 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2459 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2461 if (pswit[ECHO_SWITCH])
2462 g_print("\n%s\n",aline);
2463 if (!pswit[OVERVIEW_SWITCH])
2464 g_print(" Line %ld column %ld - "
2465 "Wrongspaced quotes?\n",
2466 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2473 c=g_utf8_get_char(aline);
2474 if (CHAR_IS_DQUOTE(c))
2476 if (g_utf8_strchr(",;:!?)]} ",-1,
2477 g_utf8_get_char(g_utf8_next_char(aline))))
2479 if (pswit[ECHO_SWITCH])
2480 g_print("\n%s\n",aline);
2481 if (!pswit[OVERVIEW_SWITCH])
2482 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2488 if (pswit[SQUOTE_SWITCH])
2490 nc=g_utf8_get_char(aline);
2491 for (s=aline;*s;s=g_utf8_next_char(s))
2494 nc=g_utf8_get_char(g_utf8_next_char(s));
2495 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2496 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2497 !g_unichar_isalpha(nc)))
2499 parities->squote=!parities->squote;
2500 if (!parities->squote)
2503 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2505 if (pswit[ECHO_SWITCH])
2506 g_print("\n%s\n",aline);
2507 if (!pswit[OVERVIEW_SWITCH])
2508 g_print(" Line %ld column %ld - "
2509 "Wrongspaced singlequotes?\n",
2510 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2518 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2519 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2521 if (pswit[ECHO_SWITCH])
2522 g_print("\n%s\n",aline);
2523 if (!pswit[OVERVIEW_SWITCH])
2524 g_print(" Line %ld column %ld - "
2525 "Wrongspaced singlequotes?\n",
2526 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2537 * check_for_double_punctuation:
2539 * Look for double punctuation like ,. or ,,
2540 * Thanks to DW for the suggestion!
2541 * In books with references, ".," and ".;" are common
2542 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2543 * OTOH, from my initial tests, there are also fairly
2544 * common errors. What to do? Make these cases paranoid?
2545 * ".," is the most common, so warnings->dotcomma is used
2546 * to suppress detailed reporting if it occurs often.
2548 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2552 nc=g_utf8_get_char(aline);
2553 for (s=aline;*s;s=g_utf8_next_char(s))
2556 nc=g_utf8_get_char(g_utf8_next_char(s));
2557 /* for each punctuation character in the line */
2558 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2559 g_utf8_strchr(".?!,;:",-1,nc))
2561 /* followed by punctuation, it's a query, unless . . . */
2562 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2563 !warnings->dotcomma && c=='.' && nc==',' ||
2564 warnings->isFrench && g_str_has_prefix(s,",...") ||
2565 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2566 warnings->isFrench && g_str_has_prefix(s,";...") ||
2567 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2568 warnings->isFrench && g_str_has_prefix(s,":...") ||
2569 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2570 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2571 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2572 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2573 warnings->isFrench && g_str_has_prefix(s,"...?"))
2575 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2576 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2577 warnings->isFrench && g_str_has_prefix(s,";...") ||
2578 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2579 warnings->isFrench && g_str_has_prefix(s,":...") ||
2580 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2581 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2582 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2583 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2584 warnings->isFrench && g_str_has_prefix(s,"...?"))
2587 nc=g_utf8_get_char(g_utf8_next_char(s));
2589 ; /* do nothing for .. !! and ?? which can be legit */
2593 if (pswit[ECHO_SWITCH])
2594 g_print("\n%s\n",aline);
2595 if (!pswit[OVERVIEW_SWITCH])
2596 g_print(" Line %ld column %ld - Double punctuation?\n",
2597 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2606 * check_for_spaced_quotes:
2608 void check_for_spaced_quotes(const char *aline)
2612 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2616 while ((t=strstr(s," \" ")))
2618 if (pswit[ECHO_SWITCH])
2619 g_print("\n%s\n",aline);
2620 if (!pswit[OVERVIEW_SWITCH])
2621 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2622 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2625 s=g_utf8_next_char(g_utf8_next_char(t));
2627 pattern=g_string_new(NULL);
2628 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2630 g_string_assign(pattern," ");
2631 g_string_append_unichar(pattern,single_quotes[i]);
2632 g_string_append_c(pattern,' ');
2634 while ((t=strstr(s,pattern->str)))
2636 if (pswit[ECHO_SWITCH])
2637 g_print("\n%s\n",aline);
2638 if (!pswit[OVERVIEW_SWITCH])
2639 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2640 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2643 s=g_utf8_next_char(g_utf8_next_char(t));
2646 g_string_free(pattern,TRUE);
2650 * check_for_miscased_genative:
2652 * Check special case of 'S instead of 's at end of word.
2654 void check_for_miscased_genative(const char *aline)
2660 c=g_utf8_get_char(aline);
2661 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2662 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2666 nc=g_utf8_get_char(g_utf8_next_char(s));
2667 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2669 if (pswit[ECHO_SWITCH])
2670 g_print("\n%s\n",aline);
2671 if (!pswit[OVERVIEW_SWITCH])
2672 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2673 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2681 * check_end_of_line:
2683 * Now check special cases - start and end of line -
2684 * for single and double quotes. Start is sometimes [sic]
2685 * but better to query it anyway.
2686 * While we're here, check for dash at end of line.
2688 void check_end_of_line(const char *aline,struct warnings *warnings)
2693 lbytes=strlen(aline);
2694 if (g_utf8_strlen(aline,lbytes)>1)
2696 s=g_utf8_prev_char(aline+lbytes);
2697 c1=g_utf8_get_char(s);
2698 c2=g_utf8_get_char(g_utf8_prev_char(s));
2699 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2701 if (pswit[ECHO_SWITCH])
2702 g_print("\n%s\n",aline);
2703 if (!pswit[OVERVIEW_SWITCH])
2704 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2705 g_utf8_strlen(aline,lbytes));
2709 c1=g_utf8_get_char(aline);
2710 c2=g_utf8_get_char(g_utf8_next_char(aline));
2711 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2713 if (pswit[ECHO_SWITCH])
2714 g_print("\n%s\n",aline);
2715 if (!pswit[OVERVIEW_SWITCH])
2716 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2721 * Dash at end of line may well be legit - paranoid mode only
2722 * and don't report em-dash at line-end.
2724 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2726 for (s=g_utf8_prev_char(aline+lbytes);
2727 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2729 if (g_utf8_get_char(s)=='-' &&
2730 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2732 if (pswit[ECHO_SWITCH])
2733 g_print("\n%s\n",aline);
2734 if (!pswit[OVERVIEW_SWITCH])
2735 g_print(" Line %ld column %ld - "
2736 "Hyphen at end of line?\n",
2737 linecnt,g_utf8_pointer_to_offset(aline,s));
2744 * check_for_unspaced_bracket:
2746 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2747 * If so, suspect a scanno like "a]most".
2749 void check_for_unspaced_bracket(const char *aline)
2753 c=g_utf8_get_char(aline);
2754 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2755 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2759 nc=g_utf8_get_char(g_utf8_next_char(s));
2762 /* for each bracket character in the line except 1st & last */
2763 if (g_utf8_strchr("{[()]}",-1,c) &&
2764 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2766 if (pswit[ECHO_SWITCH])
2767 g_print("\n%s\n",aline);
2768 if (!pswit[OVERVIEW_SWITCH])
2769 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2770 linecnt,g_utf8_pointer_to_offset(aline,s));
2778 * check_for_unpunctuated_endquote:
2780 void check_for_unpunctuated_endquote(const char *aline)
2785 c=g_utf8_get_char(aline);
2786 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2787 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2791 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2792 nc=g_utf8_get_char(g_utf8_next_char(s));
2793 /* for each character in the line except 1st */
2794 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2796 if (pswit[ECHO_SWITCH])
2797 g_print("\n%s\n",aline);
2798 if (!pswit[OVERVIEW_SWITCH])
2799 g_print(" Line %ld column %ld - "
2800 "endquote missing punctuation?\n",
2801 linecnt,g_utf8_pointer_to_offset(aline,s));
2809 * check_for_html_tag:
2811 * Check for <HTML TAG>.
2813 * If there is a < in the line, followed at some point
2814 * by a > then we suspect HTML.
2816 void check_for_html_tag(const char *aline)
2818 const char *open,*close;
2820 open=strchr(aline,'<');
2823 close=strchr(g_utf8_next_char(open),'>');
2826 if (pswit[ECHO_SWITCH])
2827 g_print("\n%s\n",aline);
2828 if (!pswit[OVERVIEW_SWITCH])
2830 tag=g_strndup(open,close-open+1);
2831 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2832 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2842 * check_for_html_entity:
2844 * Check for &symbol; HTML.
2846 * If there is a & in the line, followed at
2847 * some point by a ; then we suspect HTML.
2849 void check_for_html_entity(const char *aline)
2851 const char *s,*amp,*scolon;
2853 amp=strchr(aline,'&');
2856 scolon=strchr(amp,';');
2859 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2860 if (g_utf8_get_char(s)==CHAR_SPACE)
2861 break; /* Don't report "Jones & Son;" */
2864 if (pswit[ECHO_SWITCH])
2865 g_print("\n%s\n",aline);
2866 if (!pswit[OVERVIEW_SWITCH])
2868 entity=g_strndup(amp,scolon-amp+1);
2869 g_print(" Line %ld column %d - HTML symbol? %s \n",
2870 linecnt,(int)(amp-aline)+1,entity);
2881 * check_for_omitted_punctuation:
2883 * Check for omitted punctuation at end of paragraph by working back
2884 * through prevline. DW.
2885 * Need to check this only for "normal" paras.
2886 * So what is a "normal" para?
2887 * Not normal if one-liner (chapter headings, etc.)
2888 * Not normal if doesn't contain at least one locase letter
2889 * Not normal if starts with space
2891 void check_for_omitted_punctuation(const char *prevline,
2892 struct line_properties *last,int start_para_line)
2894 gboolean letter_on_line=FALSE;
2897 gboolean closing_quote;
2898 for (s=prevline;*s;s=g_utf8_next_char(s))
2899 if (g_unichar_isalpha(g_utf8_get_char(s)))
2901 letter_on_line=TRUE;
2905 * This next "if" is a problem.
2906 * If we say "start_para_line <= linecnt - 1", that includes
2907 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2908 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2909 * misses genuine one-line paragraphs.
2911 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2912 g_utf8_get_char(prevline)>CHAR_SPACE)
2914 s=prevline+strlen(prevline);
2917 s=g_utf8_prev_char(s);
2918 c=g_utf8_get_char(s);
2919 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2922 closing_quote=FALSE;
2923 } while (closing_quote && s>prevline);
2924 for (;s>prevline;s=g_utf8_prev_char(s))
2926 if (g_unichar_isalpha(g_utf8_get_char(s)))
2928 if (pswit[ECHO_SWITCH])
2929 g_print("\n%s\n",prevline);
2930 if (!pswit[OVERVIEW_SWITCH])
2931 g_print(" Line %ld column %ld - "
2932 "No punctuation at para end?\n",
2933 linecnt-1,g_utf8_strlen(prevline,-1));
2938 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2944 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2946 const char *word=key;
2949 g_print("\nNote: Queried word %s was duplicated %d times\n",
2954 void print_as_windows_1252(const char *string)
2956 gsize inbytes,outbytes;
2958 static GIConv converter=(GIConv)-1;
2961 if (converter!=(GIConv)-1)
2962 g_iconv_close(converter);
2963 converter=(GIConv)-1;
2966 if (converter==(GIConv)-1)
2967 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2968 if (converter!=(GIConv)-1)
2970 inbytes=outbytes=strlen(string);
2971 bp=buf=g_malloc(outbytes+1);
2972 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2978 fputs(string,stdout);
2981 void print_as_utf_8(const char *string)
2983 fputs(string,stdout);
2991 void procfile(const char *filename)
2994 gchar *parastart=NULL; /* first line of current para */
2995 gchar *etext,*aline;
2998 struct first_pass_results *first_pass_results;
2999 struct warnings *warnings;
3000 struct counters counters={0};
3001 struct line_properties last={0};
3002 struct parities parities={0};
3003 struct pending pending={0};
3004 gboolean isemptyline;
3005 long start_para_line=0;
3006 gboolean isnewpara=FALSE,enddash=FALSE;
3007 last.start=CHAR_SPACE;
3008 linecnt=checked_linecnt=0;
3009 etext=read_etext(filename,&err);
3012 if (pswit[STDOUT_SWITCH])
3013 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
3015 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
3018 g_print("\n\nFile: %s\n\n",filename);
3019 first_pass_results=first_pass(etext);
3020 warnings=report_first_pass(first_pass_results);
3021 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
3022 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
3024 * Here we go with the main pass. Hold onto yer hat!
3028 while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
3033 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
3034 continue; // skip DP page separators completely
3035 if (linecnt<first_pass_results->firstline ||
3036 (first_pass_results->footerline>0 &&
3037 linecnt>first_pass_results->footerline))
3039 if (pswit[HEADER_SWITCH])
3041 if (g_str_has_prefix(aline,"Title:"))
3042 g_print(" %s\n",aline);
3043 if (g_str_has_prefix(aline,"Author:"))
3044 g_print(" %s\n",aline);
3045 if (g_str_has_prefix(aline,"Release Date:"))
3046 g_print(" %s\n",aline);
3047 if (g_str_has_prefix(aline,"Edition:"))
3048 g_print(" %s\n\n",aline);
3050 continue; /* skip through the header */
3053 print_pending(aline,parastart,&pending);
3054 isemptyline=analyse_quotes(aline,&counters);
3055 if (isnewpara && !isemptyline)
3057 /* This line is the start of a new paragraph. */
3058 start_para_line=linecnt;
3059 /* Capture its first line in case we want to report it later. */
3061 parastart=g_strdup(aline);
3062 memset(&parities,0,sizeof(parities)); /* restart the quote count */
3064 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
3065 !g_unichar_isdigit(g_utf8_get_char(s)))
3066 s=g_utf8_next_char(s);
3067 if (g_unichar_islower(g_utf8_get_char(s)))
3069 /* and its first letter is lowercase */
3070 if (pswit[ECHO_SWITCH])
3071 g_print("\n%s\n",aline);
3072 if (!pswit[OVERVIEW_SWITCH])
3073 g_print(" Line %ld column %ld - "
3074 "Paragraph starts with lower-case\n",
3075 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
3079 isnewpara=FALSE; /* Signal the end of new para processing. */
3081 /* Check for an em-dash broken at line end. */
3082 if (enddash && g_utf8_get_char(aline)=='-')
3084 if (pswit[ECHO_SWITCH])
3085 g_print("\n%s\n",aline);
3086 if (!pswit[OVERVIEW_SWITCH])
3087 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
3092 for (s=g_utf8_prev_char(aline+strlen(aline));
3093 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
3095 if (s>=aline && g_utf8_get_char(s)=='-')
3097 check_for_control_characters(aline);
3098 check_for_odd_characters(aline,warnings,isemptyline);
3099 if (warnings->longline)
3100 check_for_long_line(aline);
3101 if (warnings->shortline)
3102 check_for_short_line(aline,&last);
3104 last.len=g_utf8_strlen(aline,-1);
3105 last.start=g_utf8_get_char(aline);
3106 check_for_starting_punctuation(aline);
3109 check_for_spaced_emdash(aline);
3110 check_for_spaced_dash(aline);
3112 check_for_unmarked_paragraphs(aline);
3113 check_for_jeebies(aline);
3114 check_for_mta_from(aline);
3115 check_for_orphan_character(aline);
3116 check_for_pling_scanno(aline);
3117 check_for_extra_period(aline,warnings);
3118 check_for_following_punctuation(aline);
3119 check_for_typos(aline,warnings);
3120 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3121 check_for_double_punctuation(aline,warnings);
3122 check_for_spaced_quotes(aline);
3123 check_for_miscased_genative(aline);
3124 check_end_of_line(aline,warnings);
3125 check_for_unspaced_bracket(aline);
3126 if (warnings->endquote)
3127 check_for_unpunctuated_endquote(aline);
3128 check_for_html_tag(aline);
3129 check_for_html_entity(aline);
3132 check_for_mismatched_quotes(&counters,&pending);
3133 counters_reset(&counters);
3134 /* let the next iteration know that it's starting a new para */
3137 check_for_omitted_punctuation(prevline,&last,start_para_line);
3140 prevline=g_strdup(aline);
3143 check_for_mismatched_quotes(&counters,&pending);
3144 print_pending(NULL,parastart,&pending);
3145 reset_pending(&pending);
3154 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3155 g_tree_foreach(qword,report_duplicate_queries,NULL);
3156 g_tree_unref(qword);
3157 g_tree_unref(qperiod);
3158 counters_destroy(&counters);
3159 g_set_print_handler(NULL);
3160 print_as_windows_1252(NULL);
3161 if (pswit[MARKUP_SWITCH])
3168 * Get one line from the input text. The setting of newlines has the following
3171 * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
3173 * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
3174 * the newline character.
3176 * UNIX_NEWLINES: Check for the presence of CRs.
3178 * In all cases, check that the last line is correctly terminated.
3180 * Returns: a pointer to the line.
3182 char *flgets(char **etext,long lcnt,int newlines)
3185 gboolean isCR=FALSE;
3186 char *theline=*etext;
3191 c=g_utf8_get_char(*etext);
3194 if (*etext==theline)
3196 else if (pswit[LINE_END_SWITCH])
3198 if (pswit[ECHO_SWITCH])
3200 s=g_strndup(theline,eos-theline);
3201 g_print("\n%s\n",s);
3204 if (!pswit[OVERVIEW_SWITCH])
3206 if (newlines==OS9_NEWLINES)
3207 g_print(" Line %ld - No CR?\n",lcnt);
3210 /* There may, or may not, have been a CR */
3211 g_print(" Line %ld - No LF?\n",lcnt);
3219 *etext=g_utf8_next_char(*etext);
3220 /* either way, it's end of line */
3223 if (newlines==DOS_NEWLINES && !isCR)
3225 /* Error - a LF without a preceding CR */
3226 if (pswit[LINE_END_SWITCH])
3228 if (pswit[ECHO_SWITCH])
3230 s=g_strndup(theline,eos-theline);
3231 g_print("\n%s\n",s);
3234 if (!pswit[OVERVIEW_SWITCH])
3235 g_print(" Line %ld - No CR?\n",lcnt);
3244 if (newlines==OS9_NEWLINES)
3246 if (isCR || newlines==UNIX_NEWLINES)
3248 if (pswit[LINE_END_SWITCH])
3250 if (pswit[ECHO_SWITCH])
3252 s=g_strndup(theline,eos-theline);
3253 g_print("\n%s\n",s);
3256 if (!pswit[OVERVIEW_SWITCH])
3258 if (newlines==UNIX_NEWLINES)
3259 g_print(" Line %ld column %ld - Embedded CR?\n",
3260 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3262 g_print(" Line %ld - Two successive CRs?\n",
3268 if (newlines==UNIX_NEWLINES)
3271 if (newlines==DOS_NEWLINES)
3276 if (pswit[LINE_END_SWITCH] && isCR)
3278 if (pswit[ECHO_SWITCH])
3280 s=g_strndup(theline,eos-theline);
3281 g_print("\n%s\n",s);
3284 if (!pswit[OVERVIEW_SWITCH])
3285 g_print(" Line %ld column %ld - CR without LF?\n",
3286 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3292 eos=g_utf8_next_char(eos);
3296 if (pswit[MARKUP_SWITCH])
3297 postprocess_for_HTML(theline);
3298 if (pswit[DP_SWITCH])
3299 postprocess_for_DP(theline);
3306 * Takes a "word" as a parameter, and checks whether it
3307 * contains a mixture of alpha and digits. Generally, this is an
3308 * error, but may not be for cases like 4th or L5 12s. 3d.
3310 * Returns: TRUE iff an is error found.
3312 gboolean mixdigit(const char *checkword)
3314 gboolean wehaveadigit,wehavealetter,query;
3315 const char *s,*nondigit;
3316 wehaveadigit=wehavealetter=query=FALSE;
3317 for (s=checkword;*s;s=g_utf8_next_char(s))
3318 if (g_unichar_isalpha(g_utf8_get_char(s)))
3320 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3322 if (wehaveadigit && wehavealetter)
3324 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3326 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3327 nondigit=g_utf8_next_char(nondigit))
3329 /* digits, ending in st, rd, nd, th of either case */
3330 if (!g_ascii_strcasecmp(nondigit,"st") ||
3331 !g_ascii_strcasecmp(nondigit,"rd") ||
3332 !g_ascii_strcasecmp(nondigit,"nd") ||
3333 !g_ascii_strcasecmp(nondigit,"th"))
3335 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3336 !g_ascii_strcasecmp(nondigit,"rds") ||
3337 !g_ascii_strcasecmp(nondigit,"nds") ||
3338 !g_ascii_strcasecmp(nondigit,"ths"))
3340 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3341 !g_ascii_strcasecmp(nondigit,"rdly") ||
3342 !g_ascii_strcasecmp(nondigit,"ndly") ||
3343 !g_ascii_strcasecmp(nondigit,"thly"))
3345 /* digits, ending in l, L, s or d */
3346 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3347 !strcmp(nondigit,"d"))
3350 * L at the start of a number, representing Britsh pounds, like L500.
3351 * This is cute. We know the current word is mixed digit. If the first
3352 * letter is L, there must be at least one digit following. If both
3353 * digits and letters follow, we have a genuine error, else we have a
3354 * capital L followed by digits, and we accept that as a non-error.
3356 if (g_utf8_get_char(checkword)=='L' &&
3357 !mixdigit(g_utf8_next_char(checkword)))
3366 * Extracts the first/next "word" from the line, and returns it.
3367 * A word is defined as one English word unit--or at least that's the aim.
3368 * "ptr" is advanced to the position in the line where we will start
3369 * looking for the next word.
3371 * Returns: A newly-allocated string.
3373 gchar *getaword(const char **ptr)
3378 word=g_string_new(NULL);
3379 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3380 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3381 **ptr;*ptr=g_utf8_next_char(*ptr))
3383 /* Handle exceptions for footnote markers like [1] */
3384 if (g_utf8_get_char(*ptr)=='[')
3386 g_string_append_c(word,'[');
3387 s=g_utf8_next_char(*ptr);
3388 for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
3389 g_string_append_unichar(word,g_utf8_get_char(s));
3390 if (g_utf8_get_char(s)==']')
3392 g_string_append_c(word,']');
3393 *ptr=g_utf8_next_char(s);
3394 return g_string_free(word,FALSE);
3397 g_string_truncate(word,0);
3401 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3402 * Especially yucky is the case of L1,000
3403 * This section looks for a pattern of characters including a digit
3404 * followed by a comma or period followed by one or more digits.
3405 * If found, it returns this whole pattern as a word; otherwise we discard
3406 * the results and resume our normal programming.
3409 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3410 g_unichar_isalpha(g_utf8_get_char(s)) ||
3411 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3412 g_string_append_unichar(word,g_utf8_get_char(s));
3415 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3417 c=g_utf8_get_char(t);
3418 pc=g_utf8_get_char(g_utf8_prev_char(t));
3419 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3422 return g_string_free(word,FALSE);
3426 /* we didn't find a punctuated number - do the regular getword thing */
3427 g_string_truncate(word,0);
3428 c=g_utf8_get_char(*ptr);
3429 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3430 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3431 g_string_append_unichar(word,c);
3432 return g_string_free(word,FALSE);
3438 * Is this word a Roman Numeral?
3440 * It doesn't actually validate that the number is a valid Roman Numeral--for
3441 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3442 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3443 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3444 * expressions thereof, except when it came to taxes. Allow any number of M,
3445 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3446 * XL or an optional XC, an optional IX or IV, an optional V and any number
3449 gboolean isroman(const char *t)
3455 while (g_utf8_get_char(t)=='m' && *t)
3457 if (g_utf8_get_char(t)=='d')
3459 if (g_str_has_prefix(t,"cm"))
3461 if (g_str_has_prefix(t,"cd"))
3463 while (g_utf8_get_char(t)=='c' && *t)
3465 if (g_str_has_prefix(t,"xl"))
3467 if (g_str_has_prefix(t,"xc"))
3469 if (g_utf8_get_char(t)=='l')
3471 while (g_utf8_get_char(t)=='x' && *t)
3473 if (g_str_has_prefix(t,"ix"))
3475 if (g_str_has_prefix(t,"iv"))
3477 if (g_utf8_get_char(t)=='v')
3479 while (g_utf8_get_char(t)=='i' && *t)
3485 * postprocess_for_DP:
3487 * Invoked with the -d switch from flgets().
3488 * It simply "removes" from the line a hard-coded set of common
3489 * DP-specific tags, so that the line passed to the main routine has
3490 * been pre-cleaned of DP markup.
3492 void postprocess_for_DP(char *theline)
3498 for (i=0;*DPmarkup[i];i++)
3499 while ((s=strstr(theline,DPmarkup[i])))
3501 t=s+strlen(DPmarkup[i]);
3502 memmove(s,t,strlen(t)+1);
3507 * postprocess_for_HTML:
3509 * Invoked with the -m switch from flgets().
3510 * It simply "removes" from the line a hard-coded set of common
3511 * HTML tags and "replaces" a hard-coded set of common HTML
3512 * entities, so that the line passed to the main routine has
3513 * been pre-cleaned of HTML.
3515 void postprocess_for_HTML(char *theline)
3517 while (losemarkup(theline))
3519 loseentities(theline);
3522 char *losemarkup(char *theline)
3526 s=strchr(theline,'<');
3527 t=s?strchr(s,'>'):NULL;
3530 for (i=0;*markup[i];i++)
3531 if (tagcomp(g_utf8_next_char(s),markup[i]))
3533 t=g_utf8_next_char(t);
3534 memmove(s,t,strlen(t)+1);
3537 /* It's an unrecognized <xxx>. */
3541 void loseentities(char *theline)
3548 GTree *entities=NULL;
3549 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3553 g_tree_destroy(entities);
3555 if (translit!=(GIConv)-1)
3556 g_iconv_close(translit);
3557 translit=(GIConv)-1;
3558 if (to_utf8!=(GIConv)-1)
3559 g_iconv_close(to_utf8);
3567 entities=g_tree_new((GCompareFunc)strcmp);
3568 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3569 g_tree_insert(entities,HTMLentities[i].name,
3570 GUINT_TO_POINTER(HTMLentities[i].c));
3572 if (translit==(GIConv)-1)
3573 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3574 if (to_utf8==(GIConv)-1)
3575 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3576 while((amp=strchr(theline,'&')))
3578 scolon=strchr(amp,';');
3583 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3584 c=strtol(amp+2,NULL,10);
3585 else if (amp[2]=='x' &&
3586 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3587 c=strtol(amp+3,NULL,16);
3591 s=g_strndup(amp+1,scolon-(amp+1));
3592 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3601 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3602 theline+=g_unichar_to_utf8(c,theline);
3606 nb=g_unichar_to_utf8(c,s);
3607 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3609 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3611 memcpy(theline,s,nb);
3615 memmove(theline,g_utf8_next_char(scolon),
3616 strlen(g_utf8_next_char(scolon))+1);
3619 theline=g_utf8_next_char(amp);
3623 gboolean tagcomp(const char *strin,const char *basetag)
3627 if (g_utf8_get_char(strin)=='/')
3628 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3630 t=g_utf8_casefold(strin,-1);
3631 s=g_utf8_casefold(basetag,-1);
3632 retval=g_str_has_prefix(t,s);
3638 void proghelp(GOptionContext *context)
3641 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3642 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3643 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3644 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3645 "For details, read the file COPYING.\n",stderr);
3646 fputs("This is Free Software; "
3647 "you may redistribute it under certain conditions (GPL);\n",stderr);
3648 fputs("read the file COPYING for details.\n\n",stderr);
3649 help=g_option_context_get_help(context,TRUE,NULL);
3652 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3653 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3654 "non-ASCII\n",stderr);
3655 fputs("characters like accented letters, "
3656 "lines longer than 75 or shorter than 55,\n",stderr);
3657 fputs("unbalanced quotes or brackets, "
3658 "a variety of badly formatted punctuation, \n",stderr);
3659 fputs("HTML tags, some likely typos. "
3660 "It is NOT a substitute for human judgement.\n",stderr);