1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 gboolean typo_compat,paranoid_compat;
137 static GOptionEntry options[]={
138 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
139 "Ignore DP-specific markup", NULL },
140 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
141 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
142 "Don't ignore DP-specific markup", NULL },
143 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
144 "Echo queried line", NULL },
145 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
146 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
147 "Don't echo queried line", NULL },
148 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
149 "Check single quotes", NULL },
150 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
151 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
152 "Don't check single quotes", NULL },
153 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
154 "Check common typos", NULL },
155 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
156 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
157 "Don't check common typos", NULL },
158 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
159 "Require closure of quotes on every paragraph", NULL },
160 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
161 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
162 "Don't require closure of quotes on every paragraph", NULL },
163 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
164 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
165 "Enable paranoid querying of everything", NULL },
166 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
167 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
168 "Disable paranoid querying of everything", NULL },
169 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
170 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
171 "Enable line end checking", NULL },
172 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
173 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
174 "Diable line end checking", NULL },
175 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
176 "Overview: just show counts", NULL },
177 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
178 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
179 "Show individual warnings", NULL },
180 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
181 "Output errors to stdout instead of stderr", NULL },
182 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
183 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
184 "Output errors to stderr instead of stdout", NULL },
185 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
186 "Echo header fields", NULL },
187 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
188 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Don't echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
193 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "No special handling for markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
198 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
199 "Ignore file of user-defined typos", NULL },
200 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
201 "Verbose - list everything", NULL },
202 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
203 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
204 "Switch off verbose mode", NULL },
205 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
206 "Set of characters valid for this ebook", "NAME" },
211 * Options relating to configuration which make no sense from inside
212 * a configuration file.
215 static GOptionEntry config_options[]={
216 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
217 "Defaults for use on www upload", NULL },
218 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
219 "Dump current config settings", NULL },
223 static GOptionEntry compatibility_options[]={
224 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
225 "Toggle checking for common typos", NULL },
226 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
227 "Toggle both paranoid mode and common typos", NULL },
231 long cnt_quote; /* for overview mode, count of quote queries */
232 long cnt_brack; /* for overview mode, count of brackets queries */
233 long cnt_bin; /* for overview mode, count of non-ASCII queries */
234 long cnt_odd; /* for overview mode, count of odd character queries */
235 long cnt_long; /* for overview mode, count of long line errors */
236 long cnt_short; /* for overview mode, count of short line queries */
237 long cnt_punct; /* for overview mode,
238 count of punctuation and spacing queries */
239 long cnt_dash; /* for overview mode, count of dash-related queries */
240 long cnt_word; /* for overview mode, count of word queries */
241 long cnt_html; /* for overview mode, count of html queries */
242 long cnt_lineend; /* for overview mode, count of line-end queries */
243 long cnt_spacend; /* count of lines with space at end */
244 long linecnt; /* count of total lines in the file */
245 long checked_linecnt; /* count of lines actually checked */
247 void proghelp(GOptionContext *context);
248 void procfile(const char *);
252 gboolean mixdigit(const char *);
253 gchar *getaword(const char **);
254 char *flgets(char **,long);
255 void postprocess_for_HTML(char *);
256 char *linehasmarkup(char *);
257 char *losemarkup(char *);
258 gboolean tagcomp(const char *,const char *);
259 void loseentities(char *);
260 gboolean isroman(const char *);
261 void postprocess_for_DP(char *);
262 void print_as_windows_1252(const char *string);
263 void print_as_utf_8(const char *string);
265 GTree *qword,*qperiod;
271 gboolean set_charset(const char *name,GError **err)
273 /* The various UNICODE encodings all share the same character set. */
274 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
275 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
276 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
277 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
278 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
282 if (charset_validator!=(GIConv)-1)
283 g_iconv_close(charset_validator);
284 if (!name || !g_strcasecmp(name,"auto"))
287 charset_validator=(GIConv)-1;
291 charset=g_strdup(name);
292 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
293 if (!g_strcasecmp(charset,unicode_aliases[i]))
296 charset=g_strdup("UTF-8");
299 if (!strcmp(charset,"UTF-8"))
300 charset_validator=(GIConv)-1;
303 charset_validator=g_iconv_open(charset,"UTF-8");
304 if (charset_validator==(GIConv)-1)
306 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
307 "Unknown character set \"%s\"",charset);
316 void config_file_update(GKeyFile *kf)
321 for(i=0;options[i].long_name;i++)
323 if (g_str_has_prefix(options[i].long_name,"no-"))
325 if (options[i].arg==G_OPTION_ARG_NONE)
327 sw=*(gboolean *)options[i].arg_data;
328 if (options[i].flags&G_OPTION_FLAG_REVERSE)
330 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
332 else if (options[i].arg==G_OPTION_ARG_STRING)
334 s=*(gchar **)options[i].arg_data;
337 g_key_file_set_string(kf,"options",options[i].long_name,s);
340 g_assert_not_reached();
344 void config_file_add_comments(GKeyFile *kf)
348 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
350 for(i=0;options[i].long_name;i++)
352 if (g_str_has_prefix(options[i].long_name,"no-"))
354 comment=g_strconcat(" ",options[i].description,NULL);
355 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
360 void dump_config(void)
364 config_file_update(config);
367 config=g_key_file_new();
368 config_file_update(config);
369 config_file_add_comments(config);
371 s=g_key_file_to_data(config,NULL,NULL);
377 GKeyFile *read_config_file(gchar **full_path)
383 const char *search_path;
386 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
390 search_dirs=g_strsplit(search_path,";",0);
392 search_dirs=g_strsplit(search_path,":",0);
397 search_dirs=g_new(gchar *,4);
398 search_dirs[0]=g_get_current_dir();
399 search_dirs[1]=g_strdup(running_from);
400 search_dirs[2]=g_strdup(g_get_user_config_dir());
403 for(i=0;search_dirs[i];i++)
405 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
406 if (g_key_file_load_from_file(kf,path,
407 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
409 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
411 g_printerr("Bookloupe: Error reading %s\n",path);
412 g_printerr("%s\n",err->message);
424 g_strfreev(search_dirs);
432 void parse_config_file(void)
439 config=read_config_file(&path);
441 keys=g_key_file_get_keys(config,"options",NULL,NULL);
448 for(j=0;options[j].long_name;j++)
450 if (g_str_has_prefix(options[j].long_name,"no-"))
452 else if (!strcmp(keys[i],options[j].long_name))
454 if (options[j].arg==G_OPTION_ARG_NONE)
456 sw=g_key_file_get_boolean(config,"options",keys[i],
460 g_printerr("Bookloupe: %s: options.%s: %s\n",
461 path,keys[i],err->message);
466 if (options[j].flags&G_OPTION_FLAG_REVERSE)
468 *(gboolean *)options[j].arg_data=sw;
472 else if (options[j].arg==G_OPTION_ARG_STRING)
474 s=g_key_file_get_string(config,"options",keys[i],
478 g_printerr("Bookloupe: %s: options.%s: %s\n",
479 path,keys[i],err->message);
484 g_free(*(gchar **)options[j].arg_data);
485 if (!g_strcmp0(s,"auto"))
487 *(gchar **)options[j].arg_data=NULL;
491 *(gchar **)options[j].arg_data=s;
496 g_assert_not_reached();
499 if (!options[j].long_name)
500 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
509 void parse_options(int *argc,char ***argv)
512 GOptionContext *context;
513 GOptionGroup *compatibility;
514 context=g_option_context_new(
515 "file - look for errors in Project Gutenberg(TM) etexts");
516 g_option_context_add_main_entries(context,options,NULL);
517 g_option_context_add_main_entries(context,config_options,NULL);
518 compatibility=g_option_group_new("compatibility",
519 "Options for Compatibility with Gutcheck:",
520 "Show compatibility options",NULL,NULL);
521 g_option_group_add_entries(compatibility,compatibility_options);
522 g_option_context_add_group(context,compatibility);
523 g_option_context_set_description(context,
524 "For simplicity, only the switch options which reverse the\n"
525 "default configuration are listed. In most cases, both vanilla\n"
526 "and \"no-\" prefixed versions are available for use.");
527 if (!g_option_context_parse(context,argc,argv,&err))
529 g_printerr("Bookloupe: %s\n",err->message);
530 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
534 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
537 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
538 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
541 * Web uploads - for the moment, this is really just a placeholder
542 * until we decide what processing we really want to do on web uploads
544 if (pswit[WEB_SWITCH])
546 /* specific override for web uploads */
547 pswit[ECHO_SWITCH]=TRUE;
548 pswit[SQUOTE_SWITCH]=FALSE;
549 pswit[TYPO_SWITCH]=TRUE;
550 pswit[QPARA_SWITCH]=FALSE;
551 pswit[PARANOID_SWITCH]=TRUE;
552 pswit[LINE_END_SWITCH]=FALSE;
553 pswit[OVERVIEW_SWITCH]=FALSE;
554 pswit[STDOUT_SWITCH]=FALSE;
555 pswit[HEADER_SWITCH]=TRUE;
556 pswit[VERBOSE_SWITCH]=FALSE;
557 pswit[MARKUP_SWITCH]=FALSE;
558 pswit[USERTYPO_SWITCH]=FALSE;
559 pswit[DP_SWITCH]=FALSE;
561 if (opt_charset && !set_charset(opt_charset,&err))
563 g_printerr("%s\n",err->message);
566 if (pswit[DUMP_CONFIG_SWITCH])
573 if (pswit[OVERVIEW_SWITCH])
574 /* just print summary; don't echo */
575 pswit[ECHO_SWITCH]=FALSE;
581 g_option_context_free(context);
587 * Read in the user-defined stealth scanno list.
589 void read_user_scannos(void)
592 gchar *usertypo_file;
596 gchar *contents,*utf8,**lines;
597 usertypo_file=g_strdup("bookloupe.typ");
598 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
599 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
602 g_free(usertypo_file);
603 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
604 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
606 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
609 g_free(usertypo_file);
610 usertypo_file=g_strdup("gutcheck.typ");
611 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
613 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
616 g_free(usertypo_file);
617 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
618 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
620 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
622 g_free(usertypo_file);
623 g_print(" --> I couldn't find bookloupe.typ "
624 "-- proceeding without user typos.\n");
629 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
630 g_free(usertypo_file);
634 if (g_utf8_validate(contents,len,NULL))
636 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
638 (void)set_charset("UNICODE",NULL);
641 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
643 lines=g_strsplit_set(utf8,"\r\n",0);
645 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
646 for (i=0;lines[i];i++)
647 if (*(unsigned char *)lines[i]>'!')
648 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
657 * Read an etext returning a newly allocated string containing the file
658 * contents or NULL on error.
660 gchar *read_etext(const char *filename,GError **err)
662 GError *tmp_err=NULL;
663 gchar *contents,*utf8;
664 gsize len,bytes_read,bytes_written;
666 if (!g_file_get_contents(filename,&contents,&len,err))
668 if (g_utf8_validate(contents,len,NULL))
670 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
671 g_set_print_handler(print_as_utf_8);
673 SetConsoleOutputCP(CP_UTF8);
678 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
679 &bytes_written,&tmp_err);
680 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
681 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
684 for(i=0;i<bytes_read;i++)
685 if (contents[i]=='\n')
690 else if (contents[i]!='\r')
692 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
693 "Input conversion failed. Byte %d at line %d, column %d is not a "
694 "valid Windows-1252 character",
695 ((unsigned char *)contents)[bytes_read],line,col);
698 g_propagate_error(err,tmp_err);
699 g_set_print_handler(print_as_windows_1252);
701 SetConsoleOutputCP(1252);
708 void cleanup_on_exit(void)
711 SetConsoleOutputCP(saved_cp);
715 int main(int argc,char **argv)
718 atexit(cleanup_on_exit);
719 saved_cp=GetConsoleOutputCP();
721 running_from=g_path_get_dirname(argv[0]);
722 /* Paranoid checking is turned OFF, not on, by its switch */
723 pswit[PARANOID_SWITCH]=TRUE;
724 /* if running in paranoid mode, typo checks default to enabled */
725 pswit[TYPO_SWITCH]=TRUE;
726 /* Line-end checking is turned OFF, not on, by its switch */
727 pswit[LINE_END_SWITCH]=TRUE;
728 /* Echoing is turned OFF, not on, by its switch */
729 pswit[ECHO_SWITCH]=TRUE;
731 parse_options(&argc,&argv);
732 if (pswit[USERTYPO_SWITCH])
734 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
736 if (pswit[OVERVIEW_SWITCH])
738 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
739 checked_linecnt,linecnt,linecnt-checked_linecnt);
740 g_print(" --------------- Queries found --------------\n");
742 g_print(" Long lines: %14ld\n",cnt_long);
744 g_print(" Short lines: %14ld\n",cnt_short);
746 g_print(" Line-end problems: %14ld\n",cnt_lineend);
748 g_print(" Common typos: %14ld\n",cnt_word);
750 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
752 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
754 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
756 g_print(" Proofing characters: %14ld\n",cnt_odd);
758 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
760 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
762 g_print(" Possible HTML tags: %14ld\n",cnt_html);
764 g_print(" TOTAL QUERIES %14ld\n",
765 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
766 cnt_dash+cnt_word+cnt_html+cnt_lineend);
768 g_free(running_from);
770 g_tree_unref(usertypo);
771 set_charset(NULL,NULL);
773 g_key_file_free(config);
780 * Run a first pass - verify that it's a valid PG
781 * file, decide whether to report some things that
782 * occur many times in the text like long or short
783 * lines, non-standard dashes, etc.
785 struct first_pass_results *first_pass(const char *etext)
787 gunichar laststart=CHAR_SPACE;
792 unsigned int lastlen=0,lastblen=0;
793 long spline=0,nspline=0;
794 static struct first_pass_results results={0};
797 lines=g_strsplit(etext,"\n",0);
798 for (j=0;lines[j];j++)
800 lbytes=strlen(lines[j]);
801 while (lbytes>0 && lines[j][lbytes-1]=='\r')
802 lines[j][--lbytes]='\0';
803 llen=g_utf8_strlen(lines[j],lbytes);
805 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
806 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
809 g_print(" --> Duplicate header?\n");
810 spline=linecnt+1; /* first line of non-header text, that is */
812 if (!strncmp(lines[j],"*** START",9) &&
813 strstr(lines[j],"PROJECT GUTENBERG"))
816 g_print(" --> Duplicate header?\n");
817 nspline=linecnt+1; /* first line of non-header text, that is */
819 if (spline || nspline)
821 lc_line=g_utf8_strdown(lines[j],lbytes);
822 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
824 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
826 if (results.footerline)
828 /* it's an old-form header - we can detect duplicates */
830 g_print(" --> Duplicate footer?\n");
833 results.footerline=linecnt;
839 results.firstline=spline;
841 results.firstline=nspline; /* override with new */
842 if (results.footerline)
843 continue; /* don't count the boilerplate in the footer */
844 results.totlen+=llen;
845 for (s=lines[j];*s;s=g_utf8_next_char(s))
847 if (g_utf8_get_char(s)>127)
849 if (g_unichar_isalpha(g_utf8_get_char(s)))
853 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
854 qc=QUOTE_CLASS(g_utf8_get_char(s));
857 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
858 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
859 results.endquote_count++;
862 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
863 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
866 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
868 if (strstr(lines[j],".,"))
870 /* only count ast lines for ignoring purposes where there is */
871 /* locase text on the line */
872 if (strchr(lines[j],'*'))
874 for (s=lines[j];*s;s=g_utf8_next_char(s))
875 if (g_unichar_islower(g_utf8_get_char(s)))
880 if (strchr(lines[j],'/'))
881 results.fslashline++;
884 for (s=g_utf8_prev_char(lines[j]+lbytes);
885 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
886 s=g_utf8_prev_char(s))
888 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
889 g_utf8_get_char(g_utf8_prev_char(s))!='-')
892 if (llen>LONGEST_PG_LINE)
894 if (llen>WAY_TOO_LONG)
895 results.verylongline++;
896 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
898 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
901 if (strstr(lines[j],"<i>"))
902 results.htmcount+=4; /* bonus marks! */
904 /* Check for spaced em-dashes */
905 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
908 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
909 results.space_emdash++;
910 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
911 /* count of em-dashes with spaces both sides */
912 results.non_PG_space_emdash++;
913 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
914 /* count of PG-type em-dashes with no spaces */
915 results.PG_space_emdash++;
920 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
921 results.Dutchcount++;
922 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
923 results.Frenchcount++;
924 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
925 results.standalone_digit++;
928 /* Check for spaced dashes */
929 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
933 laststart=lines[j][0];
942 * Make some snap decisions based on the first pass results.
944 struct warnings *report_first_pass(struct first_pass_results *results)
946 static struct warnings warnings={0};
948 g_print(" --> %ld lines in this file have white space at end\n",
951 if (results->dotcomma>5)
954 g_print(" --> %ld lines in this file contain '.,'. "
955 "Not reporting them.\n",results->dotcomma);
958 * If more than 50 lines, or one-tenth, are short,
959 * don't bother reporting them.
961 warnings.shortline=1;
962 if (results->shortline>50 || results->shortline*10>linecnt)
964 warnings.shortline=0;
965 g_print(" --> %ld lines in this file are short. "
966 "Not reporting short lines.\n",results->shortline);
969 * If more than 50 lines, or one-tenth, are long,
970 * don't bother reporting them.
973 if (results->longline>50 || results->longline*10>linecnt)
976 g_print(" --> %ld lines in this file are long. "
977 "Not reporting long lines.\n",results->longline);
979 /* If more than 10 lines contain asterisks, don't bother reporting them. */
981 if (results->astline>10)
984 g_print(" --> %ld lines in this file contain asterisks. "
985 "Not reporting them.\n",results->astline);
988 * If more than 10 lines contain forward slashes,
989 * don't bother reporting them.
992 if (results->fslashline>10)
995 g_print(" --> %ld lines in this file contain forward slashes. "
996 "Not reporting them.\n",results->fslashline);
999 * If more than 20 lines contain unpunctuated endquotes,
1000 * don't bother reporting them.
1002 warnings.endquote=1;
1003 if (results->endquote_count>20)
1005 warnings.endquote=0;
1006 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1007 "Not reporting them.\n",results->endquote_count);
1010 * If more than 15 lines contain standalone digits,
1011 * don't bother reporting them.
1014 if (results->standalone_digit>10)
1017 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1018 "Not reporting them.\n",results->standalone_digit);
1021 * If more than 20 lines contain hyphens at end,
1022 * don't bother reporting them.
1025 if (results->hyphens>20)
1028 g_print(" --> %ld lines in this file have hyphens at end. "
1029 "Not reporting them.\n",results->hyphens);
1031 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1033 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1034 pswit[MARKUP_SWITCH]=1;
1036 if (results->verylongline>0)
1037 g_print(" --> %ld lines in this file are VERY long!\n",
1038 results->verylongline);
1040 * If there are more non-PG spaced dashes than PG em-dashes,
1041 * assume it's deliberate.
1042 * Current PG guidelines say don't use them, but older texts do,
1043 * and some people insist on them whatever the guidelines say.
1046 if (results->spacedash+results->non_PG_space_emdash>
1047 results->PG_space_emdash)
1050 g_print(" --> There are %ld spaced dashes and em-dashes. "
1051 "Not reporting them.\n",
1052 results->spacedash+results->non_PG_space_emdash);
1058 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1060 /* If more than a quarter of characters are hi-bit, bug out. */
1061 if (results->binlen*4>results->totlen)
1063 g_print(" --> This file does not appear to be ASCII. "
1064 "Terminating. Best of luck with it!\n");
1067 if (results->alphalen*4<results->totlen)
1069 g_print(" --> This file does not appear to be text. "
1070 "Terminating. Best of luck with it!\n");
1073 if (results->binlen*100>results->totlen || results->binlen>100)
1075 g_print(" --> There are a lot of foreign letters here. "
1076 "Not reporting them.\n");
1077 if (!pswit[VERBOSE_SWITCH])
1081 warnings.isDutch=FALSE;
1082 if (results->Dutchcount>50)
1084 warnings.isDutch=TRUE;
1085 g_print(" --> This looks like Dutch - "
1086 "switching off dashes and warnings for 's Middags case.\n");
1088 warnings.isFrench=FALSE;
1089 if (results->Frenchcount>50)
1091 warnings.isFrench=TRUE;
1092 g_print(" --> This looks like French - "
1093 "switching off some doublepunct.\n");
1095 if (results->firstline && results->footerline)
1096 g_print(" The PG header and footer appear to be already on.\n");
1099 if (results->firstline)
1100 g_print(" The PG header is on - no footer.\n");
1101 if (results->footerline)
1102 g_print(" The PG footer is on - no header.\n");
1105 if (pswit[VERBOSE_SWITCH])
1107 warnings.shortline=1;
1108 warnings.dotcomma=1;
1109 warnings.longline=1;
1115 warnings.endquote=1;
1116 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1118 if (warnings.isDutch)
1120 if (results->footerline>0 && results->firstline>0 &&
1121 results->footerline>results->firstline &&
1122 results->footerline-results->firstline<100)
1124 g_print(" --> I don't really know where this text starts. \n");
1125 g_print(" There are no reference points.\n");
1126 g_print(" I'm going to have to report the header and footer "
1128 results->firstline=0;
1136 * Look along the line, accumulate the count of quotes, and see
1137 * if this is an empty line - i.e. a line with nothing on it
1139 * If line has just spaces, period, * and/or - on it, don't
1140 * count it, since empty lines with asterisks or dashes to
1141 * separate sections are common.
1143 * Returns: TRUE if the line is empty.
1145 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
1148 /* assume the line is empty until proven otherwise */
1149 gboolean isemptyline=TRUE;
1150 const char *s=aline,*sprev,*snext;
1153 GError *tmp_err=NULL;
1156 snext=g_utf8_next_char(s);
1157 c=g_utf8_get_char(s);
1158 if (CHAR_IS_DQUOTE(c))
1159 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1160 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1165 * At start of line, it can only be a quotation mark.
1166 * Hardcode a very common exception!
1168 if (!g_str_has_prefix(snext,"tis") &&
1169 !g_str_has_prefix(snext,"Tis"))
1170 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1172 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1173 g_unichar_isalpha(g_utf8_get_char(snext)))
1174 /* Do nothing! it's definitely an apostrophe, not a quote */
1176 /* it's outside a word - let's check it out */
1177 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1178 g_unichar_isalpha(g_utf8_get_char(snext)))
1180 /* certainly looks like a quotation mark */
1181 if (!g_str_has_prefix(snext,"tis") &&
1182 !g_str_has_prefix(snext,"Tis"))
1183 /* hardcode a very common exception! */
1185 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1186 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1188 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1193 /* now - is it a quotation mark? */
1194 guessquote=0; /* accumulate clues */
1195 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1197 /* it follows a letter - could be either */
1199 if (g_utf8_get_char(sprev)=='s')
1201 /* looks like a plural apostrophe */
1203 if (g_utf8_get_char(snext)==CHAR_SPACE)
1207 if (innermost_quote_matches(counters,c))
1209 * Give it the benefit of some doubt,
1210 * if a squote is already open.
1216 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1219 /* no adjacent letter - it must be a quote of some kind */
1220 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1225 if (pswit[ECHO_SWITCH])
1226 g_print("\n%s\n",aline);
1227 if (!pswit[OVERVIEW_SWITCH])
1228 g_print(" Line %ld column %ld - %s\n",
1229 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1230 g_clear_error(&tmp_err);
1232 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1234 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1235 if (c==CHAR_UNDERSCORE)
1236 counters->c_unders++;
1237 if (c==CHAR_OPEN_SBRACK)
1239 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1240 !matching_difference(counters,c) && s==aline &&
1241 g_str_has_prefix(s,"[Illustration:"))
1242 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1244 increment_matching(counters,c,TRUE);
1246 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1247 increment_matching(counters,c,TRUE);
1248 if (c==CHAR_CLOSE_SBRACK)
1250 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1251 !matching_difference(counters,c) && !*snext)
1252 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1254 increment_matching(counters,c,FALSE);
1256 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1257 increment_matching(counters,c,FALSE);
1265 * check_for_control_characters:
1267 * Check for invalid or questionable characters in the line
1268 * Anything above 127 is invalid for plain ASCII, and
1269 * non-printable control characters should also be flagged.
1270 * Tabs should generally not be there.
1272 void check_for_control_characters(const char *aline)
1276 for (s=aline;*s;s=g_utf8_next_char(s))
1278 c=g_utf8_get_char(s);
1279 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1281 if (pswit[ECHO_SWITCH])
1282 g_print("\n%s\n",aline);
1283 if (!pswit[OVERVIEW_SWITCH])
1284 g_print(" Line %ld column %ld - Control character %u\n",
1285 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1293 * check_for_odd_characters:
1295 * Check for binary and other odd characters.
1297 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1298 gboolean isemptyline)
1300 /* Don't repeat multiple warnings on one line. */
1301 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1302 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1307 for (s=aline;*s;s=g_utf8_next_char(s))
1309 c=g_utf8_get_char(s);
1310 if (warnings->bin && !eInvalidChar &&
1311 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1313 if (pswit[ECHO_SWITCH])
1314 g_print("\n%s\n",aline);
1315 if (!pswit[OVERVIEW_SWITCH])
1316 if (c>127 && c<160 || c>255)
1317 g_print(" Line %ld column %ld - "
1318 "Non-ISO-8859 character %u\n",
1319 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1321 g_print(" Line %ld column %ld - "
1322 "Non-ASCII character %u\n",
1323 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1328 if (!eInvalidChar && charset)
1330 if (charset_validator==(GIConv)-1)
1332 if (!g_unichar_isdefined(c))
1334 if (pswit[ECHO_SWITCH])
1335 g_print("\n%s\n",aline);
1336 if (!pswit[OVERVIEW_SWITCH])
1337 g_print(" Line %ld column %ld - Unassigned UNICODE "
1338 "code point U+%04" G_GINT32_MODIFIER "X\n",
1339 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1344 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1345 c>=100000 && c<=0x10FFFD)
1347 if (pswit[ECHO_SWITCH])
1348 g_print("\n%s\n",aline);
1349 if (!pswit[OVERVIEW_SWITCH])
1350 g_print(" Line %ld column %ld - Private Use "
1351 "character U+%04" G_GINT32_MODIFIER "X\n",
1352 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1360 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1361 charset_validator,NULL,&nb,NULL);
1366 if (pswit[ECHO_SWITCH])
1367 g_print("\n%s\n",aline);
1368 if (!pswit[OVERVIEW_SWITCH])
1369 g_print(" Line %ld column %ld - Non-%s "
1370 "character %u\n",linecnt,
1371 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1378 if (!eTab && c==CHAR_TAB)
1380 if (pswit[ECHO_SWITCH])
1381 g_print("\n%s\n",aline);
1382 if (!pswit[OVERVIEW_SWITCH])
1383 g_print(" Line %ld column %ld - Tab character?\n",
1384 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1389 if (!eTilde && c==CHAR_TILDE)
1392 * Often used by OCR software to indicate an
1393 * unrecognizable character.
1395 if (pswit[ECHO_SWITCH])
1396 g_print("\n%s\n",aline);
1397 if (!pswit[OVERVIEW_SWITCH])
1398 g_print(" Line %ld column %ld - Tilde character?\n",
1399 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1404 if (!eCarat && c==CHAR_CARAT)
1406 if (pswit[ECHO_SWITCH])
1407 g_print("\n%s\n",aline);
1408 if (!pswit[OVERVIEW_SWITCH])
1409 g_print(" Line %ld column %ld - Carat character?\n",
1410 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1415 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1417 if (pswit[ECHO_SWITCH])
1418 g_print("\n%s\n",aline);
1419 if (!pswit[OVERVIEW_SWITCH])
1420 g_print(" Line %ld column %ld - Forward slash?\n",
1421 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1427 * Report asterisks only in paranoid mode,
1428 * since they're often deliberate.
1430 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1433 if (pswit[ECHO_SWITCH])
1434 g_print("\n%s\n",aline);
1435 if (!pswit[OVERVIEW_SWITCH])
1436 g_print(" Line %ld column %ld - Asterisk?\n",
1437 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1446 * check_for_long_line:
1448 * Check for line too long.
1450 void check_for_long_line(const char *aline)
1452 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1454 if (pswit[ECHO_SWITCH])
1455 g_print("\n%s\n",aline);
1456 if (!pswit[OVERVIEW_SWITCH])
1457 g_print(" Line %ld column %ld - Long line %ld\n",
1458 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1465 * check_for_short_line:
1467 * Check for line too short.
1469 * This one is a bit trickier to implement: we don't want to
1470 * flag the last line of a paragraph for being short, so we
1471 * have to wait until we know that our current line is a
1472 * "normal" line, then report the _previous_ line if it was too
1473 * short. We also don't want to report indented lines like
1474 * chapter heads or formatted quotations. We therefore keep
1475 * last->len as the length of the last line examined, and
1476 * last->blen as the length of the last but one, and try to
1477 * suppress unnecessary warnings by checking that both were of
1478 * "normal" length. We keep the first character of the last
1479 * line in last->start, and if it was a space, we assume that
1480 * the formatting is deliberate. I can't figure out a way to
1481 * distinguish something like a quoted verse left-aligned or
1482 * the header or footer of a letter from a paragraph of short
1483 * lines - maybe if I examined the whole paragraph, and if the
1484 * para has less than, say, 8 lines and if all lines are short,
1485 * then just assume it's OK? Need to look at some texts to see
1486 * how often a formula like this would get the right result.
1488 void check_for_short_line(const char *aline,const struct line_properties *last)
1490 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1491 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1492 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1494 if (pswit[ECHO_SWITCH])
1495 g_print("\n%s\n",prevline);
1496 if (!pswit[OVERVIEW_SWITCH])
1497 g_print(" Line %ld column %ld - Short line %ld?\n",
1498 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1505 * check_for_starting_punctuation:
1507 * Look for punctuation other than full ellipses at start of line.
1509 void check_for_starting_punctuation(const char *aline)
1511 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1512 !g_str_has_prefix(aline,". . ."))
1514 if (pswit[ECHO_SWITCH])
1515 g_print("\n%s\n",aline);
1516 if (!pswit[OVERVIEW_SWITCH])
1517 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1525 * check_for_spaced_emdash:
1527 * Check for spaced em-dashes.
1529 * We must check _all_ occurrences of "--" on the line
1530 * hence the loop - even if the first double-dash is OK
1531 * there may be another that's wrong later on.
1533 void check_for_spaced_emdash(const char *aline)
1535 const char *s,*t,*next;
1536 for (s=aline;t=strstr(s,"--");s=next)
1538 next=g_utf8_next_char(g_utf8_next_char(t));
1539 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1540 g_utf8_get_char(next)==CHAR_SPACE)
1542 if (pswit[ECHO_SWITCH])
1543 g_print("\n%s\n",aline);
1544 if (!pswit[OVERVIEW_SWITCH])
1545 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1546 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1554 * check_for_spaced_dash:
1556 * Check for spaced dashes.
1558 void check_for_spaced_dash(const char *aline)
1561 if ((s=strstr(aline," -")))
1563 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1565 if (pswit[ECHO_SWITCH])
1566 g_print("\n%s\n",aline);
1567 if (!pswit[OVERVIEW_SWITCH])
1568 g_print(" Line %ld column %ld - Spaced dash?\n",
1569 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1574 else if ((s=strstr(aline,"- ")))
1576 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1578 if (pswit[ECHO_SWITCH])
1579 g_print("\n%s\n",aline);
1580 if (!pswit[OVERVIEW_SWITCH])
1581 g_print(" Line %ld column %ld - Spaced dash?\n",
1582 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1590 * check_for_unmarked_paragraphs:
1592 * Check for unmarked paragraphs indicated by separate speakers.
1594 * May well be false positive:
1595 * "Bravo!" "Wonderful!" called the crowd.
1596 * but useful all the same.
1598 void check_for_unmarked_paragraphs(const char *aline)
1601 s=strstr(aline,"\" \"");
1603 s=strstr(aline,"\" \"");
1606 if (pswit[ECHO_SWITCH])
1607 g_print("\n%s\n",aline);
1608 if (!pswit[OVERVIEW_SWITCH])
1609 g_print(" Line %ld column %ld - "
1610 "Query missing paragraph break?\n",
1611 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1618 * check_for_jeebies:
1620 * Check for "to he" and other easy h/b errors.
1622 * This is a very inadequate effort on the h/b problem,
1623 * but the phrase "to he" is always an error, whereas "to
1624 * be" is quite common.
1625 * Similarly, '"Quiet!", be said.' is a non-be error
1626 * "to he" is _not_ always an error!:
1627 * "Where they went to he couldn't say."
1628 * Another false positive:
1629 * What would "Cinderella" be without the . . .
1630 * and another: "If he wants to he can see for himself."
1632 void check_for_jeebies(const char *aline)
1635 s=strstr(aline," be could ");
1637 s=strstr(aline," be would ");
1639 s=strstr(aline," was be ");
1641 s=strstr(aline," be is ");
1643 s=strstr(aline," is be ");
1645 s=strstr(aline,"\", be ");
1647 s=strstr(aline,"\" be ");
1649 s=strstr(aline,"\" be ");
1651 s=strstr(aline," to he ");
1654 if (pswit[ECHO_SWITCH])
1655 g_print("\n%s\n",aline);
1656 if (!pswit[OVERVIEW_SWITCH])
1657 g_print(" Line %ld column %ld - Query he/be error?\n",
1658 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1662 s=strstr(aline," the had ");
1664 s=strstr(aline," a had ");
1666 s=strstr(aline," they bad ");
1668 s=strstr(aline," she bad ");
1670 s=strstr(aline," he bad ");
1672 s=strstr(aline," you bad ");
1674 s=strstr(aline," i bad ");
1677 if (pswit[ECHO_SWITCH])
1678 g_print("\n%s\n",aline);
1679 if (!pswit[OVERVIEW_SWITCH])
1680 g_print(" Line %ld column %ld - Query had/bad error?\n",
1681 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1685 s=strstr(aline,"; hut ");
1687 s=strstr(aline,", hut ");
1690 if (pswit[ECHO_SWITCH])
1691 g_print("\n%s\n",aline);
1692 if (!pswit[OVERVIEW_SWITCH])
1693 g_print(" Line %ld column %ld - Query hut/but error?\n",
1694 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1701 * check_for_mta_from:
1703 * Special case - angled bracket in front of "From" placed there by an
1704 * MTA when sending an e-mail.
1706 void check_for_mta_from(const char *aline)
1709 s=strstr(aline,">From");
1712 if (pswit[ECHO_SWITCH])
1713 g_print("\n%s\n",aline);
1714 if (!pswit[OVERVIEW_SWITCH])
1715 g_print(" Line %ld column %ld - "
1716 "Query angled bracket with From\n",
1717 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1724 * check_for_orphan_character:
1726 * Check for a single character line -
1727 * often an overflow from bad wrapping.
1729 void check_for_orphan_character(const char *aline)
1732 c=g_utf8_get_char(aline);
1733 if (c && !*g_utf8_next_char(aline))
1735 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1736 ; /* Nothing - ignore numerals alone on a line. */
1739 if (pswit[ECHO_SWITCH])
1740 g_print("\n%s\n",aline);
1741 if (!pswit[OVERVIEW_SWITCH])
1742 g_print(" Line %ld column 1 - Query single character line\n",
1751 * check_for_pling_scanno:
1753 * Check for I" - often should be !
1755 void check_for_pling_scanno(const char *aline)
1758 s=strstr(aline," I\"");
1761 if (pswit[ECHO_SWITCH])
1762 g_print("\n%s\n",aline);
1763 if (!pswit[OVERVIEW_SWITCH])
1764 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1765 linecnt,g_utf8_pointer_to_offset(aline,s));
1772 * check_for_extra_period:
1774 * Check for period without a capital letter. Cut-down from gutspell.
1775 * Only works when it happens on a single line.
1777 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1779 const char *s,*t,*s1,*sprev;
1784 gunichar c,nc,pc,*decomposition;
1785 if (pswit[PARANOID_SWITCH])
1787 for (t=aline;t=strstr(t,". ");)
1791 t=g_utf8_next_char(t);
1792 /* start of line punctuation is handled elsewhere */
1795 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1797 t=g_utf8_next_char(t);
1800 if (warnings->isDutch)
1802 /* For Frank & Jeroen -- 's Middags case */
1803 gunichar c2,c3,c4,c5;
1804 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1805 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1806 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1807 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1808 if (CHAR_IS_APOSTROPHE(c2) &&
1809 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1810 g_unichar_isupper(c5))
1812 t=g_utf8_next_char(t);
1816 s1=g_utf8_next_char(g_utf8_next_char(t));
1817 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1818 !isdigit(g_utf8_get_char(s1)))
1819 s1=g_utf8_next_char(s1);
1820 if (g_unichar_islower(g_utf8_get_char(s1)))
1822 /* we have something to investigate */
1824 /* so let's go back and find out */
1825 nc=g_utf8_get_char(t);
1826 s1=g_utf8_prev_char(t);
1827 c=g_utf8_get_char(s1);
1828 sprev=g_utf8_prev_char(s1);
1829 pc=g_utf8_get_char(sprev);
1831 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1832 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1833 g_unichar_isalpha(nc)))
1838 sprev=g_utf8_prev_char(s1);
1839 pc=g_utf8_get_char(sprev);
1841 s1=g_utf8_next_char(s1);
1844 testword=g_strndup(s1,s-s1);
1846 testword=g_strdup(s1);
1847 for (i=0;*abbrev[i];i++)
1848 if (!strcmp(testword,abbrev[i]))
1850 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1852 if (!*g_utf8_next_char(testword))
1854 if (isroman(testword))
1859 for (s=testword;*s;s=g_utf8_next_char(s))
1861 decomposition=g_unicode_canonical_decomposition(
1862 g_utf8_get_char(s),&len);
1863 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1865 g_free(decomposition);
1869 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1871 g_tree_insert(qperiod,g_strdup(testword),
1872 GINT_TO_POINTER(1));
1873 if (pswit[ECHO_SWITCH])
1874 g_print("\n%s\n",aline);
1875 if (!pswit[OVERVIEW_SWITCH])
1876 g_print(" Line %ld column %ld - Extra period?\n",
1877 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1883 t=g_utf8_next_char(t);
1889 * check_for_following_punctuation:
1891 * Check for words usually not followed by punctuation.
1893 void check_for_following_punctuation(const char *aline)
1896 const char *s,*wordstart;
1899 if (pswit[TYPO_SWITCH])
1910 inword=g_utf8_strdown(t,-1);
1912 for (i=0;*nocomma[i];i++)
1913 if (!strcmp(inword,nocomma[i]))
1915 c=g_utf8_get_char(s);
1916 if (c==',' || c==';' || c==':')
1918 if (pswit[ECHO_SWITCH])
1919 g_print("\n%s\n",aline);
1920 if (!pswit[OVERVIEW_SWITCH])
1921 g_print(" Line %ld column %ld - "
1922 "Query punctuation after %s?\n",
1923 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1929 for (i=0;*noperiod[i];i++)
1930 if (!strcmp(inword,noperiod[i]))
1932 c=g_utf8_get_char(s);
1933 if (c=='.' || c=='!')
1935 if (pswit[ECHO_SWITCH])
1936 g_print("\n%s\n",aline);
1937 if (!pswit[OVERVIEW_SWITCH])
1938 g_print(" Line %ld column %ld - "
1939 "Query punctuation after %s?\n",
1940 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1954 * Check for commonly mistyped words,
1955 * and digits like 0 for O in a word.
1957 void check_for_typos(const char *aline,struct warnings *warnings)
1959 const char *s,*t,*nt,*wordstart;
1961 gunichar *decomposition;
1963 int i,vowel,consonant,*dupcnt;
1964 gboolean isdup,istypo,alower;
1967 gsize decomposition_len;
1971 inword=getaword(&s);
1975 continue; /* don't bother with empty lines */
1977 if (mixdigit(inword))
1979 if (pswit[ECHO_SWITCH])
1980 g_print("\n%s\n",aline);
1981 if (!pswit[OVERVIEW_SWITCH])
1982 g_print(" Line %ld column %ld - Query digit in %s\n",
1983 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1988 * Put the word through a series of tests for likely typos and OCR
1991 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1995 for (t=inword;*t;t=g_utf8_next_char(t))
1997 c=g_utf8_get_char(t);
1998 nt=g_utf8_next_char(t);
1999 /* lowercase for testing */
2000 if (g_unichar_islower(c))
2002 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
2005 * We have an uppercase mid-word. However, there are
2007 * Mac and Mc like McGill
2008 * French contractions like l'Abbe
2010 offset=g_utf8_pointer_to_offset(inword,t);
2012 pc=g_utf8_get_char(g_utf8_prev_char(t));
2015 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
2016 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2017 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2018 CHAR_IS_APOSTROPHE(pc))
2024 testword=g_utf8_casefold(inword,-1);
2026 if (pswit[TYPO_SWITCH])
2029 * Check for certain unlikely two-letter combinations at word
2032 len=g_utf8_strlen(testword,-1);
2035 for (i=0;*nostart[i];i++)
2036 if (g_str_has_prefix(testword,nostart[i]))
2038 for (i=0;*noend[i];i++)
2039 if (g_str_has_suffix(testword,noend[i]))
2042 /* ght is common, gbt never. Like that. */
2043 if (strstr(testword,"cb"))
2045 if (strstr(testword,"gbt"))
2047 if (strstr(testword,"pbt"))
2049 if (strstr(testword,"tbs"))
2051 if (strstr(testword,"mrn"))
2053 if (strstr(testword,"ahle"))
2055 if (strstr(testword,"ihle"))
2058 * "TBE" does happen - like HEARTBEAT - but uncommon.
2059 * Also "TBI" - frostbite, outbid - but uncommon.
2060 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2061 * numerals, but "ii" is a common scanno.
2063 if (strstr(testword,"tbi"))
2065 if (strstr(testword,"tbe"))
2067 if (strstr(testword,"ii"))
2070 * Check for no vowels or no consonants.
2071 * If none, flag a typo.
2073 if (!istypo && len>1)
2076 for (t=testword;*t;t=g_utf8_next_char(t))
2078 c=g_utf8_get_char(t);
2080 g_unicode_canonical_decomposition(c,&decomposition_len);
2081 if (c=='y' || g_unichar_isdigit(c))
2083 /* Yah, this is loose. */
2087 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2091 g_free(decomposition);
2093 if (!vowel || !consonant)
2097 * Now exclude the word from being reported if it's in
2100 for (i=0;*okword[i];i++)
2101 if (!strcmp(testword,okword[i]))
2104 * What looks like a typo may be a Roman numeral.
2107 if (istypo && isroman(testword))
2109 /* Check the manual list of typos. */
2111 for (i=0;*typo[i];i++)
2112 if (!strcmp(testword,typo[i]))
2115 * Check lowercase s, l, i and m - special cases.
2116 * "j" - often a semi-colon gone wrong.
2117 * "d" for a missing apostrophe - he d
2120 if (!istypo && len==1 &&
2121 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2125 dupcnt=g_tree_lookup(qword,testword);
2129 isdup=!pswit[VERBOSE_SWITCH];
2133 dupcnt=g_new0(int,1);
2134 g_tree_insert(qword,g_strdup(testword),dupcnt);
2139 if (pswit[ECHO_SWITCH])
2140 g_print("\n%s\n",aline);
2141 if (!pswit[OVERVIEW_SWITCH])
2143 g_print(" Line %ld column %ld - Query word %s",
2144 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2146 if (!pswit[VERBOSE_SWITCH])
2147 g_print(" - not reporting duplicates");
2155 /* check the user's list of typos */
2156 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2158 if (pswit[ECHO_SWITCH])
2159 g_print("\n%s\n",aline);
2160 if (!pswit[OVERVIEW_SWITCH])
2161 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2162 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2164 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2166 if (pswit[PARANOID_SWITCH] && warnings->digit)
2168 /* In paranoid mode, query all 0 and 1 standing alone. */
2169 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2171 if (pswit[ECHO_SWITCH])
2172 g_print("\n%s\n",aline);
2173 if (!pswit[OVERVIEW_SWITCH])
2174 g_print(" Line %ld column %ld - Query standalone %s\n",
2175 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2186 * check_for_misspaced_punctuation:
2188 * Look for added or missing spaces around punctuation and quotes.
2189 * If there is a punctuation character like ! with no space on
2190 * either side, suspect a missing!space. If there are spaces on
2191 * both sides , assume a typo. If we see a double quote with no
2192 * space or punctuation on either side of it, assume unspaced
2193 * quotes "like"this.
2195 void check_for_misspaced_punctuation(const char *aline,
2196 struct parities *parities,gboolean isemptyline)
2198 gboolean isacro,isellipsis;
2200 gunichar c,nc,pc,n2c;
2202 c=g_utf8_get_char(aline);
2203 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2204 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2208 nc=g_utf8_get_char(g_utf8_next_char(s));
2209 /* For each character in the line after the first. */
2210 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2212 /* we need to suppress warnings for acronyms like M.D. */
2214 /* we need to suppress warnings for ellipsis . . . */
2217 * If there are letters on both sides of it or
2218 * if it's strict punctuation followed by an alpha.
2220 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2221 g_utf8_strchr("?!,;:",-1,c)))
2225 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2226 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2228 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2234 if (pswit[ECHO_SWITCH])
2235 g_print("\n%s\n",aline);
2236 if (!pswit[OVERVIEW_SWITCH])
2237 g_print(" Line %ld column %ld - Missing space?\n",
2238 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2243 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2246 * If there are spaces on both sides,
2247 * or space before and end of line.
2251 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2252 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2254 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2258 if (!isemptyline && !isellipsis)
2260 if (pswit[ECHO_SWITCH])
2261 g_print("\n%s\n",aline);
2262 if (!pswit[OVERVIEW_SWITCH])
2263 g_print(" Line %ld column %ld - "
2264 "Spaced punctuation?\n",linecnt,
2265 g_utf8_pointer_to_offset(aline,s)+1);
2272 /* Split out the characters that CANNOT be preceded by space. */
2273 c=g_utf8_get_char(aline);
2274 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2275 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2279 nc=g_utf8_get_char(g_utf8_next_char(s));
2280 /* for each character in the line after the first */
2281 if (g_utf8_strchr("?!,;:",-1,c))
2283 /* if it's punctuation that _cannot_ have a space before it */
2284 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2287 * If nc DOES == space,
2288 * it was already reported just above.
2290 if (pswit[ECHO_SWITCH])
2291 g_print("\n%s\n",aline);
2292 if (!pswit[OVERVIEW_SWITCH])
2293 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2294 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2301 * Special case " .X" where X is any alpha.
2302 * This plugs a hole in the acronym code above.
2303 * Inelegant, but maintainable.
2305 c=g_utf8_get_char(aline);
2306 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2307 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2311 nc=g_utf8_get_char(g_utf8_next_char(s));
2312 /* for each character in the line after the first */
2315 /* if it's a period */
2316 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2319 * If the period follows a space and
2320 * is followed by a letter.
2322 if (pswit[ECHO_SWITCH])
2323 g_print("\n%s\n",aline);
2324 if (!pswit[OVERVIEW_SWITCH])
2325 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2326 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2332 c=g_utf8_get_char(aline);
2333 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2334 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2338 nc=g_utf8_get_char(g_utf8_next_char(s));
2339 /* for each character in the line after the first */
2340 if (CHAR_IS_DQUOTE(c))
2342 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2343 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2344 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2346 if (pswit[ECHO_SWITCH])
2347 g_print("\n%s\n",aline);
2348 if (!pswit[OVERVIEW_SWITCH])
2349 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2350 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2356 /* Check parity of quotes. */
2357 nc=g_utf8_get_char(aline);
2358 for (s=aline;*s;s=g_utf8_next_char(s))
2361 nc=g_utf8_get_char(g_utf8_next_char(s));
2362 if (CHAR_IS_DQUOTE(c))
2366 parities->dquote=!parities->dquote;
2367 parity=parities->dquote;
2369 else if (c==CHAR_LD_QUOTE)
2376 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2378 if (pswit[ECHO_SWITCH])
2379 g_print("\n%s\n",aline);
2380 if (!pswit[OVERVIEW_SWITCH])
2381 g_print(" Line %ld column %ld - "
2382 "Wrongspaced quotes?\n",
2383 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2391 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2392 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2394 if (pswit[ECHO_SWITCH])
2395 g_print("\n%s\n",aline);
2396 if (!pswit[OVERVIEW_SWITCH])
2397 g_print(" Line %ld column %ld - "
2398 "Wrongspaced quotes?\n",
2399 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2406 c=g_utf8_get_char(aline);
2407 if (CHAR_IS_DQUOTE(c))
2409 if (g_utf8_strchr(",;:!?)]} ",-1,
2410 g_utf8_get_char(g_utf8_next_char(aline))))
2412 if (pswit[ECHO_SWITCH])
2413 g_print("\n%s\n",aline);
2414 if (!pswit[OVERVIEW_SWITCH])
2415 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2421 if (pswit[SQUOTE_SWITCH])
2423 nc=g_utf8_get_char(aline);
2424 for (s=aline;*s;s=g_utf8_next_char(s))
2427 nc=g_utf8_get_char(g_utf8_next_char(s));
2428 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2429 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2430 !g_unichar_isalpha(nc)))
2432 parities->squote=!parities->squote;
2433 if (!parities->squote)
2436 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2438 if (pswit[ECHO_SWITCH])
2439 g_print("\n%s\n",aline);
2440 if (!pswit[OVERVIEW_SWITCH])
2441 g_print(" Line %ld column %ld - "
2442 "Wrongspaced singlequotes?\n",
2443 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2451 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2452 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2454 if (pswit[ECHO_SWITCH])
2455 g_print("\n%s\n",aline);
2456 if (!pswit[OVERVIEW_SWITCH])
2457 g_print(" Line %ld column %ld - "
2458 "Wrongspaced singlequotes?\n",
2459 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2470 * check_for_double_punctuation:
2472 * Look for double punctuation like ,. or ,,
2473 * Thanks to DW for the suggestion!
2474 * In books with references, ".," and ".;" are common
2475 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2476 * OTOH, from my initial tests, there are also fairly
2477 * common errors. What to do? Make these cases paranoid?
2478 * ".," is the most common, so warnings->dotcomma is used
2479 * to suppress detailed reporting if it occurs often.
2481 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2485 nc=g_utf8_get_char(aline);
2486 for (s=aline;*s;s=g_utf8_next_char(s))
2489 nc=g_utf8_get_char(g_utf8_next_char(s));
2490 /* for each punctuation character in the line */
2491 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2492 g_utf8_strchr(".?!,;:",-1,nc))
2494 /* followed by punctuation, it's a query, unless . . . */
2495 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2496 !warnings->dotcomma && c=='.' && nc==',' ||
2497 warnings->isFrench && g_str_has_prefix(s,",...") ||
2498 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2499 warnings->isFrench && g_str_has_prefix(s,";...") ||
2500 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2501 warnings->isFrench && g_str_has_prefix(s,":...") ||
2502 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2503 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2504 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2505 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2506 warnings->isFrench && g_str_has_prefix(s,"...?"))
2508 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2509 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2510 warnings->isFrench && g_str_has_prefix(s,";...") ||
2511 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2512 warnings->isFrench && g_str_has_prefix(s,":...") ||
2513 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2514 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2515 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2516 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2517 warnings->isFrench && g_str_has_prefix(s,"...?"))
2520 nc=g_utf8_get_char(g_utf8_next_char(s));
2522 ; /* do nothing for .. !! and ?? which can be legit */
2526 if (pswit[ECHO_SWITCH])
2527 g_print("\n%s\n",aline);
2528 if (!pswit[OVERVIEW_SWITCH])
2529 g_print(" Line %ld column %ld - Double punctuation?\n",
2530 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2539 * check_for_spaced_quotes:
2541 void check_for_spaced_quotes(const char *aline)
2545 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2549 while ((t=strstr(s," \" ")))
2551 if (pswit[ECHO_SWITCH])
2552 g_print("\n%s\n",aline);
2553 if (!pswit[OVERVIEW_SWITCH])
2554 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2555 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2558 s=g_utf8_next_char(g_utf8_next_char(t));
2560 pattern=g_string_new(NULL);
2561 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2563 g_string_assign(pattern," ");
2564 g_string_append_unichar(pattern,single_quotes[i]);
2565 g_string_append_c(pattern,' ');
2567 while ((t=strstr(s,pattern->str)))
2569 if (pswit[ECHO_SWITCH])
2570 g_print("\n%s\n",aline);
2571 if (!pswit[OVERVIEW_SWITCH])
2572 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2573 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2576 s=g_utf8_next_char(g_utf8_next_char(t));
2579 g_string_free(pattern,TRUE);
2583 * check_for_miscased_genative:
2585 * Check special case of 'S instead of 's at end of word.
2587 void check_for_miscased_genative(const char *aline)
2593 c=g_utf8_get_char(aline);
2594 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2595 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2599 nc=g_utf8_get_char(g_utf8_next_char(s));
2600 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2602 if (pswit[ECHO_SWITCH])
2603 g_print("\n%s\n",aline);
2604 if (!pswit[OVERVIEW_SWITCH])
2605 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2606 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2614 * check_end_of_line:
2616 * Now check special cases - start and end of line -
2617 * for single and double quotes. Start is sometimes [sic]
2618 * but better to query it anyway.
2619 * While we're here, check for dash at end of line.
2621 void check_end_of_line(const char *aline,struct warnings *warnings)
2626 lbytes=strlen(aline);
2627 if (g_utf8_strlen(aline,lbytes)>1)
2629 s=g_utf8_prev_char(aline+lbytes);
2630 c1=g_utf8_get_char(s);
2631 c2=g_utf8_get_char(g_utf8_prev_char(s));
2632 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2634 if (pswit[ECHO_SWITCH])
2635 g_print("\n%s\n",aline);
2636 if (!pswit[OVERVIEW_SWITCH])
2637 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2638 g_utf8_strlen(aline,lbytes));
2642 c1=g_utf8_get_char(aline);
2643 c2=g_utf8_get_char(g_utf8_next_char(aline));
2644 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2646 if (pswit[ECHO_SWITCH])
2647 g_print("\n%s\n",aline);
2648 if (!pswit[OVERVIEW_SWITCH])
2649 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2654 * Dash at end of line may well be legit - paranoid mode only
2655 * and don't report em-dash at line-end.
2657 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2659 for (s=g_utf8_prev_char(aline+lbytes);
2660 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2662 if (g_utf8_get_char(s)=='-' &&
2663 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2665 if (pswit[ECHO_SWITCH])
2666 g_print("\n%s\n",aline);
2667 if (!pswit[OVERVIEW_SWITCH])
2668 g_print(" Line %ld column %ld - "
2669 "Hyphen at end of line?\n",
2670 linecnt,g_utf8_pointer_to_offset(aline,s));
2677 * check_for_unspaced_bracket:
2679 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2680 * If so, suspect a scanno like "a]most".
2682 void check_for_unspaced_bracket(const char *aline)
2686 c=g_utf8_get_char(aline);
2687 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2688 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2692 nc=g_utf8_get_char(g_utf8_next_char(s));
2695 /* for each bracket character in the line except 1st & last */
2696 if (g_utf8_strchr("{[()]}",-1,c) &&
2697 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2699 if (pswit[ECHO_SWITCH])
2700 g_print("\n%s\n",aline);
2701 if (!pswit[OVERVIEW_SWITCH])
2702 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2703 linecnt,g_utf8_pointer_to_offset(aline,s));
2711 * check_for_unpunctuated_endquote:
2713 void check_for_unpunctuated_endquote(const char *aline)
2718 c=g_utf8_get_char(aline);
2719 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2720 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2724 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2725 nc=g_utf8_get_char(g_utf8_next_char(s));
2726 /* for each character in the line except 1st */
2727 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
2729 if (pswit[ECHO_SWITCH])
2730 g_print("\n%s\n",aline);
2731 if (!pswit[OVERVIEW_SWITCH])
2732 g_print(" Line %ld column %ld - "
2733 "endquote missing punctuation?\n",
2734 linecnt,g_utf8_pointer_to_offset(aline,s));
2742 * check_for_html_tag:
2744 * Check for <HTML TAG>.
2746 * If there is a < in the line, followed at some point
2747 * by a > then we suspect HTML.
2749 void check_for_html_tag(const char *aline)
2751 const char *open,*close;
2753 open=strchr(aline,'<');
2756 close=strchr(g_utf8_next_char(open),'>');
2759 if (pswit[ECHO_SWITCH])
2760 g_print("\n%s\n",aline);
2761 if (!pswit[OVERVIEW_SWITCH])
2763 tag=g_strndup(open,close-open+1);
2764 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2765 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2775 * check_for_html_entity:
2777 * Check for &symbol; HTML.
2779 * If there is a & in the line, followed at
2780 * some point by a ; then we suspect HTML.
2782 void check_for_html_entity(const char *aline)
2784 const char *s,*amp,*scolon;
2786 amp=strchr(aline,'&');
2789 scolon=strchr(amp,';');
2792 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2793 if (g_utf8_get_char(s)==CHAR_SPACE)
2794 break; /* Don't report "Jones & Son;" */
2797 if (pswit[ECHO_SWITCH])
2798 g_print("\n%s\n",aline);
2799 if (!pswit[OVERVIEW_SWITCH])
2801 entity=g_strndup(amp,scolon-amp+1);
2802 g_print(" Line %ld column %d - HTML symbol? %s \n",
2803 linecnt,(int)(amp-aline)+1,entity);
2814 * check_for_omitted_punctuation:
2816 * Check for omitted punctuation at end of paragraph by working back
2817 * through prevline. DW.
2818 * Need to check this only for "normal" paras.
2819 * So what is a "normal" para?
2820 * Not normal if one-liner (chapter headings, etc.)
2821 * Not normal if doesn't contain at least one locase letter
2822 * Not normal if starts with space
2824 void check_for_omitted_punctuation(const char *prevline,
2825 struct line_properties *last,int start_para_line)
2827 gboolean letter_on_line=FALSE;
2830 gboolean closing_quote;
2831 for (s=prevline;*s;s=g_utf8_next_char(s))
2832 if (g_unichar_isalpha(g_utf8_get_char(s)))
2834 letter_on_line=TRUE;
2838 * This next "if" is a problem.
2839 * If we say "start_para_line <= linecnt - 1", that includes
2840 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2841 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2842 * misses genuine one-line paragraphs.
2844 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2845 g_utf8_get_char(prevline)>CHAR_SPACE)
2847 s=prevline+strlen(prevline);
2850 s=g_utf8_prev_char(s);
2851 c=g_utf8_get_char(s);
2852 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2855 closing_quote=FALSE;
2856 } while (closing_quote && s>prevline);
2857 for (;s>prevline;s=g_utf8_prev_char(s))
2859 if (g_unichar_isalpha(g_utf8_get_char(s)))
2861 if (pswit[ECHO_SWITCH])
2862 g_print("\n%s\n",prevline);
2863 if (!pswit[OVERVIEW_SWITCH])
2864 g_print(" Line %ld column %ld - "
2865 "No punctuation at para end?\n",
2866 linecnt-1,g_utf8_strlen(prevline,-1));
2871 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2877 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2879 const char *word=key;
2882 g_print("\nNote: Queried word %s was duplicated %d times\n",
2887 void print_as_windows_1252(const char *string)
2889 gsize inbytes,outbytes;
2891 static GIConv converter=(GIConv)-1;
2894 if (converter!=(GIConv)-1)
2895 g_iconv_close(converter);
2896 converter=(GIConv)-1;
2899 if (converter==(GIConv)-1)
2900 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2901 if (converter!=(GIConv)-1)
2903 inbytes=outbytes=strlen(string);
2904 bp=buf=g_malloc(outbytes+1);
2905 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2911 fputs(string,stdout);
2914 void print_as_utf_8(const char *string)
2916 fputs(string,stdout);
2924 void procfile(const char *filename)
2927 gchar *parastart=NULL; /* first line of current para */
2928 gchar *etext,*aline;
2931 struct first_pass_results *first_pass_results;
2932 struct warnings *warnings;
2933 struct counters counters={0};
2934 struct line_properties last={0};
2935 struct parities parities={0};
2936 struct pending pending={0};
2937 gboolean isemptyline;
2938 long start_para_line=0;
2939 gboolean isnewpara=FALSE,enddash=FALSE;
2940 last.start=CHAR_SPACE;
2941 linecnt=checked_linecnt=0;
2942 etext=read_etext(filename,&err);
2945 if (pswit[STDOUT_SWITCH])
2946 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2948 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2951 g_print("\n\nFile: %s\n\n",filename);
2952 first_pass_results=first_pass(etext);
2953 warnings=report_first_pass(first_pass_results);
2954 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2955 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2957 * Here we go with the main pass. Hold onto yer hat!
2961 while ((aline=flgets(&etext_ptr,linecnt+1)))
2966 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2967 continue; // skip DP page separators completely
2968 if (linecnt<first_pass_results->firstline ||
2969 (first_pass_results->footerline>0 &&
2970 linecnt>first_pass_results->footerline))
2972 if (pswit[HEADER_SWITCH])
2974 if (g_str_has_prefix(aline,"Title:"))
2975 g_print(" %s\n",aline);
2976 if (g_str_has_prefix(aline,"Author:"))
2977 g_print(" %s\n",aline);
2978 if (g_str_has_prefix(aline,"Release Date:"))
2979 g_print(" %s\n",aline);
2980 if (g_str_has_prefix(aline,"Edition:"))
2981 g_print(" %s\n\n",aline);
2983 continue; /* skip through the header */
2986 print_pending(aline,parastart,&pending);
2987 isemptyline=analyse_quotes(aline,linecnt,&counters);
2988 if (isnewpara && !isemptyline)
2990 /* This line is the start of a new paragraph. */
2991 start_para_line=linecnt;
2992 /* Capture its first line in case we want to report it later. */
2994 parastart=g_strdup(aline);
2995 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2997 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2998 !g_unichar_isdigit(g_utf8_get_char(s)))
2999 s=g_utf8_next_char(s);
3000 if (g_unichar_islower(g_utf8_get_char(s)))
3002 /* and its first letter is lowercase */
3003 if (pswit[ECHO_SWITCH])
3004 g_print("\n%s\n",aline);
3005 if (!pswit[OVERVIEW_SWITCH])
3006 g_print(" Line %ld column %ld - "
3007 "Paragraph starts with lower-case\n",
3008 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
3012 isnewpara=FALSE; /* Signal the end of new para processing. */
3014 /* Check for an em-dash broken at line end. */
3015 if (enddash && g_utf8_get_char(aline)=='-')
3017 if (pswit[ECHO_SWITCH])
3018 g_print("\n%s\n",aline);
3019 if (!pswit[OVERVIEW_SWITCH])
3020 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
3025 for (s=g_utf8_prev_char(aline+strlen(aline));
3026 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
3028 if (s>=aline && g_utf8_get_char(s)=='-')
3030 check_for_control_characters(aline);
3031 check_for_odd_characters(aline,warnings,isemptyline);
3032 if (warnings->longline)
3033 check_for_long_line(aline);
3034 if (warnings->shortline)
3035 check_for_short_line(aline,&last);
3037 last.len=g_utf8_strlen(aline,-1);
3038 last.start=g_utf8_get_char(aline);
3039 check_for_starting_punctuation(aline);
3042 check_for_spaced_emdash(aline);
3043 check_for_spaced_dash(aline);
3045 check_for_unmarked_paragraphs(aline);
3046 check_for_jeebies(aline);
3047 check_for_mta_from(aline);
3048 check_for_orphan_character(aline);
3049 check_for_pling_scanno(aline);
3050 check_for_extra_period(aline,warnings);
3051 check_for_following_punctuation(aline);
3052 check_for_typos(aline,warnings);
3053 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3054 check_for_double_punctuation(aline,warnings);
3055 check_for_spaced_quotes(aline);
3056 check_for_miscased_genative(aline);
3057 check_end_of_line(aline,warnings);
3058 check_for_unspaced_bracket(aline);
3059 if (warnings->endquote)
3060 check_for_unpunctuated_endquote(aline);
3061 check_for_html_tag(aline);
3062 check_for_html_entity(aline);
3065 check_for_mismatched_quotes(&counters,&pending);
3066 counters_reset(&counters);
3067 /* let the next iteration know that it's starting a new para */
3070 check_for_omitted_punctuation(prevline,&last,start_para_line);
3073 prevline=g_strdup(aline);
3076 check_for_mismatched_quotes(&counters,&pending);
3077 print_pending(NULL,parastart,&pending);
3078 reset_pending(&pending);
3087 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3088 g_tree_foreach(qword,report_duplicate_queries,NULL);
3089 g_tree_unref(qword);
3090 g_tree_unref(qperiod);
3091 counters_destroy(&counters);
3092 g_set_print_handler(NULL);
3093 print_as_windows_1252(NULL);
3094 if (pswit[MARKUP_SWITCH])
3101 * Get one line from the input text, checking for
3102 * the existence of exactly one CR/LF line-end per line.
3104 * Returns: a pointer to the line.
3106 char *flgets(char **etext,long lcnt)
3109 gboolean isCR=FALSE;
3110 char *theline=*etext;
3115 c=g_utf8_get_char(*etext);
3116 *etext=g_utf8_next_char(*etext);
3119 /* either way, it's end of line */
3126 /* Error - a LF without a preceding CR */
3127 if (pswit[LINE_END_SWITCH])
3129 if (pswit[ECHO_SWITCH])
3131 s=g_strndup(theline,eos-theline);
3132 g_print("\n%s\n",s);
3135 if (!pswit[OVERVIEW_SWITCH])
3136 g_print(" Line %ld - No CR?\n",lcnt);
3147 /* Error - two successive CRs */
3148 if (pswit[LINE_END_SWITCH])
3150 if (pswit[ECHO_SWITCH])
3152 s=g_strndup(theline,eos-theline);
3153 g_print("\n%s\n",s);
3156 if (!pswit[OVERVIEW_SWITCH])
3157 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3166 if (pswit[LINE_END_SWITCH] && isCR)
3168 if (pswit[ECHO_SWITCH])
3170 s=g_strndup(theline,eos-theline);
3171 g_print("\n%s\n",s);
3174 if (!pswit[OVERVIEW_SWITCH])
3175 g_print(" Line %ld column %ld - CR without LF?\n",
3176 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3182 eos=g_utf8_next_char(eos);
3186 if (pswit[MARKUP_SWITCH])
3187 postprocess_for_HTML(theline);
3188 if (pswit[DP_SWITCH])
3189 postprocess_for_DP(theline);
3196 * Takes a "word" as a parameter, and checks whether it
3197 * contains a mixture of alpha and digits. Generally, this is an
3198 * error, but may not be for cases like 4th or L5 12s. 3d.
3200 * Returns: TRUE iff an is error found.
3202 gboolean mixdigit(const char *checkword)
3204 gboolean wehaveadigit,wehavealetter,query;
3205 const char *s,*nondigit;
3206 wehaveadigit=wehavealetter=query=FALSE;
3207 for (s=checkword;*s;s=g_utf8_next_char(s))
3208 if (g_unichar_isalpha(g_utf8_get_char(s)))
3210 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3212 if (wehaveadigit && wehavealetter)
3214 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3216 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3217 nondigit=g_utf8_next_char(nondigit))
3219 /* digits, ending in st, rd, nd, th of either case */
3220 if (!g_ascii_strcasecmp(nondigit,"st") ||
3221 !g_ascii_strcasecmp(nondigit,"rd") ||
3222 !g_ascii_strcasecmp(nondigit,"nd") ||
3223 !g_ascii_strcasecmp(nondigit,"th"))
3225 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3226 !g_ascii_strcasecmp(nondigit,"rds") ||
3227 !g_ascii_strcasecmp(nondigit,"nds") ||
3228 !g_ascii_strcasecmp(nondigit,"ths"))
3230 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3231 !g_ascii_strcasecmp(nondigit,"rdly") ||
3232 !g_ascii_strcasecmp(nondigit,"ndly") ||
3233 !g_ascii_strcasecmp(nondigit,"thly"))
3235 /* digits, ending in l, L, s or d */
3236 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3237 !strcmp(nondigit,"d"))
3240 * L at the start of a number, representing Britsh pounds, like L500.
3241 * This is cute. We know the current word is mixed digit. If the first
3242 * letter is L, there must be at least one digit following. If both
3243 * digits and letters follow, we have a genuine error, else we have a
3244 * capital L followed by digits, and we accept that as a non-error.
3246 if (g_utf8_get_char(checkword)=='L' &&
3247 !mixdigit(g_utf8_next_char(checkword)))
3256 * Extracts the first/next "word" from the line, and returns it.
3257 * A word is defined as one English word unit--or at least that's the aim.
3258 * "ptr" is advanced to the position in the line where we will start
3259 * looking for the next word.
3261 * Returns: A newly-allocated string.
3263 gchar *getaword(const char **ptr)
3268 word=g_string_new(NULL);
3269 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3270 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3271 **ptr;*ptr=g_utf8_next_char(*ptr))
3274 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3275 * Especially yucky is the case of L1,000
3276 * This section looks for a pattern of characters including a digit
3277 * followed by a comma or period followed by one or more digits.
3278 * If found, it returns this whole pattern as a word; otherwise we discard
3279 * the results and resume our normal programming.
3282 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3283 g_unichar_isalpha(g_utf8_get_char(s)) ||
3284 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3285 g_string_append_unichar(word,g_utf8_get_char(s));
3288 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3290 c=g_utf8_get_char(t);
3291 pc=g_utf8_get_char(g_utf8_prev_char(t));
3292 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3295 return g_string_free(word,FALSE);
3299 /* we didn't find a punctuated number - do the regular getword thing */
3300 g_string_truncate(word,0);
3301 c=g_utf8_get_char(*ptr);
3302 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3303 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3304 g_string_append_unichar(word,c);
3305 return g_string_free(word,FALSE);
3311 * Is this word a Roman Numeral?
3313 * It doesn't actually validate that the number is a valid Roman Numeral--for
3314 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3315 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3316 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3317 * expressions thereof, except when it came to taxes. Allow any number of M,
3318 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3319 * XL or an optional XC, an optional IX or IV, an optional V and any number
3322 gboolean isroman(const char *t)
3328 while (g_utf8_get_char(t)=='m' && *t)
3330 if (g_utf8_get_char(t)=='d')
3332 if (g_str_has_prefix(t,"cm"))
3334 if (g_str_has_prefix(t,"cd"))
3336 while (g_utf8_get_char(t)=='c' && *t)
3338 if (g_str_has_prefix(t,"xl"))
3340 if (g_str_has_prefix(t,"xc"))
3342 if (g_utf8_get_char(t)=='l')
3344 while (g_utf8_get_char(t)=='x' && *t)
3346 if (g_str_has_prefix(t,"ix"))
3348 if (g_str_has_prefix(t,"iv"))
3350 if (g_utf8_get_char(t)=='v')
3352 while (g_utf8_get_char(t)=='i' && *t)
3358 * postprocess_for_DP:
3360 * Invoked with the -d switch from flgets().
3361 * It simply "removes" from the line a hard-coded set of common
3362 * DP-specific tags, so that the line passed to the main routine has
3363 * been pre-cleaned of DP markup.
3365 void postprocess_for_DP(char *theline)
3371 for (i=0;*DPmarkup[i];i++)
3372 while ((s=strstr(theline,DPmarkup[i])))
3374 t=s+strlen(DPmarkup[i]);
3375 memmove(s,t,strlen(t)+1);
3380 * postprocess_for_HTML:
3382 * Invoked with the -m switch from flgets().
3383 * It simply "removes" from the line a hard-coded set of common
3384 * HTML tags and "replaces" a hard-coded set of common HTML
3385 * entities, so that the line passed to the main routine has
3386 * been pre-cleaned of HTML.
3388 void postprocess_for_HTML(char *theline)
3390 while (losemarkup(theline))
3392 loseentities(theline);
3395 char *losemarkup(char *theline)
3399 s=strchr(theline,'<');
3400 t=s?strchr(s,'>'):NULL;
3403 for (i=0;*markup[i];i++)
3404 if (tagcomp(g_utf8_next_char(s),markup[i]))
3406 t=g_utf8_next_char(t);
3407 memmove(s,t,strlen(t)+1);
3410 /* It's an unrecognized <xxx>. */
3414 void loseentities(char *theline)
3421 GTree *entities=NULL;
3422 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3426 g_tree_destroy(entities);
3428 if (translit!=(GIConv)-1)
3429 g_iconv_close(translit);
3430 translit=(GIConv)-1;
3431 if (to_utf8!=(GIConv)-1)
3432 g_iconv_close(to_utf8);
3440 entities=g_tree_new((GCompareFunc)strcmp);
3441 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3442 g_tree_insert(entities,HTMLentities[i].name,
3443 GUINT_TO_POINTER(HTMLentities[i].c));
3445 if (translit==(GIConv)-1)
3446 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3447 if (to_utf8==(GIConv)-1)
3448 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3449 while((amp=strchr(theline,'&')))
3451 scolon=strchr(amp,';');
3456 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3457 c=strtol(amp+2,NULL,10);
3458 else if (amp[2]=='x' &&
3459 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3460 c=strtol(amp+3,NULL,16);
3464 s=g_strndup(amp+1,scolon-(amp+1));
3465 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3474 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3475 theline+=g_unichar_to_utf8(c,theline);
3479 nb=g_unichar_to_utf8(c,s);
3480 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3482 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3484 memcpy(theline,s,nb);
3488 memmove(theline,g_utf8_next_char(scolon),
3489 strlen(g_utf8_next_char(scolon))+1);
3492 theline=g_utf8_next_char(amp);
3496 gboolean tagcomp(const char *strin,const char *basetag)
3500 if (g_utf8_get_char(strin)=='/')
3501 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3503 t=g_utf8_casefold(strin,-1);
3504 s=g_utf8_casefold(basetag,-1);
3505 retval=g_str_has_prefix(t,s);
3511 void proghelp(GOptionContext *context)
3514 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3515 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3516 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3517 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3518 "For details, read the file COPYING.\n",stderr);
3519 fputs("This is Free Software; "
3520 "you may redistribute it under certain conditions (GPL);\n",stderr);
3521 fputs("read the file COPYING for details.\n\n",stderr);
3522 help=g_option_context_get_help(context,TRUE,NULL);
3525 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3526 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3527 "non-ASCII\n",stderr);
3528 fputs("characters like accented letters, "
3529 "lines longer than 75 or shorter than 55,\n",stderr);
3530 fputs("unbalanced quotes or brackets, "
3531 "a variety of badly formatted punctuation, \n",stderr);
3532 fputs("HTML tags, some likely typos. "
3533 "It is NOT a substitute for human judgement.\n",stderr);