1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
134 gboolean typo_compat,paranoid_compat;
136 static GOptionEntry options[]={
137 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
138 "Ignore DP-specific markup", NULL },
139 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
140 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
141 "Don't ignore DP-specific markup", NULL },
142 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
143 "Echo queried line", NULL },
144 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
145 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
146 "Don't echo queried line", NULL },
147 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
148 "Check single quotes", NULL },
149 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
150 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
151 "Don't check single quotes", NULL },
152 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
153 "Check common typos", NULL },
154 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
155 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
156 "Don't check common typos", NULL },
157 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
158 "Require closure of quotes on every paragraph", NULL },
159 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
160 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
161 "Don't require closure of quotes on every paragraph", NULL },
162 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
163 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
164 "Enable paranoid querying of everything", NULL },
165 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
166 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
167 "Disable paranoid querying of everything", NULL },
168 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
169 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
170 "Enable line end checking", NULL },
171 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
172 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
173 "Diable line end checking", NULL },
174 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
175 "Overview: just show counts", NULL },
176 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
177 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
178 "Show individual warnings", NULL },
179 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
180 "Output errors to stdout instead of stderr", NULL },
181 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
182 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
183 "Output errors to stderr instead of stdout", NULL },
184 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
185 "Echo header fields", NULL },
186 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
187 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
188 "Don't echo header fields", NULL },
189 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
190 "Ignore markup in < >", NULL },
191 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
192 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
193 "No special handling for markup in < >", NULL },
194 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
195 "Use file of user-defined typos", NULL },
196 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
197 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
198 "Ignore file of user-defined typos", NULL },
199 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Verbose - list everything", NULL },
201 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
202 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
203 "Switch off verbose mode", NULL },
204 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
205 "Set of characters valid for this ebook", "NAME" },
210 * Options relating to configuration which make no sense from inside
211 * a configuration file.
214 static GOptionEntry config_options[]={
215 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
216 "Defaults for use on www upload", NULL },
217 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
218 "Set of characters valid for this ebook", "NAME" },
219 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
220 "Dump current config settings", NULL },
224 static GOptionEntry compatibility_options[]={
225 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
226 "Toggle checking for common typos", NULL },
227 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
228 "Toggle both paranoid mode and common typos", NULL },
232 long cnt_quote; /* for overview mode, count of quote queries */
233 long cnt_brack; /* for overview mode, count of brackets queries */
234 long cnt_bin; /* for overview mode, count of non-ASCII queries */
235 long cnt_odd; /* for overview mode, count of odd character queries */
236 long cnt_long; /* for overview mode, count of long line errors */
237 long cnt_short; /* for overview mode, count of short line queries */
238 long cnt_punct; /* for overview mode,
239 count of punctuation and spacing queries */
240 long cnt_dash; /* for overview mode, count of dash-related queries */
241 long cnt_word; /* for overview mode, count of word queries */
242 long cnt_html; /* for overview mode, count of html queries */
243 long cnt_lineend; /* for overview mode, count of line-end queries */
244 long cnt_spacend; /* count of lines with space at end */
245 long linecnt; /* count of total lines in the file */
246 long checked_linecnt; /* count of lines actually checked */
248 void proghelp(GOptionContext *context);
249 void procfile(const char *);
253 gboolean mixdigit(const char *);
254 gchar *getaword(const char **);
255 char *flgets(char **,long);
256 void postprocess_for_HTML(char *);
257 char *linehasmarkup(char *);
258 char *losemarkup(char *);
259 gboolean tagcomp(const char *,const char *);
260 void loseentities(char *);
261 gboolean isroman(const char *);
262 void postprocess_for_DP(char *);
263 void print_as_windows_1252(const char *string);
264 void print_as_utf_8(const char *string);
266 GTree *qword,*qperiod;
272 gboolean set_charset(const char *name,GError **err)
274 /* The various UNICODE encodings all share the same character set. */
275 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
276 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
277 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
278 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
279 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
283 if (charset_validator!=(GIConv)-1)
284 g_iconv_close(charset_validator);
285 if (!name || !g_strcasecmp(name,"auto"))
288 charset_validator=(GIConv)-1;
292 charset=g_strdup(name);
293 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
294 if (!g_strcasecmp(charset,unicode_aliases[i]))
297 charset=g_strdup("UTF-8");
300 if (!strcmp(charset,"UTF-8"))
301 charset_validator=(GIConv)-1;
304 charset_validator=g_iconv_open(charset,"UTF-8");
305 if (charset_validator==(GIConv)-1)
307 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
308 "Unknown character set \"%s\"",charset);
317 void config_file_update(GKeyFile *kf)
322 for(i=0;options[i].long_name;i++)
324 if (g_str_has_prefix(options[i].long_name,"no-"))
326 if (options[i].arg==G_OPTION_ARG_NONE)
328 sw=*(gboolean *)options[i].arg_data;
329 if (options[i].flags&G_OPTION_FLAG_REVERSE)
331 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
333 else if (options[i].arg==G_OPTION_ARG_STRING)
335 s=*(gchar **)options[i].arg_data;
338 g_key_file_set_string(kf,"options",options[i].long_name,s);
341 g_assert_not_reached();
345 void config_file_add_comments(GKeyFile *kf)
349 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
351 for(i=0;options[i].long_name;i++)
353 if (g_str_has_prefix(options[i].long_name,"no-"))
355 comment=g_strconcat(" ",options[i].description,NULL);
356 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
361 void dump_config(void)
365 config_file_update(config);
368 config=g_key_file_new();
369 config_file_update(config);
370 config_file_add_comments(config);
372 s=g_key_file_to_data(config,NULL,NULL);
378 GKeyFile *read_config_file(gchar **full_path)
384 const char *search_path;
387 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
391 search_dirs=g_strsplit(search_path,";",0);
393 search_dirs=g_strsplit(search_path,":",0);
398 search_dirs=g_new(gchar *,4);
399 search_dirs[0]=g_get_current_dir();
400 search_dirs[1]=g_strdup(running_from);
401 search_dirs[2]=g_strdup(g_get_user_config_dir());
404 for(i=0;search_dirs[i];i++)
406 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
407 if (g_key_file_load_from_file(kf,path,
408 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
410 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
412 g_printerr("Bookloupe: Error reading %s\n",path);
413 g_printerr("%s\n",err->message);
425 g_strfreev(search_dirs);
433 void parse_config_file(void)
440 config=read_config_file(&path);
442 keys=g_key_file_get_keys(config,"options",NULL,NULL);
449 for(j=0;options[j].long_name;j++)
451 if (g_str_has_prefix(options[j].long_name,"no-"))
453 else if (!strcmp(keys[i],options[j].long_name))
455 if (options[j].arg==G_OPTION_ARG_NONE)
457 sw=g_key_file_get_boolean(config,"options",keys[i],
461 g_printerr("Bookloupe: %s: options.%s: %s\n",
462 path,keys[i],err->message);
467 if (options[j].flags&G_OPTION_FLAG_REVERSE)
469 *(gboolean *)options[j].arg_data=sw;
473 else if (options[j].arg==G_OPTION_ARG_STRING)
475 s=g_key_file_get_string(config,"options",keys[i],
479 g_printerr("Bookloupe: %s: options.%s: %s\n",
480 path,keys[i],err->message);
485 g_free(*(gchar **)options[j].arg_data);
486 if (!g_strcmp0(s,"auto"))
488 *(gchar **)options[j].arg_data=NULL;
492 *(gchar **)options[j].arg_data=s;
497 g_assert_not_reached();
500 if (!options[j].long_name)
501 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
510 void parse_options(int *argc,char ***argv)
513 GOptionContext *context;
514 GOptionGroup *compatibility;
515 context=g_option_context_new(
516 "file - look for errors in Project Gutenberg(TM) etexts");
517 g_option_context_add_main_entries(context,options,NULL);
518 g_option_context_add_main_entries(context,config_options,NULL);
519 compatibility=g_option_group_new("compatibility",
520 "Options for Compatibility with Gutcheck:",
521 "Show compatibility options",NULL,NULL);
522 g_option_group_add_entries(compatibility,compatibility_options);
523 g_option_context_add_group(context,compatibility);
524 g_option_context_set_description(context,
525 "For simplicity, only the switch options which reverse the\n"
526 "default configuration are listed. In most cases, both vanilla\n"
527 "and \"no-\" prefixed versions are available for use.");
528 if (!g_option_context_parse(context,argc,argv,&err))
530 g_printerr("Bookloupe: %s\n",err->message);
531 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
535 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
538 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
539 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
542 * Web uploads - for the moment, this is really just a placeholder
543 * until we decide what processing we really want to do on web uploads
545 if (pswit[WEB_SWITCH])
547 /* specific override for web uploads */
548 pswit[ECHO_SWITCH]=TRUE;
549 pswit[SQUOTE_SWITCH]=FALSE;
550 pswit[TYPO_SWITCH]=TRUE;
551 pswit[QPARA_SWITCH]=FALSE;
552 pswit[PARANOID_SWITCH]=TRUE;
553 pswit[LINE_END_SWITCH]=FALSE;
554 pswit[OVERVIEW_SWITCH]=FALSE;
555 pswit[STDOUT_SWITCH]=FALSE;
556 pswit[HEADER_SWITCH]=TRUE;
557 pswit[VERBOSE_SWITCH]=FALSE;
558 pswit[MARKUP_SWITCH]=FALSE;
559 pswit[USERTYPO_SWITCH]=FALSE;
560 pswit[DP_SWITCH]=FALSE;
562 if (opt_charset && !set_charset(opt_charset,&err))
564 g_printerr("%s\n",err->message);
567 if (pswit[DUMP_CONFIG_SWITCH])
574 if (pswit[OVERVIEW_SWITCH])
575 /* just print summary; don't echo */
576 pswit[ECHO_SWITCH]=FALSE;
582 g_option_context_free(context);
588 * Read in the user-defined stealth scanno list.
590 void read_user_scannos(void)
593 gchar *usertypo_file;
597 gchar *contents,*utf8,**lines;
598 usertypo_file=g_strdup("bookloupe.typ");
599 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
600 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
603 g_free(usertypo_file);
604 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
605 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
607 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
610 g_free(usertypo_file);
611 usertypo_file=g_strdup("gutcheck.typ");
612 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
614 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
617 g_free(usertypo_file);
618 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
619 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
621 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
623 g_free(usertypo_file);
624 g_print(" --> I couldn't find bookloupe.typ "
625 "-- proceeding without user typos.\n");
630 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
631 g_free(usertypo_file);
635 if (g_utf8_validate(contents,len,NULL))
637 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
639 (void)set_charset("UNICODE",NULL);
642 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
644 lines=g_strsplit_set(utf8,"\r\n",0);
646 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
647 for (i=0;lines[i];i++)
648 if (*(unsigned char *)lines[i]>'!')
649 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
658 * Read an etext returning a newly allocated string containing the file
659 * contents or NULL on error.
661 gchar *read_etext(const char *filename,GError **err)
663 GError *tmp_err=NULL;
664 gchar *contents,*utf8;
665 gsize len,bytes_read,bytes_written;
667 if (!g_file_get_contents(filename,&contents,&len,err))
669 if (g_utf8_validate(contents,len,NULL))
671 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
672 g_set_print_handler(print_as_utf_8);
674 SetConsoleOutputCP(CP_UTF8);
679 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
680 &bytes_written,&tmp_err);
681 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
682 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
685 for(i=0;i<bytes_read;i++)
686 if (contents[i]=='\n')
691 else if (contents[i]!='\r')
693 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
694 "Input conversion failed. Byte %d at line %d, column %d is not a "
695 "valid Windows-1252 character",
696 ((unsigned char *)contents)[bytes_read],line,col);
699 g_propagate_error(err,tmp_err);
700 g_set_print_handler(print_as_windows_1252);
702 SetConsoleOutputCP(1252);
709 void cleanup_on_exit(void)
712 SetConsoleOutputCP(saved_cp);
716 int main(int argc,char **argv)
719 atexit(cleanup_on_exit);
720 saved_cp=GetConsoleOutputCP();
722 running_from=g_path_get_dirname(argv[0]);
723 /* Paranoid checking is turned OFF, not on, by its switch */
724 pswit[PARANOID_SWITCH]=TRUE;
725 /* if running in paranoid mode, typo checks default to enabled */
726 pswit[TYPO_SWITCH]=TRUE;
727 /* Line-end checking is turned OFF, not on, by its switch */
728 pswit[LINE_END_SWITCH]=TRUE;
729 /* Echoing is turned OFF, not on, by its switch */
730 pswit[ECHO_SWITCH]=TRUE;
732 parse_options(&argc,&argv);
733 if (pswit[USERTYPO_SWITCH])
735 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
737 if (pswit[OVERVIEW_SWITCH])
739 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
740 checked_linecnt,linecnt,linecnt-checked_linecnt);
741 g_print(" --------------- Queries found --------------\n");
743 g_print(" Long lines: %14ld\n",cnt_long);
745 g_print(" Short lines: %14ld\n",cnt_short);
747 g_print(" Line-end problems: %14ld\n",cnt_lineend);
749 g_print(" Common typos: %14ld\n",cnt_word);
751 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
753 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
755 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
757 g_print(" Proofing characters: %14ld\n",cnt_odd);
759 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
761 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
763 g_print(" Possible HTML tags: %14ld\n",cnt_html);
765 g_print(" TOTAL QUERIES %14ld\n",
766 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
767 cnt_dash+cnt_word+cnt_html+cnt_lineend);
769 g_free(running_from);
771 g_tree_unref(usertypo);
772 set_charset(NULL,NULL);
774 g_key_file_free(config);
781 * Run a first pass - verify that it's a valid PG
782 * file, decide whether to report some things that
783 * occur many times in the text like long or short
784 * lines, non-standard dashes, etc.
786 struct first_pass_results *first_pass(const char *etext)
788 gunichar laststart=CHAR_SPACE;
793 unsigned int lastlen=0,lastblen=0;
794 long spline=0,nspline=0;
795 static struct first_pass_results results={0};
797 lines=g_strsplit(etext,"\n",0);
798 for (j=0;lines[j];j++)
800 lbytes=strlen(lines[j]);
801 while (lbytes>0 && lines[j][lbytes-1]=='\r')
802 lines[j][--lbytes]='\0';
803 llen=g_utf8_strlen(lines[j],lbytes);
805 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
806 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
809 g_print(" --> Duplicate header?\n");
810 spline=linecnt+1; /* first line of non-header text, that is */
812 if (!strncmp(lines[j],"*** START",9) &&
813 strstr(lines[j],"PROJECT GUTENBERG"))
816 g_print(" --> Duplicate header?\n");
817 nspline=linecnt+1; /* first line of non-header text, that is */
819 if (spline || nspline)
821 lc_line=g_utf8_strdown(lines[j],lbytes);
822 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
824 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
826 if (results.footerline)
828 /* it's an old-form header - we can detect duplicates */
830 g_print(" --> Duplicate footer?\n");
833 results.footerline=linecnt;
839 results.firstline=spline;
841 results.firstline=nspline; /* override with new */
842 if (results.footerline)
843 continue; /* don't count the boilerplate in the footer */
844 results.totlen+=llen;
845 for (s=lines[j];*s;s=g_utf8_next_char(s))
847 if (g_utf8_get_char(s)>127)
849 if (g_unichar_isalpha(g_utf8_get_char(s)))
851 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
852 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
853 results.endquote_count++;
855 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
856 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
859 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
861 if (strstr(lines[j],".,"))
863 /* only count ast lines for ignoring purposes where there is */
864 /* locase text on the line */
865 if (strchr(lines[j],'*'))
867 for (s=lines[j];*s;s=g_utf8_next_char(s))
868 if (g_unichar_islower(g_utf8_get_char(s)))
873 if (strchr(lines[j],'/'))
874 results.fslashline++;
877 for (s=g_utf8_prev_char(lines[j]+lbytes);
878 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
879 s=g_utf8_prev_char(s))
881 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
882 g_utf8_get_char(g_utf8_prev_char(s))!='-')
885 if (llen>LONGEST_PG_LINE)
887 if (llen>WAY_TOO_LONG)
888 results.verylongline++;
889 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
891 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
894 if (strstr(lines[j],"<i>"))
895 results.htmcount+=4; /* bonus marks! */
897 /* Check for spaced em-dashes */
898 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
901 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
902 results.space_emdash++;
903 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
904 /* count of em-dashes with spaces both sides */
905 results.non_PG_space_emdash++;
906 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
907 /* count of PG-type em-dashes with no spaces */
908 results.PG_space_emdash++;
913 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
914 results.Dutchcount++;
915 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
916 results.Frenchcount++;
917 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
918 results.standalone_digit++;
921 /* Check for spaced dashes */
922 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
926 laststart=lines[j][0];
935 * Make some snap decisions based on the first pass results.
937 struct warnings *report_first_pass(struct first_pass_results *results)
939 static struct warnings warnings={0};
941 g_print(" --> %ld lines in this file have white space at end\n",
944 if (results->dotcomma>5)
947 g_print(" --> %ld lines in this file contain '.,'. "
948 "Not reporting them.\n",results->dotcomma);
951 * If more than 50 lines, or one-tenth, are short,
952 * don't bother reporting them.
954 warnings.shortline=1;
955 if (results->shortline>50 || results->shortline*10>linecnt)
957 warnings.shortline=0;
958 g_print(" --> %ld lines in this file are short. "
959 "Not reporting short lines.\n",results->shortline);
962 * If more than 50 lines, or one-tenth, are long,
963 * don't bother reporting them.
966 if (results->longline>50 || results->longline*10>linecnt)
969 g_print(" --> %ld lines in this file are long. "
970 "Not reporting long lines.\n",results->longline);
972 /* If more than 10 lines contain asterisks, don't bother reporting them. */
974 if (results->astline>10)
977 g_print(" --> %ld lines in this file contain asterisks. "
978 "Not reporting them.\n",results->astline);
981 * If more than 10 lines contain forward slashes,
982 * don't bother reporting them.
985 if (results->fslashline>10)
988 g_print(" --> %ld lines in this file contain forward slashes. "
989 "Not reporting them.\n",results->fslashline);
992 * If more than 20 lines contain unpunctuated endquotes,
993 * don't bother reporting them.
996 if (results->endquote_count>20)
999 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1000 "Not reporting them.\n",results->endquote_count);
1003 * If more than 15 lines contain standalone digits,
1004 * don't bother reporting them.
1007 if (results->standalone_digit>10)
1010 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1011 "Not reporting them.\n",results->standalone_digit);
1014 * If more than 20 lines contain hyphens at end,
1015 * don't bother reporting them.
1018 if (results->hyphens>20)
1021 g_print(" --> %ld lines in this file have hyphens at end. "
1022 "Not reporting them.\n",results->hyphens);
1024 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1026 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1027 pswit[MARKUP_SWITCH]=1;
1029 if (results->verylongline>0)
1030 g_print(" --> %ld lines in this file are VERY long!\n",
1031 results->verylongline);
1033 * If there are more non-PG spaced dashes than PG em-dashes,
1034 * assume it's deliberate.
1035 * Current PG guidelines say don't use them, but older texts do,
1036 * and some people insist on them whatever the guidelines say.
1039 if (results->spacedash+results->non_PG_space_emdash>
1040 results->PG_space_emdash)
1043 g_print(" --> There are %ld spaced dashes and em-dashes. "
1044 "Not reporting them.\n",
1045 results->spacedash+results->non_PG_space_emdash);
1051 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1053 /* If more than a quarter of characters are hi-bit, bug out. */
1054 if (results->binlen*4>results->totlen)
1056 g_print(" --> This file does not appear to be ASCII. "
1057 "Terminating. Best of luck with it!\n");
1060 if (results->alphalen*4<results->totlen)
1062 g_print(" --> This file does not appear to be text. "
1063 "Terminating. Best of luck with it!\n");
1066 if (results->binlen*100>results->totlen || results->binlen>100)
1068 g_print(" --> There are a lot of foreign letters here. "
1069 "Not reporting them.\n");
1070 if (!pswit[VERBOSE_SWITCH])
1074 warnings.isDutch=FALSE;
1075 if (results->Dutchcount>50)
1077 warnings.isDutch=TRUE;
1078 g_print(" --> This looks like Dutch - "
1079 "switching off dashes and warnings for 's Middags case.\n");
1081 warnings.isFrench=FALSE;
1082 if (results->Frenchcount>50)
1084 warnings.isFrench=TRUE;
1085 g_print(" --> This looks like French - "
1086 "switching off some doublepunct.\n");
1088 if (results->firstline && results->footerline)
1089 g_print(" The PG header and footer appear to be already on.\n");
1092 if (results->firstline)
1093 g_print(" The PG header is on - no footer.\n");
1094 if (results->footerline)
1095 g_print(" The PG footer is on - no header.\n");
1098 if (pswit[VERBOSE_SWITCH])
1100 warnings.shortline=1;
1101 warnings.dotcomma=1;
1102 warnings.longline=1;
1108 warnings.endquote=1;
1109 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1111 if (warnings.isDutch)
1113 if (results->footerline>0 && results->firstline>0 &&
1114 results->footerline>results->firstline &&
1115 results->footerline-results->firstline<100)
1117 g_print(" --> I don't really know where this text starts. \n");
1118 g_print(" There are no reference points.\n");
1119 g_print(" I'm going to have to report the header and footer "
1121 results->firstline=0;
1129 * Look along the line, accumulate the count of quotes, and see
1130 * if this is an empty line - i.e. a line with nothing on it
1132 * If line has just spaces, period, * and/or - on it, don't
1133 * count it, since empty lines with asterisks or dashes to
1134 * separate sections are common.
1136 * Returns: TRUE if the line is empty.
1138 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
1141 /* assume the line is empty until proven otherwise */
1142 gboolean isemptyline=TRUE;
1143 const char *s=aline,*sprev,*snext;
1146 GError *tmp_err=NULL;
1149 snext=g_utf8_next_char(s);
1150 c=g_utf8_get_char(s);
1151 if (CHAR_IS_DQUOTE(c))
1152 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1153 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1158 * At start of line, it can only be a quotation mark.
1159 * Hardcode a very common exception!
1161 if (!g_str_has_prefix(snext,"tis") &&
1162 !g_str_has_prefix(snext,"Tis"))
1163 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1165 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1166 g_unichar_isalpha(g_utf8_get_char(snext)))
1167 /* Do nothing! it's definitely an apostrophe, not a quote */
1169 /* it's outside a word - let's check it out */
1170 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1171 g_unichar_isalpha(g_utf8_get_char(snext)))
1173 /* certainly looks like a quotation mark */
1174 if (!g_str_has_prefix(snext,"tis") &&
1175 !g_str_has_prefix(snext,"Tis"))
1176 /* hardcode a very common exception! */
1178 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1179 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1181 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1186 /* now - is it a quotation mark? */
1187 guessquote=0; /* accumulate clues */
1188 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1190 /* it follows a letter - could be either */
1192 if (g_utf8_get_char(sprev)=='s')
1194 /* looks like a plural apostrophe */
1196 if (g_utf8_get_char(snext)==CHAR_SPACE)
1200 if (innermost_quote_matches(counters,c))
1202 * Give it the benefit of some doubt,
1203 * if a squote is already open.
1209 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1212 /* no adjacent letter - it must be a quote of some kind */
1213 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1218 if (pswit[ECHO_SWITCH])
1219 g_print("\n%s\n",aline);
1220 if (!pswit[OVERVIEW_SWITCH])
1221 g_print(" Line %ld column %ld - %s\n",
1222 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1223 g_clear_error(&tmp_err);
1225 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1227 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1228 if (c==CHAR_UNDERSCORE)
1229 counters->c_unders++;
1230 if (c==CHAR_OPEN_SBRACK)
1232 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1233 !matching_difference(counters,c) && s==aline &&
1234 g_str_has_prefix(s,"[Illustration:"))
1235 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1237 increment_matching(counters,c,TRUE);
1239 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1240 increment_matching(counters,c,TRUE);
1241 if (c==CHAR_CLOSE_SBRACK)
1243 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1244 !matching_difference(counters,c) && !*snext)
1245 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1247 increment_matching(counters,c,FALSE);
1249 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1250 increment_matching(counters,c,FALSE);
1258 * check_for_control_characters:
1260 * Check for invalid or questionable characters in the line
1261 * Anything above 127 is invalid for plain ASCII, and
1262 * non-printable control characters should also be flagged.
1263 * Tabs should generally not be there.
1265 void check_for_control_characters(const char *aline)
1269 for (s=aline;*s;s=g_utf8_next_char(s))
1271 c=g_utf8_get_char(s);
1272 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1274 if (pswit[ECHO_SWITCH])
1275 g_print("\n%s\n",aline);
1276 if (!pswit[OVERVIEW_SWITCH])
1277 g_print(" Line %ld column %ld - Control character %u\n",
1278 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1286 * check_for_odd_characters:
1288 * Check for binary and other odd characters.
1290 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1291 gboolean isemptyline)
1293 /* Don't repeat multiple warnings on one line. */
1294 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1295 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1300 for (s=aline;*s;s=g_utf8_next_char(s))
1302 c=g_utf8_get_char(s);
1303 if (warnings->bin && !eInvalidChar &&
1304 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1306 if (pswit[ECHO_SWITCH])
1307 g_print("\n%s\n",aline);
1308 if (!pswit[OVERVIEW_SWITCH])
1309 if (c>127 && c<160 || c>255)
1310 g_print(" Line %ld column %ld - "
1311 "Non-ISO-8859 character %u\n",
1312 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1314 g_print(" Line %ld column %ld - "
1315 "Non-ASCII character %u\n",
1316 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1321 if (!eInvalidChar && charset)
1323 if (charset_validator==(GIConv)-1)
1325 if (!g_unichar_isdefined(c))
1327 if (pswit[ECHO_SWITCH])
1328 g_print("\n%s\n",aline);
1329 if (!pswit[OVERVIEW_SWITCH])
1330 g_print(" Line %ld column %ld - Unassigned UNICODE "
1331 "code point U+%04" G_GINT32_MODIFIER "X\n",
1332 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1337 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1338 c>=100000 && c<=0x10FFFD)
1340 if (pswit[ECHO_SWITCH])
1341 g_print("\n%s\n",aline);
1342 if (!pswit[OVERVIEW_SWITCH])
1343 g_print(" Line %ld column %ld - Private Use "
1344 "character U+%04" G_GINT32_MODIFIER "X\n",
1345 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1353 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1354 charset_validator,NULL,&nb,NULL);
1359 if (pswit[ECHO_SWITCH])
1360 g_print("\n%s\n",aline);
1361 if (!pswit[OVERVIEW_SWITCH])
1362 g_print(" Line %ld column %ld - Non-%s "
1363 "character %u\n",linecnt,
1364 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1371 if (!eTab && c==CHAR_TAB)
1373 if (pswit[ECHO_SWITCH])
1374 g_print("\n%s\n",aline);
1375 if (!pswit[OVERVIEW_SWITCH])
1376 g_print(" Line %ld column %ld - Tab character?\n",
1377 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1382 if (!eTilde && c==CHAR_TILDE)
1385 * Often used by OCR software to indicate an
1386 * unrecognizable character.
1388 if (pswit[ECHO_SWITCH])
1389 g_print("\n%s\n",aline);
1390 if (!pswit[OVERVIEW_SWITCH])
1391 g_print(" Line %ld column %ld - Tilde character?\n",
1392 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1397 if (!eCarat && c==CHAR_CARAT)
1399 if (pswit[ECHO_SWITCH])
1400 g_print("\n%s\n",aline);
1401 if (!pswit[OVERVIEW_SWITCH])
1402 g_print(" Line %ld column %ld - Carat character?\n",
1403 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1408 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1410 if (pswit[ECHO_SWITCH])
1411 g_print("\n%s\n",aline);
1412 if (!pswit[OVERVIEW_SWITCH])
1413 g_print(" Line %ld column %ld - Forward slash?\n",
1414 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1420 * Report asterisks only in paranoid mode,
1421 * since they're often deliberate.
1423 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1426 if (pswit[ECHO_SWITCH])
1427 g_print("\n%s\n",aline);
1428 if (!pswit[OVERVIEW_SWITCH])
1429 g_print(" Line %ld column %ld - Asterisk?\n",
1430 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1439 * check_for_long_line:
1441 * Check for line too long.
1443 void check_for_long_line(const char *aline)
1445 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1447 if (pswit[ECHO_SWITCH])
1448 g_print("\n%s\n",aline);
1449 if (!pswit[OVERVIEW_SWITCH])
1450 g_print(" Line %ld column %ld - Long line %ld\n",
1451 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1458 * check_for_short_line:
1460 * Check for line too short.
1462 * This one is a bit trickier to implement: we don't want to
1463 * flag the last line of a paragraph for being short, so we
1464 * have to wait until we know that our current line is a
1465 * "normal" line, then report the _previous_ line if it was too
1466 * short. We also don't want to report indented lines like
1467 * chapter heads or formatted quotations. We therefore keep
1468 * last->len as the length of the last line examined, and
1469 * last->blen as the length of the last but one, and try to
1470 * suppress unnecessary warnings by checking that both were of
1471 * "normal" length. We keep the first character of the last
1472 * line in last->start, and if it was a space, we assume that
1473 * the formatting is deliberate. I can't figure out a way to
1474 * distinguish something like a quoted verse left-aligned or
1475 * the header or footer of a letter from a paragraph of short
1476 * lines - maybe if I examined the whole paragraph, and if the
1477 * para has less than, say, 8 lines and if all lines are short,
1478 * then just assume it's OK? Need to look at some texts to see
1479 * how often a formula like this would get the right result.
1481 void check_for_short_line(const char *aline,const struct line_properties *last)
1483 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1484 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1485 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1487 if (pswit[ECHO_SWITCH])
1488 g_print("\n%s\n",prevline);
1489 if (!pswit[OVERVIEW_SWITCH])
1490 g_print(" Line %ld column %ld - Short line %ld?\n",
1491 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1498 * check_for_starting_punctuation:
1500 * Look for punctuation other than full ellipses at start of line.
1502 void check_for_starting_punctuation(const char *aline)
1504 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1505 !g_str_has_prefix(aline,". . ."))
1507 if (pswit[ECHO_SWITCH])
1508 g_print("\n%s\n",aline);
1509 if (!pswit[OVERVIEW_SWITCH])
1510 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1518 * check_for_spaced_emdash:
1520 * Check for spaced em-dashes.
1522 * We must check _all_ occurrences of "--" on the line
1523 * hence the loop - even if the first double-dash is OK
1524 * there may be another that's wrong later on.
1526 void check_for_spaced_emdash(const char *aline)
1528 const char *s,*t,*next;
1529 for (s=aline;t=strstr(s,"--");s=next)
1531 next=g_utf8_next_char(g_utf8_next_char(t));
1532 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1533 g_utf8_get_char(next)==CHAR_SPACE)
1535 if (pswit[ECHO_SWITCH])
1536 g_print("\n%s\n",aline);
1537 if (!pswit[OVERVIEW_SWITCH])
1538 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1539 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1547 * check_for_spaced_dash:
1549 * Check for spaced dashes.
1551 void check_for_spaced_dash(const char *aline)
1554 if ((s=strstr(aline," -")))
1556 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1558 if (pswit[ECHO_SWITCH])
1559 g_print("\n%s\n",aline);
1560 if (!pswit[OVERVIEW_SWITCH])
1561 g_print(" Line %ld column %ld - Spaced dash?\n",
1562 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1567 else if ((s=strstr(aline,"- ")))
1569 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1571 if (pswit[ECHO_SWITCH])
1572 g_print("\n%s\n",aline);
1573 if (!pswit[OVERVIEW_SWITCH])
1574 g_print(" Line %ld column %ld - Spaced dash?\n",
1575 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1583 * check_for_unmarked_paragraphs:
1585 * Check for unmarked paragraphs indicated by separate speakers.
1587 * May well be false positive:
1588 * "Bravo!" "Wonderful!" called the crowd.
1589 * but useful all the same.
1591 void check_for_unmarked_paragraphs(const char *aline)
1594 s=strstr(aline,"\" \"");
1596 s=strstr(aline,"\" \"");
1599 if (pswit[ECHO_SWITCH])
1600 g_print("\n%s\n",aline);
1601 if (!pswit[OVERVIEW_SWITCH])
1602 g_print(" Line %ld column %ld - "
1603 "Query missing paragraph break?\n",
1604 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1611 * check_for_jeebies:
1613 * Check for "to he" and other easy h/b errors.
1615 * This is a very inadequate effort on the h/b problem,
1616 * but the phrase "to he" is always an error, whereas "to
1617 * be" is quite common.
1618 * Similarly, '"Quiet!", be said.' is a non-be error
1619 * "to he" is _not_ always an error!:
1620 * "Where they went to he couldn't say."
1621 * Another false positive:
1622 * What would "Cinderella" be without the . . .
1623 * and another: "If he wants to he can see for himself."
1625 void check_for_jeebies(const char *aline)
1628 s=strstr(aline," be could ");
1630 s=strstr(aline," be would ");
1632 s=strstr(aline," was be ");
1634 s=strstr(aline," be is ");
1636 s=strstr(aline," is be ");
1638 s=strstr(aline,"\", be ");
1640 s=strstr(aline,"\" be ");
1642 s=strstr(aline,"\" be ");
1644 s=strstr(aline," to he ");
1647 if (pswit[ECHO_SWITCH])
1648 g_print("\n%s\n",aline);
1649 if (!pswit[OVERVIEW_SWITCH])
1650 g_print(" Line %ld column %ld - Query he/be error?\n",
1651 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1655 s=strstr(aline," the had ");
1657 s=strstr(aline," a had ");
1659 s=strstr(aline," they bad ");
1661 s=strstr(aline," she bad ");
1663 s=strstr(aline," he bad ");
1665 s=strstr(aline," you bad ");
1667 s=strstr(aline," i bad ");
1670 if (pswit[ECHO_SWITCH])
1671 g_print("\n%s\n",aline);
1672 if (!pswit[OVERVIEW_SWITCH])
1673 g_print(" Line %ld column %ld - Query had/bad error?\n",
1674 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1678 s=strstr(aline,"; hut ");
1680 s=strstr(aline,", hut ");
1683 if (pswit[ECHO_SWITCH])
1684 g_print("\n%s\n",aline);
1685 if (!pswit[OVERVIEW_SWITCH])
1686 g_print(" Line %ld column %ld - Query hut/but error?\n",
1687 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1694 * check_for_mta_from:
1696 * Special case - angled bracket in front of "From" placed there by an
1697 * MTA when sending an e-mail.
1699 void check_for_mta_from(const char *aline)
1702 s=strstr(aline,">From");
1705 if (pswit[ECHO_SWITCH])
1706 g_print("\n%s\n",aline);
1707 if (!pswit[OVERVIEW_SWITCH])
1708 g_print(" Line %ld column %ld - "
1709 "Query angled bracket with From\n",
1710 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1717 * check_for_orphan_character:
1719 * Check for a single character line -
1720 * often an overflow from bad wrapping.
1722 void check_for_orphan_character(const char *aline)
1725 c=g_utf8_get_char(aline);
1726 if (c && !*g_utf8_next_char(aline))
1728 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1729 ; /* Nothing - ignore numerals alone on a line. */
1732 if (pswit[ECHO_SWITCH])
1733 g_print("\n%s\n",aline);
1734 if (!pswit[OVERVIEW_SWITCH])
1735 g_print(" Line %ld column 1 - Query single character line\n",
1744 * check_for_pling_scanno:
1746 * Check for I" - often should be !
1748 void check_for_pling_scanno(const char *aline)
1751 s=strstr(aline," I\"");
1754 if (pswit[ECHO_SWITCH])
1755 g_print("\n%s\n",aline);
1756 if (!pswit[OVERVIEW_SWITCH])
1757 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1758 linecnt,g_utf8_pointer_to_offset(aline,s));
1765 * check_for_extra_period:
1767 * Check for period without a capital letter. Cut-down from gutspell.
1768 * Only works when it happens on a single line.
1770 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1772 const char *s,*t,*s1,*sprev;
1777 gunichar c,nc,pc,*decomposition;
1778 if (pswit[PARANOID_SWITCH])
1780 for (t=aline;t=strstr(t,". ");)
1784 t=g_utf8_next_char(t);
1785 /* start of line punctuation is handled elsewhere */
1788 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1790 t=g_utf8_next_char(t);
1793 if (warnings->isDutch)
1795 /* For Frank & Jeroen -- 's Middags case */
1796 gunichar c2,c3,c4,c5;
1797 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1798 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1799 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1800 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1801 if (CHAR_IS_APOSTROPHE(c2) &&
1802 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1803 g_unichar_isupper(c5))
1805 t=g_utf8_next_char(t);
1809 s1=g_utf8_next_char(g_utf8_next_char(t));
1810 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1811 !isdigit(g_utf8_get_char(s1)))
1812 s1=g_utf8_next_char(s1);
1813 if (g_unichar_islower(g_utf8_get_char(s1)))
1815 /* we have something to investigate */
1817 /* so let's go back and find out */
1818 nc=g_utf8_get_char(t);
1819 s1=g_utf8_prev_char(t);
1820 c=g_utf8_get_char(s1);
1821 sprev=g_utf8_prev_char(s1);
1822 pc=g_utf8_get_char(sprev);
1824 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1825 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1826 g_unichar_isalpha(nc)))
1831 sprev=g_utf8_prev_char(s1);
1832 pc=g_utf8_get_char(sprev);
1834 s1=g_utf8_next_char(s1);
1837 testword=g_strndup(s1,s-s1);
1839 testword=g_strdup(s1);
1840 for (i=0;*abbrev[i];i++)
1841 if (!strcmp(testword,abbrev[i]))
1843 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1845 if (!*g_utf8_next_char(testword))
1847 if (isroman(testword))
1852 for (s=testword;*s;s=g_utf8_next_char(s))
1854 decomposition=g_unicode_canonical_decomposition(
1855 g_utf8_get_char(s),&len);
1856 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1858 g_free(decomposition);
1862 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1864 g_tree_insert(qperiod,g_strdup(testword),
1865 GINT_TO_POINTER(1));
1866 if (pswit[ECHO_SWITCH])
1867 g_print("\n%s\n",aline);
1868 if (!pswit[OVERVIEW_SWITCH])
1869 g_print(" Line %ld column %ld - Extra period?\n",
1870 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1876 t=g_utf8_next_char(t);
1882 * check_for_following_punctuation:
1884 * Check for words usually not followed by punctuation.
1886 void check_for_following_punctuation(const char *aline)
1889 const char *s,*wordstart;
1892 if (pswit[TYPO_SWITCH])
1903 inword=g_utf8_strdown(t,-1);
1905 for (i=0;*nocomma[i];i++)
1906 if (!strcmp(inword,nocomma[i]))
1908 c=g_utf8_get_char(s);
1909 if (c==',' || c==';' || c==':')
1911 if (pswit[ECHO_SWITCH])
1912 g_print("\n%s\n",aline);
1913 if (!pswit[OVERVIEW_SWITCH])
1914 g_print(" Line %ld column %ld - "
1915 "Query punctuation after %s?\n",
1916 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1922 for (i=0;*noperiod[i];i++)
1923 if (!strcmp(inword,noperiod[i]))
1925 c=g_utf8_get_char(s);
1926 if (c=='.' || c=='!')
1928 if (pswit[ECHO_SWITCH])
1929 g_print("\n%s\n",aline);
1930 if (!pswit[OVERVIEW_SWITCH])
1931 g_print(" Line %ld column %ld - "
1932 "Query punctuation after %s?\n",
1933 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1947 * Check for commonly mistyped words,
1948 * and digits like 0 for O in a word.
1950 void check_for_typos(const char *aline,struct warnings *warnings)
1952 const char *s,*t,*nt,*wordstart;
1954 gunichar *decomposition;
1956 int i,vowel,consonant,*dupcnt;
1957 gboolean isdup,istypo,alower;
1960 gsize decomposition_len;
1964 inword=getaword(&s);
1968 continue; /* don't bother with empty lines */
1970 if (mixdigit(inword))
1972 if (pswit[ECHO_SWITCH])
1973 g_print("\n%s\n",aline);
1974 if (!pswit[OVERVIEW_SWITCH])
1975 g_print(" Line %ld column %ld - Query digit in %s\n",
1976 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1981 * Put the word through a series of tests for likely typos and OCR
1984 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1988 for (t=inword;*t;t=g_utf8_next_char(t))
1990 c=g_utf8_get_char(t);
1991 nt=g_utf8_next_char(t);
1992 /* lowercase for testing */
1993 if (g_unichar_islower(c))
1995 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1998 * We have an uppercase mid-word. However, there are
2000 * Mac and Mc like McGill
2001 * French contractions like l'Abbe
2003 offset=g_utf8_pointer_to_offset(inword,t);
2005 pc=g_utf8_get_char(g_utf8_prev_char(t));
2008 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
2009 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2010 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2011 CHAR_IS_APOSTROPHE(pc))
2017 testword=g_utf8_casefold(inword,-1);
2019 if (pswit[TYPO_SWITCH])
2022 * Check for certain unlikely two-letter combinations at word
2025 len=g_utf8_strlen(testword,-1);
2028 for (i=0;*nostart[i];i++)
2029 if (g_str_has_prefix(testword,nostart[i]))
2031 for (i=0;*noend[i];i++)
2032 if (g_str_has_suffix(testword,noend[i]))
2035 /* ght is common, gbt never. Like that. */
2036 if (strstr(testword,"cb"))
2038 if (strstr(testword,"gbt"))
2040 if (strstr(testword,"pbt"))
2042 if (strstr(testword,"tbs"))
2044 if (strstr(testword,"mrn"))
2046 if (strstr(testword,"ahle"))
2048 if (strstr(testword,"ihle"))
2051 * "TBE" does happen - like HEARTBEAT - but uncommon.
2052 * Also "TBI" - frostbite, outbid - but uncommon.
2053 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2054 * numerals, but "ii" is a common scanno.
2056 if (strstr(testword,"tbi"))
2058 if (strstr(testword,"tbe"))
2060 if (strstr(testword,"ii"))
2063 * Check for no vowels or no consonants.
2064 * If none, flag a typo.
2066 if (!istypo && len>1)
2069 for (t=testword;*t;t=g_utf8_next_char(t))
2071 c=g_utf8_get_char(t);
2073 g_unicode_canonical_decomposition(c,&decomposition_len);
2074 if (c=='y' || g_unichar_isdigit(c))
2076 /* Yah, this is loose. */
2080 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2084 g_free(decomposition);
2086 if (!vowel || !consonant)
2090 * Now exclude the word from being reported if it's in
2093 for (i=0;*okword[i];i++)
2094 if (!strcmp(testword,okword[i]))
2097 * What looks like a typo may be a Roman numeral.
2100 if (istypo && isroman(testword))
2102 /* Check the manual list of typos. */
2104 for (i=0;*typo[i];i++)
2105 if (!strcmp(testword,typo[i]))
2108 * Check lowercase s, l, i and m - special cases.
2109 * "j" - often a semi-colon gone wrong.
2110 * "d" for a missing apostrophe - he d
2113 if (!istypo && len==1 &&
2114 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2118 dupcnt=g_tree_lookup(qword,testword);
2122 isdup=!pswit[VERBOSE_SWITCH];
2126 dupcnt=g_new0(int,1);
2127 g_tree_insert(qword,g_strdup(testword),dupcnt);
2132 if (pswit[ECHO_SWITCH])
2133 g_print("\n%s\n",aline);
2134 if (!pswit[OVERVIEW_SWITCH])
2136 g_print(" Line %ld column %ld - Query word %s",
2137 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2139 if (!pswit[VERBOSE_SWITCH])
2140 g_print(" - not reporting duplicates");
2148 /* check the user's list of typos */
2149 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2151 if (pswit[ECHO_SWITCH])
2152 g_print("\n%s\n",aline);
2153 if (!pswit[OVERVIEW_SWITCH])
2154 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2155 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2157 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2159 if (pswit[PARANOID_SWITCH] && warnings->digit)
2161 /* In paranoid mode, query all 0 and 1 standing alone. */
2162 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2164 if (pswit[ECHO_SWITCH])
2165 g_print("\n%s\n",aline);
2166 if (!pswit[OVERVIEW_SWITCH])
2167 g_print(" Line %ld column %ld - Query standalone %s\n",
2168 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2179 * check_for_misspaced_punctuation:
2181 * Look for added or missing spaces around punctuation and quotes.
2182 * If there is a punctuation character like ! with no space on
2183 * either side, suspect a missing!space. If there are spaces on
2184 * both sides , assume a typo. If we see a double quote with no
2185 * space or punctuation on either side of it, assume unspaced
2186 * quotes "like"this.
2188 void check_for_misspaced_punctuation(const char *aline,
2189 struct parities *parities,gboolean isemptyline)
2191 gboolean isacro,isellipsis;
2193 gunichar c,nc,pc,n2c;
2194 c=g_utf8_get_char(aline);
2195 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2196 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2200 nc=g_utf8_get_char(g_utf8_next_char(s));
2201 /* For each character in the line after the first. */
2202 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2204 /* we need to suppress warnings for acronyms like M.D. */
2206 /* we need to suppress warnings for ellipsis . . . */
2209 * If there are letters on both sides of it or
2210 * if it's strict punctuation followed by an alpha.
2212 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2213 g_utf8_strchr("?!,;:",-1,c)))
2217 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2218 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2220 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2226 if (pswit[ECHO_SWITCH])
2227 g_print("\n%s\n",aline);
2228 if (!pswit[OVERVIEW_SWITCH])
2229 g_print(" Line %ld column %ld - Missing space?\n",
2230 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2235 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2238 * If there are spaces on both sides,
2239 * or space before and end of line.
2243 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2244 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2246 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2250 if (!isemptyline && !isellipsis)
2252 if (pswit[ECHO_SWITCH])
2253 g_print("\n%s\n",aline);
2254 if (!pswit[OVERVIEW_SWITCH])
2255 g_print(" Line %ld column %ld - "
2256 "Spaced punctuation?\n",linecnt,
2257 g_utf8_pointer_to_offset(aline,s)+1);
2264 /* Split out the characters that CANNOT be preceded by space. */
2265 c=g_utf8_get_char(aline);
2266 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2267 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2271 nc=g_utf8_get_char(g_utf8_next_char(s));
2272 /* for each character in the line after the first */
2273 if (g_utf8_strchr("?!,;:",-1,c))
2275 /* if it's punctuation that _cannot_ have a space before it */
2276 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2279 * If nc DOES == space,
2280 * it was already reported just above.
2282 if (pswit[ECHO_SWITCH])
2283 g_print("\n%s\n",aline);
2284 if (!pswit[OVERVIEW_SWITCH])
2285 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2286 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2293 * Special case " .X" where X is any alpha.
2294 * This plugs a hole in the acronym code above.
2295 * Inelegant, but maintainable.
2297 c=g_utf8_get_char(aline);
2298 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2299 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2303 nc=g_utf8_get_char(g_utf8_next_char(s));
2304 /* for each character in the line after the first */
2307 /* if it's a period */
2308 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2311 * If the period follows a space and
2312 * is followed by a letter.
2314 if (pswit[ECHO_SWITCH])
2315 g_print("\n%s\n",aline);
2316 if (!pswit[OVERVIEW_SWITCH])
2317 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2318 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2324 c=g_utf8_get_char(aline);
2325 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2326 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2330 nc=g_utf8_get_char(g_utf8_next_char(s));
2331 /* for each character in the line after the first */
2334 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2335 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2336 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2338 if (pswit[ECHO_SWITCH])
2339 g_print("\n%s\n",aline);
2340 if (!pswit[OVERVIEW_SWITCH])
2341 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2342 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2348 /* Check parity of quotes. */
2349 nc=g_utf8_get_char(aline);
2350 for (s=aline;*s;s=g_utf8_next_char(s))
2353 nc=g_utf8_get_char(g_utf8_next_char(s));
2356 parities->dquote=!parities->dquote;
2357 if (!parities->dquote)
2360 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2362 if (pswit[ECHO_SWITCH])
2363 g_print("\n%s\n",aline);
2364 if (!pswit[OVERVIEW_SWITCH])
2365 g_print(" Line %ld column %ld - "
2366 "Wrongspaced quotes?\n",
2367 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2375 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2376 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2378 if (pswit[ECHO_SWITCH])
2379 g_print("\n%s\n",aline);
2380 if (!pswit[OVERVIEW_SWITCH])
2381 g_print(" Line %ld column %ld - "
2382 "Wrongspaced quotes?\n",
2383 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2390 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2392 if (g_utf8_strchr(",;:!?)]} ",-1,
2393 g_utf8_get_char(g_utf8_next_char(aline))))
2395 if (pswit[ECHO_SWITCH])
2396 g_print("\n%s\n",aline);
2397 if (!pswit[OVERVIEW_SWITCH])
2398 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2404 if (pswit[SQUOTE_SWITCH])
2406 nc=g_utf8_get_char(aline);
2407 for (s=aline;*s;s=g_utf8_next_char(s))
2410 nc=g_utf8_get_char(g_utf8_next_char(s));
2411 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2412 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2413 !g_unichar_isalpha(nc)))
2415 parities->squote=!parities->squote;
2416 if (!parities->squote)
2419 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2421 if (pswit[ECHO_SWITCH])
2422 g_print("\n%s\n",aline);
2423 if (!pswit[OVERVIEW_SWITCH])
2424 g_print(" Line %ld column %ld - "
2425 "Wrongspaced singlequotes?\n",
2426 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2434 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2435 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2437 if (pswit[ECHO_SWITCH])
2438 g_print("\n%s\n",aline);
2439 if (!pswit[OVERVIEW_SWITCH])
2440 g_print(" Line %ld column %ld - "
2441 "Wrongspaced singlequotes?\n",
2442 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2453 * check_for_double_punctuation:
2455 * Look for double punctuation like ,. or ,,
2456 * Thanks to DW for the suggestion!
2457 * In books with references, ".," and ".;" are common
2458 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2459 * OTOH, from my initial tests, there are also fairly
2460 * common errors. What to do? Make these cases paranoid?
2461 * ".," is the most common, so warnings->dotcomma is used
2462 * to suppress detailed reporting if it occurs often.
2464 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2468 nc=g_utf8_get_char(aline);
2469 for (s=aline;*s;s=g_utf8_next_char(s))
2472 nc=g_utf8_get_char(g_utf8_next_char(s));
2473 /* for each punctuation character in the line */
2474 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2475 g_utf8_strchr(".?!,;:",-1,nc))
2477 /* followed by punctuation, it's a query, unless . . . */
2478 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2479 !warnings->dotcomma && c=='.' && nc==',' ||
2480 warnings->isFrench && g_str_has_prefix(s,",...") ||
2481 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2482 warnings->isFrench && g_str_has_prefix(s,";...") ||
2483 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2484 warnings->isFrench && g_str_has_prefix(s,":...") ||
2485 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2486 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2487 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2488 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2489 warnings->isFrench && g_str_has_prefix(s,"...?"))
2491 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2492 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2493 warnings->isFrench && g_str_has_prefix(s,";...") ||
2494 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2495 warnings->isFrench && g_str_has_prefix(s,":...") ||
2496 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2497 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2498 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2499 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2500 warnings->isFrench && g_str_has_prefix(s,"...?"))
2503 nc=g_utf8_get_char(g_utf8_next_char(s));
2505 ; /* do nothing for .. !! and ?? which can be legit */
2509 if (pswit[ECHO_SWITCH])
2510 g_print("\n%s\n",aline);
2511 if (!pswit[OVERVIEW_SWITCH])
2512 g_print(" Line %ld column %ld - Double punctuation?\n",
2513 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2522 * check_for_spaced_quotes:
2524 void check_for_spaced_quotes(const char *aline)
2528 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2532 while ((t=strstr(s," \" ")))
2534 if (pswit[ECHO_SWITCH])
2535 g_print("\n%s\n",aline);
2536 if (!pswit[OVERVIEW_SWITCH])
2537 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2538 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2541 s=g_utf8_next_char(g_utf8_next_char(t));
2543 pattern=g_string_new(NULL);
2544 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2546 g_string_assign(pattern," ");
2547 g_string_append_unichar(pattern,single_quotes[i]);
2548 g_string_append_c(pattern,' ');
2550 while ((t=strstr(s,pattern->str)))
2552 if (pswit[ECHO_SWITCH])
2553 g_print("\n%s\n",aline);
2554 if (!pswit[OVERVIEW_SWITCH])
2555 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2556 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2559 s=g_utf8_next_char(g_utf8_next_char(t));
2562 g_string_free(pattern,TRUE);
2566 * check_for_miscased_genative:
2568 * Check special case of 'S instead of 's at end of word.
2570 void check_for_miscased_genative(const char *aline)
2576 c=g_utf8_get_char(aline);
2577 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2578 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2582 nc=g_utf8_get_char(g_utf8_next_char(s));
2583 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2585 if (pswit[ECHO_SWITCH])
2586 g_print("\n%s\n",aline);
2587 if (!pswit[OVERVIEW_SWITCH])
2588 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2589 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2597 * check_end_of_line:
2599 * Now check special cases - start and end of line -
2600 * for single and double quotes. Start is sometimes [sic]
2601 * but better to query it anyway.
2602 * While we're here, check for dash at end of line.
2604 void check_end_of_line(const char *aline,struct warnings *warnings)
2609 lbytes=strlen(aline);
2610 if (g_utf8_strlen(aline,lbytes)>1)
2612 s=g_utf8_prev_char(aline+lbytes);
2613 c1=g_utf8_get_char(s);
2614 c2=g_utf8_get_char(g_utf8_prev_char(s));
2615 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2617 if (pswit[ECHO_SWITCH])
2618 g_print("\n%s\n",aline);
2619 if (!pswit[OVERVIEW_SWITCH])
2620 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2621 g_utf8_strlen(aline,lbytes));
2625 c1=g_utf8_get_char(aline);
2626 c2=g_utf8_get_char(g_utf8_next_char(aline));
2627 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2629 if (pswit[ECHO_SWITCH])
2630 g_print("\n%s\n",aline);
2631 if (!pswit[OVERVIEW_SWITCH])
2632 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2637 * Dash at end of line may well be legit - paranoid mode only
2638 * and don't report em-dash at line-end.
2640 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2642 for (s=g_utf8_prev_char(aline+lbytes);
2643 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2645 if (g_utf8_get_char(s)=='-' &&
2646 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2648 if (pswit[ECHO_SWITCH])
2649 g_print("\n%s\n",aline);
2650 if (!pswit[OVERVIEW_SWITCH])
2651 g_print(" Line %ld column %ld - "
2652 "Hyphen at end of line?\n",
2653 linecnt,g_utf8_pointer_to_offset(aline,s));
2660 * check_for_unspaced_bracket:
2662 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2663 * If so, suspect a scanno like "a]most".
2665 void check_for_unspaced_bracket(const char *aline)
2669 c=g_utf8_get_char(aline);
2670 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2671 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2675 nc=g_utf8_get_char(g_utf8_next_char(s));
2678 /* for each bracket character in the line except 1st & last */
2679 if (g_utf8_strchr("{[()]}",-1,c) &&
2680 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2682 if (pswit[ECHO_SWITCH])
2683 g_print("\n%s\n",aline);
2684 if (!pswit[OVERVIEW_SWITCH])
2685 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2686 linecnt,g_utf8_pointer_to_offset(aline,s));
2694 * check_for_unpunctuated_endquote:
2696 void check_for_unpunctuated_endquote(const char *aline)
2700 c=g_utf8_get_char(aline);
2701 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2702 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2706 nc=g_utf8_get_char(g_utf8_next_char(s));
2707 /* for each character in the line except 1st */
2708 if (c==CHAR_DQUOTE && isalpha(pc))
2710 if (pswit[ECHO_SWITCH])
2711 g_print("\n%s\n",aline);
2712 if (!pswit[OVERVIEW_SWITCH])
2713 g_print(" Line %ld column %ld - "
2714 "endquote missing punctuation?\n",
2715 linecnt,g_utf8_pointer_to_offset(aline,s));
2723 * check_for_html_tag:
2725 * Check for <HTML TAG>.
2727 * If there is a < in the line, followed at some point
2728 * by a > then we suspect HTML.
2730 void check_for_html_tag(const char *aline)
2732 const char *open,*close;
2734 open=strchr(aline,'<');
2737 close=strchr(g_utf8_next_char(open),'>');
2740 if (pswit[ECHO_SWITCH])
2741 g_print("\n%s\n",aline);
2742 if (!pswit[OVERVIEW_SWITCH])
2744 tag=g_strndup(open,close-open+1);
2745 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2746 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2756 * check_for_html_entity:
2758 * Check for &symbol; HTML.
2760 * If there is a & in the line, followed at
2761 * some point by a ; then we suspect HTML.
2763 void check_for_html_entity(const char *aline)
2765 const char *s,*amp,*scolon;
2767 amp=strchr(aline,'&');
2770 scolon=strchr(amp,';');
2773 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2774 if (g_utf8_get_char(s)==CHAR_SPACE)
2775 break; /* Don't report "Jones & Son;" */
2778 if (pswit[ECHO_SWITCH])
2779 g_print("\n%s\n",aline);
2780 if (!pswit[OVERVIEW_SWITCH])
2782 entity=g_strndup(amp,scolon-amp+1);
2783 g_print(" Line %ld column %d - HTML symbol? %s \n",
2784 linecnt,(int)(amp-aline)+1,entity);
2795 * check_for_omitted_punctuation:
2797 * Check for omitted punctuation at end of paragraph by working back
2798 * through prevline. DW.
2799 * Need to check this only for "normal" paras.
2800 * So what is a "normal" para?
2801 * Not normal if one-liner (chapter headings, etc.)
2802 * Not normal if doesn't contain at least one locase letter
2803 * Not normal if starts with space
2805 void check_for_omitted_punctuation(const char *prevline,
2806 struct line_properties *last,int start_para_line)
2808 gboolean letter_on_line=FALSE;
2811 gboolean closing_quote;
2812 for (s=prevline;*s;s=g_utf8_next_char(s))
2813 if (g_unichar_isalpha(g_utf8_get_char(s)))
2815 letter_on_line=TRUE;
2819 * This next "if" is a problem.
2820 * If we say "start_para_line <= linecnt - 1", that includes
2821 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2822 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2823 * misses genuine one-line paragraphs.
2825 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2826 g_utf8_get_char(prevline)>CHAR_SPACE)
2828 s=prevline+strlen(prevline);
2831 s=g_utf8_prev_char(s);
2832 c=g_utf8_get_char(s);
2833 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2836 closing_quote=FALSE;
2837 } while (closing_quote && s>prevline);
2838 for (;s>prevline;s=g_utf8_prev_char(s))
2840 if (g_unichar_isalpha(g_utf8_get_char(s)))
2842 if (pswit[ECHO_SWITCH])
2843 g_print("\n%s\n",prevline);
2844 if (!pswit[OVERVIEW_SWITCH])
2845 g_print(" Line %ld column %ld - "
2846 "No punctuation at para end?\n",
2847 linecnt-1,g_utf8_strlen(prevline,-1));
2852 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2858 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2860 const char *word=key;
2863 g_print("\nNote: Queried word %s was duplicated %d times\n",
2868 void print_as_windows_1252(const char *string)
2870 gsize inbytes,outbytes;
2872 static GIConv converter=(GIConv)-1;
2875 if (converter!=(GIConv)-1)
2876 g_iconv_close(converter);
2877 converter=(GIConv)-1;
2880 if (converter==(GIConv)-1)
2881 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2882 if (converter!=(GIConv)-1)
2884 inbytes=outbytes=strlen(string);
2885 bp=buf=g_malloc(outbytes+1);
2886 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2892 fputs(string,stdout);
2895 void print_as_utf_8(const char *string)
2897 fputs(string,stdout);
2905 void procfile(const char *filename)
2908 gchar *parastart=NULL; /* first line of current para */
2909 gchar *etext,*aline;
2912 struct first_pass_results *first_pass_results;
2913 struct warnings *warnings;
2914 struct counters counters={0};
2915 struct line_properties last={0};
2916 struct parities parities={0};
2917 struct pending pending={0};
2918 gboolean isemptyline;
2919 long start_para_line=0;
2920 gboolean isnewpara=FALSE,enddash=FALSE;
2921 last.start=CHAR_SPACE;
2922 linecnt=checked_linecnt=0;
2923 etext=read_etext(filename,&err);
2926 if (pswit[STDOUT_SWITCH])
2927 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2929 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2932 g_print("\n\nFile: %s\n\n",filename);
2933 first_pass_results=first_pass(etext);
2934 warnings=report_first_pass(first_pass_results);
2935 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2936 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2938 * Here we go with the main pass. Hold onto yer hat!
2942 while ((aline=flgets(&etext_ptr,linecnt+1)))
2947 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2948 continue; // skip DP page separators completely
2949 if (linecnt<first_pass_results->firstline ||
2950 (first_pass_results->footerline>0 &&
2951 linecnt>first_pass_results->footerline))
2953 if (pswit[HEADER_SWITCH])
2955 if (g_str_has_prefix(aline,"Title:"))
2956 g_print(" %s\n",aline);
2957 if (g_str_has_prefix(aline,"Author:"))
2958 g_print(" %s\n",aline);
2959 if (g_str_has_prefix(aline,"Release Date:"))
2960 g_print(" %s\n",aline);
2961 if (g_str_has_prefix(aline,"Edition:"))
2962 g_print(" %s\n\n",aline);
2964 continue; /* skip through the header */
2967 print_pending(aline,parastart,&pending);
2968 isemptyline=analyse_quotes(aline,linecnt,&counters);
2969 if (isnewpara && !isemptyline)
2971 /* This line is the start of a new paragraph. */
2972 start_para_line=linecnt;
2973 /* Capture its first line in case we want to report it later. */
2975 parastart=g_strdup(aline);
2976 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2978 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2979 !g_unichar_isdigit(g_utf8_get_char(s)))
2980 s=g_utf8_next_char(s);
2981 if (g_unichar_islower(g_utf8_get_char(s)))
2983 /* and its first letter is lowercase */
2984 if (pswit[ECHO_SWITCH])
2985 g_print("\n%s\n",aline);
2986 if (!pswit[OVERVIEW_SWITCH])
2987 g_print(" Line %ld column %ld - "
2988 "Paragraph starts with lower-case\n",
2989 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2993 isnewpara=FALSE; /* Signal the end of new para processing. */
2995 /* Check for an em-dash broken at line end. */
2996 if (enddash && g_utf8_get_char(aline)=='-')
2998 if (pswit[ECHO_SWITCH])
2999 g_print("\n%s\n",aline);
3000 if (!pswit[OVERVIEW_SWITCH])
3001 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
3006 for (s=g_utf8_prev_char(aline+strlen(aline));
3007 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
3009 if (s>=aline && g_utf8_get_char(s)=='-')
3011 check_for_control_characters(aline);
3012 check_for_odd_characters(aline,warnings,isemptyline);
3013 if (warnings->longline)
3014 check_for_long_line(aline);
3015 if (warnings->shortline)
3016 check_for_short_line(aline,&last);
3018 last.len=g_utf8_strlen(aline,-1);
3019 last.start=g_utf8_get_char(aline);
3020 check_for_starting_punctuation(aline);
3023 check_for_spaced_emdash(aline);
3024 check_for_spaced_dash(aline);
3026 check_for_unmarked_paragraphs(aline);
3027 check_for_jeebies(aline);
3028 check_for_mta_from(aline);
3029 check_for_orphan_character(aline);
3030 check_for_pling_scanno(aline);
3031 check_for_extra_period(aline,warnings);
3032 check_for_following_punctuation(aline);
3033 check_for_typos(aline,warnings);
3034 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3035 check_for_double_punctuation(aline,warnings);
3036 check_for_spaced_quotes(aline);
3037 check_for_miscased_genative(aline);
3038 check_end_of_line(aline,warnings);
3039 check_for_unspaced_bracket(aline);
3040 if (warnings->endquote)
3041 check_for_unpunctuated_endquote(aline);
3042 check_for_html_tag(aline);
3043 check_for_html_entity(aline);
3046 check_for_mismatched_quotes(&counters,&pending);
3047 counters_reset(&counters);
3048 /* let the next iteration know that it's starting a new para */
3051 check_for_omitted_punctuation(prevline,&last,start_para_line);
3054 prevline=g_strdup(aline);
3057 check_for_mismatched_quotes(&counters,&pending);
3058 print_pending(NULL,parastart,&pending);
3059 reset_pending(&pending);
3068 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3069 g_tree_foreach(qword,report_duplicate_queries,NULL);
3070 g_tree_unref(qword);
3071 g_tree_unref(qperiod);
3072 counters_destroy(&counters);
3073 g_set_print_handler(NULL);
3074 print_as_windows_1252(NULL);
3075 if (pswit[MARKUP_SWITCH])
3082 * Get one line from the input text, checking for
3083 * the existence of exactly one CR/LF line-end per line.
3085 * Returns: a pointer to the line.
3087 char *flgets(char **etext,long lcnt)
3090 gboolean isCR=FALSE;
3091 char *theline=*etext;
3096 c=g_utf8_get_char(*etext);
3097 *etext=g_utf8_next_char(*etext);
3100 /* either way, it's end of line */
3107 /* Error - a LF without a preceding CR */
3108 if (pswit[LINE_END_SWITCH])
3110 if (pswit[ECHO_SWITCH])
3112 s=g_strndup(theline,eos-theline);
3113 g_print("\n%s\n",s);
3116 if (!pswit[OVERVIEW_SWITCH])
3117 g_print(" Line %ld - No CR?\n",lcnt);
3128 /* Error - two successive CRs */
3129 if (pswit[LINE_END_SWITCH])
3131 if (pswit[ECHO_SWITCH])
3133 s=g_strndup(theline,eos-theline);
3134 g_print("\n%s\n",s);
3137 if (!pswit[OVERVIEW_SWITCH])
3138 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3147 if (pswit[LINE_END_SWITCH] && isCR)
3149 if (pswit[ECHO_SWITCH])
3151 s=g_strndup(theline,eos-theline);
3152 g_print("\n%s\n",s);
3155 if (!pswit[OVERVIEW_SWITCH])
3156 g_print(" Line %ld column %ld - CR without LF?\n",
3157 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3163 eos=g_utf8_next_char(eos);
3167 if (pswit[MARKUP_SWITCH])
3168 postprocess_for_HTML(theline);
3169 if (pswit[DP_SWITCH])
3170 postprocess_for_DP(theline);
3177 * Takes a "word" as a parameter, and checks whether it
3178 * contains a mixture of alpha and digits. Generally, this is an
3179 * error, but may not be for cases like 4th or L5 12s. 3d.
3181 * Returns: TRUE iff an is error found.
3183 gboolean mixdigit(const char *checkword)
3185 gboolean wehaveadigit,wehavealetter,query;
3186 const char *s,*nondigit;
3187 wehaveadigit=wehavealetter=query=FALSE;
3188 for (s=checkword;*s;s=g_utf8_next_char(s))
3189 if (g_unichar_isalpha(g_utf8_get_char(s)))
3191 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3193 if (wehaveadigit && wehavealetter)
3195 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3197 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3198 nondigit=g_utf8_next_char(nondigit))
3200 /* digits, ending in st, rd, nd, th of either case */
3201 if (!g_ascii_strcasecmp(nondigit,"st") ||
3202 !g_ascii_strcasecmp(nondigit,"rd") ||
3203 !g_ascii_strcasecmp(nondigit,"nd") ||
3204 !g_ascii_strcasecmp(nondigit,"th"))
3206 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3207 !g_ascii_strcasecmp(nondigit,"rds") ||
3208 !g_ascii_strcasecmp(nondigit,"nds") ||
3209 !g_ascii_strcasecmp(nondigit,"ths"))
3211 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3212 !g_ascii_strcasecmp(nondigit,"rdly") ||
3213 !g_ascii_strcasecmp(nondigit,"ndly") ||
3214 !g_ascii_strcasecmp(nondigit,"thly"))
3216 /* digits, ending in l, L, s or d */
3217 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3218 !strcmp(nondigit,"d"))
3221 * L at the start of a number, representing Britsh pounds, like L500.
3222 * This is cute. We know the current word is mixed digit. If the first
3223 * letter is L, there must be at least one digit following. If both
3224 * digits and letters follow, we have a genuine error, else we have a
3225 * capital L followed by digits, and we accept that as a non-error.
3227 if (g_utf8_get_char(checkword)=='L' &&
3228 !mixdigit(g_utf8_next_char(checkword)))
3237 * Extracts the first/next "word" from the line, and returns it.
3238 * A word is defined as one English word unit--or at least that's the aim.
3239 * "ptr" is advanced to the position in the line where we will start
3240 * looking for the next word.
3242 * Returns: A newly-allocated string.
3244 gchar *getaword(const char **ptr)
3249 word=g_string_new(NULL);
3250 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3251 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3252 **ptr;*ptr=g_utf8_next_char(*ptr))
3255 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3256 * Especially yucky is the case of L1,000
3257 * This section looks for a pattern of characters including a digit
3258 * followed by a comma or period followed by one or more digits.
3259 * If found, it returns this whole pattern as a word; otherwise we discard
3260 * the results and resume our normal programming.
3263 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3264 g_unichar_isalpha(g_utf8_get_char(s)) ||
3265 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3266 g_string_append_unichar(word,g_utf8_get_char(s));
3269 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3271 c=g_utf8_get_char(t);
3272 pc=g_utf8_get_char(g_utf8_prev_char(t));
3273 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3276 return g_string_free(word,FALSE);
3280 /* we didn't find a punctuated number - do the regular getword thing */
3281 g_string_truncate(word,0);
3282 c=g_utf8_get_char(*ptr);
3283 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3284 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3285 g_string_append_unichar(word,c);
3286 return g_string_free(word,FALSE);
3292 * Is this word a Roman Numeral?
3294 * It doesn't actually validate that the number is a valid Roman Numeral--for
3295 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3296 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3297 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3298 * expressions thereof, except when it came to taxes. Allow any number of M,
3299 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3300 * XL or an optional XC, an optional IX or IV, an optional V and any number
3303 gboolean isroman(const char *t)
3309 while (g_utf8_get_char(t)=='m' && *t)
3311 if (g_utf8_get_char(t)=='d')
3313 if (g_str_has_prefix(t,"cm"))
3315 if (g_str_has_prefix(t,"cd"))
3317 while (g_utf8_get_char(t)=='c' && *t)
3319 if (g_str_has_prefix(t,"xl"))
3321 if (g_str_has_prefix(t,"xc"))
3323 if (g_utf8_get_char(t)=='l')
3325 while (g_utf8_get_char(t)=='x' && *t)
3327 if (g_str_has_prefix(t,"ix"))
3329 if (g_str_has_prefix(t,"iv"))
3331 if (g_utf8_get_char(t)=='v')
3333 while (g_utf8_get_char(t)=='i' && *t)
3339 * postprocess_for_DP:
3341 * Invoked with the -d switch from flgets().
3342 * It simply "removes" from the line a hard-coded set of common
3343 * DP-specific tags, so that the line passed to the main routine has
3344 * been pre-cleaned of DP markup.
3346 void postprocess_for_DP(char *theline)
3352 for (i=0;*DPmarkup[i];i++)
3353 while ((s=strstr(theline,DPmarkup[i])))
3355 t=s+strlen(DPmarkup[i]);
3356 memmove(s,t,strlen(t)+1);
3361 * postprocess_for_HTML:
3363 * Invoked with the -m switch from flgets().
3364 * It simply "removes" from the line a hard-coded set of common
3365 * HTML tags and "replaces" a hard-coded set of common HTML
3366 * entities, so that the line passed to the main routine has
3367 * been pre-cleaned of HTML.
3369 void postprocess_for_HTML(char *theline)
3371 while (losemarkup(theline))
3373 loseentities(theline);
3376 char *losemarkup(char *theline)
3380 s=strchr(theline,'<');
3381 t=s?strchr(s,'>'):NULL;
3384 for (i=0;*markup[i];i++)
3385 if (tagcomp(g_utf8_next_char(s),markup[i]))
3387 t=g_utf8_next_char(t);
3388 memmove(s,t,strlen(t)+1);
3391 /* It's an unrecognized <xxx>. */
3395 void loseentities(char *theline)
3402 GTree *entities=NULL;
3403 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3407 g_tree_destroy(entities);
3409 if (translit!=(GIConv)-1)
3410 g_iconv_close(translit);
3411 translit=(GIConv)-1;
3412 if (to_utf8!=(GIConv)-1)
3413 g_iconv_close(to_utf8);
3421 entities=g_tree_new((GCompareFunc)strcmp);
3422 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3423 g_tree_insert(entities,HTMLentities[i].name,
3424 GUINT_TO_POINTER(HTMLentities[i].c));
3426 if (translit==(GIConv)-1)
3427 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3428 if (to_utf8==(GIConv)-1)
3429 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3430 while((amp=strchr(theline,'&')))
3432 scolon=strchr(amp,';');
3437 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3438 c=strtol(amp+2,NULL,10);
3439 else if (amp[2]=='x' &&
3440 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3441 c=strtol(amp+3,NULL,16);
3445 s=g_strndup(amp+1,scolon-(amp+1));
3446 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3455 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3456 theline+=g_unichar_to_utf8(c,theline);
3460 nb=g_unichar_to_utf8(c,s);
3461 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3463 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3465 memcpy(theline,s,nb);
3469 memmove(theline,g_utf8_next_char(scolon),
3470 strlen(g_utf8_next_char(scolon))+1);
3473 theline=g_utf8_next_char(amp);
3477 gboolean tagcomp(const char *strin,const char *basetag)
3481 if (g_utf8_get_char(strin)=='/')
3482 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3484 t=g_utf8_casefold(strin,-1);
3485 s=g_utf8_casefold(basetag,-1);
3486 retval=g_str_has_prefix(t,s);
3492 void proghelp(GOptionContext *context)
3495 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3496 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3497 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3498 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3499 "For details, read the file COPYING.\n",stderr);
3500 fputs("This is Free Software; "
3501 "you may redistribute it under certain conditions (GPL);\n",stderr);
3502 fputs("read the file COPYING for details.\n\n",stderr);
3503 help=g_option_context_get_help(context,TRUE,NULL);
3506 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3507 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3508 "non-ASCII\n",stderr);
3509 fputs("characters like accented letters, "
3510 "lines longer than 75 or shorter than 55,\n",stderr);
3511 fputs("unbalanced quotes or brackets, "
3512 "a variety of badly formatted punctuation, \n",stderr);
3513 fputs("HTML tags, some likely typos. "
3514 "It is NOT a substitute for human judgement.\n",stderr);