1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 gboolean typo_compat,paranoid_compat;
137 static GOptionEntry options[]={
138 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
139 "Ignore DP-specific markup", NULL },
140 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
141 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
142 "Don't ignore DP-specific markup", NULL },
143 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
144 "Echo queried line", NULL },
145 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
146 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
147 "Don't echo queried line", NULL },
148 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
149 "Check single quotes", NULL },
150 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
151 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
152 "Don't check single quotes", NULL },
153 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
154 "Check common typos", NULL },
155 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
156 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
157 "Don't check common typos", NULL },
158 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
159 "Require closure of quotes on every paragraph", NULL },
160 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
161 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
162 "Don't require closure of quotes on every paragraph", NULL },
163 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
164 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
165 "Enable paranoid querying of everything", NULL },
166 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
167 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
168 "Disable paranoid querying of everything", NULL },
169 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
170 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
171 "Enable line end checking", NULL },
172 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
173 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
174 "Diable line end checking", NULL },
175 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
176 "Overview: just show counts", NULL },
177 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
178 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
179 "Show individual warnings", NULL },
180 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
181 "Output errors to stdout instead of stderr", NULL },
182 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
183 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
184 "Output errors to stderr instead of stdout", NULL },
185 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
186 "Echo header fields", NULL },
187 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
188 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Don't echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
193 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "No special handling for markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
198 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
199 "Ignore file of user-defined typos", NULL },
200 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
201 "Verbose - list everything", NULL },
202 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
203 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
204 "Switch off verbose mode", NULL },
205 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
206 "Set of characters valid for this ebook", "NAME" },
211 * Options relating to configuration which make no sense from inside
212 * a configuration file.
215 static GOptionEntry config_options[]={
216 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
217 "Defaults for use on www upload", NULL },
218 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
219 "Dump current config settings", NULL },
223 static GOptionEntry compatibility_options[]={
224 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
225 "Toggle checking for common typos", NULL },
226 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
227 "Toggle both paranoid mode and common typos", NULL },
231 long cnt_dquot; /* for overview mode, count of doublequote queries */
232 long cnt_squot; /* for overview mode, count of singlequote queries */
233 long cnt_brack; /* for overview mode, count of brackets queries */
234 long cnt_bin; /* for overview mode, count of non-ASCII queries */
235 long cnt_odd; /* for overview mode, count of odd character queries */
236 long cnt_long; /* for overview mode, count of long line errors */
237 long cnt_short; /* for overview mode, count of short line queries */
238 long cnt_punct; /* for overview mode,
239 count of punctuation and spacing queries */
240 long cnt_dash; /* for overview mode, count of dash-related queries */
241 long cnt_word; /* for overview mode, count of word queries */
242 long cnt_html; /* for overview mode, count of html queries */
243 long cnt_lineend; /* for overview mode, count of line-end queries */
244 long cnt_spacend; /* count of lines with space at end */
245 long linecnt; /* count of total lines in the file */
246 long checked_linecnt; /* count of lines actually checked */
248 void proghelp(GOptionContext *context);
249 void procfile(const char *);
253 gboolean mixdigit(const char *);
254 gchar *getaword(const char **);
255 char *flgets(char **,long);
256 void postprocess_for_HTML(char *);
257 char *linehasmarkup(char *);
258 char *losemarkup(char *);
259 gboolean tagcomp(const char *,const char *);
260 void loseentities(char *);
261 gboolean isroman(const char *);
262 void postprocess_for_DP(char *);
263 void print_as_windows_1252(const char *string);
264 void print_as_utf_8(const char *string);
266 GTree *qword,*qperiod;
272 gboolean set_charset(const char *name,GError **err)
274 /* The various UNICODE encodings all share the same character set. */
275 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
276 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
277 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
278 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
279 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
283 if (charset_validator!=(GIConv)-1)
284 g_iconv_close(charset_validator);
285 if (!name || !g_strcasecmp(name,"auto"))
288 charset_validator=(GIConv)-1;
292 charset=g_strdup(name);
293 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
294 if (!g_strcasecmp(charset,unicode_aliases[i]))
297 charset=g_strdup("UTF-8");
300 if (!strcmp(charset,"UTF-8"))
301 charset_validator=(GIConv)-1;
304 charset_validator=g_iconv_open(charset,"UTF-8");
305 if (charset_validator==(GIConv)-1)
307 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
308 "Unknown character set \"%s\"",charset);
317 void config_file_update(GKeyFile *kf)
322 for(i=0;options[i].long_name;i++)
324 if (g_str_has_prefix(options[i].long_name,"no-"))
326 if (options[i].arg==G_OPTION_ARG_NONE)
328 sw=*(gboolean *)options[i].arg_data;
329 if (options[i].flags&G_OPTION_FLAG_REVERSE)
331 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
333 else if (options[i].arg==G_OPTION_ARG_STRING)
335 s=*(gchar **)options[i].arg_data;
338 g_key_file_set_string(kf,"options",options[i].long_name,s);
341 g_assert_not_reached();
345 void config_file_add_comments(GKeyFile *kf)
349 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
351 for(i=0;options[i].long_name;i++)
353 if (g_str_has_prefix(options[i].long_name,"no-"))
355 comment=g_strconcat(" ",options[i].description,NULL);
356 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
361 void dump_config(void)
365 config_file_update(config);
368 config=g_key_file_new();
369 config_file_update(config);
370 config_file_add_comments(config);
372 s=g_key_file_to_data(config,NULL,NULL);
378 GKeyFile *read_config_file(gchar **full_path)
384 const char *search_path;
387 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
391 search_dirs=g_strsplit(search_path,";",0);
393 search_dirs=g_strsplit(search_path,":",0);
398 search_dirs=g_new(gchar *,4);
399 search_dirs[0]=g_get_current_dir();
400 search_dirs[1]=g_strdup(running_from);
401 search_dirs[2]=g_strdup(g_get_user_config_dir());
404 for(i=0;search_dirs[i];i++)
406 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
407 if (g_key_file_load_from_file(kf,path,
408 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
410 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
412 g_printerr("Bookloupe: Error reading %s\n",path);
413 g_printerr("%s\n",err->message);
425 g_strfreev(search_dirs);
433 void parse_config_file(void)
440 config=read_config_file(&path);
442 keys=g_key_file_get_keys(config,"options",NULL,NULL);
449 for(j=0;options[j].long_name;j++)
451 if (g_str_has_prefix(options[j].long_name,"no-"))
453 else if (!strcmp(keys[i],options[j].long_name))
455 if (options[j].arg==G_OPTION_ARG_NONE)
457 sw=g_key_file_get_boolean(config,"options",keys[i],
461 g_printerr("Bookloupe: %s: options.%s: %s\n",
462 path,keys[i],err->message);
467 if (options[j].flags&G_OPTION_FLAG_REVERSE)
469 *(gboolean *)options[j].arg_data=sw;
473 else if (options[j].arg==G_OPTION_ARG_STRING)
475 s=g_key_file_get_string(config,"options",keys[i],
479 g_printerr("Bookloupe: %s: options.%s: %s\n",
480 path,keys[i],err->message);
485 g_free(*(gchar **)options[j].arg_data);
486 if (!g_strcmp0(s,"auto"))
488 *(gchar **)options[j].arg_data=NULL;
492 *(gchar **)options[j].arg_data=s;
497 g_assert_not_reached();
500 if (!options[j].long_name)
501 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
510 void parse_options(int *argc,char ***argv)
513 GOptionContext *context;
514 GOptionGroup *compatibility;
515 context=g_option_context_new(
516 "file - look for errors in Project Gutenberg(TM) etexts");
517 g_option_context_add_main_entries(context,options,NULL);
518 g_option_context_add_main_entries(context,config_options,NULL);
519 compatibility=g_option_group_new("compatibility",
520 "Options for Compatibility with Gutcheck:",
521 "Show compatibility options",NULL,NULL);
522 g_option_group_add_entries(compatibility,compatibility_options);
523 g_option_context_add_group(context,compatibility);
524 g_option_context_set_description(context,
525 "For simplicity, only the switch options which reverse the\n"
526 "default configuration are listed. In most cases, both vanilla\n"
527 "and \"no-\" prefixed versions are available for use.");
528 if (!g_option_context_parse(context,argc,argv,&err))
530 g_printerr("Bookloupe: %s\n",err->message);
531 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
535 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
538 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
539 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
542 * Web uploads - for the moment, this is really just a placeholder
543 * until we decide what processing we really want to do on web uploads
545 if (pswit[WEB_SWITCH])
547 /* specific override for web uploads */
548 pswit[ECHO_SWITCH]=TRUE;
549 pswit[SQUOTE_SWITCH]=FALSE;
550 pswit[TYPO_SWITCH]=TRUE;
551 pswit[QPARA_SWITCH]=FALSE;
552 pswit[PARANOID_SWITCH]=TRUE;
553 pswit[LINE_END_SWITCH]=FALSE;
554 pswit[OVERVIEW_SWITCH]=FALSE;
555 pswit[STDOUT_SWITCH]=FALSE;
556 pswit[HEADER_SWITCH]=TRUE;
557 pswit[VERBOSE_SWITCH]=FALSE;
558 pswit[MARKUP_SWITCH]=FALSE;
559 pswit[USERTYPO_SWITCH]=FALSE;
560 pswit[DP_SWITCH]=FALSE;
562 if (opt_charset && !set_charset(opt_charset,&err))
564 g_printerr("%s\n",err->message);
567 if (pswit[DUMP_CONFIG_SWITCH])
574 if (pswit[OVERVIEW_SWITCH])
575 /* just print summary; don't echo */
576 pswit[ECHO_SWITCH]=FALSE;
582 g_option_context_free(context);
588 * Read in the user-defined stealth scanno list.
590 void read_user_scannos(void)
593 gchar *usertypo_file;
597 gchar *contents,*utf8,**lines;
598 usertypo_file=g_strdup("bookloupe.typ");
599 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
600 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
603 g_free(usertypo_file);
604 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
605 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
607 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
610 g_free(usertypo_file);
611 usertypo_file=g_strdup("gutcheck.typ");
612 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
614 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
617 g_free(usertypo_file);
618 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
619 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
621 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
623 g_free(usertypo_file);
624 g_print(" --> I couldn't find bookloupe.typ "
625 "-- proceeding without user typos.\n");
630 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
631 g_free(usertypo_file);
635 if (g_utf8_validate(contents,len,NULL))
637 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
639 (void)set_charset("UNICODE",NULL);
642 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
644 lines=g_strsplit_set(utf8,"\r\n",0);
646 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
647 for (i=0;lines[i];i++)
648 if (*(unsigned char *)lines[i]>'!')
649 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
658 * Read an etext returning a newly allocated string containing the file
659 * contents or NULL on error.
661 gchar *read_etext(const char *filename,GError **err)
663 GError *tmp_err=NULL;
664 gchar *contents,*utf8;
665 gsize len,bytes_read,bytes_written;
667 if (!g_file_get_contents(filename,&contents,&len,err))
669 if (g_utf8_validate(contents,len,NULL))
671 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
672 g_set_print_handler(print_as_utf_8);
674 SetConsoleOutputCP(CP_UTF8);
679 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
680 &bytes_written,&tmp_err);
681 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
682 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
685 for(i=0;i<bytes_read;i++)
686 if (contents[i]=='\n')
691 else if (contents[i]!='\r')
693 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
694 "Input conversion failed. Byte %d at line %d, column %d is not a "
695 "valid Windows-1252 character",
696 ((unsigned char *)contents)[bytes_read],line,col);
699 g_propagate_error(err,tmp_err);
700 g_set_print_handler(print_as_windows_1252);
702 SetConsoleOutputCP(1252);
709 void cleanup_on_exit(void)
712 SetConsoleOutputCP(saved_cp);
716 int main(int argc,char **argv)
719 atexit(cleanup_on_exit);
720 saved_cp=GetConsoleOutputCP();
722 running_from=g_path_get_dirname(argv[0]);
723 /* Paranoid checking is turned OFF, not on, by its switch */
724 pswit[PARANOID_SWITCH]=TRUE;
725 /* if running in paranoid mode, typo checks default to enabled */
726 pswit[TYPO_SWITCH]=TRUE;
727 /* Line-end checking is turned OFF, not on, by its switch */
728 pswit[LINE_END_SWITCH]=TRUE;
729 /* Echoing is turned OFF, not on, by its switch */
730 pswit[ECHO_SWITCH]=TRUE;
732 parse_options(&argc,&argv);
733 if (pswit[USERTYPO_SWITCH])
735 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
737 if (pswit[OVERVIEW_SWITCH])
739 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
740 checked_linecnt,linecnt,linecnt-checked_linecnt);
741 g_print(" --------------- Queries found --------------\n");
743 g_print(" Long lines: %14ld\n",cnt_long);
745 g_print(" Short lines: %14ld\n",cnt_short);
747 g_print(" Line-end problems: %14ld\n",cnt_lineend);
749 g_print(" Common typos: %14ld\n",cnt_word);
751 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
753 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
755 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
757 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
759 g_print(" Proofing characters: %14ld\n",cnt_odd);
761 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
763 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
765 g_print(" Possible HTML tags: %14ld\n",cnt_html);
767 g_print(" TOTAL QUERIES %14ld\n",
768 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
769 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
771 g_free(running_from);
773 g_tree_unref(usertypo);
774 set_charset(NULL,NULL);
776 g_key_file_free(config);
783 * Run a first pass - verify that it's a valid PG
784 * file, decide whether to report some things that
785 * occur many times in the text like long or short
786 * lines, non-standard dashes, etc.
788 struct first_pass_results *first_pass(const char *etext)
790 gunichar laststart=CHAR_SPACE;
795 unsigned int lastlen=0,lastblen=0;
796 long spline=0,nspline=0;
797 static struct first_pass_results results={0};
799 lines=g_strsplit(etext,"\n",0);
800 for (j=0;lines[j];j++)
802 lbytes=strlen(lines[j]);
803 while (lbytes>0 && lines[j][lbytes-1]=='\r')
804 lines[j][--lbytes]='\0';
805 llen=g_utf8_strlen(lines[j],lbytes);
807 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
808 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
811 g_print(" --> Duplicate header?\n");
812 spline=linecnt+1; /* first line of non-header text, that is */
814 if (!strncmp(lines[j],"*** START",9) &&
815 strstr(lines[j],"PROJECT GUTENBERG"))
818 g_print(" --> Duplicate header?\n");
819 nspline=linecnt+1; /* first line of non-header text, that is */
821 if (spline || nspline)
823 lc_line=g_utf8_strdown(lines[j],lbytes);
824 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
826 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
828 if (results.footerline)
830 /* it's an old-form header - we can detect duplicates */
832 g_print(" --> Duplicate footer?\n");
835 results.footerline=linecnt;
841 results.firstline=spline;
843 results.firstline=nspline; /* override with new */
844 if (results.footerline)
845 continue; /* don't count the boilerplate in the footer */
846 results.totlen+=llen;
847 for (s=lines[j];*s;s=g_utf8_next_char(s))
849 if (g_utf8_get_char(s)>127)
851 if (g_unichar_isalpha(g_utf8_get_char(s)))
853 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
854 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
855 results.endquote_count++;
857 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
858 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
861 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
863 if (strstr(lines[j],".,"))
865 /* only count ast lines for ignoring purposes where there is */
866 /* locase text on the line */
867 if (strchr(lines[j],'*'))
869 for (s=lines[j];*s;s=g_utf8_next_char(s))
870 if (g_unichar_islower(g_utf8_get_char(s)))
875 if (strchr(lines[j],'/'))
876 results.fslashline++;
879 for (s=g_utf8_prev_char(lines[j]+lbytes);
880 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
881 s=g_utf8_prev_char(s))
883 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
884 g_utf8_get_char(g_utf8_prev_char(s))!='-')
887 if (llen>LONGEST_PG_LINE)
889 if (llen>WAY_TOO_LONG)
890 results.verylongline++;
891 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
893 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
896 if (strstr(lines[j],"<i>"))
897 results.htmcount+=4; /* bonus marks! */
899 /* Check for spaced em-dashes */
900 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
903 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
904 results.space_emdash++;
905 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
906 /* count of em-dashes with spaces both sides */
907 results.non_PG_space_emdash++;
908 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
909 /* count of PG-type em-dashes with no spaces */
910 results.PG_space_emdash++;
915 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
916 results.Dutchcount++;
917 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
918 results.Frenchcount++;
919 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
920 results.standalone_digit++;
923 /* Check for spaced dashes */
924 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
928 laststart=lines[j][0];
937 * Make some snap decisions based on the first pass results.
939 struct warnings *report_first_pass(struct first_pass_results *results)
941 static struct warnings warnings={0};
943 g_print(" --> %ld lines in this file have white space at end\n",
946 if (results->dotcomma>5)
949 g_print(" --> %ld lines in this file contain '.,'. "
950 "Not reporting them.\n",results->dotcomma);
953 * If more than 50 lines, or one-tenth, are short,
954 * don't bother reporting them.
956 warnings.shortline=1;
957 if (results->shortline>50 || results->shortline*10>linecnt)
959 warnings.shortline=0;
960 g_print(" --> %ld lines in this file are short. "
961 "Not reporting short lines.\n",results->shortline);
964 * If more than 50 lines, or one-tenth, are long,
965 * don't bother reporting them.
968 if (results->longline>50 || results->longline*10>linecnt)
971 g_print(" --> %ld lines in this file are long. "
972 "Not reporting long lines.\n",results->longline);
974 /* If more than 10 lines contain asterisks, don't bother reporting them. */
976 if (results->astline>10)
979 g_print(" --> %ld lines in this file contain asterisks. "
980 "Not reporting them.\n",results->astline);
983 * If more than 10 lines contain forward slashes,
984 * don't bother reporting them.
987 if (results->fslashline>10)
990 g_print(" --> %ld lines in this file contain forward slashes. "
991 "Not reporting them.\n",results->fslashline);
994 * If more than 20 lines contain unpunctuated endquotes,
995 * don't bother reporting them.
998 if (results->endquote_count>20)
1000 warnings.endquote=0;
1001 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1002 "Not reporting them.\n",results->endquote_count);
1005 * If more than 15 lines contain standalone digits,
1006 * don't bother reporting them.
1009 if (results->standalone_digit>10)
1012 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1013 "Not reporting them.\n",results->standalone_digit);
1016 * If more than 20 lines contain hyphens at end,
1017 * don't bother reporting them.
1020 if (results->hyphens>20)
1023 g_print(" --> %ld lines in this file have hyphens at end. "
1024 "Not reporting them.\n",results->hyphens);
1026 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1028 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1029 pswit[MARKUP_SWITCH]=1;
1031 if (results->verylongline>0)
1032 g_print(" --> %ld lines in this file are VERY long!\n",
1033 results->verylongline);
1035 * If there are more non-PG spaced dashes than PG em-dashes,
1036 * assume it's deliberate.
1037 * Current PG guidelines say don't use them, but older texts do,
1038 * and some people insist on them whatever the guidelines say.
1041 if (results->spacedash+results->non_PG_space_emdash>
1042 results->PG_space_emdash)
1045 g_print(" --> There are %ld spaced dashes and em-dashes. "
1046 "Not reporting them.\n",
1047 results->spacedash+results->non_PG_space_emdash);
1053 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1055 /* If more than a quarter of characters are hi-bit, bug out. */
1056 if (results->binlen*4>results->totlen)
1058 g_print(" --> This file does not appear to be ASCII. "
1059 "Terminating. Best of luck with it!\n");
1062 if (results->alphalen*4<results->totlen)
1064 g_print(" --> This file does not appear to be text. "
1065 "Terminating. Best of luck with it!\n");
1068 if (results->binlen*100>results->totlen || results->binlen>100)
1070 g_print(" --> There are a lot of foreign letters here. "
1071 "Not reporting them.\n");
1072 if (!pswit[VERBOSE_SWITCH])
1076 warnings.isDutch=FALSE;
1077 if (results->Dutchcount>50)
1079 warnings.isDutch=TRUE;
1080 g_print(" --> This looks like Dutch - "
1081 "switching off dashes and warnings for 's Middags case.\n");
1083 warnings.isFrench=FALSE;
1084 if (results->Frenchcount>50)
1086 warnings.isFrench=TRUE;
1087 g_print(" --> This looks like French - "
1088 "switching off some doublepunct.\n");
1090 if (results->firstline && results->footerline)
1091 g_print(" The PG header and footer appear to be already on.\n");
1094 if (results->firstline)
1095 g_print(" The PG header is on - no footer.\n");
1096 if (results->footerline)
1097 g_print(" The PG footer is on - no header.\n");
1100 if (pswit[VERBOSE_SWITCH])
1102 warnings.shortline=1;
1103 warnings.dotcomma=1;
1104 warnings.longline=1;
1110 warnings.endquote=1;
1111 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1113 if (warnings.isDutch)
1115 if (results->footerline>0 && results->firstline>0 &&
1116 results->footerline>results->firstline &&
1117 results->footerline-results->firstline<100)
1119 g_print(" --> I don't really know where this text starts. \n");
1120 g_print(" There are no reference points.\n");
1121 g_print(" I'm going to have to report the header and footer "
1123 results->firstline=0;
1131 * Look along the line, accumulate the count of quotes, and see
1132 * if this is an empty line - i.e. a line with nothing on it
1134 * If line has just spaces, period, * and/or - on it, don't
1135 * count it, since empty lines with asterisks or dashes to
1136 * separate sections are common.
1138 * Returns: TRUE if the line is empty.
1140 gboolean analyse_quotes(const char *aline,struct counters *counters)
1143 /* assume the line is empty until proven otherwise */
1144 gboolean isemptyline=TRUE;
1145 const char *s=aline,*sprev,*snext;
1150 snext=g_utf8_next_char(s);
1151 c=g_utf8_get_char(s);
1154 if (CHAR_IS_SQUOTE(c))
1159 * At start of line, it can only be an openquote.
1160 * Hardcode a very common exception!
1162 if (!g_str_has_prefix(snext,"tis") &&
1163 !g_str_has_prefix(snext,"Tis"))
1164 increment_matching(counters,c,TRUE);
1166 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1167 g_unichar_isalpha(g_utf8_get_char(snext)))
1168 /* Do nothing! it's definitely an apostrophe, not a quote */
1170 /* it's outside a word - let's check it out */
1171 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1172 g_unichar_isalpha(g_utf8_get_char(snext)))
1174 /* it damwell better BE an openquote */
1175 if (!g_str_has_prefix(snext,"tis") &&
1176 !g_str_has_prefix(snext,"Tis"))
1177 /* hardcode a very common exception! */
1178 increment_matching(counters,c,TRUE);
1182 /* now - is it a closequote? */
1183 guessquote=0; /* accumulate clues */
1184 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1186 /* it follows a letter - could be either */
1188 if (g_utf8_get_char(sprev)=='s')
1190 /* looks like a plural apostrophe */
1192 if (g_utf8_get_char(snext)==CHAR_SPACE)
1197 /* it doesn't have a letter either side */
1198 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
1199 strchr(".?!,;: ",g_utf8_get_char(snext)))
1200 guessquote+=8; /* looks like a closequote */
1203 if (matching_difference(counters,CHAR_SQUOTE)>0)
1205 * Give it the benefit of some doubt,
1206 * if a squote is already open.
1212 increment_matching(counters,c,FALSE);
1215 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1217 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1218 if (c==CHAR_UNDERSCORE)
1219 counters->c_unders++;
1220 if (c==CHAR_OPEN_SBRACK)
1222 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1223 !matching_difference(counters,c) && s==aline &&
1224 g_str_has_prefix(s,"[Illustration:"))
1225 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1227 increment_matching(counters,c,TRUE);
1229 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1230 increment_matching(counters,c,TRUE);
1231 if (c==CHAR_CLOSE_SBRACK)
1233 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1234 !matching_difference(counters,c) && !*snext)
1235 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1237 increment_matching(counters,c,FALSE);
1239 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1240 increment_matching(counters,c,FALSE);
1248 * check_for_control_characters:
1250 * Check for invalid or questionable characters in the line
1251 * Anything above 127 is invalid for plain ASCII, and
1252 * non-printable control characters should also be flagged.
1253 * Tabs should generally not be there.
1255 void check_for_control_characters(const char *aline)
1259 for (s=aline;*s;s=g_utf8_next_char(s))
1261 c=g_utf8_get_char(s);
1262 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1264 if (pswit[ECHO_SWITCH])
1265 g_print("\n%s\n",aline);
1266 if (!pswit[OVERVIEW_SWITCH])
1267 g_print(" Line %ld column %ld - Control character %u\n",
1268 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1276 * check_for_odd_characters:
1278 * Check for binary and other odd characters.
1280 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1281 gboolean isemptyline)
1283 /* Don't repeat multiple warnings on one line. */
1284 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1285 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1290 for (s=aline;*s;s=g_utf8_next_char(s))
1292 c=g_utf8_get_char(s);
1293 if (warnings->bin && !eInvalidChar &&
1294 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1296 if (pswit[ECHO_SWITCH])
1297 g_print("\n%s\n",aline);
1298 if (!pswit[OVERVIEW_SWITCH])
1299 if (c>127 && c<160 || c>255)
1300 g_print(" Line %ld column %ld - "
1301 "Non-ISO-8859 character %u\n",
1302 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1304 g_print(" Line %ld column %ld - "
1305 "Non-ASCII character %u\n",
1306 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1311 if (!eInvalidChar && charset)
1313 if (charset_validator==(GIConv)-1)
1315 if (!g_unichar_isdefined(c))
1317 if (pswit[ECHO_SWITCH])
1318 g_print("\n%s\n",aline);
1319 if (!pswit[OVERVIEW_SWITCH])
1320 g_print(" Line %ld column %ld - Unassigned UNICODE "
1321 "code point U+%04" G_GINT32_MODIFIER "X\n",
1322 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1327 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1328 c>=100000 && c<=0x10FFFD)
1330 if (pswit[ECHO_SWITCH])
1331 g_print("\n%s\n",aline);
1332 if (!pswit[OVERVIEW_SWITCH])
1333 g_print(" Line %ld column %ld - Private Use "
1334 "character U+%04" G_GINT32_MODIFIER "X\n",
1335 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1343 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1344 charset_validator,NULL,&nb,NULL);
1349 if (pswit[ECHO_SWITCH])
1350 g_print("\n%s\n",aline);
1351 if (!pswit[OVERVIEW_SWITCH])
1352 g_print(" Line %ld column %ld - Non-%s "
1353 "character %u\n",linecnt,
1354 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1361 if (!eTab && c==CHAR_TAB)
1363 if (pswit[ECHO_SWITCH])
1364 g_print("\n%s\n",aline);
1365 if (!pswit[OVERVIEW_SWITCH])
1366 g_print(" Line %ld column %ld - Tab character?\n",
1367 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1372 if (!eTilde && c==CHAR_TILDE)
1375 * Often used by OCR software to indicate an
1376 * unrecognizable character.
1378 if (pswit[ECHO_SWITCH])
1379 g_print("\n%s\n",aline);
1380 if (!pswit[OVERVIEW_SWITCH])
1381 g_print(" Line %ld column %ld - Tilde character?\n",
1382 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1387 if (!eCarat && c==CHAR_CARAT)
1389 if (pswit[ECHO_SWITCH])
1390 g_print("\n%s\n",aline);
1391 if (!pswit[OVERVIEW_SWITCH])
1392 g_print(" Line %ld column %ld - Carat character?\n",
1393 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1398 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1400 if (pswit[ECHO_SWITCH])
1401 g_print("\n%s\n",aline);
1402 if (!pswit[OVERVIEW_SWITCH])
1403 g_print(" Line %ld column %ld - Forward slash?\n",
1404 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1410 * Report asterisks only in paranoid mode,
1411 * since they're often deliberate.
1413 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1416 if (pswit[ECHO_SWITCH])
1417 g_print("\n%s\n",aline);
1418 if (!pswit[OVERVIEW_SWITCH])
1419 g_print(" Line %ld column %ld - Asterisk?\n",
1420 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1429 * check_for_long_line:
1431 * Check for line too long.
1433 void check_for_long_line(const char *aline)
1435 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1437 if (pswit[ECHO_SWITCH])
1438 g_print("\n%s\n",aline);
1439 if (!pswit[OVERVIEW_SWITCH])
1440 g_print(" Line %ld column %ld - Long line %ld\n",
1441 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1448 * check_for_short_line:
1450 * Check for line too short.
1452 * This one is a bit trickier to implement: we don't want to
1453 * flag the last line of a paragraph for being short, so we
1454 * have to wait until we know that our current line is a
1455 * "normal" line, then report the _previous_ line if it was too
1456 * short. We also don't want to report indented lines like
1457 * chapter heads or formatted quotations. We therefore keep
1458 * last->len as the length of the last line examined, and
1459 * last->blen as the length of the last but one, and try to
1460 * suppress unnecessary warnings by checking that both were of
1461 * "normal" length. We keep the first character of the last
1462 * line in last->start, and if it was a space, we assume that
1463 * the formatting is deliberate. I can't figure out a way to
1464 * distinguish something like a quoted verse left-aligned or
1465 * the header or footer of a letter from a paragraph of short
1466 * lines - maybe if I examined the whole paragraph, and if the
1467 * para has less than, say, 8 lines and if all lines are short,
1468 * then just assume it's OK? Need to look at some texts to see
1469 * how often a formula like this would get the right result.
1471 void check_for_short_line(const char *aline,const struct line_properties *last)
1473 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1474 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1475 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1477 if (pswit[ECHO_SWITCH])
1478 g_print("\n%s\n",prevline);
1479 if (!pswit[OVERVIEW_SWITCH])
1480 g_print(" Line %ld column %ld - Short line %ld?\n",
1481 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1488 * check_for_starting_punctuation:
1490 * Look for punctuation other than full ellipses at start of line.
1492 void check_for_starting_punctuation(const char *aline)
1494 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1495 !g_str_has_prefix(aline,". . ."))
1497 if (pswit[ECHO_SWITCH])
1498 g_print("\n%s\n",aline);
1499 if (!pswit[OVERVIEW_SWITCH])
1500 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1508 * check_for_spaced_emdash:
1510 * Check for spaced em-dashes.
1512 * We must check _all_ occurrences of "--" on the line
1513 * hence the loop - even if the first double-dash is OK
1514 * there may be another that's wrong later on.
1516 void check_for_spaced_emdash(const char *aline)
1518 const char *s,*t,*next;
1519 for (s=aline;t=strstr(s,"--");s=next)
1521 next=g_utf8_next_char(g_utf8_next_char(t));
1522 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1523 g_utf8_get_char(next)==CHAR_SPACE)
1525 if (pswit[ECHO_SWITCH])
1526 g_print("\n%s\n",aline);
1527 if (!pswit[OVERVIEW_SWITCH])
1528 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1529 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1537 * check_for_spaced_dash:
1539 * Check for spaced dashes.
1541 void check_for_spaced_dash(const char *aline)
1544 if ((s=strstr(aline," -")))
1546 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1548 if (pswit[ECHO_SWITCH])
1549 g_print("\n%s\n",aline);
1550 if (!pswit[OVERVIEW_SWITCH])
1551 g_print(" Line %ld column %ld - Spaced dash?\n",
1552 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1557 else if ((s=strstr(aline,"- ")))
1559 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1561 if (pswit[ECHO_SWITCH])
1562 g_print("\n%s\n",aline);
1563 if (!pswit[OVERVIEW_SWITCH])
1564 g_print(" Line %ld column %ld - Spaced dash?\n",
1565 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1573 * check_for_unmarked_paragraphs:
1575 * Check for unmarked paragraphs indicated by separate speakers.
1577 * May well be false positive:
1578 * "Bravo!" "Wonderful!" called the crowd.
1579 * but useful all the same.
1581 void check_for_unmarked_paragraphs(const char *aline)
1584 s=strstr(aline,"\" \"");
1586 s=strstr(aline,"\" \"");
1589 if (pswit[ECHO_SWITCH])
1590 g_print("\n%s\n",aline);
1591 if (!pswit[OVERVIEW_SWITCH])
1592 g_print(" Line %ld column %ld - "
1593 "Query missing paragraph break?\n",
1594 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1601 * check_for_jeebies:
1603 * Check for "to he" and other easy h/b errors.
1605 * This is a very inadequate effort on the h/b problem,
1606 * but the phrase "to he" is always an error, whereas "to
1607 * be" is quite common.
1608 * Similarly, '"Quiet!", be said.' is a non-be error
1609 * "to he" is _not_ always an error!:
1610 * "Where they went to he couldn't say."
1611 * Another false positive:
1612 * What would "Cinderella" be without the . . .
1613 * and another: "If he wants to he can see for himself."
1615 void check_for_jeebies(const char *aline)
1618 s=strstr(aline," be could ");
1620 s=strstr(aline," be would ");
1622 s=strstr(aline," was be ");
1624 s=strstr(aline," be is ");
1626 s=strstr(aline," is be ");
1628 s=strstr(aline,"\", be ");
1630 s=strstr(aline,"\" be ");
1632 s=strstr(aline,"\" be ");
1634 s=strstr(aline," to he ");
1637 if (pswit[ECHO_SWITCH])
1638 g_print("\n%s\n",aline);
1639 if (!pswit[OVERVIEW_SWITCH])
1640 g_print(" Line %ld column %ld - Query he/be error?\n",
1641 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1645 s=strstr(aline," the had ");
1647 s=strstr(aline," a had ");
1649 s=strstr(aline," they bad ");
1651 s=strstr(aline," she bad ");
1653 s=strstr(aline," he bad ");
1655 s=strstr(aline," you bad ");
1657 s=strstr(aline," i bad ");
1660 if (pswit[ECHO_SWITCH])
1661 g_print("\n%s\n",aline);
1662 if (!pswit[OVERVIEW_SWITCH])
1663 g_print(" Line %ld column %ld - Query had/bad error?\n",
1664 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1668 s=strstr(aline,"; hut ");
1670 s=strstr(aline,", hut ");
1673 if (pswit[ECHO_SWITCH])
1674 g_print("\n%s\n",aline);
1675 if (!pswit[OVERVIEW_SWITCH])
1676 g_print(" Line %ld column %ld - Query hut/but error?\n",
1677 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1684 * check_for_mta_from:
1686 * Special case - angled bracket in front of "From" placed there by an
1687 * MTA when sending an e-mail.
1689 void check_for_mta_from(const char *aline)
1692 s=strstr(aline,">From");
1695 if (pswit[ECHO_SWITCH])
1696 g_print("\n%s\n",aline);
1697 if (!pswit[OVERVIEW_SWITCH])
1698 g_print(" Line %ld column %ld - "
1699 "Query angled bracket with From\n",
1700 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1707 * check_for_orphan_character:
1709 * Check for a single character line -
1710 * often an overflow from bad wrapping.
1712 void check_for_orphan_character(const char *aline)
1715 c=g_utf8_get_char(aline);
1716 if (c && !*g_utf8_next_char(aline))
1718 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1719 ; /* Nothing - ignore numerals alone on a line. */
1722 if (pswit[ECHO_SWITCH])
1723 g_print("\n%s\n",aline);
1724 if (!pswit[OVERVIEW_SWITCH])
1725 g_print(" Line %ld column 1 - Query single character line\n",
1734 * check_for_pling_scanno:
1736 * Check for I" - often should be !
1738 void check_for_pling_scanno(const char *aline)
1741 s=strstr(aline," I\"");
1744 if (pswit[ECHO_SWITCH])
1745 g_print("\n%s\n",aline);
1746 if (!pswit[OVERVIEW_SWITCH])
1747 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1748 linecnt,g_utf8_pointer_to_offset(aline,s));
1755 * check_for_extra_period:
1757 * Check for period without a capital letter. Cut-down from gutspell.
1758 * Only works when it happens on a single line.
1760 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1762 const char *s,*t,*s1,*sprev;
1767 gunichar c,nc,pc,*decomposition;
1768 if (pswit[PARANOID_SWITCH])
1770 for (t=aline;t=strstr(t,". ");)
1774 t=g_utf8_next_char(t);
1775 /* start of line punctuation is handled elsewhere */
1778 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1780 t=g_utf8_next_char(t);
1783 if (warnings->isDutch)
1785 /* For Frank & Jeroen -- 's Middags case */
1786 gunichar c2,c3,c4,c5;
1787 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1788 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1789 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1790 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1791 if (CHAR_IS_APOSTROPHE(c2) &&
1792 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1793 g_unichar_isupper(c5))
1795 t=g_utf8_next_char(t);
1799 s1=g_utf8_next_char(g_utf8_next_char(t));
1800 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1801 !isdigit(g_utf8_get_char(s1)))
1802 s1=g_utf8_next_char(s1);
1803 if (g_unichar_islower(g_utf8_get_char(s1)))
1805 /* we have something to investigate */
1807 /* so let's go back and find out */
1808 nc=g_utf8_get_char(t);
1809 s1=g_utf8_prev_char(t);
1810 c=g_utf8_get_char(s1);
1811 sprev=g_utf8_prev_char(s1);
1812 pc=g_utf8_get_char(sprev);
1814 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1815 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1816 g_unichar_isalpha(nc)))
1821 sprev=g_utf8_prev_char(s1);
1822 pc=g_utf8_get_char(sprev);
1824 s1=g_utf8_next_char(s1);
1827 testword=g_strndup(s1,s-s1);
1829 testword=g_strdup(s1);
1830 for (i=0;*abbrev[i];i++)
1831 if (!strcmp(testword,abbrev[i]))
1833 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1835 if (!*g_utf8_next_char(testword))
1837 if (isroman(testword))
1842 for (s=testword;*s;s=g_utf8_next_char(s))
1844 decomposition=g_unicode_canonical_decomposition(
1845 g_utf8_get_char(s),&len);
1846 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1848 g_free(decomposition);
1852 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1854 g_tree_insert(qperiod,g_strdup(testword),
1855 GINT_TO_POINTER(1));
1856 if (pswit[ECHO_SWITCH])
1857 g_print("\n%s\n",aline);
1858 if (!pswit[OVERVIEW_SWITCH])
1859 g_print(" Line %ld column %ld - Extra period?\n",
1860 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1866 t=g_utf8_next_char(t);
1872 * check_for_following_punctuation:
1874 * Check for words usually not followed by punctuation.
1876 void check_for_following_punctuation(const char *aline)
1879 const char *s,*wordstart;
1882 if (pswit[TYPO_SWITCH])
1893 inword=g_utf8_strdown(t,-1);
1895 for (i=0;*nocomma[i];i++)
1896 if (!strcmp(inword,nocomma[i]))
1898 c=g_utf8_get_char(s);
1899 if (c==',' || c==';' || c==':')
1901 if (pswit[ECHO_SWITCH])
1902 g_print("\n%s\n",aline);
1903 if (!pswit[OVERVIEW_SWITCH])
1904 g_print(" Line %ld column %ld - "
1905 "Query punctuation after %s?\n",
1906 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1912 for (i=0;*noperiod[i];i++)
1913 if (!strcmp(inword,noperiod[i]))
1915 c=g_utf8_get_char(s);
1916 if (c=='.' || c=='!')
1918 if (pswit[ECHO_SWITCH])
1919 g_print("\n%s\n",aline);
1920 if (!pswit[OVERVIEW_SWITCH])
1921 g_print(" Line %ld column %ld - "
1922 "Query punctuation after %s?\n",
1923 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1937 * Check for commonly mistyped words,
1938 * and digits like 0 for O in a word.
1940 void check_for_typos(const char *aline,struct warnings *warnings)
1942 const char *s,*t,*nt,*wordstart;
1944 gunichar *decomposition;
1946 int i,vowel,consonant,*dupcnt;
1947 gboolean isdup,istypo,alower;
1950 gsize decomposition_len;
1954 inword=getaword(&s);
1958 continue; /* don't bother with empty lines */
1960 if (mixdigit(inword))
1962 if (pswit[ECHO_SWITCH])
1963 g_print("\n%s\n",aline);
1964 if (!pswit[OVERVIEW_SWITCH])
1965 g_print(" Line %ld column %ld - Query digit in %s\n",
1966 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1971 * Put the word through a series of tests for likely typos and OCR
1974 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1978 for (t=inword;*t;t=g_utf8_next_char(t))
1980 c=g_utf8_get_char(t);
1981 nt=g_utf8_next_char(t);
1982 /* lowercase for testing */
1983 if (g_unichar_islower(c))
1985 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1988 * We have an uppercase mid-word. However, there are
1990 * Mac and Mc like McGill
1991 * French contractions like l'Abbe
1993 offset=g_utf8_pointer_to_offset(inword,t);
1995 pc=g_utf8_get_char(g_utf8_prev_char(t));
1998 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1999 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2000 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2001 CHAR_IS_APOSTROPHE(pc))
2007 testword=g_utf8_casefold(inword,-1);
2009 if (pswit[TYPO_SWITCH])
2012 * Check for certain unlikely two-letter combinations at word
2015 len=g_utf8_strlen(testword,-1);
2018 for (i=0;*nostart[i];i++)
2019 if (g_str_has_prefix(testword,nostart[i]))
2021 for (i=0;*noend[i];i++)
2022 if (g_str_has_suffix(testword,noend[i]))
2025 /* ght is common, gbt never. Like that. */
2026 if (strstr(testword,"cb"))
2028 if (strstr(testword,"gbt"))
2030 if (strstr(testword,"pbt"))
2032 if (strstr(testword,"tbs"))
2034 if (strstr(testword,"mrn"))
2036 if (strstr(testword,"ahle"))
2038 if (strstr(testword,"ihle"))
2041 * "TBE" does happen - like HEARTBEAT - but uncommon.
2042 * Also "TBI" - frostbite, outbid - but uncommon.
2043 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2044 * numerals, but "ii" is a common scanno.
2046 if (strstr(testword,"tbi"))
2048 if (strstr(testword,"tbe"))
2050 if (strstr(testword,"ii"))
2053 * Check for no vowels or no consonants.
2054 * If none, flag a typo.
2056 if (!istypo && len>1)
2059 for (t=testword;*t;t=g_utf8_next_char(t))
2061 c=g_utf8_get_char(t);
2063 g_unicode_canonical_decomposition(c,&decomposition_len);
2064 if (c=='y' || g_unichar_isdigit(c))
2066 /* Yah, this is loose. */
2070 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2074 g_free(decomposition);
2076 if (!vowel || !consonant)
2080 * Now exclude the word from being reported if it's in
2083 for (i=0;*okword[i];i++)
2084 if (!strcmp(testword,okword[i]))
2087 * What looks like a typo may be a Roman numeral.
2090 if (istypo && isroman(testword))
2092 /* Check the manual list of typos. */
2094 for (i=0;*typo[i];i++)
2095 if (!strcmp(testword,typo[i]))
2098 * Check lowercase s, l, i and m - special cases.
2099 * "j" - often a semi-colon gone wrong.
2100 * "d" for a missing apostrophe - he d
2103 if (!istypo && len==1 &&
2104 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2108 dupcnt=g_tree_lookup(qword,testword);
2112 isdup=!pswit[VERBOSE_SWITCH];
2116 dupcnt=g_new0(int,1);
2117 g_tree_insert(qword,g_strdup(testword),dupcnt);
2122 if (pswit[ECHO_SWITCH])
2123 g_print("\n%s\n",aline);
2124 if (!pswit[OVERVIEW_SWITCH])
2126 g_print(" Line %ld column %ld - Query word %s",
2127 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2129 if (!pswit[VERBOSE_SWITCH])
2130 g_print(" - not reporting duplicates");
2138 /* check the user's list of typos */
2139 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2141 if (pswit[ECHO_SWITCH])
2142 g_print("\n%s\n",aline);
2143 if (!pswit[OVERVIEW_SWITCH])
2144 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2145 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2147 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2149 if (pswit[PARANOID_SWITCH] && warnings->digit)
2151 /* In paranoid mode, query all 0 and 1 standing alone. */
2152 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2154 if (pswit[ECHO_SWITCH])
2155 g_print("\n%s\n",aline);
2156 if (!pswit[OVERVIEW_SWITCH])
2157 g_print(" Line %ld column %ld - Query standalone %s\n",
2158 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2169 * check_for_misspaced_punctuation:
2171 * Look for added or missing spaces around punctuation and quotes.
2172 * If there is a punctuation character like ! with no space on
2173 * either side, suspect a missing!space. If there are spaces on
2174 * both sides , assume a typo. If we see a double quote with no
2175 * space or punctuation on either side of it, assume unspaced
2176 * quotes "like"this.
2178 void check_for_misspaced_punctuation(const char *aline,
2179 struct parities *parities,gboolean isemptyline)
2181 gboolean isacro,isellipsis;
2183 gunichar c,nc,pc,n2c;
2184 c=g_utf8_get_char(aline);
2185 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2186 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2190 nc=g_utf8_get_char(g_utf8_next_char(s));
2191 /* For each character in the line after the first. */
2192 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2194 /* we need to suppress warnings for acronyms like M.D. */
2196 /* we need to suppress warnings for ellipsis . . . */
2199 * If there are letters on both sides of it or
2200 * if it's strict punctuation followed by an alpha.
2202 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2203 g_utf8_strchr("?!,;:",-1,c)))
2207 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2208 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2210 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2216 if (pswit[ECHO_SWITCH])
2217 g_print("\n%s\n",aline);
2218 if (!pswit[OVERVIEW_SWITCH])
2219 g_print(" Line %ld column %ld - Missing space?\n",
2220 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2225 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2228 * If there are spaces on both sides,
2229 * or space before and end of line.
2233 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2234 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2236 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2240 if (!isemptyline && !isellipsis)
2242 if (pswit[ECHO_SWITCH])
2243 g_print("\n%s\n",aline);
2244 if (!pswit[OVERVIEW_SWITCH])
2245 g_print(" Line %ld column %ld - "
2246 "Spaced punctuation?\n",linecnt,
2247 g_utf8_pointer_to_offset(aline,s)+1);
2254 /* Split out the characters that CANNOT be preceded by space. */
2255 c=g_utf8_get_char(aline);
2256 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2257 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2261 nc=g_utf8_get_char(g_utf8_next_char(s));
2262 /* for each character in the line after the first */
2263 if (g_utf8_strchr("?!,;:",-1,c))
2265 /* if it's punctuation that _cannot_ have a space before it */
2266 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2269 * If nc DOES == space,
2270 * it was already reported just above.
2272 if (pswit[ECHO_SWITCH])
2273 g_print("\n%s\n",aline);
2274 if (!pswit[OVERVIEW_SWITCH])
2275 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2276 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2283 * Special case " .X" where X is any alpha.
2284 * This plugs a hole in the acronym code above.
2285 * Inelegant, but maintainable.
2287 c=g_utf8_get_char(aline);
2288 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2289 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2293 nc=g_utf8_get_char(g_utf8_next_char(s));
2294 /* for each character in the line after the first */
2297 /* if it's a period */
2298 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2301 * If the period follows a space and
2302 * is followed by a letter.
2304 if (pswit[ECHO_SWITCH])
2305 g_print("\n%s\n",aline);
2306 if (!pswit[OVERVIEW_SWITCH])
2307 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2308 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2314 c=g_utf8_get_char(aline);
2315 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2316 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2320 nc=g_utf8_get_char(g_utf8_next_char(s));
2321 /* for each character in the line after the first */
2324 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2325 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2326 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2328 if (pswit[ECHO_SWITCH])
2329 g_print("\n%s\n",aline);
2330 if (!pswit[OVERVIEW_SWITCH])
2331 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2332 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2338 /* Check parity of quotes. */
2339 nc=g_utf8_get_char(aline);
2340 for (s=aline;*s;s=g_utf8_next_char(s))
2343 nc=g_utf8_get_char(g_utf8_next_char(s));
2346 parities->dquote=!parities->dquote;
2347 if (!parities->dquote)
2350 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2352 if (pswit[ECHO_SWITCH])
2353 g_print("\n%s\n",aline);
2354 if (!pswit[OVERVIEW_SWITCH])
2355 g_print(" Line %ld column %ld - "
2356 "Wrongspaced quotes?\n",
2357 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2365 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2366 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2368 if (pswit[ECHO_SWITCH])
2369 g_print("\n%s\n",aline);
2370 if (!pswit[OVERVIEW_SWITCH])
2371 g_print(" Line %ld column %ld - "
2372 "Wrongspaced quotes?\n",
2373 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2380 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2382 if (g_utf8_strchr(",;:!?)]} ",-1,
2383 g_utf8_get_char(g_utf8_next_char(aline))))
2385 if (pswit[ECHO_SWITCH])
2386 g_print("\n%s\n",aline);
2387 if (!pswit[OVERVIEW_SWITCH])
2388 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2394 if (pswit[SQUOTE_SWITCH])
2396 nc=g_utf8_get_char(aline);
2397 for (s=aline;*s;s=g_utf8_next_char(s))
2400 nc=g_utf8_get_char(g_utf8_next_char(s));
2401 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2402 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2403 !g_unichar_isalpha(nc)))
2405 parities->squote=!parities->squote;
2406 if (!parities->squote)
2409 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2411 if (pswit[ECHO_SWITCH])
2412 g_print("\n%s\n",aline);
2413 if (!pswit[OVERVIEW_SWITCH])
2414 g_print(" Line %ld column %ld - "
2415 "Wrongspaced singlequotes?\n",
2416 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2424 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2425 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2427 if (pswit[ECHO_SWITCH])
2428 g_print("\n%s\n",aline);
2429 if (!pswit[OVERVIEW_SWITCH])
2430 g_print(" Line %ld column %ld - "
2431 "Wrongspaced singlequotes?\n",
2432 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2443 * check_for_double_punctuation:
2445 * Look for double punctuation like ,. or ,,
2446 * Thanks to DW for the suggestion!
2447 * In books with references, ".," and ".;" are common
2448 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2449 * OTOH, from my initial tests, there are also fairly
2450 * common errors. What to do? Make these cases paranoid?
2451 * ".," is the most common, so warnings->dotcomma is used
2452 * to suppress detailed reporting if it occurs often.
2454 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2458 nc=g_utf8_get_char(aline);
2459 for (s=aline;*s;s=g_utf8_next_char(s))
2462 nc=g_utf8_get_char(g_utf8_next_char(s));
2463 /* for each punctuation character in the line */
2464 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2465 g_utf8_strchr(".?!,;:",-1,nc))
2467 /* followed by punctuation, it's a query, unless . . . */
2468 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2469 !warnings->dotcomma && c=='.' && nc==',' ||
2470 warnings->isFrench && g_str_has_prefix(s,",...") ||
2471 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2472 warnings->isFrench && g_str_has_prefix(s,";...") ||
2473 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2474 warnings->isFrench && g_str_has_prefix(s,":...") ||
2475 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2476 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2477 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2478 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2479 warnings->isFrench && g_str_has_prefix(s,"...?"))
2481 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2482 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2483 warnings->isFrench && g_str_has_prefix(s,";...") ||
2484 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2485 warnings->isFrench && g_str_has_prefix(s,":...") ||
2486 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2487 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2488 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2489 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2490 warnings->isFrench && g_str_has_prefix(s,"...?"))
2493 nc=g_utf8_get_char(g_utf8_next_char(s));
2495 ; /* do nothing for .. !! and ?? which can be legit */
2499 if (pswit[ECHO_SWITCH])
2500 g_print("\n%s\n",aline);
2501 if (!pswit[OVERVIEW_SWITCH])
2502 g_print(" Line %ld column %ld - Double punctuation?\n",
2503 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2512 * check_for_spaced_quotes:
2514 void check_for_spaced_quotes(const char *aline)
2518 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2522 while ((t=strstr(s," \" ")))
2524 if (pswit[ECHO_SWITCH])
2525 g_print("\n%s\n",aline);
2526 if (!pswit[OVERVIEW_SWITCH])
2527 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2528 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2531 s=g_utf8_next_char(g_utf8_next_char(t));
2533 pattern=g_string_new(NULL);
2534 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2536 g_string_assign(pattern," ");
2537 g_string_append_unichar(pattern,single_quotes[i]);
2538 g_string_append_c(pattern,' ');
2540 while ((t=strstr(s,pattern->str)))
2542 if (pswit[ECHO_SWITCH])
2543 g_print("\n%s\n",aline);
2544 if (!pswit[OVERVIEW_SWITCH])
2545 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2546 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2549 s=g_utf8_next_char(g_utf8_next_char(t));
2552 g_string_free(pattern,TRUE);
2556 * check_for_miscased_genative:
2558 * Check special case of 'S instead of 's at end of word.
2560 void check_for_miscased_genative(const char *aline)
2566 c=g_utf8_get_char(aline);
2567 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2568 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2572 nc=g_utf8_get_char(g_utf8_next_char(s));
2573 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2575 if (pswit[ECHO_SWITCH])
2576 g_print("\n%s\n",aline);
2577 if (!pswit[OVERVIEW_SWITCH])
2578 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2579 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2587 * check_end_of_line:
2589 * Now check special cases - start and end of line -
2590 * for single and double quotes. Start is sometimes [sic]
2591 * but better to query it anyway.
2592 * While we're here, check for dash at end of line.
2594 void check_end_of_line(const char *aline,struct warnings *warnings)
2599 lbytes=strlen(aline);
2600 if (g_utf8_strlen(aline,lbytes)>1)
2602 s=g_utf8_prev_char(aline+lbytes);
2603 c1=g_utf8_get_char(s);
2604 c2=g_utf8_get_char(g_utf8_prev_char(s));
2605 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2607 if (pswit[ECHO_SWITCH])
2608 g_print("\n%s\n",aline);
2609 if (!pswit[OVERVIEW_SWITCH])
2610 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2611 g_utf8_strlen(aline,lbytes));
2615 c1=g_utf8_get_char(aline);
2616 c2=g_utf8_get_char(g_utf8_next_char(aline));
2617 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2619 if (pswit[ECHO_SWITCH])
2620 g_print("\n%s\n",aline);
2621 if (!pswit[OVERVIEW_SWITCH])
2622 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2627 * Dash at end of line may well be legit - paranoid mode only
2628 * and don't report em-dash at line-end.
2630 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2632 for (s=g_utf8_prev_char(aline+lbytes);
2633 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2635 if (g_utf8_get_char(s)=='-' &&
2636 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2638 if (pswit[ECHO_SWITCH])
2639 g_print("\n%s\n",aline);
2640 if (!pswit[OVERVIEW_SWITCH])
2641 g_print(" Line %ld column %ld - "
2642 "Hyphen at end of line?\n",
2643 linecnt,g_utf8_pointer_to_offset(aline,s));
2650 * check_for_unspaced_bracket:
2652 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2653 * If so, suspect a scanno like "a]most".
2655 void check_for_unspaced_bracket(const char *aline)
2659 c=g_utf8_get_char(aline);
2660 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2661 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2665 nc=g_utf8_get_char(g_utf8_next_char(s));
2668 /* for each bracket character in the line except 1st & last */
2669 if (g_utf8_strchr("{[()]}",-1,c) &&
2670 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2672 if (pswit[ECHO_SWITCH])
2673 g_print("\n%s\n",aline);
2674 if (!pswit[OVERVIEW_SWITCH])
2675 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2676 linecnt,g_utf8_pointer_to_offset(aline,s));
2684 * check_for_unpunctuated_endquote:
2686 void check_for_unpunctuated_endquote(const char *aline)
2690 c=g_utf8_get_char(aline);
2691 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2692 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2696 nc=g_utf8_get_char(g_utf8_next_char(s));
2697 /* for each character in the line except 1st */
2698 if (c==CHAR_DQUOTE && isalpha(pc))
2700 if (pswit[ECHO_SWITCH])
2701 g_print("\n%s\n",aline);
2702 if (!pswit[OVERVIEW_SWITCH])
2703 g_print(" Line %ld column %ld - "
2704 "endquote missing punctuation?\n",
2705 linecnt,g_utf8_pointer_to_offset(aline,s));
2713 * check_for_html_tag:
2715 * Check for <HTML TAG>.
2717 * If there is a < in the line, followed at some point
2718 * by a > then we suspect HTML.
2720 void check_for_html_tag(const char *aline)
2722 const char *open,*close;
2724 open=strchr(aline,'<');
2727 close=strchr(g_utf8_next_char(open),'>');
2730 if (pswit[ECHO_SWITCH])
2731 g_print("\n%s\n",aline);
2732 if (!pswit[OVERVIEW_SWITCH])
2734 tag=g_strndup(open,close-open+1);
2735 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2736 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2746 * check_for_html_entity:
2748 * Check for &symbol; HTML.
2750 * If there is a & in the line, followed at
2751 * some point by a ; then we suspect HTML.
2753 void check_for_html_entity(const char *aline)
2755 const char *s,*amp,*scolon;
2757 amp=strchr(aline,'&');
2760 scolon=strchr(amp,';');
2763 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2764 if (g_utf8_get_char(s)==CHAR_SPACE)
2765 break; /* Don't report "Jones & Son;" */
2768 if (pswit[ECHO_SWITCH])
2769 g_print("\n%s\n",aline);
2770 if (!pswit[OVERVIEW_SWITCH])
2772 entity=g_strndup(amp,scolon-amp+1);
2773 g_print(" Line %ld column %d - HTML symbol? %s \n",
2774 linecnt,(int)(amp-aline)+1,entity);
2785 * check_for_omitted_punctuation:
2787 * Check for omitted punctuation at end of paragraph by working back
2788 * through prevline. DW.
2789 * Need to check this only for "normal" paras.
2790 * So what is a "normal" para?
2791 * Not normal if one-liner (chapter headings, etc.)
2792 * Not normal if doesn't contain at least one locase letter
2793 * Not normal if starts with space
2795 void check_for_omitted_punctuation(const char *prevline,
2796 struct line_properties *last,int start_para_line)
2798 gboolean letter_on_line=FALSE;
2801 for (s=prevline;*s;s=g_utf8_next_char(s))
2802 if (g_unichar_isalpha(g_utf8_get_char(s)))
2804 letter_on_line=TRUE;
2808 * This next "if" is a problem.
2809 * If we say "start_para_line <= linecnt - 1", that includes
2810 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2811 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2812 * misses genuine one-line paragraphs.
2814 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2815 g_utf8_get_char(prevline)>CHAR_SPACE)
2817 s=prevline+strlen(prevline);
2820 s=g_utf8_prev_char(s);
2821 c=g_utf8_get_char(s);
2822 } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
2823 for (;s>prevline;s=g_utf8_prev_char(s))
2825 if (g_unichar_isalpha(g_utf8_get_char(s)))
2827 if (pswit[ECHO_SWITCH])
2828 g_print("\n%s\n",prevline);
2829 if (!pswit[OVERVIEW_SWITCH])
2830 g_print(" Line %ld column %ld - "
2831 "No punctuation at para end?\n",
2832 linecnt-1,g_utf8_strlen(prevline,-1));
2837 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2843 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2845 const char *word=key;
2848 g_print("\nNote: Queried word %s was duplicated %d times\n",
2853 void print_as_windows_1252(const char *string)
2855 gsize inbytes,outbytes;
2857 static GIConv converter=(GIConv)-1;
2860 if (converter!=(GIConv)-1)
2861 g_iconv_close(converter);
2862 converter=(GIConv)-1;
2865 if (converter==(GIConv)-1)
2866 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2867 if (converter!=(GIConv)-1)
2869 inbytes=outbytes=strlen(string);
2870 bp=buf=g_malloc(outbytes+1);
2871 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2877 fputs(string,stdout);
2880 void print_as_utf_8(const char *string)
2882 fputs(string,stdout);
2890 void procfile(const char *filename)
2893 gchar *parastart=NULL; /* first line of current para */
2894 gchar *etext,*aline;
2897 struct first_pass_results *first_pass_results;
2898 struct warnings *warnings;
2899 struct counters counters={0};
2900 struct line_properties last={0};
2901 struct parities parities={0};
2902 struct pending pending={0};
2903 gboolean isemptyline;
2904 long start_para_line=0;
2905 gboolean isnewpara=FALSE,enddash=FALSE;
2906 last.start=CHAR_SPACE;
2907 linecnt=checked_linecnt=0;
2908 etext=read_etext(filename,&err);
2911 if (pswit[STDOUT_SWITCH])
2912 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2914 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2917 g_print("\n\nFile: %s\n\n",filename);
2918 first_pass_results=first_pass(etext);
2919 warnings=report_first_pass(first_pass_results);
2920 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2921 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2923 * Here we go with the main pass. Hold onto yer hat!
2927 while ((aline=flgets(&etext_ptr,linecnt+1)))
2932 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2933 continue; // skip DP page separators completely
2934 if (linecnt<first_pass_results->firstline ||
2935 (first_pass_results->footerline>0 &&
2936 linecnt>first_pass_results->footerline))
2938 if (pswit[HEADER_SWITCH])
2940 if (g_str_has_prefix(aline,"Title:"))
2941 g_print(" %s\n",aline);
2942 if (g_str_has_prefix(aline,"Author:"))
2943 g_print(" %s\n",aline);
2944 if (g_str_has_prefix(aline,"Release Date:"))
2945 g_print(" %s\n",aline);
2946 if (g_str_has_prefix(aline,"Edition:"))
2947 g_print(" %s\n\n",aline);
2949 continue; /* skip through the header */
2952 print_pending(aline,parastart,&pending);
2953 isemptyline=analyse_quotes(aline,&counters);
2954 if (isnewpara && !isemptyline)
2956 /* This line is the start of a new paragraph. */
2957 start_para_line=linecnt;
2958 /* Capture its first line in case we want to report it later. */
2960 parastart=g_strdup(aline);
2961 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2963 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2964 !g_unichar_isdigit(g_utf8_get_char(s)))
2965 s=g_utf8_next_char(s);
2966 if (g_unichar_islower(g_utf8_get_char(s)))
2968 /* and its first letter is lowercase */
2969 if (pswit[ECHO_SWITCH])
2970 g_print("\n%s\n",aline);
2971 if (!pswit[OVERVIEW_SWITCH])
2972 g_print(" Line %ld column %ld - "
2973 "Paragraph starts with lower-case\n",
2974 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2978 isnewpara=FALSE; /* Signal the end of new para processing. */
2980 /* Check for an em-dash broken at line end. */
2981 if (enddash && g_utf8_get_char(aline)=='-')
2983 if (pswit[ECHO_SWITCH])
2984 g_print("\n%s\n",aline);
2985 if (!pswit[OVERVIEW_SWITCH])
2986 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2991 for (s=g_utf8_prev_char(aline+strlen(aline));
2992 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2994 if (s>=aline && g_utf8_get_char(s)=='-')
2996 check_for_control_characters(aline);
2997 check_for_odd_characters(aline,warnings,isemptyline);
2998 if (warnings->longline)
2999 check_for_long_line(aline);
3000 if (warnings->shortline)
3001 check_for_short_line(aline,&last);
3003 last.len=g_utf8_strlen(aline,-1);
3004 last.start=g_utf8_get_char(aline);
3005 check_for_starting_punctuation(aline);
3008 check_for_spaced_emdash(aline);
3009 check_for_spaced_dash(aline);
3011 check_for_unmarked_paragraphs(aline);
3012 check_for_jeebies(aline);
3013 check_for_mta_from(aline);
3014 check_for_orphan_character(aline);
3015 check_for_pling_scanno(aline);
3016 check_for_extra_period(aline,warnings);
3017 check_for_following_punctuation(aline);
3018 check_for_typos(aline,warnings);
3019 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3020 check_for_double_punctuation(aline,warnings);
3021 check_for_spaced_quotes(aline);
3022 check_for_miscased_genative(aline);
3023 check_end_of_line(aline,warnings);
3024 check_for_unspaced_bracket(aline);
3025 if (warnings->endquote)
3026 check_for_unpunctuated_endquote(aline);
3027 check_for_html_tag(aline);
3028 check_for_html_entity(aline);
3031 check_for_mismatched_quotes(&counters,&pending);
3032 counters_reset(&counters);
3033 /* let the next iteration know that it's starting a new para */
3036 check_for_omitted_punctuation(prevline,&last,start_para_line);
3039 prevline=g_strdup(aline);
3042 check_for_mismatched_quotes(&counters,&pending);
3043 print_pending(NULL,parastart,&pending);
3044 reset_pending(&pending);
3053 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3054 g_tree_foreach(qword,report_duplicate_queries,NULL);
3055 g_tree_unref(qword);
3056 g_tree_unref(qperiod);
3057 counters_destroy(&counters);
3058 g_set_print_handler(NULL);
3059 print_as_windows_1252(NULL);
3060 if (pswit[MARKUP_SWITCH])
3067 * Get one line from the input text, checking for
3068 * the existence of exactly one CR/LF line-end per line.
3070 * Returns: a pointer to the line.
3072 char *flgets(char **etext,long lcnt)
3075 gboolean isCR=FALSE;
3076 char *theline=*etext;
3081 c=g_utf8_get_char(*etext);
3082 *etext=g_utf8_next_char(*etext);
3085 /* either way, it's end of line */
3092 /* Error - a LF without a preceding CR */
3093 if (pswit[LINE_END_SWITCH])
3095 if (pswit[ECHO_SWITCH])
3097 s=g_strndup(theline,eos-theline);
3098 g_print("\n%s\n",s);
3101 if (!pswit[OVERVIEW_SWITCH])
3102 g_print(" Line %ld - No CR?\n",lcnt);
3113 /* Error - two successive CRs */
3114 if (pswit[LINE_END_SWITCH])
3116 if (pswit[ECHO_SWITCH])
3118 s=g_strndup(theline,eos-theline);
3119 g_print("\n%s\n",s);
3122 if (!pswit[OVERVIEW_SWITCH])
3123 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3132 if (pswit[LINE_END_SWITCH] && isCR)
3134 if (pswit[ECHO_SWITCH])
3136 s=g_strndup(theline,eos-theline);
3137 g_print("\n%s\n",s);
3140 if (!pswit[OVERVIEW_SWITCH])
3141 g_print(" Line %ld column %ld - CR without LF?\n",
3142 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3148 eos=g_utf8_next_char(eos);
3152 if (pswit[MARKUP_SWITCH])
3153 postprocess_for_HTML(theline);
3154 if (pswit[DP_SWITCH])
3155 postprocess_for_DP(theline);
3162 * Takes a "word" as a parameter, and checks whether it
3163 * contains a mixture of alpha and digits. Generally, this is an
3164 * error, but may not be for cases like 4th or L5 12s. 3d.
3166 * Returns: TRUE iff an is error found.
3168 gboolean mixdigit(const char *checkword)
3170 gboolean wehaveadigit,wehavealetter,query;
3171 const char *s,*nondigit;
3172 wehaveadigit=wehavealetter=query=FALSE;
3173 for (s=checkword;*s;s=g_utf8_next_char(s))
3174 if (g_unichar_isalpha(g_utf8_get_char(s)))
3176 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3178 if (wehaveadigit && wehavealetter)
3180 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3182 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3183 nondigit=g_utf8_next_char(nondigit))
3185 /* digits, ending in st, rd, nd, th of either case */
3186 if (!g_ascii_strcasecmp(nondigit,"st") ||
3187 !g_ascii_strcasecmp(nondigit,"rd") ||
3188 !g_ascii_strcasecmp(nondigit,"nd") ||
3189 !g_ascii_strcasecmp(nondigit,"th"))
3191 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3192 !g_ascii_strcasecmp(nondigit,"rds") ||
3193 !g_ascii_strcasecmp(nondigit,"nds") ||
3194 !g_ascii_strcasecmp(nondigit,"ths"))
3196 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3197 !g_ascii_strcasecmp(nondigit,"rdly") ||
3198 !g_ascii_strcasecmp(nondigit,"ndly") ||
3199 !g_ascii_strcasecmp(nondigit,"thly"))
3201 /* digits, ending in l, L, s or d */
3202 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3203 !strcmp(nondigit,"d"))
3206 * L at the start of a number, representing Britsh pounds, like L500.
3207 * This is cute. We know the current word is mixed digit. If the first
3208 * letter is L, there must be at least one digit following. If both
3209 * digits and letters follow, we have a genuine error, else we have a
3210 * capital L followed by digits, and we accept that as a non-error.
3212 if (g_utf8_get_char(checkword)=='L' &&
3213 !mixdigit(g_utf8_next_char(checkword)))
3222 * Extracts the first/next "word" from the line, and returns it.
3223 * A word is defined as one English word unit--or at least that's the aim.
3224 * "ptr" is advanced to the position in the line where we will start
3225 * looking for the next word.
3227 * Returns: A newly-allocated string.
3229 gchar *getaword(const char **ptr)
3234 word=g_string_new(NULL);
3235 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3236 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3237 **ptr;*ptr=g_utf8_next_char(*ptr))
3240 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3241 * Especially yucky is the case of L1,000
3242 * This section looks for a pattern of characters including a digit
3243 * followed by a comma or period followed by one or more digits.
3244 * If found, it returns this whole pattern as a word; otherwise we discard
3245 * the results and resume our normal programming.
3248 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3249 g_unichar_isalpha(g_utf8_get_char(s)) ||
3250 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3251 g_string_append_unichar(word,g_utf8_get_char(s));
3254 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3256 c=g_utf8_get_char(t);
3257 pc=g_utf8_get_char(g_utf8_prev_char(t));
3258 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3261 return g_string_free(word,FALSE);
3265 /* we didn't find a punctuated number - do the regular getword thing */
3266 g_string_truncate(word,0);
3267 c=g_utf8_get_char(*ptr);
3268 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3269 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3270 g_string_append_unichar(word,c);
3271 return g_string_free(word,FALSE);
3277 * Is this word a Roman Numeral?
3279 * It doesn't actually validate that the number is a valid Roman Numeral--for
3280 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3281 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3282 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3283 * expressions thereof, except when it came to taxes. Allow any number of M,
3284 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3285 * XL or an optional XC, an optional IX or IV, an optional V and any number
3288 gboolean isroman(const char *t)
3294 while (g_utf8_get_char(t)=='m' && *t)
3296 if (g_utf8_get_char(t)=='d')
3298 if (g_str_has_prefix(t,"cm"))
3300 if (g_str_has_prefix(t,"cd"))
3302 while (g_utf8_get_char(t)=='c' && *t)
3304 if (g_str_has_prefix(t,"xl"))
3306 if (g_str_has_prefix(t,"xc"))
3308 if (g_utf8_get_char(t)=='l')
3310 while (g_utf8_get_char(t)=='x' && *t)
3312 if (g_str_has_prefix(t,"ix"))
3314 if (g_str_has_prefix(t,"iv"))
3316 if (g_utf8_get_char(t)=='v')
3318 while (g_utf8_get_char(t)=='i' && *t)
3324 * postprocess_for_DP:
3326 * Invoked with the -d switch from flgets().
3327 * It simply "removes" from the line a hard-coded set of common
3328 * DP-specific tags, so that the line passed to the main routine has
3329 * been pre-cleaned of DP markup.
3331 void postprocess_for_DP(char *theline)
3337 for (i=0;*DPmarkup[i];i++)
3338 while ((s=strstr(theline,DPmarkup[i])))
3340 t=s+strlen(DPmarkup[i]);
3341 memmove(s,t,strlen(t)+1);
3346 * postprocess_for_HTML:
3348 * Invoked with the -m switch from flgets().
3349 * It simply "removes" from the line a hard-coded set of common
3350 * HTML tags and "replaces" a hard-coded set of common HTML
3351 * entities, so that the line passed to the main routine has
3352 * been pre-cleaned of HTML.
3354 void postprocess_for_HTML(char *theline)
3356 while (losemarkup(theline))
3358 loseentities(theline);
3361 char *losemarkup(char *theline)
3365 s=strchr(theline,'<');
3366 t=s?strchr(s,'>'):NULL;
3369 for (i=0;*markup[i];i++)
3370 if (tagcomp(g_utf8_next_char(s),markup[i]))
3372 t=g_utf8_next_char(t);
3373 memmove(s,t,strlen(t)+1);
3376 /* It's an unrecognized <xxx>. */
3380 void loseentities(char *theline)
3387 GTree *entities=NULL;
3388 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3392 g_tree_destroy(entities);
3394 if (translit!=(GIConv)-1)
3395 g_iconv_close(translit);
3396 translit=(GIConv)-1;
3397 if (to_utf8!=(GIConv)-1)
3398 g_iconv_close(to_utf8);
3406 entities=g_tree_new((GCompareFunc)strcmp);
3407 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3408 g_tree_insert(entities,HTMLentities[i].name,
3409 GUINT_TO_POINTER(HTMLentities[i].c));
3411 if (translit==(GIConv)-1)
3412 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3413 if (to_utf8==(GIConv)-1)
3414 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3415 while((amp=strchr(theline,'&')))
3417 scolon=strchr(amp,';');
3422 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3423 c=strtol(amp+2,NULL,10);
3424 else if (amp[2]=='x' &&
3425 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3426 c=strtol(amp+3,NULL,16);
3430 s=g_strndup(amp+1,scolon-(amp+1));
3431 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3440 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3441 theline+=g_unichar_to_utf8(c,theline);
3445 nb=g_unichar_to_utf8(c,s);
3446 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3448 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3450 memcpy(theline,s,nb);
3454 memmove(theline,g_utf8_next_char(scolon),
3455 strlen(g_utf8_next_char(scolon))+1);
3458 theline=g_utf8_next_char(amp);
3462 gboolean tagcomp(const char *strin,const char *basetag)
3466 if (g_utf8_get_char(strin)=='/')
3467 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3469 t=g_utf8_casefold(strin,-1);
3470 s=g_utf8_casefold(basetag,-1);
3471 retval=g_str_has_prefix(t,s);
3477 void proghelp(GOptionContext *context)
3480 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3481 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3482 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3483 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3484 "For details, read the file COPYING.\n",stderr);
3485 fputs("This is Free Software; "
3486 "you may redistribute it under certain conditions (GPL);\n",stderr);
3487 fputs("read the file COPYING for details.\n\n",stderr);
3488 help=g_option_context_get_help(context,TRUE,NULL);
3491 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3492 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3493 "non-ASCII\n",stderr);
3494 fputs("characters like accented letters, "
3495 "lines longer than 75 or shorter than 55,\n",stderr);
3496 fputs("unbalanced quotes or brackets, "
3497 "a variety of badly formatted punctuation, \n",stderr);
3498 fputs("HTML tags, some likely typos. "
3499 "It is NOT a substitute for human judgement.\n",stderr);