1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 gboolean typo_compat,paranoid_compat;
137 static GOptionEntry options[]={
138 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
139 "Ignore DP-specific markup", NULL },
140 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
141 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
142 "Don't ignore DP-specific markup", NULL },
143 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
144 "Echo queried line", NULL },
145 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
146 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
147 "Don't echo queried line", NULL },
148 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
149 "Check single quotes", NULL },
150 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
151 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
152 "Don't check single quotes", NULL },
153 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
154 "Check common typos", NULL },
155 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
156 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
157 "Don't check common typos", NULL },
158 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
159 "Require closure of quotes on every paragraph", NULL },
160 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
161 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
162 "Don't require closure of quotes on every paragraph", NULL },
163 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
164 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
165 "Enable paranoid querying of everything", NULL },
166 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
167 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
168 "Disable paranoid querying of everything", NULL },
169 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
170 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
171 "Enable line end checking", NULL },
172 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
173 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
174 "Diable line end checking", NULL },
175 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
176 "Overview: just show counts", NULL },
177 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
178 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
179 "Show individual warnings", NULL },
180 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
181 "Output errors to stdout instead of stderr", NULL },
182 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
183 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
184 "Output errors to stderr instead of stdout", NULL },
185 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
186 "Echo header fields", NULL },
187 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
188 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Don't echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
193 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "No special handling for markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
198 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
199 "Ignore file of user-defined typos", NULL },
200 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
201 "Verbose - list everything", NULL },
202 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
203 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
204 "Switch off verbose mode", NULL },
205 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
206 "Set of characters valid for this ebook", "NAME" },
211 * Options relating to configuration which make no sense from inside
212 * a configuration file.
215 static GOptionEntry config_options[]={
216 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
217 "Defaults for use on www upload", NULL },
218 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
219 "Dump current config settings", NULL },
223 static GOptionEntry compatibility_options[]={
224 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
225 "Toggle checking for common typos", NULL },
226 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
227 "Toggle both paranoid mode and common typos", NULL },
231 long cnt_quote; /* for overview mode, count of quote queries */
232 long cnt_brack; /* for overview mode, count of brackets queries */
233 long cnt_bin; /* for overview mode, count of non-ASCII queries */
234 long cnt_odd; /* for overview mode, count of odd character queries */
235 long cnt_long; /* for overview mode, count of long line errors */
236 long cnt_short; /* for overview mode, count of short line queries */
237 long cnt_punct; /* for overview mode,
238 count of punctuation and spacing queries */
239 long cnt_dash; /* for overview mode, count of dash-related queries */
240 long cnt_word; /* for overview mode, count of word queries */
241 long cnt_html; /* for overview mode, count of html queries */
242 long cnt_lineend; /* for overview mode, count of line-end queries */
243 long cnt_spacend; /* count of lines with space at end */
244 long linecnt; /* count of total lines in the file */
245 long checked_linecnt; /* count of lines actually checked */
247 void proghelp(GOptionContext *context);
248 void procfile(const char *);
252 gboolean mixdigit(const char *);
253 gchar *getaword(const char **);
254 char *flgets(char **,long);
255 void postprocess_for_HTML(char *);
256 char *linehasmarkup(char *);
257 char *losemarkup(char *);
258 gboolean tagcomp(const char *,const char *);
259 void loseentities(char *);
260 gboolean isroman(const char *);
261 void postprocess_for_DP(char *);
262 void print_as_windows_1252(const char *string);
263 void print_as_utf_8(const char *string);
265 GTree *qword,*qperiod;
271 gboolean set_charset(const char *name,GError **err)
273 /* The various UNICODE encodings all share the same character set. */
274 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
275 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
276 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
277 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
278 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
282 if (charset_validator!=(GIConv)-1)
283 g_iconv_close(charset_validator);
284 if (!name || !g_strcasecmp(name,"auto"))
287 charset_validator=(GIConv)-1;
291 charset=g_strdup(name);
292 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
293 if (!g_strcasecmp(charset,unicode_aliases[i]))
296 charset=g_strdup("UTF-8");
299 if (!strcmp(charset,"UTF-8"))
300 charset_validator=(GIConv)-1;
303 charset_validator=g_iconv_open(charset,"UTF-8");
304 if (charset_validator==(GIConv)-1)
306 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
307 "Unknown character set \"%s\"",charset);
316 void config_file_update(GKeyFile *kf)
321 for(i=0;options[i].long_name;i++)
323 if (g_str_has_prefix(options[i].long_name,"no-"))
325 if (options[i].arg==G_OPTION_ARG_NONE)
327 sw=*(gboolean *)options[i].arg_data;
328 if (options[i].flags&G_OPTION_FLAG_REVERSE)
330 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
332 else if (options[i].arg==G_OPTION_ARG_STRING)
334 s=*(gchar **)options[i].arg_data;
337 g_key_file_set_string(kf,"options",options[i].long_name,s);
340 g_assert_not_reached();
344 void config_file_add_comments(GKeyFile *kf)
348 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
350 for(i=0;options[i].long_name;i++)
352 if (g_str_has_prefix(options[i].long_name,"no-"))
354 comment=g_strconcat(" ",options[i].description,NULL);
355 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
360 void dump_config(void)
364 config_file_update(config);
367 config=g_key_file_new();
368 config_file_update(config);
369 config_file_add_comments(config);
371 s=g_key_file_to_data(config,NULL,NULL);
377 GKeyFile *read_config_file(gchar **full_path)
383 const char *search_path;
386 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
390 search_dirs=g_strsplit(search_path,";",0);
392 search_dirs=g_strsplit(search_path,":",0);
397 search_dirs=g_new(gchar *,4);
398 search_dirs[0]=g_get_current_dir();
399 search_dirs[1]=g_strdup(running_from);
400 search_dirs[2]=g_strdup(g_get_user_config_dir());
403 for(i=0;search_dirs[i];i++)
405 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
406 if (g_key_file_load_from_file(kf,path,
407 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
409 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
411 g_printerr("Bookloupe: Error reading %s\n",path);
412 g_printerr("%s\n",err->message);
424 g_strfreev(search_dirs);
432 void parse_config_file(void)
439 config=read_config_file(&path);
441 keys=g_key_file_get_keys(config,"options",NULL,NULL);
448 for(j=0;options[j].long_name;j++)
450 if (g_str_has_prefix(options[j].long_name,"no-"))
452 else if (!strcmp(keys[i],options[j].long_name))
454 if (options[j].arg==G_OPTION_ARG_NONE)
456 sw=g_key_file_get_boolean(config,"options",keys[i],
460 g_printerr("Bookloupe: %s: options.%s: %s\n",
461 path,keys[i],err->message);
466 if (options[j].flags&G_OPTION_FLAG_REVERSE)
468 *(gboolean *)options[j].arg_data=sw;
472 else if (options[j].arg==G_OPTION_ARG_STRING)
474 s=g_key_file_get_string(config,"options",keys[i],
478 g_printerr("Bookloupe: %s: options.%s: %s\n",
479 path,keys[i],err->message);
484 g_free(*(gchar **)options[j].arg_data);
485 if (!g_strcmp0(s,"auto"))
487 *(gchar **)options[j].arg_data=NULL;
491 *(gchar **)options[j].arg_data=s;
496 g_assert_not_reached();
499 if (!options[j].long_name)
500 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
509 void parse_options(int *argc,char ***argv)
512 GOptionContext *context;
513 GOptionGroup *compatibility;
514 context=g_option_context_new(
515 "file - look for errors in Project Gutenberg(TM) etexts");
516 g_option_context_add_main_entries(context,options,NULL);
517 g_option_context_add_main_entries(context,config_options,NULL);
518 compatibility=g_option_group_new("compatibility",
519 "Options for Compatibility with Gutcheck:",
520 "Show compatibility options",NULL,NULL);
521 g_option_group_add_entries(compatibility,compatibility_options);
522 g_option_context_add_group(context,compatibility);
523 g_option_context_set_description(context,
524 "For simplicity, only the switch options which reverse the\n"
525 "default configuration are listed. In most cases, both vanilla\n"
526 "and \"no-\" prefixed versions are available for use.");
527 if (!g_option_context_parse(context,argc,argv,&err))
529 g_printerr("Bookloupe: %s\n",err->message);
530 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
534 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
537 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
538 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
541 * Web uploads - for the moment, this is really just a placeholder
542 * until we decide what processing we really want to do on web uploads
544 if (pswit[WEB_SWITCH])
546 /* specific override for web uploads */
547 pswit[ECHO_SWITCH]=TRUE;
548 pswit[SQUOTE_SWITCH]=FALSE;
549 pswit[TYPO_SWITCH]=TRUE;
550 pswit[QPARA_SWITCH]=FALSE;
551 pswit[PARANOID_SWITCH]=TRUE;
552 pswit[LINE_END_SWITCH]=FALSE;
553 pswit[OVERVIEW_SWITCH]=FALSE;
554 pswit[STDOUT_SWITCH]=FALSE;
555 pswit[HEADER_SWITCH]=TRUE;
556 pswit[VERBOSE_SWITCH]=FALSE;
557 pswit[MARKUP_SWITCH]=FALSE;
558 pswit[USERTYPO_SWITCH]=FALSE;
559 pswit[DP_SWITCH]=FALSE;
561 if (opt_charset && !set_charset(opt_charset,&err))
563 g_printerr("%s\n",err->message);
566 if (pswit[DUMP_CONFIG_SWITCH])
573 if (pswit[OVERVIEW_SWITCH])
574 /* just print summary; don't echo */
575 pswit[ECHO_SWITCH]=FALSE;
581 g_option_context_free(context);
587 * Read in the user-defined stealth scanno list.
589 void read_user_scannos(void)
592 gchar *usertypo_file;
596 gchar *contents,*utf8,**lines;
597 usertypo_file=g_strdup("bookloupe.typ");
598 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
599 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
602 g_free(usertypo_file);
603 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
604 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
606 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
609 g_free(usertypo_file);
610 usertypo_file=g_strdup("gutcheck.typ");
611 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
613 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
616 g_free(usertypo_file);
617 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
618 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
620 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
622 g_free(usertypo_file);
623 g_print(" --> I couldn't find bookloupe.typ "
624 "-- proceeding without user typos.\n");
629 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
630 g_free(usertypo_file);
634 if (g_utf8_validate(contents,len,NULL))
636 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
638 (void)set_charset("UNICODE",NULL);
641 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
643 lines=g_strsplit_set(utf8,"\r\n",0);
645 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
646 for (i=0;lines[i];i++)
647 if (*(unsigned char *)lines[i]>'!')
648 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
657 * Read an etext returning a newly allocated string containing the file
658 * contents or NULL on error.
660 gchar *read_etext(const char *filename,GError **err)
662 GError *tmp_err=NULL;
663 gchar *contents,*utf8;
664 gsize len,bytes_read,bytes_written;
666 if (!g_file_get_contents(filename,&contents,&len,err))
668 if (g_utf8_validate(contents,len,NULL))
670 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
671 g_set_print_handler(print_as_utf_8);
673 SetConsoleOutputCP(CP_UTF8);
678 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
679 &bytes_written,&tmp_err);
680 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
681 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
684 for(i=0;i<bytes_read;i++)
685 if (contents[i]=='\n')
690 else if (contents[i]!='\r')
692 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
693 "Input conversion failed. Byte %d at line %d, column %d is not a "
694 "valid Windows-1252 character",
695 ((unsigned char *)contents)[bytes_read],line,col);
698 g_propagate_error(err,tmp_err);
699 g_set_print_handler(print_as_windows_1252);
701 SetConsoleOutputCP(1252);
708 void cleanup_on_exit(void)
711 SetConsoleOutputCP(saved_cp);
715 int main(int argc,char **argv)
718 atexit(cleanup_on_exit);
719 saved_cp=GetConsoleOutputCP();
721 running_from=g_path_get_dirname(argv[0]);
722 /* Paranoid checking is turned OFF, not on, by its switch */
723 pswit[PARANOID_SWITCH]=TRUE;
724 /* if running in paranoid mode, typo checks default to enabled */
725 pswit[TYPO_SWITCH]=TRUE;
726 /* Line-end checking is turned OFF, not on, by its switch */
727 pswit[LINE_END_SWITCH]=TRUE;
728 /* Echoing is turned OFF, not on, by its switch */
729 pswit[ECHO_SWITCH]=TRUE;
731 parse_options(&argc,&argv);
732 if (pswit[USERTYPO_SWITCH])
734 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
736 if (pswit[OVERVIEW_SWITCH])
738 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
739 checked_linecnt,linecnt,linecnt-checked_linecnt);
740 g_print(" --------------- Queries found --------------\n");
742 g_print(" Long lines: %14ld\n",cnt_long);
744 g_print(" Short lines: %14ld\n",cnt_short);
746 g_print(" Line-end problems: %14ld\n",cnt_lineend);
748 g_print(" Common typos: %14ld\n",cnt_word);
750 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
752 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
754 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
756 g_print(" Proofing characters: %14ld\n",cnt_odd);
758 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
760 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
762 g_print(" Possible HTML tags: %14ld\n",cnt_html);
764 g_print(" TOTAL QUERIES %14ld\n",
765 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
766 cnt_dash+cnt_word+cnt_html+cnt_lineend);
768 g_free(running_from);
770 g_tree_unref(usertypo);
771 set_charset(NULL,NULL);
773 g_key_file_free(config);
777 void count_dashes(const char *line,const char *dash,
778 struct dash_results *results)
783 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
786 tokens=g_strsplit(line,dash,0);
789 for(i=1;tokens[i];i++)
791 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
792 nc=g_utf8_get_char(tokens[i]);
793 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
795 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
797 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
803 /* count of lines with em-dashes with spaces both sides */
804 results->non_PG_space++;
806 /* count of lines with PG-type em-dashes with no spaces */
814 * Run a first pass - verify that it's a valid PG
815 * file, decide whether to report some things that
816 * occur many times in the text like long or short
817 * lines, non-standard dashes, etc.
819 struct first_pass_results *first_pass(const char *etext)
821 gunichar laststart=CHAR_SPACE;
826 unsigned int lastlen=0,lastblen=0;
827 long spline=0,nspline=0;
828 static struct first_pass_results results={0};
829 struct dash_results tmp_dash_results;
832 lines=g_strsplit(etext,"\n",0);
833 for (j=0;lines[j];j++)
835 lbytes=strlen(lines[j]);
836 while (lbytes>0 && lines[j][lbytes-1]=='\r')
837 lines[j][--lbytes]='\0';
838 llen=g_utf8_strlen(lines[j],lbytes);
840 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
841 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
844 g_print(" --> Duplicate header?\n");
845 spline=linecnt+1; /* first line of non-header text, that is */
847 if (!strncmp(lines[j],"*** START",9) &&
848 strstr(lines[j],"PROJECT GUTENBERG"))
851 g_print(" --> Duplicate header?\n");
852 nspline=linecnt+1; /* first line of non-header text, that is */
854 if (spline || nspline)
856 lc_line=g_utf8_strdown(lines[j],lbytes);
857 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
859 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
861 if (results.footerline)
863 /* it's an old-form header - we can detect duplicates */
865 g_print(" --> Duplicate footer?\n");
868 results.footerline=linecnt;
874 results.firstline=spline;
876 results.firstline=nspline; /* override with new */
877 if (results.footerline)
878 continue; /* don't count the boilerplate in the footer */
879 results.totlen+=llen;
880 for (s=lines[j];*s;s=g_utf8_next_char(s))
882 if (g_utf8_get_char(s)>127)
884 if (g_unichar_isalpha(g_utf8_get_char(s)))
888 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
889 qc=QUOTE_CLASS(g_utf8_get_char(s));
892 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
893 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
894 results.endquote_count++;
897 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
898 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
901 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
903 if (strstr(lines[j],".,"))
905 /* only count ast lines for ignoring purposes where there is */
906 /* locase text on the line */
907 if (strchr(lines[j],'*'))
909 for (s=lines[j];*s;s=g_utf8_next_char(s))
910 if (g_unichar_islower(g_utf8_get_char(s)))
915 if (strchr(lines[j],'/'))
916 results.fslashline++;
919 for (s=g_utf8_prev_char(lines[j]+lbytes);
920 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
921 s=g_utf8_prev_char(s))
923 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
924 g_utf8_get_char(g_utf8_prev_char(s))!='-')
927 if (llen>LONGEST_PG_LINE)
929 if (llen>WAY_TOO_LONG)
930 results.verylongline++;
931 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
933 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
936 if (strstr(lines[j],"<i>"))
937 results.htmcount+=4; /* bonus marks! */
939 /* Check for spaced em-dashes */
940 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
941 count_dashes(lines[j],"--",&tmp_dash_results);
942 count_dashes(lines[j],"—",&tmp_dash_results);
943 if (tmp_dash_results.base)
944 results.emdash.base++;
945 if (tmp_dash_results.non_PG_space)
946 results.emdash.non_PG_space++;
947 if (tmp_dash_results.PG_space)
948 results.emdash.PG_space++;
952 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
953 results.Dutchcount++;
954 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
955 results.Frenchcount++;
956 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
957 results.standalone_digit++;
960 /* Check for spaced dashes */
961 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
965 laststart=lines[j][0];
974 * Make some snap decisions based on the first pass results.
976 struct warnings *report_first_pass(struct first_pass_results *results)
978 static struct warnings warnings={0};
980 g_print(" --> %ld lines in this file have white space at end\n",
983 if (results->dotcomma>5)
986 g_print(" --> %ld lines in this file contain '.,'. "
987 "Not reporting them.\n",results->dotcomma);
990 * If more than 50 lines, or one-tenth, are short,
991 * don't bother reporting them.
993 warnings.shortline=1;
994 if (results->shortline>50 || results->shortline*10>linecnt)
996 warnings.shortline=0;
997 g_print(" --> %ld lines in this file are short. "
998 "Not reporting short lines.\n",results->shortline);
1001 * If more than 50 lines, or one-tenth, are long,
1002 * don't bother reporting them.
1004 warnings.longline=1;
1005 if (results->longline>50 || results->longline*10>linecnt)
1007 warnings.longline=0;
1008 g_print(" --> %ld lines in this file are long. "
1009 "Not reporting long lines.\n",results->longline);
1011 /* If more than 10 lines contain asterisks, don't bother reporting them. */
1013 if (results->astline>10)
1016 g_print(" --> %ld lines in this file contain asterisks. "
1017 "Not reporting them.\n",results->astline);
1020 * If more than 10 lines contain forward slashes,
1021 * don't bother reporting them.
1024 if (results->fslashline>10)
1027 g_print(" --> %ld lines in this file contain forward slashes. "
1028 "Not reporting them.\n",results->fslashline);
1031 * If more than 20 lines contain unpunctuated endquotes,
1032 * don't bother reporting them.
1034 warnings.endquote=1;
1035 if (results->endquote_count>20)
1037 warnings.endquote=0;
1038 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1039 "Not reporting them.\n",results->endquote_count);
1042 * If more than 15 lines contain standalone digits,
1043 * don't bother reporting them.
1046 if (results->standalone_digit>10)
1049 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1050 "Not reporting them.\n",results->standalone_digit);
1053 * If more than 20 lines contain hyphens at end,
1054 * don't bother reporting them.
1057 if (results->hyphens>20)
1060 g_print(" --> %ld lines in this file have hyphens at end. "
1061 "Not reporting them.\n",results->hyphens);
1063 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1065 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1066 pswit[MARKUP_SWITCH]=1;
1068 if (results->verylongline>0)
1069 g_print(" --> %ld lines in this file are VERY long!\n",
1070 results->verylongline);
1072 * If there are more non-PG spaced dashes than PG em-dashes,
1073 * assume it's deliberate.
1074 * Current PG guidelines say don't use them, but older texts do,
1075 * and some people insist on them whatever the guidelines say.
1078 if (results->spacedash+results->emdash.non_PG_space>
1079 results->emdash.PG_space)
1082 g_print(" --> There are %ld spaced dashes and em-dashes. "
1083 "Not reporting them.\n",
1084 results->spacedash+results->emdash.non_PG_space);
1090 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1092 /* If more than a quarter of characters are hi-bit, bug out. */
1093 if (results->binlen*4>results->totlen)
1095 g_print(" --> This file does not appear to be ASCII. "
1096 "Terminating. Best of luck with it!\n");
1099 if (results->alphalen*4<results->totlen)
1101 g_print(" --> This file does not appear to be text. "
1102 "Terminating. Best of luck with it!\n");
1105 if (results->binlen*100>results->totlen || results->binlen>100)
1107 g_print(" --> There are a lot of foreign letters here. "
1108 "Not reporting them.\n");
1109 if (!pswit[VERBOSE_SWITCH])
1113 warnings.isDutch=FALSE;
1114 if (results->Dutchcount>50)
1116 warnings.isDutch=TRUE;
1117 g_print(" --> This looks like Dutch - "
1118 "switching off dashes and warnings for 's Middags case.\n");
1120 warnings.isFrench=FALSE;
1121 if (results->Frenchcount>50)
1123 warnings.isFrench=TRUE;
1124 g_print(" --> This looks like French - "
1125 "switching off some doublepunct.\n");
1127 if (results->firstline && results->footerline)
1128 g_print(" The PG header and footer appear to be already on.\n");
1131 if (results->firstline)
1132 g_print(" The PG header is on - no footer.\n");
1133 if (results->footerline)
1134 g_print(" The PG footer is on - no header.\n");
1137 if (pswit[VERBOSE_SWITCH])
1139 warnings.shortline=1;
1140 warnings.dotcomma=1;
1141 warnings.longline=1;
1147 warnings.endquote=1;
1148 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1150 if (warnings.isDutch)
1152 if (results->footerline>0 && results->firstline>0 &&
1153 results->footerline>results->firstline &&
1154 results->footerline-results->firstline<100)
1156 g_print(" --> I don't really know where this text starts. \n");
1157 g_print(" There are no reference points.\n");
1158 g_print(" I'm going to have to report the header and footer "
1160 results->firstline=0;
1168 * Look along the line, accumulate the count of quotes, and see
1169 * if this is an empty line - i.e. a line with nothing on it
1171 * If line has just spaces, period, * and/or - on it, don't
1172 * count it, since empty lines with asterisks or dashes to
1173 * separate sections are common.
1175 * Returns: TRUE if the line is empty.
1177 gboolean analyse_quotes(const char *aline,struct counters *counters)
1180 /* assume the line is empty until proven otherwise */
1181 gboolean isemptyline=TRUE;
1182 const char *s=aline,*sprev,*snext;
1185 GError *tmp_err=NULL;
1188 snext=g_utf8_next_char(s);
1189 c=g_utf8_get_char(s);
1190 if (CHAR_IS_DQUOTE(c))
1191 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1192 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1197 * At start of line, it can only be a quotation mark.
1198 * Hardcode a very common exception!
1200 if (!g_str_has_prefix(snext,"tis") &&
1201 !g_str_has_prefix(snext,"Tis"))
1202 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1204 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1205 g_unichar_isalpha(g_utf8_get_char(snext)))
1206 /* Do nothing! it's definitely an apostrophe, not a quote */
1208 /* it's outside a word - let's check it out */
1209 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1210 g_unichar_isalpha(g_utf8_get_char(snext)))
1212 /* certainly looks like a quotation mark */
1213 if (!g_str_has_prefix(snext,"tis") &&
1214 !g_str_has_prefix(snext,"Tis"))
1215 /* hardcode a very common exception! */
1217 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1218 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1220 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1225 /* now - is it a quotation mark? */
1226 guessquote=0; /* accumulate clues */
1227 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1229 /* it follows a letter - could be either */
1231 if (g_utf8_get_char(sprev)=='s')
1233 /* looks like a plural apostrophe */
1235 if (g_utf8_get_char(snext)==CHAR_SPACE)
1239 if (innermost_quote_matches(counters,c))
1241 * Give it the benefit of some doubt,
1242 * if a squote is already open.
1248 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1251 /* no adjacent letter - it must be a quote of some kind */
1252 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1257 if (pswit[ECHO_SWITCH])
1258 g_print("\n%s\n",aline);
1259 if (!pswit[OVERVIEW_SWITCH])
1260 g_print(" Line %ld column %ld - %s\n",
1261 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1262 g_clear_error(&tmp_err);
1264 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1266 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1267 if (c==CHAR_UNDERSCORE)
1268 counters->c_unders++;
1269 if (c==CHAR_OPEN_SBRACK)
1271 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1272 !matching_difference(counters,c) && s==aline &&
1273 g_str_has_prefix(s,"[Illustration:"))
1274 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1276 increment_matching(counters,c,TRUE);
1278 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1279 increment_matching(counters,c,TRUE);
1280 if (c==CHAR_CLOSE_SBRACK)
1282 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1283 !matching_difference(counters,c) && !*snext)
1284 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1286 increment_matching(counters,c,FALSE);
1288 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1289 increment_matching(counters,c,FALSE);
1297 * check_for_control_characters:
1299 * Check for invalid or questionable characters in the line
1300 * Anything above 127 is invalid for plain ASCII, and
1301 * non-printable control characters should also be flagged.
1302 * Tabs should generally not be there.
1304 void check_for_control_characters(const char *aline)
1308 for (s=aline;*s;s=g_utf8_next_char(s))
1310 c=g_utf8_get_char(s);
1311 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1313 if (pswit[ECHO_SWITCH])
1314 g_print("\n%s\n",aline);
1315 if (!pswit[OVERVIEW_SWITCH])
1316 g_print(" Line %ld column %ld - Control character %u\n",
1317 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1325 * check_for_odd_characters:
1327 * Check for binary and other odd characters.
1329 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1330 gboolean isemptyline)
1332 /* Don't repeat multiple warnings on one line. */
1333 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1334 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1339 for (s=aline;*s;s=g_utf8_next_char(s))
1341 c=g_utf8_get_char(s);
1342 if (warnings->bin && !eInvalidChar &&
1343 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1345 if (pswit[ECHO_SWITCH])
1346 g_print("\n%s\n",aline);
1347 if (!pswit[OVERVIEW_SWITCH])
1348 if (c>127 && c<160 || c>255)
1349 g_print(" Line %ld column %ld - "
1350 "Non-ISO-8859 character %u\n",
1351 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1353 g_print(" Line %ld column %ld - "
1354 "Non-ASCII character %u\n",
1355 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1360 if (!eInvalidChar && charset)
1362 if (charset_validator==(GIConv)-1)
1364 if (!g_unichar_isdefined(c))
1366 if (pswit[ECHO_SWITCH])
1367 g_print("\n%s\n",aline);
1368 if (!pswit[OVERVIEW_SWITCH])
1369 g_print(" Line %ld column %ld - Unassigned UNICODE "
1370 "code point U+%04" G_GINT32_MODIFIER "X\n",
1371 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1376 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1377 c>=100000 && c<=0x10FFFD)
1379 if (pswit[ECHO_SWITCH])
1380 g_print("\n%s\n",aline);
1381 if (!pswit[OVERVIEW_SWITCH])
1382 g_print(" Line %ld column %ld - Private Use "
1383 "character U+%04" G_GINT32_MODIFIER "X\n",
1384 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1392 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1393 charset_validator,NULL,&nb,NULL);
1398 if (pswit[ECHO_SWITCH])
1399 g_print("\n%s\n",aline);
1400 if (!pswit[OVERVIEW_SWITCH])
1401 g_print(" Line %ld column %ld - Non-%s "
1402 "character %u\n",linecnt,
1403 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1410 if (!eTab && c==CHAR_TAB)
1412 if (pswit[ECHO_SWITCH])
1413 g_print("\n%s\n",aline);
1414 if (!pswit[OVERVIEW_SWITCH])
1415 g_print(" Line %ld column %ld - Tab character?\n",
1416 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1421 if (!eTilde && c==CHAR_TILDE)
1424 * Often used by OCR software to indicate an
1425 * unrecognizable character.
1427 if (pswit[ECHO_SWITCH])
1428 g_print("\n%s\n",aline);
1429 if (!pswit[OVERVIEW_SWITCH])
1430 g_print(" Line %ld column %ld - Tilde character?\n",
1431 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1436 if (!eCarat && c==CHAR_CARAT)
1438 if (pswit[ECHO_SWITCH])
1439 g_print("\n%s\n",aline);
1440 if (!pswit[OVERVIEW_SWITCH])
1441 g_print(" Line %ld column %ld - Carat character?\n",
1442 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1447 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1449 if (pswit[ECHO_SWITCH])
1450 g_print("\n%s\n",aline);
1451 if (!pswit[OVERVIEW_SWITCH])
1452 g_print(" Line %ld column %ld - Forward slash?\n",
1453 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1459 * Report asterisks only in paranoid mode,
1460 * since they're often deliberate.
1462 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1465 if (pswit[ECHO_SWITCH])
1466 g_print("\n%s\n",aline);
1467 if (!pswit[OVERVIEW_SWITCH])
1468 g_print(" Line %ld column %ld - Asterisk?\n",
1469 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1478 * check_for_long_line:
1480 * Check for line too long.
1482 void check_for_long_line(const char *aline)
1484 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1486 if (pswit[ECHO_SWITCH])
1487 g_print("\n%s\n",aline);
1488 if (!pswit[OVERVIEW_SWITCH])
1489 g_print(" Line %ld column %ld - Long line %ld\n",
1490 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1497 * check_for_short_line:
1499 * Check for line too short.
1501 * This one is a bit trickier to implement: we don't want to
1502 * flag the last line of a paragraph for being short, so we
1503 * have to wait until we know that our current line is a
1504 * "normal" line, then report the _previous_ line if it was too
1505 * short. We also don't want to report indented lines like
1506 * chapter heads or formatted quotations. We therefore keep
1507 * last->len as the length of the last line examined, and
1508 * last->blen as the length of the last but one, and try to
1509 * suppress unnecessary warnings by checking that both were of
1510 * "normal" length. We keep the first character of the last
1511 * line in last->start, and if it was a space, we assume that
1512 * the formatting is deliberate. I can't figure out a way to
1513 * distinguish something like a quoted verse left-aligned or
1514 * the header or footer of a letter from a paragraph of short
1515 * lines - maybe if I examined the whole paragraph, and if the
1516 * para has less than, say, 8 lines and if all lines are short,
1517 * then just assume it's OK? Need to look at some texts to see
1518 * how often a formula like this would get the right result.
1520 void check_for_short_line(const char *aline,const struct line_properties *last)
1522 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1523 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1524 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1526 if (pswit[ECHO_SWITCH])
1527 g_print("\n%s\n",prevline);
1528 if (!pswit[OVERVIEW_SWITCH])
1529 g_print(" Line %ld column %ld - Short line %ld?\n",
1530 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1537 * check_for_starting_punctuation:
1539 * Look for punctuation other than full ellipses at start of line.
1541 void check_for_starting_punctuation(const char *aline)
1543 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1544 !g_str_has_prefix(aline,". . ."))
1546 if (pswit[ECHO_SWITCH])
1547 g_print("\n%s\n",aline);
1548 if (!pswit[OVERVIEW_SWITCH])
1549 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1559 * Find the first em-dash, return a pointer to it and set <next> to the
1560 * character following the dash.
1562 char *str_emdash(const char *s,const char **next)
1570 *next=g_utf8_next_char(s2);
1575 *next=g_utf8_next_char(g_utf8_next_char(s1));
1580 *next=g_utf8_next_char(g_utf8_next_char(s1));
1585 *next=g_utf8_next_char(s2);
1591 * check_for_spaced_emdash:
1593 * Check for spaced em-dashes.
1595 * We must check _all_ occurrences of em-dashes on the line
1596 * hence the loop - even if the first dash is OK
1597 * there may be another that's wrong later on.
1599 void check_for_spaced_emdash(const char *aline)
1601 const char *s,*t,*next;
1602 for (s=aline;t=str_emdash(s,&next);s=next)
1604 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1605 g_utf8_get_char(next)==CHAR_SPACE)
1607 if (pswit[ECHO_SWITCH])
1608 g_print("\n%s\n",aline);
1609 if (!pswit[OVERVIEW_SWITCH])
1610 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1611 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1619 * check_for_spaced_dash:
1621 * Check for spaced dashes.
1623 void check_for_spaced_dash(const char *aline)
1626 if ((s=strstr(aline," -")))
1628 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1630 if (pswit[ECHO_SWITCH])
1631 g_print("\n%s\n",aline);
1632 if (!pswit[OVERVIEW_SWITCH])
1633 g_print(" Line %ld column %ld - Spaced dash?\n",
1634 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1639 else if ((s=strstr(aline,"- ")))
1641 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1643 if (pswit[ECHO_SWITCH])
1644 g_print("\n%s\n",aline);
1645 if (!pswit[OVERVIEW_SWITCH])
1646 g_print(" Line %ld column %ld - Spaced dash?\n",
1647 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1655 * check_for_unmarked_paragraphs:
1657 * Check for unmarked paragraphs indicated by separate speakers.
1659 * May well be false positive:
1660 * "Bravo!" "Wonderful!" called the crowd.
1661 * but useful all the same.
1663 void check_for_unmarked_paragraphs(const char *aline)
1666 s=strstr(aline,"\" \"");
1668 s=strstr(aline,"\" \"");
1671 if (pswit[ECHO_SWITCH])
1672 g_print("\n%s\n",aline);
1673 if (!pswit[OVERVIEW_SWITCH])
1674 g_print(" Line %ld column %ld - "
1675 "Query missing paragraph break?\n",
1676 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1683 * check_for_jeebies:
1685 * Check for "to he" and other easy h/b errors.
1687 * This is a very inadequate effort on the h/b problem,
1688 * but the phrase "to he" is always an error, whereas "to
1689 * be" is quite common.
1690 * Similarly, '"Quiet!", be said.' is a non-be error
1691 * "to he" is _not_ always an error!:
1692 * "Where they went to he couldn't say."
1693 * Another false positive:
1694 * What would "Cinderella" be without the . . .
1695 * and another: "If he wants to he can see for himself."
1697 void check_for_jeebies(const char *aline)
1700 s=strstr(aline," be could ");
1702 s=strstr(aline," be would ");
1704 s=strstr(aline," was be ");
1706 s=strstr(aline," be is ");
1708 s=strstr(aline," is be ");
1710 s=strstr(aline,"\", be ");
1712 s=strstr(aline,"\" be ");
1714 s=strstr(aline,"\" be ");
1716 s=strstr(aline," to he ");
1719 if (pswit[ECHO_SWITCH])
1720 g_print("\n%s\n",aline);
1721 if (!pswit[OVERVIEW_SWITCH])
1722 g_print(" Line %ld column %ld - Query he/be error?\n",
1723 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1727 s=strstr(aline," the had ");
1729 s=strstr(aline," a had ");
1731 s=strstr(aline," they bad ");
1733 s=strstr(aline," she bad ");
1735 s=strstr(aline," he bad ");
1737 s=strstr(aline," you bad ");
1739 s=strstr(aline," i bad ");
1742 if (pswit[ECHO_SWITCH])
1743 g_print("\n%s\n",aline);
1744 if (!pswit[OVERVIEW_SWITCH])
1745 g_print(" Line %ld column %ld - Query had/bad error?\n",
1746 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1750 s=strstr(aline,"; hut ");
1752 s=strstr(aline,", hut ");
1755 if (pswit[ECHO_SWITCH])
1756 g_print("\n%s\n",aline);
1757 if (!pswit[OVERVIEW_SWITCH])
1758 g_print(" Line %ld column %ld - Query hut/but error?\n",
1759 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1766 * check_for_mta_from:
1768 * Special case - angled bracket in front of "From" placed there by an
1769 * MTA when sending an e-mail.
1771 void check_for_mta_from(const char *aline)
1774 s=strstr(aline,">From");
1777 if (pswit[ECHO_SWITCH])
1778 g_print("\n%s\n",aline);
1779 if (!pswit[OVERVIEW_SWITCH])
1780 g_print(" Line %ld column %ld - "
1781 "Query angled bracket with From\n",
1782 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1789 * check_for_orphan_character:
1791 * Check for a single character line -
1792 * often an overflow from bad wrapping.
1794 void check_for_orphan_character(const char *aline)
1797 c=g_utf8_get_char(aline);
1798 if (c && !*g_utf8_next_char(aline))
1800 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1801 ; /* Nothing - ignore numerals alone on a line. */
1804 if (pswit[ECHO_SWITCH])
1805 g_print("\n%s\n",aline);
1806 if (!pswit[OVERVIEW_SWITCH])
1807 g_print(" Line %ld column 1 - Query single character line\n",
1816 * check_for_pling_scanno:
1818 * Check for I" - often should be !
1820 void check_for_pling_scanno(const char *aline)
1823 s=strstr(aline," I\"");
1826 if (pswit[ECHO_SWITCH])
1827 g_print("\n%s\n",aline);
1828 if (!pswit[OVERVIEW_SWITCH])
1829 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1830 linecnt,g_utf8_pointer_to_offset(aline,s));
1837 * check_for_extra_period:
1839 * Check for period without a capital letter. Cut-down from gutspell.
1840 * Only works when it happens on a single line.
1842 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1844 const char *s,*t,*s1,*sprev;
1849 gunichar c,nc,pc,*decomposition;
1850 if (pswit[PARANOID_SWITCH])
1852 for (t=aline;t=strstr(t,". ");)
1856 t=g_utf8_next_char(t);
1857 /* start of line punctuation is handled elsewhere */
1860 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1862 t=g_utf8_next_char(t);
1865 if (warnings->isDutch)
1867 /* For Frank & Jeroen -- 's Middags case */
1868 gunichar c2,c3,c4,c5;
1869 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1870 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1871 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1872 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1873 if (CHAR_IS_APOSTROPHE(c2) &&
1874 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1875 g_unichar_isupper(c5))
1877 t=g_utf8_next_char(t);
1881 s1=g_utf8_next_char(g_utf8_next_char(t));
1882 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1883 !isdigit(g_utf8_get_char(s1)))
1884 s1=g_utf8_next_char(s1);
1885 if (g_unichar_islower(g_utf8_get_char(s1)))
1887 /* we have something to investigate */
1889 /* so let's go back and find out */
1890 nc=g_utf8_get_char(t);
1891 s1=g_utf8_prev_char(t);
1892 c=g_utf8_get_char(s1);
1893 sprev=g_utf8_prev_char(s1);
1894 pc=g_utf8_get_char(sprev);
1896 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1897 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1898 g_unichar_isalpha(nc)))
1903 sprev=g_utf8_prev_char(s1);
1904 pc=g_utf8_get_char(sprev);
1906 s1=g_utf8_next_char(s1);
1909 testword=g_strndup(s1,s-s1);
1911 testword=g_strdup(s1);
1912 for (i=0;*abbrev[i];i++)
1913 if (!strcmp(testword,abbrev[i]))
1915 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1917 if (!*g_utf8_next_char(testword))
1919 if (isroman(testword))
1924 for (s=testword;*s;s=g_utf8_next_char(s))
1926 decomposition=g_unicode_canonical_decomposition(
1927 g_utf8_get_char(s),&len);
1928 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1930 g_free(decomposition);
1934 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1936 g_tree_insert(qperiod,g_strdup(testword),
1937 GINT_TO_POINTER(1));
1938 if (pswit[ECHO_SWITCH])
1939 g_print("\n%s\n",aline);
1940 if (!pswit[OVERVIEW_SWITCH])
1941 g_print(" Line %ld column %ld - Extra period?\n",
1942 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1948 t=g_utf8_next_char(t);
1954 * check_for_following_punctuation:
1956 * Check for words usually not followed by punctuation.
1958 void check_for_following_punctuation(const char *aline)
1961 const char *s,*wordstart;
1964 if (pswit[TYPO_SWITCH])
1975 inword=g_utf8_strdown(t,-1);
1977 for (i=0;*nocomma[i];i++)
1978 if (!strcmp(inword,nocomma[i]))
1980 c=g_utf8_get_char(s);
1981 if (c==',' || c==';' || c==':')
1983 if (pswit[ECHO_SWITCH])
1984 g_print("\n%s\n",aline);
1985 if (!pswit[OVERVIEW_SWITCH])
1986 g_print(" Line %ld column %ld - "
1987 "Query punctuation after %s?\n",
1988 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1994 for (i=0;*noperiod[i];i++)
1995 if (!strcmp(inword,noperiod[i]))
1997 c=g_utf8_get_char(s);
1998 if (c=='.' || c=='!')
2000 if (pswit[ECHO_SWITCH])
2001 g_print("\n%s\n",aline);
2002 if (!pswit[OVERVIEW_SWITCH])
2003 g_print(" Line %ld column %ld - "
2004 "Query punctuation after %s?\n",
2005 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
2019 * Check for commonly mistyped words,
2020 * and digits like 0 for O in a word.
2022 void check_for_typos(const char *aline,struct warnings *warnings)
2024 const char *s,*t,*nt,*wordstart;
2026 gunichar *decomposition;
2028 int i,vowel,consonant,*dupcnt;
2029 gboolean isdup,istypo,alower;
2032 gsize decomposition_len;
2036 inword=getaword(&s);
2040 continue; /* don't bother with empty lines */
2042 if (mixdigit(inword))
2044 if (pswit[ECHO_SWITCH])
2045 g_print("\n%s\n",aline);
2046 if (!pswit[OVERVIEW_SWITCH])
2047 g_print(" Line %ld column %ld - Query digit in %s\n",
2048 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
2053 * Put the word through a series of tests for likely typos and OCR
2056 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2060 for (t=inword;*t;t=g_utf8_next_char(t))
2062 c=g_utf8_get_char(t);
2063 nt=g_utf8_next_char(t);
2064 /* lowercase for testing */
2065 if (g_unichar_islower(c))
2067 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
2070 * We have an uppercase mid-word. However, there are
2072 * Mac and Mc like McGill
2073 * French contractions like l'Abbe
2075 offset=g_utf8_pointer_to_offset(inword,t);
2077 pc=g_utf8_get_char(g_utf8_prev_char(t));
2080 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
2081 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2082 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2083 CHAR_IS_APOSTROPHE(pc))
2089 testword=g_utf8_casefold(inword,-1);
2091 if (pswit[TYPO_SWITCH])
2094 * Check for certain unlikely two-letter combinations at word
2097 len=g_utf8_strlen(testword,-1);
2100 for (i=0;*nostart[i];i++)
2101 if (g_str_has_prefix(testword,nostart[i]))
2103 for (i=0;*noend[i];i++)
2104 if (g_str_has_suffix(testword,noend[i]))
2107 /* ght is common, gbt never. Like that. */
2108 if (strstr(testword,"cb"))
2110 if (strstr(testword,"gbt"))
2112 if (strstr(testword,"pbt"))
2114 if (strstr(testword,"tbs"))
2116 if (strstr(testword,"mrn"))
2118 if (strstr(testword,"ahle"))
2120 if (strstr(testword,"ihle"))
2123 * "TBE" does happen - like HEARTBEAT - but uncommon.
2124 * Also "TBI" - frostbite, outbid - but uncommon.
2125 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2126 * numerals, but "ii" is a common scanno.
2128 if (strstr(testword,"tbi"))
2130 if (strstr(testword,"tbe"))
2132 if (strstr(testword,"ii"))
2135 * Check for no vowels or no consonants.
2136 * If none, flag a typo.
2138 if (!istypo && len>1)
2141 for (t=testword;*t;t=g_utf8_next_char(t))
2143 c=g_utf8_get_char(t);
2145 g_unicode_canonical_decomposition(c,&decomposition_len);
2146 if (c=='y' || g_unichar_isdigit(c))
2148 /* Yah, this is loose. */
2152 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2156 g_free(decomposition);
2158 if (!vowel || !consonant)
2162 * Now exclude the word from being reported if it's in
2165 for (i=0;*okword[i];i++)
2166 if (!strcmp(testword,okword[i]))
2169 * What looks like a typo may be a Roman numeral.
2172 if (istypo && isroman(testword))
2174 /* Check the manual list of typos. */
2176 for (i=0;*typo[i];i++)
2177 if (!strcmp(testword,typo[i]))
2180 * Check lowercase s, l, i and m - special cases.
2181 * "j" - often a semi-colon gone wrong.
2182 * "d" for a missing apostrophe - he d
2185 if (!istypo && len==1 &&
2186 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2190 dupcnt=g_tree_lookup(qword,testword);
2194 isdup=!pswit[VERBOSE_SWITCH];
2198 dupcnt=g_new0(int,1);
2199 g_tree_insert(qword,g_strdup(testword),dupcnt);
2204 if (pswit[ECHO_SWITCH])
2205 g_print("\n%s\n",aline);
2206 if (!pswit[OVERVIEW_SWITCH])
2208 g_print(" Line %ld column %ld - Query word %s",
2209 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2211 if (!pswit[VERBOSE_SWITCH])
2212 g_print(" - not reporting duplicates");
2220 /* check the user's list of typos */
2221 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2223 if (pswit[ECHO_SWITCH])
2224 g_print("\n%s\n",aline);
2225 if (!pswit[OVERVIEW_SWITCH])
2226 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2227 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2229 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2231 if (pswit[PARANOID_SWITCH] && warnings->digit)
2233 /* In paranoid mode, query all 0 and 1 standing alone. */
2234 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2236 if (pswit[ECHO_SWITCH])
2237 g_print("\n%s\n",aline);
2238 if (!pswit[OVERVIEW_SWITCH])
2239 g_print(" Line %ld column %ld - Query standalone %s\n",
2240 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2251 * check_for_misspaced_punctuation:
2253 * Look for added or missing spaces around punctuation and quotes.
2254 * If there is a punctuation character like ! with no space on
2255 * either side, suspect a missing!space. If there are spaces on
2256 * both sides , assume a typo. If we see a double quote with no
2257 * space or punctuation on either side of it, assume unspaced
2258 * quotes "like"this.
2260 void check_for_misspaced_punctuation(const char *aline,
2261 struct parities *parities,gboolean isemptyline)
2263 gboolean isacro,isellipsis;
2265 gunichar c,nc,pc,n2c;
2267 c=g_utf8_get_char(aline);
2268 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2269 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2273 nc=g_utf8_get_char(g_utf8_next_char(s));
2274 /* For each character in the line after the first. */
2275 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2277 /* we need to suppress warnings for acronyms like M.D. */
2279 /* we need to suppress warnings for ellipsis . . . */
2282 * If there are letters on both sides of it or
2283 * if it's strict punctuation followed by an alpha.
2285 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2286 g_utf8_strchr("?!,;:",-1,c)))
2290 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2291 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2293 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2299 if (pswit[ECHO_SWITCH])
2300 g_print("\n%s\n",aline);
2301 if (!pswit[OVERVIEW_SWITCH])
2302 g_print(" Line %ld column %ld - Missing space?\n",
2303 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2308 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2311 * If there are spaces on both sides,
2312 * or space before and end of line.
2316 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2317 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2319 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2323 if (!isemptyline && !isellipsis)
2325 if (pswit[ECHO_SWITCH])
2326 g_print("\n%s\n",aline);
2327 if (!pswit[OVERVIEW_SWITCH])
2328 g_print(" Line %ld column %ld - "
2329 "Spaced punctuation?\n",linecnt,
2330 g_utf8_pointer_to_offset(aline,s)+1);
2337 /* Split out the characters that CANNOT be preceded by space. */
2338 c=g_utf8_get_char(aline);
2339 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2340 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2344 nc=g_utf8_get_char(g_utf8_next_char(s));
2345 /* for each character in the line after the first */
2346 if (g_utf8_strchr("?!,;:",-1,c))
2348 /* if it's punctuation that _cannot_ have a space before it */
2349 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2352 * If nc DOES == space,
2353 * it was already reported just above.
2355 if (pswit[ECHO_SWITCH])
2356 g_print("\n%s\n",aline);
2357 if (!pswit[OVERVIEW_SWITCH])
2358 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2359 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2366 * Special case " .X" where X is any alpha.
2367 * This plugs a hole in the acronym code above.
2368 * Inelegant, but maintainable.
2370 c=g_utf8_get_char(aline);
2371 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2372 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2376 nc=g_utf8_get_char(g_utf8_next_char(s));
2377 /* for each character in the line after the first */
2380 /* if it's a period */
2381 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2384 * If the period follows a space and
2385 * is followed by a letter.
2387 if (pswit[ECHO_SWITCH])
2388 g_print("\n%s\n",aline);
2389 if (!pswit[OVERVIEW_SWITCH])
2390 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2391 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2397 c=g_utf8_get_char(aline);
2398 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2399 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2403 nc=g_utf8_get_char(g_utf8_next_char(s));
2404 /* for each character in the line after the first */
2405 if (CHAR_IS_DQUOTE(c))
2407 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2408 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2409 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2411 if (pswit[ECHO_SWITCH])
2412 g_print("\n%s\n",aline);
2413 if (!pswit[OVERVIEW_SWITCH])
2414 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2415 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2421 /* Check parity of quotes. */
2422 nc=g_utf8_get_char(aline);
2423 for (s=aline;*s;s=g_utf8_next_char(s))
2426 nc=g_utf8_get_char(g_utf8_next_char(s));
2427 if (CHAR_IS_DQUOTE(c))
2431 parities->dquote=!parities->dquote;
2432 parity=parities->dquote;
2434 else if (c==CHAR_LD_QUOTE)
2441 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2443 if (pswit[ECHO_SWITCH])
2444 g_print("\n%s\n",aline);
2445 if (!pswit[OVERVIEW_SWITCH])
2446 g_print(" Line %ld column %ld - "
2447 "Wrongspaced quotes?\n",
2448 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2456 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2457 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2459 if (pswit[ECHO_SWITCH])
2460 g_print("\n%s\n",aline);
2461 if (!pswit[OVERVIEW_SWITCH])
2462 g_print(" Line %ld column %ld - "
2463 "Wrongspaced quotes?\n",
2464 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2471 c=g_utf8_get_char(aline);
2472 if (CHAR_IS_DQUOTE(c))
2474 if (g_utf8_strchr(",;:!?)]} ",-1,
2475 g_utf8_get_char(g_utf8_next_char(aline))))
2477 if (pswit[ECHO_SWITCH])
2478 g_print("\n%s\n",aline);
2479 if (!pswit[OVERVIEW_SWITCH])
2480 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2486 if (pswit[SQUOTE_SWITCH])
2488 nc=g_utf8_get_char(aline);
2489 for (s=aline;*s;s=g_utf8_next_char(s))
2492 nc=g_utf8_get_char(g_utf8_next_char(s));
2493 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2494 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2495 !g_unichar_isalpha(nc)))
2497 parities->squote=!parities->squote;
2498 if (!parities->squote)
2501 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2503 if (pswit[ECHO_SWITCH])
2504 g_print("\n%s\n",aline);
2505 if (!pswit[OVERVIEW_SWITCH])
2506 g_print(" Line %ld column %ld - "
2507 "Wrongspaced singlequotes?\n",
2508 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2516 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2517 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2519 if (pswit[ECHO_SWITCH])
2520 g_print("\n%s\n",aline);
2521 if (!pswit[OVERVIEW_SWITCH])
2522 g_print(" Line %ld column %ld - "
2523 "Wrongspaced singlequotes?\n",
2524 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2535 * check_for_double_punctuation:
2537 * Look for double punctuation like ,. or ,,
2538 * Thanks to DW for the suggestion!
2539 * In books with references, ".," and ".;" are common
2540 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2541 * OTOH, from my initial tests, there are also fairly
2542 * common errors. What to do? Make these cases paranoid?
2543 * ".," is the most common, so warnings->dotcomma is used
2544 * to suppress detailed reporting if it occurs often.
2546 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2550 nc=g_utf8_get_char(aline);
2551 for (s=aline;*s;s=g_utf8_next_char(s))
2554 nc=g_utf8_get_char(g_utf8_next_char(s));
2555 /* for each punctuation character in the line */
2556 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2557 g_utf8_strchr(".?!,;:",-1,nc))
2559 /* followed by punctuation, it's a query, unless . . . */
2560 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2561 !warnings->dotcomma && c=='.' && nc==',' ||
2562 warnings->isFrench && g_str_has_prefix(s,",...") ||
2563 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2564 warnings->isFrench && g_str_has_prefix(s,";...") ||
2565 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2566 warnings->isFrench && g_str_has_prefix(s,":...") ||
2567 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2568 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2569 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2570 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2571 warnings->isFrench && g_str_has_prefix(s,"...?"))
2573 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2574 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2575 warnings->isFrench && g_str_has_prefix(s,";...") ||
2576 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2577 warnings->isFrench && g_str_has_prefix(s,":...") ||
2578 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2579 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2580 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2581 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2582 warnings->isFrench && g_str_has_prefix(s,"...?"))
2585 nc=g_utf8_get_char(g_utf8_next_char(s));
2587 ; /* do nothing for .. !! and ?? which can be legit */
2591 if (pswit[ECHO_SWITCH])
2592 g_print("\n%s\n",aline);
2593 if (!pswit[OVERVIEW_SWITCH])
2594 g_print(" Line %ld column %ld - Double punctuation?\n",
2595 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2604 * check_for_spaced_quotes:
2606 void check_for_spaced_quotes(const char *aline)
2610 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2614 while ((t=strstr(s," \" ")))
2616 if (pswit[ECHO_SWITCH])
2617 g_print("\n%s\n",aline);
2618 if (!pswit[OVERVIEW_SWITCH])
2619 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2620 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2623 s=g_utf8_next_char(g_utf8_next_char(t));
2625 pattern=g_string_new(NULL);
2626 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2628 g_string_assign(pattern," ");
2629 g_string_append_unichar(pattern,single_quotes[i]);
2630 g_string_append_c(pattern,' ');
2632 while ((t=strstr(s,pattern->str)))
2634 if (pswit[ECHO_SWITCH])
2635 g_print("\n%s\n",aline);
2636 if (!pswit[OVERVIEW_SWITCH])
2637 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2638 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2641 s=g_utf8_next_char(g_utf8_next_char(t));
2644 g_string_free(pattern,TRUE);
2648 * check_for_miscased_genative:
2650 * Check special case of 'S instead of 's at end of word.
2652 void check_for_miscased_genative(const char *aline)
2658 c=g_utf8_get_char(aline);
2659 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2660 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2664 nc=g_utf8_get_char(g_utf8_next_char(s));
2665 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2667 if (pswit[ECHO_SWITCH])
2668 g_print("\n%s\n",aline);
2669 if (!pswit[OVERVIEW_SWITCH])
2670 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2671 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2679 * check_end_of_line:
2681 * Now check special cases - start and end of line -
2682 * for single and double quotes. Start is sometimes [sic]
2683 * but better to query it anyway.
2684 * While we're here, check for dash at end of line.
2686 void check_end_of_line(const char *aline,struct warnings *warnings)
2691 lbytes=strlen(aline);
2692 if (g_utf8_strlen(aline,lbytes)>1)
2694 s=g_utf8_prev_char(aline+lbytes);
2695 c1=g_utf8_get_char(s);
2696 c2=g_utf8_get_char(g_utf8_prev_char(s));
2697 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2699 if (pswit[ECHO_SWITCH])
2700 g_print("\n%s\n",aline);
2701 if (!pswit[OVERVIEW_SWITCH])
2702 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2703 g_utf8_strlen(aline,lbytes));
2707 c1=g_utf8_get_char(aline);
2708 c2=g_utf8_get_char(g_utf8_next_char(aline));
2709 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2711 if (pswit[ECHO_SWITCH])
2712 g_print("\n%s\n",aline);
2713 if (!pswit[OVERVIEW_SWITCH])
2714 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2719 * Dash at end of line may well be legit - paranoid mode only
2720 * and don't report em-dash at line-end.
2722 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2724 for (s=g_utf8_prev_char(aline+lbytes);
2725 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2727 if (g_utf8_get_char(s)=='-' &&
2728 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2730 if (pswit[ECHO_SWITCH])
2731 g_print("\n%s\n",aline);
2732 if (!pswit[OVERVIEW_SWITCH])
2733 g_print(" Line %ld column %ld - "
2734 "Hyphen at end of line?\n",
2735 linecnt,g_utf8_pointer_to_offset(aline,s));
2742 * check_for_unspaced_bracket:
2744 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2745 * If so, suspect a scanno like "a]most".
2747 void check_for_unspaced_bracket(const char *aline)
2751 c=g_utf8_get_char(aline);
2752 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2753 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2757 nc=g_utf8_get_char(g_utf8_next_char(s));
2760 /* for each bracket character in the line except 1st & last */
2761 if (g_utf8_strchr("{[()]}",-1,c) &&
2762 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2764 if (pswit[ECHO_SWITCH])
2765 g_print("\n%s\n",aline);
2766 if (!pswit[OVERVIEW_SWITCH])
2767 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2768 linecnt,g_utf8_pointer_to_offset(aline,s));
2776 * check_for_unpunctuated_endquote:
2778 void check_for_unpunctuated_endquote(const char *aline)
2783 c=g_utf8_get_char(aline);
2784 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2785 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2789 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2790 nc=g_utf8_get_char(g_utf8_next_char(s));
2791 /* for each character in the line except 1st */
2792 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2794 if (pswit[ECHO_SWITCH])
2795 g_print("\n%s\n",aline);
2796 if (!pswit[OVERVIEW_SWITCH])
2797 g_print(" Line %ld column %ld - "
2798 "endquote missing punctuation?\n",
2799 linecnt,g_utf8_pointer_to_offset(aline,s));
2807 * check_for_html_tag:
2809 * Check for <HTML TAG>.
2811 * If there is a < in the line, followed at some point
2812 * by a > then we suspect HTML.
2814 void check_for_html_tag(const char *aline)
2816 const char *open,*close;
2818 open=strchr(aline,'<');
2821 close=strchr(g_utf8_next_char(open),'>');
2824 if (pswit[ECHO_SWITCH])
2825 g_print("\n%s\n",aline);
2826 if (!pswit[OVERVIEW_SWITCH])
2828 tag=g_strndup(open,close-open+1);
2829 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2830 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2840 * check_for_html_entity:
2842 * Check for &symbol; HTML.
2844 * If there is a & in the line, followed at
2845 * some point by a ; then we suspect HTML.
2847 void check_for_html_entity(const char *aline)
2849 const char *s,*amp,*scolon;
2851 amp=strchr(aline,'&');
2854 scolon=strchr(amp,';');
2857 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2858 if (g_utf8_get_char(s)==CHAR_SPACE)
2859 break; /* Don't report "Jones & Son;" */
2862 if (pswit[ECHO_SWITCH])
2863 g_print("\n%s\n",aline);
2864 if (!pswit[OVERVIEW_SWITCH])
2866 entity=g_strndup(amp,scolon-amp+1);
2867 g_print(" Line %ld column %d - HTML symbol? %s \n",
2868 linecnt,(int)(amp-aline)+1,entity);
2879 * check_for_omitted_punctuation:
2881 * Check for omitted punctuation at end of paragraph by working back
2882 * through prevline. DW.
2883 * Need to check this only for "normal" paras.
2884 * So what is a "normal" para?
2885 * Not normal if one-liner (chapter headings, etc.)
2886 * Not normal if doesn't contain at least one locase letter
2887 * Not normal if starts with space
2889 void check_for_omitted_punctuation(const char *prevline,
2890 struct line_properties *last,int start_para_line)
2892 gboolean letter_on_line=FALSE;
2895 gboolean closing_quote;
2896 for (s=prevline;*s;s=g_utf8_next_char(s))
2897 if (g_unichar_isalpha(g_utf8_get_char(s)))
2899 letter_on_line=TRUE;
2903 * This next "if" is a problem.
2904 * If we say "start_para_line <= linecnt - 1", that includes
2905 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2906 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2907 * misses genuine one-line paragraphs.
2909 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2910 g_utf8_get_char(prevline)>CHAR_SPACE)
2912 s=prevline+strlen(prevline);
2915 s=g_utf8_prev_char(s);
2916 c=g_utf8_get_char(s);
2917 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2920 closing_quote=FALSE;
2921 } while (closing_quote && s>prevline);
2922 for (;s>prevline;s=g_utf8_prev_char(s))
2924 if (g_unichar_isalpha(g_utf8_get_char(s)))
2926 if (pswit[ECHO_SWITCH])
2927 g_print("\n%s\n",prevline);
2928 if (!pswit[OVERVIEW_SWITCH])
2929 g_print(" Line %ld column %ld - "
2930 "No punctuation at para end?\n",
2931 linecnt-1,g_utf8_strlen(prevline,-1));
2936 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2942 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2944 const char *word=key;
2947 g_print("\nNote: Queried word %s was duplicated %d times\n",
2952 void print_as_windows_1252(const char *string)
2954 gsize inbytes,outbytes;
2956 static GIConv converter=(GIConv)-1;
2959 if (converter!=(GIConv)-1)
2960 g_iconv_close(converter);
2961 converter=(GIConv)-1;
2964 if (converter==(GIConv)-1)
2965 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2966 if (converter!=(GIConv)-1)
2968 inbytes=outbytes=strlen(string);
2969 bp=buf=g_malloc(outbytes+1);
2970 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2976 fputs(string,stdout);
2979 void print_as_utf_8(const char *string)
2981 fputs(string,stdout);
2989 void procfile(const char *filename)
2992 gchar *parastart=NULL; /* first line of current para */
2993 gchar *etext,*aline;
2996 struct first_pass_results *first_pass_results;
2997 struct warnings *warnings;
2998 struct counters counters={0};
2999 struct line_properties last={0};
3000 struct parities parities={0};
3001 struct pending pending={0};
3002 gboolean isemptyline;
3003 long start_para_line=0;
3004 gboolean isnewpara=FALSE,enddash=FALSE;
3005 last.start=CHAR_SPACE;
3006 linecnt=checked_linecnt=0;
3007 etext=read_etext(filename,&err);
3010 if (pswit[STDOUT_SWITCH])
3011 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
3013 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
3016 g_print("\n\nFile: %s\n\n",filename);
3017 first_pass_results=first_pass(etext);
3018 warnings=report_first_pass(first_pass_results);
3019 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
3020 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
3022 * Here we go with the main pass. Hold onto yer hat!
3026 while ((aline=flgets(&etext_ptr,linecnt+1)))
3031 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
3032 continue; // skip DP page separators completely
3033 if (linecnt<first_pass_results->firstline ||
3034 (first_pass_results->footerline>0 &&
3035 linecnt>first_pass_results->footerline))
3037 if (pswit[HEADER_SWITCH])
3039 if (g_str_has_prefix(aline,"Title:"))
3040 g_print(" %s\n",aline);
3041 if (g_str_has_prefix(aline,"Author:"))
3042 g_print(" %s\n",aline);
3043 if (g_str_has_prefix(aline,"Release Date:"))
3044 g_print(" %s\n",aline);
3045 if (g_str_has_prefix(aline,"Edition:"))
3046 g_print(" %s\n\n",aline);
3048 continue; /* skip through the header */
3051 print_pending(aline,parastart,&pending);
3052 isemptyline=analyse_quotes(aline,&counters);
3053 if (isnewpara && !isemptyline)
3055 /* This line is the start of a new paragraph. */
3056 start_para_line=linecnt;
3057 /* Capture its first line in case we want to report it later. */
3059 parastart=g_strdup(aline);
3060 memset(&parities,0,sizeof(parities)); /* restart the quote count */
3062 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
3063 !g_unichar_isdigit(g_utf8_get_char(s)))
3064 s=g_utf8_next_char(s);
3065 if (g_unichar_islower(g_utf8_get_char(s)))
3067 /* and its first letter is lowercase */
3068 if (pswit[ECHO_SWITCH])
3069 g_print("\n%s\n",aline);
3070 if (!pswit[OVERVIEW_SWITCH])
3071 g_print(" Line %ld column %ld - "
3072 "Paragraph starts with lower-case\n",
3073 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
3077 isnewpara=FALSE; /* Signal the end of new para processing. */
3079 /* Check for an em-dash broken at line end. */
3080 if (enddash && g_utf8_get_char(aline)=='-')
3082 if (pswit[ECHO_SWITCH])
3083 g_print("\n%s\n",aline);
3084 if (!pswit[OVERVIEW_SWITCH])
3085 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
3090 for (s=g_utf8_prev_char(aline+strlen(aline));
3091 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
3093 if (s>=aline && g_utf8_get_char(s)=='-')
3095 check_for_control_characters(aline);
3096 check_for_odd_characters(aline,warnings,isemptyline);
3097 if (warnings->longline)
3098 check_for_long_line(aline);
3099 if (warnings->shortline)
3100 check_for_short_line(aline,&last);
3102 last.len=g_utf8_strlen(aline,-1);
3103 last.start=g_utf8_get_char(aline);
3104 check_for_starting_punctuation(aline);
3107 check_for_spaced_emdash(aline);
3108 check_for_spaced_dash(aline);
3110 check_for_unmarked_paragraphs(aline);
3111 check_for_jeebies(aline);
3112 check_for_mta_from(aline);
3113 check_for_orphan_character(aline);
3114 check_for_pling_scanno(aline);
3115 check_for_extra_period(aline,warnings);
3116 check_for_following_punctuation(aline);
3117 check_for_typos(aline,warnings);
3118 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3119 check_for_double_punctuation(aline,warnings);
3120 check_for_spaced_quotes(aline);
3121 check_for_miscased_genative(aline);
3122 check_end_of_line(aline,warnings);
3123 check_for_unspaced_bracket(aline);
3124 if (warnings->endquote)
3125 check_for_unpunctuated_endquote(aline);
3126 check_for_html_tag(aline);
3127 check_for_html_entity(aline);
3130 check_for_mismatched_quotes(&counters,&pending);
3131 counters_reset(&counters);
3132 /* let the next iteration know that it's starting a new para */
3135 check_for_omitted_punctuation(prevline,&last,start_para_line);
3138 prevline=g_strdup(aline);
3141 check_for_mismatched_quotes(&counters,&pending);
3142 print_pending(NULL,parastart,&pending);
3143 reset_pending(&pending);
3152 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3153 g_tree_foreach(qword,report_duplicate_queries,NULL);
3154 g_tree_unref(qword);
3155 g_tree_unref(qperiod);
3156 counters_destroy(&counters);
3157 g_set_print_handler(NULL);
3158 print_as_windows_1252(NULL);
3159 if (pswit[MARKUP_SWITCH])
3166 * Get one line from the input text, checking for
3167 * the existence of exactly one CR/LF line-end per line.
3169 * Returns: a pointer to the line.
3171 char *flgets(char **etext,long lcnt)
3174 gboolean isCR=FALSE;
3175 char *theline=*etext;
3180 c=g_utf8_get_char(*etext);
3181 *etext=g_utf8_next_char(*etext);
3184 /* either way, it's end of line */
3191 /* Error - a LF without a preceding CR */
3192 if (pswit[LINE_END_SWITCH])
3194 if (pswit[ECHO_SWITCH])
3196 s=g_strndup(theline,eos-theline);
3197 g_print("\n%s\n",s);
3200 if (!pswit[OVERVIEW_SWITCH])
3201 g_print(" Line %ld - No CR?\n",lcnt);
3212 /* Error - two successive CRs */
3213 if (pswit[LINE_END_SWITCH])
3215 if (pswit[ECHO_SWITCH])
3217 s=g_strndup(theline,eos-theline);
3218 g_print("\n%s\n",s);
3221 if (!pswit[OVERVIEW_SWITCH])
3222 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3231 if (pswit[LINE_END_SWITCH] && isCR)
3233 if (pswit[ECHO_SWITCH])
3235 s=g_strndup(theline,eos-theline);
3236 g_print("\n%s\n",s);
3239 if (!pswit[OVERVIEW_SWITCH])
3240 g_print(" Line %ld column %ld - CR without LF?\n",
3241 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3247 eos=g_utf8_next_char(eos);
3251 if (pswit[MARKUP_SWITCH])
3252 postprocess_for_HTML(theline);
3253 if (pswit[DP_SWITCH])
3254 postprocess_for_DP(theline);
3261 * Takes a "word" as a parameter, and checks whether it
3262 * contains a mixture of alpha and digits. Generally, this is an
3263 * error, but may not be for cases like 4th or L5 12s. 3d.
3265 * Returns: TRUE iff an is error found.
3267 gboolean mixdigit(const char *checkword)
3269 gboolean wehaveadigit,wehavealetter,query;
3270 const char *s,*nondigit;
3271 wehaveadigit=wehavealetter=query=FALSE;
3272 for (s=checkword;*s;s=g_utf8_next_char(s))
3273 if (g_unichar_isalpha(g_utf8_get_char(s)))
3275 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3277 if (wehaveadigit && wehavealetter)
3279 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3281 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3282 nondigit=g_utf8_next_char(nondigit))
3284 /* digits, ending in st, rd, nd, th of either case */
3285 if (!g_ascii_strcasecmp(nondigit,"st") ||
3286 !g_ascii_strcasecmp(nondigit,"rd") ||
3287 !g_ascii_strcasecmp(nondigit,"nd") ||
3288 !g_ascii_strcasecmp(nondigit,"th"))
3290 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3291 !g_ascii_strcasecmp(nondigit,"rds") ||
3292 !g_ascii_strcasecmp(nondigit,"nds") ||
3293 !g_ascii_strcasecmp(nondigit,"ths"))
3295 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3296 !g_ascii_strcasecmp(nondigit,"rdly") ||
3297 !g_ascii_strcasecmp(nondigit,"ndly") ||
3298 !g_ascii_strcasecmp(nondigit,"thly"))
3300 /* digits, ending in l, L, s or d */
3301 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3302 !strcmp(nondigit,"d"))
3305 * L at the start of a number, representing Britsh pounds, like L500.
3306 * This is cute. We know the current word is mixed digit. If the first
3307 * letter is L, there must be at least one digit following. If both
3308 * digits and letters follow, we have a genuine error, else we have a
3309 * capital L followed by digits, and we accept that as a non-error.
3311 if (g_utf8_get_char(checkword)=='L' &&
3312 !mixdigit(g_utf8_next_char(checkword)))
3321 * Extracts the first/next "word" from the line, and returns it.
3322 * A word is defined as one English word unit--or at least that's the aim.
3323 * "ptr" is advanced to the position in the line where we will start
3324 * looking for the next word.
3326 * Returns: A newly-allocated string.
3328 gchar *getaword(const char **ptr)
3333 word=g_string_new(NULL);
3334 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3335 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3336 **ptr;*ptr=g_utf8_next_char(*ptr))
3339 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3340 * Especially yucky is the case of L1,000
3341 * This section looks for a pattern of characters including a digit
3342 * followed by a comma or period followed by one or more digits.
3343 * If found, it returns this whole pattern as a word; otherwise we discard
3344 * the results and resume our normal programming.
3347 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3348 g_unichar_isalpha(g_utf8_get_char(s)) ||
3349 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3350 g_string_append_unichar(word,g_utf8_get_char(s));
3353 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3355 c=g_utf8_get_char(t);
3356 pc=g_utf8_get_char(g_utf8_prev_char(t));
3357 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3360 return g_string_free(word,FALSE);
3364 /* we didn't find a punctuated number - do the regular getword thing */
3365 g_string_truncate(word,0);
3366 c=g_utf8_get_char(*ptr);
3367 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3368 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3369 g_string_append_unichar(word,c);
3370 return g_string_free(word,FALSE);
3376 * Is this word a Roman Numeral?
3378 * It doesn't actually validate that the number is a valid Roman Numeral--for
3379 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3380 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3381 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3382 * expressions thereof, except when it came to taxes. Allow any number of M,
3383 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3384 * XL or an optional XC, an optional IX or IV, an optional V and any number
3387 gboolean isroman(const char *t)
3393 while (g_utf8_get_char(t)=='m' && *t)
3395 if (g_utf8_get_char(t)=='d')
3397 if (g_str_has_prefix(t,"cm"))
3399 if (g_str_has_prefix(t,"cd"))
3401 while (g_utf8_get_char(t)=='c' && *t)
3403 if (g_str_has_prefix(t,"xl"))
3405 if (g_str_has_prefix(t,"xc"))
3407 if (g_utf8_get_char(t)=='l')
3409 while (g_utf8_get_char(t)=='x' && *t)
3411 if (g_str_has_prefix(t,"ix"))
3413 if (g_str_has_prefix(t,"iv"))
3415 if (g_utf8_get_char(t)=='v')
3417 while (g_utf8_get_char(t)=='i' && *t)
3423 * postprocess_for_DP:
3425 * Invoked with the -d switch from flgets().
3426 * It simply "removes" from the line a hard-coded set of common
3427 * DP-specific tags, so that the line passed to the main routine has
3428 * been pre-cleaned of DP markup.
3430 void postprocess_for_DP(char *theline)
3436 for (i=0;*DPmarkup[i];i++)
3437 while ((s=strstr(theline,DPmarkup[i])))
3439 t=s+strlen(DPmarkup[i]);
3440 memmove(s,t,strlen(t)+1);
3445 * postprocess_for_HTML:
3447 * Invoked with the -m switch from flgets().
3448 * It simply "removes" from the line a hard-coded set of common
3449 * HTML tags and "replaces" a hard-coded set of common HTML
3450 * entities, so that the line passed to the main routine has
3451 * been pre-cleaned of HTML.
3453 void postprocess_for_HTML(char *theline)
3455 while (losemarkup(theline))
3457 loseentities(theline);
3460 char *losemarkup(char *theline)
3464 s=strchr(theline,'<');
3465 t=s?strchr(s,'>'):NULL;
3468 for (i=0;*markup[i];i++)
3469 if (tagcomp(g_utf8_next_char(s),markup[i]))
3471 t=g_utf8_next_char(t);
3472 memmove(s,t,strlen(t)+1);
3475 /* It's an unrecognized <xxx>. */
3479 void loseentities(char *theline)
3486 GTree *entities=NULL;
3487 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3491 g_tree_destroy(entities);
3493 if (translit!=(GIConv)-1)
3494 g_iconv_close(translit);
3495 translit=(GIConv)-1;
3496 if (to_utf8!=(GIConv)-1)
3497 g_iconv_close(to_utf8);
3505 entities=g_tree_new((GCompareFunc)strcmp);
3506 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3507 g_tree_insert(entities,HTMLentities[i].name,
3508 GUINT_TO_POINTER(HTMLentities[i].c));
3510 if (translit==(GIConv)-1)
3511 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3512 if (to_utf8==(GIConv)-1)
3513 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3514 while((amp=strchr(theline,'&')))
3516 scolon=strchr(amp,';');
3521 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3522 c=strtol(amp+2,NULL,10);
3523 else if (amp[2]=='x' &&
3524 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3525 c=strtol(amp+3,NULL,16);
3529 s=g_strndup(amp+1,scolon-(amp+1));
3530 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3539 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3540 theline+=g_unichar_to_utf8(c,theline);
3544 nb=g_unichar_to_utf8(c,s);
3545 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3547 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3549 memcpy(theline,s,nb);
3553 memmove(theline,g_utf8_next_char(scolon),
3554 strlen(g_utf8_next_char(scolon))+1);
3557 theline=g_utf8_next_char(amp);
3561 gboolean tagcomp(const char *strin,const char *basetag)
3565 if (g_utf8_get_char(strin)=='/')
3566 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3568 t=g_utf8_casefold(strin,-1);
3569 s=g_utf8_casefold(basetag,-1);
3570 retval=g_str_has_prefix(t,s);
3576 void proghelp(GOptionContext *context)
3579 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3580 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3581 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3582 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3583 "For details, read the file COPYING.\n",stderr);
3584 fputs("This is Free Software; "
3585 "you may redistribute it under certain conditions (GPL);\n",stderr);
3586 fputs("read the file COPYING for details.\n\n",stderr);
3587 help=g_option_context_get_help(context,TRUE,NULL);
3590 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3591 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3592 "non-ASCII\n",stderr);
3593 fputs("characters like accented letters, "
3594 "lines longer than 75 or shorter than 55,\n",stderr);
3595 fputs("unbalanced quotes or brackets, "
3596 "a variety of badly formatted punctuation, \n",stderr);
3597 fputs("HTML tags, some likely typos. "
3598 "It is NOT a substitute for human judgement.\n",stderr);