1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 gboolean typo_compat,paranoid_compat;
137 static GOptionEntry options[]={
138 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
139 "Ignore DP-specific markup", NULL },
140 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
141 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
142 "Don't ignore DP-specific markup", NULL },
143 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
144 "Echo queried line", NULL },
145 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
146 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
147 "Don't echo queried line", NULL },
148 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
149 "Check single quotes", NULL },
150 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
151 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
152 "Don't check single quotes", NULL },
153 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
154 "Check common typos", NULL },
155 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
156 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
157 "Don't check common typos", NULL },
158 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
159 "Require closure of quotes on every paragraph", NULL },
160 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
161 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
162 "Don't require closure of quotes on every paragraph", NULL },
163 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
164 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
165 "Enable paranoid querying of everything", NULL },
166 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
167 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
168 "Disable paranoid querying of everything", NULL },
169 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
170 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
171 "Enable line end checking", NULL },
172 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
173 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
174 "Disable line end checking", NULL },
175 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
176 "Overview: just show counts", NULL },
177 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
178 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
179 "Show individual warnings", NULL },
180 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
181 "Output errors to stdout instead of stderr", NULL },
182 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
183 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
184 "Output errors to stderr instead of stdout", NULL },
185 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
186 "Echo header fields", NULL },
187 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
188 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Don't echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
193 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "No special handling for markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
198 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
199 "Ignore file of user-defined typos", NULL },
200 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
201 "Verbose - list everything", NULL },
202 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
203 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
204 "Switch off verbose mode", NULL },
205 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
206 "Set of characters valid for this ebook", "NAME" },
211 * Options relating to configuration which make no sense from inside
212 * a configuration file.
215 static GOptionEntry config_options[]={
216 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
217 "Defaults for use on www upload", NULL },
218 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
219 "Dump current config settings", NULL },
223 static GOptionEntry compatibility_options[]={
224 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
225 "Toggle checking for common typos", NULL },
226 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
227 "Toggle both paranoid mode and common typos", NULL },
231 long cnt_quote; /* for overview mode, count of quote queries */
232 long cnt_brack; /* for overview mode, count of brackets queries */
233 long cnt_bin; /* for overview mode, count of non-ASCII queries */
234 long cnt_odd; /* for overview mode, count of odd character queries */
235 long cnt_long; /* for overview mode, count of long line errors */
236 long cnt_short; /* for overview mode, count of short line queries */
237 long cnt_punct; /* for overview mode,
238 count of punctuation and spacing queries */
239 long cnt_dash; /* for overview mode, count of dash-related queries */
240 long cnt_word; /* for overview mode, count of word queries */
241 long cnt_html; /* for overview mode, count of html queries */
242 long cnt_lineend; /* for overview mode, count of line-end queries */
243 long cnt_spacend; /* count of lines with space at end */
244 long linecnt; /* count of total lines in the file */
245 long checked_linecnt; /* count of lines actually checked */
247 void proghelp(GOptionContext *context);
248 void procfile(const char *);
252 gboolean mixdigit(const char *);
253 gchar *getaword(const char **);
254 char *flgets(char **,long,int);
255 void postprocess_for_HTML(char *);
256 char *linehasmarkup(char *);
257 char *losemarkup(char *);
258 gboolean tagcomp(const char *,const char *);
259 void loseentities(char *);
260 gboolean isroman(const char *);
261 void postprocess_for_DP(char *);
262 void print_as_windows_1252(const char *string);
263 void print_as_utf_8(const char *string);
265 GTree *qword,*qperiod;
271 gboolean set_charset(const char *name,GError **err)
273 /* The various UNICODE encodings all share the same character set. */
274 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
275 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
276 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
277 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
278 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
282 if (charset_validator!=(GIConv)-1)
283 g_iconv_close(charset_validator);
284 if (!name || !g_strcasecmp(name,"auto"))
287 charset_validator=(GIConv)-1;
291 charset=g_strdup(name);
292 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
293 if (!g_strcasecmp(charset,unicode_aliases[i]))
296 charset=g_strdup("UTF-8");
299 if (!strcmp(charset,"UTF-8"))
300 charset_validator=(GIConv)-1;
303 charset_validator=g_iconv_open(charset,"UTF-8");
304 if (charset_validator==(GIConv)-1)
306 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
307 "Unknown character set \"%s\"",charset);
316 void config_file_update(GKeyFile *kf)
321 for(i=0;options[i].long_name;i++)
323 if (g_str_has_prefix(options[i].long_name,"no-"))
325 if (options[i].arg==G_OPTION_ARG_NONE)
327 sw=*(gboolean *)options[i].arg_data;
328 if (options[i].flags&G_OPTION_FLAG_REVERSE)
330 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
332 else if (options[i].arg==G_OPTION_ARG_STRING)
334 s=*(gchar **)options[i].arg_data;
337 g_key_file_set_string(kf,"options",options[i].long_name,s);
340 g_assert_not_reached();
344 void config_file_add_comments(GKeyFile *kf)
348 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
350 for(i=0;options[i].long_name;i++)
352 if (g_str_has_prefix(options[i].long_name,"no-"))
354 comment=g_strconcat(" ",options[i].description,NULL);
355 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
360 void dump_config(void)
364 config_file_update(config);
367 config=g_key_file_new();
368 config_file_update(config);
369 config_file_add_comments(config);
371 s=g_key_file_to_data(config,NULL,NULL);
377 GKeyFile *read_config_file(gchar **full_path)
383 const char *search_path;
386 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
390 search_dirs=g_strsplit(search_path,";",0);
392 search_dirs=g_strsplit(search_path,":",0);
397 search_dirs=g_new(gchar *,4);
398 search_dirs[0]=g_get_current_dir();
399 search_dirs[1]=g_strdup(running_from);
400 search_dirs[2]=g_strdup(g_get_user_config_dir());
403 for(i=0;search_dirs[i];i++)
405 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
406 if (g_key_file_load_from_file(kf,path,
407 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
409 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
411 g_printerr("Bookloupe: Error reading %s\n",path);
412 g_printerr("%s\n",err->message);
424 g_strfreev(search_dirs);
432 void parse_config_file(void)
439 config=read_config_file(&path);
441 keys=g_key_file_get_keys(config,"options",NULL,NULL);
448 for(j=0;options[j].long_name;j++)
450 if (g_str_has_prefix(options[j].long_name,"no-"))
452 else if (!strcmp(keys[i],options[j].long_name))
454 if (options[j].arg==G_OPTION_ARG_NONE)
456 sw=g_key_file_get_boolean(config,"options",keys[i],
460 g_printerr("Bookloupe: %s: options.%s: %s\n",
461 path,keys[i],err->message);
466 if (options[j].flags&G_OPTION_FLAG_REVERSE)
468 *(gboolean *)options[j].arg_data=sw;
472 else if (options[j].arg==G_OPTION_ARG_STRING)
474 s=g_key_file_get_string(config,"options",keys[i],
478 g_printerr("Bookloupe: %s: options.%s: %s\n",
479 path,keys[i],err->message);
484 g_free(*(gchar **)options[j].arg_data);
485 if (!g_strcmp0(s,"auto"))
487 *(gchar **)options[j].arg_data=NULL;
491 *(gchar **)options[j].arg_data=s;
496 g_assert_not_reached();
499 if (!options[j].long_name)
500 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
509 void parse_options(int *argc,char ***argv)
512 GOptionContext *context;
513 GOptionGroup *compatibility;
514 context=g_option_context_new(
515 "file - look for errors in Project Gutenberg(TM) etexts");
516 g_option_context_add_main_entries(context,options,NULL);
517 g_option_context_add_main_entries(context,config_options,NULL);
518 compatibility=g_option_group_new("compatibility",
519 "Options for Compatibility with Gutcheck:",
520 "Show compatibility options",NULL,NULL);
521 g_option_group_add_entries(compatibility,compatibility_options);
522 g_option_context_add_group(context,compatibility);
523 g_option_context_set_description(context,
524 "For simplicity, only the switch options which reverse the\n"
525 "default configuration are listed. In most cases, both vanilla\n"
526 "and \"no-\" prefixed versions are available for use.");
527 if (!g_option_context_parse(context,argc,argv,&err))
529 g_printerr("Bookloupe: %s\n",err->message);
530 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
534 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
537 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
538 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
541 * Web uploads - for the moment, this is really just a placeholder
542 * until we decide what processing we really want to do on web uploads
544 if (pswit[WEB_SWITCH])
546 /* specific override for web uploads */
547 pswit[ECHO_SWITCH]=TRUE;
548 pswit[SQUOTE_SWITCH]=FALSE;
549 pswit[TYPO_SWITCH]=TRUE;
550 pswit[QPARA_SWITCH]=FALSE;
551 pswit[PARANOID_SWITCH]=TRUE;
552 pswit[LINE_END_SWITCH]=FALSE;
553 pswit[OVERVIEW_SWITCH]=FALSE;
554 pswit[STDOUT_SWITCH]=FALSE;
555 pswit[HEADER_SWITCH]=TRUE;
556 pswit[VERBOSE_SWITCH]=FALSE;
557 pswit[MARKUP_SWITCH]=FALSE;
558 pswit[USERTYPO_SWITCH]=FALSE;
559 pswit[DP_SWITCH]=FALSE;
561 if (opt_charset && !set_charset(opt_charset,&err))
563 g_printerr("%s\n",err->message);
566 if (pswit[DUMP_CONFIG_SWITCH])
573 if (pswit[OVERVIEW_SWITCH])
574 /* just print summary; don't echo */
575 pswit[ECHO_SWITCH]=FALSE;
581 g_option_context_free(context);
587 * Read in the user-defined stealth scanno list.
589 void read_user_scannos(void)
592 gchar *usertypo_file;
596 gchar *contents,*utf8,**lines;
597 usertypo_file=g_strdup("bookloupe.typ");
598 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
599 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
602 g_free(usertypo_file);
603 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
604 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
606 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
609 g_free(usertypo_file);
610 usertypo_file=g_strdup("gutcheck.typ");
611 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
613 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
616 g_free(usertypo_file);
617 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
618 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
620 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
622 g_free(usertypo_file);
623 g_print(" --> I couldn't find bookloupe.typ "
624 "-- proceeding without user typos.\n");
629 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
630 g_free(usertypo_file);
634 if (g_utf8_validate(contents,len,NULL))
636 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
638 (void)set_charset("UNICODE",NULL);
641 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
643 lines=g_strsplit_set(utf8,"\r\n",0);
645 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
646 for (i=0;lines[i];i++)
647 if (*(unsigned char *)lines[i]>'!')
648 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
657 * Read an etext returning a newly allocated string containing the file
658 * contents or NULL on error.
660 gchar *read_etext(const char *filename,GError **err)
662 GError *tmp_err=NULL;
663 gchar *contents,*utf8;
664 gsize len,bytes_read,bytes_written;
666 if (!g_file_get_contents(filename,&contents,&len,err))
668 if (g_utf8_validate(contents,len,NULL))
670 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
671 g_set_print_handler(print_as_utf_8);
673 SetConsoleOutputCP(CP_UTF8);
678 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
679 &bytes_written,&tmp_err);
680 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
681 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
684 for(i=0;i<bytes_read;i++)
685 if (contents[i]=='\n')
690 else if (contents[i]!='\r')
692 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
693 "Input conversion failed. Byte %d at line %d, column %d is not a "
694 "valid Windows-1252 character",
695 ((unsigned char *)contents)[bytes_read],line,col);
698 g_propagate_error(err,tmp_err);
699 g_set_print_handler(print_as_windows_1252);
701 SetConsoleOutputCP(1252);
708 void cleanup_on_exit(void)
711 SetConsoleOutputCP(saved_cp);
715 int main(int argc,char **argv)
718 atexit(cleanup_on_exit);
719 saved_cp=GetConsoleOutputCP();
721 running_from=g_path_get_dirname(argv[0]);
722 /* Paranoid checking is turned OFF, not on, by its switch */
723 pswit[PARANOID_SWITCH]=TRUE;
724 /* if running in paranoid mode, typo checks default to enabled */
725 pswit[TYPO_SWITCH]=TRUE;
726 /* Line-end checking is turned OFF, not on, by its switch */
727 pswit[LINE_END_SWITCH]=TRUE;
728 /* Echoing is turned OFF, not on, by its switch */
729 pswit[ECHO_SWITCH]=TRUE;
731 parse_options(&argc,&argv);
732 if (pswit[USERTYPO_SWITCH])
734 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
736 if (pswit[OVERVIEW_SWITCH])
738 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
739 checked_linecnt,linecnt,linecnt-checked_linecnt);
740 g_print(" --------------- Queries found --------------\n");
742 g_print(" Long lines: %14ld\n",cnt_long);
744 g_print(" Short lines: %14ld\n",cnt_short);
746 g_print(" Line-end problems: %14ld\n",cnt_lineend);
748 g_print(" Common typos: %14ld\n",cnt_word);
750 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
752 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
754 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
756 g_print(" Proofing characters: %14ld\n",cnt_odd);
758 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
760 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
762 g_print(" Possible HTML tags: %14ld\n",cnt_html);
764 g_print(" TOTAL QUERIES %14ld\n",
765 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
766 cnt_dash+cnt_word+cnt_html+cnt_lineend);
768 g_free(running_from);
770 g_tree_unref(usertypo);
771 set_charset(NULL,NULL);
773 g_key_file_free(config);
777 void count_dashes(const char *line,const char *dash,
778 struct dash_results *results)
783 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
786 tokens=g_strsplit(line,dash,0);
789 for(i=1;tokens[i];i++)
791 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
792 nc=g_utf8_get_char(tokens[i]);
793 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
795 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
797 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
803 /* count of lines with em-dashes with spaces both sides */
804 results->non_PG_space++;
806 /* count of lines with PG-type em-dashes with no spaces */
814 * Run a first pass - verify that it's a valid PG
815 * file, decide whether to report some things that
816 * occur many times in the text like long or short
817 * lines, non-standard dashes, etc.
819 struct first_pass_results *first_pass(const char *etext)
821 gunichar laststart=CHAR_SPACE;
826 unsigned int lastlen=0,lastblen=0;
827 long spline=0,nspline=0;
828 static struct first_pass_results results={0};
829 struct dash_results tmp_dash_results;
832 lines=g_strsplit(etext,"\n",0);
835 /* An empty etext has no terminators */
836 results.newlines=DOS_NEWLINES;
841 * If there are no LFs, we don't have UNIX-style
842 * terminators, but we might have OS9-style ones.
844 results.newlines=OS9_NEWLINES;
846 lines=g_strsplit(etext,"\r",0);
847 if (!lines[0] || !lines[1])
848 /* Looks like we don't have any terminators at all */
849 results.newlines=DOS_NEWLINES;
853 /* We might have UNIX-style terminators */
854 results.newlines=UNIX_NEWLINES;
856 for (j=0;lines[j];j++)
858 lbytes=strlen(lines[j]);
859 if (lbytes>0 && lines[j][lbytes-1]=='\r')
861 results.newlines=DOS_NEWLINES;
864 lines[j][--lbytes]='\0';
865 } while (lbytes>0 && lines[j][lbytes-1]=='\r');
867 llen=g_utf8_strlen(lines[j],lbytes);
869 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
870 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
873 g_print(" --> Duplicate header?\n");
874 spline=linecnt+1; /* first line of non-header text, that is */
876 if (!strncmp(lines[j],"*** START",9) &&
877 strstr(lines[j],"PROJECT GUTENBERG"))
880 g_print(" --> Duplicate header?\n");
881 nspline=linecnt+1; /* first line of non-header text, that is */
883 if (spline || nspline)
885 lc_line=g_utf8_strdown(lines[j],lbytes);
886 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
888 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
890 if (results.footerline)
892 /* it's an old-form header - we can detect duplicates */
894 g_print(" --> Duplicate footer?\n");
897 results.footerline=linecnt;
903 results.firstline=spline;
905 results.firstline=nspline; /* override with new */
906 if (results.footerline)
907 continue; /* don't count the boilerplate in the footer */
908 results.totlen+=llen;
909 for (s=lines[j];*s;s=g_utf8_next_char(s))
911 if (g_utf8_get_char(s)>127)
913 if (g_unichar_isalpha(g_utf8_get_char(s)))
917 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
918 qc=QUOTE_CLASS(g_utf8_get_char(s));
921 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
922 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
923 results.endquote_count++;
926 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
927 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
930 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
932 if (strstr(lines[j],".,"))
934 /* only count ast lines for ignoring purposes where there is */
935 /* locase text on the line */
936 if (strchr(lines[j],'*'))
938 for (s=lines[j];*s;s=g_utf8_next_char(s))
939 if (g_unichar_islower(g_utf8_get_char(s)))
944 if (strchr(lines[j],'/'))
945 results.fslashline++;
948 for (s=g_utf8_prev_char(lines[j]+lbytes);
949 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
950 s=g_utf8_prev_char(s))
952 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
953 g_utf8_get_char(g_utf8_prev_char(s))!='-')
956 if (llen>LONGEST_PG_LINE)
958 if (llen>WAY_TOO_LONG)
959 results.verylongline++;
960 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
962 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
965 if (strstr(lines[j],"<i>"))
966 results.htmcount+=4; /* bonus marks! */
968 /* Check for spaced em-dashes */
969 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
970 count_dashes(lines[j],"--",&tmp_dash_results);
971 count_dashes(lines[j],"—",&tmp_dash_results);
972 if (tmp_dash_results.base)
973 results.emdash.base++;
974 if (tmp_dash_results.non_PG_space)
975 results.emdash.non_PG_space++;
976 if (tmp_dash_results.PG_space)
977 results.emdash.PG_space++;
981 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
982 results.Dutchcount++;
983 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
984 results.Frenchcount++;
985 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
986 results.standalone_digit++;
989 /* Check for spaced dashes */
990 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
994 laststart=lines[j][0];
1001 * report_first_pass:
1003 * Make some snap decisions based on the first pass results.
1005 struct warnings *report_first_pass(struct first_pass_results *results)
1007 static struct warnings warnings={0};
1008 warnings.newlines=results->newlines;
1009 if (warnings.newlines==UNIX_NEWLINES)
1010 g_print(" --> No lines in this file have a CR. Not reporting them. "
1011 "Project Gutenberg requires that all lineends be CR-LF.\n");
1012 else if (warnings.newlines==OS9_NEWLINES)
1013 g_print(" --> No lines in this file have a LF. Not reporting them. "
1014 "Project Gutenberg requires that all lineends be CR-LF.\n");
1016 g_print(" --> %ld lines in this file have white space at end\n",
1018 warnings.dotcomma=1;
1019 if (results->dotcomma>5)
1021 warnings.dotcomma=0;
1022 g_print(" --> %ld lines in this file contain '.,'. "
1023 "Not reporting them.\n",results->dotcomma);
1026 * If more than 50 lines, or one-tenth, are short,
1027 * don't bother reporting them.
1029 warnings.shortline=1;
1030 if (results->shortline>50 || results->shortline*10>linecnt)
1032 warnings.shortline=0;
1033 g_print(" --> %ld lines in this file are short. "
1034 "Not reporting short lines.\n",results->shortline);
1037 * If more than 50 lines, or one-tenth, are long,
1038 * don't bother reporting them.
1040 warnings.longline=1;
1041 if (results->longline>50 || results->longline*10>linecnt)
1043 warnings.longline=0;
1044 g_print(" --> %ld lines in this file are long. "
1045 "Not reporting long lines.\n",results->longline);
1047 /* If more than 10 lines contain asterisks, don't bother reporting them. */
1049 if (results->astline>10)
1052 g_print(" --> %ld lines in this file contain asterisks. "
1053 "Not reporting them.\n",results->astline);
1056 * If more than 10 lines contain forward slashes,
1057 * don't bother reporting them.
1060 if (results->fslashline>10)
1063 g_print(" --> %ld lines in this file contain forward slashes. "
1064 "Not reporting them.\n",results->fslashline);
1067 * If more than 20 lines contain unpunctuated endquotes,
1068 * don't bother reporting them.
1070 warnings.endquote=1;
1071 if (results->endquote_count>20)
1073 warnings.endquote=0;
1074 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1075 "Not reporting them.\n",results->endquote_count);
1078 * If more than 15 lines contain standalone digits,
1079 * don't bother reporting them.
1082 if (results->standalone_digit>10)
1085 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1086 "Not reporting them.\n",results->standalone_digit);
1089 * If more than 20 lines contain hyphens at end,
1090 * don't bother reporting them.
1093 if (results->hyphens>20)
1096 g_print(" --> %ld lines in this file have hyphens at end. "
1097 "Not reporting them.\n",results->hyphens);
1099 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1101 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1102 pswit[MARKUP_SWITCH]=1;
1104 if (results->verylongline>0)
1105 g_print(" --> %ld lines in this file are VERY long!\n",
1106 results->verylongline);
1108 * If there are more non-PG spaced dashes than PG em-dashes,
1109 * assume it's deliberate.
1110 * Current PG guidelines say don't use them, but older texts do,
1111 * and some people insist on them whatever the guidelines say.
1114 if (results->spacedash+results->emdash.non_PG_space>
1115 results->emdash.PG_space)
1118 g_print(" --> There are %ld spaced dashes and em-dashes. "
1119 "Not reporting them.\n",
1120 results->spacedash+results->emdash.non_PG_space);
1126 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1128 /* If more than a quarter of characters are hi-bit, bug out. */
1129 if (results->binlen*4>results->totlen)
1131 g_print(" --> This file does not appear to be ASCII. "
1132 "Terminating. Best of luck with it!\n");
1135 if (results->alphalen*4<results->totlen)
1137 g_print(" --> This file does not appear to be text. "
1138 "Terminating. Best of luck with it!\n");
1141 if (results->binlen*100>results->totlen || results->binlen>100)
1143 g_print(" --> There are a lot of foreign letters here. "
1144 "Not reporting them.\n");
1145 if (!pswit[VERBOSE_SWITCH])
1149 warnings.isDutch=FALSE;
1150 if (results->Dutchcount>50)
1152 warnings.isDutch=TRUE;
1153 g_print(" --> This looks like Dutch - "
1154 "switching off dashes and warnings for 's Middags case.\n");
1156 warnings.isFrench=FALSE;
1157 if (results->Frenchcount>50)
1159 warnings.isFrench=TRUE;
1160 g_print(" --> This looks like French - "
1161 "switching off some doublepunct.\n");
1163 if (results->firstline && results->footerline)
1164 g_print(" The PG header and footer appear to be already on.\n");
1167 if (results->firstline)
1168 g_print(" The PG header is on - no footer.\n");
1169 if (results->footerline)
1170 g_print(" The PG footer is on - no header.\n");
1173 if (pswit[VERBOSE_SWITCH])
1175 warnings.shortline=1;
1176 warnings.dotcomma=1;
1177 warnings.longline=1;
1183 warnings.endquote=1;
1184 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1186 if (warnings.isDutch)
1188 if (results->footerline>0 && results->firstline>0 &&
1189 results->footerline>results->firstline &&
1190 results->footerline-results->firstline<100)
1192 g_print(" --> I don't really know where this text starts. \n");
1193 g_print(" There are no reference points.\n");
1194 g_print(" I'm going to have to report the header and footer "
1196 results->firstline=0;
1204 * Look along the line, accumulate the count of quotes, and see
1205 * if this is an empty line - i.e. a line with nothing on it
1207 * If line has just spaces, period, * and/or - on it, don't
1208 * count it, since empty lines with asterisks or dashes to
1209 * separate sections are common.
1211 * Returns: TRUE if the line is empty.
1213 gboolean analyse_quotes(const char *aline,struct counters *counters)
1216 /* assume the line is empty until proven otherwise */
1217 gboolean isemptyline=TRUE;
1218 const char *s=aline,*sprev,*snext;
1221 GError *tmp_err=NULL;
1224 snext=g_utf8_next_char(s);
1225 c=g_utf8_get_char(s);
1226 if (CHAR_IS_DQUOTE(c))
1227 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1228 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1233 * At start of line, it can only be a quotation mark.
1234 * Hardcode a very common exception!
1236 if (!g_str_has_prefix(snext,"tis") &&
1237 !g_str_has_prefix(snext,"Tis"))
1238 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1240 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1241 g_unichar_isalpha(g_utf8_get_char(snext)))
1242 /* Do nothing! it's definitely an apostrophe, not a quote */
1244 /* it's outside a word - let's check it out */
1245 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1246 g_unichar_isalpha(g_utf8_get_char(snext)))
1248 /* certainly looks like a quotation mark */
1249 if (!g_str_has_prefix(snext,"tis") &&
1250 !g_str_has_prefix(snext,"Tis"))
1251 /* hardcode a very common exception! */
1253 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1254 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1256 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1261 /* now - is it a quotation mark? */
1262 guessquote=0; /* accumulate clues */
1263 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1265 /* it follows a letter - could be either */
1267 if (g_utf8_get_char(sprev)=='s')
1269 /* looks like a plural apostrophe */
1271 if (g_utf8_get_char(snext)==CHAR_SPACE)
1275 if (innermost_quote_matches(counters,c))
1277 * Give it the benefit of some doubt,
1278 * if a squote is already open.
1284 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1287 /* no adjacent letter - it must be a quote of some kind */
1288 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1293 if (pswit[ECHO_SWITCH])
1294 g_print("\n%s\n",aline);
1295 if (!pswit[OVERVIEW_SWITCH])
1296 g_print(" Line %ld column %ld - %s\n",
1297 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1298 g_clear_error(&tmp_err);
1300 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1302 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1303 if (c==CHAR_UNDERSCORE)
1304 counters->c_unders++;
1305 if (c==CHAR_OPEN_SBRACK)
1307 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1308 !matching_difference(counters,c) && s==aline &&
1309 g_str_has_prefix(s,"[Illustration:"))
1310 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1312 increment_matching(counters,c,TRUE);
1314 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1315 increment_matching(counters,c,TRUE);
1316 if (c==CHAR_CLOSE_SBRACK)
1318 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1319 !matching_difference(counters,c) && !*snext)
1320 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1322 increment_matching(counters,c,FALSE);
1324 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1325 increment_matching(counters,c,FALSE);
1333 * check_for_control_characters:
1335 * Check for invalid or questionable characters in the line
1336 * Anything above 127 is invalid for plain ASCII, and
1337 * non-printable control characters should also be flagged.
1338 * Tabs should generally not be there.
1340 void check_for_control_characters(const char *aline)
1344 for (s=aline;*s;s=g_utf8_next_char(s))
1346 c=g_utf8_get_char(s);
1347 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1349 if (pswit[ECHO_SWITCH])
1350 g_print("\n%s\n",aline);
1351 if (!pswit[OVERVIEW_SWITCH])
1352 g_print(" Line %ld column %ld - Control character %u\n",
1353 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1361 * check_for_odd_characters:
1363 * Check for binary and other odd characters.
1365 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1366 gboolean isemptyline)
1368 /* Don't repeat multiple warnings on one line. */
1369 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1370 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1375 for (s=aline;*s;s=g_utf8_next_char(s))
1377 c=g_utf8_get_char(s);
1378 if (warnings->bin && !eInvalidChar &&
1379 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1381 if (pswit[ECHO_SWITCH])
1382 g_print("\n%s\n",aline);
1383 if (!pswit[OVERVIEW_SWITCH])
1384 if (c>127 && c<160 || c>255)
1385 g_print(" Line %ld column %ld - "
1386 "Non-ISO-8859 character %u\n",
1387 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1389 g_print(" Line %ld column %ld - "
1390 "Non-ASCII character %u\n",
1391 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1396 if (!eInvalidChar && charset)
1398 if (charset_validator==(GIConv)-1)
1400 if (!g_unichar_isdefined(c))
1402 if (pswit[ECHO_SWITCH])
1403 g_print("\n%s\n",aline);
1404 if (!pswit[OVERVIEW_SWITCH])
1405 g_print(" Line %ld column %ld - Unassigned UNICODE "
1406 "code point U+%04" G_GINT32_MODIFIER "X\n",
1407 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1412 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1413 c>=100000 && c<=0x10FFFD)
1415 if (pswit[ECHO_SWITCH])
1416 g_print("\n%s\n",aline);
1417 if (!pswit[OVERVIEW_SWITCH])
1418 g_print(" Line %ld column %ld - Private Use "
1419 "character U+%04" G_GINT32_MODIFIER "X\n",
1420 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1428 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1429 charset_validator,NULL,&nb,NULL);
1434 if (pswit[ECHO_SWITCH])
1435 g_print("\n%s\n",aline);
1436 if (!pswit[OVERVIEW_SWITCH])
1437 g_print(" Line %ld column %ld - Non-%s "
1438 "character %u\n",linecnt,
1439 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1446 if (!eTab && c==CHAR_TAB)
1448 if (pswit[ECHO_SWITCH])
1449 g_print("\n%s\n",aline);
1450 if (!pswit[OVERVIEW_SWITCH])
1451 g_print(" Line %ld column %ld - Tab character?\n",
1452 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1457 if (!eTilde && c==CHAR_TILDE)
1460 * Often used by OCR software to indicate an
1461 * unrecognizable character.
1463 if (pswit[ECHO_SWITCH])
1464 g_print("\n%s\n",aline);
1465 if (!pswit[OVERVIEW_SWITCH])
1466 g_print(" Line %ld column %ld - Tilde character?\n",
1467 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1472 if (!eCarat && c==CHAR_CARAT)
1474 if (pswit[ECHO_SWITCH])
1475 g_print("\n%s\n",aline);
1476 if (!pswit[OVERVIEW_SWITCH])
1477 g_print(" Line %ld column %ld - Carat character?\n",
1478 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1483 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1485 if (pswit[ECHO_SWITCH])
1486 g_print("\n%s\n",aline);
1487 if (!pswit[OVERVIEW_SWITCH])
1488 g_print(" Line %ld column %ld - Forward slash?\n",
1489 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1495 * Report asterisks only in paranoid mode,
1496 * since they're often deliberate.
1498 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1501 if (pswit[ECHO_SWITCH])
1502 g_print("\n%s\n",aline);
1503 if (!pswit[OVERVIEW_SWITCH])
1504 g_print(" Line %ld column %ld - Asterisk?\n",
1505 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1514 * check_for_long_line:
1516 * Check for line too long.
1518 void check_for_long_line(const char *aline)
1520 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1522 if (pswit[ECHO_SWITCH])
1523 g_print("\n%s\n",aline);
1524 if (!pswit[OVERVIEW_SWITCH])
1525 g_print(" Line %ld column %ld - Long line %ld\n",
1526 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1533 * check_for_short_line:
1535 * Check for line too short.
1537 * This one is a bit trickier to implement: we don't want to
1538 * flag the last line of a paragraph for being short, so we
1539 * have to wait until we know that our current line is a
1540 * "normal" line, then report the _previous_ line if it was too
1541 * short. We also don't want to report indented lines like
1542 * chapter heads or formatted quotations. We therefore keep
1543 * last->len as the length of the last line examined, and
1544 * last->blen as the length of the last but one, and try to
1545 * suppress unnecessary warnings by checking that both were of
1546 * "normal" length. We keep the first character of the last
1547 * line in last->start, and if it was a space, we assume that
1548 * the formatting is deliberate. I can't figure out a way to
1549 * distinguish something like a quoted verse left-aligned or
1550 * the header or footer of a letter from a paragraph of short
1551 * lines - maybe if I examined the whole paragraph, and if the
1552 * para has less than, say, 8 lines and if all lines are short,
1553 * then just assume it's OK? Need to look at some texts to see
1554 * how often a formula like this would get the right result.
1556 void check_for_short_line(const char *aline,const struct line_properties *last)
1558 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1559 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1560 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1562 if (pswit[ECHO_SWITCH])
1563 g_print("\n%s\n",prevline);
1564 if (!pswit[OVERVIEW_SWITCH])
1565 g_print(" Line %ld column %ld - Short line %ld?\n",
1566 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1573 * check_for_starting_punctuation:
1575 * Look for punctuation other than full ellipses at start of line.
1577 void check_for_starting_punctuation(const char *aline)
1579 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1580 !g_str_has_prefix(aline,". . ."))
1582 if (pswit[ECHO_SWITCH])
1583 g_print("\n%s\n",aline);
1584 if (!pswit[OVERVIEW_SWITCH])
1585 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1595 * Find the first em-dash, return a pointer to it and set <next> to the
1596 * character following the dash.
1598 char *str_emdash(const char *s,const char **next)
1606 *next=g_utf8_next_char(s2);
1611 *next=g_utf8_next_char(g_utf8_next_char(s1));
1616 *next=g_utf8_next_char(g_utf8_next_char(s1));
1621 *next=g_utf8_next_char(s2);
1627 * check_for_spaced_emdash:
1629 * Check for spaced em-dashes.
1631 * We must check _all_ occurrences of em-dashes on the line
1632 * hence the loop - even if the first dash is OK
1633 * there may be another that's wrong later on.
1635 void check_for_spaced_emdash(const char *aline)
1637 const char *s,*t,*next;
1638 for (s=aline;t=str_emdash(s,&next);s=next)
1640 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1641 g_utf8_get_char(next)==CHAR_SPACE)
1643 if (pswit[ECHO_SWITCH])
1644 g_print("\n%s\n",aline);
1645 if (!pswit[OVERVIEW_SWITCH])
1646 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1647 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1655 * check_for_spaced_dash:
1657 * Check for spaced dashes.
1659 void check_for_spaced_dash(const char *aline)
1662 if ((s=strstr(aline," -")))
1664 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1666 if (pswit[ECHO_SWITCH])
1667 g_print("\n%s\n",aline);
1668 if (!pswit[OVERVIEW_SWITCH])
1669 g_print(" Line %ld column %ld - Spaced dash?\n",
1670 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1675 else if ((s=strstr(aline,"- ")))
1677 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1679 if (pswit[ECHO_SWITCH])
1680 g_print("\n%s\n",aline);
1681 if (!pswit[OVERVIEW_SWITCH])
1682 g_print(" Line %ld column %ld - Spaced dash?\n",
1683 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1691 * check_for_unmarked_paragraphs:
1693 * Check for unmarked paragraphs indicated by separate speakers.
1695 * May well be false positive:
1696 * "Bravo!" "Wonderful!" called the crowd.
1697 * but useful all the same.
1699 void check_for_unmarked_paragraphs(const char *aline)
1702 s=strstr(aline,"\" \"");
1704 s=strstr(aline,"\" \"");
1707 if (pswit[ECHO_SWITCH])
1708 g_print("\n%s\n",aline);
1709 if (!pswit[OVERVIEW_SWITCH])
1710 g_print(" Line %ld column %ld - "
1711 "Query missing paragraph break?\n",
1712 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1719 * check_for_jeebies:
1721 * Check for "to he" and other easy h/b errors.
1723 * This is a very inadequate effort on the h/b problem,
1724 * but the phrase "to he" is always an error, whereas "to
1725 * be" is quite common.
1726 * Similarly, '"Quiet!", be said.' is a non-be error
1727 * "to he" is _not_ always an error!:
1728 * "Where they went to he couldn't say."
1729 * Another false positive:
1730 * What would "Cinderella" be without the . . .
1731 * and another: "If he wants to he can see for himself."
1733 void check_for_jeebies(const char *aline)
1736 s=strstr(aline," be could ");
1738 s=strstr(aline," be would ");
1740 s=strstr(aline," was be ");
1742 s=strstr(aline," be is ");
1744 s=strstr(aline," is be ");
1746 s=strstr(aline,"\", be ");
1748 s=strstr(aline,"\" be ");
1750 s=strstr(aline,"\" be ");
1752 s=strstr(aline," to he ");
1755 if (pswit[ECHO_SWITCH])
1756 g_print("\n%s\n",aline);
1757 if (!pswit[OVERVIEW_SWITCH])
1758 g_print(" Line %ld column %ld - Query he/be error?\n",
1759 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1763 s=strstr(aline," the had ");
1765 s=strstr(aline," a had ");
1767 s=strstr(aline," they bad ");
1769 s=strstr(aline," she bad ");
1771 s=strstr(aline," he bad ");
1773 s=strstr(aline," you bad ");
1775 s=strstr(aline," i bad ");
1778 if (pswit[ECHO_SWITCH])
1779 g_print("\n%s\n",aline);
1780 if (!pswit[OVERVIEW_SWITCH])
1781 g_print(" Line %ld column %ld - Query had/bad error?\n",
1782 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1786 s=strstr(aline,"; hut ");
1788 s=strstr(aline,", hut ");
1791 if (pswit[ECHO_SWITCH])
1792 g_print("\n%s\n",aline);
1793 if (!pswit[OVERVIEW_SWITCH])
1794 g_print(" Line %ld column %ld - Query hut/but error?\n",
1795 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1802 * check_for_mta_from:
1804 * Special case - angled bracket in front of "From" placed there by an
1805 * MTA when sending an e-mail.
1807 void check_for_mta_from(const char *aline)
1810 s=strstr(aline,">From");
1813 if (pswit[ECHO_SWITCH])
1814 g_print("\n%s\n",aline);
1815 if (!pswit[OVERVIEW_SWITCH])
1816 g_print(" Line %ld column %ld - "
1817 "Query angled bracket with From\n",
1818 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1825 * check_for_orphan_character:
1827 * Check for a single character line -
1828 * often an overflow from bad wrapping.
1830 void check_for_orphan_character(const char *aline)
1833 c=g_utf8_get_char(aline);
1834 if (c && !*g_utf8_next_char(aline))
1836 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1837 ; /* Nothing - ignore numerals alone on a line. */
1840 if (pswit[ECHO_SWITCH])
1841 g_print("\n%s\n",aline);
1842 if (!pswit[OVERVIEW_SWITCH])
1843 g_print(" Line %ld column 1 - Query single character line\n",
1852 * check_for_pling_scanno:
1854 * Check for I" - often should be !
1856 void check_for_pling_scanno(const char *aline)
1859 s=strstr(aline," I\"");
1862 if (pswit[ECHO_SWITCH])
1863 g_print("\n%s\n",aline);
1864 if (!pswit[OVERVIEW_SWITCH])
1865 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1866 linecnt,g_utf8_pointer_to_offset(aline,s));
1873 * check_for_extra_period:
1875 * Check for period without a capital letter. Cut-down from gutspell.
1876 * Only works when it happens on a single line.
1878 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1880 const char *s,*t,*s1,*sprev;
1885 gunichar c,nc,pc,*decomposition;
1886 if (pswit[PARANOID_SWITCH])
1888 for (t=aline;t=strstr(t,". ");)
1892 t=g_utf8_next_char(t);
1893 /* start of line punctuation is handled elsewhere */
1896 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1898 t=g_utf8_next_char(t);
1901 if (warnings->isDutch)
1903 /* For Frank & Jeroen -- 's Middags case */
1904 gunichar c2,c3,c4,c5;
1905 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1906 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1907 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1908 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1909 if (CHAR_IS_APOSTROPHE(c2) &&
1910 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1911 g_unichar_isupper(c5))
1913 t=g_utf8_next_char(t);
1917 s1=g_utf8_next_char(g_utf8_next_char(t));
1918 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1919 !g_unichar_isdigit(g_utf8_get_char(s1)))
1920 s1=g_utf8_next_char(s1);
1921 if (g_unichar_islower(g_utf8_get_char(s1)))
1923 /* we have something to investigate */
1925 /* so let's go back and find out */
1926 nc=g_utf8_get_char(t);
1927 s1=g_utf8_prev_char(t);
1928 c=g_utf8_get_char(s1);
1929 sprev=g_utf8_prev_char(s1);
1930 pc=g_utf8_get_char(sprev);
1932 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1933 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1934 g_unichar_isalpha(nc)))
1939 sprev=g_utf8_prev_char(s1);
1940 pc=g_utf8_get_char(sprev);
1942 s1=g_utf8_next_char(s1);
1945 testword=g_strndup(s1,s-s1);
1947 testword=g_strdup(s1);
1948 for (i=0;*abbrev[i];i++)
1949 if (!strcmp(testword,abbrev[i]))
1951 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1953 if (!*g_utf8_next_char(testword))
1955 if (isroman(testword))
1960 for (s=testword;*s;s=g_utf8_next_char(s))
1962 decomposition=g_unicode_canonical_decomposition(
1963 g_utf8_get_char(s),&len);
1964 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1966 g_free(decomposition);
1970 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1972 g_tree_insert(qperiod,g_strdup(testword),
1973 GINT_TO_POINTER(1));
1974 if (pswit[ECHO_SWITCH])
1975 g_print("\n%s\n",aline);
1976 if (!pswit[OVERVIEW_SWITCH])
1977 g_print(" Line %ld column %ld - Extra period?\n",
1978 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1984 t=g_utf8_next_char(t);
1990 * check_for_following_punctuation:
1992 * Check for words usually not followed by punctuation.
1994 void check_for_following_punctuation(const char *aline)
1997 const char *s,*wordstart;
2000 if (pswit[TYPO_SWITCH])
2011 inword=g_utf8_strdown(t,-1);
2013 for (i=0;*nocomma[i];i++)
2014 if (!strcmp(inword,nocomma[i]))
2016 c=g_utf8_get_char(s);
2017 if (c==',' || c==';' || c==':')
2019 if (pswit[ECHO_SWITCH])
2020 g_print("\n%s\n",aline);
2021 if (!pswit[OVERVIEW_SWITCH])
2022 g_print(" Line %ld column %ld - "
2023 "Query punctuation after %s?\n",
2024 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
2030 for (i=0;*noperiod[i];i++)
2031 if (!strcmp(inword,noperiod[i]))
2033 c=g_utf8_get_char(s);
2034 if (c=='.' || c=='!')
2036 if (pswit[ECHO_SWITCH])
2037 g_print("\n%s\n",aline);
2038 if (!pswit[OVERVIEW_SWITCH])
2039 g_print(" Line %ld column %ld - "
2040 "Query punctuation after %s?\n",
2041 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
2055 * Check for commonly mistyped words,
2056 * and digits like 0 for O in a word.
2058 void check_for_typos(const char *aline,struct warnings *warnings)
2060 const char *s,*t,*nt,*wordstart;
2062 gunichar *decomposition;
2064 int i,vowel,consonant,*dupcnt;
2065 gboolean isdup,istypo,alower;
2068 gsize decomposition_len;
2072 inword=getaword(&s);
2076 continue; /* don't bother with empty lines */
2078 if (mixdigit(inword))
2080 if (pswit[ECHO_SWITCH])
2081 g_print("\n%s\n",aline);
2082 if (!pswit[OVERVIEW_SWITCH])
2083 g_print(" Line %ld column %ld - Query digit in %s\n",
2084 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
2089 * Put the word through a series of tests for likely typos and OCR
2092 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2096 for (t=inword;*t;t=g_utf8_next_char(t))
2098 c=g_utf8_get_char(t);
2099 nt=g_utf8_next_char(t);
2100 /* lowercase for testing */
2101 if (g_unichar_islower(c))
2103 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
2106 * We have an uppercase mid-word. However, there are
2108 * Mac and Mc like McGill
2109 * French contractions like l'Abbe
2111 offset=g_utf8_pointer_to_offset(inword,t);
2113 pc=g_utf8_get_char(g_utf8_prev_char(t));
2116 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
2117 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2118 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2119 CHAR_IS_APOSTROPHE(pc))
2125 testword=g_utf8_casefold(inword,-1);
2127 if (pswit[TYPO_SWITCH])
2130 * Check for certain unlikely two-letter combinations at word
2133 len=g_utf8_strlen(testword,-1);
2136 for (i=0;*nostart[i];i++)
2137 if (g_str_has_prefix(testword,nostart[i]))
2139 for (i=0;*noend[i];i++)
2140 if (g_str_has_suffix(testword,noend[i]))
2143 /* ght is common, gbt never. Like that. */
2144 if (strstr(testword,"cb"))
2146 if (strstr(testword,"gbt"))
2148 if (strstr(testword,"pbt"))
2150 if (strstr(testword,"tbs"))
2152 if (strstr(testword,"mrn"))
2154 if (strstr(testword,"ahle"))
2156 if (strstr(testword,"ihle"))
2159 * "TBE" does happen - like HEARTBEAT - but uncommon.
2160 * Also "TBI" - frostbite, outbid - but uncommon.
2161 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2162 * numerals, but "ii" is a common scanno.
2164 if (strstr(testword,"tbi"))
2166 if (strstr(testword,"tbe"))
2168 if (strstr(testword,"ii"))
2171 * Check for no vowels or no consonants.
2172 * If none, flag a typo.
2174 if (!istypo && len>1)
2177 for (t=testword;*t;t=g_utf8_next_char(t))
2179 c=g_utf8_get_char(t);
2181 g_unicode_canonical_decomposition(c,&decomposition_len);
2182 if (c=='y' || g_unichar_isdigit(c))
2184 /* Yah, this is loose. */
2188 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2192 g_free(decomposition);
2194 if (!vowel || !consonant)
2198 * Now exclude the word from being reported if it's in
2201 for (i=0;*okword[i];i++)
2202 if (!strcmp(testword,okword[i]))
2205 * What looks like a typo may be a Roman numeral.
2208 if (istypo && isroman(testword))
2210 /* Check the manual list of typos. */
2212 for (i=0;*typo[i];i++)
2213 if (!strcmp(testword,typo[i]))
2216 * Check lowercase s, l, i and m - special cases.
2217 * "j" - often a semi-colon gone wrong.
2218 * "d" for a missing apostrophe - he d
2221 if (!istypo && len==1 &&
2222 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2226 dupcnt=g_tree_lookup(qword,testword);
2230 isdup=!pswit[VERBOSE_SWITCH];
2234 dupcnt=g_new0(int,1);
2235 g_tree_insert(qword,g_strdup(testword),dupcnt);
2240 if (pswit[ECHO_SWITCH])
2241 g_print("\n%s\n",aline);
2242 if (!pswit[OVERVIEW_SWITCH])
2244 g_print(" Line %ld column %ld - Query word %s",
2245 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2247 if (!pswit[VERBOSE_SWITCH])
2248 g_print(" - not reporting duplicates");
2256 /* check the user's list of typos */
2257 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2259 if (pswit[ECHO_SWITCH])
2260 g_print("\n%s\n",aline);
2261 if (!pswit[OVERVIEW_SWITCH])
2262 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2263 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2265 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2267 if (pswit[PARANOID_SWITCH] && warnings->digit)
2269 /* In paranoid mode, query all 0 and 1 standing alone. */
2270 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2272 if (pswit[ECHO_SWITCH])
2273 g_print("\n%s\n",aline);
2274 if (!pswit[OVERVIEW_SWITCH])
2275 g_print(" Line %ld column %ld - Query standalone %s\n",
2276 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2287 * check_for_misspaced_punctuation:
2289 * Look for added or missing spaces around punctuation and quotes.
2290 * If there is a punctuation character like ! with no space on
2291 * either side, suspect a missing!space. If there are spaces on
2292 * both sides , assume a typo. If we see a double quote with no
2293 * space or punctuation on either side of it, assume unspaced
2294 * quotes "like"this.
2296 void check_for_misspaced_punctuation(const char *aline,
2297 struct parities *parities,gboolean isemptyline)
2299 gboolean isacro,isellipsis;
2301 gunichar c,nc,pc,n2c;
2303 c=g_utf8_get_char(aline);
2304 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2305 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2309 nc=g_utf8_get_char(g_utf8_next_char(s));
2310 /* For each character in the line after the first. */
2311 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2313 /* we need to suppress warnings for acronyms like M.D. */
2315 /* we need to suppress warnings for ellipsis . . . */
2318 * If there are letters on both sides of it or
2319 * if it's strict punctuation followed by an alpha.
2321 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2322 g_utf8_strchr("?!,;:",-1,c)))
2326 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2327 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2329 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2335 if (pswit[ECHO_SWITCH])
2336 g_print("\n%s\n",aline);
2337 if (!pswit[OVERVIEW_SWITCH])
2338 g_print(" Line %ld column %ld - Missing space?\n",
2339 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2344 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2347 * If there are spaces on both sides,
2348 * or space before and end of line.
2352 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2353 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2355 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2359 if (!isemptyline && !isellipsis)
2361 if (pswit[ECHO_SWITCH])
2362 g_print("\n%s\n",aline);
2363 if (!pswit[OVERVIEW_SWITCH])
2364 g_print(" Line %ld column %ld - "
2365 "Spaced punctuation?\n",linecnt,
2366 g_utf8_pointer_to_offset(aline,s)+1);
2373 /* Split out the characters that CANNOT be preceded by space. */
2374 c=g_utf8_get_char(aline);
2375 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2376 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2380 nc=g_utf8_get_char(g_utf8_next_char(s));
2381 /* for each character in the line after the first */
2382 if (g_utf8_strchr("?!,;:",-1,c))
2384 /* if it's punctuation that _cannot_ have a space before it */
2385 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2388 * If nc DOES == space,
2389 * it was already reported just above.
2391 if (pswit[ECHO_SWITCH])
2392 g_print("\n%s\n",aline);
2393 if (!pswit[OVERVIEW_SWITCH])
2394 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2395 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2402 * Special case " .X" where X is any alpha.
2403 * This plugs a hole in the acronym code above.
2404 * Inelegant, but maintainable.
2406 c=g_utf8_get_char(aline);
2407 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2408 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2412 nc=g_utf8_get_char(g_utf8_next_char(s));
2413 /* for each character in the line after the first */
2416 /* if it's a period */
2417 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2420 * If the period follows a space and
2421 * is followed by a letter.
2423 if (pswit[ECHO_SWITCH])
2424 g_print("\n%s\n",aline);
2425 if (!pswit[OVERVIEW_SWITCH])
2426 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2427 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2433 c=g_utf8_get_char(aline);
2434 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2435 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2439 nc=g_utf8_get_char(g_utf8_next_char(s));
2440 /* for each character in the line after the first */
2441 if (CHAR_IS_DQUOTE(c))
2443 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2444 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2445 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2447 if (pswit[ECHO_SWITCH])
2448 g_print("\n%s\n",aline);
2449 if (!pswit[OVERVIEW_SWITCH])
2450 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2451 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2457 /* Check parity of quotes. */
2458 nc=g_utf8_get_char(aline);
2459 for (s=aline;*s;s=g_utf8_next_char(s))
2462 nc=g_utf8_get_char(g_utf8_next_char(s));
2463 if (CHAR_IS_DQUOTE(c))
2467 parities->dquote=!parities->dquote;
2468 parity=parities->dquote;
2470 else if (c==CHAR_LD_QUOTE)
2477 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2479 if (pswit[ECHO_SWITCH])
2480 g_print("\n%s\n",aline);
2481 if (!pswit[OVERVIEW_SWITCH])
2482 g_print(" Line %ld column %ld - "
2483 "Wrongspaced quotes?\n",
2484 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2492 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2493 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2495 if (pswit[ECHO_SWITCH])
2496 g_print("\n%s\n",aline);
2497 if (!pswit[OVERVIEW_SWITCH])
2498 g_print(" Line %ld column %ld - "
2499 "Wrongspaced quotes?\n",
2500 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2507 c=g_utf8_get_char(aline);
2508 if (CHAR_IS_DQUOTE(c))
2510 if (g_utf8_strchr(",;:!?)]} ",-1,
2511 g_utf8_get_char(g_utf8_next_char(aline))))
2513 if (pswit[ECHO_SWITCH])
2514 g_print("\n%s\n",aline);
2515 if (!pswit[OVERVIEW_SWITCH])
2516 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2522 if (pswit[SQUOTE_SWITCH])
2524 nc=g_utf8_get_char(aline);
2525 for (s=aline;*s;s=g_utf8_next_char(s))
2528 nc=g_utf8_get_char(g_utf8_next_char(s));
2529 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2530 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2531 !g_unichar_isalpha(nc)))
2533 parities->squote=!parities->squote;
2534 if (!parities->squote)
2537 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2539 if (pswit[ECHO_SWITCH])
2540 g_print("\n%s\n",aline);
2541 if (!pswit[OVERVIEW_SWITCH])
2542 g_print(" Line %ld column %ld - "
2543 "Wrongspaced singlequotes?\n",
2544 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2552 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2553 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2555 if (pswit[ECHO_SWITCH])
2556 g_print("\n%s\n",aline);
2557 if (!pswit[OVERVIEW_SWITCH])
2558 g_print(" Line %ld column %ld - "
2559 "Wrongspaced singlequotes?\n",
2560 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2573 * Given a position p within a string str, determine whether it follows the
2574 * given word. This is roughly equivalent to the regular expression (?<=\bword)
2575 * but has different boundary conditions.
2577 static gboolean str_follows_word(const char *str,const char *p,const char *word)
2579 int len=strlen(word);
2582 else if (!g_str_has_prefix(p-len,word))
2584 else if (p-len==str)
2587 /* Using non-alpha as a word boundary. See UAX #29 for a better way. */
2588 return !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(p-len)));
2592 * check_for_double_punctuation:
2594 * Look for double punctuation like ,. or ,,
2595 * Thanks to DW for the suggestion!
2596 * In books with references, ".," and ".;" are common
2597 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2598 * OTOH, from my initial tests, there are also fairly
2599 * common errors. What to do? Make these cases paranoid?
2600 * ".," is the most common, so warnings->dotcomma is used
2601 * to suppress detailed reporting if it occurs often.
2602 * Indeed, ".," is so common after "etc" or "&c" that
2603 * we don't warn on these cases at all.
2605 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2610 nc=g_utf8_get_char(aline);
2611 for (s=aline;*s;s=g_utf8_next_char(s))
2614 nc=g_utf8_get_char(g_utf8_next_char(s));
2615 /* for each punctuation character in the line */
2616 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2617 g_utf8_strchr(".?!,;:",-1,nc))
2619 /* followed by punctuation, it's a query, unless . . . */
2621 if (warnings->isFrench &&
2622 (g_str_has_prefix(s,",...") || g_str_has_prefix(s,"...,") ||
2623 g_str_has_prefix(s,";...") || g_str_has_prefix(s,"...;") ||
2624 g_str_has_prefix(s,":...") || g_str_has_prefix(s,"...:") ||
2625 g_str_has_prefix(s,"!...") || g_str_has_prefix(s,"...!") ||
2626 g_str_has_prefix(s,"?...") || g_str_has_prefix(s,"...?")))
2629 nc=g_utf8_get_char(g_utf8_next_char(s));
2632 else if (c==nc && (c=='.' || c=='?' || c=='!'))
2634 /* do nothing for .. !! and ?? which can be legit */
2637 else if (c=='.' && nc==',')
2639 if (!warnings->dotcomma || str_follows_word(aline,s,"etc") ||
2640 str_follows_word(aline,s,"&c"))
2645 if (pswit[ECHO_SWITCH])
2646 g_print("\n%s\n",aline);
2647 if (!pswit[OVERVIEW_SWITCH])
2648 g_print(" Line %ld column %ld - Double punctuation?\n",
2649 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2658 * check_for_spaced_quotes:
2660 void check_for_spaced_quotes(const char *aline)
2664 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2668 while ((t=strstr(s," \" ")))
2670 if (pswit[ECHO_SWITCH])
2671 g_print("\n%s\n",aline);
2672 if (!pswit[OVERVIEW_SWITCH])
2673 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2674 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2677 s=g_utf8_next_char(g_utf8_next_char(t));
2679 pattern=g_string_new(NULL);
2680 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2682 g_string_assign(pattern," ");
2683 g_string_append_unichar(pattern,single_quotes[i]);
2684 g_string_append_c(pattern,' ');
2686 while ((t=strstr(s,pattern->str)))
2688 if (pswit[ECHO_SWITCH])
2689 g_print("\n%s\n",aline);
2690 if (!pswit[OVERVIEW_SWITCH])
2691 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2692 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2695 s=g_utf8_next_char(g_utf8_next_char(t));
2698 g_string_free(pattern,TRUE);
2702 * check_for_miscased_genative:
2704 * Check special case of 'S instead of 's at end of word.
2706 void check_for_miscased_genative(const char *aline)
2712 c=g_utf8_get_char(aline);
2713 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2714 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2718 nc=g_utf8_get_char(g_utf8_next_char(s));
2719 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2721 if (pswit[ECHO_SWITCH])
2722 g_print("\n%s\n",aline);
2723 if (!pswit[OVERVIEW_SWITCH])
2724 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2725 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2733 * check_end_of_line:
2735 * Now check special cases - start and end of line -
2736 * for single and double quotes. Start is sometimes [sic]
2737 * but better to query it anyway.
2738 * While we're here, check for dash at end of line.
2740 void check_end_of_line(const char *aline,struct warnings *warnings)
2745 lbytes=strlen(aline);
2746 if (g_utf8_strlen(aline,lbytes)>1)
2748 s=g_utf8_prev_char(aline+lbytes);
2749 c1=g_utf8_get_char(s);
2750 c2=g_utf8_get_char(g_utf8_prev_char(s));
2751 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2753 if (pswit[ECHO_SWITCH])
2754 g_print("\n%s\n",aline);
2755 if (!pswit[OVERVIEW_SWITCH])
2756 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2757 g_utf8_strlen(aline,lbytes));
2761 c1=g_utf8_get_char(aline);
2762 c2=g_utf8_get_char(g_utf8_next_char(aline));
2763 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2765 if (pswit[ECHO_SWITCH])
2766 g_print("\n%s\n",aline);
2767 if (!pswit[OVERVIEW_SWITCH])
2768 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2773 * Dash at end of line may well be legit - paranoid mode only
2774 * and don't report em-dash at line-end.
2776 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2778 for (s=g_utf8_prev_char(aline+lbytes);
2779 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2781 if (g_utf8_get_char(s)=='-' &&
2782 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2784 if (pswit[ECHO_SWITCH])
2785 g_print("\n%s\n",aline);
2786 if (!pswit[OVERVIEW_SWITCH])
2787 g_print(" Line %ld column %ld - "
2788 "Hyphen at end of line?\n",
2789 linecnt,g_utf8_pointer_to_offset(aline,s));
2796 * check_for_unspaced_bracket:
2798 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2799 * If so, suspect a scanno like "a]most".
2801 void check_for_unspaced_bracket(const char *aline)
2805 c=g_utf8_get_char(aline);
2806 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2807 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2811 nc=g_utf8_get_char(g_utf8_next_char(s));
2814 /* for each bracket character in the line except 1st & last */
2815 if (g_utf8_strchr("{[()]}",-1,c) &&
2816 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2818 if (pswit[ECHO_SWITCH])
2819 g_print("\n%s\n",aline);
2820 if (!pswit[OVERVIEW_SWITCH])
2821 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2822 linecnt,g_utf8_pointer_to_offset(aline,s));
2830 * check_for_unpunctuated_endquote:
2832 void check_for_unpunctuated_endquote(const char *aline)
2837 c=g_utf8_get_char(aline);
2838 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2839 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2843 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2844 nc=g_utf8_get_char(g_utf8_next_char(s));
2845 /* for each character in the line except 1st */
2846 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2848 if (pswit[ECHO_SWITCH])
2849 g_print("\n%s\n",aline);
2850 if (!pswit[OVERVIEW_SWITCH])
2851 g_print(" Line %ld column %ld - "
2852 "endquote missing punctuation?\n",
2853 linecnt,g_utf8_pointer_to_offset(aline,s));
2861 * check_for_html_tag:
2863 * Check for <HTML TAG>.
2865 * If there is a < in the line, followed at some point
2866 * by a > then we suspect HTML.
2868 void check_for_html_tag(const char *aline)
2870 const char *open,*close;
2872 open=strchr(aline,'<');
2875 close=strchr(g_utf8_next_char(open),'>');
2878 if (pswit[ECHO_SWITCH])
2879 g_print("\n%s\n",aline);
2880 if (!pswit[OVERVIEW_SWITCH])
2882 tag=g_strndup(open,close-open+1);
2883 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2884 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2894 * check_for_html_entity:
2896 * Check for &symbol; HTML.
2898 * If there is a & in the line, followed at
2899 * some point by a ; then we suspect HTML.
2901 void check_for_html_entity(const char *aline)
2903 const char *s,*amp,*scolon;
2905 amp=strchr(aline,'&');
2908 scolon=strchr(amp,';');
2911 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2912 if (g_utf8_get_char(s)==CHAR_SPACE)
2913 break; /* Don't report "Jones & Son;" */
2916 if (pswit[ECHO_SWITCH])
2917 g_print("\n%s\n",aline);
2918 if (!pswit[OVERVIEW_SWITCH])
2920 entity=g_strndup(amp,scolon-amp+1);
2921 g_print(" Line %ld column %d - HTML symbol? %s \n",
2922 linecnt,(int)(amp-aline)+1,entity);
2933 * check_for_omitted_punctuation:
2935 * Check for omitted punctuation at end of paragraph by working back
2936 * through prevline. DW.
2937 * Need to check this only for "normal" paras.
2938 * So what is a "normal" para?
2939 * Not normal if one-liner (chapter headings, etc.)
2940 * Not normal if doesn't contain at least one locase letter
2941 * Not normal if starts with space
2943 void check_for_omitted_punctuation(const char *prevline,
2944 struct line_properties *last,int start_para_line)
2946 gboolean letter_on_line=FALSE;
2949 gboolean closing_quote;
2950 for (s=prevline;*s;s=g_utf8_next_char(s))
2951 if (g_unichar_isalpha(g_utf8_get_char(s)))
2953 letter_on_line=TRUE;
2957 * This next "if" is a problem.
2958 * If we say "start_para_line <= linecnt - 1", that includes
2959 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2960 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2961 * misses genuine one-line paragraphs.
2963 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2964 g_utf8_get_char(prevline)>CHAR_SPACE)
2966 s=prevline+strlen(prevline);
2969 s=g_utf8_prev_char(s);
2970 c=g_utf8_get_char(s);
2971 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2974 closing_quote=FALSE;
2975 } while (closing_quote && s>prevline);
2976 for (;s>prevline;s=g_utf8_prev_char(s))
2978 if (g_unichar_isalpha(g_utf8_get_char(s)))
2980 if (pswit[ECHO_SWITCH])
2981 g_print("\n%s\n",prevline);
2982 if (!pswit[OVERVIEW_SWITCH])
2983 g_print(" Line %ld column %ld - "
2984 "No punctuation at para end?\n",
2985 linecnt-1,g_utf8_strlen(prevline,-1));
2990 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2996 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2998 const char *word=key;
3001 g_print("\nNote: Queried word %s was duplicated %d times\n",
3006 void print_as_windows_1252(const char *string)
3008 gsize inbytes,outbytes;
3010 static GIConv converter=(GIConv)-1;
3013 if (converter!=(GIConv)-1)
3014 g_iconv_close(converter);
3015 converter=(GIConv)-1;
3018 if (converter==(GIConv)-1)
3019 converter=g_iconv_open("WINDOWS-1252","UTF-8");
3020 if (converter!=(GIConv)-1)
3022 inbytes=outbytes=strlen(string);
3023 bp=buf=g_malloc(outbytes+1);
3024 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
3030 fputs(string,stdout);
3033 void print_as_utf_8(const char *string)
3035 fputs(string,stdout);
3043 void procfile(const char *filename)
3046 gchar *parastart=NULL; /* first line of current para */
3047 gchar *etext,*aline;
3050 struct first_pass_results *first_pass_results;
3051 struct warnings *warnings;
3052 struct counters counters={0};
3053 struct line_properties last={0};
3054 struct parities parities={0};
3055 struct pending pending={0};
3056 gboolean isemptyline;
3057 long start_para_line=0;
3058 gboolean isnewpara=FALSE,enddash=FALSE;
3059 last.start=CHAR_SPACE;
3060 linecnt=checked_linecnt=0;
3061 etext=read_etext(filename,&err);
3064 if (pswit[STDOUT_SWITCH])
3065 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
3067 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
3070 g_print("\n\nFile: %s\n\n",filename);
3071 first_pass_results=first_pass(etext);
3072 warnings=report_first_pass(first_pass_results);
3073 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
3074 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
3076 * Here we go with the main pass. Hold onto yer hat!
3080 while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
3085 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
3086 continue; // skip DP page separators completely
3087 if (linecnt<first_pass_results->firstline ||
3088 (first_pass_results->footerline>0 &&
3089 linecnt>first_pass_results->footerline))
3091 if (pswit[HEADER_SWITCH])
3093 if (g_str_has_prefix(aline,"Title:"))
3094 g_print(" %s\n",aline);
3095 if (g_str_has_prefix(aline,"Author:"))
3096 g_print(" %s\n",aline);
3097 if (g_str_has_prefix(aline,"Release Date:"))
3098 g_print(" %s\n",aline);
3099 if (g_str_has_prefix(aline,"Edition:"))
3100 g_print(" %s\n\n",aline);
3102 continue; /* skip through the header */
3105 print_pending(aline,parastart,&pending);
3106 isemptyline=analyse_quotes(aline,&counters);
3107 if (isnewpara && !isemptyline)
3109 /* This line is the start of a new paragraph. */
3110 start_para_line=linecnt;
3111 /* Capture its first line in case we want to report it later. */
3113 parastart=g_strdup(aline);
3114 memset(&parities,0,sizeof(parities)); /* restart the quote count */
3116 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
3117 !g_unichar_isdigit(g_utf8_get_char(s)))
3118 s=g_utf8_next_char(s);
3119 if (g_unichar_islower(g_utf8_get_char(s)))
3121 /* and its first letter is lowercase */
3122 if (pswit[ECHO_SWITCH])
3123 g_print("\n%s\n",aline);
3124 if (!pswit[OVERVIEW_SWITCH])
3125 g_print(" Line %ld column %ld - "
3126 "Paragraph starts with lower-case\n",
3127 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
3131 isnewpara=FALSE; /* Signal the end of new para processing. */
3133 /* Check for an em-dash broken at line end. */
3134 if (enddash && g_utf8_get_char(aline)=='-')
3136 if (pswit[ECHO_SWITCH])
3137 g_print("\n%s\n",aline);
3138 if (!pswit[OVERVIEW_SWITCH])
3139 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
3144 for (s=g_utf8_prev_char(aline+strlen(aline));
3145 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
3147 if (s>=aline && g_utf8_get_char(s)=='-')
3149 check_for_control_characters(aline);
3150 check_for_odd_characters(aline,warnings,isemptyline);
3151 if (warnings->longline)
3152 check_for_long_line(aline);
3153 if (warnings->shortline)
3154 check_for_short_line(aline,&last);
3156 last.len=g_utf8_strlen(aline,-1);
3157 last.start=g_utf8_get_char(aline);
3158 check_for_starting_punctuation(aline);
3161 check_for_spaced_emdash(aline);
3162 check_for_spaced_dash(aline);
3164 check_for_unmarked_paragraphs(aline);
3165 check_for_jeebies(aline);
3166 check_for_mta_from(aline);
3167 check_for_orphan_character(aline);
3168 check_for_pling_scanno(aline);
3169 check_for_extra_period(aline,warnings);
3170 check_for_following_punctuation(aline);
3171 check_for_typos(aline,warnings);
3172 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3173 check_for_double_punctuation(aline,warnings);
3174 check_for_spaced_quotes(aline);
3175 check_for_miscased_genative(aline);
3176 check_end_of_line(aline,warnings);
3177 check_for_unspaced_bracket(aline);
3178 if (warnings->endquote)
3179 check_for_unpunctuated_endquote(aline);
3180 check_for_html_tag(aline);
3181 check_for_html_entity(aline);
3184 check_for_mismatched_quotes(&counters,&pending);
3185 counters_reset(&counters);
3186 /* let the next iteration know that it's starting a new para */
3189 check_for_omitted_punctuation(prevline,&last,start_para_line);
3192 prevline=g_strdup(aline);
3195 check_for_mismatched_quotes(&counters,&pending);
3196 print_pending(NULL,parastart,&pending);
3197 reset_pending(&pending);
3206 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3207 g_tree_foreach(qword,report_duplicate_queries,NULL);
3208 g_tree_unref(qword);
3209 g_tree_unref(qperiod);
3210 counters_destroy(&counters);
3211 g_set_print_handler(NULL);
3212 print_as_windows_1252(NULL);
3213 if (pswit[MARKUP_SWITCH])
3220 * Get one line from the input text. The setting of newlines has the following
3223 * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
3225 * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
3226 * the newline character.
3228 * UNIX_NEWLINES: Check for the presence of CRs.
3230 * In all cases, check that the last line is correctly terminated.
3232 * Returns: a pointer to the line.
3234 char *flgets(char **etext,long lcnt,int newlines)
3237 gboolean isCR=FALSE;
3238 char *theline=*etext;
3243 c=g_utf8_get_char(*etext);
3246 if (*etext==theline)
3248 else if (pswit[LINE_END_SWITCH])
3250 if (pswit[ECHO_SWITCH])
3252 s=g_strndup(theline,eos-theline);
3253 g_print("\n%s\n",s);
3256 if (!pswit[OVERVIEW_SWITCH])
3258 if (newlines==OS9_NEWLINES)
3259 g_print(" Line %ld - No CR?\n",lcnt);
3262 /* There may, or may not, have been a CR */
3263 g_print(" Line %ld - No LF?\n",lcnt);
3271 *etext=g_utf8_next_char(*etext);
3272 /* either way, it's end of line */
3275 if (newlines==DOS_NEWLINES && !isCR)
3277 /* Error - a LF without a preceding CR */
3278 if (pswit[LINE_END_SWITCH])
3280 if (pswit[ECHO_SWITCH])
3282 s=g_strndup(theline,eos-theline);
3283 g_print("\n%s\n",s);
3286 if (!pswit[OVERVIEW_SWITCH])
3287 g_print(" Line %ld - No CR?\n",lcnt);
3296 if (newlines==OS9_NEWLINES)
3298 if (isCR || newlines==UNIX_NEWLINES)
3300 if (pswit[LINE_END_SWITCH])
3302 if (pswit[ECHO_SWITCH])
3304 s=g_strndup(theline,eos-theline);
3305 g_print("\n%s\n",s);
3308 if (!pswit[OVERVIEW_SWITCH])
3310 if (newlines==UNIX_NEWLINES)
3311 g_print(" Line %ld column %ld - Embedded CR?\n",
3312 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3314 g_print(" Line %ld - Two successive CRs?\n",
3320 if (newlines==UNIX_NEWLINES)
3323 if (newlines==DOS_NEWLINES)
3328 if (pswit[LINE_END_SWITCH] && isCR)
3330 if (pswit[ECHO_SWITCH])
3332 s=g_strndup(theline,eos-theline);
3333 g_print("\n%s\n",s);
3336 if (!pswit[OVERVIEW_SWITCH])
3337 g_print(" Line %ld column %ld - CR without LF?\n",
3338 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3344 eos=g_utf8_next_char(eos);
3348 if (pswit[MARKUP_SWITCH])
3349 postprocess_for_HTML(theline);
3350 if (pswit[DP_SWITCH])
3351 postprocess_for_DP(theline);
3358 * Takes a "word" as a parameter, and checks whether it
3359 * contains a mixture of alpha and digits. Generally, this is an
3360 * error, but may not be for cases like 4th or L5 12s. 3d.
3362 * Returns: TRUE iff an is error found.
3364 gboolean mixdigit(const char *checkword)
3366 gboolean wehaveadigit,wehavealetter,query;
3367 const char *s,*nondigit;
3368 wehaveadigit=wehavealetter=query=FALSE;
3369 for (s=checkword;*s;s=g_utf8_next_char(s))
3370 if (g_unichar_isalpha(g_utf8_get_char(s)))
3372 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3374 if (wehaveadigit && wehavealetter)
3376 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3378 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3379 nondigit=g_utf8_next_char(nondigit))
3381 /* digits, ending in st, rd, nd, th of either case */
3382 if (!g_ascii_strcasecmp(nondigit,"st") ||
3383 !g_ascii_strcasecmp(nondigit,"rd") ||
3384 !g_ascii_strcasecmp(nondigit,"nd") ||
3385 !g_ascii_strcasecmp(nondigit,"th"))
3387 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3388 !g_ascii_strcasecmp(nondigit,"rds") ||
3389 !g_ascii_strcasecmp(nondigit,"nds") ||
3390 !g_ascii_strcasecmp(nondigit,"ths"))
3392 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3393 !g_ascii_strcasecmp(nondigit,"rdly") ||
3394 !g_ascii_strcasecmp(nondigit,"ndly") ||
3395 !g_ascii_strcasecmp(nondigit,"thly"))
3397 /* digits, ending in l, L, s or d */
3398 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3399 !strcmp(nondigit,"d"))
3402 * L at the start of a number, representing Britsh pounds, like L500.
3403 * This is cute. We know the current word is mixed digit. If the first
3404 * letter is L, there must be at least one digit following. If both
3405 * digits and letters follow, we have a genuine error, else we have a
3406 * capital L followed by digits, and we accept that as a non-error.
3408 if (g_utf8_get_char(checkword)=='L' &&
3409 !mixdigit(g_utf8_next_char(checkword)))
3418 * Extracts the first/next "word" from the line, and returns it.
3419 * A word is defined as one English word unit--or at least that's the aim.
3420 * "ptr" is advanced to the position in the line where we will start
3421 * looking for the next word.
3423 * Returns: A newly-allocated string.
3425 gchar *getaword(const char **ptr)
3430 word=g_string_new(NULL);
3431 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3432 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3433 **ptr;*ptr=g_utf8_next_char(*ptr))
3435 /* Handle exceptions for footnote markers like [1] */
3436 if (g_utf8_get_char(*ptr)=='[')
3438 g_string_append_c(word,'[');
3439 s=g_utf8_next_char(*ptr);
3440 for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
3441 g_string_append_unichar(word,g_utf8_get_char(s));
3442 if (g_utf8_get_char(s)==']')
3444 g_string_append_c(word,']');
3445 *ptr=g_utf8_next_char(s);
3446 return g_string_free(word,FALSE);
3449 g_string_truncate(word,0);
3453 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3454 * Especially yucky is the case of L1,000
3455 * This section looks for a pattern of characters including a digit
3456 * followed by a comma or period followed by one or more digits.
3457 * If found, it returns this whole pattern as a word; otherwise we discard
3458 * the results and resume our normal programming.
3461 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3462 g_unichar_isalpha(g_utf8_get_char(s)) ||
3463 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3464 g_string_append_unichar(word,g_utf8_get_char(s));
3467 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3469 c=g_utf8_get_char(t);
3470 pc=g_utf8_get_char(g_utf8_prev_char(t));
3471 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3474 return g_string_free(word,FALSE);
3478 /* we didn't find a punctuated number - do the regular getword thing */
3479 g_string_truncate(word,0);
3480 c=g_utf8_get_char(*ptr);
3481 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3482 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3483 g_string_append_unichar(word,c);
3484 return g_string_free(word,FALSE);
3490 * Is this word a Roman Numeral?
3492 * It doesn't actually validate that the number is a valid Roman Numeral--for
3493 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3494 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3495 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3496 * expressions thereof, except when it came to taxes. Allow any number of M,
3497 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3498 * XL or an optional XC, an optional IX or IV, an optional V and any number
3501 gboolean isroman(const char *t)
3507 while (g_utf8_get_char(t)=='m' && *t)
3509 if (g_utf8_get_char(t)=='d')
3511 if (g_str_has_prefix(t,"cm"))
3513 if (g_str_has_prefix(t,"cd"))
3515 while (g_utf8_get_char(t)=='c' && *t)
3517 if (g_str_has_prefix(t,"xl"))
3519 if (g_str_has_prefix(t,"xc"))
3521 if (g_utf8_get_char(t)=='l')
3523 while (g_utf8_get_char(t)=='x' && *t)
3525 if (g_str_has_prefix(t,"ix"))
3527 if (g_str_has_prefix(t,"iv"))
3529 if (g_utf8_get_char(t)=='v')
3531 while (g_utf8_get_char(t)=='i' && *t)
3537 * postprocess_for_DP:
3539 * Invoked with the -d switch from flgets().
3540 * It simply "removes" from the line a hard-coded set of common
3541 * DP-specific tags, so that the line passed to the main routine has
3542 * been pre-cleaned of DP markup.
3544 void postprocess_for_DP(char *theline)
3550 for (i=0;*DPmarkup[i];i++)
3551 while ((s=strstr(theline,DPmarkup[i])))
3553 t=s+strlen(DPmarkup[i]);
3554 memmove(s,t,strlen(t)+1);
3559 * postprocess_for_HTML:
3561 * Invoked with the -m switch from flgets().
3562 * It simply "removes" from the line a hard-coded set of common
3563 * HTML tags and "replaces" a hard-coded set of common HTML
3564 * entities, so that the line passed to the main routine has
3565 * been pre-cleaned of HTML.
3567 void postprocess_for_HTML(char *theline)
3569 while (losemarkup(theline))
3571 loseentities(theline);
3574 char *losemarkup(char *theline)
3578 s=strchr(theline,'<');
3579 t=s?strchr(s,'>'):NULL;
3582 for (i=0;*markup[i];i++)
3583 if (tagcomp(g_utf8_next_char(s),markup[i]))
3585 t=g_utf8_next_char(t);
3586 memmove(s,t,strlen(t)+1);
3589 /* It's an unrecognized <xxx>. */
3593 void loseentities(char *theline)
3600 GTree *entities=NULL;
3601 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3605 g_tree_destroy(entities);
3607 if (translit!=(GIConv)-1)
3608 g_iconv_close(translit);
3609 translit=(GIConv)-1;
3610 if (to_utf8!=(GIConv)-1)
3611 g_iconv_close(to_utf8);
3619 entities=g_tree_new((GCompareFunc)strcmp);
3620 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3621 g_tree_insert(entities,HTMLentities[i].name,
3622 GUINT_TO_POINTER(HTMLentities[i].c));
3624 if (translit==(GIConv)-1)
3625 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3626 if (to_utf8==(GIConv)-1)
3627 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3628 while((amp=strchr(theline,'&')))
3630 scolon=strchr(amp,';');
3635 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3636 c=strtol(amp+2,NULL,10);
3637 else if (amp[2]=='x' &&
3638 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3639 c=strtol(amp+3,NULL,16);
3643 s=g_strndup(amp+1,scolon-(amp+1));
3644 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3653 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3654 theline+=g_unichar_to_utf8(c,theline);
3658 nb=g_unichar_to_utf8(c,s);
3659 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3661 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3663 memcpy(theline,s,nb);
3667 memmove(theline,g_utf8_next_char(scolon),
3668 strlen(g_utf8_next_char(scolon))+1);
3671 theline=g_utf8_next_char(amp);
3675 gboolean tagcomp(const char *strin,const char *basetag)
3679 if (g_utf8_get_char(strin)=='/')
3680 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3682 t=g_utf8_casefold(strin,-1);
3683 s=g_utf8_casefold(basetag,-1);
3684 retval=g_str_has_prefix(t,s);
3690 void proghelp(GOptionContext *context)
3693 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3694 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3695 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3696 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3697 "For details, read the file COPYING.\n",stderr);
3698 fputs("This is Free Software; "
3699 "you may redistribute it under certain conditions (GPL);\n",stderr);
3700 fputs("read the file COPYING for details.\n\n",stderr);
3701 help=g_option_context_get_help(context,TRUE,NULL);
3704 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3705 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3706 "non-ASCII\n",stderr);
3707 fputs("characters like accented letters, "
3708 "lines longer than 75 or shorter than 55,\n",stderr);
3709 fputs("unbalanced quotes or brackets, "
3710 "a variety of badly formatted punctuation, \n",stderr);
3711 fputs("HTML tags, some likely typos. "
3712 "It is NOT a substitute for human judgement.\n",stderr);