1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 gboolean typo_compat,paranoid_compat;
137 static GOptionEntry options[]={
138 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
139 "Ignore DP-specific markup", NULL },
140 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
141 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
142 "Don't ignore DP-specific markup", NULL },
143 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
144 "Echo queried line", NULL },
145 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
146 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
147 "Don't echo queried line", NULL },
148 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
149 "Check single quotes", NULL },
150 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
151 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
152 "Don't check single quotes", NULL },
153 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
154 "Check common typos", NULL },
155 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
156 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
157 "Don't check common typos", NULL },
158 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
159 "Require closure of quotes on every paragraph", NULL },
160 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
161 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
162 "Don't require closure of quotes on every paragraph", NULL },
163 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
164 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
165 "Enable paranoid querying of everything", NULL },
166 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
167 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
168 "Disable paranoid querying of everything", NULL },
169 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
170 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
171 "Enable line end checking", NULL },
172 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
173 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
174 "Disable line end checking", NULL },
175 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
176 "Overview: just show counts", NULL },
177 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
178 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
179 "Show individual warnings", NULL },
180 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
181 "Output errors to stdout instead of stderr", NULL },
182 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
183 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
184 "Output errors to stderr instead of stdout", NULL },
185 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
186 "Echo header fields", NULL },
187 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
188 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Don't echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
193 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "No special handling for markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
198 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
199 "Ignore file of user-defined typos", NULL },
200 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
201 "Verbose - list everything", NULL },
202 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
203 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
204 "Switch off verbose mode", NULL },
205 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
206 "Set of characters valid for this ebook", "NAME" },
211 * Options relating to configuration which make no sense from inside
212 * a configuration file.
215 static GOptionEntry config_options[]={
216 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
217 "Defaults for use on www upload", NULL },
218 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
219 "Dump current config settings", NULL },
223 static GOptionEntry compatibility_options[]={
224 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
225 "Toggle checking for common typos", NULL },
226 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
227 "Toggle both paranoid mode and common typos", NULL },
231 long cnt_quote; /* for overview mode, count of quote queries */
232 long cnt_brack; /* for overview mode, count of brackets queries */
233 long cnt_bin; /* for overview mode, count of non-ASCII queries */
234 long cnt_odd; /* for overview mode, count of odd character queries */
235 long cnt_long; /* for overview mode, count of long line errors */
236 long cnt_short; /* for overview mode, count of short line queries */
237 long cnt_punct; /* for overview mode,
238 count of punctuation and spacing queries */
239 long cnt_dash; /* for overview mode, count of dash-related queries */
240 long cnt_word; /* for overview mode, count of word queries */
241 long cnt_html; /* for overview mode, count of html queries */
242 long cnt_lineend; /* for overview mode, count of line-end queries */
243 long cnt_spacend; /* count of lines with space at end */
244 long linecnt; /* count of total lines in the file */
245 long checked_linecnt; /* count of lines actually checked */
247 void proghelp(GOptionContext *context);
248 void procfile(const char *);
252 gboolean mixdigit(const char *);
253 gchar *getaword(const char *,const char **);
254 char *flgets(char **,long,int);
255 void postprocess_for_HTML(char *);
256 char *linehasmarkup(char *);
257 char *losemarkup(char *);
258 gboolean tagcomp(const char *,const char *);
259 void loseentities(char *);
260 gboolean isroman(const char *);
261 void postprocess_for_DP(char *);
262 void print_as_windows_1252(const char *string);
263 void print_as_utf_8(const char *string);
265 GTree *qword,*qperiod;
271 gboolean set_charset(const char *name,GError **err)
273 /* The various UNICODE encodings all share the same character set. */
274 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
275 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
276 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
277 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
278 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
282 if (charset_validator!=(GIConv)-1)
283 g_iconv_close(charset_validator);
284 if (!name || !g_strcasecmp(name,"auto"))
287 charset_validator=(GIConv)-1;
291 charset=g_strdup(name);
292 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
293 if (!g_strcasecmp(charset,unicode_aliases[i]))
296 charset=g_strdup("UTF-8");
299 if (!strcmp(charset,"UTF-8"))
300 charset_validator=(GIConv)-1;
303 charset_validator=g_iconv_open(charset,"UTF-8");
304 if (charset_validator==(GIConv)-1)
306 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
307 "Unknown character set \"%s\"",charset);
316 void config_file_update(GKeyFile *kf)
321 for(i=0;options[i].long_name;i++)
323 if (g_str_has_prefix(options[i].long_name,"no-"))
325 if (options[i].arg==G_OPTION_ARG_NONE)
327 sw=*(gboolean *)options[i].arg_data;
328 if (options[i].flags&G_OPTION_FLAG_REVERSE)
330 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
332 else if (options[i].arg==G_OPTION_ARG_STRING)
334 s=*(gchar **)options[i].arg_data;
337 g_key_file_set_string(kf,"options",options[i].long_name,s);
340 g_assert_not_reached();
344 void config_file_add_comments(GKeyFile *kf)
348 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
350 for(i=0;options[i].long_name;i++)
352 if (g_str_has_prefix(options[i].long_name,"no-"))
354 comment=g_strconcat(" ",options[i].description,NULL);
355 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
360 void dump_config(void)
364 config_file_update(config);
367 config=g_key_file_new();
368 config_file_update(config);
369 config_file_add_comments(config);
371 s=g_key_file_to_data(config,NULL,NULL);
377 GKeyFile *read_config_file(gchar **full_path)
383 const char *search_path;
386 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
390 search_dirs=g_strsplit(search_path,";",0);
392 search_dirs=g_strsplit(search_path,":",0);
397 search_dirs=g_new(gchar *,4);
398 search_dirs[0]=g_get_current_dir();
399 search_dirs[1]=g_strdup(running_from);
400 search_dirs[2]=g_strdup(g_get_user_config_dir());
403 for(i=0;search_dirs[i];i++)
405 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
406 if (g_key_file_load_from_file(kf,path,
407 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
409 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
411 g_printerr("Bookloupe: Error reading %s\n",path);
412 g_printerr("%s\n",err->message);
424 g_strfreev(search_dirs);
432 void parse_config_file(void)
439 config=read_config_file(&path);
441 keys=g_key_file_get_keys(config,"options",NULL,NULL);
448 for(j=0;options[j].long_name;j++)
450 if (g_str_has_prefix(options[j].long_name,"no-"))
452 else if (!strcmp(keys[i],options[j].long_name))
454 if (options[j].arg==G_OPTION_ARG_NONE)
456 sw=g_key_file_get_boolean(config,"options",keys[i],
460 g_printerr("Bookloupe: %s: options.%s: %s\n",
461 path,keys[i],err->message);
466 if (options[j].flags&G_OPTION_FLAG_REVERSE)
468 *(gboolean *)options[j].arg_data=sw;
472 else if (options[j].arg==G_OPTION_ARG_STRING)
474 s=g_key_file_get_string(config,"options",keys[i],
478 g_printerr("Bookloupe: %s: options.%s: %s\n",
479 path,keys[i],err->message);
484 g_free(*(gchar **)options[j].arg_data);
485 if (!g_strcmp0(s,"auto"))
487 *(gchar **)options[j].arg_data=NULL;
491 *(gchar **)options[j].arg_data=s;
496 g_assert_not_reached();
499 if (!options[j].long_name)
500 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
509 void parse_options(int *argc,char ***argv)
512 GOptionContext *context;
513 GOptionGroup *compatibility;
514 context=g_option_context_new(
515 "file - look for errors in Project Gutenberg(TM) etexts");
516 g_option_context_add_main_entries(context,options,NULL);
517 g_option_context_add_main_entries(context,config_options,NULL);
518 compatibility=g_option_group_new("compatibility",
519 "Options for Compatibility with Gutcheck:",
520 "Show compatibility options",NULL,NULL);
521 g_option_group_add_entries(compatibility,compatibility_options);
522 g_option_context_add_group(context,compatibility);
523 g_option_context_set_description(context,
524 "For simplicity, only the switch options which reverse the\n"
525 "default configuration are listed. In most cases, both vanilla\n"
526 "and \"no-\" prefixed versions are available for use.");
527 if (!g_option_context_parse(context,argc,argv,&err))
529 g_printerr("Bookloupe: %s\n",err->message);
530 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
534 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
537 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
538 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
541 * Web uploads - for the moment, this is really just a placeholder
542 * until we decide what processing we really want to do on web uploads
544 if (pswit[WEB_SWITCH])
546 /* specific override for web uploads */
547 pswit[ECHO_SWITCH]=TRUE;
548 pswit[SQUOTE_SWITCH]=FALSE;
549 pswit[TYPO_SWITCH]=TRUE;
550 pswit[QPARA_SWITCH]=FALSE;
551 pswit[PARANOID_SWITCH]=TRUE;
552 pswit[LINE_END_SWITCH]=FALSE;
553 pswit[OVERVIEW_SWITCH]=FALSE;
554 pswit[STDOUT_SWITCH]=FALSE;
555 pswit[HEADER_SWITCH]=TRUE;
556 pswit[VERBOSE_SWITCH]=FALSE;
557 pswit[MARKUP_SWITCH]=FALSE;
558 pswit[USERTYPO_SWITCH]=FALSE;
559 pswit[DP_SWITCH]=FALSE;
561 if (opt_charset && !set_charset(opt_charset,&err))
563 g_printerr("%s\n",err->message);
566 if (pswit[DUMP_CONFIG_SWITCH])
573 if (pswit[OVERVIEW_SWITCH])
574 /* just print summary; don't echo */
575 pswit[ECHO_SWITCH]=FALSE;
581 g_option_context_free(context);
587 * Read in the user-defined stealth scanno list.
589 void read_user_scannos(void)
592 gchar *usertypo_file;
596 gchar *contents,*utf8,**lines;
597 usertypo_file=g_strdup("bookloupe.typ");
598 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
599 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
602 g_free(usertypo_file);
603 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
604 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
606 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
609 g_free(usertypo_file);
610 usertypo_file=g_strdup("gutcheck.typ");
611 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
613 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
616 g_free(usertypo_file);
617 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
618 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
620 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
622 g_free(usertypo_file);
623 g_print(" --> I couldn't find bookloupe.typ "
624 "-- proceeding without user typos.\n");
629 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
630 g_free(usertypo_file);
634 if (g_utf8_validate(contents,len,NULL))
636 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
638 (void)set_charset("UNICODE",NULL);
641 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
643 lines=g_strsplit_set(utf8,"\r\n",0);
645 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
646 for (i=0;lines[i];i++)
647 if (*(unsigned char *)lines[i]>'!')
648 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
657 * Read an etext returning a newly allocated string containing the file
658 * contents or NULL on error.
660 gchar *read_etext(const char *filename,GError **err)
662 GError *tmp_err=NULL;
663 gchar *contents,*utf8;
664 gsize len,bytes_read,bytes_written;
666 if (!g_file_get_contents(filename,&contents,&len,err))
668 if (g_utf8_validate(contents,len,NULL))
670 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
671 g_set_print_handler(print_as_utf_8);
673 SetConsoleOutputCP(CP_UTF8);
678 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
679 &bytes_written,&tmp_err);
680 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
681 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
684 for(i=0;i<bytes_read;i++)
685 if (contents[i]=='\n')
690 else if (contents[i]!='\r')
692 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
693 "Input conversion failed. Byte %d at line %d, column %d is not a "
694 "valid Windows-1252 character",
695 ((unsigned char *)contents)[bytes_read],line,col);
698 g_propagate_error(err,tmp_err);
699 g_set_print_handler(print_as_windows_1252);
701 SetConsoleOutputCP(1252);
708 void cleanup_on_exit(void)
711 SetConsoleOutputCP(saved_cp);
715 int main(int argc,char **argv)
718 atexit(cleanup_on_exit);
719 saved_cp=GetConsoleOutputCP();
721 running_from=g_path_get_dirname(argv[0]);
722 /* Paranoid checking is turned OFF, not on, by its switch */
723 pswit[PARANOID_SWITCH]=TRUE;
724 /* if running in paranoid mode, typo checks default to enabled */
725 pswit[TYPO_SWITCH]=TRUE;
726 /* Line-end checking is turned OFF, not on, by its switch */
727 pswit[LINE_END_SWITCH]=TRUE;
728 /* Echoing is turned OFF, not on, by its switch */
729 pswit[ECHO_SWITCH]=TRUE;
731 parse_options(&argc,&argv);
732 if (pswit[USERTYPO_SWITCH])
734 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
736 if (pswit[OVERVIEW_SWITCH])
738 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
739 checked_linecnt,linecnt,linecnt-checked_linecnt);
740 g_print(" --------------- Queries found --------------\n");
742 g_print(" Long lines: %14ld\n",cnt_long);
744 g_print(" Short lines: %14ld\n",cnt_short);
746 g_print(" Line-end problems: %14ld\n",cnt_lineend);
748 g_print(" Common typos: %14ld\n",cnt_word);
750 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
752 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
754 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
756 g_print(" Proofing characters: %14ld\n",cnt_odd);
758 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
760 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
762 g_print(" Possible HTML tags: %14ld\n",cnt_html);
764 g_print(" TOTAL QUERIES %14ld\n",
765 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
766 cnt_dash+cnt_word+cnt_html+cnt_lineend);
768 g_free(running_from);
770 g_tree_unref(usertypo);
771 set_charset(NULL,NULL);
773 g_key_file_free(config);
777 void count_dashes(const char *line,const char *dash,
778 struct dash_results *results)
783 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
786 tokens=g_strsplit(line,dash,0);
789 for(i=1;tokens[i];i++)
791 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
792 nc=g_utf8_get_char(tokens[i]);
793 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
795 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
797 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
803 /* count of lines with em-dashes with spaces both sides */
804 results->non_PG_space++;
806 /* count of lines with PG-type em-dashes with no spaces */
814 * Run a first pass - verify that it's a valid PG
815 * file, decide whether to report some things that
816 * occur many times in the text like long or short
817 * lines, non-standard dashes, etc.
819 struct first_pass_results *first_pass(const char *etext)
821 gunichar laststart=CHAR_SPACE;
826 unsigned int lastlen=0,lastblen=0;
827 long spline=0,nspline=0;
828 static struct first_pass_results results={0};
829 struct dash_results tmp_dash_results;
832 lines=g_strsplit(etext,"\n",0);
835 /* An empty etext has no terminators */
836 results.newlines=DOS_NEWLINES;
841 * If there are no LFs, we don't have UNIX-style
842 * terminators, but we might have OS9-style ones.
844 results.newlines=OS9_NEWLINES;
846 lines=g_strsplit(etext,"\r",0);
847 if (!lines[0] || !lines[1])
848 /* Looks like we don't have any terminators at all */
849 results.newlines=DOS_NEWLINES;
853 /* We might have UNIX-style terminators */
854 results.newlines=UNIX_NEWLINES;
856 for (j=0;lines[j];j++)
858 lbytes=strlen(lines[j]);
859 if (lbytes>0 && lines[j][lbytes-1]=='\r')
861 results.newlines=DOS_NEWLINES;
864 lines[j][--lbytes]='\0';
865 } while (lbytes>0 && lines[j][lbytes-1]=='\r');
867 llen=g_utf8_strlen(lines[j],lbytes);
869 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
870 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
873 g_print(" --> Duplicate header?\n");
874 spline=linecnt+1; /* first line of non-header text, that is */
876 if (!strncmp(lines[j],"*** START",9) &&
877 strstr(lines[j],"PROJECT GUTENBERG"))
880 g_print(" --> Duplicate header?\n");
881 nspline=linecnt+1; /* first line of non-header text, that is */
883 if (spline || nspline)
885 lc_line=g_utf8_strdown(lines[j],lbytes);
886 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
888 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
890 if (results.footerline)
892 /* it's an old-form header - we can detect duplicates */
894 g_print(" --> Duplicate footer?\n");
897 results.footerline=linecnt;
903 results.firstline=spline;
905 results.firstline=nspline; /* override with new */
906 if (results.footerline)
907 continue; /* don't count the boilerplate in the footer */
908 results.totlen+=llen;
909 for (s=lines[j];*s;s=g_utf8_next_char(s))
911 if (g_utf8_get_char(s)>127)
913 if (g_unichar_isalpha(g_utf8_get_char(s)))
917 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
918 qc=QUOTE_CLASS(g_utf8_get_char(s));
921 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
922 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
923 results.endquote_count++;
926 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
927 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
930 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
932 if (strstr(lines[j],".,"))
934 /* only count ast lines for ignoring purposes where there is */
935 /* locase text on the line */
936 if (strchr(lines[j],'*'))
938 for (s=lines[j];*s;s=g_utf8_next_char(s))
939 if (g_unichar_islower(g_utf8_get_char(s)))
944 if (strchr(lines[j],'/'))
945 results.fslashline++;
948 for (s=g_utf8_prev_char(lines[j]+lbytes);
949 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
950 s=g_utf8_prev_char(s))
952 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
953 g_utf8_get_char(g_utf8_prev_char(s))!='-')
956 if (llen>LONGEST_PG_LINE)
958 if (llen>WAY_TOO_LONG)
959 results.verylongline++;
960 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
962 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
965 if (strstr(lines[j],"<i>"))
966 results.htmcount+=4; /* bonus marks! */
968 /* Check for spaced em-dashes */
969 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
970 count_dashes(lines[j],"--",&tmp_dash_results);
971 count_dashes(lines[j],"—",&tmp_dash_results);
972 if (tmp_dash_results.base)
973 results.emdash.base++;
974 if (tmp_dash_results.non_PG_space)
975 results.emdash.non_PG_space++;
976 if (tmp_dash_results.PG_space)
977 results.emdash.PG_space++;
980 inword=getaword(NULL,&s);
981 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
982 results.Dutchcount++;
983 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
984 results.Frenchcount++;
985 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
986 results.standalone_digit++;
989 /* Check for spaced dashes */
990 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
994 laststart=lines[j][0];
1001 * report_first_pass:
1003 * Make some snap decisions based on the first pass results.
1005 struct warnings *report_first_pass(struct first_pass_results *results)
1007 static struct warnings warnings={0};
1008 warnings.newlines=results->newlines;
1009 if (warnings.newlines==UNIX_NEWLINES)
1010 g_print(" --> No lines in this file have a CR. Not reporting them. "
1011 "Project Gutenberg requires that all lineends be CR-LF.\n");
1012 else if (warnings.newlines==OS9_NEWLINES)
1013 g_print(" --> No lines in this file have a LF. Not reporting them. "
1014 "Project Gutenberg requires that all lineends be CR-LF.\n");
1016 g_print(" --> %ld lines in this file have white space at end\n",
1018 warnings.dotcomma=1;
1019 if (results->dotcomma>5)
1021 warnings.dotcomma=0;
1022 g_print(" --> %ld lines in this file contain '.,'. "
1023 "Not reporting them.\n",results->dotcomma);
1026 * If more than 50 lines, or one-tenth, are short,
1027 * don't bother reporting them.
1029 warnings.shortline=1;
1030 if (results->shortline>50 || results->shortline*10>linecnt)
1032 warnings.shortline=0;
1033 g_print(" --> %ld lines in this file are short. "
1034 "Not reporting short lines.\n",results->shortline);
1037 * If more than 50 lines, or one-tenth, are long,
1038 * don't bother reporting them.
1040 warnings.longline=1;
1041 if (results->longline>50 || results->longline*10>linecnt)
1043 warnings.longline=0;
1044 g_print(" --> %ld lines in this file are long. "
1045 "Not reporting long lines.\n",results->longline);
1047 /* If more than 10 lines contain asterisks, don't bother reporting them. */
1049 if (results->astline>10)
1052 g_print(" --> %ld lines in this file contain asterisks. "
1053 "Not reporting them.\n",results->astline);
1056 * If more than 10 lines contain forward slashes,
1057 * don't bother reporting them.
1060 if (results->fslashline>10)
1063 g_print(" --> %ld lines in this file contain forward slashes. "
1064 "Not reporting them.\n",results->fslashline);
1067 * If more than 20 lines contain unpunctuated endquotes,
1068 * don't bother reporting them.
1070 warnings.endquote=1;
1071 if (results->endquote_count>20)
1073 warnings.endquote=0;
1074 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1075 "Not reporting them.\n",results->endquote_count);
1078 * If more than 15 lines contain standalone digits,
1079 * don't bother reporting them.
1082 if (results->standalone_digit>10)
1085 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1086 "Not reporting them.\n",results->standalone_digit);
1089 * If more than 20 lines contain hyphens at end,
1090 * don't bother reporting them.
1093 if (results->hyphens>20)
1096 g_print(" --> %ld lines in this file have hyphens at end. "
1097 "Not reporting them.\n",results->hyphens);
1099 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1101 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1102 pswit[MARKUP_SWITCH]=1;
1104 if (results->verylongline>0)
1105 g_print(" --> %ld lines in this file are VERY long!\n",
1106 results->verylongline);
1108 * If there are more non-PG spaced dashes than PG em-dashes,
1109 * assume it's deliberate.
1110 * Current PG guidelines say don't use them, but older texts do,
1111 * and some people insist on them whatever the guidelines say.
1114 if (results->spacedash+results->emdash.non_PG_space>
1115 results->emdash.PG_space)
1118 g_print(" --> There are %ld spaced dashes and em-dashes. "
1119 "Not reporting them.\n",
1120 results->spacedash+results->emdash.non_PG_space);
1126 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1128 /* If more than a quarter of characters are hi-bit, bug out. */
1129 if (results->binlen*4>results->totlen)
1131 g_print(" --> This file does not appear to be ASCII. "
1132 "Terminating. Best of luck with it!\n");
1135 if (results->alphalen*4<results->totlen)
1137 g_print(" --> This file does not appear to be text. "
1138 "Terminating. Best of luck with it!\n");
1141 if (results->binlen*100>results->totlen || results->binlen>100)
1143 g_print(" --> There are a lot of foreign letters here. "
1144 "Not reporting them.\n");
1145 if (!pswit[VERBOSE_SWITCH])
1149 warnings.isDutch=FALSE;
1150 if (results->Dutchcount>50)
1152 warnings.isDutch=TRUE;
1153 g_print(" --> This looks like Dutch - "
1154 "switching off dashes and warnings for 's Middags case.\n");
1156 warnings.isFrench=FALSE;
1157 if (results->Frenchcount>50)
1159 warnings.isFrench=TRUE;
1160 g_print(" --> This looks like French - "
1161 "switching off some doublepunct.\n");
1163 if (results->firstline && results->footerline)
1164 g_print(" The PG header and footer appear to be already on.\n");
1167 if (results->firstline)
1168 g_print(" The PG header is on - no footer.\n");
1169 if (results->footerline)
1170 g_print(" The PG footer is on - no header.\n");
1173 if (pswit[VERBOSE_SWITCH])
1175 warnings.shortline=1;
1176 warnings.dotcomma=1;
1177 warnings.longline=1;
1183 warnings.endquote=1;
1184 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1186 if (warnings.isDutch)
1188 if (results->footerline>0 && results->firstline>0 &&
1189 results->footerline>results->firstline &&
1190 results->footerline-results->firstline<100)
1192 g_print(" --> I don't really know where this text starts. \n");
1193 g_print(" There are no reference points.\n");
1194 g_print(" I'm going to have to report the header and footer "
1196 results->firstline=0;
1204 * Look along the line, accumulate the count of quotes, and see
1205 * if this is an empty line - i.e. a line with nothing on it
1207 * If line has just spaces, period, * and/or - on it, don't
1208 * count it, since empty lines with asterisks or dashes to
1209 * separate sections are common.
1211 * Returns: TRUE if the line is empty.
1213 gboolean analyse_quotes(const char *aline,struct counters *counters)
1216 /* assume the line is empty until proven otherwise */
1217 gboolean isemptyline=TRUE;
1218 const char *s=aline,*sprev,*snext;
1221 GError *tmp_err=NULL;
1224 snext=g_utf8_next_char(s);
1225 c=g_utf8_get_char(s);
1226 if (CHAR_IS_DQUOTE(c))
1227 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1228 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1233 * At start of line, it can only be a quotation mark.
1234 * Hardcode a very common exception!
1236 if (!g_str_has_prefix(snext,"tis") &&
1237 !g_str_has_prefix(snext,"Tis"))
1238 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1240 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1241 g_unichar_isalpha(g_utf8_get_char(snext)))
1242 /* Do nothing! it's definitely an apostrophe, not a quote */
1244 /* it's outside a word - let's check it out */
1245 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1246 g_unichar_isalpha(g_utf8_get_char(snext)))
1248 /* certainly looks like a quotation mark */
1249 if (!g_str_has_prefix(snext,"tis") &&
1250 !g_str_has_prefix(snext,"Tis"))
1251 /* hardcode a very common exception! */
1253 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1254 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1256 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1261 /* now - is it a quotation mark? */
1262 guessquote=0; /* accumulate clues */
1263 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1265 /* it follows a letter - could be either */
1267 if (g_utf8_get_char(sprev)=='s')
1269 /* looks like a plural apostrophe */
1271 if (g_utf8_get_char(snext)==CHAR_SPACE)
1275 if (innermost_quote_matches(counters,c))
1277 * Give it the benefit of some doubt,
1278 * if a squote is already open.
1284 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1287 /* no adjacent letter - it must be a quote of some kind */
1288 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1293 if (pswit[ECHO_SWITCH])
1294 g_print("\n%s\n",aline);
1295 if (!pswit[OVERVIEW_SWITCH])
1296 g_print(" Line %ld column %ld - %s\n",
1297 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1298 g_clear_error(&tmp_err);
1300 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1302 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1303 if (c==CHAR_UNDERSCORE)
1304 counters->c_unders++;
1305 if (c==CHAR_OPEN_SBRACK)
1307 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1308 !matching_difference(counters,c) && s==aline &&
1309 g_str_has_prefix(s,"[Illustration:"))
1310 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1312 increment_matching(counters,c,TRUE);
1314 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1315 increment_matching(counters,c,TRUE);
1316 if (c==CHAR_CLOSE_SBRACK)
1318 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1319 !matching_difference(counters,c) && !*snext)
1320 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1322 increment_matching(counters,c,FALSE);
1324 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1325 increment_matching(counters,c,FALSE);
1333 * check_for_control_characters:
1335 * Check for invalid or questionable characters in the line
1336 * Anything above 127 is invalid for plain ASCII, and
1337 * non-printable control characters should also be flagged.
1338 * Tabs should generally not be there.
1340 void check_for_control_characters(const char *aline)
1344 for (s=aline;*s;s=g_utf8_next_char(s))
1346 c=g_utf8_get_char(s);
1347 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1349 if (pswit[ECHO_SWITCH])
1350 g_print("\n%s\n",aline);
1351 if (!pswit[OVERVIEW_SWITCH])
1352 g_print(" Line %ld column %ld - Control character %u\n",
1353 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1361 * check_for_odd_characters:
1363 * Check for binary and other odd characters.
1365 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1366 gboolean isemptyline)
1368 /* Don't repeat multiple warnings on one line. */
1369 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1370 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1375 for (s=aline;*s;s=g_utf8_next_char(s))
1377 c=g_utf8_get_char(s);
1378 if (warnings->bin && !eInvalidChar &&
1379 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1381 if (pswit[ECHO_SWITCH])
1382 g_print("\n%s\n",aline);
1383 if (!pswit[OVERVIEW_SWITCH])
1384 if (c>127 && c<160 || c>255)
1385 g_print(" Line %ld column %ld - "
1386 "Non-ISO-8859 character %u\n",
1387 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1389 g_print(" Line %ld column %ld - "
1390 "Non-ASCII character %u\n",
1391 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1396 if (!eInvalidChar && charset)
1398 if (charset_validator==(GIConv)-1)
1400 if (!g_unichar_isdefined(c))
1402 if (pswit[ECHO_SWITCH])
1403 g_print("\n%s\n",aline);
1404 if (!pswit[OVERVIEW_SWITCH])
1405 g_print(" Line %ld column %ld - Unassigned UNICODE "
1406 "code point U+%04" G_GINT32_MODIFIER "X\n",
1407 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1412 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1413 c>=100000 && c<=0x10FFFD)
1415 if (pswit[ECHO_SWITCH])
1416 g_print("\n%s\n",aline);
1417 if (!pswit[OVERVIEW_SWITCH])
1418 g_print(" Line %ld column %ld - Private Use "
1419 "character U+%04" G_GINT32_MODIFIER "X\n",
1420 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1428 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1429 charset_validator,NULL,&nb,NULL);
1434 if (pswit[ECHO_SWITCH])
1435 g_print("\n%s\n",aline);
1436 if (!pswit[OVERVIEW_SWITCH])
1437 g_print(" Line %ld column %ld - Non-%s "
1438 "character %u\n",linecnt,
1439 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1446 if (!eTab && c==CHAR_TAB)
1448 if (pswit[ECHO_SWITCH])
1449 g_print("\n%s\n",aline);
1450 if (!pswit[OVERVIEW_SWITCH])
1451 g_print(" Line %ld column %ld - Tab character?\n",
1452 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1457 if (!eTilde && c==CHAR_TILDE)
1460 * Often used by OCR software to indicate an
1461 * unrecognizable character.
1463 if (pswit[ECHO_SWITCH])
1464 g_print("\n%s\n",aline);
1465 if (!pswit[OVERVIEW_SWITCH])
1466 g_print(" Line %ld column %ld - Tilde character?\n",
1467 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1472 if (!eCarat && c==CHAR_CARAT)
1474 if (pswit[ECHO_SWITCH])
1475 g_print("\n%s\n",aline);
1476 if (!pswit[OVERVIEW_SWITCH])
1477 g_print(" Line %ld column %ld - Carat character?\n",
1478 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1483 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1485 if (pswit[ECHO_SWITCH])
1486 g_print("\n%s\n",aline);
1487 if (!pswit[OVERVIEW_SWITCH])
1488 g_print(" Line %ld column %ld - Forward slash?\n",
1489 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1495 * Report asterisks only in paranoid mode,
1496 * since they're often deliberate.
1498 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1501 if (pswit[ECHO_SWITCH])
1502 g_print("\n%s\n",aline);
1503 if (!pswit[OVERVIEW_SWITCH])
1504 g_print(" Line %ld column %ld - Asterisk?\n",
1505 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1514 * check_for_long_line:
1516 * Check for line too long.
1518 void check_for_long_line(const char *aline)
1520 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1522 if (pswit[ECHO_SWITCH])
1523 g_print("\n%s\n",aline);
1524 if (!pswit[OVERVIEW_SWITCH])
1525 g_print(" Line %ld column %ld - Long line %ld\n",
1526 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1533 * check_for_short_line:
1535 * Check for line too short.
1537 * This one is a bit trickier to implement: we don't want to
1538 * flag the last line of a paragraph for being short, so we
1539 * have to wait until we know that our current line is a
1540 * "normal" line, then report the _previous_ line if it was too
1541 * short. We also don't want to report indented lines like
1542 * chapter heads or formatted quotations. We therefore keep
1543 * last->len as the length of the last line examined, and
1544 * last->blen as the length of the last but one, and try to
1545 * suppress unnecessary warnings by checking that both were of
1546 * "normal" length. We keep the first character of the last
1547 * line in last->start, and if it was a space, we assume that
1548 * the formatting is deliberate. I can't figure out a way to
1549 * distinguish something like a quoted verse left-aligned or
1550 * the header or footer of a letter from a paragraph of short
1551 * lines - maybe if I examined the whole paragraph, and if the
1552 * para has less than, say, 8 lines and if all lines are short,
1553 * then just assume it's OK? Need to look at some texts to see
1554 * how often a formula like this would get the right result.
1556 void check_for_short_line(const char *aline,const struct line_properties *last)
1558 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1559 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1560 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1562 if (pswit[ECHO_SWITCH])
1563 g_print("\n%s\n",prevline);
1564 if (!pswit[OVERVIEW_SWITCH])
1565 g_print(" Line %ld column %ld - Short line %ld?\n",
1566 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1573 * check_for_starting_punctuation:
1575 * Look for punctuation other than full ellipses at start of line.
1577 void check_for_starting_punctuation(const char *aline)
1579 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1580 !g_str_has_prefix(aline,". . ."))
1582 if (pswit[ECHO_SWITCH])
1583 g_print("\n%s\n",aline);
1584 if (!pswit[OVERVIEW_SWITCH])
1585 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1595 * Find the first em-dash, return a pointer to it and set <next> to the
1596 * character following the dash.
1598 char *str_emdash(const char *s,const char **next)
1606 *next=g_utf8_next_char(s2);
1611 *next=g_utf8_next_char(g_utf8_next_char(s1));
1616 *next=g_utf8_next_char(g_utf8_next_char(s1));
1621 *next=g_utf8_next_char(s2);
1627 * check_for_spaced_emdash:
1629 * Check for spaced em-dashes.
1631 * We must check _all_ occurrences of em-dashes on the line
1632 * hence the loop - even if the first dash is OK
1633 * there may be another that's wrong later on.
1635 void check_for_spaced_emdash(const char *aline)
1637 const char *s,*t,*next;
1638 for (s=aline;t=str_emdash(s,&next);s=next)
1640 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1641 g_utf8_get_char(next)==CHAR_SPACE)
1643 if (pswit[ECHO_SWITCH])
1644 g_print("\n%s\n",aline);
1645 if (!pswit[OVERVIEW_SWITCH])
1646 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1647 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1655 * check_for_spaced_dash:
1657 * Check for spaced dashes.
1659 void check_for_spaced_dash(const char *aline)
1662 if ((s=strstr(aline," -")))
1664 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1666 if (pswit[ECHO_SWITCH])
1667 g_print("\n%s\n",aline);
1668 if (!pswit[OVERVIEW_SWITCH])
1669 g_print(" Line %ld column %ld - Spaced dash?\n",
1670 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1675 else if ((s=strstr(aline,"- ")))
1677 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1679 if (pswit[ECHO_SWITCH])
1680 g_print("\n%s\n",aline);
1681 if (!pswit[OVERVIEW_SWITCH])
1682 g_print(" Line %ld column %ld - Spaced dash?\n",
1683 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1691 * check_for_unmarked_paragraphs:
1693 * Check for unmarked paragraphs indicated by separate speakers.
1695 * May well be false positive:
1696 * "Bravo!" "Wonderful!" called the crowd.
1697 * but useful all the same.
1699 void check_for_unmarked_paragraphs(const char *aline)
1702 s=strstr(aline,"\" \"");
1704 s=strstr(aline,"\" \"");
1707 if (pswit[ECHO_SWITCH])
1708 g_print("\n%s\n",aline);
1709 if (!pswit[OVERVIEW_SWITCH])
1710 g_print(" Line %ld column %ld - "
1711 "Query missing paragraph break?\n",
1712 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1719 * check_for_jeebies:
1721 * Check for "to he" and other easy h/b errors.
1723 * This is a very inadequate effort on the h/b problem,
1724 * but the phrase "to he" is always an error, whereas "to
1725 * be" is quite common.
1726 * Similarly, '"Quiet!", be said.' is a non-be error
1727 * "to he" is _not_ always an error!:
1728 * "Where they went to he couldn't say."
1729 * Another false positive:
1730 * What would "Cinderella" be without the . . .
1731 * and another: "If he wants to he can see for himself."
1733 void check_for_jeebies(const char *aline)
1736 s=strstr(aline," be could ");
1738 s=strstr(aline," be would ");
1740 s=strstr(aline," was be ");
1742 s=strstr(aline," be is ");
1744 s=strstr(aline," is be ");
1746 s=strstr(aline,"\", be ");
1748 s=strstr(aline,"\" be ");
1750 s=strstr(aline,"\" be ");
1752 s=strstr(aline," to he ");
1755 if (pswit[ECHO_SWITCH])
1756 g_print("\n%s\n",aline);
1757 if (!pswit[OVERVIEW_SWITCH])
1758 g_print(" Line %ld column %ld - Query he/be error?\n",
1759 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1763 s=strstr(aline," the had ");
1765 s=strstr(aline," a had ");
1767 s=strstr(aline," they bad ");
1769 s=strstr(aline," she bad ");
1771 s=strstr(aline," he bad ");
1773 s=strstr(aline," you bad ");
1775 s=strstr(aline," i bad ");
1778 if (pswit[ECHO_SWITCH])
1779 g_print("\n%s\n",aline);
1780 if (!pswit[OVERVIEW_SWITCH])
1781 g_print(" Line %ld column %ld - Query had/bad error?\n",
1782 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1786 s=strstr(aline,"; hut ");
1788 s=strstr(aline,", hut ");
1791 if (pswit[ECHO_SWITCH])
1792 g_print("\n%s\n",aline);
1793 if (!pswit[OVERVIEW_SWITCH])
1794 g_print(" Line %ld column %ld - Query hut/but error?\n",
1795 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1802 * check_for_mta_from:
1804 * Special case - angled bracket in front of "From" placed there by an
1805 * MTA when sending an e-mail.
1807 void check_for_mta_from(const char *aline)
1810 s=strstr(aline,">From");
1813 if (pswit[ECHO_SWITCH])
1814 g_print("\n%s\n",aline);
1815 if (!pswit[OVERVIEW_SWITCH])
1816 g_print(" Line %ld column %ld - "
1817 "Query angled bracket with From\n",
1818 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1825 * check_for_orphan_character:
1827 * Check for a single character line -
1828 * often an overflow from bad wrapping.
1830 void check_for_orphan_character(const char *aline)
1833 c=g_utf8_get_char(aline);
1834 if (c && !*g_utf8_next_char(aline))
1836 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1837 ; /* Nothing - ignore numerals alone on a line. */
1840 if (pswit[ECHO_SWITCH])
1841 g_print("\n%s\n",aline);
1842 if (!pswit[OVERVIEW_SWITCH])
1843 g_print(" Line %ld column 1 - Query single character line\n",
1852 * check_for_pling_scanno:
1854 * Check for I" - often should be !
1856 void check_for_pling_scanno(const char *aline)
1859 s=strstr(aline," I\"");
1862 if (pswit[ECHO_SWITCH])
1863 g_print("\n%s\n",aline);
1864 if (!pswit[OVERVIEW_SWITCH])
1865 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1866 linecnt,g_utf8_pointer_to_offset(aline,s));
1873 * check_for_extra_period:
1875 * Check for period without a capital letter. Cut-down from gutspell.
1876 * Only works when it happens on a single line.
1878 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1880 const char *s,*t,*s1,*sprev;
1885 gunichar c,nc,pc,*decomposition;
1886 if (pswit[PARANOID_SWITCH])
1888 for (t=aline;t=strstr(t,". ");)
1892 t=g_utf8_next_char(t);
1893 /* start of line punctuation is handled elsewhere */
1896 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1898 t=g_utf8_next_char(t);
1901 if (warnings->isDutch)
1903 /* For Frank & Jeroen -- 's Middags case */
1904 gunichar c2,c3,c4,c5;
1905 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1906 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1907 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1908 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1909 if (CHAR_IS_APOSTROPHE(c2) &&
1910 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1911 g_unichar_isupper(c5))
1913 t=g_utf8_next_char(t);
1917 s1=g_utf8_next_char(g_utf8_next_char(t));
1918 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1919 !g_unichar_isdigit(g_utf8_get_char(s1)))
1920 s1=g_utf8_next_char(s1);
1921 if (g_unichar_islower(g_utf8_get_char(s1)))
1923 /* we have something to investigate */
1925 /* so let's go back and find out */
1926 nc=g_utf8_get_char(t);
1927 s1=g_utf8_prev_char(t);
1928 c=g_utf8_get_char(s1);
1929 sprev=g_utf8_prev_char(s1);
1930 pc=g_utf8_get_char(sprev);
1932 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1933 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1934 g_unichar_isalpha(nc)))
1939 sprev=g_utf8_prev_char(s1);
1940 pc=g_utf8_get_char(sprev);
1942 s1=g_utf8_next_char(s1);
1945 testword=g_strndup(s1,s-s1);
1947 testword=g_strdup(s1);
1948 for (i=0;*abbrev[i];i++)
1949 if (!strcmp(testword,abbrev[i]))
1951 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1953 if (!*g_utf8_next_char(testword))
1955 if (isroman(testword))
1960 for (s=testword;*s;s=g_utf8_next_char(s))
1962 decomposition=g_unicode_canonical_decomposition(
1963 g_utf8_get_char(s),&len);
1964 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1966 g_free(decomposition);
1970 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1972 g_tree_insert(qperiod,g_strdup(testword),
1973 GINT_TO_POINTER(1));
1974 if (pswit[ECHO_SWITCH])
1975 g_print("\n%s\n",aline);
1976 if (!pswit[OVERVIEW_SWITCH])
1977 g_print(" Line %ld column %ld - Extra period?\n",
1978 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1984 t=g_utf8_next_char(t);
1990 * check_for_following_punctuation:
1992 * Check for words usually not followed by punctuation.
1994 void check_for_following_punctuation(const char *aline)
1997 const char *s,*wordstart;
2000 if (pswit[TYPO_SWITCH])
2005 t=getaword(NULL,&s);
2011 inword=g_utf8_strdown(t,-1);
2013 for (i=0;*nocomma[i];i++)
2014 if (!strcmp(inword,nocomma[i]))
2016 c=g_utf8_get_char(s);
2017 if (c==',' || c==';' || c==':')
2019 if (pswit[ECHO_SWITCH])
2020 g_print("\n%s\n",aline);
2021 if (!pswit[OVERVIEW_SWITCH])
2022 g_print(" Line %ld column %ld - "
2023 "Query punctuation after %s?\n",
2024 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
2030 for (i=0;*noperiod[i];i++)
2031 if (!strcmp(inword,noperiod[i]))
2033 c=g_utf8_get_char(s);
2034 if (c=='.' || c=='!')
2036 if (pswit[ECHO_SWITCH])
2037 g_print("\n%s\n",aline);
2038 if (!pswit[OVERVIEW_SWITCH])
2039 g_print(" Line %ld column %ld - "
2040 "Query punctuation after %s?\n",
2041 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
2055 * Check for commonly mistyped words, and digits like 0 for O in a word.
2056 * Note that somewhat confusingly, this is also where we call getaword()
2057 * with a non-NULL line so that it will issue warnings.
2059 void check_for_typos(const char *aline,struct warnings *warnings)
2061 const char *s,*t,*nt,*wordstart;
2063 gunichar *decomposition;
2065 int i,vowel,consonant,*dupcnt;
2066 gboolean isdup,istypo,alower;
2069 gsize decomposition_len;
2073 inword=getaword(aline,&s);
2077 continue; /* don't bother with empty lines */
2079 if (mixdigit(inword))
2081 if (pswit[ECHO_SWITCH])
2082 g_print("\n%s\n",aline);
2083 if (!pswit[OVERVIEW_SWITCH])
2084 g_print(" Line %ld column %ld - Query digit in %s\n",
2085 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
2090 * Put the word through a series of tests for likely typos and OCR
2093 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2097 for (t=inword;*t;t=g_utf8_next_char(t))
2099 c=g_utf8_get_char(t);
2100 nt=g_utf8_next_char(t);
2101 /* lowercase for testing */
2102 if (g_unichar_islower(c))
2104 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
2107 * We have an uppercase mid-word. However, there are
2109 * Mac and Mc like McGill
2110 * French contractions like l'Abbe
2112 offset=g_utf8_pointer_to_offset(inword,t);
2114 pc=g_utf8_get_char(g_utf8_prev_char(t));
2117 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
2118 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2119 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2120 CHAR_IS_APOSTROPHE(pc))
2126 testword=g_utf8_casefold(inword,-1);
2128 if (pswit[TYPO_SWITCH])
2131 * Check for certain unlikely two-letter combinations at word
2134 len=g_utf8_strlen(testword,-1);
2137 for (i=0;*nostart[i];i++)
2138 if (g_str_has_prefix(testword,nostart[i]))
2140 for (i=0;*noend[i];i++)
2141 if (g_str_has_suffix(testword,noend[i]))
2144 /* ght is common, gbt never. Like that. */
2145 if (strstr(testword,"cb"))
2147 if (strstr(testword,"gbt"))
2149 if (strstr(testword,"pbt"))
2151 if (strstr(testword,"tbs"))
2153 if (strstr(testword,"mrn"))
2155 if (strstr(testword,"ahle"))
2157 if (strstr(testword,"ihle"))
2160 * "TBE" does happen - like HEARTBEAT - but uncommon.
2161 * Also "TBI" - frostbite, outbid - but uncommon.
2162 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2163 * numerals, but "ii" is a common scanno.
2165 if (strstr(testword,"tbi"))
2167 if (strstr(testword,"tbe"))
2169 if (strstr(testword,"ii"))
2172 * Check for no vowels or no consonants.
2173 * If none, flag a typo.
2175 if (!istypo && len>1)
2178 for (t=testword;*t;t=g_utf8_next_char(t))
2180 c=g_utf8_get_char(t);
2182 g_unicode_canonical_decomposition(c,&decomposition_len);
2183 if (c=='y' || g_unichar_isdigit(c))
2185 /* Yah, this is loose. */
2189 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2193 g_free(decomposition);
2195 if (!vowel || !consonant)
2199 * Now exclude the word from being reported if it's in
2202 for (i=0;*okword[i];i++)
2203 if (!strcmp(testword,okword[i]))
2206 * What looks like a typo may be a Roman numeral.
2209 if (istypo && isroman(testword))
2211 /* Check the manual list of typos. */
2213 for (i=0;*typo[i];i++)
2214 if (!strcmp(testword,typo[i]))
2217 * Check lowercase s, l, i and m - special cases.
2218 * "j" - often a semi-colon gone wrong.
2219 * "d" for a missing apostrophe - he d
2222 if (!istypo && len==1 &&
2223 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2227 dupcnt=g_tree_lookup(qword,testword);
2231 isdup=!pswit[VERBOSE_SWITCH];
2235 dupcnt=g_new0(int,1);
2236 g_tree_insert(qword,g_strdup(testword),dupcnt);
2241 if (pswit[ECHO_SWITCH])
2242 g_print("\n%s\n",aline);
2243 if (!pswit[OVERVIEW_SWITCH])
2245 g_print(" Line %ld column %ld - Query word %s",
2246 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2248 if (!pswit[VERBOSE_SWITCH])
2249 g_print(" - not reporting duplicates");
2257 /* check the user's list of typos */
2258 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2260 if (pswit[ECHO_SWITCH])
2261 g_print("\n%s\n",aline);
2262 if (!pswit[OVERVIEW_SWITCH])
2263 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2264 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2266 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2268 if (pswit[PARANOID_SWITCH] && warnings->digit)
2270 /* In paranoid mode, query all 0 and 1 standing alone. */
2271 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2273 if (pswit[ECHO_SWITCH])
2274 g_print("\n%s\n",aline);
2275 if (!pswit[OVERVIEW_SWITCH])
2276 g_print(" Line %ld column %ld - Query standalone %s\n",
2277 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2288 * check_for_misspaced_punctuation:
2290 * Look for added or missing spaces around punctuation and quotes.
2291 * If there is a punctuation character like ! with no space on
2292 * either side, suspect a missing!space. If there are spaces on
2293 * both sides , assume a typo. If we see a double quote with no
2294 * space or punctuation on either side of it, assume unspaced
2295 * quotes "like"this.
2297 void check_for_misspaced_punctuation(const char *aline,
2298 struct parities *parities,gboolean isemptyline)
2300 gboolean isacro,isellipsis;
2302 gunichar c,nc,pc,n2c;
2304 c=g_utf8_get_char(aline);
2305 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2306 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2310 nc=g_utf8_get_char(g_utf8_next_char(s));
2311 /* For each character in the line after the first. */
2312 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2314 /* we need to suppress warnings for acronyms like M.D. */
2316 /* we need to suppress warnings for ellipsis . . . */
2319 * If there are letters on both sides of it or
2320 * if it's strict punctuation followed by an alpha.
2322 if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2323 g_utf8_strchr("?!,;:",-1,c)))
2327 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2328 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2330 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2336 if (pswit[ECHO_SWITCH])
2337 g_print("\n%s\n",aline);
2338 if (!pswit[OVERVIEW_SWITCH])
2339 g_print(" Line %ld column %ld - Missing space?\n",
2340 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2345 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2348 * If there are spaces on both sides,
2349 * or space before and end of line.
2353 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2354 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2356 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2360 if (!isemptyline && !isellipsis)
2362 if (pswit[ECHO_SWITCH])
2363 g_print("\n%s\n",aline);
2364 if (!pswit[OVERVIEW_SWITCH])
2365 g_print(" Line %ld column %ld - "
2366 "Spaced punctuation?\n",linecnt,
2367 g_utf8_pointer_to_offset(aline,s)+1);
2374 /* Split out the characters that CANNOT be preceded by space. */
2375 c=g_utf8_get_char(aline);
2376 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2377 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2381 nc=g_utf8_get_char(g_utf8_next_char(s));
2382 /* for each character in the line after the first */
2383 if (g_utf8_strchr("?!,;:",-1,c))
2385 /* if it's punctuation that _cannot_ have a space before it */
2386 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2389 * If nc DOES == space,
2390 * it was already reported just above.
2392 if (pswit[ECHO_SWITCH])
2393 g_print("\n%s\n",aline);
2394 if (!pswit[OVERVIEW_SWITCH])
2395 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2396 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2403 * Special case " .X" where X is any alpha.
2404 * This plugs a hole in the acronym code above.
2405 * Inelegant, but maintainable.
2407 c=g_utf8_get_char(aline);
2408 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2409 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2413 nc=g_utf8_get_char(g_utf8_next_char(s));
2414 /* for each character in the line after the first */
2417 /* if it's a period */
2418 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2421 * If the period follows a space and
2422 * is followed by a letter.
2424 if (pswit[ECHO_SWITCH])
2425 g_print("\n%s\n",aline);
2426 if (!pswit[OVERVIEW_SWITCH])
2427 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2428 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2434 c=g_utf8_get_char(aline);
2435 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2436 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2440 nc=g_utf8_get_char(g_utf8_next_char(s));
2441 /* for each character in the line after the first */
2442 if (CHAR_IS_DQUOTE(c))
2444 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2445 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2446 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2448 if (pswit[ECHO_SWITCH])
2449 g_print("\n%s\n",aline);
2450 if (!pswit[OVERVIEW_SWITCH])
2451 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2452 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2458 /* Check parity of quotes. */
2459 nc=g_utf8_get_char(aline);
2460 for (s=aline;*s;s=g_utf8_next_char(s))
2463 nc=g_utf8_get_char(g_utf8_next_char(s));
2464 if (CHAR_IS_DQUOTE(c))
2468 parities->dquote=!parities->dquote;
2469 parity=parities->dquote;
2471 else if (c==CHAR_LD_QUOTE)
2478 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2480 if (pswit[ECHO_SWITCH])
2481 g_print("\n%s\n",aline);
2482 if (!pswit[OVERVIEW_SWITCH])
2483 g_print(" Line %ld column %ld - "
2484 "Wrongspaced quotes?\n",
2485 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2493 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2494 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2496 if (pswit[ECHO_SWITCH])
2497 g_print("\n%s\n",aline);
2498 if (!pswit[OVERVIEW_SWITCH])
2499 g_print(" Line %ld column %ld - "
2500 "Wrongspaced quotes?\n",
2501 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2508 c=g_utf8_get_char(aline);
2509 if (CHAR_IS_DQUOTE(c))
2511 if (g_utf8_strchr(",;:!?)]} ",-1,
2512 g_utf8_get_char(g_utf8_next_char(aline))))
2514 if (pswit[ECHO_SWITCH])
2515 g_print("\n%s\n",aline);
2516 if (!pswit[OVERVIEW_SWITCH])
2517 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2523 if (pswit[SQUOTE_SWITCH])
2525 nc=g_utf8_get_char(aline);
2526 for (s=aline;*s;s=g_utf8_next_char(s))
2529 nc=g_utf8_get_char(g_utf8_next_char(s));
2530 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2531 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2532 !g_unichar_isalpha(nc)))
2534 parities->squote=!parities->squote;
2535 if (!parities->squote)
2538 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2540 if (pswit[ECHO_SWITCH])
2541 g_print("\n%s\n",aline);
2542 if (!pswit[OVERVIEW_SWITCH])
2543 g_print(" Line %ld column %ld - "
2544 "Wrongspaced singlequotes?\n",
2545 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2553 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2554 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2556 if (pswit[ECHO_SWITCH])
2557 g_print("\n%s\n",aline);
2558 if (!pswit[OVERVIEW_SWITCH])
2559 g_print(" Line %ld column %ld - "
2560 "Wrongspaced singlequotes?\n",
2561 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2574 * Given a position p within a string str, determine whether it follows the
2575 * given word. This is roughly equivalent to the regular expression (?<=\bword)
2576 * but has different boundary conditions.
2578 static gboolean str_follows_word(const char *str,const char *p,const char *word)
2580 int len=strlen(word);
2583 else if (!g_str_has_prefix(p-len,word))
2585 else if (p-len==str)
2588 /* Using non-alpha as a word boundary. See UAX #29 for a better way. */
2589 return !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(p-len)));
2593 * check_for_double_punctuation:
2595 * Look for double punctuation like ,. or ,,
2596 * Thanks to DW for the suggestion!
2597 * In books with references, ".," and ".;" are common
2598 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2599 * OTOH, from my initial tests, there are also fairly
2600 * common errors. What to do? Make these cases paranoid?
2601 * ".," is the most common, so warnings->dotcomma is used
2602 * to suppress detailed reporting if it occurs often.
2603 * Indeed, ".," is so common after "etc" or "&c" that
2604 * we don't warn on these cases at all.
2606 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2611 nc=g_utf8_get_char(aline);
2612 for (s=aline;*s;s=g_utf8_next_char(s))
2615 nc=g_utf8_get_char(g_utf8_next_char(s));
2616 /* for each punctuation character in the line */
2617 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2618 g_utf8_strchr(".?!,;:",-1,nc))
2620 /* followed by punctuation, it's a query, unless . . . */
2622 if (warnings->isFrench &&
2623 (g_str_has_prefix(s,",...") || g_str_has_prefix(s,"...,") ||
2624 g_str_has_prefix(s,";...") || g_str_has_prefix(s,"...;") ||
2625 g_str_has_prefix(s,":...") || g_str_has_prefix(s,"...:") ||
2626 g_str_has_prefix(s,"!...") || g_str_has_prefix(s,"...!") ||
2627 g_str_has_prefix(s,"?...") || g_str_has_prefix(s,"...?")))
2630 nc=g_utf8_get_char(g_utf8_next_char(s));
2633 else if (c==nc && (c=='.' || c=='?' || c=='!'))
2635 /* do nothing for .. !! and ?? which can be legit */
2638 else if (c=='.' && nc==',')
2640 if (!warnings->dotcomma || str_follows_word(aline,s,"etc") ||
2641 str_follows_word(aline,s,"&c"))
2646 if (pswit[ECHO_SWITCH])
2647 g_print("\n%s\n",aline);
2648 if (!pswit[OVERVIEW_SWITCH])
2649 g_print(" Line %ld column %ld - Double punctuation?\n",
2650 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2659 * check_for_spaced_quotes:
2661 void check_for_spaced_quotes(const char *aline)
2665 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2669 while ((t=strstr(s," \" ")))
2671 if (pswit[ECHO_SWITCH])
2672 g_print("\n%s\n",aline);
2673 if (!pswit[OVERVIEW_SWITCH])
2674 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2675 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2678 s=g_utf8_next_char(g_utf8_next_char(t));
2680 pattern=g_string_new(NULL);
2681 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2683 g_string_assign(pattern," ");
2684 g_string_append_unichar(pattern,single_quotes[i]);
2685 g_string_append_c(pattern,' ');
2687 while ((t=strstr(s,pattern->str)))
2689 if (pswit[ECHO_SWITCH])
2690 g_print("\n%s\n",aline);
2691 if (!pswit[OVERVIEW_SWITCH])
2692 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2693 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2696 s=g_utf8_next_char(g_utf8_next_char(t));
2699 g_string_free(pattern,TRUE);
2703 * check_for_miscased_genative:
2705 * Check special case of 'S instead of 's at end of word.
2707 void check_for_miscased_genative(const char *aline)
2713 c=g_utf8_get_char(aline);
2714 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2715 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2719 nc=g_utf8_get_char(g_utf8_next_char(s));
2720 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2722 if (pswit[ECHO_SWITCH])
2723 g_print("\n%s\n",aline);
2724 if (!pswit[OVERVIEW_SWITCH])
2725 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2726 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2734 * check_end_of_line:
2736 * Now check special cases - start and end of line -
2737 * for single and double quotes. Start is sometimes [sic]
2738 * but better to query it anyway.
2739 * While we're here, check for dash at end of line.
2741 void check_end_of_line(const char *aline,struct warnings *warnings)
2746 lbytes=strlen(aline);
2747 if (g_utf8_strlen(aline,lbytes)>1)
2749 s=g_utf8_prev_char(aline+lbytes);
2750 c1=g_utf8_get_char(s);
2751 c2=g_utf8_get_char(g_utf8_prev_char(s));
2752 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2754 if (pswit[ECHO_SWITCH])
2755 g_print("\n%s\n",aline);
2756 if (!pswit[OVERVIEW_SWITCH])
2757 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2758 g_utf8_strlen(aline,lbytes));
2762 c1=g_utf8_get_char(aline);
2763 c2=g_utf8_get_char(g_utf8_next_char(aline));
2764 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2766 if (pswit[ECHO_SWITCH])
2767 g_print("\n%s\n",aline);
2768 if (!pswit[OVERVIEW_SWITCH])
2769 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2774 * Dash at end of line may well be legit - paranoid mode only
2775 * and don't report em-dash at line-end.
2777 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2779 for (s=g_utf8_prev_char(aline+lbytes);
2780 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2782 if (g_utf8_get_char(s)=='-' &&
2783 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2785 if (pswit[ECHO_SWITCH])
2786 g_print("\n%s\n",aline);
2787 if (!pswit[OVERVIEW_SWITCH])
2788 g_print(" Line %ld column %ld - "
2789 "Hyphen at end of line?\n",
2790 linecnt,g_utf8_pointer_to_offset(aline,s));
2797 * check_for_unspaced_bracket:
2799 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2800 * If so, suspect a scanno like "a]most".
2802 void check_for_unspaced_bracket(const char *aline)
2806 c=g_utf8_get_char(aline);
2807 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2808 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2812 nc=g_utf8_get_char(g_utf8_next_char(s));
2815 /* for each bracket character in the line except 1st & last */
2816 if (g_utf8_strchr("{[()]}",-1,c) &&
2817 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2819 if (pswit[ECHO_SWITCH])
2820 g_print("\n%s\n",aline);
2821 if (!pswit[OVERVIEW_SWITCH])
2822 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2823 linecnt,g_utf8_pointer_to_offset(aline,s));
2831 * check_for_unpunctuated_endquote:
2833 void check_for_unpunctuated_endquote(const char *aline)
2838 c=g_utf8_get_char(aline);
2839 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2840 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2844 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2845 nc=g_utf8_get_char(g_utf8_next_char(s));
2846 /* for each character in the line except 1st */
2847 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2849 if (pswit[ECHO_SWITCH])
2850 g_print("\n%s\n",aline);
2851 if (!pswit[OVERVIEW_SWITCH])
2852 g_print(" Line %ld column %ld - "
2853 "endquote missing punctuation?\n",
2854 linecnt,g_utf8_pointer_to_offset(aline,s));
2862 * check_for_html_tag:
2864 * Check for <HTML TAG>.
2866 * If there is a < in the line, followed at some point
2867 * by a > then we suspect HTML.
2869 void check_for_html_tag(const char *aline)
2871 const char *open,*close;
2873 open=strchr(aline,'<');
2876 close=strchr(g_utf8_next_char(open),'>');
2879 if (pswit[ECHO_SWITCH])
2880 g_print("\n%s\n",aline);
2881 if (!pswit[OVERVIEW_SWITCH])
2883 tag=g_strndup(open,close-open+1);
2884 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2885 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2895 * check_for_html_entity:
2897 * Check for &symbol; HTML.
2899 * If there is a & in the line, followed at
2900 * some point by a ; then we suspect HTML.
2902 void check_for_html_entity(const char *aline)
2904 const char *s,*amp,*scolon;
2906 amp=strchr(aline,'&');
2909 scolon=strchr(amp,';');
2912 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2913 if (g_utf8_get_char(s)==CHAR_SPACE)
2914 break; /* Don't report "Jones & Son;" */
2917 if (pswit[ECHO_SWITCH])
2918 g_print("\n%s\n",aline);
2919 if (!pswit[OVERVIEW_SWITCH])
2921 entity=g_strndup(amp,scolon-amp+1);
2922 g_print(" Line %ld column %d - HTML symbol? %s \n",
2923 linecnt,(int)(amp-aline)+1,entity);
2934 * check_for_omitted_punctuation:
2936 * Check for omitted punctuation at end of paragraph by working back
2937 * through prevline. DW.
2938 * Need to check this only for "normal" paras.
2939 * So what is a "normal" para?
2940 * Not normal if one-liner (chapter headings, etc.)
2941 * Not normal if doesn't contain at least one locase letter
2942 * Not normal if starts with space
2944 void check_for_omitted_punctuation(const char *prevline,
2945 struct line_properties *last,int start_para_line)
2947 gboolean letter_on_line=FALSE;
2950 gboolean closing_quote;
2951 for (s=prevline;*s;s=g_utf8_next_char(s))
2952 if (g_unichar_isalpha(g_utf8_get_char(s)))
2954 letter_on_line=TRUE;
2958 * This next "if" is a problem.
2959 * If we say "start_para_line <= linecnt - 1", that includes
2960 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2961 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2962 * misses genuine one-line paragraphs.
2964 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2965 g_utf8_get_char(prevline)>CHAR_SPACE)
2967 s=prevline+strlen(prevline);
2970 s=g_utf8_prev_char(s);
2971 c=g_utf8_get_char(s);
2972 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2975 closing_quote=FALSE;
2976 } while (closing_quote && s>prevline);
2977 for (;s>prevline;s=g_utf8_prev_char(s))
2979 if (g_unichar_isalpha(g_utf8_get_char(s)))
2981 if (pswit[ECHO_SWITCH])
2982 g_print("\n%s\n",prevline);
2983 if (!pswit[OVERVIEW_SWITCH])
2984 g_print(" Line %ld column %ld - "
2985 "No punctuation at para end?\n",
2986 linecnt-1,g_utf8_strlen(prevline,-1));
2991 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2997 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2999 const char *word=key;
3002 g_print("\nNote: Queried word %s was duplicated %d times\n",
3007 void print_as_windows_1252(const char *string)
3009 gsize inbytes,outbytes;
3011 static GIConv converter=(GIConv)-1;
3014 if (converter!=(GIConv)-1)
3015 g_iconv_close(converter);
3016 converter=(GIConv)-1;
3019 if (converter==(GIConv)-1)
3020 converter=g_iconv_open("WINDOWS-1252","UTF-8");
3021 if (converter!=(GIConv)-1)
3023 inbytes=outbytes=strlen(string);
3024 bp=buf=g_malloc(outbytes+1);
3025 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
3031 fputs(string,stdout);
3034 void print_as_utf_8(const char *string)
3036 fputs(string,stdout);
3044 void procfile(const char *filename)
3047 gchar *parastart=NULL; /* first line of current para */
3048 gchar *etext,*aline;
3051 struct first_pass_results *first_pass_results;
3052 struct warnings *warnings;
3053 struct counters counters={0};
3054 struct line_properties last={0};
3055 struct parities parities={0};
3056 struct pending pending={0};
3057 gboolean isemptyline;
3058 long start_para_line=0;
3059 gboolean isnewpara=FALSE,enddash=FALSE;
3060 last.start=CHAR_SPACE;
3061 linecnt=checked_linecnt=0;
3062 etext=read_etext(filename,&err);
3065 if (pswit[STDOUT_SWITCH])
3066 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
3068 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
3071 g_print("\n\nFile: %s\n\n",filename);
3072 first_pass_results=first_pass(etext);
3073 warnings=report_first_pass(first_pass_results);
3074 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
3075 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
3077 * Here we go with the main pass. Hold onto yer hat!
3081 while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
3086 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
3087 continue; // skip DP page separators completely
3088 if (linecnt<first_pass_results->firstline ||
3089 (first_pass_results->footerline>0 &&
3090 linecnt>first_pass_results->footerline))
3092 if (pswit[HEADER_SWITCH])
3094 if (g_str_has_prefix(aline,"Title:"))
3095 g_print(" %s\n",aline);
3096 if (g_str_has_prefix(aline,"Author:"))
3097 g_print(" %s\n",aline);
3098 if (g_str_has_prefix(aline,"Release Date:"))
3099 g_print(" %s\n",aline);
3100 if (g_str_has_prefix(aline,"Edition:"))
3101 g_print(" %s\n\n",aline);
3103 continue; /* skip through the header */
3106 print_pending(aline,parastart,&pending);
3107 isemptyline=analyse_quotes(aline,&counters);
3108 if (isnewpara && !isemptyline)
3110 /* This line is the start of a new paragraph. */
3111 start_para_line=linecnt;
3112 /* Capture its first line in case we want to report it later. */
3114 parastart=g_strdup(aline);
3115 memset(&parities,0,sizeof(parities)); /* restart the quote count */
3117 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
3118 !g_unichar_isdigit(g_utf8_get_char(s)))
3119 s=g_utf8_next_char(s);
3120 if (g_unichar_islower(g_utf8_get_char(s)))
3122 /* and its first letter is lowercase */
3123 if (pswit[ECHO_SWITCH])
3124 g_print("\n%s\n",aline);
3125 if (!pswit[OVERVIEW_SWITCH])
3126 g_print(" Line %ld column %ld - "
3127 "Paragraph starts with lower-case\n",
3128 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
3132 isnewpara=FALSE; /* Signal the end of new para processing. */
3134 /* Check for an em-dash broken at line end. */
3135 if (enddash && g_utf8_get_char(aline)=='-')
3137 if (pswit[ECHO_SWITCH])
3138 g_print("\n%s\n",aline);
3139 if (!pswit[OVERVIEW_SWITCH])
3140 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
3145 for (s=g_utf8_prev_char(aline+strlen(aline));
3146 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
3148 if (s>=aline && g_utf8_get_char(s)=='-')
3150 check_for_control_characters(aline);
3151 check_for_odd_characters(aline,warnings,isemptyline);
3152 if (warnings->longline)
3153 check_for_long_line(aline);
3154 if (warnings->shortline)
3155 check_for_short_line(aline,&last);
3157 last.len=g_utf8_strlen(aline,-1);
3158 last.start=g_utf8_get_char(aline);
3159 check_for_starting_punctuation(aline);
3162 check_for_spaced_emdash(aline);
3163 check_for_spaced_dash(aline);
3165 check_for_unmarked_paragraphs(aline);
3166 check_for_jeebies(aline);
3167 check_for_mta_from(aline);
3168 check_for_orphan_character(aline);
3169 check_for_pling_scanno(aline);
3170 check_for_extra_period(aline,warnings);
3171 check_for_following_punctuation(aline);
3172 check_for_typos(aline,warnings);
3173 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3174 check_for_double_punctuation(aline,warnings);
3175 check_for_spaced_quotes(aline);
3176 check_for_miscased_genative(aline);
3177 check_end_of_line(aline,warnings);
3178 check_for_unspaced_bracket(aline);
3179 if (warnings->endquote)
3180 check_for_unpunctuated_endquote(aline);
3181 check_for_html_tag(aline);
3182 check_for_html_entity(aline);
3185 check_for_mismatched_quotes(&counters,&pending);
3186 counters_reset(&counters);
3187 /* let the next iteration know that it's starting a new para */
3190 check_for_omitted_punctuation(prevline,&last,start_para_line);
3193 prevline=g_strdup(aline);
3196 check_for_mismatched_quotes(&counters,&pending);
3197 print_pending(NULL,parastart,&pending);
3198 reset_pending(&pending);
3207 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3208 g_tree_foreach(qword,report_duplicate_queries,NULL);
3209 g_tree_unref(qword);
3210 g_tree_unref(qperiod);
3211 counters_destroy(&counters);
3212 g_set_print_handler(NULL);
3213 print_as_windows_1252(NULL);
3214 if (pswit[MARKUP_SWITCH])
3221 * Get one line from the input text. The setting of newlines has the following
3224 * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
3226 * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
3227 * the newline character.
3229 * UNIX_NEWLINES: Check for the presence of CRs.
3231 * In all cases, check that the last line is correctly terminated.
3233 * Returns: a pointer to the line.
3235 char *flgets(char **etext,long lcnt,int newlines)
3238 gboolean isCR=FALSE;
3239 char *theline=*etext;
3244 c=g_utf8_get_char(*etext);
3247 if (*etext==theline)
3249 else if (pswit[LINE_END_SWITCH])
3251 if (pswit[ECHO_SWITCH])
3253 s=g_strndup(theline,eos-theline);
3254 g_print("\n%s\n",s);
3257 if (!pswit[OVERVIEW_SWITCH])
3259 if (newlines==OS9_NEWLINES)
3260 g_print(" Line %ld - No CR?\n",lcnt);
3263 /* There may, or may not, have been a CR */
3264 g_print(" Line %ld - No LF?\n",lcnt);
3272 *etext=g_utf8_next_char(*etext);
3273 /* either way, it's end of line */
3276 if (newlines==DOS_NEWLINES && !isCR)
3278 /* Error - a LF without a preceding CR */
3279 if (pswit[LINE_END_SWITCH])
3281 if (pswit[ECHO_SWITCH])
3283 s=g_strndup(theline,eos-theline);
3284 g_print("\n%s\n",s);
3287 if (!pswit[OVERVIEW_SWITCH])
3288 g_print(" Line %ld - No CR?\n",lcnt);
3297 if (newlines==OS9_NEWLINES)
3299 if (isCR || newlines==UNIX_NEWLINES)
3301 if (pswit[LINE_END_SWITCH])
3303 if (pswit[ECHO_SWITCH])
3305 s=g_strndup(theline,eos-theline);
3306 g_print("\n%s\n",s);
3309 if (!pswit[OVERVIEW_SWITCH])
3311 if (newlines==UNIX_NEWLINES)
3312 g_print(" Line %ld column %ld - Embedded CR?\n",
3313 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3315 g_print(" Line %ld - Two successive CRs?\n",
3321 if (newlines==UNIX_NEWLINES)
3324 if (newlines==DOS_NEWLINES)
3329 if (pswit[LINE_END_SWITCH] && isCR)
3331 if (pswit[ECHO_SWITCH])
3333 s=g_strndup(theline,eos-theline);
3334 g_print("\n%s\n",s);
3337 if (!pswit[OVERVIEW_SWITCH])
3338 g_print(" Line %ld column %ld - CR without LF?\n",
3339 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3345 eos=g_utf8_next_char(eos);
3349 if (pswit[MARKUP_SWITCH])
3350 postprocess_for_HTML(theline);
3351 if (pswit[DP_SWITCH])
3352 postprocess_for_DP(theline);
3359 * Takes a "word" as a parameter, and checks whether it
3360 * contains a mixture of alpha and digits. Generally, this is an
3361 * error, but may not be for cases like 4th or L5 12s. 3d.
3363 * Returns: TRUE iff an is error found.
3365 gboolean mixdigit(const char *checkword)
3367 gboolean wehaveadigit,wehavealetter,query;
3368 const char *s,*nondigit;
3369 wehaveadigit=wehavealetter=query=FALSE;
3370 for (s=checkword;*s;s=g_utf8_next_char(s))
3371 if (g_unichar_isalpha(g_utf8_get_char(s)))
3373 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3375 if (wehaveadigit && wehavealetter)
3377 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3379 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3380 nondigit=g_utf8_next_char(nondigit))
3382 /* digits, ending in st, rd, nd, th of either case */
3383 if (!g_ascii_strcasecmp(nondigit,"st") ||
3384 !g_ascii_strcasecmp(nondigit,"rd") ||
3385 !g_ascii_strcasecmp(nondigit,"nd") ||
3386 !g_ascii_strcasecmp(nondigit,"th"))
3388 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3389 !g_ascii_strcasecmp(nondigit,"rds") ||
3390 !g_ascii_strcasecmp(nondigit,"nds") ||
3391 !g_ascii_strcasecmp(nondigit,"ths"))
3393 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3394 !g_ascii_strcasecmp(nondigit,"rdly") ||
3395 !g_ascii_strcasecmp(nondigit,"ndly") ||
3396 !g_ascii_strcasecmp(nondigit,"thly"))
3398 /* digits, ending in l, L, s or d */
3399 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3400 !strcmp(nondigit,"d"))
3403 * L at the start of a number, representing Britsh pounds, like L500.
3404 * This is cute. We know the current word is mixed digit. If the first
3405 * letter is L, there must be at least one digit following. If both
3406 * digits and letters follow, we have a genuine error, else we have a
3407 * capital L followed by digits, and we accept that as a non-error.
3409 if (g_utf8_get_char(checkword)=='L' &&
3410 !mixdigit(g_utf8_next_char(checkword)))
3419 * Extracts the first/next "word" from the line, and returns it.
3420 * A word is defined as one English word unit--or at least that's the aim.
3421 * "ptr" is advanced to the position in the line where we will start
3422 * looking for the next word.
3423 * If line is non-NULL, then it will be used to derive the column numbers for
3424 * any warnings issued. If line is NULL, then warnings will be suppressed.
3426 * Returns: A newly-allocated string.
3428 gchar *getaword(const char *line,const char **ptr)
3430 const char *s,*t,*t2;
3434 gboolean initial_underlining=FALSE;
3435 word=g_string_new(NULL);
3436 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3437 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3438 **ptr;*ptr=g_utf8_next_char(*ptr))
3440 /* Handle exceptions for footnote markers like [1] */
3441 if (g_utf8_get_char(*ptr)=='[')
3443 g_string_append_c(word,'[');
3444 s=g_utf8_next_char(*ptr);
3445 for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
3446 g_string_append_unichar(word,g_utf8_get_char(s));
3447 if (g_utf8_get_char(s)==']')
3449 g_string_append_c(word,']');
3450 *ptr=g_utf8_next_char(s);
3451 return g_string_free(word,FALSE);
3454 g_string_truncate(word,0);
3456 initial_underlining=g_utf8_get_char(*ptr)=='_';
3459 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3460 * Especially yucky is the case of L1,000
3461 * This section looks for a pattern of characters including a digit
3462 * followed by a comma or period followed by one or more digits.
3463 * If found, it returns this whole pattern as a word; otherwise we discard
3464 * the results and resume our normal programming.
3467 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3468 g_unichar_isalpha(g_utf8_get_char(s)) ||
3469 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3470 g_string_append_unichar(word,g_utf8_get_char(s));
3473 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3475 c=g_utf8_get_char(t);
3476 pc=g_utf8_get_char(g_utf8_prev_char(t));
3477 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3480 return g_string_free(word,FALSE);
3484 /* we didn't find a punctuated number - do the regular getword thing */
3485 g_string_truncate(word,0);
3487 c=g_utf8_get_char(s);
3488 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' ||
3489 CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s))
3490 g_string_append_unichar(word,c);
3491 if (initial_underlining && word->str[word->len-1]=='_')
3493 /* _Simple_ or _Old-school_underlining_ */
3495 g_string_truncate(word,t-*ptr);
3497 *ptr=t; /* _Old-school_underlining_ */
3499 *ptr=s; /* _Simple_ */
3501 else if (initial_underlining || (t=strchr(word->str,'_')))
3503 /* Part_ial_ underlining */
3505 if (initial_underlining)
3507 t2=strchr(word->str,'_');
3510 g_string_erase(word,t2-word->str,1);
3517 if (pswit[ECHO_SWITCH])
3518 g_print("\n%s\n",line);
3519 if (!pswit[OVERVIEW_SWITCH])
3520 g_print(" Line %ld column %ld - "
3521 "Missing space or underscore?\n",linecnt,
3522 g_utf8_pointer_to_offset(line,*ptr));
3527 return g_string_free(word,FALSE);
3530 while ((t=strchr(word->str,'_')))
3535 g_string_erase(word,t-word->str,1);
3537 g_string_erase(word,t2-word->str,1);
3542 g_string_truncate(word,t-word->str);
3543 adjust+=g_utf8_pointer_to_offset(word->str,t);
3544 *ptr=g_utf8_offset_to_pointer(*ptr,adjust);
3547 if (pswit[ECHO_SWITCH])
3548 g_print("\n%s\n",line);
3549 if (!pswit[OVERVIEW_SWITCH])
3550 g_print(" Line %ld column %ld - "
3551 "Missing space or underscore?\n",linecnt,
3552 g_utf8_pointer_to_offset(line,*ptr)+1);
3556 return g_string_free(word,FALSE);
3562 /* No underlining */
3564 return g_string_free(word,FALSE);
3570 * Is this word a Roman Numeral?
3572 * It doesn't actually validate that the number is a valid Roman Numeral--for
3573 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3574 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3575 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3576 * expressions thereof, except when it came to taxes. Allow any number of M,
3577 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3578 * XL or an optional XC, an optional IX or IV, an optional V and any number
3581 gboolean isroman(const char *t)
3587 while (g_utf8_get_char(t)=='m' && *t)
3589 if (g_utf8_get_char(t)=='d')
3591 if (g_str_has_prefix(t,"cm"))
3593 if (g_str_has_prefix(t,"cd"))
3595 while (g_utf8_get_char(t)=='c' && *t)
3597 if (g_str_has_prefix(t,"xl"))
3599 if (g_str_has_prefix(t,"xc"))
3601 if (g_utf8_get_char(t)=='l')
3603 while (g_utf8_get_char(t)=='x' && *t)
3605 if (g_str_has_prefix(t,"ix"))
3607 if (g_str_has_prefix(t,"iv"))
3609 if (g_utf8_get_char(t)=='v')
3611 while (g_utf8_get_char(t)=='i' && *t)
3617 * postprocess_for_DP:
3619 * Invoked with the -d switch from flgets().
3620 * It simply "removes" from the line a hard-coded set of common
3621 * DP-specific tags, so that the line passed to the main routine has
3622 * been pre-cleaned of DP markup.
3624 void postprocess_for_DP(char *theline)
3630 for (i=0;*DPmarkup[i];i++)
3631 while ((s=strstr(theline,DPmarkup[i])))
3633 t=s+strlen(DPmarkup[i]);
3634 memmove(s,t,strlen(t)+1);
3639 * postprocess_for_HTML:
3641 * Invoked with the -m switch from flgets().
3642 * It simply "removes" from the line a hard-coded set of common
3643 * HTML tags and "replaces" a hard-coded set of common HTML
3644 * entities, so that the line passed to the main routine has
3645 * been pre-cleaned of HTML.
3647 void postprocess_for_HTML(char *theline)
3649 while (losemarkup(theline))
3651 loseentities(theline);
3654 char *losemarkup(char *theline)
3658 s=strchr(theline,'<');
3659 t=s?strchr(s,'>'):NULL;
3662 for (i=0;*markup[i];i++)
3663 if (tagcomp(g_utf8_next_char(s),markup[i]))
3665 t=g_utf8_next_char(t);
3666 memmove(s,t,strlen(t)+1);
3669 /* It's an unrecognized <xxx>. */
3673 void loseentities(char *theline)
3680 GTree *entities=NULL;
3681 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3685 g_tree_destroy(entities);
3687 if (translit!=(GIConv)-1)
3688 g_iconv_close(translit);
3689 translit=(GIConv)-1;
3690 if (to_utf8!=(GIConv)-1)
3691 g_iconv_close(to_utf8);
3699 entities=g_tree_new((GCompareFunc)strcmp);
3700 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3701 g_tree_insert(entities,HTMLentities[i].name,
3702 GUINT_TO_POINTER(HTMLentities[i].c));
3704 if (translit==(GIConv)-1)
3705 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3706 if (to_utf8==(GIConv)-1)
3707 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3708 while((amp=strchr(theline,'&')))
3710 scolon=strchr(amp,';');
3715 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3716 c=strtol(amp+2,NULL,10);
3717 else if (amp[2]=='x' &&
3718 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3719 c=strtol(amp+3,NULL,16);
3723 s=g_strndup(amp+1,scolon-(amp+1));
3724 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3733 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3734 theline+=g_unichar_to_utf8(c,theline);
3738 nb=g_unichar_to_utf8(c,s);
3739 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3741 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3743 memcpy(theline,s,nb);
3747 memmove(theline,g_utf8_next_char(scolon),
3748 strlen(g_utf8_next_char(scolon))+1);
3751 theline=g_utf8_next_char(amp);
3755 gboolean tagcomp(const char *strin,const char *basetag)
3759 if (g_utf8_get_char(strin)=='/')
3760 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3762 t=g_utf8_casefold(strin,-1);
3763 s=g_utf8_casefold(basetag,-1);
3764 retval=g_str_has_prefix(t,s);
3770 void proghelp(GOptionContext *context)
3773 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3774 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3775 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3776 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3777 "For details, read the file COPYING.\n",stderr);
3778 fputs("This is Free Software; "
3779 "you may redistribute it under certain conditions (GPL);\n",stderr);
3780 fputs("read the file COPYING for details.\n\n",stderr);
3781 help=g_option_context_get_help(context,TRUE,NULL);
3784 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3785 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3786 "non-ASCII\n",stderr);
3787 fputs("characters like accented letters, "
3788 "lines longer than 75 or shorter than 55,\n",stderr);
3789 fputs("unbalanced quotes or brackets, "
3790 "a variety of badly formatted punctuation, \n",stderr);
3791 fputs("HTML tags, some likely typos. "
3792 "It is NOT a substitute for human judgement.\n",stderr);