1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 gboolean typo_compat,paranoid_compat;
133 static GOptionEntry options[]={
134 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
135 "Ignore DP-specific markup", NULL },
136 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
137 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
138 "Don't ignore DP-specific markup", NULL },
139 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
140 "Echo queried line", NULL },
141 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
142 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
143 "Don't echo queried line", NULL },
144 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
145 "Check single quotes", NULL },
146 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
147 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
148 "Don't check single quotes", NULL },
149 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
150 "Check common typos", NULL },
151 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
152 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
153 "Don't check common typos", NULL },
154 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
155 "Require closure of quotes on every paragraph", NULL },
156 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
157 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
158 "Don't require closure of quotes on every paragraph", NULL },
159 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
160 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
161 "Enable paranoid querying of everything", NULL },
162 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
163 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
164 "Disable paranoid querying of everything", NULL },
165 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
166 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
167 "Enable line end checking", NULL },
168 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
169 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
170 "Diable line end checking", NULL },
171 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
172 "Overview: just show counts", NULL },
173 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
174 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
175 "Show individual warnings", NULL },
176 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
177 "Output errors to stdout instead of stderr", NULL },
178 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
179 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
180 "Output errors to stderr instead of stdout", NULL },
181 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
182 "Echo header fields", NULL },
183 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
184 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
185 "Don't echo header fields", NULL },
186 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
187 "Ignore markup in < >", NULL },
188 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
189 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
190 "No special handling for markup in < >", NULL },
191 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
192 "Use file of user-defined typos", NULL },
193 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
194 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
195 "Ignore file of user-defined typos", NULL },
196 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
197 "Verbose - list everything", NULL },
198 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
199 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Switch off verbose mode", NULL },
205 * Options relating to configuration which make no sense from inside
206 * a configuration file.
209 static GOptionEntry config_options[]={
210 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
211 "Defaults for use on www upload", NULL },
212 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
213 "Dump current config settings", NULL },
217 static GOptionEntry compatibility_options[]={
218 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
219 "Toggle checking for common typos", NULL },
220 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
221 "Toggle both paranoid mode and common typos", NULL },
225 long cnt_quote; /* for overview mode, count of quote queries */
226 long cnt_brack; /* for overview mode, count of brackets queries */
227 long cnt_bin; /* for overview mode, count of non-ASCII queries */
228 long cnt_odd; /* for overview mode, count of odd character queries */
229 long cnt_long; /* for overview mode, count of long line errors */
230 long cnt_short; /* for overview mode, count of short line queries */
231 long cnt_punct; /* for overview mode,
232 count of punctuation and spacing queries */
233 long cnt_dash; /* for overview mode, count of dash-related queries */
234 long cnt_word; /* for overview mode, count of word queries */
235 long cnt_html; /* for overview mode, count of html queries */
236 long cnt_lineend; /* for overview mode, count of line-end queries */
237 long cnt_spacend; /* count of lines with space at end */
238 long linecnt; /* count of total lines in the file */
239 long checked_linecnt; /* count of lines actually checked */
241 void proghelp(GOptionContext *context);
242 void procfile(const char *);
246 gboolean mixdigit(const char *);
247 gchar *getaword(const char **);
248 char *flgets(char **,long);
249 void postprocess_for_HTML(char *);
250 char *linehasmarkup(char *);
251 char *losemarkup(char *);
252 gboolean tagcomp(const char *,const char *);
253 void loseentities(char *);
254 gboolean isroman(const char *);
255 void postprocess_for_DP(char *);
256 void print_as_windows_1252(const char *string);
257 void print_as_utf_8(const char *string);
259 GTree *qword,*qperiod;
267 void config_file_update(GKeyFile *kf)
271 for(i=0;options[i].long_name;i++)
273 if (g_str_has_prefix(options[i].long_name,"no-"))
275 if (options[i].arg==G_OPTION_ARG_NONE)
277 sw=*(gboolean *)options[i].arg_data;
278 if (options[i].flags&G_OPTION_FLAG_REVERSE)
280 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
283 g_assert_not_reached();
287 void config_file_add_comments(GKeyFile *kf)
291 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
293 for(i=0;options[i].long_name;i++)
295 if (g_str_has_prefix(options[i].long_name,"no-"))
297 comment=g_strconcat(" ",options[i].description,NULL);
298 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
303 void dump_config(void)
307 config_file_update(config);
310 config=g_key_file_new();
311 config_file_update(config);
312 config_file_add_comments(config);
314 s=g_key_file_to_data(config,NULL,NULL);
320 GKeyFile *read_config_file(gchar **full_path)
326 const char *search_path;
329 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
333 search_dirs=g_strsplit(search_path,";",0);
335 search_dirs=g_strsplit(search_path,":",0);
340 search_dirs=g_new(gchar *,4);
341 search_dirs[0]=g_get_current_dir();
342 search_dirs[1]=g_strdup(running_from);
343 search_dirs[2]=g_strdup(g_get_user_config_dir());
346 for(i=0;search_dirs[i];i++)
348 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
349 if (g_key_file_load_from_file(kf,path,
350 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
352 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
354 g_printerr("Bookloupe: Error reading %s\n",path);
355 g_printerr("%s\n",err->message);
367 g_strfreev(search_dirs);
375 void parse_config_file(void)
382 config=read_config_file(&path);
384 keys=g_key_file_get_keys(config,"options",NULL,NULL);
391 for(j=0;options[j].long_name;j++)
393 if (g_str_has_prefix(options[j].long_name,"no-"))
395 else if (!strcmp(keys[i],options[j].long_name))
397 if (options[j].arg==G_OPTION_ARG_NONE)
399 sw=g_key_file_get_boolean(config,"options",keys[i],
403 g_printerr("Bookloupe: %s: options.%s: %s\n",
404 path,keys[i],err->message);
407 if (options[j].flags&G_OPTION_FLAG_REVERSE)
409 *(gboolean *)options[j].arg_data=sw;
413 g_assert_not_reached();
416 if (!options[j].long_name)
417 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
426 void parse_options(int *argc,char ***argv)
429 GOptionContext *context;
430 GOptionGroup *compatibility;
431 context=g_option_context_new(
432 "file - look for errors in Project Gutenberg(TM) etexts");
433 g_option_context_add_main_entries(context,options,NULL);
434 g_option_context_add_main_entries(context,config_options,NULL);
435 compatibility=g_option_group_new("compatibility",
436 "Options for Compatibility with Gutcheck:",
437 "Show compatibility options",NULL,NULL);
438 g_option_group_add_entries(compatibility,compatibility_options);
439 g_option_context_add_group(context,compatibility);
440 g_option_context_set_description(context,
441 "For simplicity, only the switch options which reverse the\n"
442 "default configuration are listed. In most cases, both vanilla\n"
443 "and \"no-\" prefixed versions are available for use.");
444 if (!g_option_context_parse(context,argc,argv,&err))
446 g_printerr("Bookloupe: %s\n",err->message);
447 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
451 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
454 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
455 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
458 * Web uploads - for the moment, this is really just a placeholder
459 * until we decide what processing we really want to do on web uploads
461 if (pswit[WEB_SWITCH])
463 /* specific override for web uploads */
464 pswit[ECHO_SWITCH]=TRUE;
465 pswit[SQUOTE_SWITCH]=FALSE;
466 pswit[TYPO_SWITCH]=TRUE;
467 pswit[QPARA_SWITCH]=FALSE;
468 pswit[PARANOID_SWITCH]=TRUE;
469 pswit[LINE_END_SWITCH]=FALSE;
470 pswit[OVERVIEW_SWITCH]=FALSE;
471 pswit[STDOUT_SWITCH]=FALSE;
472 pswit[HEADER_SWITCH]=TRUE;
473 pswit[VERBOSE_SWITCH]=FALSE;
474 pswit[MARKUP_SWITCH]=FALSE;
475 pswit[USERTYPO_SWITCH]=FALSE;
476 pswit[DP_SWITCH]=FALSE;
478 if (pswit[DUMP_CONFIG_SWITCH])
483 if (pswit[OVERVIEW_SWITCH])
484 /* just print summary; don't echo */
485 pswit[ECHO_SWITCH]=FALSE;
491 g_option_context_free(context);
497 * Read in the user-defined stealth scanno list.
499 void read_user_scannos(void)
502 gchar *usertypo_file;
506 gchar *contents,*utf8,**lines;
507 usertypo_file=g_strdup("bookloupe.typ");
508 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
509 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
512 g_free(usertypo_file);
513 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
514 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
516 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
519 g_free(usertypo_file);
520 usertypo_file=g_strdup("gutcheck.typ");
521 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
523 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
526 g_free(usertypo_file);
527 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
528 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
530 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
532 g_free(usertypo_file);
533 g_print(" --> I couldn't find bookloupe.typ "
534 "-- proceeding without user typos.\n");
539 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
540 g_free(usertypo_file);
544 if (g_utf8_validate(contents,len,NULL))
545 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
547 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
549 lines=g_strsplit_set(utf8,"\r\n",0);
551 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
552 for (i=0;lines[i];i++)
553 if (*(unsigned char *)lines[i]>'!')
554 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
563 * Read an etext returning a newly allocated string containing the file
564 * contents or NULL on error.
566 gchar *read_etext(const char *filename,GError **err)
568 GError *tmp_err=NULL;
569 gchar *contents,*utf8;
570 gsize len,bytes_read,bytes_written;
572 if (!g_file_get_contents(filename,&contents,&len,err))
574 if (g_utf8_validate(contents,len,NULL))
576 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
577 g_set_print_handler(print_as_utf_8);
579 SetConsoleOutputCP(CP_UTF8);
584 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
585 &bytes_written,&tmp_err);
586 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
587 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
590 for(i=0;i<bytes_read;i++)
591 if (contents[i]=='\n')
596 else if (contents[i]!='\r')
598 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
599 "Input conversion failed. Byte %d at line %d, column %d is not a "
600 "valid Windows-1252 character",
601 ((unsigned char *)contents)[bytes_read],line,col);
604 g_propagate_error(err,tmp_err);
605 g_set_print_handler(print_as_windows_1252);
607 SetConsoleOutputCP(1252);
614 void cleanup_on_exit(void)
617 SetConsoleOutputCP(saved_cp);
621 int main(int argc,char **argv)
624 atexit(cleanup_on_exit);
625 saved_cp=GetConsoleOutputCP();
627 running_from=g_path_get_dirname(argv[0]);
628 /* Paranoid checking is turned OFF, not on, by its switch */
629 pswit[PARANOID_SWITCH]=TRUE;
630 /* if running in paranoid mode, typo checks default to enabled */
631 pswit[TYPO_SWITCH]=TRUE;
632 /* Line-end checking is turned OFF, not on, by its switch */
633 pswit[LINE_END_SWITCH]=TRUE;
634 /* Echoing is turned OFF, not on, by its switch */
635 pswit[ECHO_SWITCH]=TRUE;
637 parse_options(&argc,&argv);
638 if (pswit[USERTYPO_SWITCH])
640 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
642 if (pswit[OVERVIEW_SWITCH])
644 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
645 checked_linecnt,linecnt,linecnt-checked_linecnt);
646 g_print(" --------------- Queries found --------------\n");
648 g_print(" Long lines: %14ld\n",cnt_long);
650 g_print(" Short lines: %14ld\n",cnt_short);
652 g_print(" Line-end problems: %14ld\n",cnt_lineend);
654 g_print(" Common typos: %14ld\n",cnt_word);
656 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
658 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
660 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
662 g_print(" Proofing characters: %14ld\n",cnt_odd);
664 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
666 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
668 g_print(" Possible HTML tags: %14ld\n",cnt_html);
670 g_print(" TOTAL QUERIES %14ld\n",
671 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
672 cnt_dash+cnt_word+cnt_html+cnt_lineend);
674 g_free(running_from);
676 g_tree_unref(usertypo);
678 g_key_file_free(config);
685 * Run a first pass - verify that it's a valid PG
686 * file, decide whether to report some things that
687 * occur many times in the text like long or short
688 * lines, non-standard dashes, etc.
690 struct first_pass_results *first_pass(const char *etext)
692 gunichar laststart=CHAR_SPACE;
697 unsigned int lastlen=0,lastblen=0;
698 long spline=0,nspline=0;
699 static struct first_pass_results results={0};
702 lines=g_strsplit(etext,"\n",0);
703 for (j=0;lines[j];j++)
705 lbytes=strlen(lines[j]);
706 while (lbytes>0 && lines[j][lbytes-1]=='\r')
707 lines[j][--lbytes]='\0';
708 llen=g_utf8_strlen(lines[j],lbytes);
710 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
711 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
714 g_print(" --> Duplicate header?\n");
715 spline=linecnt+1; /* first line of non-header text, that is */
717 if (!strncmp(lines[j],"*** START",9) &&
718 strstr(lines[j],"PROJECT GUTENBERG"))
721 g_print(" --> Duplicate header?\n");
722 nspline=linecnt+1; /* first line of non-header text, that is */
724 if (spline || nspline)
726 lc_line=g_utf8_strdown(lines[j],lbytes);
727 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
729 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
731 if (results.footerline)
733 /* it's an old-form header - we can detect duplicates */
735 g_print(" --> Duplicate footer?\n");
738 results.footerline=linecnt;
744 results.firstline=spline;
746 results.firstline=nspline; /* override with new */
747 if (results.footerline)
748 continue; /* don't count the boilerplate in the footer */
749 results.totlen+=llen;
750 for (s=lines[j];*s;s=g_utf8_next_char(s))
752 if (g_utf8_get_char(s)>127)
754 if (g_unichar_isalpha(g_utf8_get_char(s)))
758 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
759 qc=QUOTE_CLASS(g_utf8_get_char(s));
762 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
763 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
764 results.endquote_count++;
767 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
768 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
771 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
773 if (strstr(lines[j],".,"))
775 /* only count ast lines for ignoring purposes where there is */
776 /* locase text on the line */
777 if (strchr(lines[j],'*'))
779 for (s=lines[j];*s;s=g_utf8_next_char(s))
780 if (g_unichar_islower(g_utf8_get_char(s)))
785 if (strchr(lines[j],'/'))
786 results.fslashline++;
789 for (s=g_utf8_prev_char(lines[j]+lbytes);
790 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
791 s=g_utf8_prev_char(s))
793 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
794 g_utf8_get_char(g_utf8_prev_char(s))!='-')
797 if (llen>LONGEST_PG_LINE)
799 if (llen>WAY_TOO_LONG)
800 results.verylongline++;
801 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
803 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
806 if (strstr(lines[j],"<i>"))
807 results.htmcount+=4; /* bonus marks! */
809 /* Check for spaced em-dashes */
810 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
813 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
814 results.space_emdash++;
815 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
816 /* count of em-dashes with spaces both sides */
817 results.non_PG_space_emdash++;
818 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
819 /* count of PG-type em-dashes with no spaces */
820 results.PG_space_emdash++;
825 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
826 results.Dutchcount++;
827 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
828 results.Frenchcount++;
829 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
830 results.standalone_digit++;
833 /* Check for spaced dashes */
834 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
838 laststart=lines[j][0];
847 * Make some snap decisions based on the first pass results.
849 struct warnings *report_first_pass(struct first_pass_results *results)
851 static struct warnings warnings={0};
853 g_print(" --> %ld lines in this file have white space at end\n",
856 if (results->dotcomma>5)
859 g_print(" --> %ld lines in this file contain '.,'. "
860 "Not reporting them.\n",results->dotcomma);
863 * If more than 50 lines, or one-tenth, are short,
864 * don't bother reporting them.
866 warnings.shortline=1;
867 if (results->shortline>50 || results->shortline*10>linecnt)
869 warnings.shortline=0;
870 g_print(" --> %ld lines in this file are short. "
871 "Not reporting short lines.\n",results->shortline);
874 * If more than 50 lines, or one-tenth, are long,
875 * don't bother reporting them.
878 if (results->longline>50 || results->longline*10>linecnt)
881 g_print(" --> %ld lines in this file are long. "
882 "Not reporting long lines.\n",results->longline);
884 /* If more than 10 lines contain asterisks, don't bother reporting them. */
886 if (results->astline>10)
889 g_print(" --> %ld lines in this file contain asterisks. "
890 "Not reporting them.\n",results->astline);
893 * If more than 10 lines contain forward slashes,
894 * don't bother reporting them.
897 if (results->fslashline>10)
900 g_print(" --> %ld lines in this file contain forward slashes. "
901 "Not reporting them.\n",results->fslashline);
904 * If more than 20 lines contain unpunctuated endquotes,
905 * don't bother reporting them.
908 if (results->endquote_count>20)
911 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
912 "Not reporting them.\n",results->endquote_count);
915 * If more than 15 lines contain standalone digits,
916 * don't bother reporting them.
919 if (results->standalone_digit>10)
922 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
923 "Not reporting them.\n",results->standalone_digit);
926 * If more than 20 lines contain hyphens at end,
927 * don't bother reporting them.
930 if (results->hyphens>20)
933 g_print(" --> %ld lines in this file have hyphens at end. "
934 "Not reporting them.\n",results->hyphens);
936 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
938 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
939 pswit[MARKUP_SWITCH]=1;
941 if (results->verylongline>0)
942 g_print(" --> %ld lines in this file are VERY long!\n",
943 results->verylongline);
945 * If there are more non-PG spaced dashes than PG em-dashes,
946 * assume it's deliberate.
947 * Current PG guidelines say don't use them, but older texts do,
948 * and some people insist on them whatever the guidelines say.
951 if (results->spacedash+results->non_PG_space_emdash>
952 results->PG_space_emdash)
955 g_print(" --> There are %ld spaced dashes and em-dashes. "
956 "Not reporting them.\n",
957 results->spacedash+results->non_PG_space_emdash);
959 /* If more than a quarter of characters are hi-bit, bug out. */
961 if (results->binlen*4>results->totlen)
963 g_print(" --> This file does not appear to be ASCII. "
964 "Terminating. Best of luck with it!\n");
967 if (results->alphalen*4<results->totlen)
969 g_print(" --> This file does not appear to be text. "
970 "Terminating. Best of luck with it!\n");
973 if (results->binlen*100>results->totlen || results->binlen>100)
975 g_print(" --> There are a lot of foreign letters here. "
976 "Not reporting them.\n");
979 warnings.isDutch=FALSE;
980 if (results->Dutchcount>50)
982 warnings.isDutch=TRUE;
983 g_print(" --> This looks like Dutch - "
984 "switching off dashes and warnings for 's Middags case.\n");
986 warnings.isFrench=FALSE;
987 if (results->Frenchcount>50)
989 warnings.isFrench=TRUE;
990 g_print(" --> This looks like French - "
991 "switching off some doublepunct.\n");
993 if (results->firstline && results->footerline)
994 g_print(" The PG header and footer appear to be already on.\n");
997 if (results->firstline)
998 g_print(" The PG header is on - no footer.\n");
999 if (results->footerline)
1000 g_print(" The PG footer is on - no header.\n");
1003 if (pswit[VERBOSE_SWITCH])
1006 warnings.shortline=1;
1007 warnings.dotcomma=1;
1008 warnings.longline=1;
1014 warnings.endquote=1;
1015 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1017 if (warnings.isDutch)
1019 if (results->footerline>0 && results->firstline>0 &&
1020 results->footerline>results->firstline &&
1021 results->footerline-results->firstline<100)
1023 g_print(" --> I don't really know where this text starts. \n");
1024 g_print(" There are no reference points.\n");
1025 g_print(" I'm going to have to report the header and footer "
1027 results->firstline=0;
1035 * Look along the line, accumulate the count of quotes, and see
1036 * if this is an empty line - i.e. a line with nothing on it
1038 * If line has just spaces, period, * and/or - on it, don't
1039 * count it, since empty lines with asterisks or dashes to
1040 * separate sections are common.
1042 * Returns: TRUE if the line is empty.
1044 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
1047 /* assume the line is empty until proven otherwise */
1048 gboolean isemptyline=TRUE;
1049 const char *s=aline,*sprev,*snext;
1052 GError *tmp_err=NULL;
1055 snext=g_utf8_next_char(s);
1056 c=g_utf8_get_char(s);
1057 if (CHAR_IS_DQUOTE(c))
1058 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1059 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1064 * At start of line, it can only be a quotation mark.
1065 * Hardcode a very common exception!
1067 if (!g_str_has_prefix(snext,"tis") &&
1068 !g_str_has_prefix(snext,"Tis"))
1069 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1071 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1072 g_unichar_isalpha(g_utf8_get_char(snext)))
1073 /* Do nothing! it's definitely an apostrophe, not a quote */
1075 /* it's outside a word - let's check it out */
1076 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1077 g_unichar_isalpha(g_utf8_get_char(snext)))
1079 /* certainly looks like a quotation mark */
1080 if (!g_str_has_prefix(snext,"tis") &&
1081 !g_str_has_prefix(snext,"Tis"))
1082 /* hardcode a very common exception! */
1084 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1085 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1087 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1092 /* now - is it a quotation mark? */
1093 guessquote=0; /* accumulate clues */
1094 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1096 /* it follows a letter - could be either */
1098 if (g_utf8_get_char(sprev)=='s')
1100 /* looks like a plural apostrophe */
1102 if (g_utf8_get_char(snext)==CHAR_SPACE)
1106 if (innermost_quote_matches(counters,c))
1108 * Give it the benefit of some doubt,
1109 * if a squote is already open.
1115 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1118 /* no adjacent letter - it must be a quote of some kind */
1119 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1124 if (pswit[ECHO_SWITCH])
1125 g_print("\n%s\n",aline);
1126 if (!pswit[OVERVIEW_SWITCH])
1127 g_print(" Line %ld column %ld - %s\n",
1128 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1129 g_clear_error(&tmp_err);
1131 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1133 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1134 if (c==CHAR_UNDERSCORE)
1135 counters->c_unders++;
1136 if (c==CHAR_OPEN_SBRACK)
1138 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1139 !matching_difference(counters,c) && s==aline &&
1140 g_str_has_prefix(s,"[Illustration:"))
1141 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1143 increment_matching(counters,c,TRUE);
1145 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1146 increment_matching(counters,c,TRUE);
1147 if (c==CHAR_CLOSE_SBRACK)
1149 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1150 !matching_difference(counters,c) && !*snext)
1151 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1153 increment_matching(counters,c,FALSE);
1155 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1156 increment_matching(counters,c,FALSE);
1164 * check_for_control_characters:
1166 * Check for invalid or questionable characters in the line
1167 * Anything above 127 is invalid for plain ASCII, and
1168 * non-printable control characters should also be flagged.
1169 * Tabs should generally not be there.
1171 void check_for_control_characters(const char *aline)
1175 for (s=aline;*s;s=g_utf8_next_char(s))
1177 c=g_utf8_get_char(s);
1178 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1180 if (pswit[ECHO_SWITCH])
1181 g_print("\n%s\n",aline);
1182 if (!pswit[OVERVIEW_SWITCH])
1183 g_print(" Line %ld column %ld - Control character %u\n",
1184 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1192 * check_for_odd_characters:
1194 * Check for binary and other odd characters.
1196 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1197 gboolean isemptyline)
1199 /* Don't repeat multiple warnings on one line. */
1200 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1201 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1204 for (s=aline;*s;s=g_utf8_next_char(s))
1206 c=g_utf8_get_char(s);
1207 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1209 if (pswit[ECHO_SWITCH])
1210 g_print("\n%s\n",aline);
1211 if (!pswit[OVERVIEW_SWITCH])
1212 if (c>127 && c<160 || c>255)
1213 g_print(" Line %ld column %ld - "
1214 "Non-ISO-8859 character %u\n",
1215 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1217 g_print(" Line %ld column %ld - "
1218 "Non-ASCII character %u\n",
1219 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1224 if (!eTab && c==CHAR_TAB)
1226 if (pswit[ECHO_SWITCH])
1227 g_print("\n%s\n",aline);
1228 if (!pswit[OVERVIEW_SWITCH])
1229 g_print(" Line %ld column %ld - Tab character?\n",
1230 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1235 if (!eTilde && c==CHAR_TILDE)
1238 * Often used by OCR software to indicate an
1239 * unrecognizable character.
1241 if (pswit[ECHO_SWITCH])
1242 g_print("\n%s\n",aline);
1243 if (!pswit[OVERVIEW_SWITCH])
1244 g_print(" Line %ld column %ld - Tilde character?\n",
1245 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1250 if (!eCarat && c==CHAR_CARAT)
1252 if (pswit[ECHO_SWITCH])
1253 g_print("\n%s\n",aline);
1254 if (!pswit[OVERVIEW_SWITCH])
1255 g_print(" Line %ld column %ld - Carat character?\n",
1256 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1261 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1263 if (pswit[ECHO_SWITCH])
1264 g_print("\n%s\n",aline);
1265 if (!pswit[OVERVIEW_SWITCH])
1266 g_print(" Line %ld column %ld - Forward slash?\n",
1267 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1273 * Report asterisks only in paranoid mode,
1274 * since they're often deliberate.
1276 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1279 if (pswit[ECHO_SWITCH])
1280 g_print("\n%s\n",aline);
1281 if (!pswit[OVERVIEW_SWITCH])
1282 g_print(" Line %ld column %ld - Asterisk?\n",
1283 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1292 * check_for_long_line:
1294 * Check for line too long.
1296 void check_for_long_line(const char *aline)
1298 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1300 if (pswit[ECHO_SWITCH])
1301 g_print("\n%s\n",aline);
1302 if (!pswit[OVERVIEW_SWITCH])
1303 g_print(" Line %ld column %ld - Long line %ld\n",
1304 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1311 * check_for_short_line:
1313 * Check for line too short.
1315 * This one is a bit trickier to implement: we don't want to
1316 * flag the last line of a paragraph for being short, so we
1317 * have to wait until we know that our current line is a
1318 * "normal" line, then report the _previous_ line if it was too
1319 * short. We also don't want to report indented lines like
1320 * chapter heads or formatted quotations. We therefore keep
1321 * last->len as the length of the last line examined, and
1322 * last->blen as the length of the last but one, and try to
1323 * suppress unnecessary warnings by checking that both were of
1324 * "normal" length. We keep the first character of the last
1325 * line in last->start, and if it was a space, we assume that
1326 * the formatting is deliberate. I can't figure out a way to
1327 * distinguish something like a quoted verse left-aligned or
1328 * the header or footer of a letter from a paragraph of short
1329 * lines - maybe if I examined the whole paragraph, and if the
1330 * para has less than, say, 8 lines and if all lines are short,
1331 * then just assume it's OK? Need to look at some texts to see
1332 * how often a formula like this would get the right result.
1334 void check_for_short_line(const char *aline,const struct line_properties *last)
1336 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1337 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1338 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1340 if (pswit[ECHO_SWITCH])
1341 g_print("\n%s\n",prevline);
1342 if (!pswit[OVERVIEW_SWITCH])
1343 g_print(" Line %ld column %ld - Short line %ld?\n",
1344 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1351 * check_for_starting_punctuation:
1353 * Look for punctuation other than full ellipses at start of line.
1355 void check_for_starting_punctuation(const char *aline)
1357 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1358 !g_str_has_prefix(aline,". . ."))
1360 if (pswit[ECHO_SWITCH])
1361 g_print("\n%s\n",aline);
1362 if (!pswit[OVERVIEW_SWITCH])
1363 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1371 * check_for_spaced_emdash:
1373 * Check for spaced em-dashes.
1375 * We must check _all_ occurrences of "--" on the line
1376 * hence the loop - even if the first double-dash is OK
1377 * there may be another that's wrong later on.
1379 void check_for_spaced_emdash(const char *aline)
1381 const char *s,*t,*next;
1382 for (s=aline;t=strstr(s,"--");s=next)
1384 next=g_utf8_next_char(g_utf8_next_char(t));
1385 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1386 g_utf8_get_char(next)==CHAR_SPACE)
1388 if (pswit[ECHO_SWITCH])
1389 g_print("\n%s\n",aline);
1390 if (!pswit[OVERVIEW_SWITCH])
1391 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1392 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1400 * check_for_spaced_dash:
1402 * Check for spaced dashes.
1404 void check_for_spaced_dash(const char *aline)
1407 if ((s=strstr(aline," -")))
1409 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1411 if (pswit[ECHO_SWITCH])
1412 g_print("\n%s\n",aline);
1413 if (!pswit[OVERVIEW_SWITCH])
1414 g_print(" Line %ld column %ld - Spaced dash?\n",
1415 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1420 else if ((s=strstr(aline,"- ")))
1422 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1424 if (pswit[ECHO_SWITCH])
1425 g_print("\n%s\n",aline);
1426 if (!pswit[OVERVIEW_SWITCH])
1427 g_print(" Line %ld column %ld - Spaced dash?\n",
1428 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1436 * check_for_unmarked_paragraphs:
1438 * Check for unmarked paragraphs indicated by separate speakers.
1440 * May well be false positive:
1441 * "Bravo!" "Wonderful!" called the crowd.
1442 * but useful all the same.
1444 void check_for_unmarked_paragraphs(const char *aline)
1447 s=strstr(aline,"\" \"");
1449 s=strstr(aline,"\" \"");
1452 if (pswit[ECHO_SWITCH])
1453 g_print("\n%s\n",aline);
1454 if (!pswit[OVERVIEW_SWITCH])
1455 g_print(" Line %ld column %ld - "
1456 "Query missing paragraph break?\n",
1457 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1464 * check_for_jeebies:
1466 * Check for "to he" and other easy h/b errors.
1468 * This is a very inadequate effort on the h/b problem,
1469 * but the phrase "to he" is always an error, whereas "to
1470 * be" is quite common.
1471 * Similarly, '"Quiet!", be said.' is a non-be error
1472 * "to he" is _not_ always an error!:
1473 * "Where they went to he couldn't say."
1474 * Another false positive:
1475 * What would "Cinderella" be without the . . .
1476 * and another: "If he wants to he can see for himself."
1478 void check_for_jeebies(const char *aline)
1481 s=strstr(aline," be could ");
1483 s=strstr(aline," be would ");
1485 s=strstr(aline," was be ");
1487 s=strstr(aline," be is ");
1489 s=strstr(aline," is be ");
1491 s=strstr(aline,"\", be ");
1493 s=strstr(aline,"\" be ");
1495 s=strstr(aline,"\" be ");
1497 s=strstr(aline," to he ");
1500 if (pswit[ECHO_SWITCH])
1501 g_print("\n%s\n",aline);
1502 if (!pswit[OVERVIEW_SWITCH])
1503 g_print(" Line %ld column %ld - Query he/be error?\n",
1504 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1508 s=strstr(aline," the had ");
1510 s=strstr(aline," a had ");
1512 s=strstr(aline," they bad ");
1514 s=strstr(aline," she bad ");
1516 s=strstr(aline," he bad ");
1518 s=strstr(aline," you bad ");
1520 s=strstr(aline," i bad ");
1523 if (pswit[ECHO_SWITCH])
1524 g_print("\n%s\n",aline);
1525 if (!pswit[OVERVIEW_SWITCH])
1526 g_print(" Line %ld column %ld - Query had/bad error?\n",
1527 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1531 s=strstr(aline,"; hut ");
1533 s=strstr(aline,", hut ");
1536 if (pswit[ECHO_SWITCH])
1537 g_print("\n%s\n",aline);
1538 if (!pswit[OVERVIEW_SWITCH])
1539 g_print(" Line %ld column %ld - Query hut/but error?\n",
1540 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1547 * check_for_mta_from:
1549 * Special case - angled bracket in front of "From" placed there by an
1550 * MTA when sending an e-mail.
1552 void check_for_mta_from(const char *aline)
1555 s=strstr(aline,">From");
1558 if (pswit[ECHO_SWITCH])
1559 g_print("\n%s\n",aline);
1560 if (!pswit[OVERVIEW_SWITCH])
1561 g_print(" Line %ld column %ld - "
1562 "Query angled bracket with From\n",
1563 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1570 * check_for_orphan_character:
1572 * Check for a single character line -
1573 * often an overflow from bad wrapping.
1575 void check_for_orphan_character(const char *aline)
1578 c=g_utf8_get_char(aline);
1579 if (c && !*g_utf8_next_char(aline))
1581 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1582 ; /* Nothing - ignore numerals alone on a line. */
1585 if (pswit[ECHO_SWITCH])
1586 g_print("\n%s\n",aline);
1587 if (!pswit[OVERVIEW_SWITCH])
1588 g_print(" Line %ld column 1 - Query single character line\n",
1597 * check_for_pling_scanno:
1599 * Check for I" - often should be !
1601 void check_for_pling_scanno(const char *aline)
1604 s=strstr(aline," I\"");
1607 if (pswit[ECHO_SWITCH])
1608 g_print("\n%s\n",aline);
1609 if (!pswit[OVERVIEW_SWITCH])
1610 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1611 linecnt,g_utf8_pointer_to_offset(aline,s));
1618 * check_for_extra_period:
1620 * Check for period without a capital letter. Cut-down from gutspell.
1621 * Only works when it happens on a single line.
1623 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1625 const char *s,*t,*s1,*sprev;
1630 gunichar c,nc,pc,*decomposition;
1631 if (pswit[PARANOID_SWITCH])
1633 for (t=aline;t=strstr(t,". ");)
1637 t=g_utf8_next_char(t);
1638 /* start of line punctuation is handled elsewhere */
1641 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1643 t=g_utf8_next_char(t);
1646 if (warnings->isDutch)
1648 /* For Frank & Jeroen -- 's Middags case */
1649 gunichar c2,c3,c4,c5;
1650 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1651 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1652 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1653 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1654 if (CHAR_IS_APOSTROPHE(c2) &&
1655 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1656 g_unichar_isupper(c5))
1658 t=g_utf8_next_char(t);
1662 s1=g_utf8_next_char(g_utf8_next_char(t));
1663 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1664 !isdigit(g_utf8_get_char(s1)))
1665 s1=g_utf8_next_char(s1);
1666 if (g_unichar_islower(g_utf8_get_char(s1)))
1668 /* we have something to investigate */
1670 /* so let's go back and find out */
1671 nc=g_utf8_get_char(t);
1672 s1=g_utf8_prev_char(t);
1673 c=g_utf8_get_char(s1);
1674 sprev=g_utf8_prev_char(s1);
1675 pc=g_utf8_get_char(sprev);
1677 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1678 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1679 g_unichar_isalpha(nc)))
1684 sprev=g_utf8_prev_char(s1);
1685 pc=g_utf8_get_char(sprev);
1687 s1=g_utf8_next_char(s1);
1690 testword=g_strndup(s1,s-s1);
1692 testword=g_strdup(s1);
1693 for (i=0;*abbrev[i];i++)
1694 if (!strcmp(testword,abbrev[i]))
1696 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1698 if (!*g_utf8_next_char(testword))
1700 if (isroman(testword))
1705 for (s=testword;*s;s=g_utf8_next_char(s))
1707 decomposition=g_unicode_canonical_decomposition(
1708 g_utf8_get_char(s),&len);
1709 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1711 g_free(decomposition);
1715 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1717 g_tree_insert(qperiod,g_strdup(testword),
1718 GINT_TO_POINTER(1));
1719 if (pswit[ECHO_SWITCH])
1720 g_print("\n%s\n",aline);
1721 if (!pswit[OVERVIEW_SWITCH])
1722 g_print(" Line %ld column %ld - Extra period?\n",
1723 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1729 t=g_utf8_next_char(t);
1735 * check_for_following_punctuation:
1737 * Check for words usually not followed by punctuation.
1739 void check_for_following_punctuation(const char *aline)
1742 const char *s,*wordstart;
1745 if (pswit[TYPO_SWITCH])
1756 inword=g_utf8_strdown(t,-1);
1758 for (i=0;*nocomma[i];i++)
1759 if (!strcmp(inword,nocomma[i]))
1761 c=g_utf8_get_char(s);
1762 if (c==',' || c==';' || c==':')
1764 if (pswit[ECHO_SWITCH])
1765 g_print("\n%s\n",aline);
1766 if (!pswit[OVERVIEW_SWITCH])
1767 g_print(" Line %ld column %ld - "
1768 "Query punctuation after %s?\n",
1769 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1775 for (i=0;*noperiod[i];i++)
1776 if (!strcmp(inword,noperiod[i]))
1778 c=g_utf8_get_char(s);
1779 if (c=='.' || c=='!')
1781 if (pswit[ECHO_SWITCH])
1782 g_print("\n%s\n",aline);
1783 if (!pswit[OVERVIEW_SWITCH])
1784 g_print(" Line %ld column %ld - "
1785 "Query punctuation after %s?\n",
1786 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1800 * Check for commonly mistyped words,
1801 * and digits like 0 for O in a word.
1803 void check_for_typos(const char *aline,struct warnings *warnings)
1805 const char *s,*t,*nt,*wordstart;
1807 gunichar *decomposition;
1809 int i,vowel,consonant,*dupcnt;
1810 gboolean isdup,istypo,alower;
1813 gsize decomposition_len;
1817 inword=getaword(&s);
1821 continue; /* don't bother with empty lines */
1823 if (mixdigit(inword))
1825 if (pswit[ECHO_SWITCH])
1826 g_print("\n%s\n",aline);
1827 if (!pswit[OVERVIEW_SWITCH])
1828 g_print(" Line %ld column %ld - Query digit in %s\n",
1829 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1834 * Put the word through a series of tests for likely typos and OCR
1837 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1841 for (t=inword;*t;t=g_utf8_next_char(t))
1843 c=g_utf8_get_char(t);
1844 nt=g_utf8_next_char(t);
1845 /* lowercase for testing */
1846 if (g_unichar_islower(c))
1848 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1851 * We have an uppercase mid-word. However, there are
1853 * Mac and Mc like McGill
1854 * French contractions like l'Abbe
1856 offset=g_utf8_pointer_to_offset(inword,t);
1858 pc=g_utf8_get_char(g_utf8_prev_char(t));
1861 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1862 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1863 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1864 CHAR_IS_APOSTROPHE(pc))
1870 testword=g_utf8_casefold(inword,-1);
1872 if (pswit[TYPO_SWITCH])
1875 * Check for certain unlikely two-letter combinations at word
1878 len=g_utf8_strlen(testword,-1);
1881 for (i=0;*nostart[i];i++)
1882 if (g_str_has_prefix(testword,nostart[i]))
1884 for (i=0;*noend[i];i++)
1885 if (g_str_has_suffix(testword,noend[i]))
1888 /* ght is common, gbt never. Like that. */
1889 if (strstr(testword,"cb"))
1891 if (strstr(testword,"gbt"))
1893 if (strstr(testword,"pbt"))
1895 if (strstr(testword,"tbs"))
1897 if (strstr(testword,"mrn"))
1899 if (strstr(testword,"ahle"))
1901 if (strstr(testword,"ihle"))
1904 * "TBE" does happen - like HEARTBEAT - but uncommon.
1905 * Also "TBI" - frostbite, outbid - but uncommon.
1906 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1907 * numerals, but "ii" is a common scanno.
1909 if (strstr(testword,"tbi"))
1911 if (strstr(testword,"tbe"))
1913 if (strstr(testword,"ii"))
1916 * Check for no vowels or no consonants.
1917 * If none, flag a typo.
1919 if (!istypo && len>1)
1922 for (t=testword;*t;t=g_utf8_next_char(t))
1924 c=g_utf8_get_char(t);
1926 g_unicode_canonical_decomposition(c,&decomposition_len);
1927 if (c=='y' || g_unichar_isdigit(c))
1929 /* Yah, this is loose. */
1933 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1937 g_free(decomposition);
1939 if (!vowel || !consonant)
1943 * Now exclude the word from being reported if it's in
1946 for (i=0;*okword[i];i++)
1947 if (!strcmp(testword,okword[i]))
1950 * What looks like a typo may be a Roman numeral.
1953 if (istypo && isroman(testword))
1955 /* Check the manual list of typos. */
1957 for (i=0;*typo[i];i++)
1958 if (!strcmp(testword,typo[i]))
1961 * Check lowercase s, l, i and m - special cases.
1962 * "j" - often a semi-colon gone wrong.
1963 * "d" for a missing apostrophe - he d
1966 if (!istypo && len==1 &&
1967 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1971 dupcnt=g_tree_lookup(qword,testword);
1975 isdup=!pswit[VERBOSE_SWITCH];
1979 dupcnt=g_new0(int,1);
1980 g_tree_insert(qword,g_strdup(testword),dupcnt);
1985 if (pswit[ECHO_SWITCH])
1986 g_print("\n%s\n",aline);
1987 if (!pswit[OVERVIEW_SWITCH])
1989 g_print(" Line %ld column %ld - Query word %s",
1990 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1992 if (!pswit[VERBOSE_SWITCH])
1993 g_print(" - not reporting duplicates");
2001 /* check the user's list of typos */
2002 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2004 if (pswit[ECHO_SWITCH])
2005 g_print("\n%s\n",aline);
2006 if (!pswit[OVERVIEW_SWITCH])
2007 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2008 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2010 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2012 if (pswit[PARANOID_SWITCH] && warnings->digit)
2014 /* In paranoid mode, query all 0 and 1 standing alone. */
2015 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2017 if (pswit[ECHO_SWITCH])
2018 g_print("\n%s\n",aline);
2019 if (!pswit[OVERVIEW_SWITCH])
2020 g_print(" Line %ld column %ld - Query standalone %s\n",
2021 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2032 * check_for_misspaced_punctuation:
2034 * Look for added or missing spaces around punctuation and quotes.
2035 * If there is a punctuation character like ! with no space on
2036 * either side, suspect a missing!space. If there are spaces on
2037 * both sides , assume a typo. If we see a double quote with no
2038 * space or punctuation on either side of it, assume unspaced
2039 * quotes "like"this.
2041 void check_for_misspaced_punctuation(const char *aline,
2042 struct parities *parities,gboolean isemptyline)
2044 gboolean isacro,isellipsis;
2046 gunichar c,nc,pc,n2c;
2048 c=g_utf8_get_char(aline);
2049 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2050 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2054 nc=g_utf8_get_char(g_utf8_next_char(s));
2055 /* For each character in the line after the first. */
2056 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2058 /* we need to suppress warnings for acronyms like M.D. */
2060 /* we need to suppress warnings for ellipsis . . . */
2063 * If there are letters on both sides of it or
2064 * if it's strict punctuation followed by an alpha.
2066 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2067 g_utf8_strchr("?!,;:",-1,c)))
2071 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2072 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2074 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2080 if (pswit[ECHO_SWITCH])
2081 g_print("\n%s\n",aline);
2082 if (!pswit[OVERVIEW_SWITCH])
2083 g_print(" Line %ld column %ld - Missing space?\n",
2084 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2089 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2092 * If there are spaces on both sides,
2093 * or space before and end of line.
2097 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2098 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2100 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2104 if (!isemptyline && !isellipsis)
2106 if (pswit[ECHO_SWITCH])
2107 g_print("\n%s\n",aline);
2108 if (!pswit[OVERVIEW_SWITCH])
2109 g_print(" Line %ld column %ld - "
2110 "Spaced punctuation?\n",linecnt,
2111 g_utf8_pointer_to_offset(aline,s)+1);
2118 /* Split out the characters that CANNOT be preceded by space. */
2119 c=g_utf8_get_char(aline);
2120 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2121 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2125 nc=g_utf8_get_char(g_utf8_next_char(s));
2126 /* for each character in the line after the first */
2127 if (g_utf8_strchr("?!,;:",-1,c))
2129 /* if it's punctuation that _cannot_ have a space before it */
2130 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2133 * If nc DOES == space,
2134 * it was already reported just above.
2136 if (pswit[ECHO_SWITCH])
2137 g_print("\n%s\n",aline);
2138 if (!pswit[OVERVIEW_SWITCH])
2139 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2140 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2147 * Special case " .X" where X is any alpha.
2148 * This plugs a hole in the acronym code above.
2149 * Inelegant, but maintainable.
2151 c=g_utf8_get_char(aline);
2152 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2153 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2157 nc=g_utf8_get_char(g_utf8_next_char(s));
2158 /* for each character in the line after the first */
2161 /* if it's a period */
2162 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2165 * If the period follows a space and
2166 * is followed by a letter.
2168 if (pswit[ECHO_SWITCH])
2169 g_print("\n%s\n",aline);
2170 if (!pswit[OVERVIEW_SWITCH])
2171 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2172 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2178 c=g_utf8_get_char(aline);
2179 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2180 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2184 nc=g_utf8_get_char(g_utf8_next_char(s));
2185 /* for each character in the line after the first */
2186 if (CHAR_IS_DQUOTE(c))
2188 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2189 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2190 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2192 if (pswit[ECHO_SWITCH])
2193 g_print("\n%s\n",aline);
2194 if (!pswit[OVERVIEW_SWITCH])
2195 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2196 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2202 /* Check parity of quotes. */
2203 nc=g_utf8_get_char(aline);
2204 for (s=aline;*s;s=g_utf8_next_char(s))
2207 nc=g_utf8_get_char(g_utf8_next_char(s));
2208 if (CHAR_IS_DQUOTE(c))
2212 parities->dquote=!parities->dquote;
2213 parity=parities->dquote;
2215 else if (c==CHAR_LD_QUOTE)
2222 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2224 if (pswit[ECHO_SWITCH])
2225 g_print("\n%s\n",aline);
2226 if (!pswit[OVERVIEW_SWITCH])
2227 g_print(" Line %ld column %ld - "
2228 "Wrongspaced quotes?\n",
2229 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2237 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2238 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2240 if (pswit[ECHO_SWITCH])
2241 g_print("\n%s\n",aline);
2242 if (!pswit[OVERVIEW_SWITCH])
2243 g_print(" Line %ld column %ld - "
2244 "Wrongspaced quotes?\n",
2245 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2252 c=g_utf8_get_char(aline);
2253 if (CHAR_IS_DQUOTE(c))
2255 if (g_utf8_strchr(",;:!?)]} ",-1,
2256 g_utf8_get_char(g_utf8_next_char(aline))))
2258 if (pswit[ECHO_SWITCH])
2259 g_print("\n%s\n",aline);
2260 if (!pswit[OVERVIEW_SWITCH])
2261 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2267 if (pswit[SQUOTE_SWITCH])
2269 nc=g_utf8_get_char(aline);
2270 for (s=aline;*s;s=g_utf8_next_char(s))
2273 nc=g_utf8_get_char(g_utf8_next_char(s));
2274 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2275 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2276 !g_unichar_isalpha(nc)))
2278 parities->squote=!parities->squote;
2279 if (!parities->squote)
2282 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2284 if (pswit[ECHO_SWITCH])
2285 g_print("\n%s\n",aline);
2286 if (!pswit[OVERVIEW_SWITCH])
2287 g_print(" Line %ld column %ld - "
2288 "Wrongspaced singlequotes?\n",
2289 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2297 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2298 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2300 if (pswit[ECHO_SWITCH])
2301 g_print("\n%s\n",aline);
2302 if (!pswit[OVERVIEW_SWITCH])
2303 g_print(" Line %ld column %ld - "
2304 "Wrongspaced singlequotes?\n",
2305 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2316 * check_for_double_punctuation:
2318 * Look for double punctuation like ,. or ,,
2319 * Thanks to DW for the suggestion!
2320 * In books with references, ".," and ".;" are common
2321 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2322 * OTOH, from my initial tests, there are also fairly
2323 * common errors. What to do? Make these cases paranoid?
2324 * ".," is the most common, so warnings->dotcomma is used
2325 * to suppress detailed reporting if it occurs often.
2327 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2331 nc=g_utf8_get_char(aline);
2332 for (s=aline;*s;s=g_utf8_next_char(s))
2335 nc=g_utf8_get_char(g_utf8_next_char(s));
2336 /* for each punctuation character in the line */
2337 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2338 g_utf8_strchr(".?!,;:",-1,nc))
2340 /* followed by punctuation, it's a query, unless . . . */
2341 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2342 !warnings->dotcomma && c=='.' && nc==',' ||
2343 warnings->isFrench && g_str_has_prefix(s,",...") ||
2344 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2345 warnings->isFrench && g_str_has_prefix(s,";...") ||
2346 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2347 warnings->isFrench && g_str_has_prefix(s,":...") ||
2348 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2349 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2350 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2351 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2352 warnings->isFrench && g_str_has_prefix(s,"...?"))
2354 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2355 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2356 warnings->isFrench && g_str_has_prefix(s,";...") ||
2357 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2358 warnings->isFrench && g_str_has_prefix(s,":...") ||
2359 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2360 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2361 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2362 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2363 warnings->isFrench && g_str_has_prefix(s,"...?"))
2366 nc=g_utf8_get_char(g_utf8_next_char(s));
2368 ; /* do nothing for .. !! and ?? which can be legit */
2372 if (pswit[ECHO_SWITCH])
2373 g_print("\n%s\n",aline);
2374 if (!pswit[OVERVIEW_SWITCH])
2375 g_print(" Line %ld column %ld - Double punctuation?\n",
2376 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2385 * check_for_spaced_quotes:
2387 void check_for_spaced_quotes(const char *aline)
2391 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2395 while ((t=strstr(s," \" ")))
2397 if (pswit[ECHO_SWITCH])
2398 g_print("\n%s\n",aline);
2399 if (!pswit[OVERVIEW_SWITCH])
2400 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2401 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2404 s=g_utf8_next_char(g_utf8_next_char(t));
2406 pattern=g_string_new(NULL);
2407 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2409 g_string_assign(pattern," ");
2410 g_string_append_unichar(pattern,single_quotes[i]);
2411 g_string_append_c(pattern,' ');
2413 while ((t=strstr(s,pattern->str)))
2415 if (pswit[ECHO_SWITCH])
2416 g_print("\n%s\n",aline);
2417 if (!pswit[OVERVIEW_SWITCH])
2418 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2419 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2422 s=g_utf8_next_char(g_utf8_next_char(t));
2425 g_string_free(pattern,TRUE);
2429 * check_for_miscased_genative:
2431 * Check special case of 'S instead of 's at end of word.
2433 void check_for_miscased_genative(const char *aline)
2439 c=g_utf8_get_char(aline);
2440 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2441 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2445 nc=g_utf8_get_char(g_utf8_next_char(s));
2446 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2448 if (pswit[ECHO_SWITCH])
2449 g_print("\n%s\n",aline);
2450 if (!pswit[OVERVIEW_SWITCH])
2451 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2452 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2460 * check_end_of_line:
2462 * Now check special cases - start and end of line -
2463 * for single and double quotes. Start is sometimes [sic]
2464 * but better to query it anyway.
2465 * While we're here, check for dash at end of line.
2467 void check_end_of_line(const char *aline,struct warnings *warnings)
2472 lbytes=strlen(aline);
2473 if (g_utf8_strlen(aline,lbytes)>1)
2475 s=g_utf8_prev_char(aline+lbytes);
2476 c1=g_utf8_get_char(s);
2477 c2=g_utf8_get_char(g_utf8_prev_char(s));
2478 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2480 if (pswit[ECHO_SWITCH])
2481 g_print("\n%s\n",aline);
2482 if (!pswit[OVERVIEW_SWITCH])
2483 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2484 g_utf8_strlen(aline,lbytes));
2488 c1=g_utf8_get_char(aline);
2489 c2=g_utf8_get_char(g_utf8_next_char(aline));
2490 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2492 if (pswit[ECHO_SWITCH])
2493 g_print("\n%s\n",aline);
2494 if (!pswit[OVERVIEW_SWITCH])
2495 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2500 * Dash at end of line may well be legit - paranoid mode only
2501 * and don't report em-dash at line-end.
2503 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2505 for (s=g_utf8_prev_char(aline+lbytes);
2506 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2508 if (g_utf8_get_char(s)=='-' &&
2509 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2511 if (pswit[ECHO_SWITCH])
2512 g_print("\n%s\n",aline);
2513 if (!pswit[OVERVIEW_SWITCH])
2514 g_print(" Line %ld column %ld - "
2515 "Hyphen at end of line?\n",
2516 linecnt,g_utf8_pointer_to_offset(aline,s));
2523 * check_for_unspaced_bracket:
2525 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2526 * If so, suspect a scanno like "a]most".
2528 void check_for_unspaced_bracket(const char *aline)
2532 c=g_utf8_get_char(aline);
2533 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2534 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2538 nc=g_utf8_get_char(g_utf8_next_char(s));
2541 /* for each bracket character in the line except 1st & last */
2542 if (g_utf8_strchr("{[()]}",-1,c) &&
2543 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2545 if (pswit[ECHO_SWITCH])
2546 g_print("\n%s\n",aline);
2547 if (!pswit[OVERVIEW_SWITCH])
2548 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2549 linecnt,g_utf8_pointer_to_offset(aline,s));
2557 * check_for_unpunctuated_endquote:
2559 void check_for_unpunctuated_endquote(const char *aline)
2564 c=g_utf8_get_char(aline);
2565 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2566 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2570 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2571 nc=g_utf8_get_char(g_utf8_next_char(s));
2572 /* for each character in the line except 1st */
2573 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
2575 if (pswit[ECHO_SWITCH])
2576 g_print("\n%s\n",aline);
2577 if (!pswit[OVERVIEW_SWITCH])
2578 g_print(" Line %ld column %ld - "
2579 "endquote missing punctuation?\n",
2580 linecnt,g_utf8_pointer_to_offset(aline,s));
2588 * check_for_html_tag:
2590 * Check for <HTML TAG>.
2592 * If there is a < in the line, followed at some point
2593 * by a > then we suspect HTML.
2595 void check_for_html_tag(const char *aline)
2597 const char *open,*close;
2599 open=strchr(aline,'<');
2602 close=strchr(g_utf8_next_char(open),'>');
2605 if (pswit[ECHO_SWITCH])
2606 g_print("\n%s\n",aline);
2607 if (!pswit[OVERVIEW_SWITCH])
2609 tag=g_strndup(open,close-open+1);
2610 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2611 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2621 * check_for_html_entity:
2623 * Check for &symbol; HTML.
2625 * If there is a & in the line, followed at
2626 * some point by a ; then we suspect HTML.
2628 void check_for_html_entity(const char *aline)
2630 const char *s,*amp,*scolon;
2632 amp=strchr(aline,'&');
2635 scolon=strchr(amp,';');
2638 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2639 if (g_utf8_get_char(s)==CHAR_SPACE)
2640 break; /* Don't report "Jones & Son;" */
2643 if (pswit[ECHO_SWITCH])
2644 g_print("\n%s\n",aline);
2645 if (!pswit[OVERVIEW_SWITCH])
2647 entity=g_strndup(amp,scolon-amp+1);
2648 g_print(" Line %ld column %d - HTML symbol? %s \n",
2649 linecnt,(int)(amp-aline)+1,entity);
2660 * check_for_omitted_punctuation:
2662 * Check for omitted punctuation at end of paragraph by working back
2663 * through prevline. DW.
2664 * Need to check this only for "normal" paras.
2665 * So what is a "normal" para?
2666 * Not normal if one-liner (chapter headings, etc.)
2667 * Not normal if doesn't contain at least one locase letter
2668 * Not normal if starts with space
2670 void check_for_omitted_punctuation(const char *prevline,
2671 struct line_properties *last,int start_para_line)
2673 gboolean letter_on_line=FALSE;
2676 gboolean closing_quote;
2677 for (s=prevline;*s;s=g_utf8_next_char(s))
2678 if (g_unichar_isalpha(g_utf8_get_char(s)))
2680 letter_on_line=TRUE;
2684 * This next "if" is a problem.
2685 * If we say "start_para_line <= linecnt - 1", that includes
2686 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2687 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2688 * misses genuine one-line paragraphs.
2690 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2691 g_utf8_get_char(prevline)>CHAR_SPACE)
2693 s=prevline+strlen(prevline);
2696 s=g_utf8_prev_char(s);
2697 c=g_utf8_get_char(s);
2698 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2701 closing_quote=FALSE;
2702 } while (closing_quote && s>prevline);
2703 for (;s>prevline;s=g_utf8_prev_char(s))
2705 if (g_unichar_isalpha(g_utf8_get_char(s)))
2707 if (pswit[ECHO_SWITCH])
2708 g_print("\n%s\n",prevline);
2709 if (!pswit[OVERVIEW_SWITCH])
2710 g_print(" Line %ld column %ld - "
2711 "No punctuation at para end?\n",
2712 linecnt-1,g_utf8_strlen(prevline,-1));
2717 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2723 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2725 const char *word=key;
2728 g_print("\nNote: Queried word %s was duplicated %d times\n",
2733 void print_as_windows_1252(const char *string)
2735 gsize inbytes,outbytes;
2737 static GIConv converter=(GIConv)-1;
2740 if (converter!=(GIConv)-1)
2741 g_iconv_close(converter);
2742 converter=(GIConv)-1;
2745 if (converter==(GIConv)-1)
2746 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2747 if (converter!=(GIConv)-1)
2749 inbytes=outbytes=strlen(string);
2750 bp=buf=g_malloc(outbytes+1);
2751 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2757 fputs(string,stdout);
2760 void print_as_utf_8(const char *string)
2762 fputs(string,stdout);
2770 void procfile(const char *filename)
2773 gchar *parastart=NULL; /* first line of current para */
2774 gchar *etext,*aline;
2777 struct first_pass_results *first_pass_results;
2778 struct warnings *warnings;
2779 struct counters counters={0};
2780 struct line_properties last={0};
2781 struct parities parities={0};
2782 struct pending pending={0};
2783 gboolean isemptyline;
2784 long start_para_line=0;
2785 gboolean isnewpara=FALSE,enddash=FALSE;
2786 last.start=CHAR_SPACE;
2787 linecnt=checked_linecnt=0;
2788 etext=read_etext(filename,&err);
2791 if (pswit[STDOUT_SWITCH])
2792 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2794 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2797 g_print("\n\nFile: %s\n\n",filename);
2798 first_pass_results=first_pass(etext);
2799 warnings=report_first_pass(first_pass_results);
2800 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2801 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2803 * Here we go with the main pass. Hold onto yer hat!
2807 while ((aline=flgets(&etext_ptr,linecnt+1)))
2812 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2813 continue; // skip DP page separators completely
2814 if (linecnt<first_pass_results->firstline ||
2815 (first_pass_results->footerline>0 &&
2816 linecnt>first_pass_results->footerline))
2818 if (pswit[HEADER_SWITCH])
2820 if (g_str_has_prefix(aline,"Title:"))
2821 g_print(" %s\n",aline);
2822 if (g_str_has_prefix(aline,"Author:"))
2823 g_print(" %s\n",aline);
2824 if (g_str_has_prefix(aline,"Release Date:"))
2825 g_print(" %s\n",aline);
2826 if (g_str_has_prefix(aline,"Edition:"))
2827 g_print(" %s\n\n",aline);
2829 continue; /* skip through the header */
2832 print_pending(aline,parastart,&pending);
2833 isemptyline=analyse_quotes(aline,linecnt,&counters);
2834 if (isnewpara && !isemptyline)
2836 /* This line is the start of a new paragraph. */
2837 start_para_line=linecnt;
2838 /* Capture its first line in case we want to report it later. */
2840 parastart=g_strdup(aline);
2841 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2843 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2844 !g_unichar_isdigit(g_utf8_get_char(s)))
2845 s=g_utf8_next_char(s);
2846 if (g_unichar_islower(g_utf8_get_char(s)))
2848 /* and its first letter is lowercase */
2849 if (pswit[ECHO_SWITCH])
2850 g_print("\n%s\n",aline);
2851 if (!pswit[OVERVIEW_SWITCH])
2852 g_print(" Line %ld column %ld - "
2853 "Paragraph starts with lower-case\n",
2854 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2858 isnewpara=FALSE; /* Signal the end of new para processing. */
2860 /* Check for an em-dash broken at line end. */
2861 if (enddash && g_utf8_get_char(aline)=='-')
2863 if (pswit[ECHO_SWITCH])
2864 g_print("\n%s\n",aline);
2865 if (!pswit[OVERVIEW_SWITCH])
2866 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2871 for (s=g_utf8_prev_char(aline+strlen(aline));
2872 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2874 if (s>=aline && g_utf8_get_char(s)=='-')
2876 check_for_control_characters(aline);
2878 check_for_odd_characters(aline,warnings,isemptyline);
2879 if (warnings->longline)
2880 check_for_long_line(aline);
2881 if (warnings->shortline)
2882 check_for_short_line(aline,&last);
2884 last.len=g_utf8_strlen(aline,-1);
2885 last.start=g_utf8_get_char(aline);
2886 check_for_starting_punctuation(aline);
2889 check_for_spaced_emdash(aline);
2890 check_for_spaced_dash(aline);
2892 check_for_unmarked_paragraphs(aline);
2893 check_for_jeebies(aline);
2894 check_for_mta_from(aline);
2895 check_for_orphan_character(aline);
2896 check_for_pling_scanno(aline);
2897 check_for_extra_period(aline,warnings);
2898 check_for_following_punctuation(aline);
2899 check_for_typos(aline,warnings);
2900 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2901 check_for_double_punctuation(aline,warnings);
2902 check_for_spaced_quotes(aline);
2903 check_for_miscased_genative(aline);
2904 check_end_of_line(aline,warnings);
2905 check_for_unspaced_bracket(aline);
2906 if (warnings->endquote)
2907 check_for_unpunctuated_endquote(aline);
2908 check_for_html_tag(aline);
2909 check_for_html_entity(aline);
2912 check_for_mismatched_quotes(&counters,&pending);
2913 counters_reset(&counters);
2914 /* let the next iteration know that it's starting a new para */
2917 check_for_omitted_punctuation(prevline,&last,start_para_line);
2920 prevline=g_strdup(aline);
2923 check_for_mismatched_quotes(&counters,&pending);
2924 print_pending(NULL,parastart,&pending);
2925 reset_pending(&pending);
2934 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2935 g_tree_foreach(qword,report_duplicate_queries,NULL);
2936 g_tree_unref(qword);
2937 g_tree_unref(qperiod);
2938 counters_destroy(&counters);
2939 g_set_print_handler(NULL);
2940 print_as_windows_1252(NULL);
2941 if (pswit[MARKUP_SWITCH])
2948 * Get one line from the input text, checking for
2949 * the existence of exactly one CR/LF line-end per line.
2951 * Returns: a pointer to the line.
2953 char *flgets(char **etext,long lcnt)
2956 gboolean isCR=FALSE;
2957 char *theline=*etext;
2962 c=g_utf8_get_char(*etext);
2963 *etext=g_utf8_next_char(*etext);
2966 /* either way, it's end of line */
2973 /* Error - a LF without a preceding CR */
2974 if (pswit[LINE_END_SWITCH])
2976 if (pswit[ECHO_SWITCH])
2978 s=g_strndup(theline,eos-theline);
2979 g_print("\n%s\n",s);
2982 if (!pswit[OVERVIEW_SWITCH])
2983 g_print(" Line %ld - No CR?\n",lcnt);
2994 /* Error - two successive CRs */
2995 if (pswit[LINE_END_SWITCH])
2997 if (pswit[ECHO_SWITCH])
2999 s=g_strndup(theline,eos-theline);
3000 g_print("\n%s\n",s);
3003 if (!pswit[OVERVIEW_SWITCH])
3004 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3013 if (pswit[LINE_END_SWITCH] && isCR)
3015 if (pswit[ECHO_SWITCH])
3017 s=g_strndup(theline,eos-theline);
3018 g_print("\n%s\n",s);
3021 if (!pswit[OVERVIEW_SWITCH])
3022 g_print(" Line %ld column %ld - CR without LF?\n",
3023 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3029 eos=g_utf8_next_char(eos);
3033 if (pswit[MARKUP_SWITCH])
3034 postprocess_for_HTML(theline);
3035 if (pswit[DP_SWITCH])
3036 postprocess_for_DP(theline);
3043 * Takes a "word" as a parameter, and checks whether it
3044 * contains a mixture of alpha and digits. Generally, this is an
3045 * error, but may not be for cases like 4th or L5 12s. 3d.
3047 * Returns: TRUE iff an is error found.
3049 gboolean mixdigit(const char *checkword)
3051 gboolean wehaveadigit,wehavealetter,query;
3052 const char *s,*nondigit;
3053 wehaveadigit=wehavealetter=query=FALSE;
3054 for (s=checkword;*s;s=g_utf8_next_char(s))
3055 if (g_unichar_isalpha(g_utf8_get_char(s)))
3057 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3059 if (wehaveadigit && wehavealetter)
3061 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3063 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3064 nondigit=g_utf8_next_char(nondigit))
3066 /* digits, ending in st, rd, nd, th of either case */
3067 if (!g_ascii_strcasecmp(nondigit,"st") ||
3068 !g_ascii_strcasecmp(nondigit,"rd") ||
3069 !g_ascii_strcasecmp(nondigit,"nd") ||
3070 !g_ascii_strcasecmp(nondigit,"th"))
3072 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3073 !g_ascii_strcasecmp(nondigit,"rds") ||
3074 !g_ascii_strcasecmp(nondigit,"nds") ||
3075 !g_ascii_strcasecmp(nondigit,"ths"))
3077 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3078 !g_ascii_strcasecmp(nondigit,"rdly") ||
3079 !g_ascii_strcasecmp(nondigit,"ndly") ||
3080 !g_ascii_strcasecmp(nondigit,"thly"))
3082 /* digits, ending in l, L, s or d */
3083 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3084 !strcmp(nondigit,"d"))
3087 * L at the start of a number, representing Britsh pounds, like L500.
3088 * This is cute. We know the current word is mixed digit. If the first
3089 * letter is L, there must be at least one digit following. If both
3090 * digits and letters follow, we have a genuine error, else we have a
3091 * capital L followed by digits, and we accept that as a non-error.
3093 if (g_utf8_get_char(checkword)=='L' &&
3094 !mixdigit(g_utf8_next_char(checkword)))
3103 * Extracts the first/next "word" from the line, and returns it.
3104 * A word is defined as one English word unit--or at least that's the aim.
3105 * "ptr" is advanced to the position in the line where we will start
3106 * looking for the next word.
3108 * Returns: A newly-allocated string.
3110 gchar *getaword(const char **ptr)
3115 word=g_string_new(NULL);
3116 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3117 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3118 **ptr;*ptr=g_utf8_next_char(*ptr))
3121 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3122 * Especially yucky is the case of L1,000
3123 * This section looks for a pattern of characters including a digit
3124 * followed by a comma or period followed by one or more digits.
3125 * If found, it returns this whole pattern as a word; otherwise we discard
3126 * the results and resume our normal programming.
3129 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3130 g_unichar_isalpha(g_utf8_get_char(s)) ||
3131 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3132 g_string_append_unichar(word,g_utf8_get_char(s));
3135 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3137 c=g_utf8_get_char(t);
3138 pc=g_utf8_get_char(g_utf8_prev_char(t));
3139 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3142 return g_string_free(word,FALSE);
3146 /* we didn't find a punctuated number - do the regular getword thing */
3147 g_string_truncate(word,0);
3148 c=g_utf8_get_char(*ptr);
3149 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3150 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3151 g_string_append_unichar(word,c);
3152 return g_string_free(word,FALSE);
3158 * Is this word a Roman Numeral?
3160 * It doesn't actually validate that the number is a valid Roman Numeral--for
3161 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3162 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3163 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3164 * expressions thereof, except when it came to taxes. Allow any number of M,
3165 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3166 * XL or an optional XC, an optional IX or IV, an optional V and any number
3169 gboolean isroman(const char *t)
3175 while (g_utf8_get_char(t)=='m' && *t)
3177 if (g_utf8_get_char(t)=='d')
3179 if (g_str_has_prefix(t,"cm"))
3181 if (g_str_has_prefix(t,"cd"))
3183 while (g_utf8_get_char(t)=='c' && *t)
3185 if (g_str_has_prefix(t,"xl"))
3187 if (g_str_has_prefix(t,"xc"))
3189 if (g_utf8_get_char(t)=='l')
3191 while (g_utf8_get_char(t)=='x' && *t)
3193 if (g_str_has_prefix(t,"ix"))
3195 if (g_str_has_prefix(t,"iv"))
3197 if (g_utf8_get_char(t)=='v')
3199 while (g_utf8_get_char(t)=='i' && *t)
3205 * postprocess_for_DP:
3207 * Invoked with the -d switch from flgets().
3208 * It simply "removes" from the line a hard-coded set of common
3209 * DP-specific tags, so that the line passed to the main routine has
3210 * been pre-cleaned of DP markup.
3212 void postprocess_for_DP(char *theline)
3218 for (i=0;*DPmarkup[i];i++)
3219 while ((s=strstr(theline,DPmarkup[i])))
3221 t=s+strlen(DPmarkup[i]);
3222 memmove(s,t,strlen(t)+1);
3227 * postprocess_for_HTML:
3229 * Invoked with the -m switch from flgets().
3230 * It simply "removes" from the line a hard-coded set of common
3231 * HTML tags and "replaces" a hard-coded set of common HTML
3232 * entities, so that the line passed to the main routine has
3233 * been pre-cleaned of HTML.
3235 void postprocess_for_HTML(char *theline)
3237 while (losemarkup(theline))
3239 loseentities(theline);
3242 char *losemarkup(char *theline)
3246 s=strchr(theline,'<');
3247 t=s?strchr(s,'>'):NULL;
3250 for (i=0;*markup[i];i++)
3251 if (tagcomp(g_utf8_next_char(s),markup[i]))
3253 t=g_utf8_next_char(t);
3254 memmove(s,t,strlen(t)+1);
3257 /* It's an unrecognized <xxx>. */
3261 void loseentities(char *theline)
3268 GTree *entities=NULL;
3269 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3273 g_tree_destroy(entities);
3275 if (translit!=(GIConv)-1)
3276 g_iconv_close(translit);
3277 translit=(GIConv)-1;
3278 if (to_utf8!=(GIConv)-1)
3279 g_iconv_close(to_utf8);
3287 entities=g_tree_new((GCompareFunc)strcmp);
3288 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3289 g_tree_insert(entities,HTMLentities[i].name,
3290 GUINT_TO_POINTER(HTMLentities[i].c));
3292 if (translit==(GIConv)-1)
3293 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3294 if (to_utf8==(GIConv)-1)
3295 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3296 while((amp=strchr(theline,'&')))
3298 scolon=strchr(amp,';');
3303 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3304 c=strtol(amp+2,NULL,10);
3305 else if (amp[2]=='x' &&
3306 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3307 c=strtol(amp+3,NULL,16);
3311 s=g_strndup(amp+1,scolon-(amp+1));
3312 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3321 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3322 theline+=g_unichar_to_utf8(c,theline);
3326 nb=g_unichar_to_utf8(c,s);
3327 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3329 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3331 memcpy(theline,s,nb);
3335 memmove(theline,g_utf8_next_char(scolon),
3336 strlen(g_utf8_next_char(scolon))+1);
3339 theline=g_utf8_next_char(amp);
3343 gboolean tagcomp(const char *strin,const char *basetag)
3347 if (g_utf8_get_char(strin)=='/')
3348 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3350 t=g_utf8_casefold(strin,-1);
3351 s=g_utf8_casefold(basetag,-1);
3352 retval=g_str_has_prefix(t,s);
3358 void proghelp(GOptionContext *context)
3361 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3362 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3363 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3364 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3365 "For details, read the file COPYING.\n",stderr);
3366 fputs("This is Free Software; "
3367 "you may redistribute it under certain conditions (GPL);\n",stderr);
3368 fputs("read the file COPYING for details.\n\n",stderr);
3369 help=g_option_context_get_help(context,TRUE,NULL);
3372 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3373 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3374 "non-ASCII\n",stderr);
3375 fputs("characters like accented letters, "
3376 "lines longer than 75 or shorter than 55,\n",stderr);
3377 fputs("unbalanced quotes or brackets, "
3378 "a variety of badly formatted punctuation, \n",stderr);
3379 fputs("HTML tags, some likely typos. "
3380 "It is NOT a substitute for human judgement.\n",stderr);