1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 gboolean typo_compat,paranoid_compat;
133 static GOptionEntry options[]={
134 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
135 "Ignore DP-specific markup", NULL },
136 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
137 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
138 "Don't ignore DP-specific markup", NULL },
139 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
140 "Echo queried line", NULL },
141 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
142 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
143 "Don't echo queried line", NULL },
144 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
145 "Check single quotes", NULL },
146 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
147 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
148 "Don't check single quotes", NULL },
149 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
150 "Check common typos", NULL },
151 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
152 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
153 "Don't check common typos", NULL },
154 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
155 "Require closure of quotes on every paragraph", NULL },
156 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
157 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
158 "Don't require closure of quotes on every paragraph", NULL },
159 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
160 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
161 "Enable paranoid querying of everything", NULL },
162 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
163 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
164 "Disable paranoid querying of everything", NULL },
165 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
166 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
167 "Enable line end checking", NULL },
168 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
169 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
170 "Diable line end checking", NULL },
171 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
172 "Overview: just show counts", NULL },
173 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
174 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
175 "Show individual warnings", NULL },
176 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
177 "Output errors to stdout instead of stderr", NULL },
178 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
179 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
180 "Output errors to stderr instead of stdout", NULL },
181 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
182 "Echo header fields", NULL },
183 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
184 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
185 "Don't echo header fields", NULL },
186 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
187 "Ignore markup in < >", NULL },
188 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
189 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
190 "No special handling for markup in < >", NULL },
191 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
192 "Use file of user-defined typos", NULL },
193 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
194 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
195 "Ignore file of user-defined typos", NULL },
196 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
197 "Verbose - list everything", NULL },
198 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
199 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Switch off verbose mode", NULL },
205 * Options relating to configuration which make no sense from inside
206 * a configuration file.
209 static GOptionEntry config_options[]={
210 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
211 "Defaults for use on www upload", NULL },
212 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
213 "Dump current config settings", NULL },
217 static GOptionEntry compatibility_options[]={
218 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
219 "Toggle checking for common typos", NULL },
220 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
221 "Toggle both paranoid mode and common typos", NULL },
225 long cnt_quote; /* for overview mode, count of quote queries */
226 long cnt_brack; /* for overview mode, count of brackets queries */
227 long cnt_bin; /* for overview mode, count of non-ASCII queries */
228 long cnt_odd; /* for overview mode, count of odd character queries */
229 long cnt_long; /* for overview mode, count of long line errors */
230 long cnt_short; /* for overview mode, count of short line queries */
231 long cnt_punct; /* for overview mode,
232 count of punctuation and spacing queries */
233 long cnt_dash; /* for overview mode, count of dash-related queries */
234 long cnt_word; /* for overview mode, count of word queries */
235 long cnt_html; /* for overview mode, count of html queries */
236 long cnt_lineend; /* for overview mode, count of line-end queries */
237 long cnt_spacend; /* count of lines with space at end */
238 long linecnt; /* count of total lines in the file */
239 long checked_linecnt; /* count of lines actually checked */
241 void proghelp(GOptionContext *context);
242 void procfile(const char *);
246 gboolean mixdigit(const char *);
247 gchar *getaword(const char **);
248 char *flgets(char **,long);
249 void postprocess_for_HTML(char *);
250 char *linehasmarkup(char *);
251 char *losemarkup(char *);
252 gboolean tagcomp(const char *,const char *);
253 void loseentities(char *);
254 gboolean isroman(const char *);
255 void postprocess_for_DP(char *);
256 void print_as_windows_1252(const char *string);
257 void print_as_utf_8(const char *string);
259 GTree *qword,*qperiod;
267 void config_file_update(GKeyFile *kf)
271 for(i=0;options[i].long_name;i++)
273 if (g_str_has_prefix(options[i].long_name,"no-"))
275 if (options[i].arg==G_OPTION_ARG_NONE)
277 sw=*(gboolean *)options[i].arg_data;
278 if (options[i].flags&G_OPTION_FLAG_REVERSE)
280 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
283 g_assert_not_reached();
287 void config_file_add_comments(GKeyFile *kf)
291 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
293 for(i=0;options[i].long_name;i++)
295 if (g_str_has_prefix(options[i].long_name,"no-"))
297 comment=g_strconcat(" ",options[i].description,NULL);
298 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
303 void dump_config(void)
307 config_file_update(config);
310 config=g_key_file_new();
311 config_file_update(config);
312 config_file_add_comments(config);
314 s=g_key_file_to_data(config,NULL,NULL);
320 GKeyFile *read_config_file(gchar **full_path)
326 const char *search_path;
329 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
333 search_dirs=g_strsplit(search_path,";",0);
335 search_dirs=g_strsplit(search_path,":",0);
340 search_dirs=g_new(gchar *,4);
341 search_dirs[0]=g_get_current_dir();
342 search_dirs[1]=g_strdup(running_from);
343 search_dirs[2]=g_strdup(g_get_user_config_dir());
346 for(i=0;search_dirs[i];i++)
348 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
349 if (g_key_file_load_from_file(kf,path,
350 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
352 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
354 g_printerr("Bookloupe: Error reading %s\n",path);
355 g_printerr("%s\n",err->message);
367 g_strfreev(search_dirs);
375 void parse_config_file(void)
382 config=read_config_file(&path);
384 keys=g_key_file_get_keys(config,"options",NULL,NULL);
391 for(j=0;options[j].long_name;j++)
393 if (g_str_has_prefix(options[j].long_name,"no-"))
395 else if (!strcmp(keys[i],options[j].long_name))
397 if (options[j].arg==G_OPTION_ARG_NONE)
399 sw=g_key_file_get_boolean(config,"options",keys[i],
403 g_printerr("Bookloupe: %s: options.%s: %s\n",
404 path,keys[i],err->message);
407 if (options[j].flags&G_OPTION_FLAG_REVERSE)
409 *(gboolean *)options[j].arg_data=sw;
413 g_assert_not_reached();
416 if (!options[j].long_name)
417 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
426 void parse_options(int *argc,char ***argv)
429 GOptionContext *context;
430 GOptionGroup *compatibility;
431 context=g_option_context_new(
432 "file - look for errors in Project Gutenberg(TM) etexts");
433 g_option_context_add_main_entries(context,options,NULL);
434 g_option_context_add_main_entries(context,config_options,NULL);
435 compatibility=g_option_group_new("compatibility",
436 "Options for Compatibility with Gutcheck:",
437 "Show compatibility options",NULL,NULL);
438 g_option_group_add_entries(compatibility,compatibility_options);
439 g_option_context_add_group(context,compatibility);
440 g_option_context_set_description(context,
441 "For simplicity, only the switch options which reverse the\n"
442 "default configuration are listed. In most cases, both vanilla\n"
443 "and \"no-\" prefixed versions are available for use.");
444 if (!g_option_context_parse(context,argc,argv,&err))
446 g_printerr("Bookloupe: %s\n",err->message);
447 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
451 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
454 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
455 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
458 * Web uploads - for the moment, this is really just a placeholder
459 * until we decide what processing we really want to do on web uploads
461 if (pswit[WEB_SWITCH])
463 /* specific override for web uploads */
464 pswit[ECHO_SWITCH]=TRUE;
465 pswit[SQUOTE_SWITCH]=FALSE;
466 pswit[TYPO_SWITCH]=TRUE;
467 pswit[QPARA_SWITCH]=FALSE;
468 pswit[PARANOID_SWITCH]=TRUE;
469 pswit[LINE_END_SWITCH]=FALSE;
470 pswit[OVERVIEW_SWITCH]=FALSE;
471 pswit[STDOUT_SWITCH]=FALSE;
472 pswit[HEADER_SWITCH]=TRUE;
473 pswit[VERBOSE_SWITCH]=FALSE;
474 pswit[MARKUP_SWITCH]=FALSE;
475 pswit[USERTYPO_SWITCH]=FALSE;
476 pswit[DP_SWITCH]=FALSE;
478 if (pswit[DUMP_CONFIG_SWITCH])
483 if (pswit[OVERVIEW_SWITCH])
484 /* just print summary; don't echo */
485 pswit[ECHO_SWITCH]=FALSE;
491 g_option_context_free(context);
497 * Read in the user-defined stealth scanno list.
499 void read_user_scannos(void)
502 gchar *usertypo_file;
506 gchar *contents,*utf8,**lines;
507 usertypo_file=g_strdup("bookloupe.typ");
508 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
509 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
512 g_free(usertypo_file);
513 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
514 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
516 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
519 g_free(usertypo_file);
520 usertypo_file=g_strdup("gutcheck.typ");
521 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
523 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
526 g_free(usertypo_file);
527 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
528 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
530 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
532 g_free(usertypo_file);
533 g_print(" --> I couldn't find bookloupe.typ "
534 "-- proceeding without user typos.\n");
539 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
540 g_free(usertypo_file);
544 if (g_utf8_validate(contents,len,NULL))
545 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
547 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
549 lines=g_strsplit_set(utf8,"\r\n",0);
551 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
552 for (i=0;lines[i];i++)
553 if (*(unsigned char *)lines[i]>'!')
554 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
563 * Read an etext returning a newly allocated string containing the file
564 * contents or NULL on error.
566 gchar *read_etext(const char *filename,GError **err)
568 GError *tmp_err=NULL;
569 gchar *contents,*utf8;
570 gsize len,bytes_read,bytes_written;
572 if (!g_file_get_contents(filename,&contents,&len,err))
574 if (g_utf8_validate(contents,len,NULL))
576 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
577 g_set_print_handler(print_as_utf_8);
579 SetConsoleOutputCP(CP_UTF8);
584 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
585 &bytes_written,&tmp_err);
586 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
587 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
590 for(i=0;i<bytes_read;i++)
591 if (contents[i]=='\n')
596 else if (contents[i]!='\r')
598 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
599 "Input conversion failed. Byte %d at line %d, column %d is not a "
600 "valid Windows-1252 character",
601 ((unsigned char *)contents)[bytes_read],line,col);
604 g_propagate_error(err,tmp_err);
605 g_set_print_handler(print_as_windows_1252);
607 SetConsoleOutputCP(1252);
614 void cleanup_on_exit(void)
617 SetConsoleOutputCP(saved_cp);
621 int main(int argc,char **argv)
624 atexit(cleanup_on_exit);
625 saved_cp=GetConsoleOutputCP();
627 running_from=g_path_get_dirname(argv[0]);
628 /* Paranoid checking is turned OFF, not on, by its switch */
629 pswit[PARANOID_SWITCH]=TRUE;
630 /* if running in paranoid mode, typo checks default to enabled */
631 pswit[TYPO_SWITCH]=TRUE;
632 /* Line-end checking is turned OFF, not on, by its switch */
633 pswit[LINE_END_SWITCH]=TRUE;
634 /* Echoing is turned OFF, not on, by its switch */
635 pswit[ECHO_SWITCH]=TRUE;
637 parse_options(&argc,&argv);
638 if (pswit[USERTYPO_SWITCH])
640 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
642 if (pswit[OVERVIEW_SWITCH])
644 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
645 checked_linecnt,linecnt,linecnt-checked_linecnt);
646 g_print(" --------------- Queries found --------------\n");
648 g_print(" Long lines: %14ld\n",cnt_long);
650 g_print(" Short lines: %14ld\n",cnt_short);
652 g_print(" Line-end problems: %14ld\n",cnt_lineend);
654 g_print(" Common typos: %14ld\n",cnt_word);
656 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
658 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
660 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
662 g_print(" Proofing characters: %14ld\n",cnt_odd);
664 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
666 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
668 g_print(" Possible HTML tags: %14ld\n",cnt_html);
670 g_print(" TOTAL QUERIES %14ld\n",
671 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
672 cnt_dash+cnt_word+cnt_html+cnt_lineend);
674 g_free(running_from);
676 g_tree_unref(usertypo);
678 g_key_file_free(config);
682 void count_dashes(const char *line,const char *dash,
683 struct dash_results *results)
688 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
691 tokens=g_strsplit(line,dash,0);
694 for(i=1;tokens[i];i++)
696 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
697 nc=g_utf8_get_char(tokens[i]);
698 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
700 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
702 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
708 /* count of lines with em-dashes with spaces both sides */
709 results->non_PG_space++;
711 /* count of lines with PG-type em-dashes with no spaces */
719 * Run a first pass - verify that it's a valid PG
720 * file, decide whether to report some things that
721 * occur many times in the text like long or short
722 * lines, non-standard dashes, etc.
724 struct first_pass_results *first_pass(const char *etext)
726 gunichar laststart=CHAR_SPACE;
731 unsigned int lastlen=0,lastblen=0;
732 long spline=0,nspline=0;
733 static struct first_pass_results results={0};
734 struct dash_results tmp_dash_results;
737 lines=g_strsplit(etext,"\n",0);
738 for (j=0;lines[j];j++)
740 lbytes=strlen(lines[j]);
741 while (lbytes>0 && lines[j][lbytes-1]=='\r')
742 lines[j][--lbytes]='\0';
743 llen=g_utf8_strlen(lines[j],lbytes);
745 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
746 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
749 g_print(" --> Duplicate header?\n");
750 spline=linecnt+1; /* first line of non-header text, that is */
752 if (!strncmp(lines[j],"*** START",9) &&
753 strstr(lines[j],"PROJECT GUTENBERG"))
756 g_print(" --> Duplicate header?\n");
757 nspline=linecnt+1; /* first line of non-header text, that is */
759 if (spline || nspline)
761 lc_line=g_utf8_strdown(lines[j],lbytes);
762 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
764 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
766 if (results.footerline)
768 /* it's an old-form header - we can detect duplicates */
770 g_print(" --> Duplicate footer?\n");
773 results.footerline=linecnt;
779 results.firstline=spline;
781 results.firstline=nspline; /* override with new */
782 if (results.footerline)
783 continue; /* don't count the boilerplate in the footer */
784 results.totlen+=llen;
785 for (s=lines[j];*s;s=g_utf8_next_char(s))
787 if (g_utf8_get_char(s)>127)
789 if (g_unichar_isalpha(g_utf8_get_char(s)))
793 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
794 qc=QUOTE_CLASS(g_utf8_get_char(s));
797 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
798 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
799 results.endquote_count++;
802 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
803 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
806 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
808 if (strstr(lines[j],".,"))
810 /* only count ast lines for ignoring purposes where there is */
811 /* locase text on the line */
812 if (strchr(lines[j],'*'))
814 for (s=lines[j];*s;s=g_utf8_next_char(s))
815 if (g_unichar_islower(g_utf8_get_char(s)))
820 if (strchr(lines[j],'/'))
821 results.fslashline++;
824 for (s=g_utf8_prev_char(lines[j]+lbytes);
825 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
826 s=g_utf8_prev_char(s))
828 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
829 g_utf8_get_char(g_utf8_prev_char(s))!='-')
832 if (llen>LONGEST_PG_LINE)
834 if (llen>WAY_TOO_LONG)
835 results.verylongline++;
836 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
838 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
841 if (strstr(lines[j],"<i>"))
842 results.htmcount+=4; /* bonus marks! */
844 /* Check for spaced em-dashes */
845 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
846 count_dashes(lines[j],"--",&tmp_dash_results);
847 count_dashes(lines[j],"—",&tmp_dash_results);
848 if (tmp_dash_results.base)
849 results.emdash.base++;
850 if (tmp_dash_results.non_PG_space)
851 results.emdash.non_PG_space++;
852 if (tmp_dash_results.PG_space)
853 results.emdash.PG_space++;
857 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
858 results.Dutchcount++;
859 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
860 results.Frenchcount++;
861 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
862 results.standalone_digit++;
865 /* Check for spaced dashes */
866 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
870 laststart=lines[j][0];
879 * Make some snap decisions based on the first pass results.
881 struct warnings *report_first_pass(struct first_pass_results *results)
883 static struct warnings warnings={0};
885 g_print(" --> %ld lines in this file have white space at end\n",
888 if (results->dotcomma>5)
891 g_print(" --> %ld lines in this file contain '.,'. "
892 "Not reporting them.\n",results->dotcomma);
895 * If more than 50 lines, or one-tenth, are short,
896 * don't bother reporting them.
898 warnings.shortline=1;
899 if (results->shortline>50 || results->shortline*10>linecnt)
901 warnings.shortline=0;
902 g_print(" --> %ld lines in this file are short. "
903 "Not reporting short lines.\n",results->shortline);
906 * If more than 50 lines, or one-tenth, are long,
907 * don't bother reporting them.
910 if (results->longline>50 || results->longline*10>linecnt)
913 g_print(" --> %ld lines in this file are long. "
914 "Not reporting long lines.\n",results->longline);
916 /* If more than 10 lines contain asterisks, don't bother reporting them. */
918 if (results->astline>10)
921 g_print(" --> %ld lines in this file contain asterisks. "
922 "Not reporting them.\n",results->astline);
925 * If more than 10 lines contain forward slashes,
926 * don't bother reporting them.
929 if (results->fslashline>10)
932 g_print(" --> %ld lines in this file contain forward slashes. "
933 "Not reporting them.\n",results->fslashline);
936 * If more than 20 lines contain unpunctuated endquotes,
937 * don't bother reporting them.
940 if (results->endquote_count>20)
943 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
944 "Not reporting them.\n",results->endquote_count);
947 * If more than 15 lines contain standalone digits,
948 * don't bother reporting them.
951 if (results->standalone_digit>10)
954 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
955 "Not reporting them.\n",results->standalone_digit);
958 * If more than 20 lines contain hyphens at end,
959 * don't bother reporting them.
962 if (results->hyphens>20)
965 g_print(" --> %ld lines in this file have hyphens at end. "
966 "Not reporting them.\n",results->hyphens);
968 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
970 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
971 pswit[MARKUP_SWITCH]=1;
973 if (results->verylongline>0)
974 g_print(" --> %ld lines in this file are VERY long!\n",
975 results->verylongline);
977 * If there are more non-PG spaced dashes than PG em-dashes,
978 * assume it's deliberate.
979 * Current PG guidelines say don't use them, but older texts do,
980 * and some people insist on them whatever the guidelines say.
983 if (results->spacedash+results->emdash.non_PG_space>
984 results->emdash.PG_space)
987 g_print(" --> There are %ld spaced dashes and em-dashes. "
988 "Not reporting them.\n",
989 results->spacedash+results->emdash.non_PG_space);
991 /* If more than a quarter of characters are hi-bit, bug out. */
993 if (results->binlen*4>results->totlen)
995 g_print(" --> This file does not appear to be ASCII. "
996 "Terminating. Best of luck with it!\n");
999 if (results->alphalen*4<results->totlen)
1001 g_print(" --> This file does not appear to be text. "
1002 "Terminating. Best of luck with it!\n");
1005 if (results->binlen*100>results->totlen || results->binlen>100)
1007 g_print(" --> There are a lot of foreign letters here. "
1008 "Not reporting them.\n");
1011 warnings.isDutch=FALSE;
1012 if (results->Dutchcount>50)
1014 warnings.isDutch=TRUE;
1015 g_print(" --> This looks like Dutch - "
1016 "switching off dashes and warnings for 's Middags case.\n");
1018 warnings.isFrench=FALSE;
1019 if (results->Frenchcount>50)
1021 warnings.isFrench=TRUE;
1022 g_print(" --> This looks like French - "
1023 "switching off some doublepunct.\n");
1025 if (results->firstline && results->footerline)
1026 g_print(" The PG header and footer appear to be already on.\n");
1029 if (results->firstline)
1030 g_print(" The PG header is on - no footer.\n");
1031 if (results->footerline)
1032 g_print(" The PG footer is on - no header.\n");
1035 if (pswit[VERBOSE_SWITCH])
1038 warnings.shortline=1;
1039 warnings.dotcomma=1;
1040 warnings.longline=1;
1046 warnings.endquote=1;
1047 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1049 if (warnings.isDutch)
1051 if (results->footerline>0 && results->firstline>0 &&
1052 results->footerline>results->firstline &&
1053 results->footerline-results->firstline<100)
1055 g_print(" --> I don't really know where this text starts. \n");
1056 g_print(" There are no reference points.\n");
1057 g_print(" I'm going to have to report the header and footer "
1059 results->firstline=0;
1067 * Look along the line, accumulate the count of quotes, and see
1068 * if this is an empty line - i.e. a line with nothing on it
1070 * If line has just spaces, period, * and/or - on it, don't
1071 * count it, since empty lines with asterisks or dashes to
1072 * separate sections are common.
1074 * Returns: TRUE if the line is empty.
1076 gboolean analyse_quotes(const char *aline,struct counters *counters)
1079 /* assume the line is empty until proven otherwise */
1080 gboolean isemptyline=TRUE;
1081 const char *s=aline,*sprev,*snext;
1084 GError *tmp_err=NULL;
1087 snext=g_utf8_next_char(s);
1088 c=g_utf8_get_char(s);
1089 if (CHAR_IS_DQUOTE(c))
1090 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1091 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1096 * At start of line, it can only be a quotation mark.
1097 * Hardcode a very common exception!
1099 if (!g_str_has_prefix(snext,"tis") &&
1100 !g_str_has_prefix(snext,"Tis"))
1101 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1103 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1104 g_unichar_isalpha(g_utf8_get_char(snext)))
1105 /* Do nothing! it's definitely an apostrophe, not a quote */
1107 /* it's outside a word - let's check it out */
1108 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1109 g_unichar_isalpha(g_utf8_get_char(snext)))
1111 /* certainly looks like a quotation mark */
1112 if (!g_str_has_prefix(snext,"tis") &&
1113 !g_str_has_prefix(snext,"Tis"))
1114 /* hardcode a very common exception! */
1116 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1117 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1119 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1124 /* now - is it a quotation mark? */
1125 guessquote=0; /* accumulate clues */
1126 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1128 /* it follows a letter - could be either */
1130 if (g_utf8_get_char(sprev)=='s')
1132 /* looks like a plural apostrophe */
1134 if (g_utf8_get_char(snext)==CHAR_SPACE)
1138 if (innermost_quote_matches(counters,c))
1140 * Give it the benefit of some doubt,
1141 * if a squote is already open.
1147 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1150 /* no adjacent letter - it must be a quote of some kind */
1151 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1156 if (pswit[ECHO_SWITCH])
1157 g_print("\n%s\n",aline);
1158 if (!pswit[OVERVIEW_SWITCH])
1159 g_print(" Line %ld column %ld - %s\n",
1160 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1161 g_clear_error(&tmp_err);
1163 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1165 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1166 if (c==CHAR_UNDERSCORE)
1167 counters->c_unders++;
1168 if (c==CHAR_OPEN_SBRACK)
1170 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1171 !matching_difference(counters,c) && s==aline &&
1172 g_str_has_prefix(s,"[Illustration:"))
1173 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1175 increment_matching(counters,c,TRUE);
1177 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1178 increment_matching(counters,c,TRUE);
1179 if (c==CHAR_CLOSE_SBRACK)
1181 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1182 !matching_difference(counters,c) && !*snext)
1183 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1185 increment_matching(counters,c,FALSE);
1187 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1188 increment_matching(counters,c,FALSE);
1196 * check_for_control_characters:
1198 * Check for invalid or questionable characters in the line
1199 * Anything above 127 is invalid for plain ASCII, and
1200 * non-printable control characters should also be flagged.
1201 * Tabs should generally not be there.
1203 void check_for_control_characters(const char *aline)
1207 for (s=aline;*s;s=g_utf8_next_char(s))
1209 c=g_utf8_get_char(s);
1210 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1212 if (pswit[ECHO_SWITCH])
1213 g_print("\n%s\n",aline);
1214 if (!pswit[OVERVIEW_SWITCH])
1215 g_print(" Line %ld column %ld - Control character %u\n",
1216 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1224 * check_for_odd_characters:
1226 * Check for binary and other odd characters.
1228 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1229 gboolean isemptyline)
1231 /* Don't repeat multiple warnings on one line. */
1232 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1233 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1236 for (s=aline;*s;s=g_utf8_next_char(s))
1238 c=g_utf8_get_char(s);
1239 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1241 if (pswit[ECHO_SWITCH])
1242 g_print("\n%s\n",aline);
1243 if (!pswit[OVERVIEW_SWITCH])
1244 if (c>127 && c<160 || c>255)
1245 g_print(" Line %ld column %ld - "
1246 "Non-ISO-8859 character %u\n",
1247 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1249 g_print(" Line %ld column %ld - "
1250 "Non-ASCII character %u\n",
1251 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1256 if (!eTab && c==CHAR_TAB)
1258 if (pswit[ECHO_SWITCH])
1259 g_print("\n%s\n",aline);
1260 if (!pswit[OVERVIEW_SWITCH])
1261 g_print(" Line %ld column %ld - Tab character?\n",
1262 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1267 if (!eTilde && c==CHAR_TILDE)
1270 * Often used by OCR software to indicate an
1271 * unrecognizable character.
1273 if (pswit[ECHO_SWITCH])
1274 g_print("\n%s\n",aline);
1275 if (!pswit[OVERVIEW_SWITCH])
1276 g_print(" Line %ld column %ld - Tilde character?\n",
1277 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1282 if (!eCarat && c==CHAR_CARAT)
1284 if (pswit[ECHO_SWITCH])
1285 g_print("\n%s\n",aline);
1286 if (!pswit[OVERVIEW_SWITCH])
1287 g_print(" Line %ld column %ld - Carat character?\n",
1288 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1293 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1295 if (pswit[ECHO_SWITCH])
1296 g_print("\n%s\n",aline);
1297 if (!pswit[OVERVIEW_SWITCH])
1298 g_print(" Line %ld column %ld - Forward slash?\n",
1299 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1305 * Report asterisks only in paranoid mode,
1306 * since they're often deliberate.
1308 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1311 if (pswit[ECHO_SWITCH])
1312 g_print("\n%s\n",aline);
1313 if (!pswit[OVERVIEW_SWITCH])
1314 g_print(" Line %ld column %ld - Asterisk?\n",
1315 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1324 * check_for_long_line:
1326 * Check for line too long.
1328 void check_for_long_line(const char *aline)
1330 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1332 if (pswit[ECHO_SWITCH])
1333 g_print("\n%s\n",aline);
1334 if (!pswit[OVERVIEW_SWITCH])
1335 g_print(" Line %ld column %ld - Long line %ld\n",
1336 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1343 * check_for_short_line:
1345 * Check for line too short.
1347 * This one is a bit trickier to implement: we don't want to
1348 * flag the last line of a paragraph for being short, so we
1349 * have to wait until we know that our current line is a
1350 * "normal" line, then report the _previous_ line if it was too
1351 * short. We also don't want to report indented lines like
1352 * chapter heads or formatted quotations. We therefore keep
1353 * last->len as the length of the last line examined, and
1354 * last->blen as the length of the last but one, and try to
1355 * suppress unnecessary warnings by checking that both were of
1356 * "normal" length. We keep the first character of the last
1357 * line in last->start, and if it was a space, we assume that
1358 * the formatting is deliberate. I can't figure out a way to
1359 * distinguish something like a quoted verse left-aligned or
1360 * the header or footer of a letter from a paragraph of short
1361 * lines - maybe if I examined the whole paragraph, and if the
1362 * para has less than, say, 8 lines and if all lines are short,
1363 * then just assume it's OK? Need to look at some texts to see
1364 * how often a formula like this would get the right result.
1366 void check_for_short_line(const char *aline,const struct line_properties *last)
1368 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1369 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1370 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1372 if (pswit[ECHO_SWITCH])
1373 g_print("\n%s\n",prevline);
1374 if (!pswit[OVERVIEW_SWITCH])
1375 g_print(" Line %ld column %ld - Short line %ld?\n",
1376 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1383 * check_for_starting_punctuation:
1385 * Look for punctuation other than full ellipses at start of line.
1387 void check_for_starting_punctuation(const char *aline)
1389 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1390 !g_str_has_prefix(aline,". . ."))
1392 if (pswit[ECHO_SWITCH])
1393 g_print("\n%s\n",aline);
1394 if (!pswit[OVERVIEW_SWITCH])
1395 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1405 * Find the first em-dash, return a pointer to it and set <next> to the
1406 * character following the dash.
1408 char *str_emdash(const char *s,const char **next)
1416 *next=g_utf8_next_char(s2);
1421 *next=g_utf8_next_char(g_utf8_next_char(s1));
1426 *next=g_utf8_next_char(g_utf8_next_char(s1));
1431 *next=g_utf8_next_char(s2);
1437 * check_for_spaced_emdash:
1439 * Check for spaced em-dashes.
1441 * We must check _all_ occurrences of em-dashes on the line
1442 * hence the loop - even if the first dash is OK
1443 * there may be another that's wrong later on.
1445 void check_for_spaced_emdash(const char *aline)
1447 const char *s,*t,*next;
1448 for (s=aline;t=str_emdash(s,&next);s=next)
1450 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1451 g_utf8_get_char(next)==CHAR_SPACE)
1453 if (pswit[ECHO_SWITCH])
1454 g_print("\n%s\n",aline);
1455 if (!pswit[OVERVIEW_SWITCH])
1456 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1457 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1465 * check_for_spaced_dash:
1467 * Check for spaced dashes.
1469 void check_for_spaced_dash(const char *aline)
1472 if ((s=strstr(aline," -")))
1474 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1476 if (pswit[ECHO_SWITCH])
1477 g_print("\n%s\n",aline);
1478 if (!pswit[OVERVIEW_SWITCH])
1479 g_print(" Line %ld column %ld - Spaced dash?\n",
1480 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1485 else if ((s=strstr(aline,"- ")))
1487 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1489 if (pswit[ECHO_SWITCH])
1490 g_print("\n%s\n",aline);
1491 if (!pswit[OVERVIEW_SWITCH])
1492 g_print(" Line %ld column %ld - Spaced dash?\n",
1493 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1501 * check_for_unmarked_paragraphs:
1503 * Check for unmarked paragraphs indicated by separate speakers.
1505 * May well be false positive:
1506 * "Bravo!" "Wonderful!" called the crowd.
1507 * but useful all the same.
1509 void check_for_unmarked_paragraphs(const char *aline)
1512 s=strstr(aline,"\" \"");
1514 s=strstr(aline,"\" \"");
1517 if (pswit[ECHO_SWITCH])
1518 g_print("\n%s\n",aline);
1519 if (!pswit[OVERVIEW_SWITCH])
1520 g_print(" Line %ld column %ld - "
1521 "Query missing paragraph break?\n",
1522 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1529 * check_for_jeebies:
1531 * Check for "to he" and other easy h/b errors.
1533 * This is a very inadequate effort on the h/b problem,
1534 * but the phrase "to he" is always an error, whereas "to
1535 * be" is quite common.
1536 * Similarly, '"Quiet!", be said.' is a non-be error
1537 * "to he" is _not_ always an error!:
1538 * "Where they went to he couldn't say."
1539 * Another false positive:
1540 * What would "Cinderella" be without the . . .
1541 * and another: "If he wants to he can see for himself."
1543 void check_for_jeebies(const char *aline)
1546 s=strstr(aline," be could ");
1548 s=strstr(aline," be would ");
1550 s=strstr(aline," was be ");
1552 s=strstr(aline," be is ");
1554 s=strstr(aline," is be ");
1556 s=strstr(aline,"\", be ");
1558 s=strstr(aline,"\" be ");
1560 s=strstr(aline,"\" be ");
1562 s=strstr(aline," to he ");
1565 if (pswit[ECHO_SWITCH])
1566 g_print("\n%s\n",aline);
1567 if (!pswit[OVERVIEW_SWITCH])
1568 g_print(" Line %ld column %ld - Query he/be error?\n",
1569 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1573 s=strstr(aline," the had ");
1575 s=strstr(aline," a had ");
1577 s=strstr(aline," they bad ");
1579 s=strstr(aline," she bad ");
1581 s=strstr(aline," he bad ");
1583 s=strstr(aline," you bad ");
1585 s=strstr(aline," i bad ");
1588 if (pswit[ECHO_SWITCH])
1589 g_print("\n%s\n",aline);
1590 if (!pswit[OVERVIEW_SWITCH])
1591 g_print(" Line %ld column %ld - Query had/bad error?\n",
1592 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1596 s=strstr(aline,"; hut ");
1598 s=strstr(aline,", hut ");
1601 if (pswit[ECHO_SWITCH])
1602 g_print("\n%s\n",aline);
1603 if (!pswit[OVERVIEW_SWITCH])
1604 g_print(" Line %ld column %ld - Query hut/but error?\n",
1605 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1612 * check_for_mta_from:
1614 * Special case - angled bracket in front of "From" placed there by an
1615 * MTA when sending an e-mail.
1617 void check_for_mta_from(const char *aline)
1620 s=strstr(aline,">From");
1623 if (pswit[ECHO_SWITCH])
1624 g_print("\n%s\n",aline);
1625 if (!pswit[OVERVIEW_SWITCH])
1626 g_print(" Line %ld column %ld - "
1627 "Query angled bracket with From\n",
1628 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1635 * check_for_orphan_character:
1637 * Check for a single character line -
1638 * often an overflow from bad wrapping.
1640 void check_for_orphan_character(const char *aline)
1643 c=g_utf8_get_char(aline);
1644 if (c && !*g_utf8_next_char(aline))
1646 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1647 ; /* Nothing - ignore numerals alone on a line. */
1650 if (pswit[ECHO_SWITCH])
1651 g_print("\n%s\n",aline);
1652 if (!pswit[OVERVIEW_SWITCH])
1653 g_print(" Line %ld column 1 - Query single character line\n",
1662 * check_for_pling_scanno:
1664 * Check for I" - often should be !
1666 void check_for_pling_scanno(const char *aline)
1669 s=strstr(aline," I\"");
1672 if (pswit[ECHO_SWITCH])
1673 g_print("\n%s\n",aline);
1674 if (!pswit[OVERVIEW_SWITCH])
1675 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1676 linecnt,g_utf8_pointer_to_offset(aline,s));
1683 * check_for_extra_period:
1685 * Check for period without a capital letter. Cut-down from gutspell.
1686 * Only works when it happens on a single line.
1688 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1690 const char *s,*t,*s1,*sprev;
1695 gunichar c,nc,pc,*decomposition;
1696 if (pswit[PARANOID_SWITCH])
1698 for (t=aline;t=strstr(t,". ");)
1702 t=g_utf8_next_char(t);
1703 /* start of line punctuation is handled elsewhere */
1706 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1708 t=g_utf8_next_char(t);
1711 if (warnings->isDutch)
1713 /* For Frank & Jeroen -- 's Middags case */
1714 gunichar c2,c3,c4,c5;
1715 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1716 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1717 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1718 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1719 if (CHAR_IS_APOSTROPHE(c2) &&
1720 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1721 g_unichar_isupper(c5))
1723 t=g_utf8_next_char(t);
1727 s1=g_utf8_next_char(g_utf8_next_char(t));
1728 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1729 !g_unichar_isdigit(g_utf8_get_char(s1)))
1730 s1=g_utf8_next_char(s1);
1731 if (g_unichar_islower(g_utf8_get_char(s1)))
1733 /* we have something to investigate */
1735 /* so let's go back and find out */
1736 nc=g_utf8_get_char(t);
1737 s1=g_utf8_prev_char(t);
1738 c=g_utf8_get_char(s1);
1739 sprev=g_utf8_prev_char(s1);
1740 pc=g_utf8_get_char(sprev);
1742 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1743 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1744 g_unichar_isalpha(nc)))
1749 sprev=g_utf8_prev_char(s1);
1750 pc=g_utf8_get_char(sprev);
1752 s1=g_utf8_next_char(s1);
1755 testword=g_strndup(s1,s-s1);
1757 testword=g_strdup(s1);
1758 for (i=0;*abbrev[i];i++)
1759 if (!strcmp(testword,abbrev[i]))
1761 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1763 if (!*g_utf8_next_char(testword))
1765 if (isroman(testword))
1770 for (s=testword;*s;s=g_utf8_next_char(s))
1772 decomposition=g_unicode_canonical_decomposition(
1773 g_utf8_get_char(s),&len);
1774 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1776 g_free(decomposition);
1780 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1782 g_tree_insert(qperiod,g_strdup(testword),
1783 GINT_TO_POINTER(1));
1784 if (pswit[ECHO_SWITCH])
1785 g_print("\n%s\n",aline);
1786 if (!pswit[OVERVIEW_SWITCH])
1787 g_print(" Line %ld column %ld - Extra period?\n",
1788 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1794 t=g_utf8_next_char(t);
1800 * check_for_following_punctuation:
1802 * Check for words usually not followed by punctuation.
1804 void check_for_following_punctuation(const char *aline)
1807 const char *s,*wordstart;
1810 if (pswit[TYPO_SWITCH])
1821 inword=g_utf8_strdown(t,-1);
1823 for (i=0;*nocomma[i];i++)
1824 if (!strcmp(inword,nocomma[i]))
1826 c=g_utf8_get_char(s);
1827 if (c==',' || c==';' || c==':')
1829 if (pswit[ECHO_SWITCH])
1830 g_print("\n%s\n",aline);
1831 if (!pswit[OVERVIEW_SWITCH])
1832 g_print(" Line %ld column %ld - "
1833 "Query punctuation after %s?\n",
1834 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1840 for (i=0;*noperiod[i];i++)
1841 if (!strcmp(inword,noperiod[i]))
1843 c=g_utf8_get_char(s);
1844 if (c=='.' || c=='!')
1846 if (pswit[ECHO_SWITCH])
1847 g_print("\n%s\n",aline);
1848 if (!pswit[OVERVIEW_SWITCH])
1849 g_print(" Line %ld column %ld - "
1850 "Query punctuation after %s?\n",
1851 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1865 * Check for commonly mistyped words,
1866 * and digits like 0 for O in a word.
1868 void check_for_typos(const char *aline,struct warnings *warnings)
1870 const char *s,*t,*nt,*wordstart;
1872 gunichar *decomposition;
1874 int i,vowel,consonant,*dupcnt;
1875 gboolean isdup,istypo,alower;
1878 gsize decomposition_len;
1882 inword=getaword(&s);
1886 continue; /* don't bother with empty lines */
1888 if (mixdigit(inword))
1890 if (pswit[ECHO_SWITCH])
1891 g_print("\n%s\n",aline);
1892 if (!pswit[OVERVIEW_SWITCH])
1893 g_print(" Line %ld column %ld - Query digit in %s\n",
1894 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1899 * Put the word through a series of tests for likely typos and OCR
1902 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1906 for (t=inword;*t;t=g_utf8_next_char(t))
1908 c=g_utf8_get_char(t);
1909 nt=g_utf8_next_char(t);
1910 /* lowercase for testing */
1911 if (g_unichar_islower(c))
1913 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1916 * We have an uppercase mid-word. However, there are
1918 * Mac and Mc like McGill
1919 * French contractions like l'Abbe
1921 offset=g_utf8_pointer_to_offset(inword,t);
1923 pc=g_utf8_get_char(g_utf8_prev_char(t));
1926 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1927 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1928 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1929 CHAR_IS_APOSTROPHE(pc))
1935 testword=g_utf8_casefold(inword,-1);
1937 if (pswit[TYPO_SWITCH])
1940 * Check for certain unlikely two-letter combinations at word
1943 len=g_utf8_strlen(testword,-1);
1946 for (i=0;*nostart[i];i++)
1947 if (g_str_has_prefix(testword,nostart[i]))
1949 for (i=0;*noend[i];i++)
1950 if (g_str_has_suffix(testword,noend[i]))
1953 /* ght is common, gbt never. Like that. */
1954 if (strstr(testword,"cb"))
1956 if (strstr(testword,"gbt"))
1958 if (strstr(testword,"pbt"))
1960 if (strstr(testword,"tbs"))
1962 if (strstr(testword,"mrn"))
1964 if (strstr(testword,"ahle"))
1966 if (strstr(testword,"ihle"))
1969 * "TBE" does happen - like HEARTBEAT - but uncommon.
1970 * Also "TBI" - frostbite, outbid - but uncommon.
1971 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1972 * numerals, but "ii" is a common scanno.
1974 if (strstr(testword,"tbi"))
1976 if (strstr(testword,"tbe"))
1978 if (strstr(testword,"ii"))
1981 * Check for no vowels or no consonants.
1982 * If none, flag a typo.
1984 if (!istypo && len>1)
1987 for (t=testword;*t;t=g_utf8_next_char(t))
1989 c=g_utf8_get_char(t);
1991 g_unicode_canonical_decomposition(c,&decomposition_len);
1992 if (c=='y' || g_unichar_isdigit(c))
1994 /* Yah, this is loose. */
1998 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2002 g_free(decomposition);
2004 if (!vowel || !consonant)
2008 * Now exclude the word from being reported if it's in
2011 for (i=0;*okword[i];i++)
2012 if (!strcmp(testword,okword[i]))
2015 * What looks like a typo may be a Roman numeral.
2018 if (istypo && isroman(testword))
2020 /* Check the manual list of typos. */
2022 for (i=0;*typo[i];i++)
2023 if (!strcmp(testword,typo[i]))
2026 * Check lowercase s, l, i and m - special cases.
2027 * "j" - often a semi-colon gone wrong.
2028 * "d" for a missing apostrophe - he d
2031 if (!istypo && len==1 &&
2032 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2036 dupcnt=g_tree_lookup(qword,testword);
2040 isdup=!pswit[VERBOSE_SWITCH];
2044 dupcnt=g_new0(int,1);
2045 g_tree_insert(qword,g_strdup(testword),dupcnt);
2050 if (pswit[ECHO_SWITCH])
2051 g_print("\n%s\n",aline);
2052 if (!pswit[OVERVIEW_SWITCH])
2054 g_print(" Line %ld column %ld - Query word %s",
2055 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2057 if (!pswit[VERBOSE_SWITCH])
2058 g_print(" - not reporting duplicates");
2066 /* check the user's list of typos */
2067 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2069 if (pswit[ECHO_SWITCH])
2070 g_print("\n%s\n",aline);
2071 if (!pswit[OVERVIEW_SWITCH])
2072 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2073 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2075 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2077 if (pswit[PARANOID_SWITCH] && warnings->digit)
2079 /* In paranoid mode, query all 0 and 1 standing alone. */
2080 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2082 if (pswit[ECHO_SWITCH])
2083 g_print("\n%s\n",aline);
2084 if (!pswit[OVERVIEW_SWITCH])
2085 g_print(" Line %ld column %ld - Query standalone %s\n",
2086 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2097 * check_for_misspaced_punctuation:
2099 * Look for added or missing spaces around punctuation and quotes.
2100 * If there is a punctuation character like ! with no space on
2101 * either side, suspect a missing!space. If there are spaces on
2102 * both sides , assume a typo. If we see a double quote with no
2103 * space or punctuation on either side of it, assume unspaced
2104 * quotes "like"this.
2106 void check_for_misspaced_punctuation(const char *aline,
2107 struct parities *parities,gboolean isemptyline)
2109 gboolean isacro,isellipsis;
2111 gunichar c,nc,pc,n2c;
2113 c=g_utf8_get_char(aline);
2114 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2115 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2119 nc=g_utf8_get_char(g_utf8_next_char(s));
2120 /* For each character in the line after the first. */
2121 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2123 /* we need to suppress warnings for acronyms like M.D. */
2125 /* we need to suppress warnings for ellipsis . . . */
2128 * If there are letters on both sides of it or
2129 * if it's strict punctuation followed by an alpha.
2131 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2132 g_utf8_strchr("?!,;:",-1,c)))
2136 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2137 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2139 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2145 if (pswit[ECHO_SWITCH])
2146 g_print("\n%s\n",aline);
2147 if (!pswit[OVERVIEW_SWITCH])
2148 g_print(" Line %ld column %ld - Missing space?\n",
2149 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2154 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2157 * If there are spaces on both sides,
2158 * or space before and end of line.
2162 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2163 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2165 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2169 if (!isemptyline && !isellipsis)
2171 if (pswit[ECHO_SWITCH])
2172 g_print("\n%s\n",aline);
2173 if (!pswit[OVERVIEW_SWITCH])
2174 g_print(" Line %ld column %ld - "
2175 "Spaced punctuation?\n",linecnt,
2176 g_utf8_pointer_to_offset(aline,s)+1);
2183 /* Split out the characters that CANNOT be preceded by space. */
2184 c=g_utf8_get_char(aline);
2185 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2186 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2190 nc=g_utf8_get_char(g_utf8_next_char(s));
2191 /* for each character in the line after the first */
2192 if (g_utf8_strchr("?!,;:",-1,c))
2194 /* if it's punctuation that _cannot_ have a space before it */
2195 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2198 * If nc DOES == space,
2199 * it was already reported just above.
2201 if (pswit[ECHO_SWITCH])
2202 g_print("\n%s\n",aline);
2203 if (!pswit[OVERVIEW_SWITCH])
2204 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2205 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2212 * Special case " .X" where X is any alpha.
2213 * This plugs a hole in the acronym code above.
2214 * Inelegant, but maintainable.
2216 c=g_utf8_get_char(aline);
2217 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2218 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2222 nc=g_utf8_get_char(g_utf8_next_char(s));
2223 /* for each character in the line after the first */
2226 /* if it's a period */
2227 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2230 * If the period follows a space and
2231 * is followed by a letter.
2233 if (pswit[ECHO_SWITCH])
2234 g_print("\n%s\n",aline);
2235 if (!pswit[OVERVIEW_SWITCH])
2236 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2237 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2243 c=g_utf8_get_char(aline);
2244 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2245 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2249 nc=g_utf8_get_char(g_utf8_next_char(s));
2250 /* for each character in the line after the first */
2251 if (CHAR_IS_DQUOTE(c))
2253 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2254 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2255 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2257 if (pswit[ECHO_SWITCH])
2258 g_print("\n%s\n",aline);
2259 if (!pswit[OVERVIEW_SWITCH])
2260 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2261 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2267 /* Check parity of quotes. */
2268 nc=g_utf8_get_char(aline);
2269 for (s=aline;*s;s=g_utf8_next_char(s))
2272 nc=g_utf8_get_char(g_utf8_next_char(s));
2273 if (CHAR_IS_DQUOTE(c))
2277 parities->dquote=!parities->dquote;
2278 parity=parities->dquote;
2280 else if (c==CHAR_LD_QUOTE)
2287 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2289 if (pswit[ECHO_SWITCH])
2290 g_print("\n%s\n",aline);
2291 if (!pswit[OVERVIEW_SWITCH])
2292 g_print(" Line %ld column %ld - "
2293 "Wrongspaced quotes?\n",
2294 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2302 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2303 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2305 if (pswit[ECHO_SWITCH])
2306 g_print("\n%s\n",aline);
2307 if (!pswit[OVERVIEW_SWITCH])
2308 g_print(" Line %ld column %ld - "
2309 "Wrongspaced quotes?\n",
2310 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2317 c=g_utf8_get_char(aline);
2318 if (CHAR_IS_DQUOTE(c))
2320 if (g_utf8_strchr(",;:!?)]} ",-1,
2321 g_utf8_get_char(g_utf8_next_char(aline))))
2323 if (pswit[ECHO_SWITCH])
2324 g_print("\n%s\n",aline);
2325 if (!pswit[OVERVIEW_SWITCH])
2326 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2332 if (pswit[SQUOTE_SWITCH])
2334 nc=g_utf8_get_char(aline);
2335 for (s=aline;*s;s=g_utf8_next_char(s))
2338 nc=g_utf8_get_char(g_utf8_next_char(s));
2339 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2340 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2341 !g_unichar_isalpha(nc)))
2343 parities->squote=!parities->squote;
2344 if (!parities->squote)
2347 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2349 if (pswit[ECHO_SWITCH])
2350 g_print("\n%s\n",aline);
2351 if (!pswit[OVERVIEW_SWITCH])
2352 g_print(" Line %ld column %ld - "
2353 "Wrongspaced singlequotes?\n",
2354 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2362 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2363 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2365 if (pswit[ECHO_SWITCH])
2366 g_print("\n%s\n",aline);
2367 if (!pswit[OVERVIEW_SWITCH])
2368 g_print(" Line %ld column %ld - "
2369 "Wrongspaced singlequotes?\n",
2370 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2381 * check_for_double_punctuation:
2383 * Look for double punctuation like ,. or ,,
2384 * Thanks to DW for the suggestion!
2385 * In books with references, ".," and ".;" are common
2386 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2387 * OTOH, from my initial tests, there are also fairly
2388 * common errors. What to do? Make these cases paranoid?
2389 * ".," is the most common, so warnings->dotcomma is used
2390 * to suppress detailed reporting if it occurs often.
2392 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2396 nc=g_utf8_get_char(aline);
2397 for (s=aline;*s;s=g_utf8_next_char(s))
2400 nc=g_utf8_get_char(g_utf8_next_char(s));
2401 /* for each punctuation character in the line */
2402 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2403 g_utf8_strchr(".?!,;:",-1,nc))
2405 /* followed by punctuation, it's a query, unless . . . */
2406 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2407 !warnings->dotcomma && c=='.' && nc==',' ||
2408 warnings->isFrench && g_str_has_prefix(s,",...") ||
2409 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2410 warnings->isFrench && g_str_has_prefix(s,";...") ||
2411 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2412 warnings->isFrench && g_str_has_prefix(s,":...") ||
2413 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2414 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2415 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2416 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2417 warnings->isFrench && g_str_has_prefix(s,"...?"))
2419 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2420 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2421 warnings->isFrench && g_str_has_prefix(s,";...") ||
2422 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2423 warnings->isFrench && g_str_has_prefix(s,":...") ||
2424 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2425 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2426 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2427 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2428 warnings->isFrench && g_str_has_prefix(s,"...?"))
2431 nc=g_utf8_get_char(g_utf8_next_char(s));
2433 ; /* do nothing for .. !! and ?? which can be legit */
2437 if (pswit[ECHO_SWITCH])
2438 g_print("\n%s\n",aline);
2439 if (!pswit[OVERVIEW_SWITCH])
2440 g_print(" Line %ld column %ld - Double punctuation?\n",
2441 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2450 * check_for_spaced_quotes:
2452 void check_for_spaced_quotes(const char *aline)
2456 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2460 while ((t=strstr(s," \" ")))
2462 if (pswit[ECHO_SWITCH])
2463 g_print("\n%s\n",aline);
2464 if (!pswit[OVERVIEW_SWITCH])
2465 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2466 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2469 s=g_utf8_next_char(g_utf8_next_char(t));
2471 pattern=g_string_new(NULL);
2472 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2474 g_string_assign(pattern," ");
2475 g_string_append_unichar(pattern,single_quotes[i]);
2476 g_string_append_c(pattern,' ');
2478 while ((t=strstr(s,pattern->str)))
2480 if (pswit[ECHO_SWITCH])
2481 g_print("\n%s\n",aline);
2482 if (!pswit[OVERVIEW_SWITCH])
2483 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2484 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2487 s=g_utf8_next_char(g_utf8_next_char(t));
2490 g_string_free(pattern,TRUE);
2494 * check_for_miscased_genative:
2496 * Check special case of 'S instead of 's at end of word.
2498 void check_for_miscased_genative(const char *aline)
2504 c=g_utf8_get_char(aline);
2505 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2506 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2510 nc=g_utf8_get_char(g_utf8_next_char(s));
2511 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2513 if (pswit[ECHO_SWITCH])
2514 g_print("\n%s\n",aline);
2515 if (!pswit[OVERVIEW_SWITCH])
2516 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2517 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2525 * check_end_of_line:
2527 * Now check special cases - start and end of line -
2528 * for single and double quotes. Start is sometimes [sic]
2529 * but better to query it anyway.
2530 * While we're here, check for dash at end of line.
2532 void check_end_of_line(const char *aline,struct warnings *warnings)
2537 lbytes=strlen(aline);
2538 if (g_utf8_strlen(aline,lbytes)>1)
2540 s=g_utf8_prev_char(aline+lbytes);
2541 c1=g_utf8_get_char(s);
2542 c2=g_utf8_get_char(g_utf8_prev_char(s));
2543 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2545 if (pswit[ECHO_SWITCH])
2546 g_print("\n%s\n",aline);
2547 if (!pswit[OVERVIEW_SWITCH])
2548 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2549 g_utf8_strlen(aline,lbytes));
2553 c1=g_utf8_get_char(aline);
2554 c2=g_utf8_get_char(g_utf8_next_char(aline));
2555 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2557 if (pswit[ECHO_SWITCH])
2558 g_print("\n%s\n",aline);
2559 if (!pswit[OVERVIEW_SWITCH])
2560 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2565 * Dash at end of line may well be legit - paranoid mode only
2566 * and don't report em-dash at line-end.
2568 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2570 for (s=g_utf8_prev_char(aline+lbytes);
2571 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2573 if (g_utf8_get_char(s)=='-' &&
2574 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2576 if (pswit[ECHO_SWITCH])
2577 g_print("\n%s\n",aline);
2578 if (!pswit[OVERVIEW_SWITCH])
2579 g_print(" Line %ld column %ld - "
2580 "Hyphen at end of line?\n",
2581 linecnt,g_utf8_pointer_to_offset(aline,s));
2588 * check_for_unspaced_bracket:
2590 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2591 * If so, suspect a scanno like "a]most".
2593 void check_for_unspaced_bracket(const char *aline)
2597 c=g_utf8_get_char(aline);
2598 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2599 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2603 nc=g_utf8_get_char(g_utf8_next_char(s));
2606 /* for each bracket character in the line except 1st & last */
2607 if (g_utf8_strchr("{[()]}",-1,c) &&
2608 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2610 if (pswit[ECHO_SWITCH])
2611 g_print("\n%s\n",aline);
2612 if (!pswit[OVERVIEW_SWITCH])
2613 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2614 linecnt,g_utf8_pointer_to_offset(aline,s));
2622 * check_for_unpunctuated_endquote:
2624 void check_for_unpunctuated_endquote(const char *aline)
2629 c=g_utf8_get_char(aline);
2630 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2631 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2635 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2636 nc=g_utf8_get_char(g_utf8_next_char(s));
2637 /* for each character in the line except 1st */
2638 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2640 if (pswit[ECHO_SWITCH])
2641 g_print("\n%s\n",aline);
2642 if (!pswit[OVERVIEW_SWITCH])
2643 g_print(" Line %ld column %ld - "
2644 "endquote missing punctuation?\n",
2645 linecnt,g_utf8_pointer_to_offset(aline,s));
2653 * check_for_html_tag:
2655 * Check for <HTML TAG>.
2657 * If there is a < in the line, followed at some point
2658 * by a > then we suspect HTML.
2660 void check_for_html_tag(const char *aline)
2662 const char *open,*close;
2664 open=strchr(aline,'<');
2667 close=strchr(g_utf8_next_char(open),'>');
2670 if (pswit[ECHO_SWITCH])
2671 g_print("\n%s\n",aline);
2672 if (!pswit[OVERVIEW_SWITCH])
2674 tag=g_strndup(open,close-open+1);
2675 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2676 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2686 * check_for_html_entity:
2688 * Check for &symbol; HTML.
2690 * If there is a & in the line, followed at
2691 * some point by a ; then we suspect HTML.
2693 void check_for_html_entity(const char *aline)
2695 const char *s,*amp,*scolon;
2697 amp=strchr(aline,'&');
2700 scolon=strchr(amp,';');
2703 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2704 if (g_utf8_get_char(s)==CHAR_SPACE)
2705 break; /* Don't report "Jones & Son;" */
2708 if (pswit[ECHO_SWITCH])
2709 g_print("\n%s\n",aline);
2710 if (!pswit[OVERVIEW_SWITCH])
2712 entity=g_strndup(amp,scolon-amp+1);
2713 g_print(" Line %ld column %d - HTML symbol? %s \n",
2714 linecnt,(int)(amp-aline)+1,entity);
2725 * check_for_omitted_punctuation:
2727 * Check for omitted punctuation at end of paragraph by working back
2728 * through prevline. DW.
2729 * Need to check this only for "normal" paras.
2730 * So what is a "normal" para?
2731 * Not normal if one-liner (chapter headings, etc.)
2732 * Not normal if doesn't contain at least one locase letter
2733 * Not normal if starts with space
2735 void check_for_omitted_punctuation(const char *prevline,
2736 struct line_properties *last,int start_para_line)
2738 gboolean letter_on_line=FALSE;
2741 gboolean closing_quote;
2742 for (s=prevline;*s;s=g_utf8_next_char(s))
2743 if (g_unichar_isalpha(g_utf8_get_char(s)))
2745 letter_on_line=TRUE;
2749 * This next "if" is a problem.
2750 * If we say "start_para_line <= linecnt - 1", that includes
2751 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2752 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2753 * misses genuine one-line paragraphs.
2755 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2756 g_utf8_get_char(prevline)>CHAR_SPACE)
2758 s=prevline+strlen(prevline);
2761 s=g_utf8_prev_char(s);
2762 c=g_utf8_get_char(s);
2763 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2766 closing_quote=FALSE;
2767 } while (closing_quote && s>prevline);
2768 for (;s>prevline;s=g_utf8_prev_char(s))
2770 if (g_unichar_isalpha(g_utf8_get_char(s)))
2772 if (pswit[ECHO_SWITCH])
2773 g_print("\n%s\n",prevline);
2774 if (!pswit[OVERVIEW_SWITCH])
2775 g_print(" Line %ld column %ld - "
2776 "No punctuation at para end?\n",
2777 linecnt-1,g_utf8_strlen(prevline,-1));
2782 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2788 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2790 const char *word=key;
2793 g_print("\nNote: Queried word %s was duplicated %d times\n",
2798 void print_as_windows_1252(const char *string)
2800 gsize inbytes,outbytes;
2802 static GIConv converter=(GIConv)-1;
2805 if (converter!=(GIConv)-1)
2806 g_iconv_close(converter);
2807 converter=(GIConv)-1;
2810 if (converter==(GIConv)-1)
2811 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2812 if (converter!=(GIConv)-1)
2814 inbytes=outbytes=strlen(string);
2815 bp=buf=g_malloc(outbytes+1);
2816 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2822 fputs(string,stdout);
2825 void print_as_utf_8(const char *string)
2827 fputs(string,stdout);
2835 void procfile(const char *filename)
2838 gchar *parastart=NULL; /* first line of current para */
2839 gchar *etext,*aline;
2842 struct first_pass_results *first_pass_results;
2843 struct warnings *warnings;
2844 struct counters counters={0};
2845 struct line_properties last={0};
2846 struct parities parities={0};
2847 struct pending pending={0};
2848 gboolean isemptyline;
2849 long start_para_line=0;
2850 gboolean isnewpara=FALSE,enddash=FALSE;
2851 last.start=CHAR_SPACE;
2852 linecnt=checked_linecnt=0;
2853 etext=read_etext(filename,&err);
2856 if (pswit[STDOUT_SWITCH])
2857 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2859 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2862 g_print("\n\nFile: %s\n\n",filename);
2863 first_pass_results=first_pass(etext);
2864 warnings=report_first_pass(first_pass_results);
2865 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2866 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2868 * Here we go with the main pass. Hold onto yer hat!
2872 while ((aline=flgets(&etext_ptr,linecnt+1)))
2877 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2878 continue; // skip DP page separators completely
2879 if (linecnt<first_pass_results->firstline ||
2880 (first_pass_results->footerline>0 &&
2881 linecnt>first_pass_results->footerline))
2883 if (pswit[HEADER_SWITCH])
2885 if (g_str_has_prefix(aline,"Title:"))
2886 g_print(" %s\n",aline);
2887 if (g_str_has_prefix(aline,"Author:"))
2888 g_print(" %s\n",aline);
2889 if (g_str_has_prefix(aline,"Release Date:"))
2890 g_print(" %s\n",aline);
2891 if (g_str_has_prefix(aline,"Edition:"))
2892 g_print(" %s\n\n",aline);
2894 continue; /* skip through the header */
2897 print_pending(aline,parastart,&pending);
2898 isemptyline=analyse_quotes(aline,&counters);
2899 if (isnewpara && !isemptyline)
2901 /* This line is the start of a new paragraph. */
2902 start_para_line=linecnt;
2903 /* Capture its first line in case we want to report it later. */
2905 parastart=g_strdup(aline);
2906 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2908 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2909 !g_unichar_isdigit(g_utf8_get_char(s)))
2910 s=g_utf8_next_char(s);
2911 if (g_unichar_islower(g_utf8_get_char(s)))
2913 /* and its first letter is lowercase */
2914 if (pswit[ECHO_SWITCH])
2915 g_print("\n%s\n",aline);
2916 if (!pswit[OVERVIEW_SWITCH])
2917 g_print(" Line %ld column %ld - "
2918 "Paragraph starts with lower-case\n",
2919 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2923 isnewpara=FALSE; /* Signal the end of new para processing. */
2925 /* Check for an em-dash broken at line end. */
2926 if (enddash && g_utf8_get_char(aline)=='-')
2928 if (pswit[ECHO_SWITCH])
2929 g_print("\n%s\n",aline);
2930 if (!pswit[OVERVIEW_SWITCH])
2931 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2936 for (s=g_utf8_prev_char(aline+strlen(aline));
2937 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2939 if (s>=aline && g_utf8_get_char(s)=='-')
2941 check_for_control_characters(aline);
2943 check_for_odd_characters(aline,warnings,isemptyline);
2944 if (warnings->longline)
2945 check_for_long_line(aline);
2946 if (warnings->shortline)
2947 check_for_short_line(aline,&last);
2949 last.len=g_utf8_strlen(aline,-1);
2950 last.start=g_utf8_get_char(aline);
2951 check_for_starting_punctuation(aline);
2954 check_for_spaced_emdash(aline);
2955 check_for_spaced_dash(aline);
2957 check_for_unmarked_paragraphs(aline);
2958 check_for_jeebies(aline);
2959 check_for_mta_from(aline);
2960 check_for_orphan_character(aline);
2961 check_for_pling_scanno(aline);
2962 check_for_extra_period(aline,warnings);
2963 check_for_following_punctuation(aline);
2964 check_for_typos(aline,warnings);
2965 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2966 check_for_double_punctuation(aline,warnings);
2967 check_for_spaced_quotes(aline);
2968 check_for_miscased_genative(aline);
2969 check_end_of_line(aline,warnings);
2970 check_for_unspaced_bracket(aline);
2971 if (warnings->endquote)
2972 check_for_unpunctuated_endquote(aline);
2973 check_for_html_tag(aline);
2974 check_for_html_entity(aline);
2977 check_for_mismatched_quotes(&counters,&pending);
2978 counters_reset(&counters);
2979 /* let the next iteration know that it's starting a new para */
2982 check_for_omitted_punctuation(prevline,&last,start_para_line);
2985 prevline=g_strdup(aline);
2988 check_for_mismatched_quotes(&counters,&pending);
2989 print_pending(NULL,parastart,&pending);
2990 reset_pending(&pending);
2999 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3000 g_tree_foreach(qword,report_duplicate_queries,NULL);
3001 g_tree_unref(qword);
3002 g_tree_unref(qperiod);
3003 counters_destroy(&counters);
3004 g_set_print_handler(NULL);
3005 print_as_windows_1252(NULL);
3006 if (pswit[MARKUP_SWITCH])
3013 * Get one line from the input text, checking for
3014 * the existence of exactly one CR/LF line-end per line.
3016 * Returns: a pointer to the line.
3018 char *flgets(char **etext,long lcnt)
3021 gboolean isCR=FALSE;
3022 char *theline=*etext;
3027 c=g_utf8_get_char(*etext);
3030 if (*etext==theline)
3032 else if (pswit[LINE_END_SWITCH])
3034 if (pswit[ECHO_SWITCH])
3036 s=g_strndup(theline,eos-theline);
3037 g_print("\n%s\n",s);
3040 if (!pswit[OVERVIEW_SWITCH])
3041 /* There may, or may not, have been a CR */
3042 g_print(" Line %ld - No LF?\n",lcnt);
3048 *etext=g_utf8_next_char(*etext);
3049 /* either way, it's end of line */
3056 /* Error - a LF without a preceding CR */
3057 if (pswit[LINE_END_SWITCH])
3059 if (pswit[ECHO_SWITCH])
3061 s=g_strndup(theline,eos-theline);
3062 g_print("\n%s\n",s);
3065 if (!pswit[OVERVIEW_SWITCH])
3066 g_print(" Line %ld - No CR?\n",lcnt);
3077 /* Error - two successive CRs */
3078 if (pswit[LINE_END_SWITCH])
3080 if (pswit[ECHO_SWITCH])
3082 s=g_strndup(theline,eos-theline);
3083 g_print("\n%s\n",s);
3086 if (!pswit[OVERVIEW_SWITCH])
3087 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3096 if (pswit[LINE_END_SWITCH] && isCR)
3098 if (pswit[ECHO_SWITCH])
3100 s=g_strndup(theline,eos-theline);
3101 g_print("\n%s\n",s);
3104 if (!pswit[OVERVIEW_SWITCH])
3105 g_print(" Line %ld column %ld - CR without LF?\n",
3106 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3112 eos=g_utf8_next_char(eos);
3116 if (pswit[MARKUP_SWITCH])
3117 postprocess_for_HTML(theline);
3118 if (pswit[DP_SWITCH])
3119 postprocess_for_DP(theline);
3126 * Takes a "word" as a parameter, and checks whether it
3127 * contains a mixture of alpha and digits. Generally, this is an
3128 * error, but may not be for cases like 4th or L5 12s. 3d.
3130 * Returns: TRUE iff an is error found.
3132 gboolean mixdigit(const char *checkword)
3134 gboolean wehaveadigit,wehavealetter,query;
3135 const char *s,*nondigit;
3136 wehaveadigit=wehavealetter=query=FALSE;
3137 for (s=checkword;*s;s=g_utf8_next_char(s))
3138 if (g_unichar_isalpha(g_utf8_get_char(s)))
3140 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3142 if (wehaveadigit && wehavealetter)
3144 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3146 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3147 nondigit=g_utf8_next_char(nondigit))
3149 /* digits, ending in st, rd, nd, th of either case */
3150 if (!g_ascii_strcasecmp(nondigit,"st") ||
3151 !g_ascii_strcasecmp(nondigit,"rd") ||
3152 !g_ascii_strcasecmp(nondigit,"nd") ||
3153 !g_ascii_strcasecmp(nondigit,"th"))
3155 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3156 !g_ascii_strcasecmp(nondigit,"rds") ||
3157 !g_ascii_strcasecmp(nondigit,"nds") ||
3158 !g_ascii_strcasecmp(nondigit,"ths"))
3160 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3161 !g_ascii_strcasecmp(nondigit,"rdly") ||
3162 !g_ascii_strcasecmp(nondigit,"ndly") ||
3163 !g_ascii_strcasecmp(nondigit,"thly"))
3165 /* digits, ending in l, L, s or d */
3166 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3167 !strcmp(nondigit,"d"))
3170 * L at the start of a number, representing Britsh pounds, like L500.
3171 * This is cute. We know the current word is mixed digit. If the first
3172 * letter is L, there must be at least one digit following. If both
3173 * digits and letters follow, we have a genuine error, else we have a
3174 * capital L followed by digits, and we accept that as a non-error.
3176 if (g_utf8_get_char(checkword)=='L' &&
3177 !mixdigit(g_utf8_next_char(checkword)))
3186 * Extracts the first/next "word" from the line, and returns it.
3187 * A word is defined as one English word unit--or at least that's the aim.
3188 * "ptr" is advanced to the position in the line where we will start
3189 * looking for the next word.
3191 * Returns: A newly-allocated string.
3193 gchar *getaword(const char **ptr)
3198 word=g_string_new(NULL);
3199 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3200 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3201 **ptr;*ptr=g_utf8_next_char(*ptr))
3204 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3205 * Especially yucky is the case of L1,000
3206 * This section looks for a pattern of characters including a digit
3207 * followed by a comma or period followed by one or more digits.
3208 * If found, it returns this whole pattern as a word; otherwise we discard
3209 * the results and resume our normal programming.
3212 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3213 g_unichar_isalpha(g_utf8_get_char(s)) ||
3214 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3215 g_string_append_unichar(word,g_utf8_get_char(s));
3218 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3220 c=g_utf8_get_char(t);
3221 pc=g_utf8_get_char(g_utf8_prev_char(t));
3222 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3225 return g_string_free(word,FALSE);
3229 /* we didn't find a punctuated number - do the regular getword thing */
3230 g_string_truncate(word,0);
3231 c=g_utf8_get_char(*ptr);
3232 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3233 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3234 g_string_append_unichar(word,c);
3235 return g_string_free(word,FALSE);
3241 * Is this word a Roman Numeral?
3243 * It doesn't actually validate that the number is a valid Roman Numeral--for
3244 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3245 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3246 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3247 * expressions thereof, except when it came to taxes. Allow any number of M,
3248 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3249 * XL or an optional XC, an optional IX or IV, an optional V and any number
3252 gboolean isroman(const char *t)
3258 while (g_utf8_get_char(t)=='m' && *t)
3260 if (g_utf8_get_char(t)=='d')
3262 if (g_str_has_prefix(t,"cm"))
3264 if (g_str_has_prefix(t,"cd"))
3266 while (g_utf8_get_char(t)=='c' && *t)
3268 if (g_str_has_prefix(t,"xl"))
3270 if (g_str_has_prefix(t,"xc"))
3272 if (g_utf8_get_char(t)=='l')
3274 while (g_utf8_get_char(t)=='x' && *t)
3276 if (g_str_has_prefix(t,"ix"))
3278 if (g_str_has_prefix(t,"iv"))
3280 if (g_utf8_get_char(t)=='v')
3282 while (g_utf8_get_char(t)=='i' && *t)
3288 * postprocess_for_DP:
3290 * Invoked with the -d switch from flgets().
3291 * It simply "removes" from the line a hard-coded set of common
3292 * DP-specific tags, so that the line passed to the main routine has
3293 * been pre-cleaned of DP markup.
3295 void postprocess_for_DP(char *theline)
3301 for (i=0;*DPmarkup[i];i++)
3302 while ((s=strstr(theline,DPmarkup[i])))
3304 t=s+strlen(DPmarkup[i]);
3305 memmove(s,t,strlen(t)+1);
3310 * postprocess_for_HTML:
3312 * Invoked with the -m switch from flgets().
3313 * It simply "removes" from the line a hard-coded set of common
3314 * HTML tags and "replaces" a hard-coded set of common HTML
3315 * entities, so that the line passed to the main routine has
3316 * been pre-cleaned of HTML.
3318 void postprocess_for_HTML(char *theline)
3320 while (losemarkup(theline))
3322 loseentities(theline);
3325 char *losemarkup(char *theline)
3329 s=strchr(theline,'<');
3330 t=s?strchr(s,'>'):NULL;
3333 for (i=0;*markup[i];i++)
3334 if (tagcomp(g_utf8_next_char(s),markup[i]))
3336 t=g_utf8_next_char(t);
3337 memmove(s,t,strlen(t)+1);
3340 /* It's an unrecognized <xxx>. */
3344 void loseentities(char *theline)
3351 GTree *entities=NULL;
3352 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3356 g_tree_destroy(entities);
3358 if (translit!=(GIConv)-1)
3359 g_iconv_close(translit);
3360 translit=(GIConv)-1;
3361 if (to_utf8!=(GIConv)-1)
3362 g_iconv_close(to_utf8);
3370 entities=g_tree_new((GCompareFunc)strcmp);
3371 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3372 g_tree_insert(entities,HTMLentities[i].name,
3373 GUINT_TO_POINTER(HTMLentities[i].c));
3375 if (translit==(GIConv)-1)
3376 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3377 if (to_utf8==(GIConv)-1)
3378 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3379 while((amp=strchr(theline,'&')))
3381 scolon=strchr(amp,';');
3386 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3387 c=strtol(amp+2,NULL,10);
3388 else if (amp[2]=='x' &&
3389 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3390 c=strtol(amp+3,NULL,16);
3394 s=g_strndup(amp+1,scolon-(amp+1));
3395 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3404 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3405 theline+=g_unichar_to_utf8(c,theline);
3409 nb=g_unichar_to_utf8(c,s);
3410 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3412 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3414 memcpy(theline,s,nb);
3418 memmove(theline,g_utf8_next_char(scolon),
3419 strlen(g_utf8_next_char(scolon))+1);
3422 theline=g_utf8_next_char(amp);
3426 gboolean tagcomp(const char *strin,const char *basetag)
3430 if (g_utf8_get_char(strin)=='/')
3431 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3433 t=g_utf8_casefold(strin,-1);
3434 s=g_utf8_casefold(basetag,-1);
3435 retval=g_str_has_prefix(t,s);
3441 void proghelp(GOptionContext *context)
3444 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3445 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3446 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3447 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3448 "For details, read the file COPYING.\n",stderr);
3449 fputs("This is Free Software; "
3450 "you may redistribute it under certain conditions (GPL);\n",stderr);
3451 fputs("read the file COPYING for details.\n\n",stderr);
3452 help=g_option_context_get_help(context,TRUE,NULL);
3455 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3456 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3457 "non-ASCII\n",stderr);
3458 fputs("characters like accented letters, "
3459 "lines longer than 75 or shorter than 55,\n",stderr);
3460 fputs("unbalanced quotes or brackets, "
3461 "a variety of badly formatted punctuation, \n",stderr);
3462 fputs("HTML tags, some likely typos. "
3463 "It is NOT a substitute for human judgement.\n",stderr);