1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 gboolean typo_compat,paranoid_compat;
133 static GOptionEntry options[]={
134 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
135 "Ignore DP-specific markup", NULL },
136 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
137 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
138 "Don't ignore DP-specific markup", NULL },
139 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
140 "Echo queried line", NULL },
141 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
142 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
143 "Don't echo queried line", NULL },
144 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
145 "Check single quotes", NULL },
146 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
147 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
148 "Don't check single quotes", NULL },
149 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
150 "Check common typos", NULL },
151 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
152 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
153 "Don't check common typos", NULL },
154 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
155 "Require closure of quotes on every paragraph", NULL },
156 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
157 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
158 "Don't require closure of quotes on every paragraph", NULL },
159 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
160 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
161 "Enable paranoid querying of everything", NULL },
162 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
163 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
164 "Disable paranoid querying of everything", NULL },
165 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
166 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
167 "Enable line end checking", NULL },
168 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
169 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
170 "Disable line end checking", NULL },
171 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
172 "Overview: just show counts", NULL },
173 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
174 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
175 "Show individual warnings", NULL },
176 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
177 "Output errors to stdout instead of stderr", NULL },
178 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
179 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
180 "Output errors to stderr instead of stdout", NULL },
181 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
182 "Echo header fields", NULL },
183 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
184 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
185 "Don't echo header fields", NULL },
186 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
187 "Ignore markup in < >", NULL },
188 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
189 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
190 "No special handling for markup in < >", NULL },
191 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
192 "Use file of user-defined typos", NULL },
193 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
194 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
195 "Ignore file of user-defined typos", NULL },
196 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
197 "Verbose - list everything", NULL },
198 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
199 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Switch off verbose mode", NULL },
205 * Options relating to configuration which make no sense from inside
206 * a configuration file.
209 static GOptionEntry config_options[]={
210 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
211 "Defaults for use on www upload", NULL },
212 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
213 "Dump current config settings", NULL },
217 static GOptionEntry compatibility_options[]={
218 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
219 "Toggle checking for common typos", NULL },
220 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
221 "Toggle both paranoid mode and common typos", NULL },
225 long cnt_quote; /* for overview mode, count of quote queries */
226 long cnt_brack; /* for overview mode, count of brackets queries */
227 long cnt_bin; /* for overview mode, count of non-ASCII queries */
228 long cnt_odd; /* for overview mode, count of odd character queries */
229 long cnt_long; /* for overview mode, count of long line errors */
230 long cnt_short; /* for overview mode, count of short line queries */
231 long cnt_punct; /* for overview mode,
232 count of punctuation and spacing queries */
233 long cnt_dash; /* for overview mode, count of dash-related queries */
234 long cnt_word; /* for overview mode, count of word queries */
235 long cnt_html; /* for overview mode, count of html queries */
236 long cnt_lineend; /* for overview mode, count of line-end queries */
237 long cnt_spacend; /* count of lines with space at end */
238 long linecnt; /* count of total lines in the file */
239 long checked_linecnt; /* count of lines actually checked */
241 void proghelp(GOptionContext *context);
242 void procfile(const char *);
246 gboolean mixdigit(const char *);
247 gchar *getaword(const char **);
248 char *flgets(char **,long,int);
249 void postprocess_for_HTML(char *);
250 char *linehasmarkup(char *);
251 char *losemarkup(char *);
252 gboolean tagcomp(const char *,const char *);
253 void loseentities(char *);
254 gboolean isroman(const char *);
255 void postprocess_for_DP(char *);
256 void print_as_windows_1252(const char *string);
257 void print_as_utf_8(const char *string);
259 GTree *qword,*qperiod;
267 void config_file_update(GKeyFile *kf)
271 for(i=0;options[i].long_name;i++)
273 if (g_str_has_prefix(options[i].long_name,"no-"))
275 if (options[i].arg==G_OPTION_ARG_NONE)
277 sw=*(gboolean *)options[i].arg_data;
278 if (options[i].flags&G_OPTION_FLAG_REVERSE)
280 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
283 g_assert_not_reached();
287 void config_file_add_comments(GKeyFile *kf)
291 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
293 for(i=0;options[i].long_name;i++)
295 if (g_str_has_prefix(options[i].long_name,"no-"))
297 comment=g_strconcat(" ",options[i].description,NULL);
298 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
303 void dump_config(void)
307 config_file_update(config);
310 config=g_key_file_new();
311 config_file_update(config);
312 config_file_add_comments(config);
314 s=g_key_file_to_data(config,NULL,NULL);
320 GKeyFile *read_config_file(gchar **full_path)
326 const char *search_path;
329 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
333 search_dirs=g_strsplit(search_path,";",0);
335 search_dirs=g_strsplit(search_path,":",0);
340 search_dirs=g_new(gchar *,4);
341 search_dirs[0]=g_get_current_dir();
342 search_dirs[1]=g_strdup(running_from);
343 search_dirs[2]=g_strdup(g_get_user_config_dir());
346 for(i=0;search_dirs[i];i++)
348 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
349 if (g_key_file_load_from_file(kf,path,
350 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
352 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
354 g_printerr("Bookloupe: Error reading %s\n",path);
355 g_printerr("%s\n",err->message);
367 g_strfreev(search_dirs);
375 void parse_config_file(void)
382 config=read_config_file(&path);
384 keys=g_key_file_get_keys(config,"options",NULL,NULL);
391 for(j=0;options[j].long_name;j++)
393 if (g_str_has_prefix(options[j].long_name,"no-"))
395 else if (!strcmp(keys[i],options[j].long_name))
397 if (options[j].arg==G_OPTION_ARG_NONE)
399 sw=g_key_file_get_boolean(config,"options",keys[i],
403 g_printerr("Bookloupe: %s: options.%s: %s\n",
404 path,keys[i],err->message);
407 if (options[j].flags&G_OPTION_FLAG_REVERSE)
409 *(gboolean *)options[j].arg_data=sw;
413 g_assert_not_reached();
416 if (!options[j].long_name)
417 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
426 void parse_options(int *argc,char ***argv)
429 GOptionContext *context;
430 GOptionGroup *compatibility;
431 context=g_option_context_new(
432 "file - look for errors in Project Gutenberg(TM) etexts");
433 g_option_context_add_main_entries(context,options,NULL);
434 g_option_context_add_main_entries(context,config_options,NULL);
435 compatibility=g_option_group_new("compatibility",
436 "Options for Compatibility with Gutcheck:",
437 "Show compatibility options",NULL,NULL);
438 g_option_group_add_entries(compatibility,compatibility_options);
439 g_option_context_add_group(context,compatibility);
440 g_option_context_set_description(context,
441 "For simplicity, only the switch options which reverse the\n"
442 "default configuration are listed. In most cases, both vanilla\n"
443 "and \"no-\" prefixed versions are available for use.");
444 if (!g_option_context_parse(context,argc,argv,&err))
446 g_printerr("Bookloupe: %s\n",err->message);
447 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
451 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
454 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
455 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
458 * Web uploads - for the moment, this is really just a placeholder
459 * until we decide what processing we really want to do on web uploads
461 if (pswit[WEB_SWITCH])
463 /* specific override for web uploads */
464 pswit[ECHO_SWITCH]=TRUE;
465 pswit[SQUOTE_SWITCH]=FALSE;
466 pswit[TYPO_SWITCH]=TRUE;
467 pswit[QPARA_SWITCH]=FALSE;
468 pswit[PARANOID_SWITCH]=TRUE;
469 pswit[LINE_END_SWITCH]=FALSE;
470 pswit[OVERVIEW_SWITCH]=FALSE;
471 pswit[STDOUT_SWITCH]=FALSE;
472 pswit[HEADER_SWITCH]=TRUE;
473 pswit[VERBOSE_SWITCH]=FALSE;
474 pswit[MARKUP_SWITCH]=FALSE;
475 pswit[USERTYPO_SWITCH]=FALSE;
476 pswit[DP_SWITCH]=FALSE;
478 if (pswit[DUMP_CONFIG_SWITCH])
483 if (pswit[OVERVIEW_SWITCH])
484 /* just print summary; don't echo */
485 pswit[ECHO_SWITCH]=FALSE;
491 g_option_context_free(context);
497 * Read in the user-defined stealth scanno list.
499 void read_user_scannos(void)
502 gchar *usertypo_file;
506 gchar *contents,*utf8,**lines;
507 usertypo_file=g_strdup("bookloupe.typ");
508 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
509 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
512 g_free(usertypo_file);
513 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
514 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
516 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
519 g_free(usertypo_file);
520 usertypo_file=g_strdup("gutcheck.typ");
521 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
523 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
526 g_free(usertypo_file);
527 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
528 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
530 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
532 g_free(usertypo_file);
533 g_print(" --> I couldn't find bookloupe.typ "
534 "-- proceeding without user typos.\n");
539 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
540 g_free(usertypo_file);
544 if (g_utf8_validate(contents,len,NULL))
545 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
547 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
549 lines=g_strsplit_set(utf8,"\r\n",0);
551 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
552 for (i=0;lines[i];i++)
553 if (*(unsigned char *)lines[i]>'!')
554 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
563 * Read an etext returning a newly allocated string containing the file
564 * contents or NULL on error.
566 gchar *read_etext(const char *filename,GError **err)
568 GError *tmp_err=NULL;
569 gchar *contents,*utf8;
570 gsize len,bytes_read,bytes_written;
572 if (!g_file_get_contents(filename,&contents,&len,err))
574 if (g_utf8_validate(contents,len,NULL))
576 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
577 g_set_print_handler(print_as_utf_8);
579 SetConsoleOutputCP(CP_UTF8);
584 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
585 &bytes_written,&tmp_err);
586 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
587 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
590 for(i=0;i<bytes_read;i++)
591 if (contents[i]=='\n')
596 else if (contents[i]!='\r')
598 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
599 "Input conversion failed. Byte %d at line %d, column %d is not a "
600 "valid Windows-1252 character",
601 ((unsigned char *)contents)[bytes_read],line,col);
604 g_propagate_error(err,tmp_err);
605 g_set_print_handler(print_as_windows_1252);
607 SetConsoleOutputCP(1252);
614 void cleanup_on_exit(void)
617 SetConsoleOutputCP(saved_cp);
621 int main(int argc,char **argv)
624 atexit(cleanup_on_exit);
625 saved_cp=GetConsoleOutputCP();
627 running_from=g_path_get_dirname(argv[0]);
628 /* Paranoid checking is turned OFF, not on, by its switch */
629 pswit[PARANOID_SWITCH]=TRUE;
630 /* if running in paranoid mode, typo checks default to enabled */
631 pswit[TYPO_SWITCH]=TRUE;
632 /* Line-end checking is turned OFF, not on, by its switch */
633 pswit[LINE_END_SWITCH]=TRUE;
634 /* Echoing is turned OFF, not on, by its switch */
635 pswit[ECHO_SWITCH]=TRUE;
637 parse_options(&argc,&argv);
638 if (pswit[USERTYPO_SWITCH])
640 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
642 if (pswit[OVERVIEW_SWITCH])
644 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
645 checked_linecnt,linecnt,linecnt-checked_linecnt);
646 g_print(" --------------- Queries found --------------\n");
648 g_print(" Long lines: %14ld\n",cnt_long);
650 g_print(" Short lines: %14ld\n",cnt_short);
652 g_print(" Line-end problems: %14ld\n",cnt_lineend);
654 g_print(" Common typos: %14ld\n",cnt_word);
656 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
658 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
660 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
662 g_print(" Proofing characters: %14ld\n",cnt_odd);
664 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
666 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
668 g_print(" Possible HTML tags: %14ld\n",cnt_html);
670 g_print(" TOTAL QUERIES %14ld\n",
671 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
672 cnt_dash+cnt_word+cnt_html+cnt_lineend);
674 g_free(running_from);
676 g_tree_unref(usertypo);
678 g_key_file_free(config);
682 void count_dashes(const char *line,const char *dash,
683 struct dash_results *results)
688 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
691 tokens=g_strsplit(line,dash,0);
694 for(i=1;tokens[i];i++)
696 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
697 nc=g_utf8_get_char(tokens[i]);
698 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
700 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
702 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
708 /* count of lines with em-dashes with spaces both sides */
709 results->non_PG_space++;
711 /* count of lines with PG-type em-dashes with no spaces */
719 * Run a first pass - verify that it's a valid PG
720 * file, decide whether to report some things that
721 * occur many times in the text like long or short
722 * lines, non-standard dashes, etc.
724 struct first_pass_results *first_pass(const char *etext)
726 gunichar laststart=CHAR_SPACE;
731 unsigned int lastlen=0,lastblen=0;
732 long spline=0,nspline=0;
733 static struct first_pass_results results={0};
734 struct dash_results tmp_dash_results;
737 lines=g_strsplit(etext,"\n",0);
740 /* An empty etext has no terminators */
741 results.newlines=DOS_NEWLINES;
746 * If there are no LFs, we don't have UNIX-style
747 * terminators, but we might have OS9-style ones.
749 results.newlines=OS9_NEWLINES;
751 lines=g_strsplit(etext,"\r",0);
752 if (!lines[0] || !lines[1])
753 /* Looks like we don't have any terminators at all */
754 results.newlines=DOS_NEWLINES;
758 /* We might have UNIX-style terminators */
759 results.newlines=UNIX_NEWLINES;
761 for (j=0;lines[j];j++)
763 lbytes=strlen(lines[j]);
764 if (lbytes>0 && lines[j][lbytes-1]=='\r')
766 results.newlines=DOS_NEWLINES;
769 lines[j][--lbytes]='\0';
770 } while (lbytes>0 && lines[j][lbytes-1]=='\r');
772 llen=g_utf8_strlen(lines[j],lbytes);
774 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
775 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
778 g_print(" --> Duplicate header?\n");
779 spline=linecnt+1; /* first line of non-header text, that is */
781 if (!strncmp(lines[j],"*** START",9) &&
782 strstr(lines[j],"PROJECT GUTENBERG"))
785 g_print(" --> Duplicate header?\n");
786 nspline=linecnt+1; /* first line of non-header text, that is */
788 if (spline || nspline)
790 lc_line=g_utf8_strdown(lines[j],lbytes);
791 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
793 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
795 if (results.footerline)
797 /* it's an old-form header - we can detect duplicates */
799 g_print(" --> Duplicate footer?\n");
802 results.footerline=linecnt;
808 results.firstline=spline;
810 results.firstline=nspline; /* override with new */
811 if (results.footerline)
812 continue; /* don't count the boilerplate in the footer */
813 results.totlen+=llen;
814 for (s=lines[j];*s;s=g_utf8_next_char(s))
816 if (g_utf8_get_char(s)>127)
818 if (g_unichar_isalpha(g_utf8_get_char(s)))
822 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
823 qc=QUOTE_CLASS(g_utf8_get_char(s));
826 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
827 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
828 results.endquote_count++;
831 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
832 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
835 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
837 if (strstr(lines[j],".,"))
839 /* only count ast lines for ignoring purposes where there is */
840 /* locase text on the line */
841 if (strchr(lines[j],'*'))
843 for (s=lines[j];*s;s=g_utf8_next_char(s))
844 if (g_unichar_islower(g_utf8_get_char(s)))
849 if (strchr(lines[j],'/'))
850 results.fslashline++;
853 for (s=g_utf8_prev_char(lines[j]+lbytes);
854 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
855 s=g_utf8_prev_char(s))
857 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
858 g_utf8_get_char(g_utf8_prev_char(s))!='-')
861 if (llen>LONGEST_PG_LINE)
863 if (llen>WAY_TOO_LONG)
864 results.verylongline++;
865 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
867 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
870 if (strstr(lines[j],"<i>"))
871 results.htmcount+=4; /* bonus marks! */
873 /* Check for spaced em-dashes */
874 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
875 count_dashes(lines[j],"--",&tmp_dash_results);
876 count_dashes(lines[j],"—",&tmp_dash_results);
877 if (tmp_dash_results.base)
878 results.emdash.base++;
879 if (tmp_dash_results.non_PG_space)
880 results.emdash.non_PG_space++;
881 if (tmp_dash_results.PG_space)
882 results.emdash.PG_space++;
886 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
887 results.Dutchcount++;
888 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
889 results.Frenchcount++;
890 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
891 results.standalone_digit++;
894 /* Check for spaced dashes */
895 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
899 laststart=lines[j][0];
908 * Make some snap decisions based on the first pass results.
910 struct warnings *report_first_pass(struct first_pass_results *results)
912 static struct warnings warnings={0};
913 warnings.newlines=results->newlines;
914 if (warnings.newlines==UNIX_NEWLINES)
915 g_print(" --> No lines in this file have a CR. Not reporting them. "
916 "Project Gutenberg requires that all lineends be CR-LF.\n");
917 else if (warnings.newlines==OS9_NEWLINES)
918 g_print(" --> No lines in this file have a LF. Not reporting them. "
919 "Project Gutenberg requires that all lineends be CR-LF.\n");
921 g_print(" --> %ld lines in this file have white space at end\n",
924 if (results->dotcomma>5)
927 g_print(" --> %ld lines in this file contain '.,'. "
928 "Not reporting them.\n",results->dotcomma);
931 * If more than 50 lines, or one-tenth, are short,
932 * don't bother reporting them.
934 warnings.shortline=1;
935 if (results->shortline>50 || results->shortline*10>linecnt)
937 warnings.shortline=0;
938 g_print(" --> %ld lines in this file are short. "
939 "Not reporting short lines.\n",results->shortline);
942 * If more than 50 lines, or one-tenth, are long,
943 * don't bother reporting them.
946 if (results->longline>50 || results->longline*10>linecnt)
949 g_print(" --> %ld lines in this file are long. "
950 "Not reporting long lines.\n",results->longline);
952 /* If more than 10 lines contain asterisks, don't bother reporting them. */
954 if (results->astline>10)
957 g_print(" --> %ld lines in this file contain asterisks. "
958 "Not reporting them.\n",results->astline);
961 * If more than 10 lines contain forward slashes,
962 * don't bother reporting them.
965 if (results->fslashline>10)
968 g_print(" --> %ld lines in this file contain forward slashes. "
969 "Not reporting them.\n",results->fslashline);
972 * If more than 20 lines contain unpunctuated endquotes,
973 * don't bother reporting them.
976 if (results->endquote_count>20)
979 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
980 "Not reporting them.\n",results->endquote_count);
983 * If more than 15 lines contain standalone digits,
984 * don't bother reporting them.
987 if (results->standalone_digit>10)
990 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
991 "Not reporting them.\n",results->standalone_digit);
994 * If more than 20 lines contain hyphens at end,
995 * don't bother reporting them.
998 if (results->hyphens>20)
1001 g_print(" --> %ld lines in this file have hyphens at end. "
1002 "Not reporting them.\n",results->hyphens);
1004 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1006 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1007 pswit[MARKUP_SWITCH]=1;
1009 if (results->verylongline>0)
1010 g_print(" --> %ld lines in this file are VERY long!\n",
1011 results->verylongline);
1013 * If there are more non-PG spaced dashes than PG em-dashes,
1014 * assume it's deliberate.
1015 * Current PG guidelines say don't use them, but older texts do,
1016 * and some people insist on them whatever the guidelines say.
1019 if (results->spacedash+results->emdash.non_PG_space>
1020 results->emdash.PG_space)
1023 g_print(" --> There are %ld spaced dashes and em-dashes. "
1024 "Not reporting them.\n",
1025 results->spacedash+results->emdash.non_PG_space);
1027 /* If more than a quarter of characters are hi-bit, bug out. */
1029 if (results->binlen*4>results->totlen)
1031 g_print(" --> This file does not appear to be ASCII. "
1032 "Terminating. Best of luck with it!\n");
1035 if (results->alphalen*4<results->totlen)
1037 g_print(" --> This file does not appear to be text. "
1038 "Terminating. Best of luck with it!\n");
1041 if (results->binlen*100>results->totlen || results->binlen>100)
1043 g_print(" --> There are a lot of foreign letters here. "
1044 "Not reporting them.\n");
1047 warnings.isDutch=FALSE;
1048 if (results->Dutchcount>50)
1050 warnings.isDutch=TRUE;
1051 g_print(" --> This looks like Dutch - "
1052 "switching off dashes and warnings for 's Middags case.\n");
1054 warnings.isFrench=FALSE;
1055 if (results->Frenchcount>50)
1057 warnings.isFrench=TRUE;
1058 g_print(" --> This looks like French - "
1059 "switching off some doublepunct.\n");
1061 if (results->firstline && results->footerline)
1062 g_print(" The PG header and footer appear to be already on.\n");
1065 if (results->firstline)
1066 g_print(" The PG header is on - no footer.\n");
1067 if (results->footerline)
1068 g_print(" The PG footer is on - no header.\n");
1071 if (pswit[VERBOSE_SWITCH])
1074 warnings.shortline=1;
1075 warnings.dotcomma=1;
1076 warnings.longline=1;
1082 warnings.endquote=1;
1083 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1085 if (warnings.isDutch)
1087 if (results->footerline>0 && results->firstline>0 &&
1088 results->footerline>results->firstline &&
1089 results->footerline-results->firstline<100)
1091 g_print(" --> I don't really know where this text starts. \n");
1092 g_print(" There are no reference points.\n");
1093 g_print(" I'm going to have to report the header and footer "
1095 results->firstline=0;
1103 * Look along the line, accumulate the count of quotes, and see
1104 * if this is an empty line - i.e. a line with nothing on it
1106 * If line has just spaces, period, * and/or - on it, don't
1107 * count it, since empty lines with asterisks or dashes to
1108 * separate sections are common.
1110 * Returns: TRUE if the line is empty.
1112 gboolean analyse_quotes(const char *aline,struct counters *counters)
1115 /* assume the line is empty until proven otherwise */
1116 gboolean isemptyline=TRUE;
1117 const char *s=aline,*sprev,*snext;
1120 GError *tmp_err=NULL;
1123 snext=g_utf8_next_char(s);
1124 c=g_utf8_get_char(s);
1125 if (CHAR_IS_DQUOTE(c))
1126 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1127 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1132 * At start of line, it can only be a quotation mark.
1133 * Hardcode a very common exception!
1135 if (!g_str_has_prefix(snext,"tis") &&
1136 !g_str_has_prefix(snext,"Tis"))
1137 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1139 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1140 g_unichar_isalpha(g_utf8_get_char(snext)))
1141 /* Do nothing! it's definitely an apostrophe, not a quote */
1143 /* it's outside a word - let's check it out */
1144 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1145 g_unichar_isalpha(g_utf8_get_char(snext)))
1147 /* certainly looks like a quotation mark */
1148 if (!g_str_has_prefix(snext,"tis") &&
1149 !g_str_has_prefix(snext,"Tis"))
1150 /* hardcode a very common exception! */
1152 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1153 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1155 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1160 /* now - is it a quotation mark? */
1161 guessquote=0; /* accumulate clues */
1162 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1164 /* it follows a letter - could be either */
1166 if (g_utf8_get_char(sprev)=='s')
1168 /* looks like a plural apostrophe */
1170 if (g_utf8_get_char(snext)==CHAR_SPACE)
1174 if (innermost_quote_matches(counters,c))
1176 * Give it the benefit of some doubt,
1177 * if a squote is already open.
1183 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1186 /* no adjacent letter - it must be a quote of some kind */
1187 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1192 if (pswit[ECHO_SWITCH])
1193 g_print("\n%s\n",aline);
1194 if (!pswit[OVERVIEW_SWITCH])
1195 g_print(" Line %ld column %ld - %s\n",
1196 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1197 g_clear_error(&tmp_err);
1199 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1201 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1202 if (c==CHAR_UNDERSCORE)
1203 counters->c_unders++;
1204 if (c==CHAR_OPEN_SBRACK)
1206 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1207 !matching_difference(counters,c) && s==aline &&
1208 g_str_has_prefix(s,"[Illustration:"))
1209 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1211 increment_matching(counters,c,TRUE);
1213 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1214 increment_matching(counters,c,TRUE);
1215 if (c==CHAR_CLOSE_SBRACK)
1217 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1218 !matching_difference(counters,c) && !*snext)
1219 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1221 increment_matching(counters,c,FALSE);
1223 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1224 increment_matching(counters,c,FALSE);
1232 * check_for_control_characters:
1234 * Check for invalid or questionable characters in the line
1235 * Anything above 127 is invalid for plain ASCII, and
1236 * non-printable control characters should also be flagged.
1237 * Tabs should generally not be there.
1239 void check_for_control_characters(const char *aline)
1243 for (s=aline;*s;s=g_utf8_next_char(s))
1245 c=g_utf8_get_char(s);
1246 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1248 if (pswit[ECHO_SWITCH])
1249 g_print("\n%s\n",aline);
1250 if (!pswit[OVERVIEW_SWITCH])
1251 g_print(" Line %ld column %ld - Control character %u\n",
1252 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1260 * check_for_odd_characters:
1262 * Check for binary and other odd characters.
1264 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1265 gboolean isemptyline)
1267 /* Don't repeat multiple warnings on one line. */
1268 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1269 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1272 for (s=aline;*s;s=g_utf8_next_char(s))
1274 c=g_utf8_get_char(s);
1275 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1277 if (pswit[ECHO_SWITCH])
1278 g_print("\n%s\n",aline);
1279 if (!pswit[OVERVIEW_SWITCH])
1280 if (c>127 && c<160 || c>255)
1281 g_print(" Line %ld column %ld - "
1282 "Non-ISO-8859 character %u\n",
1283 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1285 g_print(" Line %ld column %ld - "
1286 "Non-ASCII character %u\n",
1287 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1292 if (!eTab && c==CHAR_TAB)
1294 if (pswit[ECHO_SWITCH])
1295 g_print("\n%s\n",aline);
1296 if (!pswit[OVERVIEW_SWITCH])
1297 g_print(" Line %ld column %ld - Tab character?\n",
1298 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1303 if (!eTilde && c==CHAR_TILDE)
1306 * Often used by OCR software to indicate an
1307 * unrecognizable character.
1309 if (pswit[ECHO_SWITCH])
1310 g_print("\n%s\n",aline);
1311 if (!pswit[OVERVIEW_SWITCH])
1312 g_print(" Line %ld column %ld - Tilde character?\n",
1313 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1318 if (!eCarat && c==CHAR_CARAT)
1320 if (pswit[ECHO_SWITCH])
1321 g_print("\n%s\n",aline);
1322 if (!pswit[OVERVIEW_SWITCH])
1323 g_print(" Line %ld column %ld - Carat character?\n",
1324 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1329 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1331 if (pswit[ECHO_SWITCH])
1332 g_print("\n%s\n",aline);
1333 if (!pswit[OVERVIEW_SWITCH])
1334 g_print(" Line %ld column %ld - Forward slash?\n",
1335 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1341 * Report asterisks only in paranoid mode,
1342 * since they're often deliberate.
1344 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1347 if (pswit[ECHO_SWITCH])
1348 g_print("\n%s\n",aline);
1349 if (!pswit[OVERVIEW_SWITCH])
1350 g_print(" Line %ld column %ld - Asterisk?\n",
1351 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1360 * check_for_long_line:
1362 * Check for line too long.
1364 void check_for_long_line(const char *aline)
1366 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1368 if (pswit[ECHO_SWITCH])
1369 g_print("\n%s\n",aline);
1370 if (!pswit[OVERVIEW_SWITCH])
1371 g_print(" Line %ld column %ld - Long line %ld\n",
1372 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1379 * check_for_short_line:
1381 * Check for line too short.
1383 * This one is a bit trickier to implement: we don't want to
1384 * flag the last line of a paragraph for being short, so we
1385 * have to wait until we know that our current line is a
1386 * "normal" line, then report the _previous_ line if it was too
1387 * short. We also don't want to report indented lines like
1388 * chapter heads or formatted quotations. We therefore keep
1389 * last->len as the length of the last line examined, and
1390 * last->blen as the length of the last but one, and try to
1391 * suppress unnecessary warnings by checking that both were of
1392 * "normal" length. We keep the first character of the last
1393 * line in last->start, and if it was a space, we assume that
1394 * the formatting is deliberate. I can't figure out a way to
1395 * distinguish something like a quoted verse left-aligned or
1396 * the header or footer of a letter from a paragraph of short
1397 * lines - maybe if I examined the whole paragraph, and if the
1398 * para has less than, say, 8 lines and if all lines are short,
1399 * then just assume it's OK? Need to look at some texts to see
1400 * how often a formula like this would get the right result.
1402 void check_for_short_line(const char *aline,const struct line_properties *last)
1404 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1405 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1406 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1408 if (pswit[ECHO_SWITCH])
1409 g_print("\n%s\n",prevline);
1410 if (!pswit[OVERVIEW_SWITCH])
1411 g_print(" Line %ld column %ld - Short line %ld?\n",
1412 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1419 * check_for_starting_punctuation:
1421 * Look for punctuation other than full ellipses at start of line.
1423 void check_for_starting_punctuation(const char *aline)
1425 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1426 !g_str_has_prefix(aline,". . ."))
1428 if (pswit[ECHO_SWITCH])
1429 g_print("\n%s\n",aline);
1430 if (!pswit[OVERVIEW_SWITCH])
1431 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1441 * Find the first em-dash, return a pointer to it and set <next> to the
1442 * character following the dash.
1444 char *str_emdash(const char *s,const char **next)
1452 *next=g_utf8_next_char(s2);
1457 *next=g_utf8_next_char(g_utf8_next_char(s1));
1462 *next=g_utf8_next_char(g_utf8_next_char(s1));
1467 *next=g_utf8_next_char(s2);
1473 * check_for_spaced_emdash:
1475 * Check for spaced em-dashes.
1477 * We must check _all_ occurrences of em-dashes on the line
1478 * hence the loop - even if the first dash is OK
1479 * there may be another that's wrong later on.
1481 void check_for_spaced_emdash(const char *aline)
1483 const char *s,*t,*next;
1484 for (s=aline;t=str_emdash(s,&next);s=next)
1486 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1487 g_utf8_get_char(next)==CHAR_SPACE)
1489 if (pswit[ECHO_SWITCH])
1490 g_print("\n%s\n",aline);
1491 if (!pswit[OVERVIEW_SWITCH])
1492 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1493 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1501 * check_for_spaced_dash:
1503 * Check for spaced dashes.
1505 void check_for_spaced_dash(const char *aline)
1508 if ((s=strstr(aline," -")))
1510 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1512 if (pswit[ECHO_SWITCH])
1513 g_print("\n%s\n",aline);
1514 if (!pswit[OVERVIEW_SWITCH])
1515 g_print(" Line %ld column %ld - Spaced dash?\n",
1516 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1521 else if ((s=strstr(aline,"- ")))
1523 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1525 if (pswit[ECHO_SWITCH])
1526 g_print("\n%s\n",aline);
1527 if (!pswit[OVERVIEW_SWITCH])
1528 g_print(" Line %ld column %ld - Spaced dash?\n",
1529 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1537 * check_for_unmarked_paragraphs:
1539 * Check for unmarked paragraphs indicated by separate speakers.
1541 * May well be false positive:
1542 * "Bravo!" "Wonderful!" called the crowd.
1543 * but useful all the same.
1545 void check_for_unmarked_paragraphs(const char *aline)
1548 s=strstr(aline,"\" \"");
1550 s=strstr(aline,"\" \"");
1553 if (pswit[ECHO_SWITCH])
1554 g_print("\n%s\n",aline);
1555 if (!pswit[OVERVIEW_SWITCH])
1556 g_print(" Line %ld column %ld - "
1557 "Query missing paragraph break?\n",
1558 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1565 * check_for_jeebies:
1567 * Check for "to he" and other easy h/b errors.
1569 * This is a very inadequate effort on the h/b problem,
1570 * but the phrase "to he" is always an error, whereas "to
1571 * be" is quite common.
1572 * Similarly, '"Quiet!", be said.' is a non-be error
1573 * "to he" is _not_ always an error!:
1574 * "Where they went to he couldn't say."
1575 * Another false positive:
1576 * What would "Cinderella" be without the . . .
1577 * and another: "If he wants to he can see for himself."
1579 void check_for_jeebies(const char *aline)
1582 s=strstr(aline," be could ");
1584 s=strstr(aline," be would ");
1586 s=strstr(aline," was be ");
1588 s=strstr(aline," be is ");
1590 s=strstr(aline," is be ");
1592 s=strstr(aline,"\", be ");
1594 s=strstr(aline,"\" be ");
1596 s=strstr(aline,"\" be ");
1598 s=strstr(aline," to he ");
1601 if (pswit[ECHO_SWITCH])
1602 g_print("\n%s\n",aline);
1603 if (!pswit[OVERVIEW_SWITCH])
1604 g_print(" Line %ld column %ld - Query he/be error?\n",
1605 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1609 s=strstr(aline," the had ");
1611 s=strstr(aline," a had ");
1613 s=strstr(aline," they bad ");
1615 s=strstr(aline," she bad ");
1617 s=strstr(aline," he bad ");
1619 s=strstr(aline," you bad ");
1621 s=strstr(aline," i bad ");
1624 if (pswit[ECHO_SWITCH])
1625 g_print("\n%s\n",aline);
1626 if (!pswit[OVERVIEW_SWITCH])
1627 g_print(" Line %ld column %ld - Query had/bad error?\n",
1628 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1632 s=strstr(aline,"; hut ");
1634 s=strstr(aline,", hut ");
1637 if (pswit[ECHO_SWITCH])
1638 g_print("\n%s\n",aline);
1639 if (!pswit[OVERVIEW_SWITCH])
1640 g_print(" Line %ld column %ld - Query hut/but error?\n",
1641 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1648 * check_for_mta_from:
1650 * Special case - angled bracket in front of "From" placed there by an
1651 * MTA when sending an e-mail.
1653 void check_for_mta_from(const char *aline)
1656 s=strstr(aline,">From");
1659 if (pswit[ECHO_SWITCH])
1660 g_print("\n%s\n",aline);
1661 if (!pswit[OVERVIEW_SWITCH])
1662 g_print(" Line %ld column %ld - "
1663 "Query angled bracket with From\n",
1664 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1671 * check_for_orphan_character:
1673 * Check for a single character line -
1674 * often an overflow from bad wrapping.
1676 void check_for_orphan_character(const char *aline)
1679 c=g_utf8_get_char(aline);
1680 if (c && !*g_utf8_next_char(aline))
1682 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1683 ; /* Nothing - ignore numerals alone on a line. */
1686 if (pswit[ECHO_SWITCH])
1687 g_print("\n%s\n",aline);
1688 if (!pswit[OVERVIEW_SWITCH])
1689 g_print(" Line %ld column 1 - Query single character line\n",
1698 * check_for_pling_scanno:
1700 * Check for I" - often should be !
1702 void check_for_pling_scanno(const char *aline)
1705 s=strstr(aline," I\"");
1708 if (pswit[ECHO_SWITCH])
1709 g_print("\n%s\n",aline);
1710 if (!pswit[OVERVIEW_SWITCH])
1711 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1712 linecnt,g_utf8_pointer_to_offset(aline,s));
1719 * check_for_extra_period:
1721 * Check for period without a capital letter. Cut-down from gutspell.
1722 * Only works when it happens on a single line.
1724 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1726 const char *s,*t,*s1,*sprev;
1731 gunichar c,nc,pc,*decomposition;
1732 if (pswit[PARANOID_SWITCH])
1734 for (t=aline;t=strstr(t,". ");)
1738 t=g_utf8_next_char(t);
1739 /* start of line punctuation is handled elsewhere */
1742 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1744 t=g_utf8_next_char(t);
1747 if (warnings->isDutch)
1749 /* For Frank & Jeroen -- 's Middags case */
1750 gunichar c2,c3,c4,c5;
1751 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1752 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1753 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1754 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1755 if (CHAR_IS_APOSTROPHE(c2) &&
1756 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1757 g_unichar_isupper(c5))
1759 t=g_utf8_next_char(t);
1763 s1=g_utf8_next_char(g_utf8_next_char(t));
1764 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1765 !g_unichar_isdigit(g_utf8_get_char(s1)))
1766 s1=g_utf8_next_char(s1);
1767 if (g_unichar_islower(g_utf8_get_char(s1)))
1769 /* we have something to investigate */
1771 /* so let's go back and find out */
1772 nc=g_utf8_get_char(t);
1773 s1=g_utf8_prev_char(t);
1774 c=g_utf8_get_char(s1);
1775 sprev=g_utf8_prev_char(s1);
1776 pc=g_utf8_get_char(sprev);
1778 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1779 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1780 g_unichar_isalpha(nc)))
1785 sprev=g_utf8_prev_char(s1);
1786 pc=g_utf8_get_char(sprev);
1788 s1=g_utf8_next_char(s1);
1791 testword=g_strndup(s1,s-s1);
1793 testword=g_strdup(s1);
1794 for (i=0;*abbrev[i];i++)
1795 if (!strcmp(testword,abbrev[i]))
1797 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1799 if (!*g_utf8_next_char(testword))
1801 if (isroman(testword))
1806 for (s=testword;*s;s=g_utf8_next_char(s))
1808 decomposition=g_unicode_canonical_decomposition(
1809 g_utf8_get_char(s),&len);
1810 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1812 g_free(decomposition);
1816 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1818 g_tree_insert(qperiod,g_strdup(testword),
1819 GINT_TO_POINTER(1));
1820 if (pswit[ECHO_SWITCH])
1821 g_print("\n%s\n",aline);
1822 if (!pswit[OVERVIEW_SWITCH])
1823 g_print(" Line %ld column %ld - Extra period?\n",
1824 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1830 t=g_utf8_next_char(t);
1836 * check_for_following_punctuation:
1838 * Check for words usually not followed by punctuation.
1840 void check_for_following_punctuation(const char *aline)
1843 const char *s,*wordstart;
1846 if (pswit[TYPO_SWITCH])
1857 inword=g_utf8_strdown(t,-1);
1859 for (i=0;*nocomma[i];i++)
1860 if (!strcmp(inword,nocomma[i]))
1862 c=g_utf8_get_char(s);
1863 if (c==',' || c==';' || c==':')
1865 if (pswit[ECHO_SWITCH])
1866 g_print("\n%s\n",aline);
1867 if (!pswit[OVERVIEW_SWITCH])
1868 g_print(" Line %ld column %ld - "
1869 "Query punctuation after %s?\n",
1870 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1876 for (i=0;*noperiod[i];i++)
1877 if (!strcmp(inword,noperiod[i]))
1879 c=g_utf8_get_char(s);
1880 if (c=='.' || c=='!')
1882 if (pswit[ECHO_SWITCH])
1883 g_print("\n%s\n",aline);
1884 if (!pswit[OVERVIEW_SWITCH])
1885 g_print(" Line %ld column %ld - "
1886 "Query punctuation after %s?\n",
1887 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1901 * Check for commonly mistyped words,
1902 * and digits like 0 for O in a word.
1904 void check_for_typos(const char *aline,struct warnings *warnings)
1906 const char *s,*t,*nt,*wordstart;
1908 gunichar *decomposition;
1910 int i,vowel,consonant,*dupcnt;
1911 gboolean isdup,istypo,alower;
1914 gsize decomposition_len;
1918 inword=getaword(&s);
1922 continue; /* don't bother with empty lines */
1924 if (mixdigit(inword))
1926 if (pswit[ECHO_SWITCH])
1927 g_print("\n%s\n",aline);
1928 if (!pswit[OVERVIEW_SWITCH])
1929 g_print(" Line %ld column %ld - Query digit in %s\n",
1930 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1935 * Put the word through a series of tests for likely typos and OCR
1938 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1942 for (t=inword;*t;t=g_utf8_next_char(t))
1944 c=g_utf8_get_char(t);
1945 nt=g_utf8_next_char(t);
1946 /* lowercase for testing */
1947 if (g_unichar_islower(c))
1949 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1952 * We have an uppercase mid-word. However, there are
1954 * Mac and Mc like McGill
1955 * French contractions like l'Abbe
1957 offset=g_utf8_pointer_to_offset(inword,t);
1959 pc=g_utf8_get_char(g_utf8_prev_char(t));
1962 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1963 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1964 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1965 CHAR_IS_APOSTROPHE(pc))
1971 testword=g_utf8_casefold(inword,-1);
1973 if (pswit[TYPO_SWITCH])
1976 * Check for certain unlikely two-letter combinations at word
1979 len=g_utf8_strlen(testword,-1);
1982 for (i=0;*nostart[i];i++)
1983 if (g_str_has_prefix(testword,nostart[i]))
1985 for (i=0;*noend[i];i++)
1986 if (g_str_has_suffix(testword,noend[i]))
1989 /* ght is common, gbt never. Like that. */
1990 if (strstr(testword,"cb"))
1992 if (strstr(testword,"gbt"))
1994 if (strstr(testword,"pbt"))
1996 if (strstr(testword,"tbs"))
1998 if (strstr(testword,"mrn"))
2000 if (strstr(testword,"ahle"))
2002 if (strstr(testword,"ihle"))
2005 * "TBE" does happen - like HEARTBEAT - but uncommon.
2006 * Also "TBI" - frostbite, outbid - but uncommon.
2007 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
2008 * numerals, but "ii" is a common scanno.
2010 if (strstr(testword,"tbi"))
2012 if (strstr(testword,"tbe"))
2014 if (strstr(testword,"ii"))
2017 * Check for no vowels or no consonants.
2018 * If none, flag a typo.
2020 if (!istypo && len>1)
2023 for (t=testword;*t;t=g_utf8_next_char(t))
2025 c=g_utf8_get_char(t);
2027 g_unicode_canonical_decomposition(c,&decomposition_len);
2028 if (c=='y' || g_unichar_isdigit(c))
2030 /* Yah, this is loose. */
2034 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2038 g_free(decomposition);
2040 if (!vowel || !consonant)
2044 * Now exclude the word from being reported if it's in
2047 for (i=0;*okword[i];i++)
2048 if (!strcmp(testword,okword[i]))
2051 * What looks like a typo may be a Roman numeral.
2054 if (istypo && isroman(testword))
2056 /* Check the manual list of typos. */
2058 for (i=0;*typo[i];i++)
2059 if (!strcmp(testword,typo[i]))
2062 * Check lowercase s, l, i and m - special cases.
2063 * "j" - often a semi-colon gone wrong.
2064 * "d" for a missing apostrophe - he d
2067 if (!istypo && len==1 &&
2068 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2072 dupcnt=g_tree_lookup(qword,testword);
2076 isdup=!pswit[VERBOSE_SWITCH];
2080 dupcnt=g_new0(int,1);
2081 g_tree_insert(qword,g_strdup(testword),dupcnt);
2086 if (pswit[ECHO_SWITCH])
2087 g_print("\n%s\n",aline);
2088 if (!pswit[OVERVIEW_SWITCH])
2090 g_print(" Line %ld column %ld - Query word %s",
2091 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2093 if (!pswit[VERBOSE_SWITCH])
2094 g_print(" - not reporting duplicates");
2102 /* check the user's list of typos */
2103 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2105 if (pswit[ECHO_SWITCH])
2106 g_print("\n%s\n",aline);
2107 if (!pswit[OVERVIEW_SWITCH])
2108 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2109 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2111 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2113 if (pswit[PARANOID_SWITCH] && warnings->digit)
2115 /* In paranoid mode, query all 0 and 1 standing alone. */
2116 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2118 if (pswit[ECHO_SWITCH])
2119 g_print("\n%s\n",aline);
2120 if (!pswit[OVERVIEW_SWITCH])
2121 g_print(" Line %ld column %ld - Query standalone %s\n",
2122 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2133 * check_for_misspaced_punctuation:
2135 * Look for added or missing spaces around punctuation and quotes.
2136 * If there is a punctuation character like ! with no space on
2137 * either side, suspect a missing!space. If there are spaces on
2138 * both sides , assume a typo. If we see a double quote with no
2139 * space or punctuation on either side of it, assume unspaced
2140 * quotes "like"this.
2142 void check_for_misspaced_punctuation(const char *aline,
2143 struct parities *parities,gboolean isemptyline)
2145 gboolean isacro,isellipsis;
2147 gunichar c,nc,pc,n2c;
2149 c=g_utf8_get_char(aline);
2150 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2151 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2155 nc=g_utf8_get_char(g_utf8_next_char(s));
2156 /* For each character in the line after the first. */
2157 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2159 /* we need to suppress warnings for acronyms like M.D. */
2161 /* we need to suppress warnings for ellipsis . . . */
2164 * If there are letters on both sides of it or
2165 * if it's strict punctuation followed by an alpha.
2167 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2168 g_utf8_strchr("?!,;:",-1,c)))
2172 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2173 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2175 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2181 if (pswit[ECHO_SWITCH])
2182 g_print("\n%s\n",aline);
2183 if (!pswit[OVERVIEW_SWITCH])
2184 g_print(" Line %ld column %ld - Missing space?\n",
2185 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2190 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2193 * If there are spaces on both sides,
2194 * or space before and end of line.
2198 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2199 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2201 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2205 if (!isemptyline && !isellipsis)
2207 if (pswit[ECHO_SWITCH])
2208 g_print("\n%s\n",aline);
2209 if (!pswit[OVERVIEW_SWITCH])
2210 g_print(" Line %ld column %ld - "
2211 "Spaced punctuation?\n",linecnt,
2212 g_utf8_pointer_to_offset(aline,s)+1);
2219 /* Split out the characters that CANNOT be preceded by space. */
2220 c=g_utf8_get_char(aline);
2221 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2222 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2226 nc=g_utf8_get_char(g_utf8_next_char(s));
2227 /* for each character in the line after the first */
2228 if (g_utf8_strchr("?!,;:",-1,c))
2230 /* if it's punctuation that _cannot_ have a space before it */
2231 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2234 * If nc DOES == space,
2235 * it was already reported just above.
2237 if (pswit[ECHO_SWITCH])
2238 g_print("\n%s\n",aline);
2239 if (!pswit[OVERVIEW_SWITCH])
2240 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2241 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2248 * Special case " .X" where X is any alpha.
2249 * This plugs a hole in the acronym code above.
2250 * Inelegant, but maintainable.
2252 c=g_utf8_get_char(aline);
2253 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2254 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2258 nc=g_utf8_get_char(g_utf8_next_char(s));
2259 /* for each character in the line after the first */
2262 /* if it's a period */
2263 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2266 * If the period follows a space and
2267 * is followed by a letter.
2269 if (pswit[ECHO_SWITCH])
2270 g_print("\n%s\n",aline);
2271 if (!pswit[OVERVIEW_SWITCH])
2272 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2273 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2279 c=g_utf8_get_char(aline);
2280 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2281 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2285 nc=g_utf8_get_char(g_utf8_next_char(s));
2286 /* for each character in the line after the first */
2287 if (CHAR_IS_DQUOTE(c))
2289 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2290 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2291 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2293 if (pswit[ECHO_SWITCH])
2294 g_print("\n%s\n",aline);
2295 if (!pswit[OVERVIEW_SWITCH])
2296 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2297 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2303 /* Check parity of quotes. */
2304 nc=g_utf8_get_char(aline);
2305 for (s=aline;*s;s=g_utf8_next_char(s))
2308 nc=g_utf8_get_char(g_utf8_next_char(s));
2309 if (CHAR_IS_DQUOTE(c))
2313 parities->dquote=!parities->dquote;
2314 parity=parities->dquote;
2316 else if (c==CHAR_LD_QUOTE)
2323 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2325 if (pswit[ECHO_SWITCH])
2326 g_print("\n%s\n",aline);
2327 if (!pswit[OVERVIEW_SWITCH])
2328 g_print(" Line %ld column %ld - "
2329 "Wrongspaced quotes?\n",
2330 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2338 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2339 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2341 if (pswit[ECHO_SWITCH])
2342 g_print("\n%s\n",aline);
2343 if (!pswit[OVERVIEW_SWITCH])
2344 g_print(" Line %ld column %ld - "
2345 "Wrongspaced quotes?\n",
2346 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2353 c=g_utf8_get_char(aline);
2354 if (CHAR_IS_DQUOTE(c))
2356 if (g_utf8_strchr(",;:!?)]} ",-1,
2357 g_utf8_get_char(g_utf8_next_char(aline))))
2359 if (pswit[ECHO_SWITCH])
2360 g_print("\n%s\n",aline);
2361 if (!pswit[OVERVIEW_SWITCH])
2362 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2368 if (pswit[SQUOTE_SWITCH])
2370 nc=g_utf8_get_char(aline);
2371 for (s=aline;*s;s=g_utf8_next_char(s))
2374 nc=g_utf8_get_char(g_utf8_next_char(s));
2375 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2376 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2377 !g_unichar_isalpha(nc)))
2379 parities->squote=!parities->squote;
2380 if (!parities->squote)
2383 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2385 if (pswit[ECHO_SWITCH])
2386 g_print("\n%s\n",aline);
2387 if (!pswit[OVERVIEW_SWITCH])
2388 g_print(" Line %ld column %ld - "
2389 "Wrongspaced singlequotes?\n",
2390 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2398 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2399 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2401 if (pswit[ECHO_SWITCH])
2402 g_print("\n%s\n",aline);
2403 if (!pswit[OVERVIEW_SWITCH])
2404 g_print(" Line %ld column %ld - "
2405 "Wrongspaced singlequotes?\n",
2406 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2417 * check_for_double_punctuation:
2419 * Look for double punctuation like ,. or ,,
2420 * Thanks to DW for the suggestion!
2421 * In books with references, ".," and ".;" are common
2422 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2423 * OTOH, from my initial tests, there are also fairly
2424 * common errors. What to do? Make these cases paranoid?
2425 * ".," is the most common, so warnings->dotcomma is used
2426 * to suppress detailed reporting if it occurs often.
2428 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2432 nc=g_utf8_get_char(aline);
2433 for (s=aline;*s;s=g_utf8_next_char(s))
2436 nc=g_utf8_get_char(g_utf8_next_char(s));
2437 /* for each punctuation character in the line */
2438 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2439 g_utf8_strchr(".?!,;:",-1,nc))
2441 /* followed by punctuation, it's a query, unless . . . */
2442 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2443 !warnings->dotcomma && c=='.' && nc==',' ||
2444 warnings->isFrench && g_str_has_prefix(s,",...") ||
2445 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2446 warnings->isFrench && g_str_has_prefix(s,";...") ||
2447 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2448 warnings->isFrench && g_str_has_prefix(s,":...") ||
2449 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2450 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2451 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2452 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2453 warnings->isFrench && g_str_has_prefix(s,"...?"))
2455 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2456 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2457 warnings->isFrench && g_str_has_prefix(s,";...") ||
2458 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2459 warnings->isFrench && g_str_has_prefix(s,":...") ||
2460 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2461 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2462 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2463 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2464 warnings->isFrench && g_str_has_prefix(s,"...?"))
2467 nc=g_utf8_get_char(g_utf8_next_char(s));
2469 ; /* do nothing for .. !! and ?? which can be legit */
2473 if (pswit[ECHO_SWITCH])
2474 g_print("\n%s\n",aline);
2475 if (!pswit[OVERVIEW_SWITCH])
2476 g_print(" Line %ld column %ld - Double punctuation?\n",
2477 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2486 * check_for_spaced_quotes:
2488 void check_for_spaced_quotes(const char *aline)
2492 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2496 while ((t=strstr(s," \" ")))
2498 if (pswit[ECHO_SWITCH])
2499 g_print("\n%s\n",aline);
2500 if (!pswit[OVERVIEW_SWITCH])
2501 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2502 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2505 s=g_utf8_next_char(g_utf8_next_char(t));
2507 pattern=g_string_new(NULL);
2508 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2510 g_string_assign(pattern," ");
2511 g_string_append_unichar(pattern,single_quotes[i]);
2512 g_string_append_c(pattern,' ');
2514 while ((t=strstr(s,pattern->str)))
2516 if (pswit[ECHO_SWITCH])
2517 g_print("\n%s\n",aline);
2518 if (!pswit[OVERVIEW_SWITCH])
2519 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2520 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2523 s=g_utf8_next_char(g_utf8_next_char(t));
2526 g_string_free(pattern,TRUE);
2530 * check_for_miscased_genative:
2532 * Check special case of 'S instead of 's at end of word.
2534 void check_for_miscased_genative(const char *aline)
2540 c=g_utf8_get_char(aline);
2541 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2542 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2546 nc=g_utf8_get_char(g_utf8_next_char(s));
2547 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2549 if (pswit[ECHO_SWITCH])
2550 g_print("\n%s\n",aline);
2551 if (!pswit[OVERVIEW_SWITCH])
2552 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2553 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2561 * check_end_of_line:
2563 * Now check special cases - start and end of line -
2564 * for single and double quotes. Start is sometimes [sic]
2565 * but better to query it anyway.
2566 * While we're here, check for dash at end of line.
2568 void check_end_of_line(const char *aline,struct warnings *warnings)
2573 lbytes=strlen(aline);
2574 if (g_utf8_strlen(aline,lbytes)>1)
2576 s=g_utf8_prev_char(aline+lbytes);
2577 c1=g_utf8_get_char(s);
2578 c2=g_utf8_get_char(g_utf8_prev_char(s));
2579 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2581 if (pswit[ECHO_SWITCH])
2582 g_print("\n%s\n",aline);
2583 if (!pswit[OVERVIEW_SWITCH])
2584 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2585 g_utf8_strlen(aline,lbytes));
2589 c1=g_utf8_get_char(aline);
2590 c2=g_utf8_get_char(g_utf8_next_char(aline));
2591 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2593 if (pswit[ECHO_SWITCH])
2594 g_print("\n%s\n",aline);
2595 if (!pswit[OVERVIEW_SWITCH])
2596 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2601 * Dash at end of line may well be legit - paranoid mode only
2602 * and don't report em-dash at line-end.
2604 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2606 for (s=g_utf8_prev_char(aline+lbytes);
2607 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2609 if (g_utf8_get_char(s)=='-' &&
2610 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2612 if (pswit[ECHO_SWITCH])
2613 g_print("\n%s\n",aline);
2614 if (!pswit[OVERVIEW_SWITCH])
2615 g_print(" Line %ld column %ld - "
2616 "Hyphen at end of line?\n",
2617 linecnt,g_utf8_pointer_to_offset(aline,s));
2624 * check_for_unspaced_bracket:
2626 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2627 * If so, suspect a scanno like "a]most".
2629 void check_for_unspaced_bracket(const char *aline)
2633 c=g_utf8_get_char(aline);
2634 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2635 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2639 nc=g_utf8_get_char(g_utf8_next_char(s));
2642 /* for each bracket character in the line except 1st & last */
2643 if (g_utf8_strchr("{[()]}",-1,c) &&
2644 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2646 if (pswit[ECHO_SWITCH])
2647 g_print("\n%s\n",aline);
2648 if (!pswit[OVERVIEW_SWITCH])
2649 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2650 linecnt,g_utf8_pointer_to_offset(aline,s));
2658 * check_for_unpunctuated_endquote:
2660 void check_for_unpunctuated_endquote(const char *aline)
2665 c=g_utf8_get_char(aline);
2666 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2667 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2671 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2672 nc=g_utf8_get_char(g_utf8_next_char(s));
2673 /* for each character in the line except 1st */
2674 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2676 if (pswit[ECHO_SWITCH])
2677 g_print("\n%s\n",aline);
2678 if (!pswit[OVERVIEW_SWITCH])
2679 g_print(" Line %ld column %ld - "
2680 "endquote missing punctuation?\n",
2681 linecnt,g_utf8_pointer_to_offset(aline,s));
2689 * check_for_html_tag:
2691 * Check for <HTML TAG>.
2693 * If there is a < in the line, followed at some point
2694 * by a > then we suspect HTML.
2696 void check_for_html_tag(const char *aline)
2698 const char *open,*close;
2700 open=strchr(aline,'<');
2703 close=strchr(g_utf8_next_char(open),'>');
2706 if (pswit[ECHO_SWITCH])
2707 g_print("\n%s\n",aline);
2708 if (!pswit[OVERVIEW_SWITCH])
2710 tag=g_strndup(open,close-open+1);
2711 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2712 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2722 * check_for_html_entity:
2724 * Check for &symbol; HTML.
2726 * If there is a & in the line, followed at
2727 * some point by a ; then we suspect HTML.
2729 void check_for_html_entity(const char *aline)
2731 const char *s,*amp,*scolon;
2733 amp=strchr(aline,'&');
2736 scolon=strchr(amp,';');
2739 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2740 if (g_utf8_get_char(s)==CHAR_SPACE)
2741 break; /* Don't report "Jones & Son;" */
2744 if (pswit[ECHO_SWITCH])
2745 g_print("\n%s\n",aline);
2746 if (!pswit[OVERVIEW_SWITCH])
2748 entity=g_strndup(amp,scolon-amp+1);
2749 g_print(" Line %ld column %d - HTML symbol? %s \n",
2750 linecnt,(int)(amp-aline)+1,entity);
2761 * check_for_omitted_punctuation:
2763 * Check for omitted punctuation at end of paragraph by working back
2764 * through prevline. DW.
2765 * Need to check this only for "normal" paras.
2766 * So what is a "normal" para?
2767 * Not normal if one-liner (chapter headings, etc.)
2768 * Not normal if doesn't contain at least one locase letter
2769 * Not normal if starts with space
2771 void check_for_omitted_punctuation(const char *prevline,
2772 struct line_properties *last,int start_para_line)
2774 gboolean letter_on_line=FALSE;
2777 gboolean closing_quote;
2778 for (s=prevline;*s;s=g_utf8_next_char(s))
2779 if (g_unichar_isalpha(g_utf8_get_char(s)))
2781 letter_on_line=TRUE;
2785 * This next "if" is a problem.
2786 * If we say "start_para_line <= linecnt - 1", that includes
2787 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2788 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2789 * misses genuine one-line paragraphs.
2791 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2792 g_utf8_get_char(prevline)>CHAR_SPACE)
2794 s=prevline+strlen(prevline);
2797 s=g_utf8_prev_char(s);
2798 c=g_utf8_get_char(s);
2799 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2802 closing_quote=FALSE;
2803 } while (closing_quote && s>prevline);
2804 for (;s>prevline;s=g_utf8_prev_char(s))
2806 if (g_unichar_isalpha(g_utf8_get_char(s)))
2808 if (pswit[ECHO_SWITCH])
2809 g_print("\n%s\n",prevline);
2810 if (!pswit[OVERVIEW_SWITCH])
2811 g_print(" Line %ld column %ld - "
2812 "No punctuation at para end?\n",
2813 linecnt-1,g_utf8_strlen(prevline,-1));
2818 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2824 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2826 const char *word=key;
2829 g_print("\nNote: Queried word %s was duplicated %d times\n",
2834 void print_as_windows_1252(const char *string)
2836 gsize inbytes,outbytes;
2838 static GIConv converter=(GIConv)-1;
2841 if (converter!=(GIConv)-1)
2842 g_iconv_close(converter);
2843 converter=(GIConv)-1;
2846 if (converter==(GIConv)-1)
2847 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2848 if (converter!=(GIConv)-1)
2850 inbytes=outbytes=strlen(string);
2851 bp=buf=g_malloc(outbytes+1);
2852 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2858 fputs(string,stdout);
2861 void print_as_utf_8(const char *string)
2863 fputs(string,stdout);
2871 void procfile(const char *filename)
2874 gchar *parastart=NULL; /* first line of current para */
2875 gchar *etext,*aline;
2878 struct first_pass_results *first_pass_results;
2879 struct warnings *warnings;
2880 struct counters counters={0};
2881 struct line_properties last={0};
2882 struct parities parities={0};
2883 struct pending pending={0};
2884 gboolean isemptyline;
2885 long start_para_line=0;
2886 gboolean isnewpara=FALSE,enddash=FALSE;
2887 last.start=CHAR_SPACE;
2888 linecnt=checked_linecnt=0;
2889 etext=read_etext(filename,&err);
2892 if (pswit[STDOUT_SWITCH])
2893 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2895 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2898 g_print("\n\nFile: %s\n\n",filename);
2899 first_pass_results=first_pass(etext);
2900 warnings=report_first_pass(first_pass_results);
2901 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2902 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2904 * Here we go with the main pass. Hold onto yer hat!
2908 while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
2913 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2914 continue; // skip DP page separators completely
2915 if (linecnt<first_pass_results->firstline ||
2916 (first_pass_results->footerline>0 &&
2917 linecnt>first_pass_results->footerline))
2919 if (pswit[HEADER_SWITCH])
2921 if (g_str_has_prefix(aline,"Title:"))
2922 g_print(" %s\n",aline);
2923 if (g_str_has_prefix(aline,"Author:"))
2924 g_print(" %s\n",aline);
2925 if (g_str_has_prefix(aline,"Release Date:"))
2926 g_print(" %s\n",aline);
2927 if (g_str_has_prefix(aline,"Edition:"))
2928 g_print(" %s\n\n",aline);
2930 continue; /* skip through the header */
2933 print_pending(aline,parastart,&pending);
2934 isemptyline=analyse_quotes(aline,&counters);
2935 if (isnewpara && !isemptyline)
2937 /* This line is the start of a new paragraph. */
2938 start_para_line=linecnt;
2939 /* Capture its first line in case we want to report it later. */
2941 parastart=g_strdup(aline);
2942 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2944 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2945 !g_unichar_isdigit(g_utf8_get_char(s)))
2946 s=g_utf8_next_char(s);
2947 if (g_unichar_islower(g_utf8_get_char(s)))
2949 /* and its first letter is lowercase */
2950 if (pswit[ECHO_SWITCH])
2951 g_print("\n%s\n",aline);
2952 if (!pswit[OVERVIEW_SWITCH])
2953 g_print(" Line %ld column %ld - "
2954 "Paragraph starts with lower-case\n",
2955 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2959 isnewpara=FALSE; /* Signal the end of new para processing. */
2961 /* Check for an em-dash broken at line end. */
2962 if (enddash && g_utf8_get_char(aline)=='-')
2964 if (pswit[ECHO_SWITCH])
2965 g_print("\n%s\n",aline);
2966 if (!pswit[OVERVIEW_SWITCH])
2967 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2972 for (s=g_utf8_prev_char(aline+strlen(aline));
2973 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2975 if (s>=aline && g_utf8_get_char(s)=='-')
2977 check_for_control_characters(aline);
2979 check_for_odd_characters(aline,warnings,isemptyline);
2980 if (warnings->longline)
2981 check_for_long_line(aline);
2982 if (warnings->shortline)
2983 check_for_short_line(aline,&last);
2985 last.len=g_utf8_strlen(aline,-1);
2986 last.start=g_utf8_get_char(aline);
2987 check_for_starting_punctuation(aline);
2990 check_for_spaced_emdash(aline);
2991 check_for_spaced_dash(aline);
2993 check_for_unmarked_paragraphs(aline);
2994 check_for_jeebies(aline);
2995 check_for_mta_from(aline);
2996 check_for_orphan_character(aline);
2997 check_for_pling_scanno(aline);
2998 check_for_extra_period(aline,warnings);
2999 check_for_following_punctuation(aline);
3000 check_for_typos(aline,warnings);
3001 check_for_misspaced_punctuation(aline,&parities,isemptyline);
3002 check_for_double_punctuation(aline,warnings);
3003 check_for_spaced_quotes(aline);
3004 check_for_miscased_genative(aline);
3005 check_end_of_line(aline,warnings);
3006 check_for_unspaced_bracket(aline);
3007 if (warnings->endquote)
3008 check_for_unpunctuated_endquote(aline);
3009 check_for_html_tag(aline);
3010 check_for_html_entity(aline);
3013 check_for_mismatched_quotes(&counters,&pending);
3014 counters_reset(&counters);
3015 /* let the next iteration know that it's starting a new para */
3018 check_for_omitted_punctuation(prevline,&last,start_para_line);
3021 prevline=g_strdup(aline);
3024 check_for_mismatched_quotes(&counters,&pending);
3025 print_pending(NULL,parastart,&pending);
3026 reset_pending(&pending);
3035 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3036 g_tree_foreach(qword,report_duplicate_queries,NULL);
3037 g_tree_unref(qword);
3038 g_tree_unref(qperiod);
3039 counters_destroy(&counters);
3040 g_set_print_handler(NULL);
3041 print_as_windows_1252(NULL);
3042 if (pswit[MARKUP_SWITCH])
3049 * Get one line from the input text. The setting of newlines has the following
3052 * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
3054 * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
3055 * the newline character.
3057 * UNIX_NEWLINES: Check for the presence of CRs.
3059 * In all cases, check that the last line is correctly terminated.
3061 * Returns: a pointer to the line.
3063 char *flgets(char **etext,long lcnt,int newlines)
3066 gboolean isCR=FALSE;
3067 char *theline=*etext;
3072 c=g_utf8_get_char(*etext);
3075 if (*etext==theline)
3077 else if (pswit[LINE_END_SWITCH])
3079 if (pswit[ECHO_SWITCH])
3081 s=g_strndup(theline,eos-theline);
3082 g_print("\n%s\n",s);
3085 if (!pswit[OVERVIEW_SWITCH])
3087 if (newlines==OS9_NEWLINES)
3088 g_print(" Line %ld - No CR?\n",lcnt);
3091 /* There may, or may not, have been a CR */
3092 g_print(" Line %ld - No LF?\n",lcnt);
3100 *etext=g_utf8_next_char(*etext);
3101 /* either way, it's end of line */
3104 if (newlines==DOS_NEWLINES && !isCR)
3106 /* Error - a LF without a preceding CR */
3107 if (pswit[LINE_END_SWITCH])
3109 if (pswit[ECHO_SWITCH])
3111 s=g_strndup(theline,eos-theline);
3112 g_print("\n%s\n",s);
3115 if (!pswit[OVERVIEW_SWITCH])
3116 g_print(" Line %ld - No CR?\n",lcnt);
3125 if (newlines==OS9_NEWLINES)
3127 if (isCR || newlines==UNIX_NEWLINES)
3129 if (pswit[LINE_END_SWITCH])
3131 if (pswit[ECHO_SWITCH])
3133 s=g_strndup(theline,eos-theline);
3134 g_print("\n%s\n",s);
3137 if (!pswit[OVERVIEW_SWITCH])
3139 if (newlines==UNIX_NEWLINES)
3140 g_print(" Line %ld column %ld - Embedded CR?\n",
3141 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3143 g_print(" Line %ld - Two successive CRs?\n",
3149 if (newlines==UNIX_NEWLINES)
3152 if (newlines==DOS_NEWLINES)
3157 if (pswit[LINE_END_SWITCH] && isCR)
3159 if (pswit[ECHO_SWITCH])
3161 s=g_strndup(theline,eos-theline);
3162 g_print("\n%s\n",s);
3165 if (!pswit[OVERVIEW_SWITCH])
3166 g_print(" Line %ld column %ld - CR without LF?\n",
3167 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3173 eos=g_utf8_next_char(eos);
3177 if (pswit[MARKUP_SWITCH])
3178 postprocess_for_HTML(theline);
3179 if (pswit[DP_SWITCH])
3180 postprocess_for_DP(theline);
3187 * Takes a "word" as a parameter, and checks whether it
3188 * contains a mixture of alpha and digits. Generally, this is an
3189 * error, but may not be for cases like 4th or L5 12s. 3d.
3191 * Returns: TRUE iff an is error found.
3193 gboolean mixdigit(const char *checkword)
3195 gboolean wehaveadigit,wehavealetter,query;
3196 const char *s,*nondigit;
3197 wehaveadigit=wehavealetter=query=FALSE;
3198 for (s=checkword;*s;s=g_utf8_next_char(s))
3199 if (g_unichar_isalpha(g_utf8_get_char(s)))
3201 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3203 if (wehaveadigit && wehavealetter)
3205 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3207 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3208 nondigit=g_utf8_next_char(nondigit))
3210 /* digits, ending in st, rd, nd, th of either case */
3211 if (!g_ascii_strcasecmp(nondigit,"st") ||
3212 !g_ascii_strcasecmp(nondigit,"rd") ||
3213 !g_ascii_strcasecmp(nondigit,"nd") ||
3214 !g_ascii_strcasecmp(nondigit,"th"))
3216 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3217 !g_ascii_strcasecmp(nondigit,"rds") ||
3218 !g_ascii_strcasecmp(nondigit,"nds") ||
3219 !g_ascii_strcasecmp(nondigit,"ths"))
3221 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3222 !g_ascii_strcasecmp(nondigit,"rdly") ||
3223 !g_ascii_strcasecmp(nondigit,"ndly") ||
3224 !g_ascii_strcasecmp(nondigit,"thly"))
3226 /* digits, ending in l, L, s or d */
3227 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3228 !strcmp(nondigit,"d"))
3231 * L at the start of a number, representing Britsh pounds, like L500.
3232 * This is cute. We know the current word is mixed digit. If the first
3233 * letter is L, there must be at least one digit following. If both
3234 * digits and letters follow, we have a genuine error, else we have a
3235 * capital L followed by digits, and we accept that as a non-error.
3237 if (g_utf8_get_char(checkword)=='L' &&
3238 !mixdigit(g_utf8_next_char(checkword)))
3247 * Extracts the first/next "word" from the line, and returns it.
3248 * A word is defined as one English word unit--or at least that's the aim.
3249 * "ptr" is advanced to the position in the line where we will start
3250 * looking for the next word.
3252 * Returns: A newly-allocated string.
3254 gchar *getaword(const char **ptr)
3259 word=g_string_new(NULL);
3260 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3261 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3262 **ptr;*ptr=g_utf8_next_char(*ptr))
3264 /* Handle exceptions for footnote markers like [1] */
3265 if (g_utf8_get_char(*ptr)=='[')
3267 g_string_append_c(word,'[');
3268 s=g_utf8_next_char(*ptr);
3269 for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
3270 g_string_append_unichar(word,g_utf8_get_char(s));
3271 if (g_utf8_get_char(s)==']')
3273 g_string_append_c(word,']');
3274 *ptr=g_utf8_next_char(s);
3275 return g_string_free(word,FALSE);
3278 g_string_truncate(word,0);
3282 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3283 * Especially yucky is the case of L1,000
3284 * This section looks for a pattern of characters including a digit
3285 * followed by a comma or period followed by one or more digits.
3286 * If found, it returns this whole pattern as a word; otherwise we discard
3287 * the results and resume our normal programming.
3290 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3291 g_unichar_isalpha(g_utf8_get_char(s)) ||
3292 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3293 g_string_append_unichar(word,g_utf8_get_char(s));
3296 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3298 c=g_utf8_get_char(t);
3299 pc=g_utf8_get_char(g_utf8_prev_char(t));
3300 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3303 return g_string_free(word,FALSE);
3307 /* we didn't find a punctuated number - do the regular getword thing */
3308 g_string_truncate(word,0);
3309 c=g_utf8_get_char(*ptr);
3310 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3311 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3312 g_string_append_unichar(word,c);
3313 return g_string_free(word,FALSE);
3319 * Is this word a Roman Numeral?
3321 * It doesn't actually validate that the number is a valid Roman Numeral--for
3322 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3323 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3324 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3325 * expressions thereof, except when it came to taxes. Allow any number of M,
3326 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3327 * XL or an optional XC, an optional IX or IV, an optional V and any number
3330 gboolean isroman(const char *t)
3336 while (g_utf8_get_char(t)=='m' && *t)
3338 if (g_utf8_get_char(t)=='d')
3340 if (g_str_has_prefix(t,"cm"))
3342 if (g_str_has_prefix(t,"cd"))
3344 while (g_utf8_get_char(t)=='c' && *t)
3346 if (g_str_has_prefix(t,"xl"))
3348 if (g_str_has_prefix(t,"xc"))
3350 if (g_utf8_get_char(t)=='l')
3352 while (g_utf8_get_char(t)=='x' && *t)
3354 if (g_str_has_prefix(t,"ix"))
3356 if (g_str_has_prefix(t,"iv"))
3358 if (g_utf8_get_char(t)=='v')
3360 while (g_utf8_get_char(t)=='i' && *t)
3366 * postprocess_for_DP:
3368 * Invoked with the -d switch from flgets().
3369 * It simply "removes" from the line a hard-coded set of common
3370 * DP-specific tags, so that the line passed to the main routine has
3371 * been pre-cleaned of DP markup.
3373 void postprocess_for_DP(char *theline)
3379 for (i=0;*DPmarkup[i];i++)
3380 while ((s=strstr(theline,DPmarkup[i])))
3382 t=s+strlen(DPmarkup[i]);
3383 memmove(s,t,strlen(t)+1);
3388 * postprocess_for_HTML:
3390 * Invoked with the -m switch from flgets().
3391 * It simply "removes" from the line a hard-coded set of common
3392 * HTML tags and "replaces" a hard-coded set of common HTML
3393 * entities, so that the line passed to the main routine has
3394 * been pre-cleaned of HTML.
3396 void postprocess_for_HTML(char *theline)
3398 while (losemarkup(theline))
3400 loseentities(theline);
3403 char *losemarkup(char *theline)
3407 s=strchr(theline,'<');
3408 t=s?strchr(s,'>'):NULL;
3411 for (i=0;*markup[i];i++)
3412 if (tagcomp(g_utf8_next_char(s),markup[i]))
3414 t=g_utf8_next_char(t);
3415 memmove(s,t,strlen(t)+1);
3418 /* It's an unrecognized <xxx>. */
3422 void loseentities(char *theline)
3429 GTree *entities=NULL;
3430 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3434 g_tree_destroy(entities);
3436 if (translit!=(GIConv)-1)
3437 g_iconv_close(translit);
3438 translit=(GIConv)-1;
3439 if (to_utf8!=(GIConv)-1)
3440 g_iconv_close(to_utf8);
3448 entities=g_tree_new((GCompareFunc)strcmp);
3449 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3450 g_tree_insert(entities,HTMLentities[i].name,
3451 GUINT_TO_POINTER(HTMLentities[i].c));
3453 if (translit==(GIConv)-1)
3454 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3455 if (to_utf8==(GIConv)-1)
3456 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3457 while((amp=strchr(theline,'&')))
3459 scolon=strchr(amp,';');
3464 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3465 c=strtol(amp+2,NULL,10);
3466 else if (amp[2]=='x' &&
3467 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3468 c=strtol(amp+3,NULL,16);
3472 s=g_strndup(amp+1,scolon-(amp+1));
3473 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3482 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3483 theline+=g_unichar_to_utf8(c,theline);
3487 nb=g_unichar_to_utf8(c,s);
3488 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3490 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3492 memcpy(theline,s,nb);
3496 memmove(theline,g_utf8_next_char(scolon),
3497 strlen(g_utf8_next_char(scolon))+1);
3500 theline=g_utf8_next_char(amp);
3504 gboolean tagcomp(const char *strin,const char *basetag)
3508 if (g_utf8_get_char(strin)=='/')
3509 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3511 t=g_utf8_casefold(strin,-1);
3512 s=g_utf8_casefold(basetag,-1);
3513 retval=g_str_has_prefix(t,s);
3519 void proghelp(GOptionContext *context)
3522 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3523 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3524 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3525 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3526 "For details, read the file COPYING.\n",stderr);
3527 fputs("This is Free Software; "
3528 "you may redistribute it under certain conditions (GPL);\n",stderr);
3529 fputs("read the file COPYING for details.\n\n",stderr);
3530 help=g_option_context_get_help(context,TRUE,NULL);
3533 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3534 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3535 "non-ASCII\n",stderr);
3536 fputs("characters like accented letters, "
3537 "lines longer than 75 or shorter than 55,\n",stderr);
3538 fputs("unbalanced quotes or brackets, "
3539 "a variety of badly formatted punctuation, \n",stderr);
3540 fputs("HTML tags, some likely typos. "
3541 "It is NOT a substitute for human judgement.\n",stderr);