1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 gboolean typo_compat,paranoid_compat;
133 static GOptionEntry options[]={
134 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
135 "Ignore DP-specific markup", NULL },
136 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
137 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
138 "Don't ignore DP-specific markup", NULL },
139 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
140 "Echo queried line", NULL },
141 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
142 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
143 "Don't echo queried line", NULL },
144 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
145 "Check single quotes", NULL },
146 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
147 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
148 "Don't check single quotes", NULL },
149 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
150 "Check common typos", NULL },
151 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
152 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
153 "Don't check common typos", NULL },
154 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
155 "Require closure of quotes on every paragraph", NULL },
156 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
157 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
158 "Don't require closure of quotes on every paragraph", NULL },
159 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
160 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
161 "Enable paranoid querying of everything", NULL },
162 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
163 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
164 "Disable paranoid querying of everything", NULL },
165 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
166 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
167 "Enable line end checking", NULL },
168 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
169 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
170 "Diable line end checking", NULL },
171 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
172 "Overview: just show counts", NULL },
173 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
174 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
175 "Show individual warnings", NULL },
176 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
177 "Output errors to stdout instead of stderr", NULL },
178 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
179 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
180 "Output errors to stderr instead of stdout", NULL },
181 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
182 "Echo header fields", NULL },
183 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
184 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
185 "Don't echo header fields", NULL },
186 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
187 "Ignore markup in < >", NULL },
188 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
189 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
190 "No special handling for markup in < >", NULL },
191 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
192 "Use file of user-defined typos", NULL },
193 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
194 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
195 "Ignore file of user-defined typos", NULL },
196 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
197 "Verbose - list everything", NULL },
198 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
199 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Switch off verbose mode", NULL },
205 * Options relating to configuration which make no sense from inside
206 * a configuration file.
209 static GOptionEntry config_options[]={
210 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
211 "Defaults for use on www upload", NULL },
212 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
213 "Dump current config settings", NULL },
217 static GOptionEntry compatibility_options[]={
218 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
219 "Toggle checking for common typos", NULL },
220 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
221 "Toggle both paranoid mode and common typos", NULL },
225 long cnt_quote; /* for overview mode, count of quote queries */
226 long cnt_brack; /* for overview mode, count of brackets queries */
227 long cnt_bin; /* for overview mode, count of non-ASCII queries */
228 long cnt_odd; /* for overview mode, count of odd character queries */
229 long cnt_long; /* for overview mode, count of long line errors */
230 long cnt_short; /* for overview mode, count of short line queries */
231 long cnt_punct; /* for overview mode,
232 count of punctuation and spacing queries */
233 long cnt_dash; /* for overview mode, count of dash-related queries */
234 long cnt_word; /* for overview mode, count of word queries */
235 long cnt_html; /* for overview mode, count of html queries */
236 long cnt_lineend; /* for overview mode, count of line-end queries */
237 long cnt_spacend; /* count of lines with space at end */
238 long linecnt; /* count of total lines in the file */
239 long checked_linecnt; /* count of lines actually checked */
241 void proghelp(GOptionContext *context);
242 void procfile(const char *);
246 gboolean mixdigit(const char *);
247 gchar *getaword(const char **);
248 char *flgets(char **,long,gboolean);
249 void postprocess_for_HTML(char *);
250 char *linehasmarkup(char *);
251 char *losemarkup(char *);
252 gboolean tagcomp(const char *,const char *);
253 void loseentities(char *);
254 gboolean isroman(const char *);
255 void postprocess_for_DP(char *);
256 void print_as_windows_1252(const char *string);
257 void print_as_utf_8(const char *string);
259 GTree *qword,*qperiod;
267 void config_file_update(GKeyFile *kf)
271 for(i=0;options[i].long_name;i++)
273 if (g_str_has_prefix(options[i].long_name,"no-"))
275 if (options[i].arg==G_OPTION_ARG_NONE)
277 sw=*(gboolean *)options[i].arg_data;
278 if (options[i].flags&G_OPTION_FLAG_REVERSE)
280 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
283 g_assert_not_reached();
287 void config_file_add_comments(GKeyFile *kf)
291 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
293 for(i=0;options[i].long_name;i++)
295 if (g_str_has_prefix(options[i].long_name,"no-"))
297 comment=g_strconcat(" ",options[i].description,NULL);
298 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
303 void dump_config(void)
307 config_file_update(config);
310 config=g_key_file_new();
311 config_file_update(config);
312 config_file_add_comments(config);
314 s=g_key_file_to_data(config,NULL,NULL);
320 GKeyFile *read_config_file(gchar **full_path)
326 const char *search_path;
329 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
333 search_dirs=g_strsplit(search_path,";",0);
335 search_dirs=g_strsplit(search_path,":",0);
340 search_dirs=g_new(gchar *,4);
341 search_dirs[0]=g_get_current_dir();
342 search_dirs[1]=g_strdup(running_from);
343 search_dirs[2]=g_strdup(g_get_user_config_dir());
346 for(i=0;search_dirs[i];i++)
348 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
349 if (g_key_file_load_from_file(kf,path,
350 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
352 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
354 g_printerr("Bookloupe: Error reading %s\n",path);
355 g_printerr("%s\n",err->message);
367 g_strfreev(search_dirs);
375 void parse_config_file(void)
382 config=read_config_file(&path);
384 keys=g_key_file_get_keys(config,"options",NULL,NULL);
391 for(j=0;options[j].long_name;j++)
393 if (g_str_has_prefix(options[j].long_name,"no-"))
395 else if (!strcmp(keys[i],options[j].long_name))
397 if (options[j].arg==G_OPTION_ARG_NONE)
399 sw=g_key_file_get_boolean(config,"options",keys[i],
403 g_printerr("Bookloupe: %s: options.%s: %s\n",
404 path,keys[i],err->message);
407 if (options[j].flags&G_OPTION_FLAG_REVERSE)
409 *(gboolean *)options[j].arg_data=sw;
413 g_assert_not_reached();
416 if (!options[j].long_name)
417 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
426 void parse_options(int *argc,char ***argv)
429 GOptionContext *context;
430 GOptionGroup *compatibility;
431 context=g_option_context_new(
432 "file - look for errors in Project Gutenberg(TM) etexts");
433 g_option_context_add_main_entries(context,options,NULL);
434 g_option_context_add_main_entries(context,config_options,NULL);
435 compatibility=g_option_group_new("compatibility",
436 "Options for Compatibility with Gutcheck:",
437 "Show compatibility options",NULL,NULL);
438 g_option_group_add_entries(compatibility,compatibility_options);
439 g_option_context_add_group(context,compatibility);
440 g_option_context_set_description(context,
441 "For simplicity, only the switch options which reverse the\n"
442 "default configuration are listed. In most cases, both vanilla\n"
443 "and \"no-\" prefixed versions are available for use.");
444 if (!g_option_context_parse(context,argc,argv,&err))
446 g_printerr("Bookloupe: %s\n",err->message);
447 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
451 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
454 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
455 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
458 * Web uploads - for the moment, this is really just a placeholder
459 * until we decide what processing we really want to do on web uploads
461 if (pswit[WEB_SWITCH])
463 /* specific override for web uploads */
464 pswit[ECHO_SWITCH]=TRUE;
465 pswit[SQUOTE_SWITCH]=FALSE;
466 pswit[TYPO_SWITCH]=TRUE;
467 pswit[QPARA_SWITCH]=FALSE;
468 pswit[PARANOID_SWITCH]=TRUE;
469 pswit[LINE_END_SWITCH]=FALSE;
470 pswit[OVERVIEW_SWITCH]=FALSE;
471 pswit[STDOUT_SWITCH]=FALSE;
472 pswit[HEADER_SWITCH]=TRUE;
473 pswit[VERBOSE_SWITCH]=FALSE;
474 pswit[MARKUP_SWITCH]=FALSE;
475 pswit[USERTYPO_SWITCH]=FALSE;
476 pswit[DP_SWITCH]=FALSE;
478 if (pswit[DUMP_CONFIG_SWITCH])
483 if (pswit[OVERVIEW_SWITCH])
484 /* just print summary; don't echo */
485 pswit[ECHO_SWITCH]=FALSE;
491 g_option_context_free(context);
497 * Read in the user-defined stealth scanno list.
499 void read_user_scannos(void)
502 gchar *usertypo_file;
506 gchar *contents,*utf8,**lines;
507 usertypo_file=g_strdup("bookloupe.typ");
508 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
509 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
512 g_free(usertypo_file);
513 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
514 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
516 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
519 g_free(usertypo_file);
520 usertypo_file=g_strdup("gutcheck.typ");
521 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
523 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
526 g_free(usertypo_file);
527 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
528 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
530 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
532 g_free(usertypo_file);
533 g_print(" --> I couldn't find bookloupe.typ "
534 "-- proceeding without user typos.\n");
539 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
540 g_free(usertypo_file);
544 if (g_utf8_validate(contents,len,NULL))
545 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
547 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
549 lines=g_strsplit_set(utf8,"\r\n",0);
551 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
552 for (i=0;lines[i];i++)
553 if (*(unsigned char *)lines[i]>'!')
554 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
563 * Read an etext returning a newly allocated string containing the file
564 * contents or NULL on error.
566 gchar *read_etext(const char *filename,GError **err)
568 GError *tmp_err=NULL;
569 gchar *contents,*utf8;
570 gsize len,bytes_read,bytes_written;
572 if (!g_file_get_contents(filename,&contents,&len,err))
574 if (g_utf8_validate(contents,len,NULL))
576 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
577 g_set_print_handler(print_as_utf_8);
579 SetConsoleOutputCP(CP_UTF8);
584 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
585 &bytes_written,&tmp_err);
586 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
587 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
590 for(i=0;i<bytes_read;i++)
591 if (contents[i]=='\n')
596 else if (contents[i]!='\r')
598 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
599 "Input conversion failed. Byte %d at line %d, column %d is not a "
600 "valid Windows-1252 character",
601 ((unsigned char *)contents)[bytes_read],line,col);
604 g_propagate_error(err,tmp_err);
605 g_set_print_handler(print_as_windows_1252);
607 SetConsoleOutputCP(1252);
614 void cleanup_on_exit(void)
617 SetConsoleOutputCP(saved_cp);
621 int main(int argc,char **argv)
624 atexit(cleanup_on_exit);
625 saved_cp=GetConsoleOutputCP();
627 running_from=g_path_get_dirname(argv[0]);
628 /* Paranoid checking is turned OFF, not on, by its switch */
629 pswit[PARANOID_SWITCH]=TRUE;
630 /* if running in paranoid mode, typo checks default to enabled */
631 pswit[TYPO_SWITCH]=TRUE;
632 /* Line-end checking is turned OFF, not on, by its switch */
633 pswit[LINE_END_SWITCH]=TRUE;
634 /* Echoing is turned OFF, not on, by its switch */
635 pswit[ECHO_SWITCH]=TRUE;
637 parse_options(&argc,&argv);
638 if (pswit[USERTYPO_SWITCH])
640 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
642 if (pswit[OVERVIEW_SWITCH])
644 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
645 checked_linecnt,linecnt,linecnt-checked_linecnt);
646 g_print(" --------------- Queries found --------------\n");
648 g_print(" Long lines: %14ld\n",cnt_long);
650 g_print(" Short lines: %14ld\n",cnt_short);
652 g_print(" Line-end problems: %14ld\n",cnt_lineend);
654 g_print(" Common typos: %14ld\n",cnt_word);
656 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
658 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
660 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
662 g_print(" Proofing characters: %14ld\n",cnt_odd);
664 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
666 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
668 g_print(" Possible HTML tags: %14ld\n",cnt_html);
670 g_print(" TOTAL QUERIES %14ld\n",
671 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
672 cnt_dash+cnt_word+cnt_html+cnt_lineend);
674 g_free(running_from);
676 g_tree_unref(usertypo);
678 g_key_file_free(config);
682 void count_dashes(const char *line,const char *dash,
683 struct dash_results *results)
688 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
691 tokens=g_strsplit(line,dash,0);
694 for(i=1;tokens[i];i++)
696 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
697 nc=g_utf8_get_char(tokens[i]);
698 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
700 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
702 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
708 /* count of lines with em-dashes with spaces both sides */
709 results->non_PG_space++;
711 /* count of lines with PG-type em-dashes with no spaces */
719 * Run a first pass - verify that it's a valid PG
720 * file, decide whether to report some things that
721 * occur many times in the text like long or short
722 * lines, non-standard dashes, etc.
724 struct first_pass_results *first_pass(const char *etext)
726 gunichar laststart=CHAR_SPACE;
731 unsigned int lastlen=0,lastblen=0;
732 long spline=0,nspline=0;
733 static struct first_pass_results results={0};
734 struct dash_results tmp_dash_results;
737 lines=g_strsplit(etext,"\n",0);
739 /* If there's at least one line, we might have UNIX-style terminators */
740 results.unix_lineends=TRUE;
741 for (j=0;lines[j];j++)
743 lbytes=strlen(lines[j]);
744 if (lbytes>0 && lines[j][lbytes-1]=='\r')
746 results.unix_lineends=FALSE;
749 lines[j][--lbytes]='\0';
750 } while (lbytes>0 && lines[j][lbytes-1]=='\r');
752 llen=g_utf8_strlen(lines[j],lbytes);
754 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
755 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
758 g_print(" --> Duplicate header?\n");
759 spline=linecnt+1; /* first line of non-header text, that is */
761 if (!strncmp(lines[j],"*** START",9) &&
762 strstr(lines[j],"PROJECT GUTENBERG"))
765 g_print(" --> Duplicate header?\n");
766 nspline=linecnt+1; /* first line of non-header text, that is */
768 if (spline || nspline)
770 lc_line=g_utf8_strdown(lines[j],lbytes);
771 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
773 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
775 if (results.footerline)
777 /* it's an old-form header - we can detect duplicates */
779 g_print(" --> Duplicate footer?\n");
782 results.footerline=linecnt;
788 results.firstline=spline;
790 results.firstline=nspline; /* override with new */
791 if (results.footerline)
792 continue; /* don't count the boilerplate in the footer */
793 results.totlen+=llen;
794 for (s=lines[j];*s;s=g_utf8_next_char(s))
796 if (g_utf8_get_char(s)>127)
798 if (g_unichar_isalpha(g_utf8_get_char(s)))
802 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
803 qc=QUOTE_CLASS(g_utf8_get_char(s));
806 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
807 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
808 results.endquote_count++;
811 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
812 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
815 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
817 if (strstr(lines[j],".,"))
819 /* only count ast lines for ignoring purposes where there is */
820 /* locase text on the line */
821 if (strchr(lines[j],'*'))
823 for (s=lines[j];*s;s=g_utf8_next_char(s))
824 if (g_unichar_islower(g_utf8_get_char(s)))
829 if (strchr(lines[j],'/'))
830 results.fslashline++;
833 for (s=g_utf8_prev_char(lines[j]+lbytes);
834 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
835 s=g_utf8_prev_char(s))
837 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
838 g_utf8_get_char(g_utf8_prev_char(s))!='-')
841 if (llen>LONGEST_PG_LINE)
843 if (llen>WAY_TOO_LONG)
844 results.verylongline++;
845 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
847 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
850 if (strstr(lines[j],"<i>"))
851 results.htmcount+=4; /* bonus marks! */
853 /* Check for spaced em-dashes */
854 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
855 count_dashes(lines[j],"--",&tmp_dash_results);
856 count_dashes(lines[j],"—",&tmp_dash_results);
857 if (tmp_dash_results.base)
858 results.emdash.base++;
859 if (tmp_dash_results.non_PG_space)
860 results.emdash.non_PG_space++;
861 if (tmp_dash_results.PG_space)
862 results.emdash.PG_space++;
866 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
867 results.Dutchcount++;
868 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
869 results.Frenchcount++;
870 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
871 results.standalone_digit++;
874 /* Check for spaced dashes */
875 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
879 laststart=lines[j][0];
888 * Make some snap decisions based on the first pass results.
890 struct warnings *report_first_pass(struct first_pass_results *results)
892 static struct warnings warnings={0};
894 if (results->unix_lineends)
897 g_print(" --> No lines in this file have a CR. Not reporting them. "
898 "Project Gutenberg requires that all lineends be CR-LF.\n");
901 g_print(" --> %ld lines in this file have white space at end\n",
904 if (results->dotcomma>5)
907 g_print(" --> %ld lines in this file contain '.,'. "
908 "Not reporting them.\n",results->dotcomma);
911 * If more than 50 lines, or one-tenth, are short,
912 * don't bother reporting them.
914 warnings.shortline=1;
915 if (results->shortline>50 || results->shortline*10>linecnt)
917 warnings.shortline=0;
918 g_print(" --> %ld lines in this file are short. "
919 "Not reporting short lines.\n",results->shortline);
922 * If more than 50 lines, or one-tenth, are long,
923 * don't bother reporting them.
926 if (results->longline>50 || results->longline*10>linecnt)
929 g_print(" --> %ld lines in this file are long. "
930 "Not reporting long lines.\n",results->longline);
932 /* If more than 10 lines contain asterisks, don't bother reporting them. */
934 if (results->astline>10)
937 g_print(" --> %ld lines in this file contain asterisks. "
938 "Not reporting them.\n",results->astline);
941 * If more than 10 lines contain forward slashes,
942 * don't bother reporting them.
945 if (results->fslashline>10)
948 g_print(" --> %ld lines in this file contain forward slashes. "
949 "Not reporting them.\n",results->fslashline);
952 * If more than 20 lines contain unpunctuated endquotes,
953 * don't bother reporting them.
956 if (results->endquote_count>20)
959 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
960 "Not reporting them.\n",results->endquote_count);
963 * If more than 15 lines contain standalone digits,
964 * don't bother reporting them.
967 if (results->standalone_digit>10)
970 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
971 "Not reporting them.\n",results->standalone_digit);
974 * If more than 20 lines contain hyphens at end,
975 * don't bother reporting them.
978 if (results->hyphens>20)
981 g_print(" --> %ld lines in this file have hyphens at end. "
982 "Not reporting them.\n",results->hyphens);
984 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
986 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
987 pswit[MARKUP_SWITCH]=1;
989 if (results->verylongline>0)
990 g_print(" --> %ld lines in this file are VERY long!\n",
991 results->verylongline);
993 * If there are more non-PG spaced dashes than PG em-dashes,
994 * assume it's deliberate.
995 * Current PG guidelines say don't use them, but older texts do,
996 * and some people insist on them whatever the guidelines say.
999 if (results->spacedash+results->emdash.non_PG_space>
1000 results->emdash.PG_space)
1003 g_print(" --> There are %ld spaced dashes and em-dashes. "
1004 "Not reporting them.\n",
1005 results->spacedash+results->emdash.non_PG_space);
1007 /* If more than a quarter of characters are hi-bit, bug out. */
1009 if (results->binlen*4>results->totlen)
1011 g_print(" --> This file does not appear to be ASCII. "
1012 "Terminating. Best of luck with it!\n");
1015 if (results->alphalen*4<results->totlen)
1017 g_print(" --> This file does not appear to be text. "
1018 "Terminating. Best of luck with it!\n");
1021 if (results->binlen*100>results->totlen || results->binlen>100)
1023 g_print(" --> There are a lot of foreign letters here. "
1024 "Not reporting them.\n");
1027 warnings.isDutch=FALSE;
1028 if (results->Dutchcount>50)
1030 warnings.isDutch=TRUE;
1031 g_print(" --> This looks like Dutch - "
1032 "switching off dashes and warnings for 's Middags case.\n");
1034 warnings.isFrench=FALSE;
1035 if (results->Frenchcount>50)
1037 warnings.isFrench=TRUE;
1038 g_print(" --> This looks like French - "
1039 "switching off some doublepunct.\n");
1041 if (results->firstline && results->footerline)
1042 g_print(" The PG header and footer appear to be already on.\n");
1045 if (results->firstline)
1046 g_print(" The PG header is on - no footer.\n");
1047 if (results->footerline)
1048 g_print(" The PG footer is on - no header.\n");
1051 if (pswit[VERBOSE_SWITCH])
1054 warnings.shortline=1;
1055 warnings.dotcomma=1;
1056 warnings.longline=1;
1062 warnings.endquote=1;
1063 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1065 if (warnings.isDutch)
1067 if (results->footerline>0 && results->firstline>0 &&
1068 results->footerline>results->firstline &&
1069 results->footerline-results->firstline<100)
1071 g_print(" --> I don't really know where this text starts. \n");
1072 g_print(" There are no reference points.\n");
1073 g_print(" I'm going to have to report the header and footer "
1075 results->firstline=0;
1083 * Look along the line, accumulate the count of quotes, and see
1084 * if this is an empty line - i.e. a line with nothing on it
1086 * If line has just spaces, period, * and/or - on it, don't
1087 * count it, since empty lines with asterisks or dashes to
1088 * separate sections are common.
1090 * Returns: TRUE if the line is empty.
1092 gboolean analyse_quotes(const char *aline,struct counters *counters)
1095 /* assume the line is empty until proven otherwise */
1096 gboolean isemptyline=TRUE;
1097 const char *s=aline,*sprev,*snext;
1100 GError *tmp_err=NULL;
1103 snext=g_utf8_next_char(s);
1104 c=g_utf8_get_char(s);
1105 if (CHAR_IS_DQUOTE(c))
1106 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
1107 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
1112 * At start of line, it can only be a quotation mark.
1113 * Hardcode a very common exception!
1115 if (!g_str_has_prefix(snext,"tis") &&
1116 !g_str_has_prefix(snext,"Tis"))
1117 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1119 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1120 g_unichar_isalpha(g_utf8_get_char(snext)))
1121 /* Do nothing! it's definitely an apostrophe, not a quote */
1123 /* it's outside a word - let's check it out */
1124 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1125 g_unichar_isalpha(g_utf8_get_char(snext)))
1127 /* certainly looks like a quotation mark */
1128 if (!g_str_has_prefix(snext,"tis") &&
1129 !g_str_has_prefix(snext,"Tis"))
1130 /* hardcode a very common exception! */
1132 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
1133 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1135 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
1140 /* now - is it a quotation mark? */
1141 guessquote=0; /* accumulate clues */
1142 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1144 /* it follows a letter - could be either */
1146 if (g_utf8_get_char(sprev)=='s')
1148 /* looks like a plural apostrophe */
1150 if (g_utf8_get_char(snext)==CHAR_SPACE)
1154 if (innermost_quote_matches(counters,c))
1156 * Give it the benefit of some doubt,
1157 * if a squote is already open.
1163 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
1166 /* no adjacent letter - it must be a quote of some kind */
1167 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
1172 if (pswit[ECHO_SWITCH])
1173 g_print("\n%s\n",aline);
1174 if (!pswit[OVERVIEW_SWITCH])
1175 g_print(" Line %ld column %ld - %s\n",
1176 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
1177 g_clear_error(&tmp_err);
1179 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1181 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1182 if (c==CHAR_UNDERSCORE)
1183 counters->c_unders++;
1184 if (c==CHAR_OPEN_SBRACK)
1186 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1187 !matching_difference(counters,c) && s==aline &&
1188 g_str_has_prefix(s,"[Illustration:"))
1189 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1191 increment_matching(counters,c,TRUE);
1193 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1194 increment_matching(counters,c,TRUE);
1195 if (c==CHAR_CLOSE_SBRACK)
1197 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1198 !matching_difference(counters,c) && !*snext)
1199 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1201 increment_matching(counters,c,FALSE);
1203 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1204 increment_matching(counters,c,FALSE);
1212 * check_for_control_characters:
1214 * Check for invalid or questionable characters in the line
1215 * Anything above 127 is invalid for plain ASCII, and
1216 * non-printable control characters should also be flagged.
1217 * Tabs should generally not be there.
1219 void check_for_control_characters(const char *aline)
1223 for (s=aline;*s;s=g_utf8_next_char(s))
1225 c=g_utf8_get_char(s);
1226 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1228 if (pswit[ECHO_SWITCH])
1229 g_print("\n%s\n",aline);
1230 if (!pswit[OVERVIEW_SWITCH])
1231 g_print(" Line %ld column %ld - Control character %u\n",
1232 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1240 * check_for_odd_characters:
1242 * Check for binary and other odd characters.
1244 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1245 gboolean isemptyline)
1247 /* Don't repeat multiple warnings on one line. */
1248 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1249 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1252 for (s=aline;*s;s=g_utf8_next_char(s))
1254 c=g_utf8_get_char(s);
1255 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1257 if (pswit[ECHO_SWITCH])
1258 g_print("\n%s\n",aline);
1259 if (!pswit[OVERVIEW_SWITCH])
1260 if (c>127 && c<160 || c>255)
1261 g_print(" Line %ld column %ld - "
1262 "Non-ISO-8859 character %u\n",
1263 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1265 g_print(" Line %ld column %ld - "
1266 "Non-ASCII character %u\n",
1267 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1272 if (!eTab && c==CHAR_TAB)
1274 if (pswit[ECHO_SWITCH])
1275 g_print("\n%s\n",aline);
1276 if (!pswit[OVERVIEW_SWITCH])
1277 g_print(" Line %ld column %ld - Tab character?\n",
1278 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1283 if (!eTilde && c==CHAR_TILDE)
1286 * Often used by OCR software to indicate an
1287 * unrecognizable character.
1289 if (pswit[ECHO_SWITCH])
1290 g_print("\n%s\n",aline);
1291 if (!pswit[OVERVIEW_SWITCH])
1292 g_print(" Line %ld column %ld - Tilde character?\n",
1293 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1298 if (!eCarat && c==CHAR_CARAT)
1300 if (pswit[ECHO_SWITCH])
1301 g_print("\n%s\n",aline);
1302 if (!pswit[OVERVIEW_SWITCH])
1303 g_print(" Line %ld column %ld - Carat character?\n",
1304 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1309 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1311 if (pswit[ECHO_SWITCH])
1312 g_print("\n%s\n",aline);
1313 if (!pswit[OVERVIEW_SWITCH])
1314 g_print(" Line %ld column %ld - Forward slash?\n",
1315 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1321 * Report asterisks only in paranoid mode,
1322 * since they're often deliberate.
1324 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1327 if (pswit[ECHO_SWITCH])
1328 g_print("\n%s\n",aline);
1329 if (!pswit[OVERVIEW_SWITCH])
1330 g_print(" Line %ld column %ld - Asterisk?\n",
1331 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1340 * check_for_long_line:
1342 * Check for line too long.
1344 void check_for_long_line(const char *aline)
1346 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1348 if (pswit[ECHO_SWITCH])
1349 g_print("\n%s\n",aline);
1350 if (!pswit[OVERVIEW_SWITCH])
1351 g_print(" Line %ld column %ld - Long line %ld\n",
1352 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1359 * check_for_short_line:
1361 * Check for line too short.
1363 * This one is a bit trickier to implement: we don't want to
1364 * flag the last line of a paragraph for being short, so we
1365 * have to wait until we know that our current line is a
1366 * "normal" line, then report the _previous_ line if it was too
1367 * short. We also don't want to report indented lines like
1368 * chapter heads or formatted quotations. We therefore keep
1369 * last->len as the length of the last line examined, and
1370 * last->blen as the length of the last but one, and try to
1371 * suppress unnecessary warnings by checking that both were of
1372 * "normal" length. We keep the first character of the last
1373 * line in last->start, and if it was a space, we assume that
1374 * the formatting is deliberate. I can't figure out a way to
1375 * distinguish something like a quoted verse left-aligned or
1376 * the header or footer of a letter from a paragraph of short
1377 * lines - maybe if I examined the whole paragraph, and if the
1378 * para has less than, say, 8 lines and if all lines are short,
1379 * then just assume it's OK? Need to look at some texts to see
1380 * how often a formula like this would get the right result.
1382 void check_for_short_line(const char *aline,const struct line_properties *last)
1384 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1385 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1386 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1388 if (pswit[ECHO_SWITCH])
1389 g_print("\n%s\n",prevline);
1390 if (!pswit[OVERVIEW_SWITCH])
1391 g_print(" Line %ld column %ld - Short line %ld?\n",
1392 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1399 * check_for_starting_punctuation:
1401 * Look for punctuation other than full ellipses at start of line.
1403 void check_for_starting_punctuation(const char *aline)
1405 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1406 !g_str_has_prefix(aline,". . ."))
1408 if (pswit[ECHO_SWITCH])
1409 g_print("\n%s\n",aline);
1410 if (!pswit[OVERVIEW_SWITCH])
1411 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1421 * Find the first em-dash, return a pointer to it and set <next> to the
1422 * character following the dash.
1424 char *str_emdash(const char *s,const char **next)
1432 *next=g_utf8_next_char(s2);
1437 *next=g_utf8_next_char(g_utf8_next_char(s1));
1442 *next=g_utf8_next_char(g_utf8_next_char(s1));
1447 *next=g_utf8_next_char(s2);
1453 * check_for_spaced_emdash:
1455 * Check for spaced em-dashes.
1457 * We must check _all_ occurrences of em-dashes on the line
1458 * hence the loop - even if the first dash is OK
1459 * there may be another that's wrong later on.
1461 void check_for_spaced_emdash(const char *aline)
1463 const char *s,*t,*next;
1464 for (s=aline;t=str_emdash(s,&next);s=next)
1466 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1467 g_utf8_get_char(next)==CHAR_SPACE)
1469 if (pswit[ECHO_SWITCH])
1470 g_print("\n%s\n",aline);
1471 if (!pswit[OVERVIEW_SWITCH])
1472 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1473 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1481 * check_for_spaced_dash:
1483 * Check for spaced dashes.
1485 void check_for_spaced_dash(const char *aline)
1488 if ((s=strstr(aline," -")))
1490 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1492 if (pswit[ECHO_SWITCH])
1493 g_print("\n%s\n",aline);
1494 if (!pswit[OVERVIEW_SWITCH])
1495 g_print(" Line %ld column %ld - Spaced dash?\n",
1496 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1501 else if ((s=strstr(aline,"- ")))
1503 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1505 if (pswit[ECHO_SWITCH])
1506 g_print("\n%s\n",aline);
1507 if (!pswit[OVERVIEW_SWITCH])
1508 g_print(" Line %ld column %ld - Spaced dash?\n",
1509 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1517 * check_for_unmarked_paragraphs:
1519 * Check for unmarked paragraphs indicated by separate speakers.
1521 * May well be false positive:
1522 * "Bravo!" "Wonderful!" called the crowd.
1523 * but useful all the same.
1525 void check_for_unmarked_paragraphs(const char *aline)
1528 s=strstr(aline,"\" \"");
1530 s=strstr(aline,"\" \"");
1533 if (pswit[ECHO_SWITCH])
1534 g_print("\n%s\n",aline);
1535 if (!pswit[OVERVIEW_SWITCH])
1536 g_print(" Line %ld column %ld - "
1537 "Query missing paragraph break?\n",
1538 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1545 * check_for_jeebies:
1547 * Check for "to he" and other easy h/b errors.
1549 * This is a very inadequate effort on the h/b problem,
1550 * but the phrase "to he" is always an error, whereas "to
1551 * be" is quite common.
1552 * Similarly, '"Quiet!", be said.' is a non-be error
1553 * "to he" is _not_ always an error!:
1554 * "Where they went to he couldn't say."
1555 * Another false positive:
1556 * What would "Cinderella" be without the . . .
1557 * and another: "If he wants to he can see for himself."
1559 void check_for_jeebies(const char *aline)
1562 s=strstr(aline," be could ");
1564 s=strstr(aline," be would ");
1566 s=strstr(aline," was be ");
1568 s=strstr(aline," be is ");
1570 s=strstr(aline," is be ");
1572 s=strstr(aline,"\", be ");
1574 s=strstr(aline,"\" be ");
1576 s=strstr(aline,"\" be ");
1578 s=strstr(aline," to he ");
1581 if (pswit[ECHO_SWITCH])
1582 g_print("\n%s\n",aline);
1583 if (!pswit[OVERVIEW_SWITCH])
1584 g_print(" Line %ld column %ld - Query he/be error?\n",
1585 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1589 s=strstr(aline," the had ");
1591 s=strstr(aline," a had ");
1593 s=strstr(aline," they bad ");
1595 s=strstr(aline," she bad ");
1597 s=strstr(aline," he bad ");
1599 s=strstr(aline," you bad ");
1601 s=strstr(aline," i bad ");
1604 if (pswit[ECHO_SWITCH])
1605 g_print("\n%s\n",aline);
1606 if (!pswit[OVERVIEW_SWITCH])
1607 g_print(" Line %ld column %ld - Query had/bad error?\n",
1608 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1612 s=strstr(aline,"; hut ");
1614 s=strstr(aline,", hut ");
1617 if (pswit[ECHO_SWITCH])
1618 g_print("\n%s\n",aline);
1619 if (!pswit[OVERVIEW_SWITCH])
1620 g_print(" Line %ld column %ld - Query hut/but error?\n",
1621 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1628 * check_for_mta_from:
1630 * Special case - angled bracket in front of "From" placed there by an
1631 * MTA when sending an e-mail.
1633 void check_for_mta_from(const char *aline)
1636 s=strstr(aline,">From");
1639 if (pswit[ECHO_SWITCH])
1640 g_print("\n%s\n",aline);
1641 if (!pswit[OVERVIEW_SWITCH])
1642 g_print(" Line %ld column %ld - "
1643 "Query angled bracket with From\n",
1644 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1651 * check_for_orphan_character:
1653 * Check for a single character line -
1654 * often an overflow from bad wrapping.
1656 void check_for_orphan_character(const char *aline)
1659 c=g_utf8_get_char(aline);
1660 if (c && !*g_utf8_next_char(aline))
1662 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1663 ; /* Nothing - ignore numerals alone on a line. */
1666 if (pswit[ECHO_SWITCH])
1667 g_print("\n%s\n",aline);
1668 if (!pswit[OVERVIEW_SWITCH])
1669 g_print(" Line %ld column 1 - Query single character line\n",
1678 * check_for_pling_scanno:
1680 * Check for I" - often should be !
1682 void check_for_pling_scanno(const char *aline)
1685 s=strstr(aline," I\"");
1688 if (pswit[ECHO_SWITCH])
1689 g_print("\n%s\n",aline);
1690 if (!pswit[OVERVIEW_SWITCH])
1691 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1692 linecnt,g_utf8_pointer_to_offset(aline,s));
1699 * check_for_extra_period:
1701 * Check for period without a capital letter. Cut-down from gutspell.
1702 * Only works when it happens on a single line.
1704 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1706 const char *s,*t,*s1,*sprev;
1711 gunichar c,nc,pc,*decomposition;
1712 if (pswit[PARANOID_SWITCH])
1714 for (t=aline;t=strstr(t,". ");)
1718 t=g_utf8_next_char(t);
1719 /* start of line punctuation is handled elsewhere */
1722 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1724 t=g_utf8_next_char(t);
1727 if (warnings->isDutch)
1729 /* For Frank & Jeroen -- 's Middags case */
1730 gunichar c2,c3,c4,c5;
1731 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1732 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1733 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1734 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1735 if (CHAR_IS_APOSTROPHE(c2) &&
1736 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1737 g_unichar_isupper(c5))
1739 t=g_utf8_next_char(t);
1743 s1=g_utf8_next_char(g_utf8_next_char(t));
1744 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1745 !g_unichar_isdigit(g_utf8_get_char(s1)))
1746 s1=g_utf8_next_char(s1);
1747 if (g_unichar_islower(g_utf8_get_char(s1)))
1749 /* we have something to investigate */
1751 /* so let's go back and find out */
1752 nc=g_utf8_get_char(t);
1753 s1=g_utf8_prev_char(t);
1754 c=g_utf8_get_char(s1);
1755 sprev=g_utf8_prev_char(s1);
1756 pc=g_utf8_get_char(sprev);
1758 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1759 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1760 g_unichar_isalpha(nc)))
1765 sprev=g_utf8_prev_char(s1);
1766 pc=g_utf8_get_char(sprev);
1768 s1=g_utf8_next_char(s1);
1771 testword=g_strndup(s1,s-s1);
1773 testword=g_strdup(s1);
1774 for (i=0;*abbrev[i];i++)
1775 if (!strcmp(testword,abbrev[i]))
1777 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1779 if (!*g_utf8_next_char(testword))
1781 if (isroman(testword))
1786 for (s=testword;*s;s=g_utf8_next_char(s))
1788 decomposition=g_unicode_canonical_decomposition(
1789 g_utf8_get_char(s),&len);
1790 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1792 g_free(decomposition);
1796 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1798 g_tree_insert(qperiod,g_strdup(testword),
1799 GINT_TO_POINTER(1));
1800 if (pswit[ECHO_SWITCH])
1801 g_print("\n%s\n",aline);
1802 if (!pswit[OVERVIEW_SWITCH])
1803 g_print(" Line %ld column %ld - Extra period?\n",
1804 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1810 t=g_utf8_next_char(t);
1816 * check_for_following_punctuation:
1818 * Check for words usually not followed by punctuation.
1820 void check_for_following_punctuation(const char *aline)
1823 const char *s,*wordstart;
1826 if (pswit[TYPO_SWITCH])
1837 inword=g_utf8_strdown(t,-1);
1839 for (i=0;*nocomma[i];i++)
1840 if (!strcmp(inword,nocomma[i]))
1842 c=g_utf8_get_char(s);
1843 if (c==',' || c==';' || c==':')
1845 if (pswit[ECHO_SWITCH])
1846 g_print("\n%s\n",aline);
1847 if (!pswit[OVERVIEW_SWITCH])
1848 g_print(" Line %ld column %ld - "
1849 "Query punctuation after %s?\n",
1850 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1856 for (i=0;*noperiod[i];i++)
1857 if (!strcmp(inword,noperiod[i]))
1859 c=g_utf8_get_char(s);
1860 if (c=='.' || c=='!')
1862 if (pswit[ECHO_SWITCH])
1863 g_print("\n%s\n",aline);
1864 if (!pswit[OVERVIEW_SWITCH])
1865 g_print(" Line %ld column %ld - "
1866 "Query punctuation after %s?\n",
1867 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1881 * Check for commonly mistyped words,
1882 * and digits like 0 for O in a word.
1884 void check_for_typos(const char *aline,struct warnings *warnings)
1886 const char *s,*t,*nt,*wordstart;
1888 gunichar *decomposition;
1890 int i,vowel,consonant,*dupcnt;
1891 gboolean isdup,istypo,alower;
1894 gsize decomposition_len;
1898 inword=getaword(&s);
1902 continue; /* don't bother with empty lines */
1904 if (mixdigit(inword))
1906 if (pswit[ECHO_SWITCH])
1907 g_print("\n%s\n",aline);
1908 if (!pswit[OVERVIEW_SWITCH])
1909 g_print(" Line %ld column %ld - Query digit in %s\n",
1910 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1915 * Put the word through a series of tests for likely typos and OCR
1918 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1922 for (t=inword;*t;t=g_utf8_next_char(t))
1924 c=g_utf8_get_char(t);
1925 nt=g_utf8_next_char(t);
1926 /* lowercase for testing */
1927 if (g_unichar_islower(c))
1929 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1932 * We have an uppercase mid-word. However, there are
1934 * Mac and Mc like McGill
1935 * French contractions like l'Abbe
1937 offset=g_utf8_pointer_to_offset(inword,t);
1939 pc=g_utf8_get_char(g_utf8_prev_char(t));
1942 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1943 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1944 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1945 CHAR_IS_APOSTROPHE(pc))
1951 testword=g_utf8_casefold(inword,-1);
1953 if (pswit[TYPO_SWITCH])
1956 * Check for certain unlikely two-letter combinations at word
1959 len=g_utf8_strlen(testword,-1);
1962 for (i=0;*nostart[i];i++)
1963 if (g_str_has_prefix(testword,nostart[i]))
1965 for (i=0;*noend[i];i++)
1966 if (g_str_has_suffix(testword,noend[i]))
1969 /* ght is common, gbt never. Like that. */
1970 if (strstr(testword,"cb"))
1972 if (strstr(testword,"gbt"))
1974 if (strstr(testword,"pbt"))
1976 if (strstr(testword,"tbs"))
1978 if (strstr(testword,"mrn"))
1980 if (strstr(testword,"ahle"))
1982 if (strstr(testword,"ihle"))
1985 * "TBE" does happen - like HEARTBEAT - but uncommon.
1986 * Also "TBI" - frostbite, outbid - but uncommon.
1987 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1988 * numerals, but "ii" is a common scanno.
1990 if (strstr(testword,"tbi"))
1992 if (strstr(testword,"tbe"))
1994 if (strstr(testword,"ii"))
1997 * Check for no vowels or no consonants.
1998 * If none, flag a typo.
2000 if (!istypo && len>1)
2003 for (t=testword;*t;t=g_utf8_next_char(t))
2005 c=g_utf8_get_char(t);
2007 g_unicode_canonical_decomposition(c,&decomposition_len);
2008 if (c=='y' || g_unichar_isdigit(c))
2010 /* Yah, this is loose. */
2014 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
2018 g_free(decomposition);
2020 if (!vowel || !consonant)
2024 * Now exclude the word from being reported if it's in
2027 for (i=0;*okword[i];i++)
2028 if (!strcmp(testword,okword[i]))
2031 * What looks like a typo may be a Roman numeral.
2034 if (istypo && isroman(testword))
2036 /* Check the manual list of typos. */
2038 for (i=0;*typo[i];i++)
2039 if (!strcmp(testword,typo[i]))
2042 * Check lowercase s, l, i and m - special cases.
2043 * "j" - often a semi-colon gone wrong.
2044 * "d" for a missing apostrophe - he d
2047 if (!istypo && len==1 &&
2048 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
2052 dupcnt=g_tree_lookup(qword,testword);
2056 isdup=!pswit[VERBOSE_SWITCH];
2060 dupcnt=g_new0(int,1);
2061 g_tree_insert(qword,g_strdup(testword),dupcnt);
2066 if (pswit[ECHO_SWITCH])
2067 g_print("\n%s\n",aline);
2068 if (!pswit[OVERVIEW_SWITCH])
2070 g_print(" Line %ld column %ld - Query word %s",
2071 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
2073 if (!pswit[VERBOSE_SWITCH])
2074 g_print(" - not reporting duplicates");
2082 /* check the user's list of typos */
2083 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
2085 if (pswit[ECHO_SWITCH])
2086 g_print("\n%s\n",aline);
2087 if (!pswit[OVERVIEW_SWITCH])
2088 g_print(" Line %ld column %ld - Query possible scanno %s\n",
2089 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
2091 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
2093 if (pswit[PARANOID_SWITCH] && warnings->digit)
2095 /* In paranoid mode, query all 0 and 1 standing alone. */
2096 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2098 if (pswit[ECHO_SWITCH])
2099 g_print("\n%s\n",aline);
2100 if (!pswit[OVERVIEW_SWITCH])
2101 g_print(" Line %ld column %ld - Query standalone %s\n",
2102 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2113 * check_for_misspaced_punctuation:
2115 * Look for added or missing spaces around punctuation and quotes.
2116 * If there is a punctuation character like ! with no space on
2117 * either side, suspect a missing!space. If there are spaces on
2118 * both sides , assume a typo. If we see a double quote with no
2119 * space or punctuation on either side of it, assume unspaced
2120 * quotes "like"this.
2122 void check_for_misspaced_punctuation(const char *aline,
2123 struct parities *parities,gboolean isemptyline)
2125 gboolean isacro,isellipsis;
2127 gunichar c,nc,pc,n2c;
2129 c=g_utf8_get_char(aline);
2130 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2131 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2135 nc=g_utf8_get_char(g_utf8_next_char(s));
2136 /* For each character in the line after the first. */
2137 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2139 /* we need to suppress warnings for acronyms like M.D. */
2141 /* we need to suppress warnings for ellipsis . . . */
2144 * If there are letters on both sides of it or
2145 * if it's strict punctuation followed by an alpha.
2147 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2148 g_utf8_strchr("?!,;:",-1,c)))
2152 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2153 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2155 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2161 if (pswit[ECHO_SWITCH])
2162 g_print("\n%s\n",aline);
2163 if (!pswit[OVERVIEW_SWITCH])
2164 g_print(" Line %ld column %ld - Missing space?\n",
2165 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2170 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2173 * If there are spaces on both sides,
2174 * or space before and end of line.
2178 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2179 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2181 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2185 if (!isemptyline && !isellipsis)
2187 if (pswit[ECHO_SWITCH])
2188 g_print("\n%s\n",aline);
2189 if (!pswit[OVERVIEW_SWITCH])
2190 g_print(" Line %ld column %ld - "
2191 "Spaced punctuation?\n",linecnt,
2192 g_utf8_pointer_to_offset(aline,s)+1);
2199 /* Split out the characters that CANNOT be preceded by space. */
2200 c=g_utf8_get_char(aline);
2201 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2202 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2206 nc=g_utf8_get_char(g_utf8_next_char(s));
2207 /* for each character in the line after the first */
2208 if (g_utf8_strchr("?!,;:",-1,c))
2210 /* if it's punctuation that _cannot_ have a space before it */
2211 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2214 * If nc DOES == space,
2215 * it was already reported just above.
2217 if (pswit[ECHO_SWITCH])
2218 g_print("\n%s\n",aline);
2219 if (!pswit[OVERVIEW_SWITCH])
2220 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2221 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2228 * Special case " .X" where X is any alpha.
2229 * This plugs a hole in the acronym code above.
2230 * Inelegant, but maintainable.
2232 c=g_utf8_get_char(aline);
2233 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2234 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2238 nc=g_utf8_get_char(g_utf8_next_char(s));
2239 /* for each character in the line after the first */
2242 /* if it's a period */
2243 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2246 * If the period follows a space and
2247 * is followed by a letter.
2249 if (pswit[ECHO_SWITCH])
2250 g_print("\n%s\n",aline);
2251 if (!pswit[OVERVIEW_SWITCH])
2252 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2253 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2259 c=g_utf8_get_char(aline);
2260 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2261 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2265 nc=g_utf8_get_char(g_utf8_next_char(s));
2266 /* for each character in the line after the first */
2267 if (CHAR_IS_DQUOTE(c))
2269 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2270 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2271 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2273 if (pswit[ECHO_SWITCH])
2274 g_print("\n%s\n",aline);
2275 if (!pswit[OVERVIEW_SWITCH])
2276 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2277 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2283 /* Check parity of quotes. */
2284 nc=g_utf8_get_char(aline);
2285 for (s=aline;*s;s=g_utf8_next_char(s))
2288 nc=g_utf8_get_char(g_utf8_next_char(s));
2289 if (CHAR_IS_DQUOTE(c))
2293 parities->dquote=!parities->dquote;
2294 parity=parities->dquote;
2296 else if (c==CHAR_LD_QUOTE)
2303 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2305 if (pswit[ECHO_SWITCH])
2306 g_print("\n%s\n",aline);
2307 if (!pswit[OVERVIEW_SWITCH])
2308 g_print(" Line %ld column %ld - "
2309 "Wrongspaced quotes?\n",
2310 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2318 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2319 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2321 if (pswit[ECHO_SWITCH])
2322 g_print("\n%s\n",aline);
2323 if (!pswit[OVERVIEW_SWITCH])
2324 g_print(" Line %ld column %ld - "
2325 "Wrongspaced quotes?\n",
2326 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2333 c=g_utf8_get_char(aline);
2334 if (CHAR_IS_DQUOTE(c))
2336 if (g_utf8_strchr(",;:!?)]} ",-1,
2337 g_utf8_get_char(g_utf8_next_char(aline))))
2339 if (pswit[ECHO_SWITCH])
2340 g_print("\n%s\n",aline);
2341 if (!pswit[OVERVIEW_SWITCH])
2342 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2348 if (pswit[SQUOTE_SWITCH])
2350 nc=g_utf8_get_char(aline);
2351 for (s=aline;*s;s=g_utf8_next_char(s))
2354 nc=g_utf8_get_char(g_utf8_next_char(s));
2355 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2356 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2357 !g_unichar_isalpha(nc)))
2359 parities->squote=!parities->squote;
2360 if (!parities->squote)
2363 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2365 if (pswit[ECHO_SWITCH])
2366 g_print("\n%s\n",aline);
2367 if (!pswit[OVERVIEW_SWITCH])
2368 g_print(" Line %ld column %ld - "
2369 "Wrongspaced singlequotes?\n",
2370 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2378 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2379 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2381 if (pswit[ECHO_SWITCH])
2382 g_print("\n%s\n",aline);
2383 if (!pswit[OVERVIEW_SWITCH])
2384 g_print(" Line %ld column %ld - "
2385 "Wrongspaced singlequotes?\n",
2386 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2397 * check_for_double_punctuation:
2399 * Look for double punctuation like ,. or ,,
2400 * Thanks to DW for the suggestion!
2401 * In books with references, ".," and ".;" are common
2402 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2403 * OTOH, from my initial tests, there are also fairly
2404 * common errors. What to do? Make these cases paranoid?
2405 * ".," is the most common, so warnings->dotcomma is used
2406 * to suppress detailed reporting if it occurs often.
2408 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2412 nc=g_utf8_get_char(aline);
2413 for (s=aline;*s;s=g_utf8_next_char(s))
2416 nc=g_utf8_get_char(g_utf8_next_char(s));
2417 /* for each punctuation character in the line */
2418 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2419 g_utf8_strchr(".?!,;:",-1,nc))
2421 /* followed by punctuation, it's a query, unless . . . */
2422 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2423 !warnings->dotcomma && c=='.' && nc==',' ||
2424 warnings->isFrench && g_str_has_prefix(s,",...") ||
2425 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2426 warnings->isFrench && g_str_has_prefix(s,";...") ||
2427 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2428 warnings->isFrench && g_str_has_prefix(s,":...") ||
2429 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2430 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2431 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2432 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2433 warnings->isFrench && g_str_has_prefix(s,"...?"))
2435 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2436 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2437 warnings->isFrench && g_str_has_prefix(s,";...") ||
2438 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2439 warnings->isFrench && g_str_has_prefix(s,":...") ||
2440 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2441 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2442 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2443 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2444 warnings->isFrench && g_str_has_prefix(s,"...?"))
2447 nc=g_utf8_get_char(g_utf8_next_char(s));
2449 ; /* do nothing for .. !! and ?? which can be legit */
2453 if (pswit[ECHO_SWITCH])
2454 g_print("\n%s\n",aline);
2455 if (!pswit[OVERVIEW_SWITCH])
2456 g_print(" Line %ld column %ld - Double punctuation?\n",
2457 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2466 * check_for_spaced_quotes:
2468 void check_for_spaced_quotes(const char *aline)
2472 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2476 while ((t=strstr(s," \" ")))
2478 if (pswit[ECHO_SWITCH])
2479 g_print("\n%s\n",aline);
2480 if (!pswit[OVERVIEW_SWITCH])
2481 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2482 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2485 s=g_utf8_next_char(g_utf8_next_char(t));
2487 pattern=g_string_new(NULL);
2488 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2490 g_string_assign(pattern," ");
2491 g_string_append_unichar(pattern,single_quotes[i]);
2492 g_string_append_c(pattern,' ');
2494 while ((t=strstr(s,pattern->str)))
2496 if (pswit[ECHO_SWITCH])
2497 g_print("\n%s\n",aline);
2498 if (!pswit[OVERVIEW_SWITCH])
2499 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2500 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2503 s=g_utf8_next_char(g_utf8_next_char(t));
2506 g_string_free(pattern,TRUE);
2510 * check_for_miscased_genative:
2512 * Check special case of 'S instead of 's at end of word.
2514 void check_for_miscased_genative(const char *aline)
2520 c=g_utf8_get_char(aline);
2521 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2522 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2526 nc=g_utf8_get_char(g_utf8_next_char(s));
2527 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2529 if (pswit[ECHO_SWITCH])
2530 g_print("\n%s\n",aline);
2531 if (!pswit[OVERVIEW_SWITCH])
2532 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2533 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2541 * check_end_of_line:
2543 * Now check special cases - start and end of line -
2544 * for single and double quotes. Start is sometimes [sic]
2545 * but better to query it anyway.
2546 * While we're here, check for dash at end of line.
2548 void check_end_of_line(const char *aline,struct warnings *warnings)
2553 lbytes=strlen(aline);
2554 if (g_utf8_strlen(aline,lbytes)>1)
2556 s=g_utf8_prev_char(aline+lbytes);
2557 c1=g_utf8_get_char(s);
2558 c2=g_utf8_get_char(g_utf8_prev_char(s));
2559 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2561 if (pswit[ECHO_SWITCH])
2562 g_print("\n%s\n",aline);
2563 if (!pswit[OVERVIEW_SWITCH])
2564 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2565 g_utf8_strlen(aline,lbytes));
2569 c1=g_utf8_get_char(aline);
2570 c2=g_utf8_get_char(g_utf8_next_char(aline));
2571 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2573 if (pswit[ECHO_SWITCH])
2574 g_print("\n%s\n",aline);
2575 if (!pswit[OVERVIEW_SWITCH])
2576 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2581 * Dash at end of line may well be legit - paranoid mode only
2582 * and don't report em-dash at line-end.
2584 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2586 for (s=g_utf8_prev_char(aline+lbytes);
2587 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2589 if (g_utf8_get_char(s)=='-' &&
2590 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2592 if (pswit[ECHO_SWITCH])
2593 g_print("\n%s\n",aline);
2594 if (!pswit[OVERVIEW_SWITCH])
2595 g_print(" Line %ld column %ld - "
2596 "Hyphen at end of line?\n",
2597 linecnt,g_utf8_pointer_to_offset(aline,s));
2604 * check_for_unspaced_bracket:
2606 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2607 * If so, suspect a scanno like "a]most".
2609 void check_for_unspaced_bracket(const char *aline)
2613 c=g_utf8_get_char(aline);
2614 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2615 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2619 nc=g_utf8_get_char(g_utf8_next_char(s));
2622 /* for each bracket character in the line except 1st & last */
2623 if (g_utf8_strchr("{[()]}",-1,c) &&
2624 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2626 if (pswit[ECHO_SWITCH])
2627 g_print("\n%s\n",aline);
2628 if (!pswit[OVERVIEW_SWITCH])
2629 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2630 linecnt,g_utf8_pointer_to_offset(aline,s));
2638 * check_for_unpunctuated_endquote:
2640 void check_for_unpunctuated_endquote(const char *aline)
2645 c=g_utf8_get_char(aline);
2646 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2647 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2651 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2652 nc=g_utf8_get_char(g_utf8_next_char(s));
2653 /* for each character in the line except 1st */
2654 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2656 if (pswit[ECHO_SWITCH])
2657 g_print("\n%s\n",aline);
2658 if (!pswit[OVERVIEW_SWITCH])
2659 g_print(" Line %ld column %ld - "
2660 "endquote missing punctuation?\n",
2661 linecnt,g_utf8_pointer_to_offset(aline,s));
2669 * check_for_html_tag:
2671 * Check for <HTML TAG>.
2673 * If there is a < in the line, followed at some point
2674 * by a > then we suspect HTML.
2676 void check_for_html_tag(const char *aline)
2678 const char *open,*close;
2680 open=strchr(aline,'<');
2683 close=strchr(g_utf8_next_char(open),'>');
2686 if (pswit[ECHO_SWITCH])
2687 g_print("\n%s\n",aline);
2688 if (!pswit[OVERVIEW_SWITCH])
2690 tag=g_strndup(open,close-open+1);
2691 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2692 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2702 * check_for_html_entity:
2704 * Check for &symbol; HTML.
2706 * If there is a & in the line, followed at
2707 * some point by a ; then we suspect HTML.
2709 void check_for_html_entity(const char *aline)
2711 const char *s,*amp,*scolon;
2713 amp=strchr(aline,'&');
2716 scolon=strchr(amp,';');
2719 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2720 if (g_utf8_get_char(s)==CHAR_SPACE)
2721 break; /* Don't report "Jones & Son;" */
2724 if (pswit[ECHO_SWITCH])
2725 g_print("\n%s\n",aline);
2726 if (!pswit[OVERVIEW_SWITCH])
2728 entity=g_strndup(amp,scolon-amp+1);
2729 g_print(" Line %ld column %d - HTML symbol? %s \n",
2730 linecnt,(int)(amp-aline)+1,entity);
2741 * check_for_omitted_punctuation:
2743 * Check for omitted punctuation at end of paragraph by working back
2744 * through prevline. DW.
2745 * Need to check this only for "normal" paras.
2746 * So what is a "normal" para?
2747 * Not normal if one-liner (chapter headings, etc.)
2748 * Not normal if doesn't contain at least one locase letter
2749 * Not normal if starts with space
2751 void check_for_omitted_punctuation(const char *prevline,
2752 struct line_properties *last,int start_para_line)
2754 gboolean letter_on_line=FALSE;
2757 gboolean closing_quote;
2758 for (s=prevline;*s;s=g_utf8_next_char(s))
2759 if (g_unichar_isalpha(g_utf8_get_char(s)))
2761 letter_on_line=TRUE;
2765 * This next "if" is a problem.
2766 * If we say "start_para_line <= linecnt - 1", that includes
2767 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2768 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2769 * misses genuine one-line paragraphs.
2771 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2772 g_utf8_get_char(prevline)>CHAR_SPACE)
2774 s=prevline+strlen(prevline);
2777 s=g_utf8_prev_char(s);
2778 c=g_utf8_get_char(s);
2779 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2782 closing_quote=FALSE;
2783 } while (closing_quote && s>prevline);
2784 for (;s>prevline;s=g_utf8_prev_char(s))
2786 if (g_unichar_isalpha(g_utf8_get_char(s)))
2788 if (pswit[ECHO_SWITCH])
2789 g_print("\n%s\n",prevline);
2790 if (!pswit[OVERVIEW_SWITCH])
2791 g_print(" Line %ld column %ld - "
2792 "No punctuation at para end?\n",
2793 linecnt-1,g_utf8_strlen(prevline,-1));
2798 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2804 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2806 const char *word=key;
2809 g_print("\nNote: Queried word %s was duplicated %d times\n",
2814 void print_as_windows_1252(const char *string)
2816 gsize inbytes,outbytes;
2818 static GIConv converter=(GIConv)-1;
2821 if (converter!=(GIConv)-1)
2822 g_iconv_close(converter);
2823 converter=(GIConv)-1;
2826 if (converter==(GIConv)-1)
2827 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2828 if (converter!=(GIConv)-1)
2830 inbytes=outbytes=strlen(string);
2831 bp=buf=g_malloc(outbytes+1);
2832 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2838 fputs(string,stdout);
2841 void print_as_utf_8(const char *string)
2843 fputs(string,stdout);
2851 void procfile(const char *filename)
2854 gchar *parastart=NULL; /* first line of current para */
2855 gchar *etext,*aline;
2858 struct first_pass_results *first_pass_results;
2859 struct warnings *warnings;
2860 struct counters counters={0};
2861 struct line_properties last={0};
2862 struct parities parities={0};
2863 struct pending pending={0};
2864 gboolean isemptyline;
2865 long start_para_line=0;
2866 gboolean isnewpara=FALSE,enddash=FALSE;
2867 last.start=CHAR_SPACE;
2868 linecnt=checked_linecnt=0;
2869 etext=read_etext(filename,&err);
2872 if (pswit[STDOUT_SWITCH])
2873 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2875 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2878 g_print("\n\nFile: %s\n\n",filename);
2879 first_pass_results=first_pass(etext);
2880 warnings=report_first_pass(first_pass_results);
2881 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2882 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2884 * Here we go with the main pass. Hold onto yer hat!
2888 while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr)))
2893 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2894 continue; // skip DP page separators completely
2895 if (linecnt<first_pass_results->firstline ||
2896 (first_pass_results->footerline>0 &&
2897 linecnt>first_pass_results->footerline))
2899 if (pswit[HEADER_SWITCH])
2901 if (g_str_has_prefix(aline,"Title:"))
2902 g_print(" %s\n",aline);
2903 if (g_str_has_prefix(aline,"Author:"))
2904 g_print(" %s\n",aline);
2905 if (g_str_has_prefix(aline,"Release Date:"))
2906 g_print(" %s\n",aline);
2907 if (g_str_has_prefix(aline,"Edition:"))
2908 g_print(" %s\n\n",aline);
2910 continue; /* skip through the header */
2913 print_pending(aline,parastart,&pending);
2914 isemptyline=analyse_quotes(aline,&counters);
2915 if (isnewpara && !isemptyline)
2917 /* This line is the start of a new paragraph. */
2918 start_para_line=linecnt;
2919 /* Capture its first line in case we want to report it later. */
2921 parastart=g_strdup(aline);
2922 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2924 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2925 !g_unichar_isdigit(g_utf8_get_char(s)))
2926 s=g_utf8_next_char(s);
2927 if (g_unichar_islower(g_utf8_get_char(s)))
2929 /* and its first letter is lowercase */
2930 if (pswit[ECHO_SWITCH])
2931 g_print("\n%s\n",aline);
2932 if (!pswit[OVERVIEW_SWITCH])
2933 g_print(" Line %ld column %ld - "
2934 "Paragraph starts with lower-case\n",
2935 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2939 isnewpara=FALSE; /* Signal the end of new para processing. */
2941 /* Check for an em-dash broken at line end. */
2942 if (enddash && g_utf8_get_char(aline)=='-')
2944 if (pswit[ECHO_SWITCH])
2945 g_print("\n%s\n",aline);
2946 if (!pswit[OVERVIEW_SWITCH])
2947 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2952 for (s=g_utf8_prev_char(aline+strlen(aline));
2953 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2955 if (s>=aline && g_utf8_get_char(s)=='-')
2957 check_for_control_characters(aline);
2959 check_for_odd_characters(aline,warnings,isemptyline);
2960 if (warnings->longline)
2961 check_for_long_line(aline);
2962 if (warnings->shortline)
2963 check_for_short_line(aline,&last);
2965 last.len=g_utf8_strlen(aline,-1);
2966 last.start=g_utf8_get_char(aline);
2967 check_for_starting_punctuation(aline);
2970 check_for_spaced_emdash(aline);
2971 check_for_spaced_dash(aline);
2973 check_for_unmarked_paragraphs(aline);
2974 check_for_jeebies(aline);
2975 check_for_mta_from(aline);
2976 check_for_orphan_character(aline);
2977 check_for_pling_scanno(aline);
2978 check_for_extra_period(aline,warnings);
2979 check_for_following_punctuation(aline);
2980 check_for_typos(aline,warnings);
2981 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2982 check_for_double_punctuation(aline,warnings);
2983 check_for_spaced_quotes(aline);
2984 check_for_miscased_genative(aline);
2985 check_end_of_line(aline,warnings);
2986 check_for_unspaced_bracket(aline);
2987 if (warnings->endquote)
2988 check_for_unpunctuated_endquote(aline);
2989 check_for_html_tag(aline);
2990 check_for_html_entity(aline);
2993 check_for_mismatched_quotes(&counters,&pending);
2994 counters_reset(&counters);
2995 /* let the next iteration know that it's starting a new para */
2998 check_for_omitted_punctuation(prevline,&last,start_para_line);
3001 prevline=g_strdup(aline);
3004 check_for_mismatched_quotes(&counters,&pending);
3005 print_pending(NULL,parastart,&pending);
3006 reset_pending(&pending);
3015 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
3016 g_tree_foreach(qword,report_duplicate_queries,NULL);
3017 g_tree_unref(qword);
3018 g_tree_unref(qperiod);
3019 counters_destroy(&counters);
3020 g_set_print_handler(NULL);
3021 print_as_windows_1252(NULL);
3022 if (pswit[MARKUP_SWITCH])
3029 * Get one line from the input text, checking for
3030 * the existence of exactly one CR/LF line-end per line.
3032 * Returns: a pointer to the line.
3034 char *flgets(char **etext,long lcnt,gboolean warn_nocr)
3037 gboolean isCR=FALSE;
3038 char *theline=*etext;
3043 c=g_utf8_get_char(*etext);
3046 if (*etext==theline)
3048 else if (pswit[LINE_END_SWITCH])
3050 if (pswit[ECHO_SWITCH])
3052 s=g_strndup(theline,eos-theline);
3053 g_print("\n%s\n",s);
3056 if (!pswit[OVERVIEW_SWITCH])
3057 /* There may, or may not, have been a CR */
3058 g_print(" Line %ld - No LF?\n",lcnt);
3064 *etext=g_utf8_next_char(*etext);
3065 /* either way, it's end of line */
3072 /* Error - a LF without a preceding CR */
3073 if (pswit[LINE_END_SWITCH] && warn_nocr)
3075 if (pswit[ECHO_SWITCH])
3077 s=g_strndup(theline,eos-theline);
3078 g_print("\n%s\n",s);
3081 if (!pswit[OVERVIEW_SWITCH])
3082 g_print(" Line %ld - No CR?\n",lcnt);
3093 /* Error - two successive CRs */
3094 if (pswit[LINE_END_SWITCH])
3096 if (pswit[ECHO_SWITCH])
3098 s=g_strndup(theline,eos-theline);
3099 g_print("\n%s\n",s);
3102 if (!pswit[OVERVIEW_SWITCH])
3103 g_print(" Line %ld - Two successive CRs?\n",lcnt);
3112 if (pswit[LINE_END_SWITCH] && isCR)
3114 if (pswit[ECHO_SWITCH])
3116 s=g_strndup(theline,eos-theline);
3117 g_print("\n%s\n",s);
3120 if (!pswit[OVERVIEW_SWITCH])
3121 g_print(" Line %ld column %ld - CR without LF?\n",
3122 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
3128 eos=g_utf8_next_char(eos);
3132 if (pswit[MARKUP_SWITCH])
3133 postprocess_for_HTML(theline);
3134 if (pswit[DP_SWITCH])
3135 postprocess_for_DP(theline);
3142 * Takes a "word" as a parameter, and checks whether it
3143 * contains a mixture of alpha and digits. Generally, this is an
3144 * error, but may not be for cases like 4th or L5 12s. 3d.
3146 * Returns: TRUE iff an is error found.
3148 gboolean mixdigit(const char *checkword)
3150 gboolean wehaveadigit,wehavealetter,query;
3151 const char *s,*nondigit;
3152 wehaveadigit=wehavealetter=query=FALSE;
3153 for (s=checkword;*s;s=g_utf8_next_char(s))
3154 if (g_unichar_isalpha(g_utf8_get_char(s)))
3156 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3158 if (wehaveadigit && wehavealetter)
3160 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3162 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3163 nondigit=g_utf8_next_char(nondigit))
3165 /* digits, ending in st, rd, nd, th of either case */
3166 if (!g_ascii_strcasecmp(nondigit,"st") ||
3167 !g_ascii_strcasecmp(nondigit,"rd") ||
3168 !g_ascii_strcasecmp(nondigit,"nd") ||
3169 !g_ascii_strcasecmp(nondigit,"th"))
3171 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3172 !g_ascii_strcasecmp(nondigit,"rds") ||
3173 !g_ascii_strcasecmp(nondigit,"nds") ||
3174 !g_ascii_strcasecmp(nondigit,"ths"))
3176 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3177 !g_ascii_strcasecmp(nondigit,"rdly") ||
3178 !g_ascii_strcasecmp(nondigit,"ndly") ||
3179 !g_ascii_strcasecmp(nondigit,"thly"))
3181 /* digits, ending in l, L, s or d */
3182 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3183 !strcmp(nondigit,"d"))
3186 * L at the start of a number, representing Britsh pounds, like L500.
3187 * This is cute. We know the current word is mixed digit. If the first
3188 * letter is L, there must be at least one digit following. If both
3189 * digits and letters follow, we have a genuine error, else we have a
3190 * capital L followed by digits, and we accept that as a non-error.
3192 if (g_utf8_get_char(checkword)=='L' &&
3193 !mixdigit(g_utf8_next_char(checkword)))
3202 * Extracts the first/next "word" from the line, and returns it.
3203 * A word is defined as one English word unit--or at least that's the aim.
3204 * "ptr" is advanced to the position in the line where we will start
3205 * looking for the next word.
3207 * Returns: A newly-allocated string.
3209 gchar *getaword(const char **ptr)
3214 word=g_string_new(NULL);
3215 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3216 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3217 **ptr;*ptr=g_utf8_next_char(*ptr))
3219 /* Handle exceptions for footnote markers like [1] */
3220 if (g_utf8_get_char(*ptr)=='[')
3222 g_string_append_c(word,'[');
3223 s=g_utf8_next_char(*ptr);
3224 for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
3225 g_string_append_unichar(word,g_utf8_get_char(s));
3226 if (g_utf8_get_char(s)==']')
3228 g_string_append_c(word,']');
3229 *ptr=g_utf8_next_char(s);
3230 return g_string_free(word,FALSE);
3233 g_string_truncate(word,0);
3237 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3238 * Especially yucky is the case of L1,000
3239 * This section looks for a pattern of characters including a digit
3240 * followed by a comma or period followed by one or more digits.
3241 * If found, it returns this whole pattern as a word; otherwise we discard
3242 * the results and resume our normal programming.
3245 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3246 g_unichar_isalpha(g_utf8_get_char(s)) ||
3247 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3248 g_string_append_unichar(word,g_utf8_get_char(s));
3251 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3253 c=g_utf8_get_char(t);
3254 pc=g_utf8_get_char(g_utf8_prev_char(t));
3255 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3258 return g_string_free(word,FALSE);
3262 /* we didn't find a punctuated number - do the regular getword thing */
3263 g_string_truncate(word,0);
3264 c=g_utf8_get_char(*ptr);
3265 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3266 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3267 g_string_append_unichar(word,c);
3268 return g_string_free(word,FALSE);
3274 * Is this word a Roman Numeral?
3276 * It doesn't actually validate that the number is a valid Roman Numeral--for
3277 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3278 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3279 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3280 * expressions thereof, except when it came to taxes. Allow any number of M,
3281 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3282 * XL or an optional XC, an optional IX or IV, an optional V and any number
3285 gboolean isroman(const char *t)
3291 while (g_utf8_get_char(t)=='m' && *t)
3293 if (g_utf8_get_char(t)=='d')
3295 if (g_str_has_prefix(t,"cm"))
3297 if (g_str_has_prefix(t,"cd"))
3299 while (g_utf8_get_char(t)=='c' && *t)
3301 if (g_str_has_prefix(t,"xl"))
3303 if (g_str_has_prefix(t,"xc"))
3305 if (g_utf8_get_char(t)=='l')
3307 while (g_utf8_get_char(t)=='x' && *t)
3309 if (g_str_has_prefix(t,"ix"))
3311 if (g_str_has_prefix(t,"iv"))
3313 if (g_utf8_get_char(t)=='v')
3315 while (g_utf8_get_char(t)=='i' && *t)
3321 * postprocess_for_DP:
3323 * Invoked with the -d switch from flgets().
3324 * It simply "removes" from the line a hard-coded set of common
3325 * DP-specific tags, so that the line passed to the main routine has
3326 * been pre-cleaned of DP markup.
3328 void postprocess_for_DP(char *theline)
3334 for (i=0;*DPmarkup[i];i++)
3335 while ((s=strstr(theline,DPmarkup[i])))
3337 t=s+strlen(DPmarkup[i]);
3338 memmove(s,t,strlen(t)+1);
3343 * postprocess_for_HTML:
3345 * Invoked with the -m switch from flgets().
3346 * It simply "removes" from the line a hard-coded set of common
3347 * HTML tags and "replaces" a hard-coded set of common HTML
3348 * entities, so that the line passed to the main routine has
3349 * been pre-cleaned of HTML.
3351 void postprocess_for_HTML(char *theline)
3353 while (losemarkup(theline))
3355 loseentities(theline);
3358 char *losemarkup(char *theline)
3362 s=strchr(theline,'<');
3363 t=s?strchr(s,'>'):NULL;
3366 for (i=0;*markup[i];i++)
3367 if (tagcomp(g_utf8_next_char(s),markup[i]))
3369 t=g_utf8_next_char(t);
3370 memmove(s,t,strlen(t)+1);
3373 /* It's an unrecognized <xxx>. */
3377 void loseentities(char *theline)
3384 GTree *entities=NULL;
3385 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3389 g_tree_destroy(entities);
3391 if (translit!=(GIConv)-1)
3392 g_iconv_close(translit);
3393 translit=(GIConv)-1;
3394 if (to_utf8!=(GIConv)-1)
3395 g_iconv_close(to_utf8);
3403 entities=g_tree_new((GCompareFunc)strcmp);
3404 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3405 g_tree_insert(entities,HTMLentities[i].name,
3406 GUINT_TO_POINTER(HTMLentities[i].c));
3408 if (translit==(GIConv)-1)
3409 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3410 if (to_utf8==(GIConv)-1)
3411 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3412 while((amp=strchr(theline,'&')))
3414 scolon=strchr(amp,';');
3419 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3420 c=strtol(amp+2,NULL,10);
3421 else if (amp[2]=='x' &&
3422 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3423 c=strtol(amp+3,NULL,16);
3427 s=g_strndup(amp+1,scolon-(amp+1));
3428 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3437 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3438 theline+=g_unichar_to_utf8(c,theline);
3442 nb=g_unichar_to_utf8(c,s);
3443 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3445 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3447 memcpy(theline,s,nb);
3451 memmove(theline,g_utf8_next_char(scolon),
3452 strlen(g_utf8_next_char(scolon))+1);
3455 theline=g_utf8_next_char(amp);
3459 gboolean tagcomp(const char *strin,const char *basetag)
3463 if (g_utf8_get_char(strin)=='/')
3464 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3466 t=g_utf8_casefold(strin,-1);
3467 s=g_utf8_casefold(basetag,-1);
3468 retval=g_str_has_prefix(t,s);
3474 void proghelp(GOptionContext *context)
3477 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3478 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3479 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3480 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3481 "For details, read the file COPYING.\n",stderr);
3482 fputs("This is Free Software; "
3483 "you may redistribute it under certain conditions (GPL);\n",stderr);
3484 fputs("read the file COPYING for details.\n\n",stderr);
3485 help=g_option_context_get_help(context,TRUE,NULL);
3488 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3489 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3490 "non-ASCII\n",stderr);
3491 fputs("characters like accented letters, "
3492 "lines longer than 75 or shorter than 55,\n",stderr);
3493 fputs("unbalanced quotes or brackets, "
3494 "a variety of badly formatted punctuation, \n",stderr);
3495 fputs("HTML tags, some likely typos. "
3496 "It is NOT a substitute for human judgement.\n",stderr);