1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 gboolean typo_compat,paranoid_compat;
133 static GOptionEntry options[]={
134 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
135 "Ignore DP-specific markup", NULL },
136 { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
137 G_OPTION_ARG_NONE, pswit+DP_SWITCH,
138 "Don't ignore DP-specific markup", NULL },
139 { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
140 "Echo queried line", NULL },
141 { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
142 G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
143 "Don't echo queried line", NULL },
144 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
145 "Check single quotes", NULL },
146 { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
147 G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
148 "Don't check single quotes", NULL },
149 { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
150 "Check common typos", NULL },
151 { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
152 G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
153 "Don't check common typos", NULL },
154 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
155 "Require closure of quotes on every paragraph", NULL },
156 { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
157 G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
158 "Don't require closure of quotes on every paragraph", NULL },
159 { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
160 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
161 "Enable paranoid querying of everything", NULL },
162 { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
163 G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
164 "Disable paranoid querying of everything", NULL },
165 { "line-end", 0, G_OPTION_FLAG_HIDDEN,
166 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
167 "Enable line end checking", NULL },
168 { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
169 G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
170 "Diable line end checking", NULL },
171 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
172 "Overview: just show counts", NULL },
173 { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
174 G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
175 "Show individual warnings", NULL },
176 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
177 "Output errors to stdout instead of stderr", NULL },
178 { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
179 G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
180 "Output errors to stderr instead of stdout", NULL },
181 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
182 "Echo header fields", NULL },
183 { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
184 G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
185 "Don't echo header fields", NULL },
186 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
187 "Ignore markup in < >", NULL },
188 { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
189 G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
190 "No special handling for markup in < >", NULL },
191 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
192 "Use file of user-defined typos", NULL },
193 { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
194 G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
195 "Ignore file of user-defined typos", NULL },
196 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
197 "Verbose - list everything", NULL },
198 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
199 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Switch off verbose mode", NULL },
205 * Options relating to configuration which make no sense from inside
206 * a configuration file.
209 static GOptionEntry config_options[]={
210 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
211 "Defaults for use on www upload", NULL },
212 { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
213 "Dump current config settings", NULL },
217 static GOptionEntry compatibility_options[]={
218 { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
219 "Toggle checking for common typos", NULL },
220 { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat,
221 "Toggle both paranoid mode and common typos", NULL },
225 long cnt_dquot; /* for overview mode, count of doublequote queries */
226 long cnt_squot; /* for overview mode, count of singlequote queries */
227 long cnt_brack; /* for overview mode, count of brackets queries */
228 long cnt_bin; /* for overview mode, count of non-ASCII queries */
229 long cnt_odd; /* for overview mode, count of odd character queries */
230 long cnt_long; /* for overview mode, count of long line errors */
231 long cnt_short; /* for overview mode, count of short line queries */
232 long cnt_punct; /* for overview mode,
233 count of punctuation and spacing queries */
234 long cnt_dash; /* for overview mode, count of dash-related queries */
235 long cnt_word; /* for overview mode, count of word queries */
236 long cnt_html; /* for overview mode, count of html queries */
237 long cnt_lineend; /* for overview mode, count of line-end queries */
238 long cnt_spacend; /* count of lines with space at end */
239 long linecnt; /* count of total lines in the file */
240 long checked_linecnt; /* count of lines actually checked */
242 void proghelp(GOptionContext *context);
243 void procfile(const char *);
247 gboolean mixdigit(const char *);
248 gchar *getaword(const char **);
249 char *flgets(char **,long);
250 void postprocess_for_HTML(char *);
251 char *linehasmarkup(char *);
252 char *losemarkup(char *);
253 gboolean tagcomp(const char *,const char *);
254 void loseentities(char *);
255 gboolean isroman(const char *);
256 void postprocess_for_DP(char *);
257 void print_as_windows_1252(const char *string);
258 void print_as_utf_8(const char *string);
260 GTree *qword,*qperiod;
268 void config_file_update(GKeyFile *kf)
272 for(i=0;options[i].long_name;i++)
274 if (g_str_has_prefix(options[i].long_name,"no-"))
276 if (options[i].arg==G_OPTION_ARG_NONE)
278 sw=*(gboolean *)options[i].arg_data;
279 if (options[i].flags&G_OPTION_FLAG_REVERSE)
281 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
284 g_assert_not_reached();
288 void config_file_add_comments(GKeyFile *kf)
292 g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
294 for(i=0;options[i].long_name;i++)
296 if (g_str_has_prefix(options[i].long_name,"no-"))
298 comment=g_strconcat(" ",options[i].description,NULL);
299 g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
304 void dump_config(void)
308 config_file_update(config);
311 config=g_key_file_new();
312 config_file_update(config);
313 config_file_add_comments(config);
315 s=g_key_file_to_data(config,NULL,NULL);
321 GKeyFile *read_config_file(gchar **full_path)
327 const char *search_path;
330 search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
334 search_dirs=g_strsplit(search_path,";",0);
336 search_dirs=g_strsplit(search_path,":",0);
341 search_dirs=g_new(gchar *,4);
342 search_dirs[0]=g_get_current_dir();
343 search_dirs[1]=g_strdup(running_from);
344 search_dirs[2]=g_strdup(g_get_user_config_dir());
347 for(i=0;search_dirs[i];i++)
349 path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
350 if (g_key_file_load_from_file(kf,path,
351 G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
353 if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
355 g_printerr("Bookloupe: Error reading %s\n",path);
356 g_printerr("%s\n",err->message);
368 g_strfreev(search_dirs);
376 void parse_config_file(void)
383 config=read_config_file(&path);
385 keys=g_key_file_get_keys(config,"options",NULL,NULL);
392 for(j=0;options[j].long_name;j++)
394 if (g_str_has_prefix(options[j].long_name,"no-"))
396 else if (!strcmp(keys[i],options[j].long_name))
398 if (options[j].arg==G_OPTION_ARG_NONE)
400 sw=g_key_file_get_boolean(config,"options",keys[i],
404 g_printerr("Bookloupe: %s: options.%s: %s\n",
405 path,keys[i],err->message);
408 if (options[j].flags&G_OPTION_FLAG_REVERSE)
410 *(gboolean *)options[j].arg_data=sw;
414 g_assert_not_reached();
417 if (!options[j].long_name)
418 g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
427 void parse_options(int *argc,char ***argv)
430 GOptionContext *context;
431 GOptionGroup *compatibility;
432 context=g_option_context_new(
433 "file - look for errors in Project Gutenberg(TM) etexts");
434 g_option_context_add_main_entries(context,options,NULL);
435 g_option_context_add_main_entries(context,config_options,NULL);
436 compatibility=g_option_group_new("compatibility",
437 "Options for Compatibility with Gutcheck:",
438 "Show compatibility options",NULL,NULL);
439 g_option_group_add_entries(compatibility,compatibility_options);
440 g_option_context_add_group(context,compatibility);
441 g_option_context_set_description(context,
442 "For simplicity, only the switch options which reverse the\n"
443 "default configuration are listed. In most cases, both vanilla\n"
444 "and \"no-\" prefixed versions are available for use.");
445 if (!g_option_context_parse(context,argc,argv,&err))
447 g_printerr("Bookloupe: %s\n",err->message);
448 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
452 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
455 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
456 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
459 * Web uploads - for the moment, this is really just a placeholder
460 * until we decide what processing we really want to do on web uploads
462 if (pswit[WEB_SWITCH])
464 /* specific override for web uploads */
465 pswit[ECHO_SWITCH]=TRUE;
466 pswit[SQUOTE_SWITCH]=FALSE;
467 pswit[TYPO_SWITCH]=TRUE;
468 pswit[QPARA_SWITCH]=FALSE;
469 pswit[PARANOID_SWITCH]=TRUE;
470 pswit[LINE_END_SWITCH]=FALSE;
471 pswit[OVERVIEW_SWITCH]=FALSE;
472 pswit[STDOUT_SWITCH]=FALSE;
473 pswit[HEADER_SWITCH]=TRUE;
474 pswit[VERBOSE_SWITCH]=FALSE;
475 pswit[MARKUP_SWITCH]=FALSE;
476 pswit[USERTYPO_SWITCH]=FALSE;
477 pswit[DP_SWITCH]=FALSE;
479 if (pswit[DUMP_CONFIG_SWITCH])
484 if (pswit[OVERVIEW_SWITCH])
485 /* just print summary; don't echo */
486 pswit[ECHO_SWITCH]=FALSE;
492 g_option_context_free(context);
498 * Read in the user-defined stealth scanno list.
500 void read_user_scannos(void)
503 gchar *usertypo_file;
507 gchar *contents,*utf8,**lines;
508 usertypo_file=g_strdup("bookloupe.typ");
509 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
510 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
513 g_free(usertypo_file);
514 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
515 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
517 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
520 g_free(usertypo_file);
521 usertypo_file=g_strdup("gutcheck.typ");
522 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
524 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
527 g_free(usertypo_file);
528 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
529 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
531 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
533 g_free(usertypo_file);
534 g_print(" --> I couldn't find bookloupe.typ "
535 "-- proceeding without user typos.\n");
540 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
541 g_free(usertypo_file);
545 if (g_utf8_validate(contents,len,NULL))
546 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
548 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
550 lines=g_strsplit_set(utf8,"\r\n",0);
552 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
553 for (i=0;lines[i];i++)
554 if (*(unsigned char *)lines[i]>'!')
555 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
564 * Read an etext returning a newly allocated string containing the file
565 * contents or NULL on error.
567 gchar *read_etext(const char *filename,GError **err)
569 GError *tmp_err=NULL;
570 gchar *contents,*utf8;
571 gsize len,bytes_read,bytes_written;
573 if (!g_file_get_contents(filename,&contents,&len,err))
575 if (g_utf8_validate(contents,len,NULL))
577 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
578 g_set_print_handler(print_as_utf_8);
580 SetConsoleOutputCP(CP_UTF8);
585 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
586 &bytes_written,&tmp_err);
587 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
588 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
591 for(i=0;i<bytes_read;i++)
592 if (contents[i]=='\n')
597 else if (contents[i]!='\r')
599 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
600 "Input conversion failed. Byte %d at line %d, column %d is not a "
601 "valid Windows-1252 character",
602 ((unsigned char *)contents)[bytes_read],line,col);
605 g_propagate_error(err,tmp_err);
606 g_set_print_handler(print_as_windows_1252);
608 SetConsoleOutputCP(1252);
615 void cleanup_on_exit(void)
618 SetConsoleOutputCP(saved_cp);
622 int main(int argc,char **argv)
625 atexit(cleanup_on_exit);
626 saved_cp=GetConsoleOutputCP();
628 running_from=g_path_get_dirname(argv[0]);
629 /* Paranoid checking is turned OFF, not on, by its switch */
630 pswit[PARANOID_SWITCH]=TRUE;
631 /* if running in paranoid mode, typo checks default to enabled */
632 pswit[TYPO_SWITCH]=TRUE;
633 /* Line-end checking is turned OFF, not on, by its switch */
634 pswit[LINE_END_SWITCH]=TRUE;
635 /* Echoing is turned OFF, not on, by its switch */
636 pswit[ECHO_SWITCH]=TRUE;
638 parse_options(&argc,&argv);
639 if (pswit[USERTYPO_SWITCH])
641 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
643 if (pswit[OVERVIEW_SWITCH])
645 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
646 checked_linecnt,linecnt,linecnt-checked_linecnt);
647 g_print(" --------------- Queries found --------------\n");
649 g_print(" Long lines: %14ld\n",cnt_long);
651 g_print(" Short lines: %14ld\n",cnt_short);
653 g_print(" Line-end problems: %14ld\n",cnt_lineend);
655 g_print(" Common typos: %14ld\n",cnt_word);
657 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
659 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
661 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
663 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
665 g_print(" Proofing characters: %14ld\n",cnt_odd);
667 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
669 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
671 g_print(" Possible HTML tags: %14ld\n",cnt_html);
673 g_print(" TOTAL QUERIES %14ld\n",
674 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
675 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
677 g_free(running_from);
679 g_tree_unref(usertypo);
681 g_key_file_free(config);
688 * Run a first pass - verify that it's a valid PG
689 * file, decide whether to report some things that
690 * occur many times in the text like long or short
691 * lines, non-standard dashes, etc.
693 struct first_pass_results *first_pass(const char *etext)
695 gunichar laststart=CHAR_SPACE;
700 unsigned int lastlen=0,lastblen=0;
701 long spline=0,nspline=0;
702 static struct first_pass_results results={0};
704 lines=g_strsplit(etext,"\n",0);
705 for (j=0;lines[j];j++)
707 lbytes=strlen(lines[j]);
708 while (lbytes>0 && lines[j][lbytes-1]=='\r')
709 lines[j][--lbytes]='\0';
710 llen=g_utf8_strlen(lines[j],lbytes);
712 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
713 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
716 g_print(" --> Duplicate header?\n");
717 spline=linecnt+1; /* first line of non-header text, that is */
719 if (!strncmp(lines[j],"*** START",9) &&
720 strstr(lines[j],"PROJECT GUTENBERG"))
723 g_print(" --> Duplicate header?\n");
724 nspline=linecnt+1; /* first line of non-header text, that is */
726 if (spline || nspline)
728 lc_line=g_utf8_strdown(lines[j],lbytes);
729 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
731 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
733 if (results.footerline)
735 /* it's an old-form header - we can detect duplicates */
737 g_print(" --> Duplicate footer?\n");
740 results.footerline=linecnt;
746 results.firstline=spline;
748 results.firstline=nspline; /* override with new */
749 if (results.footerline)
750 continue; /* don't count the boilerplate in the footer */
751 results.totlen+=llen;
752 for (s=lines[j];*s;s=g_utf8_next_char(s))
754 if (g_utf8_get_char(s)>127)
756 if (g_unichar_isalpha(g_utf8_get_char(s)))
758 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
759 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
760 results.endquote_count++;
762 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
763 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
766 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
768 if (strstr(lines[j],".,"))
770 /* only count ast lines for ignoring purposes where there is */
771 /* locase text on the line */
772 if (strchr(lines[j],'*'))
774 for (s=lines[j];*s;s=g_utf8_next_char(s))
775 if (g_unichar_islower(g_utf8_get_char(s)))
780 if (strchr(lines[j],'/'))
781 results.fslashline++;
784 for (s=g_utf8_prev_char(lines[j]+lbytes);
785 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
786 s=g_utf8_prev_char(s))
788 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
789 g_utf8_get_char(g_utf8_prev_char(s))!='-')
792 if (llen>LONGEST_PG_LINE)
794 if (llen>WAY_TOO_LONG)
795 results.verylongline++;
796 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
798 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
801 if (strstr(lines[j],"<i>"))
802 results.htmcount+=4; /* bonus marks! */
804 /* Check for spaced em-dashes */
805 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
808 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
809 results.space_emdash++;
810 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
811 /* count of em-dashes with spaces both sides */
812 results.non_PG_space_emdash++;
813 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
814 /* count of PG-type em-dashes with no spaces */
815 results.PG_space_emdash++;
820 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
821 results.Dutchcount++;
822 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
823 results.Frenchcount++;
824 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
825 results.standalone_digit++;
828 /* Check for spaced dashes */
829 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
833 laststart=lines[j][0];
842 * Make some snap decisions based on the first pass results.
844 struct warnings *report_first_pass(struct first_pass_results *results)
846 static struct warnings warnings={0};
848 g_print(" --> %ld lines in this file have white space at end\n",
851 if (results->dotcomma>5)
854 g_print(" --> %ld lines in this file contain '.,'. "
855 "Not reporting them.\n",results->dotcomma);
858 * If more than 50 lines, or one-tenth, are short,
859 * don't bother reporting them.
861 warnings.shortline=1;
862 if (results->shortline>50 || results->shortline*10>linecnt)
864 warnings.shortline=0;
865 g_print(" --> %ld lines in this file are short. "
866 "Not reporting short lines.\n",results->shortline);
869 * If more than 50 lines, or one-tenth, are long,
870 * don't bother reporting them.
873 if (results->longline>50 || results->longline*10>linecnt)
876 g_print(" --> %ld lines in this file are long. "
877 "Not reporting long lines.\n",results->longline);
879 /* If more than 10 lines contain asterisks, don't bother reporting them. */
881 if (results->astline>10)
884 g_print(" --> %ld lines in this file contain asterisks. "
885 "Not reporting them.\n",results->astline);
888 * If more than 10 lines contain forward slashes,
889 * don't bother reporting them.
892 if (results->fslashline>10)
895 g_print(" --> %ld lines in this file contain forward slashes. "
896 "Not reporting them.\n",results->fslashline);
899 * If more than 20 lines contain unpunctuated endquotes,
900 * don't bother reporting them.
903 if (results->endquote_count>20)
906 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
907 "Not reporting them.\n",results->endquote_count);
910 * If more than 15 lines contain standalone digits,
911 * don't bother reporting them.
914 if (results->standalone_digit>10)
917 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
918 "Not reporting them.\n",results->standalone_digit);
921 * If more than 20 lines contain hyphens at end,
922 * don't bother reporting them.
925 if (results->hyphens>20)
928 g_print(" --> %ld lines in this file have hyphens at end. "
929 "Not reporting them.\n",results->hyphens);
931 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
933 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
934 pswit[MARKUP_SWITCH]=1;
936 if (results->verylongline>0)
937 g_print(" --> %ld lines in this file are VERY long!\n",
938 results->verylongline);
940 * If there are more non-PG spaced dashes than PG em-dashes,
941 * assume it's deliberate.
942 * Current PG guidelines say don't use them, but older texts do,
943 * and some people insist on them whatever the guidelines say.
946 if (results->spacedash+results->non_PG_space_emdash>
947 results->PG_space_emdash)
950 g_print(" --> There are %ld spaced dashes and em-dashes. "
951 "Not reporting them.\n",
952 results->spacedash+results->non_PG_space_emdash);
954 /* If more than a quarter of characters are hi-bit, bug out. */
956 if (results->binlen*4>results->totlen)
958 g_print(" --> This file does not appear to be ASCII. "
959 "Terminating. Best of luck with it!\n");
962 if (results->alphalen*4<results->totlen)
964 g_print(" --> This file does not appear to be text. "
965 "Terminating. Best of luck with it!\n");
968 if (results->binlen*100>results->totlen || results->binlen>100)
970 g_print(" --> There are a lot of foreign letters here. "
971 "Not reporting them.\n");
974 warnings.isDutch=FALSE;
975 if (results->Dutchcount>50)
977 warnings.isDutch=TRUE;
978 g_print(" --> This looks like Dutch - "
979 "switching off dashes and warnings for 's Middags case.\n");
981 warnings.isFrench=FALSE;
982 if (results->Frenchcount>50)
984 warnings.isFrench=TRUE;
985 g_print(" --> This looks like French - "
986 "switching off some doublepunct.\n");
988 if (results->firstline && results->footerline)
989 g_print(" The PG header and footer appear to be already on.\n");
992 if (results->firstline)
993 g_print(" The PG header is on - no footer.\n");
994 if (results->footerline)
995 g_print(" The PG footer is on - no header.\n");
998 if (pswit[VERBOSE_SWITCH])
1001 warnings.shortline=1;
1002 warnings.dotcomma=1;
1003 warnings.longline=1;
1009 warnings.endquote=1;
1010 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1012 if (warnings.isDutch)
1014 if (results->footerline>0 && results->firstline>0 &&
1015 results->footerline>results->firstline &&
1016 results->footerline-results->firstline<100)
1018 g_print(" --> I don't really know where this text starts. \n");
1019 g_print(" There are no reference points.\n");
1020 g_print(" I'm going to have to report the header and footer "
1022 results->firstline=0;
1030 * Look along the line, accumulate the count of quotes, and see
1031 * if this is an empty line - i.e. a line with nothing on it
1033 * If line has just spaces, period, * and/or - on it, don't
1034 * count it, since empty lines with asterisks or dashes to
1035 * separate sections are common.
1037 * Returns: TRUE if the line is empty.
1039 gboolean analyse_quotes(const char *aline,struct counters *counters)
1042 /* assume the line is empty until proven otherwise */
1043 gboolean isemptyline=TRUE;
1044 const char *s=aline,*sprev,*snext;
1049 snext=g_utf8_next_char(s);
1050 c=g_utf8_get_char(s);
1053 if (CHAR_IS_SQUOTE(c))
1058 * At start of line, it can only be an openquote.
1059 * Hardcode a very common exception!
1061 if (!g_str_has_prefix(snext,"tis") &&
1062 !g_str_has_prefix(snext,"Tis"))
1063 increment_matching(counters,c,TRUE);
1065 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1066 g_unichar_isalpha(g_utf8_get_char(snext)))
1067 /* Do nothing! it's definitely an apostrophe, not a quote */
1069 /* it's outside a word - let's check it out */
1070 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1071 g_unichar_isalpha(g_utf8_get_char(snext)))
1073 /* it damwell better BE an openquote */
1074 if (!g_str_has_prefix(snext,"tis") &&
1075 !g_str_has_prefix(snext,"Tis"))
1076 /* hardcode a very common exception! */
1077 increment_matching(counters,c,TRUE);
1081 /* now - is it a closequote? */
1082 guessquote=0; /* accumulate clues */
1083 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1085 /* it follows a letter - could be either */
1087 if (g_utf8_get_char(sprev)=='s')
1089 /* looks like a plural apostrophe */
1091 if (g_utf8_get_char(snext)==CHAR_SPACE)
1096 /* it doesn't have a letter either side */
1097 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
1098 strchr(".?!,;: ",g_utf8_get_char(snext)))
1099 guessquote+=8; /* looks like a closequote */
1102 if (matching_difference(counters,CHAR_SQUOTE)>0)
1104 * Give it the benefit of some doubt,
1105 * if a squote is already open.
1111 increment_matching(counters,c,FALSE);
1114 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1116 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1117 if (c==CHAR_UNDERSCORE)
1118 counters->c_unders++;
1119 if (c==CHAR_OPEN_SBRACK)
1121 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
1122 !matching_difference(counters,c) && s==aline &&
1123 g_str_has_prefix(s,"[Illustration:"))
1124 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
1126 increment_matching(counters,c,TRUE);
1128 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
1129 increment_matching(counters,c,TRUE);
1130 if (c==CHAR_CLOSE_SBRACK)
1132 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1133 !matching_difference(counters,c) && !*snext)
1134 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1136 increment_matching(counters,c,FALSE);
1138 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1139 increment_matching(counters,c,FALSE);
1147 * check_for_control_characters:
1149 * Check for invalid or questionable characters in the line
1150 * Anything above 127 is invalid for plain ASCII, and
1151 * non-printable control characters should also be flagged.
1152 * Tabs should generally not be there.
1154 void check_for_control_characters(const char *aline)
1158 for (s=aline;*s;s=g_utf8_next_char(s))
1160 c=g_utf8_get_char(s);
1161 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1163 if (pswit[ECHO_SWITCH])
1164 g_print("\n%s\n",aline);
1165 if (!pswit[OVERVIEW_SWITCH])
1166 g_print(" Line %ld column %ld - Control character %u\n",
1167 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1175 * check_for_odd_characters:
1177 * Check for binary and other odd characters.
1179 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1180 gboolean isemptyline)
1182 /* Don't repeat multiple warnings on one line. */
1183 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1184 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1187 for (s=aline;*s;s=g_utf8_next_char(s))
1189 c=g_utf8_get_char(s);
1190 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1192 if (pswit[ECHO_SWITCH])
1193 g_print("\n%s\n",aline);
1194 if (!pswit[OVERVIEW_SWITCH])
1195 if (c>127 && c<160 || c>255)
1196 g_print(" Line %ld column %ld - "
1197 "Non-ISO-8859 character %u\n",
1198 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1200 g_print(" Line %ld column %ld - "
1201 "Non-ASCII character %u\n",
1202 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1207 if (!eTab && c==CHAR_TAB)
1209 if (pswit[ECHO_SWITCH])
1210 g_print("\n%s\n",aline);
1211 if (!pswit[OVERVIEW_SWITCH])
1212 g_print(" Line %ld column %ld - Tab character?\n",
1213 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1218 if (!eTilde && c==CHAR_TILDE)
1221 * Often used by OCR software to indicate an
1222 * unrecognizable character.
1224 if (pswit[ECHO_SWITCH])
1225 g_print("\n%s\n",aline);
1226 if (!pswit[OVERVIEW_SWITCH])
1227 g_print(" Line %ld column %ld - Tilde character?\n",
1228 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1233 if (!eCarat && c==CHAR_CARAT)
1235 if (pswit[ECHO_SWITCH])
1236 g_print("\n%s\n",aline);
1237 if (!pswit[OVERVIEW_SWITCH])
1238 g_print(" Line %ld column %ld - Carat character?\n",
1239 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1244 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1246 if (pswit[ECHO_SWITCH])
1247 g_print("\n%s\n",aline);
1248 if (!pswit[OVERVIEW_SWITCH])
1249 g_print(" Line %ld column %ld - Forward slash?\n",
1250 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1256 * Report asterisks only in paranoid mode,
1257 * since they're often deliberate.
1259 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1262 if (pswit[ECHO_SWITCH])
1263 g_print("\n%s\n",aline);
1264 if (!pswit[OVERVIEW_SWITCH])
1265 g_print(" Line %ld column %ld - Asterisk?\n",
1266 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1275 * check_for_long_line:
1277 * Check for line too long.
1279 void check_for_long_line(const char *aline)
1281 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1283 if (pswit[ECHO_SWITCH])
1284 g_print("\n%s\n",aline);
1285 if (!pswit[OVERVIEW_SWITCH])
1286 g_print(" Line %ld column %ld - Long line %ld\n",
1287 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1294 * check_for_short_line:
1296 * Check for line too short.
1298 * This one is a bit trickier to implement: we don't want to
1299 * flag the last line of a paragraph for being short, so we
1300 * have to wait until we know that our current line is a
1301 * "normal" line, then report the _previous_ line if it was too
1302 * short. We also don't want to report indented lines like
1303 * chapter heads or formatted quotations. We therefore keep
1304 * last->len as the length of the last line examined, and
1305 * last->blen as the length of the last but one, and try to
1306 * suppress unnecessary warnings by checking that both were of
1307 * "normal" length. We keep the first character of the last
1308 * line in last->start, and if it was a space, we assume that
1309 * the formatting is deliberate. I can't figure out a way to
1310 * distinguish something like a quoted verse left-aligned or
1311 * the header or footer of a letter from a paragraph of short
1312 * lines - maybe if I examined the whole paragraph, and if the
1313 * para has less than, say, 8 lines and if all lines are short,
1314 * then just assume it's OK? Need to look at some texts to see
1315 * how often a formula like this would get the right result.
1317 void check_for_short_line(const char *aline,const struct line_properties *last)
1319 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1320 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1321 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1323 if (pswit[ECHO_SWITCH])
1324 g_print("\n%s\n",prevline);
1325 if (!pswit[OVERVIEW_SWITCH])
1326 g_print(" Line %ld column %ld - Short line %ld?\n",
1327 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1334 * check_for_starting_punctuation:
1336 * Look for punctuation other than full ellipses at start of line.
1338 void check_for_starting_punctuation(const char *aline)
1340 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1341 !g_str_has_prefix(aline,". . ."))
1343 if (pswit[ECHO_SWITCH])
1344 g_print("\n%s\n",aline);
1345 if (!pswit[OVERVIEW_SWITCH])
1346 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1354 * check_for_spaced_emdash:
1356 * Check for spaced em-dashes.
1358 * We must check _all_ occurrences of "--" on the line
1359 * hence the loop - even if the first double-dash is OK
1360 * there may be another that's wrong later on.
1362 void check_for_spaced_emdash(const char *aline)
1364 const char *s,*t,*next;
1365 for (s=aline;t=strstr(s,"--");s=next)
1367 next=g_utf8_next_char(g_utf8_next_char(t));
1368 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1369 g_utf8_get_char(next)==CHAR_SPACE)
1371 if (pswit[ECHO_SWITCH])
1372 g_print("\n%s\n",aline);
1373 if (!pswit[OVERVIEW_SWITCH])
1374 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1375 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1383 * check_for_spaced_dash:
1385 * Check for spaced dashes.
1387 void check_for_spaced_dash(const char *aline)
1390 if ((s=strstr(aline," -")))
1392 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1394 if (pswit[ECHO_SWITCH])
1395 g_print("\n%s\n",aline);
1396 if (!pswit[OVERVIEW_SWITCH])
1397 g_print(" Line %ld column %ld - Spaced dash?\n",
1398 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1403 else if ((s=strstr(aline,"- ")))
1405 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1407 if (pswit[ECHO_SWITCH])
1408 g_print("\n%s\n",aline);
1409 if (!pswit[OVERVIEW_SWITCH])
1410 g_print(" Line %ld column %ld - Spaced dash?\n",
1411 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1419 * check_for_unmarked_paragraphs:
1421 * Check for unmarked paragraphs indicated by separate speakers.
1423 * May well be false positive:
1424 * "Bravo!" "Wonderful!" called the crowd.
1425 * but useful all the same.
1427 void check_for_unmarked_paragraphs(const char *aline)
1430 s=strstr(aline,"\" \"");
1432 s=strstr(aline,"\" \"");
1435 if (pswit[ECHO_SWITCH])
1436 g_print("\n%s\n",aline);
1437 if (!pswit[OVERVIEW_SWITCH])
1438 g_print(" Line %ld column %ld - "
1439 "Query missing paragraph break?\n",
1440 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1447 * check_for_jeebies:
1449 * Check for "to he" and other easy h/b errors.
1451 * This is a very inadequate effort on the h/b problem,
1452 * but the phrase "to he" is always an error, whereas "to
1453 * be" is quite common.
1454 * Similarly, '"Quiet!", be said.' is a non-be error
1455 * "to he" is _not_ always an error!:
1456 * "Where they went to he couldn't say."
1457 * Another false positive:
1458 * What would "Cinderella" be without the . . .
1459 * and another: "If he wants to he can see for himself."
1461 void check_for_jeebies(const char *aline)
1464 s=strstr(aline," be could ");
1466 s=strstr(aline," be would ");
1468 s=strstr(aline," was be ");
1470 s=strstr(aline," be is ");
1472 s=strstr(aline," is be ");
1474 s=strstr(aline,"\", be ");
1476 s=strstr(aline,"\" be ");
1478 s=strstr(aline,"\" be ");
1480 s=strstr(aline," to he ");
1483 if (pswit[ECHO_SWITCH])
1484 g_print("\n%s\n",aline);
1485 if (!pswit[OVERVIEW_SWITCH])
1486 g_print(" Line %ld column %ld - Query he/be error?\n",
1487 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1491 s=strstr(aline," the had ");
1493 s=strstr(aline," a had ");
1495 s=strstr(aline," they bad ");
1497 s=strstr(aline," she bad ");
1499 s=strstr(aline," he bad ");
1501 s=strstr(aline," you bad ");
1503 s=strstr(aline," i bad ");
1506 if (pswit[ECHO_SWITCH])
1507 g_print("\n%s\n",aline);
1508 if (!pswit[OVERVIEW_SWITCH])
1509 g_print(" Line %ld column %ld - Query had/bad error?\n",
1510 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1514 s=strstr(aline,"; hut ");
1516 s=strstr(aline,", hut ");
1519 if (pswit[ECHO_SWITCH])
1520 g_print("\n%s\n",aline);
1521 if (!pswit[OVERVIEW_SWITCH])
1522 g_print(" Line %ld column %ld - Query hut/but error?\n",
1523 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1530 * check_for_mta_from:
1532 * Special case - angled bracket in front of "From" placed there by an
1533 * MTA when sending an e-mail.
1535 void check_for_mta_from(const char *aline)
1538 s=strstr(aline,">From");
1541 if (pswit[ECHO_SWITCH])
1542 g_print("\n%s\n",aline);
1543 if (!pswit[OVERVIEW_SWITCH])
1544 g_print(" Line %ld column %ld - "
1545 "Query angled bracket with From\n",
1546 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1553 * check_for_orphan_character:
1555 * Check for a single character line -
1556 * often an overflow from bad wrapping.
1558 void check_for_orphan_character(const char *aline)
1561 c=g_utf8_get_char(aline);
1562 if (c && !*g_utf8_next_char(aline))
1564 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1565 ; /* Nothing - ignore numerals alone on a line. */
1568 if (pswit[ECHO_SWITCH])
1569 g_print("\n%s\n",aline);
1570 if (!pswit[OVERVIEW_SWITCH])
1571 g_print(" Line %ld column 1 - Query single character line\n",
1580 * check_for_pling_scanno:
1582 * Check for I" - often should be !
1584 void check_for_pling_scanno(const char *aline)
1587 s=strstr(aline," I\"");
1590 if (pswit[ECHO_SWITCH])
1591 g_print("\n%s\n",aline);
1592 if (!pswit[OVERVIEW_SWITCH])
1593 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1594 linecnt,g_utf8_pointer_to_offset(aline,s));
1601 * check_for_extra_period:
1603 * Check for period without a capital letter. Cut-down from gutspell.
1604 * Only works when it happens on a single line.
1606 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1608 const char *s,*t,*s1,*sprev;
1613 gunichar c,nc,pc,*decomposition;
1614 if (pswit[PARANOID_SWITCH])
1616 for (t=aline;t=strstr(t,". ");)
1620 t=g_utf8_next_char(t);
1621 /* start of line punctuation is handled elsewhere */
1624 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1626 t=g_utf8_next_char(t);
1629 if (warnings->isDutch)
1631 /* For Frank & Jeroen -- 's Middags case */
1632 gunichar c2,c3,c4,c5;
1633 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1634 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1635 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1636 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1637 if (CHAR_IS_APOSTROPHE(c2) &&
1638 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1639 g_unichar_isupper(c5))
1641 t=g_utf8_next_char(t);
1645 s1=g_utf8_next_char(g_utf8_next_char(t));
1646 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1647 !isdigit(g_utf8_get_char(s1)))
1648 s1=g_utf8_next_char(s1);
1649 if (g_unichar_islower(g_utf8_get_char(s1)))
1651 /* we have something to investigate */
1653 /* so let's go back and find out */
1654 nc=g_utf8_get_char(t);
1655 s1=g_utf8_prev_char(t);
1656 c=g_utf8_get_char(s1);
1657 sprev=g_utf8_prev_char(s1);
1658 pc=g_utf8_get_char(sprev);
1660 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1661 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1662 g_unichar_isalpha(nc)))
1667 sprev=g_utf8_prev_char(s1);
1668 pc=g_utf8_get_char(sprev);
1670 s1=g_utf8_next_char(s1);
1673 testword=g_strndup(s1,s-s1);
1675 testword=g_strdup(s1);
1676 for (i=0;*abbrev[i];i++)
1677 if (!strcmp(testword,abbrev[i]))
1679 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1681 if (!*g_utf8_next_char(testword))
1683 if (isroman(testword))
1688 for (s=testword;*s;s=g_utf8_next_char(s))
1690 decomposition=g_unicode_canonical_decomposition(
1691 g_utf8_get_char(s),&len);
1692 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1694 g_free(decomposition);
1698 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1700 g_tree_insert(qperiod,g_strdup(testword),
1701 GINT_TO_POINTER(1));
1702 if (pswit[ECHO_SWITCH])
1703 g_print("\n%s\n",aline);
1704 if (!pswit[OVERVIEW_SWITCH])
1705 g_print(" Line %ld column %ld - Extra period?\n",
1706 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1712 t=g_utf8_next_char(t);
1718 * check_for_following_punctuation:
1720 * Check for words usually not followed by punctuation.
1722 void check_for_following_punctuation(const char *aline)
1725 const char *s,*wordstart;
1728 if (pswit[TYPO_SWITCH])
1739 inword=g_utf8_strdown(t,-1);
1741 for (i=0;*nocomma[i];i++)
1742 if (!strcmp(inword,nocomma[i]))
1744 c=g_utf8_get_char(s);
1745 if (c==',' || c==';' || c==':')
1747 if (pswit[ECHO_SWITCH])
1748 g_print("\n%s\n",aline);
1749 if (!pswit[OVERVIEW_SWITCH])
1750 g_print(" Line %ld column %ld - "
1751 "Query punctuation after %s?\n",
1752 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1758 for (i=0;*noperiod[i];i++)
1759 if (!strcmp(inword,noperiod[i]))
1761 c=g_utf8_get_char(s);
1762 if (c=='.' || c=='!')
1764 if (pswit[ECHO_SWITCH])
1765 g_print("\n%s\n",aline);
1766 if (!pswit[OVERVIEW_SWITCH])
1767 g_print(" Line %ld column %ld - "
1768 "Query punctuation after %s?\n",
1769 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1783 * Check for commonly mistyped words,
1784 * and digits like 0 for O in a word.
1786 void check_for_typos(const char *aline,struct warnings *warnings)
1788 const char *s,*t,*nt,*wordstart;
1790 gunichar *decomposition;
1792 int i,vowel,consonant,*dupcnt;
1793 gboolean isdup,istypo,alower;
1796 gsize decomposition_len;
1800 inword=getaword(&s);
1804 continue; /* don't bother with empty lines */
1806 if (mixdigit(inword))
1808 if (pswit[ECHO_SWITCH])
1809 g_print("\n%s\n",aline);
1810 if (!pswit[OVERVIEW_SWITCH])
1811 g_print(" Line %ld column %ld - Query digit in %s\n",
1812 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1817 * Put the word through a series of tests for likely typos and OCR
1820 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1824 for (t=inword;*t;t=g_utf8_next_char(t))
1826 c=g_utf8_get_char(t);
1827 nt=g_utf8_next_char(t);
1828 /* lowercase for testing */
1829 if (g_unichar_islower(c))
1831 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1834 * We have an uppercase mid-word. However, there are
1836 * Mac and Mc like McGill
1837 * French contractions like l'Abbe
1839 offset=g_utf8_pointer_to_offset(inword,t);
1841 pc=g_utf8_get_char(g_utf8_prev_char(t));
1844 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1845 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1846 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1847 CHAR_IS_APOSTROPHE(pc))
1853 testword=g_utf8_casefold(inword,-1);
1855 if (pswit[TYPO_SWITCH])
1858 * Check for certain unlikely two-letter combinations at word
1861 len=g_utf8_strlen(testword,-1);
1864 for (i=0;*nostart[i];i++)
1865 if (g_str_has_prefix(testword,nostart[i]))
1867 for (i=0;*noend[i];i++)
1868 if (g_str_has_suffix(testword,noend[i]))
1871 /* ght is common, gbt never. Like that. */
1872 if (strstr(testword,"cb"))
1874 if (strstr(testword,"gbt"))
1876 if (strstr(testword,"pbt"))
1878 if (strstr(testword,"tbs"))
1880 if (strstr(testword,"mrn"))
1882 if (strstr(testword,"ahle"))
1884 if (strstr(testword,"ihle"))
1887 * "TBE" does happen - like HEARTBEAT - but uncommon.
1888 * Also "TBI" - frostbite, outbid - but uncommon.
1889 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1890 * numerals, but "ii" is a common scanno.
1892 if (strstr(testword,"tbi"))
1894 if (strstr(testword,"tbe"))
1896 if (strstr(testword,"ii"))
1899 * Check for no vowels or no consonants.
1900 * If none, flag a typo.
1902 if (!istypo && len>1)
1905 for (t=testword;*t;t=g_utf8_next_char(t))
1907 c=g_utf8_get_char(t);
1909 g_unicode_canonical_decomposition(c,&decomposition_len);
1910 if (c=='y' || g_unichar_isdigit(c))
1912 /* Yah, this is loose. */
1916 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1920 g_free(decomposition);
1922 if (!vowel || !consonant)
1926 * Now exclude the word from being reported if it's in
1929 for (i=0;*okword[i];i++)
1930 if (!strcmp(testword,okword[i]))
1933 * What looks like a typo may be a Roman numeral.
1936 if (istypo && isroman(testword))
1938 /* Check the manual list of typos. */
1940 for (i=0;*typo[i];i++)
1941 if (!strcmp(testword,typo[i]))
1944 * Check lowercase s, l, i and m - special cases.
1945 * "j" - often a semi-colon gone wrong.
1946 * "d" for a missing apostrophe - he d
1949 if (!istypo && len==1 &&
1950 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1954 dupcnt=g_tree_lookup(qword,testword);
1958 isdup=!pswit[VERBOSE_SWITCH];
1962 dupcnt=g_new0(int,1);
1963 g_tree_insert(qword,g_strdup(testword),dupcnt);
1968 if (pswit[ECHO_SWITCH])
1969 g_print("\n%s\n",aline);
1970 if (!pswit[OVERVIEW_SWITCH])
1972 g_print(" Line %ld column %ld - Query word %s",
1973 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1975 if (!pswit[VERBOSE_SWITCH])
1976 g_print(" - not reporting duplicates");
1984 /* check the user's list of typos */
1985 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1987 if (pswit[ECHO_SWITCH])
1988 g_print("\n%s\n",aline);
1989 if (!pswit[OVERVIEW_SWITCH])
1990 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1991 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1993 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1995 if (pswit[PARANOID_SWITCH] && warnings->digit)
1997 /* In paranoid mode, query all 0 and 1 standing alone. */
1998 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
2000 if (pswit[ECHO_SWITCH])
2001 g_print("\n%s\n",aline);
2002 if (!pswit[OVERVIEW_SWITCH])
2003 g_print(" Line %ld column %ld - Query standalone %s\n",
2004 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
2015 * check_for_misspaced_punctuation:
2017 * Look for added or missing spaces around punctuation and quotes.
2018 * If there is a punctuation character like ! with no space on
2019 * either side, suspect a missing!space. If there are spaces on
2020 * both sides , assume a typo. If we see a double quote with no
2021 * space or punctuation on either side of it, assume unspaced
2022 * quotes "like"this.
2024 void check_for_misspaced_punctuation(const char *aline,
2025 struct parities *parities,gboolean isemptyline)
2027 gboolean isacro,isellipsis;
2029 gunichar c,nc,pc,n2c;
2030 c=g_utf8_get_char(aline);
2031 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2032 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2036 nc=g_utf8_get_char(g_utf8_next_char(s));
2037 /* For each character in the line after the first. */
2038 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
2040 /* we need to suppress warnings for acronyms like M.D. */
2042 /* we need to suppress warnings for ellipsis . . . */
2045 * If there are letters on both sides of it or
2046 * if it's strict punctuation followed by an alpha.
2048 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2049 g_utf8_strchr("?!,;:",-1,c)))
2053 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2054 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2056 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2062 if (pswit[ECHO_SWITCH])
2063 g_print("\n%s\n",aline);
2064 if (!pswit[OVERVIEW_SWITCH])
2065 g_print(" Line %ld column %ld - Missing space?\n",
2066 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2071 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2074 * If there are spaces on both sides,
2075 * or space before and end of line.
2079 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2080 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2082 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2086 if (!isemptyline && !isellipsis)
2088 if (pswit[ECHO_SWITCH])
2089 g_print("\n%s\n",aline);
2090 if (!pswit[OVERVIEW_SWITCH])
2091 g_print(" Line %ld column %ld - "
2092 "Spaced punctuation?\n",linecnt,
2093 g_utf8_pointer_to_offset(aline,s)+1);
2100 /* Split out the characters that CANNOT be preceded by space. */
2101 c=g_utf8_get_char(aline);
2102 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2103 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2107 nc=g_utf8_get_char(g_utf8_next_char(s));
2108 /* for each character in the line after the first */
2109 if (g_utf8_strchr("?!,;:",-1,c))
2111 /* if it's punctuation that _cannot_ have a space before it */
2112 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2115 * If nc DOES == space,
2116 * it was already reported just above.
2118 if (pswit[ECHO_SWITCH])
2119 g_print("\n%s\n",aline);
2120 if (!pswit[OVERVIEW_SWITCH])
2121 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2122 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2129 * Special case " .X" where X is any alpha.
2130 * This plugs a hole in the acronym code above.
2131 * Inelegant, but maintainable.
2133 c=g_utf8_get_char(aline);
2134 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2135 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2139 nc=g_utf8_get_char(g_utf8_next_char(s));
2140 /* for each character in the line after the first */
2143 /* if it's a period */
2144 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2147 * If the period follows a space and
2148 * is followed by a letter.
2150 if (pswit[ECHO_SWITCH])
2151 g_print("\n%s\n",aline);
2152 if (!pswit[OVERVIEW_SWITCH])
2153 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2154 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2160 c=g_utf8_get_char(aline);
2161 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2162 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2166 nc=g_utf8_get_char(g_utf8_next_char(s));
2167 /* for each character in the line after the first */
2170 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2171 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2172 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2174 if (pswit[ECHO_SWITCH])
2175 g_print("\n%s\n",aline);
2176 if (!pswit[OVERVIEW_SWITCH])
2177 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2178 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2184 /* Check parity of quotes. */
2185 nc=g_utf8_get_char(aline);
2186 for (s=aline;*s;s=g_utf8_next_char(s))
2189 nc=g_utf8_get_char(g_utf8_next_char(s));
2192 parities->dquote=!parities->dquote;
2193 if (!parities->dquote)
2196 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2198 if (pswit[ECHO_SWITCH])
2199 g_print("\n%s\n",aline);
2200 if (!pswit[OVERVIEW_SWITCH])
2201 g_print(" Line %ld column %ld - "
2202 "Wrongspaced quotes?\n",
2203 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2211 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2212 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2214 if (pswit[ECHO_SWITCH])
2215 g_print("\n%s\n",aline);
2216 if (!pswit[OVERVIEW_SWITCH])
2217 g_print(" Line %ld column %ld - "
2218 "Wrongspaced quotes?\n",
2219 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2226 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2228 if (g_utf8_strchr(",;:!?)]} ",-1,
2229 g_utf8_get_char(g_utf8_next_char(aline))))
2231 if (pswit[ECHO_SWITCH])
2232 g_print("\n%s\n",aline);
2233 if (!pswit[OVERVIEW_SWITCH])
2234 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2240 if (pswit[SQUOTE_SWITCH])
2242 nc=g_utf8_get_char(aline);
2243 for (s=aline;*s;s=g_utf8_next_char(s))
2246 nc=g_utf8_get_char(g_utf8_next_char(s));
2247 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2248 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2249 !g_unichar_isalpha(nc)))
2251 parities->squote=!parities->squote;
2252 if (!parities->squote)
2255 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2257 if (pswit[ECHO_SWITCH])
2258 g_print("\n%s\n",aline);
2259 if (!pswit[OVERVIEW_SWITCH])
2260 g_print(" Line %ld column %ld - "
2261 "Wrongspaced singlequotes?\n",
2262 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2270 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2271 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2273 if (pswit[ECHO_SWITCH])
2274 g_print("\n%s\n",aline);
2275 if (!pswit[OVERVIEW_SWITCH])
2276 g_print(" Line %ld column %ld - "
2277 "Wrongspaced singlequotes?\n",
2278 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2289 * check_for_double_punctuation:
2291 * Look for double punctuation like ,. or ,,
2292 * Thanks to DW for the suggestion!
2293 * In books with references, ".," and ".;" are common
2294 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2295 * OTOH, from my initial tests, there are also fairly
2296 * common errors. What to do? Make these cases paranoid?
2297 * ".," is the most common, so warnings->dotcomma is used
2298 * to suppress detailed reporting if it occurs often.
2300 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2304 nc=g_utf8_get_char(aline);
2305 for (s=aline;*s;s=g_utf8_next_char(s))
2308 nc=g_utf8_get_char(g_utf8_next_char(s));
2309 /* for each punctuation character in the line */
2310 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2311 g_utf8_strchr(".?!,;:",-1,nc))
2313 /* followed by punctuation, it's a query, unless . . . */
2314 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2315 !warnings->dotcomma && c=='.' && nc==',' ||
2316 warnings->isFrench && g_str_has_prefix(s,",...") ||
2317 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2318 warnings->isFrench && g_str_has_prefix(s,";...") ||
2319 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2320 warnings->isFrench && g_str_has_prefix(s,":...") ||
2321 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2322 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2323 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2324 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2325 warnings->isFrench && g_str_has_prefix(s,"...?"))
2327 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2328 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2329 warnings->isFrench && g_str_has_prefix(s,";...") ||
2330 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2331 warnings->isFrench && g_str_has_prefix(s,":...") ||
2332 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2333 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2334 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2335 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2336 warnings->isFrench && g_str_has_prefix(s,"...?"))
2339 nc=g_utf8_get_char(g_utf8_next_char(s));
2341 ; /* do nothing for .. !! and ?? which can be legit */
2345 if (pswit[ECHO_SWITCH])
2346 g_print("\n%s\n",aline);
2347 if (!pswit[OVERVIEW_SWITCH])
2348 g_print(" Line %ld column %ld - Double punctuation?\n",
2349 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2358 * check_for_spaced_quotes:
2360 void check_for_spaced_quotes(const char *aline)
2364 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2368 while ((t=strstr(s," \" ")))
2370 if (pswit[ECHO_SWITCH])
2371 g_print("\n%s\n",aline);
2372 if (!pswit[OVERVIEW_SWITCH])
2373 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2374 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2377 s=g_utf8_next_char(g_utf8_next_char(t));
2379 pattern=g_string_new(NULL);
2380 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2382 g_string_assign(pattern," ");
2383 g_string_append_unichar(pattern,single_quotes[i]);
2384 g_string_append_c(pattern,' ');
2386 while ((t=strstr(s,pattern->str)))
2388 if (pswit[ECHO_SWITCH])
2389 g_print("\n%s\n",aline);
2390 if (!pswit[OVERVIEW_SWITCH])
2391 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2392 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2395 s=g_utf8_next_char(g_utf8_next_char(t));
2398 g_string_free(pattern,TRUE);
2402 * check_for_miscased_genative:
2404 * Check special case of 'S instead of 's at end of word.
2406 void check_for_miscased_genative(const char *aline)
2412 c=g_utf8_get_char(aline);
2413 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2414 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2418 nc=g_utf8_get_char(g_utf8_next_char(s));
2419 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2421 if (pswit[ECHO_SWITCH])
2422 g_print("\n%s\n",aline);
2423 if (!pswit[OVERVIEW_SWITCH])
2424 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2425 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2433 * check_end_of_line:
2435 * Now check special cases - start and end of line -
2436 * for single and double quotes. Start is sometimes [sic]
2437 * but better to query it anyway.
2438 * While we're here, check for dash at end of line.
2440 void check_end_of_line(const char *aline,struct warnings *warnings)
2445 lbytes=strlen(aline);
2446 if (g_utf8_strlen(aline,lbytes)>1)
2448 s=g_utf8_prev_char(aline+lbytes);
2449 c1=g_utf8_get_char(s);
2450 c2=g_utf8_get_char(g_utf8_prev_char(s));
2451 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2453 if (pswit[ECHO_SWITCH])
2454 g_print("\n%s\n",aline);
2455 if (!pswit[OVERVIEW_SWITCH])
2456 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2457 g_utf8_strlen(aline,lbytes));
2461 c1=g_utf8_get_char(aline);
2462 c2=g_utf8_get_char(g_utf8_next_char(aline));
2463 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2465 if (pswit[ECHO_SWITCH])
2466 g_print("\n%s\n",aline);
2467 if (!pswit[OVERVIEW_SWITCH])
2468 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2473 * Dash at end of line may well be legit - paranoid mode only
2474 * and don't report em-dash at line-end.
2476 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2478 for (s=g_utf8_prev_char(aline+lbytes);
2479 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2481 if (g_utf8_get_char(s)=='-' &&
2482 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2484 if (pswit[ECHO_SWITCH])
2485 g_print("\n%s\n",aline);
2486 if (!pswit[OVERVIEW_SWITCH])
2487 g_print(" Line %ld column %ld - "
2488 "Hyphen at end of line?\n",
2489 linecnt,g_utf8_pointer_to_offset(aline,s));
2496 * check_for_unspaced_bracket:
2498 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2499 * If so, suspect a scanno like "a]most".
2501 void check_for_unspaced_bracket(const char *aline)
2505 c=g_utf8_get_char(aline);
2506 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2507 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2511 nc=g_utf8_get_char(g_utf8_next_char(s));
2514 /* for each bracket character in the line except 1st & last */
2515 if (g_utf8_strchr("{[()]}",-1,c) &&
2516 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2518 if (pswit[ECHO_SWITCH])
2519 g_print("\n%s\n",aline);
2520 if (!pswit[OVERVIEW_SWITCH])
2521 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2522 linecnt,g_utf8_pointer_to_offset(aline,s));
2530 * check_for_unpunctuated_endquote:
2532 void check_for_unpunctuated_endquote(const char *aline)
2536 c=g_utf8_get_char(aline);
2537 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2538 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2542 nc=g_utf8_get_char(g_utf8_next_char(s));
2543 /* for each character in the line except 1st */
2544 if (c==CHAR_DQUOTE && isalpha(pc))
2546 if (pswit[ECHO_SWITCH])
2547 g_print("\n%s\n",aline);
2548 if (!pswit[OVERVIEW_SWITCH])
2549 g_print(" Line %ld column %ld - "
2550 "endquote missing punctuation?\n",
2551 linecnt,g_utf8_pointer_to_offset(aline,s));
2559 * check_for_html_tag:
2561 * Check for <HTML TAG>.
2563 * If there is a < in the line, followed at some point
2564 * by a > then we suspect HTML.
2566 void check_for_html_tag(const char *aline)
2568 const char *open,*close;
2570 open=strchr(aline,'<');
2573 close=strchr(g_utf8_next_char(open),'>');
2576 if (pswit[ECHO_SWITCH])
2577 g_print("\n%s\n",aline);
2578 if (!pswit[OVERVIEW_SWITCH])
2580 tag=g_strndup(open,close-open+1);
2581 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2582 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2592 * check_for_html_entity:
2594 * Check for &symbol; HTML.
2596 * If there is a & in the line, followed at
2597 * some point by a ; then we suspect HTML.
2599 void check_for_html_entity(const char *aline)
2601 const char *s,*amp,*scolon;
2603 amp=strchr(aline,'&');
2606 scolon=strchr(amp,';');
2609 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2610 if (g_utf8_get_char(s)==CHAR_SPACE)
2611 break; /* Don't report "Jones & Son;" */
2614 if (pswit[ECHO_SWITCH])
2615 g_print("\n%s\n",aline);
2616 if (!pswit[OVERVIEW_SWITCH])
2618 entity=g_strndup(amp,scolon-amp+1);
2619 g_print(" Line %ld column %d - HTML symbol? %s \n",
2620 linecnt,(int)(amp-aline)+1,entity);
2631 * check_for_omitted_punctuation:
2633 * Check for omitted punctuation at end of paragraph by working back
2634 * through prevline. DW.
2635 * Need to check this only for "normal" paras.
2636 * So what is a "normal" para?
2637 * Not normal if one-liner (chapter headings, etc.)
2638 * Not normal if doesn't contain at least one locase letter
2639 * Not normal if starts with space
2641 void check_for_omitted_punctuation(const char *prevline,
2642 struct line_properties *last,int start_para_line)
2644 gboolean letter_on_line=FALSE;
2647 for (s=prevline;*s;s=g_utf8_next_char(s))
2648 if (g_unichar_isalpha(g_utf8_get_char(s)))
2650 letter_on_line=TRUE;
2654 * This next "if" is a problem.
2655 * If we say "start_para_line <= linecnt - 1", that includes
2656 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2657 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2658 * misses genuine one-line paragraphs.
2660 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2661 g_utf8_get_char(prevline)>CHAR_SPACE)
2663 s=prevline+strlen(prevline);
2666 s=g_utf8_prev_char(s);
2667 c=g_utf8_get_char(s);
2668 } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
2669 for (;s>prevline;s=g_utf8_prev_char(s))
2671 if (g_unichar_isalpha(g_utf8_get_char(s)))
2673 if (pswit[ECHO_SWITCH])
2674 g_print("\n%s\n",prevline);
2675 if (!pswit[OVERVIEW_SWITCH])
2676 g_print(" Line %ld column %ld - "
2677 "No punctuation at para end?\n",
2678 linecnt-1,g_utf8_strlen(prevline,-1));
2683 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2689 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2691 const char *word=key;
2694 g_print("\nNote: Queried word %s was duplicated %d times\n",
2699 void print_as_windows_1252(const char *string)
2701 gsize inbytes,outbytes;
2703 static GIConv converter=(GIConv)-1;
2706 if (converter!=(GIConv)-1)
2707 g_iconv_close(converter);
2708 converter=(GIConv)-1;
2711 if (converter==(GIConv)-1)
2712 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2713 if (converter!=(GIConv)-1)
2715 inbytes=outbytes=strlen(string);
2716 bp=buf=g_malloc(outbytes+1);
2717 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2723 fputs(string,stdout);
2726 void print_as_utf_8(const char *string)
2728 fputs(string,stdout);
2736 void procfile(const char *filename)
2739 gchar *parastart=NULL; /* first line of current para */
2740 gchar *etext,*aline;
2743 struct first_pass_results *first_pass_results;
2744 struct warnings *warnings;
2745 struct counters counters={0};
2746 struct line_properties last={0};
2747 struct parities parities={0};
2748 struct pending pending={0};
2749 gboolean isemptyline;
2750 long start_para_line=0;
2751 gboolean isnewpara=FALSE,enddash=FALSE;
2752 last.start=CHAR_SPACE;
2753 linecnt=checked_linecnt=0;
2754 etext=read_etext(filename,&err);
2757 if (pswit[STDOUT_SWITCH])
2758 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2760 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2763 g_print("\n\nFile: %s\n\n",filename);
2764 first_pass_results=first_pass(etext);
2765 warnings=report_first_pass(first_pass_results);
2766 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2767 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2769 * Here we go with the main pass. Hold onto yer hat!
2773 while ((aline=flgets(&etext_ptr,linecnt+1)))
2778 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2779 continue; // skip DP page separators completely
2780 if (linecnt<first_pass_results->firstline ||
2781 (first_pass_results->footerline>0 &&
2782 linecnt>first_pass_results->footerline))
2784 if (pswit[HEADER_SWITCH])
2786 if (g_str_has_prefix(aline,"Title:"))
2787 g_print(" %s\n",aline);
2788 if (g_str_has_prefix(aline,"Author:"))
2789 g_print(" %s\n",aline);
2790 if (g_str_has_prefix(aline,"Release Date:"))
2791 g_print(" %s\n",aline);
2792 if (g_str_has_prefix(aline,"Edition:"))
2793 g_print(" %s\n\n",aline);
2795 continue; /* skip through the header */
2798 print_pending(aline,parastart,&pending);
2799 isemptyline=analyse_quotes(aline,&counters);
2800 if (isnewpara && !isemptyline)
2802 /* This line is the start of a new paragraph. */
2803 start_para_line=linecnt;
2804 /* Capture its first line in case we want to report it later. */
2806 parastart=g_strdup(aline);
2807 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2809 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2810 !g_unichar_isdigit(g_utf8_get_char(s)))
2811 s=g_utf8_next_char(s);
2812 if (g_unichar_islower(g_utf8_get_char(s)))
2814 /* and its first letter is lowercase */
2815 if (pswit[ECHO_SWITCH])
2816 g_print("\n%s\n",aline);
2817 if (!pswit[OVERVIEW_SWITCH])
2818 g_print(" Line %ld column %ld - "
2819 "Paragraph starts with lower-case\n",
2820 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2824 isnewpara=FALSE; /* Signal the end of new para processing. */
2826 /* Check for an em-dash broken at line end. */
2827 if (enddash && g_utf8_get_char(aline)=='-')
2829 if (pswit[ECHO_SWITCH])
2830 g_print("\n%s\n",aline);
2831 if (!pswit[OVERVIEW_SWITCH])
2832 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2837 for (s=g_utf8_prev_char(aline+strlen(aline));
2838 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2840 if (s>=aline && g_utf8_get_char(s)=='-')
2842 check_for_control_characters(aline);
2844 check_for_odd_characters(aline,warnings,isemptyline);
2845 if (warnings->longline)
2846 check_for_long_line(aline);
2847 if (warnings->shortline)
2848 check_for_short_line(aline,&last);
2850 last.len=g_utf8_strlen(aline,-1);
2851 last.start=g_utf8_get_char(aline);
2852 check_for_starting_punctuation(aline);
2855 check_for_spaced_emdash(aline);
2856 check_for_spaced_dash(aline);
2858 check_for_unmarked_paragraphs(aline);
2859 check_for_jeebies(aline);
2860 check_for_mta_from(aline);
2861 check_for_orphan_character(aline);
2862 check_for_pling_scanno(aline);
2863 check_for_extra_period(aline,warnings);
2864 check_for_following_punctuation(aline);
2865 check_for_typos(aline,warnings);
2866 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2867 check_for_double_punctuation(aline,warnings);
2868 check_for_spaced_quotes(aline);
2869 check_for_miscased_genative(aline);
2870 check_end_of_line(aline,warnings);
2871 check_for_unspaced_bracket(aline);
2872 if (warnings->endquote)
2873 check_for_unpunctuated_endquote(aline);
2874 check_for_html_tag(aline);
2875 check_for_html_entity(aline);
2878 check_for_mismatched_quotes(&counters,&pending);
2879 counters_reset(&counters);
2880 /* let the next iteration know that it's starting a new para */
2883 check_for_omitted_punctuation(prevline,&last,start_para_line);
2886 prevline=g_strdup(aline);
2889 check_for_mismatched_quotes(&counters,&pending);
2890 print_pending(NULL,parastart,&pending);
2891 reset_pending(&pending);
2900 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2901 g_tree_foreach(qword,report_duplicate_queries,NULL);
2902 g_tree_unref(qword);
2903 g_tree_unref(qperiod);
2904 counters_destroy(&counters);
2905 g_set_print_handler(NULL);
2906 print_as_windows_1252(NULL);
2907 if (pswit[MARKUP_SWITCH])
2914 * Get one line from the input text, checking for
2915 * the existence of exactly one CR/LF line-end per line.
2917 * Returns: a pointer to the line.
2919 char *flgets(char **etext,long lcnt)
2922 gboolean isCR=FALSE;
2923 char *theline=*etext;
2928 c=g_utf8_get_char(*etext);
2929 *etext=g_utf8_next_char(*etext);
2932 /* either way, it's end of line */
2939 /* Error - a LF without a preceding CR */
2940 if (pswit[LINE_END_SWITCH])
2942 if (pswit[ECHO_SWITCH])
2944 s=g_strndup(theline,eos-theline);
2945 g_print("\n%s\n",s);
2948 if (!pswit[OVERVIEW_SWITCH])
2949 g_print(" Line %ld - No CR?\n",lcnt);
2960 /* Error - two successive CRs */
2961 if (pswit[LINE_END_SWITCH])
2963 if (pswit[ECHO_SWITCH])
2965 s=g_strndup(theline,eos-theline);
2966 g_print("\n%s\n",s);
2969 if (!pswit[OVERVIEW_SWITCH])
2970 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2979 if (pswit[LINE_END_SWITCH] && isCR)
2981 if (pswit[ECHO_SWITCH])
2983 s=g_strndup(theline,eos-theline);
2984 g_print("\n%s\n",s);
2987 if (!pswit[OVERVIEW_SWITCH])
2988 g_print(" Line %ld column %ld - CR without LF?\n",
2989 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2995 eos=g_utf8_next_char(eos);
2999 if (pswit[MARKUP_SWITCH])
3000 postprocess_for_HTML(theline);
3001 if (pswit[DP_SWITCH])
3002 postprocess_for_DP(theline);
3009 * Takes a "word" as a parameter, and checks whether it
3010 * contains a mixture of alpha and digits. Generally, this is an
3011 * error, but may not be for cases like 4th or L5 12s. 3d.
3013 * Returns: TRUE iff an is error found.
3015 gboolean mixdigit(const char *checkword)
3017 gboolean wehaveadigit,wehavealetter,query;
3018 const char *s,*nondigit;
3019 wehaveadigit=wehavealetter=query=FALSE;
3020 for (s=checkword;*s;s=g_utf8_next_char(s))
3021 if (g_unichar_isalpha(g_utf8_get_char(s)))
3023 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3025 if (wehaveadigit && wehavealetter)
3027 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3029 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3030 nondigit=g_utf8_next_char(nondigit))
3032 /* digits, ending in st, rd, nd, th of either case */
3033 if (!g_ascii_strcasecmp(nondigit,"st") ||
3034 !g_ascii_strcasecmp(nondigit,"rd") ||
3035 !g_ascii_strcasecmp(nondigit,"nd") ||
3036 !g_ascii_strcasecmp(nondigit,"th"))
3038 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3039 !g_ascii_strcasecmp(nondigit,"rds") ||
3040 !g_ascii_strcasecmp(nondigit,"nds") ||
3041 !g_ascii_strcasecmp(nondigit,"ths"))
3043 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3044 !g_ascii_strcasecmp(nondigit,"rdly") ||
3045 !g_ascii_strcasecmp(nondigit,"ndly") ||
3046 !g_ascii_strcasecmp(nondigit,"thly"))
3048 /* digits, ending in l, L, s or d */
3049 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3050 !strcmp(nondigit,"d"))
3053 * L at the start of a number, representing Britsh pounds, like L500.
3054 * This is cute. We know the current word is mixed digit. If the first
3055 * letter is L, there must be at least one digit following. If both
3056 * digits and letters follow, we have a genuine error, else we have a
3057 * capital L followed by digits, and we accept that as a non-error.
3059 if (g_utf8_get_char(checkword)=='L' &&
3060 !mixdigit(g_utf8_next_char(checkword)))
3069 * Extracts the first/next "word" from the line, and returns it.
3070 * A word is defined as one English word unit--or at least that's the aim.
3071 * "ptr" is advanced to the position in the line where we will start
3072 * looking for the next word.
3074 * Returns: A newly-allocated string.
3076 gchar *getaword(const char **ptr)
3081 word=g_string_new(NULL);
3082 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3083 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3084 **ptr;*ptr=g_utf8_next_char(*ptr))
3087 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3088 * Especially yucky is the case of L1,000
3089 * This section looks for a pattern of characters including a digit
3090 * followed by a comma or period followed by one or more digits.
3091 * If found, it returns this whole pattern as a word; otherwise we discard
3092 * the results and resume our normal programming.
3095 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3096 g_unichar_isalpha(g_utf8_get_char(s)) ||
3097 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3098 g_string_append_unichar(word,g_utf8_get_char(s));
3101 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3103 c=g_utf8_get_char(t);
3104 pc=g_utf8_get_char(g_utf8_prev_char(t));
3105 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3108 return g_string_free(word,FALSE);
3112 /* we didn't find a punctuated number - do the regular getword thing */
3113 g_string_truncate(word,0);
3114 c=g_utf8_get_char(*ptr);
3115 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3116 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3117 g_string_append_unichar(word,c);
3118 return g_string_free(word,FALSE);
3124 * Is this word a Roman Numeral?
3126 * It doesn't actually validate that the number is a valid Roman Numeral--for
3127 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3128 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3129 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3130 * expressions thereof, except when it came to taxes. Allow any number of M,
3131 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3132 * XL or an optional XC, an optional IX or IV, an optional V and any number
3135 gboolean isroman(const char *t)
3141 while (g_utf8_get_char(t)=='m' && *t)
3143 if (g_utf8_get_char(t)=='d')
3145 if (g_str_has_prefix(t,"cm"))
3147 if (g_str_has_prefix(t,"cd"))
3149 while (g_utf8_get_char(t)=='c' && *t)
3151 if (g_str_has_prefix(t,"xl"))
3153 if (g_str_has_prefix(t,"xc"))
3155 if (g_utf8_get_char(t)=='l')
3157 while (g_utf8_get_char(t)=='x' && *t)
3159 if (g_str_has_prefix(t,"ix"))
3161 if (g_str_has_prefix(t,"iv"))
3163 if (g_utf8_get_char(t)=='v')
3165 while (g_utf8_get_char(t)=='i' && *t)
3171 * postprocess_for_DP:
3173 * Invoked with the -d switch from flgets().
3174 * It simply "removes" from the line a hard-coded set of common
3175 * DP-specific tags, so that the line passed to the main routine has
3176 * been pre-cleaned of DP markup.
3178 void postprocess_for_DP(char *theline)
3184 for (i=0;*DPmarkup[i];i++)
3185 while ((s=strstr(theline,DPmarkup[i])))
3187 t=s+strlen(DPmarkup[i]);
3188 memmove(s,t,strlen(t)+1);
3193 * postprocess_for_HTML:
3195 * Invoked with the -m switch from flgets().
3196 * It simply "removes" from the line a hard-coded set of common
3197 * HTML tags and "replaces" a hard-coded set of common HTML
3198 * entities, so that the line passed to the main routine has
3199 * been pre-cleaned of HTML.
3201 void postprocess_for_HTML(char *theline)
3203 while (losemarkup(theline))
3205 loseentities(theline);
3208 char *losemarkup(char *theline)
3212 s=strchr(theline,'<');
3213 t=s?strchr(s,'>'):NULL;
3216 for (i=0;*markup[i];i++)
3217 if (tagcomp(g_utf8_next_char(s),markup[i]))
3219 t=g_utf8_next_char(t);
3220 memmove(s,t,strlen(t)+1);
3223 /* It's an unrecognized <xxx>. */
3227 void loseentities(char *theline)
3234 GTree *entities=NULL;
3235 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3239 g_tree_destroy(entities);
3241 if (translit!=(GIConv)-1)
3242 g_iconv_close(translit);
3243 translit=(GIConv)-1;
3244 if (to_utf8!=(GIConv)-1)
3245 g_iconv_close(to_utf8);
3253 entities=g_tree_new((GCompareFunc)strcmp);
3254 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3255 g_tree_insert(entities,HTMLentities[i].name,
3256 GUINT_TO_POINTER(HTMLentities[i].c));
3258 if (translit==(GIConv)-1)
3259 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3260 if (to_utf8==(GIConv)-1)
3261 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3262 while((amp=strchr(theline,'&')))
3264 scolon=strchr(amp,';');
3269 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3270 c=strtol(amp+2,NULL,10);
3271 else if (amp[2]=='x' &&
3272 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3273 c=strtol(amp+3,NULL,16);
3277 s=g_strndup(amp+1,scolon-(amp+1));
3278 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3287 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3288 theline+=g_unichar_to_utf8(c,theline);
3292 nb=g_unichar_to_utf8(c,s);
3293 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3295 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3297 memcpy(theline,s,nb);
3301 memmove(theline,g_utf8_next_char(scolon),
3302 strlen(g_utf8_next_char(scolon))+1);
3305 theline=g_utf8_next_char(amp);
3309 gboolean tagcomp(const char *strin,const char *basetag)
3313 if (g_utf8_get_char(strin)=='/')
3314 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3316 t=g_utf8_casefold(strin,-1);
3317 s=g_utf8_casefold(basetag,-1);
3318 retval=g_str_has_prefix(t,s);
3324 void proghelp(GOptionContext *context)
3327 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3328 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3329 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3330 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3331 "For details, read the file COPYING.\n",stderr);
3332 fputs("This is Free Software; "
3333 "you may redistribute it under certain conditions (GPL);\n",stderr);
3334 fputs("read the file COPYING for details.\n\n",stderr);
3335 help=g_option_context_get_help(context,TRUE,NULL);
3338 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3339 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3340 "non-ASCII\n",stderr);
3341 fputs("characters like accented letters, "
3342 "lines longer than 75 or shorter than 55,\n",stderr);
3343 fputs("unbalanced quotes or brackets, "
3344 "a variety of badly formatted punctuation, \n",stderr);
3345 fputs("HTML tags, some likely typos. "
3346 "It is NOT a substitute for human judgement.\n",stderr);