1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 static GOptionEntry options[]={
136 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
137 "Ignore DP-specific markup", NULL },
138 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
139 "Don't echo queried line", NULL },
140 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
141 "Check single quotes", NULL },
142 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
143 "Check common typos", NULL },
144 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
145 "Require closure of quotes on every paragraph", NULL },
146 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
147 "Disable paranoid querying of everything", NULL },
148 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
149 "Disable line end checking", NULL },
150 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
151 "Overview: just show counts", NULL },
152 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
153 "Output errors to stdout instead of stderr", NULL },
154 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
155 "Echo header fields", NULL },
156 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
157 "Ignore markup in < >", NULL },
158 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
159 "Use file of user-defined typos", NULL },
160 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
161 "Defaults for use on www upload", NULL },
162 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
163 "Verbose - list everything", NULL },
164 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
165 "Set of characters valid for this ebook", "NAME" },
169 long cnt_quote; /* for overview mode, count of quote queries */
170 long cnt_brack; /* for overview mode, count of brackets queries */
171 long cnt_bin; /* for overview mode, count of non-ASCII queries */
172 long cnt_odd; /* for overview mode, count of odd character queries */
173 long cnt_long; /* for overview mode, count of long line errors */
174 long cnt_short; /* for overview mode, count of short line queries */
175 long cnt_punct; /* for overview mode,
176 count of punctuation and spacing queries */
177 long cnt_dash; /* for overview mode, count of dash-related queries */
178 long cnt_word; /* for overview mode, count of word queries */
179 long cnt_html; /* for overview mode, count of html queries */
180 long cnt_lineend; /* for overview mode, count of line-end queries */
181 long cnt_spacend; /* count of lines with space at end */
182 long linecnt; /* count of total lines in the file */
183 long checked_linecnt; /* count of lines actually checked */
185 void proghelp(GOptionContext *context);
186 void procfile(const char *);
190 gboolean mixdigit(const char *);
191 gchar *getaword(const char **);
192 char *flgets(char **,long);
193 void postprocess_for_HTML(char *);
194 char *linehasmarkup(char *);
195 char *losemarkup(char *);
196 gboolean tagcomp(const char *,const char *);
197 void loseentities(char *);
198 gboolean isroman(const char *);
199 void postprocess_for_DP(char *);
200 void print_as_windows_1252(const char *string);
201 void print_as_utf_8(const char *string);
203 GTree *qword,*qperiod;
209 gboolean set_charset(const char *name,GError **err)
211 /* The various UNICODE encodings all share the same character set. */
212 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
213 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
214 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
215 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
216 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
220 if (charset_validator!=(GIConv)-1)
221 g_iconv_close(charset_validator);
222 if (!name || !g_strcasecmp(name,"auto"))
225 charset_validator=(GIConv)-1;
229 charset=g_strdup(name);
230 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
231 if (!g_strcasecmp(charset,unicode_aliases[i]))
234 charset=g_strdup("UTF-8");
237 if (!strcmp(charset,"UTF-8"))
238 charset_validator=(GIConv)-1;
241 charset_validator=g_iconv_open(charset,"UTF-8");
242 if (charset_validator==(GIConv)-1)
244 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
245 "Unknown character set \"%s\"",charset);
252 void parse_options(int *argc,char ***argv)
255 GOptionContext *context;
256 context=g_option_context_new(
257 "file - looks for errors in Project Gutenberg(TM) etexts");
258 g_option_context_add_main_entries(context,options,NULL);
259 if (!g_option_context_parse(context,argc,argv,&err))
261 g_printerr("Bookloupe: %s\n",err->message);
262 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
265 /* Paranoid checking is turned OFF, not on, by its switch */
266 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
267 if (pswit[PARANOID_SWITCH])
268 /* if running in paranoid mode, typo checks default to enabled */
269 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
270 /* Line-end checking is turned OFF, not on, by its switch */
271 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
272 /* Echoing is turned OFF, not on, by its switch */
273 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
274 if (pswit[OVERVIEW_SWITCH])
275 /* just print summary; don't echo */
276 pswit[ECHO_SWITCH]=FALSE;
278 * Web uploads - for the moment, this is really just a placeholder
279 * until we decide what processing we really want to do on web uploads
281 if (pswit[WEB_SWITCH])
283 /* specific override for web uploads */
284 pswit[ECHO_SWITCH]=TRUE;
285 pswit[SQUOTE_SWITCH]=FALSE;
286 pswit[TYPO_SWITCH]=TRUE;
287 pswit[QPARA_SWITCH]=FALSE;
288 pswit[PARANOID_SWITCH]=TRUE;
289 pswit[LINE_END_SWITCH]=FALSE;
290 pswit[OVERVIEW_SWITCH]=FALSE;
291 pswit[STDOUT_SWITCH]=FALSE;
292 pswit[HEADER_SWITCH]=TRUE;
293 pswit[VERBOSE_SWITCH]=FALSE;
294 pswit[MARKUP_SWITCH]=FALSE;
295 pswit[USERTYPO_SWITCH]=FALSE;
296 pswit[DP_SWITCH]=FALSE;
298 if (opt_charset && !set_charset(opt_charset,&err))
300 g_printerr("%s\n",err->message);
310 g_option_context_free(context);
316 * Read in the user-defined stealth scanno list.
318 void read_user_scannos(void)
321 gchar *usertypo_file;
325 gchar *contents,*utf8,**lines;
326 usertypo_file=g_strdup("bookloupe.typ");
327 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
328 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
331 g_free(usertypo_file);
332 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
333 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
335 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
338 g_free(usertypo_file);
339 usertypo_file=g_strdup("gutcheck.typ");
340 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
342 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
345 g_free(usertypo_file);
346 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
347 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
349 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
351 g_free(usertypo_file);
352 g_print(" --> I couldn't find bookloupe.typ "
353 "-- proceeding without user typos.\n");
358 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
359 g_free(usertypo_file);
363 if (g_utf8_validate(contents,len,NULL))
365 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
367 (void)set_charset("UNICODE",NULL);
370 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
372 lines=g_strsplit_set(utf8,"\r\n",0);
374 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
375 for (i=0;lines[i];i++)
376 if (*(unsigned char *)lines[i]>'!')
377 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
386 * Read an etext returning a newly allocated string containing the file
387 * contents or NULL on error.
389 gchar *read_etext(const char *filename,GError **err)
391 GError *tmp_err=NULL;
392 gchar *contents,*utf8;
393 gsize len,bytes_read,bytes_written;
395 if (!g_file_get_contents(filename,&contents,&len,err))
397 if (g_utf8_validate(contents,len,NULL))
399 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
400 g_set_print_handler(print_as_utf_8);
402 SetConsoleOutputCP(CP_UTF8);
407 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
408 &bytes_written,&tmp_err);
409 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
410 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
413 for(i=0;i<bytes_read;i++)
414 if (contents[i]=='\n')
419 else if (contents[i]!='\r')
421 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
422 "Input conversion failed. Byte %d at line %d, column %d is not a "
423 "valid Windows-1252 character",
424 ((unsigned char *)contents)[bytes_read],line,col);
427 g_propagate_error(err,tmp_err);
428 g_set_print_handler(print_as_windows_1252);
430 SetConsoleOutputCP(1252);
437 void cleanup_on_exit(void)
440 SetConsoleOutputCP(saved_cp);
444 int main(int argc,char **argv)
447 atexit(cleanup_on_exit);
448 saved_cp=GetConsoleOutputCP();
450 running_from=g_path_get_dirname(argv[0]);
451 parse_options(&argc,&argv);
452 if (pswit[USERTYPO_SWITCH])
454 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
456 if (pswit[OVERVIEW_SWITCH])
458 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
459 checked_linecnt,linecnt,linecnt-checked_linecnt);
460 g_print(" --------------- Queries found --------------\n");
462 g_print(" Long lines: %14ld\n",cnt_long);
464 g_print(" Short lines: %14ld\n",cnt_short);
466 g_print(" Line-end problems: %14ld\n",cnt_lineend);
468 g_print(" Common typos: %14ld\n",cnt_word);
470 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
472 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
474 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
476 g_print(" Proofing characters: %14ld\n",cnt_odd);
478 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
480 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
482 g_print(" Possible HTML tags: %14ld\n",cnt_html);
484 g_print(" TOTAL QUERIES %14ld\n",
485 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
486 cnt_dash+cnt_word+cnt_html+cnt_lineend);
488 g_free(running_from);
490 g_tree_unref(usertypo);
491 set_charset(NULL,NULL);
495 void count_dashes(const char *line,const char *dash,
496 struct dash_results *results)
501 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
504 tokens=g_strsplit(line,dash,0);
507 for(i=1;tokens[i];i++)
509 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
510 nc=g_utf8_get_char(tokens[i]);
511 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
513 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
515 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
521 /* count of lines with em-dashes with spaces both sides */
522 results->non_PG_space++;
524 /* count of lines with PG-type em-dashes with no spaces */
532 * Run a first pass - verify that it's a valid PG
533 * file, decide whether to report some things that
534 * occur many times in the text like long or short
535 * lines, non-standard dashes, etc.
537 struct first_pass_results *first_pass(const char *etext)
539 gunichar laststart=CHAR_SPACE;
544 unsigned int lastlen=0,lastblen=0;
545 long spline=0,nspline=0;
546 static struct first_pass_results results={0};
547 struct dash_results tmp_dash_results;
550 lines=g_strsplit(etext,"\n",0);
551 for (j=0;lines[j];j++)
553 lbytes=strlen(lines[j]);
554 while (lbytes>0 && lines[j][lbytes-1]=='\r')
555 lines[j][--lbytes]='\0';
556 llen=g_utf8_strlen(lines[j],lbytes);
558 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
559 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
562 g_print(" --> Duplicate header?\n");
563 spline=linecnt+1; /* first line of non-header text, that is */
565 if (!strncmp(lines[j],"*** START",9) &&
566 strstr(lines[j],"PROJECT GUTENBERG"))
569 g_print(" --> Duplicate header?\n");
570 nspline=linecnt+1; /* first line of non-header text, that is */
572 if (spline || nspline)
574 lc_line=g_utf8_strdown(lines[j],lbytes);
575 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
577 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
579 if (results.footerline)
581 /* it's an old-form header - we can detect duplicates */
583 g_print(" --> Duplicate footer?\n");
586 results.footerline=linecnt;
592 results.firstline=spline;
594 results.firstline=nspline; /* override with new */
595 if (results.footerline)
596 continue; /* don't count the boilerplate in the footer */
597 results.totlen+=llen;
598 for (s=lines[j];*s;s=g_utf8_next_char(s))
600 if (g_utf8_get_char(s)>127)
602 if (g_unichar_isalpha(g_utf8_get_char(s)))
606 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
607 qc=QUOTE_CLASS(g_utf8_get_char(s));
610 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
611 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
612 results.endquote_count++;
615 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
616 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
619 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
621 if (strstr(lines[j],".,"))
623 /* only count ast lines for ignoring purposes where there is */
624 /* locase text on the line */
625 if (strchr(lines[j],'*'))
627 for (s=lines[j];*s;s=g_utf8_next_char(s))
628 if (g_unichar_islower(g_utf8_get_char(s)))
633 if (strchr(lines[j],'/'))
634 results.fslashline++;
637 for (s=g_utf8_prev_char(lines[j]+lbytes);
638 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
639 s=g_utf8_prev_char(s))
641 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
642 g_utf8_get_char(g_utf8_prev_char(s))!='-')
645 if (llen>LONGEST_PG_LINE)
647 if (llen>WAY_TOO_LONG)
648 results.verylongline++;
649 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
651 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
654 if (strstr(lines[j],"<i>"))
655 results.htmcount+=4; /* bonus marks! */
657 /* Check for spaced em-dashes */
658 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
659 count_dashes(lines[j],"--",&tmp_dash_results);
660 count_dashes(lines[j],"—",&tmp_dash_results);
661 if (tmp_dash_results.base)
662 results.emdash.base++;
663 if (tmp_dash_results.non_PG_space)
664 results.emdash.non_PG_space++;
665 if (tmp_dash_results.PG_space)
666 results.emdash.PG_space++;
670 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
671 results.Dutchcount++;
672 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
673 results.Frenchcount++;
674 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
675 results.standalone_digit++;
678 /* Check for spaced dashes */
679 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
683 laststart=lines[j][0];
692 * Make some snap decisions based on the first pass results.
694 struct warnings *report_first_pass(struct first_pass_results *results)
696 static struct warnings warnings={0};
698 g_print(" --> %ld lines in this file have white space at end\n",
701 if (results->dotcomma>5)
704 g_print(" --> %ld lines in this file contain '.,'. "
705 "Not reporting them.\n",results->dotcomma);
708 * If more than 50 lines, or one-tenth, are short,
709 * don't bother reporting them.
711 warnings.shortline=1;
712 if (results->shortline>50 || results->shortline*10>linecnt)
714 warnings.shortline=0;
715 g_print(" --> %ld lines in this file are short. "
716 "Not reporting short lines.\n",results->shortline);
719 * If more than 50 lines, or one-tenth, are long,
720 * don't bother reporting them.
723 if (results->longline>50 || results->longline*10>linecnt)
726 g_print(" --> %ld lines in this file are long. "
727 "Not reporting long lines.\n",results->longline);
729 /* If more than 10 lines contain asterisks, don't bother reporting them. */
731 if (results->astline>10)
734 g_print(" --> %ld lines in this file contain asterisks. "
735 "Not reporting them.\n",results->astline);
738 * If more than 10 lines contain forward slashes,
739 * don't bother reporting them.
742 if (results->fslashline>10)
745 g_print(" --> %ld lines in this file contain forward slashes. "
746 "Not reporting them.\n",results->fslashline);
749 * If more than 20 lines contain unpunctuated endquotes,
750 * don't bother reporting them.
753 if (results->endquote_count>20)
756 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
757 "Not reporting them.\n",results->endquote_count);
760 * If more than 15 lines contain standalone digits,
761 * don't bother reporting them.
764 if (results->standalone_digit>10)
767 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
768 "Not reporting them.\n",results->standalone_digit);
771 * If more than 20 lines contain hyphens at end,
772 * don't bother reporting them.
775 if (results->hyphens>20)
778 g_print(" --> %ld lines in this file have hyphens at end. "
779 "Not reporting them.\n",results->hyphens);
781 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
783 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
784 pswit[MARKUP_SWITCH]=1;
786 if (results->verylongline>0)
787 g_print(" --> %ld lines in this file are VERY long!\n",
788 results->verylongline);
790 * If there are more non-PG spaced dashes than PG em-dashes,
791 * assume it's deliberate.
792 * Current PG guidelines say don't use them, but older texts do,
793 * and some people insist on them whatever the guidelines say.
796 if (results->spacedash+results->emdash.non_PG_space>
797 results->emdash.PG_space)
800 g_print(" --> There are %ld spaced dashes and em-dashes. "
801 "Not reporting them.\n",
802 results->spacedash+results->emdash.non_PG_space);
808 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
810 /* If more than a quarter of characters are hi-bit, bug out. */
811 if (results->binlen*4>results->totlen)
813 g_print(" --> This file does not appear to be ASCII. "
814 "Terminating. Best of luck with it!\n");
817 if (results->alphalen*4<results->totlen)
819 g_print(" --> This file does not appear to be text. "
820 "Terminating. Best of luck with it!\n");
823 if (results->binlen*100>results->totlen || results->binlen>100)
825 g_print(" --> There are a lot of foreign letters here. "
826 "Not reporting them.\n");
827 if (!pswit[VERBOSE_SWITCH])
831 warnings.isDutch=FALSE;
832 if (results->Dutchcount>50)
834 warnings.isDutch=TRUE;
835 g_print(" --> This looks like Dutch - "
836 "switching off dashes and warnings for 's Middags case.\n");
838 warnings.isFrench=FALSE;
839 if (results->Frenchcount>50)
841 warnings.isFrench=TRUE;
842 g_print(" --> This looks like French - "
843 "switching off some doublepunct.\n");
845 if (results->firstline && results->footerline)
846 g_print(" The PG header and footer appear to be already on.\n");
849 if (results->firstline)
850 g_print(" The PG header is on - no footer.\n");
851 if (results->footerline)
852 g_print(" The PG footer is on - no header.\n");
855 if (pswit[VERBOSE_SWITCH])
857 warnings.shortline=1;
866 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
868 if (warnings.isDutch)
870 if (results->footerline>0 && results->firstline>0 &&
871 results->footerline>results->firstline &&
872 results->footerline-results->firstline<100)
874 g_print(" --> I don't really know where this text starts. \n");
875 g_print(" There are no reference points.\n");
876 g_print(" I'm going to have to report the header and footer "
878 results->firstline=0;
886 * Look along the line, accumulate the count of quotes, and see
887 * if this is an empty line - i.e. a line with nothing on it
889 * If line has just spaces, period, * and/or - on it, don't
890 * count it, since empty lines with asterisks or dashes to
891 * separate sections are common.
893 * Returns: TRUE if the line is empty.
895 gboolean analyse_quotes(const char *aline,struct counters *counters)
898 /* assume the line is empty until proven otherwise */
899 gboolean isemptyline=TRUE;
900 const char *s=aline,*sprev,*snext;
903 GError *tmp_err=NULL;
906 snext=g_utf8_next_char(s);
907 c=g_utf8_get_char(s);
908 if (CHAR_IS_DQUOTE(c))
909 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
910 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
915 * At start of line, it can only be a quotation mark.
916 * Hardcode a very common exception!
918 if (!g_str_has_prefix(snext,"tis") &&
919 !g_str_has_prefix(snext,"Tis"))
920 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
922 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
923 g_unichar_isalpha(g_utf8_get_char(snext)))
924 /* Do nothing! it's definitely an apostrophe, not a quote */
926 /* it's outside a word - let's check it out */
927 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
928 g_unichar_isalpha(g_utf8_get_char(snext)))
930 /* certainly looks like a quotation mark */
931 if (!g_str_has_prefix(snext,"tis") &&
932 !g_str_has_prefix(snext,"Tis"))
933 /* hardcode a very common exception! */
935 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
936 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
938 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
943 /* now - is it a quotation mark? */
944 guessquote=0; /* accumulate clues */
945 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
947 /* it follows a letter - could be either */
949 if (g_utf8_get_char(sprev)=='s')
951 /* looks like a plural apostrophe */
953 if (g_utf8_get_char(snext)==CHAR_SPACE)
957 if (innermost_quote_matches(counters,c))
959 * Give it the benefit of some doubt,
960 * if a squote is already open.
966 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
969 /* no adjacent letter - it must be a quote of some kind */
970 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
975 if (pswit[ECHO_SWITCH])
976 g_print("\n%s\n",aline);
977 if (!pswit[OVERVIEW_SWITCH])
978 g_print(" Line %ld column %ld - %s\n",
979 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
980 g_clear_error(&tmp_err);
982 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
984 isemptyline=FALSE; /* ignore lines like * * * as spacers */
985 if (c==CHAR_UNDERSCORE)
986 counters->c_unders++;
987 if (c==CHAR_OPEN_SBRACK)
989 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
990 !matching_difference(counters,c) && s==aline &&
991 g_str_has_prefix(s,"[Illustration:"))
992 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
994 increment_matching(counters,c,TRUE);
996 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
997 increment_matching(counters,c,TRUE);
998 if (c==CHAR_CLOSE_SBRACK)
1000 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
1001 !matching_difference(counters,c) && !*snext)
1002 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
1004 increment_matching(counters,c,FALSE);
1006 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
1007 increment_matching(counters,c,FALSE);
1015 * check_for_control_characters:
1017 * Check for invalid or questionable characters in the line
1018 * Anything above 127 is invalid for plain ASCII, and
1019 * non-printable control characters should also be flagged.
1020 * Tabs should generally not be there.
1022 void check_for_control_characters(const char *aline)
1026 for (s=aline;*s;s=g_utf8_next_char(s))
1028 c=g_utf8_get_char(s);
1029 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1031 if (pswit[ECHO_SWITCH])
1032 g_print("\n%s\n",aline);
1033 if (!pswit[OVERVIEW_SWITCH])
1034 g_print(" Line %ld column %ld - Control character %u\n",
1035 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1043 * check_for_odd_characters:
1045 * Check for binary and other odd characters.
1047 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1048 gboolean isemptyline)
1050 /* Don't repeat multiple warnings on one line. */
1051 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1052 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1057 for (s=aline;*s;s=g_utf8_next_char(s))
1059 c=g_utf8_get_char(s);
1060 if (warnings->bin && !eInvalidChar &&
1061 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1063 if (pswit[ECHO_SWITCH])
1064 g_print("\n%s\n",aline);
1065 if (!pswit[OVERVIEW_SWITCH])
1066 if (c>127 && c<160 || c>255)
1067 g_print(" Line %ld column %ld - "
1068 "Non-ISO-8859 character %u\n",
1069 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1071 g_print(" Line %ld column %ld - "
1072 "Non-ASCII character %u\n",
1073 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1078 if (!eInvalidChar && charset)
1080 if (charset_validator==(GIConv)-1)
1082 if (!g_unichar_isdefined(c))
1084 if (pswit[ECHO_SWITCH])
1085 g_print("\n%s\n",aline);
1086 if (!pswit[OVERVIEW_SWITCH])
1087 g_print(" Line %ld column %ld - Unassigned UNICODE "
1088 "code point U+%04" G_GINT32_MODIFIER "X\n",
1089 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1094 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1095 c>=100000 && c<=0x10FFFD)
1097 if (pswit[ECHO_SWITCH])
1098 g_print("\n%s\n",aline);
1099 if (!pswit[OVERVIEW_SWITCH])
1100 g_print(" Line %ld column %ld - Private Use "
1101 "character U+%04" G_GINT32_MODIFIER "X\n",
1102 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1110 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1111 charset_validator,NULL,&nb,NULL);
1116 if (pswit[ECHO_SWITCH])
1117 g_print("\n%s\n",aline);
1118 if (!pswit[OVERVIEW_SWITCH])
1119 g_print(" Line %ld column %ld - Non-%s "
1120 "character %u\n",linecnt,
1121 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1128 if (!eTab && c==CHAR_TAB)
1130 if (pswit[ECHO_SWITCH])
1131 g_print("\n%s\n",aline);
1132 if (!pswit[OVERVIEW_SWITCH])
1133 g_print(" Line %ld column %ld - Tab character?\n",
1134 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1139 if (!eTilde && c==CHAR_TILDE)
1142 * Often used by OCR software to indicate an
1143 * unrecognizable character.
1145 if (pswit[ECHO_SWITCH])
1146 g_print("\n%s\n",aline);
1147 if (!pswit[OVERVIEW_SWITCH])
1148 g_print(" Line %ld column %ld - Tilde character?\n",
1149 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1154 if (!eCarat && c==CHAR_CARAT)
1156 if (pswit[ECHO_SWITCH])
1157 g_print("\n%s\n",aline);
1158 if (!pswit[OVERVIEW_SWITCH])
1159 g_print(" Line %ld column %ld - Carat character?\n",
1160 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1165 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1167 if (pswit[ECHO_SWITCH])
1168 g_print("\n%s\n",aline);
1169 if (!pswit[OVERVIEW_SWITCH])
1170 g_print(" Line %ld column %ld - Forward slash?\n",
1171 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1177 * Report asterisks only in paranoid mode,
1178 * since they're often deliberate.
1180 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1183 if (pswit[ECHO_SWITCH])
1184 g_print("\n%s\n",aline);
1185 if (!pswit[OVERVIEW_SWITCH])
1186 g_print(" Line %ld column %ld - Asterisk?\n",
1187 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1196 * check_for_long_line:
1198 * Check for line too long.
1200 void check_for_long_line(const char *aline)
1202 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1204 if (pswit[ECHO_SWITCH])
1205 g_print("\n%s\n",aline);
1206 if (!pswit[OVERVIEW_SWITCH])
1207 g_print(" Line %ld column %ld - Long line %ld\n",
1208 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1215 * check_for_short_line:
1217 * Check for line too short.
1219 * This one is a bit trickier to implement: we don't want to
1220 * flag the last line of a paragraph for being short, so we
1221 * have to wait until we know that our current line is a
1222 * "normal" line, then report the _previous_ line if it was too
1223 * short. We also don't want to report indented lines like
1224 * chapter heads or formatted quotations. We therefore keep
1225 * last->len as the length of the last line examined, and
1226 * last->blen as the length of the last but one, and try to
1227 * suppress unnecessary warnings by checking that both were of
1228 * "normal" length. We keep the first character of the last
1229 * line in last->start, and if it was a space, we assume that
1230 * the formatting is deliberate. I can't figure out a way to
1231 * distinguish something like a quoted verse left-aligned or
1232 * the header or footer of a letter from a paragraph of short
1233 * lines - maybe if I examined the whole paragraph, and if the
1234 * para has less than, say, 8 lines and if all lines are short,
1235 * then just assume it's OK? Need to look at some texts to see
1236 * how often a formula like this would get the right result.
1238 void check_for_short_line(const char *aline,const struct line_properties *last)
1240 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1241 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1242 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1244 if (pswit[ECHO_SWITCH])
1245 g_print("\n%s\n",prevline);
1246 if (!pswit[OVERVIEW_SWITCH])
1247 g_print(" Line %ld column %ld - Short line %ld?\n",
1248 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1255 * check_for_starting_punctuation:
1257 * Look for punctuation other than full ellipses at start of line.
1259 void check_for_starting_punctuation(const char *aline)
1261 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1262 !g_str_has_prefix(aline,". . ."))
1264 if (pswit[ECHO_SWITCH])
1265 g_print("\n%s\n",aline);
1266 if (!pswit[OVERVIEW_SWITCH])
1267 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1277 * Find the first em-dash, return a pointer to it and set <next> to the
1278 * character following the dash.
1280 char *str_emdash(const char *s,const char **next)
1288 *next=g_utf8_next_char(s2);
1293 *next=g_utf8_next_char(g_utf8_next_char(s1));
1298 *next=g_utf8_next_char(g_utf8_next_char(s1));
1303 *next=g_utf8_next_char(s2);
1309 * check_for_spaced_emdash:
1311 * Check for spaced em-dashes.
1313 * We must check _all_ occurrences of em-dashes on the line
1314 * hence the loop - even if the first dash is OK
1315 * there may be another that's wrong later on.
1317 void check_for_spaced_emdash(const char *aline)
1319 const char *s,*t,*next;
1320 for (s=aline;t=str_emdash(s,&next);s=next)
1322 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1323 g_utf8_get_char(next)==CHAR_SPACE)
1325 if (pswit[ECHO_SWITCH])
1326 g_print("\n%s\n",aline);
1327 if (!pswit[OVERVIEW_SWITCH])
1328 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1329 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1337 * check_for_spaced_dash:
1339 * Check for spaced dashes.
1341 void check_for_spaced_dash(const char *aline)
1344 if ((s=strstr(aline," -")))
1346 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1348 if (pswit[ECHO_SWITCH])
1349 g_print("\n%s\n",aline);
1350 if (!pswit[OVERVIEW_SWITCH])
1351 g_print(" Line %ld column %ld - Spaced dash?\n",
1352 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1357 else if ((s=strstr(aline,"- ")))
1359 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1361 if (pswit[ECHO_SWITCH])
1362 g_print("\n%s\n",aline);
1363 if (!pswit[OVERVIEW_SWITCH])
1364 g_print(" Line %ld column %ld - Spaced dash?\n",
1365 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1373 * check_for_unmarked_paragraphs:
1375 * Check for unmarked paragraphs indicated by separate speakers.
1377 * May well be false positive:
1378 * "Bravo!" "Wonderful!" called the crowd.
1379 * but useful all the same.
1381 void check_for_unmarked_paragraphs(const char *aline)
1384 s=strstr(aline,"\" \"");
1386 s=strstr(aline,"\" \"");
1389 if (pswit[ECHO_SWITCH])
1390 g_print("\n%s\n",aline);
1391 if (!pswit[OVERVIEW_SWITCH])
1392 g_print(" Line %ld column %ld - "
1393 "Query missing paragraph break?\n",
1394 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1401 * check_for_jeebies:
1403 * Check for "to he" and other easy h/b errors.
1405 * This is a very inadequate effort on the h/b problem,
1406 * but the phrase "to he" is always an error, whereas "to
1407 * be" is quite common.
1408 * Similarly, '"Quiet!", be said.' is a non-be error
1409 * "to he" is _not_ always an error!:
1410 * "Where they went to he couldn't say."
1411 * Another false positive:
1412 * What would "Cinderella" be without the . . .
1413 * and another: "If he wants to he can see for himself."
1415 void check_for_jeebies(const char *aline)
1418 s=strstr(aline," be could ");
1420 s=strstr(aline," be would ");
1422 s=strstr(aline," was be ");
1424 s=strstr(aline," be is ");
1426 s=strstr(aline," is be ");
1428 s=strstr(aline,"\", be ");
1430 s=strstr(aline,"\" be ");
1432 s=strstr(aline,"\" be ");
1434 s=strstr(aline," to he ");
1437 if (pswit[ECHO_SWITCH])
1438 g_print("\n%s\n",aline);
1439 if (!pswit[OVERVIEW_SWITCH])
1440 g_print(" Line %ld column %ld - Query he/be error?\n",
1441 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1445 s=strstr(aline," the had ");
1447 s=strstr(aline," a had ");
1449 s=strstr(aline," they bad ");
1451 s=strstr(aline," she bad ");
1453 s=strstr(aline," he bad ");
1455 s=strstr(aline," you bad ");
1457 s=strstr(aline," i bad ");
1460 if (pswit[ECHO_SWITCH])
1461 g_print("\n%s\n",aline);
1462 if (!pswit[OVERVIEW_SWITCH])
1463 g_print(" Line %ld column %ld - Query had/bad error?\n",
1464 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1468 s=strstr(aline,"; hut ");
1470 s=strstr(aline,", hut ");
1473 if (pswit[ECHO_SWITCH])
1474 g_print("\n%s\n",aline);
1475 if (!pswit[OVERVIEW_SWITCH])
1476 g_print(" Line %ld column %ld - Query hut/but error?\n",
1477 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1484 * check_for_mta_from:
1486 * Special case - angled bracket in front of "From" placed there by an
1487 * MTA when sending an e-mail.
1489 void check_for_mta_from(const char *aline)
1492 s=strstr(aline,">From");
1495 if (pswit[ECHO_SWITCH])
1496 g_print("\n%s\n",aline);
1497 if (!pswit[OVERVIEW_SWITCH])
1498 g_print(" Line %ld column %ld - "
1499 "Query angled bracket with From\n",
1500 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1507 * check_for_orphan_character:
1509 * Check for a single character line -
1510 * often an overflow from bad wrapping.
1512 void check_for_orphan_character(const char *aline)
1515 c=g_utf8_get_char(aline);
1516 if (c && !*g_utf8_next_char(aline))
1518 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1519 ; /* Nothing - ignore numerals alone on a line. */
1522 if (pswit[ECHO_SWITCH])
1523 g_print("\n%s\n",aline);
1524 if (!pswit[OVERVIEW_SWITCH])
1525 g_print(" Line %ld column 1 - Query single character line\n",
1534 * check_for_pling_scanno:
1536 * Check for I" - often should be !
1538 void check_for_pling_scanno(const char *aline)
1541 s=strstr(aline," I\"");
1544 if (pswit[ECHO_SWITCH])
1545 g_print("\n%s\n",aline);
1546 if (!pswit[OVERVIEW_SWITCH])
1547 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1548 linecnt,g_utf8_pointer_to_offset(aline,s));
1555 * check_for_extra_period:
1557 * Check for period without a capital letter. Cut-down from gutspell.
1558 * Only works when it happens on a single line.
1560 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1562 const char *s,*t,*s1,*sprev;
1567 gunichar c,nc,pc,*decomposition;
1568 if (pswit[PARANOID_SWITCH])
1570 for (t=aline;t=strstr(t,". ");)
1574 t=g_utf8_next_char(t);
1575 /* start of line punctuation is handled elsewhere */
1578 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1580 t=g_utf8_next_char(t);
1583 if (warnings->isDutch)
1585 /* For Frank & Jeroen -- 's Middags case */
1586 gunichar c2,c3,c4,c5;
1587 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1588 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1589 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1590 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1591 if (CHAR_IS_APOSTROPHE(c2) &&
1592 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1593 g_unichar_isupper(c5))
1595 t=g_utf8_next_char(t);
1599 s1=g_utf8_next_char(g_utf8_next_char(t));
1600 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1601 !g_unichar_isdigit(g_utf8_get_char(s1)))
1602 s1=g_utf8_next_char(s1);
1603 if (g_unichar_islower(g_utf8_get_char(s1)))
1605 /* we have something to investigate */
1607 /* so let's go back and find out */
1608 nc=g_utf8_get_char(t);
1609 s1=g_utf8_prev_char(t);
1610 c=g_utf8_get_char(s1);
1611 sprev=g_utf8_prev_char(s1);
1612 pc=g_utf8_get_char(sprev);
1614 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1615 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1616 g_unichar_isalpha(nc)))
1621 sprev=g_utf8_prev_char(s1);
1622 pc=g_utf8_get_char(sprev);
1624 s1=g_utf8_next_char(s1);
1627 testword=g_strndup(s1,s-s1);
1629 testword=g_strdup(s1);
1630 for (i=0;*abbrev[i];i++)
1631 if (!strcmp(testword,abbrev[i]))
1633 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1635 if (!*g_utf8_next_char(testword))
1637 if (isroman(testword))
1642 for (s=testword;*s;s=g_utf8_next_char(s))
1644 decomposition=g_unicode_canonical_decomposition(
1645 g_utf8_get_char(s),&len);
1646 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1648 g_free(decomposition);
1652 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1654 g_tree_insert(qperiod,g_strdup(testword),
1655 GINT_TO_POINTER(1));
1656 if (pswit[ECHO_SWITCH])
1657 g_print("\n%s\n",aline);
1658 if (!pswit[OVERVIEW_SWITCH])
1659 g_print(" Line %ld column %ld - Extra period?\n",
1660 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1666 t=g_utf8_next_char(t);
1672 * check_for_following_punctuation:
1674 * Check for words usually not followed by punctuation.
1676 void check_for_following_punctuation(const char *aline)
1679 const char *s,*wordstart;
1682 if (pswit[TYPO_SWITCH])
1693 inword=g_utf8_strdown(t,-1);
1695 for (i=0;*nocomma[i];i++)
1696 if (!strcmp(inword,nocomma[i]))
1698 c=g_utf8_get_char(s);
1699 if (c==',' || c==';' || c==':')
1701 if (pswit[ECHO_SWITCH])
1702 g_print("\n%s\n",aline);
1703 if (!pswit[OVERVIEW_SWITCH])
1704 g_print(" Line %ld column %ld - "
1705 "Query punctuation after %s?\n",
1706 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1712 for (i=0;*noperiod[i];i++)
1713 if (!strcmp(inword,noperiod[i]))
1715 c=g_utf8_get_char(s);
1716 if (c=='.' || c=='!')
1718 if (pswit[ECHO_SWITCH])
1719 g_print("\n%s\n",aline);
1720 if (!pswit[OVERVIEW_SWITCH])
1721 g_print(" Line %ld column %ld - "
1722 "Query punctuation after %s?\n",
1723 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1737 * Check for commonly mistyped words,
1738 * and digits like 0 for O in a word.
1740 void check_for_typos(const char *aline,struct warnings *warnings)
1742 const char *s,*t,*nt,*wordstart;
1744 gunichar *decomposition;
1746 int i,vowel,consonant,*dupcnt;
1747 gboolean isdup,istypo,alower;
1750 gsize decomposition_len;
1754 inword=getaword(&s);
1758 continue; /* don't bother with empty lines */
1760 if (mixdigit(inword))
1762 if (pswit[ECHO_SWITCH])
1763 g_print("\n%s\n",aline);
1764 if (!pswit[OVERVIEW_SWITCH])
1765 g_print(" Line %ld column %ld - Query digit in %s\n",
1766 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1771 * Put the word through a series of tests for likely typos and OCR
1774 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1778 for (t=inword;*t;t=g_utf8_next_char(t))
1780 c=g_utf8_get_char(t);
1781 nt=g_utf8_next_char(t);
1782 /* lowercase for testing */
1783 if (g_unichar_islower(c))
1785 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1788 * We have an uppercase mid-word. However, there are
1790 * Mac and Mc like McGill
1791 * French contractions like l'Abbe
1793 offset=g_utf8_pointer_to_offset(inword,t);
1795 pc=g_utf8_get_char(g_utf8_prev_char(t));
1798 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1799 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1800 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1801 CHAR_IS_APOSTROPHE(pc))
1807 testword=g_utf8_casefold(inword,-1);
1809 if (pswit[TYPO_SWITCH])
1812 * Check for certain unlikely two-letter combinations at word
1815 len=g_utf8_strlen(testword,-1);
1818 for (i=0;*nostart[i];i++)
1819 if (g_str_has_prefix(testword,nostart[i]))
1821 for (i=0;*noend[i];i++)
1822 if (g_str_has_suffix(testword,noend[i]))
1825 /* ght is common, gbt never. Like that. */
1826 if (strstr(testword,"cb"))
1828 if (strstr(testword,"gbt"))
1830 if (strstr(testword,"pbt"))
1832 if (strstr(testword,"tbs"))
1834 if (strstr(testword,"mrn"))
1836 if (strstr(testword,"ahle"))
1838 if (strstr(testword,"ihle"))
1841 * "TBE" does happen - like HEARTBEAT - but uncommon.
1842 * Also "TBI" - frostbite, outbid - but uncommon.
1843 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1844 * numerals, but "ii" is a common scanno.
1846 if (strstr(testword,"tbi"))
1848 if (strstr(testword,"tbe"))
1850 if (strstr(testword,"ii"))
1853 * Check for no vowels or no consonants.
1854 * If none, flag a typo.
1856 if (!istypo && len>1)
1859 for (t=testword;*t;t=g_utf8_next_char(t))
1861 c=g_utf8_get_char(t);
1863 g_unicode_canonical_decomposition(c,&decomposition_len);
1864 if (c=='y' || g_unichar_isdigit(c))
1866 /* Yah, this is loose. */
1870 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1874 g_free(decomposition);
1876 if (!vowel || !consonant)
1880 * Now exclude the word from being reported if it's in
1883 for (i=0;*okword[i];i++)
1884 if (!strcmp(testword,okword[i]))
1887 * What looks like a typo may be a Roman numeral.
1890 if (istypo && isroman(testword))
1892 /* Check the manual list of typos. */
1894 for (i=0;*typo[i];i++)
1895 if (!strcmp(testword,typo[i]))
1898 * Check lowercase s, l, i and m - special cases.
1899 * "j" - often a semi-colon gone wrong.
1900 * "d" for a missing apostrophe - he d
1903 if (!istypo && len==1 &&
1904 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1908 dupcnt=g_tree_lookup(qword,testword);
1912 isdup=!pswit[VERBOSE_SWITCH];
1916 dupcnt=g_new0(int,1);
1917 g_tree_insert(qword,g_strdup(testword),dupcnt);
1922 if (pswit[ECHO_SWITCH])
1923 g_print("\n%s\n",aline);
1924 if (!pswit[OVERVIEW_SWITCH])
1926 g_print(" Line %ld column %ld - Query word %s",
1927 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1929 if (!pswit[VERBOSE_SWITCH])
1930 g_print(" - not reporting duplicates");
1938 /* check the user's list of typos */
1939 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1941 if (pswit[ECHO_SWITCH])
1942 g_print("\n%s\n",aline);
1943 if (!pswit[OVERVIEW_SWITCH])
1944 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1945 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1947 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1949 if (pswit[PARANOID_SWITCH] && warnings->digit)
1951 /* In paranoid mode, query all 0 and 1 standing alone. */
1952 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1954 if (pswit[ECHO_SWITCH])
1955 g_print("\n%s\n",aline);
1956 if (!pswit[OVERVIEW_SWITCH])
1957 g_print(" Line %ld column %ld - Query standalone %s\n",
1958 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1969 * check_for_misspaced_punctuation:
1971 * Look for added or missing spaces around punctuation and quotes.
1972 * If there is a punctuation character like ! with no space on
1973 * either side, suspect a missing!space. If there are spaces on
1974 * both sides , assume a typo. If we see a double quote with no
1975 * space or punctuation on either side of it, assume unspaced
1976 * quotes "like"this.
1978 void check_for_misspaced_punctuation(const char *aline,
1979 struct parities *parities,gboolean isemptyline)
1981 gboolean isacro,isellipsis;
1983 gunichar c,nc,pc,n2c;
1985 c=g_utf8_get_char(aline);
1986 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1987 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1991 nc=g_utf8_get_char(g_utf8_next_char(s));
1992 /* For each character in the line after the first. */
1993 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1995 /* we need to suppress warnings for acronyms like M.D. */
1997 /* we need to suppress warnings for ellipsis . . . */
2000 * If there are letters on both sides of it or
2001 * if it's strict punctuation followed by an alpha.
2003 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
2004 g_utf8_strchr("?!,;:",-1,c)))
2008 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2009 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2011 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2017 if (pswit[ECHO_SWITCH])
2018 g_print("\n%s\n",aline);
2019 if (!pswit[OVERVIEW_SWITCH])
2020 g_print(" Line %ld column %ld - Missing space?\n",
2021 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2026 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
2029 * If there are spaces on both sides,
2030 * or space before and end of line.
2034 if (g_utf8_pointer_to_offset(aline,s)>2 &&
2035 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
2037 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
2041 if (!isemptyline && !isellipsis)
2043 if (pswit[ECHO_SWITCH])
2044 g_print("\n%s\n",aline);
2045 if (!pswit[OVERVIEW_SWITCH])
2046 g_print(" Line %ld column %ld - "
2047 "Spaced punctuation?\n",linecnt,
2048 g_utf8_pointer_to_offset(aline,s)+1);
2055 /* Split out the characters that CANNOT be preceded by space. */
2056 c=g_utf8_get_char(aline);
2057 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2058 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2062 nc=g_utf8_get_char(g_utf8_next_char(s));
2063 /* for each character in the line after the first */
2064 if (g_utf8_strchr("?!,;:",-1,c))
2066 /* if it's punctuation that _cannot_ have a space before it */
2067 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2070 * If nc DOES == space,
2071 * it was already reported just above.
2073 if (pswit[ECHO_SWITCH])
2074 g_print("\n%s\n",aline);
2075 if (!pswit[OVERVIEW_SWITCH])
2076 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2077 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2084 * Special case " .X" where X is any alpha.
2085 * This plugs a hole in the acronym code above.
2086 * Inelegant, but maintainable.
2088 c=g_utf8_get_char(aline);
2089 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2090 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2094 nc=g_utf8_get_char(g_utf8_next_char(s));
2095 /* for each character in the line after the first */
2098 /* if it's a period */
2099 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2102 * If the period follows a space and
2103 * is followed by a letter.
2105 if (pswit[ECHO_SWITCH])
2106 g_print("\n%s\n",aline);
2107 if (!pswit[OVERVIEW_SWITCH])
2108 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2109 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2115 c=g_utf8_get_char(aline);
2116 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2117 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2121 nc=g_utf8_get_char(g_utf8_next_char(s));
2122 /* for each character in the line after the first */
2123 if (CHAR_IS_DQUOTE(c))
2125 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2126 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2127 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2129 if (pswit[ECHO_SWITCH])
2130 g_print("\n%s\n",aline);
2131 if (!pswit[OVERVIEW_SWITCH])
2132 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2133 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2139 /* Check parity of quotes. */
2140 nc=g_utf8_get_char(aline);
2141 for (s=aline;*s;s=g_utf8_next_char(s))
2144 nc=g_utf8_get_char(g_utf8_next_char(s));
2145 if (CHAR_IS_DQUOTE(c))
2149 parities->dquote=!parities->dquote;
2150 parity=parities->dquote;
2152 else if (c==CHAR_LD_QUOTE)
2159 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2161 if (pswit[ECHO_SWITCH])
2162 g_print("\n%s\n",aline);
2163 if (!pswit[OVERVIEW_SWITCH])
2164 g_print(" Line %ld column %ld - "
2165 "Wrongspaced quotes?\n",
2166 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2174 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2175 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2177 if (pswit[ECHO_SWITCH])
2178 g_print("\n%s\n",aline);
2179 if (!pswit[OVERVIEW_SWITCH])
2180 g_print(" Line %ld column %ld - "
2181 "Wrongspaced quotes?\n",
2182 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2189 c=g_utf8_get_char(aline);
2190 if (CHAR_IS_DQUOTE(c))
2192 if (g_utf8_strchr(",;:!?)]} ",-1,
2193 g_utf8_get_char(g_utf8_next_char(aline))))
2195 if (pswit[ECHO_SWITCH])
2196 g_print("\n%s\n",aline);
2197 if (!pswit[OVERVIEW_SWITCH])
2198 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2204 if (pswit[SQUOTE_SWITCH])
2206 nc=g_utf8_get_char(aline);
2207 for (s=aline;*s;s=g_utf8_next_char(s))
2210 nc=g_utf8_get_char(g_utf8_next_char(s));
2211 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2212 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2213 !g_unichar_isalpha(nc)))
2215 parities->squote=!parities->squote;
2216 if (!parities->squote)
2219 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2221 if (pswit[ECHO_SWITCH])
2222 g_print("\n%s\n",aline);
2223 if (!pswit[OVERVIEW_SWITCH])
2224 g_print(" Line %ld column %ld - "
2225 "Wrongspaced singlequotes?\n",
2226 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2234 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2235 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2237 if (pswit[ECHO_SWITCH])
2238 g_print("\n%s\n",aline);
2239 if (!pswit[OVERVIEW_SWITCH])
2240 g_print(" Line %ld column %ld - "
2241 "Wrongspaced singlequotes?\n",
2242 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2253 * check_for_double_punctuation:
2255 * Look for double punctuation like ,. or ,,
2256 * Thanks to DW for the suggestion!
2257 * In books with references, ".," and ".;" are common
2258 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2259 * OTOH, from my initial tests, there are also fairly
2260 * common errors. What to do? Make these cases paranoid?
2261 * ".," is the most common, so warnings->dotcomma is used
2262 * to suppress detailed reporting if it occurs often.
2264 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2268 nc=g_utf8_get_char(aline);
2269 for (s=aline;*s;s=g_utf8_next_char(s))
2272 nc=g_utf8_get_char(g_utf8_next_char(s));
2273 /* for each punctuation character in the line */
2274 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2275 g_utf8_strchr(".?!,;:",-1,nc))
2277 /* followed by punctuation, it's a query, unless . . . */
2278 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2279 !warnings->dotcomma && c=='.' && nc==',' ||
2280 warnings->isFrench && g_str_has_prefix(s,",...") ||
2281 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2282 warnings->isFrench && g_str_has_prefix(s,";...") ||
2283 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2284 warnings->isFrench && g_str_has_prefix(s,":...") ||
2285 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2286 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2287 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2288 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2289 warnings->isFrench && g_str_has_prefix(s,"...?"))
2291 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2292 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2293 warnings->isFrench && g_str_has_prefix(s,";...") ||
2294 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2295 warnings->isFrench && g_str_has_prefix(s,":...") ||
2296 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2297 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2298 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2299 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2300 warnings->isFrench && g_str_has_prefix(s,"...?"))
2303 nc=g_utf8_get_char(g_utf8_next_char(s));
2305 ; /* do nothing for .. !! and ?? which can be legit */
2309 if (pswit[ECHO_SWITCH])
2310 g_print("\n%s\n",aline);
2311 if (!pswit[OVERVIEW_SWITCH])
2312 g_print(" Line %ld column %ld - Double punctuation?\n",
2313 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2322 * check_for_spaced_quotes:
2324 void check_for_spaced_quotes(const char *aline)
2328 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2332 while ((t=strstr(s," \" ")))
2334 if (pswit[ECHO_SWITCH])
2335 g_print("\n%s\n",aline);
2336 if (!pswit[OVERVIEW_SWITCH])
2337 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2338 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2341 s=g_utf8_next_char(g_utf8_next_char(t));
2343 pattern=g_string_new(NULL);
2344 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2346 g_string_assign(pattern," ");
2347 g_string_append_unichar(pattern,single_quotes[i]);
2348 g_string_append_c(pattern,' ');
2350 while ((t=strstr(s,pattern->str)))
2352 if (pswit[ECHO_SWITCH])
2353 g_print("\n%s\n",aline);
2354 if (!pswit[OVERVIEW_SWITCH])
2355 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2356 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2359 s=g_utf8_next_char(g_utf8_next_char(t));
2362 g_string_free(pattern,TRUE);
2366 * check_for_miscased_genative:
2368 * Check special case of 'S instead of 's at end of word.
2370 void check_for_miscased_genative(const char *aline)
2376 c=g_utf8_get_char(aline);
2377 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2378 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2382 nc=g_utf8_get_char(g_utf8_next_char(s));
2383 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2385 if (pswit[ECHO_SWITCH])
2386 g_print("\n%s\n",aline);
2387 if (!pswit[OVERVIEW_SWITCH])
2388 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2389 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2397 * check_end_of_line:
2399 * Now check special cases - start and end of line -
2400 * for single and double quotes. Start is sometimes [sic]
2401 * but better to query it anyway.
2402 * While we're here, check for dash at end of line.
2404 void check_end_of_line(const char *aline,struct warnings *warnings)
2409 lbytes=strlen(aline);
2410 if (g_utf8_strlen(aline,lbytes)>1)
2412 s=g_utf8_prev_char(aline+lbytes);
2413 c1=g_utf8_get_char(s);
2414 c2=g_utf8_get_char(g_utf8_prev_char(s));
2415 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2417 if (pswit[ECHO_SWITCH])
2418 g_print("\n%s\n",aline);
2419 if (!pswit[OVERVIEW_SWITCH])
2420 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2421 g_utf8_strlen(aline,lbytes));
2425 c1=g_utf8_get_char(aline);
2426 c2=g_utf8_get_char(g_utf8_next_char(aline));
2427 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2429 if (pswit[ECHO_SWITCH])
2430 g_print("\n%s\n",aline);
2431 if (!pswit[OVERVIEW_SWITCH])
2432 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2437 * Dash at end of line may well be legit - paranoid mode only
2438 * and don't report em-dash at line-end.
2440 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2442 for (s=g_utf8_prev_char(aline+lbytes);
2443 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2445 if (g_utf8_get_char(s)=='-' &&
2446 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2448 if (pswit[ECHO_SWITCH])
2449 g_print("\n%s\n",aline);
2450 if (!pswit[OVERVIEW_SWITCH])
2451 g_print(" Line %ld column %ld - "
2452 "Hyphen at end of line?\n",
2453 linecnt,g_utf8_pointer_to_offset(aline,s));
2460 * check_for_unspaced_bracket:
2462 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2463 * If so, suspect a scanno like "a]most".
2465 void check_for_unspaced_bracket(const char *aline)
2469 c=g_utf8_get_char(aline);
2470 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2471 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2475 nc=g_utf8_get_char(g_utf8_next_char(s));
2478 /* for each bracket character in the line except 1st & last */
2479 if (g_utf8_strchr("{[()]}",-1,c) &&
2480 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2482 if (pswit[ECHO_SWITCH])
2483 g_print("\n%s\n",aline);
2484 if (!pswit[OVERVIEW_SWITCH])
2485 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2486 linecnt,g_utf8_pointer_to_offset(aline,s));
2494 * check_for_unpunctuated_endquote:
2496 void check_for_unpunctuated_endquote(const char *aline)
2501 c=g_utf8_get_char(aline);
2502 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2503 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2507 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2508 nc=g_utf8_get_char(g_utf8_next_char(s));
2509 /* for each character in the line except 1st */
2510 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2512 if (pswit[ECHO_SWITCH])
2513 g_print("\n%s\n",aline);
2514 if (!pswit[OVERVIEW_SWITCH])
2515 g_print(" Line %ld column %ld - "
2516 "endquote missing punctuation?\n",
2517 linecnt,g_utf8_pointer_to_offset(aline,s));
2525 * check_for_html_tag:
2527 * Check for <HTML TAG>.
2529 * If there is a < in the line, followed at some point
2530 * by a > then we suspect HTML.
2532 void check_for_html_tag(const char *aline)
2534 const char *open,*close;
2536 open=strchr(aline,'<');
2539 close=strchr(g_utf8_next_char(open),'>');
2542 if (pswit[ECHO_SWITCH])
2543 g_print("\n%s\n",aline);
2544 if (!pswit[OVERVIEW_SWITCH])
2546 tag=g_strndup(open,close-open+1);
2547 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2548 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2558 * check_for_html_entity:
2560 * Check for &symbol; HTML.
2562 * If there is a & in the line, followed at
2563 * some point by a ; then we suspect HTML.
2565 void check_for_html_entity(const char *aline)
2567 const char *s,*amp,*scolon;
2569 amp=strchr(aline,'&');
2572 scolon=strchr(amp,';');
2575 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2576 if (g_utf8_get_char(s)==CHAR_SPACE)
2577 break; /* Don't report "Jones & Son;" */
2580 if (pswit[ECHO_SWITCH])
2581 g_print("\n%s\n",aline);
2582 if (!pswit[OVERVIEW_SWITCH])
2584 entity=g_strndup(amp,scolon-amp+1);
2585 g_print(" Line %ld column %d - HTML symbol? %s \n",
2586 linecnt,(int)(amp-aline)+1,entity);
2597 * check_for_omitted_punctuation:
2599 * Check for omitted punctuation at end of paragraph by working back
2600 * through prevline. DW.
2601 * Need to check this only for "normal" paras.
2602 * So what is a "normal" para?
2603 * Not normal if one-liner (chapter headings, etc.)
2604 * Not normal if doesn't contain at least one locase letter
2605 * Not normal if starts with space
2607 void check_for_omitted_punctuation(const char *prevline,
2608 struct line_properties *last,int start_para_line)
2610 gboolean letter_on_line=FALSE;
2613 gboolean closing_quote;
2614 for (s=prevline;*s;s=g_utf8_next_char(s))
2615 if (g_unichar_isalpha(g_utf8_get_char(s)))
2617 letter_on_line=TRUE;
2621 * This next "if" is a problem.
2622 * If we say "start_para_line <= linecnt - 1", that includes
2623 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2624 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2625 * misses genuine one-line paragraphs.
2627 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2628 g_utf8_get_char(prevline)>CHAR_SPACE)
2630 s=prevline+strlen(prevline);
2633 s=g_utf8_prev_char(s);
2634 c=g_utf8_get_char(s);
2635 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2638 closing_quote=FALSE;
2639 } while (closing_quote && s>prevline);
2640 for (;s>prevline;s=g_utf8_prev_char(s))
2642 if (g_unichar_isalpha(g_utf8_get_char(s)))
2644 if (pswit[ECHO_SWITCH])
2645 g_print("\n%s\n",prevline);
2646 if (!pswit[OVERVIEW_SWITCH])
2647 g_print(" Line %ld column %ld - "
2648 "No punctuation at para end?\n",
2649 linecnt-1,g_utf8_strlen(prevline,-1));
2654 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2660 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2662 const char *word=key;
2665 g_print("\nNote: Queried word %s was duplicated %d times\n",
2670 void print_as_windows_1252(const char *string)
2672 gsize inbytes,outbytes;
2674 static GIConv converter=(GIConv)-1;
2677 if (converter!=(GIConv)-1)
2678 g_iconv_close(converter);
2679 converter=(GIConv)-1;
2682 if (converter==(GIConv)-1)
2683 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2684 if (converter!=(GIConv)-1)
2686 inbytes=outbytes=strlen(string);
2687 bp=buf=g_malloc(outbytes+1);
2688 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2694 fputs(string,stdout);
2697 void print_as_utf_8(const char *string)
2699 fputs(string,stdout);
2707 void procfile(const char *filename)
2710 gchar *parastart=NULL; /* first line of current para */
2711 gchar *etext,*aline;
2714 struct first_pass_results *first_pass_results;
2715 struct warnings *warnings;
2716 struct counters counters={0};
2717 struct line_properties last={0};
2718 struct parities parities={0};
2719 struct pending pending={0};
2720 gboolean isemptyline;
2721 long start_para_line=0;
2722 gboolean isnewpara=FALSE,enddash=FALSE;
2723 last.start=CHAR_SPACE;
2724 linecnt=checked_linecnt=0;
2725 etext=read_etext(filename,&err);
2728 if (pswit[STDOUT_SWITCH])
2729 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2731 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2734 g_print("\n\nFile: %s\n\n",filename);
2735 first_pass_results=first_pass(etext);
2736 warnings=report_first_pass(first_pass_results);
2737 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2738 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2740 * Here we go with the main pass. Hold onto yer hat!
2744 while ((aline=flgets(&etext_ptr,linecnt+1)))
2749 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2750 continue; // skip DP page separators completely
2751 if (linecnt<first_pass_results->firstline ||
2752 (first_pass_results->footerline>0 &&
2753 linecnt>first_pass_results->footerline))
2755 if (pswit[HEADER_SWITCH])
2757 if (g_str_has_prefix(aline,"Title:"))
2758 g_print(" %s\n",aline);
2759 if (g_str_has_prefix(aline,"Author:"))
2760 g_print(" %s\n",aline);
2761 if (g_str_has_prefix(aline,"Release Date:"))
2762 g_print(" %s\n",aline);
2763 if (g_str_has_prefix(aline,"Edition:"))
2764 g_print(" %s\n\n",aline);
2766 continue; /* skip through the header */
2769 print_pending(aline,parastart,&pending);
2770 isemptyline=analyse_quotes(aline,&counters);
2771 if (isnewpara && !isemptyline)
2773 /* This line is the start of a new paragraph. */
2774 start_para_line=linecnt;
2775 /* Capture its first line in case we want to report it later. */
2777 parastart=g_strdup(aline);
2778 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2780 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2781 !g_unichar_isdigit(g_utf8_get_char(s)))
2782 s=g_utf8_next_char(s);
2783 if (g_unichar_islower(g_utf8_get_char(s)))
2785 /* and its first letter is lowercase */
2786 if (pswit[ECHO_SWITCH])
2787 g_print("\n%s\n",aline);
2788 if (!pswit[OVERVIEW_SWITCH])
2789 g_print(" Line %ld column %ld - "
2790 "Paragraph starts with lower-case\n",
2791 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2795 isnewpara=FALSE; /* Signal the end of new para processing. */
2797 /* Check for an em-dash broken at line end. */
2798 if (enddash && g_utf8_get_char(aline)=='-')
2800 if (pswit[ECHO_SWITCH])
2801 g_print("\n%s\n",aline);
2802 if (!pswit[OVERVIEW_SWITCH])
2803 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2808 for (s=g_utf8_prev_char(aline+strlen(aline));
2809 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2811 if (s>=aline && g_utf8_get_char(s)=='-')
2813 check_for_control_characters(aline);
2814 check_for_odd_characters(aline,warnings,isemptyline);
2815 if (warnings->longline)
2816 check_for_long_line(aline);
2817 if (warnings->shortline)
2818 check_for_short_line(aline,&last);
2820 last.len=g_utf8_strlen(aline,-1);
2821 last.start=g_utf8_get_char(aline);
2822 check_for_starting_punctuation(aline);
2825 check_for_spaced_emdash(aline);
2826 check_for_spaced_dash(aline);
2828 check_for_unmarked_paragraphs(aline);
2829 check_for_jeebies(aline);
2830 check_for_mta_from(aline);
2831 check_for_orphan_character(aline);
2832 check_for_pling_scanno(aline);
2833 check_for_extra_period(aline,warnings);
2834 check_for_following_punctuation(aline);
2835 check_for_typos(aline,warnings);
2836 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2837 check_for_double_punctuation(aline,warnings);
2838 check_for_spaced_quotes(aline);
2839 check_for_miscased_genative(aline);
2840 check_end_of_line(aline,warnings);
2841 check_for_unspaced_bracket(aline);
2842 if (warnings->endquote)
2843 check_for_unpunctuated_endquote(aline);
2844 check_for_html_tag(aline);
2845 check_for_html_entity(aline);
2848 check_for_mismatched_quotes(&counters,&pending);
2849 counters_reset(&counters);
2850 /* let the next iteration know that it's starting a new para */
2853 check_for_omitted_punctuation(prevline,&last,start_para_line);
2856 prevline=g_strdup(aline);
2859 check_for_mismatched_quotes(&counters,&pending);
2860 print_pending(NULL,parastart,&pending);
2861 reset_pending(&pending);
2870 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2871 g_tree_foreach(qword,report_duplicate_queries,NULL);
2872 g_tree_unref(qword);
2873 g_tree_unref(qperiod);
2874 counters_destroy(&counters);
2875 g_set_print_handler(NULL);
2876 print_as_windows_1252(NULL);
2877 if (pswit[MARKUP_SWITCH])
2884 * Get one line from the input text, checking for
2885 * the existence of exactly one CR/LF line-end per line.
2887 * Returns: a pointer to the line.
2889 char *flgets(char **etext,long lcnt)
2892 gboolean isCR=FALSE;
2893 char *theline=*etext;
2898 c=g_utf8_get_char(*etext);
2901 if (*etext==theline)
2903 else if (pswit[LINE_END_SWITCH])
2905 if (pswit[ECHO_SWITCH])
2907 s=g_strndup(theline,eos-theline);
2908 g_print("\n%s\n",s);
2911 if (!pswit[OVERVIEW_SWITCH])
2912 /* There may, or may not, have been a CR */
2913 g_print(" Line %ld - No LF?\n",lcnt);
2919 *etext=g_utf8_next_char(*etext);
2920 /* either way, it's end of line */
2927 /* Error - a LF without a preceding CR */
2928 if (pswit[LINE_END_SWITCH])
2930 if (pswit[ECHO_SWITCH])
2932 s=g_strndup(theline,eos-theline);
2933 g_print("\n%s\n",s);
2936 if (!pswit[OVERVIEW_SWITCH])
2937 g_print(" Line %ld - No CR?\n",lcnt);
2948 /* Error - two successive CRs */
2949 if (pswit[LINE_END_SWITCH])
2951 if (pswit[ECHO_SWITCH])
2953 s=g_strndup(theline,eos-theline);
2954 g_print("\n%s\n",s);
2957 if (!pswit[OVERVIEW_SWITCH])
2958 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2967 if (pswit[LINE_END_SWITCH] && isCR)
2969 if (pswit[ECHO_SWITCH])
2971 s=g_strndup(theline,eos-theline);
2972 g_print("\n%s\n",s);
2975 if (!pswit[OVERVIEW_SWITCH])
2976 g_print(" Line %ld column %ld - CR without LF?\n",
2977 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2983 eos=g_utf8_next_char(eos);
2987 if (pswit[MARKUP_SWITCH])
2988 postprocess_for_HTML(theline);
2989 if (pswit[DP_SWITCH])
2990 postprocess_for_DP(theline);
2997 * Takes a "word" as a parameter, and checks whether it
2998 * contains a mixture of alpha and digits. Generally, this is an
2999 * error, but may not be for cases like 4th or L5 12s. 3d.
3001 * Returns: TRUE iff an is error found.
3003 gboolean mixdigit(const char *checkword)
3005 gboolean wehaveadigit,wehavealetter,query;
3006 const char *s,*nondigit;
3007 wehaveadigit=wehavealetter=query=FALSE;
3008 for (s=checkword;*s;s=g_utf8_next_char(s))
3009 if (g_unichar_isalpha(g_utf8_get_char(s)))
3011 else if (g_unichar_isdigit(g_utf8_get_char(s)))
3013 if (wehaveadigit && wehavealetter)
3015 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
3017 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
3018 nondigit=g_utf8_next_char(nondigit))
3020 /* digits, ending in st, rd, nd, th of either case */
3021 if (!g_ascii_strcasecmp(nondigit,"st") ||
3022 !g_ascii_strcasecmp(nondigit,"rd") ||
3023 !g_ascii_strcasecmp(nondigit,"nd") ||
3024 !g_ascii_strcasecmp(nondigit,"th"))
3026 if (!g_ascii_strcasecmp(nondigit,"sts") ||
3027 !g_ascii_strcasecmp(nondigit,"rds") ||
3028 !g_ascii_strcasecmp(nondigit,"nds") ||
3029 !g_ascii_strcasecmp(nondigit,"ths"))
3031 if (!g_ascii_strcasecmp(nondigit,"stly") ||
3032 !g_ascii_strcasecmp(nondigit,"rdly") ||
3033 !g_ascii_strcasecmp(nondigit,"ndly") ||
3034 !g_ascii_strcasecmp(nondigit,"thly"))
3036 /* digits, ending in l, L, s or d */
3037 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3038 !strcmp(nondigit,"d"))
3041 * L at the start of a number, representing Britsh pounds, like L500.
3042 * This is cute. We know the current word is mixed digit. If the first
3043 * letter is L, there must be at least one digit following. If both
3044 * digits and letters follow, we have a genuine error, else we have a
3045 * capital L followed by digits, and we accept that as a non-error.
3047 if (g_utf8_get_char(checkword)=='L' &&
3048 !mixdigit(g_utf8_next_char(checkword)))
3057 * Extracts the first/next "word" from the line, and returns it.
3058 * A word is defined as one English word unit--or at least that's the aim.
3059 * "ptr" is advanced to the position in the line where we will start
3060 * looking for the next word.
3062 * Returns: A newly-allocated string.
3064 gchar *getaword(const char **ptr)
3069 word=g_string_new(NULL);
3070 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3071 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3072 **ptr;*ptr=g_utf8_next_char(*ptr))
3075 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3076 * Especially yucky is the case of L1,000
3077 * This section looks for a pattern of characters including a digit
3078 * followed by a comma or period followed by one or more digits.
3079 * If found, it returns this whole pattern as a word; otherwise we discard
3080 * the results and resume our normal programming.
3083 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3084 g_unichar_isalpha(g_utf8_get_char(s)) ||
3085 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3086 g_string_append_unichar(word,g_utf8_get_char(s));
3089 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3091 c=g_utf8_get_char(t);
3092 pc=g_utf8_get_char(g_utf8_prev_char(t));
3093 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3096 return g_string_free(word,FALSE);
3100 /* we didn't find a punctuated number - do the regular getword thing */
3101 g_string_truncate(word,0);
3102 c=g_utf8_get_char(*ptr);
3103 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3104 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3105 g_string_append_unichar(word,c);
3106 return g_string_free(word,FALSE);
3112 * Is this word a Roman Numeral?
3114 * It doesn't actually validate that the number is a valid Roman Numeral--for
3115 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3116 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3117 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3118 * expressions thereof, except when it came to taxes. Allow any number of M,
3119 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3120 * XL or an optional XC, an optional IX or IV, an optional V and any number
3123 gboolean isroman(const char *t)
3129 while (g_utf8_get_char(t)=='m' && *t)
3131 if (g_utf8_get_char(t)=='d')
3133 if (g_str_has_prefix(t,"cm"))
3135 if (g_str_has_prefix(t,"cd"))
3137 while (g_utf8_get_char(t)=='c' && *t)
3139 if (g_str_has_prefix(t,"xl"))
3141 if (g_str_has_prefix(t,"xc"))
3143 if (g_utf8_get_char(t)=='l')
3145 while (g_utf8_get_char(t)=='x' && *t)
3147 if (g_str_has_prefix(t,"ix"))
3149 if (g_str_has_prefix(t,"iv"))
3151 if (g_utf8_get_char(t)=='v')
3153 while (g_utf8_get_char(t)=='i' && *t)
3159 * postprocess_for_DP:
3161 * Invoked with the -d switch from flgets().
3162 * It simply "removes" from the line a hard-coded set of common
3163 * DP-specific tags, so that the line passed to the main routine has
3164 * been pre-cleaned of DP markup.
3166 void postprocess_for_DP(char *theline)
3172 for (i=0;*DPmarkup[i];i++)
3173 while ((s=strstr(theline,DPmarkup[i])))
3175 t=s+strlen(DPmarkup[i]);
3176 memmove(s,t,strlen(t)+1);
3181 * postprocess_for_HTML:
3183 * Invoked with the -m switch from flgets().
3184 * It simply "removes" from the line a hard-coded set of common
3185 * HTML tags and "replaces" a hard-coded set of common HTML
3186 * entities, so that the line passed to the main routine has
3187 * been pre-cleaned of HTML.
3189 void postprocess_for_HTML(char *theline)
3191 while (losemarkup(theline))
3193 loseentities(theline);
3196 char *losemarkup(char *theline)
3200 s=strchr(theline,'<');
3201 t=s?strchr(s,'>'):NULL;
3204 for (i=0;*markup[i];i++)
3205 if (tagcomp(g_utf8_next_char(s),markup[i]))
3207 t=g_utf8_next_char(t);
3208 memmove(s,t,strlen(t)+1);
3211 /* It's an unrecognized <xxx>. */
3215 void loseentities(char *theline)
3222 GTree *entities=NULL;
3223 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3227 g_tree_destroy(entities);
3229 if (translit!=(GIConv)-1)
3230 g_iconv_close(translit);
3231 translit=(GIConv)-1;
3232 if (to_utf8!=(GIConv)-1)
3233 g_iconv_close(to_utf8);
3241 entities=g_tree_new((GCompareFunc)strcmp);
3242 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3243 g_tree_insert(entities,HTMLentities[i].name,
3244 GUINT_TO_POINTER(HTMLentities[i].c));
3246 if (translit==(GIConv)-1)
3247 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3248 if (to_utf8==(GIConv)-1)
3249 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3250 while((amp=strchr(theline,'&')))
3252 scolon=strchr(amp,';');
3257 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3258 c=strtol(amp+2,NULL,10);
3259 else if (amp[2]=='x' &&
3260 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3261 c=strtol(amp+3,NULL,16);
3265 s=g_strndup(amp+1,scolon-(amp+1));
3266 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3275 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3276 theline+=g_unichar_to_utf8(c,theline);
3280 nb=g_unichar_to_utf8(c,s);
3281 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3283 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3285 memcpy(theline,s,nb);
3289 memmove(theline,g_utf8_next_char(scolon),
3290 strlen(g_utf8_next_char(scolon))+1);
3293 theline=g_utf8_next_char(amp);
3297 gboolean tagcomp(const char *strin,const char *basetag)
3301 if (g_utf8_get_char(strin)=='/')
3302 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3304 t=g_utf8_casefold(strin,-1);
3305 s=g_utf8_casefold(basetag,-1);
3306 retval=g_str_has_prefix(t,s);
3312 void proghelp(GOptionContext *context)
3315 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3316 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3317 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3318 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3319 "For details, read the file COPYING.\n",stderr);
3320 fputs("This is Free Software; "
3321 "you may redistribute it under certain conditions (GPL);\n",stderr);
3322 fputs("read the file COPYING for details.\n\n",stderr);
3323 help=g_option_context_get_help(context,TRUE,NULL);
3326 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3327 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3328 "non-ASCII\n",stderr);
3329 fputs("characters like accented letters, "
3330 "lines longer than 75 or shorter than 55,\n",stderr);
3331 fputs("unbalanced quotes or brackets, "
3332 "a variety of badly formatted punctuation, \n",stderr);
3333 fputs("HTML tags, some likely typos. "
3334 "It is NOT a substitute for human judgement.\n",stderr);