1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 static GOptionEntry options[]={
136 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
137 "Ignore DP-specific markup", NULL },
138 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
139 "Don't echo queried line", NULL },
140 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
141 "Check single quotes", NULL },
142 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
143 "Check common typos", NULL },
144 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
145 "Require closure of quotes on every paragraph", NULL },
146 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
147 "Disable paranoid querying of everything", NULL },
148 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
149 "Disable line end checking", NULL },
150 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
151 "Overview: just show counts", NULL },
152 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
153 "Output errors to stdout instead of stderr", NULL },
154 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
155 "Echo header fields", NULL },
156 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
157 "Ignore markup in < >", NULL },
158 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
159 "Use file of user-defined typos", NULL },
160 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
161 "Defaults for use on www upload", NULL },
162 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
163 "Verbose - list everything", NULL },
164 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
165 "Set of characters valid for this ebook", "NAME" },
169 long cnt_quote; /* for overview mode, count of quote queries */
170 long cnt_brack; /* for overview mode, count of brackets queries */
171 long cnt_bin; /* for overview mode, count of non-ASCII queries */
172 long cnt_odd; /* for overview mode, count of odd character queries */
173 long cnt_long; /* for overview mode, count of long line errors */
174 long cnt_short; /* for overview mode, count of short line queries */
175 long cnt_punct; /* for overview mode,
176 count of punctuation and spacing queries */
177 long cnt_dash; /* for overview mode, count of dash-related queries */
178 long cnt_word; /* for overview mode, count of word queries */
179 long cnt_html; /* for overview mode, count of html queries */
180 long cnt_lineend; /* for overview mode, count of line-end queries */
181 long cnt_spacend; /* count of lines with space at end */
182 long linecnt; /* count of total lines in the file */
183 long checked_linecnt; /* count of lines actually checked */
185 void proghelp(GOptionContext *context);
186 void procfile(const char *);
190 gboolean mixdigit(const char *);
191 gchar *getaword(const char **);
192 char *flgets(char **,long);
193 void postprocess_for_HTML(char *);
194 char *linehasmarkup(char *);
195 char *losemarkup(char *);
196 gboolean tagcomp(const char *,const char *);
197 void loseentities(char *);
198 gboolean isroman(const char *);
199 void postprocess_for_DP(char *);
200 void print_as_windows_1252(const char *string);
201 void print_as_utf_8(const char *string);
203 GTree *qword,*qperiod;
209 gboolean set_charset(const char *name,GError **err)
211 /* The various UNICODE encodings all share the same character set. */
212 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
213 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
214 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
215 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
216 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
220 if (charset_validator!=(GIConv)-1)
221 g_iconv_close(charset_validator);
222 if (!name || !g_strcasecmp(name,"auto"))
225 charset_validator=(GIConv)-1;
229 charset=g_strdup(name);
230 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
231 if (!g_strcasecmp(charset,unicode_aliases[i]))
234 charset=g_strdup("UTF-8");
237 if (!strcmp(charset,"UTF-8"))
238 charset_validator=(GIConv)-1;
241 charset_validator=g_iconv_open(charset,"UTF-8");
242 if (charset_validator==(GIConv)-1)
244 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
245 "Unknown character set \"%s\"",charset);
252 void parse_options(int *argc,char ***argv)
255 GOptionContext *context;
256 context=g_option_context_new(
257 "file - looks for errors in Project Gutenberg(TM) etexts");
258 g_option_context_add_main_entries(context,options,NULL);
259 if (!g_option_context_parse(context,argc,argv,&err))
261 g_printerr("Bookloupe: %s\n",err->message);
262 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
265 /* Paranoid checking is turned OFF, not on, by its switch */
266 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
267 if (pswit[PARANOID_SWITCH])
268 /* if running in paranoid mode, typo checks default to enabled */
269 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
270 /* Line-end checking is turned OFF, not on, by its switch */
271 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
272 /* Echoing is turned OFF, not on, by its switch */
273 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
274 if (pswit[OVERVIEW_SWITCH])
275 /* just print summary; don't echo */
276 pswit[ECHO_SWITCH]=FALSE;
278 * Web uploads - for the moment, this is really just a placeholder
279 * until we decide what processing we really want to do on web uploads
281 if (pswit[WEB_SWITCH])
283 /* specific override for web uploads */
284 pswit[ECHO_SWITCH]=TRUE;
285 pswit[SQUOTE_SWITCH]=FALSE;
286 pswit[TYPO_SWITCH]=TRUE;
287 pswit[QPARA_SWITCH]=FALSE;
288 pswit[PARANOID_SWITCH]=TRUE;
289 pswit[LINE_END_SWITCH]=FALSE;
290 pswit[OVERVIEW_SWITCH]=FALSE;
291 pswit[STDOUT_SWITCH]=FALSE;
292 pswit[HEADER_SWITCH]=TRUE;
293 pswit[VERBOSE_SWITCH]=FALSE;
294 pswit[MARKUP_SWITCH]=FALSE;
295 pswit[USERTYPO_SWITCH]=FALSE;
296 pswit[DP_SWITCH]=FALSE;
298 if (opt_charset && !set_charset(opt_charset,&err))
300 g_printerr("%s\n",err->message);
310 g_option_context_free(context);
316 * Read in the user-defined stealth scanno list.
318 void read_user_scannos(void)
321 gchar *usertypo_file;
325 gchar *contents,*utf8,**lines;
326 usertypo_file=g_strdup("bookloupe.typ");
327 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
328 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
331 g_free(usertypo_file);
332 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
333 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
335 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
338 g_free(usertypo_file);
339 usertypo_file=g_strdup("gutcheck.typ");
340 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
342 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
345 g_free(usertypo_file);
346 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
347 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
349 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
351 g_free(usertypo_file);
352 g_print(" --> I couldn't find bookloupe.typ "
353 "-- proceeding without user typos.\n");
358 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
359 g_free(usertypo_file);
363 if (g_utf8_validate(contents,len,NULL))
365 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
367 (void)set_charset("UNICODE",NULL);
370 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
372 lines=g_strsplit_set(utf8,"\r\n",0);
374 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
375 for (i=0;lines[i];i++)
376 if (*(unsigned char *)lines[i]>'!')
377 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
386 * Read an etext returning a newly allocated string containing the file
387 * contents or NULL on error.
389 gchar *read_etext(const char *filename,GError **err)
391 GError *tmp_err=NULL;
392 gchar *contents,*utf8;
393 gsize len,bytes_read,bytes_written;
395 if (!g_file_get_contents(filename,&contents,&len,err))
397 if (g_utf8_validate(contents,len,NULL))
399 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
400 g_set_print_handler(print_as_utf_8);
402 SetConsoleOutputCP(CP_UTF8);
407 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
408 &bytes_written,&tmp_err);
409 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
410 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
413 for(i=0;i<bytes_read;i++)
414 if (contents[i]=='\n')
419 else if (contents[i]!='\r')
421 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
422 "Input conversion failed. Byte %d at line %d, column %d is not a "
423 "valid Windows-1252 character",
424 ((unsigned char *)contents)[bytes_read],line,col);
427 g_propagate_error(err,tmp_err);
428 g_set_print_handler(print_as_windows_1252);
430 SetConsoleOutputCP(1252);
437 void cleanup_on_exit(void)
440 SetConsoleOutputCP(saved_cp);
444 int main(int argc,char **argv)
447 atexit(cleanup_on_exit);
448 saved_cp=GetConsoleOutputCP();
450 running_from=g_path_get_dirname(argv[0]);
451 parse_options(&argc,&argv);
452 if (pswit[USERTYPO_SWITCH])
454 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
456 if (pswit[OVERVIEW_SWITCH])
458 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
459 checked_linecnt,linecnt,linecnt-checked_linecnt);
460 g_print(" --------------- Queries found --------------\n");
462 g_print(" Long lines: %14ld\n",cnt_long);
464 g_print(" Short lines: %14ld\n",cnt_short);
466 g_print(" Line-end problems: %14ld\n",cnt_lineend);
468 g_print(" Common typos: %14ld\n",cnt_word);
470 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
472 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
474 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
476 g_print(" Proofing characters: %14ld\n",cnt_odd);
478 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
480 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
482 g_print(" Possible HTML tags: %14ld\n",cnt_html);
484 g_print(" TOTAL QUERIES %14ld\n",
485 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
486 cnt_dash+cnt_word+cnt_html+cnt_lineend);
488 g_free(running_from);
490 g_tree_unref(usertypo);
491 set_charset(NULL,NULL);
498 * Run a first pass - verify that it's a valid PG
499 * file, decide whether to report some things that
500 * occur many times in the text like long or short
501 * lines, non-standard dashes, etc.
503 struct first_pass_results *first_pass(const char *etext)
505 gunichar laststart=CHAR_SPACE;
510 unsigned int lastlen=0,lastblen=0;
511 long spline=0,nspline=0;
512 static struct first_pass_results results={0};
515 lines=g_strsplit(etext,"\n",0);
516 for (j=0;lines[j];j++)
518 lbytes=strlen(lines[j]);
519 while (lbytes>0 && lines[j][lbytes-1]=='\r')
520 lines[j][--lbytes]='\0';
521 llen=g_utf8_strlen(lines[j],lbytes);
523 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
524 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
527 g_print(" --> Duplicate header?\n");
528 spline=linecnt+1; /* first line of non-header text, that is */
530 if (!strncmp(lines[j],"*** START",9) &&
531 strstr(lines[j],"PROJECT GUTENBERG"))
534 g_print(" --> Duplicate header?\n");
535 nspline=linecnt+1; /* first line of non-header text, that is */
537 if (spline || nspline)
539 lc_line=g_utf8_strdown(lines[j],lbytes);
540 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
542 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
544 if (results.footerline)
546 /* it's an old-form header - we can detect duplicates */
548 g_print(" --> Duplicate footer?\n");
551 results.footerline=linecnt;
557 results.firstline=spline;
559 results.firstline=nspline; /* override with new */
560 if (results.footerline)
561 continue; /* don't count the boilerplate in the footer */
562 results.totlen+=llen;
563 for (s=lines[j];*s;s=g_utf8_next_char(s))
565 if (g_utf8_get_char(s)>127)
567 if (g_unichar_isalpha(g_utf8_get_char(s)))
571 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
572 qc=QUOTE_CLASS(g_utf8_get_char(s));
575 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
576 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
577 results.endquote_count++;
580 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
581 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
584 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
586 if (strstr(lines[j],".,"))
588 /* only count ast lines for ignoring purposes where there is */
589 /* locase text on the line */
590 if (strchr(lines[j],'*'))
592 for (s=lines[j];*s;s=g_utf8_next_char(s))
593 if (g_unichar_islower(g_utf8_get_char(s)))
598 if (strchr(lines[j],'/'))
599 results.fslashline++;
602 for (s=g_utf8_prev_char(lines[j]+lbytes);
603 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
604 s=g_utf8_prev_char(s))
606 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
607 g_utf8_get_char(g_utf8_prev_char(s))!='-')
610 if (llen>LONGEST_PG_LINE)
612 if (llen>WAY_TOO_LONG)
613 results.verylongline++;
614 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
616 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
619 if (strstr(lines[j],"<i>"))
620 results.htmcount+=4; /* bonus marks! */
622 /* Check for spaced em-dashes */
623 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
626 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
627 results.space_emdash++;
628 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
629 /* count of em-dashes with spaces both sides */
630 results.non_PG_space_emdash++;
631 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
632 /* count of PG-type em-dashes with no spaces */
633 results.PG_space_emdash++;
638 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
639 results.Dutchcount++;
640 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
641 results.Frenchcount++;
642 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
643 results.standalone_digit++;
646 /* Check for spaced dashes */
647 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
651 laststart=lines[j][0];
660 * Make some snap decisions based on the first pass results.
662 struct warnings *report_first_pass(struct first_pass_results *results)
664 static struct warnings warnings={0};
666 g_print(" --> %ld lines in this file have white space at end\n",
669 if (results->dotcomma>5)
672 g_print(" --> %ld lines in this file contain '.,'. "
673 "Not reporting them.\n",results->dotcomma);
676 * If more than 50 lines, or one-tenth, are short,
677 * don't bother reporting them.
679 warnings.shortline=1;
680 if (results->shortline>50 || results->shortline*10>linecnt)
682 warnings.shortline=0;
683 g_print(" --> %ld lines in this file are short. "
684 "Not reporting short lines.\n",results->shortline);
687 * If more than 50 lines, or one-tenth, are long,
688 * don't bother reporting them.
691 if (results->longline>50 || results->longline*10>linecnt)
694 g_print(" --> %ld lines in this file are long. "
695 "Not reporting long lines.\n",results->longline);
697 /* If more than 10 lines contain asterisks, don't bother reporting them. */
699 if (results->astline>10)
702 g_print(" --> %ld lines in this file contain asterisks. "
703 "Not reporting them.\n",results->astline);
706 * If more than 10 lines contain forward slashes,
707 * don't bother reporting them.
710 if (results->fslashline>10)
713 g_print(" --> %ld lines in this file contain forward slashes. "
714 "Not reporting them.\n",results->fslashline);
717 * If more than 20 lines contain unpunctuated endquotes,
718 * don't bother reporting them.
721 if (results->endquote_count>20)
724 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
725 "Not reporting them.\n",results->endquote_count);
728 * If more than 15 lines contain standalone digits,
729 * don't bother reporting them.
732 if (results->standalone_digit>10)
735 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
736 "Not reporting them.\n",results->standalone_digit);
739 * If more than 20 lines contain hyphens at end,
740 * don't bother reporting them.
743 if (results->hyphens>20)
746 g_print(" --> %ld lines in this file have hyphens at end. "
747 "Not reporting them.\n",results->hyphens);
749 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
751 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
752 pswit[MARKUP_SWITCH]=1;
754 if (results->verylongline>0)
755 g_print(" --> %ld lines in this file are VERY long!\n",
756 results->verylongline);
758 * If there are more non-PG spaced dashes than PG em-dashes,
759 * assume it's deliberate.
760 * Current PG guidelines say don't use them, but older texts do,
761 * and some people insist on them whatever the guidelines say.
764 if (results->spacedash+results->non_PG_space_emdash>
765 results->PG_space_emdash)
768 g_print(" --> There are %ld spaced dashes and em-dashes. "
769 "Not reporting them.\n",
770 results->spacedash+results->non_PG_space_emdash);
776 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
778 /* If more than a quarter of characters are hi-bit, bug out. */
779 if (results->binlen*4>results->totlen)
781 g_print(" --> This file does not appear to be ASCII. "
782 "Terminating. Best of luck with it!\n");
785 if (results->alphalen*4<results->totlen)
787 g_print(" --> This file does not appear to be text. "
788 "Terminating. Best of luck with it!\n");
791 if (results->binlen*100>results->totlen || results->binlen>100)
793 g_print(" --> There are a lot of foreign letters here. "
794 "Not reporting them.\n");
795 if (!pswit[VERBOSE_SWITCH])
799 warnings.isDutch=FALSE;
800 if (results->Dutchcount>50)
802 warnings.isDutch=TRUE;
803 g_print(" --> This looks like Dutch - "
804 "switching off dashes and warnings for 's Middags case.\n");
806 warnings.isFrench=FALSE;
807 if (results->Frenchcount>50)
809 warnings.isFrench=TRUE;
810 g_print(" --> This looks like French - "
811 "switching off some doublepunct.\n");
813 if (results->firstline && results->footerline)
814 g_print(" The PG header and footer appear to be already on.\n");
817 if (results->firstline)
818 g_print(" The PG header is on - no footer.\n");
819 if (results->footerline)
820 g_print(" The PG footer is on - no header.\n");
823 if (pswit[VERBOSE_SWITCH])
825 warnings.shortline=1;
834 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
836 if (warnings.isDutch)
838 if (results->footerline>0 && results->firstline>0 &&
839 results->footerline>results->firstline &&
840 results->footerline-results->firstline<100)
842 g_print(" --> I don't really know where this text starts. \n");
843 g_print(" There are no reference points.\n");
844 g_print(" I'm going to have to report the header and footer "
846 results->firstline=0;
854 * Look along the line, accumulate the count of quotes, and see
855 * if this is an empty line - i.e. a line with nothing on it
857 * If line has just spaces, period, * and/or - on it, don't
858 * count it, since empty lines with asterisks or dashes to
859 * separate sections are common.
861 * Returns: TRUE if the line is empty.
863 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
866 /* assume the line is empty until proven otherwise */
867 gboolean isemptyline=TRUE;
868 const char *s=aline,*sprev,*snext;
871 GError *tmp_err=NULL;
874 snext=g_utf8_next_char(s);
875 c=g_utf8_get_char(s);
876 if (CHAR_IS_DQUOTE(c))
877 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
878 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
883 * At start of line, it can only be a quotation mark.
884 * Hardcode a very common exception!
886 if (!g_str_has_prefix(snext,"tis") &&
887 !g_str_has_prefix(snext,"Tis"))
888 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
890 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
891 g_unichar_isalpha(g_utf8_get_char(snext)))
892 /* Do nothing! it's definitely an apostrophe, not a quote */
894 /* it's outside a word - let's check it out */
895 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
896 g_unichar_isalpha(g_utf8_get_char(snext)))
898 /* certainly looks like a quotation mark */
899 if (!g_str_has_prefix(snext,"tis") &&
900 !g_str_has_prefix(snext,"Tis"))
901 /* hardcode a very common exception! */
903 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
904 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
906 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
911 /* now - is it a quotation mark? */
912 guessquote=0; /* accumulate clues */
913 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
915 /* it follows a letter - could be either */
917 if (g_utf8_get_char(sprev)=='s')
919 /* looks like a plural apostrophe */
921 if (g_utf8_get_char(snext)==CHAR_SPACE)
925 if (innermost_quote_matches(counters,c))
927 * Give it the benefit of some doubt,
928 * if a squote is already open.
934 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
937 /* no adjacent letter - it must be a quote of some kind */
938 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
943 if (pswit[ECHO_SWITCH])
944 g_print("\n%s\n",aline);
945 if (!pswit[OVERVIEW_SWITCH])
946 g_print(" Line %ld column %ld - %s\n",
947 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
948 g_clear_error(&tmp_err);
950 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
952 isemptyline=FALSE; /* ignore lines like * * * as spacers */
953 if (c==CHAR_UNDERSCORE)
954 counters->c_unders++;
955 if (c==CHAR_OPEN_SBRACK)
957 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
958 !matching_difference(counters,c) && s==aline &&
959 g_str_has_prefix(s,"[Illustration:"))
960 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
962 increment_matching(counters,c,TRUE);
964 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
965 increment_matching(counters,c,TRUE);
966 if (c==CHAR_CLOSE_SBRACK)
968 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
969 !matching_difference(counters,c) && !*snext)
970 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
972 increment_matching(counters,c,FALSE);
974 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
975 increment_matching(counters,c,FALSE);
983 * check_for_control_characters:
985 * Check for invalid or questionable characters in the line
986 * Anything above 127 is invalid for plain ASCII, and
987 * non-printable control characters should also be flagged.
988 * Tabs should generally not be there.
990 void check_for_control_characters(const char *aline)
994 for (s=aline;*s;s=g_utf8_next_char(s))
996 c=g_utf8_get_char(s);
997 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
999 if (pswit[ECHO_SWITCH])
1000 g_print("\n%s\n",aline);
1001 if (!pswit[OVERVIEW_SWITCH])
1002 g_print(" Line %ld column %ld - Control character %u\n",
1003 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1011 * check_for_odd_characters:
1013 * Check for binary and other odd characters.
1015 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1016 gboolean isemptyline)
1018 /* Don't repeat multiple warnings on one line. */
1019 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1020 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1025 for (s=aline;*s;s=g_utf8_next_char(s))
1027 c=g_utf8_get_char(s);
1028 if (warnings->bin && !eInvalidChar &&
1029 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1031 if (pswit[ECHO_SWITCH])
1032 g_print("\n%s\n",aline);
1033 if (!pswit[OVERVIEW_SWITCH])
1034 if (c>127 && c<160 || c>255)
1035 g_print(" Line %ld column %ld - "
1036 "Non-ISO-8859 character %u\n",
1037 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1039 g_print(" Line %ld column %ld - "
1040 "Non-ASCII character %u\n",
1041 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1046 if (!eInvalidChar && charset)
1048 if (charset_validator==(GIConv)-1)
1050 if (!g_unichar_isdefined(c))
1052 if (pswit[ECHO_SWITCH])
1053 g_print("\n%s\n",aline);
1054 if (!pswit[OVERVIEW_SWITCH])
1055 g_print(" Line %ld column %ld - Unassigned UNICODE "
1056 "code point U+%04" G_GINT32_MODIFIER "X\n",
1057 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1062 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1063 c>=100000 && c<=0x10FFFD)
1065 if (pswit[ECHO_SWITCH])
1066 g_print("\n%s\n",aline);
1067 if (!pswit[OVERVIEW_SWITCH])
1068 g_print(" Line %ld column %ld - Private Use "
1069 "character U+%04" G_GINT32_MODIFIER "X\n",
1070 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1078 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1079 charset_validator,NULL,&nb,NULL);
1084 if (pswit[ECHO_SWITCH])
1085 g_print("\n%s\n",aline);
1086 if (!pswit[OVERVIEW_SWITCH])
1087 g_print(" Line %ld column %ld - Non-%s "
1088 "character %u\n",linecnt,
1089 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1096 if (!eTab && c==CHAR_TAB)
1098 if (pswit[ECHO_SWITCH])
1099 g_print("\n%s\n",aline);
1100 if (!pswit[OVERVIEW_SWITCH])
1101 g_print(" Line %ld column %ld - Tab character?\n",
1102 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1107 if (!eTilde && c==CHAR_TILDE)
1110 * Often used by OCR software to indicate an
1111 * unrecognizable character.
1113 if (pswit[ECHO_SWITCH])
1114 g_print("\n%s\n",aline);
1115 if (!pswit[OVERVIEW_SWITCH])
1116 g_print(" Line %ld column %ld - Tilde character?\n",
1117 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1122 if (!eCarat && c==CHAR_CARAT)
1124 if (pswit[ECHO_SWITCH])
1125 g_print("\n%s\n",aline);
1126 if (!pswit[OVERVIEW_SWITCH])
1127 g_print(" Line %ld column %ld - Carat character?\n",
1128 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1133 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1135 if (pswit[ECHO_SWITCH])
1136 g_print("\n%s\n",aline);
1137 if (!pswit[OVERVIEW_SWITCH])
1138 g_print(" Line %ld column %ld - Forward slash?\n",
1139 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1145 * Report asterisks only in paranoid mode,
1146 * since they're often deliberate.
1148 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1151 if (pswit[ECHO_SWITCH])
1152 g_print("\n%s\n",aline);
1153 if (!pswit[OVERVIEW_SWITCH])
1154 g_print(" Line %ld column %ld - Asterisk?\n",
1155 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1164 * check_for_long_line:
1166 * Check for line too long.
1168 void check_for_long_line(const char *aline)
1170 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1172 if (pswit[ECHO_SWITCH])
1173 g_print("\n%s\n",aline);
1174 if (!pswit[OVERVIEW_SWITCH])
1175 g_print(" Line %ld column %ld - Long line %ld\n",
1176 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1183 * check_for_short_line:
1185 * Check for line too short.
1187 * This one is a bit trickier to implement: we don't want to
1188 * flag the last line of a paragraph for being short, so we
1189 * have to wait until we know that our current line is a
1190 * "normal" line, then report the _previous_ line if it was too
1191 * short. We also don't want to report indented lines like
1192 * chapter heads or formatted quotations. We therefore keep
1193 * last->len as the length of the last line examined, and
1194 * last->blen as the length of the last but one, and try to
1195 * suppress unnecessary warnings by checking that both were of
1196 * "normal" length. We keep the first character of the last
1197 * line in last->start, and if it was a space, we assume that
1198 * the formatting is deliberate. I can't figure out a way to
1199 * distinguish something like a quoted verse left-aligned or
1200 * the header or footer of a letter from a paragraph of short
1201 * lines - maybe if I examined the whole paragraph, and if the
1202 * para has less than, say, 8 lines and if all lines are short,
1203 * then just assume it's OK? Need to look at some texts to see
1204 * how often a formula like this would get the right result.
1206 void check_for_short_line(const char *aline,const struct line_properties *last)
1208 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1209 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1210 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1212 if (pswit[ECHO_SWITCH])
1213 g_print("\n%s\n",prevline);
1214 if (!pswit[OVERVIEW_SWITCH])
1215 g_print(" Line %ld column %ld - Short line %ld?\n",
1216 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1223 * check_for_starting_punctuation:
1225 * Look for punctuation other than full ellipses at start of line.
1227 void check_for_starting_punctuation(const char *aline)
1229 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1230 !g_str_has_prefix(aline,". . ."))
1232 if (pswit[ECHO_SWITCH])
1233 g_print("\n%s\n",aline);
1234 if (!pswit[OVERVIEW_SWITCH])
1235 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1243 * check_for_spaced_emdash:
1245 * Check for spaced em-dashes.
1247 * We must check _all_ occurrences of "--" on the line
1248 * hence the loop - even if the first double-dash is OK
1249 * there may be another that's wrong later on.
1251 void check_for_spaced_emdash(const char *aline)
1253 const char *s,*t,*next;
1254 for (s=aline;t=strstr(s,"--");s=next)
1256 next=g_utf8_next_char(g_utf8_next_char(t));
1257 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1258 g_utf8_get_char(next)==CHAR_SPACE)
1260 if (pswit[ECHO_SWITCH])
1261 g_print("\n%s\n",aline);
1262 if (!pswit[OVERVIEW_SWITCH])
1263 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1264 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1272 * check_for_spaced_dash:
1274 * Check for spaced dashes.
1276 void check_for_spaced_dash(const char *aline)
1279 if ((s=strstr(aline," -")))
1281 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1283 if (pswit[ECHO_SWITCH])
1284 g_print("\n%s\n",aline);
1285 if (!pswit[OVERVIEW_SWITCH])
1286 g_print(" Line %ld column %ld - Spaced dash?\n",
1287 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1292 else if ((s=strstr(aline,"- ")))
1294 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1296 if (pswit[ECHO_SWITCH])
1297 g_print("\n%s\n",aline);
1298 if (!pswit[OVERVIEW_SWITCH])
1299 g_print(" Line %ld column %ld - Spaced dash?\n",
1300 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1308 * check_for_unmarked_paragraphs:
1310 * Check for unmarked paragraphs indicated by separate speakers.
1312 * May well be false positive:
1313 * "Bravo!" "Wonderful!" called the crowd.
1314 * but useful all the same.
1316 void check_for_unmarked_paragraphs(const char *aline)
1319 s=strstr(aline,"\" \"");
1321 s=strstr(aline,"\" \"");
1324 if (pswit[ECHO_SWITCH])
1325 g_print("\n%s\n",aline);
1326 if (!pswit[OVERVIEW_SWITCH])
1327 g_print(" Line %ld column %ld - "
1328 "Query missing paragraph break?\n",
1329 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1336 * check_for_jeebies:
1338 * Check for "to he" and other easy h/b errors.
1340 * This is a very inadequate effort on the h/b problem,
1341 * but the phrase "to he" is always an error, whereas "to
1342 * be" is quite common.
1343 * Similarly, '"Quiet!", be said.' is a non-be error
1344 * "to he" is _not_ always an error!:
1345 * "Where they went to he couldn't say."
1346 * Another false positive:
1347 * What would "Cinderella" be without the . . .
1348 * and another: "If he wants to he can see for himself."
1350 void check_for_jeebies(const char *aline)
1353 s=strstr(aline," be could ");
1355 s=strstr(aline," be would ");
1357 s=strstr(aline," was be ");
1359 s=strstr(aline," be is ");
1361 s=strstr(aline," is be ");
1363 s=strstr(aline,"\", be ");
1365 s=strstr(aline,"\" be ");
1367 s=strstr(aline,"\" be ");
1369 s=strstr(aline," to he ");
1372 if (pswit[ECHO_SWITCH])
1373 g_print("\n%s\n",aline);
1374 if (!pswit[OVERVIEW_SWITCH])
1375 g_print(" Line %ld column %ld - Query he/be error?\n",
1376 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1380 s=strstr(aline," the had ");
1382 s=strstr(aline," a had ");
1384 s=strstr(aline," they bad ");
1386 s=strstr(aline," she bad ");
1388 s=strstr(aline," he bad ");
1390 s=strstr(aline," you bad ");
1392 s=strstr(aline," i bad ");
1395 if (pswit[ECHO_SWITCH])
1396 g_print("\n%s\n",aline);
1397 if (!pswit[OVERVIEW_SWITCH])
1398 g_print(" Line %ld column %ld - Query had/bad error?\n",
1399 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1403 s=strstr(aline,"; hut ");
1405 s=strstr(aline,", hut ");
1408 if (pswit[ECHO_SWITCH])
1409 g_print("\n%s\n",aline);
1410 if (!pswit[OVERVIEW_SWITCH])
1411 g_print(" Line %ld column %ld - Query hut/but error?\n",
1412 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1419 * check_for_mta_from:
1421 * Special case - angled bracket in front of "From" placed there by an
1422 * MTA when sending an e-mail.
1424 void check_for_mta_from(const char *aline)
1427 s=strstr(aline,">From");
1430 if (pswit[ECHO_SWITCH])
1431 g_print("\n%s\n",aline);
1432 if (!pswit[OVERVIEW_SWITCH])
1433 g_print(" Line %ld column %ld - "
1434 "Query angled bracket with From\n",
1435 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1442 * check_for_orphan_character:
1444 * Check for a single character line -
1445 * often an overflow from bad wrapping.
1447 void check_for_orphan_character(const char *aline)
1450 c=g_utf8_get_char(aline);
1451 if (c && !*g_utf8_next_char(aline))
1453 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1454 ; /* Nothing - ignore numerals alone on a line. */
1457 if (pswit[ECHO_SWITCH])
1458 g_print("\n%s\n",aline);
1459 if (!pswit[OVERVIEW_SWITCH])
1460 g_print(" Line %ld column 1 - Query single character line\n",
1469 * check_for_pling_scanno:
1471 * Check for I" - often should be !
1473 void check_for_pling_scanno(const char *aline)
1476 s=strstr(aline," I\"");
1479 if (pswit[ECHO_SWITCH])
1480 g_print("\n%s\n",aline);
1481 if (!pswit[OVERVIEW_SWITCH])
1482 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1483 linecnt,g_utf8_pointer_to_offset(aline,s));
1490 * check_for_extra_period:
1492 * Check for period without a capital letter. Cut-down from gutspell.
1493 * Only works when it happens on a single line.
1495 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1497 const char *s,*t,*s1,*sprev;
1502 gunichar c,nc,pc,*decomposition;
1503 if (pswit[PARANOID_SWITCH])
1505 for (t=aline;t=strstr(t,". ");)
1509 t=g_utf8_next_char(t);
1510 /* start of line punctuation is handled elsewhere */
1513 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1515 t=g_utf8_next_char(t);
1518 if (warnings->isDutch)
1520 /* For Frank & Jeroen -- 's Middags case */
1521 gunichar c2,c3,c4,c5;
1522 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1523 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1524 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1525 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1526 if (CHAR_IS_APOSTROPHE(c2) &&
1527 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1528 g_unichar_isupper(c5))
1530 t=g_utf8_next_char(t);
1534 s1=g_utf8_next_char(g_utf8_next_char(t));
1535 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1536 !isdigit(g_utf8_get_char(s1)))
1537 s1=g_utf8_next_char(s1);
1538 if (g_unichar_islower(g_utf8_get_char(s1)))
1540 /* we have something to investigate */
1542 /* so let's go back and find out */
1543 nc=g_utf8_get_char(t);
1544 s1=g_utf8_prev_char(t);
1545 c=g_utf8_get_char(s1);
1546 sprev=g_utf8_prev_char(s1);
1547 pc=g_utf8_get_char(sprev);
1549 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1550 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1551 g_unichar_isalpha(nc)))
1556 sprev=g_utf8_prev_char(s1);
1557 pc=g_utf8_get_char(sprev);
1559 s1=g_utf8_next_char(s1);
1562 testword=g_strndup(s1,s-s1);
1564 testword=g_strdup(s1);
1565 for (i=0;*abbrev[i];i++)
1566 if (!strcmp(testword,abbrev[i]))
1568 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1570 if (!*g_utf8_next_char(testword))
1572 if (isroman(testword))
1577 for (s=testword;*s;s=g_utf8_next_char(s))
1579 decomposition=g_unicode_canonical_decomposition(
1580 g_utf8_get_char(s),&len);
1581 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1583 g_free(decomposition);
1587 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1589 g_tree_insert(qperiod,g_strdup(testword),
1590 GINT_TO_POINTER(1));
1591 if (pswit[ECHO_SWITCH])
1592 g_print("\n%s\n",aline);
1593 if (!pswit[OVERVIEW_SWITCH])
1594 g_print(" Line %ld column %ld - Extra period?\n",
1595 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1601 t=g_utf8_next_char(t);
1607 * check_for_following_punctuation:
1609 * Check for words usually not followed by punctuation.
1611 void check_for_following_punctuation(const char *aline)
1614 const char *s,*wordstart;
1617 if (pswit[TYPO_SWITCH])
1628 inword=g_utf8_strdown(t,-1);
1630 for (i=0;*nocomma[i];i++)
1631 if (!strcmp(inword,nocomma[i]))
1633 c=g_utf8_get_char(s);
1634 if (c==',' || c==';' || c==':')
1636 if (pswit[ECHO_SWITCH])
1637 g_print("\n%s\n",aline);
1638 if (!pswit[OVERVIEW_SWITCH])
1639 g_print(" Line %ld column %ld - "
1640 "Query punctuation after %s?\n",
1641 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1647 for (i=0;*noperiod[i];i++)
1648 if (!strcmp(inword,noperiod[i]))
1650 c=g_utf8_get_char(s);
1651 if (c=='.' || c=='!')
1653 if (pswit[ECHO_SWITCH])
1654 g_print("\n%s\n",aline);
1655 if (!pswit[OVERVIEW_SWITCH])
1656 g_print(" Line %ld column %ld - "
1657 "Query punctuation after %s?\n",
1658 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1672 * Check for commonly mistyped words,
1673 * and digits like 0 for O in a word.
1675 void check_for_typos(const char *aline,struct warnings *warnings)
1677 const char *s,*t,*nt,*wordstart;
1679 gunichar *decomposition;
1681 int i,vowel,consonant,*dupcnt;
1682 gboolean isdup,istypo,alower;
1685 gsize decomposition_len;
1689 inword=getaword(&s);
1693 continue; /* don't bother with empty lines */
1695 if (mixdigit(inword))
1697 if (pswit[ECHO_SWITCH])
1698 g_print("\n%s\n",aline);
1699 if (!pswit[OVERVIEW_SWITCH])
1700 g_print(" Line %ld column %ld - Query digit in %s\n",
1701 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1706 * Put the word through a series of tests for likely typos and OCR
1709 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1713 for (t=inword;*t;t=g_utf8_next_char(t))
1715 c=g_utf8_get_char(t);
1716 nt=g_utf8_next_char(t);
1717 /* lowercase for testing */
1718 if (g_unichar_islower(c))
1720 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1723 * We have an uppercase mid-word. However, there are
1725 * Mac and Mc like McGill
1726 * French contractions like l'Abbe
1728 offset=g_utf8_pointer_to_offset(inword,t);
1730 pc=g_utf8_get_char(g_utf8_prev_char(t));
1733 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1734 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1735 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1736 CHAR_IS_APOSTROPHE(pc))
1742 testword=g_utf8_casefold(inword,-1);
1744 if (pswit[TYPO_SWITCH])
1747 * Check for certain unlikely two-letter combinations at word
1750 len=g_utf8_strlen(testword,-1);
1753 for (i=0;*nostart[i];i++)
1754 if (g_str_has_prefix(testword,nostart[i]))
1756 for (i=0;*noend[i];i++)
1757 if (g_str_has_suffix(testword,noend[i]))
1760 /* ght is common, gbt never. Like that. */
1761 if (strstr(testword,"cb"))
1763 if (strstr(testword,"gbt"))
1765 if (strstr(testword,"pbt"))
1767 if (strstr(testword,"tbs"))
1769 if (strstr(testword,"mrn"))
1771 if (strstr(testword,"ahle"))
1773 if (strstr(testword,"ihle"))
1776 * "TBE" does happen - like HEARTBEAT - but uncommon.
1777 * Also "TBI" - frostbite, outbid - but uncommon.
1778 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1779 * numerals, but "ii" is a common scanno.
1781 if (strstr(testword,"tbi"))
1783 if (strstr(testword,"tbe"))
1785 if (strstr(testword,"ii"))
1788 * Check for no vowels or no consonants.
1789 * If none, flag a typo.
1791 if (!istypo && len>1)
1794 for (t=testword;*t;t=g_utf8_next_char(t))
1796 c=g_utf8_get_char(t);
1798 g_unicode_canonical_decomposition(c,&decomposition_len);
1799 if (c=='y' || g_unichar_isdigit(c))
1801 /* Yah, this is loose. */
1805 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1809 g_free(decomposition);
1811 if (!vowel || !consonant)
1815 * Now exclude the word from being reported if it's in
1818 for (i=0;*okword[i];i++)
1819 if (!strcmp(testword,okword[i]))
1822 * What looks like a typo may be a Roman numeral.
1825 if (istypo && isroman(testword))
1827 /* Check the manual list of typos. */
1829 for (i=0;*typo[i];i++)
1830 if (!strcmp(testword,typo[i]))
1833 * Check lowercase s, l, i and m - special cases.
1834 * "j" - often a semi-colon gone wrong.
1835 * "d" for a missing apostrophe - he d
1838 if (!istypo && len==1 &&
1839 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1843 dupcnt=g_tree_lookup(qword,testword);
1847 isdup=!pswit[VERBOSE_SWITCH];
1851 dupcnt=g_new0(int,1);
1852 g_tree_insert(qword,g_strdup(testword),dupcnt);
1857 if (pswit[ECHO_SWITCH])
1858 g_print("\n%s\n",aline);
1859 if (!pswit[OVERVIEW_SWITCH])
1861 g_print(" Line %ld column %ld - Query word %s",
1862 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1864 if (!pswit[VERBOSE_SWITCH])
1865 g_print(" - not reporting duplicates");
1873 /* check the user's list of typos */
1874 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1876 if (pswit[ECHO_SWITCH])
1877 g_print("\n%s\n",aline);
1878 if (!pswit[OVERVIEW_SWITCH])
1879 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1880 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1882 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1884 if (pswit[PARANOID_SWITCH] && warnings->digit)
1886 /* In paranoid mode, query all 0 and 1 standing alone. */
1887 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1889 if (pswit[ECHO_SWITCH])
1890 g_print("\n%s\n",aline);
1891 if (!pswit[OVERVIEW_SWITCH])
1892 g_print(" Line %ld column %ld - Query standalone %s\n",
1893 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1904 * check_for_misspaced_punctuation:
1906 * Look for added or missing spaces around punctuation and quotes.
1907 * If there is a punctuation character like ! with no space on
1908 * either side, suspect a missing!space. If there are spaces on
1909 * both sides , assume a typo. If we see a double quote with no
1910 * space or punctuation on either side of it, assume unspaced
1911 * quotes "like"this.
1913 void check_for_misspaced_punctuation(const char *aline,
1914 struct parities *parities,gboolean isemptyline)
1916 gboolean isacro,isellipsis;
1918 gunichar c,nc,pc,n2c;
1920 c=g_utf8_get_char(aline);
1921 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1922 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1926 nc=g_utf8_get_char(g_utf8_next_char(s));
1927 /* For each character in the line after the first. */
1928 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1930 /* we need to suppress warnings for acronyms like M.D. */
1932 /* we need to suppress warnings for ellipsis . . . */
1935 * If there are letters on both sides of it or
1936 * if it's strict punctuation followed by an alpha.
1938 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1939 g_utf8_strchr("?!,;:",-1,c)))
1943 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1944 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1946 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1952 if (pswit[ECHO_SWITCH])
1953 g_print("\n%s\n",aline);
1954 if (!pswit[OVERVIEW_SWITCH])
1955 g_print(" Line %ld column %ld - Missing space?\n",
1956 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1961 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1964 * If there are spaces on both sides,
1965 * or space before and end of line.
1969 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1970 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1972 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1976 if (!isemptyline && !isellipsis)
1978 if (pswit[ECHO_SWITCH])
1979 g_print("\n%s\n",aline);
1980 if (!pswit[OVERVIEW_SWITCH])
1981 g_print(" Line %ld column %ld - "
1982 "Spaced punctuation?\n",linecnt,
1983 g_utf8_pointer_to_offset(aline,s)+1);
1990 /* Split out the characters that CANNOT be preceded by space. */
1991 c=g_utf8_get_char(aline);
1992 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1993 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1997 nc=g_utf8_get_char(g_utf8_next_char(s));
1998 /* for each character in the line after the first */
1999 if (g_utf8_strchr("?!,;:",-1,c))
2001 /* if it's punctuation that _cannot_ have a space before it */
2002 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
2005 * If nc DOES == space,
2006 * it was already reported just above.
2008 if (pswit[ECHO_SWITCH])
2009 g_print("\n%s\n",aline);
2010 if (!pswit[OVERVIEW_SWITCH])
2011 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2012 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2019 * Special case " .X" where X is any alpha.
2020 * This plugs a hole in the acronym code above.
2021 * Inelegant, but maintainable.
2023 c=g_utf8_get_char(aline);
2024 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2025 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2029 nc=g_utf8_get_char(g_utf8_next_char(s));
2030 /* for each character in the line after the first */
2033 /* if it's a period */
2034 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2037 * If the period follows a space and
2038 * is followed by a letter.
2040 if (pswit[ECHO_SWITCH])
2041 g_print("\n%s\n",aline);
2042 if (!pswit[OVERVIEW_SWITCH])
2043 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2044 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2050 c=g_utf8_get_char(aline);
2051 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2052 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2056 nc=g_utf8_get_char(g_utf8_next_char(s));
2057 /* for each character in the line after the first */
2058 if (CHAR_IS_DQUOTE(c))
2060 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2061 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2062 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2064 if (pswit[ECHO_SWITCH])
2065 g_print("\n%s\n",aline);
2066 if (!pswit[OVERVIEW_SWITCH])
2067 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2068 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2074 /* Check parity of quotes. */
2075 nc=g_utf8_get_char(aline);
2076 for (s=aline;*s;s=g_utf8_next_char(s))
2079 nc=g_utf8_get_char(g_utf8_next_char(s));
2080 if (CHAR_IS_DQUOTE(c))
2084 parities->dquote=!parities->dquote;
2085 parity=parities->dquote;
2087 else if (c==CHAR_LD_QUOTE)
2094 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2096 if (pswit[ECHO_SWITCH])
2097 g_print("\n%s\n",aline);
2098 if (!pswit[OVERVIEW_SWITCH])
2099 g_print(" Line %ld column %ld - "
2100 "Wrongspaced quotes?\n",
2101 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2109 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2110 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2112 if (pswit[ECHO_SWITCH])
2113 g_print("\n%s\n",aline);
2114 if (!pswit[OVERVIEW_SWITCH])
2115 g_print(" Line %ld column %ld - "
2116 "Wrongspaced quotes?\n",
2117 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2124 c=g_utf8_get_char(aline);
2125 if (CHAR_IS_DQUOTE(c))
2127 if (g_utf8_strchr(",;:!?)]} ",-1,
2128 g_utf8_get_char(g_utf8_next_char(aline))))
2130 if (pswit[ECHO_SWITCH])
2131 g_print("\n%s\n",aline);
2132 if (!pswit[OVERVIEW_SWITCH])
2133 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2139 if (pswit[SQUOTE_SWITCH])
2141 nc=g_utf8_get_char(aline);
2142 for (s=aline;*s;s=g_utf8_next_char(s))
2145 nc=g_utf8_get_char(g_utf8_next_char(s));
2146 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2147 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2148 !g_unichar_isalpha(nc)))
2150 parities->squote=!parities->squote;
2151 if (!parities->squote)
2154 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2156 if (pswit[ECHO_SWITCH])
2157 g_print("\n%s\n",aline);
2158 if (!pswit[OVERVIEW_SWITCH])
2159 g_print(" Line %ld column %ld - "
2160 "Wrongspaced singlequotes?\n",
2161 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2169 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2170 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2172 if (pswit[ECHO_SWITCH])
2173 g_print("\n%s\n",aline);
2174 if (!pswit[OVERVIEW_SWITCH])
2175 g_print(" Line %ld column %ld - "
2176 "Wrongspaced singlequotes?\n",
2177 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2188 * check_for_double_punctuation:
2190 * Look for double punctuation like ,. or ,,
2191 * Thanks to DW for the suggestion!
2192 * In books with references, ".," and ".;" are common
2193 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2194 * OTOH, from my initial tests, there are also fairly
2195 * common errors. What to do? Make these cases paranoid?
2196 * ".," is the most common, so warnings->dotcomma is used
2197 * to suppress detailed reporting if it occurs often.
2199 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2203 nc=g_utf8_get_char(aline);
2204 for (s=aline;*s;s=g_utf8_next_char(s))
2207 nc=g_utf8_get_char(g_utf8_next_char(s));
2208 /* for each punctuation character in the line */
2209 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2210 g_utf8_strchr(".?!,;:",-1,nc))
2212 /* followed by punctuation, it's a query, unless . . . */
2213 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2214 !warnings->dotcomma && c=='.' && nc==',' ||
2215 warnings->isFrench && g_str_has_prefix(s,",...") ||
2216 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2217 warnings->isFrench && g_str_has_prefix(s,";...") ||
2218 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2219 warnings->isFrench && g_str_has_prefix(s,":...") ||
2220 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2221 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2222 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2223 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2224 warnings->isFrench && g_str_has_prefix(s,"...?"))
2226 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2227 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2228 warnings->isFrench && g_str_has_prefix(s,";...") ||
2229 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2230 warnings->isFrench && g_str_has_prefix(s,":...") ||
2231 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2232 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2233 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2234 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2235 warnings->isFrench && g_str_has_prefix(s,"...?"))
2238 nc=g_utf8_get_char(g_utf8_next_char(s));
2240 ; /* do nothing for .. !! and ?? which can be legit */
2244 if (pswit[ECHO_SWITCH])
2245 g_print("\n%s\n",aline);
2246 if (!pswit[OVERVIEW_SWITCH])
2247 g_print(" Line %ld column %ld - Double punctuation?\n",
2248 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2257 * check_for_spaced_quotes:
2259 void check_for_spaced_quotes(const char *aline)
2263 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2267 while ((t=strstr(s," \" ")))
2269 if (pswit[ECHO_SWITCH])
2270 g_print("\n%s\n",aline);
2271 if (!pswit[OVERVIEW_SWITCH])
2272 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2273 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2276 s=g_utf8_next_char(g_utf8_next_char(t));
2278 pattern=g_string_new(NULL);
2279 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2281 g_string_assign(pattern," ");
2282 g_string_append_unichar(pattern,single_quotes[i]);
2283 g_string_append_c(pattern,' ');
2285 while ((t=strstr(s,pattern->str)))
2287 if (pswit[ECHO_SWITCH])
2288 g_print("\n%s\n",aline);
2289 if (!pswit[OVERVIEW_SWITCH])
2290 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2291 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2294 s=g_utf8_next_char(g_utf8_next_char(t));
2297 g_string_free(pattern,TRUE);
2301 * check_for_miscased_genative:
2303 * Check special case of 'S instead of 's at end of word.
2305 void check_for_miscased_genative(const char *aline)
2311 c=g_utf8_get_char(aline);
2312 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2313 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2317 nc=g_utf8_get_char(g_utf8_next_char(s));
2318 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2320 if (pswit[ECHO_SWITCH])
2321 g_print("\n%s\n",aline);
2322 if (!pswit[OVERVIEW_SWITCH])
2323 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2324 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2332 * check_end_of_line:
2334 * Now check special cases - start and end of line -
2335 * for single and double quotes. Start is sometimes [sic]
2336 * but better to query it anyway.
2337 * While we're here, check for dash at end of line.
2339 void check_end_of_line(const char *aline,struct warnings *warnings)
2344 lbytes=strlen(aline);
2345 if (g_utf8_strlen(aline,lbytes)>1)
2347 s=g_utf8_prev_char(aline+lbytes);
2348 c1=g_utf8_get_char(s);
2349 c2=g_utf8_get_char(g_utf8_prev_char(s));
2350 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2352 if (pswit[ECHO_SWITCH])
2353 g_print("\n%s\n",aline);
2354 if (!pswit[OVERVIEW_SWITCH])
2355 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2356 g_utf8_strlen(aline,lbytes));
2360 c1=g_utf8_get_char(aline);
2361 c2=g_utf8_get_char(g_utf8_next_char(aline));
2362 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2364 if (pswit[ECHO_SWITCH])
2365 g_print("\n%s\n",aline);
2366 if (!pswit[OVERVIEW_SWITCH])
2367 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2372 * Dash at end of line may well be legit - paranoid mode only
2373 * and don't report em-dash at line-end.
2375 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2377 for (s=g_utf8_prev_char(aline+lbytes);
2378 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2380 if (g_utf8_get_char(s)=='-' &&
2381 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2383 if (pswit[ECHO_SWITCH])
2384 g_print("\n%s\n",aline);
2385 if (!pswit[OVERVIEW_SWITCH])
2386 g_print(" Line %ld column %ld - "
2387 "Hyphen at end of line?\n",
2388 linecnt,g_utf8_pointer_to_offset(aline,s));
2395 * check_for_unspaced_bracket:
2397 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2398 * If so, suspect a scanno like "a]most".
2400 void check_for_unspaced_bracket(const char *aline)
2404 c=g_utf8_get_char(aline);
2405 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2406 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2410 nc=g_utf8_get_char(g_utf8_next_char(s));
2413 /* for each bracket character in the line except 1st & last */
2414 if (g_utf8_strchr("{[()]}",-1,c) &&
2415 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2417 if (pswit[ECHO_SWITCH])
2418 g_print("\n%s\n",aline);
2419 if (!pswit[OVERVIEW_SWITCH])
2420 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2421 linecnt,g_utf8_pointer_to_offset(aline,s));
2429 * check_for_unpunctuated_endquote:
2431 void check_for_unpunctuated_endquote(const char *aline)
2436 c=g_utf8_get_char(aline);
2437 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2438 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2442 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2443 nc=g_utf8_get_char(g_utf8_next_char(s));
2444 /* for each character in the line except 1st */
2445 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
2447 if (pswit[ECHO_SWITCH])
2448 g_print("\n%s\n",aline);
2449 if (!pswit[OVERVIEW_SWITCH])
2450 g_print(" Line %ld column %ld - "
2451 "endquote missing punctuation?\n",
2452 linecnt,g_utf8_pointer_to_offset(aline,s));
2460 * check_for_html_tag:
2462 * Check for <HTML TAG>.
2464 * If there is a < in the line, followed at some point
2465 * by a > then we suspect HTML.
2467 void check_for_html_tag(const char *aline)
2469 const char *open,*close;
2471 open=strchr(aline,'<');
2474 close=strchr(g_utf8_next_char(open),'>');
2477 if (pswit[ECHO_SWITCH])
2478 g_print("\n%s\n",aline);
2479 if (!pswit[OVERVIEW_SWITCH])
2481 tag=g_strndup(open,close-open+1);
2482 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2483 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2493 * check_for_html_entity:
2495 * Check for &symbol; HTML.
2497 * If there is a & in the line, followed at
2498 * some point by a ; then we suspect HTML.
2500 void check_for_html_entity(const char *aline)
2502 const char *s,*amp,*scolon;
2504 amp=strchr(aline,'&');
2507 scolon=strchr(amp,';');
2510 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2511 if (g_utf8_get_char(s)==CHAR_SPACE)
2512 break; /* Don't report "Jones & Son;" */
2515 if (pswit[ECHO_SWITCH])
2516 g_print("\n%s\n",aline);
2517 if (!pswit[OVERVIEW_SWITCH])
2519 entity=g_strndup(amp,scolon-amp+1);
2520 g_print(" Line %ld column %d - HTML symbol? %s \n",
2521 linecnt,(int)(amp-aline)+1,entity);
2532 * check_for_omitted_punctuation:
2534 * Check for omitted punctuation at end of paragraph by working back
2535 * through prevline. DW.
2536 * Need to check this only for "normal" paras.
2537 * So what is a "normal" para?
2538 * Not normal if one-liner (chapter headings, etc.)
2539 * Not normal if doesn't contain at least one locase letter
2540 * Not normal if starts with space
2542 void check_for_omitted_punctuation(const char *prevline,
2543 struct line_properties *last,int start_para_line)
2545 gboolean letter_on_line=FALSE;
2548 gboolean closing_quote;
2549 for (s=prevline;*s;s=g_utf8_next_char(s))
2550 if (g_unichar_isalpha(g_utf8_get_char(s)))
2552 letter_on_line=TRUE;
2556 * This next "if" is a problem.
2557 * If we say "start_para_line <= linecnt - 1", that includes
2558 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2559 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2560 * misses genuine one-line paragraphs.
2562 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2563 g_utf8_get_char(prevline)>CHAR_SPACE)
2565 s=prevline+strlen(prevline);
2568 s=g_utf8_prev_char(s);
2569 c=g_utf8_get_char(s);
2570 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2573 closing_quote=FALSE;
2574 } while (closing_quote && s>prevline);
2575 for (;s>prevline;s=g_utf8_prev_char(s))
2577 if (g_unichar_isalpha(g_utf8_get_char(s)))
2579 if (pswit[ECHO_SWITCH])
2580 g_print("\n%s\n",prevline);
2581 if (!pswit[OVERVIEW_SWITCH])
2582 g_print(" Line %ld column %ld - "
2583 "No punctuation at para end?\n",
2584 linecnt-1,g_utf8_strlen(prevline,-1));
2589 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2595 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2597 const char *word=key;
2600 g_print("\nNote: Queried word %s was duplicated %d times\n",
2605 void print_as_windows_1252(const char *string)
2607 gsize inbytes,outbytes;
2609 static GIConv converter=(GIConv)-1;
2612 if (converter!=(GIConv)-1)
2613 g_iconv_close(converter);
2614 converter=(GIConv)-1;
2617 if (converter==(GIConv)-1)
2618 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2619 if (converter!=(GIConv)-1)
2621 inbytes=outbytes=strlen(string);
2622 bp=buf=g_malloc(outbytes+1);
2623 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2629 fputs(string,stdout);
2632 void print_as_utf_8(const char *string)
2634 fputs(string,stdout);
2642 void procfile(const char *filename)
2645 gchar *parastart=NULL; /* first line of current para */
2646 gchar *etext,*aline;
2649 struct first_pass_results *first_pass_results;
2650 struct warnings *warnings;
2651 struct counters counters={0};
2652 struct line_properties last={0};
2653 struct parities parities={0};
2654 struct pending pending={0};
2655 gboolean isemptyline;
2656 long start_para_line=0;
2657 gboolean isnewpara=FALSE,enddash=FALSE;
2658 last.start=CHAR_SPACE;
2659 linecnt=checked_linecnt=0;
2660 etext=read_etext(filename,&err);
2663 if (pswit[STDOUT_SWITCH])
2664 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2666 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2669 g_print("\n\nFile: %s\n\n",filename);
2670 first_pass_results=first_pass(etext);
2671 warnings=report_first_pass(first_pass_results);
2672 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2673 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2675 * Here we go with the main pass. Hold onto yer hat!
2679 while ((aline=flgets(&etext_ptr,linecnt+1)))
2684 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2685 continue; // skip DP page separators completely
2686 if (linecnt<first_pass_results->firstline ||
2687 (first_pass_results->footerline>0 &&
2688 linecnt>first_pass_results->footerline))
2690 if (pswit[HEADER_SWITCH])
2692 if (g_str_has_prefix(aline,"Title:"))
2693 g_print(" %s\n",aline);
2694 if (g_str_has_prefix(aline,"Author:"))
2695 g_print(" %s\n",aline);
2696 if (g_str_has_prefix(aline,"Release Date:"))
2697 g_print(" %s\n",aline);
2698 if (g_str_has_prefix(aline,"Edition:"))
2699 g_print(" %s\n\n",aline);
2701 continue; /* skip through the header */
2704 print_pending(aline,parastart,&pending);
2705 isemptyline=analyse_quotes(aline,linecnt,&counters);
2706 if (isnewpara && !isemptyline)
2708 /* This line is the start of a new paragraph. */
2709 start_para_line=linecnt;
2710 /* Capture its first line in case we want to report it later. */
2712 parastart=g_strdup(aline);
2713 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2715 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2716 !g_unichar_isdigit(g_utf8_get_char(s)))
2717 s=g_utf8_next_char(s);
2718 if (g_unichar_islower(g_utf8_get_char(s)))
2720 /* and its first letter is lowercase */
2721 if (pswit[ECHO_SWITCH])
2722 g_print("\n%s\n",aline);
2723 if (!pswit[OVERVIEW_SWITCH])
2724 g_print(" Line %ld column %ld - "
2725 "Paragraph starts with lower-case\n",
2726 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2730 isnewpara=FALSE; /* Signal the end of new para processing. */
2732 /* Check for an em-dash broken at line end. */
2733 if (enddash && g_utf8_get_char(aline)=='-')
2735 if (pswit[ECHO_SWITCH])
2736 g_print("\n%s\n",aline);
2737 if (!pswit[OVERVIEW_SWITCH])
2738 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2743 for (s=g_utf8_prev_char(aline+strlen(aline));
2744 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2746 if (s>=aline && g_utf8_get_char(s)=='-')
2748 check_for_control_characters(aline);
2749 check_for_odd_characters(aline,warnings,isemptyline);
2750 if (warnings->longline)
2751 check_for_long_line(aline);
2752 if (warnings->shortline)
2753 check_for_short_line(aline,&last);
2755 last.len=g_utf8_strlen(aline,-1);
2756 last.start=g_utf8_get_char(aline);
2757 check_for_starting_punctuation(aline);
2760 check_for_spaced_emdash(aline);
2761 check_for_spaced_dash(aline);
2763 check_for_unmarked_paragraphs(aline);
2764 check_for_jeebies(aline);
2765 check_for_mta_from(aline);
2766 check_for_orphan_character(aline);
2767 check_for_pling_scanno(aline);
2768 check_for_extra_period(aline,warnings);
2769 check_for_following_punctuation(aline);
2770 check_for_typos(aline,warnings);
2771 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2772 check_for_double_punctuation(aline,warnings);
2773 check_for_spaced_quotes(aline);
2774 check_for_miscased_genative(aline);
2775 check_end_of_line(aline,warnings);
2776 check_for_unspaced_bracket(aline);
2777 if (warnings->endquote)
2778 check_for_unpunctuated_endquote(aline);
2779 check_for_html_tag(aline);
2780 check_for_html_entity(aline);
2783 check_for_mismatched_quotes(&counters,&pending);
2784 counters_reset(&counters);
2785 /* let the next iteration know that it's starting a new para */
2788 check_for_omitted_punctuation(prevline,&last,start_para_line);
2791 prevline=g_strdup(aline);
2794 check_for_mismatched_quotes(&counters,&pending);
2795 print_pending(NULL,parastart,&pending);
2796 reset_pending(&pending);
2805 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2806 g_tree_foreach(qword,report_duplicate_queries,NULL);
2807 g_tree_unref(qword);
2808 g_tree_unref(qperiod);
2809 counters_destroy(&counters);
2810 g_set_print_handler(NULL);
2811 print_as_windows_1252(NULL);
2812 if (pswit[MARKUP_SWITCH])
2819 * Get one line from the input text, checking for
2820 * the existence of exactly one CR/LF line-end per line.
2822 * Returns: a pointer to the line.
2824 char *flgets(char **etext,long lcnt)
2827 gboolean isCR=FALSE;
2828 char *theline=*etext;
2833 c=g_utf8_get_char(*etext);
2834 *etext=g_utf8_next_char(*etext);
2837 /* either way, it's end of line */
2844 /* Error - a LF without a preceding CR */
2845 if (pswit[LINE_END_SWITCH])
2847 if (pswit[ECHO_SWITCH])
2849 s=g_strndup(theline,eos-theline);
2850 g_print("\n%s\n",s);
2853 if (!pswit[OVERVIEW_SWITCH])
2854 g_print(" Line %ld - No CR?\n",lcnt);
2865 /* Error - two successive CRs */
2866 if (pswit[LINE_END_SWITCH])
2868 if (pswit[ECHO_SWITCH])
2870 s=g_strndup(theline,eos-theline);
2871 g_print("\n%s\n",s);
2874 if (!pswit[OVERVIEW_SWITCH])
2875 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2884 if (pswit[LINE_END_SWITCH] && isCR)
2886 if (pswit[ECHO_SWITCH])
2888 s=g_strndup(theline,eos-theline);
2889 g_print("\n%s\n",s);
2892 if (!pswit[OVERVIEW_SWITCH])
2893 g_print(" Line %ld column %ld - CR without LF?\n",
2894 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2900 eos=g_utf8_next_char(eos);
2904 if (pswit[MARKUP_SWITCH])
2905 postprocess_for_HTML(theline);
2906 if (pswit[DP_SWITCH])
2907 postprocess_for_DP(theline);
2914 * Takes a "word" as a parameter, and checks whether it
2915 * contains a mixture of alpha and digits. Generally, this is an
2916 * error, but may not be for cases like 4th or L5 12s. 3d.
2918 * Returns: TRUE iff an is error found.
2920 gboolean mixdigit(const char *checkword)
2922 gboolean wehaveadigit,wehavealetter,query;
2923 const char *s,*nondigit;
2924 wehaveadigit=wehavealetter=query=FALSE;
2925 for (s=checkword;*s;s=g_utf8_next_char(s))
2926 if (g_unichar_isalpha(g_utf8_get_char(s)))
2928 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2930 if (wehaveadigit && wehavealetter)
2932 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2934 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2935 nondigit=g_utf8_next_char(nondigit))
2937 /* digits, ending in st, rd, nd, th of either case */
2938 if (!g_ascii_strcasecmp(nondigit,"st") ||
2939 !g_ascii_strcasecmp(nondigit,"rd") ||
2940 !g_ascii_strcasecmp(nondigit,"nd") ||
2941 !g_ascii_strcasecmp(nondigit,"th"))
2943 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2944 !g_ascii_strcasecmp(nondigit,"rds") ||
2945 !g_ascii_strcasecmp(nondigit,"nds") ||
2946 !g_ascii_strcasecmp(nondigit,"ths"))
2948 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2949 !g_ascii_strcasecmp(nondigit,"rdly") ||
2950 !g_ascii_strcasecmp(nondigit,"ndly") ||
2951 !g_ascii_strcasecmp(nondigit,"thly"))
2953 /* digits, ending in l, L, s or d */
2954 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2955 !strcmp(nondigit,"d"))
2958 * L at the start of a number, representing Britsh pounds, like L500.
2959 * This is cute. We know the current word is mixed digit. If the first
2960 * letter is L, there must be at least one digit following. If both
2961 * digits and letters follow, we have a genuine error, else we have a
2962 * capital L followed by digits, and we accept that as a non-error.
2964 if (g_utf8_get_char(checkword)=='L' &&
2965 !mixdigit(g_utf8_next_char(checkword)))
2974 * Extracts the first/next "word" from the line, and returns it.
2975 * A word is defined as one English word unit--or at least that's the aim.
2976 * "ptr" is advanced to the position in the line where we will start
2977 * looking for the next word.
2979 * Returns: A newly-allocated string.
2981 gchar *getaword(const char **ptr)
2986 word=g_string_new(NULL);
2987 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2988 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2989 **ptr;*ptr=g_utf8_next_char(*ptr))
2992 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2993 * Especially yucky is the case of L1,000
2994 * This section looks for a pattern of characters including a digit
2995 * followed by a comma or period followed by one or more digits.
2996 * If found, it returns this whole pattern as a word; otherwise we discard
2997 * the results and resume our normal programming.
3000 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3001 g_unichar_isalpha(g_utf8_get_char(s)) ||
3002 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3003 g_string_append_unichar(word,g_utf8_get_char(s));
3006 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3008 c=g_utf8_get_char(t);
3009 pc=g_utf8_get_char(g_utf8_prev_char(t));
3010 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3013 return g_string_free(word,FALSE);
3017 /* we didn't find a punctuated number - do the regular getword thing */
3018 g_string_truncate(word,0);
3019 c=g_utf8_get_char(*ptr);
3020 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3021 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3022 g_string_append_unichar(word,c);
3023 return g_string_free(word,FALSE);
3029 * Is this word a Roman Numeral?
3031 * It doesn't actually validate that the number is a valid Roman Numeral--for
3032 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3033 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3034 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3035 * expressions thereof, except when it came to taxes. Allow any number of M,
3036 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3037 * XL or an optional XC, an optional IX or IV, an optional V and any number
3040 gboolean isroman(const char *t)
3046 while (g_utf8_get_char(t)=='m' && *t)
3048 if (g_utf8_get_char(t)=='d')
3050 if (g_str_has_prefix(t,"cm"))
3052 if (g_str_has_prefix(t,"cd"))
3054 while (g_utf8_get_char(t)=='c' && *t)
3056 if (g_str_has_prefix(t,"xl"))
3058 if (g_str_has_prefix(t,"xc"))
3060 if (g_utf8_get_char(t)=='l')
3062 while (g_utf8_get_char(t)=='x' && *t)
3064 if (g_str_has_prefix(t,"ix"))
3066 if (g_str_has_prefix(t,"iv"))
3068 if (g_utf8_get_char(t)=='v')
3070 while (g_utf8_get_char(t)=='i' && *t)
3076 * postprocess_for_DP:
3078 * Invoked with the -d switch from flgets().
3079 * It simply "removes" from the line a hard-coded set of common
3080 * DP-specific tags, so that the line passed to the main routine has
3081 * been pre-cleaned of DP markup.
3083 void postprocess_for_DP(char *theline)
3089 for (i=0;*DPmarkup[i];i++)
3090 while ((s=strstr(theline,DPmarkup[i])))
3092 t=s+strlen(DPmarkup[i]);
3093 memmove(s,t,strlen(t)+1);
3098 * postprocess_for_HTML:
3100 * Invoked with the -m switch from flgets().
3101 * It simply "removes" from the line a hard-coded set of common
3102 * HTML tags and "replaces" a hard-coded set of common HTML
3103 * entities, so that the line passed to the main routine has
3104 * been pre-cleaned of HTML.
3106 void postprocess_for_HTML(char *theline)
3108 while (losemarkup(theline))
3110 loseentities(theline);
3113 char *losemarkup(char *theline)
3117 s=strchr(theline,'<');
3118 t=s?strchr(s,'>'):NULL;
3121 for (i=0;*markup[i];i++)
3122 if (tagcomp(g_utf8_next_char(s),markup[i]))
3124 t=g_utf8_next_char(t);
3125 memmove(s,t,strlen(t)+1);
3128 /* It's an unrecognized <xxx>. */
3132 void loseentities(char *theline)
3139 GTree *entities=NULL;
3140 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3144 g_tree_destroy(entities);
3146 if (translit!=(GIConv)-1)
3147 g_iconv_close(translit);
3148 translit=(GIConv)-1;
3149 if (to_utf8!=(GIConv)-1)
3150 g_iconv_close(to_utf8);
3158 entities=g_tree_new((GCompareFunc)strcmp);
3159 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3160 g_tree_insert(entities,HTMLentities[i].name,
3161 GUINT_TO_POINTER(HTMLentities[i].c));
3163 if (translit==(GIConv)-1)
3164 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3165 if (to_utf8==(GIConv)-1)
3166 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3167 while((amp=strchr(theline,'&')))
3169 scolon=strchr(amp,';');
3174 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3175 c=strtol(amp+2,NULL,10);
3176 else if (amp[2]=='x' &&
3177 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3178 c=strtol(amp+3,NULL,16);
3182 s=g_strndup(amp+1,scolon-(amp+1));
3183 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3192 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3193 theline+=g_unichar_to_utf8(c,theline);
3197 nb=g_unichar_to_utf8(c,s);
3198 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3200 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3202 memcpy(theline,s,nb);
3206 memmove(theline,g_utf8_next_char(scolon),
3207 strlen(g_utf8_next_char(scolon))+1);
3210 theline=g_utf8_next_char(amp);
3214 gboolean tagcomp(const char *strin,const char *basetag)
3218 if (g_utf8_get_char(strin)=='/')
3219 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3221 t=g_utf8_casefold(strin,-1);
3222 s=g_utf8_casefold(basetag,-1);
3223 retval=g_str_has_prefix(t,s);
3229 void proghelp(GOptionContext *context)
3232 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3233 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3234 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3235 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3236 "For details, read the file COPYING.\n",stderr);
3237 fputs("This is Free Software; "
3238 "you may redistribute it under certain conditions (GPL);\n",stderr);
3239 fputs("read the file COPYING for details.\n\n",stderr);
3240 help=g_option_context_get_help(context,TRUE,NULL);
3243 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3244 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3245 "non-ASCII\n",stderr);
3246 fputs("characters like accented letters, "
3247 "lines longer than 75 or shorter than 55,\n",stderr);
3248 fputs("unbalanced quotes or brackets, "
3249 "a variety of badly formatted punctuation, \n",stderr);
3250 fputs("HTML tags, some likely typos. "
3251 "It is NOT a substitute for human judgement.\n",stderr);