1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 static GOptionEntry options[]={
136 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
137 "Ignore DP-specific markup", NULL },
138 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
139 "Don't echo queried line", NULL },
140 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
141 "Check single quotes", NULL },
142 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
143 "Check common typos", NULL },
144 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
145 "Require closure of quotes on every paragraph", NULL },
146 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
147 "Disable paranoid querying of everything", NULL },
148 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
149 "Disable line end checking", NULL },
150 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
151 "Overview: just show counts", NULL },
152 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
153 "Output errors to stdout instead of stderr", NULL },
154 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
155 "Echo header fields", NULL },
156 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
157 "Ignore markup in < >", NULL },
158 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
159 "Use file of user-defined typos", NULL },
160 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
161 "Defaults for use on www upload", NULL },
162 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
163 "Verbose - list everything", NULL },
164 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
165 "Set of characters valid for this ebook", "NAME" },
169 long cnt_dquot; /* for overview mode, count of doublequote queries */
170 long cnt_squot; /* for overview mode, count of singlequote queries */
171 long cnt_brack; /* for overview mode, count of brackets queries */
172 long cnt_bin; /* for overview mode, count of non-ASCII queries */
173 long cnt_odd; /* for overview mode, count of odd character queries */
174 long cnt_long; /* for overview mode, count of long line errors */
175 long cnt_short; /* for overview mode, count of short line queries */
176 long cnt_punct; /* for overview mode,
177 count of punctuation and spacing queries */
178 long cnt_dash; /* for overview mode, count of dash-related queries */
179 long cnt_word; /* for overview mode, count of word queries */
180 long cnt_html; /* for overview mode, count of html queries */
181 long cnt_lineend; /* for overview mode, count of line-end queries */
182 long cnt_spacend; /* count of lines with space at end */
183 long linecnt; /* count of total lines in the file */
184 long checked_linecnt; /* count of lines actually checked */
186 void proghelp(GOptionContext *context);
187 void procfile(const char *);
191 gboolean mixdigit(const char *);
192 gchar *getaword(const char **);
193 char *flgets(char **,long);
194 void postprocess_for_HTML(char *);
195 char *linehasmarkup(char *);
196 char *losemarkup(char *);
197 gboolean tagcomp(const char *,const char *);
198 void loseentities(char *);
199 gboolean isroman(const char *);
200 void postprocess_for_DP(char *);
201 void print_as_windows_1252(const char *string);
202 void print_as_utf_8(const char *string);
204 GTree *qword,*qperiod;
210 gboolean set_charset(const char *name,GError **err)
212 /* The various UNICODE encodings all share the same character set. */
213 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
214 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
215 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
216 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
217 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
221 if (charset_validator==(GIConv)-1)
222 g_iconv_close(charset_validator);
223 if (!name || !g_strcasecmp(name,"auto"))
226 charset_validator=(GIConv)-1;
230 charset=g_strdup(name);
231 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
232 if (!g_strcasecmp(charset,unicode_aliases[i]))
235 charset=g_strdup("UTF-8");
238 if (!strcmp(charset,"UTF-8"))
239 charset_validator=(GIConv)-1;
242 charset_validator=g_iconv_open(charset,"UTF-8");
243 if (charset_validator==(GIConv)-1)
245 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
246 "Unknown character set \"%s\"",charset);
253 void parse_options(int *argc,char ***argv)
256 GOptionContext *context;
257 context=g_option_context_new(
258 "file - looks for errors in Project Gutenberg(TM) etexts");
259 g_option_context_add_main_entries(context,options,NULL);
260 if (!g_option_context_parse(context,argc,argv,&err))
262 g_printerr("Bookloupe: %s\n",err->message);
263 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
266 /* Paranoid checking is turned OFF, not on, by its switch */
267 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
268 if (pswit[PARANOID_SWITCH])
269 /* if running in paranoid mode, typo checks default to enabled */
270 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
271 /* Line-end checking is turned OFF, not on, by its switch */
272 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
273 /* Echoing is turned OFF, not on, by its switch */
274 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
275 if (pswit[OVERVIEW_SWITCH])
276 /* just print summary; don't echo */
277 pswit[ECHO_SWITCH]=FALSE;
279 * Web uploads - for the moment, this is really just a placeholder
280 * until we decide what processing we really want to do on web uploads
282 if (pswit[WEB_SWITCH])
284 /* specific override for web uploads */
285 pswit[ECHO_SWITCH]=TRUE;
286 pswit[SQUOTE_SWITCH]=FALSE;
287 pswit[TYPO_SWITCH]=TRUE;
288 pswit[QPARA_SWITCH]=FALSE;
289 pswit[PARANOID_SWITCH]=TRUE;
290 pswit[LINE_END_SWITCH]=FALSE;
291 pswit[OVERVIEW_SWITCH]=FALSE;
292 pswit[STDOUT_SWITCH]=FALSE;
293 pswit[HEADER_SWITCH]=TRUE;
294 pswit[VERBOSE_SWITCH]=FALSE;
295 pswit[MARKUP_SWITCH]=FALSE;
296 pswit[USERTYPO_SWITCH]=FALSE;
297 pswit[DP_SWITCH]=FALSE;
299 if (opt_charset && !set_charset(opt_charset,&err))
301 g_printerr("%s\n",err->message);
311 g_option_context_free(context);
317 * Read in the user-defined stealth scanno list.
319 void read_user_scannos(void)
322 gchar *usertypo_file;
326 gchar *contents,*utf8,**lines;
327 usertypo_file=g_strdup("bookloupe.typ");
328 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
329 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
332 g_free(usertypo_file);
333 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
334 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
336 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
339 g_free(usertypo_file);
340 usertypo_file=g_strdup("gutcheck.typ");
341 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
343 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
346 g_free(usertypo_file);
347 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
348 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
350 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
352 g_free(usertypo_file);
353 g_print(" --> I couldn't find bookloupe.typ "
354 "-- proceeding without user typos.\n");
359 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
360 g_free(usertypo_file);
364 if (g_utf8_validate(contents,len,NULL))
366 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
368 (void)set_charset("UNICODE",NULL);
371 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
373 lines=g_strsplit_set(utf8,"\r\n",0);
375 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
376 for (i=0;lines[i];i++)
377 if (*(unsigned char *)lines[i]>'!')
378 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
387 * Read an etext returning a newly allocated string containing the file
388 * contents or NULL on error.
390 gchar *read_etext(const char *filename,GError **err)
392 GError *tmp_err=NULL;
393 gchar *contents,*utf8;
394 gsize len,bytes_read,bytes_written;
396 if (!g_file_get_contents(filename,&contents,&len,err))
398 if (g_utf8_validate(contents,len,NULL))
400 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
401 g_set_print_handler(print_as_utf_8);
403 SetConsoleOutputCP(CP_UTF8);
408 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
409 &bytes_written,&tmp_err);
410 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
411 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
414 for(i=0;i<bytes_read;i++)
415 if (contents[i]=='\n')
420 else if (contents[i]!='\r')
422 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
423 "Input conversion failed. Byte %d at line %d, column %d is not a "
424 "valid Windows-1252 character",
425 ((unsigned char *)contents)[bytes_read],line,col);
428 g_propagate_error(err,tmp_err);
429 g_set_print_handler(print_as_windows_1252);
431 SetConsoleOutputCP(1252);
438 void cleanup_on_exit(void)
441 SetConsoleOutputCP(saved_cp);
445 int main(int argc,char **argv)
448 atexit(cleanup_on_exit);
449 saved_cp=GetConsoleOutputCP();
451 running_from=g_path_get_dirname(argv[0]);
452 parse_options(&argc,&argv);
453 if (pswit[USERTYPO_SWITCH])
455 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
457 if (pswit[OVERVIEW_SWITCH])
459 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
460 checked_linecnt,linecnt,linecnt-checked_linecnt);
461 g_print(" --------------- Queries found --------------\n");
463 g_print(" Long lines: %14ld\n",cnt_long);
465 g_print(" Short lines: %14ld\n",cnt_short);
467 g_print(" Line-end problems: %14ld\n",cnt_lineend);
469 g_print(" Common typos: %14ld\n",cnt_word);
471 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
473 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
475 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
477 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
479 g_print(" Proofing characters: %14ld\n",cnt_odd);
481 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
483 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
485 g_print(" Possible HTML tags: %14ld\n",cnt_html);
487 g_print(" TOTAL QUERIES %14ld\n",
488 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
489 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
491 g_free(running_from);
493 g_tree_unref(usertypo);
494 set_charset(NULL,NULL);
501 * Run a first pass - verify that it's a valid PG
502 * file, decide whether to report some things that
503 * occur many times in the text like long or short
504 * lines, non-standard dashes, etc.
506 struct first_pass_results *first_pass(const char *etext)
508 gunichar laststart=CHAR_SPACE;
513 unsigned int lastlen=0,lastblen=0;
514 long spline=0,nspline=0;
515 static struct first_pass_results results={0};
517 lines=g_strsplit(etext,"\n",0);
518 for (j=0;lines[j];j++)
520 lbytes=strlen(lines[j]);
521 while (lbytes>0 && lines[j][lbytes-1]=='\r')
522 lines[j][--lbytes]='\0';
523 llen=g_utf8_strlen(lines[j],lbytes);
525 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
526 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
529 g_print(" --> Duplicate header?\n");
530 spline=linecnt+1; /* first line of non-header text, that is */
532 if (!strncmp(lines[j],"*** START",9) &&
533 strstr(lines[j],"PROJECT GUTENBERG"))
536 g_print(" --> Duplicate header?\n");
537 nspline=linecnt+1; /* first line of non-header text, that is */
539 if (spline || nspline)
541 lc_line=g_utf8_strdown(lines[j],lbytes);
542 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
544 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
546 if (results.footerline)
548 /* it's an old-form header - we can detect duplicates */
550 g_print(" --> Duplicate footer?\n");
553 results.footerline=linecnt;
559 results.firstline=spline;
561 results.firstline=nspline; /* override with new */
562 if (results.footerline)
563 continue; /* don't count the boilerplate in the footer */
564 results.totlen+=llen;
565 for (s=lines[j];*s;s=g_utf8_next_char(s))
567 if (g_utf8_get_char(s)>127)
569 if (g_unichar_isalpha(g_utf8_get_char(s)))
571 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
572 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
573 results.endquote_count++;
575 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
576 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
579 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
581 if (strstr(lines[j],".,"))
583 /* only count ast lines for ignoring purposes where there is */
584 /* locase text on the line */
585 if (strchr(lines[j],'*'))
587 for (s=lines[j];*s;s=g_utf8_next_char(s))
588 if (g_unichar_islower(g_utf8_get_char(s)))
593 if (strchr(lines[j],'/'))
594 results.fslashline++;
597 for (s=g_utf8_prev_char(lines[j]+lbytes);
598 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
599 s=g_utf8_prev_char(s))
601 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
602 g_utf8_get_char(g_utf8_prev_char(s))!='-')
605 if (llen>LONGEST_PG_LINE)
607 if (llen>WAY_TOO_LONG)
608 results.verylongline++;
609 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
611 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
614 if (strstr(lines[j],"<i>"))
615 results.htmcount+=4; /* bonus marks! */
617 /* Check for spaced em-dashes */
618 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
621 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
622 results.space_emdash++;
623 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
624 /* count of em-dashes with spaces both sides */
625 results.non_PG_space_emdash++;
626 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
627 /* count of PG-type em-dashes with no spaces */
628 results.PG_space_emdash++;
633 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
634 results.Dutchcount++;
635 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
636 results.Frenchcount++;
637 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
638 results.standalone_digit++;
641 /* Check for spaced dashes */
642 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
646 laststart=lines[j][0];
655 * Make some snap decisions based on the first pass results.
657 struct warnings *report_first_pass(struct first_pass_results *results)
659 static struct warnings warnings={0};
661 g_print(" --> %ld lines in this file have white space at end\n",
664 if (results->dotcomma>5)
667 g_print(" --> %ld lines in this file contain '.,'. "
668 "Not reporting them.\n",results->dotcomma);
671 * If more than 50 lines, or one-tenth, are short,
672 * don't bother reporting them.
674 warnings.shortline=1;
675 if (results->shortline>50 || results->shortline*10>linecnt)
677 warnings.shortline=0;
678 g_print(" --> %ld lines in this file are short. "
679 "Not reporting short lines.\n",results->shortline);
682 * If more than 50 lines, or one-tenth, are long,
683 * don't bother reporting them.
686 if (results->longline>50 || results->longline*10>linecnt)
689 g_print(" --> %ld lines in this file are long. "
690 "Not reporting long lines.\n",results->longline);
692 /* If more than 10 lines contain asterisks, don't bother reporting them. */
694 if (results->astline>10)
697 g_print(" --> %ld lines in this file contain asterisks. "
698 "Not reporting them.\n",results->astline);
701 * If more than 10 lines contain forward slashes,
702 * don't bother reporting them.
705 if (results->fslashline>10)
708 g_print(" --> %ld lines in this file contain forward slashes. "
709 "Not reporting them.\n",results->fslashline);
712 * If more than 20 lines contain unpunctuated endquotes,
713 * don't bother reporting them.
716 if (results->endquote_count>20)
719 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
720 "Not reporting them.\n",results->endquote_count);
723 * If more than 15 lines contain standalone digits,
724 * don't bother reporting them.
727 if (results->standalone_digit>10)
730 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
731 "Not reporting them.\n",results->standalone_digit);
734 * If more than 20 lines contain hyphens at end,
735 * don't bother reporting them.
738 if (results->hyphens>20)
741 g_print(" --> %ld lines in this file have hyphens at end. "
742 "Not reporting them.\n",results->hyphens);
744 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
746 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
747 pswit[MARKUP_SWITCH]=1;
749 if (results->verylongline>0)
750 g_print(" --> %ld lines in this file are VERY long!\n",
751 results->verylongline);
753 * If there are more non-PG spaced dashes than PG em-dashes,
754 * assume it's deliberate.
755 * Current PG guidelines say don't use them, but older texts do,
756 * and some people insist on them whatever the guidelines say.
759 if (results->spacedash+results->non_PG_space_emdash>
760 results->PG_space_emdash)
763 g_print(" --> There are %ld spaced dashes and em-dashes. "
764 "Not reporting them.\n",
765 results->spacedash+results->non_PG_space_emdash);
771 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
773 /* If more than a quarter of characters are hi-bit, bug out. */
774 if (results->binlen*4>results->totlen)
776 g_print(" --> This file does not appear to be ASCII. "
777 "Terminating. Best of luck with it!\n");
780 if (results->alphalen*4<results->totlen)
782 g_print(" --> This file does not appear to be text. "
783 "Terminating. Best of luck with it!\n");
786 if (results->binlen*100>results->totlen || results->binlen>100)
788 g_print(" --> There are a lot of foreign letters here. "
789 "Not reporting them.\n");
790 if (!pswit[VERBOSE_SWITCH])
794 warnings.isDutch=FALSE;
795 if (results->Dutchcount>50)
797 warnings.isDutch=TRUE;
798 g_print(" --> This looks like Dutch - "
799 "switching off dashes and warnings for 's Middags case.\n");
801 warnings.isFrench=FALSE;
802 if (results->Frenchcount>50)
804 warnings.isFrench=TRUE;
805 g_print(" --> This looks like French - "
806 "switching off some doublepunct.\n");
808 if (results->firstline && results->footerline)
809 g_print(" The PG header and footer appear to be already on.\n");
812 if (results->firstline)
813 g_print(" The PG header is on - no footer.\n");
814 if (results->footerline)
815 g_print(" The PG footer is on - no header.\n");
818 if (pswit[VERBOSE_SWITCH])
820 warnings.shortline=1;
829 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
831 if (warnings.isDutch)
833 if (results->footerline>0 && results->firstline>0 &&
834 results->footerline>results->firstline &&
835 results->footerline-results->firstline<100)
837 g_print(" --> I don't really know where this text starts. \n");
838 g_print(" There are no reference points.\n");
839 g_print(" I'm going to have to report the header and footer "
841 results->firstline=0;
849 * Look along the line, accumulate the count of quotes, and see
850 * if this is an empty line - i.e. a line with nothing on it
852 * If line has just spaces, period, * and/or - on it, don't
853 * count it, since empty lines with asterisks or dashes to
854 * separate sections are common.
856 * Returns: TRUE if the line is empty.
858 gboolean analyse_quotes(const char *aline,struct counters *counters)
861 /* assume the line is empty until proven otherwise */
862 gboolean isemptyline=TRUE;
863 const char *s=aline,*sprev,*snext;
868 snext=g_utf8_next_char(s);
869 c=g_utf8_get_char(s);
872 if (CHAR_IS_SQUOTE(c))
877 * At start of line, it can only be an openquote.
878 * Hardcode a very common exception!
880 if (!g_str_has_prefix(snext,"tis") &&
881 !g_str_has_prefix(snext,"Tis"))
882 increment_matching(counters,c,TRUE);
884 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
885 g_unichar_isalpha(g_utf8_get_char(snext)))
886 /* Do nothing! it's definitely an apostrophe, not a quote */
888 /* it's outside a word - let's check it out */
889 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
890 g_unichar_isalpha(g_utf8_get_char(snext)))
892 /* it damwell better BE an openquote */
893 if (!g_str_has_prefix(snext,"tis") &&
894 !g_str_has_prefix(snext,"Tis"))
895 /* hardcode a very common exception! */
896 increment_matching(counters,c,TRUE);
900 /* now - is it a closequote? */
901 guessquote=0; /* accumulate clues */
902 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
904 /* it follows a letter - could be either */
906 if (g_utf8_get_char(sprev)=='s')
908 /* looks like a plural apostrophe */
910 if (g_utf8_get_char(snext)==CHAR_SPACE)
915 /* it doesn't have a letter either side */
916 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
917 strchr(".?!,;: ",g_utf8_get_char(snext)))
918 guessquote+=8; /* looks like a closequote */
921 if (matching_difference(counters,CHAR_SQUOTE)>0)
923 * Give it the benefit of some doubt,
924 * if a squote is already open.
930 increment_matching(counters,c,FALSE);
933 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
935 isemptyline=FALSE; /* ignore lines like * * * as spacers */
936 if (c==CHAR_UNDERSCORE)
937 counters->c_unders++;
938 if (c==CHAR_OPEN_SBRACK)
940 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
941 !matching_difference(counters,c) && s==aline &&
942 g_str_has_prefix(s,"[Illustration:"))
943 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
945 increment_matching(counters,c,TRUE);
947 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
948 increment_matching(counters,c,TRUE);
949 if (c==CHAR_CLOSE_SBRACK)
951 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
952 !matching_difference(counters,c) && !*snext)
953 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
955 increment_matching(counters,c,FALSE);
957 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
958 increment_matching(counters,c,FALSE);
966 * check_for_control_characters:
968 * Check for invalid or questionable characters in the line
969 * Anything above 127 is invalid for plain ASCII, and
970 * non-printable control characters should also be flagged.
971 * Tabs should generally not be there.
973 void check_for_control_characters(const char *aline)
977 for (s=aline;*s;s=g_utf8_next_char(s))
979 c=g_utf8_get_char(s);
980 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
982 if (pswit[ECHO_SWITCH])
983 g_print("\n%s\n",aline);
984 if (!pswit[OVERVIEW_SWITCH])
985 g_print(" Line %ld column %ld - Control character %u\n",
986 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
994 * check_for_odd_characters:
996 * Check for binary and other odd characters.
998 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
999 gboolean isemptyline)
1001 /* Don't repeat multiple warnings on one line. */
1002 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1003 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1008 for (s=aline;*s;s=g_utf8_next_char(s))
1010 c=g_utf8_get_char(s);
1011 if (warnings->bin && !eInvalidChar &&
1012 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1014 if (pswit[ECHO_SWITCH])
1015 g_print("\n%s\n",aline);
1016 if (!pswit[OVERVIEW_SWITCH])
1017 if (c>127 && c<160 || c>255)
1018 g_print(" Line %ld column %ld - "
1019 "Non-ISO-8859 character %u\n",
1020 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1022 g_print(" Line %ld column %ld - "
1023 "Non-ASCII character %u\n",
1024 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1029 if (!eInvalidChar && charset)
1031 if (charset_validator==(GIConv)-1)
1033 if (!g_unichar_isdefined(c))
1035 if (pswit[ECHO_SWITCH])
1036 g_print("\n%s\n",aline);
1037 if (!pswit[OVERVIEW_SWITCH])
1038 g_print(" Line %ld column %ld - Unassigned UNICODE "
1039 "code point U+%04" G_GINT32_MODIFIER "X\n",
1040 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1045 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1046 c>=100000 && c<=0x10FFFD)
1048 if (pswit[ECHO_SWITCH])
1049 g_print("\n%s\n",aline);
1050 if (!pswit[OVERVIEW_SWITCH])
1051 g_print(" Line %ld column %ld - Private Use "
1052 "character U+%04" G_GINT32_MODIFIER "X\n",
1053 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1061 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1062 charset_validator,NULL,&nb,NULL);
1067 if (pswit[ECHO_SWITCH])
1068 g_print("\n%s\n",aline);
1069 if (!pswit[OVERVIEW_SWITCH])
1070 g_print(" Line %ld column %ld - Non-%s "
1071 "character %u\n",linecnt,
1072 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1079 if (!eTab && c==CHAR_TAB)
1081 if (pswit[ECHO_SWITCH])
1082 g_print("\n%s\n",aline);
1083 if (!pswit[OVERVIEW_SWITCH])
1084 g_print(" Line %ld column %ld - Tab character?\n",
1085 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1090 if (!eTilde && c==CHAR_TILDE)
1093 * Often used by OCR software to indicate an
1094 * unrecognizable character.
1096 if (pswit[ECHO_SWITCH])
1097 g_print("\n%s\n",aline);
1098 if (!pswit[OVERVIEW_SWITCH])
1099 g_print(" Line %ld column %ld - Tilde character?\n",
1100 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1105 if (!eCarat && c==CHAR_CARAT)
1107 if (pswit[ECHO_SWITCH])
1108 g_print("\n%s\n",aline);
1109 if (!pswit[OVERVIEW_SWITCH])
1110 g_print(" Line %ld column %ld - Carat character?\n",
1111 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1116 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1118 if (pswit[ECHO_SWITCH])
1119 g_print("\n%s\n",aline);
1120 if (!pswit[OVERVIEW_SWITCH])
1121 g_print(" Line %ld column %ld - Forward slash?\n",
1122 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1128 * Report asterisks only in paranoid mode,
1129 * since they're often deliberate.
1131 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1134 if (pswit[ECHO_SWITCH])
1135 g_print("\n%s\n",aline);
1136 if (!pswit[OVERVIEW_SWITCH])
1137 g_print(" Line %ld column %ld - Asterisk?\n",
1138 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1147 * check_for_long_line:
1149 * Check for line too long.
1151 void check_for_long_line(const char *aline)
1153 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1155 if (pswit[ECHO_SWITCH])
1156 g_print("\n%s\n",aline);
1157 if (!pswit[OVERVIEW_SWITCH])
1158 g_print(" Line %ld column %ld - Long line %ld\n",
1159 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1166 * check_for_short_line:
1168 * Check for line too short.
1170 * This one is a bit trickier to implement: we don't want to
1171 * flag the last line of a paragraph for being short, so we
1172 * have to wait until we know that our current line is a
1173 * "normal" line, then report the _previous_ line if it was too
1174 * short. We also don't want to report indented lines like
1175 * chapter heads or formatted quotations. We therefore keep
1176 * last->len as the length of the last line examined, and
1177 * last->blen as the length of the last but one, and try to
1178 * suppress unnecessary warnings by checking that both were of
1179 * "normal" length. We keep the first character of the last
1180 * line in last->start, and if it was a space, we assume that
1181 * the formatting is deliberate. I can't figure out a way to
1182 * distinguish something like a quoted verse left-aligned or
1183 * the header or footer of a letter from a paragraph of short
1184 * lines - maybe if I examined the whole paragraph, and if the
1185 * para has less than, say, 8 lines and if all lines are short,
1186 * then just assume it's OK? Need to look at some texts to see
1187 * how often a formula like this would get the right result.
1189 void check_for_short_line(const char *aline,const struct line_properties *last)
1191 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1192 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1193 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1195 if (pswit[ECHO_SWITCH])
1196 g_print("\n%s\n",prevline);
1197 if (!pswit[OVERVIEW_SWITCH])
1198 g_print(" Line %ld column %ld - Short line %ld?\n",
1199 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1206 * check_for_starting_punctuation:
1208 * Look for punctuation other than full ellipses at start of line.
1210 void check_for_starting_punctuation(const char *aline)
1212 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1213 !g_str_has_prefix(aline,". . ."))
1215 if (pswit[ECHO_SWITCH])
1216 g_print("\n%s\n",aline);
1217 if (!pswit[OVERVIEW_SWITCH])
1218 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1226 * check_for_spaced_emdash:
1228 * Check for spaced em-dashes.
1230 * We must check _all_ occurrences of "--" on the line
1231 * hence the loop - even if the first double-dash is OK
1232 * there may be another that's wrong later on.
1234 void check_for_spaced_emdash(const char *aline)
1236 const char *s,*t,*next;
1237 for (s=aline;t=strstr(s,"--");s=next)
1239 next=g_utf8_next_char(g_utf8_next_char(t));
1240 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1241 g_utf8_get_char(next)==CHAR_SPACE)
1243 if (pswit[ECHO_SWITCH])
1244 g_print("\n%s\n",aline);
1245 if (!pswit[OVERVIEW_SWITCH])
1246 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1247 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1255 * check_for_spaced_dash:
1257 * Check for spaced dashes.
1259 void check_for_spaced_dash(const char *aline)
1262 if ((s=strstr(aline," -")))
1264 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1266 if (pswit[ECHO_SWITCH])
1267 g_print("\n%s\n",aline);
1268 if (!pswit[OVERVIEW_SWITCH])
1269 g_print(" Line %ld column %ld - Spaced dash?\n",
1270 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1275 else if ((s=strstr(aline,"- ")))
1277 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1279 if (pswit[ECHO_SWITCH])
1280 g_print("\n%s\n",aline);
1281 if (!pswit[OVERVIEW_SWITCH])
1282 g_print(" Line %ld column %ld - Spaced dash?\n",
1283 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1291 * check_for_unmarked_paragraphs:
1293 * Check for unmarked paragraphs indicated by separate speakers.
1295 * May well be false positive:
1296 * "Bravo!" "Wonderful!" called the crowd.
1297 * but useful all the same.
1299 void check_for_unmarked_paragraphs(const char *aline)
1302 s=strstr(aline,"\" \"");
1304 s=strstr(aline,"\" \"");
1307 if (pswit[ECHO_SWITCH])
1308 g_print("\n%s\n",aline);
1309 if (!pswit[OVERVIEW_SWITCH])
1310 g_print(" Line %ld column %ld - "
1311 "Query missing paragraph break?\n",
1312 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1319 * check_for_jeebies:
1321 * Check for "to he" and other easy h/b errors.
1323 * This is a very inadequate effort on the h/b problem,
1324 * but the phrase "to he" is always an error, whereas "to
1325 * be" is quite common.
1326 * Similarly, '"Quiet!", be said.' is a non-be error
1327 * "to he" is _not_ always an error!:
1328 * "Where they went to he couldn't say."
1329 * Another false positive:
1330 * What would "Cinderella" be without the . . .
1331 * and another: "If he wants to he can see for himself."
1333 void check_for_jeebies(const char *aline)
1336 s=strstr(aline," be could ");
1338 s=strstr(aline," be would ");
1340 s=strstr(aline," was be ");
1342 s=strstr(aline," be is ");
1344 s=strstr(aline," is be ");
1346 s=strstr(aline,"\", be ");
1348 s=strstr(aline,"\" be ");
1350 s=strstr(aline,"\" be ");
1352 s=strstr(aline," to he ");
1355 if (pswit[ECHO_SWITCH])
1356 g_print("\n%s\n",aline);
1357 if (!pswit[OVERVIEW_SWITCH])
1358 g_print(" Line %ld column %ld - Query he/be error?\n",
1359 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1363 s=strstr(aline," the had ");
1365 s=strstr(aline," a had ");
1367 s=strstr(aline," they bad ");
1369 s=strstr(aline," she bad ");
1371 s=strstr(aline," he bad ");
1373 s=strstr(aline," you bad ");
1375 s=strstr(aline," i bad ");
1378 if (pswit[ECHO_SWITCH])
1379 g_print("\n%s\n",aline);
1380 if (!pswit[OVERVIEW_SWITCH])
1381 g_print(" Line %ld column %ld - Query had/bad error?\n",
1382 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1386 s=strstr(aline,"; hut ");
1388 s=strstr(aline,", hut ");
1391 if (pswit[ECHO_SWITCH])
1392 g_print("\n%s\n",aline);
1393 if (!pswit[OVERVIEW_SWITCH])
1394 g_print(" Line %ld column %ld - Query hut/but error?\n",
1395 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1402 * check_for_mta_from:
1404 * Special case - angled bracket in front of "From" placed there by an
1405 * MTA when sending an e-mail.
1407 void check_for_mta_from(const char *aline)
1410 s=strstr(aline,">From");
1413 if (pswit[ECHO_SWITCH])
1414 g_print("\n%s\n",aline);
1415 if (!pswit[OVERVIEW_SWITCH])
1416 g_print(" Line %ld column %ld - "
1417 "Query angled bracket with From\n",
1418 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1425 * check_for_orphan_character:
1427 * Check for a single character line -
1428 * often an overflow from bad wrapping.
1430 void check_for_orphan_character(const char *aline)
1433 c=g_utf8_get_char(aline);
1434 if (c && !*g_utf8_next_char(aline))
1436 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1437 ; /* Nothing - ignore numerals alone on a line. */
1440 if (pswit[ECHO_SWITCH])
1441 g_print("\n%s\n",aline);
1442 if (!pswit[OVERVIEW_SWITCH])
1443 g_print(" Line %ld column 1 - Query single character line\n",
1452 * check_for_pling_scanno:
1454 * Check for I" - often should be !
1456 void check_for_pling_scanno(const char *aline)
1459 s=strstr(aline," I\"");
1462 if (pswit[ECHO_SWITCH])
1463 g_print("\n%s\n",aline);
1464 if (!pswit[OVERVIEW_SWITCH])
1465 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1466 linecnt,g_utf8_pointer_to_offset(aline,s));
1473 * check_for_extra_period:
1475 * Check for period without a capital letter. Cut-down from gutspell.
1476 * Only works when it happens on a single line.
1478 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1480 const char *s,*t,*s1,*sprev;
1485 gunichar c,nc,pc,*decomposition;
1486 if (pswit[PARANOID_SWITCH])
1488 for (t=aline;t=strstr(t,". ");)
1492 t=g_utf8_next_char(t);
1493 /* start of line punctuation is handled elsewhere */
1496 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1498 t=g_utf8_next_char(t);
1501 if (warnings->isDutch)
1503 /* For Frank & Jeroen -- 's Middags case */
1504 gunichar c2,c3,c4,c5;
1505 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1506 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1507 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1508 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1509 if (CHAR_IS_APOSTROPHE(c2) &&
1510 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1511 g_unichar_isupper(c5))
1513 t=g_utf8_next_char(t);
1517 s1=g_utf8_next_char(g_utf8_next_char(t));
1518 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1519 !isdigit(g_utf8_get_char(s1)))
1520 s1=g_utf8_next_char(s1);
1521 if (g_unichar_islower(g_utf8_get_char(s1)))
1523 /* we have something to investigate */
1525 /* so let's go back and find out */
1526 nc=g_utf8_get_char(t);
1527 s1=g_utf8_prev_char(t);
1528 c=g_utf8_get_char(s1);
1529 sprev=g_utf8_prev_char(s1);
1530 pc=g_utf8_get_char(sprev);
1532 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1533 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1534 g_unichar_isalpha(nc)))
1539 sprev=g_utf8_prev_char(s1);
1540 pc=g_utf8_get_char(sprev);
1542 s1=g_utf8_next_char(s1);
1545 testword=g_strndup(s1,s-s1);
1547 testword=g_strdup(s1);
1548 for (i=0;*abbrev[i];i++)
1549 if (!strcmp(testword,abbrev[i]))
1551 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1553 if (!*g_utf8_next_char(testword))
1555 if (isroman(testword))
1560 for (s=testword;*s;s=g_utf8_next_char(s))
1562 decomposition=g_unicode_canonical_decomposition(
1563 g_utf8_get_char(s),&len);
1564 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1566 g_free(decomposition);
1570 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1572 g_tree_insert(qperiod,g_strdup(testword),
1573 GINT_TO_POINTER(1));
1574 if (pswit[ECHO_SWITCH])
1575 g_print("\n%s\n",aline);
1576 if (!pswit[OVERVIEW_SWITCH])
1577 g_print(" Line %ld column %ld - Extra period?\n",
1578 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1584 t=g_utf8_next_char(t);
1590 * check_for_following_punctuation:
1592 * Check for words usually not followed by punctuation.
1594 void check_for_following_punctuation(const char *aline)
1597 const char *s,*wordstart;
1600 if (pswit[TYPO_SWITCH])
1611 inword=g_utf8_strdown(t,-1);
1613 for (i=0;*nocomma[i];i++)
1614 if (!strcmp(inword,nocomma[i]))
1616 c=g_utf8_get_char(s);
1617 if (c==',' || c==';' || c==':')
1619 if (pswit[ECHO_SWITCH])
1620 g_print("\n%s\n",aline);
1621 if (!pswit[OVERVIEW_SWITCH])
1622 g_print(" Line %ld column %ld - "
1623 "Query punctuation after %s?\n",
1624 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1630 for (i=0;*noperiod[i];i++)
1631 if (!strcmp(inword,noperiod[i]))
1633 c=g_utf8_get_char(s);
1634 if (c=='.' || c=='!')
1636 if (pswit[ECHO_SWITCH])
1637 g_print("\n%s\n",aline);
1638 if (!pswit[OVERVIEW_SWITCH])
1639 g_print(" Line %ld column %ld - "
1640 "Query punctuation after %s?\n",
1641 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1655 * Check for commonly mistyped words,
1656 * and digits like 0 for O in a word.
1658 void check_for_typos(const char *aline,struct warnings *warnings)
1660 const char *s,*t,*nt,*wordstart;
1662 gunichar *decomposition;
1664 int i,vowel,consonant,*dupcnt;
1665 gboolean isdup,istypo,alower;
1668 gsize decomposition_len;
1672 inword=getaword(&s);
1676 continue; /* don't bother with empty lines */
1678 if (mixdigit(inword))
1680 if (pswit[ECHO_SWITCH])
1681 g_print("\n%s\n",aline);
1682 if (!pswit[OVERVIEW_SWITCH])
1683 g_print(" Line %ld column %ld - Query digit in %s\n",
1684 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1689 * Put the word through a series of tests for likely typos and OCR
1692 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1696 for (t=inword;*t;t=g_utf8_next_char(t))
1698 c=g_utf8_get_char(t);
1699 nt=g_utf8_next_char(t);
1700 /* lowercase for testing */
1701 if (g_unichar_islower(c))
1703 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1706 * We have an uppercase mid-word. However, there are
1708 * Mac and Mc like McGill
1709 * French contractions like l'Abbe
1711 offset=g_utf8_pointer_to_offset(inword,t);
1713 pc=g_utf8_get_char(g_utf8_prev_char(t));
1716 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1717 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1718 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1719 CHAR_IS_APOSTROPHE(pc))
1725 testword=g_utf8_casefold(inword,-1);
1727 if (pswit[TYPO_SWITCH])
1730 * Check for certain unlikely two-letter combinations at word
1733 len=g_utf8_strlen(testword,-1);
1736 for (i=0;*nostart[i];i++)
1737 if (g_str_has_prefix(testword,nostart[i]))
1739 for (i=0;*noend[i];i++)
1740 if (g_str_has_suffix(testword,noend[i]))
1743 /* ght is common, gbt never. Like that. */
1744 if (strstr(testword,"cb"))
1746 if (strstr(testword,"gbt"))
1748 if (strstr(testword,"pbt"))
1750 if (strstr(testword,"tbs"))
1752 if (strstr(testword,"mrn"))
1754 if (strstr(testword,"ahle"))
1756 if (strstr(testword,"ihle"))
1759 * "TBE" does happen - like HEARTBEAT - but uncommon.
1760 * Also "TBI" - frostbite, outbid - but uncommon.
1761 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1762 * numerals, but "ii" is a common scanno.
1764 if (strstr(testword,"tbi"))
1766 if (strstr(testword,"tbe"))
1768 if (strstr(testword,"ii"))
1771 * Check for no vowels or no consonants.
1772 * If none, flag a typo.
1774 if (!istypo && len>1)
1777 for (t=testword;*t;t=g_utf8_next_char(t))
1779 c=g_utf8_get_char(t);
1781 g_unicode_canonical_decomposition(c,&decomposition_len);
1782 if (c=='y' || g_unichar_isdigit(c))
1784 /* Yah, this is loose. */
1788 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1792 g_free(decomposition);
1794 if (!vowel || !consonant)
1798 * Now exclude the word from being reported if it's in
1801 for (i=0;*okword[i];i++)
1802 if (!strcmp(testword,okword[i]))
1805 * What looks like a typo may be a Roman numeral.
1808 if (istypo && isroman(testword))
1810 /* Check the manual list of typos. */
1812 for (i=0;*typo[i];i++)
1813 if (!strcmp(testword,typo[i]))
1816 * Check lowercase s, l, i and m - special cases.
1817 * "j" - often a semi-colon gone wrong.
1818 * "d" for a missing apostrophe - he d
1821 if (!istypo && len==1 &&
1822 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1826 dupcnt=g_tree_lookup(qword,testword);
1830 isdup=!pswit[VERBOSE_SWITCH];
1834 dupcnt=g_new0(int,1);
1835 g_tree_insert(qword,g_strdup(testword),dupcnt);
1840 if (pswit[ECHO_SWITCH])
1841 g_print("\n%s\n",aline);
1842 if (!pswit[OVERVIEW_SWITCH])
1844 g_print(" Line %ld column %ld - Query word %s",
1845 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1847 if (!pswit[VERBOSE_SWITCH])
1848 g_print(" - not reporting duplicates");
1856 /* check the user's list of typos */
1857 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1859 if (pswit[ECHO_SWITCH])
1860 g_print("\n%s\n",aline);
1861 if (!pswit[OVERVIEW_SWITCH])
1862 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1863 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1865 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1867 if (pswit[PARANOID_SWITCH] && warnings->digit)
1869 /* In paranoid mode, query all 0 and 1 standing alone. */
1870 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1872 if (pswit[ECHO_SWITCH])
1873 g_print("\n%s\n",aline);
1874 if (!pswit[OVERVIEW_SWITCH])
1875 g_print(" Line %ld column %ld - Query standalone %s\n",
1876 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1887 * check_for_misspaced_punctuation:
1889 * Look for added or missing spaces around punctuation and quotes.
1890 * If there is a punctuation character like ! with no space on
1891 * either side, suspect a missing!space. If there are spaces on
1892 * both sides , assume a typo. If we see a double quote with no
1893 * space or punctuation on either side of it, assume unspaced
1894 * quotes "like"this.
1896 void check_for_misspaced_punctuation(const char *aline,
1897 struct parities *parities,gboolean isemptyline)
1899 gboolean isacro,isellipsis;
1901 gunichar c,nc,pc,n2c;
1902 c=g_utf8_get_char(aline);
1903 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1904 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1908 nc=g_utf8_get_char(g_utf8_next_char(s));
1909 /* For each character in the line after the first. */
1910 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1912 /* we need to suppress warnings for acronyms like M.D. */
1914 /* we need to suppress warnings for ellipsis . . . */
1917 * If there are letters on both sides of it or
1918 * if it's strict punctuation followed by an alpha.
1920 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1921 g_utf8_strchr("?!,;:",-1,c)))
1925 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1926 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1928 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1934 if (pswit[ECHO_SWITCH])
1935 g_print("\n%s\n",aline);
1936 if (!pswit[OVERVIEW_SWITCH])
1937 g_print(" Line %ld column %ld - Missing space?\n",
1938 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1943 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1946 * If there are spaces on both sides,
1947 * or space before and end of line.
1951 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1952 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1954 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1958 if (!isemptyline && !isellipsis)
1960 if (pswit[ECHO_SWITCH])
1961 g_print("\n%s\n",aline);
1962 if (!pswit[OVERVIEW_SWITCH])
1963 g_print(" Line %ld column %ld - "
1964 "Spaced punctuation?\n",linecnt,
1965 g_utf8_pointer_to_offset(aline,s)+1);
1972 /* Split out the characters that CANNOT be preceded by space. */
1973 c=g_utf8_get_char(aline);
1974 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1975 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1979 nc=g_utf8_get_char(g_utf8_next_char(s));
1980 /* for each character in the line after the first */
1981 if (g_utf8_strchr("?!,;:",-1,c))
1983 /* if it's punctuation that _cannot_ have a space before it */
1984 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1987 * If nc DOES == space,
1988 * it was already reported just above.
1990 if (pswit[ECHO_SWITCH])
1991 g_print("\n%s\n",aline);
1992 if (!pswit[OVERVIEW_SWITCH])
1993 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1994 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2001 * Special case " .X" where X is any alpha.
2002 * This plugs a hole in the acronym code above.
2003 * Inelegant, but maintainable.
2005 c=g_utf8_get_char(aline);
2006 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2007 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2011 nc=g_utf8_get_char(g_utf8_next_char(s));
2012 /* for each character in the line after the first */
2015 /* if it's a period */
2016 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2019 * If the period follows a space and
2020 * is followed by a letter.
2022 if (pswit[ECHO_SWITCH])
2023 g_print("\n%s\n",aline);
2024 if (!pswit[OVERVIEW_SWITCH])
2025 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2026 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2032 c=g_utf8_get_char(aline);
2033 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2034 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2038 nc=g_utf8_get_char(g_utf8_next_char(s));
2039 /* for each character in the line after the first */
2042 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2043 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2044 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2046 if (pswit[ECHO_SWITCH])
2047 g_print("\n%s\n",aline);
2048 if (!pswit[OVERVIEW_SWITCH])
2049 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2050 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2056 /* Check parity of quotes. */
2057 nc=g_utf8_get_char(aline);
2058 for (s=aline;*s;s=g_utf8_next_char(s))
2061 nc=g_utf8_get_char(g_utf8_next_char(s));
2064 parities->dquote=!parities->dquote;
2065 if (!parities->dquote)
2068 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2070 if (pswit[ECHO_SWITCH])
2071 g_print("\n%s\n",aline);
2072 if (!pswit[OVERVIEW_SWITCH])
2073 g_print(" Line %ld column %ld - "
2074 "Wrongspaced quotes?\n",
2075 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2083 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2084 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2086 if (pswit[ECHO_SWITCH])
2087 g_print("\n%s\n",aline);
2088 if (!pswit[OVERVIEW_SWITCH])
2089 g_print(" Line %ld column %ld - "
2090 "Wrongspaced quotes?\n",
2091 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2098 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2100 if (g_utf8_strchr(",;:!?)]} ",-1,
2101 g_utf8_get_char(g_utf8_next_char(aline))))
2103 if (pswit[ECHO_SWITCH])
2104 g_print("\n%s\n",aline);
2105 if (!pswit[OVERVIEW_SWITCH])
2106 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2112 if (pswit[SQUOTE_SWITCH])
2114 nc=g_utf8_get_char(aline);
2115 for (s=aline;*s;s=g_utf8_next_char(s))
2118 nc=g_utf8_get_char(g_utf8_next_char(s));
2119 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2120 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2121 !g_unichar_isalpha(nc)))
2123 parities->squote=!parities->squote;
2124 if (!parities->squote)
2127 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2129 if (pswit[ECHO_SWITCH])
2130 g_print("\n%s\n",aline);
2131 if (!pswit[OVERVIEW_SWITCH])
2132 g_print(" Line %ld column %ld - "
2133 "Wrongspaced singlequotes?\n",
2134 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2142 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2143 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2145 if (pswit[ECHO_SWITCH])
2146 g_print("\n%s\n",aline);
2147 if (!pswit[OVERVIEW_SWITCH])
2148 g_print(" Line %ld column %ld - "
2149 "Wrongspaced singlequotes?\n",
2150 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2161 * check_for_double_punctuation:
2163 * Look for double punctuation like ,. or ,,
2164 * Thanks to DW for the suggestion!
2165 * In books with references, ".," and ".;" are common
2166 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2167 * OTOH, from my initial tests, there are also fairly
2168 * common errors. What to do? Make these cases paranoid?
2169 * ".," is the most common, so warnings->dotcomma is used
2170 * to suppress detailed reporting if it occurs often.
2172 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2176 nc=g_utf8_get_char(aline);
2177 for (s=aline;*s;s=g_utf8_next_char(s))
2180 nc=g_utf8_get_char(g_utf8_next_char(s));
2181 /* for each punctuation character in the line */
2182 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2183 g_utf8_strchr(".?!,;:",-1,nc))
2185 /* followed by punctuation, it's a query, unless . . . */
2186 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2187 !warnings->dotcomma && c=='.' && nc==',' ||
2188 warnings->isFrench && g_str_has_prefix(s,",...") ||
2189 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2190 warnings->isFrench && g_str_has_prefix(s,";...") ||
2191 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2192 warnings->isFrench && g_str_has_prefix(s,":...") ||
2193 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2194 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2195 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2196 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2197 warnings->isFrench && g_str_has_prefix(s,"...?"))
2199 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2200 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2201 warnings->isFrench && g_str_has_prefix(s,";...") ||
2202 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2203 warnings->isFrench && g_str_has_prefix(s,":...") ||
2204 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2205 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2206 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2207 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2208 warnings->isFrench && g_str_has_prefix(s,"...?"))
2211 nc=g_utf8_get_char(g_utf8_next_char(s));
2213 ; /* do nothing for .. !! and ?? which can be legit */
2217 if (pswit[ECHO_SWITCH])
2218 g_print("\n%s\n",aline);
2219 if (!pswit[OVERVIEW_SWITCH])
2220 g_print(" Line %ld column %ld - Double punctuation?\n",
2221 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2230 * check_for_spaced_quotes:
2232 void check_for_spaced_quotes(const char *aline)
2236 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2240 while ((t=strstr(s," \" ")))
2242 if (pswit[ECHO_SWITCH])
2243 g_print("\n%s\n",aline);
2244 if (!pswit[OVERVIEW_SWITCH])
2245 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2246 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2249 s=g_utf8_next_char(g_utf8_next_char(t));
2251 pattern=g_string_new(NULL);
2252 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2254 g_string_assign(pattern," ");
2255 g_string_append_unichar(pattern,single_quotes[i]);
2256 g_string_append_c(pattern,' ');
2258 while ((t=strstr(s,pattern->str)))
2260 if (pswit[ECHO_SWITCH])
2261 g_print("\n%s\n",aline);
2262 if (!pswit[OVERVIEW_SWITCH])
2263 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2264 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2267 s=g_utf8_next_char(g_utf8_next_char(t));
2270 g_string_free(pattern,TRUE);
2274 * check_for_miscased_genative:
2276 * Check special case of 'S instead of 's at end of word.
2278 void check_for_miscased_genative(const char *aline)
2284 c=g_utf8_get_char(aline);
2285 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2286 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2290 nc=g_utf8_get_char(g_utf8_next_char(s));
2291 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2293 if (pswit[ECHO_SWITCH])
2294 g_print("\n%s\n",aline);
2295 if (!pswit[OVERVIEW_SWITCH])
2296 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2297 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2305 * check_end_of_line:
2307 * Now check special cases - start and end of line -
2308 * for single and double quotes. Start is sometimes [sic]
2309 * but better to query it anyway.
2310 * While we're here, check for dash at end of line.
2312 void check_end_of_line(const char *aline,struct warnings *warnings)
2317 lbytes=strlen(aline);
2318 if (g_utf8_strlen(aline,lbytes)>1)
2320 s=g_utf8_prev_char(aline+lbytes);
2321 c1=g_utf8_get_char(s);
2322 c2=g_utf8_get_char(g_utf8_prev_char(s));
2323 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2325 if (pswit[ECHO_SWITCH])
2326 g_print("\n%s\n",aline);
2327 if (!pswit[OVERVIEW_SWITCH])
2328 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2329 g_utf8_strlen(aline,lbytes));
2333 c1=g_utf8_get_char(aline);
2334 c2=g_utf8_get_char(g_utf8_next_char(aline));
2335 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2337 if (pswit[ECHO_SWITCH])
2338 g_print("\n%s\n",aline);
2339 if (!pswit[OVERVIEW_SWITCH])
2340 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2345 * Dash at end of line may well be legit - paranoid mode only
2346 * and don't report em-dash at line-end.
2348 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2350 for (s=g_utf8_prev_char(aline+lbytes);
2351 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2353 if (g_utf8_get_char(s)=='-' &&
2354 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2356 if (pswit[ECHO_SWITCH])
2357 g_print("\n%s\n",aline);
2358 if (!pswit[OVERVIEW_SWITCH])
2359 g_print(" Line %ld column %ld - "
2360 "Hyphen at end of line?\n",
2361 linecnt,g_utf8_pointer_to_offset(aline,s));
2368 * check_for_unspaced_bracket:
2370 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2371 * If so, suspect a scanno like "a]most".
2373 void check_for_unspaced_bracket(const char *aline)
2377 c=g_utf8_get_char(aline);
2378 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2379 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2383 nc=g_utf8_get_char(g_utf8_next_char(s));
2386 /* for each bracket character in the line except 1st & last */
2387 if (g_utf8_strchr("{[()]}",-1,c) &&
2388 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2390 if (pswit[ECHO_SWITCH])
2391 g_print("\n%s\n",aline);
2392 if (!pswit[OVERVIEW_SWITCH])
2393 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2394 linecnt,g_utf8_pointer_to_offset(aline,s));
2402 * check_for_unpunctuated_endquote:
2404 void check_for_unpunctuated_endquote(const char *aline)
2408 c=g_utf8_get_char(aline);
2409 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2410 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2414 nc=g_utf8_get_char(g_utf8_next_char(s));
2415 /* for each character in the line except 1st */
2416 if (c==CHAR_DQUOTE && isalpha(pc))
2418 if (pswit[ECHO_SWITCH])
2419 g_print("\n%s\n",aline);
2420 if (!pswit[OVERVIEW_SWITCH])
2421 g_print(" Line %ld column %ld - "
2422 "endquote missing punctuation?\n",
2423 linecnt,g_utf8_pointer_to_offset(aline,s));
2431 * check_for_html_tag:
2433 * Check for <HTML TAG>.
2435 * If there is a < in the line, followed at some point
2436 * by a > then we suspect HTML.
2438 void check_for_html_tag(const char *aline)
2440 const char *open,*close;
2442 open=strchr(aline,'<');
2445 close=strchr(g_utf8_next_char(open),'>');
2448 if (pswit[ECHO_SWITCH])
2449 g_print("\n%s\n",aline);
2450 if (!pswit[OVERVIEW_SWITCH])
2452 tag=g_strndup(open,close-open+1);
2453 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2454 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2464 * check_for_html_entity:
2466 * Check for &symbol; HTML.
2468 * If there is a & in the line, followed at
2469 * some point by a ; then we suspect HTML.
2471 void check_for_html_entity(const char *aline)
2473 const char *s,*amp,*scolon;
2475 amp=strchr(aline,'&');
2478 scolon=strchr(amp,';');
2481 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2482 if (g_utf8_get_char(s)==CHAR_SPACE)
2483 break; /* Don't report "Jones & Son;" */
2486 if (pswit[ECHO_SWITCH])
2487 g_print("\n%s\n",aline);
2488 if (!pswit[OVERVIEW_SWITCH])
2490 entity=g_strndup(amp,scolon-amp+1);
2491 g_print(" Line %ld column %d - HTML symbol? %s \n",
2492 linecnt,(int)(amp-aline)+1,entity);
2503 * check_for_omitted_punctuation:
2505 * Check for omitted punctuation at end of paragraph by working back
2506 * through prevline. DW.
2507 * Need to check this only for "normal" paras.
2508 * So what is a "normal" para?
2509 * Not normal if one-liner (chapter headings, etc.)
2510 * Not normal if doesn't contain at least one locase letter
2511 * Not normal if starts with space
2513 void check_for_omitted_punctuation(const char *prevline,
2514 struct line_properties *last,int start_para_line)
2516 gboolean letter_on_line=FALSE;
2519 for (s=prevline;*s;s=g_utf8_next_char(s))
2520 if (g_unichar_isalpha(g_utf8_get_char(s)))
2522 letter_on_line=TRUE;
2526 * This next "if" is a problem.
2527 * If we say "start_para_line <= linecnt - 1", that includes
2528 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2529 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2530 * misses genuine one-line paragraphs.
2532 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2533 g_utf8_get_char(prevline)>CHAR_SPACE)
2535 s=prevline+strlen(prevline);
2538 s=g_utf8_prev_char(s);
2539 c=g_utf8_get_char(s);
2540 } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
2541 for (;s>prevline;s=g_utf8_prev_char(s))
2543 if (g_unichar_isalpha(g_utf8_get_char(s)))
2545 if (pswit[ECHO_SWITCH])
2546 g_print("\n%s\n",prevline);
2547 if (!pswit[OVERVIEW_SWITCH])
2548 g_print(" Line %ld column %ld - "
2549 "No punctuation at para end?\n",
2550 linecnt-1,g_utf8_strlen(prevline,-1));
2555 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2561 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2563 const char *word=key;
2566 g_print("\nNote: Queried word %s was duplicated %d times\n",
2571 void print_as_windows_1252(const char *string)
2573 gsize inbytes,outbytes;
2575 static GIConv converter=(GIConv)-1;
2578 if (converter!=(GIConv)-1)
2579 g_iconv_close(converter);
2580 converter=(GIConv)-1;
2583 if (converter==(GIConv)-1)
2584 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2585 if (converter!=(GIConv)-1)
2587 inbytes=outbytes=strlen(string);
2588 bp=buf=g_malloc(outbytes+1);
2589 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2595 fputs(string,stdout);
2598 void print_as_utf_8(const char *string)
2600 fputs(string,stdout);
2608 void procfile(const char *filename)
2611 gchar *parastart=NULL; /* first line of current para */
2612 gchar *etext,*aline;
2615 struct first_pass_results *first_pass_results;
2616 struct warnings *warnings;
2617 struct counters counters={0};
2618 struct line_properties last={0};
2619 struct parities parities={0};
2620 struct pending pending={0};
2621 gboolean isemptyline;
2622 long start_para_line=0;
2623 gboolean isnewpara=FALSE,enddash=FALSE;
2624 last.start=CHAR_SPACE;
2625 linecnt=checked_linecnt=0;
2626 etext=read_etext(filename,&err);
2629 if (pswit[STDOUT_SWITCH])
2630 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2632 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2635 g_print("\n\nFile: %s\n\n",filename);
2636 first_pass_results=first_pass(etext);
2637 warnings=report_first_pass(first_pass_results);
2638 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2639 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2641 * Here we go with the main pass. Hold onto yer hat!
2645 while ((aline=flgets(&etext_ptr,linecnt+1)))
2650 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2651 continue; // skip DP page separators completely
2652 if (linecnt<first_pass_results->firstline ||
2653 (first_pass_results->footerline>0 &&
2654 linecnt>first_pass_results->footerline))
2656 if (pswit[HEADER_SWITCH])
2658 if (g_str_has_prefix(aline,"Title:"))
2659 g_print(" %s\n",aline);
2660 if (g_str_has_prefix(aline,"Author:"))
2661 g_print(" %s\n",aline);
2662 if (g_str_has_prefix(aline,"Release Date:"))
2663 g_print(" %s\n",aline);
2664 if (g_str_has_prefix(aline,"Edition:"))
2665 g_print(" %s\n\n",aline);
2667 continue; /* skip through the header */
2670 print_pending(aline,parastart,&pending);
2671 isemptyline=analyse_quotes(aline,&counters);
2672 if (isnewpara && !isemptyline)
2674 /* This line is the start of a new paragraph. */
2675 start_para_line=linecnt;
2676 /* Capture its first line in case we want to report it later. */
2678 parastart=g_strdup(aline);
2679 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2681 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2682 !g_unichar_isdigit(g_utf8_get_char(s)))
2683 s=g_utf8_next_char(s);
2684 if (g_unichar_islower(g_utf8_get_char(s)))
2686 /* and its first letter is lowercase */
2687 if (pswit[ECHO_SWITCH])
2688 g_print("\n%s\n",aline);
2689 if (!pswit[OVERVIEW_SWITCH])
2690 g_print(" Line %ld column %ld - "
2691 "Paragraph starts with lower-case\n",
2692 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2696 isnewpara=FALSE; /* Signal the end of new para processing. */
2698 /* Check for an em-dash broken at line end. */
2699 if (enddash && g_utf8_get_char(aline)=='-')
2701 if (pswit[ECHO_SWITCH])
2702 g_print("\n%s\n",aline);
2703 if (!pswit[OVERVIEW_SWITCH])
2704 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2709 for (s=g_utf8_prev_char(aline+strlen(aline));
2710 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2712 if (s>=aline && g_utf8_get_char(s)=='-')
2714 check_for_control_characters(aline);
2715 check_for_odd_characters(aline,warnings,isemptyline);
2716 if (warnings->longline)
2717 check_for_long_line(aline);
2718 if (warnings->shortline)
2719 check_for_short_line(aline,&last);
2721 last.len=g_utf8_strlen(aline,-1);
2722 last.start=g_utf8_get_char(aline);
2723 check_for_starting_punctuation(aline);
2726 check_for_spaced_emdash(aline);
2727 check_for_spaced_dash(aline);
2729 check_for_unmarked_paragraphs(aline);
2730 check_for_jeebies(aline);
2731 check_for_mta_from(aline);
2732 check_for_orphan_character(aline);
2733 check_for_pling_scanno(aline);
2734 check_for_extra_period(aline,warnings);
2735 check_for_following_punctuation(aline);
2736 check_for_typos(aline,warnings);
2737 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2738 check_for_double_punctuation(aline,warnings);
2739 check_for_spaced_quotes(aline);
2740 check_for_miscased_genative(aline);
2741 check_end_of_line(aline,warnings);
2742 check_for_unspaced_bracket(aline);
2743 if (warnings->endquote)
2744 check_for_unpunctuated_endquote(aline);
2745 check_for_html_tag(aline);
2746 check_for_html_entity(aline);
2749 check_for_mismatched_quotes(&counters,&pending);
2750 counters_reset(&counters);
2751 /* let the next iteration know that it's starting a new para */
2754 check_for_omitted_punctuation(prevline,&last,start_para_line);
2757 prevline=g_strdup(aline);
2760 check_for_mismatched_quotes(&counters,&pending);
2761 print_pending(NULL,parastart,&pending);
2762 reset_pending(&pending);
2771 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2772 g_tree_foreach(qword,report_duplicate_queries,NULL);
2773 g_tree_unref(qword);
2774 g_tree_unref(qperiod);
2775 counters_destroy(&counters);
2776 g_set_print_handler(NULL);
2777 print_as_windows_1252(NULL);
2778 if (pswit[MARKUP_SWITCH])
2785 * Get one line from the input text, checking for
2786 * the existence of exactly one CR/LF line-end per line.
2788 * Returns: a pointer to the line.
2790 char *flgets(char **etext,long lcnt)
2793 gboolean isCR=FALSE;
2794 char *theline=*etext;
2799 c=g_utf8_get_char(*etext);
2800 *etext=g_utf8_next_char(*etext);
2803 /* either way, it's end of line */
2810 /* Error - a LF without a preceding CR */
2811 if (pswit[LINE_END_SWITCH])
2813 if (pswit[ECHO_SWITCH])
2815 s=g_strndup(theline,eos-theline);
2816 g_print("\n%s\n",s);
2819 if (!pswit[OVERVIEW_SWITCH])
2820 g_print(" Line %ld - No CR?\n",lcnt);
2831 /* Error - two successive CRs */
2832 if (pswit[LINE_END_SWITCH])
2834 if (pswit[ECHO_SWITCH])
2836 s=g_strndup(theline,eos-theline);
2837 g_print("\n%s\n",s);
2840 if (!pswit[OVERVIEW_SWITCH])
2841 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2850 if (pswit[LINE_END_SWITCH] && isCR)
2852 if (pswit[ECHO_SWITCH])
2854 s=g_strndup(theline,eos-theline);
2855 g_print("\n%s\n",s);
2858 if (!pswit[OVERVIEW_SWITCH])
2859 g_print(" Line %ld column %ld - CR without LF?\n",
2860 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2866 eos=g_utf8_next_char(eos);
2870 if (pswit[MARKUP_SWITCH])
2871 postprocess_for_HTML(theline);
2872 if (pswit[DP_SWITCH])
2873 postprocess_for_DP(theline);
2880 * Takes a "word" as a parameter, and checks whether it
2881 * contains a mixture of alpha and digits. Generally, this is an
2882 * error, but may not be for cases like 4th or L5 12s. 3d.
2884 * Returns: TRUE iff an is error found.
2886 gboolean mixdigit(const char *checkword)
2888 gboolean wehaveadigit,wehavealetter,query;
2889 const char *s,*nondigit;
2890 wehaveadigit=wehavealetter=query=FALSE;
2891 for (s=checkword;*s;s=g_utf8_next_char(s))
2892 if (g_unichar_isalpha(g_utf8_get_char(s)))
2894 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2896 if (wehaveadigit && wehavealetter)
2898 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2900 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2901 nondigit=g_utf8_next_char(nondigit))
2903 /* digits, ending in st, rd, nd, th of either case */
2904 if (!g_ascii_strcasecmp(nondigit,"st") ||
2905 !g_ascii_strcasecmp(nondigit,"rd") ||
2906 !g_ascii_strcasecmp(nondigit,"nd") ||
2907 !g_ascii_strcasecmp(nondigit,"th"))
2909 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2910 !g_ascii_strcasecmp(nondigit,"rds") ||
2911 !g_ascii_strcasecmp(nondigit,"nds") ||
2912 !g_ascii_strcasecmp(nondigit,"ths"))
2914 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2915 !g_ascii_strcasecmp(nondigit,"rdly") ||
2916 !g_ascii_strcasecmp(nondigit,"ndly") ||
2917 !g_ascii_strcasecmp(nondigit,"thly"))
2919 /* digits, ending in l, L, s or d */
2920 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2921 !strcmp(nondigit,"d"))
2924 * L at the start of a number, representing Britsh pounds, like L500.
2925 * This is cute. We know the current word is mixed digit. If the first
2926 * letter is L, there must be at least one digit following. If both
2927 * digits and letters follow, we have a genuine error, else we have a
2928 * capital L followed by digits, and we accept that as a non-error.
2930 if (g_utf8_get_char(checkword)=='L' &&
2931 !mixdigit(g_utf8_next_char(checkword)))
2940 * Extracts the first/next "word" from the line, and returns it.
2941 * A word is defined as one English word unit--or at least that's the aim.
2942 * "ptr" is advanced to the position in the line where we will start
2943 * looking for the next word.
2945 * Returns: A newly-allocated string.
2947 gchar *getaword(const char **ptr)
2952 word=g_string_new(NULL);
2953 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2954 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2955 **ptr;*ptr=g_utf8_next_char(*ptr))
2958 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2959 * Especially yucky is the case of L1,000
2960 * This section looks for a pattern of characters including a digit
2961 * followed by a comma or period followed by one or more digits.
2962 * If found, it returns this whole pattern as a word; otherwise we discard
2963 * the results and resume our normal programming.
2966 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2967 g_unichar_isalpha(g_utf8_get_char(s)) ||
2968 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2969 g_string_append_unichar(word,g_utf8_get_char(s));
2972 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
2974 c=g_utf8_get_char(t);
2975 pc=g_utf8_get_char(g_utf8_prev_char(t));
2976 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
2979 return g_string_free(word,FALSE);
2983 /* we didn't find a punctuated number - do the regular getword thing */
2984 g_string_truncate(word,0);
2985 c=g_utf8_get_char(*ptr);
2986 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
2987 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
2988 g_string_append_unichar(word,c);
2989 return g_string_free(word,FALSE);
2995 * Is this word a Roman Numeral?
2997 * It doesn't actually validate that the number is a valid Roman Numeral--for
2998 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2999 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3000 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3001 * expressions thereof, except when it came to taxes. Allow any number of M,
3002 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3003 * XL or an optional XC, an optional IX or IV, an optional V and any number
3006 gboolean isroman(const char *t)
3012 while (g_utf8_get_char(t)=='m' && *t)
3014 if (g_utf8_get_char(t)=='d')
3016 if (g_str_has_prefix(t,"cm"))
3018 if (g_str_has_prefix(t,"cd"))
3020 while (g_utf8_get_char(t)=='c' && *t)
3022 if (g_str_has_prefix(t,"xl"))
3024 if (g_str_has_prefix(t,"xc"))
3026 if (g_utf8_get_char(t)=='l')
3028 while (g_utf8_get_char(t)=='x' && *t)
3030 if (g_str_has_prefix(t,"ix"))
3032 if (g_str_has_prefix(t,"iv"))
3034 if (g_utf8_get_char(t)=='v')
3036 while (g_utf8_get_char(t)=='i' && *t)
3042 * postprocess_for_DP:
3044 * Invoked with the -d switch from flgets().
3045 * It simply "removes" from the line a hard-coded set of common
3046 * DP-specific tags, so that the line passed to the main routine has
3047 * been pre-cleaned of DP markup.
3049 void postprocess_for_DP(char *theline)
3055 for (i=0;*DPmarkup[i];i++)
3056 while ((s=strstr(theline,DPmarkup[i])))
3058 t=s+strlen(DPmarkup[i]);
3059 memmove(s,t,strlen(t)+1);
3064 * postprocess_for_HTML:
3066 * Invoked with the -m switch from flgets().
3067 * It simply "removes" from the line a hard-coded set of common
3068 * HTML tags and "replaces" a hard-coded set of common HTML
3069 * entities, so that the line passed to the main routine has
3070 * been pre-cleaned of HTML.
3072 void postprocess_for_HTML(char *theline)
3074 while (losemarkup(theline))
3076 loseentities(theline);
3079 char *losemarkup(char *theline)
3083 s=strchr(theline,'<');
3084 t=s?strchr(s,'>'):NULL;
3087 for (i=0;*markup[i];i++)
3088 if (tagcomp(g_utf8_next_char(s),markup[i]))
3090 t=g_utf8_next_char(t);
3091 memmove(s,t,strlen(t)+1);
3094 /* It's an unrecognized <xxx>. */
3098 void loseentities(char *theline)
3105 GTree *entities=NULL;
3106 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3110 g_tree_destroy(entities);
3112 if (translit!=(GIConv)-1)
3113 g_iconv_close(translit);
3114 translit=(GIConv)-1;
3115 if (to_utf8!=(GIConv)-1)
3116 g_iconv_close(to_utf8);
3124 entities=g_tree_new((GCompareFunc)strcmp);
3125 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3126 g_tree_insert(entities,HTMLentities[i].name,
3127 GUINT_TO_POINTER(HTMLentities[i].c));
3129 if (translit==(GIConv)-1)
3130 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3131 if (to_utf8==(GIConv)-1)
3132 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3133 while((amp=strchr(theline,'&')))
3135 scolon=strchr(amp,';');
3140 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3141 c=strtol(amp+2,NULL,10);
3142 else if (amp[2]=='x' &&
3143 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3144 c=strtol(amp+3,NULL,16);
3148 s=g_strndup(amp+1,scolon-(amp+1));
3149 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3158 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3159 theline+=g_unichar_to_utf8(c,theline);
3163 nb=g_unichar_to_utf8(c,s);
3164 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3166 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3168 memcpy(theline,s,nb);
3172 memmove(theline,g_utf8_next_char(scolon),
3173 strlen(g_utf8_next_char(scolon))+1);
3176 theline=g_utf8_next_char(amp);
3180 gboolean tagcomp(const char *strin,const char *basetag)
3184 if (g_utf8_get_char(strin)=='/')
3185 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3187 t=g_utf8_casefold(strin,-1);
3188 s=g_utf8_casefold(basetag,-1);
3189 retval=g_str_has_prefix(t,s);
3195 void proghelp(GOptionContext *context)
3198 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3199 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3200 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3201 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3202 "For details, read the file COPYING.\n",stderr);
3203 fputs("This is Free Software; "
3204 "you may redistribute it under certain conditions (GPL);\n",stderr);
3205 fputs("read the file COPYING for details.\n\n",stderr);
3206 help=g_option_context_get_help(context,TRUE,NULL);
3209 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3210 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3211 "non-ASCII\n",stderr);
3212 fputs("characters like accented letters, "
3213 "lines longer than 75 or shorter than 55,\n",stderr);
3214 fputs("unbalanced quotes or brackets, "
3215 "a variety of badly formatted punctuation, \n",stderr);
3216 fputs("HTML tags, some likely typos. "
3217 "It is NOT a substitute for human judgement.\n",stderr);