1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
27 #include "HTMLentities.h"
33 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
34 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
35 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
36 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
37 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
38 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
39 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
40 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
41 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
42 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
43 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
44 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
45 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
46 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
47 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
48 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
49 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
50 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
51 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
52 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
53 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
54 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
55 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
56 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
57 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
58 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
59 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
60 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
61 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
67 /* Common abbreviations and other OK words not to query as typos. */
69 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
70 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
71 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
72 "outbid", "outbids", "frostbite", "frostbitten", ""
75 /* Common abbreviations that cause otherwise unexplained periods. */
77 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
78 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
82 * Two-Letter combinations that rarely if ever start words,
83 * but are common scannos or otherwise common letter combinations.
86 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
90 * Two-Letter combinations that rarely if ever end words,
91 * but are common scannos or otherwise common letter combinations.
94 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
95 "sw", "gr", "sl", "cl", "iy", ""
99 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
100 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
101 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
102 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
106 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
110 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
111 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
112 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
113 "during", "let", "toward", "among", ""
117 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
118 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
119 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
120 "among", "those", "into", "whom", "having", "thence", ""
123 /* special characters */
124 #define CHAR_SPACE 32
128 #define CHAR_DQUOTE 34
129 #define CHAR_SQUOTE 39
130 #define CHAR_OPEN_SQUOTE 96
131 #define CHAR_TILDE 126
132 #define CHAR_ASTERISK 42
133 #define CHAR_FORESLASH 47
134 #define CHAR_CARAT 94
136 #define CHAR_UNDERSCORE '_'
137 #define CHAR_OPEN_CBRACK '{'
138 #define CHAR_CLOSE_CBRACK '}'
139 #define CHAR_OPEN_RBRACK '('
140 #define CHAR_CLOSE_RBRACK ')'
141 #define CHAR_OPEN_SBRACK '['
142 #define CHAR_CLOSE_SBRACK ']'
144 /* longest and shortest normal PG line lengths */
145 #define LONGEST_PG_LINE 75
146 #define WAY_TOO_LONG 80
147 #define SHORTEST_PG_LINE 55
167 gboolean pswit[SWITNO]; /* program switches */
169 static GOptionEntry options[]={
170 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
171 "Ignore DP-specific markup", NULL },
172 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
173 "Don't echo queried line", NULL },
174 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
175 "Check single quotes", NULL },
176 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
177 "Check common typos", NULL },
178 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
179 "Require closure of quotes on every paragraph", NULL },
180 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
181 "Disable paranoid querying of everything", NULL },
182 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
183 "Disable line end checking", NULL },
184 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
185 "Overview: just show counts", NULL },
186 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
187 "Output errors to stdout instead of stderr", NULL },
188 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
193 "Use file of user-defined typos", NULL },
194 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
195 "Defaults for use on www upload", NULL },
196 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
197 "Verbose - list everything", NULL },
201 long cnt_dquot; /* for overview mode, count of doublequote queries */
202 long cnt_squot; /* for overview mode, count of singlequote queries */
203 long cnt_brack; /* for overview mode, count of brackets queries */
204 long cnt_bin; /* for overview mode, count of non-ASCII queries */
205 long cnt_odd; /* for overview mode, count of odd character queries */
206 long cnt_long; /* for overview mode, count of long line errors */
207 long cnt_short; /* for overview mode, count of short line queries */
208 long cnt_punct; /* for overview mode,
209 count of punctuation and spacing queries */
210 long cnt_dash; /* for overview mode, count of dash-related queries */
211 long cnt_word; /* for overview mode, count of word queries */
212 long cnt_html; /* for overview mode, count of html queries */
213 long cnt_lineend; /* for overview mode, count of line-end queries */
214 long cnt_spacend; /* count of lines with space at end */
215 long linecnt; /* count of total lines in the file */
216 long checked_linecnt; /* count of lines actually checked */
218 void proghelp(GOptionContext *context);
219 void procfile(const char *);
223 gboolean mixdigit(const char *);
224 gchar *getaword(const char **);
225 char *flgets(char **,long);
226 void postprocess_for_HTML(char *);
227 char *linehasmarkup(char *);
228 char *losemarkup(char *);
229 gboolean tagcomp(const char *,const char *);
230 void loseentities(char *);
231 gboolean isroman(const char *);
232 void postprocess_for_DP(char *);
233 void print_as_windows_1252(const char *string);
234 void print_as_utf_8(const char *string);
236 GTree *qword,*qperiod;
238 struct first_pass_results {
239 long firstline,astline;
240 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
241 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
242 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
243 int Dutchcount,Frenchcount;
247 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
249 gboolean isDutch,isFrench;
254 int c_unders,c_brack,s_brack,r_brack;
255 int open_single_quote,close_single_quote;
258 struct line_properties {
259 unsigned int len,blen;
268 char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
272 void parse_options(int *argc,char ***argv)
275 GOptionContext *context;
276 context=g_option_context_new(
277 "file - looks for errors in Project Gutenberg(TM) etexts");
278 g_option_context_add_main_entries(context,options,NULL);
279 if (!g_option_context_parse(context,argc,argv,&err))
281 g_printerr("Bookloupe: %s\n",err->message);
282 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
285 /* Paranoid checking is turned OFF, not on, by its switch */
286 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
287 if (pswit[PARANOID_SWITCH])
288 /* if running in paranoid mode, typo checks default to enabled */
289 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
290 /* Line-end checking is turned OFF, not on, by its switch */
291 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
292 /* Echoing is turned OFF, not on, by its switch */
293 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
294 if (pswit[OVERVIEW_SWITCH])
295 /* just print summary; don't echo */
296 pswit[ECHO_SWITCH]=FALSE;
298 * Web uploads - for the moment, this is really just a placeholder
299 * until we decide what processing we really want to do on web uploads
301 if (pswit[WEB_SWITCH])
303 /* specific override for web uploads */
304 pswit[ECHO_SWITCH]=TRUE;
305 pswit[SQUOTE_SWITCH]=FALSE;
306 pswit[TYPO_SWITCH]=TRUE;
307 pswit[QPARA_SWITCH]=FALSE;
308 pswit[PARANOID_SWITCH]=TRUE;
309 pswit[LINE_END_SWITCH]=FALSE;
310 pswit[OVERVIEW_SWITCH]=FALSE;
311 pswit[STDOUT_SWITCH]=FALSE;
312 pswit[HEADER_SWITCH]=TRUE;
313 pswit[VERBOSE_SWITCH]=FALSE;
314 pswit[MARKUP_SWITCH]=FALSE;
315 pswit[USERTYPO_SWITCH]=FALSE;
316 pswit[DP_SWITCH]=FALSE;
323 g_option_context_free(context);
329 * Read in the user-defined stealth scanno list.
331 void read_user_scannos(void)
334 gchar *usertypo_file;
338 gchar *contents,*utf8,**lines;
339 usertypo_file=g_strdup("bookloupe.typ");
340 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
341 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
344 g_free(usertypo_file);
345 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
346 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
348 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
351 g_free(usertypo_file);
352 usertypo_file=g_strdup("gutcheck.typ");
353 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
355 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
358 g_free(usertypo_file);
359 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
360 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
362 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
364 g_free(usertypo_file);
365 g_print(" --> I couldn't find bookloupe.typ "
366 "-- proceeding without user typos.\n");
371 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
372 g_free(usertypo_file);
376 if (g_utf8_validate(contents,len,NULL))
377 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
379 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
381 lines=g_strsplit_set(utf8,"\r\n",0);
383 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
384 for (i=0;lines[i];i++)
385 if (*(unsigned char *)lines[i]>'!')
386 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
395 * Read an etext returning a newly allocated string containing the file
396 * contents or NULL on error.
398 gchar *read_etext(const char *filename,GError **err)
400 gchar *contents,*utf8;
402 if (!g_file_get_contents(filename,&contents,&len,err))
404 if (g_utf8_validate(contents,len,NULL))
406 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
407 g_set_print_handler(print_as_utf_8);
411 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
412 g_set_print_handler(print_as_windows_1252);
418 int main(int argc,char **argv)
420 running_from=g_path_get_dirname(argv[0]);
421 parse_options(&argc,&argv);
422 if (pswit[USERTYPO_SWITCH])
424 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
426 if (pswit[OVERVIEW_SWITCH])
428 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
429 checked_linecnt,linecnt,linecnt-checked_linecnt);
430 g_print(" --------------- Queries found --------------\n");
432 g_print(" Long lines: %14ld\n",cnt_long);
434 g_print(" Short lines: %14ld\n",cnt_short);
436 g_print(" Line-end problems: %14ld\n",cnt_lineend);
438 g_print(" Common typos: %14ld\n",cnt_word);
440 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
442 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
444 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
446 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
448 g_print(" Proofing characters: %14ld\n",cnt_odd);
450 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
452 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
454 g_print(" Possible HTML tags: %14ld\n",cnt_html);
456 g_print(" TOTAL QUERIES %14ld\n",
457 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
458 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
460 g_free(running_from);
462 g_tree_unref(usertypo);
469 * Run a first pass - verify that it's a valid PG
470 * file, decide whether to report some things that
471 * occur many times in the text like long or short
472 * lines, non-standard dashes, etc.
474 struct first_pass_results *first_pass(const char *etext)
476 gunichar laststart=CHAR_SPACE;
481 unsigned int lastlen=0,lastblen=0;
482 long spline=0,nspline=0;
483 static struct first_pass_results results={0};
485 lines=g_strsplit(etext,"\n",0);
486 for (j=0;lines[j];j++)
488 lbytes=strlen(lines[j]);
489 while (lines[j][lbytes-1]=='\r')
490 lines[j][--lbytes]='\0';
491 llen=g_utf8_strlen(lines[j],lbytes);
493 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
494 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
497 g_print(" --> Duplicate header?\n");
498 spline=linecnt+1; /* first line of non-header text, that is */
500 if (!strncmp(lines[j],"*** START",9) &&
501 strstr(lines[j],"PROJECT GUTENBERG"))
504 g_print(" --> Duplicate header?\n");
505 nspline=linecnt+1; /* first line of non-header text, that is */
507 if (spline || nspline)
509 lc_line=g_utf8_strdown(lines[j],lbytes);
510 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
512 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
514 if (results.footerline)
516 /* it's an old-form header - we can detect duplicates */
518 g_print(" --> Duplicate footer?\n");
521 results.footerline=linecnt;
527 results.firstline=spline;
529 results.firstline=nspline; /* override with new */
530 if (results.footerline)
531 continue; /* don't count the boilerplate in the footer */
532 results.totlen+=llen;
533 for (s=lines[j];*s;s=g_utf8_next_char(s))
535 if (g_utf8_get_char(s)>127)
537 if (g_unichar_isalpha(g_utf8_get_char(s)))
539 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
540 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
541 results.endquote_count++;
543 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
544 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
547 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
549 if (strstr(lines[j],".,"))
551 /* only count ast lines for ignoring purposes where there is */
552 /* locase text on the line */
553 if (strchr(lines[j],'*'))
555 for (s=lines[j];*s;s=g_utf8_next_char(s))
556 if (g_unichar_islower(g_utf8_get_char(s)))
561 if (strchr(lines[j],'/'))
562 results.fslashline++;
563 for (s=g_utf8_prev_char(lines[j]+lbytes);
564 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
566 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
567 g_utf8_get_char(g_utf8_prev_char(s))!='-')
569 if (llen>LONGEST_PG_LINE)
571 if (llen>WAY_TOO_LONG)
572 results.verylongline++;
573 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
575 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
578 if (strstr(lines[j],"<i>"))
579 results.htmcount+=4; /* bonus marks! */
581 /* Check for spaced em-dashes */
582 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
585 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
586 results.space_emdash++;
587 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
588 /* count of em-dashes with spaces both sides */
589 results.non_PG_space_emdash++;
590 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
591 /* count of PG-type em-dashes with no spaces */
592 results.PG_space_emdash++;
597 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
598 results.Dutchcount++;
599 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
600 results.Frenchcount++;
601 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
602 results.standalone_digit++;
605 /* Check for spaced dashes */
606 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
610 laststart=lines[j][0];
619 * Make some snap decisions based on the first pass results.
621 struct warnings *report_first_pass(struct first_pass_results *results)
623 static struct warnings warnings={0};
625 g_print(" --> %ld lines in this file have white space at end\n",
628 if (results->dotcomma>5)
631 g_print(" --> %ld lines in this file contain '.,'. "
632 "Not reporting them.\n",results->dotcomma);
635 * If more than 50 lines, or one-tenth, are short,
636 * don't bother reporting them.
638 warnings.shortline=1;
639 if (results->shortline>50 || results->shortline*10>linecnt)
641 warnings.shortline=0;
642 g_print(" --> %ld lines in this file are short. "
643 "Not reporting short lines.\n",results->shortline);
646 * If more than 50 lines, or one-tenth, are long,
647 * don't bother reporting them.
650 if (results->longline>50 || results->longline*10>linecnt)
653 g_print(" --> %ld lines in this file are long. "
654 "Not reporting long lines.\n",results->longline);
656 /* If more than 10 lines contain asterisks, don't bother reporting them. */
658 if (results->astline>10)
661 g_print(" --> %ld lines in this file contain asterisks. "
662 "Not reporting them.\n",results->astline);
665 * If more than 10 lines contain forward slashes,
666 * don't bother reporting them.
669 if (results->fslashline>10)
672 g_print(" --> %ld lines in this file contain forward slashes. "
673 "Not reporting them.\n",results->fslashline);
676 * If more than 20 lines contain unpunctuated endquotes,
677 * don't bother reporting them.
680 if (results->endquote_count>20)
683 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
684 "Not reporting them.\n",results->endquote_count);
687 * If more than 15 lines contain standalone digits,
688 * don't bother reporting them.
691 if (results->standalone_digit>10)
694 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
695 "Not reporting them.\n",results->standalone_digit);
698 * If more than 20 lines contain hyphens at end,
699 * don't bother reporting them.
702 if (results->hyphens>20)
705 g_print(" --> %ld lines in this file have hyphens at end. "
706 "Not reporting them.\n",results->hyphens);
708 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
710 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
711 pswit[MARKUP_SWITCH]=1;
713 if (results->verylongline>0)
714 g_print(" --> %ld lines in this file are VERY long!\n",
715 results->verylongline);
717 * If there are more non-PG spaced dashes than PG em-dashes,
718 * assume it's deliberate.
719 * Current PG guidelines say don't use them, but older texts do,
720 * and some people insist on them whatever the guidelines say.
723 if (results->spacedash+results->non_PG_space_emdash>
724 results->PG_space_emdash)
727 g_print(" --> There are %ld spaced dashes and em-dashes. "
728 "Not reporting them.\n",
729 results->spacedash+results->non_PG_space_emdash);
731 /* If more than a quarter of characters are hi-bit, bug out. */
733 if (results->binlen*4>results->totlen)
735 g_print(" --> This file does not appear to be ASCII. "
736 "Terminating. Best of luck with it!\n");
739 if (results->alphalen*4<results->totlen)
741 g_print(" --> This file does not appear to be text. "
742 "Terminating. Best of luck with it!\n");
745 if (results->binlen*100>results->totlen || results->binlen>100)
747 g_print(" --> There are a lot of foreign letters here. "
748 "Not reporting them.\n");
751 warnings.isDutch=FALSE;
752 if (results->Dutchcount>50)
754 warnings.isDutch=TRUE;
755 g_print(" --> This looks like Dutch - "
756 "switching off dashes and warnings for 's Middags case.\n");
758 warnings.isFrench=FALSE;
759 if (results->Frenchcount>50)
761 warnings.isFrench=TRUE;
762 g_print(" --> This looks like French - "
763 "switching off some doublepunct.\n");
765 if (results->firstline && results->footerline)
766 g_print(" The PG header and footer appear to be already on.\n");
769 if (results->firstline)
770 g_print(" The PG header is on - no footer.\n");
771 if (results->footerline)
772 g_print(" The PG footer is on - no header.\n");
775 if (pswit[VERBOSE_SWITCH])
778 warnings.shortline=1;
787 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
789 if (warnings.isDutch)
791 if (results->footerline>0 && results->firstline>0 &&
792 results->footerline>results->firstline &&
793 results->footerline-results->firstline<100)
795 g_print(" --> I don't really know where this text starts. \n");
796 g_print(" There are no reference points.\n");
797 g_print(" I'm going to have to report the header and footer "
799 results->firstline=0;
807 * Look along the line, accumulate the count of quotes, and see
808 * if this is an empty line - i.e. a line with nothing on it
810 * If line has just spaces, period, * and/or - on it, don't
811 * count it, since empty lines with asterisks or dashes to
812 * separate sections are common.
814 * Returns: TRUE if the line is empty.
816 gboolean analyse_quotes(const char *aline,struct counters *counters)
819 /* assume the line is empty until proven otherwise */
820 gboolean isemptyline=TRUE;
821 const char *s=aline,*sprev,*snext;
826 snext=g_utf8_next_char(s);
827 c=g_utf8_get_char(s);
830 if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
835 * At start of line, it can only be an openquote.
836 * Hardcode a very common exception!
838 if (!g_str_has_prefix(snext,"tis") &&
839 !g_str_has_prefix(snext,"Tis"))
840 counters->open_single_quote++;
842 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
843 g_unichar_isalpha(g_utf8_get_char(snext)))
844 /* Do nothing! it's definitely an apostrophe, not a quote */
846 /* it's outside a word - let's check it out */
847 else if (c==CHAR_OPEN_SQUOTE ||
848 g_unichar_isalpha(g_utf8_get_char(snext)))
850 /* it damwell better BE an openquote */
851 if (!g_str_has_prefix(snext,"tis") &&
852 !g_str_has_prefix(snext,"Tis"))
853 /* hardcode a very common exception! */
854 counters->open_single_quote++;
858 /* now - is it a closequote? */
859 guessquote=0; /* accumulate clues */
860 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
862 /* it follows a letter - could be either */
864 if (g_utf8_get_char(sprev)=='s')
866 /* looks like a plural apostrophe */
868 if (g_utf8_get_char(snext)==CHAR_SPACE)
873 /* it doesn't have a letter either side */
874 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
875 strchr(".?!,;: ",g_utf8_get_char(snext)))
876 guessquote+=8; /* looks like a closequote */
879 if (counters->open_single_quote>counters->close_single_quote)
881 * Give it the benefit of some doubt,
882 * if a squote is already open.
888 counters->close_single_quote++;
891 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
893 isemptyline=FALSE; /* ignore lines like * * * as spacers */
894 if (c==CHAR_UNDERSCORE)
895 counters->c_unders++;
896 if (c==CHAR_OPEN_CBRACK)
898 if (c==CHAR_CLOSE_CBRACK)
900 if (c==CHAR_OPEN_RBRACK)
902 if (c==CHAR_CLOSE_RBRACK)
904 if (c==CHAR_OPEN_SBRACK)
906 if (c==CHAR_CLOSE_SBRACK)
915 * check_for_control_characters:
917 * Check for invalid or questionable characters in the line
918 * Anything above 127 is invalid for plain ASCII, and
919 * non-printable control characters should also be flagged.
920 * Tabs should generally not be there.
922 void check_for_control_characters(const char *aline)
926 for (s=aline;*s;s=g_utf8_next_char(s))
928 c=g_utf8_get_char(s);
929 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
931 if (pswit[ECHO_SWITCH])
932 g_print("\n%s\n",aline);
933 if (!pswit[OVERVIEW_SWITCH])
934 g_print(" Line %ld column %ld - Control character %u\n",
935 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
943 * check_for_odd_characters:
945 * Check for binary and other odd characters.
947 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
948 gboolean isemptyline)
950 /* Don't repeat multiple warnings on one line. */
951 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
952 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
955 for (s=aline;*s;s=g_utf8_next_char(s))
957 c=g_utf8_get_char(s);
958 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
960 if (pswit[ECHO_SWITCH])
961 g_print("\n%s\n",aline);
962 if (!pswit[OVERVIEW_SWITCH])
963 if (c>127 && c<160 || c>255)
964 g_print(" Line %ld column %ld - "
965 "Non-ISO-8859 character %u\n",
966 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
968 g_print(" Line %ld column %ld - "
969 "Non-ASCII character %u\n",
970 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
975 if (!eTab && c==CHAR_TAB)
977 if (pswit[ECHO_SWITCH])
978 g_print("\n%s\n",aline);
979 if (!pswit[OVERVIEW_SWITCH])
980 g_print(" Line %ld column %ld - Tab character?\n",
981 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
986 if (!eTilde && c==CHAR_TILDE)
989 * Often used by OCR software to indicate an
990 * unrecognizable character.
992 if (pswit[ECHO_SWITCH])
993 g_print("\n%s\n",aline);
994 if (!pswit[OVERVIEW_SWITCH])
995 g_print(" Line %ld column %ld - Tilde character?\n",
996 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1001 if (!eCarat && c==CHAR_CARAT)
1003 if (pswit[ECHO_SWITCH])
1004 g_print("\n%s\n",aline);
1005 if (!pswit[OVERVIEW_SWITCH])
1006 g_print(" Line %ld column %ld - Carat character?\n",
1007 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1012 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1014 if (pswit[ECHO_SWITCH])
1015 g_print("\n%s\n",aline);
1016 if (!pswit[OVERVIEW_SWITCH])
1017 g_print(" Line %ld column %ld - Forward slash?\n",
1018 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1024 * Report asterisks only in paranoid mode,
1025 * since they're often deliberate.
1027 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1030 if (pswit[ECHO_SWITCH])
1031 g_print("\n%s\n",aline);
1032 if (!pswit[OVERVIEW_SWITCH])
1033 g_print(" Line %ld column %ld - Asterisk?\n",
1034 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1043 * check_for_long_line:
1045 * Check for line too long.
1047 void check_for_long_line(const char *aline)
1049 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1051 if (pswit[ECHO_SWITCH])
1052 g_print("\n%s\n",aline);
1053 if (!pswit[OVERVIEW_SWITCH])
1054 g_print(" Line %ld column %ld - Long line %ld\n",
1055 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1062 * check_for_short_line:
1064 * Check for line too short.
1066 * This one is a bit trickier to implement: we don't want to
1067 * flag the last line of a paragraph for being short, so we
1068 * have to wait until we know that our current line is a
1069 * "normal" line, then report the _previous_ line if it was too
1070 * short. We also don't want to report indented lines like
1071 * chapter heads or formatted quotations. We therefore keep
1072 * last->len as the length of the last line examined, and
1073 * last->blen as the length of the last but one, and try to
1074 * suppress unnecessary warnings by checking that both were of
1075 * "normal" length. We keep the first character of the last
1076 * line in last->start, and if it was a space, we assume that
1077 * the formatting is deliberate. I can't figure out a way to
1078 * distinguish something like a quoted verse left-aligned or
1079 * the header or footer of a letter from a paragraph of short
1080 * lines - maybe if I examined the whole paragraph, and if the
1081 * para has less than, say, 8 lines and if all lines are short,
1082 * then just assume it's OK? Need to look at some texts to see
1083 * how often a formula like this would get the right result.
1085 void check_for_short_line(const char *aline,const struct line_properties *last)
1087 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1088 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1089 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1091 if (pswit[ECHO_SWITCH])
1092 g_print("\n%s\n",prevline);
1093 if (!pswit[OVERVIEW_SWITCH])
1094 g_print(" Line %ld column %ld - Short line %ld?\n",
1095 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1102 * check_for_starting_punctuation:
1104 * Look for punctuation other than full ellipses at start of line.
1106 void check_for_starting_punctuation(const char *aline)
1108 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1109 !g_str_has_prefix(aline,". . ."))
1111 if (pswit[ECHO_SWITCH])
1112 g_print("\n%s\n",aline);
1113 if (!pswit[OVERVIEW_SWITCH])
1114 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1122 * check_for_spaced_emdash:
1124 * Check for spaced em-dashes.
1126 * We must check _all_ occurrences of "--" on the line
1127 * hence the loop - even if the first double-dash is OK
1128 * there may be another that's wrong later on.
1130 void check_for_spaced_emdash(const char *aline)
1132 const char *s,*t,*next;
1133 for (s=aline;t=strstr(s,"--");s=next)
1135 next=g_utf8_next_char(g_utf8_next_char(t));
1136 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1137 g_utf8_get_char(next)==CHAR_SPACE)
1139 if (pswit[ECHO_SWITCH])
1140 g_print("\n%s\n",aline);
1141 if (!pswit[OVERVIEW_SWITCH])
1142 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1143 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1151 * check_for_spaced_dash:
1153 * Check for spaced dashes.
1155 void check_for_spaced_dash(const char *aline)
1158 if ((s=strstr(aline," -")))
1160 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1162 if (pswit[ECHO_SWITCH])
1163 g_print("\n%s\n",aline);
1164 if (!pswit[OVERVIEW_SWITCH])
1165 g_print(" Line %ld column %ld - Spaced dash?\n",
1166 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1171 else if ((s=strstr(aline,"- ")))
1173 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1175 if (pswit[ECHO_SWITCH])
1176 g_print("\n%s\n",aline);
1177 if (!pswit[OVERVIEW_SWITCH])
1178 g_print(" Line %ld column %ld - Spaced dash?\n",
1179 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1187 * check_for_unmarked_paragraphs:
1189 * Check for unmarked paragraphs indicated by separate speakers.
1191 * May well be false positive:
1192 * "Bravo!" "Wonderful!" called the crowd.
1193 * but useful all the same.
1195 void check_for_unmarked_paragraphs(const char *aline)
1198 s=strstr(aline,"\" \"");
1200 s=strstr(aline,"\" \"");
1203 if (pswit[ECHO_SWITCH])
1204 g_print("\n%s\n",aline);
1205 if (!pswit[OVERVIEW_SWITCH])
1206 g_print(" Line %ld column %ld - "
1207 "Query missing paragraph break?\n",
1208 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1215 * check_for_jeebies:
1217 * Check for "to he" and other easy h/b errors.
1219 * This is a very inadequate effort on the h/b problem,
1220 * but the phrase "to he" is always an error, whereas "to
1221 * be" is quite common.
1222 * Similarly, '"Quiet!", be said.' is a non-be error
1223 * "to he" is _not_ always an error!:
1224 * "Where they went to he couldn't say."
1225 * Another false positive:
1226 * What would "Cinderella" be without the . . .
1227 * and another: "If he wants to he can see for himself."
1229 void check_for_jeebies(const char *aline)
1232 s=strstr(aline," be could ");
1234 s=strstr(aline," be would ");
1236 s=strstr(aline," was be ");
1238 s=strstr(aline," be is ");
1240 s=strstr(aline," is be ");
1242 s=strstr(aline,"\", be ");
1244 s=strstr(aline,"\" be ");
1246 s=strstr(aline,"\" be ");
1248 s=strstr(aline," to he ");
1251 if (pswit[ECHO_SWITCH])
1252 g_print("\n%s\n",aline);
1253 if (!pswit[OVERVIEW_SWITCH])
1254 g_print(" Line %ld column %ld - Query he/be error?\n",
1255 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1259 s=strstr(aline," the had ");
1261 s=strstr(aline," a had ");
1263 s=strstr(aline," they bad ");
1265 s=strstr(aline," she bad ");
1267 s=strstr(aline," he bad ");
1269 s=strstr(aline," you bad ");
1271 s=strstr(aline," i bad ");
1274 if (pswit[ECHO_SWITCH])
1275 g_print("\n%s\n",aline);
1276 if (!pswit[OVERVIEW_SWITCH])
1277 g_print(" Line %ld column %ld - Query had/bad error?\n",
1278 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1282 s=strstr(aline,"; hut ");
1284 s=strstr(aline,", hut ");
1287 if (pswit[ECHO_SWITCH])
1288 g_print("\n%s\n",aline);
1289 if (!pswit[OVERVIEW_SWITCH])
1290 g_print(" Line %ld column %ld - Query hut/but error?\n",
1291 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1298 * check_for_mta_from:
1300 * Special case - angled bracket in front of "From" placed there by an
1301 * MTA when sending an e-mail.
1303 void check_for_mta_from(const char *aline)
1306 s=strstr(aline,">From");
1309 if (pswit[ECHO_SWITCH])
1310 g_print("\n%s\n",aline);
1311 if (!pswit[OVERVIEW_SWITCH])
1312 g_print(" Line %ld column %ld - "
1313 "Query angled bracket with From\n",
1314 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1321 * check_for_orphan_character:
1323 * Check for a single character line -
1324 * often an overflow from bad wrapping.
1326 void check_for_orphan_character(const char *aline)
1329 c=g_utf8_get_char(aline);
1330 if (c && !*g_utf8_next_char(aline))
1332 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1333 ; /* Nothing - ignore numerals alone on a line. */
1336 if (pswit[ECHO_SWITCH])
1337 g_print("\n%s\n",aline);
1338 if (!pswit[OVERVIEW_SWITCH])
1339 g_print(" Line %ld column 1 - Query single character line\n",
1348 * check_for_pling_scanno:
1350 * Check for I" - often should be !
1352 void check_for_pling_scanno(const char *aline)
1355 s=strstr(aline," I\"");
1358 if (pswit[ECHO_SWITCH])
1359 g_print("\n%s\n",aline);
1360 if (!pswit[OVERVIEW_SWITCH])
1361 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1362 linecnt,g_utf8_pointer_to_offset(aline,s));
1369 * check_for_extra_period:
1371 * Check for period without a capital letter. Cut-down from gutspell.
1372 * Only works when it happens on a single line.
1374 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1376 const char *s,*t,*s1;
1381 gunichar *decomposition;
1382 if (pswit[PARANOID_SWITCH])
1384 for (t=aline;t=strstr(t,". ");)
1388 t=g_utf8_next_char(t);
1389 /* start of line punctuation is handled elsewhere */
1392 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1394 t=g_utf8_next_char(t);
1397 if (warnings->isDutch)
1399 /* For Frank & Jeroen -- 's Middags case */
1400 gunichar c2,c3,c4,c5;
1401 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1402 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1403 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1404 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1405 if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
1406 c4==CHAR_SPACE && g_unichar_isupper(c5))
1408 t=g_utf8_next_char(t);
1412 s1=g_utf8_next_char(g_utf8_next_char(t));
1413 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1414 !isdigit(g_utf8_get_char(s1)))
1415 s1=g_utf8_next_char(s1);
1416 if (g_unichar_islower(g_utf8_get_char(s1)))
1418 /* we have something to investigate */
1420 /* so let's go back and find out */
1421 for (s1=g_utf8_prev_char(t);s1>=aline &&
1422 (g_unichar_isalpha(g_utf8_get_char(s1)) ||
1423 g_unichar_isdigit(g_utf8_get_char(s1)) ||
1424 g_utf8_get_char(s1)==CHAR_SQUOTE &&
1425 g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
1426 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
1427 s1=g_utf8_prev_char(s1))
1429 s1=g_utf8_next_char(s1);
1432 testword=g_strndup(s1,s-s1);
1434 testword=g_strdup(s1);
1435 for (i=0;*abbrev[i];i++)
1436 if (!strcmp(testword,abbrev[i]))
1438 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1440 if (!*g_utf8_next_char(testword))
1442 if (isroman(testword))
1447 for (s=testword;*s;s=g_utf8_next_char(s))
1449 decomposition=g_unicode_canonical_decomposition(
1450 g_utf8_get_char(s),&len);
1451 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1453 g_free(decomposition);
1457 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1459 g_tree_insert(qperiod,g_strdup(testword),
1460 GINT_TO_POINTER(1));
1461 if (pswit[ECHO_SWITCH])
1462 g_print("\n%s\n",aline);
1463 if (!pswit[OVERVIEW_SWITCH])
1464 g_print(" Line %ld column %ld - Extra period?\n",
1465 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1471 t=g_utf8_next_char(t);
1477 * check_for_following_punctuation:
1479 * Check for words usually not followed by punctuation.
1481 void check_for_following_punctuation(const char *aline)
1484 const char *s,*wordstart;
1487 if (pswit[TYPO_SWITCH])
1498 inword=g_utf8_strdown(t,-1);
1500 for (i=0;*nocomma[i];i++)
1501 if (!strcmp(inword,nocomma[i]))
1503 c=g_utf8_get_char(s);
1504 if (c==',' || c==';' || c==':')
1506 if (pswit[ECHO_SWITCH])
1507 g_print("\n%s\n",aline);
1508 if (!pswit[OVERVIEW_SWITCH])
1509 g_print(" Line %ld column %ld - "
1510 "Query punctuation after %s?\n",
1511 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1517 for (i=0;*noperiod[i];i++)
1518 if (!strcmp(inword,noperiod[i]))
1520 c=g_utf8_get_char(s);
1521 if (c=='.' || c=='!')
1523 if (pswit[ECHO_SWITCH])
1524 g_print("\n%s\n",aline);
1525 if (!pswit[OVERVIEW_SWITCH])
1526 g_print(" Line %ld column %ld - "
1527 "Query punctuation after %s?\n",
1528 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1542 * Check for commonly mistyped words,
1543 * and digits like 0 for O in a word.
1545 void check_for_typos(const char *aline,struct warnings *warnings)
1547 const char *s,*t,*nt,*wordstart;
1549 gunichar *decomposition;
1551 int i,vowel,consonant,*dupcnt;
1552 gboolean isdup,istypo,alower;
1555 gsize decomposition_len;
1559 inword=getaword(&s);
1563 continue; /* don't bother with empty lines */
1565 if (mixdigit(inword))
1567 if (pswit[ECHO_SWITCH])
1568 g_print("\n%s\n",aline);
1569 if (!pswit[OVERVIEW_SWITCH])
1570 g_print(" Line %ld column %ld - Query digit in %s\n",
1571 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1576 * Put the word through a series of tests for likely typos and OCR
1579 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1583 for (t=inword;*t;t=g_utf8_next_char(t))
1585 c=g_utf8_get_char(t);
1586 nt=g_utf8_next_char(t);
1587 /* lowercase for testing */
1588 if (g_unichar_islower(c))
1590 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1593 * We have an uppercase mid-word. However, there are
1595 * Mac and Mc like McGill
1596 * French contractions like l'Abbe
1598 offset=g_utf8_pointer_to_offset(inword,t);
1599 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1600 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1601 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1603 g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
1609 testword=g_utf8_casefold(inword,-1);
1611 if (pswit[TYPO_SWITCH])
1614 * Check for certain unlikely two-letter combinations at word
1617 len=g_utf8_strlen(testword,-1);
1620 for (i=0;*nostart[i];i++)
1621 if (g_str_has_prefix(testword,nostart[i]))
1623 for (i=0;*noend[i];i++)
1624 if (g_str_has_suffix(testword,noend[i]))
1627 /* ght is common, gbt never. Like that. */
1628 if (strstr(testword,"cb"))
1630 if (strstr(testword,"gbt"))
1632 if (strstr(testword,"pbt"))
1634 if (strstr(testword,"tbs"))
1636 if (strstr(testword,"mrn"))
1638 if (strstr(testword,"ahle"))
1640 if (strstr(testword,"ihle"))
1643 * "TBE" does happen - like HEARTBEAT - but uncommon.
1644 * Also "TBI" - frostbite, outbid - but uncommon.
1645 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1646 * numerals, but "ii" is a common scanno.
1648 if (strstr(testword,"tbi"))
1650 if (strstr(testword,"tbe"))
1652 if (strstr(testword,"ii"))
1655 * Check for no vowels or no consonants.
1656 * If none, flag a typo.
1658 if (!istypo && len>1)
1661 for (t=testword;*t;t=g_utf8_next_char(t))
1663 c=g_utf8_get_char(t);
1665 g_unicode_canonical_decomposition(c,&decomposition_len);
1666 if (c=='y' || g_unichar_isdigit(c))
1668 /* Yah, this is loose. */
1672 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1676 g_free(decomposition);
1678 if (!vowel || !consonant)
1682 * Now exclude the word from being reported if it's in
1685 for (i=0;*okword[i];i++)
1686 if (!strcmp(testword,okword[i]))
1689 * What looks like a typo may be a Roman numeral.
1692 if (istypo && isroman(testword))
1694 /* Check the manual list of typos. */
1696 for (i=0;*typo[i];i++)
1697 if (!strcmp(testword,typo[i]))
1700 * Check lowercase s, l, i and m - special cases.
1701 * "j" - often a semi-colon gone wrong.
1702 * "d" for a missing apostrophe - he d
1705 if (!istypo && len==1 &&
1706 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1710 dupcnt=g_tree_lookup(qword,testword);
1714 isdup=!pswit[VERBOSE_SWITCH];
1718 dupcnt=g_new0(int,1);
1719 g_tree_insert(qword,g_strdup(testword),dupcnt);
1724 if (pswit[ECHO_SWITCH])
1725 g_print("\n%s\n",aline);
1726 if (!pswit[OVERVIEW_SWITCH])
1728 g_print(" Line %ld column %ld - Query word %s",
1729 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1731 if (!pswit[VERBOSE_SWITCH])
1732 g_print(" - not reporting duplicates");
1740 /* check the user's list of typos */
1741 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1743 if (pswit[ECHO_SWITCH])
1744 g_print("\n%s\n",aline);
1745 if (!pswit[OVERVIEW_SWITCH])
1746 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1747 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1749 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1751 if (pswit[PARANOID_SWITCH] && warnings->digit)
1753 /* In paranoid mode, query all 0 and 1 standing alone. */
1754 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1756 if (pswit[ECHO_SWITCH])
1757 g_print("\n%s\n",aline);
1758 if (!pswit[OVERVIEW_SWITCH])
1759 g_print(" Line %ld column %ld - Query standalone %s\n",
1760 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1771 * check_for_misspaced_punctuation:
1773 * Look for added or missing spaces around punctuation and quotes.
1774 * If there is a punctuation character like ! with no space on
1775 * either side, suspect a missing!space. If there are spaces on
1776 * both sides , assume a typo. If we see a double quote with no
1777 * space or punctuation on either side of it, assume unspaced
1778 * quotes "like"this.
1780 void check_for_misspaced_punctuation(const char *aline,
1781 struct parities *parities,gboolean isemptyline)
1783 gboolean isacro,isellipsis;
1785 gunichar c,nc,pc,n2c;
1786 c=g_utf8_get_char(aline);
1787 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1788 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1792 nc=g_utf8_get_char(g_utf8_next_char(s));
1793 /* For each character in the line after the first. */
1794 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1796 /* we need to suppress warnings for acronyms like M.D. */
1798 /* we need to suppress warnings for ellipsis . . . */
1801 * If there are letters on both sides of it or
1802 * if it's strict punctuation followed by an alpha.
1804 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1805 g_utf8_strchr("?!,;:",-1,c)))
1809 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1810 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1812 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1818 if (pswit[ECHO_SWITCH])
1819 g_print("\n%s\n",aline);
1820 if (!pswit[OVERVIEW_SWITCH])
1821 g_print(" Line %ld column %ld - Missing space?\n",
1822 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1827 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1830 * If there are spaces on both sides,
1831 * or space before and end of line.
1835 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1836 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1838 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1842 if (!isemptyline && !isellipsis)
1844 if (pswit[ECHO_SWITCH])
1845 g_print("\n%s\n",aline);
1846 if (!pswit[OVERVIEW_SWITCH])
1847 g_print(" Line %ld column %ld - "
1848 "Spaced punctuation?\n",linecnt,
1849 g_utf8_pointer_to_offset(aline,s)+1);
1856 /* Split out the characters that CANNOT be preceded by space. */
1857 c=g_utf8_get_char(aline);
1858 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1859 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1863 nc=g_utf8_get_char(g_utf8_next_char(s));
1864 /* for each character in the line after the first */
1865 if (g_utf8_strchr("?!,;:",-1,c))
1867 /* if it's punctuation that _cannot_ have a space before it */
1868 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1871 * If nc DOES == space,
1872 * it was already reported just above.
1874 if (pswit[ECHO_SWITCH])
1875 g_print("\n%s\n",aline);
1876 if (!pswit[OVERVIEW_SWITCH])
1877 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1878 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1885 * Special case " .X" where X is any alpha.
1886 * This plugs a hole in the acronym code above.
1887 * Inelegant, but maintainable.
1889 c=g_utf8_get_char(aline);
1890 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1891 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1895 nc=g_utf8_get_char(g_utf8_next_char(s));
1896 /* for each character in the line after the first */
1899 /* if it's a period */
1900 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1903 * If the period follows a space and
1904 * is followed by a letter.
1906 if (pswit[ECHO_SWITCH])
1907 g_print("\n%s\n",aline);
1908 if (!pswit[OVERVIEW_SWITCH])
1909 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1910 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1916 c=g_utf8_get_char(aline);
1917 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1918 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1922 nc=g_utf8_get_char(g_utf8_next_char(s));
1923 /* for each character in the line after the first */
1926 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1927 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1928 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1930 if (pswit[ECHO_SWITCH])
1931 g_print("\n%s\n",aline);
1932 if (!pswit[OVERVIEW_SWITCH])
1933 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1934 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1940 /* Check parity of quotes. */
1941 nc=g_utf8_get_char(aline);
1942 for (s=aline;*s;s=g_utf8_next_char(s))
1945 nc=g_utf8_get_char(g_utf8_next_char(s));
1948 parities->dquote=!parities->dquote;
1949 if (!parities->dquote)
1952 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
1954 if (pswit[ECHO_SWITCH])
1955 g_print("\n%s\n",aline);
1956 if (!pswit[OVERVIEW_SWITCH])
1957 g_print(" Line %ld column %ld - "
1958 "Wrongspaced quotes?\n",
1959 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1967 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1968 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
1970 if (pswit[ECHO_SWITCH])
1971 g_print("\n%s\n",aline);
1972 if (!pswit[OVERVIEW_SWITCH])
1973 g_print(" Line %ld column %ld - "
1974 "Wrongspaced quotes?\n",
1975 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1982 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
1984 if (g_utf8_strchr(",;:!?)]} ",-1,
1985 g_utf8_get_char(g_utf8_next_char(aline))))
1987 if (pswit[ECHO_SWITCH])
1988 g_print("\n%s\n",aline);
1989 if (!pswit[OVERVIEW_SWITCH])
1990 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
1996 if (pswit[SQUOTE_SWITCH])
1998 nc=g_utf8_get_char(aline);
1999 for (s=aline;*s;s=g_utf8_next_char(s))
2002 nc=g_utf8_get_char(g_utf8_next_char(s));
2003 if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
2005 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2006 !g_unichar_isalpha(nc)))
2008 parities->squote=!parities->squote;
2009 if (!parities->squote)
2012 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2014 if (pswit[ECHO_SWITCH])
2015 g_print("\n%s\n",aline);
2016 if (!pswit[OVERVIEW_SWITCH])
2017 g_print(" Line %ld column %ld - "
2018 "Wrongspaced singlequotes?\n",
2019 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2027 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2028 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2030 if (pswit[ECHO_SWITCH])
2031 g_print("\n%s\n",aline);
2032 if (!pswit[OVERVIEW_SWITCH])
2033 g_print(" Line %ld column %ld - "
2034 "Wrongspaced singlequotes?\n",
2035 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2046 * check_for_double_punctuation:
2048 * Look for double punctuation like ,. or ,,
2049 * Thanks to DW for the suggestion!
2050 * In books with references, ".," and ".;" are common
2051 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2052 * OTOH, from my initial tests, there are also fairly
2053 * common errors. What to do? Make these cases paranoid?
2054 * ".," is the most common, so warnings->dotcomma is used
2055 * to suppress detailed reporting if it occurs often.
2057 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2061 nc=g_utf8_get_char(aline);
2062 for (s=aline;*s;s=g_utf8_next_char(s))
2065 nc=g_utf8_get_char(g_utf8_next_char(s));
2066 /* for each punctuation character in the line */
2067 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2068 g_utf8_strchr(".?!,;:",-1,nc))
2070 /* followed by punctuation, it's a query, unless . . . */
2071 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2072 !warnings->dotcomma && c=='.' && nc==',' ||
2073 warnings->isFrench && g_str_has_prefix(s,",...") ||
2074 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2075 warnings->isFrench && g_str_has_prefix(s,";...") ||
2076 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2077 warnings->isFrench && g_str_has_prefix(s,":...") ||
2078 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2079 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2080 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2081 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2082 warnings->isFrench && g_str_has_prefix(s,"...?"))
2084 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2085 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2086 warnings->isFrench && g_str_has_prefix(s,";...") ||
2087 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2088 warnings->isFrench && g_str_has_prefix(s,":...") ||
2089 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2090 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2091 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2092 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2093 warnings->isFrench && g_str_has_prefix(s,"...?"))
2096 nc=g_utf8_get_char(g_utf8_next_char(s));
2098 ; /* do nothing for .. !! and ?? which can be legit */
2102 if (pswit[ECHO_SWITCH])
2103 g_print("\n%s\n",aline);
2104 if (!pswit[OVERVIEW_SWITCH])
2105 g_print(" Line %ld column %ld - Double punctuation?\n",
2106 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2115 * check_for_spaced_quotes:
2117 void check_for_spaced_quotes(const char *aline)
2121 while ((t=strstr(s," \" ")))
2123 if (pswit[ECHO_SWITCH])
2124 g_print("\n%s\n",aline);
2125 if (!pswit[OVERVIEW_SWITCH])
2126 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2127 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2130 s=g_utf8_next_char(g_utf8_next_char(t));
2133 while ((t=strstr(s," ' ")))
2135 if (pswit[ECHO_SWITCH])
2136 g_print("\n%s\n",aline);
2137 if (!pswit[OVERVIEW_SWITCH])
2138 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2139 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2142 s=g_utf8_next_char(g_utf8_next_char(t));
2145 while ((t=strstr(s," ` ")))
2147 if (pswit[ECHO_SWITCH])
2148 g_print("\n%s\n",aline);
2149 if (!pswit[OVERVIEW_SWITCH])
2150 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2151 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2154 s=g_utf8_next_char(g_utf8_next_char(t));
2159 * check_for_miscased_genative:
2161 * Check special case of 'S instead of 's at end of word.
2163 void check_for_miscased_genative(const char *aline)
2169 c=g_utf8_get_char(aline);
2170 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2171 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2175 nc=g_utf8_get_char(g_utf8_next_char(s));
2176 if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
2178 if (pswit[ECHO_SWITCH])
2179 g_print("\n%s\n",aline);
2180 if (!pswit[OVERVIEW_SWITCH])
2181 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2182 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2190 * check_end_of_line:
2192 * Now check special cases - start and end of line -
2193 * for single and double quotes. Start is sometimes [sic]
2194 * but better to query it anyway.
2195 * While we're here, check for dash at end of line.
2197 void check_end_of_line(const char *aline,struct warnings *warnings)
2202 lbytes=strlen(aline);
2203 if (g_utf8_strlen(aline,lbytes)>1)
2205 s=g_utf8_prev_char(aline+lbytes);
2206 c1=g_utf8_get_char(s);
2207 c2=g_utf8_get_char(g_utf8_prev_char(s));
2208 if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
2211 if (pswit[ECHO_SWITCH])
2212 g_print("\n%s\n",aline);
2213 if (!pswit[OVERVIEW_SWITCH])
2214 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2215 g_utf8_strlen(aline,lbytes));
2219 c1=g_utf8_get_char(aline);
2220 c2=g_utf8_get_char(g_utf8_next_char(aline));
2221 if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
2223 if (pswit[ECHO_SWITCH])
2224 g_print("\n%s\n",aline);
2225 if (!pswit[OVERVIEW_SWITCH])
2226 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2231 * Dash at end of line may well be legit - paranoid mode only
2232 * and don't report em-dash at line-end.
2234 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2236 for (s=g_utf8_prev_char(aline+lbytes);
2237 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2239 if (g_utf8_get_char(s)=='-' &&
2240 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2242 if (pswit[ECHO_SWITCH])
2243 g_print("\n%s\n",aline);
2244 if (!pswit[OVERVIEW_SWITCH])
2245 g_print(" Line %ld column %ld - "
2246 "Hyphen at end of line?\n",
2247 linecnt,g_utf8_pointer_to_offset(aline,s));
2254 * check_for_unspaced_bracket:
2256 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2257 * If so, suspect a scanno like "a]most".
2259 void check_for_unspaced_bracket(const char *aline)
2263 c=g_utf8_get_char(aline);
2264 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2265 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2269 nc=g_utf8_get_char(g_utf8_next_char(s));
2272 /* for each bracket character in the line except 1st & last */
2273 if (g_utf8_strchr("{[()]}",-1,c) &&
2274 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2276 if (pswit[ECHO_SWITCH])
2277 g_print("\n%s\n",aline);
2278 if (!pswit[OVERVIEW_SWITCH])
2279 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2280 linecnt,g_utf8_pointer_to_offset(aline,s));
2288 * check_for_unpunctuated_endquote:
2290 void check_for_unpunctuated_endquote(const char *aline)
2294 c=g_utf8_get_char(aline);
2295 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2296 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2300 nc=g_utf8_get_char(g_utf8_next_char(s));
2301 /* for each character in the line except 1st */
2302 if (c==CHAR_DQUOTE && isalpha(pc))
2304 if (pswit[ECHO_SWITCH])
2305 g_print("\n%s\n",aline);
2306 if (!pswit[OVERVIEW_SWITCH])
2307 g_print(" Line %ld column %ld - "
2308 "endquote missing punctuation?\n",
2309 linecnt,g_utf8_pointer_to_offset(aline,s));
2317 * check_for_html_tag:
2319 * Check for <HTML TAG>.
2321 * If there is a < in the line, followed at some point
2322 * by a > then we suspect HTML.
2324 void check_for_html_tag(const char *aline)
2326 const char *open,*close;
2328 open=strchr(aline,'<');
2331 close=strchr(g_utf8_next_char(open),'>');
2334 if (pswit[ECHO_SWITCH])
2335 g_print("\n%s\n",aline);
2336 if (!pswit[OVERVIEW_SWITCH])
2338 tag=g_strndup(open,close-open+1);
2339 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2340 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2350 * check_for_html_entity:
2352 * Check for &symbol; HTML.
2354 * If there is a & in the line, followed at
2355 * some point by a ; then we suspect HTML.
2357 void check_for_html_entity(const char *aline)
2359 const char *s,*amp,*scolon;
2361 amp=strchr(aline,'&');
2364 scolon=strchr(amp,';');
2367 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2368 if (g_utf8_get_char(s)==CHAR_SPACE)
2369 break; /* Don't report "Jones & Son;" */
2372 if (pswit[ECHO_SWITCH])
2373 g_print("\n%s\n",aline);
2374 if (!pswit[OVERVIEW_SWITCH])
2376 entity=g_strndup(amp,scolon-amp+1);
2377 g_print(" Line %ld column %d - HTML symbol? %s \n",
2378 linecnt,(int)(amp-aline)+1,entity);
2391 * If we are in a state of unbalanced quotes, and this line
2392 * doesn't begin with a quote, output the stored error message.
2393 * If the -P switch was used, print the warning even if the
2394 * new para starts with quotes.
2396 void print_pending(const char *aline,const char *parastart,
2397 struct pending *pending)
2404 c=g_utf8_get_char(s);
2405 if (pending->dquote)
2407 if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2409 if (!pswit[OVERVIEW_SWITCH])
2411 if (pswit[ECHO_SWITCH])
2412 g_print("\n%s\n",parastart);
2413 g_print("%s\n",pending->dquote);
2418 g_free(pending->dquote);
2419 pending->dquote=NULL;
2421 if (pending->squote)
2423 if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2426 if (!pswit[OVERVIEW_SWITCH])
2428 if (pswit[ECHO_SWITCH])
2429 g_print("\n%s\n",parastart);
2430 g_print("%s\n",pending->squote);
2435 g_free(pending->squote);
2436 pending->squote=NULL;
2438 if (pending->rbrack)
2440 if (!pswit[OVERVIEW_SWITCH])
2442 if (pswit[ECHO_SWITCH])
2443 g_print("\n%s\n",parastart);
2444 g_print("%s\n",pending->rbrack);
2448 g_free(pending->rbrack);
2449 pending->rbrack=NULL;
2451 if (pending->sbrack)
2453 if (!pswit[OVERVIEW_SWITCH])
2455 if (pswit[ECHO_SWITCH])
2456 g_print("\n%s\n",parastart);
2457 g_print("%s\n",pending->sbrack);
2461 g_free(pending->sbrack);
2462 pending->sbrack=NULL;
2464 if (pending->cbrack)
2466 if (!pswit[OVERVIEW_SWITCH])
2468 if (pswit[ECHO_SWITCH])
2469 g_print("\n%s\n",parastart);
2470 g_print("%s\n",pending->cbrack);
2474 g_free(pending->cbrack);
2475 pending->cbrack=NULL;
2477 if (pending->unders)
2479 if (!pswit[OVERVIEW_SWITCH])
2481 if (pswit[ECHO_SWITCH])
2482 g_print("\n%s\n",parastart);
2483 g_print("%s\n",pending->unders);
2487 g_free(pending->unders);
2488 pending->unders=NULL;
2493 * check_for_mismatched_quotes:
2495 * At end of paragraph, check for mismatched quotes.
2497 * We don't want to report an error immediately, since it is a
2498 * common convention to omit the quotes at end of paragraph if
2499 * the next paragraph is a continuation of the same speaker.
2500 * Where this is the case, the next para should begin with a
2501 * quote, so we store the warning message and only display it
2502 * at the top of the next iteration if the new para doesn't
2503 * start with a quote.
2504 * The -p switch overrides this default, and warns of unclosed
2505 * quotes on _every_ paragraph, whether the next begins with a
2508 void check_for_mismatched_quotes(const struct counters *counters,
2509 struct pending *pending)
2511 if (counters->quot%2)
2513 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2514 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2515 counters->open_single_quote!=counters->close_single_quote)
2517 g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
2518 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2519 counters->open_single_quote!=counters->close_single_quote &&
2520 counters->open_single_quote!=counters->close_single_quote+1)
2522 * Flag it to be noted regardless of the
2523 * first char of the next para.
2526 if (counters->r_brack)
2528 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2529 if (counters->s_brack)
2531 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2532 if (counters->c_brack)
2534 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2535 if (counters->c_unders%2)
2537 g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
2541 * check_for_omitted_punctuation:
2543 * Check for omitted punctuation at end of paragraph by working back
2544 * through prevline. DW.
2545 * Need to check this only for "normal" paras.
2546 * So what is a "normal" para?
2547 * Not normal if one-liner (chapter headings, etc.)
2548 * Not normal if doesn't contain at least one locase letter
2549 * Not normal if starts with space
2551 void check_for_omitted_punctuation(const char *prevline,
2552 struct line_properties *last,int start_para_line)
2554 gboolean letter_on_line=FALSE;
2556 for (s=prevline;*s;s=g_utf8_next_char(s))
2557 if (g_unichar_isalpha(g_utf8_get_char(s)))
2559 letter_on_line=TRUE;
2563 * This next "if" is a problem.
2564 * If we say "start_para_line <= linecnt - 1", that includes
2565 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2566 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2567 * misses genuine one-line paragraphs.
2569 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2570 g_utf8_get_char(prevline)>CHAR_SPACE)
2572 for (s=g_utf8_prev_char(prevline+strlen(prevline));
2573 (g_utf8_get_char(s)==CHAR_DQUOTE ||
2574 g_utf8_get_char(s)==CHAR_SQUOTE) &&
2575 g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
2576 s=g_utf8_prev_char(s))
2578 for (;s>prevline;s=g_utf8_prev_char(s))
2580 if (g_unichar_isalpha(g_utf8_get_char(s)))
2582 if (pswit[ECHO_SWITCH])
2583 g_print("\n%s\n",prevline);
2584 if (!pswit[OVERVIEW_SWITCH])
2585 g_print(" Line %ld column %ld - "
2586 "No punctuation at para end?\n",
2587 linecnt-1,g_utf8_strlen(prevline,-1));
2592 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2598 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2600 const char *word=key;
2603 g_print("\nNote: Queried word %s was duplicated %d times\n",
2608 void print_as_windows_1252(const char *string)
2610 gsize inbytes,outbytes;
2612 GIConv converter=(GIConv)-1;
2615 if (converter!=(GIConv)-1)
2616 g_iconv_close(converter);
2617 converter=(GIConv)-1;
2620 if (converter=(GIConv)-1)
2621 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2622 if (converter!=(GIConv)-1)
2624 inbytes=outbytes=strlen(string);
2625 bp=buf=g_malloc(outbytes+1);
2626 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2632 fputs(string,stdout);
2635 void print_as_utf_8(const char *string)
2637 fputs(string,stdout);
2645 void procfile(const char *filename)
2648 gchar *parastart=NULL; /* first line of current para */
2649 gchar *etext,*aline;
2652 struct first_pass_results *first_pass_results;
2653 struct warnings *warnings;
2654 struct counters counters={0};
2655 struct line_properties last={0};
2656 struct parities parities={0};
2657 struct pending pending={0};
2658 gboolean isemptyline;
2659 long start_para_line=0;
2660 gboolean isnewpara=FALSE,enddash=FALSE;
2661 last.start=CHAR_SPACE;
2662 linecnt=checked_linecnt=0;
2663 etext=read_etext(filename,&err);
2666 if (pswit[STDOUT_SWITCH])
2667 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2669 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2672 g_print("\n\nFile: %s\n\n",filename);
2673 first_pass_results=first_pass(etext);
2674 warnings=report_first_pass(first_pass_results);
2675 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2676 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2678 * Here we go with the main pass. Hold onto yer hat!
2682 while ((aline=flgets(&etext_ptr,linecnt+1)))
2687 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2688 continue; // skip DP page separators completely
2689 if (linecnt<first_pass_results->firstline ||
2690 (first_pass_results->footerline>0 &&
2691 linecnt>first_pass_results->footerline))
2693 if (pswit[HEADER_SWITCH])
2695 if (g_str_has_prefix(aline,"Title:"))
2696 g_print(" %s\n",aline);
2697 if (g_str_has_prefix(aline,"Author:"))
2698 g_print(" %s\n",aline);
2699 if (g_str_has_prefix(aline,"Release Date:"))
2700 g_print(" %s\n",aline);
2701 if (g_str_has_prefix(aline,"Edition:"))
2702 g_print(" %s\n\n",aline);
2704 continue; /* skip through the header */
2707 print_pending(aline,parastart,&pending);
2708 memset(&pending,0,sizeof(pending));
2709 isemptyline=analyse_quotes(aline,&counters);
2710 if (isnewpara && !isemptyline)
2712 /* This line is the start of a new paragraph. */
2713 start_para_line=linecnt;
2714 /* Capture its first line in case we want to report it later. */
2716 parastart=g_strdup(aline);
2717 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2719 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2720 !g_unichar_isdigit(g_utf8_get_char(s)))
2721 s=g_utf8_next_char(s);
2722 if (g_unichar_islower(g_utf8_get_char(s)))
2724 /* and its first letter is lowercase */
2725 if (pswit[ECHO_SWITCH])
2726 g_print("\n%s\n",aline);
2727 if (!pswit[OVERVIEW_SWITCH])
2728 g_print(" Line %ld column %ld - "
2729 "Paragraph starts with lower-case\n",
2730 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2734 isnewpara=FALSE; /* Signal the end of new para processing. */
2736 /* Check for an em-dash broken at line end. */
2737 if (enddash && g_utf8_get_char(aline)=='-')
2739 if (pswit[ECHO_SWITCH])
2740 g_print("\n%s\n",aline);
2741 if (!pswit[OVERVIEW_SWITCH])
2742 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2747 for (s=g_utf8_prev_char(aline+strlen(aline));
2748 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2750 if (s>=aline && g_utf8_get_char(s)=='-')
2752 check_for_control_characters(aline);
2754 check_for_odd_characters(aline,warnings,isemptyline);
2755 if (warnings->longline)
2756 check_for_long_line(aline);
2757 if (warnings->shortline)
2758 check_for_short_line(aline,&last);
2760 last.len=g_utf8_strlen(aline,-1);
2761 last.start=g_utf8_get_char(aline);
2762 check_for_starting_punctuation(aline);
2765 check_for_spaced_emdash(aline);
2766 check_for_spaced_dash(aline);
2768 check_for_unmarked_paragraphs(aline);
2769 check_for_jeebies(aline);
2770 check_for_mta_from(aline);
2771 check_for_orphan_character(aline);
2772 check_for_pling_scanno(aline);
2773 check_for_extra_period(aline,warnings);
2774 check_for_following_punctuation(aline);
2775 check_for_typos(aline,warnings);
2776 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2777 check_for_double_punctuation(aline,warnings);
2778 check_for_spaced_quotes(aline);
2779 check_for_miscased_genative(aline);
2780 check_end_of_line(aline,warnings);
2781 check_for_unspaced_bracket(aline);
2782 if (warnings->endquote)
2783 check_for_unpunctuated_endquote(aline);
2784 check_for_html_tag(aline);
2785 check_for_html_entity(aline);
2788 check_for_mismatched_quotes(&counters,&pending);
2789 memset(&counters,0,sizeof(counters));
2790 /* let the next iteration know that it's starting a new para */
2793 check_for_omitted_punctuation(prevline,&last,start_para_line);
2796 prevline=g_strdup(aline);
2806 if (!pswit[OVERVIEW_SWITCH])
2807 g_tree_foreach(qword,report_duplicate_queries,NULL);
2808 g_tree_unref(qword);
2809 g_tree_unref(qperiod);
2810 g_set_print_handler(NULL);
2811 print_as_windows_1252(NULL);
2812 if (pswit[MARKUP_SWITCH])
2819 * Get one line from the input text, checking for
2820 * the existence of exactly one CR/LF line-end per line.
2822 * Returns: a pointer to the line.
2824 char *flgets(char **etext,long lcnt)
2827 gboolean isCR=FALSE;
2828 char *theline=*etext;
2833 c=g_utf8_get_char(*etext);
2834 *etext=g_utf8_next_char(*etext);
2837 /* either way, it's end of line */
2844 /* Error - a LF without a preceding CR */
2845 if (pswit[LINE_END_SWITCH])
2847 if (pswit[ECHO_SWITCH])
2849 s=g_strndup(theline,eos-theline);
2850 g_print("\n%s\n",s);
2853 if (!pswit[OVERVIEW_SWITCH])
2854 g_print(" Line %ld - No CR?\n",lcnt);
2865 /* Error - two successive CRs */
2866 if (pswit[LINE_END_SWITCH])
2868 if (pswit[ECHO_SWITCH])
2870 s=g_strndup(theline,eos-theline);
2871 g_print("\n%s\n",s);
2874 if (!pswit[OVERVIEW_SWITCH])
2875 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2884 if (pswit[LINE_END_SWITCH] && isCR)
2886 if (pswit[ECHO_SWITCH])
2888 s=g_strndup(theline,eos-theline);
2889 g_print("\n%s\n",s);
2892 if (!pswit[OVERVIEW_SWITCH])
2893 g_print(" Line %ld column %ld - CR without LF?\n",
2894 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2900 eos=g_utf8_next_char(eos);
2904 if (pswit[MARKUP_SWITCH])
2905 postprocess_for_HTML(theline);
2906 if (pswit[DP_SWITCH])
2907 postprocess_for_DP(theline);
2914 * Takes a "word" as a parameter, and checks whether it
2915 * contains a mixture of alpha and digits. Generally, this is an
2916 * error, but may not be for cases like 4th or L5 12s. 3d.
2918 * Returns: TRUE iff an is error found.
2920 gboolean mixdigit(const char *checkword)
2922 gboolean wehaveadigit,wehavealetter,query;
2923 const char *s,*nondigit;
2924 wehaveadigit=wehavealetter=query=FALSE;
2925 for (s=checkword;*s;s=g_utf8_next_char(s))
2926 if (g_unichar_isalpha(g_utf8_get_char(s)))
2928 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2930 if (wehaveadigit && wehavealetter)
2932 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2934 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2935 nondigit=g_utf8_next_char(nondigit))
2937 /* digits, ending in st, rd, nd, th of either case */
2938 if (!g_ascii_strcasecmp(nondigit,"st") ||
2939 !g_ascii_strcasecmp(nondigit,"rd") ||
2940 !g_ascii_strcasecmp(nondigit,"nd") ||
2941 !g_ascii_strcasecmp(nondigit,"th"))
2943 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2944 !g_ascii_strcasecmp(nondigit,"rds") ||
2945 !g_ascii_strcasecmp(nondigit,"nds") ||
2946 !g_ascii_strcasecmp(nondigit,"ths"))
2948 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2949 !g_ascii_strcasecmp(nondigit,"rdly") ||
2950 !g_ascii_strcasecmp(nondigit,"ndly") ||
2951 !g_ascii_strcasecmp(nondigit,"thly"))
2953 /* digits, ending in l, L, s or d */
2954 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2955 !strcmp(nondigit,"d"))
2958 * L at the start of a number, representing Britsh pounds, like L500.
2959 * This is cute. We know the current word is mixed digit. If the first
2960 * letter is L, there must be at least one digit following. If both
2961 * digits and letters follow, we have a genuine error, else we have a
2962 * capital L followed by digits, and we accept that as a non-error.
2964 if (g_utf8_get_char(checkword)=='L' &&
2965 !mixdigit(g_utf8_next_char(checkword)))
2974 * Extracts the first/next "word" from the line, and returns it.
2975 * A word is defined as one English word unit--or at least that's the aim.
2976 * "ptr" is advanced to the position in the line where we will start
2977 * looking for the next word.
2979 * Returns: A newly-allocated string.
2981 gchar *getaword(const char **ptr)
2986 word=g_string_new(NULL);
2987 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2988 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2989 **ptr;*ptr=g_utf8_next_char(*ptr))
2992 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2993 * Especially yucky is the case of L1,000
2994 * This section looks for a pattern of characters including a digit
2995 * followed by a comma or period followed by one or more digits.
2996 * If found, it returns this whole pattern as a word; otherwise we discard
2997 * the results and resume our normal programming.
3000 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3001 g_unichar_isalpha(g_utf8_get_char(s)) ||
3002 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3003 g_string_append_unichar(word,g_utf8_get_char(s));
3004 for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
3005 t=g_utf8_next_char(t))
3007 c=g_utf8_get_char(t);
3008 pc=g_utf8_get_char(g_utf8_prev_char(t));
3009 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3012 return g_string_free(word,FALSE);
3015 /* we didn't find a punctuated number - do the regular getword thing */
3016 g_string_truncate(word,0);
3017 for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
3018 g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
3019 g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
3020 g_string_append_unichar(word,g_utf8_get_char(*ptr));
3021 return g_string_free(word,FALSE);
3027 * Is this word a Roman Numeral?
3029 * It doesn't actually validate that the number is a valid Roman Numeral--for
3030 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3031 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3032 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3033 * expressions thereof, except when it came to taxes. Allow any number of M,
3034 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3035 * XL or an optional XC, an optional IX or IV, an optional V and any number
3038 gboolean isroman(const char *t)
3044 while (g_utf8_get_char(t)=='m' && *t)
3046 if (g_utf8_get_char(t)=='d')
3048 if (g_str_has_prefix(t,"cm"))
3050 if (g_str_has_prefix(t,"cd"))
3052 while (g_utf8_get_char(t)=='c' && *t)
3054 if (g_str_has_prefix(t,"xl"))
3056 if (g_str_has_prefix(t,"xc"))
3058 if (g_utf8_get_char(t)=='l')
3060 while (g_utf8_get_char(t)=='x' && *t)
3062 if (g_str_has_prefix(t,"ix"))
3064 if (g_str_has_prefix(t,"iv"))
3066 if (g_utf8_get_char(t)=='v')
3068 while (g_utf8_get_char(t)=='i' && *t)
3074 * postprocess_for_DP:
3076 * Invoked with the -d switch from flgets().
3077 * It simply "removes" from the line a hard-coded set of common
3078 * DP-specific tags, so that the line passed to the main routine has
3079 * been pre-cleaned of DP markup.
3081 void postprocess_for_DP(char *theline)
3087 for (i=0;*DPmarkup[i];i++)
3088 while ((s=strstr(theline,DPmarkup[i])))
3090 t=s+strlen(DPmarkup[i]);
3091 memmove(s,t,strlen(t)+1);
3096 * postprocess_for_HTML:
3098 * Invoked with the -m switch from flgets().
3099 * It simply "removes" from the line a hard-coded set of common
3100 * HTML tags and "replaces" a hard-coded set of common HTML
3101 * entities, so that the line passed to the main routine has
3102 * been pre-cleaned of HTML.
3104 void postprocess_for_HTML(char *theline)
3106 while (losemarkup(theline))
3108 loseentities(theline);
3111 char *losemarkup(char *theline)
3115 s=strchr(theline,'<');
3116 t=s?strchr(s,'>'):NULL;
3119 for (i=0;*markup[i];i++)
3120 if (tagcomp(g_utf8_next_char(s),markup[i]))
3122 t=g_utf8_next_char(t);
3123 memmove(s,t,strlen(t)+1);
3126 /* It's an unrecognized <xxx>. */
3130 void loseentities(char *theline)
3137 GTree *entities=NULL;
3138 GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3142 g_tree_destroy(entities);
3144 if (translit==(GIConv)-1)
3145 g_iconv_close(translit);
3146 translit=(GIConv)-1;
3147 if (to_utf8==(GIConv)-1)
3148 g_iconv_close(to_utf8);
3156 entities=g_tree_new((GCompareFunc)strcmp);
3157 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3158 g_tree_insert(entities,HTMLentities[i].name,
3159 GUINT_TO_POINTER(HTMLentities[i].c));
3161 if (translit==(GIConv)-1)
3162 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3163 if (to_utf8==(GIConv)-1)
3164 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3165 while((amp=strchr(theline,'&')))
3167 scolon=strchr(amp,';');
3172 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3173 c=strtol(amp+2,NULL,10);
3174 else if (amp[2]=='x' &&
3175 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3176 c=strtol(amp+3,NULL,16);
3180 s=g_strndup(amp+1,scolon-(amp+1));
3181 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3190 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3191 theline+=g_unichar_to_utf8(c,theline);
3195 nb=g_unichar_to_utf8(c,s);
3196 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3198 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3200 memcpy(theline,s,nb);
3204 memmove(theline,g_utf8_next_char(scolon),
3205 strlen(g_utf8_next_char(scolon))+1);
3208 theline=g_utf8_next_char(amp);
3212 gboolean tagcomp(const char *strin,const char *basetag)
3216 if (g_utf8_get_char(strin)=='/')
3217 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3219 t=g_utf8_casefold(strin,-1);
3220 s=g_utf8_casefold(basetag,-1);
3221 retval=g_str_has_prefix(t,s);
3227 void proghelp(GOptionContext *context)
3230 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3231 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3232 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3233 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3234 "For details, read the file COPYING.\n",stderr);
3235 fputs("This is Free Software; "
3236 "you may redistribute it under certain conditions (GPL);\n",stderr);
3237 fputs("read the file COPYING for details.\n\n",stderr);
3238 help=g_option_context_get_help(context,TRUE,NULL);
3241 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3242 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3243 "non-ASCII\n",stderr);
3244 fputs("characters like accented letters, "
3245 "lines longer than 75 or shorter than 55,\n",stderr);
3246 fputs("unbalanced quotes or brackets, "
3247 "a variety of badly formatted punctuation, \n",stderr);
3248 fputs("HTML tags, some likely typos. "
3249 "It is NOT a substitute for human judgement.\n",stderr);