1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
27 #include "HTMLentities.h"
33 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
34 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
35 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
36 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
37 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
38 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
39 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
40 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
41 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
42 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
43 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
44 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
45 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
46 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
47 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
48 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
49 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
50 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
51 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
52 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
53 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
54 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
55 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
56 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
57 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
58 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
59 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
60 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
61 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
67 /* Common abbreviations and other OK words not to query as typos. */
69 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
70 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
71 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
72 "outbid", "outbids", "frostbite", "frostbitten", ""
75 /* Common abbreviations that cause otherwise unexplained periods. */
77 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
78 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
82 * Two-Letter combinations that rarely if ever start words,
83 * but are common scannos or otherwise common letter combinations.
86 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
90 * Two-Letter combinations that rarely if ever end words,
91 * but are common scannos or otherwise common letter combinations.
94 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
95 "sw", "gr", "sl", "cl", "iy", ""
99 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
100 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
101 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
102 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
106 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
110 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
111 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
112 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
113 "during", "let", "toward", "among", ""
117 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
118 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
119 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
120 "among", "those", "into", "whom", "having", "thence", ""
123 /* special characters */
124 #define CHAR_SPACE 32
128 #define CHAR_DQUOTE 34
129 #define CHAR_SQUOTE 39
130 #define CHAR_OPEN_SQUOTE 96
131 #define CHAR_TILDE 126
132 #define CHAR_ASTERISK 42
133 #define CHAR_FORESLASH 47
134 #define CHAR_CARAT 94
136 #define CHAR_UNDERSCORE '_'
137 #define CHAR_OPEN_CBRACK '{'
138 #define CHAR_CLOSE_CBRACK '}'
139 #define CHAR_OPEN_RBRACK '('
140 #define CHAR_CLOSE_RBRACK ')'
141 #define CHAR_OPEN_SBRACK '['
142 #define CHAR_CLOSE_SBRACK ']'
144 /* longest and shortest normal PG line lengths */
145 #define LONGEST_PG_LINE 75
146 #define WAY_TOO_LONG 80
147 #define SHORTEST_PG_LINE 55
167 gboolean pswit[SWITNO]; /* program switches */
169 static GOptionEntry options[]={
170 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
171 "Ignore DP-specific markup", NULL },
172 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
173 "Don't echo queried line", NULL },
174 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
175 "Check single quotes", NULL },
176 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
177 "Check common typos", NULL },
178 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
179 "Require closure of quotes on every paragraph", NULL },
180 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
181 "Disable paranoid querying of everything", NULL },
182 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
183 "Disable line end checking", NULL },
184 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
185 "Overview: just show counts", NULL },
186 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
187 "Output errors to stdout instead of stderr", NULL },
188 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
189 "Echo header fields", NULL },
190 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
191 "Ignore markup in < >", NULL },
192 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
193 "Use file of user-defined typos", NULL },
194 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
195 "Defaults for use on www upload", NULL },
196 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
197 "Verbose - list everything", NULL },
201 long cnt_dquot; /* for overview mode, count of doublequote queries */
202 long cnt_squot; /* for overview mode, count of singlequote queries */
203 long cnt_brack; /* for overview mode, count of brackets queries */
204 long cnt_bin; /* for overview mode, count of non-ASCII queries */
205 long cnt_odd; /* for overview mode, count of odd character queries */
206 long cnt_long; /* for overview mode, count of long line errors */
207 long cnt_short; /* for overview mode, count of short line queries */
208 long cnt_punct; /* for overview mode,
209 count of punctuation and spacing queries */
210 long cnt_dash; /* for overview mode, count of dash-related queries */
211 long cnt_word; /* for overview mode, count of word queries */
212 long cnt_html; /* for overview mode, count of html queries */
213 long cnt_lineend; /* for overview mode, count of line-end queries */
214 long cnt_spacend; /* count of lines with space at end */
215 long linecnt; /* count of total lines in the file */
216 long checked_linecnt; /* count of lines actually checked */
218 void proghelp(GOptionContext *context);
219 void procfile(const char *);
223 gboolean mixdigit(const char *);
224 gchar *getaword(const char **);
225 char *flgets(char **,long);
226 void postprocess_for_HTML(char *);
227 char *linehasmarkup(char *);
228 char *losemarkup(char *);
229 gboolean tagcomp(const char *,const char *);
230 void loseentities(char *);
231 gboolean isroman(const char *);
232 void postprocess_for_DP(char *);
234 GTree *qword,*qperiod;
236 struct first_pass_results {
237 long firstline,astline;
238 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
239 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
240 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
241 int Dutchcount,Frenchcount;
245 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
247 gboolean isDutch,isFrench;
252 int c_unders,c_brack,s_brack,r_brack;
253 int open_single_quote,close_single_quote;
256 struct line_properties {
257 unsigned int len,blen;
266 char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
270 void parse_options(int *argc,char ***argv)
273 GOptionContext *context;
274 context=g_option_context_new(
275 "file - looks for errors in Project Gutenberg(TM) etexts");
276 g_option_context_add_main_entries(context,options,NULL);
277 if (!g_option_context_parse(context,argc,argv,&err))
279 g_printerr("Bookloupe: %s\n",err->message);
280 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
283 /* Paranoid checking is turned OFF, not on, by its switch */
284 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
285 if (pswit[PARANOID_SWITCH])
286 /* if running in paranoid mode, typo checks default to enabled */
287 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
288 /* Line-end checking is turned OFF, not on, by its switch */
289 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
290 /* Echoing is turned OFF, not on, by its switch */
291 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
292 if (pswit[OVERVIEW_SWITCH])
293 /* just print summary; don't echo */
294 pswit[ECHO_SWITCH]=FALSE;
296 * Web uploads - for the moment, this is really just a placeholder
297 * until we decide what processing we really want to do on web uploads
299 if (pswit[WEB_SWITCH])
301 /* specific override for web uploads */
302 pswit[ECHO_SWITCH]=TRUE;
303 pswit[SQUOTE_SWITCH]=FALSE;
304 pswit[TYPO_SWITCH]=TRUE;
305 pswit[QPARA_SWITCH]=FALSE;
306 pswit[PARANOID_SWITCH]=TRUE;
307 pswit[LINE_END_SWITCH]=FALSE;
308 pswit[OVERVIEW_SWITCH]=FALSE;
309 pswit[STDOUT_SWITCH]=FALSE;
310 pswit[HEADER_SWITCH]=TRUE;
311 pswit[VERBOSE_SWITCH]=FALSE;
312 pswit[MARKUP_SWITCH]=FALSE;
313 pswit[USERTYPO_SWITCH]=FALSE;
314 pswit[DP_SWITCH]=FALSE;
321 g_option_context_free(context);
327 * Read in the user-defined stealth scanno list.
329 void read_user_scannos(void)
332 gchar *usertypo_file;
336 gchar *contents,*utf8,**lines;
337 usertypo_file=g_strdup("bookloupe.typ");
338 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
339 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
342 g_free(usertypo_file);
343 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
344 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
346 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
349 g_free(usertypo_file);
350 usertypo_file=g_strdup("gutcheck.typ");
351 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
353 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
356 g_free(usertypo_file);
357 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
358 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
360 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
362 g_free(usertypo_file);
363 g_print(" --> I couldn't find bookloupe.typ "
364 "-- proceeding without user typos.\n");
369 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
370 g_free(usertypo_file);
374 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
376 lines=g_strsplit_set(utf8,"\r\n",0);
378 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
379 for (i=0;lines[i];i++)
380 if (*(unsigned char *)lines[i]>'!')
381 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
390 * Read an etext returning a newly allocated string containing the file
391 * contents or NULL on error.
393 gchar *read_etext(const char *filename,GError **err)
395 gchar *contents,*utf8;
397 if (!g_file_get_contents(filename,&contents,&len,err))
399 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
404 int main(int argc,char **argv)
406 running_from=g_path_get_dirname(argv[0]);
407 parse_options(&argc,&argv);
408 if (pswit[USERTYPO_SWITCH])
410 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
412 if (pswit[OVERVIEW_SWITCH])
414 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
415 checked_linecnt,linecnt,linecnt-checked_linecnt);
416 g_print(" --------------- Queries found --------------\n");
418 g_print(" Long lines: %14ld\n",cnt_long);
420 g_print(" Short lines: %14ld\n",cnt_short);
422 g_print(" Line-end problems: %14ld\n",cnt_lineend);
424 g_print(" Common typos: %14ld\n",cnt_word);
426 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
428 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
430 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
432 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
434 g_print(" Proofing characters: %14ld\n",cnt_odd);
436 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
438 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
440 g_print(" Possible HTML tags: %14ld\n",cnt_html);
442 g_print(" TOTAL QUERIES %14ld\n",
443 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
444 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
446 g_free(running_from);
448 g_tree_unref(usertypo);
455 * Run a first pass - verify that it's a valid PG
456 * file, decide whether to report some things that
457 * occur many times in the text like long or short
458 * lines, non-standard dashes, etc.
460 struct first_pass_results *first_pass(const char *etext)
462 gunichar laststart=CHAR_SPACE;
467 unsigned int lastlen=0,lastblen=0;
468 long spline=0,nspline=0;
469 static struct first_pass_results results={0};
471 lines=g_strsplit(etext,"\n",0);
472 for (j=0;lines[j];j++)
474 lbytes=strlen(lines[j]);
475 while (lines[j][lbytes-1]=='\r')
476 lines[j][--lbytes]='\0';
477 llen=g_utf8_strlen(lines[j],lbytes);
479 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
480 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
483 g_print(" --> Duplicate header?\n");
484 spline=linecnt+1; /* first line of non-header text, that is */
486 if (!strncmp(lines[j],"*** START",9) &&
487 strstr(lines[j],"PROJECT GUTENBERG"))
490 g_print(" --> Duplicate header?\n");
491 nspline=linecnt+1; /* first line of non-header text, that is */
493 if (spline || nspline)
495 lc_line=g_utf8_strdown(lines[j],lbytes);
496 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
498 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
500 if (results.footerline)
502 /* it's an old-form header - we can detect duplicates */
504 g_print(" --> Duplicate footer?\n");
507 results.footerline=linecnt;
513 results.firstline=spline;
515 results.firstline=nspline; /* override with new */
516 if (results.footerline)
517 continue; /* don't count the boilerplate in the footer */
518 results.totlen+=llen;
519 for (s=lines[j];*s;s=g_utf8_next_char(s))
521 if (g_utf8_get_char(s)>127)
523 if (g_unichar_isalpha(g_utf8_get_char(s)))
525 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
526 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
527 results.endquote_count++;
529 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
530 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
533 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
535 if (strstr(lines[j],".,"))
537 /* only count ast lines for ignoring purposes where there is */
538 /* locase text on the line */
539 if (strchr(lines[j],'*'))
541 for (s=lines[j];*s;s=g_utf8_next_char(s))
542 if (g_unichar_islower(g_utf8_get_char(s)))
547 if (strchr(lines[j],'/'))
548 results.fslashline++;
549 for (s=g_utf8_prev_char(lines[j]+lbytes);
550 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
552 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
553 g_utf8_get_char(g_utf8_prev_char(s))!='-')
555 if (llen>LONGEST_PG_LINE)
557 if (llen>WAY_TOO_LONG)
558 results.verylongline++;
559 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
561 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
564 if (strstr(lines[j],"<i>"))
565 results.htmcount+=4; /* bonus marks! */
567 /* Check for spaced em-dashes */
568 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
571 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
572 results.space_emdash++;
573 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
574 /* count of em-dashes with spaces both sides */
575 results.non_PG_space_emdash++;
576 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
577 /* count of PG-type em-dashes with no spaces */
578 results.PG_space_emdash++;
583 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
584 results.Dutchcount++;
585 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
586 results.Frenchcount++;
587 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
588 results.standalone_digit++;
591 /* Check for spaced dashes */
592 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
596 laststart=lines[j][0];
605 * Make some snap decisions based on the first pass results.
607 struct warnings *report_first_pass(struct first_pass_results *results)
609 static struct warnings warnings={0};
611 g_print(" --> %ld lines in this file have white space at end\n",
614 if (results->dotcomma>5)
617 g_print(" --> %ld lines in this file contain '.,'. "
618 "Not reporting them.\n",results->dotcomma);
621 * If more than 50 lines, or one-tenth, are short,
622 * don't bother reporting them.
624 warnings.shortline=1;
625 if (results->shortline>50 || results->shortline*10>linecnt)
627 warnings.shortline=0;
628 g_print(" --> %ld lines in this file are short. "
629 "Not reporting short lines.\n",results->shortline);
632 * If more than 50 lines, or one-tenth, are long,
633 * don't bother reporting them.
636 if (results->longline>50 || results->longline*10>linecnt)
639 g_print(" --> %ld lines in this file are long. "
640 "Not reporting long lines.\n",results->longline);
642 /* If more than 10 lines contain asterisks, don't bother reporting them. */
644 if (results->astline>10)
647 g_print(" --> %ld lines in this file contain asterisks. "
648 "Not reporting them.\n",results->astline);
651 * If more than 10 lines contain forward slashes,
652 * don't bother reporting them.
655 if (results->fslashline>10)
658 g_print(" --> %ld lines in this file contain forward slashes. "
659 "Not reporting them.\n",results->fslashline);
662 * If more than 20 lines contain unpunctuated endquotes,
663 * don't bother reporting them.
666 if (results->endquote_count>20)
669 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
670 "Not reporting them.\n",results->endquote_count);
673 * If more than 15 lines contain standalone digits,
674 * don't bother reporting them.
677 if (results->standalone_digit>10)
680 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
681 "Not reporting them.\n",results->standalone_digit);
684 * If more than 20 lines contain hyphens at end,
685 * don't bother reporting them.
688 if (results->hyphens>20)
691 g_print(" --> %ld lines in this file have hyphens at end. "
692 "Not reporting them.\n",results->hyphens);
694 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
696 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
697 pswit[MARKUP_SWITCH]=1;
699 if (results->verylongline>0)
700 g_print(" --> %ld lines in this file are VERY long!\n",
701 results->verylongline);
703 * If there are more non-PG spaced dashes than PG em-dashes,
704 * assume it's deliberate.
705 * Current PG guidelines say don't use them, but older texts do,
706 * and some people insist on them whatever the guidelines say.
709 if (results->spacedash+results->non_PG_space_emdash>
710 results->PG_space_emdash)
713 g_print(" --> There are %ld spaced dashes and em-dashes. "
714 "Not reporting them.\n",
715 results->spacedash+results->non_PG_space_emdash);
717 /* If more than a quarter of characters are hi-bit, bug out. */
719 if (results->binlen*4>results->totlen)
721 g_print(" --> This file does not appear to be ASCII. "
722 "Terminating. Best of luck with it!\n");
725 if (results->alphalen*4<results->totlen)
727 g_print(" --> This file does not appear to be text. "
728 "Terminating. Best of luck with it!\n");
731 if (results->binlen*100>results->totlen || results->binlen>100)
733 g_print(" --> There are a lot of foreign letters here. "
734 "Not reporting them.\n");
737 warnings.isDutch=FALSE;
738 if (results->Dutchcount>50)
740 warnings.isDutch=TRUE;
741 g_print(" --> This looks like Dutch - "
742 "switching off dashes and warnings for 's Middags case.\n");
744 warnings.isFrench=FALSE;
745 if (results->Frenchcount>50)
747 warnings.isFrench=TRUE;
748 g_print(" --> This looks like French - "
749 "switching off some doublepunct.\n");
751 if (results->firstline && results->footerline)
752 g_print(" The PG header and footer appear to be already on.\n");
755 if (results->firstline)
756 g_print(" The PG header is on - no footer.\n");
757 if (results->footerline)
758 g_print(" The PG footer is on - no header.\n");
761 if (pswit[VERBOSE_SWITCH])
764 warnings.shortline=1;
773 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
775 if (warnings.isDutch)
777 if (results->footerline>0 && results->firstline>0 &&
778 results->footerline>results->firstline &&
779 results->footerline-results->firstline<100)
781 g_print(" --> I don't really know where this text starts. \n");
782 g_print(" There are no reference points.\n");
783 g_print(" I'm going to have to report the header and footer "
785 results->firstline=0;
793 * Look along the line, accumulate the count of quotes, and see
794 * if this is an empty line - i.e. a line with nothing on it
796 * If line has just spaces, period, * and/or - on it, don't
797 * count it, since empty lines with asterisks or dashes to
798 * separate sections are common.
800 * Returns: TRUE if the line is empty.
802 gboolean analyse_quotes(const char *aline,struct counters *counters)
805 /* assume the line is empty until proven otherwise */
806 gboolean isemptyline=TRUE;
807 const char *s=aline,*sprev,*snext;
812 snext=g_utf8_next_char(s);
813 c=g_utf8_get_char(s);
816 if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
821 * At start of line, it can only be an openquote.
822 * Hardcode a very common exception!
824 if (!g_str_has_prefix(snext,"tis") &&
825 !g_str_has_prefix(snext,"Tis"))
826 counters->open_single_quote++;
828 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
829 g_unichar_isalpha(g_utf8_get_char(snext)))
830 /* Do nothing! it's definitely an apostrophe, not a quote */
832 /* it's outside a word - let's check it out */
833 else if (c==CHAR_OPEN_SQUOTE ||
834 g_unichar_isalpha(g_utf8_get_char(snext)))
836 /* it damwell better BE an openquote */
837 if (!g_str_has_prefix(snext,"tis") &&
838 !g_str_has_prefix(snext,"Tis"))
839 /* hardcode a very common exception! */
840 counters->open_single_quote++;
844 /* now - is it a closequote? */
845 guessquote=0; /* accumulate clues */
846 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
848 /* it follows a letter - could be either */
850 if (g_utf8_get_char(sprev)=='s')
852 /* looks like a plural apostrophe */
854 if (g_utf8_get_char(snext)==CHAR_SPACE)
859 /* it doesn't have a letter either side */
860 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
861 strchr(".?!,;: ",g_utf8_get_char(snext)))
862 guessquote+=8; /* looks like a closequote */
865 if (counters->open_single_quote>counters->close_single_quote)
867 * Give it the benefit of some doubt,
868 * if a squote is already open.
874 counters->close_single_quote++;
877 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
879 isemptyline=FALSE; /* ignore lines like * * * as spacers */
880 if (c==CHAR_UNDERSCORE)
881 counters->c_unders++;
882 if (c==CHAR_OPEN_CBRACK)
884 if (c==CHAR_CLOSE_CBRACK)
886 if (c==CHAR_OPEN_RBRACK)
888 if (c==CHAR_CLOSE_RBRACK)
890 if (c==CHAR_OPEN_SBRACK)
892 if (c==CHAR_CLOSE_SBRACK)
901 * check_for_control_characters:
903 * Check for invalid or questionable characters in the line
904 * Anything above 127 is invalid for plain ASCII, and
905 * non-printable control characters should also be flagged.
906 * Tabs should generally not be there.
908 void check_for_control_characters(const char *aline)
912 for (s=aline;*s;s=g_utf8_next_char(s))
914 c=g_utf8_get_char(s);
915 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
917 if (pswit[ECHO_SWITCH])
918 g_print("\n%s\n",aline);
919 if (!pswit[OVERVIEW_SWITCH])
920 g_print(" Line %ld column %ld - Control character %u\n",
921 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
929 * check_for_odd_characters:
931 * Check for binary and other odd characters.
933 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
934 gboolean isemptyline)
936 /* Don't repeat multiple warnings on one line. */
937 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
938 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
941 for (s=aline;*s;s=g_utf8_next_char(s))
943 c=g_utf8_get_char(s);
944 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
946 if (pswit[ECHO_SWITCH])
947 g_print("\n%s\n",aline);
948 if (!pswit[OVERVIEW_SWITCH])
949 if (c>127 && c<160 || c>255)
950 g_print(" Line %ld column %ld - "
951 "Non-ISO-8859 character %u\n",
952 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
954 g_print(" Line %ld column %ld - "
955 "Non-ASCII character %u\n",
956 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
961 if (!eTab && c==CHAR_TAB)
963 if (pswit[ECHO_SWITCH])
964 g_print("\n%s\n",aline);
965 if (!pswit[OVERVIEW_SWITCH])
966 g_print(" Line %ld column %ld - Tab character?\n",
967 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
972 if (!eTilde && c==CHAR_TILDE)
975 * Often used by OCR software to indicate an
976 * unrecognizable character.
978 if (pswit[ECHO_SWITCH])
979 g_print("\n%s\n",aline);
980 if (!pswit[OVERVIEW_SWITCH])
981 g_print(" Line %ld column %ld - Tilde character?\n",
982 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
987 if (!eCarat && c==CHAR_CARAT)
989 if (pswit[ECHO_SWITCH])
990 g_print("\n%s\n",aline);
991 if (!pswit[OVERVIEW_SWITCH])
992 g_print(" Line %ld column %ld - Carat character?\n",
993 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
998 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1000 if (pswit[ECHO_SWITCH])
1001 g_print("\n%s\n",aline);
1002 if (!pswit[OVERVIEW_SWITCH])
1003 g_print(" Line %ld column %ld - Forward slash?\n",
1004 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1010 * Report asterisks only in paranoid mode,
1011 * since they're often deliberate.
1013 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1016 if (pswit[ECHO_SWITCH])
1017 g_print("\n%s\n",aline);
1018 if (!pswit[OVERVIEW_SWITCH])
1019 g_print(" Line %ld column %ld - Asterisk?\n",
1020 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1029 * check_for_long_line:
1031 * Check for line too long.
1033 void check_for_long_line(const char *aline)
1035 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1037 if (pswit[ECHO_SWITCH])
1038 g_print("\n%s\n",aline);
1039 if (!pswit[OVERVIEW_SWITCH])
1040 g_print(" Line %ld column %ld - Long line %ld\n",
1041 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1048 * check_for_short_line:
1050 * Check for line too short.
1052 * This one is a bit trickier to implement: we don't want to
1053 * flag the last line of a paragraph for being short, so we
1054 * have to wait until we know that our current line is a
1055 * "normal" line, then report the _previous_ line if it was too
1056 * short. We also don't want to report indented lines like
1057 * chapter heads or formatted quotations. We therefore keep
1058 * last->len as the length of the last line examined, and
1059 * last->blen as the length of the last but one, and try to
1060 * suppress unnecessary warnings by checking that both were of
1061 * "normal" length. We keep the first character of the last
1062 * line in last->start, and if it was a space, we assume that
1063 * the formatting is deliberate. I can't figure out a way to
1064 * distinguish something like a quoted verse left-aligned or
1065 * the header or footer of a letter from a paragraph of short
1066 * lines - maybe if I examined the whole paragraph, and if the
1067 * para has less than, say, 8 lines and if all lines are short,
1068 * then just assume it's OK? Need to look at some texts to see
1069 * how often a formula like this would get the right result.
1071 void check_for_short_line(const char *aline,const struct line_properties *last)
1073 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1074 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1075 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1077 if (pswit[ECHO_SWITCH])
1078 g_print("\n%s\n",prevline);
1079 if (!pswit[OVERVIEW_SWITCH])
1080 g_print(" Line %ld column %ld - Short line %ld?\n",
1081 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1088 * check_for_starting_punctuation:
1090 * Look for punctuation other than full ellipses at start of line.
1092 void check_for_starting_punctuation(const char *aline)
1094 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1095 !g_str_has_prefix(aline,". . ."))
1097 if (pswit[ECHO_SWITCH])
1098 g_print("\n%s\n",aline);
1099 if (!pswit[OVERVIEW_SWITCH])
1100 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1108 * check_for_spaced_emdash:
1110 * Check for spaced em-dashes.
1112 * We must check _all_ occurrences of "--" on the line
1113 * hence the loop - even if the first double-dash is OK
1114 * there may be another that's wrong later on.
1116 void check_for_spaced_emdash(const char *aline)
1118 const char *s,*t,*next;
1119 for (s=aline;t=strstr(s,"--");s=next)
1121 next=g_utf8_next_char(g_utf8_next_char(t));
1122 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1123 g_utf8_get_char(next)==CHAR_SPACE)
1125 if (pswit[ECHO_SWITCH])
1126 g_print("\n%s\n",aline);
1127 if (!pswit[OVERVIEW_SWITCH])
1128 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1129 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1137 * check_for_spaced_dash:
1139 * Check for spaced dashes.
1141 void check_for_spaced_dash(const char *aline)
1144 if ((s=strstr(aline," -")))
1146 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1148 if (pswit[ECHO_SWITCH])
1149 g_print("\n%s\n",aline);
1150 if (!pswit[OVERVIEW_SWITCH])
1151 g_print(" Line %ld column %ld - Spaced dash?\n",
1152 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1157 else if ((s=strstr(aline,"- ")))
1159 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1161 if (pswit[ECHO_SWITCH])
1162 g_print("\n%s\n",aline);
1163 if (!pswit[OVERVIEW_SWITCH])
1164 g_print(" Line %ld column %ld - Spaced dash?\n",
1165 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1173 * check_for_unmarked_paragraphs:
1175 * Check for unmarked paragraphs indicated by separate speakers.
1177 * May well be false positive:
1178 * "Bravo!" "Wonderful!" called the crowd.
1179 * but useful all the same.
1181 void check_for_unmarked_paragraphs(const char *aline)
1184 s=strstr(aline,"\" \"");
1186 s=strstr(aline,"\" \"");
1189 if (pswit[ECHO_SWITCH])
1190 g_print("\n%s\n",aline);
1191 if (!pswit[OVERVIEW_SWITCH])
1192 g_print(" Line %ld column %ld - "
1193 "Query missing paragraph break?\n",
1194 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1201 * check_for_jeebies:
1203 * Check for "to he" and other easy h/b errors.
1205 * This is a very inadequate effort on the h/b problem,
1206 * but the phrase "to he" is always an error, whereas "to
1207 * be" is quite common.
1208 * Similarly, '"Quiet!", be said.' is a non-be error
1209 * "to he" is _not_ always an error!:
1210 * "Where they went to he couldn't say."
1211 * Another false positive:
1212 * What would "Cinderella" be without the . . .
1213 * and another: "If he wants to he can see for himself."
1215 void check_for_jeebies(const char *aline)
1218 s=strstr(aline," be could ");
1220 s=strstr(aline," be would ");
1222 s=strstr(aline," was be ");
1224 s=strstr(aline," be is ");
1226 s=strstr(aline," is be ");
1228 s=strstr(aline,"\", be ");
1230 s=strstr(aline,"\" be ");
1232 s=strstr(aline,"\" be ");
1234 s=strstr(aline," to he ");
1237 if (pswit[ECHO_SWITCH])
1238 g_print("\n%s\n",aline);
1239 if (!pswit[OVERVIEW_SWITCH])
1240 g_print(" Line %ld column %ld - Query he/be error?\n",
1241 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1245 s=strstr(aline," the had ");
1247 s=strstr(aline," a had ");
1249 s=strstr(aline," they bad ");
1251 s=strstr(aline," she bad ");
1253 s=strstr(aline," he bad ");
1255 s=strstr(aline," you bad ");
1257 s=strstr(aline," i bad ");
1260 if (pswit[ECHO_SWITCH])
1261 g_print("\n%s\n",aline);
1262 if (!pswit[OVERVIEW_SWITCH])
1263 g_print(" Line %ld column %ld - Query had/bad error?\n",
1264 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1268 s=strstr(aline,"; hut ");
1270 s=strstr(aline,", hut ");
1273 if (pswit[ECHO_SWITCH])
1274 g_print("\n%s\n",aline);
1275 if (!pswit[OVERVIEW_SWITCH])
1276 g_print(" Line %ld column %ld - Query hut/but error?\n",
1277 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1284 * check_for_mta_from:
1286 * Special case - angled bracket in front of "From" placed there by an
1287 * MTA when sending an e-mail.
1289 void check_for_mta_from(const char *aline)
1292 s=strstr(aline,">From");
1295 if (pswit[ECHO_SWITCH])
1296 g_print("\n%s\n",aline);
1297 if (!pswit[OVERVIEW_SWITCH])
1298 g_print(" Line %ld column %ld - "
1299 "Query angled bracket with From\n",
1300 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1307 * check_for_orphan_character:
1309 * Check for a single character line -
1310 * often an overflow from bad wrapping.
1312 void check_for_orphan_character(const char *aline)
1315 c=g_utf8_get_char(aline);
1316 if (c && !*g_utf8_next_char(aline))
1318 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1319 ; /* Nothing - ignore numerals alone on a line. */
1322 if (pswit[ECHO_SWITCH])
1323 g_print("\n%s\n",aline);
1324 if (!pswit[OVERVIEW_SWITCH])
1325 g_print(" Line %ld column 1 - Query single character line\n",
1334 * check_for_pling_scanno:
1336 * Check for I" - often should be !
1338 void check_for_pling_scanno(const char *aline)
1341 s=strstr(aline," I\"");
1344 if (pswit[ECHO_SWITCH])
1345 g_print("\n%s\n",aline);
1346 if (!pswit[OVERVIEW_SWITCH])
1347 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1348 linecnt,g_utf8_pointer_to_offset(aline,s));
1355 * check_for_extra_period:
1357 * Check for period without a capital letter. Cut-down from gutspell.
1358 * Only works when it happens on a single line.
1360 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1362 const char *s,*t,*s1;
1367 gunichar *decomposition;
1368 if (pswit[PARANOID_SWITCH])
1370 for (t=aline;t=strstr(t,". ");)
1374 t=g_utf8_next_char(t);
1375 /* start of line punctuation is handled elsewhere */
1378 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1380 t=g_utf8_next_char(t);
1383 if (warnings->isDutch)
1385 /* For Frank & Jeroen -- 's Middags case */
1386 gunichar c2,c3,c4,c5;
1387 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1388 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1389 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1390 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1391 if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
1392 c4==CHAR_SPACE && g_unichar_isupper(c5))
1394 t=g_utf8_next_char(t);
1398 s1=g_utf8_next_char(g_utf8_next_char(t));
1399 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1400 !isdigit(g_utf8_get_char(s1)))
1401 s1=g_utf8_next_char(s1);
1402 if (g_unichar_islower(g_utf8_get_char(s1)))
1404 /* we have something to investigate */
1406 /* so let's go back and find out */
1407 for (s1=g_utf8_prev_char(t);s1>=aline &&
1408 (g_unichar_isalpha(g_utf8_get_char(s1)) ||
1409 g_unichar_isdigit(g_utf8_get_char(s1)) ||
1410 g_utf8_get_char(s1)==CHAR_SQUOTE &&
1411 g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
1412 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
1413 s1=g_utf8_prev_char(s1))
1415 s1=g_utf8_next_char(s1);
1418 testword=g_strndup(s1,s-s1);
1420 testword=g_strdup(s1);
1421 for (i=0;*abbrev[i];i++)
1422 if (!strcmp(testword,abbrev[i]))
1424 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1426 if (!*g_utf8_next_char(testword))
1428 if (isroman(testword))
1433 for (s=testword;*s;s=g_utf8_next_char(s))
1435 decomposition=g_unicode_canonical_decomposition(
1436 g_utf8_get_char(s),&len);
1437 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1439 g_free(decomposition);
1443 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1445 g_tree_insert(qperiod,g_strdup(testword),
1446 GINT_TO_POINTER(1));
1447 if (pswit[ECHO_SWITCH])
1448 g_print("\n%s\n",aline);
1449 if (!pswit[OVERVIEW_SWITCH])
1450 g_print(" Line %ld column %ld - Extra period?\n",
1451 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1457 t=g_utf8_next_char(t);
1463 * check_for_following_punctuation:
1465 * Check for words usually not followed by punctuation.
1467 void check_for_following_punctuation(const char *aline)
1470 const char *s,*wordstart;
1473 if (pswit[TYPO_SWITCH])
1484 inword=g_utf8_strdown(t,-1);
1486 for (i=0;*nocomma[i];i++)
1487 if (!strcmp(inword,nocomma[i]))
1489 c=g_utf8_get_char(s);
1490 if (c==',' || c==';' || c==':')
1492 if (pswit[ECHO_SWITCH])
1493 g_print("\n%s\n",aline);
1494 if (!pswit[OVERVIEW_SWITCH])
1495 g_print(" Line %ld column %ld - "
1496 "Query punctuation after %s?\n",
1497 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1503 for (i=0;*noperiod[i];i++)
1504 if (!strcmp(inword,noperiod[i]))
1506 c=g_utf8_get_char(s);
1507 if (c=='.' || c=='!')
1509 if (pswit[ECHO_SWITCH])
1510 g_print("\n%s\n",aline);
1511 if (!pswit[OVERVIEW_SWITCH])
1512 g_print(" Line %ld column %ld - "
1513 "Query punctuation after %s?\n",
1514 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1528 * Check for commonly mistyped words,
1529 * and digits like 0 for O in a word.
1531 void check_for_typos(const char *aline,struct warnings *warnings)
1533 const char *s,*t,*nt,*wordstart;
1535 gunichar *decomposition;
1537 int i,vowel,consonant,*dupcnt;
1538 gboolean isdup,istypo,alower;
1541 gsize decomposition_len;
1545 inword=getaword(&s);
1549 continue; /* don't bother with empty lines */
1551 if (mixdigit(inword))
1553 if (pswit[ECHO_SWITCH])
1554 g_print("\n%s\n",aline);
1555 if (!pswit[OVERVIEW_SWITCH])
1556 g_print(" Line %ld column %ld - Query digit in %s\n",
1557 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1562 * Put the word through a series of tests for likely typos and OCR
1565 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1569 for (t=inword;*t;t=g_utf8_next_char(t))
1571 c=g_utf8_get_char(t);
1572 nt=g_utf8_next_char(t);
1573 /* lowercase for testing */
1574 if (g_unichar_islower(c))
1576 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1579 * We have an uppercase mid-word. However, there are
1581 * Mac and Mc like McGill
1582 * French contractions like l'Abbe
1584 offset=g_utf8_pointer_to_offset(inword,t);
1585 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1586 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1587 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1589 g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
1595 testword=g_utf8_casefold(inword,-1);
1597 if (pswit[TYPO_SWITCH])
1600 * Check for certain unlikely two-letter combinations at word
1603 len=g_utf8_strlen(testword,-1);
1606 for (i=0;*nostart[i];i++)
1607 if (g_str_has_prefix(testword,nostart[i]))
1609 for (i=0;*noend[i];i++)
1610 if (g_str_has_suffix(testword,noend[i]))
1613 /* ght is common, gbt never. Like that. */
1614 if (strstr(testword,"cb"))
1616 if (strstr(testword,"gbt"))
1618 if (strstr(testword,"pbt"))
1620 if (strstr(testword,"tbs"))
1622 if (strstr(testword,"mrn"))
1624 if (strstr(testword,"ahle"))
1626 if (strstr(testword,"ihle"))
1629 * "TBE" does happen - like HEARTBEAT - but uncommon.
1630 * Also "TBI" - frostbite, outbid - but uncommon.
1631 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1632 * numerals, but "ii" is a common scanno.
1634 if (strstr(testword,"tbi"))
1636 if (strstr(testword,"tbe"))
1638 if (strstr(testword,"ii"))
1641 * Check for no vowels or no consonants.
1642 * If none, flag a typo.
1644 if (!istypo && len>1)
1647 for (t=testword;*t;t=g_utf8_next_char(t))
1649 c=g_utf8_get_char(t);
1651 g_unicode_canonical_decomposition(c,&decomposition_len);
1652 if (c=='y' || g_unichar_isdigit(c))
1654 /* Yah, this is loose. */
1658 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1662 g_free(decomposition);
1664 if (!vowel || !consonant)
1668 * Now exclude the word from being reported if it's in
1671 for (i=0;*okword[i];i++)
1672 if (!strcmp(testword,okword[i]))
1675 * What looks like a typo may be a Roman numeral.
1678 if (istypo && isroman(testword))
1680 /* Check the manual list of typos. */
1682 for (i=0;*typo[i];i++)
1683 if (!strcmp(testword,typo[i]))
1686 * Check lowercase s, l, i and m - special cases.
1687 * "j" - often a semi-colon gone wrong.
1688 * "d" for a missing apostrophe - he d
1691 if (!istypo && len==1 &&
1692 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1696 dupcnt=g_tree_lookup(qword,testword);
1700 isdup=!pswit[VERBOSE_SWITCH];
1704 dupcnt=g_new0(int,1);
1705 g_tree_insert(qword,g_strdup(testword),dupcnt);
1710 if (pswit[ECHO_SWITCH])
1711 g_print("\n%s\n",aline);
1712 if (!pswit[OVERVIEW_SWITCH])
1714 g_print(" Line %ld column %ld - Query word %s",
1715 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1717 if (!pswit[VERBOSE_SWITCH])
1718 g_print(" - not reporting duplicates");
1726 /* check the user's list of typos */
1727 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1729 if (pswit[ECHO_SWITCH])
1730 g_print("\n%s\n",aline);
1731 if (!pswit[OVERVIEW_SWITCH])
1732 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1733 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1735 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1737 if (pswit[PARANOID_SWITCH] && warnings->digit)
1739 /* In paranoid mode, query all 0 and 1 standing alone. */
1740 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1742 if (pswit[ECHO_SWITCH])
1743 g_print("\n%s\n",aline);
1744 if (!pswit[OVERVIEW_SWITCH])
1745 g_print(" Line %ld column %ld - Query standalone %s\n",
1746 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1757 * check_for_misspaced_punctuation:
1759 * Look for added or missing spaces around punctuation and quotes.
1760 * If there is a punctuation character like ! with no space on
1761 * either side, suspect a missing!space. If there are spaces on
1762 * both sides , assume a typo. If we see a double quote with no
1763 * space or punctuation on either side of it, assume unspaced
1764 * quotes "like"this.
1766 void check_for_misspaced_punctuation(const char *aline,
1767 struct parities *parities,gboolean isemptyline)
1769 gboolean isacro,isellipsis;
1771 gunichar c,nc,pc,n2c;
1772 c=g_utf8_get_char(aline);
1773 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1774 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1778 nc=g_utf8_get_char(g_utf8_next_char(s));
1779 /* For each character in the line after the first. */
1780 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1782 /* we need to suppress warnings for acronyms like M.D. */
1784 /* we need to suppress warnings for ellipsis . . . */
1787 * If there are letters on both sides of it or
1788 * if it's strict punctuation followed by an alpha.
1790 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1791 g_utf8_strchr("?!,;:",-1,c)))
1795 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1796 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1798 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1804 if (pswit[ECHO_SWITCH])
1805 g_print("\n%s\n",aline);
1806 if (!pswit[OVERVIEW_SWITCH])
1807 g_print(" Line %ld column %ld - Missing space?\n",
1808 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1813 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1816 * If there are spaces on both sides,
1817 * or space before and end of line.
1821 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1822 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1824 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1828 if (!isemptyline && !isellipsis)
1830 if (pswit[ECHO_SWITCH])
1831 g_print("\n%s\n",aline);
1832 if (!pswit[OVERVIEW_SWITCH])
1833 g_print(" Line %ld column %ld - "
1834 "Spaced punctuation?\n",linecnt,
1835 g_utf8_pointer_to_offset(aline,s)+1);
1842 /* Split out the characters that CANNOT be preceded by space. */
1843 c=g_utf8_get_char(aline);
1844 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1845 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1849 nc=g_utf8_get_char(g_utf8_next_char(s));
1850 /* for each character in the line after the first */
1851 if (g_utf8_strchr("?!,;:",-1,c))
1853 /* if it's punctuation that _cannot_ have a space before it */
1854 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1857 * If nc DOES == space,
1858 * it was already reported just above.
1860 if (pswit[ECHO_SWITCH])
1861 g_print("\n%s\n",aline);
1862 if (!pswit[OVERVIEW_SWITCH])
1863 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1864 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1871 * Special case " .X" where X is any alpha.
1872 * This plugs a hole in the acronym code above.
1873 * Inelegant, but maintainable.
1875 c=g_utf8_get_char(aline);
1876 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1877 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1881 nc=g_utf8_get_char(g_utf8_next_char(s));
1882 /* for each character in the line after the first */
1885 /* if it's a period */
1886 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1889 * If the period follows a space and
1890 * is followed by a letter.
1892 if (pswit[ECHO_SWITCH])
1893 g_print("\n%s\n",aline);
1894 if (!pswit[OVERVIEW_SWITCH])
1895 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1896 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1902 c=g_utf8_get_char(aline);
1903 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1904 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1908 nc=g_utf8_get_char(g_utf8_next_char(s));
1909 /* for each character in the line after the first */
1912 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1913 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1914 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1916 if (pswit[ECHO_SWITCH])
1917 g_print("\n%s\n",aline);
1918 if (!pswit[OVERVIEW_SWITCH])
1919 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1920 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1926 /* Check parity of quotes. */
1927 nc=g_utf8_get_char(aline);
1928 for (s=aline;*s;s=g_utf8_next_char(s))
1931 nc=g_utf8_get_char(g_utf8_next_char(s));
1934 parities->dquote=!parities->dquote;
1935 if (!parities->dquote)
1938 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
1940 if (pswit[ECHO_SWITCH])
1941 g_print("\n%s\n",aline);
1942 if (!pswit[OVERVIEW_SWITCH])
1943 g_print(" Line %ld column %ld - "
1944 "Wrongspaced quotes?\n",
1945 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1953 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1954 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
1956 if (pswit[ECHO_SWITCH])
1957 g_print("\n%s\n",aline);
1958 if (!pswit[OVERVIEW_SWITCH])
1959 g_print(" Line %ld column %ld - "
1960 "Wrongspaced quotes?\n",
1961 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1968 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
1970 if (g_utf8_strchr(",;:!?)]} ",-1,
1971 g_utf8_get_char(g_utf8_next_char(aline))))
1973 if (pswit[ECHO_SWITCH])
1974 g_print("\n%s\n",aline);
1975 if (!pswit[OVERVIEW_SWITCH])
1976 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
1982 if (pswit[SQUOTE_SWITCH])
1984 nc=g_utf8_get_char(aline);
1985 for (s=aline;*s;s=g_utf8_next_char(s))
1988 nc=g_utf8_get_char(g_utf8_next_char(s));
1989 if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
1991 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
1992 !g_unichar_isalpha(nc)))
1994 parities->squote=!parities->squote;
1995 if (!parities->squote)
1998 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2000 if (pswit[ECHO_SWITCH])
2001 g_print("\n%s\n",aline);
2002 if (!pswit[OVERVIEW_SWITCH])
2003 g_print(" Line %ld column %ld - "
2004 "Wrongspaced singlequotes?\n",
2005 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2013 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2014 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2016 if (pswit[ECHO_SWITCH])
2017 g_print("\n%s\n",aline);
2018 if (!pswit[OVERVIEW_SWITCH])
2019 g_print(" Line %ld column %ld - "
2020 "Wrongspaced singlequotes?\n",
2021 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2032 * check_for_double_punctuation:
2034 * Look for double punctuation like ,. or ,,
2035 * Thanks to DW for the suggestion!
2036 * In books with references, ".," and ".;" are common
2037 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2038 * OTOH, from my initial tests, there are also fairly
2039 * common errors. What to do? Make these cases paranoid?
2040 * ".," is the most common, so warnings->dotcomma is used
2041 * to suppress detailed reporting if it occurs often.
2043 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2047 nc=g_utf8_get_char(aline);
2048 for (s=aline;*s;s=g_utf8_next_char(s))
2051 nc=g_utf8_get_char(g_utf8_next_char(s));
2052 /* for each punctuation character in the line */
2053 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2054 g_utf8_strchr(".?!,;:",-1,nc))
2056 /* followed by punctuation, it's a query, unless . . . */
2057 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2058 !warnings->dotcomma && c=='.' && nc==',' ||
2059 warnings->isFrench && g_str_has_prefix(s,",...") ||
2060 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2061 warnings->isFrench && g_str_has_prefix(s,";...") ||
2062 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2063 warnings->isFrench && g_str_has_prefix(s,":...") ||
2064 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2065 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2066 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2067 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2068 warnings->isFrench && g_str_has_prefix(s,"...?"))
2070 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2071 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2072 warnings->isFrench && g_str_has_prefix(s,";...") ||
2073 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2074 warnings->isFrench && g_str_has_prefix(s,":...") ||
2075 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2076 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2077 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2078 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2079 warnings->isFrench && g_str_has_prefix(s,"...?"))
2082 nc=g_utf8_get_char(g_utf8_next_char(s));
2084 ; /* do nothing for .. !! and ?? which can be legit */
2088 if (pswit[ECHO_SWITCH])
2089 g_print("\n%s\n",aline);
2090 if (!pswit[OVERVIEW_SWITCH])
2091 g_print(" Line %ld column %ld - Double punctuation?\n",
2092 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2101 * check_for_spaced_quotes:
2103 void check_for_spaced_quotes(const char *aline)
2107 while ((t=strstr(s," \" ")))
2109 if (pswit[ECHO_SWITCH])
2110 g_print("\n%s\n",aline);
2111 if (!pswit[OVERVIEW_SWITCH])
2112 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2113 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2116 s=g_utf8_next_char(g_utf8_next_char(t));
2119 while ((t=strstr(s," ' ")))
2121 if (pswit[ECHO_SWITCH])
2122 g_print("\n%s\n",aline);
2123 if (!pswit[OVERVIEW_SWITCH])
2124 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2125 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2128 s=g_utf8_next_char(g_utf8_next_char(t));
2131 while ((t=strstr(s," ` ")))
2133 if (pswit[ECHO_SWITCH])
2134 g_print("\n%s\n",aline);
2135 if (!pswit[OVERVIEW_SWITCH])
2136 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2137 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2140 s=g_utf8_next_char(g_utf8_next_char(t));
2145 * check_for_miscased_genative:
2147 * Check special case of 'S instead of 's at end of word.
2149 void check_for_miscased_genative(const char *aline)
2155 c=g_utf8_get_char(aline);
2156 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2157 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2161 nc=g_utf8_get_char(g_utf8_next_char(s));
2162 if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
2164 if (pswit[ECHO_SWITCH])
2165 g_print("\n%s\n",aline);
2166 if (!pswit[OVERVIEW_SWITCH])
2167 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2168 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2176 * check_end_of_line:
2178 * Now check special cases - start and end of line -
2179 * for single and double quotes. Start is sometimes [sic]
2180 * but better to query it anyway.
2181 * While we're here, check for dash at end of line.
2183 void check_end_of_line(const char *aline,struct warnings *warnings)
2188 lbytes=strlen(aline);
2189 if (g_utf8_strlen(aline,lbytes)>1)
2191 s=g_utf8_prev_char(aline+lbytes);
2192 c1=g_utf8_get_char(s);
2193 c2=g_utf8_get_char(g_utf8_prev_char(s));
2194 if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
2197 if (pswit[ECHO_SWITCH])
2198 g_print("\n%s\n",aline);
2199 if (!pswit[OVERVIEW_SWITCH])
2200 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2201 g_utf8_strlen(aline,lbytes));
2205 c1=g_utf8_get_char(aline);
2206 c2=g_utf8_get_char(g_utf8_next_char(aline));
2207 if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
2209 if (pswit[ECHO_SWITCH])
2210 g_print("\n%s\n",aline);
2211 if (!pswit[OVERVIEW_SWITCH])
2212 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2217 * Dash at end of line may well be legit - paranoid mode only
2218 * and don't report em-dash at line-end.
2220 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2222 for (s=g_utf8_prev_char(aline+lbytes);
2223 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2225 if (g_utf8_get_char(s)=='-' &&
2226 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2228 if (pswit[ECHO_SWITCH])
2229 g_print("\n%s\n",aline);
2230 if (!pswit[OVERVIEW_SWITCH])
2231 g_print(" Line %ld column %ld - "
2232 "Hyphen at end of line?\n",
2233 linecnt,g_utf8_pointer_to_offset(aline,s));
2240 * check_for_unspaced_bracket:
2242 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2243 * If so, suspect a scanno like "a]most".
2245 void check_for_unspaced_bracket(const char *aline)
2249 c=g_utf8_get_char(aline);
2250 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2251 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2255 nc=g_utf8_get_char(g_utf8_next_char(s));
2258 /* for each bracket character in the line except 1st & last */
2259 if (g_utf8_strchr("{[()]}",-1,c) &&
2260 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2262 if (pswit[ECHO_SWITCH])
2263 g_print("\n%s\n",aline);
2264 if (!pswit[OVERVIEW_SWITCH])
2265 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2266 linecnt,g_utf8_pointer_to_offset(aline,s));
2274 * check_for_unpunctuated_endquote:
2276 void check_for_unpunctuated_endquote(const char *aline)
2280 c=g_utf8_get_char(aline);
2281 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2282 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2286 nc=g_utf8_get_char(g_utf8_next_char(s));
2287 /* for each character in the line except 1st */
2288 if (c==CHAR_DQUOTE && isalpha(pc))
2290 if (pswit[ECHO_SWITCH])
2291 g_print("\n%s\n",aline);
2292 if (!pswit[OVERVIEW_SWITCH])
2293 g_print(" Line %ld column %ld - "
2294 "endquote missing punctuation?\n",
2295 linecnt,g_utf8_pointer_to_offset(aline,s));
2303 * check_for_html_tag:
2305 * Check for <HTML TAG>.
2307 * If there is a < in the line, followed at some point
2308 * by a > then we suspect HTML.
2310 void check_for_html_tag(const char *aline)
2312 const char *open,*close;
2314 open=strchr(aline,'<');
2317 close=strchr(g_utf8_next_char(open),'>');
2320 if (pswit[ECHO_SWITCH])
2321 g_print("\n%s\n",aline);
2322 if (!pswit[OVERVIEW_SWITCH])
2324 tag=g_strndup(open,close-open+1);
2325 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2326 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2336 * check_for_html_entity:
2338 * Check for &symbol; HTML.
2340 * If there is a & in the line, followed at
2341 * some point by a ; then we suspect HTML.
2343 void check_for_html_entity(const char *aline)
2345 const char *s,*amp,*scolon;
2347 amp=strchr(aline,'&');
2350 scolon=strchr(amp,';');
2353 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2354 if (g_utf8_get_char(s)==CHAR_SPACE)
2355 break; /* Don't report "Jones & Son;" */
2358 if (pswit[ECHO_SWITCH])
2359 g_print("\n%s\n",aline);
2360 if (!pswit[OVERVIEW_SWITCH])
2362 entity=g_strndup(amp,scolon-amp+1);
2363 g_print(" Line %ld column %d - HTML symbol? %s \n",
2364 linecnt,(int)(amp-aline)+1,entity);
2377 * If we are in a state of unbalanced quotes, and this line
2378 * doesn't begin with a quote, output the stored error message.
2379 * If the -P switch was used, print the warning even if the
2380 * new para starts with quotes.
2382 void print_pending(const char *aline,const char *parastart,
2383 struct pending *pending)
2390 c=g_utf8_get_char(s);
2391 if (pending->dquote)
2393 if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2395 if (!pswit[OVERVIEW_SWITCH])
2397 if (pswit[ECHO_SWITCH])
2398 g_print("\n%s\n",parastart);
2399 g_print("%s\n",pending->dquote);
2404 g_free(pending->dquote);
2405 pending->dquote=NULL;
2407 if (pending->squote)
2409 if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2412 if (!pswit[OVERVIEW_SWITCH])
2414 if (pswit[ECHO_SWITCH])
2415 g_print("\n%s\n",parastart);
2416 g_print("%s\n",pending->squote);
2421 g_free(pending->squote);
2422 pending->squote=NULL;
2424 if (pending->rbrack)
2426 if (!pswit[OVERVIEW_SWITCH])
2428 if (pswit[ECHO_SWITCH])
2429 g_print("\n%s\n",parastart);
2430 g_print("%s\n",pending->rbrack);
2434 g_free(pending->rbrack);
2435 pending->rbrack=NULL;
2437 if (pending->sbrack)
2439 if (!pswit[OVERVIEW_SWITCH])
2441 if (pswit[ECHO_SWITCH])
2442 g_print("\n%s\n",parastart);
2443 g_print("%s\n",pending->sbrack);
2447 g_free(pending->sbrack);
2448 pending->sbrack=NULL;
2450 if (pending->cbrack)
2452 if (!pswit[OVERVIEW_SWITCH])
2454 if (pswit[ECHO_SWITCH])
2455 g_print("\n%s\n",parastart);
2456 g_print("%s\n",pending->cbrack);
2460 g_free(pending->cbrack);
2461 pending->cbrack=NULL;
2463 if (pending->unders)
2465 if (!pswit[OVERVIEW_SWITCH])
2467 if (pswit[ECHO_SWITCH])
2468 g_print("\n%s\n",parastart);
2469 g_print("%s\n",pending->unders);
2473 g_free(pending->unders);
2474 pending->unders=NULL;
2479 * check_for_mismatched_quotes:
2481 * At end of paragraph, check for mismatched quotes.
2483 * We don't want to report an error immediately, since it is a
2484 * common convention to omit the quotes at end of paragraph if
2485 * the next paragraph is a continuation of the same speaker.
2486 * Where this is the case, the next para should begin with a
2487 * quote, so we store the warning message and only display it
2488 * at the top of the next iteration if the new para doesn't
2489 * start with a quote.
2490 * The -p switch overrides this default, and warns of unclosed
2491 * quotes on _every_ paragraph, whether the next begins with a
2494 void check_for_mismatched_quotes(const struct counters *counters,
2495 struct pending *pending)
2497 if (counters->quot%2)
2499 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2500 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2501 counters->open_single_quote!=counters->close_single_quote)
2503 g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
2504 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2505 counters->open_single_quote!=counters->close_single_quote &&
2506 counters->open_single_quote!=counters->close_single_quote+1)
2508 * Flag it to be noted regardless of the
2509 * first char of the next para.
2512 if (counters->r_brack)
2514 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2515 if (counters->s_brack)
2517 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2518 if (counters->c_brack)
2520 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2521 if (counters->c_unders%2)
2523 g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
2527 * check_for_omitted_punctuation:
2529 * Check for omitted punctuation at end of paragraph by working back
2530 * through prevline. DW.
2531 * Need to check this only for "normal" paras.
2532 * So what is a "normal" para?
2533 * Not normal if one-liner (chapter headings, etc.)
2534 * Not normal if doesn't contain at least one locase letter
2535 * Not normal if starts with space
2537 void check_for_omitted_punctuation(const char *prevline,
2538 struct line_properties *last,int start_para_line)
2540 gboolean letter_on_line=FALSE;
2542 for (s=prevline;*s;s=g_utf8_next_char(s))
2543 if (g_unichar_isalpha(g_utf8_get_char(s)))
2545 letter_on_line=TRUE;
2549 * This next "if" is a problem.
2550 * If we say "start_para_line <= linecnt - 1", that includes
2551 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2552 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2553 * misses genuine one-line paragraphs.
2555 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2556 g_utf8_get_char(prevline)>CHAR_SPACE)
2558 for (s=g_utf8_prev_char(prevline+strlen(prevline));
2559 (g_utf8_get_char(s)==CHAR_DQUOTE ||
2560 g_utf8_get_char(s)==CHAR_SQUOTE) &&
2561 g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
2562 s=g_utf8_prev_char(s))
2564 for (;s>prevline;s=g_utf8_prev_char(s))
2566 if (g_unichar_isalpha(g_utf8_get_char(s)))
2568 if (pswit[ECHO_SWITCH])
2569 g_print("\n%s\n",prevline);
2570 if (!pswit[OVERVIEW_SWITCH])
2571 g_print(" Line %ld column %ld - "
2572 "No punctuation at para end?\n",
2573 linecnt-1,g_utf8_strlen(prevline,-1));
2578 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2584 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2586 const char *word=key;
2589 g_print("\nNote: Queried word %s was duplicated %d times\n",
2594 void print_as_windows_1252(const char *string)
2596 gsize inbytes,outbytes;
2598 GIConv converter=(GIConv)-1;
2601 if (converter!=(GIConv)-1)
2602 g_iconv_close(converter);
2603 converter=(GIConv)-1;
2606 if (converter=(GIConv)-1)
2607 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2608 if (converter!=(GIConv)-1)
2610 inbytes=outbytes=strlen(string);
2611 bp=buf=g_malloc(outbytes+1);
2612 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2618 fputs(string,stdout);
2626 void procfile(const char *filename)
2629 gchar *parastart=NULL; /* first line of current para */
2630 gchar *etext,*aline;
2633 struct first_pass_results *first_pass_results;
2634 struct warnings *warnings;
2635 struct counters counters={0};
2636 struct line_properties last={0};
2637 struct parities parities={0};
2638 struct pending pending={0};
2639 gboolean isemptyline;
2640 long start_para_line=0;
2641 gboolean isnewpara=FALSE,enddash=FALSE;
2642 last.start=CHAR_SPACE;
2643 linecnt=checked_linecnt=0;
2644 etext=read_etext(filename,&err);
2647 if (pswit[STDOUT_SWITCH])
2648 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2650 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2653 g_set_print_handler(print_as_windows_1252);
2654 g_print("\n\nFile: %s\n\n",filename);
2655 first_pass_results=first_pass(etext);
2656 warnings=report_first_pass(first_pass_results);
2657 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2658 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2660 * Here we go with the main pass. Hold onto yer hat!
2664 while ((aline=flgets(&etext_ptr,linecnt+1)))
2669 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2670 continue; // skip DP page separators completely
2671 if (linecnt<first_pass_results->firstline ||
2672 (first_pass_results->footerline>0 &&
2673 linecnt>first_pass_results->footerline))
2675 if (pswit[HEADER_SWITCH])
2677 if (g_str_has_prefix(aline,"Title:"))
2678 g_print(" %s\n",aline);
2679 if (g_str_has_prefix(aline,"Author:"))
2680 g_print(" %s\n",aline);
2681 if (g_str_has_prefix(aline,"Release Date:"))
2682 g_print(" %s\n",aline);
2683 if (g_str_has_prefix(aline,"Edition:"))
2684 g_print(" %s\n\n",aline);
2686 continue; /* skip through the header */
2689 print_pending(aline,parastart,&pending);
2690 memset(&pending,0,sizeof(pending));
2691 isemptyline=analyse_quotes(aline,&counters);
2692 if (isnewpara && !isemptyline)
2694 /* This line is the start of a new paragraph. */
2695 start_para_line=linecnt;
2696 /* Capture its first line in case we want to report it later. */
2698 parastart=g_strdup(aline);
2699 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2701 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2702 !g_unichar_isdigit(g_utf8_get_char(s)))
2703 s=g_utf8_next_char(s);
2704 if (g_unichar_islower(g_utf8_get_char(s)))
2706 /* and its first letter is lowercase */
2707 if (pswit[ECHO_SWITCH])
2708 g_print("\n%s\n",aline);
2709 if (!pswit[OVERVIEW_SWITCH])
2710 g_print(" Line %ld column %ld - "
2711 "Paragraph starts with lower-case\n",
2712 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2716 isnewpara=FALSE; /* Signal the end of new para processing. */
2718 /* Check for an em-dash broken at line end. */
2719 if (enddash && g_utf8_get_char(aline)=='-')
2721 if (pswit[ECHO_SWITCH])
2722 g_print("\n%s\n",aline);
2723 if (!pswit[OVERVIEW_SWITCH])
2724 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2729 for (s=g_utf8_prev_char(aline+strlen(aline));
2730 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2732 if (s>=aline && g_utf8_get_char(s)=='-')
2734 check_for_control_characters(aline);
2736 check_for_odd_characters(aline,warnings,isemptyline);
2737 if (warnings->longline)
2738 check_for_long_line(aline);
2739 if (warnings->shortline)
2740 check_for_short_line(aline,&last);
2742 last.len=g_utf8_strlen(aline,-1);
2743 last.start=g_utf8_get_char(aline);
2744 check_for_starting_punctuation(aline);
2747 check_for_spaced_emdash(aline);
2748 check_for_spaced_dash(aline);
2750 check_for_unmarked_paragraphs(aline);
2751 check_for_jeebies(aline);
2752 check_for_mta_from(aline);
2753 check_for_orphan_character(aline);
2754 check_for_pling_scanno(aline);
2755 check_for_extra_period(aline,warnings);
2756 check_for_following_punctuation(aline);
2757 check_for_typos(aline,warnings);
2758 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2759 check_for_double_punctuation(aline,warnings);
2760 check_for_spaced_quotes(aline);
2761 check_for_miscased_genative(aline);
2762 check_end_of_line(aline,warnings);
2763 check_for_unspaced_bracket(aline);
2764 if (warnings->endquote)
2765 check_for_unpunctuated_endquote(aline);
2766 check_for_html_tag(aline);
2767 check_for_html_entity(aline);
2770 check_for_mismatched_quotes(&counters,&pending);
2771 memset(&counters,0,sizeof(counters));
2772 /* let the next iteration know that it's starting a new para */
2775 check_for_omitted_punctuation(prevline,&last,start_para_line);
2778 prevline=g_strdup(aline);
2788 if (!pswit[OVERVIEW_SWITCH])
2789 g_tree_foreach(qword,report_duplicate_queries,NULL);
2790 g_tree_unref(qword);
2791 g_tree_unref(qperiod);
2792 g_set_print_handler(NULL);
2793 print_as_windows_1252(NULL);
2794 if (pswit[MARKUP_SWITCH])
2801 * Get one line from the input text, checking for
2802 * the existence of exactly one CR/LF line-end per line.
2804 * Returns: a pointer to the line.
2806 char *flgets(char **etext,long lcnt)
2809 gboolean isCR=FALSE;
2810 char *theline=*etext;
2815 c=g_utf8_get_char(*etext);
2816 *etext=g_utf8_next_char(*etext);
2819 /* either way, it's end of line */
2826 /* Error - a LF without a preceding CR */
2827 if (pswit[LINE_END_SWITCH])
2829 if (pswit[ECHO_SWITCH])
2831 s=g_strndup(theline,eos-theline);
2832 g_print("\n%s\n",s);
2835 if (!pswit[OVERVIEW_SWITCH])
2836 g_print(" Line %ld - No CR?\n",lcnt);
2847 /* Error - two successive CRs */
2848 if (pswit[LINE_END_SWITCH])
2850 if (pswit[ECHO_SWITCH])
2852 s=g_strndup(theline,eos-theline);
2853 g_print("\n%s\n",s);
2856 if (!pswit[OVERVIEW_SWITCH])
2857 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2866 if (pswit[LINE_END_SWITCH] && isCR)
2868 if (pswit[ECHO_SWITCH])
2870 s=g_strndup(theline,eos-theline);
2871 g_print("\n%s\n",s);
2874 if (!pswit[OVERVIEW_SWITCH])
2875 g_print(" Line %ld column %ld - CR without LF?\n",
2876 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2882 eos=g_utf8_next_char(eos);
2886 if (pswit[MARKUP_SWITCH])
2887 postprocess_for_HTML(theline);
2888 if (pswit[DP_SWITCH])
2889 postprocess_for_DP(theline);
2896 * Takes a "word" as a parameter, and checks whether it
2897 * contains a mixture of alpha and digits. Generally, this is an
2898 * error, but may not be for cases like 4th or L5 12s. 3d.
2900 * Returns: TRUE iff an is error found.
2902 gboolean mixdigit(const char *checkword)
2904 gboolean wehaveadigit,wehavealetter,query;
2905 const char *s,*nondigit;
2906 wehaveadigit=wehavealetter=query=FALSE;
2907 for (s=checkword;*s;s=g_utf8_next_char(s))
2908 if (g_unichar_isalpha(g_utf8_get_char(s)))
2910 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2912 if (wehaveadigit && wehavealetter)
2914 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2916 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2917 nondigit=g_utf8_next_char(nondigit))
2919 /* digits, ending in st, rd, nd, th of either case */
2920 if (!g_ascii_strcasecmp(nondigit,"st") ||
2921 !g_ascii_strcasecmp(nondigit,"rd") ||
2922 !g_ascii_strcasecmp(nondigit,"nd") ||
2923 !g_ascii_strcasecmp(nondigit,"th"))
2925 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2926 !g_ascii_strcasecmp(nondigit,"rds") ||
2927 !g_ascii_strcasecmp(nondigit,"nds") ||
2928 !g_ascii_strcasecmp(nondigit,"ths"))
2930 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2931 !g_ascii_strcasecmp(nondigit,"rdly") ||
2932 !g_ascii_strcasecmp(nondigit,"ndly") ||
2933 !g_ascii_strcasecmp(nondigit,"thly"))
2935 /* digits, ending in l, L, s or d */
2936 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2937 !strcmp(nondigit,"d"))
2940 * L at the start of a number, representing Britsh pounds, like L500.
2941 * This is cute. We know the current word is mixed digit. If the first
2942 * letter is L, there must be at least one digit following. If both
2943 * digits and letters follow, we have a genuine error, else we have a
2944 * capital L followed by digits, and we accept that as a non-error.
2946 if (g_utf8_get_char(checkword)=='L' &&
2947 !mixdigit(g_utf8_next_char(checkword)))
2956 * Extracts the first/next "word" from the line, and returns it.
2957 * A word is defined as one English word unit--or at least that's the aim.
2958 * "ptr" is advanced to the position in the line where we will start
2959 * looking for the next word.
2961 * Returns: A newly-allocated string.
2963 gchar *getaword(const char **ptr)
2968 word=g_string_new(NULL);
2969 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2970 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2971 **ptr;*ptr=g_utf8_next_char(*ptr))
2974 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2975 * Especially yucky is the case of L1,000
2976 * This section looks for a pattern of characters including a digit
2977 * followed by a comma or period followed by one or more digits.
2978 * If found, it returns this whole pattern as a word; otherwise we discard
2979 * the results and resume our normal programming.
2982 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2983 g_unichar_isalpha(g_utf8_get_char(s)) ||
2984 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2985 g_string_append_unichar(word,g_utf8_get_char(s));
2986 for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
2987 t=g_utf8_next_char(t))
2989 c=g_utf8_get_char(t);
2990 pc=g_utf8_get_char(g_utf8_prev_char(t));
2991 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
2994 return g_string_free(word,FALSE);
2997 /* we didn't find a punctuated number - do the regular getword thing */
2998 g_string_truncate(word,0);
2999 for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
3000 g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
3001 g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
3002 g_string_append_unichar(word,g_utf8_get_char(*ptr));
3003 return g_string_free(word,FALSE);
3009 * Is this word a Roman Numeral?
3011 * It doesn't actually validate that the number is a valid Roman Numeral--for
3012 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3013 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3014 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3015 * expressions thereof, except when it came to taxes. Allow any number of M,
3016 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3017 * XL or an optional XC, an optional IX or IV, an optional V and any number
3020 gboolean isroman(const char *t)
3026 while (g_utf8_get_char(t)=='m' && *t)
3028 if (g_utf8_get_char(t)=='d')
3030 if (g_str_has_prefix(t,"cm"))
3032 if (g_str_has_prefix(t,"cd"))
3034 while (g_utf8_get_char(t)=='c' && *t)
3036 if (g_str_has_prefix(t,"xl"))
3038 if (g_str_has_prefix(t,"xc"))
3040 if (g_utf8_get_char(t)=='l')
3042 while (g_utf8_get_char(t)=='x' && *t)
3044 if (g_str_has_prefix(t,"ix"))
3046 if (g_str_has_prefix(t,"iv"))
3048 if (g_utf8_get_char(t)=='v')
3050 while (g_utf8_get_char(t)=='i' && *t)
3056 * postprocess_for_DP:
3058 * Invoked with the -d switch from flgets().
3059 * It simply "removes" from the line a hard-coded set of common
3060 * DP-specific tags, so that the line passed to the main routine has
3061 * been pre-cleaned of DP markup.
3063 void postprocess_for_DP(char *theline)
3069 for (i=0;*DPmarkup[i];i++)
3070 while ((s=strstr(theline,DPmarkup[i])))
3072 t=s+strlen(DPmarkup[i]);
3073 memmove(s,t,strlen(t)+1);
3078 * postprocess_for_HTML:
3080 * Invoked with the -m switch from flgets().
3081 * It simply "removes" from the line a hard-coded set of common
3082 * HTML tags and "replaces" a hard-coded set of common HTML
3083 * entities, so that the line passed to the main routine has
3084 * been pre-cleaned of HTML.
3086 void postprocess_for_HTML(char *theline)
3088 while (losemarkup(theline))
3090 loseentities(theline);
3093 char *losemarkup(char *theline)
3097 s=strchr(theline,'<');
3098 t=s?strchr(s,'>'):NULL;
3101 for (i=0;*markup[i];i++)
3102 if (tagcomp(g_utf8_next_char(s),markup[i]))
3104 t=g_utf8_next_char(t);
3105 memmove(s,t,strlen(t)+1);
3108 /* It's an unrecognized <xxx>. */
3112 void loseentities(char *theline)
3119 GTree *entities=NULL;
3120 GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3124 g_tree_destroy(entities);
3126 if (translit==(GIConv)-1)
3127 g_iconv_close(translit);
3128 translit=(GIConv)-1;
3129 if (to_utf8==(GIConv)-1)
3130 g_iconv_close(to_utf8);
3138 entities=g_tree_new((GCompareFunc)strcmp);
3139 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3140 g_tree_insert(entities,HTMLentities[i].name,
3141 GUINT_TO_POINTER(HTMLentities[i].c));
3143 if (translit==(GIConv)-1)
3144 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3145 if (to_utf8==(GIConv)-1)
3146 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3147 while((amp=strchr(theline,'&')))
3149 scolon=strchr(amp,';');
3154 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3155 c=strtol(amp+2,NULL,10);
3156 else if (amp[2]=='x' &&
3157 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3158 c=strtol(amp+3,NULL,16);
3162 s=g_strndup(amp+1,scolon-(amp+1));
3163 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3172 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3173 theline+=g_unichar_to_utf8(c,theline);
3177 nb=g_unichar_to_utf8(c,s);
3178 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3180 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3182 memcpy(theline,s,nb);
3186 memmove(theline,g_utf8_next_char(scolon),
3187 strlen(g_utf8_next_char(scolon))+1);
3190 theline=g_utf8_next_char(amp);
3194 gboolean tagcomp(const char *strin,const char *basetag)
3198 if (g_utf8_get_char(strin)=='/')
3199 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3201 t=g_utf8_casefold(strin,-1);
3202 s=g_utf8_casefold(basetag,-1);
3203 retval=g_str_has_prefix(t,s);
3209 void proghelp(GOptionContext *context)
3212 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3213 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3214 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3215 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3216 "For details, read the file COPYING.\n",stderr);
3217 fputs("This is Free Software; "
3218 "you may redistribute it under certain conditions (GPL);\n",stderr);
3219 fputs("read the file COPYING for details.\n\n",stderr);
3220 help=g_option_context_get_help(context,TRUE,NULL);
3223 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3224 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3225 "non-ASCII\n",stderr);
3226 fputs("characters like accented letters, "
3227 "lines longer than 75 or shorter than 55,\n",stderr);
3228 fputs("unbalanced quotes or brackets, "
3229 "a variety of badly formatted punctuation, \n",stderr);
3230 fputs("HTML tags, some likely typos. "
3231 "It is NOT a substitute for human judgement.\n",stderr);