1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 static GOptionEntry options[]={
132 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
133 "Ignore DP-specific markup", NULL },
134 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
135 "Don't echo queried line", NULL },
136 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
137 "Check single quotes", NULL },
138 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
139 "Check common typos", NULL },
140 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
141 "Require closure of quotes on every paragraph", NULL },
142 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
143 "Disable paranoid querying of everything", NULL },
144 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
145 "Disable line end checking", NULL },
146 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
147 "Overview: just show counts", NULL },
148 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
149 "Output errors to stdout instead of stderr", NULL },
150 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
151 "Echo header fields", NULL },
152 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
153 "Ignore markup in < >", NULL },
154 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
155 "Use file of user-defined typos", NULL },
156 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
157 "Defaults for use on www upload", NULL },
158 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
159 "Verbose - list everything", NULL },
163 long cnt_quote; /* for overview mode, count of quote queries */
164 long cnt_brack; /* for overview mode, count of brackets queries */
165 long cnt_bin; /* for overview mode, count of non-ASCII queries */
166 long cnt_odd; /* for overview mode, count of odd character queries */
167 long cnt_long; /* for overview mode, count of long line errors */
168 long cnt_short; /* for overview mode, count of short line queries */
169 long cnt_punct; /* for overview mode,
170 count of punctuation and spacing queries */
171 long cnt_dash; /* for overview mode, count of dash-related queries */
172 long cnt_word; /* for overview mode, count of word queries */
173 long cnt_html; /* for overview mode, count of html queries */
174 long cnt_lineend; /* for overview mode, count of line-end queries */
175 long cnt_spacend; /* count of lines with space at end */
176 long linecnt; /* count of total lines in the file */
177 long checked_linecnt; /* count of lines actually checked */
179 void proghelp(GOptionContext *context);
180 void procfile(const char *);
184 gboolean mixdigit(const char *);
185 gchar *getaword(const char **);
186 char *flgets(char **,long);
187 void postprocess_for_HTML(char *);
188 char *linehasmarkup(char *);
189 char *losemarkup(char *);
190 gboolean tagcomp(const char *,const char *);
191 void loseentities(char *);
192 gboolean isroman(const char *);
193 void postprocess_for_DP(char *);
194 void print_as_windows_1252(const char *string);
195 void print_as_utf_8(const char *string);
197 GTree *qword,*qperiod;
203 void parse_options(int *argc,char ***argv)
206 GOptionContext *context;
207 context=g_option_context_new(
208 "file - looks for errors in Project Gutenberg(TM) etexts");
209 g_option_context_add_main_entries(context,options,NULL);
210 if (!g_option_context_parse(context,argc,argv,&err))
212 g_printerr("Bookloupe: %s\n",err->message);
213 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
216 /* Paranoid checking is turned OFF, not on, by its switch */
217 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
218 if (pswit[PARANOID_SWITCH])
219 /* if running in paranoid mode, typo checks default to enabled */
220 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
221 /* Line-end checking is turned OFF, not on, by its switch */
222 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
223 /* Echoing is turned OFF, not on, by its switch */
224 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
225 if (pswit[OVERVIEW_SWITCH])
226 /* just print summary; don't echo */
227 pswit[ECHO_SWITCH]=FALSE;
229 * Web uploads - for the moment, this is really just a placeholder
230 * until we decide what processing we really want to do on web uploads
232 if (pswit[WEB_SWITCH])
234 /* specific override for web uploads */
235 pswit[ECHO_SWITCH]=TRUE;
236 pswit[SQUOTE_SWITCH]=FALSE;
237 pswit[TYPO_SWITCH]=TRUE;
238 pswit[QPARA_SWITCH]=FALSE;
239 pswit[PARANOID_SWITCH]=TRUE;
240 pswit[LINE_END_SWITCH]=FALSE;
241 pswit[OVERVIEW_SWITCH]=FALSE;
242 pswit[STDOUT_SWITCH]=FALSE;
243 pswit[HEADER_SWITCH]=TRUE;
244 pswit[VERBOSE_SWITCH]=FALSE;
245 pswit[MARKUP_SWITCH]=FALSE;
246 pswit[USERTYPO_SWITCH]=FALSE;
247 pswit[DP_SWITCH]=FALSE;
254 g_option_context_free(context);
260 * Read in the user-defined stealth scanno list.
262 void read_user_scannos(void)
265 gchar *usertypo_file;
269 gchar *contents,*utf8,**lines;
270 usertypo_file=g_strdup("bookloupe.typ");
271 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
272 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
275 g_free(usertypo_file);
276 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
277 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
279 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
282 g_free(usertypo_file);
283 usertypo_file=g_strdup("gutcheck.typ");
284 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
286 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
289 g_free(usertypo_file);
290 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
291 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
293 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
295 g_free(usertypo_file);
296 g_print(" --> I couldn't find bookloupe.typ "
297 "-- proceeding without user typos.\n");
302 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
303 g_free(usertypo_file);
307 if (g_utf8_validate(contents,len,NULL))
308 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
310 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
312 lines=g_strsplit_set(utf8,"\r\n",0);
314 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
315 for (i=0;lines[i];i++)
316 if (*(unsigned char *)lines[i]>'!')
317 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
326 * Read an etext returning a newly allocated string containing the file
327 * contents or NULL on error.
329 gchar *read_etext(const char *filename,GError **err)
331 GError *tmp_err=NULL;
332 gchar *contents,*utf8;
333 gsize len,bytes_read,bytes_written;
335 if (!g_file_get_contents(filename,&contents,&len,err))
337 if (g_utf8_validate(contents,len,NULL))
339 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
340 g_set_print_handler(print_as_utf_8);
342 SetConsoleOutputCP(CP_UTF8);
347 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
348 &bytes_written,&tmp_err);
349 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
350 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
353 for(i=0;i<bytes_read;i++)
354 if (contents[i]=='\n')
359 else if (contents[i]!='\r')
361 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
362 "Input conversion failed. Byte %d at line %d, column %d is not a "
363 "valid Windows-1252 character",
364 ((unsigned char *)contents)[bytes_read],line,col);
367 g_propagate_error(err,tmp_err);
368 g_set_print_handler(print_as_windows_1252);
370 SetConsoleOutputCP(1252);
377 void cleanup_on_exit(void)
380 SetConsoleOutputCP(saved_cp);
384 int main(int argc,char **argv)
387 atexit(cleanup_on_exit);
388 saved_cp=GetConsoleOutputCP();
390 running_from=g_path_get_dirname(argv[0]);
391 parse_options(&argc,&argv);
392 if (pswit[USERTYPO_SWITCH])
394 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
396 if (pswit[OVERVIEW_SWITCH])
398 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
399 checked_linecnt,linecnt,linecnt-checked_linecnt);
400 g_print(" --------------- Queries found --------------\n");
402 g_print(" Long lines: %14ld\n",cnt_long);
404 g_print(" Short lines: %14ld\n",cnt_short);
406 g_print(" Line-end problems: %14ld\n",cnt_lineend);
408 g_print(" Common typos: %14ld\n",cnt_word);
410 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
412 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
414 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
416 g_print(" Proofing characters: %14ld\n",cnt_odd);
418 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
420 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
422 g_print(" Possible HTML tags: %14ld\n",cnt_html);
424 g_print(" TOTAL QUERIES %14ld\n",
425 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
426 cnt_dash+cnt_word+cnt_html+cnt_lineend);
428 g_free(running_from);
430 g_tree_unref(usertypo);
434 void count_dashes(const char *line,const char *dash,
435 struct dash_results *results)
440 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
443 tokens=g_strsplit(line,dash,0);
446 for(i=1;tokens[i];i++)
448 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
449 nc=g_utf8_get_char(tokens[i]);
450 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
452 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
454 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
460 /* count of lines with em-dashes with spaces both sides */
461 results->non_PG_space++;
463 /* count of lines with PG-type em-dashes with no spaces */
471 * Run a first pass - verify that it's a valid PG
472 * file, decide whether to report some things that
473 * occur many times in the text like long or short
474 * lines, non-standard dashes, etc.
476 struct first_pass_results *first_pass(const char *etext)
478 gunichar laststart=CHAR_SPACE;
483 unsigned int lastlen=0,lastblen=0;
484 long spline=0,nspline=0;
485 static struct first_pass_results results={0};
486 struct dash_results tmp_dash_results;
489 lines=g_strsplit(etext,"\n",0);
490 for (j=0;lines[j];j++)
492 lbytes=strlen(lines[j]);
493 while (lbytes>0 && lines[j][lbytes-1]=='\r')
494 lines[j][--lbytes]='\0';
495 llen=g_utf8_strlen(lines[j],lbytes);
497 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
498 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
501 g_print(" --> Duplicate header?\n");
502 spline=linecnt+1; /* first line of non-header text, that is */
504 if (!strncmp(lines[j],"*** START",9) &&
505 strstr(lines[j],"PROJECT GUTENBERG"))
508 g_print(" --> Duplicate header?\n");
509 nspline=linecnt+1; /* first line of non-header text, that is */
511 if (spline || nspline)
513 lc_line=g_utf8_strdown(lines[j],lbytes);
514 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
516 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
518 if (results.footerline)
520 /* it's an old-form header - we can detect duplicates */
522 g_print(" --> Duplicate footer?\n");
525 results.footerline=linecnt;
531 results.firstline=spline;
533 results.firstline=nspline; /* override with new */
534 if (results.footerline)
535 continue; /* don't count the boilerplate in the footer */
536 results.totlen+=llen;
537 for (s=lines[j];*s;s=g_utf8_next_char(s))
539 if (g_utf8_get_char(s)>127)
541 if (g_unichar_isalpha(g_utf8_get_char(s)))
545 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
546 qc=QUOTE_CLASS(g_utf8_get_char(s));
549 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
550 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
551 results.endquote_count++;
554 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
555 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
558 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
560 if (strstr(lines[j],".,"))
562 /* only count ast lines for ignoring purposes where there is */
563 /* locase text on the line */
564 if (strchr(lines[j],'*'))
566 for (s=lines[j];*s;s=g_utf8_next_char(s))
567 if (g_unichar_islower(g_utf8_get_char(s)))
572 if (strchr(lines[j],'/'))
573 results.fslashline++;
576 for (s=g_utf8_prev_char(lines[j]+lbytes);
577 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
578 s=g_utf8_prev_char(s))
580 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
581 g_utf8_get_char(g_utf8_prev_char(s))!='-')
584 if (llen>LONGEST_PG_LINE)
586 if (llen>WAY_TOO_LONG)
587 results.verylongline++;
588 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
590 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
593 if (strstr(lines[j],"<i>"))
594 results.htmcount+=4; /* bonus marks! */
596 /* Check for spaced em-dashes */
597 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
598 count_dashes(lines[j],"--",&tmp_dash_results);
599 count_dashes(lines[j],"—",&tmp_dash_results);
600 if (tmp_dash_results.base)
601 results.emdash.base++;
602 if (tmp_dash_results.non_PG_space)
603 results.emdash.non_PG_space++;
604 if (tmp_dash_results.PG_space)
605 results.emdash.PG_space++;
609 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
610 results.Dutchcount++;
611 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
612 results.Frenchcount++;
613 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
614 results.standalone_digit++;
617 /* Check for spaced dashes */
618 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
622 laststart=lines[j][0];
631 * Make some snap decisions based on the first pass results.
633 struct warnings *report_first_pass(struct first_pass_results *results)
635 static struct warnings warnings={0};
637 g_print(" --> %ld lines in this file have white space at end\n",
640 if (results->dotcomma>5)
643 g_print(" --> %ld lines in this file contain '.,'. "
644 "Not reporting them.\n",results->dotcomma);
647 * If more than 50 lines, or one-tenth, are short,
648 * don't bother reporting them.
650 warnings.shortline=1;
651 if (results->shortline>50 || results->shortline*10>linecnt)
653 warnings.shortline=0;
654 g_print(" --> %ld lines in this file are short. "
655 "Not reporting short lines.\n",results->shortline);
658 * If more than 50 lines, or one-tenth, are long,
659 * don't bother reporting them.
662 if (results->longline>50 || results->longline*10>linecnt)
665 g_print(" --> %ld lines in this file are long. "
666 "Not reporting long lines.\n",results->longline);
668 /* If more than 10 lines contain asterisks, don't bother reporting them. */
670 if (results->astline>10)
673 g_print(" --> %ld lines in this file contain asterisks. "
674 "Not reporting them.\n",results->astline);
677 * If more than 10 lines contain forward slashes,
678 * don't bother reporting them.
681 if (results->fslashline>10)
684 g_print(" --> %ld lines in this file contain forward slashes. "
685 "Not reporting them.\n",results->fslashline);
688 * If more than 20 lines contain unpunctuated endquotes,
689 * don't bother reporting them.
692 if (results->endquote_count>20)
695 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
696 "Not reporting them.\n",results->endquote_count);
699 * If more than 15 lines contain standalone digits,
700 * don't bother reporting them.
703 if (results->standalone_digit>10)
706 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
707 "Not reporting them.\n",results->standalone_digit);
710 * If more than 20 lines contain hyphens at end,
711 * don't bother reporting them.
714 if (results->hyphens>20)
717 g_print(" --> %ld lines in this file have hyphens at end. "
718 "Not reporting them.\n",results->hyphens);
720 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
722 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
723 pswit[MARKUP_SWITCH]=1;
725 if (results->verylongline>0)
726 g_print(" --> %ld lines in this file are VERY long!\n",
727 results->verylongline);
729 * If there are more non-PG spaced dashes than PG em-dashes,
730 * assume it's deliberate.
731 * Current PG guidelines say don't use them, but older texts do,
732 * and some people insist on them whatever the guidelines say.
735 if (results->spacedash+results->emdash.non_PG_space>
736 results->emdash.PG_space)
739 g_print(" --> There are %ld spaced dashes and em-dashes. "
740 "Not reporting them.\n",
741 results->spacedash+results->emdash.non_PG_space);
743 /* If more than a quarter of characters are hi-bit, bug out. */
745 if (results->binlen*4>results->totlen)
747 g_print(" --> This file does not appear to be ASCII. "
748 "Terminating. Best of luck with it!\n");
751 if (results->alphalen*4<results->totlen)
753 g_print(" --> This file does not appear to be text. "
754 "Terminating. Best of luck with it!\n");
757 if (results->binlen*100>results->totlen || results->binlen>100)
759 g_print(" --> There are a lot of foreign letters here. "
760 "Not reporting them.\n");
763 warnings.isDutch=FALSE;
764 if (results->Dutchcount>50)
766 warnings.isDutch=TRUE;
767 g_print(" --> This looks like Dutch - "
768 "switching off dashes and warnings for 's Middags case.\n");
770 warnings.isFrench=FALSE;
771 if (results->Frenchcount>50)
773 warnings.isFrench=TRUE;
774 g_print(" --> This looks like French - "
775 "switching off some doublepunct.\n");
777 if (results->firstline && results->footerline)
778 g_print(" The PG header and footer appear to be already on.\n");
781 if (results->firstline)
782 g_print(" The PG header is on - no footer.\n");
783 if (results->footerline)
784 g_print(" The PG footer is on - no header.\n");
787 if (pswit[VERBOSE_SWITCH])
790 warnings.shortline=1;
799 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
801 if (warnings.isDutch)
803 if (results->footerline>0 && results->firstline>0 &&
804 results->footerline>results->firstline &&
805 results->footerline-results->firstline<100)
807 g_print(" --> I don't really know where this text starts. \n");
808 g_print(" There are no reference points.\n");
809 g_print(" I'm going to have to report the header and footer "
811 results->firstline=0;
819 * Look along the line, accumulate the count of quotes, and see
820 * if this is an empty line - i.e. a line with nothing on it
822 * If line has just spaces, period, * and/or - on it, don't
823 * count it, since empty lines with asterisks or dashes to
824 * separate sections are common.
826 * Returns: TRUE if the line is empty.
828 gboolean analyse_quotes(const char *aline,struct counters *counters)
831 /* assume the line is empty until proven otherwise */
832 gboolean isemptyline=TRUE;
833 const char *s=aline,*sprev,*snext;
836 GError *tmp_err=NULL;
839 snext=g_utf8_next_char(s);
840 c=g_utf8_get_char(s);
841 if (CHAR_IS_DQUOTE(c))
842 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
843 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
848 * At start of line, it can only be a quotation mark.
849 * Hardcode a very common exception!
851 if (!g_str_has_prefix(snext,"tis") &&
852 !g_str_has_prefix(snext,"Tis"))
853 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
855 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
856 g_unichar_isalpha(g_utf8_get_char(snext)))
857 /* Do nothing! it's definitely an apostrophe, not a quote */
859 /* it's outside a word - let's check it out */
860 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
861 g_unichar_isalpha(g_utf8_get_char(snext)))
863 /* certainly looks like a quotation mark */
864 if (!g_str_has_prefix(snext,"tis") &&
865 !g_str_has_prefix(snext,"Tis"))
866 /* hardcode a very common exception! */
868 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
869 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
871 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
876 /* now - is it a quotation mark? */
877 guessquote=0; /* accumulate clues */
878 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
880 /* it follows a letter - could be either */
882 if (g_utf8_get_char(sprev)=='s')
884 /* looks like a plural apostrophe */
886 if (g_utf8_get_char(snext)==CHAR_SPACE)
890 if (innermost_quote_matches(counters,c))
892 * Give it the benefit of some doubt,
893 * if a squote is already open.
899 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
902 /* no adjacent letter - it must be a quote of some kind */
903 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
908 if (pswit[ECHO_SWITCH])
909 g_print("\n%s\n",aline);
910 if (!pswit[OVERVIEW_SWITCH])
911 g_print(" Line %ld column %ld - %s\n",
912 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
913 g_clear_error(&tmp_err);
915 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
917 isemptyline=FALSE; /* ignore lines like * * * as spacers */
918 if (c==CHAR_UNDERSCORE)
919 counters->c_unders++;
920 if (c==CHAR_OPEN_SBRACK)
922 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
923 !matching_difference(counters,c) && s==aline &&
924 g_str_has_prefix(s,"[Illustration:"))
925 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
927 increment_matching(counters,c,TRUE);
929 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
930 increment_matching(counters,c,TRUE);
931 if (c==CHAR_CLOSE_SBRACK)
933 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
934 !matching_difference(counters,c) && !*snext)
935 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
937 increment_matching(counters,c,FALSE);
939 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
940 increment_matching(counters,c,FALSE);
948 * check_for_control_characters:
950 * Check for invalid or questionable characters in the line
951 * Anything above 127 is invalid for plain ASCII, and
952 * non-printable control characters should also be flagged.
953 * Tabs should generally not be there.
955 void check_for_control_characters(const char *aline)
959 for (s=aline;*s;s=g_utf8_next_char(s))
961 c=g_utf8_get_char(s);
962 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
964 if (pswit[ECHO_SWITCH])
965 g_print("\n%s\n",aline);
966 if (!pswit[OVERVIEW_SWITCH])
967 g_print(" Line %ld column %ld - Control character %u\n",
968 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
976 * check_for_odd_characters:
978 * Check for binary and other odd characters.
980 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
981 gboolean isemptyline)
983 /* Don't repeat multiple warnings on one line. */
984 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
985 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
988 for (s=aline;*s;s=g_utf8_next_char(s))
990 c=g_utf8_get_char(s);
991 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
993 if (pswit[ECHO_SWITCH])
994 g_print("\n%s\n",aline);
995 if (!pswit[OVERVIEW_SWITCH])
996 if (c>127 && c<160 || c>255)
997 g_print(" Line %ld column %ld - "
998 "Non-ISO-8859 character %u\n",
999 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1001 g_print(" Line %ld column %ld - "
1002 "Non-ASCII character %u\n",
1003 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1008 if (!eTab && c==CHAR_TAB)
1010 if (pswit[ECHO_SWITCH])
1011 g_print("\n%s\n",aline);
1012 if (!pswit[OVERVIEW_SWITCH])
1013 g_print(" Line %ld column %ld - Tab character?\n",
1014 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1019 if (!eTilde && c==CHAR_TILDE)
1022 * Often used by OCR software to indicate an
1023 * unrecognizable character.
1025 if (pswit[ECHO_SWITCH])
1026 g_print("\n%s\n",aline);
1027 if (!pswit[OVERVIEW_SWITCH])
1028 g_print(" Line %ld column %ld - Tilde character?\n",
1029 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1034 if (!eCarat && c==CHAR_CARAT)
1036 if (pswit[ECHO_SWITCH])
1037 g_print("\n%s\n",aline);
1038 if (!pswit[OVERVIEW_SWITCH])
1039 g_print(" Line %ld column %ld - Carat character?\n",
1040 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1045 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1047 if (pswit[ECHO_SWITCH])
1048 g_print("\n%s\n",aline);
1049 if (!pswit[OVERVIEW_SWITCH])
1050 g_print(" Line %ld column %ld - Forward slash?\n",
1051 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1057 * Report asterisks only in paranoid mode,
1058 * since they're often deliberate.
1060 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1063 if (pswit[ECHO_SWITCH])
1064 g_print("\n%s\n",aline);
1065 if (!pswit[OVERVIEW_SWITCH])
1066 g_print(" Line %ld column %ld - Asterisk?\n",
1067 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1076 * check_for_long_line:
1078 * Check for line too long.
1080 void check_for_long_line(const char *aline)
1082 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1084 if (pswit[ECHO_SWITCH])
1085 g_print("\n%s\n",aline);
1086 if (!pswit[OVERVIEW_SWITCH])
1087 g_print(" Line %ld column %ld - Long line %ld\n",
1088 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1095 * check_for_short_line:
1097 * Check for line too short.
1099 * This one is a bit trickier to implement: we don't want to
1100 * flag the last line of a paragraph for being short, so we
1101 * have to wait until we know that our current line is a
1102 * "normal" line, then report the _previous_ line if it was too
1103 * short. We also don't want to report indented lines like
1104 * chapter heads or formatted quotations. We therefore keep
1105 * last->len as the length of the last line examined, and
1106 * last->blen as the length of the last but one, and try to
1107 * suppress unnecessary warnings by checking that both were of
1108 * "normal" length. We keep the first character of the last
1109 * line in last->start, and if it was a space, we assume that
1110 * the formatting is deliberate. I can't figure out a way to
1111 * distinguish something like a quoted verse left-aligned or
1112 * the header or footer of a letter from a paragraph of short
1113 * lines - maybe if I examined the whole paragraph, and if the
1114 * para has less than, say, 8 lines and if all lines are short,
1115 * then just assume it's OK? Need to look at some texts to see
1116 * how often a formula like this would get the right result.
1118 void check_for_short_line(const char *aline,const struct line_properties *last)
1120 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1121 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1122 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1124 if (pswit[ECHO_SWITCH])
1125 g_print("\n%s\n",prevline);
1126 if (!pswit[OVERVIEW_SWITCH])
1127 g_print(" Line %ld column %ld - Short line %ld?\n",
1128 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1135 * check_for_starting_punctuation:
1137 * Look for punctuation other than full ellipses at start of line.
1139 void check_for_starting_punctuation(const char *aline)
1141 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1142 !g_str_has_prefix(aline,". . ."))
1144 if (pswit[ECHO_SWITCH])
1145 g_print("\n%s\n",aline);
1146 if (!pswit[OVERVIEW_SWITCH])
1147 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1157 * Find the first em-dash, return a pointer to it and set <next> to the
1158 * character following the dash.
1160 char *str_emdash(const char *s,const char **next)
1168 *next=g_utf8_next_char(s2);
1173 *next=g_utf8_next_char(g_utf8_next_char(s1));
1178 *next=g_utf8_next_char(g_utf8_next_char(s1));
1183 *next=g_utf8_next_char(s2);
1189 * check_for_spaced_emdash:
1191 * Check for spaced em-dashes.
1193 * We must check _all_ occurrences of em-dashes on the line
1194 * hence the loop - even if the first dash is OK
1195 * there may be another that's wrong later on.
1197 void check_for_spaced_emdash(const char *aline)
1199 const char *s,*t,*next;
1200 for (s=aline;t=str_emdash(s,&next);s=next)
1202 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1203 g_utf8_get_char(next)==CHAR_SPACE)
1205 if (pswit[ECHO_SWITCH])
1206 g_print("\n%s\n",aline);
1207 if (!pswit[OVERVIEW_SWITCH])
1208 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1209 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1217 * check_for_spaced_dash:
1219 * Check for spaced dashes.
1221 void check_for_spaced_dash(const char *aline)
1224 if ((s=strstr(aline," -")))
1226 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1228 if (pswit[ECHO_SWITCH])
1229 g_print("\n%s\n",aline);
1230 if (!pswit[OVERVIEW_SWITCH])
1231 g_print(" Line %ld column %ld - Spaced dash?\n",
1232 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1237 else if ((s=strstr(aline,"- ")))
1239 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1241 if (pswit[ECHO_SWITCH])
1242 g_print("\n%s\n",aline);
1243 if (!pswit[OVERVIEW_SWITCH])
1244 g_print(" Line %ld column %ld - Spaced dash?\n",
1245 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1253 * check_for_unmarked_paragraphs:
1255 * Check for unmarked paragraphs indicated by separate speakers.
1257 * May well be false positive:
1258 * "Bravo!" "Wonderful!" called the crowd.
1259 * but useful all the same.
1261 void check_for_unmarked_paragraphs(const char *aline)
1264 s=strstr(aline,"\" \"");
1266 s=strstr(aline,"\" \"");
1269 if (pswit[ECHO_SWITCH])
1270 g_print("\n%s\n",aline);
1271 if (!pswit[OVERVIEW_SWITCH])
1272 g_print(" Line %ld column %ld - "
1273 "Query missing paragraph break?\n",
1274 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1281 * check_for_jeebies:
1283 * Check for "to he" and other easy h/b errors.
1285 * This is a very inadequate effort on the h/b problem,
1286 * but the phrase "to he" is always an error, whereas "to
1287 * be" is quite common.
1288 * Similarly, '"Quiet!", be said.' is a non-be error
1289 * "to he" is _not_ always an error!:
1290 * "Where they went to he couldn't say."
1291 * Another false positive:
1292 * What would "Cinderella" be without the . . .
1293 * and another: "If he wants to he can see for himself."
1295 void check_for_jeebies(const char *aline)
1298 s=strstr(aline," be could ");
1300 s=strstr(aline," be would ");
1302 s=strstr(aline," was be ");
1304 s=strstr(aline," be is ");
1306 s=strstr(aline," is be ");
1308 s=strstr(aline,"\", be ");
1310 s=strstr(aline,"\" be ");
1312 s=strstr(aline,"\" be ");
1314 s=strstr(aline," to he ");
1317 if (pswit[ECHO_SWITCH])
1318 g_print("\n%s\n",aline);
1319 if (!pswit[OVERVIEW_SWITCH])
1320 g_print(" Line %ld column %ld - Query he/be error?\n",
1321 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1325 s=strstr(aline," the had ");
1327 s=strstr(aline," a had ");
1329 s=strstr(aline," they bad ");
1331 s=strstr(aline," she bad ");
1333 s=strstr(aline," he bad ");
1335 s=strstr(aline," you bad ");
1337 s=strstr(aline," i bad ");
1340 if (pswit[ECHO_SWITCH])
1341 g_print("\n%s\n",aline);
1342 if (!pswit[OVERVIEW_SWITCH])
1343 g_print(" Line %ld column %ld - Query had/bad error?\n",
1344 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1348 s=strstr(aline,"; hut ");
1350 s=strstr(aline,", hut ");
1353 if (pswit[ECHO_SWITCH])
1354 g_print("\n%s\n",aline);
1355 if (!pswit[OVERVIEW_SWITCH])
1356 g_print(" Line %ld column %ld - Query hut/but error?\n",
1357 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1364 * check_for_mta_from:
1366 * Special case - angled bracket in front of "From" placed there by an
1367 * MTA when sending an e-mail.
1369 void check_for_mta_from(const char *aline)
1372 s=strstr(aline,">From");
1375 if (pswit[ECHO_SWITCH])
1376 g_print("\n%s\n",aline);
1377 if (!pswit[OVERVIEW_SWITCH])
1378 g_print(" Line %ld column %ld - "
1379 "Query angled bracket with From\n",
1380 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1387 * check_for_orphan_character:
1389 * Check for a single character line -
1390 * often an overflow from bad wrapping.
1392 void check_for_orphan_character(const char *aline)
1395 c=g_utf8_get_char(aline);
1396 if (c && !*g_utf8_next_char(aline))
1398 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1399 ; /* Nothing - ignore numerals alone on a line. */
1402 if (pswit[ECHO_SWITCH])
1403 g_print("\n%s\n",aline);
1404 if (!pswit[OVERVIEW_SWITCH])
1405 g_print(" Line %ld column 1 - Query single character line\n",
1414 * check_for_pling_scanno:
1416 * Check for I" - often should be !
1418 void check_for_pling_scanno(const char *aline)
1421 s=strstr(aline," I\"");
1424 if (pswit[ECHO_SWITCH])
1425 g_print("\n%s\n",aline);
1426 if (!pswit[OVERVIEW_SWITCH])
1427 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1428 linecnt,g_utf8_pointer_to_offset(aline,s));
1435 * check_for_extra_period:
1437 * Check for period without a capital letter. Cut-down from gutspell.
1438 * Only works when it happens on a single line.
1440 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1442 const char *s,*t,*s1,*sprev;
1447 gunichar c,nc,pc,*decomposition;
1448 if (pswit[PARANOID_SWITCH])
1450 for (t=aline;t=strstr(t,". ");)
1454 t=g_utf8_next_char(t);
1455 /* start of line punctuation is handled elsewhere */
1458 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1460 t=g_utf8_next_char(t);
1463 if (warnings->isDutch)
1465 /* For Frank & Jeroen -- 's Middags case */
1466 gunichar c2,c3,c4,c5;
1467 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1468 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1469 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1470 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1471 if (CHAR_IS_APOSTROPHE(c2) &&
1472 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1473 g_unichar_isupper(c5))
1475 t=g_utf8_next_char(t);
1479 s1=g_utf8_next_char(g_utf8_next_char(t));
1480 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1481 !g_unichar_isdigit(g_utf8_get_char(s1)))
1482 s1=g_utf8_next_char(s1);
1483 if (g_unichar_islower(g_utf8_get_char(s1)))
1485 /* we have something to investigate */
1487 /* so let's go back and find out */
1488 nc=g_utf8_get_char(t);
1489 s1=g_utf8_prev_char(t);
1490 c=g_utf8_get_char(s1);
1491 sprev=g_utf8_prev_char(s1);
1492 pc=g_utf8_get_char(sprev);
1494 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1495 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1496 g_unichar_isalpha(nc)))
1501 sprev=g_utf8_prev_char(s1);
1502 pc=g_utf8_get_char(sprev);
1504 s1=g_utf8_next_char(s1);
1507 testword=g_strndup(s1,s-s1);
1509 testword=g_strdup(s1);
1510 for (i=0;*abbrev[i];i++)
1511 if (!strcmp(testword,abbrev[i]))
1513 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1515 if (!*g_utf8_next_char(testword))
1517 if (isroman(testword))
1522 for (s=testword;*s;s=g_utf8_next_char(s))
1524 decomposition=g_unicode_canonical_decomposition(
1525 g_utf8_get_char(s),&len);
1526 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1528 g_free(decomposition);
1532 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1534 g_tree_insert(qperiod,g_strdup(testword),
1535 GINT_TO_POINTER(1));
1536 if (pswit[ECHO_SWITCH])
1537 g_print("\n%s\n",aline);
1538 if (!pswit[OVERVIEW_SWITCH])
1539 g_print(" Line %ld column %ld - Extra period?\n",
1540 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1546 t=g_utf8_next_char(t);
1552 * check_for_following_punctuation:
1554 * Check for words usually not followed by punctuation.
1556 void check_for_following_punctuation(const char *aline)
1559 const char *s,*wordstart;
1562 if (pswit[TYPO_SWITCH])
1573 inword=g_utf8_strdown(t,-1);
1575 for (i=0;*nocomma[i];i++)
1576 if (!strcmp(inword,nocomma[i]))
1578 c=g_utf8_get_char(s);
1579 if (c==',' || c==';' || c==':')
1581 if (pswit[ECHO_SWITCH])
1582 g_print("\n%s\n",aline);
1583 if (!pswit[OVERVIEW_SWITCH])
1584 g_print(" Line %ld column %ld - "
1585 "Query punctuation after %s?\n",
1586 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1592 for (i=0;*noperiod[i];i++)
1593 if (!strcmp(inword,noperiod[i]))
1595 c=g_utf8_get_char(s);
1596 if (c=='.' || c=='!')
1598 if (pswit[ECHO_SWITCH])
1599 g_print("\n%s\n",aline);
1600 if (!pswit[OVERVIEW_SWITCH])
1601 g_print(" Line %ld column %ld - "
1602 "Query punctuation after %s?\n",
1603 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1617 * Check for commonly mistyped words,
1618 * and digits like 0 for O in a word.
1620 void check_for_typos(const char *aline,struct warnings *warnings)
1622 const char *s,*t,*nt,*wordstart;
1624 gunichar *decomposition;
1626 int i,vowel,consonant,*dupcnt;
1627 gboolean isdup,istypo,alower;
1630 gsize decomposition_len;
1634 inword=getaword(&s);
1638 continue; /* don't bother with empty lines */
1640 if (mixdigit(inword))
1642 if (pswit[ECHO_SWITCH])
1643 g_print("\n%s\n",aline);
1644 if (!pswit[OVERVIEW_SWITCH])
1645 g_print(" Line %ld column %ld - Query digit in %s\n",
1646 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1651 * Put the word through a series of tests for likely typos and OCR
1654 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1658 for (t=inword;*t;t=g_utf8_next_char(t))
1660 c=g_utf8_get_char(t);
1661 nt=g_utf8_next_char(t);
1662 /* lowercase for testing */
1663 if (g_unichar_islower(c))
1665 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1668 * We have an uppercase mid-word. However, there are
1670 * Mac and Mc like McGill
1671 * French contractions like l'Abbe
1673 offset=g_utf8_pointer_to_offset(inword,t);
1675 pc=g_utf8_get_char(g_utf8_prev_char(t));
1678 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1679 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1680 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1681 CHAR_IS_APOSTROPHE(pc))
1687 testword=g_utf8_casefold(inword,-1);
1689 if (pswit[TYPO_SWITCH])
1692 * Check for certain unlikely two-letter combinations at word
1695 len=g_utf8_strlen(testword,-1);
1698 for (i=0;*nostart[i];i++)
1699 if (g_str_has_prefix(testword,nostart[i]))
1701 for (i=0;*noend[i];i++)
1702 if (g_str_has_suffix(testword,noend[i]))
1705 /* ght is common, gbt never. Like that. */
1706 if (strstr(testword,"cb"))
1708 if (strstr(testword,"gbt"))
1710 if (strstr(testword,"pbt"))
1712 if (strstr(testword,"tbs"))
1714 if (strstr(testword,"mrn"))
1716 if (strstr(testword,"ahle"))
1718 if (strstr(testword,"ihle"))
1721 * "TBE" does happen - like HEARTBEAT - but uncommon.
1722 * Also "TBI" - frostbite, outbid - but uncommon.
1723 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1724 * numerals, but "ii" is a common scanno.
1726 if (strstr(testword,"tbi"))
1728 if (strstr(testword,"tbe"))
1730 if (strstr(testword,"ii"))
1733 * Check for no vowels or no consonants.
1734 * If none, flag a typo.
1736 if (!istypo && len>1)
1739 for (t=testword;*t;t=g_utf8_next_char(t))
1741 c=g_utf8_get_char(t);
1743 g_unicode_canonical_decomposition(c,&decomposition_len);
1744 if (c=='y' || g_unichar_isdigit(c))
1746 /* Yah, this is loose. */
1750 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1754 g_free(decomposition);
1756 if (!vowel || !consonant)
1760 * Now exclude the word from being reported if it's in
1763 for (i=0;*okword[i];i++)
1764 if (!strcmp(testword,okword[i]))
1767 * What looks like a typo may be a Roman numeral.
1770 if (istypo && isroman(testword))
1772 /* Check the manual list of typos. */
1774 for (i=0;*typo[i];i++)
1775 if (!strcmp(testword,typo[i]))
1778 * Check lowercase s, l, i and m - special cases.
1779 * "j" - often a semi-colon gone wrong.
1780 * "d" for a missing apostrophe - he d
1783 if (!istypo && len==1 &&
1784 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1788 dupcnt=g_tree_lookup(qword,testword);
1792 isdup=!pswit[VERBOSE_SWITCH];
1796 dupcnt=g_new0(int,1);
1797 g_tree_insert(qword,g_strdup(testword),dupcnt);
1802 if (pswit[ECHO_SWITCH])
1803 g_print("\n%s\n",aline);
1804 if (!pswit[OVERVIEW_SWITCH])
1806 g_print(" Line %ld column %ld - Query word %s",
1807 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1809 if (!pswit[VERBOSE_SWITCH])
1810 g_print(" - not reporting duplicates");
1818 /* check the user's list of typos */
1819 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1821 if (pswit[ECHO_SWITCH])
1822 g_print("\n%s\n",aline);
1823 if (!pswit[OVERVIEW_SWITCH])
1824 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1825 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1827 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1829 if (pswit[PARANOID_SWITCH] && warnings->digit)
1831 /* In paranoid mode, query all 0 and 1 standing alone. */
1832 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1834 if (pswit[ECHO_SWITCH])
1835 g_print("\n%s\n",aline);
1836 if (!pswit[OVERVIEW_SWITCH])
1837 g_print(" Line %ld column %ld - Query standalone %s\n",
1838 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1849 * check_for_misspaced_punctuation:
1851 * Look for added or missing spaces around punctuation and quotes.
1852 * If there is a punctuation character like ! with no space on
1853 * either side, suspect a missing!space. If there are spaces on
1854 * both sides , assume a typo. If we see a double quote with no
1855 * space or punctuation on either side of it, assume unspaced
1856 * quotes "like"this.
1858 void check_for_misspaced_punctuation(const char *aline,
1859 struct parities *parities,gboolean isemptyline)
1861 gboolean isacro,isellipsis;
1863 gunichar c,nc,pc,n2c;
1865 c=g_utf8_get_char(aline);
1866 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1867 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1871 nc=g_utf8_get_char(g_utf8_next_char(s));
1872 /* For each character in the line after the first. */
1873 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1875 /* we need to suppress warnings for acronyms like M.D. */
1877 /* we need to suppress warnings for ellipsis . . . */
1880 * If there are letters on both sides of it or
1881 * if it's strict punctuation followed by an alpha.
1883 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1884 g_utf8_strchr("?!,;:",-1,c)))
1888 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1889 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1891 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1897 if (pswit[ECHO_SWITCH])
1898 g_print("\n%s\n",aline);
1899 if (!pswit[OVERVIEW_SWITCH])
1900 g_print(" Line %ld column %ld - Missing space?\n",
1901 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1906 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1909 * If there are spaces on both sides,
1910 * or space before and end of line.
1914 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1915 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1917 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1921 if (!isemptyline && !isellipsis)
1923 if (pswit[ECHO_SWITCH])
1924 g_print("\n%s\n",aline);
1925 if (!pswit[OVERVIEW_SWITCH])
1926 g_print(" Line %ld column %ld - "
1927 "Spaced punctuation?\n",linecnt,
1928 g_utf8_pointer_to_offset(aline,s)+1);
1935 /* Split out the characters that CANNOT be preceded by space. */
1936 c=g_utf8_get_char(aline);
1937 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1938 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1942 nc=g_utf8_get_char(g_utf8_next_char(s));
1943 /* for each character in the line after the first */
1944 if (g_utf8_strchr("?!,;:",-1,c))
1946 /* if it's punctuation that _cannot_ have a space before it */
1947 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1950 * If nc DOES == space,
1951 * it was already reported just above.
1953 if (pswit[ECHO_SWITCH])
1954 g_print("\n%s\n",aline);
1955 if (!pswit[OVERVIEW_SWITCH])
1956 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1957 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1964 * Special case " .X" where X is any alpha.
1965 * This plugs a hole in the acronym code above.
1966 * Inelegant, but maintainable.
1968 c=g_utf8_get_char(aline);
1969 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1970 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1974 nc=g_utf8_get_char(g_utf8_next_char(s));
1975 /* for each character in the line after the first */
1978 /* if it's a period */
1979 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1982 * If the period follows a space and
1983 * is followed by a letter.
1985 if (pswit[ECHO_SWITCH])
1986 g_print("\n%s\n",aline);
1987 if (!pswit[OVERVIEW_SWITCH])
1988 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1989 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1995 c=g_utf8_get_char(aline);
1996 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1997 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2001 nc=g_utf8_get_char(g_utf8_next_char(s));
2002 /* for each character in the line after the first */
2003 if (CHAR_IS_DQUOTE(c))
2005 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2006 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2007 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2009 if (pswit[ECHO_SWITCH])
2010 g_print("\n%s\n",aline);
2011 if (!pswit[OVERVIEW_SWITCH])
2012 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2013 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2019 /* Check parity of quotes. */
2020 nc=g_utf8_get_char(aline);
2021 for (s=aline;*s;s=g_utf8_next_char(s))
2024 nc=g_utf8_get_char(g_utf8_next_char(s));
2025 if (CHAR_IS_DQUOTE(c))
2029 parities->dquote=!parities->dquote;
2030 parity=parities->dquote;
2032 else if (c==CHAR_LD_QUOTE)
2039 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2041 if (pswit[ECHO_SWITCH])
2042 g_print("\n%s\n",aline);
2043 if (!pswit[OVERVIEW_SWITCH])
2044 g_print(" Line %ld column %ld - "
2045 "Wrongspaced quotes?\n",
2046 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2054 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2055 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2057 if (pswit[ECHO_SWITCH])
2058 g_print("\n%s\n",aline);
2059 if (!pswit[OVERVIEW_SWITCH])
2060 g_print(" Line %ld column %ld - "
2061 "Wrongspaced quotes?\n",
2062 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2069 c=g_utf8_get_char(aline);
2070 if (CHAR_IS_DQUOTE(c))
2072 if (g_utf8_strchr(",;:!?)]} ",-1,
2073 g_utf8_get_char(g_utf8_next_char(aline))))
2075 if (pswit[ECHO_SWITCH])
2076 g_print("\n%s\n",aline);
2077 if (!pswit[OVERVIEW_SWITCH])
2078 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2084 if (pswit[SQUOTE_SWITCH])
2086 nc=g_utf8_get_char(aline);
2087 for (s=aline;*s;s=g_utf8_next_char(s))
2090 nc=g_utf8_get_char(g_utf8_next_char(s));
2091 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2092 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2093 !g_unichar_isalpha(nc)))
2095 parities->squote=!parities->squote;
2096 if (!parities->squote)
2099 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2101 if (pswit[ECHO_SWITCH])
2102 g_print("\n%s\n",aline);
2103 if (!pswit[OVERVIEW_SWITCH])
2104 g_print(" Line %ld column %ld - "
2105 "Wrongspaced singlequotes?\n",
2106 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2114 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2115 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2117 if (pswit[ECHO_SWITCH])
2118 g_print("\n%s\n",aline);
2119 if (!pswit[OVERVIEW_SWITCH])
2120 g_print(" Line %ld column %ld - "
2121 "Wrongspaced singlequotes?\n",
2122 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2135 * Given a position p within a string str, determine whether it follows the
2136 * given word. This is roughly equivalent to the regular expression (?<=\bword)
2137 * but has different boundary conditions.
2139 static gboolean str_follows_word(const char *str,const char *p,const char *word)
2141 int len=strlen(word);
2144 else if (!g_str_has_prefix(p-len,word))
2146 else if (p-len==str)
2149 /* Using non-alpha as a word boundary. See UAX #29 for a better way. */
2150 return !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(p-len)));
2154 * check_for_double_punctuation:
2156 * Look for double punctuation like ,. or ,,
2157 * Thanks to DW for the suggestion!
2158 * In books with references, ".," and ".;" are common
2159 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2160 * OTOH, from my initial tests, there are also fairly
2161 * common errors. What to do? Make these cases paranoid?
2162 * ".," is the most common, so warnings->dotcomma is used
2163 * to suppress detailed reporting if it occurs often.
2164 * Indeed, ".," is so common after "etc" or "&c" that
2165 * we don't warn on these cases at all.
2167 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2172 nc=g_utf8_get_char(aline);
2173 for (s=aline;*s;s=g_utf8_next_char(s))
2176 nc=g_utf8_get_char(g_utf8_next_char(s));
2177 /* for each punctuation character in the line */
2178 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2179 g_utf8_strchr(".?!,;:",-1,nc))
2181 /* followed by punctuation, it's a query, unless . . . */
2183 if (warnings->isFrench &&
2184 (g_str_has_prefix(s,",...") || g_str_has_prefix(s,"...,") ||
2185 g_str_has_prefix(s,";...") || g_str_has_prefix(s,"...;") ||
2186 g_str_has_prefix(s,":...") || g_str_has_prefix(s,"...:") ||
2187 g_str_has_prefix(s,"!...") || g_str_has_prefix(s,"...!") ||
2188 g_str_has_prefix(s,"?...") || g_str_has_prefix(s,"...?")))
2191 nc=g_utf8_get_char(g_utf8_next_char(s));
2194 else if (c==nc && (c=='.' || c=='?' || c=='!'))
2196 /* do nothing for .. !! and ?? which can be legit */
2199 else if (c=='.' && nc==',')
2201 if (!warnings->dotcomma || str_follows_word(aline,s,"etc") ||
2202 str_follows_word(aline,s,"&c"))
2207 if (pswit[ECHO_SWITCH])
2208 g_print("\n%s\n",aline);
2209 if (!pswit[OVERVIEW_SWITCH])
2210 g_print(" Line %ld column %ld - Double punctuation?\n",
2211 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2220 * check_for_spaced_quotes:
2222 void check_for_spaced_quotes(const char *aline)
2226 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2230 while ((t=strstr(s," \" ")))
2232 if (pswit[ECHO_SWITCH])
2233 g_print("\n%s\n",aline);
2234 if (!pswit[OVERVIEW_SWITCH])
2235 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2236 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2239 s=g_utf8_next_char(g_utf8_next_char(t));
2241 pattern=g_string_new(NULL);
2242 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2244 g_string_assign(pattern," ");
2245 g_string_append_unichar(pattern,single_quotes[i]);
2246 g_string_append_c(pattern,' ');
2248 while ((t=strstr(s,pattern->str)))
2250 if (pswit[ECHO_SWITCH])
2251 g_print("\n%s\n",aline);
2252 if (!pswit[OVERVIEW_SWITCH])
2253 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2254 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2257 s=g_utf8_next_char(g_utf8_next_char(t));
2260 g_string_free(pattern,TRUE);
2264 * check_for_miscased_genative:
2266 * Check special case of 'S instead of 's at end of word.
2268 void check_for_miscased_genative(const char *aline)
2274 c=g_utf8_get_char(aline);
2275 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2276 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2280 nc=g_utf8_get_char(g_utf8_next_char(s));
2281 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2283 if (pswit[ECHO_SWITCH])
2284 g_print("\n%s\n",aline);
2285 if (!pswit[OVERVIEW_SWITCH])
2286 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2287 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2295 * check_end_of_line:
2297 * Now check special cases - start and end of line -
2298 * for single and double quotes. Start is sometimes [sic]
2299 * but better to query it anyway.
2300 * While we're here, check for dash at end of line.
2302 void check_end_of_line(const char *aline,struct warnings *warnings)
2307 lbytes=strlen(aline);
2308 if (g_utf8_strlen(aline,lbytes)>1)
2310 s=g_utf8_prev_char(aline+lbytes);
2311 c1=g_utf8_get_char(s);
2312 c2=g_utf8_get_char(g_utf8_prev_char(s));
2313 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2315 if (pswit[ECHO_SWITCH])
2316 g_print("\n%s\n",aline);
2317 if (!pswit[OVERVIEW_SWITCH])
2318 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2319 g_utf8_strlen(aline,lbytes));
2323 c1=g_utf8_get_char(aline);
2324 c2=g_utf8_get_char(g_utf8_next_char(aline));
2325 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2327 if (pswit[ECHO_SWITCH])
2328 g_print("\n%s\n",aline);
2329 if (!pswit[OVERVIEW_SWITCH])
2330 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2335 * Dash at end of line may well be legit - paranoid mode only
2336 * and don't report em-dash at line-end.
2338 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2340 for (s=g_utf8_prev_char(aline+lbytes);
2341 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2343 if (g_utf8_get_char(s)=='-' &&
2344 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2346 if (pswit[ECHO_SWITCH])
2347 g_print("\n%s\n",aline);
2348 if (!pswit[OVERVIEW_SWITCH])
2349 g_print(" Line %ld column %ld - "
2350 "Hyphen at end of line?\n",
2351 linecnt,g_utf8_pointer_to_offset(aline,s));
2358 * check_for_unspaced_bracket:
2360 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2361 * If so, suspect a scanno like "a]most".
2363 void check_for_unspaced_bracket(const char *aline)
2367 c=g_utf8_get_char(aline);
2368 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2369 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2373 nc=g_utf8_get_char(g_utf8_next_char(s));
2376 /* for each bracket character in the line except 1st & last */
2377 if (g_utf8_strchr("{[()]}",-1,c) &&
2378 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2380 if (pswit[ECHO_SWITCH])
2381 g_print("\n%s\n",aline);
2382 if (!pswit[OVERVIEW_SWITCH])
2383 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2384 linecnt,g_utf8_pointer_to_offset(aline,s));
2392 * check_for_unpunctuated_endquote:
2394 void check_for_unpunctuated_endquote(const char *aline)
2399 c=g_utf8_get_char(aline);
2400 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2401 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2405 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2406 nc=g_utf8_get_char(g_utf8_next_char(s));
2407 /* for each character in the line except 1st */
2408 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2410 if (pswit[ECHO_SWITCH])
2411 g_print("\n%s\n",aline);
2412 if (!pswit[OVERVIEW_SWITCH])
2413 g_print(" Line %ld column %ld - "
2414 "endquote missing punctuation?\n",
2415 linecnt,g_utf8_pointer_to_offset(aline,s));
2423 * check_for_html_tag:
2425 * Check for <HTML TAG>.
2427 * If there is a < in the line, followed at some point
2428 * by a > then we suspect HTML.
2430 void check_for_html_tag(const char *aline)
2432 const char *open,*close;
2434 open=strchr(aline,'<');
2437 close=strchr(g_utf8_next_char(open),'>');
2440 if (pswit[ECHO_SWITCH])
2441 g_print("\n%s\n",aline);
2442 if (!pswit[OVERVIEW_SWITCH])
2444 tag=g_strndup(open,close-open+1);
2445 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2446 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2456 * check_for_html_entity:
2458 * Check for &symbol; HTML.
2460 * If there is a & in the line, followed at
2461 * some point by a ; then we suspect HTML.
2463 void check_for_html_entity(const char *aline)
2465 const char *s,*amp,*scolon;
2467 amp=strchr(aline,'&');
2470 scolon=strchr(amp,';');
2473 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2474 if (g_utf8_get_char(s)==CHAR_SPACE)
2475 break; /* Don't report "Jones & Son;" */
2478 if (pswit[ECHO_SWITCH])
2479 g_print("\n%s\n",aline);
2480 if (!pswit[OVERVIEW_SWITCH])
2482 entity=g_strndup(amp,scolon-amp+1);
2483 g_print(" Line %ld column %d - HTML symbol? %s \n",
2484 linecnt,(int)(amp-aline)+1,entity);
2495 * check_for_omitted_punctuation:
2497 * Check for omitted punctuation at end of paragraph by working back
2498 * through prevline. DW.
2499 * Need to check this only for "normal" paras.
2500 * So what is a "normal" para?
2501 * Not normal if one-liner (chapter headings, etc.)
2502 * Not normal if doesn't contain at least one locase letter
2503 * Not normal if starts with space
2505 void check_for_omitted_punctuation(const char *prevline,
2506 struct line_properties *last,int start_para_line)
2508 gboolean letter_on_line=FALSE;
2511 gboolean closing_quote;
2512 for (s=prevline;*s;s=g_utf8_next_char(s))
2513 if (g_unichar_isalpha(g_utf8_get_char(s)))
2515 letter_on_line=TRUE;
2519 * This next "if" is a problem.
2520 * If we say "start_para_line <= linecnt - 1", that includes
2521 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2522 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2523 * misses genuine one-line paragraphs.
2525 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2526 g_utf8_get_char(prevline)>CHAR_SPACE)
2528 s=prevline+strlen(prevline);
2531 s=g_utf8_prev_char(s);
2532 c=g_utf8_get_char(s);
2533 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2536 closing_quote=FALSE;
2537 } while (closing_quote && s>prevline);
2538 for (;s>prevline;s=g_utf8_prev_char(s))
2540 if (g_unichar_isalpha(g_utf8_get_char(s)))
2542 if (pswit[ECHO_SWITCH])
2543 g_print("\n%s\n",prevline);
2544 if (!pswit[OVERVIEW_SWITCH])
2545 g_print(" Line %ld column %ld - "
2546 "No punctuation at para end?\n",
2547 linecnt-1,g_utf8_strlen(prevline,-1));
2552 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2558 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2560 const char *word=key;
2563 g_print("\nNote: Queried word %s was duplicated %d times\n",
2568 void print_as_windows_1252(const char *string)
2570 gsize inbytes,outbytes;
2572 static GIConv converter=(GIConv)-1;
2575 if (converter!=(GIConv)-1)
2576 g_iconv_close(converter);
2577 converter=(GIConv)-1;
2580 if (converter==(GIConv)-1)
2581 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2582 if (converter!=(GIConv)-1)
2584 inbytes=outbytes=strlen(string);
2585 bp=buf=g_malloc(outbytes+1);
2586 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2592 fputs(string,stdout);
2595 void print_as_utf_8(const char *string)
2597 fputs(string,stdout);
2605 void procfile(const char *filename)
2608 gchar *parastart=NULL; /* first line of current para */
2609 gchar *etext,*aline;
2612 struct first_pass_results *first_pass_results;
2613 struct warnings *warnings;
2614 struct counters counters={0};
2615 struct line_properties last={0};
2616 struct parities parities={0};
2617 struct pending pending={0};
2618 gboolean isemptyline;
2619 long start_para_line=0;
2620 gboolean isnewpara=FALSE,enddash=FALSE;
2621 last.start=CHAR_SPACE;
2622 linecnt=checked_linecnt=0;
2623 etext=read_etext(filename,&err);
2626 if (pswit[STDOUT_SWITCH])
2627 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2629 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2632 g_print("\n\nFile: %s\n\n",filename);
2633 first_pass_results=first_pass(etext);
2634 warnings=report_first_pass(first_pass_results);
2635 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2636 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2638 * Here we go with the main pass. Hold onto yer hat!
2642 while ((aline=flgets(&etext_ptr,linecnt+1)))
2647 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2648 continue; // skip DP page separators completely
2649 if (linecnt<first_pass_results->firstline ||
2650 (first_pass_results->footerline>0 &&
2651 linecnt>first_pass_results->footerline))
2653 if (pswit[HEADER_SWITCH])
2655 if (g_str_has_prefix(aline,"Title:"))
2656 g_print(" %s\n",aline);
2657 if (g_str_has_prefix(aline,"Author:"))
2658 g_print(" %s\n",aline);
2659 if (g_str_has_prefix(aline,"Release Date:"))
2660 g_print(" %s\n",aline);
2661 if (g_str_has_prefix(aline,"Edition:"))
2662 g_print(" %s\n\n",aline);
2664 continue; /* skip through the header */
2667 print_pending(aline,parastart,&pending);
2668 isemptyline=analyse_quotes(aline,&counters);
2669 if (isnewpara && !isemptyline)
2671 /* This line is the start of a new paragraph. */
2672 start_para_line=linecnt;
2673 /* Capture its first line in case we want to report it later. */
2675 parastart=g_strdup(aline);
2676 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2678 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2679 !g_unichar_isdigit(g_utf8_get_char(s)))
2680 s=g_utf8_next_char(s);
2681 if (g_unichar_islower(g_utf8_get_char(s)))
2683 /* and its first letter is lowercase */
2684 if (pswit[ECHO_SWITCH])
2685 g_print("\n%s\n",aline);
2686 if (!pswit[OVERVIEW_SWITCH])
2687 g_print(" Line %ld column %ld - "
2688 "Paragraph starts with lower-case\n",
2689 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2693 isnewpara=FALSE; /* Signal the end of new para processing. */
2695 /* Check for an em-dash broken at line end. */
2696 if (enddash && g_utf8_get_char(aline)=='-')
2698 if (pswit[ECHO_SWITCH])
2699 g_print("\n%s\n",aline);
2700 if (!pswit[OVERVIEW_SWITCH])
2701 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2706 for (s=g_utf8_prev_char(aline+strlen(aline));
2707 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2709 if (s>=aline && g_utf8_get_char(s)=='-')
2711 check_for_control_characters(aline);
2713 check_for_odd_characters(aline,warnings,isemptyline);
2714 if (warnings->longline)
2715 check_for_long_line(aline);
2716 if (warnings->shortline)
2717 check_for_short_line(aline,&last);
2719 last.len=g_utf8_strlen(aline,-1);
2720 last.start=g_utf8_get_char(aline);
2721 check_for_starting_punctuation(aline);
2724 check_for_spaced_emdash(aline);
2725 check_for_spaced_dash(aline);
2727 check_for_unmarked_paragraphs(aline);
2728 check_for_jeebies(aline);
2729 check_for_mta_from(aline);
2730 check_for_orphan_character(aline);
2731 check_for_pling_scanno(aline);
2732 check_for_extra_period(aline,warnings);
2733 check_for_following_punctuation(aline);
2734 check_for_typos(aline,warnings);
2735 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2736 check_for_double_punctuation(aline,warnings);
2737 check_for_spaced_quotes(aline);
2738 check_for_miscased_genative(aline);
2739 check_end_of_line(aline,warnings);
2740 check_for_unspaced_bracket(aline);
2741 if (warnings->endquote)
2742 check_for_unpunctuated_endquote(aline);
2743 check_for_html_tag(aline);
2744 check_for_html_entity(aline);
2747 check_for_mismatched_quotes(&counters,&pending);
2748 counters_reset(&counters);
2749 /* let the next iteration know that it's starting a new para */
2752 check_for_omitted_punctuation(prevline,&last,start_para_line);
2755 prevline=g_strdup(aline);
2758 check_for_mismatched_quotes(&counters,&pending);
2759 print_pending(NULL,parastart,&pending);
2760 reset_pending(&pending);
2769 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2770 g_tree_foreach(qword,report_duplicate_queries,NULL);
2771 g_tree_unref(qword);
2772 g_tree_unref(qperiod);
2773 counters_destroy(&counters);
2774 g_set_print_handler(NULL);
2775 print_as_windows_1252(NULL);
2776 if (pswit[MARKUP_SWITCH])
2783 * Get one line from the input text, checking for
2784 * the existence of exactly one CR/LF line-end per line.
2786 * Returns: a pointer to the line.
2788 char *flgets(char **etext,long lcnt)
2791 gboolean isCR=FALSE;
2792 char *theline=*etext;
2797 c=g_utf8_get_char(*etext);
2800 if (*etext==theline)
2802 else if (pswit[LINE_END_SWITCH])
2804 if (pswit[ECHO_SWITCH])
2806 s=g_strndup(theline,eos-theline);
2807 g_print("\n%s\n",s);
2810 if (!pswit[OVERVIEW_SWITCH])
2811 /* There may, or may not, have been a CR */
2812 g_print(" Line %ld - No LF?\n",lcnt);
2818 *etext=g_utf8_next_char(*etext);
2819 /* either way, it's end of line */
2826 /* Error - a LF without a preceding CR */
2827 if (pswit[LINE_END_SWITCH])
2829 if (pswit[ECHO_SWITCH])
2831 s=g_strndup(theline,eos-theline);
2832 g_print("\n%s\n",s);
2835 if (!pswit[OVERVIEW_SWITCH])
2836 g_print(" Line %ld - No CR?\n",lcnt);
2847 /* Error - two successive CRs */
2848 if (pswit[LINE_END_SWITCH])
2850 if (pswit[ECHO_SWITCH])
2852 s=g_strndup(theline,eos-theline);
2853 g_print("\n%s\n",s);
2856 if (!pswit[OVERVIEW_SWITCH])
2857 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2866 if (pswit[LINE_END_SWITCH] && isCR)
2868 if (pswit[ECHO_SWITCH])
2870 s=g_strndup(theline,eos-theline);
2871 g_print("\n%s\n",s);
2874 if (!pswit[OVERVIEW_SWITCH])
2875 g_print(" Line %ld column %ld - CR without LF?\n",
2876 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2882 eos=g_utf8_next_char(eos);
2886 if (pswit[MARKUP_SWITCH])
2887 postprocess_for_HTML(theline);
2888 if (pswit[DP_SWITCH])
2889 postprocess_for_DP(theline);
2896 * Takes a "word" as a parameter, and checks whether it
2897 * contains a mixture of alpha and digits. Generally, this is an
2898 * error, but may not be for cases like 4th or L5 12s. 3d.
2900 * Returns: TRUE iff an is error found.
2902 gboolean mixdigit(const char *checkword)
2904 gboolean wehaveadigit,wehavealetter,query;
2905 const char *s,*nondigit;
2906 wehaveadigit=wehavealetter=query=FALSE;
2907 for (s=checkword;*s;s=g_utf8_next_char(s))
2908 if (g_unichar_isalpha(g_utf8_get_char(s)))
2910 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2912 if (wehaveadigit && wehavealetter)
2914 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2916 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2917 nondigit=g_utf8_next_char(nondigit))
2919 /* digits, ending in st, rd, nd, th of either case */
2920 if (!g_ascii_strcasecmp(nondigit,"st") ||
2921 !g_ascii_strcasecmp(nondigit,"rd") ||
2922 !g_ascii_strcasecmp(nondigit,"nd") ||
2923 !g_ascii_strcasecmp(nondigit,"th"))
2925 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2926 !g_ascii_strcasecmp(nondigit,"rds") ||
2927 !g_ascii_strcasecmp(nondigit,"nds") ||
2928 !g_ascii_strcasecmp(nondigit,"ths"))
2930 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2931 !g_ascii_strcasecmp(nondigit,"rdly") ||
2932 !g_ascii_strcasecmp(nondigit,"ndly") ||
2933 !g_ascii_strcasecmp(nondigit,"thly"))
2935 /* digits, ending in l, L, s or d */
2936 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2937 !strcmp(nondigit,"d"))
2940 * L at the start of a number, representing Britsh pounds, like L500.
2941 * This is cute. We know the current word is mixed digit. If the first
2942 * letter is L, there must be at least one digit following. If both
2943 * digits and letters follow, we have a genuine error, else we have a
2944 * capital L followed by digits, and we accept that as a non-error.
2946 if (g_utf8_get_char(checkword)=='L' &&
2947 !mixdigit(g_utf8_next_char(checkword)))
2956 * Extracts the first/next "word" from the line, and returns it.
2957 * A word is defined as one English word unit--or at least that's the aim.
2958 * "ptr" is advanced to the position in the line where we will start
2959 * looking for the next word.
2961 * Returns: A newly-allocated string.
2963 gchar *getaword(const char **ptr)
2968 word=g_string_new(NULL);
2969 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2970 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2971 **ptr;*ptr=g_utf8_next_char(*ptr))
2973 /* Handle exceptions for footnote markers like [1] */
2974 if (g_utf8_get_char(*ptr)=='[')
2976 g_string_append_c(word,'[');
2977 s=g_utf8_next_char(*ptr);
2978 for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
2979 g_string_append_unichar(word,g_utf8_get_char(s));
2980 if (g_utf8_get_char(s)==']')
2982 g_string_append_c(word,']');
2983 *ptr=g_utf8_next_char(s);
2984 return g_string_free(word,FALSE);
2987 g_string_truncate(word,0);
2991 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2992 * Especially yucky is the case of L1,000
2993 * This section looks for a pattern of characters including a digit
2994 * followed by a comma or period followed by one or more digits.
2995 * If found, it returns this whole pattern as a word; otherwise we discard
2996 * the results and resume our normal programming.
2999 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3000 g_unichar_isalpha(g_utf8_get_char(s)) ||
3001 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3002 g_string_append_unichar(word,g_utf8_get_char(s));
3005 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3007 c=g_utf8_get_char(t);
3008 pc=g_utf8_get_char(g_utf8_prev_char(t));
3009 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3012 return g_string_free(word,FALSE);
3016 /* we didn't find a punctuated number - do the regular getword thing */
3017 g_string_truncate(word,0);
3018 c=g_utf8_get_char(*ptr);
3019 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3020 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3021 g_string_append_unichar(word,c);
3022 return g_string_free(word,FALSE);
3028 * Is this word a Roman Numeral?
3030 * It doesn't actually validate that the number is a valid Roman Numeral--for
3031 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3032 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3033 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3034 * expressions thereof, except when it came to taxes. Allow any number of M,
3035 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3036 * XL or an optional XC, an optional IX or IV, an optional V and any number
3039 gboolean isroman(const char *t)
3045 while (g_utf8_get_char(t)=='m' && *t)
3047 if (g_utf8_get_char(t)=='d')
3049 if (g_str_has_prefix(t,"cm"))
3051 if (g_str_has_prefix(t,"cd"))
3053 while (g_utf8_get_char(t)=='c' && *t)
3055 if (g_str_has_prefix(t,"xl"))
3057 if (g_str_has_prefix(t,"xc"))
3059 if (g_utf8_get_char(t)=='l')
3061 while (g_utf8_get_char(t)=='x' && *t)
3063 if (g_str_has_prefix(t,"ix"))
3065 if (g_str_has_prefix(t,"iv"))
3067 if (g_utf8_get_char(t)=='v')
3069 while (g_utf8_get_char(t)=='i' && *t)
3075 * postprocess_for_DP:
3077 * Invoked with the -d switch from flgets().
3078 * It simply "removes" from the line a hard-coded set of common
3079 * DP-specific tags, so that the line passed to the main routine has
3080 * been pre-cleaned of DP markup.
3082 void postprocess_for_DP(char *theline)
3088 for (i=0;*DPmarkup[i];i++)
3089 while ((s=strstr(theline,DPmarkup[i])))
3091 t=s+strlen(DPmarkup[i]);
3092 memmove(s,t,strlen(t)+1);
3097 * postprocess_for_HTML:
3099 * Invoked with the -m switch from flgets().
3100 * It simply "removes" from the line a hard-coded set of common
3101 * HTML tags and "replaces" a hard-coded set of common HTML
3102 * entities, so that the line passed to the main routine has
3103 * been pre-cleaned of HTML.
3105 void postprocess_for_HTML(char *theline)
3107 while (losemarkup(theline))
3109 loseentities(theline);
3112 char *losemarkup(char *theline)
3116 s=strchr(theline,'<');
3117 t=s?strchr(s,'>'):NULL;
3120 for (i=0;*markup[i];i++)
3121 if (tagcomp(g_utf8_next_char(s),markup[i]))
3123 t=g_utf8_next_char(t);
3124 memmove(s,t,strlen(t)+1);
3127 /* It's an unrecognized <xxx>. */
3131 void loseentities(char *theline)
3138 GTree *entities=NULL;
3139 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3143 g_tree_destroy(entities);
3145 if (translit!=(GIConv)-1)
3146 g_iconv_close(translit);
3147 translit=(GIConv)-1;
3148 if (to_utf8!=(GIConv)-1)
3149 g_iconv_close(to_utf8);
3157 entities=g_tree_new((GCompareFunc)strcmp);
3158 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3159 g_tree_insert(entities,HTMLentities[i].name,
3160 GUINT_TO_POINTER(HTMLentities[i].c));
3162 if (translit==(GIConv)-1)
3163 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3164 if (to_utf8==(GIConv)-1)
3165 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3166 while((amp=strchr(theline,'&')))
3168 scolon=strchr(amp,';');
3173 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3174 c=strtol(amp+2,NULL,10);
3175 else if (amp[2]=='x' &&
3176 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3177 c=strtol(amp+3,NULL,16);
3181 s=g_strndup(amp+1,scolon-(amp+1));
3182 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3191 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3192 theline+=g_unichar_to_utf8(c,theline);
3196 nb=g_unichar_to_utf8(c,s);
3197 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3199 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3201 memcpy(theline,s,nb);
3205 memmove(theline,g_utf8_next_char(scolon),
3206 strlen(g_utf8_next_char(scolon))+1);
3209 theline=g_utf8_next_char(amp);
3213 gboolean tagcomp(const char *strin,const char *basetag)
3217 if (g_utf8_get_char(strin)=='/')
3218 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3220 t=g_utf8_casefold(strin,-1);
3221 s=g_utf8_casefold(basetag,-1);
3222 retval=g_str_has_prefix(t,s);
3228 void proghelp(GOptionContext *context)
3231 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3232 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3233 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3234 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3235 "For details, read the file COPYING.\n",stderr);
3236 fputs("This is Free Software; "
3237 "you may redistribute it under certain conditions (GPL);\n",stderr);
3238 fputs("read the file COPYING for details.\n\n",stderr);
3239 help=g_option_context_get_help(context,TRUE,NULL);
3242 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3243 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3244 "non-ASCII\n",stderr);
3245 fputs("characters like accented letters, "
3246 "lines longer than 75 or shorter than 55,\n",stderr);
3247 fputs("unbalanced quotes or brackets, "
3248 "a variety of badly formatted punctuation, \n",stderr);
3249 fputs("HTML tags, some likely typos. "
3250 "It is NOT a substitute for human judgement.\n",stderr);