1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
32 #include "HTMLentities.h"
38 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
39 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
40 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
41 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
42 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
43 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
44 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
45 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
46 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
47 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
48 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
49 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
50 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
51 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
52 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
53 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
54 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
55 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
56 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
57 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
58 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
59 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
60 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
61 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
62 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
63 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
64 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
65 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
66 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
72 /* Common abbreviations and other OK words not to query as typos. */
74 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
75 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
76 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
77 "outbid", "outbids", "frostbite", "frostbitten", ""
80 /* Common abbreviations that cause otherwise unexplained periods. */
82 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
83 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
87 * Two-Letter combinations that rarely if ever start words,
88 * but are common scannos or otherwise common letter combinations.
91 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
95 * Two-Letter combinations that rarely if ever end words,
96 * but are common scannos or otherwise common letter combinations.
99 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
100 "sw", "gr", "sl", "cl", "iy", ""
104 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
105 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
106 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
107 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
111 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
115 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
116 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
117 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
118 "during", "let", "toward", "among", ""
122 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
123 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
124 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
125 "among", "those", "into", "whom", "having", "thence", ""
128 gboolean pswit[SWITNO]; /* program switches */
130 static GOptionEntry options[]={
131 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
132 "Ignore DP-specific markup", NULL },
133 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
134 "Don't echo queried line", NULL },
135 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
136 "Check single quotes", NULL },
137 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
138 "Check common typos", NULL },
139 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
140 "Require closure of quotes on every paragraph", NULL },
141 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
142 "Disable paranoid querying of everything", NULL },
143 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
144 "Disable line end checking", NULL },
145 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
146 "Overview: just show counts", NULL },
147 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
148 "Output errors to stdout instead of stderr", NULL },
149 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
150 "Echo header fields", NULL },
151 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
152 "Ignore markup in < >", NULL },
153 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
154 "Use file of user-defined typos", NULL },
155 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
156 "Defaults for use on www upload", NULL },
157 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
158 "Verbose - list everything", NULL },
162 long cnt_dquot; /* for overview mode, count of doublequote queries */
163 long cnt_squot; /* for overview mode, count of singlequote queries */
164 long cnt_brack; /* for overview mode, count of brackets queries */
165 long cnt_bin; /* for overview mode, count of non-ASCII queries */
166 long cnt_odd; /* for overview mode, count of odd character queries */
167 long cnt_long; /* for overview mode, count of long line errors */
168 long cnt_short; /* for overview mode, count of short line queries */
169 long cnt_punct; /* for overview mode,
170 count of punctuation and spacing queries */
171 long cnt_dash; /* for overview mode, count of dash-related queries */
172 long cnt_word; /* for overview mode, count of word queries */
173 long cnt_html; /* for overview mode, count of html queries */
174 long cnt_lineend; /* for overview mode, count of line-end queries */
175 long cnt_spacend; /* count of lines with space at end */
176 long linecnt; /* count of total lines in the file */
177 long checked_linecnt; /* count of lines actually checked */
179 void proghelp(GOptionContext *context);
180 void procfile(const char *);
184 gboolean mixdigit(const char *);
185 gchar *getaword(const char **);
186 char *flgets(char **,long);
187 void postprocess_for_HTML(char *);
188 char *linehasmarkup(char *);
189 char *losemarkup(char *);
190 gboolean tagcomp(const char *,const char *);
191 void loseentities(char *);
192 gboolean isroman(const char *);
193 void postprocess_for_DP(char *);
194 void print_as_windows_1252(const char *string);
195 void print_as_utf_8(const char *string);
197 GTree *qword,*qperiod;
203 void parse_options(int *argc,char ***argv)
206 GOptionContext *context;
207 context=g_option_context_new(
208 "file - looks for errors in Project Gutenberg(TM) etexts");
209 g_option_context_add_main_entries(context,options,NULL);
210 if (!g_option_context_parse(context,argc,argv,&err))
212 g_printerr("Bookloupe: %s\n",err->message);
213 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
216 /* Paranoid checking is turned OFF, not on, by its switch */
217 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
218 if (pswit[PARANOID_SWITCH])
219 /* if running in paranoid mode, typo checks default to enabled */
220 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
221 /* Line-end checking is turned OFF, not on, by its switch */
222 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
223 /* Echoing is turned OFF, not on, by its switch */
224 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
225 if (pswit[OVERVIEW_SWITCH])
226 /* just print summary; don't echo */
227 pswit[ECHO_SWITCH]=FALSE;
229 * Web uploads - for the moment, this is really just a placeholder
230 * until we decide what processing we really want to do on web uploads
232 if (pswit[WEB_SWITCH])
234 /* specific override for web uploads */
235 pswit[ECHO_SWITCH]=TRUE;
236 pswit[SQUOTE_SWITCH]=FALSE;
237 pswit[TYPO_SWITCH]=TRUE;
238 pswit[QPARA_SWITCH]=FALSE;
239 pswit[PARANOID_SWITCH]=TRUE;
240 pswit[LINE_END_SWITCH]=FALSE;
241 pswit[OVERVIEW_SWITCH]=FALSE;
242 pswit[STDOUT_SWITCH]=FALSE;
243 pswit[HEADER_SWITCH]=TRUE;
244 pswit[VERBOSE_SWITCH]=FALSE;
245 pswit[MARKUP_SWITCH]=FALSE;
246 pswit[USERTYPO_SWITCH]=FALSE;
247 pswit[DP_SWITCH]=FALSE;
254 g_option_context_free(context);
260 * Read in the user-defined stealth scanno list.
262 void read_user_scannos(void)
265 gchar *usertypo_file;
269 gchar *contents,*utf8,**lines;
270 usertypo_file=g_strdup("bookloupe.typ");
271 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
272 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
275 g_free(usertypo_file);
276 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
277 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
279 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
282 g_free(usertypo_file);
283 usertypo_file=g_strdup("gutcheck.typ");
284 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
286 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
289 g_free(usertypo_file);
290 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
291 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
293 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
295 g_free(usertypo_file);
296 g_print(" --> I couldn't find bookloupe.typ "
297 "-- proceeding without user typos.\n");
302 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
303 g_free(usertypo_file);
307 if (g_utf8_validate(contents,len,NULL))
308 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
310 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
312 lines=g_strsplit_set(utf8,"\r\n",0);
314 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
315 for (i=0;lines[i];i++)
316 if (*(unsigned char *)lines[i]>'!')
317 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
326 * Read an etext returning a newly allocated string containing the file
327 * contents or NULL on error.
329 gchar *read_etext(const char *filename,GError **err)
331 GError *tmp_err=NULL;
332 gchar *contents,*utf8;
333 gsize len,bytes_read,bytes_written;
335 if (!g_file_get_contents(filename,&contents,&len,err))
337 if (g_utf8_validate(contents,len,NULL))
339 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
340 g_set_print_handler(print_as_utf_8);
342 SetConsoleOutputCP(CP_UTF8);
347 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
348 &bytes_written,&tmp_err);
349 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
350 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
353 for(i=0;i<bytes_read;i++)
354 if (contents[i]=='\n')
359 else if (contents[i]!='\r')
361 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
362 "Input conversion failed. Byte %d at line %d, column %d is not a "
363 "valid Windows-1252 character",
364 ((unsigned char *)contents)[bytes_read],line,col);
367 g_propagate_error(err,tmp_err);
368 g_set_print_handler(print_as_windows_1252);
370 SetConsoleOutputCP(1252);
377 void cleanup_on_exit(void)
380 SetConsoleOutputCP(saved_cp);
384 int main(int argc,char **argv)
387 atexit(cleanup_on_exit);
388 saved_cp=GetConsoleOutputCP();
390 running_from=g_path_get_dirname(argv[0]);
391 parse_options(&argc,&argv);
392 if (pswit[USERTYPO_SWITCH])
394 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
396 if (pswit[OVERVIEW_SWITCH])
398 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
399 checked_linecnt,linecnt,linecnt-checked_linecnt);
400 g_print(" --------------- Queries found --------------\n");
402 g_print(" Long lines: %14ld\n",cnt_long);
404 g_print(" Short lines: %14ld\n",cnt_short);
406 g_print(" Line-end problems: %14ld\n",cnt_lineend);
408 g_print(" Common typos: %14ld\n",cnt_word);
410 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
412 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
414 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
416 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
418 g_print(" Proofing characters: %14ld\n",cnt_odd);
420 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
422 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
424 g_print(" Possible HTML tags: %14ld\n",cnt_html);
426 g_print(" TOTAL QUERIES %14ld\n",
427 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
428 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
430 g_free(running_from);
432 g_tree_unref(usertypo);
439 * Run a first pass - verify that it's a valid PG
440 * file, decide whether to report some things that
441 * occur many times in the text like long or short
442 * lines, non-standard dashes, etc.
444 struct first_pass_results *first_pass(const char *etext)
446 gunichar laststart=CHAR_SPACE;
451 unsigned int lastlen=0,lastblen=0;
452 long spline=0,nspline=0;
453 static struct first_pass_results results={0};
455 lines=g_strsplit(etext,"\n",0);
456 for (j=0;lines[j];j++)
458 lbytes=strlen(lines[j]);
459 while (lbytes>0 && lines[j][lbytes-1]=='\r')
460 lines[j][--lbytes]='\0';
461 llen=g_utf8_strlen(lines[j],lbytes);
463 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
464 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
467 g_print(" --> Duplicate header?\n");
468 spline=linecnt+1; /* first line of non-header text, that is */
470 if (!strncmp(lines[j],"*** START",9) &&
471 strstr(lines[j],"PROJECT GUTENBERG"))
474 g_print(" --> Duplicate header?\n");
475 nspline=linecnt+1; /* first line of non-header text, that is */
477 if (spline || nspline)
479 lc_line=g_utf8_strdown(lines[j],lbytes);
480 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
482 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
484 if (results.footerline)
486 /* it's an old-form header - we can detect duplicates */
488 g_print(" --> Duplicate footer?\n");
491 results.footerline=linecnt;
497 results.firstline=spline;
499 results.firstline=nspline; /* override with new */
500 if (results.footerline)
501 continue; /* don't count the boilerplate in the footer */
502 results.totlen+=llen;
503 for (s=lines[j];*s;s=g_utf8_next_char(s))
505 if (g_utf8_get_char(s)>127)
507 if (g_unichar_isalpha(g_utf8_get_char(s)))
509 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
510 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
511 results.endquote_count++;
513 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
514 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
517 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
519 if (strstr(lines[j],".,"))
521 /* only count ast lines for ignoring purposes where there is */
522 /* locase text on the line */
523 if (strchr(lines[j],'*'))
525 for (s=lines[j];*s;s=g_utf8_next_char(s))
526 if (g_unichar_islower(g_utf8_get_char(s)))
531 if (strchr(lines[j],'/'))
532 results.fslashline++;
535 for (s=g_utf8_prev_char(lines[j]+lbytes);
536 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
537 s=g_utf8_prev_char(s))
539 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
540 g_utf8_get_char(g_utf8_prev_char(s))!='-')
543 if (llen>LONGEST_PG_LINE)
545 if (llen>WAY_TOO_LONG)
546 results.verylongline++;
547 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
549 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
552 if (strstr(lines[j],"<i>"))
553 results.htmcount+=4; /* bonus marks! */
555 /* Check for spaced em-dashes */
556 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
559 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
560 results.space_emdash++;
561 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
562 /* count of em-dashes with spaces both sides */
563 results.non_PG_space_emdash++;
564 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
565 /* count of PG-type em-dashes with no spaces */
566 results.PG_space_emdash++;
571 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
572 results.Dutchcount++;
573 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
574 results.Frenchcount++;
575 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
576 results.standalone_digit++;
579 /* Check for spaced dashes */
580 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
584 laststart=lines[j][0];
593 * Make some snap decisions based on the first pass results.
595 struct warnings *report_first_pass(struct first_pass_results *results)
597 static struct warnings warnings={0};
599 g_print(" --> %ld lines in this file have white space at end\n",
602 if (results->dotcomma>5)
605 g_print(" --> %ld lines in this file contain '.,'. "
606 "Not reporting them.\n",results->dotcomma);
609 * If more than 50 lines, or one-tenth, are short,
610 * don't bother reporting them.
612 warnings.shortline=1;
613 if (results->shortline>50 || results->shortline*10>linecnt)
615 warnings.shortline=0;
616 g_print(" --> %ld lines in this file are short. "
617 "Not reporting short lines.\n",results->shortline);
620 * If more than 50 lines, or one-tenth, are long,
621 * don't bother reporting them.
624 if (results->longline>50 || results->longline*10>linecnt)
627 g_print(" --> %ld lines in this file are long. "
628 "Not reporting long lines.\n",results->longline);
630 /* If more than 10 lines contain asterisks, don't bother reporting them. */
632 if (results->astline>10)
635 g_print(" --> %ld lines in this file contain asterisks. "
636 "Not reporting them.\n",results->astline);
639 * If more than 10 lines contain forward slashes,
640 * don't bother reporting them.
643 if (results->fslashline>10)
646 g_print(" --> %ld lines in this file contain forward slashes. "
647 "Not reporting them.\n",results->fslashline);
650 * If more than 20 lines contain unpunctuated endquotes,
651 * don't bother reporting them.
654 if (results->endquote_count>20)
657 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
658 "Not reporting them.\n",results->endquote_count);
661 * If more than 15 lines contain standalone digits,
662 * don't bother reporting them.
665 if (results->standalone_digit>10)
668 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
669 "Not reporting them.\n",results->standalone_digit);
672 * If more than 20 lines contain hyphens at end,
673 * don't bother reporting them.
676 if (results->hyphens>20)
679 g_print(" --> %ld lines in this file have hyphens at end. "
680 "Not reporting them.\n",results->hyphens);
682 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
684 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
685 pswit[MARKUP_SWITCH]=1;
687 if (results->verylongline>0)
688 g_print(" --> %ld lines in this file are VERY long!\n",
689 results->verylongline);
691 * If there are more non-PG spaced dashes than PG em-dashes,
692 * assume it's deliberate.
693 * Current PG guidelines say don't use them, but older texts do,
694 * and some people insist on them whatever the guidelines say.
697 if (results->spacedash+results->non_PG_space_emdash>
698 results->PG_space_emdash)
701 g_print(" --> There are %ld spaced dashes and em-dashes. "
702 "Not reporting them.\n",
703 results->spacedash+results->non_PG_space_emdash);
705 /* If more than a quarter of characters are hi-bit, bug out. */
707 if (results->binlen*4>results->totlen)
709 g_print(" --> This file does not appear to be ASCII. "
710 "Terminating. Best of luck with it!\n");
713 if (results->alphalen*4<results->totlen)
715 g_print(" --> This file does not appear to be text. "
716 "Terminating. Best of luck with it!\n");
719 if (results->binlen*100>results->totlen || results->binlen>100)
721 g_print(" --> There are a lot of foreign letters here. "
722 "Not reporting them.\n");
725 warnings.isDutch=FALSE;
726 if (results->Dutchcount>50)
728 warnings.isDutch=TRUE;
729 g_print(" --> This looks like Dutch - "
730 "switching off dashes and warnings for 's Middags case.\n");
732 warnings.isFrench=FALSE;
733 if (results->Frenchcount>50)
735 warnings.isFrench=TRUE;
736 g_print(" --> This looks like French - "
737 "switching off some doublepunct.\n");
739 if (results->firstline && results->footerline)
740 g_print(" The PG header and footer appear to be already on.\n");
743 if (results->firstline)
744 g_print(" The PG header is on - no footer.\n");
745 if (results->footerline)
746 g_print(" The PG footer is on - no header.\n");
749 if (pswit[VERBOSE_SWITCH])
752 warnings.shortline=1;
761 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
763 if (warnings.isDutch)
765 if (results->footerline>0 && results->firstline>0 &&
766 results->footerline>results->firstline &&
767 results->footerline-results->firstline<100)
769 g_print(" --> I don't really know where this text starts. \n");
770 g_print(" There are no reference points.\n");
771 g_print(" I'm going to have to report the header and footer "
773 results->firstline=0;
781 * Look along the line, accumulate the count of quotes, and see
782 * if this is an empty line - i.e. a line with nothing on it
784 * If line has just spaces, period, * and/or - on it, don't
785 * count it, since empty lines with asterisks or dashes to
786 * separate sections are common.
788 * Returns: TRUE if the line is empty.
790 gboolean analyse_quotes(const char *aline,struct counters *counters)
793 /* assume the line is empty until proven otherwise */
794 gboolean isemptyline=TRUE;
795 const char *s=aline,*sprev,*snext;
800 snext=g_utf8_next_char(s);
801 c=g_utf8_get_char(s);
804 if (CHAR_IS_SQUOTE(c))
809 * At start of line, it can only be an openquote.
810 * Hardcode a very common exception!
812 if (!g_str_has_prefix(snext,"tis") &&
813 !g_str_has_prefix(snext,"Tis"))
814 increment_matching(counters,c,TRUE);
816 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
817 g_unichar_isalpha(g_utf8_get_char(snext)))
818 /* Do nothing! it's definitely an apostrophe, not a quote */
820 /* it's outside a word - let's check it out */
821 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
822 g_unichar_isalpha(g_utf8_get_char(snext)))
824 /* it damwell better BE an openquote */
825 if (!g_str_has_prefix(snext,"tis") &&
826 !g_str_has_prefix(snext,"Tis"))
827 /* hardcode a very common exception! */
828 increment_matching(counters,c,TRUE);
832 /* now - is it a closequote? */
833 guessquote=0; /* accumulate clues */
834 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
836 /* it follows a letter - could be either */
838 if (g_utf8_get_char(sprev)=='s')
840 /* looks like a plural apostrophe */
842 if (g_utf8_get_char(snext)==CHAR_SPACE)
847 /* it doesn't have a letter either side */
848 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
849 strchr(".?!,;: ",g_utf8_get_char(snext)))
850 guessquote+=8; /* looks like a closequote */
853 if (matching_difference(counters,CHAR_SQUOTE)>0)
855 * Give it the benefit of some doubt,
856 * if a squote is already open.
862 increment_matching(counters,c,FALSE);
865 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
867 isemptyline=FALSE; /* ignore lines like * * * as spacers */
868 if (c==CHAR_UNDERSCORE)
869 counters->c_unders++;
870 if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK || c==CHAR_OPEN_SBRACK)
871 increment_matching(counters,c,TRUE);
872 if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK ||
873 c==CHAR_CLOSE_SBRACK)
874 increment_matching(counters,c,FALSE);
882 * check_for_control_characters:
884 * Check for invalid or questionable characters in the line
885 * Anything above 127 is invalid for plain ASCII, and
886 * non-printable control characters should also be flagged.
887 * Tabs should generally not be there.
889 void check_for_control_characters(const char *aline)
893 for (s=aline;*s;s=g_utf8_next_char(s))
895 c=g_utf8_get_char(s);
896 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
898 if (pswit[ECHO_SWITCH])
899 g_print("\n%s\n",aline);
900 if (!pswit[OVERVIEW_SWITCH])
901 g_print(" Line %ld column %ld - Control character %u\n",
902 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
910 * check_for_odd_characters:
912 * Check for binary and other odd characters.
914 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
915 gboolean isemptyline)
917 /* Don't repeat multiple warnings on one line. */
918 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
919 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
922 for (s=aline;*s;s=g_utf8_next_char(s))
924 c=g_utf8_get_char(s);
925 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
927 if (pswit[ECHO_SWITCH])
928 g_print("\n%s\n",aline);
929 if (!pswit[OVERVIEW_SWITCH])
930 if (c>127 && c<160 || c>255)
931 g_print(" Line %ld column %ld - "
932 "Non-ISO-8859 character %u\n",
933 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
935 g_print(" Line %ld column %ld - "
936 "Non-ASCII character %u\n",
937 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
942 if (!eTab && c==CHAR_TAB)
944 if (pswit[ECHO_SWITCH])
945 g_print("\n%s\n",aline);
946 if (!pswit[OVERVIEW_SWITCH])
947 g_print(" Line %ld column %ld - Tab character?\n",
948 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
953 if (!eTilde && c==CHAR_TILDE)
956 * Often used by OCR software to indicate an
957 * unrecognizable character.
959 if (pswit[ECHO_SWITCH])
960 g_print("\n%s\n",aline);
961 if (!pswit[OVERVIEW_SWITCH])
962 g_print(" Line %ld column %ld - Tilde character?\n",
963 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
968 if (!eCarat && c==CHAR_CARAT)
970 if (pswit[ECHO_SWITCH])
971 g_print("\n%s\n",aline);
972 if (!pswit[OVERVIEW_SWITCH])
973 g_print(" Line %ld column %ld - Carat character?\n",
974 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
979 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
981 if (pswit[ECHO_SWITCH])
982 g_print("\n%s\n",aline);
983 if (!pswit[OVERVIEW_SWITCH])
984 g_print(" Line %ld column %ld - Forward slash?\n",
985 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
991 * Report asterisks only in paranoid mode,
992 * since they're often deliberate.
994 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
997 if (pswit[ECHO_SWITCH])
998 g_print("\n%s\n",aline);
999 if (!pswit[OVERVIEW_SWITCH])
1000 g_print(" Line %ld column %ld - Asterisk?\n",
1001 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1010 * check_for_long_line:
1012 * Check for line too long.
1014 void check_for_long_line(const char *aline)
1016 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1018 if (pswit[ECHO_SWITCH])
1019 g_print("\n%s\n",aline);
1020 if (!pswit[OVERVIEW_SWITCH])
1021 g_print(" Line %ld column %ld - Long line %ld\n",
1022 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1029 * check_for_short_line:
1031 * Check for line too short.
1033 * This one is a bit trickier to implement: we don't want to
1034 * flag the last line of a paragraph for being short, so we
1035 * have to wait until we know that our current line is a
1036 * "normal" line, then report the _previous_ line if it was too
1037 * short. We also don't want to report indented lines like
1038 * chapter heads or formatted quotations. We therefore keep
1039 * last->len as the length of the last line examined, and
1040 * last->blen as the length of the last but one, and try to
1041 * suppress unnecessary warnings by checking that both were of
1042 * "normal" length. We keep the first character of the last
1043 * line in last->start, and if it was a space, we assume that
1044 * the formatting is deliberate. I can't figure out a way to
1045 * distinguish something like a quoted verse left-aligned or
1046 * the header or footer of a letter from a paragraph of short
1047 * lines - maybe if I examined the whole paragraph, and if the
1048 * para has less than, say, 8 lines and if all lines are short,
1049 * then just assume it's OK? Need to look at some texts to see
1050 * how often a formula like this would get the right result.
1052 void check_for_short_line(const char *aline,const struct line_properties *last)
1054 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1055 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1056 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1058 if (pswit[ECHO_SWITCH])
1059 g_print("\n%s\n",prevline);
1060 if (!pswit[OVERVIEW_SWITCH])
1061 g_print(" Line %ld column %ld - Short line %ld?\n",
1062 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1069 * check_for_starting_punctuation:
1071 * Look for punctuation other than full ellipses at start of line.
1073 void check_for_starting_punctuation(const char *aline)
1075 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1076 !g_str_has_prefix(aline,". . ."))
1078 if (pswit[ECHO_SWITCH])
1079 g_print("\n%s\n",aline);
1080 if (!pswit[OVERVIEW_SWITCH])
1081 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1089 * check_for_spaced_emdash:
1091 * Check for spaced em-dashes.
1093 * We must check _all_ occurrences of "--" on the line
1094 * hence the loop - even if the first double-dash is OK
1095 * there may be another that's wrong later on.
1097 void check_for_spaced_emdash(const char *aline)
1099 const char *s,*t,*next;
1100 for (s=aline;t=strstr(s,"--");s=next)
1102 next=g_utf8_next_char(g_utf8_next_char(t));
1103 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1104 g_utf8_get_char(next)==CHAR_SPACE)
1106 if (pswit[ECHO_SWITCH])
1107 g_print("\n%s\n",aline);
1108 if (!pswit[OVERVIEW_SWITCH])
1109 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1110 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1118 * check_for_spaced_dash:
1120 * Check for spaced dashes.
1122 void check_for_spaced_dash(const char *aline)
1125 if ((s=strstr(aline," -")))
1127 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1129 if (pswit[ECHO_SWITCH])
1130 g_print("\n%s\n",aline);
1131 if (!pswit[OVERVIEW_SWITCH])
1132 g_print(" Line %ld column %ld - Spaced dash?\n",
1133 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1138 else if ((s=strstr(aline,"- ")))
1140 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1142 if (pswit[ECHO_SWITCH])
1143 g_print("\n%s\n",aline);
1144 if (!pswit[OVERVIEW_SWITCH])
1145 g_print(" Line %ld column %ld - Spaced dash?\n",
1146 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1154 * check_for_unmarked_paragraphs:
1156 * Check for unmarked paragraphs indicated by separate speakers.
1158 * May well be false positive:
1159 * "Bravo!" "Wonderful!" called the crowd.
1160 * but useful all the same.
1162 void check_for_unmarked_paragraphs(const char *aline)
1165 s=strstr(aline,"\" \"");
1167 s=strstr(aline,"\" \"");
1170 if (pswit[ECHO_SWITCH])
1171 g_print("\n%s\n",aline);
1172 if (!pswit[OVERVIEW_SWITCH])
1173 g_print(" Line %ld column %ld - "
1174 "Query missing paragraph break?\n",
1175 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1182 * check_for_jeebies:
1184 * Check for "to he" and other easy h/b errors.
1186 * This is a very inadequate effort on the h/b problem,
1187 * but the phrase "to he" is always an error, whereas "to
1188 * be" is quite common.
1189 * Similarly, '"Quiet!", be said.' is a non-be error
1190 * "to he" is _not_ always an error!:
1191 * "Where they went to he couldn't say."
1192 * Another false positive:
1193 * What would "Cinderella" be without the . . .
1194 * and another: "If he wants to he can see for himself."
1196 void check_for_jeebies(const char *aline)
1199 s=strstr(aline," be could ");
1201 s=strstr(aline," be would ");
1203 s=strstr(aline," was be ");
1205 s=strstr(aline," be is ");
1207 s=strstr(aline," is be ");
1209 s=strstr(aline,"\", be ");
1211 s=strstr(aline,"\" be ");
1213 s=strstr(aline,"\" be ");
1215 s=strstr(aline," to he ");
1218 if (pswit[ECHO_SWITCH])
1219 g_print("\n%s\n",aline);
1220 if (!pswit[OVERVIEW_SWITCH])
1221 g_print(" Line %ld column %ld - Query he/be error?\n",
1222 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1226 s=strstr(aline," the had ");
1228 s=strstr(aline," a had ");
1230 s=strstr(aline," they bad ");
1232 s=strstr(aline," she bad ");
1234 s=strstr(aline," he bad ");
1236 s=strstr(aline," you bad ");
1238 s=strstr(aline," i bad ");
1241 if (pswit[ECHO_SWITCH])
1242 g_print("\n%s\n",aline);
1243 if (!pswit[OVERVIEW_SWITCH])
1244 g_print(" Line %ld column %ld - Query had/bad error?\n",
1245 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1249 s=strstr(aline,"; hut ");
1251 s=strstr(aline,", hut ");
1254 if (pswit[ECHO_SWITCH])
1255 g_print("\n%s\n",aline);
1256 if (!pswit[OVERVIEW_SWITCH])
1257 g_print(" Line %ld column %ld - Query hut/but error?\n",
1258 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1265 * check_for_mta_from:
1267 * Special case - angled bracket in front of "From" placed there by an
1268 * MTA when sending an e-mail.
1270 void check_for_mta_from(const char *aline)
1273 s=strstr(aline,">From");
1276 if (pswit[ECHO_SWITCH])
1277 g_print("\n%s\n",aline);
1278 if (!pswit[OVERVIEW_SWITCH])
1279 g_print(" Line %ld column %ld - "
1280 "Query angled bracket with From\n",
1281 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1288 * check_for_orphan_character:
1290 * Check for a single character line -
1291 * often an overflow from bad wrapping.
1293 void check_for_orphan_character(const char *aline)
1296 c=g_utf8_get_char(aline);
1297 if (c && !*g_utf8_next_char(aline))
1299 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1300 ; /* Nothing - ignore numerals alone on a line. */
1303 if (pswit[ECHO_SWITCH])
1304 g_print("\n%s\n",aline);
1305 if (!pswit[OVERVIEW_SWITCH])
1306 g_print(" Line %ld column 1 - Query single character line\n",
1315 * check_for_pling_scanno:
1317 * Check for I" - often should be !
1319 void check_for_pling_scanno(const char *aline)
1322 s=strstr(aline," I\"");
1325 if (pswit[ECHO_SWITCH])
1326 g_print("\n%s\n",aline);
1327 if (!pswit[OVERVIEW_SWITCH])
1328 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1329 linecnt,g_utf8_pointer_to_offset(aline,s));
1336 * check_for_extra_period:
1338 * Check for period without a capital letter. Cut-down from gutspell.
1339 * Only works when it happens on a single line.
1341 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1343 const char *s,*t,*s1,*sprev;
1348 gunichar c,nc,pc,*decomposition;
1349 if (pswit[PARANOID_SWITCH])
1351 for (t=aline;t=strstr(t,". ");)
1355 t=g_utf8_next_char(t);
1356 /* start of line punctuation is handled elsewhere */
1359 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1361 t=g_utf8_next_char(t);
1364 if (warnings->isDutch)
1366 /* For Frank & Jeroen -- 's Middags case */
1367 gunichar c2,c3,c4,c5;
1368 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1369 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1370 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1371 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1372 if (CHAR_IS_APOSTROPHE(c2) &&
1373 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1374 g_unichar_isupper(c5))
1376 t=g_utf8_next_char(t);
1380 s1=g_utf8_next_char(g_utf8_next_char(t));
1381 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1382 !isdigit(g_utf8_get_char(s1)))
1383 s1=g_utf8_next_char(s1);
1384 if (g_unichar_islower(g_utf8_get_char(s1)))
1386 /* we have something to investigate */
1388 /* so let's go back and find out */
1389 nc=g_utf8_get_char(t);
1390 s1=g_utf8_prev_char(t);
1391 c=g_utf8_get_char(s1);
1392 sprev=g_utf8_prev_char(s1);
1393 pc=g_utf8_get_char(sprev);
1395 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1396 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1397 g_unichar_isalpha(nc)))
1402 sprev=g_utf8_prev_char(s1);
1403 pc=g_utf8_get_char(sprev);
1405 s1=g_utf8_next_char(s1);
1408 testword=g_strndup(s1,s-s1);
1410 testword=g_strdup(s1);
1411 for (i=0;*abbrev[i];i++)
1412 if (!strcmp(testword,abbrev[i]))
1414 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1416 if (!*g_utf8_next_char(testword))
1418 if (isroman(testword))
1423 for (s=testword;*s;s=g_utf8_next_char(s))
1425 decomposition=g_unicode_canonical_decomposition(
1426 g_utf8_get_char(s),&len);
1427 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1429 g_free(decomposition);
1433 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1435 g_tree_insert(qperiod,g_strdup(testword),
1436 GINT_TO_POINTER(1));
1437 if (pswit[ECHO_SWITCH])
1438 g_print("\n%s\n",aline);
1439 if (!pswit[OVERVIEW_SWITCH])
1440 g_print(" Line %ld column %ld - Extra period?\n",
1441 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1447 t=g_utf8_next_char(t);
1453 * check_for_following_punctuation:
1455 * Check for words usually not followed by punctuation.
1457 void check_for_following_punctuation(const char *aline)
1460 const char *s,*wordstart;
1463 if (pswit[TYPO_SWITCH])
1474 inword=g_utf8_strdown(t,-1);
1476 for (i=0;*nocomma[i];i++)
1477 if (!strcmp(inword,nocomma[i]))
1479 c=g_utf8_get_char(s);
1480 if (c==',' || c==';' || c==':')
1482 if (pswit[ECHO_SWITCH])
1483 g_print("\n%s\n",aline);
1484 if (!pswit[OVERVIEW_SWITCH])
1485 g_print(" Line %ld column %ld - "
1486 "Query punctuation after %s?\n",
1487 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1493 for (i=0;*noperiod[i];i++)
1494 if (!strcmp(inword,noperiod[i]))
1496 c=g_utf8_get_char(s);
1497 if (c=='.' || c=='!')
1499 if (pswit[ECHO_SWITCH])
1500 g_print("\n%s\n",aline);
1501 if (!pswit[OVERVIEW_SWITCH])
1502 g_print(" Line %ld column %ld - "
1503 "Query punctuation after %s?\n",
1504 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1518 * Check for commonly mistyped words,
1519 * and digits like 0 for O in a word.
1521 void check_for_typos(const char *aline,struct warnings *warnings)
1523 const char *s,*t,*nt,*wordstart;
1525 gunichar *decomposition;
1527 int i,vowel,consonant,*dupcnt;
1528 gboolean isdup,istypo,alower;
1531 gsize decomposition_len;
1535 inword=getaword(&s);
1539 continue; /* don't bother with empty lines */
1541 if (mixdigit(inword))
1543 if (pswit[ECHO_SWITCH])
1544 g_print("\n%s\n",aline);
1545 if (!pswit[OVERVIEW_SWITCH])
1546 g_print(" Line %ld column %ld - Query digit in %s\n",
1547 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1552 * Put the word through a series of tests for likely typos and OCR
1555 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1559 for (t=inword;*t;t=g_utf8_next_char(t))
1561 c=g_utf8_get_char(t);
1562 nt=g_utf8_next_char(t);
1563 /* lowercase for testing */
1564 if (g_unichar_islower(c))
1566 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1569 * We have an uppercase mid-word. However, there are
1571 * Mac and Mc like McGill
1572 * French contractions like l'Abbe
1574 offset=g_utf8_pointer_to_offset(inword,t);
1576 pc=g_utf8_get_char(g_utf8_prev_char(t));
1579 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1580 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1581 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1582 CHAR_IS_APOSTROPHE(pc))
1588 testword=g_utf8_casefold(inword,-1);
1590 if (pswit[TYPO_SWITCH])
1593 * Check for certain unlikely two-letter combinations at word
1596 len=g_utf8_strlen(testword,-1);
1599 for (i=0;*nostart[i];i++)
1600 if (g_str_has_prefix(testword,nostart[i]))
1602 for (i=0;*noend[i];i++)
1603 if (g_str_has_suffix(testword,noend[i]))
1606 /* ght is common, gbt never. Like that. */
1607 if (strstr(testword,"cb"))
1609 if (strstr(testword,"gbt"))
1611 if (strstr(testword,"pbt"))
1613 if (strstr(testword,"tbs"))
1615 if (strstr(testword,"mrn"))
1617 if (strstr(testword,"ahle"))
1619 if (strstr(testword,"ihle"))
1622 * "TBE" does happen - like HEARTBEAT - but uncommon.
1623 * Also "TBI" - frostbite, outbid - but uncommon.
1624 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1625 * numerals, but "ii" is a common scanno.
1627 if (strstr(testword,"tbi"))
1629 if (strstr(testword,"tbe"))
1631 if (strstr(testword,"ii"))
1634 * Check for no vowels or no consonants.
1635 * If none, flag a typo.
1637 if (!istypo && len>1)
1640 for (t=testword;*t;t=g_utf8_next_char(t))
1642 c=g_utf8_get_char(t);
1644 g_unicode_canonical_decomposition(c,&decomposition_len);
1645 if (c=='y' || g_unichar_isdigit(c))
1647 /* Yah, this is loose. */
1651 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1655 g_free(decomposition);
1657 if (!vowel || !consonant)
1661 * Now exclude the word from being reported if it's in
1664 for (i=0;*okword[i];i++)
1665 if (!strcmp(testword,okword[i]))
1668 * What looks like a typo may be a Roman numeral.
1671 if (istypo && isroman(testword))
1673 /* Check the manual list of typos. */
1675 for (i=0;*typo[i];i++)
1676 if (!strcmp(testword,typo[i]))
1679 * Check lowercase s, l, i and m - special cases.
1680 * "j" - often a semi-colon gone wrong.
1681 * "d" for a missing apostrophe - he d
1684 if (!istypo && len==1 &&
1685 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1689 dupcnt=g_tree_lookup(qword,testword);
1693 isdup=!pswit[VERBOSE_SWITCH];
1697 dupcnt=g_new0(int,1);
1698 g_tree_insert(qword,g_strdup(testword),dupcnt);
1703 if (pswit[ECHO_SWITCH])
1704 g_print("\n%s\n",aline);
1705 if (!pswit[OVERVIEW_SWITCH])
1707 g_print(" Line %ld column %ld - Query word %s",
1708 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1710 if (!pswit[VERBOSE_SWITCH])
1711 g_print(" - not reporting duplicates");
1719 /* check the user's list of typos */
1720 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1722 if (pswit[ECHO_SWITCH])
1723 g_print("\n%s\n",aline);
1724 if (!pswit[OVERVIEW_SWITCH])
1725 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1726 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1728 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1730 if (pswit[PARANOID_SWITCH] && warnings->digit)
1732 /* In paranoid mode, query all 0 and 1 standing alone. */
1733 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1735 if (pswit[ECHO_SWITCH])
1736 g_print("\n%s\n",aline);
1737 if (!pswit[OVERVIEW_SWITCH])
1738 g_print(" Line %ld column %ld - Query standalone %s\n",
1739 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1750 * check_for_misspaced_punctuation:
1752 * Look for added or missing spaces around punctuation and quotes.
1753 * If there is a punctuation character like ! with no space on
1754 * either side, suspect a missing!space. If there are spaces on
1755 * both sides , assume a typo. If we see a double quote with no
1756 * space or punctuation on either side of it, assume unspaced
1757 * quotes "like"this.
1759 void check_for_misspaced_punctuation(const char *aline,
1760 struct parities *parities,gboolean isemptyline)
1762 gboolean isacro,isellipsis;
1764 gunichar c,nc,pc,n2c;
1765 c=g_utf8_get_char(aline);
1766 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1767 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1771 nc=g_utf8_get_char(g_utf8_next_char(s));
1772 /* For each character in the line after the first. */
1773 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1775 /* we need to suppress warnings for acronyms like M.D. */
1777 /* we need to suppress warnings for ellipsis . . . */
1780 * If there are letters on both sides of it or
1781 * if it's strict punctuation followed by an alpha.
1783 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1784 g_utf8_strchr("?!,;:",-1,c)))
1788 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1789 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1791 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1797 if (pswit[ECHO_SWITCH])
1798 g_print("\n%s\n",aline);
1799 if (!pswit[OVERVIEW_SWITCH])
1800 g_print(" Line %ld column %ld - Missing space?\n",
1801 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1806 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1809 * If there are spaces on both sides,
1810 * or space before and end of line.
1814 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1815 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1817 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1821 if (!isemptyline && !isellipsis)
1823 if (pswit[ECHO_SWITCH])
1824 g_print("\n%s\n",aline);
1825 if (!pswit[OVERVIEW_SWITCH])
1826 g_print(" Line %ld column %ld - "
1827 "Spaced punctuation?\n",linecnt,
1828 g_utf8_pointer_to_offset(aline,s)+1);
1835 /* Split out the characters that CANNOT be preceded by space. */
1836 c=g_utf8_get_char(aline);
1837 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1838 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1842 nc=g_utf8_get_char(g_utf8_next_char(s));
1843 /* for each character in the line after the first */
1844 if (g_utf8_strchr("?!,;:",-1,c))
1846 /* if it's punctuation that _cannot_ have a space before it */
1847 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1850 * If nc DOES == space,
1851 * it was already reported just above.
1853 if (pswit[ECHO_SWITCH])
1854 g_print("\n%s\n",aline);
1855 if (!pswit[OVERVIEW_SWITCH])
1856 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1857 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1864 * Special case " .X" where X is any alpha.
1865 * This plugs a hole in the acronym code above.
1866 * Inelegant, but maintainable.
1868 c=g_utf8_get_char(aline);
1869 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1870 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1874 nc=g_utf8_get_char(g_utf8_next_char(s));
1875 /* for each character in the line after the first */
1878 /* if it's a period */
1879 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1882 * If the period follows a space and
1883 * is followed by a letter.
1885 if (pswit[ECHO_SWITCH])
1886 g_print("\n%s\n",aline);
1887 if (!pswit[OVERVIEW_SWITCH])
1888 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1889 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1895 c=g_utf8_get_char(aline);
1896 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1897 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1901 nc=g_utf8_get_char(g_utf8_next_char(s));
1902 /* for each character in the line after the first */
1905 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1906 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1907 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1909 if (pswit[ECHO_SWITCH])
1910 g_print("\n%s\n",aline);
1911 if (!pswit[OVERVIEW_SWITCH])
1912 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1913 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1919 /* Check parity of quotes. */
1920 nc=g_utf8_get_char(aline);
1921 for (s=aline;*s;s=g_utf8_next_char(s))
1924 nc=g_utf8_get_char(g_utf8_next_char(s));
1927 parities->dquote=!parities->dquote;
1928 if (!parities->dquote)
1931 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
1933 if (pswit[ECHO_SWITCH])
1934 g_print("\n%s\n",aline);
1935 if (!pswit[OVERVIEW_SWITCH])
1936 g_print(" Line %ld column %ld - "
1937 "Wrongspaced quotes?\n",
1938 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1946 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1947 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
1949 if (pswit[ECHO_SWITCH])
1950 g_print("\n%s\n",aline);
1951 if (!pswit[OVERVIEW_SWITCH])
1952 g_print(" Line %ld column %ld - "
1953 "Wrongspaced quotes?\n",
1954 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1961 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
1963 if (g_utf8_strchr(",;:!?)]} ",-1,
1964 g_utf8_get_char(g_utf8_next_char(aline))))
1966 if (pswit[ECHO_SWITCH])
1967 g_print("\n%s\n",aline);
1968 if (!pswit[OVERVIEW_SWITCH])
1969 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
1975 if (pswit[SQUOTE_SWITCH])
1977 nc=g_utf8_get_char(aline);
1978 for (s=aline;*s;s=g_utf8_next_char(s))
1981 nc=g_utf8_get_char(g_utf8_next_char(s));
1982 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
1983 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
1984 !g_unichar_isalpha(nc)))
1986 parities->squote=!parities->squote;
1987 if (!parities->squote)
1990 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
1992 if (pswit[ECHO_SWITCH])
1993 g_print("\n%s\n",aline);
1994 if (!pswit[OVERVIEW_SWITCH])
1995 g_print(" Line %ld column %ld - "
1996 "Wrongspaced singlequotes?\n",
1997 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2005 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2006 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2008 if (pswit[ECHO_SWITCH])
2009 g_print("\n%s\n",aline);
2010 if (!pswit[OVERVIEW_SWITCH])
2011 g_print(" Line %ld column %ld - "
2012 "Wrongspaced singlequotes?\n",
2013 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2024 * check_for_double_punctuation:
2026 * Look for double punctuation like ,. or ,,
2027 * Thanks to DW for the suggestion!
2028 * In books with references, ".," and ".;" are common
2029 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2030 * OTOH, from my initial tests, there are also fairly
2031 * common errors. What to do? Make these cases paranoid?
2032 * ".," is the most common, so warnings->dotcomma is used
2033 * to suppress detailed reporting if it occurs often.
2035 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2039 nc=g_utf8_get_char(aline);
2040 for (s=aline;*s;s=g_utf8_next_char(s))
2043 nc=g_utf8_get_char(g_utf8_next_char(s));
2044 /* for each punctuation character in the line */
2045 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2046 g_utf8_strchr(".?!,;:",-1,nc))
2048 /* followed by punctuation, it's a query, unless . . . */
2049 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2050 !warnings->dotcomma && c=='.' && nc==',' ||
2051 warnings->isFrench && g_str_has_prefix(s,",...") ||
2052 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2053 warnings->isFrench && g_str_has_prefix(s,";...") ||
2054 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2055 warnings->isFrench && g_str_has_prefix(s,":...") ||
2056 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2057 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2058 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2059 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2060 warnings->isFrench && g_str_has_prefix(s,"...?"))
2062 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2063 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2064 warnings->isFrench && g_str_has_prefix(s,";...") ||
2065 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2066 warnings->isFrench && g_str_has_prefix(s,":...") ||
2067 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2068 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2069 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2070 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2071 warnings->isFrench && g_str_has_prefix(s,"...?"))
2074 nc=g_utf8_get_char(g_utf8_next_char(s));
2076 ; /* do nothing for .. !! and ?? which can be legit */
2080 if (pswit[ECHO_SWITCH])
2081 g_print("\n%s\n",aline);
2082 if (!pswit[OVERVIEW_SWITCH])
2083 g_print(" Line %ld column %ld - Double punctuation?\n",
2084 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2093 * check_for_spaced_quotes:
2095 void check_for_spaced_quotes(const char *aline)
2099 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2103 while ((t=strstr(s," \" ")))
2105 if (pswit[ECHO_SWITCH])
2106 g_print("\n%s\n",aline);
2107 if (!pswit[OVERVIEW_SWITCH])
2108 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2109 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2112 s=g_utf8_next_char(g_utf8_next_char(t));
2114 pattern=g_string_new(NULL);
2115 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2117 g_string_assign(pattern," ");
2118 g_string_append_unichar(pattern,single_quotes[i]);
2119 g_string_append_c(pattern,' ');
2121 while ((t=strstr(s,pattern->str)))
2123 if (pswit[ECHO_SWITCH])
2124 g_print("\n%s\n",aline);
2125 if (!pswit[OVERVIEW_SWITCH])
2126 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2127 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2130 s=g_utf8_next_char(g_utf8_next_char(t));
2133 g_string_free(pattern,TRUE);
2137 * check_for_miscased_genative:
2139 * Check special case of 'S instead of 's at end of word.
2141 void check_for_miscased_genative(const char *aline)
2147 c=g_utf8_get_char(aline);
2148 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2149 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2153 nc=g_utf8_get_char(g_utf8_next_char(s));
2154 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2156 if (pswit[ECHO_SWITCH])
2157 g_print("\n%s\n",aline);
2158 if (!pswit[OVERVIEW_SWITCH])
2159 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2160 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2168 * check_end_of_line:
2170 * Now check special cases - start and end of line -
2171 * for single and double quotes. Start is sometimes [sic]
2172 * but better to query it anyway.
2173 * While we're here, check for dash at end of line.
2175 void check_end_of_line(const char *aline,struct warnings *warnings)
2180 lbytes=strlen(aline);
2181 if (g_utf8_strlen(aline,lbytes)>1)
2183 s=g_utf8_prev_char(aline+lbytes);
2184 c1=g_utf8_get_char(s);
2185 c2=g_utf8_get_char(g_utf8_prev_char(s));
2186 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2188 if (pswit[ECHO_SWITCH])
2189 g_print("\n%s\n",aline);
2190 if (!pswit[OVERVIEW_SWITCH])
2191 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2192 g_utf8_strlen(aline,lbytes));
2196 c1=g_utf8_get_char(aline);
2197 c2=g_utf8_get_char(g_utf8_next_char(aline));
2198 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2200 if (pswit[ECHO_SWITCH])
2201 g_print("\n%s\n",aline);
2202 if (!pswit[OVERVIEW_SWITCH])
2203 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2208 * Dash at end of line may well be legit - paranoid mode only
2209 * and don't report em-dash at line-end.
2211 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2213 for (s=g_utf8_prev_char(aline+lbytes);
2214 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2216 if (g_utf8_get_char(s)=='-' &&
2217 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2219 if (pswit[ECHO_SWITCH])
2220 g_print("\n%s\n",aline);
2221 if (!pswit[OVERVIEW_SWITCH])
2222 g_print(" Line %ld column %ld - "
2223 "Hyphen at end of line?\n",
2224 linecnt,g_utf8_pointer_to_offset(aline,s));
2231 * check_for_unspaced_bracket:
2233 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2234 * If so, suspect a scanno like "a]most".
2236 void check_for_unspaced_bracket(const char *aline)
2240 c=g_utf8_get_char(aline);
2241 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2242 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2246 nc=g_utf8_get_char(g_utf8_next_char(s));
2249 /* for each bracket character in the line except 1st & last */
2250 if (g_utf8_strchr("{[()]}",-1,c) &&
2251 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2253 if (pswit[ECHO_SWITCH])
2254 g_print("\n%s\n",aline);
2255 if (!pswit[OVERVIEW_SWITCH])
2256 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2257 linecnt,g_utf8_pointer_to_offset(aline,s));
2265 * check_for_unpunctuated_endquote:
2267 void check_for_unpunctuated_endquote(const char *aline)
2271 c=g_utf8_get_char(aline);
2272 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2273 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2277 nc=g_utf8_get_char(g_utf8_next_char(s));
2278 /* for each character in the line except 1st */
2279 if (c==CHAR_DQUOTE && isalpha(pc))
2281 if (pswit[ECHO_SWITCH])
2282 g_print("\n%s\n",aline);
2283 if (!pswit[OVERVIEW_SWITCH])
2284 g_print(" Line %ld column %ld - "
2285 "endquote missing punctuation?\n",
2286 linecnt,g_utf8_pointer_to_offset(aline,s));
2294 * check_for_html_tag:
2296 * Check for <HTML TAG>.
2298 * If there is a < in the line, followed at some point
2299 * by a > then we suspect HTML.
2301 void check_for_html_tag(const char *aline)
2303 const char *open,*close;
2305 open=strchr(aline,'<');
2308 close=strchr(g_utf8_next_char(open),'>');
2311 if (pswit[ECHO_SWITCH])
2312 g_print("\n%s\n",aline);
2313 if (!pswit[OVERVIEW_SWITCH])
2315 tag=g_strndup(open,close-open+1);
2316 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2317 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2327 * check_for_html_entity:
2329 * Check for &symbol; HTML.
2331 * If there is a & in the line, followed at
2332 * some point by a ; then we suspect HTML.
2334 void check_for_html_entity(const char *aline)
2336 const char *s,*amp,*scolon;
2338 amp=strchr(aline,'&');
2341 scolon=strchr(amp,';');
2344 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2345 if (g_utf8_get_char(s)==CHAR_SPACE)
2346 break; /* Don't report "Jones & Son;" */
2349 if (pswit[ECHO_SWITCH])
2350 g_print("\n%s\n",aline);
2351 if (!pswit[OVERVIEW_SWITCH])
2353 entity=g_strndup(amp,scolon-amp+1);
2354 g_print(" Line %ld column %d - HTML symbol? %s \n",
2355 linecnt,(int)(amp-aline)+1,entity);
2368 * If we are in a state of unbalanced quotes, and this line
2369 * doesn't begin with a quote, output the stored error message.
2370 * If the -P switch was used, print the warning even if the
2371 * new para starts with quotes.
2373 void print_pending(const char *aline,const char *parastart,
2374 struct pending *pending)
2381 c=g_utf8_get_char(s);
2382 if (pending->dquote)
2384 if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2386 if (!pswit[OVERVIEW_SWITCH])
2388 if (pswit[ECHO_SWITCH])
2389 g_print("\n%s\n",parastart);
2390 g_print("%s\n",pending->dquote);
2395 g_free(pending->dquote);
2396 pending->dquote=NULL;
2398 if (pending->squote)
2400 if (!CHAR_IS_SQUOTE(c) || pswit[QPARA_SWITCH] || pending->squot)
2402 if (!pswit[OVERVIEW_SWITCH])
2404 if (pswit[ECHO_SWITCH])
2405 g_print("\n%s\n",parastart);
2406 g_print("%s\n",pending->squote);
2411 g_free(pending->squote);
2412 pending->squote=NULL;
2414 if (pending->rbrack)
2416 if (!pswit[OVERVIEW_SWITCH])
2418 if (pswit[ECHO_SWITCH])
2419 g_print("\n%s\n",parastart);
2420 g_print("%s\n",pending->rbrack);
2424 g_free(pending->rbrack);
2425 pending->rbrack=NULL;
2427 if (pending->sbrack)
2429 if (!pswit[OVERVIEW_SWITCH])
2431 if (pswit[ECHO_SWITCH])
2432 g_print("\n%s\n",parastart);
2433 g_print("%s\n",pending->sbrack);
2437 g_free(pending->sbrack);
2438 pending->sbrack=NULL;
2440 if (pending->cbrack)
2442 if (!pswit[OVERVIEW_SWITCH])
2444 if (pswit[ECHO_SWITCH])
2445 g_print("\n%s\n",parastart);
2446 g_print("%s\n",pending->cbrack);
2450 g_free(pending->cbrack);
2451 pending->cbrack=NULL;
2453 if (pending->unders)
2455 if (!pswit[OVERVIEW_SWITCH])
2457 if (pswit[ECHO_SWITCH])
2458 g_print("\n%s\n",parastart);
2459 g_print("%s\n",pending->unders);
2463 g_free(pending->unders);
2464 pending->unders=NULL;
2469 * check_for_mismatched_quotes:
2471 * At end of paragraph, check for mismatched quotes.
2473 * We don't want to report an error immediately, since it is a
2474 * common convention to omit the quotes at end of paragraph if
2475 * the next paragraph is a continuation of the same speaker.
2476 * Where this is the case, the next para should begin with a
2477 * quote, so we store the warning message and only display it
2478 * at the top of the next iteration if the new para doesn't
2479 * start with a quote.
2480 * The -p switch overrides this default, and warns of unclosed
2481 * quotes on _every_ paragraph, whether the next begins with a
2484 void check_for_mismatched_quotes(const struct counters *counters,
2485 struct pending *pending)
2487 int squote_straight,squote_curved;
2488 if (counters->quot%2)
2490 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2491 if (pswit[SQUOTE_SWITCH])
2493 if (matching_count(counters,CHAR_SQUOTE,TRUE))
2494 squote_straight=matching_difference(counters,CHAR_SQUOTE);
2497 if (matching_count(counters,CHAR_LS_QUOTE,TRUE))
2498 squote_curved=matching_difference(counters,CHAR_LS_QUOTE);
2501 if (squote_straight || squote_curved)
2503 g_strdup_printf(" Line %ld - Mismatched singlequotes?",
2505 if (squote_straight && squote_straight!=1 ||
2506 squote_curved && squote_curved!=1)
2508 * Flag it to be noted regardless of the
2509 * first char of the next para.
2513 if (matching_difference(counters,CHAR_OPEN_RBRACK))
2515 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2516 if (matching_difference(counters,CHAR_OPEN_SBRACK))
2518 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2519 if (matching_difference(counters,CHAR_OPEN_CBRACK))
2521 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2522 if (counters->c_unders%2)
2524 g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
2528 * check_for_omitted_punctuation:
2530 * Check for omitted punctuation at end of paragraph by working back
2531 * through prevline. DW.
2532 * Need to check this only for "normal" paras.
2533 * So what is a "normal" para?
2534 * Not normal if one-liner (chapter headings, etc.)
2535 * Not normal if doesn't contain at least one locase letter
2536 * Not normal if starts with space
2538 void check_for_omitted_punctuation(const char *prevline,
2539 struct line_properties *last,int start_para_line)
2541 gboolean letter_on_line=FALSE;
2544 for (s=prevline;*s;s=g_utf8_next_char(s))
2545 if (g_unichar_isalpha(g_utf8_get_char(s)))
2547 letter_on_line=TRUE;
2551 * This next "if" is a problem.
2552 * If we say "start_para_line <= linecnt - 1", that includes
2553 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2554 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2555 * misses genuine one-line paragraphs.
2557 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2558 g_utf8_get_char(prevline)>CHAR_SPACE)
2560 s=prevline+strlen(prevline);
2563 s=g_utf8_prev_char(s);
2564 c=g_utf8_get_char(s);
2565 } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
2566 for (;s>prevline;s=g_utf8_prev_char(s))
2568 if (g_unichar_isalpha(g_utf8_get_char(s)))
2570 if (pswit[ECHO_SWITCH])
2571 g_print("\n%s\n",prevline);
2572 if (!pswit[OVERVIEW_SWITCH])
2573 g_print(" Line %ld column %ld - "
2574 "No punctuation at para end?\n",
2575 linecnt-1,g_utf8_strlen(prevline,-1));
2580 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2586 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2588 const char *word=key;
2591 g_print("\nNote: Queried word %s was duplicated %d times\n",
2596 void print_as_windows_1252(const char *string)
2598 gsize inbytes,outbytes;
2600 static GIConv converter=(GIConv)-1;
2603 if (converter!=(GIConv)-1)
2604 g_iconv_close(converter);
2605 converter=(GIConv)-1;
2608 if (converter==(GIConv)-1)
2609 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2610 if (converter!=(GIConv)-1)
2612 inbytes=outbytes=strlen(string);
2613 bp=buf=g_malloc(outbytes+1);
2614 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2620 fputs(string,stdout);
2623 void print_as_utf_8(const char *string)
2625 fputs(string,stdout);
2633 void procfile(const char *filename)
2636 gchar *parastart=NULL; /* first line of current para */
2637 gchar *etext,*aline;
2640 struct first_pass_results *first_pass_results;
2641 struct warnings *warnings;
2642 struct counters counters={0};
2643 struct line_properties last={0};
2644 struct parities parities={0};
2645 struct pending pending={0};
2646 gboolean isemptyline;
2647 long start_para_line=0;
2648 gboolean isnewpara=FALSE,enddash=FALSE;
2649 last.start=CHAR_SPACE;
2650 linecnt=checked_linecnt=0;
2651 etext=read_etext(filename,&err);
2654 if (pswit[STDOUT_SWITCH])
2655 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2657 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2660 g_print("\n\nFile: %s\n\n",filename);
2661 first_pass_results=first_pass(etext);
2662 warnings=report_first_pass(first_pass_results);
2663 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2664 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2666 * Here we go with the main pass. Hold onto yer hat!
2670 while ((aline=flgets(&etext_ptr,linecnt+1)))
2675 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2676 continue; // skip DP page separators completely
2677 if (linecnt<first_pass_results->firstline ||
2678 (first_pass_results->footerline>0 &&
2679 linecnt>first_pass_results->footerline))
2681 if (pswit[HEADER_SWITCH])
2683 if (g_str_has_prefix(aline,"Title:"))
2684 g_print(" %s\n",aline);
2685 if (g_str_has_prefix(aline,"Author:"))
2686 g_print(" %s\n",aline);
2687 if (g_str_has_prefix(aline,"Release Date:"))
2688 g_print(" %s\n",aline);
2689 if (g_str_has_prefix(aline,"Edition:"))
2690 g_print(" %s\n\n",aline);
2692 continue; /* skip through the header */
2695 print_pending(aline,parastart,&pending);
2696 memset(&pending,0,sizeof(pending));
2697 isemptyline=analyse_quotes(aline,&counters);
2698 if (isnewpara && !isemptyline)
2700 /* This line is the start of a new paragraph. */
2701 start_para_line=linecnt;
2702 /* Capture its first line in case we want to report it later. */
2704 parastart=g_strdup(aline);
2705 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2707 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2708 !g_unichar_isdigit(g_utf8_get_char(s)))
2709 s=g_utf8_next_char(s);
2710 if (g_unichar_islower(g_utf8_get_char(s)))
2712 /* and its first letter is lowercase */
2713 if (pswit[ECHO_SWITCH])
2714 g_print("\n%s\n",aline);
2715 if (!pswit[OVERVIEW_SWITCH])
2716 g_print(" Line %ld column %ld - "
2717 "Paragraph starts with lower-case\n",
2718 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2722 isnewpara=FALSE; /* Signal the end of new para processing. */
2724 /* Check for an em-dash broken at line end. */
2725 if (enddash && g_utf8_get_char(aline)=='-')
2727 if (pswit[ECHO_SWITCH])
2728 g_print("\n%s\n",aline);
2729 if (!pswit[OVERVIEW_SWITCH])
2730 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2735 for (s=g_utf8_prev_char(aline+strlen(aline));
2736 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2738 if (s>=aline && g_utf8_get_char(s)=='-')
2740 check_for_control_characters(aline);
2742 check_for_odd_characters(aline,warnings,isemptyline);
2743 if (warnings->longline)
2744 check_for_long_line(aline);
2745 if (warnings->shortline)
2746 check_for_short_line(aline,&last);
2748 last.len=g_utf8_strlen(aline,-1);
2749 last.start=g_utf8_get_char(aline);
2750 check_for_starting_punctuation(aline);
2753 check_for_spaced_emdash(aline);
2754 check_for_spaced_dash(aline);
2756 check_for_unmarked_paragraphs(aline);
2757 check_for_jeebies(aline);
2758 check_for_mta_from(aline);
2759 check_for_orphan_character(aline);
2760 check_for_pling_scanno(aline);
2761 check_for_extra_period(aline,warnings);
2762 check_for_following_punctuation(aline);
2763 check_for_typos(aline,warnings);
2764 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2765 check_for_double_punctuation(aline,warnings);
2766 check_for_spaced_quotes(aline);
2767 check_for_miscased_genative(aline);
2768 check_end_of_line(aline,warnings);
2769 check_for_unspaced_bracket(aline);
2770 if (warnings->endquote)
2771 check_for_unpunctuated_endquote(aline);
2772 check_for_html_tag(aline);
2773 check_for_html_entity(aline);
2776 check_for_mismatched_quotes(&counters,&pending);
2777 memset(&counters,0,sizeof(counters));
2778 /* let the next iteration know that it's starting a new para */
2781 check_for_omitted_punctuation(prevline,&last,start_para_line);
2784 prevline=g_strdup(aline);
2794 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2795 g_tree_foreach(qword,report_duplicate_queries,NULL);
2796 g_tree_unref(qword);
2797 g_tree_unref(qperiod);
2798 counters_destroy(&counters);
2799 g_set_print_handler(NULL);
2800 print_as_windows_1252(NULL);
2801 if (pswit[MARKUP_SWITCH])
2808 * Get one line from the input text, checking for
2809 * the existence of exactly one CR/LF line-end per line.
2811 * Returns: a pointer to the line.
2813 char *flgets(char **etext,long lcnt)
2816 gboolean isCR=FALSE;
2817 char *theline=*etext;
2822 c=g_utf8_get_char(*etext);
2823 *etext=g_utf8_next_char(*etext);
2826 /* either way, it's end of line */
2833 /* Error - a LF without a preceding CR */
2834 if (pswit[LINE_END_SWITCH])
2836 if (pswit[ECHO_SWITCH])
2838 s=g_strndup(theline,eos-theline);
2839 g_print("\n%s\n",s);
2842 if (!pswit[OVERVIEW_SWITCH])
2843 g_print(" Line %ld - No CR?\n",lcnt);
2854 /* Error - two successive CRs */
2855 if (pswit[LINE_END_SWITCH])
2857 if (pswit[ECHO_SWITCH])
2859 s=g_strndup(theline,eos-theline);
2860 g_print("\n%s\n",s);
2863 if (!pswit[OVERVIEW_SWITCH])
2864 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2873 if (pswit[LINE_END_SWITCH] && isCR)
2875 if (pswit[ECHO_SWITCH])
2877 s=g_strndup(theline,eos-theline);
2878 g_print("\n%s\n",s);
2881 if (!pswit[OVERVIEW_SWITCH])
2882 g_print(" Line %ld column %ld - CR without LF?\n",
2883 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2889 eos=g_utf8_next_char(eos);
2893 if (pswit[MARKUP_SWITCH])
2894 postprocess_for_HTML(theline);
2895 if (pswit[DP_SWITCH])
2896 postprocess_for_DP(theline);
2903 * Takes a "word" as a parameter, and checks whether it
2904 * contains a mixture of alpha and digits. Generally, this is an
2905 * error, but may not be for cases like 4th or L5 12s. 3d.
2907 * Returns: TRUE iff an is error found.
2909 gboolean mixdigit(const char *checkword)
2911 gboolean wehaveadigit,wehavealetter,query;
2912 const char *s,*nondigit;
2913 wehaveadigit=wehavealetter=query=FALSE;
2914 for (s=checkword;*s;s=g_utf8_next_char(s))
2915 if (g_unichar_isalpha(g_utf8_get_char(s)))
2917 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2919 if (wehaveadigit && wehavealetter)
2921 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2923 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2924 nondigit=g_utf8_next_char(nondigit))
2926 /* digits, ending in st, rd, nd, th of either case */
2927 if (!g_ascii_strcasecmp(nondigit,"st") ||
2928 !g_ascii_strcasecmp(nondigit,"rd") ||
2929 !g_ascii_strcasecmp(nondigit,"nd") ||
2930 !g_ascii_strcasecmp(nondigit,"th"))
2932 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2933 !g_ascii_strcasecmp(nondigit,"rds") ||
2934 !g_ascii_strcasecmp(nondigit,"nds") ||
2935 !g_ascii_strcasecmp(nondigit,"ths"))
2937 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2938 !g_ascii_strcasecmp(nondigit,"rdly") ||
2939 !g_ascii_strcasecmp(nondigit,"ndly") ||
2940 !g_ascii_strcasecmp(nondigit,"thly"))
2942 /* digits, ending in l, L, s or d */
2943 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2944 !strcmp(nondigit,"d"))
2947 * L at the start of a number, representing Britsh pounds, like L500.
2948 * This is cute. We know the current word is mixed digit. If the first
2949 * letter is L, there must be at least one digit following. If both
2950 * digits and letters follow, we have a genuine error, else we have a
2951 * capital L followed by digits, and we accept that as a non-error.
2953 if (g_utf8_get_char(checkword)=='L' &&
2954 !mixdigit(g_utf8_next_char(checkword)))
2963 * Extracts the first/next "word" from the line, and returns it.
2964 * A word is defined as one English word unit--or at least that's the aim.
2965 * "ptr" is advanced to the position in the line where we will start
2966 * looking for the next word.
2968 * Returns: A newly-allocated string.
2970 gchar *getaword(const char **ptr)
2975 word=g_string_new(NULL);
2976 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2977 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2978 **ptr;*ptr=g_utf8_next_char(*ptr))
2981 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2982 * Especially yucky is the case of L1,000
2983 * This section looks for a pattern of characters including a digit
2984 * followed by a comma or period followed by one or more digits.
2985 * If found, it returns this whole pattern as a word; otherwise we discard
2986 * the results and resume our normal programming.
2989 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2990 g_unichar_isalpha(g_utf8_get_char(s)) ||
2991 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2992 g_string_append_unichar(word,g_utf8_get_char(s));
2995 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
2997 c=g_utf8_get_char(t);
2998 pc=g_utf8_get_char(g_utf8_prev_char(t));
2999 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3002 return g_string_free(word,FALSE);
3006 /* we didn't find a punctuated number - do the regular getword thing */
3007 g_string_truncate(word,0);
3008 for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
3009 g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
3010 g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
3011 g_string_append_unichar(word,g_utf8_get_char(*ptr));
3012 return g_string_free(word,FALSE);
3018 * Is this word a Roman Numeral?
3020 * It doesn't actually validate that the number is a valid Roman Numeral--for
3021 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3022 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3023 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3024 * expressions thereof, except when it came to taxes. Allow any number of M,
3025 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3026 * XL or an optional XC, an optional IX or IV, an optional V and any number
3029 gboolean isroman(const char *t)
3035 while (g_utf8_get_char(t)=='m' && *t)
3037 if (g_utf8_get_char(t)=='d')
3039 if (g_str_has_prefix(t,"cm"))
3041 if (g_str_has_prefix(t,"cd"))
3043 while (g_utf8_get_char(t)=='c' && *t)
3045 if (g_str_has_prefix(t,"xl"))
3047 if (g_str_has_prefix(t,"xc"))
3049 if (g_utf8_get_char(t)=='l')
3051 while (g_utf8_get_char(t)=='x' && *t)
3053 if (g_str_has_prefix(t,"ix"))
3055 if (g_str_has_prefix(t,"iv"))
3057 if (g_utf8_get_char(t)=='v')
3059 while (g_utf8_get_char(t)=='i' && *t)
3065 * postprocess_for_DP:
3067 * Invoked with the -d switch from flgets().
3068 * It simply "removes" from the line a hard-coded set of common
3069 * DP-specific tags, so that the line passed to the main routine has
3070 * been pre-cleaned of DP markup.
3072 void postprocess_for_DP(char *theline)
3078 for (i=0;*DPmarkup[i];i++)
3079 while ((s=strstr(theline,DPmarkup[i])))
3081 t=s+strlen(DPmarkup[i]);
3082 memmove(s,t,strlen(t)+1);
3087 * postprocess_for_HTML:
3089 * Invoked with the -m switch from flgets().
3090 * It simply "removes" from the line a hard-coded set of common
3091 * HTML tags and "replaces" a hard-coded set of common HTML
3092 * entities, so that the line passed to the main routine has
3093 * been pre-cleaned of HTML.
3095 void postprocess_for_HTML(char *theline)
3097 while (losemarkup(theline))
3099 loseentities(theline);
3102 char *losemarkup(char *theline)
3106 s=strchr(theline,'<');
3107 t=s?strchr(s,'>'):NULL;
3110 for (i=0;*markup[i];i++)
3111 if (tagcomp(g_utf8_next_char(s),markup[i]))
3113 t=g_utf8_next_char(t);
3114 memmove(s,t,strlen(t)+1);
3117 /* It's an unrecognized <xxx>. */
3121 void loseentities(char *theline)
3128 GTree *entities=NULL;
3129 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3133 g_tree_destroy(entities);
3135 if (translit!=(GIConv)-1)
3136 g_iconv_close(translit);
3137 translit=(GIConv)-1;
3138 if (to_utf8!=(GIConv)-1)
3139 g_iconv_close(to_utf8);
3147 entities=g_tree_new((GCompareFunc)strcmp);
3148 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3149 g_tree_insert(entities,HTMLentities[i].name,
3150 GUINT_TO_POINTER(HTMLentities[i].c));
3152 if (translit==(GIConv)-1)
3153 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3154 if (to_utf8==(GIConv)-1)
3155 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3156 while((amp=strchr(theline,'&')))
3158 scolon=strchr(amp,';');
3163 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3164 c=strtol(amp+2,NULL,10);
3165 else if (amp[2]=='x' &&
3166 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3167 c=strtol(amp+3,NULL,16);
3171 s=g_strndup(amp+1,scolon-(amp+1));
3172 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3181 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3182 theline+=g_unichar_to_utf8(c,theline);
3186 nb=g_unichar_to_utf8(c,s);
3187 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3189 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3191 memcpy(theline,s,nb);
3195 memmove(theline,g_utf8_next_char(scolon),
3196 strlen(g_utf8_next_char(scolon))+1);
3199 theline=g_utf8_next_char(amp);
3203 gboolean tagcomp(const char *strin,const char *basetag)
3207 if (g_utf8_get_char(strin)=='/')
3208 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3210 t=g_utf8_casefold(strin,-1);
3211 s=g_utf8_casefold(basetag,-1);
3212 retval=g_str_has_prefix(t,s);
3218 void proghelp(GOptionContext *context)
3221 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3222 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3223 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3224 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3225 "For details, read the file COPYING.\n",stderr);
3226 fputs("This is Free Software; "
3227 "you may redistribute it under certain conditions (GPL);\n",stderr);
3228 fputs("read the file COPYING for details.\n\n",stderr);
3229 help=g_option_context_get_help(context,TRUE,NULL);
3232 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3233 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3234 "non-ASCII\n",stderr);
3235 fputs("characters like accented letters, "
3236 "lines longer than 75 or shorter than 55,\n",stderr);
3237 fputs("unbalanced quotes or brackets, "
3238 "a variety of badly formatted punctuation, \n",stderr);
3239 fputs("HTML tags, some likely typos. "
3240 "It is NOT a substitute for human judgement.\n",stderr);