1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "HTMLentities.h"
36 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
37 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
38 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
39 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
40 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
41 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
42 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
43 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
44 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
45 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
46 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
47 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
48 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
49 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
50 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
51 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
52 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
53 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
54 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
55 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
56 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
57 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
58 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
59 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
60 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
61 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
62 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
63 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
64 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
70 /* Common abbreviations and other OK words not to query as typos. */
72 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
73 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
74 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
75 "outbid", "outbids", "frostbite", "frostbitten", ""
78 /* Common abbreviations that cause otherwise unexplained periods. */
80 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
81 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
85 * Two-Letter combinations that rarely if ever start words,
86 * but are common scannos or otherwise common letter combinations.
89 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
93 * Two-Letter combinations that rarely if ever end words,
94 * but are common scannos or otherwise common letter combinations.
97 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
98 "sw", "gr", "sl", "cl", "iy", ""
102 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
103 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
104 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
105 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
109 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
113 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
114 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
115 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
116 "during", "let", "toward", "among", ""
120 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
121 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
122 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
123 "among", "those", "into", "whom", "having", "thence", ""
126 /* special characters */
127 #define CHAR_SPACE 32
131 #define CHAR_DQUOTE 34
132 #define CHAR_SQUOTE 39
133 #define CHAR_OPEN_SQUOTE 96
134 #define CHAR_TILDE 126
135 #define CHAR_ASTERISK 42
136 #define CHAR_FORESLASH 47
137 #define CHAR_CARAT 94
139 #define CHAR_UNDERSCORE '_'
140 #define CHAR_OPEN_CBRACK '{'
141 #define CHAR_CLOSE_CBRACK '}'
142 #define CHAR_OPEN_RBRACK '('
143 #define CHAR_CLOSE_RBRACK ')'
144 #define CHAR_OPEN_SBRACK '['
145 #define CHAR_CLOSE_SBRACK ']'
147 /* longest and shortest normal PG line lengths */
148 #define LONGEST_PG_LINE 75
149 #define WAY_TOO_LONG 80
150 #define SHORTEST_PG_LINE 55
170 gboolean pswit[SWITNO]; /* program switches */
172 static GOptionEntry options[]={
173 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
174 "Ignore DP-specific markup", NULL },
175 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
176 "Don't echo queried line", NULL },
177 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
178 "Check single quotes", NULL },
179 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
180 "Check common typos", NULL },
181 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
182 "Require closure of quotes on every paragraph", NULL },
183 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
184 "Disable paranoid querying of everything", NULL },
185 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
186 "Disable line end checking", NULL },
187 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
188 "Overview: just show counts", NULL },
189 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
190 "Output errors to stdout instead of stderr", NULL },
191 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
192 "Echo header fields", NULL },
193 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "Ignore markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
198 "Defaults for use on www upload", NULL },
199 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Verbose - list everything", NULL },
204 long cnt_dquot; /* for overview mode, count of doublequote queries */
205 long cnt_squot; /* for overview mode, count of singlequote queries */
206 long cnt_brack; /* for overview mode, count of brackets queries */
207 long cnt_bin; /* for overview mode, count of non-ASCII queries */
208 long cnt_odd; /* for overview mode, count of odd character queries */
209 long cnt_long; /* for overview mode, count of long line errors */
210 long cnt_short; /* for overview mode, count of short line queries */
211 long cnt_punct; /* for overview mode,
212 count of punctuation and spacing queries */
213 long cnt_dash; /* for overview mode, count of dash-related queries */
214 long cnt_word; /* for overview mode, count of word queries */
215 long cnt_html; /* for overview mode, count of html queries */
216 long cnt_lineend; /* for overview mode, count of line-end queries */
217 long cnt_spacend; /* count of lines with space at end */
218 long linecnt; /* count of total lines in the file */
219 long checked_linecnt; /* count of lines actually checked */
221 void proghelp(GOptionContext *context);
222 void procfile(const char *);
226 gboolean mixdigit(const char *);
227 gchar *getaword(const char **);
228 char *flgets(char **,long);
229 void postprocess_for_HTML(char *);
230 char *linehasmarkup(char *);
231 char *losemarkup(char *);
232 gboolean tagcomp(const char *,const char *);
233 void loseentities(char *);
234 gboolean isroman(const char *);
235 void postprocess_for_DP(char *);
236 void print_as_windows_1252(const char *string);
237 void print_as_utf_8(const char *string);
239 GTree *qword,*qperiod;
245 struct first_pass_results {
246 long firstline,astline;
247 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
248 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
249 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
250 int Dutchcount,Frenchcount;
254 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
256 gboolean isDutch,isFrench;
261 int c_unders,c_brack,s_brack,r_brack;
262 int open_single_quote,close_single_quote;
265 struct line_properties {
266 unsigned int len,blen;
275 char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
279 void parse_options(int *argc,char ***argv)
282 GOptionContext *context;
283 context=g_option_context_new(
284 "file - looks for errors in Project Gutenberg(TM) etexts");
285 g_option_context_add_main_entries(context,options,NULL);
286 if (!g_option_context_parse(context,argc,argv,&err))
288 g_printerr("Bookloupe: %s\n",err->message);
289 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
292 /* Paranoid checking is turned OFF, not on, by its switch */
293 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
294 if (pswit[PARANOID_SWITCH])
295 /* if running in paranoid mode, typo checks default to enabled */
296 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
297 /* Line-end checking is turned OFF, not on, by its switch */
298 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
299 /* Echoing is turned OFF, not on, by its switch */
300 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
301 if (pswit[OVERVIEW_SWITCH])
302 /* just print summary; don't echo */
303 pswit[ECHO_SWITCH]=FALSE;
305 * Web uploads - for the moment, this is really just a placeholder
306 * until we decide what processing we really want to do on web uploads
308 if (pswit[WEB_SWITCH])
310 /* specific override for web uploads */
311 pswit[ECHO_SWITCH]=TRUE;
312 pswit[SQUOTE_SWITCH]=FALSE;
313 pswit[TYPO_SWITCH]=TRUE;
314 pswit[QPARA_SWITCH]=FALSE;
315 pswit[PARANOID_SWITCH]=TRUE;
316 pswit[LINE_END_SWITCH]=FALSE;
317 pswit[OVERVIEW_SWITCH]=FALSE;
318 pswit[STDOUT_SWITCH]=FALSE;
319 pswit[HEADER_SWITCH]=TRUE;
320 pswit[VERBOSE_SWITCH]=FALSE;
321 pswit[MARKUP_SWITCH]=FALSE;
322 pswit[USERTYPO_SWITCH]=FALSE;
323 pswit[DP_SWITCH]=FALSE;
330 g_option_context_free(context);
336 * Read in the user-defined stealth scanno list.
338 void read_user_scannos(void)
341 gchar *usertypo_file;
345 gchar *contents,*utf8,**lines;
346 usertypo_file=g_strdup("bookloupe.typ");
347 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
348 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
351 g_free(usertypo_file);
352 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
353 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
355 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
358 g_free(usertypo_file);
359 usertypo_file=g_strdup("gutcheck.typ");
360 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
362 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
365 g_free(usertypo_file);
366 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
367 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
369 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
371 g_free(usertypo_file);
372 g_print(" --> I couldn't find bookloupe.typ "
373 "-- proceeding without user typos.\n");
378 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
379 g_free(usertypo_file);
383 if (g_utf8_validate(contents,len,NULL))
384 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
386 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
388 lines=g_strsplit_set(utf8,"\r\n",0);
390 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
391 for (i=0;lines[i];i++)
392 if (*(unsigned char *)lines[i]>'!')
393 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
402 * Read an etext returning a newly allocated string containing the file
403 * contents or NULL on error.
405 gchar *read_etext(const char *filename,GError **err)
407 gchar *contents,*utf8;
409 if (!g_file_get_contents(filename,&contents,&len,err))
411 if (g_utf8_validate(contents,len,NULL))
413 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
414 g_set_print_handler(print_as_utf_8);
416 SetConsoleOutputCP(CP_UTF8);
421 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
422 g_set_print_handler(print_as_windows_1252);
424 SetConsoleOutputCP(1252);
431 void cleanup_on_exit(void)
434 SetConsoleOutputCP(saved_cp);
438 int main(int argc,char **argv)
441 atexit(cleanup_on_exit);
442 saved_cp=GetConsoleOutputCP();
444 running_from=g_path_get_dirname(argv[0]);
445 parse_options(&argc,&argv);
446 if (pswit[USERTYPO_SWITCH])
448 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
450 if (pswit[OVERVIEW_SWITCH])
452 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
453 checked_linecnt,linecnt,linecnt-checked_linecnt);
454 g_print(" --------------- Queries found --------------\n");
456 g_print(" Long lines: %14ld\n",cnt_long);
458 g_print(" Short lines: %14ld\n",cnt_short);
460 g_print(" Line-end problems: %14ld\n",cnt_lineend);
462 g_print(" Common typos: %14ld\n",cnt_word);
464 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
466 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
468 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
470 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
472 g_print(" Proofing characters: %14ld\n",cnt_odd);
474 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
476 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
478 g_print(" Possible HTML tags: %14ld\n",cnt_html);
480 g_print(" TOTAL QUERIES %14ld\n",
481 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
482 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
484 g_free(running_from);
486 g_tree_unref(usertypo);
493 * Run a first pass - verify that it's a valid PG
494 * file, decide whether to report some things that
495 * occur many times in the text like long or short
496 * lines, non-standard dashes, etc.
498 struct first_pass_results *first_pass(const char *etext)
500 gunichar laststart=CHAR_SPACE;
505 unsigned int lastlen=0,lastblen=0;
506 long spline=0,nspline=0;
507 static struct first_pass_results results={0};
509 lines=g_strsplit(etext,"\n",0);
510 for (j=0;lines[j];j++)
512 lbytes=strlen(lines[j]);
513 while (lines[j][lbytes-1]=='\r')
514 lines[j][--lbytes]='\0';
515 llen=g_utf8_strlen(lines[j],lbytes);
517 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
518 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
521 g_print(" --> Duplicate header?\n");
522 spline=linecnt+1; /* first line of non-header text, that is */
524 if (!strncmp(lines[j],"*** START",9) &&
525 strstr(lines[j],"PROJECT GUTENBERG"))
528 g_print(" --> Duplicate header?\n");
529 nspline=linecnt+1; /* first line of non-header text, that is */
531 if (spline || nspline)
533 lc_line=g_utf8_strdown(lines[j],lbytes);
534 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
536 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
538 if (results.footerline)
540 /* it's an old-form header - we can detect duplicates */
542 g_print(" --> Duplicate footer?\n");
545 results.footerline=linecnt;
551 results.firstline=spline;
553 results.firstline=nspline; /* override with new */
554 if (results.footerline)
555 continue; /* don't count the boilerplate in the footer */
556 results.totlen+=llen;
557 for (s=lines[j];*s;s=g_utf8_next_char(s))
559 if (g_utf8_get_char(s)>127)
561 if (g_unichar_isalpha(g_utf8_get_char(s)))
563 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
564 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
565 results.endquote_count++;
567 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
568 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
571 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
573 if (strstr(lines[j],".,"))
575 /* only count ast lines for ignoring purposes where there is */
576 /* locase text on the line */
577 if (strchr(lines[j],'*'))
579 for (s=lines[j];*s;s=g_utf8_next_char(s))
580 if (g_unichar_islower(g_utf8_get_char(s)))
585 if (strchr(lines[j],'/'))
586 results.fslashline++;
587 for (s=g_utf8_prev_char(lines[j]+lbytes);
588 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
590 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
591 g_utf8_get_char(g_utf8_prev_char(s))!='-')
593 if (llen>LONGEST_PG_LINE)
595 if (llen>WAY_TOO_LONG)
596 results.verylongline++;
597 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
599 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
602 if (strstr(lines[j],"<i>"))
603 results.htmcount+=4; /* bonus marks! */
605 /* Check for spaced em-dashes */
606 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
609 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
610 results.space_emdash++;
611 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
612 /* count of em-dashes with spaces both sides */
613 results.non_PG_space_emdash++;
614 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
615 /* count of PG-type em-dashes with no spaces */
616 results.PG_space_emdash++;
621 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
622 results.Dutchcount++;
623 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
624 results.Frenchcount++;
625 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
626 results.standalone_digit++;
629 /* Check for spaced dashes */
630 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
634 laststart=lines[j][0];
643 * Make some snap decisions based on the first pass results.
645 struct warnings *report_first_pass(struct first_pass_results *results)
647 static struct warnings warnings={0};
649 g_print(" --> %ld lines in this file have white space at end\n",
652 if (results->dotcomma>5)
655 g_print(" --> %ld lines in this file contain '.,'. "
656 "Not reporting them.\n",results->dotcomma);
659 * If more than 50 lines, or one-tenth, are short,
660 * don't bother reporting them.
662 warnings.shortline=1;
663 if (results->shortline>50 || results->shortline*10>linecnt)
665 warnings.shortline=0;
666 g_print(" --> %ld lines in this file are short. "
667 "Not reporting short lines.\n",results->shortline);
670 * If more than 50 lines, or one-tenth, are long,
671 * don't bother reporting them.
674 if (results->longline>50 || results->longline*10>linecnt)
677 g_print(" --> %ld lines in this file are long. "
678 "Not reporting long lines.\n",results->longline);
680 /* If more than 10 lines contain asterisks, don't bother reporting them. */
682 if (results->astline>10)
685 g_print(" --> %ld lines in this file contain asterisks. "
686 "Not reporting them.\n",results->astline);
689 * If more than 10 lines contain forward slashes,
690 * don't bother reporting them.
693 if (results->fslashline>10)
696 g_print(" --> %ld lines in this file contain forward slashes. "
697 "Not reporting them.\n",results->fslashline);
700 * If more than 20 lines contain unpunctuated endquotes,
701 * don't bother reporting them.
704 if (results->endquote_count>20)
707 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
708 "Not reporting them.\n",results->endquote_count);
711 * If more than 15 lines contain standalone digits,
712 * don't bother reporting them.
715 if (results->standalone_digit>10)
718 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
719 "Not reporting them.\n",results->standalone_digit);
722 * If more than 20 lines contain hyphens at end,
723 * don't bother reporting them.
726 if (results->hyphens>20)
729 g_print(" --> %ld lines in this file have hyphens at end. "
730 "Not reporting them.\n",results->hyphens);
732 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
734 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
735 pswit[MARKUP_SWITCH]=1;
737 if (results->verylongline>0)
738 g_print(" --> %ld lines in this file are VERY long!\n",
739 results->verylongline);
741 * If there are more non-PG spaced dashes than PG em-dashes,
742 * assume it's deliberate.
743 * Current PG guidelines say don't use them, but older texts do,
744 * and some people insist on them whatever the guidelines say.
747 if (results->spacedash+results->non_PG_space_emdash>
748 results->PG_space_emdash)
751 g_print(" --> There are %ld spaced dashes and em-dashes. "
752 "Not reporting them.\n",
753 results->spacedash+results->non_PG_space_emdash);
755 /* If more than a quarter of characters are hi-bit, bug out. */
757 if (results->binlen*4>results->totlen)
759 g_print(" --> This file does not appear to be ASCII. "
760 "Terminating. Best of luck with it!\n");
763 if (results->alphalen*4<results->totlen)
765 g_print(" --> This file does not appear to be text. "
766 "Terminating. Best of luck with it!\n");
769 if (results->binlen*100>results->totlen || results->binlen>100)
771 g_print(" --> There are a lot of foreign letters here. "
772 "Not reporting them.\n");
775 warnings.isDutch=FALSE;
776 if (results->Dutchcount>50)
778 warnings.isDutch=TRUE;
779 g_print(" --> This looks like Dutch - "
780 "switching off dashes and warnings for 's Middags case.\n");
782 warnings.isFrench=FALSE;
783 if (results->Frenchcount>50)
785 warnings.isFrench=TRUE;
786 g_print(" --> This looks like French - "
787 "switching off some doublepunct.\n");
789 if (results->firstline && results->footerline)
790 g_print(" The PG header and footer appear to be already on.\n");
793 if (results->firstline)
794 g_print(" The PG header is on - no footer.\n");
795 if (results->footerline)
796 g_print(" The PG footer is on - no header.\n");
799 if (pswit[VERBOSE_SWITCH])
802 warnings.shortline=1;
811 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
813 if (warnings.isDutch)
815 if (results->footerline>0 && results->firstline>0 &&
816 results->footerline>results->firstline &&
817 results->footerline-results->firstline<100)
819 g_print(" --> I don't really know where this text starts. \n");
820 g_print(" There are no reference points.\n");
821 g_print(" I'm going to have to report the header and footer "
823 results->firstline=0;
831 * Look along the line, accumulate the count of quotes, and see
832 * if this is an empty line - i.e. a line with nothing on it
834 * If line has just spaces, period, * and/or - on it, don't
835 * count it, since empty lines with asterisks or dashes to
836 * separate sections are common.
838 * Returns: TRUE if the line is empty.
840 gboolean analyse_quotes(const char *aline,struct counters *counters)
843 /* assume the line is empty until proven otherwise */
844 gboolean isemptyline=TRUE;
845 const char *s=aline,*sprev,*snext;
850 snext=g_utf8_next_char(s);
851 c=g_utf8_get_char(s);
854 if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
859 * At start of line, it can only be an openquote.
860 * Hardcode a very common exception!
862 if (!g_str_has_prefix(snext,"tis") &&
863 !g_str_has_prefix(snext,"Tis"))
864 counters->open_single_quote++;
866 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
867 g_unichar_isalpha(g_utf8_get_char(snext)))
868 /* Do nothing! it's definitely an apostrophe, not a quote */
870 /* it's outside a word - let's check it out */
871 else if (c==CHAR_OPEN_SQUOTE ||
872 g_unichar_isalpha(g_utf8_get_char(snext)))
874 /* it damwell better BE an openquote */
875 if (!g_str_has_prefix(snext,"tis") &&
876 !g_str_has_prefix(snext,"Tis"))
877 /* hardcode a very common exception! */
878 counters->open_single_quote++;
882 /* now - is it a closequote? */
883 guessquote=0; /* accumulate clues */
884 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
886 /* it follows a letter - could be either */
888 if (g_utf8_get_char(sprev)=='s')
890 /* looks like a plural apostrophe */
892 if (g_utf8_get_char(snext)==CHAR_SPACE)
897 /* it doesn't have a letter either side */
898 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
899 strchr(".?!,;: ",g_utf8_get_char(snext)))
900 guessquote+=8; /* looks like a closequote */
903 if (counters->open_single_quote>counters->close_single_quote)
905 * Give it the benefit of some doubt,
906 * if a squote is already open.
912 counters->close_single_quote++;
915 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
917 isemptyline=FALSE; /* ignore lines like * * * as spacers */
918 if (c==CHAR_UNDERSCORE)
919 counters->c_unders++;
920 if (c==CHAR_OPEN_CBRACK)
922 if (c==CHAR_CLOSE_CBRACK)
924 if (c==CHAR_OPEN_RBRACK)
926 if (c==CHAR_CLOSE_RBRACK)
928 if (c==CHAR_OPEN_SBRACK)
930 if (c==CHAR_CLOSE_SBRACK)
939 * check_for_control_characters:
941 * Check for invalid or questionable characters in the line
942 * Anything above 127 is invalid for plain ASCII, and
943 * non-printable control characters should also be flagged.
944 * Tabs should generally not be there.
946 void check_for_control_characters(const char *aline)
950 for (s=aline;*s;s=g_utf8_next_char(s))
952 c=g_utf8_get_char(s);
953 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
955 if (pswit[ECHO_SWITCH])
956 g_print("\n%s\n",aline);
957 if (!pswit[OVERVIEW_SWITCH])
958 g_print(" Line %ld column %ld - Control character %u\n",
959 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
967 * check_for_odd_characters:
969 * Check for binary and other odd characters.
971 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
972 gboolean isemptyline)
974 /* Don't repeat multiple warnings on one line. */
975 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
976 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
979 for (s=aline;*s;s=g_utf8_next_char(s))
981 c=g_utf8_get_char(s);
982 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
984 if (pswit[ECHO_SWITCH])
985 g_print("\n%s\n",aline);
986 if (!pswit[OVERVIEW_SWITCH])
987 if (c>127 && c<160 || c>255)
988 g_print(" Line %ld column %ld - "
989 "Non-ISO-8859 character %u\n",
990 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
992 g_print(" Line %ld column %ld - "
993 "Non-ASCII character %u\n",
994 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
999 if (!eTab && c==CHAR_TAB)
1001 if (pswit[ECHO_SWITCH])
1002 g_print("\n%s\n",aline);
1003 if (!pswit[OVERVIEW_SWITCH])
1004 g_print(" Line %ld column %ld - Tab character?\n",
1005 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1010 if (!eTilde && c==CHAR_TILDE)
1013 * Often used by OCR software to indicate an
1014 * unrecognizable character.
1016 if (pswit[ECHO_SWITCH])
1017 g_print("\n%s\n",aline);
1018 if (!pswit[OVERVIEW_SWITCH])
1019 g_print(" Line %ld column %ld - Tilde character?\n",
1020 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1025 if (!eCarat && c==CHAR_CARAT)
1027 if (pswit[ECHO_SWITCH])
1028 g_print("\n%s\n",aline);
1029 if (!pswit[OVERVIEW_SWITCH])
1030 g_print(" Line %ld column %ld - Carat character?\n",
1031 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1036 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1038 if (pswit[ECHO_SWITCH])
1039 g_print("\n%s\n",aline);
1040 if (!pswit[OVERVIEW_SWITCH])
1041 g_print(" Line %ld column %ld - Forward slash?\n",
1042 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1048 * Report asterisks only in paranoid mode,
1049 * since they're often deliberate.
1051 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1054 if (pswit[ECHO_SWITCH])
1055 g_print("\n%s\n",aline);
1056 if (!pswit[OVERVIEW_SWITCH])
1057 g_print(" Line %ld column %ld - Asterisk?\n",
1058 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1067 * check_for_long_line:
1069 * Check for line too long.
1071 void check_for_long_line(const char *aline)
1073 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1075 if (pswit[ECHO_SWITCH])
1076 g_print("\n%s\n",aline);
1077 if (!pswit[OVERVIEW_SWITCH])
1078 g_print(" Line %ld column %ld - Long line %ld\n",
1079 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1086 * check_for_short_line:
1088 * Check for line too short.
1090 * This one is a bit trickier to implement: we don't want to
1091 * flag the last line of a paragraph for being short, so we
1092 * have to wait until we know that our current line is a
1093 * "normal" line, then report the _previous_ line if it was too
1094 * short. We also don't want to report indented lines like
1095 * chapter heads or formatted quotations. We therefore keep
1096 * last->len as the length of the last line examined, and
1097 * last->blen as the length of the last but one, and try to
1098 * suppress unnecessary warnings by checking that both were of
1099 * "normal" length. We keep the first character of the last
1100 * line in last->start, and if it was a space, we assume that
1101 * the formatting is deliberate. I can't figure out a way to
1102 * distinguish something like a quoted verse left-aligned or
1103 * the header or footer of a letter from a paragraph of short
1104 * lines - maybe if I examined the whole paragraph, and if the
1105 * para has less than, say, 8 lines and if all lines are short,
1106 * then just assume it's OK? Need to look at some texts to see
1107 * how often a formula like this would get the right result.
1109 void check_for_short_line(const char *aline,const struct line_properties *last)
1111 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1112 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1113 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1115 if (pswit[ECHO_SWITCH])
1116 g_print("\n%s\n",prevline);
1117 if (!pswit[OVERVIEW_SWITCH])
1118 g_print(" Line %ld column %ld - Short line %ld?\n",
1119 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1126 * check_for_starting_punctuation:
1128 * Look for punctuation other than full ellipses at start of line.
1130 void check_for_starting_punctuation(const char *aline)
1132 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1133 !g_str_has_prefix(aline,". . ."))
1135 if (pswit[ECHO_SWITCH])
1136 g_print("\n%s\n",aline);
1137 if (!pswit[OVERVIEW_SWITCH])
1138 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1146 * check_for_spaced_emdash:
1148 * Check for spaced em-dashes.
1150 * We must check _all_ occurrences of "--" on the line
1151 * hence the loop - even if the first double-dash is OK
1152 * there may be another that's wrong later on.
1154 void check_for_spaced_emdash(const char *aline)
1156 const char *s,*t,*next;
1157 for (s=aline;t=strstr(s,"--");s=next)
1159 next=g_utf8_next_char(g_utf8_next_char(t));
1160 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1161 g_utf8_get_char(next)==CHAR_SPACE)
1163 if (pswit[ECHO_SWITCH])
1164 g_print("\n%s\n",aline);
1165 if (!pswit[OVERVIEW_SWITCH])
1166 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1167 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1175 * check_for_spaced_dash:
1177 * Check for spaced dashes.
1179 void check_for_spaced_dash(const char *aline)
1182 if ((s=strstr(aline," -")))
1184 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1186 if (pswit[ECHO_SWITCH])
1187 g_print("\n%s\n",aline);
1188 if (!pswit[OVERVIEW_SWITCH])
1189 g_print(" Line %ld column %ld - Spaced dash?\n",
1190 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1195 else if ((s=strstr(aline,"- ")))
1197 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1199 if (pswit[ECHO_SWITCH])
1200 g_print("\n%s\n",aline);
1201 if (!pswit[OVERVIEW_SWITCH])
1202 g_print(" Line %ld column %ld - Spaced dash?\n",
1203 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1211 * check_for_unmarked_paragraphs:
1213 * Check for unmarked paragraphs indicated by separate speakers.
1215 * May well be false positive:
1216 * "Bravo!" "Wonderful!" called the crowd.
1217 * but useful all the same.
1219 void check_for_unmarked_paragraphs(const char *aline)
1222 s=strstr(aline,"\" \"");
1224 s=strstr(aline,"\" \"");
1227 if (pswit[ECHO_SWITCH])
1228 g_print("\n%s\n",aline);
1229 if (!pswit[OVERVIEW_SWITCH])
1230 g_print(" Line %ld column %ld - "
1231 "Query missing paragraph break?\n",
1232 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1239 * check_for_jeebies:
1241 * Check for "to he" and other easy h/b errors.
1243 * This is a very inadequate effort on the h/b problem,
1244 * but the phrase "to he" is always an error, whereas "to
1245 * be" is quite common.
1246 * Similarly, '"Quiet!", be said.' is a non-be error
1247 * "to he" is _not_ always an error!:
1248 * "Where they went to he couldn't say."
1249 * Another false positive:
1250 * What would "Cinderella" be without the . . .
1251 * and another: "If he wants to he can see for himself."
1253 void check_for_jeebies(const char *aline)
1256 s=strstr(aline," be could ");
1258 s=strstr(aline," be would ");
1260 s=strstr(aline," was be ");
1262 s=strstr(aline," be is ");
1264 s=strstr(aline," is be ");
1266 s=strstr(aline,"\", be ");
1268 s=strstr(aline,"\" be ");
1270 s=strstr(aline,"\" be ");
1272 s=strstr(aline," to he ");
1275 if (pswit[ECHO_SWITCH])
1276 g_print("\n%s\n",aline);
1277 if (!pswit[OVERVIEW_SWITCH])
1278 g_print(" Line %ld column %ld - Query he/be error?\n",
1279 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1283 s=strstr(aline," the had ");
1285 s=strstr(aline," a had ");
1287 s=strstr(aline," they bad ");
1289 s=strstr(aline," she bad ");
1291 s=strstr(aline," he bad ");
1293 s=strstr(aline," you bad ");
1295 s=strstr(aline," i bad ");
1298 if (pswit[ECHO_SWITCH])
1299 g_print("\n%s\n",aline);
1300 if (!pswit[OVERVIEW_SWITCH])
1301 g_print(" Line %ld column %ld - Query had/bad error?\n",
1302 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1306 s=strstr(aline,"; hut ");
1308 s=strstr(aline,", hut ");
1311 if (pswit[ECHO_SWITCH])
1312 g_print("\n%s\n",aline);
1313 if (!pswit[OVERVIEW_SWITCH])
1314 g_print(" Line %ld column %ld - Query hut/but error?\n",
1315 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1322 * check_for_mta_from:
1324 * Special case - angled bracket in front of "From" placed there by an
1325 * MTA when sending an e-mail.
1327 void check_for_mta_from(const char *aline)
1330 s=strstr(aline,">From");
1333 if (pswit[ECHO_SWITCH])
1334 g_print("\n%s\n",aline);
1335 if (!pswit[OVERVIEW_SWITCH])
1336 g_print(" Line %ld column %ld - "
1337 "Query angled bracket with From\n",
1338 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1345 * check_for_orphan_character:
1347 * Check for a single character line -
1348 * often an overflow from bad wrapping.
1350 void check_for_orphan_character(const char *aline)
1353 c=g_utf8_get_char(aline);
1354 if (c && !*g_utf8_next_char(aline))
1356 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1357 ; /* Nothing - ignore numerals alone on a line. */
1360 if (pswit[ECHO_SWITCH])
1361 g_print("\n%s\n",aline);
1362 if (!pswit[OVERVIEW_SWITCH])
1363 g_print(" Line %ld column 1 - Query single character line\n",
1372 * check_for_pling_scanno:
1374 * Check for I" - often should be !
1376 void check_for_pling_scanno(const char *aline)
1379 s=strstr(aline," I\"");
1382 if (pswit[ECHO_SWITCH])
1383 g_print("\n%s\n",aline);
1384 if (!pswit[OVERVIEW_SWITCH])
1385 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1386 linecnt,g_utf8_pointer_to_offset(aline,s));
1393 * check_for_extra_period:
1395 * Check for period without a capital letter. Cut-down from gutspell.
1396 * Only works when it happens on a single line.
1398 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1400 const char *s,*t,*s1;
1405 gunichar *decomposition;
1406 if (pswit[PARANOID_SWITCH])
1408 for (t=aline;t=strstr(t,". ");)
1412 t=g_utf8_next_char(t);
1413 /* start of line punctuation is handled elsewhere */
1416 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1418 t=g_utf8_next_char(t);
1421 if (warnings->isDutch)
1423 /* For Frank & Jeroen -- 's Middags case */
1424 gunichar c2,c3,c4,c5;
1425 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1426 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1427 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1428 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1429 if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
1430 c4==CHAR_SPACE && g_unichar_isupper(c5))
1432 t=g_utf8_next_char(t);
1436 s1=g_utf8_next_char(g_utf8_next_char(t));
1437 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1438 !isdigit(g_utf8_get_char(s1)))
1439 s1=g_utf8_next_char(s1);
1440 if (g_unichar_islower(g_utf8_get_char(s1)))
1442 /* we have something to investigate */
1444 /* so let's go back and find out */
1445 for (s1=g_utf8_prev_char(t);s1>=aline &&
1446 (g_unichar_isalpha(g_utf8_get_char(s1)) ||
1447 g_unichar_isdigit(g_utf8_get_char(s1)) ||
1448 g_utf8_get_char(s1)==CHAR_SQUOTE &&
1449 g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
1450 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
1451 s1=g_utf8_prev_char(s1))
1453 s1=g_utf8_next_char(s1);
1456 testword=g_strndup(s1,s-s1);
1458 testword=g_strdup(s1);
1459 for (i=0;*abbrev[i];i++)
1460 if (!strcmp(testword,abbrev[i]))
1462 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1464 if (!*g_utf8_next_char(testword))
1466 if (isroman(testword))
1471 for (s=testword;*s;s=g_utf8_next_char(s))
1473 decomposition=g_unicode_canonical_decomposition(
1474 g_utf8_get_char(s),&len);
1475 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1477 g_free(decomposition);
1481 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1483 g_tree_insert(qperiod,g_strdup(testword),
1484 GINT_TO_POINTER(1));
1485 if (pswit[ECHO_SWITCH])
1486 g_print("\n%s\n",aline);
1487 if (!pswit[OVERVIEW_SWITCH])
1488 g_print(" Line %ld column %ld - Extra period?\n",
1489 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1495 t=g_utf8_next_char(t);
1501 * check_for_following_punctuation:
1503 * Check for words usually not followed by punctuation.
1505 void check_for_following_punctuation(const char *aline)
1508 const char *s,*wordstart;
1511 if (pswit[TYPO_SWITCH])
1522 inword=g_utf8_strdown(t,-1);
1524 for (i=0;*nocomma[i];i++)
1525 if (!strcmp(inword,nocomma[i]))
1527 c=g_utf8_get_char(s);
1528 if (c==',' || c==';' || c==':')
1530 if (pswit[ECHO_SWITCH])
1531 g_print("\n%s\n",aline);
1532 if (!pswit[OVERVIEW_SWITCH])
1533 g_print(" Line %ld column %ld - "
1534 "Query punctuation after %s?\n",
1535 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1541 for (i=0;*noperiod[i];i++)
1542 if (!strcmp(inword,noperiod[i]))
1544 c=g_utf8_get_char(s);
1545 if (c=='.' || c=='!')
1547 if (pswit[ECHO_SWITCH])
1548 g_print("\n%s\n",aline);
1549 if (!pswit[OVERVIEW_SWITCH])
1550 g_print(" Line %ld column %ld - "
1551 "Query punctuation after %s?\n",
1552 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1566 * Check for commonly mistyped words,
1567 * and digits like 0 for O in a word.
1569 void check_for_typos(const char *aline,struct warnings *warnings)
1571 const char *s,*t,*nt,*wordstart;
1573 gunichar *decomposition;
1575 int i,vowel,consonant,*dupcnt;
1576 gboolean isdup,istypo,alower;
1579 gsize decomposition_len;
1583 inword=getaword(&s);
1587 continue; /* don't bother with empty lines */
1589 if (mixdigit(inword))
1591 if (pswit[ECHO_SWITCH])
1592 g_print("\n%s\n",aline);
1593 if (!pswit[OVERVIEW_SWITCH])
1594 g_print(" Line %ld column %ld - Query digit in %s\n",
1595 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1600 * Put the word through a series of tests for likely typos and OCR
1603 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1607 for (t=inword;*t;t=g_utf8_next_char(t))
1609 c=g_utf8_get_char(t);
1610 nt=g_utf8_next_char(t);
1611 /* lowercase for testing */
1612 if (g_unichar_islower(c))
1614 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1617 * We have an uppercase mid-word. However, there are
1619 * Mac and Mc like McGill
1620 * French contractions like l'Abbe
1622 offset=g_utf8_pointer_to_offset(inword,t);
1623 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1624 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1625 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1627 g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
1633 testword=g_utf8_casefold(inword,-1);
1635 if (pswit[TYPO_SWITCH])
1638 * Check for certain unlikely two-letter combinations at word
1641 len=g_utf8_strlen(testword,-1);
1644 for (i=0;*nostart[i];i++)
1645 if (g_str_has_prefix(testword,nostart[i]))
1647 for (i=0;*noend[i];i++)
1648 if (g_str_has_suffix(testword,noend[i]))
1651 /* ght is common, gbt never. Like that. */
1652 if (strstr(testword,"cb"))
1654 if (strstr(testword,"gbt"))
1656 if (strstr(testword,"pbt"))
1658 if (strstr(testword,"tbs"))
1660 if (strstr(testword,"mrn"))
1662 if (strstr(testword,"ahle"))
1664 if (strstr(testword,"ihle"))
1667 * "TBE" does happen - like HEARTBEAT - but uncommon.
1668 * Also "TBI" - frostbite, outbid - but uncommon.
1669 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1670 * numerals, but "ii" is a common scanno.
1672 if (strstr(testword,"tbi"))
1674 if (strstr(testword,"tbe"))
1676 if (strstr(testword,"ii"))
1679 * Check for no vowels or no consonants.
1680 * If none, flag a typo.
1682 if (!istypo && len>1)
1685 for (t=testword;*t;t=g_utf8_next_char(t))
1687 c=g_utf8_get_char(t);
1689 g_unicode_canonical_decomposition(c,&decomposition_len);
1690 if (c=='y' || g_unichar_isdigit(c))
1692 /* Yah, this is loose. */
1696 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1700 g_free(decomposition);
1702 if (!vowel || !consonant)
1706 * Now exclude the word from being reported if it's in
1709 for (i=0;*okword[i];i++)
1710 if (!strcmp(testword,okword[i]))
1713 * What looks like a typo may be a Roman numeral.
1716 if (istypo && isroman(testword))
1718 /* Check the manual list of typos. */
1720 for (i=0;*typo[i];i++)
1721 if (!strcmp(testword,typo[i]))
1724 * Check lowercase s, l, i and m - special cases.
1725 * "j" - often a semi-colon gone wrong.
1726 * "d" for a missing apostrophe - he d
1729 if (!istypo && len==1 &&
1730 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1734 dupcnt=g_tree_lookup(qword,testword);
1738 isdup=!pswit[VERBOSE_SWITCH];
1742 dupcnt=g_new0(int,1);
1743 g_tree_insert(qword,g_strdup(testword),dupcnt);
1748 if (pswit[ECHO_SWITCH])
1749 g_print("\n%s\n",aline);
1750 if (!pswit[OVERVIEW_SWITCH])
1752 g_print(" Line %ld column %ld - Query word %s",
1753 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1755 if (!pswit[VERBOSE_SWITCH])
1756 g_print(" - not reporting duplicates");
1764 /* check the user's list of typos */
1765 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1767 if (pswit[ECHO_SWITCH])
1768 g_print("\n%s\n",aline);
1769 if (!pswit[OVERVIEW_SWITCH])
1770 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1771 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1773 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1775 if (pswit[PARANOID_SWITCH] && warnings->digit)
1777 /* In paranoid mode, query all 0 and 1 standing alone. */
1778 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1780 if (pswit[ECHO_SWITCH])
1781 g_print("\n%s\n",aline);
1782 if (!pswit[OVERVIEW_SWITCH])
1783 g_print(" Line %ld column %ld - Query standalone %s\n",
1784 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1795 * check_for_misspaced_punctuation:
1797 * Look for added or missing spaces around punctuation and quotes.
1798 * If there is a punctuation character like ! with no space on
1799 * either side, suspect a missing!space. If there are spaces on
1800 * both sides , assume a typo. If we see a double quote with no
1801 * space or punctuation on either side of it, assume unspaced
1802 * quotes "like"this.
1804 void check_for_misspaced_punctuation(const char *aline,
1805 struct parities *parities,gboolean isemptyline)
1807 gboolean isacro,isellipsis;
1809 gunichar c,nc,pc,n2c;
1810 c=g_utf8_get_char(aline);
1811 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1812 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1816 nc=g_utf8_get_char(g_utf8_next_char(s));
1817 /* For each character in the line after the first. */
1818 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1820 /* we need to suppress warnings for acronyms like M.D. */
1822 /* we need to suppress warnings for ellipsis . . . */
1825 * If there are letters on both sides of it or
1826 * if it's strict punctuation followed by an alpha.
1828 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1829 g_utf8_strchr("?!,;:",-1,c)))
1833 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1834 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1836 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1842 if (pswit[ECHO_SWITCH])
1843 g_print("\n%s\n",aline);
1844 if (!pswit[OVERVIEW_SWITCH])
1845 g_print(" Line %ld column %ld - Missing space?\n",
1846 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1851 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1854 * If there are spaces on both sides,
1855 * or space before and end of line.
1859 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1860 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1862 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1866 if (!isemptyline && !isellipsis)
1868 if (pswit[ECHO_SWITCH])
1869 g_print("\n%s\n",aline);
1870 if (!pswit[OVERVIEW_SWITCH])
1871 g_print(" Line %ld column %ld - "
1872 "Spaced punctuation?\n",linecnt,
1873 g_utf8_pointer_to_offset(aline,s)+1);
1880 /* Split out the characters that CANNOT be preceded by space. */
1881 c=g_utf8_get_char(aline);
1882 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1883 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1887 nc=g_utf8_get_char(g_utf8_next_char(s));
1888 /* for each character in the line after the first */
1889 if (g_utf8_strchr("?!,;:",-1,c))
1891 /* if it's punctuation that _cannot_ have a space before it */
1892 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1895 * If nc DOES == space,
1896 * it was already reported just above.
1898 if (pswit[ECHO_SWITCH])
1899 g_print("\n%s\n",aline);
1900 if (!pswit[OVERVIEW_SWITCH])
1901 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1902 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1909 * Special case " .X" where X is any alpha.
1910 * This plugs a hole in the acronym code above.
1911 * Inelegant, but maintainable.
1913 c=g_utf8_get_char(aline);
1914 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1915 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1919 nc=g_utf8_get_char(g_utf8_next_char(s));
1920 /* for each character in the line after the first */
1923 /* if it's a period */
1924 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1927 * If the period follows a space and
1928 * is followed by a letter.
1930 if (pswit[ECHO_SWITCH])
1931 g_print("\n%s\n",aline);
1932 if (!pswit[OVERVIEW_SWITCH])
1933 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1934 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1940 c=g_utf8_get_char(aline);
1941 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1942 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1946 nc=g_utf8_get_char(g_utf8_next_char(s));
1947 /* for each character in the line after the first */
1950 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1951 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1952 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1954 if (pswit[ECHO_SWITCH])
1955 g_print("\n%s\n",aline);
1956 if (!pswit[OVERVIEW_SWITCH])
1957 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1958 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1964 /* Check parity of quotes. */
1965 nc=g_utf8_get_char(aline);
1966 for (s=aline;*s;s=g_utf8_next_char(s))
1969 nc=g_utf8_get_char(g_utf8_next_char(s));
1972 parities->dquote=!parities->dquote;
1973 if (!parities->dquote)
1976 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
1978 if (pswit[ECHO_SWITCH])
1979 g_print("\n%s\n",aline);
1980 if (!pswit[OVERVIEW_SWITCH])
1981 g_print(" Line %ld column %ld - "
1982 "Wrongspaced quotes?\n",
1983 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1991 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1992 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
1994 if (pswit[ECHO_SWITCH])
1995 g_print("\n%s\n",aline);
1996 if (!pswit[OVERVIEW_SWITCH])
1997 g_print(" Line %ld column %ld - "
1998 "Wrongspaced quotes?\n",
1999 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2006 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2008 if (g_utf8_strchr(",;:!?)]} ",-1,
2009 g_utf8_get_char(g_utf8_next_char(aline))))
2011 if (pswit[ECHO_SWITCH])
2012 g_print("\n%s\n",aline);
2013 if (!pswit[OVERVIEW_SWITCH])
2014 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2020 if (pswit[SQUOTE_SWITCH])
2022 nc=g_utf8_get_char(aline);
2023 for (s=aline;*s;s=g_utf8_next_char(s))
2026 nc=g_utf8_get_char(g_utf8_next_char(s));
2027 if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
2029 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2030 !g_unichar_isalpha(nc)))
2032 parities->squote=!parities->squote;
2033 if (!parities->squote)
2036 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2038 if (pswit[ECHO_SWITCH])
2039 g_print("\n%s\n",aline);
2040 if (!pswit[OVERVIEW_SWITCH])
2041 g_print(" Line %ld column %ld - "
2042 "Wrongspaced singlequotes?\n",
2043 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2051 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2052 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2054 if (pswit[ECHO_SWITCH])
2055 g_print("\n%s\n",aline);
2056 if (!pswit[OVERVIEW_SWITCH])
2057 g_print(" Line %ld column %ld - "
2058 "Wrongspaced singlequotes?\n",
2059 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2070 * check_for_double_punctuation:
2072 * Look for double punctuation like ,. or ,,
2073 * Thanks to DW for the suggestion!
2074 * In books with references, ".," and ".;" are common
2075 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2076 * OTOH, from my initial tests, there are also fairly
2077 * common errors. What to do? Make these cases paranoid?
2078 * ".," is the most common, so warnings->dotcomma is used
2079 * to suppress detailed reporting if it occurs often.
2081 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2085 nc=g_utf8_get_char(aline);
2086 for (s=aline;*s;s=g_utf8_next_char(s))
2089 nc=g_utf8_get_char(g_utf8_next_char(s));
2090 /* for each punctuation character in the line */
2091 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2092 g_utf8_strchr(".?!,;:",-1,nc))
2094 /* followed by punctuation, it's a query, unless . . . */
2095 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2096 !warnings->dotcomma && c=='.' && nc==',' ||
2097 warnings->isFrench && g_str_has_prefix(s,",...") ||
2098 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2099 warnings->isFrench && g_str_has_prefix(s,";...") ||
2100 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2101 warnings->isFrench && g_str_has_prefix(s,":...") ||
2102 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2103 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2104 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2105 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2106 warnings->isFrench && g_str_has_prefix(s,"...?"))
2108 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2109 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2110 warnings->isFrench && g_str_has_prefix(s,";...") ||
2111 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2112 warnings->isFrench && g_str_has_prefix(s,":...") ||
2113 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2114 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2115 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2116 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2117 warnings->isFrench && g_str_has_prefix(s,"...?"))
2120 nc=g_utf8_get_char(g_utf8_next_char(s));
2122 ; /* do nothing for .. !! and ?? which can be legit */
2126 if (pswit[ECHO_SWITCH])
2127 g_print("\n%s\n",aline);
2128 if (!pswit[OVERVIEW_SWITCH])
2129 g_print(" Line %ld column %ld - Double punctuation?\n",
2130 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2139 * check_for_spaced_quotes:
2141 void check_for_spaced_quotes(const char *aline)
2145 while ((t=strstr(s," \" ")))
2147 if (pswit[ECHO_SWITCH])
2148 g_print("\n%s\n",aline);
2149 if (!pswit[OVERVIEW_SWITCH])
2150 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2151 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2154 s=g_utf8_next_char(g_utf8_next_char(t));
2157 while ((t=strstr(s," ' ")))
2159 if (pswit[ECHO_SWITCH])
2160 g_print("\n%s\n",aline);
2161 if (!pswit[OVERVIEW_SWITCH])
2162 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2163 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2166 s=g_utf8_next_char(g_utf8_next_char(t));
2169 while ((t=strstr(s," ` ")))
2171 if (pswit[ECHO_SWITCH])
2172 g_print("\n%s\n",aline);
2173 if (!pswit[OVERVIEW_SWITCH])
2174 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2175 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2178 s=g_utf8_next_char(g_utf8_next_char(t));
2183 * check_for_miscased_genative:
2185 * Check special case of 'S instead of 's at end of word.
2187 void check_for_miscased_genative(const char *aline)
2193 c=g_utf8_get_char(aline);
2194 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2195 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2199 nc=g_utf8_get_char(g_utf8_next_char(s));
2200 if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
2202 if (pswit[ECHO_SWITCH])
2203 g_print("\n%s\n",aline);
2204 if (!pswit[OVERVIEW_SWITCH])
2205 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2206 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2214 * check_end_of_line:
2216 * Now check special cases - start and end of line -
2217 * for single and double quotes. Start is sometimes [sic]
2218 * but better to query it anyway.
2219 * While we're here, check for dash at end of line.
2221 void check_end_of_line(const char *aline,struct warnings *warnings)
2226 lbytes=strlen(aline);
2227 if (g_utf8_strlen(aline,lbytes)>1)
2229 s=g_utf8_prev_char(aline+lbytes);
2230 c1=g_utf8_get_char(s);
2231 c2=g_utf8_get_char(g_utf8_prev_char(s));
2232 if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
2235 if (pswit[ECHO_SWITCH])
2236 g_print("\n%s\n",aline);
2237 if (!pswit[OVERVIEW_SWITCH])
2238 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2239 g_utf8_strlen(aline,lbytes));
2243 c1=g_utf8_get_char(aline);
2244 c2=g_utf8_get_char(g_utf8_next_char(aline));
2245 if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
2247 if (pswit[ECHO_SWITCH])
2248 g_print("\n%s\n",aline);
2249 if (!pswit[OVERVIEW_SWITCH])
2250 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2255 * Dash at end of line may well be legit - paranoid mode only
2256 * and don't report em-dash at line-end.
2258 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2260 for (s=g_utf8_prev_char(aline+lbytes);
2261 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2263 if (g_utf8_get_char(s)=='-' &&
2264 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2266 if (pswit[ECHO_SWITCH])
2267 g_print("\n%s\n",aline);
2268 if (!pswit[OVERVIEW_SWITCH])
2269 g_print(" Line %ld column %ld - "
2270 "Hyphen at end of line?\n",
2271 linecnt,g_utf8_pointer_to_offset(aline,s));
2278 * check_for_unspaced_bracket:
2280 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2281 * If so, suspect a scanno like "a]most".
2283 void check_for_unspaced_bracket(const char *aline)
2287 c=g_utf8_get_char(aline);
2288 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2289 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2293 nc=g_utf8_get_char(g_utf8_next_char(s));
2296 /* for each bracket character in the line except 1st & last */
2297 if (g_utf8_strchr("{[()]}",-1,c) &&
2298 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2300 if (pswit[ECHO_SWITCH])
2301 g_print("\n%s\n",aline);
2302 if (!pswit[OVERVIEW_SWITCH])
2303 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2304 linecnt,g_utf8_pointer_to_offset(aline,s));
2312 * check_for_unpunctuated_endquote:
2314 void check_for_unpunctuated_endquote(const char *aline)
2318 c=g_utf8_get_char(aline);
2319 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2320 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2324 nc=g_utf8_get_char(g_utf8_next_char(s));
2325 /* for each character in the line except 1st */
2326 if (c==CHAR_DQUOTE && isalpha(pc))
2328 if (pswit[ECHO_SWITCH])
2329 g_print("\n%s\n",aline);
2330 if (!pswit[OVERVIEW_SWITCH])
2331 g_print(" Line %ld column %ld - "
2332 "endquote missing punctuation?\n",
2333 linecnt,g_utf8_pointer_to_offset(aline,s));
2341 * check_for_html_tag:
2343 * Check for <HTML TAG>.
2345 * If there is a < in the line, followed at some point
2346 * by a > then we suspect HTML.
2348 void check_for_html_tag(const char *aline)
2350 const char *open,*close;
2352 open=strchr(aline,'<');
2355 close=strchr(g_utf8_next_char(open),'>');
2358 if (pswit[ECHO_SWITCH])
2359 g_print("\n%s\n",aline);
2360 if (!pswit[OVERVIEW_SWITCH])
2362 tag=g_strndup(open,close-open+1);
2363 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2364 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2374 * check_for_html_entity:
2376 * Check for &symbol; HTML.
2378 * If there is a & in the line, followed at
2379 * some point by a ; then we suspect HTML.
2381 void check_for_html_entity(const char *aline)
2383 const char *s,*amp,*scolon;
2385 amp=strchr(aline,'&');
2388 scolon=strchr(amp,';');
2391 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2392 if (g_utf8_get_char(s)==CHAR_SPACE)
2393 break; /* Don't report "Jones & Son;" */
2396 if (pswit[ECHO_SWITCH])
2397 g_print("\n%s\n",aline);
2398 if (!pswit[OVERVIEW_SWITCH])
2400 entity=g_strndup(amp,scolon-amp+1);
2401 g_print(" Line %ld column %d - HTML symbol? %s \n",
2402 linecnt,(int)(amp-aline)+1,entity);
2415 * If we are in a state of unbalanced quotes, and this line
2416 * doesn't begin with a quote, output the stored error message.
2417 * If the -P switch was used, print the warning even if the
2418 * new para starts with quotes.
2420 void print_pending(const char *aline,const char *parastart,
2421 struct pending *pending)
2428 c=g_utf8_get_char(s);
2429 if (pending->dquote)
2431 if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2433 if (!pswit[OVERVIEW_SWITCH])
2435 if (pswit[ECHO_SWITCH])
2436 g_print("\n%s\n",parastart);
2437 g_print("%s\n",pending->dquote);
2442 g_free(pending->dquote);
2443 pending->dquote=NULL;
2445 if (pending->squote)
2447 if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2450 if (!pswit[OVERVIEW_SWITCH])
2452 if (pswit[ECHO_SWITCH])
2453 g_print("\n%s\n",parastart);
2454 g_print("%s\n",pending->squote);
2459 g_free(pending->squote);
2460 pending->squote=NULL;
2462 if (pending->rbrack)
2464 if (!pswit[OVERVIEW_SWITCH])
2466 if (pswit[ECHO_SWITCH])
2467 g_print("\n%s\n",parastart);
2468 g_print("%s\n",pending->rbrack);
2472 g_free(pending->rbrack);
2473 pending->rbrack=NULL;
2475 if (pending->sbrack)
2477 if (!pswit[OVERVIEW_SWITCH])
2479 if (pswit[ECHO_SWITCH])
2480 g_print("\n%s\n",parastart);
2481 g_print("%s\n",pending->sbrack);
2485 g_free(pending->sbrack);
2486 pending->sbrack=NULL;
2488 if (pending->cbrack)
2490 if (!pswit[OVERVIEW_SWITCH])
2492 if (pswit[ECHO_SWITCH])
2493 g_print("\n%s\n",parastart);
2494 g_print("%s\n",pending->cbrack);
2498 g_free(pending->cbrack);
2499 pending->cbrack=NULL;
2501 if (pending->unders)
2503 if (!pswit[OVERVIEW_SWITCH])
2505 if (pswit[ECHO_SWITCH])
2506 g_print("\n%s\n",parastart);
2507 g_print("%s\n",pending->unders);
2511 g_free(pending->unders);
2512 pending->unders=NULL;
2517 * check_for_mismatched_quotes:
2519 * At end of paragraph, check for mismatched quotes.
2521 * We don't want to report an error immediately, since it is a
2522 * common convention to omit the quotes at end of paragraph if
2523 * the next paragraph is a continuation of the same speaker.
2524 * Where this is the case, the next para should begin with a
2525 * quote, so we store the warning message and only display it
2526 * at the top of the next iteration if the new para doesn't
2527 * start with a quote.
2528 * The -p switch overrides this default, and warns of unclosed
2529 * quotes on _every_ paragraph, whether the next begins with a
2532 void check_for_mismatched_quotes(const struct counters *counters,
2533 struct pending *pending)
2535 if (counters->quot%2)
2537 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2538 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2539 counters->open_single_quote!=counters->close_single_quote)
2541 g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
2542 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2543 counters->open_single_quote!=counters->close_single_quote &&
2544 counters->open_single_quote!=counters->close_single_quote+1)
2546 * Flag it to be noted regardless of the
2547 * first char of the next para.
2550 if (counters->r_brack)
2552 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2553 if (counters->s_brack)
2555 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2556 if (counters->c_brack)
2558 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2559 if (counters->c_unders%2)
2561 g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
2565 * check_for_omitted_punctuation:
2567 * Check for omitted punctuation at end of paragraph by working back
2568 * through prevline. DW.
2569 * Need to check this only for "normal" paras.
2570 * So what is a "normal" para?
2571 * Not normal if one-liner (chapter headings, etc.)
2572 * Not normal if doesn't contain at least one locase letter
2573 * Not normal if starts with space
2575 void check_for_omitted_punctuation(const char *prevline,
2576 struct line_properties *last,int start_para_line)
2578 gboolean letter_on_line=FALSE;
2580 for (s=prevline;*s;s=g_utf8_next_char(s))
2581 if (g_unichar_isalpha(g_utf8_get_char(s)))
2583 letter_on_line=TRUE;
2587 * This next "if" is a problem.
2588 * If we say "start_para_line <= linecnt - 1", that includes
2589 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2590 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2591 * misses genuine one-line paragraphs.
2593 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2594 g_utf8_get_char(prevline)>CHAR_SPACE)
2596 for (s=g_utf8_prev_char(prevline+strlen(prevline));
2597 (g_utf8_get_char(s)==CHAR_DQUOTE ||
2598 g_utf8_get_char(s)==CHAR_SQUOTE) &&
2599 g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
2600 s=g_utf8_prev_char(s))
2602 for (;s>prevline;s=g_utf8_prev_char(s))
2604 if (g_unichar_isalpha(g_utf8_get_char(s)))
2606 if (pswit[ECHO_SWITCH])
2607 g_print("\n%s\n",prevline);
2608 if (!pswit[OVERVIEW_SWITCH])
2609 g_print(" Line %ld column %ld - "
2610 "No punctuation at para end?\n",
2611 linecnt-1,g_utf8_strlen(prevline,-1));
2616 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2622 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2624 const char *word=key;
2627 g_print("\nNote: Queried word %s was duplicated %d times\n",
2632 void print_as_windows_1252(const char *string)
2634 gsize inbytes,outbytes;
2636 GIConv converter=(GIConv)-1;
2639 if (converter!=(GIConv)-1)
2640 g_iconv_close(converter);
2641 converter=(GIConv)-1;
2644 if (converter=(GIConv)-1)
2645 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2646 if (converter!=(GIConv)-1)
2648 inbytes=outbytes=strlen(string);
2649 bp=buf=g_malloc(outbytes+1);
2650 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2656 fputs(string,stdout);
2659 void print_as_utf_8(const char *string)
2661 fputs(string,stdout);
2669 void procfile(const char *filename)
2672 gchar *parastart=NULL; /* first line of current para */
2673 gchar *etext,*aline;
2676 struct first_pass_results *first_pass_results;
2677 struct warnings *warnings;
2678 struct counters counters={0};
2679 struct line_properties last={0};
2680 struct parities parities={0};
2681 struct pending pending={0};
2682 gboolean isemptyline;
2683 long start_para_line=0;
2684 gboolean isnewpara=FALSE,enddash=FALSE;
2685 last.start=CHAR_SPACE;
2686 linecnt=checked_linecnt=0;
2687 etext=read_etext(filename,&err);
2690 if (pswit[STDOUT_SWITCH])
2691 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2693 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2696 g_print("\n\nFile: %s\n\n",filename);
2697 first_pass_results=first_pass(etext);
2698 warnings=report_first_pass(first_pass_results);
2699 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2700 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2702 * Here we go with the main pass. Hold onto yer hat!
2706 while ((aline=flgets(&etext_ptr,linecnt+1)))
2711 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2712 continue; // skip DP page separators completely
2713 if (linecnt<first_pass_results->firstline ||
2714 (first_pass_results->footerline>0 &&
2715 linecnt>first_pass_results->footerline))
2717 if (pswit[HEADER_SWITCH])
2719 if (g_str_has_prefix(aline,"Title:"))
2720 g_print(" %s\n",aline);
2721 if (g_str_has_prefix(aline,"Author:"))
2722 g_print(" %s\n",aline);
2723 if (g_str_has_prefix(aline,"Release Date:"))
2724 g_print(" %s\n",aline);
2725 if (g_str_has_prefix(aline,"Edition:"))
2726 g_print(" %s\n\n",aline);
2728 continue; /* skip through the header */
2731 print_pending(aline,parastart,&pending);
2732 memset(&pending,0,sizeof(pending));
2733 isemptyline=analyse_quotes(aline,&counters);
2734 if (isnewpara && !isemptyline)
2736 /* This line is the start of a new paragraph. */
2737 start_para_line=linecnt;
2738 /* Capture its first line in case we want to report it later. */
2740 parastart=g_strdup(aline);
2741 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2743 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2744 !g_unichar_isdigit(g_utf8_get_char(s)))
2745 s=g_utf8_next_char(s);
2746 if (g_unichar_islower(g_utf8_get_char(s)))
2748 /* and its first letter is lowercase */
2749 if (pswit[ECHO_SWITCH])
2750 g_print("\n%s\n",aline);
2751 if (!pswit[OVERVIEW_SWITCH])
2752 g_print(" Line %ld column %ld - "
2753 "Paragraph starts with lower-case\n",
2754 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2758 isnewpara=FALSE; /* Signal the end of new para processing. */
2760 /* Check for an em-dash broken at line end. */
2761 if (enddash && g_utf8_get_char(aline)=='-')
2763 if (pswit[ECHO_SWITCH])
2764 g_print("\n%s\n",aline);
2765 if (!pswit[OVERVIEW_SWITCH])
2766 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2771 for (s=g_utf8_prev_char(aline+strlen(aline));
2772 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2774 if (s>=aline && g_utf8_get_char(s)=='-')
2776 check_for_control_characters(aline);
2778 check_for_odd_characters(aline,warnings,isemptyline);
2779 if (warnings->longline)
2780 check_for_long_line(aline);
2781 if (warnings->shortline)
2782 check_for_short_line(aline,&last);
2784 last.len=g_utf8_strlen(aline,-1);
2785 last.start=g_utf8_get_char(aline);
2786 check_for_starting_punctuation(aline);
2789 check_for_spaced_emdash(aline);
2790 check_for_spaced_dash(aline);
2792 check_for_unmarked_paragraphs(aline);
2793 check_for_jeebies(aline);
2794 check_for_mta_from(aline);
2795 check_for_orphan_character(aline);
2796 check_for_pling_scanno(aline);
2797 check_for_extra_period(aline,warnings);
2798 check_for_following_punctuation(aline);
2799 check_for_typos(aline,warnings);
2800 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2801 check_for_double_punctuation(aline,warnings);
2802 check_for_spaced_quotes(aline);
2803 check_for_miscased_genative(aline);
2804 check_end_of_line(aline,warnings);
2805 check_for_unspaced_bracket(aline);
2806 if (warnings->endquote)
2807 check_for_unpunctuated_endquote(aline);
2808 check_for_html_tag(aline);
2809 check_for_html_entity(aline);
2812 check_for_mismatched_quotes(&counters,&pending);
2813 memset(&counters,0,sizeof(counters));
2814 /* let the next iteration know that it's starting a new para */
2817 check_for_omitted_punctuation(prevline,&last,start_para_line);
2820 prevline=g_strdup(aline);
2830 if (!pswit[OVERVIEW_SWITCH])
2831 g_tree_foreach(qword,report_duplicate_queries,NULL);
2832 g_tree_unref(qword);
2833 g_tree_unref(qperiod);
2834 g_set_print_handler(NULL);
2835 print_as_windows_1252(NULL);
2836 if (pswit[MARKUP_SWITCH])
2843 * Get one line from the input text, checking for
2844 * the existence of exactly one CR/LF line-end per line.
2846 * Returns: a pointer to the line.
2848 char *flgets(char **etext,long lcnt)
2851 gboolean isCR=FALSE;
2852 char *theline=*etext;
2857 c=g_utf8_get_char(*etext);
2858 *etext=g_utf8_next_char(*etext);
2861 /* either way, it's end of line */
2868 /* Error - a LF without a preceding CR */
2869 if (pswit[LINE_END_SWITCH])
2871 if (pswit[ECHO_SWITCH])
2873 s=g_strndup(theline,eos-theline);
2874 g_print("\n%s\n",s);
2877 if (!pswit[OVERVIEW_SWITCH])
2878 g_print(" Line %ld - No CR?\n",lcnt);
2889 /* Error - two successive CRs */
2890 if (pswit[LINE_END_SWITCH])
2892 if (pswit[ECHO_SWITCH])
2894 s=g_strndup(theline,eos-theline);
2895 g_print("\n%s\n",s);
2898 if (!pswit[OVERVIEW_SWITCH])
2899 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2908 if (pswit[LINE_END_SWITCH] && isCR)
2910 if (pswit[ECHO_SWITCH])
2912 s=g_strndup(theline,eos-theline);
2913 g_print("\n%s\n",s);
2916 if (!pswit[OVERVIEW_SWITCH])
2917 g_print(" Line %ld column %ld - CR without LF?\n",
2918 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2924 eos=g_utf8_next_char(eos);
2928 if (pswit[MARKUP_SWITCH])
2929 postprocess_for_HTML(theline);
2930 if (pswit[DP_SWITCH])
2931 postprocess_for_DP(theline);
2938 * Takes a "word" as a parameter, and checks whether it
2939 * contains a mixture of alpha and digits. Generally, this is an
2940 * error, but may not be for cases like 4th or L5 12s. 3d.
2942 * Returns: TRUE iff an is error found.
2944 gboolean mixdigit(const char *checkword)
2946 gboolean wehaveadigit,wehavealetter,query;
2947 const char *s,*nondigit;
2948 wehaveadigit=wehavealetter=query=FALSE;
2949 for (s=checkword;*s;s=g_utf8_next_char(s))
2950 if (g_unichar_isalpha(g_utf8_get_char(s)))
2952 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2954 if (wehaveadigit && wehavealetter)
2956 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2958 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2959 nondigit=g_utf8_next_char(nondigit))
2961 /* digits, ending in st, rd, nd, th of either case */
2962 if (!g_ascii_strcasecmp(nondigit,"st") ||
2963 !g_ascii_strcasecmp(nondigit,"rd") ||
2964 !g_ascii_strcasecmp(nondigit,"nd") ||
2965 !g_ascii_strcasecmp(nondigit,"th"))
2967 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2968 !g_ascii_strcasecmp(nondigit,"rds") ||
2969 !g_ascii_strcasecmp(nondigit,"nds") ||
2970 !g_ascii_strcasecmp(nondigit,"ths"))
2972 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2973 !g_ascii_strcasecmp(nondigit,"rdly") ||
2974 !g_ascii_strcasecmp(nondigit,"ndly") ||
2975 !g_ascii_strcasecmp(nondigit,"thly"))
2977 /* digits, ending in l, L, s or d */
2978 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2979 !strcmp(nondigit,"d"))
2982 * L at the start of a number, representing Britsh pounds, like L500.
2983 * This is cute. We know the current word is mixed digit. If the first
2984 * letter is L, there must be at least one digit following. If both
2985 * digits and letters follow, we have a genuine error, else we have a
2986 * capital L followed by digits, and we accept that as a non-error.
2988 if (g_utf8_get_char(checkword)=='L' &&
2989 !mixdigit(g_utf8_next_char(checkword)))
2998 * Extracts the first/next "word" from the line, and returns it.
2999 * A word is defined as one English word unit--or at least that's the aim.
3000 * "ptr" is advanced to the position in the line where we will start
3001 * looking for the next word.
3003 * Returns: A newly-allocated string.
3005 gchar *getaword(const char **ptr)
3010 word=g_string_new(NULL);
3011 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3012 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3013 **ptr;*ptr=g_utf8_next_char(*ptr))
3016 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3017 * Especially yucky is the case of L1,000
3018 * This section looks for a pattern of characters including a digit
3019 * followed by a comma or period followed by one or more digits.
3020 * If found, it returns this whole pattern as a word; otherwise we discard
3021 * the results and resume our normal programming.
3024 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3025 g_unichar_isalpha(g_utf8_get_char(s)) ||
3026 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3027 g_string_append_unichar(word,g_utf8_get_char(s));
3028 for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
3029 t=g_utf8_next_char(t))
3031 c=g_utf8_get_char(t);
3032 pc=g_utf8_get_char(g_utf8_prev_char(t));
3033 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3036 return g_string_free(word,FALSE);
3039 /* we didn't find a punctuated number - do the regular getword thing */
3040 g_string_truncate(word,0);
3041 for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
3042 g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
3043 g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
3044 g_string_append_unichar(word,g_utf8_get_char(*ptr));
3045 return g_string_free(word,FALSE);
3051 * Is this word a Roman Numeral?
3053 * It doesn't actually validate that the number is a valid Roman Numeral--for
3054 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3055 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3056 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3057 * expressions thereof, except when it came to taxes. Allow any number of M,
3058 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3059 * XL or an optional XC, an optional IX or IV, an optional V and any number
3062 gboolean isroman(const char *t)
3068 while (g_utf8_get_char(t)=='m' && *t)
3070 if (g_utf8_get_char(t)=='d')
3072 if (g_str_has_prefix(t,"cm"))
3074 if (g_str_has_prefix(t,"cd"))
3076 while (g_utf8_get_char(t)=='c' && *t)
3078 if (g_str_has_prefix(t,"xl"))
3080 if (g_str_has_prefix(t,"xc"))
3082 if (g_utf8_get_char(t)=='l')
3084 while (g_utf8_get_char(t)=='x' && *t)
3086 if (g_str_has_prefix(t,"ix"))
3088 if (g_str_has_prefix(t,"iv"))
3090 if (g_utf8_get_char(t)=='v')
3092 while (g_utf8_get_char(t)=='i' && *t)
3098 * postprocess_for_DP:
3100 * Invoked with the -d switch from flgets().
3101 * It simply "removes" from the line a hard-coded set of common
3102 * DP-specific tags, so that the line passed to the main routine has
3103 * been pre-cleaned of DP markup.
3105 void postprocess_for_DP(char *theline)
3111 for (i=0;*DPmarkup[i];i++)
3112 while ((s=strstr(theline,DPmarkup[i])))
3114 t=s+strlen(DPmarkup[i]);
3115 memmove(s,t,strlen(t)+1);
3120 * postprocess_for_HTML:
3122 * Invoked with the -m switch from flgets().
3123 * It simply "removes" from the line a hard-coded set of common
3124 * HTML tags and "replaces" a hard-coded set of common HTML
3125 * entities, so that the line passed to the main routine has
3126 * been pre-cleaned of HTML.
3128 void postprocess_for_HTML(char *theline)
3130 while (losemarkup(theline))
3132 loseentities(theline);
3135 char *losemarkup(char *theline)
3139 s=strchr(theline,'<');
3140 t=s?strchr(s,'>'):NULL;
3143 for (i=0;*markup[i];i++)
3144 if (tagcomp(g_utf8_next_char(s),markup[i]))
3146 t=g_utf8_next_char(t);
3147 memmove(s,t,strlen(t)+1);
3150 /* It's an unrecognized <xxx>. */
3154 void loseentities(char *theline)
3161 GTree *entities=NULL;
3162 GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3166 g_tree_destroy(entities);
3168 if (translit==(GIConv)-1)
3169 g_iconv_close(translit);
3170 translit=(GIConv)-1;
3171 if (to_utf8==(GIConv)-1)
3172 g_iconv_close(to_utf8);
3180 entities=g_tree_new((GCompareFunc)strcmp);
3181 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3182 g_tree_insert(entities,HTMLentities[i].name,
3183 GUINT_TO_POINTER(HTMLentities[i].c));
3185 if (translit==(GIConv)-1)
3186 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3187 if (to_utf8==(GIConv)-1)
3188 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3189 while((amp=strchr(theline,'&')))
3191 scolon=strchr(amp,';');
3196 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3197 c=strtol(amp+2,NULL,10);
3198 else if (amp[2]=='x' &&
3199 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3200 c=strtol(amp+3,NULL,16);
3204 s=g_strndup(amp+1,scolon-(amp+1));
3205 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3214 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3215 theline+=g_unichar_to_utf8(c,theline);
3219 nb=g_unichar_to_utf8(c,s);
3220 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3222 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3224 memcpy(theline,s,nb);
3228 memmove(theline,g_utf8_next_char(scolon),
3229 strlen(g_utf8_next_char(scolon))+1);
3232 theline=g_utf8_next_char(amp);
3236 gboolean tagcomp(const char *strin,const char *basetag)
3240 if (g_utf8_get_char(strin)=='/')
3241 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3243 t=g_utf8_casefold(strin,-1);
3244 s=g_utf8_casefold(basetag,-1);
3245 retval=g_str_has_prefix(t,s);
3251 void proghelp(GOptionContext *context)
3254 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3255 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3256 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3257 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3258 "For details, read the file COPYING.\n",stderr);
3259 fputs("This is Free Software; "
3260 "you may redistribute it under certain conditions (GPL);\n",stderr);
3261 fputs("read the file COPYING for details.\n\n",stderr);
3262 help=g_option_context_get_help(context,TRUE,NULL);
3265 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3266 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3267 "non-ASCII\n",stderr);
3268 fputs("characters like accented letters, "
3269 "lines longer than 75 or shorter than 55,\n",stderr);
3270 fputs("unbalanced quotes or brackets, "
3271 "a variety of badly formatted punctuation, \n",stderr);
3272 fputs("HTML tags, some likely typos. "
3273 "It is NOT a substitute for human judgement.\n",stderr);