1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "HTMLentities.h"
36 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
37 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
38 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
39 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
40 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
41 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
42 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
43 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
44 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
45 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
46 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
47 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
48 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
49 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
50 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
51 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
52 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
53 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
54 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
55 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
56 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
57 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
58 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
59 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
60 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
61 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
62 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
63 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
64 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
70 /* Common abbreviations and other OK words not to query as typos. */
72 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
73 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
74 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
75 "outbid", "outbids", "frostbite", "frostbitten", ""
78 /* Common abbreviations that cause otherwise unexplained periods. */
80 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
81 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
85 * Two-Letter combinations that rarely if ever start words,
86 * but are common scannos or otherwise common letter combinations.
89 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
93 * Two-Letter combinations that rarely if ever end words,
94 * but are common scannos or otherwise common letter combinations.
97 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
98 "sw", "gr", "sl", "cl", "iy", ""
102 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
103 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
104 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
105 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
109 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
113 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
114 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
115 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
116 "during", "let", "toward", "among", ""
120 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
121 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
122 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
123 "among", "those", "into", "whom", "having", "thence", ""
126 /* special characters */
127 #define CHAR_SPACE 32
131 #define CHAR_DQUOTE 34
132 #define CHAR_SQUOTE 39
133 #define CHAR_OPEN_SQUOTE 96
134 #define CHAR_TILDE 126
135 #define CHAR_ASTERISK 42
136 #define CHAR_FORESLASH 47
137 #define CHAR_CARAT 94
139 #define CHAR_UNDERSCORE '_'
140 #define CHAR_OPEN_CBRACK '{'
141 #define CHAR_CLOSE_CBRACK '}'
142 #define CHAR_OPEN_RBRACK '('
143 #define CHAR_CLOSE_RBRACK ')'
144 #define CHAR_OPEN_SBRACK '['
145 #define CHAR_CLOSE_SBRACK ']'
147 /* longest and shortest normal PG line lengths */
148 #define LONGEST_PG_LINE 75
149 #define WAY_TOO_LONG 80
150 #define SHORTEST_PG_LINE 55
170 gboolean pswit[SWITNO]; /* program switches */
172 static GOptionEntry options[]={
173 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
174 "Ignore DP-specific markup", NULL },
175 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
176 "Don't echo queried line", NULL },
177 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
178 "Check single quotes", NULL },
179 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
180 "Check common typos", NULL },
181 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
182 "Require closure of quotes on every paragraph", NULL },
183 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
184 "Disable paranoid querying of everything", NULL },
185 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
186 "Disable line end checking", NULL },
187 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
188 "Overview: just show counts", NULL },
189 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
190 "Output errors to stdout instead of stderr", NULL },
191 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
192 "Echo header fields", NULL },
193 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "Ignore markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
198 "Defaults for use on www upload", NULL },
199 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Verbose - list everything", NULL },
204 long cnt_dquot; /* for overview mode, count of doublequote queries */
205 long cnt_squot; /* for overview mode, count of singlequote queries */
206 long cnt_brack; /* for overview mode, count of brackets queries */
207 long cnt_bin; /* for overview mode, count of non-ASCII queries */
208 long cnt_odd; /* for overview mode, count of odd character queries */
209 long cnt_long; /* for overview mode, count of long line errors */
210 long cnt_short; /* for overview mode, count of short line queries */
211 long cnt_punct; /* for overview mode,
212 count of punctuation and spacing queries */
213 long cnt_dash; /* for overview mode, count of dash-related queries */
214 long cnt_word; /* for overview mode, count of word queries */
215 long cnt_html; /* for overview mode, count of html queries */
216 long cnt_lineend; /* for overview mode, count of line-end queries */
217 long cnt_spacend; /* count of lines with space at end */
218 long linecnt; /* count of total lines in the file */
219 long checked_linecnt; /* count of lines actually checked */
221 void proghelp(GOptionContext *context);
222 void procfile(const char *);
226 gboolean mixdigit(const char *);
227 gchar *getaword(const char **);
228 char *flgets(char **,long);
229 void postprocess_for_HTML(char *);
230 char *linehasmarkup(char *);
231 char *losemarkup(char *);
232 gboolean tagcomp(const char *,const char *);
233 void loseentities(char *);
234 gboolean isroman(const char *);
235 void postprocess_for_DP(char *);
236 void print_as_windows_1252(const char *string);
237 void print_as_utf_8(const char *string);
239 GTree *qword,*qperiod;
245 struct first_pass_results {
246 long firstline,astline;
247 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
248 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
249 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
250 int Dutchcount,Frenchcount;
254 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
256 gboolean isDutch,isFrench;
261 int c_unders,c_brack,s_brack,r_brack;
262 int open_single_quote,close_single_quote;
265 struct line_properties {
266 unsigned int len,blen;
275 char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
279 void parse_options(int *argc,char ***argv)
282 GOptionContext *context;
283 context=g_option_context_new(
284 "file - looks for errors in Project Gutenberg(TM) etexts");
285 g_option_context_add_main_entries(context,options,NULL);
286 if (!g_option_context_parse(context,argc,argv,&err))
288 g_printerr("Bookloupe: %s\n",err->message);
289 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
292 /* Paranoid checking is turned OFF, not on, by its switch */
293 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
294 if (pswit[PARANOID_SWITCH])
295 /* if running in paranoid mode, typo checks default to enabled */
296 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
297 /* Line-end checking is turned OFF, not on, by its switch */
298 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
299 /* Echoing is turned OFF, not on, by its switch */
300 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
301 if (pswit[OVERVIEW_SWITCH])
302 /* just print summary; don't echo */
303 pswit[ECHO_SWITCH]=FALSE;
305 * Web uploads - for the moment, this is really just a placeholder
306 * until we decide what processing we really want to do on web uploads
308 if (pswit[WEB_SWITCH])
310 /* specific override for web uploads */
311 pswit[ECHO_SWITCH]=TRUE;
312 pswit[SQUOTE_SWITCH]=FALSE;
313 pswit[TYPO_SWITCH]=TRUE;
314 pswit[QPARA_SWITCH]=FALSE;
315 pswit[PARANOID_SWITCH]=TRUE;
316 pswit[LINE_END_SWITCH]=FALSE;
317 pswit[OVERVIEW_SWITCH]=FALSE;
318 pswit[STDOUT_SWITCH]=FALSE;
319 pswit[HEADER_SWITCH]=TRUE;
320 pswit[VERBOSE_SWITCH]=FALSE;
321 pswit[MARKUP_SWITCH]=FALSE;
322 pswit[USERTYPO_SWITCH]=FALSE;
323 pswit[DP_SWITCH]=FALSE;
330 g_option_context_free(context);
336 * Read in the user-defined stealth scanno list.
338 void read_user_scannos(void)
341 gchar *usertypo_file;
345 gchar *contents,*utf8,**lines;
346 usertypo_file=g_strdup("bookloupe.typ");
347 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
348 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
351 g_free(usertypo_file);
352 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
353 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
355 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
358 g_free(usertypo_file);
359 usertypo_file=g_strdup("gutcheck.typ");
360 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
362 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
365 g_free(usertypo_file);
366 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
367 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
369 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
371 g_free(usertypo_file);
372 g_print(" --> I couldn't find bookloupe.typ "
373 "-- proceeding without user typos.\n");
378 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
379 g_free(usertypo_file);
383 if (g_utf8_validate(contents,len,NULL))
384 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
386 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
388 lines=g_strsplit_set(utf8,"\r\n",0);
390 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
391 for (i=0;lines[i];i++)
392 if (*(unsigned char *)lines[i]>'!')
393 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
402 * Read an etext returning a newly allocated string containing the file
403 * contents or NULL on error.
405 gchar *read_etext(const char *filename,GError **err)
407 GError *tmp_err=NULL;
408 gchar *contents,*utf8;
409 gsize len,bytes_read,bytes_written;
411 if (!g_file_get_contents(filename,&contents,&len,err))
413 if (g_utf8_validate(contents,len,NULL))
415 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
416 g_set_print_handler(print_as_utf_8);
418 SetConsoleOutputCP(CP_UTF8);
423 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
424 &bytes_written,&tmp_err);
425 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
426 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
429 for(i=0;i<bytes_read;i++)
430 if (contents[i]=='\n')
435 else if (contents[i]!='\r')
437 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
438 "Input conversion failed. Byte %d at line %d, column %d is not a "
439 "valid Windows-1252 character",
440 ((unsigned char *)contents)[bytes_read],line,col);
443 g_propagate_error(err,tmp_err);
444 g_set_print_handler(print_as_windows_1252);
446 SetConsoleOutputCP(1252);
453 void cleanup_on_exit(void)
456 SetConsoleOutputCP(saved_cp);
460 int main(int argc,char **argv)
463 atexit(cleanup_on_exit);
464 saved_cp=GetConsoleOutputCP();
466 running_from=g_path_get_dirname(argv[0]);
467 parse_options(&argc,&argv);
468 if (pswit[USERTYPO_SWITCH])
470 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
472 if (pswit[OVERVIEW_SWITCH])
474 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
475 checked_linecnt,linecnt,linecnt-checked_linecnt);
476 g_print(" --------------- Queries found --------------\n");
478 g_print(" Long lines: %14ld\n",cnt_long);
480 g_print(" Short lines: %14ld\n",cnt_short);
482 g_print(" Line-end problems: %14ld\n",cnt_lineend);
484 g_print(" Common typos: %14ld\n",cnt_word);
486 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
488 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
490 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
492 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
494 g_print(" Proofing characters: %14ld\n",cnt_odd);
496 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
498 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
500 g_print(" Possible HTML tags: %14ld\n",cnt_html);
502 g_print(" TOTAL QUERIES %14ld\n",
503 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
504 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
506 g_free(running_from);
508 g_tree_unref(usertypo);
515 * Run a first pass - verify that it's a valid PG
516 * file, decide whether to report some things that
517 * occur many times in the text like long or short
518 * lines, non-standard dashes, etc.
520 struct first_pass_results *first_pass(const char *etext)
522 gunichar laststart=CHAR_SPACE;
527 unsigned int lastlen=0,lastblen=0;
528 long spline=0,nspline=0;
529 static struct first_pass_results results={0};
531 lines=g_strsplit(etext,"\n",0);
532 for (j=0;lines[j];j++)
534 lbytes=strlen(lines[j]);
535 while (lbytes>0 && lines[j][lbytes-1]=='\r')
536 lines[j][--lbytes]='\0';
537 llen=g_utf8_strlen(lines[j],lbytes);
539 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
540 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
543 g_print(" --> Duplicate header?\n");
544 spline=linecnt+1; /* first line of non-header text, that is */
546 if (!strncmp(lines[j],"*** START",9) &&
547 strstr(lines[j],"PROJECT GUTENBERG"))
550 g_print(" --> Duplicate header?\n");
551 nspline=linecnt+1; /* first line of non-header text, that is */
553 if (spline || nspline)
555 lc_line=g_utf8_strdown(lines[j],lbytes);
556 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
558 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
560 if (results.footerline)
562 /* it's an old-form header - we can detect duplicates */
564 g_print(" --> Duplicate footer?\n");
567 results.footerline=linecnt;
573 results.firstline=spline;
575 results.firstline=nspline; /* override with new */
576 if (results.footerline)
577 continue; /* don't count the boilerplate in the footer */
578 results.totlen+=llen;
579 for (s=lines[j];*s;s=g_utf8_next_char(s))
581 if (g_utf8_get_char(s)>127)
583 if (g_unichar_isalpha(g_utf8_get_char(s)))
585 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
586 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
587 results.endquote_count++;
589 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
590 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
593 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
595 if (strstr(lines[j],".,"))
597 /* only count ast lines for ignoring purposes where there is */
598 /* locase text on the line */
599 if (strchr(lines[j],'*'))
601 for (s=lines[j];*s;s=g_utf8_next_char(s))
602 if (g_unichar_islower(g_utf8_get_char(s)))
607 if (strchr(lines[j],'/'))
608 results.fslashline++;
611 for (s=g_utf8_prev_char(lines[j]+lbytes);
612 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
613 s=g_utf8_prev_char(s))
615 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
616 g_utf8_get_char(g_utf8_prev_char(s))!='-')
619 if (llen>LONGEST_PG_LINE)
621 if (llen>WAY_TOO_LONG)
622 results.verylongline++;
623 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
625 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
628 if (strstr(lines[j],"<i>"))
629 results.htmcount+=4; /* bonus marks! */
631 /* Check for spaced em-dashes */
632 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
635 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
636 results.space_emdash++;
637 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
638 /* count of em-dashes with spaces both sides */
639 results.non_PG_space_emdash++;
640 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
641 /* count of PG-type em-dashes with no spaces */
642 results.PG_space_emdash++;
647 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
648 results.Dutchcount++;
649 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
650 results.Frenchcount++;
651 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
652 results.standalone_digit++;
655 /* Check for spaced dashes */
656 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
660 laststart=lines[j][0];
669 * Make some snap decisions based on the first pass results.
671 struct warnings *report_first_pass(struct first_pass_results *results)
673 static struct warnings warnings={0};
675 g_print(" --> %ld lines in this file have white space at end\n",
678 if (results->dotcomma>5)
681 g_print(" --> %ld lines in this file contain '.,'. "
682 "Not reporting them.\n",results->dotcomma);
685 * If more than 50 lines, or one-tenth, are short,
686 * don't bother reporting them.
688 warnings.shortline=1;
689 if (results->shortline>50 || results->shortline*10>linecnt)
691 warnings.shortline=0;
692 g_print(" --> %ld lines in this file are short. "
693 "Not reporting short lines.\n",results->shortline);
696 * If more than 50 lines, or one-tenth, are long,
697 * don't bother reporting them.
700 if (results->longline>50 || results->longline*10>linecnt)
703 g_print(" --> %ld lines in this file are long. "
704 "Not reporting long lines.\n",results->longline);
706 /* If more than 10 lines contain asterisks, don't bother reporting them. */
708 if (results->astline>10)
711 g_print(" --> %ld lines in this file contain asterisks. "
712 "Not reporting them.\n",results->astline);
715 * If more than 10 lines contain forward slashes,
716 * don't bother reporting them.
719 if (results->fslashline>10)
722 g_print(" --> %ld lines in this file contain forward slashes. "
723 "Not reporting them.\n",results->fslashline);
726 * If more than 20 lines contain unpunctuated endquotes,
727 * don't bother reporting them.
730 if (results->endquote_count>20)
733 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
734 "Not reporting them.\n",results->endquote_count);
737 * If more than 15 lines contain standalone digits,
738 * don't bother reporting them.
741 if (results->standalone_digit>10)
744 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
745 "Not reporting them.\n",results->standalone_digit);
748 * If more than 20 lines contain hyphens at end,
749 * don't bother reporting them.
752 if (results->hyphens>20)
755 g_print(" --> %ld lines in this file have hyphens at end. "
756 "Not reporting them.\n",results->hyphens);
758 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
760 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
761 pswit[MARKUP_SWITCH]=1;
763 if (results->verylongline>0)
764 g_print(" --> %ld lines in this file are VERY long!\n",
765 results->verylongline);
767 * If there are more non-PG spaced dashes than PG em-dashes,
768 * assume it's deliberate.
769 * Current PG guidelines say don't use them, but older texts do,
770 * and some people insist on them whatever the guidelines say.
773 if (results->spacedash+results->non_PG_space_emdash>
774 results->PG_space_emdash)
777 g_print(" --> There are %ld spaced dashes and em-dashes. "
778 "Not reporting them.\n",
779 results->spacedash+results->non_PG_space_emdash);
781 /* If more than a quarter of characters are hi-bit, bug out. */
783 if (results->binlen*4>results->totlen)
785 g_print(" --> This file does not appear to be ASCII. "
786 "Terminating. Best of luck with it!\n");
789 if (results->alphalen*4<results->totlen)
791 g_print(" --> This file does not appear to be text. "
792 "Terminating. Best of luck with it!\n");
795 if (results->binlen*100>results->totlen || results->binlen>100)
797 g_print(" --> There are a lot of foreign letters here. "
798 "Not reporting them.\n");
801 warnings.isDutch=FALSE;
802 if (results->Dutchcount>50)
804 warnings.isDutch=TRUE;
805 g_print(" --> This looks like Dutch - "
806 "switching off dashes and warnings for 's Middags case.\n");
808 warnings.isFrench=FALSE;
809 if (results->Frenchcount>50)
811 warnings.isFrench=TRUE;
812 g_print(" --> This looks like French - "
813 "switching off some doublepunct.\n");
815 if (results->firstline && results->footerline)
816 g_print(" The PG header and footer appear to be already on.\n");
819 if (results->firstline)
820 g_print(" The PG header is on - no footer.\n");
821 if (results->footerline)
822 g_print(" The PG footer is on - no header.\n");
825 if (pswit[VERBOSE_SWITCH])
828 warnings.shortline=1;
837 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
839 if (warnings.isDutch)
841 if (results->footerline>0 && results->firstline>0 &&
842 results->footerline>results->firstline &&
843 results->footerline-results->firstline<100)
845 g_print(" --> I don't really know where this text starts. \n");
846 g_print(" There are no reference points.\n");
847 g_print(" I'm going to have to report the header and footer "
849 results->firstline=0;
857 * Look along the line, accumulate the count of quotes, and see
858 * if this is an empty line - i.e. a line with nothing on it
860 * If line has just spaces, period, * and/or - on it, don't
861 * count it, since empty lines with asterisks or dashes to
862 * separate sections are common.
864 * Returns: TRUE if the line is empty.
866 gboolean analyse_quotes(const char *aline,struct counters *counters)
869 /* assume the line is empty until proven otherwise */
870 gboolean isemptyline=TRUE;
871 const char *s=aline,*sprev,*snext;
876 snext=g_utf8_next_char(s);
877 c=g_utf8_get_char(s);
880 if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
885 * At start of line, it can only be an openquote.
886 * Hardcode a very common exception!
888 if (!g_str_has_prefix(snext,"tis") &&
889 !g_str_has_prefix(snext,"Tis"))
890 counters->open_single_quote++;
892 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
893 g_unichar_isalpha(g_utf8_get_char(snext)))
894 /* Do nothing! it's definitely an apostrophe, not a quote */
896 /* it's outside a word - let's check it out */
897 else if (c==CHAR_OPEN_SQUOTE ||
898 g_unichar_isalpha(g_utf8_get_char(snext)))
900 /* it damwell better BE an openquote */
901 if (!g_str_has_prefix(snext,"tis") &&
902 !g_str_has_prefix(snext,"Tis"))
903 /* hardcode a very common exception! */
904 counters->open_single_quote++;
908 /* now - is it a closequote? */
909 guessquote=0; /* accumulate clues */
910 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
912 /* it follows a letter - could be either */
914 if (g_utf8_get_char(sprev)=='s')
916 /* looks like a plural apostrophe */
918 if (g_utf8_get_char(snext)==CHAR_SPACE)
923 /* it doesn't have a letter either side */
924 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
925 strchr(".?!,;: ",g_utf8_get_char(snext)))
926 guessquote+=8; /* looks like a closequote */
929 if (counters->open_single_quote>counters->close_single_quote)
931 * Give it the benefit of some doubt,
932 * if a squote is already open.
938 counters->close_single_quote++;
941 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
943 isemptyline=FALSE; /* ignore lines like * * * as spacers */
944 if (c==CHAR_UNDERSCORE)
945 counters->c_unders++;
946 if (c==CHAR_OPEN_CBRACK)
948 if (c==CHAR_CLOSE_CBRACK)
950 if (c==CHAR_OPEN_RBRACK)
952 if (c==CHAR_CLOSE_RBRACK)
954 if (c==CHAR_OPEN_SBRACK)
956 if (c==CHAR_CLOSE_SBRACK)
965 * check_for_control_characters:
967 * Check for invalid or questionable characters in the line
968 * Anything above 127 is invalid for plain ASCII, and
969 * non-printable control characters should also be flagged.
970 * Tabs should generally not be there.
972 void check_for_control_characters(const char *aline)
976 for (s=aline;*s;s=g_utf8_next_char(s))
978 c=g_utf8_get_char(s);
979 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
981 if (pswit[ECHO_SWITCH])
982 g_print("\n%s\n",aline);
983 if (!pswit[OVERVIEW_SWITCH])
984 g_print(" Line %ld column %ld - Control character %u\n",
985 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
993 * check_for_odd_characters:
995 * Check for binary and other odd characters.
997 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
998 gboolean isemptyline)
1000 /* Don't repeat multiple warnings on one line. */
1001 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1002 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1005 for (s=aline;*s;s=g_utf8_next_char(s))
1007 c=g_utf8_get_char(s);
1008 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1010 if (pswit[ECHO_SWITCH])
1011 g_print("\n%s\n",aline);
1012 if (!pswit[OVERVIEW_SWITCH])
1013 if (c>127 && c<160 || c>255)
1014 g_print(" Line %ld column %ld - "
1015 "Non-ISO-8859 character %u\n",
1016 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1018 g_print(" Line %ld column %ld - "
1019 "Non-ASCII character %u\n",
1020 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1025 if (!eTab && c==CHAR_TAB)
1027 if (pswit[ECHO_SWITCH])
1028 g_print("\n%s\n",aline);
1029 if (!pswit[OVERVIEW_SWITCH])
1030 g_print(" Line %ld column %ld - Tab character?\n",
1031 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1036 if (!eTilde && c==CHAR_TILDE)
1039 * Often used by OCR software to indicate an
1040 * unrecognizable character.
1042 if (pswit[ECHO_SWITCH])
1043 g_print("\n%s\n",aline);
1044 if (!pswit[OVERVIEW_SWITCH])
1045 g_print(" Line %ld column %ld - Tilde character?\n",
1046 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1051 if (!eCarat && c==CHAR_CARAT)
1053 if (pswit[ECHO_SWITCH])
1054 g_print("\n%s\n",aline);
1055 if (!pswit[OVERVIEW_SWITCH])
1056 g_print(" Line %ld column %ld - Carat character?\n",
1057 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1062 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1064 if (pswit[ECHO_SWITCH])
1065 g_print("\n%s\n",aline);
1066 if (!pswit[OVERVIEW_SWITCH])
1067 g_print(" Line %ld column %ld - Forward slash?\n",
1068 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1074 * Report asterisks only in paranoid mode,
1075 * since they're often deliberate.
1077 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1080 if (pswit[ECHO_SWITCH])
1081 g_print("\n%s\n",aline);
1082 if (!pswit[OVERVIEW_SWITCH])
1083 g_print(" Line %ld column %ld - Asterisk?\n",
1084 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1093 * check_for_long_line:
1095 * Check for line too long.
1097 void check_for_long_line(const char *aline)
1099 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1101 if (pswit[ECHO_SWITCH])
1102 g_print("\n%s\n",aline);
1103 if (!pswit[OVERVIEW_SWITCH])
1104 g_print(" Line %ld column %ld - Long line %ld\n",
1105 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1112 * check_for_short_line:
1114 * Check for line too short.
1116 * This one is a bit trickier to implement: we don't want to
1117 * flag the last line of a paragraph for being short, so we
1118 * have to wait until we know that our current line is a
1119 * "normal" line, then report the _previous_ line if it was too
1120 * short. We also don't want to report indented lines like
1121 * chapter heads or formatted quotations. We therefore keep
1122 * last->len as the length of the last line examined, and
1123 * last->blen as the length of the last but one, and try to
1124 * suppress unnecessary warnings by checking that both were of
1125 * "normal" length. We keep the first character of the last
1126 * line in last->start, and if it was a space, we assume that
1127 * the formatting is deliberate. I can't figure out a way to
1128 * distinguish something like a quoted verse left-aligned or
1129 * the header or footer of a letter from a paragraph of short
1130 * lines - maybe if I examined the whole paragraph, and if the
1131 * para has less than, say, 8 lines and if all lines are short,
1132 * then just assume it's OK? Need to look at some texts to see
1133 * how often a formula like this would get the right result.
1135 void check_for_short_line(const char *aline,const struct line_properties *last)
1137 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1138 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1139 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1141 if (pswit[ECHO_SWITCH])
1142 g_print("\n%s\n",prevline);
1143 if (!pswit[OVERVIEW_SWITCH])
1144 g_print(" Line %ld column %ld - Short line %ld?\n",
1145 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1152 * check_for_starting_punctuation:
1154 * Look for punctuation other than full ellipses at start of line.
1156 void check_for_starting_punctuation(const char *aline)
1158 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1159 !g_str_has_prefix(aline,". . ."))
1161 if (pswit[ECHO_SWITCH])
1162 g_print("\n%s\n",aline);
1163 if (!pswit[OVERVIEW_SWITCH])
1164 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1172 * check_for_spaced_emdash:
1174 * Check for spaced em-dashes.
1176 * We must check _all_ occurrences of "--" on the line
1177 * hence the loop - even if the first double-dash is OK
1178 * there may be another that's wrong later on.
1180 void check_for_spaced_emdash(const char *aline)
1182 const char *s,*t,*next;
1183 for (s=aline;t=strstr(s,"--");s=next)
1185 next=g_utf8_next_char(g_utf8_next_char(t));
1186 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1187 g_utf8_get_char(next)==CHAR_SPACE)
1189 if (pswit[ECHO_SWITCH])
1190 g_print("\n%s\n",aline);
1191 if (!pswit[OVERVIEW_SWITCH])
1192 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1193 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1201 * check_for_spaced_dash:
1203 * Check for spaced dashes.
1205 void check_for_spaced_dash(const char *aline)
1208 if ((s=strstr(aline," -")))
1210 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1212 if (pswit[ECHO_SWITCH])
1213 g_print("\n%s\n",aline);
1214 if (!pswit[OVERVIEW_SWITCH])
1215 g_print(" Line %ld column %ld - Spaced dash?\n",
1216 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1221 else if ((s=strstr(aline,"- ")))
1223 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1225 if (pswit[ECHO_SWITCH])
1226 g_print("\n%s\n",aline);
1227 if (!pswit[OVERVIEW_SWITCH])
1228 g_print(" Line %ld column %ld - Spaced dash?\n",
1229 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1237 * check_for_unmarked_paragraphs:
1239 * Check for unmarked paragraphs indicated by separate speakers.
1241 * May well be false positive:
1242 * "Bravo!" "Wonderful!" called the crowd.
1243 * but useful all the same.
1245 void check_for_unmarked_paragraphs(const char *aline)
1248 s=strstr(aline,"\" \"");
1250 s=strstr(aline,"\" \"");
1253 if (pswit[ECHO_SWITCH])
1254 g_print("\n%s\n",aline);
1255 if (!pswit[OVERVIEW_SWITCH])
1256 g_print(" Line %ld column %ld - "
1257 "Query missing paragraph break?\n",
1258 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1265 * check_for_jeebies:
1267 * Check for "to he" and other easy h/b errors.
1269 * This is a very inadequate effort on the h/b problem,
1270 * but the phrase "to he" is always an error, whereas "to
1271 * be" is quite common.
1272 * Similarly, '"Quiet!", be said.' is a non-be error
1273 * "to he" is _not_ always an error!:
1274 * "Where they went to he couldn't say."
1275 * Another false positive:
1276 * What would "Cinderella" be without the . . .
1277 * and another: "If he wants to he can see for himself."
1279 void check_for_jeebies(const char *aline)
1282 s=strstr(aline," be could ");
1284 s=strstr(aline," be would ");
1286 s=strstr(aline," was be ");
1288 s=strstr(aline," be is ");
1290 s=strstr(aline," is be ");
1292 s=strstr(aline,"\", be ");
1294 s=strstr(aline,"\" be ");
1296 s=strstr(aline,"\" be ");
1298 s=strstr(aline," to he ");
1301 if (pswit[ECHO_SWITCH])
1302 g_print("\n%s\n",aline);
1303 if (!pswit[OVERVIEW_SWITCH])
1304 g_print(" Line %ld column %ld - Query he/be error?\n",
1305 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1309 s=strstr(aline," the had ");
1311 s=strstr(aline," a had ");
1313 s=strstr(aline," they bad ");
1315 s=strstr(aline," she bad ");
1317 s=strstr(aline," he bad ");
1319 s=strstr(aline," you bad ");
1321 s=strstr(aline," i bad ");
1324 if (pswit[ECHO_SWITCH])
1325 g_print("\n%s\n",aline);
1326 if (!pswit[OVERVIEW_SWITCH])
1327 g_print(" Line %ld column %ld - Query had/bad error?\n",
1328 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1332 s=strstr(aline,"; hut ");
1334 s=strstr(aline,", hut ");
1337 if (pswit[ECHO_SWITCH])
1338 g_print("\n%s\n",aline);
1339 if (!pswit[OVERVIEW_SWITCH])
1340 g_print(" Line %ld column %ld - Query hut/but error?\n",
1341 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1348 * check_for_mta_from:
1350 * Special case - angled bracket in front of "From" placed there by an
1351 * MTA when sending an e-mail.
1353 void check_for_mta_from(const char *aline)
1356 s=strstr(aline,">From");
1359 if (pswit[ECHO_SWITCH])
1360 g_print("\n%s\n",aline);
1361 if (!pswit[OVERVIEW_SWITCH])
1362 g_print(" Line %ld column %ld - "
1363 "Query angled bracket with From\n",
1364 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1371 * check_for_orphan_character:
1373 * Check for a single character line -
1374 * often an overflow from bad wrapping.
1376 void check_for_orphan_character(const char *aline)
1379 c=g_utf8_get_char(aline);
1380 if (c && !*g_utf8_next_char(aline))
1382 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1383 ; /* Nothing - ignore numerals alone on a line. */
1386 if (pswit[ECHO_SWITCH])
1387 g_print("\n%s\n",aline);
1388 if (!pswit[OVERVIEW_SWITCH])
1389 g_print(" Line %ld column 1 - Query single character line\n",
1398 * check_for_pling_scanno:
1400 * Check for I" - often should be !
1402 void check_for_pling_scanno(const char *aline)
1405 s=strstr(aline," I\"");
1408 if (pswit[ECHO_SWITCH])
1409 g_print("\n%s\n",aline);
1410 if (!pswit[OVERVIEW_SWITCH])
1411 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1412 linecnt,g_utf8_pointer_to_offset(aline,s));
1419 * check_for_extra_period:
1421 * Check for period without a capital letter. Cut-down from gutspell.
1422 * Only works when it happens on a single line.
1424 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1426 const char *s,*t,*s1;
1431 gunichar *decomposition;
1432 if (pswit[PARANOID_SWITCH])
1434 for (t=aline;t=strstr(t,". ");)
1438 t=g_utf8_next_char(t);
1439 /* start of line punctuation is handled elsewhere */
1442 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1444 t=g_utf8_next_char(t);
1447 if (warnings->isDutch)
1449 /* For Frank & Jeroen -- 's Middags case */
1450 gunichar c2,c3,c4,c5;
1451 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1452 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1453 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1454 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1455 if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
1456 c4==CHAR_SPACE && g_unichar_isupper(c5))
1458 t=g_utf8_next_char(t);
1462 s1=g_utf8_next_char(g_utf8_next_char(t));
1463 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1464 !isdigit(g_utf8_get_char(s1)))
1465 s1=g_utf8_next_char(s1);
1466 if (g_unichar_islower(g_utf8_get_char(s1)))
1468 /* we have something to investigate */
1470 /* so let's go back and find out */
1471 for (s1=g_utf8_prev_char(t);s1>=aline &&
1472 (g_unichar_isalpha(g_utf8_get_char(s1)) ||
1473 g_unichar_isdigit(g_utf8_get_char(s1)) ||
1474 g_utf8_get_char(s1)==CHAR_SQUOTE &&
1475 g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
1476 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
1477 s1=g_utf8_prev_char(s1))
1479 s1=g_utf8_next_char(s1);
1482 testword=g_strndup(s1,s-s1);
1484 testword=g_strdup(s1);
1485 for (i=0;*abbrev[i];i++)
1486 if (!strcmp(testword,abbrev[i]))
1488 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1490 if (!*g_utf8_next_char(testword))
1492 if (isroman(testword))
1497 for (s=testword;*s;s=g_utf8_next_char(s))
1499 decomposition=g_unicode_canonical_decomposition(
1500 g_utf8_get_char(s),&len);
1501 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1503 g_free(decomposition);
1507 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1509 g_tree_insert(qperiod,g_strdup(testword),
1510 GINT_TO_POINTER(1));
1511 if (pswit[ECHO_SWITCH])
1512 g_print("\n%s\n",aline);
1513 if (!pswit[OVERVIEW_SWITCH])
1514 g_print(" Line %ld column %ld - Extra period?\n",
1515 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1521 t=g_utf8_next_char(t);
1527 * check_for_following_punctuation:
1529 * Check for words usually not followed by punctuation.
1531 void check_for_following_punctuation(const char *aline)
1534 const char *s,*wordstart;
1537 if (pswit[TYPO_SWITCH])
1548 inword=g_utf8_strdown(t,-1);
1550 for (i=0;*nocomma[i];i++)
1551 if (!strcmp(inword,nocomma[i]))
1553 c=g_utf8_get_char(s);
1554 if (c==',' || c==';' || c==':')
1556 if (pswit[ECHO_SWITCH])
1557 g_print("\n%s\n",aline);
1558 if (!pswit[OVERVIEW_SWITCH])
1559 g_print(" Line %ld column %ld - "
1560 "Query punctuation after %s?\n",
1561 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1567 for (i=0;*noperiod[i];i++)
1568 if (!strcmp(inword,noperiod[i]))
1570 c=g_utf8_get_char(s);
1571 if (c=='.' || c=='!')
1573 if (pswit[ECHO_SWITCH])
1574 g_print("\n%s\n",aline);
1575 if (!pswit[OVERVIEW_SWITCH])
1576 g_print(" Line %ld column %ld - "
1577 "Query punctuation after %s?\n",
1578 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1592 * Check for commonly mistyped words,
1593 * and digits like 0 for O in a word.
1595 void check_for_typos(const char *aline,struct warnings *warnings)
1597 const char *s,*t,*nt,*wordstart;
1599 gunichar *decomposition;
1601 int i,vowel,consonant,*dupcnt;
1602 gboolean isdup,istypo,alower;
1605 gsize decomposition_len;
1609 inword=getaword(&s);
1613 continue; /* don't bother with empty lines */
1615 if (mixdigit(inword))
1617 if (pswit[ECHO_SWITCH])
1618 g_print("\n%s\n",aline);
1619 if (!pswit[OVERVIEW_SWITCH])
1620 g_print(" Line %ld column %ld - Query digit in %s\n",
1621 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1626 * Put the word through a series of tests for likely typos and OCR
1629 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1633 for (t=inword;*t;t=g_utf8_next_char(t))
1635 c=g_utf8_get_char(t);
1636 nt=g_utf8_next_char(t);
1637 /* lowercase for testing */
1638 if (g_unichar_islower(c))
1640 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1643 * We have an uppercase mid-word. However, there are
1645 * Mac and Mc like McGill
1646 * French contractions like l'Abbe
1648 offset=g_utf8_pointer_to_offset(inword,t);
1649 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1650 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1651 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1653 g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
1659 testword=g_utf8_casefold(inword,-1);
1661 if (pswit[TYPO_SWITCH])
1664 * Check for certain unlikely two-letter combinations at word
1667 len=g_utf8_strlen(testword,-1);
1670 for (i=0;*nostart[i];i++)
1671 if (g_str_has_prefix(testword,nostart[i]))
1673 for (i=0;*noend[i];i++)
1674 if (g_str_has_suffix(testword,noend[i]))
1677 /* ght is common, gbt never. Like that. */
1678 if (strstr(testword,"cb"))
1680 if (strstr(testword,"gbt"))
1682 if (strstr(testword,"pbt"))
1684 if (strstr(testword,"tbs"))
1686 if (strstr(testword,"mrn"))
1688 if (strstr(testword,"ahle"))
1690 if (strstr(testword,"ihle"))
1693 * "TBE" does happen - like HEARTBEAT - but uncommon.
1694 * Also "TBI" - frostbite, outbid - but uncommon.
1695 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1696 * numerals, but "ii" is a common scanno.
1698 if (strstr(testword,"tbi"))
1700 if (strstr(testword,"tbe"))
1702 if (strstr(testword,"ii"))
1705 * Check for no vowels or no consonants.
1706 * If none, flag a typo.
1708 if (!istypo && len>1)
1711 for (t=testword;*t;t=g_utf8_next_char(t))
1713 c=g_utf8_get_char(t);
1715 g_unicode_canonical_decomposition(c,&decomposition_len);
1716 if (c=='y' || g_unichar_isdigit(c))
1718 /* Yah, this is loose. */
1722 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1726 g_free(decomposition);
1728 if (!vowel || !consonant)
1732 * Now exclude the word from being reported if it's in
1735 for (i=0;*okword[i];i++)
1736 if (!strcmp(testword,okword[i]))
1739 * What looks like a typo may be a Roman numeral.
1742 if (istypo && isroman(testword))
1744 /* Check the manual list of typos. */
1746 for (i=0;*typo[i];i++)
1747 if (!strcmp(testword,typo[i]))
1750 * Check lowercase s, l, i and m - special cases.
1751 * "j" - often a semi-colon gone wrong.
1752 * "d" for a missing apostrophe - he d
1755 if (!istypo && len==1 &&
1756 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1760 dupcnt=g_tree_lookup(qword,testword);
1764 isdup=!pswit[VERBOSE_SWITCH];
1768 dupcnt=g_new0(int,1);
1769 g_tree_insert(qword,g_strdup(testword),dupcnt);
1774 if (pswit[ECHO_SWITCH])
1775 g_print("\n%s\n",aline);
1776 if (!pswit[OVERVIEW_SWITCH])
1778 g_print(" Line %ld column %ld - Query word %s",
1779 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1781 if (!pswit[VERBOSE_SWITCH])
1782 g_print(" - not reporting duplicates");
1790 /* check the user's list of typos */
1791 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1793 if (pswit[ECHO_SWITCH])
1794 g_print("\n%s\n",aline);
1795 if (!pswit[OVERVIEW_SWITCH])
1796 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1797 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1799 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1801 if (pswit[PARANOID_SWITCH] && warnings->digit)
1803 /* In paranoid mode, query all 0 and 1 standing alone. */
1804 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1806 if (pswit[ECHO_SWITCH])
1807 g_print("\n%s\n",aline);
1808 if (!pswit[OVERVIEW_SWITCH])
1809 g_print(" Line %ld column %ld - Query standalone %s\n",
1810 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1821 * check_for_misspaced_punctuation:
1823 * Look for added or missing spaces around punctuation and quotes.
1824 * If there is a punctuation character like ! with no space on
1825 * either side, suspect a missing!space. If there are spaces on
1826 * both sides , assume a typo. If we see a double quote with no
1827 * space or punctuation on either side of it, assume unspaced
1828 * quotes "like"this.
1830 void check_for_misspaced_punctuation(const char *aline,
1831 struct parities *parities,gboolean isemptyline)
1833 gboolean isacro,isellipsis;
1835 gunichar c,nc,pc,n2c;
1836 c=g_utf8_get_char(aline);
1837 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1838 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1842 nc=g_utf8_get_char(g_utf8_next_char(s));
1843 /* For each character in the line after the first. */
1844 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1846 /* we need to suppress warnings for acronyms like M.D. */
1848 /* we need to suppress warnings for ellipsis . . . */
1851 * If there are letters on both sides of it or
1852 * if it's strict punctuation followed by an alpha.
1854 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1855 g_utf8_strchr("?!,;:",-1,c)))
1859 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1860 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1862 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1868 if (pswit[ECHO_SWITCH])
1869 g_print("\n%s\n",aline);
1870 if (!pswit[OVERVIEW_SWITCH])
1871 g_print(" Line %ld column %ld - Missing space?\n",
1872 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1877 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1880 * If there are spaces on both sides,
1881 * or space before and end of line.
1885 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1886 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1888 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1892 if (!isemptyline && !isellipsis)
1894 if (pswit[ECHO_SWITCH])
1895 g_print("\n%s\n",aline);
1896 if (!pswit[OVERVIEW_SWITCH])
1897 g_print(" Line %ld column %ld - "
1898 "Spaced punctuation?\n",linecnt,
1899 g_utf8_pointer_to_offset(aline,s)+1);
1906 /* Split out the characters that CANNOT be preceded by space. */
1907 c=g_utf8_get_char(aline);
1908 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1909 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1913 nc=g_utf8_get_char(g_utf8_next_char(s));
1914 /* for each character in the line after the first */
1915 if (g_utf8_strchr("?!,;:",-1,c))
1917 /* if it's punctuation that _cannot_ have a space before it */
1918 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1921 * If nc DOES == space,
1922 * it was already reported just above.
1924 if (pswit[ECHO_SWITCH])
1925 g_print("\n%s\n",aline);
1926 if (!pswit[OVERVIEW_SWITCH])
1927 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1928 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1935 * Special case " .X" where X is any alpha.
1936 * This plugs a hole in the acronym code above.
1937 * Inelegant, but maintainable.
1939 c=g_utf8_get_char(aline);
1940 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1941 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1945 nc=g_utf8_get_char(g_utf8_next_char(s));
1946 /* for each character in the line after the first */
1949 /* if it's a period */
1950 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1953 * If the period follows a space and
1954 * is followed by a letter.
1956 if (pswit[ECHO_SWITCH])
1957 g_print("\n%s\n",aline);
1958 if (!pswit[OVERVIEW_SWITCH])
1959 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1960 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1966 c=g_utf8_get_char(aline);
1967 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1968 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1972 nc=g_utf8_get_char(g_utf8_next_char(s));
1973 /* for each character in the line after the first */
1976 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1977 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1978 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1980 if (pswit[ECHO_SWITCH])
1981 g_print("\n%s\n",aline);
1982 if (!pswit[OVERVIEW_SWITCH])
1983 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1984 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1990 /* Check parity of quotes. */
1991 nc=g_utf8_get_char(aline);
1992 for (s=aline;*s;s=g_utf8_next_char(s))
1995 nc=g_utf8_get_char(g_utf8_next_char(s));
1998 parities->dquote=!parities->dquote;
1999 if (!parities->dquote)
2002 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2004 if (pswit[ECHO_SWITCH])
2005 g_print("\n%s\n",aline);
2006 if (!pswit[OVERVIEW_SWITCH])
2007 g_print(" Line %ld column %ld - "
2008 "Wrongspaced quotes?\n",
2009 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2017 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2018 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2020 if (pswit[ECHO_SWITCH])
2021 g_print("\n%s\n",aline);
2022 if (!pswit[OVERVIEW_SWITCH])
2023 g_print(" Line %ld column %ld - "
2024 "Wrongspaced quotes?\n",
2025 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2032 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2034 if (g_utf8_strchr(",;:!?)]} ",-1,
2035 g_utf8_get_char(g_utf8_next_char(aline))))
2037 if (pswit[ECHO_SWITCH])
2038 g_print("\n%s\n",aline);
2039 if (!pswit[OVERVIEW_SWITCH])
2040 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2046 if (pswit[SQUOTE_SWITCH])
2048 nc=g_utf8_get_char(aline);
2049 for (s=aline;*s;s=g_utf8_next_char(s))
2052 nc=g_utf8_get_char(g_utf8_next_char(s));
2053 if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
2055 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2056 !g_unichar_isalpha(nc)))
2058 parities->squote=!parities->squote;
2059 if (!parities->squote)
2062 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2064 if (pswit[ECHO_SWITCH])
2065 g_print("\n%s\n",aline);
2066 if (!pswit[OVERVIEW_SWITCH])
2067 g_print(" Line %ld column %ld - "
2068 "Wrongspaced singlequotes?\n",
2069 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2077 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2078 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2080 if (pswit[ECHO_SWITCH])
2081 g_print("\n%s\n",aline);
2082 if (!pswit[OVERVIEW_SWITCH])
2083 g_print(" Line %ld column %ld - "
2084 "Wrongspaced singlequotes?\n",
2085 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2096 * check_for_double_punctuation:
2098 * Look for double punctuation like ,. or ,,
2099 * Thanks to DW for the suggestion!
2100 * In books with references, ".," and ".;" are common
2101 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2102 * OTOH, from my initial tests, there are also fairly
2103 * common errors. What to do? Make these cases paranoid?
2104 * ".," is the most common, so warnings->dotcomma is used
2105 * to suppress detailed reporting if it occurs often.
2107 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2111 nc=g_utf8_get_char(aline);
2112 for (s=aline;*s;s=g_utf8_next_char(s))
2115 nc=g_utf8_get_char(g_utf8_next_char(s));
2116 /* for each punctuation character in the line */
2117 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2118 g_utf8_strchr(".?!,;:",-1,nc))
2120 /* followed by punctuation, it's a query, unless . . . */
2121 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2122 !warnings->dotcomma && c=='.' && nc==',' ||
2123 warnings->isFrench && g_str_has_prefix(s,",...") ||
2124 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2125 warnings->isFrench && g_str_has_prefix(s,";...") ||
2126 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2127 warnings->isFrench && g_str_has_prefix(s,":...") ||
2128 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2129 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2130 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2131 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2132 warnings->isFrench && g_str_has_prefix(s,"...?"))
2134 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2135 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2136 warnings->isFrench && g_str_has_prefix(s,";...") ||
2137 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2138 warnings->isFrench && g_str_has_prefix(s,":...") ||
2139 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2140 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2141 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2142 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2143 warnings->isFrench && g_str_has_prefix(s,"...?"))
2146 nc=g_utf8_get_char(g_utf8_next_char(s));
2148 ; /* do nothing for .. !! and ?? which can be legit */
2152 if (pswit[ECHO_SWITCH])
2153 g_print("\n%s\n",aline);
2154 if (!pswit[OVERVIEW_SWITCH])
2155 g_print(" Line %ld column %ld - Double punctuation?\n",
2156 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2165 * check_for_spaced_quotes:
2167 void check_for_spaced_quotes(const char *aline)
2171 while ((t=strstr(s," \" ")))
2173 if (pswit[ECHO_SWITCH])
2174 g_print("\n%s\n",aline);
2175 if (!pswit[OVERVIEW_SWITCH])
2176 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2177 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2180 s=g_utf8_next_char(g_utf8_next_char(t));
2183 while ((t=strstr(s," ' ")))
2185 if (pswit[ECHO_SWITCH])
2186 g_print("\n%s\n",aline);
2187 if (!pswit[OVERVIEW_SWITCH])
2188 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2189 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2192 s=g_utf8_next_char(g_utf8_next_char(t));
2195 while ((t=strstr(s," ` ")))
2197 if (pswit[ECHO_SWITCH])
2198 g_print("\n%s\n",aline);
2199 if (!pswit[OVERVIEW_SWITCH])
2200 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2201 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2204 s=g_utf8_next_char(g_utf8_next_char(t));
2209 * check_for_miscased_genative:
2211 * Check special case of 'S instead of 's at end of word.
2213 void check_for_miscased_genative(const char *aline)
2219 c=g_utf8_get_char(aline);
2220 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2221 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2225 nc=g_utf8_get_char(g_utf8_next_char(s));
2226 if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
2228 if (pswit[ECHO_SWITCH])
2229 g_print("\n%s\n",aline);
2230 if (!pswit[OVERVIEW_SWITCH])
2231 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2232 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2240 * check_end_of_line:
2242 * Now check special cases - start and end of line -
2243 * for single and double quotes. Start is sometimes [sic]
2244 * but better to query it anyway.
2245 * While we're here, check for dash at end of line.
2247 void check_end_of_line(const char *aline,struct warnings *warnings)
2252 lbytes=strlen(aline);
2253 if (g_utf8_strlen(aline,lbytes)>1)
2255 s=g_utf8_prev_char(aline+lbytes);
2256 c1=g_utf8_get_char(s);
2257 c2=g_utf8_get_char(g_utf8_prev_char(s));
2258 if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
2261 if (pswit[ECHO_SWITCH])
2262 g_print("\n%s\n",aline);
2263 if (!pswit[OVERVIEW_SWITCH])
2264 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2265 g_utf8_strlen(aline,lbytes));
2269 c1=g_utf8_get_char(aline);
2270 c2=g_utf8_get_char(g_utf8_next_char(aline));
2271 if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
2273 if (pswit[ECHO_SWITCH])
2274 g_print("\n%s\n",aline);
2275 if (!pswit[OVERVIEW_SWITCH])
2276 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2281 * Dash at end of line may well be legit - paranoid mode only
2282 * and don't report em-dash at line-end.
2284 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2286 for (s=g_utf8_prev_char(aline+lbytes);
2287 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2289 if (g_utf8_get_char(s)=='-' &&
2290 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2292 if (pswit[ECHO_SWITCH])
2293 g_print("\n%s\n",aline);
2294 if (!pswit[OVERVIEW_SWITCH])
2295 g_print(" Line %ld column %ld - "
2296 "Hyphen at end of line?\n",
2297 linecnt,g_utf8_pointer_to_offset(aline,s));
2304 * check_for_unspaced_bracket:
2306 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2307 * If so, suspect a scanno like "a]most".
2309 void check_for_unspaced_bracket(const char *aline)
2313 c=g_utf8_get_char(aline);
2314 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2315 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2319 nc=g_utf8_get_char(g_utf8_next_char(s));
2322 /* for each bracket character in the line except 1st & last */
2323 if (g_utf8_strchr("{[()]}",-1,c) &&
2324 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2326 if (pswit[ECHO_SWITCH])
2327 g_print("\n%s\n",aline);
2328 if (!pswit[OVERVIEW_SWITCH])
2329 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2330 linecnt,g_utf8_pointer_to_offset(aline,s));
2338 * check_for_unpunctuated_endquote:
2340 void check_for_unpunctuated_endquote(const char *aline)
2344 c=g_utf8_get_char(aline);
2345 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2346 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2350 nc=g_utf8_get_char(g_utf8_next_char(s));
2351 /* for each character in the line except 1st */
2352 if (c==CHAR_DQUOTE && isalpha(pc))
2354 if (pswit[ECHO_SWITCH])
2355 g_print("\n%s\n",aline);
2356 if (!pswit[OVERVIEW_SWITCH])
2357 g_print(" Line %ld column %ld - "
2358 "endquote missing punctuation?\n",
2359 linecnt,g_utf8_pointer_to_offset(aline,s));
2367 * check_for_html_tag:
2369 * Check for <HTML TAG>.
2371 * If there is a < in the line, followed at some point
2372 * by a > then we suspect HTML.
2374 void check_for_html_tag(const char *aline)
2376 const char *open,*close;
2378 open=strchr(aline,'<');
2381 close=strchr(g_utf8_next_char(open),'>');
2384 if (pswit[ECHO_SWITCH])
2385 g_print("\n%s\n",aline);
2386 if (!pswit[OVERVIEW_SWITCH])
2388 tag=g_strndup(open,close-open+1);
2389 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2390 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2400 * check_for_html_entity:
2402 * Check for &symbol; HTML.
2404 * If there is a & in the line, followed at
2405 * some point by a ; then we suspect HTML.
2407 void check_for_html_entity(const char *aline)
2409 const char *s,*amp,*scolon;
2411 amp=strchr(aline,'&');
2414 scolon=strchr(amp,';');
2417 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2418 if (g_utf8_get_char(s)==CHAR_SPACE)
2419 break; /* Don't report "Jones & Son;" */
2422 if (pswit[ECHO_SWITCH])
2423 g_print("\n%s\n",aline);
2424 if (!pswit[OVERVIEW_SWITCH])
2426 entity=g_strndup(amp,scolon-amp+1);
2427 g_print(" Line %ld column %d - HTML symbol? %s \n",
2428 linecnt,(int)(amp-aline)+1,entity);
2441 * If we are in a state of unbalanced quotes, and this line
2442 * doesn't begin with a quote, output the stored error message.
2443 * If the -P switch was used, print the warning even if the
2444 * new para starts with quotes.
2446 void print_pending(const char *aline,const char *parastart,
2447 struct pending *pending)
2454 c=g_utf8_get_char(s);
2455 if (pending->dquote)
2457 if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2459 if (!pswit[OVERVIEW_SWITCH])
2461 if (pswit[ECHO_SWITCH])
2462 g_print("\n%s\n",parastart);
2463 g_print("%s\n",pending->dquote);
2468 g_free(pending->dquote);
2469 pending->dquote=NULL;
2471 if (pending->squote)
2473 if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2476 if (!pswit[OVERVIEW_SWITCH])
2478 if (pswit[ECHO_SWITCH])
2479 g_print("\n%s\n",parastart);
2480 g_print("%s\n",pending->squote);
2485 g_free(pending->squote);
2486 pending->squote=NULL;
2488 if (pending->rbrack)
2490 if (!pswit[OVERVIEW_SWITCH])
2492 if (pswit[ECHO_SWITCH])
2493 g_print("\n%s\n",parastart);
2494 g_print("%s\n",pending->rbrack);
2498 g_free(pending->rbrack);
2499 pending->rbrack=NULL;
2501 if (pending->sbrack)
2503 if (!pswit[OVERVIEW_SWITCH])
2505 if (pswit[ECHO_SWITCH])
2506 g_print("\n%s\n",parastart);
2507 g_print("%s\n",pending->sbrack);
2511 g_free(pending->sbrack);
2512 pending->sbrack=NULL;
2514 if (pending->cbrack)
2516 if (!pswit[OVERVIEW_SWITCH])
2518 if (pswit[ECHO_SWITCH])
2519 g_print("\n%s\n",parastart);
2520 g_print("%s\n",pending->cbrack);
2524 g_free(pending->cbrack);
2525 pending->cbrack=NULL;
2527 if (pending->unders)
2529 if (!pswit[OVERVIEW_SWITCH])
2531 if (pswit[ECHO_SWITCH])
2532 g_print("\n%s\n",parastart);
2533 g_print("%s\n",pending->unders);
2537 g_free(pending->unders);
2538 pending->unders=NULL;
2543 * check_for_mismatched_quotes:
2545 * At end of paragraph, check for mismatched quotes.
2547 * We don't want to report an error immediately, since it is a
2548 * common convention to omit the quotes at end of paragraph if
2549 * the next paragraph is a continuation of the same speaker.
2550 * Where this is the case, the next para should begin with a
2551 * quote, so we store the warning message and only display it
2552 * at the top of the next iteration if the new para doesn't
2553 * start with a quote.
2554 * The -p switch overrides this default, and warns of unclosed
2555 * quotes on _every_ paragraph, whether the next begins with a
2558 void check_for_mismatched_quotes(const struct counters *counters,
2559 struct pending *pending)
2561 if (counters->quot%2)
2563 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2564 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2565 counters->open_single_quote!=counters->close_single_quote)
2567 g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
2568 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2569 counters->open_single_quote!=counters->close_single_quote &&
2570 counters->open_single_quote!=counters->close_single_quote+1)
2572 * Flag it to be noted regardless of the
2573 * first char of the next para.
2576 if (counters->r_brack)
2578 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2579 if (counters->s_brack)
2581 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2582 if (counters->c_brack)
2584 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2585 if (counters->c_unders%2)
2587 g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
2591 * check_for_omitted_punctuation:
2593 * Check for omitted punctuation at end of paragraph by working back
2594 * through prevline. DW.
2595 * Need to check this only for "normal" paras.
2596 * So what is a "normal" para?
2597 * Not normal if one-liner (chapter headings, etc.)
2598 * Not normal if doesn't contain at least one locase letter
2599 * Not normal if starts with space
2601 void check_for_omitted_punctuation(const char *prevline,
2602 struct line_properties *last,int start_para_line)
2604 gboolean letter_on_line=FALSE;
2606 for (s=prevline;*s;s=g_utf8_next_char(s))
2607 if (g_unichar_isalpha(g_utf8_get_char(s)))
2609 letter_on_line=TRUE;
2613 * This next "if" is a problem.
2614 * If we say "start_para_line <= linecnt - 1", that includes
2615 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2616 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2617 * misses genuine one-line paragraphs.
2619 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2620 g_utf8_get_char(prevline)>CHAR_SPACE)
2622 for (s=g_utf8_prev_char(prevline+strlen(prevline));
2623 (g_utf8_get_char(s)==CHAR_DQUOTE ||
2624 g_utf8_get_char(s)==CHAR_SQUOTE) &&
2625 g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
2626 s=g_utf8_prev_char(s))
2628 for (;s>prevline;s=g_utf8_prev_char(s))
2630 if (g_unichar_isalpha(g_utf8_get_char(s)))
2632 if (pswit[ECHO_SWITCH])
2633 g_print("\n%s\n",prevline);
2634 if (!pswit[OVERVIEW_SWITCH])
2635 g_print(" Line %ld column %ld - "
2636 "No punctuation at para end?\n",
2637 linecnt-1,g_utf8_strlen(prevline,-1));
2642 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2648 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2650 const char *word=key;
2653 g_print("\nNote: Queried word %s was duplicated %d times\n",
2658 void print_as_windows_1252(const char *string)
2660 gsize inbytes,outbytes;
2662 GIConv converter=(GIConv)-1;
2665 if (converter!=(GIConv)-1)
2666 g_iconv_close(converter);
2667 converter=(GIConv)-1;
2670 if (converter=(GIConv)-1)
2671 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2672 if (converter!=(GIConv)-1)
2674 inbytes=outbytes=strlen(string);
2675 bp=buf=g_malloc(outbytes+1);
2676 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2682 fputs(string,stdout);
2685 void print_as_utf_8(const char *string)
2687 fputs(string,stdout);
2695 void procfile(const char *filename)
2698 gchar *parastart=NULL; /* first line of current para */
2699 gchar *etext,*aline;
2702 struct first_pass_results *first_pass_results;
2703 struct warnings *warnings;
2704 struct counters counters={0};
2705 struct line_properties last={0};
2706 struct parities parities={0};
2707 struct pending pending={0};
2708 gboolean isemptyline;
2709 long start_para_line=0;
2710 gboolean isnewpara=FALSE,enddash=FALSE;
2711 last.start=CHAR_SPACE;
2712 linecnt=checked_linecnt=0;
2713 etext=read_etext(filename,&err);
2716 if (pswit[STDOUT_SWITCH])
2717 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2719 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2722 g_print("\n\nFile: %s\n\n",filename);
2723 first_pass_results=first_pass(etext);
2724 warnings=report_first_pass(first_pass_results);
2725 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2726 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2728 * Here we go with the main pass. Hold onto yer hat!
2732 while ((aline=flgets(&etext_ptr,linecnt+1)))
2737 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2738 continue; // skip DP page separators completely
2739 if (linecnt<first_pass_results->firstline ||
2740 (first_pass_results->footerline>0 &&
2741 linecnt>first_pass_results->footerline))
2743 if (pswit[HEADER_SWITCH])
2745 if (g_str_has_prefix(aline,"Title:"))
2746 g_print(" %s\n",aline);
2747 if (g_str_has_prefix(aline,"Author:"))
2748 g_print(" %s\n",aline);
2749 if (g_str_has_prefix(aline,"Release Date:"))
2750 g_print(" %s\n",aline);
2751 if (g_str_has_prefix(aline,"Edition:"))
2752 g_print(" %s\n\n",aline);
2754 continue; /* skip through the header */
2757 print_pending(aline,parastart,&pending);
2758 memset(&pending,0,sizeof(pending));
2759 isemptyline=analyse_quotes(aline,&counters);
2760 if (isnewpara && !isemptyline)
2762 /* This line is the start of a new paragraph. */
2763 start_para_line=linecnt;
2764 /* Capture its first line in case we want to report it later. */
2766 parastart=g_strdup(aline);
2767 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2769 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2770 !g_unichar_isdigit(g_utf8_get_char(s)))
2771 s=g_utf8_next_char(s);
2772 if (g_unichar_islower(g_utf8_get_char(s)))
2774 /* and its first letter is lowercase */
2775 if (pswit[ECHO_SWITCH])
2776 g_print("\n%s\n",aline);
2777 if (!pswit[OVERVIEW_SWITCH])
2778 g_print(" Line %ld column %ld - "
2779 "Paragraph starts with lower-case\n",
2780 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2784 isnewpara=FALSE; /* Signal the end of new para processing. */
2786 /* Check for an em-dash broken at line end. */
2787 if (enddash && g_utf8_get_char(aline)=='-')
2789 if (pswit[ECHO_SWITCH])
2790 g_print("\n%s\n",aline);
2791 if (!pswit[OVERVIEW_SWITCH])
2792 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2797 for (s=g_utf8_prev_char(aline+strlen(aline));
2798 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2800 if (s>=aline && g_utf8_get_char(s)=='-')
2802 check_for_control_characters(aline);
2804 check_for_odd_characters(aline,warnings,isemptyline);
2805 if (warnings->longline)
2806 check_for_long_line(aline);
2807 if (warnings->shortline)
2808 check_for_short_line(aline,&last);
2810 last.len=g_utf8_strlen(aline,-1);
2811 last.start=g_utf8_get_char(aline);
2812 check_for_starting_punctuation(aline);
2815 check_for_spaced_emdash(aline);
2816 check_for_spaced_dash(aline);
2818 check_for_unmarked_paragraphs(aline);
2819 check_for_jeebies(aline);
2820 check_for_mta_from(aline);
2821 check_for_orphan_character(aline);
2822 check_for_pling_scanno(aline);
2823 check_for_extra_period(aline,warnings);
2824 check_for_following_punctuation(aline);
2825 check_for_typos(aline,warnings);
2826 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2827 check_for_double_punctuation(aline,warnings);
2828 check_for_spaced_quotes(aline);
2829 check_for_miscased_genative(aline);
2830 check_end_of_line(aline,warnings);
2831 check_for_unspaced_bracket(aline);
2832 if (warnings->endquote)
2833 check_for_unpunctuated_endquote(aline);
2834 check_for_html_tag(aline);
2835 check_for_html_entity(aline);
2838 check_for_mismatched_quotes(&counters,&pending);
2839 memset(&counters,0,sizeof(counters));
2840 /* let the next iteration know that it's starting a new para */
2843 check_for_omitted_punctuation(prevline,&last,start_para_line);
2846 prevline=g_strdup(aline);
2856 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2857 g_tree_foreach(qword,report_duplicate_queries,NULL);
2858 g_tree_unref(qword);
2859 g_tree_unref(qperiod);
2860 g_set_print_handler(NULL);
2861 print_as_windows_1252(NULL);
2862 if (pswit[MARKUP_SWITCH])
2869 * Get one line from the input text, checking for
2870 * the existence of exactly one CR/LF line-end per line.
2872 * Returns: a pointer to the line.
2874 char *flgets(char **etext,long lcnt)
2877 gboolean isCR=FALSE;
2878 char *theline=*etext;
2883 c=g_utf8_get_char(*etext);
2884 *etext=g_utf8_next_char(*etext);
2887 /* either way, it's end of line */
2894 /* Error - a LF without a preceding CR */
2895 if (pswit[LINE_END_SWITCH])
2897 if (pswit[ECHO_SWITCH])
2899 s=g_strndup(theline,eos-theline);
2900 g_print("\n%s\n",s);
2903 if (!pswit[OVERVIEW_SWITCH])
2904 g_print(" Line %ld - No CR?\n",lcnt);
2915 /* Error - two successive CRs */
2916 if (pswit[LINE_END_SWITCH])
2918 if (pswit[ECHO_SWITCH])
2920 s=g_strndup(theline,eos-theline);
2921 g_print("\n%s\n",s);
2924 if (!pswit[OVERVIEW_SWITCH])
2925 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2934 if (pswit[LINE_END_SWITCH] && isCR)
2936 if (pswit[ECHO_SWITCH])
2938 s=g_strndup(theline,eos-theline);
2939 g_print("\n%s\n",s);
2942 if (!pswit[OVERVIEW_SWITCH])
2943 g_print(" Line %ld column %ld - CR without LF?\n",
2944 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2950 eos=g_utf8_next_char(eos);
2954 if (pswit[MARKUP_SWITCH])
2955 postprocess_for_HTML(theline);
2956 if (pswit[DP_SWITCH])
2957 postprocess_for_DP(theline);
2964 * Takes a "word" as a parameter, and checks whether it
2965 * contains a mixture of alpha and digits. Generally, this is an
2966 * error, but may not be for cases like 4th or L5 12s. 3d.
2968 * Returns: TRUE iff an is error found.
2970 gboolean mixdigit(const char *checkword)
2972 gboolean wehaveadigit,wehavealetter,query;
2973 const char *s,*nondigit;
2974 wehaveadigit=wehavealetter=query=FALSE;
2975 for (s=checkword;*s;s=g_utf8_next_char(s))
2976 if (g_unichar_isalpha(g_utf8_get_char(s)))
2978 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2980 if (wehaveadigit && wehavealetter)
2982 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2984 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2985 nondigit=g_utf8_next_char(nondigit))
2987 /* digits, ending in st, rd, nd, th of either case */
2988 if (!g_ascii_strcasecmp(nondigit,"st") ||
2989 !g_ascii_strcasecmp(nondigit,"rd") ||
2990 !g_ascii_strcasecmp(nondigit,"nd") ||
2991 !g_ascii_strcasecmp(nondigit,"th"))
2993 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2994 !g_ascii_strcasecmp(nondigit,"rds") ||
2995 !g_ascii_strcasecmp(nondigit,"nds") ||
2996 !g_ascii_strcasecmp(nondigit,"ths"))
2998 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2999 !g_ascii_strcasecmp(nondigit,"rdly") ||
3000 !g_ascii_strcasecmp(nondigit,"ndly") ||
3001 !g_ascii_strcasecmp(nondigit,"thly"))
3003 /* digits, ending in l, L, s or d */
3004 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3005 !strcmp(nondigit,"d"))
3008 * L at the start of a number, representing Britsh pounds, like L500.
3009 * This is cute. We know the current word is mixed digit. If the first
3010 * letter is L, there must be at least one digit following. If both
3011 * digits and letters follow, we have a genuine error, else we have a
3012 * capital L followed by digits, and we accept that as a non-error.
3014 if (g_utf8_get_char(checkword)=='L' &&
3015 !mixdigit(g_utf8_next_char(checkword)))
3024 * Extracts the first/next "word" from the line, and returns it.
3025 * A word is defined as one English word unit--or at least that's the aim.
3026 * "ptr" is advanced to the position in the line where we will start
3027 * looking for the next word.
3029 * Returns: A newly-allocated string.
3031 gchar *getaword(const char **ptr)
3036 word=g_string_new(NULL);
3037 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3038 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3039 **ptr;*ptr=g_utf8_next_char(*ptr))
3042 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3043 * Especially yucky is the case of L1,000
3044 * This section looks for a pattern of characters including a digit
3045 * followed by a comma or period followed by one or more digits.
3046 * If found, it returns this whole pattern as a word; otherwise we discard
3047 * the results and resume our normal programming.
3050 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3051 g_unichar_isalpha(g_utf8_get_char(s)) ||
3052 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3053 g_string_append_unichar(word,g_utf8_get_char(s));
3056 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3058 c=g_utf8_get_char(t);
3059 pc=g_utf8_get_char(g_utf8_prev_char(t));
3060 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3063 return g_string_free(word,FALSE);
3067 /* we didn't find a punctuated number - do the regular getword thing */
3068 g_string_truncate(word,0);
3069 for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
3070 g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
3071 g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
3072 g_string_append_unichar(word,g_utf8_get_char(*ptr));
3073 return g_string_free(word,FALSE);
3079 * Is this word a Roman Numeral?
3081 * It doesn't actually validate that the number is a valid Roman Numeral--for
3082 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3083 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3084 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3085 * expressions thereof, except when it came to taxes. Allow any number of M,
3086 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3087 * XL or an optional XC, an optional IX or IV, an optional V and any number
3090 gboolean isroman(const char *t)
3096 while (g_utf8_get_char(t)=='m' && *t)
3098 if (g_utf8_get_char(t)=='d')
3100 if (g_str_has_prefix(t,"cm"))
3102 if (g_str_has_prefix(t,"cd"))
3104 while (g_utf8_get_char(t)=='c' && *t)
3106 if (g_str_has_prefix(t,"xl"))
3108 if (g_str_has_prefix(t,"xc"))
3110 if (g_utf8_get_char(t)=='l')
3112 while (g_utf8_get_char(t)=='x' && *t)
3114 if (g_str_has_prefix(t,"ix"))
3116 if (g_str_has_prefix(t,"iv"))
3118 if (g_utf8_get_char(t)=='v')
3120 while (g_utf8_get_char(t)=='i' && *t)
3126 * postprocess_for_DP:
3128 * Invoked with the -d switch from flgets().
3129 * It simply "removes" from the line a hard-coded set of common
3130 * DP-specific tags, so that the line passed to the main routine has
3131 * been pre-cleaned of DP markup.
3133 void postprocess_for_DP(char *theline)
3139 for (i=0;*DPmarkup[i];i++)
3140 while ((s=strstr(theline,DPmarkup[i])))
3142 t=s+strlen(DPmarkup[i]);
3143 memmove(s,t,strlen(t)+1);
3148 * postprocess_for_HTML:
3150 * Invoked with the -m switch from flgets().
3151 * It simply "removes" from the line a hard-coded set of common
3152 * HTML tags and "replaces" a hard-coded set of common HTML
3153 * entities, so that the line passed to the main routine has
3154 * been pre-cleaned of HTML.
3156 void postprocess_for_HTML(char *theline)
3158 while (losemarkup(theline))
3160 loseentities(theline);
3163 char *losemarkup(char *theline)
3167 s=strchr(theline,'<');
3168 t=s?strchr(s,'>'):NULL;
3171 for (i=0;*markup[i];i++)
3172 if (tagcomp(g_utf8_next_char(s),markup[i]))
3174 t=g_utf8_next_char(t);
3175 memmove(s,t,strlen(t)+1);
3178 /* It's an unrecognized <xxx>. */
3182 void loseentities(char *theline)
3189 GTree *entities=NULL;
3190 GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3194 g_tree_destroy(entities);
3196 if (translit==(GIConv)-1)
3197 g_iconv_close(translit);
3198 translit=(GIConv)-1;
3199 if (to_utf8==(GIConv)-1)
3200 g_iconv_close(to_utf8);
3208 entities=g_tree_new((GCompareFunc)strcmp);
3209 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3210 g_tree_insert(entities,HTMLentities[i].name,
3211 GUINT_TO_POINTER(HTMLentities[i].c));
3213 if (translit==(GIConv)-1)
3214 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3215 if (to_utf8==(GIConv)-1)
3216 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3217 while((amp=strchr(theline,'&')))
3219 scolon=strchr(amp,';');
3224 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3225 c=strtol(amp+2,NULL,10);
3226 else if (amp[2]=='x' &&
3227 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3228 c=strtol(amp+3,NULL,16);
3232 s=g_strndup(amp+1,scolon-(amp+1));
3233 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3242 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3243 theline+=g_unichar_to_utf8(c,theline);
3247 nb=g_unichar_to_utf8(c,s);
3248 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3250 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3252 memcpy(theline,s,nb);
3256 memmove(theline,g_utf8_next_char(scolon),
3257 strlen(g_utf8_next_char(scolon))+1);
3260 theline=g_utf8_next_char(amp);
3264 gboolean tagcomp(const char *strin,const char *basetag)
3268 if (g_utf8_get_char(strin)=='/')
3269 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3271 t=g_utf8_casefold(strin,-1);
3272 s=g_utf8_casefold(basetag,-1);
3273 retval=g_str_has_prefix(t,s);
3279 void proghelp(GOptionContext *context)
3282 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3283 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3284 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3285 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3286 "For details, read the file COPYING.\n",stderr);
3287 fputs("This is Free Software; "
3288 "you may redistribute it under certain conditions (GPL);\n",stderr);
3289 fputs("read the file COPYING for details.\n\n",stderr);
3290 help=g_option_context_get_help(context,TRUE,NULL);
3293 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3294 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3295 "non-ASCII\n",stderr);
3296 fputs("characters like accented letters, "
3297 "lines longer than 75 or shorter than 55,\n",stderr);
3298 fputs("unbalanced quotes or brackets, "
3299 "a variety of badly formatted punctuation, \n",stderr);
3300 fputs("HTML tags, some likely typos. "
3301 "It is NOT a substitute for human judgement.\n",stderr);