In verbose mode each instance of a questionable word is queried so don't give a count
1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "HTMLentities.h"
36 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
37 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
38 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
39 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
40 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
41 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
42 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
43 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
44 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
45 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
46 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
47 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
48 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
49 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
50 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
51 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
52 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
53 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
54 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
55 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
56 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
57 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
58 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
59 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
60 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
61 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
62 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
63 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
64 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
70 /* Common abbreviations and other OK words not to query as typos. */
72 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
73 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
74 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
75 "outbid", "outbids", "frostbite", "frostbitten", ""
78 /* Common abbreviations that cause otherwise unexplained periods. */
80 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
81 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
85 * Two-Letter combinations that rarely if ever start words,
86 * but are common scannos or otherwise common letter combinations.
89 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
93 * Two-Letter combinations that rarely if ever end words,
94 * but are common scannos or otherwise common letter combinations.
97 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
98 "sw", "gr", "sl", "cl", "iy", ""
102 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
103 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
104 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
105 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
109 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
113 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
114 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
115 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
116 "during", "let", "toward", "among", ""
120 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
121 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
122 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
123 "among", "those", "into", "whom", "having", "thence", ""
126 /* special characters */
127 #define CHAR_SPACE 32
131 #define CHAR_DQUOTE 34
132 #define CHAR_SQUOTE 39
133 #define CHAR_OPEN_SQUOTE 96
134 #define CHAR_TILDE 126
135 #define CHAR_ASTERISK 42
136 #define CHAR_FORESLASH 47
137 #define CHAR_CARAT 94
139 #define CHAR_UNDERSCORE '_'
140 #define CHAR_OPEN_CBRACK '{'
141 #define CHAR_CLOSE_CBRACK '}'
142 #define CHAR_OPEN_RBRACK '('
143 #define CHAR_CLOSE_RBRACK ')'
144 #define CHAR_OPEN_SBRACK '['
145 #define CHAR_CLOSE_SBRACK ']'
147 /* longest and shortest normal PG line lengths */
148 #define LONGEST_PG_LINE 75
149 #define WAY_TOO_LONG 80
150 #define SHORTEST_PG_LINE 55
170 gboolean pswit[SWITNO]; /* program switches */
172 static GOptionEntry options[]={
173 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
174 "Ignore DP-specific markup", NULL },
175 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
176 "Don't echo queried line", NULL },
177 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
178 "Check single quotes", NULL },
179 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
180 "Check common typos", NULL },
181 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
182 "Require closure of quotes on every paragraph", NULL },
183 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
184 "Disable paranoid querying of everything", NULL },
185 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
186 "Disable line end checking", NULL },
187 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
188 "Overview: just show counts", NULL },
189 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
190 "Output errors to stdout instead of stderr", NULL },
191 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
192 "Echo header fields", NULL },
193 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
194 "Ignore markup in < >", NULL },
195 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
196 "Use file of user-defined typos", NULL },
197 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
198 "Defaults for use on www upload", NULL },
199 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
200 "Verbose - list everything", NULL },
204 long cnt_dquot; /* for overview mode, count of doublequote queries */
205 long cnt_squot; /* for overview mode, count of singlequote queries */
206 long cnt_brack; /* for overview mode, count of brackets queries */
207 long cnt_bin; /* for overview mode, count of non-ASCII queries */
208 long cnt_odd; /* for overview mode, count of odd character queries */
209 long cnt_long; /* for overview mode, count of long line errors */
210 long cnt_short; /* for overview mode, count of short line queries */
211 long cnt_punct; /* for overview mode,
212 count of punctuation and spacing queries */
213 long cnt_dash; /* for overview mode, count of dash-related queries */
214 long cnt_word; /* for overview mode, count of word queries */
215 long cnt_html; /* for overview mode, count of html queries */
216 long cnt_lineend; /* for overview mode, count of line-end queries */
217 long cnt_spacend; /* count of lines with space at end */
218 long linecnt; /* count of total lines in the file */
219 long checked_linecnt; /* count of lines actually checked */
221 void proghelp(GOptionContext *context);
222 void procfile(const char *);
226 gboolean mixdigit(const char *);
227 gchar *getaword(const char **);
228 char *flgets(char **,long);
229 void postprocess_for_HTML(char *);
230 char *linehasmarkup(char *);
231 char *losemarkup(char *);
232 gboolean tagcomp(const char *,const char *);
233 void loseentities(char *);
234 gboolean isroman(const char *);
235 void postprocess_for_DP(char *);
236 void print_as_windows_1252(const char *string);
237 void print_as_utf_8(const char *string);
239 GTree *qword,*qperiod;
245 struct first_pass_results {
246 long firstline,astline;
247 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
248 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
249 long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
250 int Dutchcount,Frenchcount;
254 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
256 gboolean isDutch,isFrench;
261 int c_unders,c_brack,s_brack,r_brack;
262 int open_single_quote,close_single_quote;
265 struct line_properties {
266 unsigned int len,blen;
275 char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
279 void parse_options(int *argc,char ***argv)
282 GOptionContext *context;
283 context=g_option_context_new(
284 "file - looks for errors in Project Gutenberg(TM) etexts");
285 g_option_context_add_main_entries(context,options,NULL);
286 if (!g_option_context_parse(context,argc,argv,&err))
288 g_printerr("Bookloupe: %s\n",err->message);
289 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
292 /* Paranoid checking is turned OFF, not on, by its switch */
293 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
294 if (pswit[PARANOID_SWITCH])
295 /* if running in paranoid mode, typo checks default to enabled */
296 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
297 /* Line-end checking is turned OFF, not on, by its switch */
298 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
299 /* Echoing is turned OFF, not on, by its switch */
300 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
301 if (pswit[OVERVIEW_SWITCH])
302 /* just print summary; don't echo */
303 pswit[ECHO_SWITCH]=FALSE;
305 * Web uploads - for the moment, this is really just a placeholder
306 * until we decide what processing we really want to do on web uploads
308 if (pswit[WEB_SWITCH])
310 /* specific override for web uploads */
311 pswit[ECHO_SWITCH]=TRUE;
312 pswit[SQUOTE_SWITCH]=FALSE;
313 pswit[TYPO_SWITCH]=TRUE;
314 pswit[QPARA_SWITCH]=FALSE;
315 pswit[PARANOID_SWITCH]=TRUE;
316 pswit[LINE_END_SWITCH]=FALSE;
317 pswit[OVERVIEW_SWITCH]=FALSE;
318 pswit[STDOUT_SWITCH]=FALSE;
319 pswit[HEADER_SWITCH]=TRUE;
320 pswit[VERBOSE_SWITCH]=FALSE;
321 pswit[MARKUP_SWITCH]=FALSE;
322 pswit[USERTYPO_SWITCH]=FALSE;
323 pswit[DP_SWITCH]=FALSE;
330 g_option_context_free(context);
336 * Read in the user-defined stealth scanno list.
338 void read_user_scannos(void)
341 gchar *usertypo_file;
345 gchar *contents,*utf8,**lines;
346 usertypo_file=g_strdup("bookloupe.typ");
347 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
348 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
351 g_free(usertypo_file);
352 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
353 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
355 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
358 g_free(usertypo_file);
359 usertypo_file=g_strdup("gutcheck.typ");
360 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
362 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
365 g_free(usertypo_file);
366 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
367 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
369 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
371 g_free(usertypo_file);
372 g_print(" --> I couldn't find bookloupe.typ "
373 "-- proceeding without user typos.\n");
378 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
379 g_free(usertypo_file);
383 if (g_utf8_validate(contents,len,NULL))
384 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
386 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
388 lines=g_strsplit_set(utf8,"\r\n",0);
390 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
391 for (i=0;lines[i];i++)
392 if (*(unsigned char *)lines[i]>'!')
393 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
402 * Read an etext returning a newly allocated string containing the file
403 * contents or NULL on error.
405 gchar *read_etext(const char *filename,GError **err)
407 GError *tmp_err=NULL;
408 gchar *contents,*utf8;
409 gsize len,bytes_read,bytes_written;
411 if (!g_file_get_contents(filename,&contents,&len,err))
413 if (g_utf8_validate(contents,len,NULL))
415 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
416 g_set_print_handler(print_as_utf_8);
418 SetConsoleOutputCP(CP_UTF8);
423 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
424 &bytes_written,&tmp_err);
425 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
426 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
429 for(i=0;i<bytes_read;i++)
430 if (contents[i]=='\n')
435 else if (contents[i]!='\r')
437 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
438 "Input conversion failed. Byte %d at line %d, column %d is not a "
439 "valid Windows-1252 character",
440 ((unsigned char *)contents)[bytes_read],line,col);
443 g_propagate_error(err,tmp_err);
444 g_set_print_handler(print_as_windows_1252);
446 SetConsoleOutputCP(1252);
453 void cleanup_on_exit(void)
456 SetConsoleOutputCP(saved_cp);
460 int main(int argc,char **argv)
463 atexit(cleanup_on_exit);
464 saved_cp=GetConsoleOutputCP();
466 running_from=g_path_get_dirname(argv[0]);
467 parse_options(&argc,&argv);
468 if (pswit[USERTYPO_SWITCH])
470 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
472 if (pswit[OVERVIEW_SWITCH])
474 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
475 checked_linecnt,linecnt,linecnt-checked_linecnt);
476 g_print(" --------------- Queries found --------------\n");
478 g_print(" Long lines: %14ld\n",cnt_long);
480 g_print(" Short lines: %14ld\n",cnt_short);
482 g_print(" Line-end problems: %14ld\n",cnt_lineend);
484 g_print(" Common typos: %14ld\n",cnt_word);
486 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
488 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
490 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
492 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
494 g_print(" Proofing characters: %14ld\n",cnt_odd);
496 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
498 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
500 g_print(" Possible HTML tags: %14ld\n",cnt_html);
502 g_print(" TOTAL QUERIES %14ld\n",
503 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
504 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
506 g_free(running_from);
508 g_tree_unref(usertypo);
515 * Run a first pass - verify that it's a valid PG
516 * file, decide whether to report some things that
517 * occur many times in the text like long or short
518 * lines, non-standard dashes, etc.
520 struct first_pass_results *first_pass(const char *etext)
522 gunichar laststart=CHAR_SPACE;
527 unsigned int lastlen=0,lastblen=0;
528 long spline=0,nspline=0;
529 static struct first_pass_results results={0};
531 lines=g_strsplit(etext,"\n",0);
532 for (j=0;lines[j];j++)
534 lbytes=strlen(lines[j]);
535 while (lines[j][lbytes-1]=='\r')
536 lines[j][--lbytes]='\0';
537 llen=g_utf8_strlen(lines[j],lbytes);
539 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
540 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
543 g_print(" --> Duplicate header?\n");
544 spline=linecnt+1; /* first line of non-header text, that is */
546 if (!strncmp(lines[j],"*** START",9) &&
547 strstr(lines[j],"PROJECT GUTENBERG"))
550 g_print(" --> Duplicate header?\n");
551 nspline=linecnt+1; /* first line of non-header text, that is */
553 if (spline || nspline)
555 lc_line=g_utf8_strdown(lines[j],lbytes);
556 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
558 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
560 if (results.footerline)
562 /* it's an old-form header - we can detect duplicates */
564 g_print(" --> Duplicate footer?\n");
567 results.footerline=linecnt;
573 results.firstline=spline;
575 results.firstline=nspline; /* override with new */
576 if (results.footerline)
577 continue; /* don't count the boilerplate in the footer */
578 results.totlen+=llen;
579 for (s=lines[j];*s;s=g_utf8_next_char(s))
581 if (g_utf8_get_char(s)>127)
583 if (g_unichar_isalpha(g_utf8_get_char(s)))
585 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
586 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
587 results.endquote_count++;
589 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
590 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
593 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
595 if (strstr(lines[j],".,"))
597 /* only count ast lines for ignoring purposes where there is */
598 /* locase text on the line */
599 if (strchr(lines[j],'*'))
601 for (s=lines[j];*s;s=g_utf8_next_char(s))
602 if (g_unichar_islower(g_utf8_get_char(s)))
607 if (strchr(lines[j],'/'))
608 results.fslashline++;
609 for (s=g_utf8_prev_char(lines[j]+lbytes);
610 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
612 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
613 g_utf8_get_char(g_utf8_prev_char(s))!='-')
615 if (llen>LONGEST_PG_LINE)
617 if (llen>WAY_TOO_LONG)
618 results.verylongline++;
619 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
621 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
624 if (strstr(lines[j],"<i>"))
625 results.htmcount+=4; /* bonus marks! */
627 /* Check for spaced em-dashes */
628 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
631 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
632 results.space_emdash++;
633 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
634 /* count of em-dashes with spaces both sides */
635 results.non_PG_space_emdash++;
636 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
637 /* count of PG-type em-dashes with no spaces */
638 results.PG_space_emdash++;
643 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
644 results.Dutchcount++;
645 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
646 results.Frenchcount++;
647 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
648 results.standalone_digit++;
651 /* Check for spaced dashes */
652 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
656 laststart=lines[j][0];
665 * Make some snap decisions based on the first pass results.
667 struct warnings *report_first_pass(struct first_pass_results *results)
669 static struct warnings warnings={0};
671 g_print(" --> %ld lines in this file have white space at end\n",
674 if (results->dotcomma>5)
677 g_print(" --> %ld lines in this file contain '.,'. "
678 "Not reporting them.\n",results->dotcomma);
681 * If more than 50 lines, or one-tenth, are short,
682 * don't bother reporting them.
684 warnings.shortline=1;
685 if (results->shortline>50 || results->shortline*10>linecnt)
687 warnings.shortline=0;
688 g_print(" --> %ld lines in this file are short. "
689 "Not reporting short lines.\n",results->shortline);
692 * If more than 50 lines, or one-tenth, are long,
693 * don't bother reporting them.
696 if (results->longline>50 || results->longline*10>linecnt)
699 g_print(" --> %ld lines in this file are long. "
700 "Not reporting long lines.\n",results->longline);
702 /* If more than 10 lines contain asterisks, don't bother reporting them. */
704 if (results->astline>10)
707 g_print(" --> %ld lines in this file contain asterisks. "
708 "Not reporting them.\n",results->astline);
711 * If more than 10 lines contain forward slashes,
712 * don't bother reporting them.
715 if (results->fslashline>10)
718 g_print(" --> %ld lines in this file contain forward slashes. "
719 "Not reporting them.\n",results->fslashline);
722 * If more than 20 lines contain unpunctuated endquotes,
723 * don't bother reporting them.
726 if (results->endquote_count>20)
729 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
730 "Not reporting them.\n",results->endquote_count);
733 * If more than 15 lines contain standalone digits,
734 * don't bother reporting them.
737 if (results->standalone_digit>10)
740 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
741 "Not reporting them.\n",results->standalone_digit);
744 * If more than 20 lines contain hyphens at end,
745 * don't bother reporting them.
748 if (results->hyphens>20)
751 g_print(" --> %ld lines in this file have hyphens at end. "
752 "Not reporting them.\n",results->hyphens);
754 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
756 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
757 pswit[MARKUP_SWITCH]=1;
759 if (results->verylongline>0)
760 g_print(" --> %ld lines in this file are VERY long!\n",
761 results->verylongline);
763 * If there are more non-PG spaced dashes than PG em-dashes,
764 * assume it's deliberate.
765 * Current PG guidelines say don't use them, but older texts do,
766 * and some people insist on them whatever the guidelines say.
769 if (results->spacedash+results->non_PG_space_emdash>
770 results->PG_space_emdash)
773 g_print(" --> There are %ld spaced dashes and em-dashes. "
774 "Not reporting them.\n",
775 results->spacedash+results->non_PG_space_emdash);
777 /* If more than a quarter of characters are hi-bit, bug out. */
779 if (results->binlen*4>results->totlen)
781 g_print(" --> This file does not appear to be ASCII. "
782 "Terminating. Best of luck with it!\n");
785 if (results->alphalen*4<results->totlen)
787 g_print(" --> This file does not appear to be text. "
788 "Terminating. Best of luck with it!\n");
791 if (results->binlen*100>results->totlen || results->binlen>100)
793 g_print(" --> There are a lot of foreign letters here. "
794 "Not reporting them.\n");
797 warnings.isDutch=FALSE;
798 if (results->Dutchcount>50)
800 warnings.isDutch=TRUE;
801 g_print(" --> This looks like Dutch - "
802 "switching off dashes and warnings for 's Middags case.\n");
804 warnings.isFrench=FALSE;
805 if (results->Frenchcount>50)
807 warnings.isFrench=TRUE;
808 g_print(" --> This looks like French - "
809 "switching off some doublepunct.\n");
811 if (results->firstline && results->footerline)
812 g_print(" The PG header and footer appear to be already on.\n");
815 if (results->firstline)
816 g_print(" The PG header is on - no footer.\n");
817 if (results->footerline)
818 g_print(" The PG footer is on - no header.\n");
821 if (pswit[VERBOSE_SWITCH])
824 warnings.shortline=1;
833 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
835 if (warnings.isDutch)
837 if (results->footerline>0 && results->firstline>0 &&
838 results->footerline>results->firstline &&
839 results->footerline-results->firstline<100)
841 g_print(" --> I don't really know where this text starts. \n");
842 g_print(" There are no reference points.\n");
843 g_print(" I'm going to have to report the header and footer "
845 results->firstline=0;
853 * Look along the line, accumulate the count of quotes, and see
854 * if this is an empty line - i.e. a line with nothing on it
856 * If line has just spaces, period, * and/or - on it, don't
857 * count it, since empty lines with asterisks or dashes to
858 * separate sections are common.
860 * Returns: TRUE if the line is empty.
862 gboolean analyse_quotes(const char *aline,struct counters *counters)
865 /* assume the line is empty until proven otherwise */
866 gboolean isemptyline=TRUE;
867 const char *s=aline,*sprev,*snext;
872 snext=g_utf8_next_char(s);
873 c=g_utf8_get_char(s);
876 if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
881 * At start of line, it can only be an openquote.
882 * Hardcode a very common exception!
884 if (!g_str_has_prefix(snext,"tis") &&
885 !g_str_has_prefix(snext,"Tis"))
886 counters->open_single_quote++;
888 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
889 g_unichar_isalpha(g_utf8_get_char(snext)))
890 /* Do nothing! it's definitely an apostrophe, not a quote */
892 /* it's outside a word - let's check it out */
893 else if (c==CHAR_OPEN_SQUOTE ||
894 g_unichar_isalpha(g_utf8_get_char(snext)))
896 /* it damwell better BE an openquote */
897 if (!g_str_has_prefix(snext,"tis") &&
898 !g_str_has_prefix(snext,"Tis"))
899 /* hardcode a very common exception! */
900 counters->open_single_quote++;
904 /* now - is it a closequote? */
905 guessquote=0; /* accumulate clues */
906 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
908 /* it follows a letter - could be either */
910 if (g_utf8_get_char(sprev)=='s')
912 /* looks like a plural apostrophe */
914 if (g_utf8_get_char(snext)==CHAR_SPACE)
919 /* it doesn't have a letter either side */
920 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
921 strchr(".?!,;: ",g_utf8_get_char(snext)))
922 guessquote+=8; /* looks like a closequote */
925 if (counters->open_single_quote>counters->close_single_quote)
927 * Give it the benefit of some doubt,
928 * if a squote is already open.
934 counters->close_single_quote++;
937 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
939 isemptyline=FALSE; /* ignore lines like * * * as spacers */
940 if (c==CHAR_UNDERSCORE)
941 counters->c_unders++;
942 if (c==CHAR_OPEN_CBRACK)
944 if (c==CHAR_CLOSE_CBRACK)
946 if (c==CHAR_OPEN_RBRACK)
948 if (c==CHAR_CLOSE_RBRACK)
950 if (c==CHAR_OPEN_SBRACK)
952 if (c==CHAR_CLOSE_SBRACK)
961 * check_for_control_characters:
963 * Check for invalid or questionable characters in the line
964 * Anything above 127 is invalid for plain ASCII, and
965 * non-printable control characters should also be flagged.
966 * Tabs should generally not be there.
968 void check_for_control_characters(const char *aline)
972 for (s=aline;*s;s=g_utf8_next_char(s))
974 c=g_utf8_get_char(s);
975 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
977 if (pswit[ECHO_SWITCH])
978 g_print("\n%s\n",aline);
979 if (!pswit[OVERVIEW_SWITCH])
980 g_print(" Line %ld column %ld - Control character %u\n",
981 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
989 * check_for_odd_characters:
991 * Check for binary and other odd characters.
993 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
994 gboolean isemptyline)
996 /* Don't repeat multiple warnings on one line. */
997 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
998 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1001 for (s=aline;*s;s=g_utf8_next_char(s))
1003 c=g_utf8_get_char(s);
1004 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1006 if (pswit[ECHO_SWITCH])
1007 g_print("\n%s\n",aline);
1008 if (!pswit[OVERVIEW_SWITCH])
1009 if (c>127 && c<160 || c>255)
1010 g_print(" Line %ld column %ld - "
1011 "Non-ISO-8859 character %u\n",
1012 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1014 g_print(" Line %ld column %ld - "
1015 "Non-ASCII character %u\n",
1016 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1021 if (!eTab && c==CHAR_TAB)
1023 if (pswit[ECHO_SWITCH])
1024 g_print("\n%s\n",aline);
1025 if (!pswit[OVERVIEW_SWITCH])
1026 g_print(" Line %ld column %ld - Tab character?\n",
1027 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1032 if (!eTilde && c==CHAR_TILDE)
1035 * Often used by OCR software to indicate an
1036 * unrecognizable character.
1038 if (pswit[ECHO_SWITCH])
1039 g_print("\n%s\n",aline);
1040 if (!pswit[OVERVIEW_SWITCH])
1041 g_print(" Line %ld column %ld - Tilde character?\n",
1042 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1047 if (!eCarat && c==CHAR_CARAT)
1049 if (pswit[ECHO_SWITCH])
1050 g_print("\n%s\n",aline);
1051 if (!pswit[OVERVIEW_SWITCH])
1052 g_print(" Line %ld column %ld - Carat character?\n",
1053 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1058 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1060 if (pswit[ECHO_SWITCH])
1061 g_print("\n%s\n",aline);
1062 if (!pswit[OVERVIEW_SWITCH])
1063 g_print(" Line %ld column %ld - Forward slash?\n",
1064 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1070 * Report asterisks only in paranoid mode,
1071 * since they're often deliberate.
1073 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1076 if (pswit[ECHO_SWITCH])
1077 g_print("\n%s\n",aline);
1078 if (!pswit[OVERVIEW_SWITCH])
1079 g_print(" Line %ld column %ld - Asterisk?\n",
1080 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1089 * check_for_long_line:
1091 * Check for line too long.
1093 void check_for_long_line(const char *aline)
1095 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1097 if (pswit[ECHO_SWITCH])
1098 g_print("\n%s\n",aline);
1099 if (!pswit[OVERVIEW_SWITCH])
1100 g_print(" Line %ld column %ld - Long line %ld\n",
1101 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1108 * check_for_short_line:
1110 * Check for line too short.
1112 * This one is a bit trickier to implement: we don't want to
1113 * flag the last line of a paragraph for being short, so we
1114 * have to wait until we know that our current line is a
1115 * "normal" line, then report the _previous_ line if it was too
1116 * short. We also don't want to report indented lines like
1117 * chapter heads or formatted quotations. We therefore keep
1118 * last->len as the length of the last line examined, and
1119 * last->blen as the length of the last but one, and try to
1120 * suppress unnecessary warnings by checking that both were of
1121 * "normal" length. We keep the first character of the last
1122 * line in last->start, and if it was a space, we assume that
1123 * the formatting is deliberate. I can't figure out a way to
1124 * distinguish something like a quoted verse left-aligned or
1125 * the header or footer of a letter from a paragraph of short
1126 * lines - maybe if I examined the whole paragraph, and if the
1127 * para has less than, say, 8 lines and if all lines are short,
1128 * then just assume it's OK? Need to look at some texts to see
1129 * how often a formula like this would get the right result.
1131 void check_for_short_line(const char *aline,const struct line_properties *last)
1133 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1134 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1135 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1137 if (pswit[ECHO_SWITCH])
1138 g_print("\n%s\n",prevline);
1139 if (!pswit[OVERVIEW_SWITCH])
1140 g_print(" Line %ld column %ld - Short line %ld?\n",
1141 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1148 * check_for_starting_punctuation:
1150 * Look for punctuation other than full ellipses at start of line.
1152 void check_for_starting_punctuation(const char *aline)
1154 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1155 !g_str_has_prefix(aline,". . ."))
1157 if (pswit[ECHO_SWITCH])
1158 g_print("\n%s\n",aline);
1159 if (!pswit[OVERVIEW_SWITCH])
1160 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1168 * check_for_spaced_emdash:
1170 * Check for spaced em-dashes.
1172 * We must check _all_ occurrences of "--" on the line
1173 * hence the loop - even if the first double-dash is OK
1174 * there may be another that's wrong later on.
1176 void check_for_spaced_emdash(const char *aline)
1178 const char *s,*t,*next;
1179 for (s=aline;t=strstr(s,"--");s=next)
1181 next=g_utf8_next_char(g_utf8_next_char(t));
1182 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1183 g_utf8_get_char(next)==CHAR_SPACE)
1185 if (pswit[ECHO_SWITCH])
1186 g_print("\n%s\n",aline);
1187 if (!pswit[OVERVIEW_SWITCH])
1188 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1189 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1197 * check_for_spaced_dash:
1199 * Check for spaced dashes.
1201 void check_for_spaced_dash(const char *aline)
1204 if ((s=strstr(aline," -")))
1206 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1208 if (pswit[ECHO_SWITCH])
1209 g_print("\n%s\n",aline);
1210 if (!pswit[OVERVIEW_SWITCH])
1211 g_print(" Line %ld column %ld - Spaced dash?\n",
1212 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1217 else if ((s=strstr(aline,"- ")))
1219 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1221 if (pswit[ECHO_SWITCH])
1222 g_print("\n%s\n",aline);
1223 if (!pswit[OVERVIEW_SWITCH])
1224 g_print(" Line %ld column %ld - Spaced dash?\n",
1225 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1233 * check_for_unmarked_paragraphs:
1235 * Check for unmarked paragraphs indicated by separate speakers.
1237 * May well be false positive:
1238 * "Bravo!" "Wonderful!" called the crowd.
1239 * but useful all the same.
1241 void check_for_unmarked_paragraphs(const char *aline)
1244 s=strstr(aline,"\" \"");
1246 s=strstr(aline,"\" \"");
1249 if (pswit[ECHO_SWITCH])
1250 g_print("\n%s\n",aline);
1251 if (!pswit[OVERVIEW_SWITCH])
1252 g_print(" Line %ld column %ld - "
1253 "Query missing paragraph break?\n",
1254 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1261 * check_for_jeebies:
1263 * Check for "to he" and other easy h/b errors.
1265 * This is a very inadequate effort on the h/b problem,
1266 * but the phrase "to he" is always an error, whereas "to
1267 * be" is quite common.
1268 * Similarly, '"Quiet!", be said.' is a non-be error
1269 * "to he" is _not_ always an error!:
1270 * "Where they went to he couldn't say."
1271 * Another false positive:
1272 * What would "Cinderella" be without the . . .
1273 * and another: "If he wants to he can see for himself."
1275 void check_for_jeebies(const char *aline)
1278 s=strstr(aline," be could ");
1280 s=strstr(aline," be would ");
1282 s=strstr(aline," was be ");
1284 s=strstr(aline," be is ");
1286 s=strstr(aline," is be ");
1288 s=strstr(aline,"\", be ");
1290 s=strstr(aline,"\" be ");
1292 s=strstr(aline,"\" be ");
1294 s=strstr(aline," to he ");
1297 if (pswit[ECHO_SWITCH])
1298 g_print("\n%s\n",aline);
1299 if (!pswit[OVERVIEW_SWITCH])
1300 g_print(" Line %ld column %ld - Query he/be error?\n",
1301 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1305 s=strstr(aline," the had ");
1307 s=strstr(aline," a had ");
1309 s=strstr(aline," they bad ");
1311 s=strstr(aline," she bad ");
1313 s=strstr(aline," he bad ");
1315 s=strstr(aline," you bad ");
1317 s=strstr(aline," i bad ");
1320 if (pswit[ECHO_SWITCH])
1321 g_print("\n%s\n",aline);
1322 if (!pswit[OVERVIEW_SWITCH])
1323 g_print(" Line %ld column %ld - Query had/bad error?\n",
1324 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1328 s=strstr(aline,"; hut ");
1330 s=strstr(aline,", hut ");
1333 if (pswit[ECHO_SWITCH])
1334 g_print("\n%s\n",aline);
1335 if (!pswit[OVERVIEW_SWITCH])
1336 g_print(" Line %ld column %ld - Query hut/but error?\n",
1337 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1344 * check_for_mta_from:
1346 * Special case - angled bracket in front of "From" placed there by an
1347 * MTA when sending an e-mail.
1349 void check_for_mta_from(const char *aline)
1352 s=strstr(aline,">From");
1355 if (pswit[ECHO_SWITCH])
1356 g_print("\n%s\n",aline);
1357 if (!pswit[OVERVIEW_SWITCH])
1358 g_print(" Line %ld column %ld - "
1359 "Query angled bracket with From\n",
1360 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1367 * check_for_orphan_character:
1369 * Check for a single character line -
1370 * often an overflow from bad wrapping.
1372 void check_for_orphan_character(const char *aline)
1375 c=g_utf8_get_char(aline);
1376 if (c && !*g_utf8_next_char(aline))
1378 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1379 ; /* Nothing - ignore numerals alone on a line. */
1382 if (pswit[ECHO_SWITCH])
1383 g_print("\n%s\n",aline);
1384 if (!pswit[OVERVIEW_SWITCH])
1385 g_print(" Line %ld column 1 - Query single character line\n",
1394 * check_for_pling_scanno:
1396 * Check for I" - often should be !
1398 void check_for_pling_scanno(const char *aline)
1401 s=strstr(aline," I\"");
1404 if (pswit[ECHO_SWITCH])
1405 g_print("\n%s\n",aline);
1406 if (!pswit[OVERVIEW_SWITCH])
1407 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1408 linecnt,g_utf8_pointer_to_offset(aline,s));
1415 * check_for_extra_period:
1417 * Check for period without a capital letter. Cut-down from gutspell.
1418 * Only works when it happens on a single line.
1420 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1422 const char *s,*t,*s1;
1427 gunichar *decomposition;
1428 if (pswit[PARANOID_SWITCH])
1430 for (t=aline;t=strstr(t,". ");)
1434 t=g_utf8_next_char(t);
1435 /* start of line punctuation is handled elsewhere */
1438 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1440 t=g_utf8_next_char(t);
1443 if (warnings->isDutch)
1445 /* For Frank & Jeroen -- 's Middags case */
1446 gunichar c2,c3,c4,c5;
1447 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1448 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1449 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1450 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1451 if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
1452 c4==CHAR_SPACE && g_unichar_isupper(c5))
1454 t=g_utf8_next_char(t);
1458 s1=g_utf8_next_char(g_utf8_next_char(t));
1459 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1460 !isdigit(g_utf8_get_char(s1)))
1461 s1=g_utf8_next_char(s1);
1462 if (g_unichar_islower(g_utf8_get_char(s1)))
1464 /* we have something to investigate */
1466 /* so let's go back and find out */
1467 for (s1=g_utf8_prev_char(t);s1>=aline &&
1468 (g_unichar_isalpha(g_utf8_get_char(s1)) ||
1469 g_unichar_isdigit(g_utf8_get_char(s1)) ||
1470 g_utf8_get_char(s1)==CHAR_SQUOTE &&
1471 g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
1472 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
1473 s1=g_utf8_prev_char(s1))
1475 s1=g_utf8_next_char(s1);
1478 testword=g_strndup(s1,s-s1);
1480 testword=g_strdup(s1);
1481 for (i=0;*abbrev[i];i++)
1482 if (!strcmp(testword,abbrev[i]))
1484 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1486 if (!*g_utf8_next_char(testword))
1488 if (isroman(testword))
1493 for (s=testword;*s;s=g_utf8_next_char(s))
1495 decomposition=g_unicode_canonical_decomposition(
1496 g_utf8_get_char(s),&len);
1497 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1499 g_free(decomposition);
1503 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1505 g_tree_insert(qperiod,g_strdup(testword),
1506 GINT_TO_POINTER(1));
1507 if (pswit[ECHO_SWITCH])
1508 g_print("\n%s\n",aline);
1509 if (!pswit[OVERVIEW_SWITCH])
1510 g_print(" Line %ld column %ld - Extra period?\n",
1511 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1517 t=g_utf8_next_char(t);
1523 * check_for_following_punctuation:
1525 * Check for words usually not followed by punctuation.
1527 void check_for_following_punctuation(const char *aline)
1530 const char *s,*wordstart;
1533 if (pswit[TYPO_SWITCH])
1544 inword=g_utf8_strdown(t,-1);
1546 for (i=0;*nocomma[i];i++)
1547 if (!strcmp(inword,nocomma[i]))
1549 c=g_utf8_get_char(s);
1550 if (c==',' || c==';' || c==':')
1552 if (pswit[ECHO_SWITCH])
1553 g_print("\n%s\n",aline);
1554 if (!pswit[OVERVIEW_SWITCH])
1555 g_print(" Line %ld column %ld - "
1556 "Query punctuation after %s?\n",
1557 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1563 for (i=0;*noperiod[i];i++)
1564 if (!strcmp(inword,noperiod[i]))
1566 c=g_utf8_get_char(s);
1567 if (c=='.' || c=='!')
1569 if (pswit[ECHO_SWITCH])
1570 g_print("\n%s\n",aline);
1571 if (!pswit[OVERVIEW_SWITCH])
1572 g_print(" Line %ld column %ld - "
1573 "Query punctuation after %s?\n",
1574 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1588 * Check for commonly mistyped words,
1589 * and digits like 0 for O in a word.
1591 void check_for_typos(const char *aline,struct warnings *warnings)
1593 const char *s,*t,*nt,*wordstart;
1595 gunichar *decomposition;
1597 int i,vowel,consonant,*dupcnt;
1598 gboolean isdup,istypo,alower;
1601 gsize decomposition_len;
1605 inword=getaword(&s);
1609 continue; /* don't bother with empty lines */
1611 if (mixdigit(inword))
1613 if (pswit[ECHO_SWITCH])
1614 g_print("\n%s\n",aline);
1615 if (!pswit[OVERVIEW_SWITCH])
1616 g_print(" Line %ld column %ld - Query digit in %s\n",
1617 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1622 * Put the word through a series of tests for likely typos and OCR
1625 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1629 for (t=inword;*t;t=g_utf8_next_char(t))
1631 c=g_utf8_get_char(t);
1632 nt=g_utf8_next_char(t);
1633 /* lowercase for testing */
1634 if (g_unichar_islower(c))
1636 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1639 * We have an uppercase mid-word. However, there are
1641 * Mac and Mc like McGill
1642 * French contractions like l'Abbe
1644 offset=g_utf8_pointer_to_offset(inword,t);
1645 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1646 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1647 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1649 g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
1655 testword=g_utf8_casefold(inword,-1);
1657 if (pswit[TYPO_SWITCH])
1660 * Check for certain unlikely two-letter combinations at word
1663 len=g_utf8_strlen(testword,-1);
1666 for (i=0;*nostart[i];i++)
1667 if (g_str_has_prefix(testword,nostart[i]))
1669 for (i=0;*noend[i];i++)
1670 if (g_str_has_suffix(testword,noend[i]))
1673 /* ght is common, gbt never. Like that. */
1674 if (strstr(testword,"cb"))
1676 if (strstr(testword,"gbt"))
1678 if (strstr(testword,"pbt"))
1680 if (strstr(testword,"tbs"))
1682 if (strstr(testword,"mrn"))
1684 if (strstr(testword,"ahle"))
1686 if (strstr(testword,"ihle"))
1689 * "TBE" does happen - like HEARTBEAT - but uncommon.
1690 * Also "TBI" - frostbite, outbid - but uncommon.
1691 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1692 * numerals, but "ii" is a common scanno.
1694 if (strstr(testword,"tbi"))
1696 if (strstr(testword,"tbe"))
1698 if (strstr(testword,"ii"))
1701 * Check for no vowels or no consonants.
1702 * If none, flag a typo.
1704 if (!istypo && len>1)
1707 for (t=testword;*t;t=g_utf8_next_char(t))
1709 c=g_utf8_get_char(t);
1711 g_unicode_canonical_decomposition(c,&decomposition_len);
1712 if (c=='y' || g_unichar_isdigit(c))
1714 /* Yah, this is loose. */
1718 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1722 g_free(decomposition);
1724 if (!vowel || !consonant)
1728 * Now exclude the word from being reported if it's in
1731 for (i=0;*okword[i];i++)
1732 if (!strcmp(testword,okword[i]))
1735 * What looks like a typo may be a Roman numeral.
1738 if (istypo && isroman(testword))
1740 /* Check the manual list of typos. */
1742 for (i=0;*typo[i];i++)
1743 if (!strcmp(testword,typo[i]))
1746 * Check lowercase s, l, i and m - special cases.
1747 * "j" - often a semi-colon gone wrong.
1748 * "d" for a missing apostrophe - he d
1751 if (!istypo && len==1 &&
1752 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1756 dupcnt=g_tree_lookup(qword,testword);
1760 isdup=!pswit[VERBOSE_SWITCH];
1764 dupcnt=g_new0(int,1);
1765 g_tree_insert(qword,g_strdup(testword),dupcnt);
1770 if (pswit[ECHO_SWITCH])
1771 g_print("\n%s\n",aline);
1772 if (!pswit[OVERVIEW_SWITCH])
1774 g_print(" Line %ld column %ld - Query word %s",
1775 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1777 if (!pswit[VERBOSE_SWITCH])
1778 g_print(" - not reporting duplicates");
1786 /* check the user's list of typos */
1787 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1789 if (pswit[ECHO_SWITCH])
1790 g_print("\n%s\n",aline);
1791 if (!pswit[OVERVIEW_SWITCH])
1792 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1793 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1795 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1797 if (pswit[PARANOID_SWITCH] && warnings->digit)
1799 /* In paranoid mode, query all 0 and 1 standing alone. */
1800 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1802 if (pswit[ECHO_SWITCH])
1803 g_print("\n%s\n",aline);
1804 if (!pswit[OVERVIEW_SWITCH])
1805 g_print(" Line %ld column %ld - Query standalone %s\n",
1806 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1817 * check_for_misspaced_punctuation:
1819 * Look for added or missing spaces around punctuation and quotes.
1820 * If there is a punctuation character like ! with no space on
1821 * either side, suspect a missing!space. If there are spaces on
1822 * both sides , assume a typo. If we see a double quote with no
1823 * space or punctuation on either side of it, assume unspaced
1824 * quotes "like"this.
1826 void check_for_misspaced_punctuation(const char *aline,
1827 struct parities *parities,gboolean isemptyline)
1829 gboolean isacro,isellipsis;
1831 gunichar c,nc,pc,n2c;
1832 c=g_utf8_get_char(aline);
1833 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1834 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1838 nc=g_utf8_get_char(g_utf8_next_char(s));
1839 /* For each character in the line after the first. */
1840 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1842 /* we need to suppress warnings for acronyms like M.D. */
1844 /* we need to suppress warnings for ellipsis . . . */
1847 * If there are letters on both sides of it or
1848 * if it's strict punctuation followed by an alpha.
1850 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1851 g_utf8_strchr("?!,;:",-1,c)))
1855 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1856 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1858 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1864 if (pswit[ECHO_SWITCH])
1865 g_print("\n%s\n",aline);
1866 if (!pswit[OVERVIEW_SWITCH])
1867 g_print(" Line %ld column %ld - Missing space?\n",
1868 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1873 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1876 * If there are spaces on both sides,
1877 * or space before and end of line.
1881 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1882 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1884 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1888 if (!isemptyline && !isellipsis)
1890 if (pswit[ECHO_SWITCH])
1891 g_print("\n%s\n",aline);
1892 if (!pswit[OVERVIEW_SWITCH])
1893 g_print(" Line %ld column %ld - "
1894 "Spaced punctuation?\n",linecnt,
1895 g_utf8_pointer_to_offset(aline,s)+1);
1902 /* Split out the characters that CANNOT be preceded by space. */
1903 c=g_utf8_get_char(aline);
1904 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1905 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1909 nc=g_utf8_get_char(g_utf8_next_char(s));
1910 /* for each character in the line after the first */
1911 if (g_utf8_strchr("?!,;:",-1,c))
1913 /* if it's punctuation that _cannot_ have a space before it */
1914 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1917 * If nc DOES == space,
1918 * it was already reported just above.
1920 if (pswit[ECHO_SWITCH])
1921 g_print("\n%s\n",aline);
1922 if (!pswit[OVERVIEW_SWITCH])
1923 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1924 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1931 * Special case " .X" where X is any alpha.
1932 * This plugs a hole in the acronym code above.
1933 * Inelegant, but maintainable.
1935 c=g_utf8_get_char(aline);
1936 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1937 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1941 nc=g_utf8_get_char(g_utf8_next_char(s));
1942 /* for each character in the line after the first */
1945 /* if it's a period */
1946 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1949 * If the period follows a space and
1950 * is followed by a letter.
1952 if (pswit[ECHO_SWITCH])
1953 g_print("\n%s\n",aline);
1954 if (!pswit[OVERVIEW_SWITCH])
1955 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1956 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1962 c=g_utf8_get_char(aline);
1963 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1964 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1968 nc=g_utf8_get_char(g_utf8_next_char(s));
1969 /* for each character in the line after the first */
1972 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1973 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1974 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1976 if (pswit[ECHO_SWITCH])
1977 g_print("\n%s\n",aline);
1978 if (!pswit[OVERVIEW_SWITCH])
1979 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1980 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1986 /* Check parity of quotes. */
1987 nc=g_utf8_get_char(aline);
1988 for (s=aline;*s;s=g_utf8_next_char(s))
1991 nc=g_utf8_get_char(g_utf8_next_char(s));
1994 parities->dquote=!parities->dquote;
1995 if (!parities->dquote)
1998 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2000 if (pswit[ECHO_SWITCH])
2001 g_print("\n%s\n",aline);
2002 if (!pswit[OVERVIEW_SWITCH])
2003 g_print(" Line %ld column %ld - "
2004 "Wrongspaced quotes?\n",
2005 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2013 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2014 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2016 if (pswit[ECHO_SWITCH])
2017 g_print("\n%s\n",aline);
2018 if (!pswit[OVERVIEW_SWITCH])
2019 g_print(" Line %ld column %ld - "
2020 "Wrongspaced quotes?\n",
2021 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2028 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2030 if (g_utf8_strchr(",;:!?)]} ",-1,
2031 g_utf8_get_char(g_utf8_next_char(aline))))
2033 if (pswit[ECHO_SWITCH])
2034 g_print("\n%s\n",aline);
2035 if (!pswit[OVERVIEW_SWITCH])
2036 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2042 if (pswit[SQUOTE_SWITCH])
2044 nc=g_utf8_get_char(aline);
2045 for (s=aline;*s;s=g_utf8_next_char(s))
2048 nc=g_utf8_get_char(g_utf8_next_char(s));
2049 if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
2051 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2052 !g_unichar_isalpha(nc)))
2054 parities->squote=!parities->squote;
2055 if (!parities->squote)
2058 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2060 if (pswit[ECHO_SWITCH])
2061 g_print("\n%s\n",aline);
2062 if (!pswit[OVERVIEW_SWITCH])
2063 g_print(" Line %ld column %ld - "
2064 "Wrongspaced singlequotes?\n",
2065 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2073 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2074 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2076 if (pswit[ECHO_SWITCH])
2077 g_print("\n%s\n",aline);
2078 if (!pswit[OVERVIEW_SWITCH])
2079 g_print(" Line %ld column %ld - "
2080 "Wrongspaced singlequotes?\n",
2081 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2092 * check_for_double_punctuation:
2094 * Look for double punctuation like ,. or ,,
2095 * Thanks to DW for the suggestion!
2096 * In books with references, ".," and ".;" are common
2097 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2098 * OTOH, from my initial tests, there are also fairly
2099 * common errors. What to do? Make these cases paranoid?
2100 * ".," is the most common, so warnings->dotcomma is used
2101 * to suppress detailed reporting if it occurs often.
2103 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2107 nc=g_utf8_get_char(aline);
2108 for (s=aline;*s;s=g_utf8_next_char(s))
2111 nc=g_utf8_get_char(g_utf8_next_char(s));
2112 /* for each punctuation character in the line */
2113 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2114 g_utf8_strchr(".?!,;:",-1,nc))
2116 /* followed by punctuation, it's a query, unless . . . */
2117 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2118 !warnings->dotcomma && c=='.' && nc==',' ||
2119 warnings->isFrench && g_str_has_prefix(s,",...") ||
2120 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2121 warnings->isFrench && g_str_has_prefix(s,";...") ||
2122 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2123 warnings->isFrench && g_str_has_prefix(s,":...") ||
2124 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2125 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2126 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2127 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2128 warnings->isFrench && g_str_has_prefix(s,"...?"))
2130 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2131 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2132 warnings->isFrench && g_str_has_prefix(s,";...") ||
2133 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2134 warnings->isFrench && g_str_has_prefix(s,":...") ||
2135 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2136 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2137 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2138 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2139 warnings->isFrench && g_str_has_prefix(s,"...?"))
2142 nc=g_utf8_get_char(g_utf8_next_char(s));
2144 ; /* do nothing for .. !! and ?? which can be legit */
2148 if (pswit[ECHO_SWITCH])
2149 g_print("\n%s\n",aline);
2150 if (!pswit[OVERVIEW_SWITCH])
2151 g_print(" Line %ld column %ld - Double punctuation?\n",
2152 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2161 * check_for_spaced_quotes:
2163 void check_for_spaced_quotes(const char *aline)
2167 while ((t=strstr(s," \" ")))
2169 if (pswit[ECHO_SWITCH])
2170 g_print("\n%s\n",aline);
2171 if (!pswit[OVERVIEW_SWITCH])
2172 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2173 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2176 s=g_utf8_next_char(g_utf8_next_char(t));
2179 while ((t=strstr(s," ' ")))
2181 if (pswit[ECHO_SWITCH])
2182 g_print("\n%s\n",aline);
2183 if (!pswit[OVERVIEW_SWITCH])
2184 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2185 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2188 s=g_utf8_next_char(g_utf8_next_char(t));
2191 while ((t=strstr(s," ` ")))
2193 if (pswit[ECHO_SWITCH])
2194 g_print("\n%s\n",aline);
2195 if (!pswit[OVERVIEW_SWITCH])
2196 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2197 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2200 s=g_utf8_next_char(g_utf8_next_char(t));
2205 * check_for_miscased_genative:
2207 * Check special case of 'S instead of 's at end of word.
2209 void check_for_miscased_genative(const char *aline)
2215 c=g_utf8_get_char(aline);
2216 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2217 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2221 nc=g_utf8_get_char(g_utf8_next_char(s));
2222 if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
2224 if (pswit[ECHO_SWITCH])
2225 g_print("\n%s\n",aline);
2226 if (!pswit[OVERVIEW_SWITCH])
2227 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2228 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2236 * check_end_of_line:
2238 * Now check special cases - start and end of line -
2239 * for single and double quotes. Start is sometimes [sic]
2240 * but better to query it anyway.
2241 * While we're here, check for dash at end of line.
2243 void check_end_of_line(const char *aline,struct warnings *warnings)
2248 lbytes=strlen(aline);
2249 if (g_utf8_strlen(aline,lbytes)>1)
2251 s=g_utf8_prev_char(aline+lbytes);
2252 c1=g_utf8_get_char(s);
2253 c2=g_utf8_get_char(g_utf8_prev_char(s));
2254 if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
2257 if (pswit[ECHO_SWITCH])
2258 g_print("\n%s\n",aline);
2259 if (!pswit[OVERVIEW_SWITCH])
2260 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2261 g_utf8_strlen(aline,lbytes));
2265 c1=g_utf8_get_char(aline);
2266 c2=g_utf8_get_char(g_utf8_next_char(aline));
2267 if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
2269 if (pswit[ECHO_SWITCH])
2270 g_print("\n%s\n",aline);
2271 if (!pswit[OVERVIEW_SWITCH])
2272 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2277 * Dash at end of line may well be legit - paranoid mode only
2278 * and don't report em-dash at line-end.
2280 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2282 for (s=g_utf8_prev_char(aline+lbytes);
2283 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2285 if (g_utf8_get_char(s)=='-' &&
2286 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2288 if (pswit[ECHO_SWITCH])
2289 g_print("\n%s\n",aline);
2290 if (!pswit[OVERVIEW_SWITCH])
2291 g_print(" Line %ld column %ld - "
2292 "Hyphen at end of line?\n",
2293 linecnt,g_utf8_pointer_to_offset(aline,s));
2300 * check_for_unspaced_bracket:
2302 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2303 * If so, suspect a scanno like "a]most".
2305 void check_for_unspaced_bracket(const char *aline)
2309 c=g_utf8_get_char(aline);
2310 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2311 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2315 nc=g_utf8_get_char(g_utf8_next_char(s));
2318 /* for each bracket character in the line except 1st & last */
2319 if (g_utf8_strchr("{[()]}",-1,c) &&
2320 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2322 if (pswit[ECHO_SWITCH])
2323 g_print("\n%s\n",aline);
2324 if (!pswit[OVERVIEW_SWITCH])
2325 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2326 linecnt,g_utf8_pointer_to_offset(aline,s));
2334 * check_for_unpunctuated_endquote:
2336 void check_for_unpunctuated_endquote(const char *aline)
2340 c=g_utf8_get_char(aline);
2341 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2342 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2346 nc=g_utf8_get_char(g_utf8_next_char(s));
2347 /* for each character in the line except 1st */
2348 if (c==CHAR_DQUOTE && isalpha(pc))
2350 if (pswit[ECHO_SWITCH])
2351 g_print("\n%s\n",aline);
2352 if (!pswit[OVERVIEW_SWITCH])
2353 g_print(" Line %ld column %ld - "
2354 "endquote missing punctuation?\n",
2355 linecnt,g_utf8_pointer_to_offset(aline,s));
2363 * check_for_html_tag:
2365 * Check for <HTML TAG>.
2367 * If there is a < in the line, followed at some point
2368 * by a > then we suspect HTML.
2370 void check_for_html_tag(const char *aline)
2372 const char *open,*close;
2374 open=strchr(aline,'<');
2377 close=strchr(g_utf8_next_char(open),'>');
2380 if (pswit[ECHO_SWITCH])
2381 g_print("\n%s\n",aline);
2382 if (!pswit[OVERVIEW_SWITCH])
2384 tag=g_strndup(open,close-open+1);
2385 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2386 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2396 * check_for_html_entity:
2398 * Check for &symbol; HTML.
2400 * If there is a & in the line, followed at
2401 * some point by a ; then we suspect HTML.
2403 void check_for_html_entity(const char *aline)
2405 const char *s,*amp,*scolon;
2407 amp=strchr(aline,'&');
2410 scolon=strchr(amp,';');
2413 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2414 if (g_utf8_get_char(s)==CHAR_SPACE)
2415 break; /* Don't report "Jones & Son;" */
2418 if (pswit[ECHO_SWITCH])
2419 g_print("\n%s\n",aline);
2420 if (!pswit[OVERVIEW_SWITCH])
2422 entity=g_strndup(amp,scolon-amp+1);
2423 g_print(" Line %ld column %d - HTML symbol? %s \n",
2424 linecnt,(int)(amp-aline)+1,entity);
2437 * If we are in a state of unbalanced quotes, and this line
2438 * doesn't begin with a quote, output the stored error message.
2439 * If the -P switch was used, print the warning even if the
2440 * new para starts with quotes.
2442 void print_pending(const char *aline,const char *parastart,
2443 struct pending *pending)
2450 c=g_utf8_get_char(s);
2451 if (pending->dquote)
2453 if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
2455 if (!pswit[OVERVIEW_SWITCH])
2457 if (pswit[ECHO_SWITCH])
2458 g_print("\n%s\n",parastart);
2459 g_print("%s\n",pending->dquote);
2464 g_free(pending->dquote);
2465 pending->dquote=NULL;
2467 if (pending->squote)
2469 if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2472 if (!pswit[OVERVIEW_SWITCH])
2474 if (pswit[ECHO_SWITCH])
2475 g_print("\n%s\n",parastart);
2476 g_print("%s\n",pending->squote);
2481 g_free(pending->squote);
2482 pending->squote=NULL;
2484 if (pending->rbrack)
2486 if (!pswit[OVERVIEW_SWITCH])
2488 if (pswit[ECHO_SWITCH])
2489 g_print("\n%s\n",parastart);
2490 g_print("%s\n",pending->rbrack);
2494 g_free(pending->rbrack);
2495 pending->rbrack=NULL;
2497 if (pending->sbrack)
2499 if (!pswit[OVERVIEW_SWITCH])
2501 if (pswit[ECHO_SWITCH])
2502 g_print("\n%s\n",parastart);
2503 g_print("%s\n",pending->sbrack);
2507 g_free(pending->sbrack);
2508 pending->sbrack=NULL;
2510 if (pending->cbrack)
2512 if (!pswit[OVERVIEW_SWITCH])
2514 if (pswit[ECHO_SWITCH])
2515 g_print("\n%s\n",parastart);
2516 g_print("%s\n",pending->cbrack);
2520 g_free(pending->cbrack);
2521 pending->cbrack=NULL;
2523 if (pending->unders)
2525 if (!pswit[OVERVIEW_SWITCH])
2527 if (pswit[ECHO_SWITCH])
2528 g_print("\n%s\n",parastart);
2529 g_print("%s\n",pending->unders);
2533 g_free(pending->unders);
2534 pending->unders=NULL;
2539 * check_for_mismatched_quotes:
2541 * At end of paragraph, check for mismatched quotes.
2543 * We don't want to report an error immediately, since it is a
2544 * common convention to omit the quotes at end of paragraph if
2545 * the next paragraph is a continuation of the same speaker.
2546 * Where this is the case, the next para should begin with a
2547 * quote, so we store the warning message and only display it
2548 * at the top of the next iteration if the new para doesn't
2549 * start with a quote.
2550 * The -p switch overrides this default, and warns of unclosed
2551 * quotes on _every_ paragraph, whether the next begins with a
2554 void check_for_mismatched_quotes(const struct counters *counters,
2555 struct pending *pending)
2557 if (counters->quot%2)
2559 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2560 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2561 counters->open_single_quote!=counters->close_single_quote)
2563 g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
2564 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2565 counters->open_single_quote!=counters->close_single_quote &&
2566 counters->open_single_quote!=counters->close_single_quote+1)
2568 * Flag it to be noted regardless of the
2569 * first char of the next para.
2572 if (counters->r_brack)
2574 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2575 if (counters->s_brack)
2577 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2578 if (counters->c_brack)
2580 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2581 if (counters->c_unders%2)
2583 g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
2587 * check_for_omitted_punctuation:
2589 * Check for omitted punctuation at end of paragraph by working back
2590 * through prevline. DW.
2591 * Need to check this only for "normal" paras.
2592 * So what is a "normal" para?
2593 * Not normal if one-liner (chapter headings, etc.)
2594 * Not normal if doesn't contain at least one locase letter
2595 * Not normal if starts with space
2597 void check_for_omitted_punctuation(const char *prevline,
2598 struct line_properties *last,int start_para_line)
2600 gboolean letter_on_line=FALSE;
2602 for (s=prevline;*s;s=g_utf8_next_char(s))
2603 if (g_unichar_isalpha(g_utf8_get_char(s)))
2605 letter_on_line=TRUE;
2609 * This next "if" is a problem.
2610 * If we say "start_para_line <= linecnt - 1", that includes
2611 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2612 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2613 * misses genuine one-line paragraphs.
2615 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2616 g_utf8_get_char(prevline)>CHAR_SPACE)
2618 for (s=g_utf8_prev_char(prevline+strlen(prevline));
2619 (g_utf8_get_char(s)==CHAR_DQUOTE ||
2620 g_utf8_get_char(s)==CHAR_SQUOTE) &&
2621 g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
2622 s=g_utf8_prev_char(s))
2624 for (;s>prevline;s=g_utf8_prev_char(s))
2626 if (g_unichar_isalpha(g_utf8_get_char(s)))
2628 if (pswit[ECHO_SWITCH])
2629 g_print("\n%s\n",prevline);
2630 if (!pswit[OVERVIEW_SWITCH])
2631 g_print(" Line %ld column %ld - "
2632 "No punctuation at para end?\n",
2633 linecnt-1,g_utf8_strlen(prevline,-1));
2638 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2644 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2646 const char *word=key;
2649 g_print("\nNote: Queried word %s was duplicated %d times\n",
2654 void print_as_windows_1252(const char *string)
2656 gsize inbytes,outbytes;
2658 GIConv converter=(GIConv)-1;
2661 if (converter!=(GIConv)-1)
2662 g_iconv_close(converter);
2663 converter=(GIConv)-1;
2666 if (converter=(GIConv)-1)
2667 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2668 if (converter!=(GIConv)-1)
2670 inbytes=outbytes=strlen(string);
2671 bp=buf=g_malloc(outbytes+1);
2672 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2678 fputs(string,stdout);
2681 void print_as_utf_8(const char *string)
2683 fputs(string,stdout);
2691 void procfile(const char *filename)
2694 gchar *parastart=NULL; /* first line of current para */
2695 gchar *etext,*aline;
2698 struct first_pass_results *first_pass_results;
2699 struct warnings *warnings;
2700 struct counters counters={0};
2701 struct line_properties last={0};
2702 struct parities parities={0};
2703 struct pending pending={0};
2704 gboolean isemptyline;
2705 long start_para_line=0;
2706 gboolean isnewpara=FALSE,enddash=FALSE;
2707 last.start=CHAR_SPACE;
2708 linecnt=checked_linecnt=0;
2709 etext=read_etext(filename,&err);
2712 if (pswit[STDOUT_SWITCH])
2713 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2715 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2718 g_print("\n\nFile: %s\n\n",filename);
2719 first_pass_results=first_pass(etext);
2720 warnings=report_first_pass(first_pass_results);
2721 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2722 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2724 * Here we go with the main pass. Hold onto yer hat!
2728 while ((aline=flgets(&etext_ptr,linecnt+1)))
2733 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2734 continue; // skip DP page separators completely
2735 if (linecnt<first_pass_results->firstline ||
2736 (first_pass_results->footerline>0 &&
2737 linecnt>first_pass_results->footerline))
2739 if (pswit[HEADER_SWITCH])
2741 if (g_str_has_prefix(aline,"Title:"))
2742 g_print(" %s\n",aline);
2743 if (g_str_has_prefix(aline,"Author:"))
2744 g_print(" %s\n",aline);
2745 if (g_str_has_prefix(aline,"Release Date:"))
2746 g_print(" %s\n",aline);
2747 if (g_str_has_prefix(aline,"Edition:"))
2748 g_print(" %s\n\n",aline);
2750 continue; /* skip through the header */
2753 print_pending(aline,parastart,&pending);
2754 memset(&pending,0,sizeof(pending));
2755 isemptyline=analyse_quotes(aline,&counters);
2756 if (isnewpara && !isemptyline)
2758 /* This line is the start of a new paragraph. */
2759 start_para_line=linecnt;
2760 /* Capture its first line in case we want to report it later. */
2762 parastart=g_strdup(aline);
2763 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2765 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2766 !g_unichar_isdigit(g_utf8_get_char(s)))
2767 s=g_utf8_next_char(s);
2768 if (g_unichar_islower(g_utf8_get_char(s)))
2770 /* and its first letter is lowercase */
2771 if (pswit[ECHO_SWITCH])
2772 g_print("\n%s\n",aline);
2773 if (!pswit[OVERVIEW_SWITCH])
2774 g_print(" Line %ld column %ld - "
2775 "Paragraph starts with lower-case\n",
2776 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2780 isnewpara=FALSE; /* Signal the end of new para processing. */
2782 /* Check for an em-dash broken at line end. */
2783 if (enddash && g_utf8_get_char(aline)=='-')
2785 if (pswit[ECHO_SWITCH])
2786 g_print("\n%s\n",aline);
2787 if (!pswit[OVERVIEW_SWITCH])
2788 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2793 for (s=g_utf8_prev_char(aline+strlen(aline));
2794 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2796 if (s>=aline && g_utf8_get_char(s)=='-')
2798 check_for_control_characters(aline);
2800 check_for_odd_characters(aline,warnings,isemptyline);
2801 if (warnings->longline)
2802 check_for_long_line(aline);
2803 if (warnings->shortline)
2804 check_for_short_line(aline,&last);
2806 last.len=g_utf8_strlen(aline,-1);
2807 last.start=g_utf8_get_char(aline);
2808 check_for_starting_punctuation(aline);
2811 check_for_spaced_emdash(aline);
2812 check_for_spaced_dash(aline);
2814 check_for_unmarked_paragraphs(aline);
2815 check_for_jeebies(aline);
2816 check_for_mta_from(aline);
2817 check_for_orphan_character(aline);
2818 check_for_pling_scanno(aline);
2819 check_for_extra_period(aline,warnings);
2820 check_for_following_punctuation(aline);
2821 check_for_typos(aline,warnings);
2822 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2823 check_for_double_punctuation(aline,warnings);
2824 check_for_spaced_quotes(aline);
2825 check_for_miscased_genative(aline);
2826 check_end_of_line(aline,warnings);
2827 check_for_unspaced_bracket(aline);
2828 if (warnings->endquote)
2829 check_for_unpunctuated_endquote(aline);
2830 check_for_html_tag(aline);
2831 check_for_html_entity(aline);
2834 check_for_mismatched_quotes(&counters,&pending);
2835 memset(&counters,0,sizeof(counters));
2836 /* let the next iteration know that it's starting a new para */
2839 check_for_omitted_punctuation(prevline,&last,start_para_line);
2842 prevline=g_strdup(aline);
2852 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2853 g_tree_foreach(qword,report_duplicate_queries,NULL);
2854 g_tree_unref(qword);
2855 g_tree_unref(qperiod);
2856 g_set_print_handler(NULL);
2857 print_as_windows_1252(NULL);
2858 if (pswit[MARKUP_SWITCH])
2865 * Get one line from the input text, checking for
2866 * the existence of exactly one CR/LF line-end per line.
2868 * Returns: a pointer to the line.
2870 char *flgets(char **etext,long lcnt)
2873 gboolean isCR=FALSE;
2874 char *theline=*etext;
2879 c=g_utf8_get_char(*etext);
2880 *etext=g_utf8_next_char(*etext);
2883 /* either way, it's end of line */
2890 /* Error - a LF without a preceding CR */
2891 if (pswit[LINE_END_SWITCH])
2893 if (pswit[ECHO_SWITCH])
2895 s=g_strndup(theline,eos-theline);
2896 g_print("\n%s\n",s);
2899 if (!pswit[OVERVIEW_SWITCH])
2900 g_print(" Line %ld - No CR?\n",lcnt);
2911 /* Error - two successive CRs */
2912 if (pswit[LINE_END_SWITCH])
2914 if (pswit[ECHO_SWITCH])
2916 s=g_strndup(theline,eos-theline);
2917 g_print("\n%s\n",s);
2920 if (!pswit[OVERVIEW_SWITCH])
2921 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2930 if (pswit[LINE_END_SWITCH] && isCR)
2932 if (pswit[ECHO_SWITCH])
2934 s=g_strndup(theline,eos-theline);
2935 g_print("\n%s\n",s);
2938 if (!pswit[OVERVIEW_SWITCH])
2939 g_print(" Line %ld column %ld - CR without LF?\n",
2940 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2946 eos=g_utf8_next_char(eos);
2950 if (pswit[MARKUP_SWITCH])
2951 postprocess_for_HTML(theline);
2952 if (pswit[DP_SWITCH])
2953 postprocess_for_DP(theline);
2960 * Takes a "word" as a parameter, and checks whether it
2961 * contains a mixture of alpha and digits. Generally, this is an
2962 * error, but may not be for cases like 4th or L5 12s. 3d.
2964 * Returns: TRUE iff an is error found.
2966 gboolean mixdigit(const char *checkword)
2968 gboolean wehaveadigit,wehavealetter,query;
2969 const char *s,*nondigit;
2970 wehaveadigit=wehavealetter=query=FALSE;
2971 for (s=checkword;*s;s=g_utf8_next_char(s))
2972 if (g_unichar_isalpha(g_utf8_get_char(s)))
2974 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2976 if (wehaveadigit && wehavealetter)
2978 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2980 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2981 nondigit=g_utf8_next_char(nondigit))
2983 /* digits, ending in st, rd, nd, th of either case */
2984 if (!g_ascii_strcasecmp(nondigit,"st") ||
2985 !g_ascii_strcasecmp(nondigit,"rd") ||
2986 !g_ascii_strcasecmp(nondigit,"nd") ||
2987 !g_ascii_strcasecmp(nondigit,"th"))
2989 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2990 !g_ascii_strcasecmp(nondigit,"rds") ||
2991 !g_ascii_strcasecmp(nondigit,"nds") ||
2992 !g_ascii_strcasecmp(nondigit,"ths"))
2994 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2995 !g_ascii_strcasecmp(nondigit,"rdly") ||
2996 !g_ascii_strcasecmp(nondigit,"ndly") ||
2997 !g_ascii_strcasecmp(nondigit,"thly"))
2999 /* digits, ending in l, L, s or d */
3000 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
3001 !strcmp(nondigit,"d"))
3004 * L at the start of a number, representing Britsh pounds, like L500.
3005 * This is cute. We know the current word is mixed digit. If the first
3006 * letter is L, there must be at least one digit following. If both
3007 * digits and letters follow, we have a genuine error, else we have a
3008 * capital L followed by digits, and we accept that as a non-error.
3010 if (g_utf8_get_char(checkword)=='L' &&
3011 !mixdigit(g_utf8_next_char(checkword)))
3020 * Extracts the first/next "word" from the line, and returns it.
3021 * A word is defined as one English word unit--or at least that's the aim.
3022 * "ptr" is advanced to the position in the line where we will start
3023 * looking for the next word.
3025 * Returns: A newly-allocated string.
3027 gchar *getaword(const char **ptr)
3032 word=g_string_new(NULL);
3033 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3034 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3035 **ptr;*ptr=g_utf8_next_char(*ptr))
3038 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3039 * Especially yucky is the case of L1,000
3040 * This section looks for a pattern of characters including a digit
3041 * followed by a comma or period followed by one or more digits.
3042 * If found, it returns this whole pattern as a word; otherwise we discard
3043 * the results and resume our normal programming.
3046 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3047 g_unichar_isalpha(g_utf8_get_char(s)) ||
3048 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3049 g_string_append_unichar(word,g_utf8_get_char(s));
3050 for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
3051 t=g_utf8_next_char(t))
3053 c=g_utf8_get_char(t);
3054 pc=g_utf8_get_char(g_utf8_prev_char(t));
3055 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3058 return g_string_free(word,FALSE);
3061 /* we didn't find a punctuated number - do the regular getword thing */
3062 g_string_truncate(word,0);
3063 for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
3064 g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
3065 g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
3066 g_string_append_unichar(word,g_utf8_get_char(*ptr));
3067 return g_string_free(word,FALSE);
3073 * Is this word a Roman Numeral?
3075 * It doesn't actually validate that the number is a valid Roman Numeral--for
3076 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3077 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3078 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3079 * expressions thereof, except when it came to taxes. Allow any number of M,
3080 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3081 * XL or an optional XC, an optional IX or IV, an optional V and any number
3084 gboolean isroman(const char *t)
3090 while (g_utf8_get_char(t)=='m' && *t)
3092 if (g_utf8_get_char(t)=='d')
3094 if (g_str_has_prefix(t,"cm"))
3096 if (g_str_has_prefix(t,"cd"))
3098 while (g_utf8_get_char(t)=='c' && *t)
3100 if (g_str_has_prefix(t,"xl"))
3102 if (g_str_has_prefix(t,"xc"))
3104 if (g_utf8_get_char(t)=='l')
3106 while (g_utf8_get_char(t)=='x' && *t)
3108 if (g_str_has_prefix(t,"ix"))
3110 if (g_str_has_prefix(t,"iv"))
3112 if (g_utf8_get_char(t)=='v')
3114 while (g_utf8_get_char(t)=='i' && *t)
3120 * postprocess_for_DP:
3122 * Invoked with the -d switch from flgets().
3123 * It simply "removes" from the line a hard-coded set of common
3124 * DP-specific tags, so that the line passed to the main routine has
3125 * been pre-cleaned of DP markup.
3127 void postprocess_for_DP(char *theline)
3133 for (i=0;*DPmarkup[i];i++)
3134 while ((s=strstr(theline,DPmarkup[i])))
3136 t=s+strlen(DPmarkup[i]);
3137 memmove(s,t,strlen(t)+1);
3142 * postprocess_for_HTML:
3144 * Invoked with the -m switch from flgets().
3145 * It simply "removes" from the line a hard-coded set of common
3146 * HTML tags and "replaces" a hard-coded set of common HTML
3147 * entities, so that the line passed to the main routine has
3148 * been pre-cleaned of HTML.
3150 void postprocess_for_HTML(char *theline)
3152 while (losemarkup(theline))
3154 loseentities(theline);
3157 char *losemarkup(char *theline)
3161 s=strchr(theline,'<');
3162 t=s?strchr(s,'>'):NULL;
3165 for (i=0;*markup[i];i++)
3166 if (tagcomp(g_utf8_next_char(s),markup[i]))
3168 t=g_utf8_next_char(t);
3169 memmove(s,t,strlen(t)+1);
3172 /* It's an unrecognized <xxx>. */
3176 void loseentities(char *theline)
3183 GTree *entities=NULL;
3184 GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3188 g_tree_destroy(entities);
3190 if (translit==(GIConv)-1)
3191 g_iconv_close(translit);
3192 translit=(GIConv)-1;
3193 if (to_utf8==(GIConv)-1)
3194 g_iconv_close(to_utf8);
3202 entities=g_tree_new((GCompareFunc)strcmp);
3203 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3204 g_tree_insert(entities,HTMLentities[i].name,
3205 GUINT_TO_POINTER(HTMLentities[i].c));
3207 if (translit==(GIConv)-1)
3208 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3209 if (to_utf8==(GIConv)-1)
3210 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3211 while((amp=strchr(theline,'&')))
3213 scolon=strchr(amp,';');
3218 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3219 c=strtol(amp+2,NULL,10);
3220 else if (amp[2]=='x' &&
3221 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3222 c=strtol(amp+3,NULL,16);
3226 s=g_strndup(amp+1,scolon-(amp+1));
3227 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3236 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3237 theline+=g_unichar_to_utf8(c,theline);
3241 nb=g_unichar_to_utf8(c,s);
3242 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3244 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3246 memcpy(theline,s,nb);
3250 memmove(theline,g_utf8_next_char(scolon),
3251 strlen(g_utf8_next_char(scolon))+1);
3254 theline=g_utf8_next_char(amp);
3258 gboolean tagcomp(const char *strin,const char *basetag)
3262 if (g_utf8_get_char(strin)=='/')
3263 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3265 t=g_utf8_casefold(strin,-1);
3266 s=g_utf8_casefold(basetag,-1);
3267 retval=g_str_has_prefix(t,s);
3273 void proghelp(GOptionContext *context)
3276 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3277 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3278 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3279 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3280 "For details, read the file COPYING.\n",stderr);
3281 fputs("This is Free Software; "
3282 "you may redistribute it under certain conditions (GPL);\n",stderr);
3283 fputs("read the file COPYING for details.\n\n",stderr);
3284 help=g_option_context_get_help(context,TRUE,NULL);
3287 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3288 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3289 "non-ASCII\n",stderr);
3290 fputs("characters like accented letters, "
3291 "lines longer than 75 or shorter than 55,\n",stderr);
3292 fputs("unbalanced quotes or brackets, "
3293 "a variety of badly formatted punctuation, \n",stderr);
3294 fputs("HTML tags, some likely typos. "
3295 "It is NOT a substitute for human judgement.\n",stderr);