Fix bug #21: False positive: Opening slanted double-quote, followed by single slanted quote, should be accepted by BL
1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 static GOptionEntry options[]={
132 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
133 "Ignore DP-specific markup", NULL },
134 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
135 "Don't echo queried line", NULL },
136 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
137 "Check single quotes", NULL },
138 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
139 "Check common typos", NULL },
140 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
141 "Require closure of quotes on every paragraph", NULL },
142 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
143 "Disable paranoid querying of everything", NULL },
144 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
145 "Disable line end checking", NULL },
146 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
147 "Overview: just show counts", NULL },
148 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
149 "Output errors to stdout instead of stderr", NULL },
150 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
151 "Echo header fields", NULL },
152 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
153 "Ignore markup in < >", NULL },
154 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
155 "Use file of user-defined typos", NULL },
156 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
157 "Defaults for use on www upload", NULL },
158 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
159 "Verbose - list everything", NULL },
163 long cnt_quote; /* for overview mode, count of quote queries */
164 long cnt_brack; /* for overview mode, count of brackets queries */
165 long cnt_bin; /* for overview mode, count of non-ASCII queries */
166 long cnt_odd; /* for overview mode, count of odd character queries */
167 long cnt_long; /* for overview mode, count of long line errors */
168 long cnt_short; /* for overview mode, count of short line queries */
169 long cnt_punct; /* for overview mode,
170 count of punctuation and spacing queries */
171 long cnt_dash; /* for overview mode, count of dash-related queries */
172 long cnt_word; /* for overview mode, count of word queries */
173 long cnt_html; /* for overview mode, count of html queries */
174 long cnt_lineend; /* for overview mode, count of line-end queries */
175 long cnt_spacend; /* count of lines with space at end */
176 long linecnt; /* count of total lines in the file */
177 long checked_linecnt; /* count of lines actually checked */
179 void proghelp(GOptionContext *context);
180 void procfile(const char *);
184 gboolean mixdigit(const char *);
185 gchar *getaword(const char **);
186 char *flgets(char **,long);
187 void postprocess_for_HTML(char *);
188 char *linehasmarkup(char *);
189 char *losemarkup(char *);
190 gboolean tagcomp(const char *,const char *);
191 void loseentities(char *);
192 gboolean isroman(const char *);
193 void postprocess_for_DP(char *);
194 void print_as_windows_1252(const char *string);
195 void print_as_utf_8(const char *string);
197 GTree *qword,*qperiod;
203 void parse_options(int *argc,char ***argv)
206 GOptionContext *context;
207 context=g_option_context_new(
208 "file - looks for errors in Project Gutenberg(TM) etexts");
209 g_option_context_add_main_entries(context,options,NULL);
210 if (!g_option_context_parse(context,argc,argv,&err))
212 g_printerr("Bookloupe: %s\n",err->message);
213 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
216 /* Paranoid checking is turned OFF, not on, by its switch */
217 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
218 if (pswit[PARANOID_SWITCH])
219 /* if running in paranoid mode, typo checks default to enabled */
220 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
221 /* Line-end checking is turned OFF, not on, by its switch */
222 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
223 /* Echoing is turned OFF, not on, by its switch */
224 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
225 if (pswit[OVERVIEW_SWITCH])
226 /* just print summary; don't echo */
227 pswit[ECHO_SWITCH]=FALSE;
229 * Web uploads - for the moment, this is really just a placeholder
230 * until we decide what processing we really want to do on web uploads
232 if (pswit[WEB_SWITCH])
234 /* specific override for web uploads */
235 pswit[ECHO_SWITCH]=TRUE;
236 pswit[SQUOTE_SWITCH]=FALSE;
237 pswit[TYPO_SWITCH]=TRUE;
238 pswit[QPARA_SWITCH]=FALSE;
239 pswit[PARANOID_SWITCH]=TRUE;
240 pswit[LINE_END_SWITCH]=FALSE;
241 pswit[OVERVIEW_SWITCH]=FALSE;
242 pswit[STDOUT_SWITCH]=FALSE;
243 pswit[HEADER_SWITCH]=TRUE;
244 pswit[VERBOSE_SWITCH]=FALSE;
245 pswit[MARKUP_SWITCH]=FALSE;
246 pswit[USERTYPO_SWITCH]=FALSE;
247 pswit[DP_SWITCH]=FALSE;
254 g_option_context_free(context);
260 * Read in the user-defined stealth scanno list.
262 void read_user_scannos(void)
265 gchar *usertypo_file;
269 gchar *contents,*utf8,**lines;
270 usertypo_file=g_strdup("bookloupe.typ");
271 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
272 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
275 g_free(usertypo_file);
276 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
277 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
279 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
282 g_free(usertypo_file);
283 usertypo_file=g_strdup("gutcheck.typ");
284 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
286 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
289 g_free(usertypo_file);
290 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
291 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
293 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
295 g_free(usertypo_file);
296 g_print(" --> I couldn't find bookloupe.typ "
297 "-- proceeding without user typos.\n");
302 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
303 g_free(usertypo_file);
307 if (g_utf8_validate(contents,len,NULL))
308 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
310 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
312 lines=g_strsplit_set(utf8,"\r\n",0);
314 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
315 for (i=0;lines[i];i++)
316 if (*(unsigned char *)lines[i]>'!')
317 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
326 * Read an etext returning a newly allocated string containing the file
327 * contents or NULL on error.
329 gchar *read_etext(const char *filename,GError **err)
331 GError *tmp_err=NULL;
332 gchar *contents,*utf8;
333 gsize len,bytes_read,bytes_written;
335 if (!g_file_get_contents(filename,&contents,&len,err))
337 if (g_utf8_validate(contents,len,NULL))
339 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
340 g_set_print_handler(print_as_utf_8);
342 SetConsoleOutputCP(CP_UTF8);
347 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
348 &bytes_written,&tmp_err);
349 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
350 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
353 for(i=0;i<bytes_read;i++)
354 if (contents[i]=='\n')
359 else if (contents[i]!='\r')
361 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
362 "Input conversion failed. Byte %d at line %d, column %d is not a "
363 "valid Windows-1252 character",
364 ((unsigned char *)contents)[bytes_read],line,col);
367 g_propagate_error(err,tmp_err);
368 g_set_print_handler(print_as_windows_1252);
370 SetConsoleOutputCP(1252);
377 void cleanup_on_exit(void)
380 SetConsoleOutputCP(saved_cp);
384 int main(int argc,char **argv)
387 atexit(cleanup_on_exit);
388 saved_cp=GetConsoleOutputCP();
390 running_from=g_path_get_dirname(argv[0]);
391 parse_options(&argc,&argv);
392 if (pswit[USERTYPO_SWITCH])
394 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
396 if (pswit[OVERVIEW_SWITCH])
398 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
399 checked_linecnt,linecnt,linecnt-checked_linecnt);
400 g_print(" --------------- Queries found --------------\n");
402 g_print(" Long lines: %14ld\n",cnt_long);
404 g_print(" Short lines: %14ld\n",cnt_short);
406 g_print(" Line-end problems: %14ld\n",cnt_lineend);
408 g_print(" Common typos: %14ld\n",cnt_word);
410 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
412 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
414 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
416 g_print(" Proofing characters: %14ld\n",cnt_odd);
418 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
420 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
422 g_print(" Possible HTML tags: %14ld\n",cnt_html);
424 g_print(" TOTAL QUERIES %14ld\n",
425 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
426 cnt_dash+cnt_word+cnt_html+cnt_lineend);
428 g_free(running_from);
430 g_tree_unref(usertypo);
437 * Run a first pass - verify that it's a valid PG
438 * file, decide whether to report some things that
439 * occur many times in the text like long or short
440 * lines, non-standard dashes, etc.
442 struct first_pass_results *first_pass(const char *etext)
444 gunichar laststart=CHAR_SPACE;
449 unsigned int lastlen=0,lastblen=0;
450 long spline=0,nspline=0;
451 static struct first_pass_results results={0};
454 lines=g_strsplit(etext,"\n",0);
455 for (j=0;lines[j];j++)
457 lbytes=strlen(lines[j]);
458 while (lbytes>0 && lines[j][lbytes-1]=='\r')
459 lines[j][--lbytes]='\0';
460 llen=g_utf8_strlen(lines[j],lbytes);
462 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
463 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
466 g_print(" --> Duplicate header?\n");
467 spline=linecnt+1; /* first line of non-header text, that is */
469 if (!strncmp(lines[j],"*** START",9) &&
470 strstr(lines[j],"PROJECT GUTENBERG"))
473 g_print(" --> Duplicate header?\n");
474 nspline=linecnt+1; /* first line of non-header text, that is */
476 if (spline || nspline)
478 lc_line=g_utf8_strdown(lines[j],lbytes);
479 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
481 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
483 if (results.footerline)
485 /* it's an old-form header - we can detect duplicates */
487 g_print(" --> Duplicate footer?\n");
490 results.footerline=linecnt;
496 results.firstline=spline;
498 results.firstline=nspline; /* override with new */
499 if (results.footerline)
500 continue; /* don't count the boilerplate in the footer */
501 results.totlen+=llen;
502 for (s=lines[j];*s;s=g_utf8_next_char(s))
504 if (g_utf8_get_char(s)>127)
506 if (g_unichar_isalpha(g_utf8_get_char(s)))
510 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
511 qc=QUOTE_CLASS(g_utf8_get_char(s));
514 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
515 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
516 results.endquote_count++;
519 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
520 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
523 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
525 if (strstr(lines[j],".,"))
527 /* only count ast lines for ignoring purposes where there is */
528 /* locase text on the line */
529 if (strchr(lines[j],'*'))
531 for (s=lines[j];*s;s=g_utf8_next_char(s))
532 if (g_unichar_islower(g_utf8_get_char(s)))
537 if (strchr(lines[j],'/'))
538 results.fslashline++;
541 for (s=g_utf8_prev_char(lines[j]+lbytes);
542 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
543 s=g_utf8_prev_char(s))
545 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
546 g_utf8_get_char(g_utf8_prev_char(s))!='-')
549 if (llen>LONGEST_PG_LINE)
551 if (llen>WAY_TOO_LONG)
552 results.verylongline++;
553 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
555 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
558 if (strstr(lines[j],"<i>"))
559 results.htmcount+=4; /* bonus marks! */
561 /* Check for spaced em-dashes */
562 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
565 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
566 results.space_emdash++;
567 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
568 /* count of em-dashes with spaces both sides */
569 results.non_PG_space_emdash++;
570 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
571 /* count of PG-type em-dashes with no spaces */
572 results.PG_space_emdash++;
577 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
578 results.Dutchcount++;
579 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
580 results.Frenchcount++;
581 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
582 results.standalone_digit++;
585 /* Check for spaced dashes */
586 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
590 laststart=lines[j][0];
599 * Make some snap decisions based on the first pass results.
601 struct warnings *report_first_pass(struct first_pass_results *results)
603 static struct warnings warnings={0};
605 g_print(" --> %ld lines in this file have white space at end\n",
608 if (results->dotcomma>5)
611 g_print(" --> %ld lines in this file contain '.,'. "
612 "Not reporting them.\n",results->dotcomma);
615 * If more than 50 lines, or one-tenth, are short,
616 * don't bother reporting them.
618 warnings.shortline=1;
619 if (results->shortline>50 || results->shortline*10>linecnt)
621 warnings.shortline=0;
622 g_print(" --> %ld lines in this file are short. "
623 "Not reporting short lines.\n",results->shortline);
626 * If more than 50 lines, or one-tenth, are long,
627 * don't bother reporting them.
630 if (results->longline>50 || results->longline*10>linecnt)
633 g_print(" --> %ld lines in this file are long. "
634 "Not reporting long lines.\n",results->longline);
636 /* If more than 10 lines contain asterisks, don't bother reporting them. */
638 if (results->astline>10)
641 g_print(" --> %ld lines in this file contain asterisks. "
642 "Not reporting them.\n",results->astline);
645 * If more than 10 lines contain forward slashes,
646 * don't bother reporting them.
649 if (results->fslashline>10)
652 g_print(" --> %ld lines in this file contain forward slashes. "
653 "Not reporting them.\n",results->fslashline);
656 * If more than 20 lines contain unpunctuated endquotes,
657 * don't bother reporting them.
660 if (results->endquote_count>20)
663 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
664 "Not reporting them.\n",results->endquote_count);
667 * If more than 15 lines contain standalone digits,
668 * don't bother reporting them.
671 if (results->standalone_digit>10)
674 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
675 "Not reporting them.\n",results->standalone_digit);
678 * If more than 20 lines contain hyphens at end,
679 * don't bother reporting them.
682 if (results->hyphens>20)
685 g_print(" --> %ld lines in this file have hyphens at end. "
686 "Not reporting them.\n",results->hyphens);
688 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
690 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
691 pswit[MARKUP_SWITCH]=1;
693 if (results->verylongline>0)
694 g_print(" --> %ld lines in this file are VERY long!\n",
695 results->verylongline);
697 * If there are more non-PG spaced dashes than PG em-dashes,
698 * assume it's deliberate.
699 * Current PG guidelines say don't use them, but older texts do,
700 * and some people insist on them whatever the guidelines say.
703 if (results->spacedash+results->non_PG_space_emdash>
704 results->PG_space_emdash)
707 g_print(" --> There are %ld spaced dashes and em-dashes. "
708 "Not reporting them.\n",
709 results->spacedash+results->non_PG_space_emdash);
711 /* If more than a quarter of characters are hi-bit, bug out. */
713 if (results->binlen*4>results->totlen)
715 g_print(" --> This file does not appear to be ASCII. "
716 "Terminating. Best of luck with it!\n");
719 if (results->alphalen*4<results->totlen)
721 g_print(" --> This file does not appear to be text. "
722 "Terminating. Best of luck with it!\n");
725 if (results->binlen*100>results->totlen || results->binlen>100)
727 g_print(" --> There are a lot of foreign letters here. "
728 "Not reporting them.\n");
731 warnings.isDutch=FALSE;
732 if (results->Dutchcount>50)
734 warnings.isDutch=TRUE;
735 g_print(" --> This looks like Dutch - "
736 "switching off dashes and warnings for 's Middags case.\n");
738 warnings.isFrench=FALSE;
739 if (results->Frenchcount>50)
741 warnings.isFrench=TRUE;
742 g_print(" --> This looks like French - "
743 "switching off some doublepunct.\n");
745 if (results->firstline && results->footerline)
746 g_print(" The PG header and footer appear to be already on.\n");
749 if (results->firstline)
750 g_print(" The PG header is on - no footer.\n");
751 if (results->footerline)
752 g_print(" The PG footer is on - no header.\n");
755 if (pswit[VERBOSE_SWITCH])
758 warnings.shortline=1;
767 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
769 if (warnings.isDutch)
771 if (results->footerline>0 && results->firstline>0 &&
772 results->footerline>results->firstline &&
773 results->footerline-results->firstline<100)
775 g_print(" --> I don't really know where this text starts. \n");
776 g_print(" There are no reference points.\n");
777 g_print(" I'm going to have to report the header and footer "
779 results->firstline=0;
787 * Look along the line, accumulate the count of quotes, and see
788 * if this is an empty line - i.e. a line with nothing on it
790 * If line has just spaces, period, * and/or - on it, don't
791 * count it, since empty lines with asterisks or dashes to
792 * separate sections are common.
794 * Returns: TRUE if the line is empty.
796 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
799 /* assume the line is empty until proven otherwise */
800 gboolean isemptyline=TRUE;
801 const char *s=aline,*sprev,*snext;
804 GError *tmp_err=NULL;
807 snext=g_utf8_next_char(s);
808 c=g_utf8_get_char(s);
809 if (CHAR_IS_DQUOTE(c))
810 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
811 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
816 * At start of line, it can only be a quotation mark.
817 * Hardcode a very common exception!
819 if (!g_str_has_prefix(snext,"tis") &&
820 !g_str_has_prefix(snext,"Tis"))
821 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
823 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
824 g_unichar_isalpha(g_utf8_get_char(snext)))
825 /* Do nothing! it's definitely an apostrophe, not a quote */
827 /* it's outside a word - let's check it out */
828 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
829 g_unichar_isalpha(g_utf8_get_char(snext)))
831 /* certainly looks like a quotation mark */
832 if (!g_str_has_prefix(snext,"tis") &&
833 !g_str_has_prefix(snext,"Tis"))
834 /* hardcode a very common exception! */
836 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
837 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
839 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
844 /* now - is it a quotation mark? */
845 guessquote=0; /* accumulate clues */
846 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
848 /* it follows a letter - could be either */
850 if (g_utf8_get_char(sprev)=='s')
852 /* looks like a plural apostrophe */
854 if (g_utf8_get_char(snext)==CHAR_SPACE)
858 if (innermost_quote_matches(counters,c))
860 * Give it the benefit of some doubt,
861 * if a squote is already open.
867 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
870 /* no adjacent letter - it must be a quote of some kind */
871 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
876 if (pswit[ECHO_SWITCH])
877 g_print("\n%s\n",aline);
878 if (!pswit[OVERVIEW_SWITCH])
879 g_print(" Line %ld column %ld - %s\n",
880 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
881 g_clear_error(&tmp_err);
883 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
885 isemptyline=FALSE; /* ignore lines like * * * as spacers */
886 if (c==CHAR_UNDERSCORE)
887 counters->c_unders++;
888 if (c==CHAR_OPEN_SBRACK)
890 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
891 !matching_difference(counters,c) && s==aline &&
892 g_str_has_prefix(s,"[Illustration:"))
893 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
895 increment_matching(counters,c,TRUE);
897 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
898 increment_matching(counters,c,TRUE);
899 if (c==CHAR_CLOSE_SBRACK)
901 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
902 !matching_difference(counters,c) && !*snext)
903 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
905 increment_matching(counters,c,FALSE);
907 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
908 increment_matching(counters,c,FALSE);
916 * check_for_control_characters:
918 * Check for invalid or questionable characters in the line
919 * Anything above 127 is invalid for plain ASCII, and
920 * non-printable control characters should also be flagged.
921 * Tabs should generally not be there.
923 void check_for_control_characters(const char *aline)
927 for (s=aline;*s;s=g_utf8_next_char(s))
929 c=g_utf8_get_char(s);
930 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
932 if (pswit[ECHO_SWITCH])
933 g_print("\n%s\n",aline);
934 if (!pswit[OVERVIEW_SWITCH])
935 g_print(" Line %ld column %ld - Control character %u\n",
936 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
944 * check_for_odd_characters:
946 * Check for binary and other odd characters.
948 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
949 gboolean isemptyline)
951 /* Don't repeat multiple warnings on one line. */
952 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
953 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
956 for (s=aline;*s;s=g_utf8_next_char(s))
958 c=g_utf8_get_char(s);
959 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
961 if (pswit[ECHO_SWITCH])
962 g_print("\n%s\n",aline);
963 if (!pswit[OVERVIEW_SWITCH])
964 if (c>127 && c<160 || c>255)
965 g_print(" Line %ld column %ld - "
966 "Non-ISO-8859 character %u\n",
967 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
969 g_print(" Line %ld column %ld - "
970 "Non-ASCII character %u\n",
971 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
976 if (!eTab && c==CHAR_TAB)
978 if (pswit[ECHO_SWITCH])
979 g_print("\n%s\n",aline);
980 if (!pswit[OVERVIEW_SWITCH])
981 g_print(" Line %ld column %ld - Tab character?\n",
982 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
987 if (!eTilde && c==CHAR_TILDE)
990 * Often used by OCR software to indicate an
991 * unrecognizable character.
993 if (pswit[ECHO_SWITCH])
994 g_print("\n%s\n",aline);
995 if (!pswit[OVERVIEW_SWITCH])
996 g_print(" Line %ld column %ld - Tilde character?\n",
997 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1002 if (!eCarat && c==CHAR_CARAT)
1004 if (pswit[ECHO_SWITCH])
1005 g_print("\n%s\n",aline);
1006 if (!pswit[OVERVIEW_SWITCH])
1007 g_print(" Line %ld column %ld - Carat character?\n",
1008 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1013 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1015 if (pswit[ECHO_SWITCH])
1016 g_print("\n%s\n",aline);
1017 if (!pswit[OVERVIEW_SWITCH])
1018 g_print(" Line %ld column %ld - Forward slash?\n",
1019 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1025 * Report asterisks only in paranoid mode,
1026 * since they're often deliberate.
1028 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1031 if (pswit[ECHO_SWITCH])
1032 g_print("\n%s\n",aline);
1033 if (!pswit[OVERVIEW_SWITCH])
1034 g_print(" Line %ld column %ld - Asterisk?\n",
1035 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1044 * check_for_long_line:
1046 * Check for line too long.
1048 void check_for_long_line(const char *aline)
1050 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1052 if (pswit[ECHO_SWITCH])
1053 g_print("\n%s\n",aline);
1054 if (!pswit[OVERVIEW_SWITCH])
1055 g_print(" Line %ld column %ld - Long line %ld\n",
1056 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1063 * check_for_short_line:
1065 * Check for line too short.
1067 * This one is a bit trickier to implement: we don't want to
1068 * flag the last line of a paragraph for being short, so we
1069 * have to wait until we know that our current line is a
1070 * "normal" line, then report the _previous_ line if it was too
1071 * short. We also don't want to report indented lines like
1072 * chapter heads or formatted quotations. We therefore keep
1073 * last->len as the length of the last line examined, and
1074 * last->blen as the length of the last but one, and try to
1075 * suppress unnecessary warnings by checking that both were of
1076 * "normal" length. We keep the first character of the last
1077 * line in last->start, and if it was a space, we assume that
1078 * the formatting is deliberate. I can't figure out a way to
1079 * distinguish something like a quoted verse left-aligned or
1080 * the header or footer of a letter from a paragraph of short
1081 * lines - maybe if I examined the whole paragraph, and if the
1082 * para has less than, say, 8 lines and if all lines are short,
1083 * then just assume it's OK? Need to look at some texts to see
1084 * how often a formula like this would get the right result.
1086 void check_for_short_line(const char *aline,const struct line_properties *last)
1088 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1089 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1090 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1092 if (pswit[ECHO_SWITCH])
1093 g_print("\n%s\n",prevline);
1094 if (!pswit[OVERVIEW_SWITCH])
1095 g_print(" Line %ld column %ld - Short line %ld?\n",
1096 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1103 * check_for_starting_punctuation:
1105 * Look for punctuation other than full ellipses at start of line.
1107 void check_for_starting_punctuation(const char *aline)
1109 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1110 !g_str_has_prefix(aline,". . ."))
1112 if (pswit[ECHO_SWITCH])
1113 g_print("\n%s\n",aline);
1114 if (!pswit[OVERVIEW_SWITCH])
1115 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1123 * check_for_spaced_emdash:
1125 * Check for spaced em-dashes.
1127 * We must check _all_ occurrences of "--" on the line
1128 * hence the loop - even if the first double-dash is OK
1129 * there may be another that's wrong later on.
1131 void check_for_spaced_emdash(const char *aline)
1133 const char *s,*t,*next;
1134 for (s=aline;t=strstr(s,"--");s=next)
1136 next=g_utf8_next_char(g_utf8_next_char(t));
1137 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1138 g_utf8_get_char(next)==CHAR_SPACE)
1140 if (pswit[ECHO_SWITCH])
1141 g_print("\n%s\n",aline);
1142 if (!pswit[OVERVIEW_SWITCH])
1143 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1144 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1152 * check_for_spaced_dash:
1154 * Check for spaced dashes.
1156 void check_for_spaced_dash(const char *aline)
1159 if ((s=strstr(aline," -")))
1161 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1163 if (pswit[ECHO_SWITCH])
1164 g_print("\n%s\n",aline);
1165 if (!pswit[OVERVIEW_SWITCH])
1166 g_print(" Line %ld column %ld - Spaced dash?\n",
1167 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1172 else if ((s=strstr(aline,"- ")))
1174 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1176 if (pswit[ECHO_SWITCH])
1177 g_print("\n%s\n",aline);
1178 if (!pswit[OVERVIEW_SWITCH])
1179 g_print(" Line %ld column %ld - Spaced dash?\n",
1180 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1188 * check_for_unmarked_paragraphs:
1190 * Check for unmarked paragraphs indicated by separate speakers.
1192 * May well be false positive:
1193 * "Bravo!" "Wonderful!" called the crowd.
1194 * but useful all the same.
1196 void check_for_unmarked_paragraphs(const char *aline)
1199 s=strstr(aline,"\" \"");
1201 s=strstr(aline,"\" \"");
1204 if (pswit[ECHO_SWITCH])
1205 g_print("\n%s\n",aline);
1206 if (!pswit[OVERVIEW_SWITCH])
1207 g_print(" Line %ld column %ld - "
1208 "Query missing paragraph break?\n",
1209 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1216 * check_for_jeebies:
1218 * Check for "to he" and other easy h/b errors.
1220 * This is a very inadequate effort on the h/b problem,
1221 * but the phrase "to he" is always an error, whereas "to
1222 * be" is quite common.
1223 * Similarly, '"Quiet!", be said.' is a non-be error
1224 * "to he" is _not_ always an error!:
1225 * "Where they went to he couldn't say."
1226 * Another false positive:
1227 * What would "Cinderella" be without the . . .
1228 * and another: "If he wants to he can see for himself."
1230 void check_for_jeebies(const char *aline)
1233 s=strstr(aline," be could ");
1235 s=strstr(aline," be would ");
1237 s=strstr(aline," was be ");
1239 s=strstr(aline," be is ");
1241 s=strstr(aline," is be ");
1243 s=strstr(aline,"\", be ");
1245 s=strstr(aline,"\" be ");
1247 s=strstr(aline,"\" be ");
1249 s=strstr(aline," to he ");
1252 if (pswit[ECHO_SWITCH])
1253 g_print("\n%s\n",aline);
1254 if (!pswit[OVERVIEW_SWITCH])
1255 g_print(" Line %ld column %ld - Query he/be error?\n",
1256 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1260 s=strstr(aline," the had ");
1262 s=strstr(aline," a had ");
1264 s=strstr(aline," they bad ");
1266 s=strstr(aline," she bad ");
1268 s=strstr(aline," he bad ");
1270 s=strstr(aline," you bad ");
1272 s=strstr(aline," i bad ");
1275 if (pswit[ECHO_SWITCH])
1276 g_print("\n%s\n",aline);
1277 if (!pswit[OVERVIEW_SWITCH])
1278 g_print(" Line %ld column %ld - Query had/bad error?\n",
1279 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1283 s=strstr(aline,"; hut ");
1285 s=strstr(aline,", hut ");
1288 if (pswit[ECHO_SWITCH])
1289 g_print("\n%s\n",aline);
1290 if (!pswit[OVERVIEW_SWITCH])
1291 g_print(" Line %ld column %ld - Query hut/but error?\n",
1292 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1299 * check_for_mta_from:
1301 * Special case - angled bracket in front of "From" placed there by an
1302 * MTA when sending an e-mail.
1304 void check_for_mta_from(const char *aline)
1307 s=strstr(aline,">From");
1310 if (pswit[ECHO_SWITCH])
1311 g_print("\n%s\n",aline);
1312 if (!pswit[OVERVIEW_SWITCH])
1313 g_print(" Line %ld column %ld - "
1314 "Query angled bracket with From\n",
1315 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1322 * check_for_orphan_character:
1324 * Check for a single character line -
1325 * often an overflow from bad wrapping.
1327 void check_for_orphan_character(const char *aline)
1330 c=g_utf8_get_char(aline);
1331 if (c && !*g_utf8_next_char(aline))
1333 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1334 ; /* Nothing - ignore numerals alone on a line. */
1337 if (pswit[ECHO_SWITCH])
1338 g_print("\n%s\n",aline);
1339 if (!pswit[OVERVIEW_SWITCH])
1340 g_print(" Line %ld column 1 - Query single character line\n",
1349 * check_for_pling_scanno:
1351 * Check for I" - often should be !
1353 void check_for_pling_scanno(const char *aline)
1356 s=strstr(aline," I\"");
1359 if (pswit[ECHO_SWITCH])
1360 g_print("\n%s\n",aline);
1361 if (!pswit[OVERVIEW_SWITCH])
1362 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1363 linecnt,g_utf8_pointer_to_offset(aline,s));
1370 * check_for_extra_period:
1372 * Check for period without a capital letter. Cut-down from gutspell.
1373 * Only works when it happens on a single line.
1375 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1377 const char *s,*t,*s1,*sprev;
1382 gunichar c,nc,pc,*decomposition;
1383 if (pswit[PARANOID_SWITCH])
1385 for (t=aline;t=strstr(t,". ");)
1389 t=g_utf8_next_char(t);
1390 /* start of line punctuation is handled elsewhere */
1393 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1395 t=g_utf8_next_char(t);
1398 if (warnings->isDutch)
1400 /* For Frank & Jeroen -- 's Middags case */
1401 gunichar c2,c3,c4,c5;
1402 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1403 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1404 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1405 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1406 if (CHAR_IS_APOSTROPHE(c2) &&
1407 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1408 g_unichar_isupper(c5))
1410 t=g_utf8_next_char(t);
1414 s1=g_utf8_next_char(g_utf8_next_char(t));
1415 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1416 !g_unichar_isdigit(g_utf8_get_char(s1)))
1417 s1=g_utf8_next_char(s1);
1418 if (g_unichar_islower(g_utf8_get_char(s1)))
1420 /* we have something to investigate */
1422 /* so let's go back and find out */
1423 nc=g_utf8_get_char(t);
1424 s1=g_utf8_prev_char(t);
1425 c=g_utf8_get_char(s1);
1426 sprev=g_utf8_prev_char(s1);
1427 pc=g_utf8_get_char(sprev);
1429 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1430 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1431 g_unichar_isalpha(nc)))
1436 sprev=g_utf8_prev_char(s1);
1437 pc=g_utf8_get_char(sprev);
1439 s1=g_utf8_next_char(s1);
1442 testword=g_strndup(s1,s-s1);
1444 testword=g_strdup(s1);
1445 for (i=0;*abbrev[i];i++)
1446 if (!strcmp(testword,abbrev[i]))
1448 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1450 if (!*g_utf8_next_char(testword))
1452 if (isroman(testword))
1457 for (s=testword;*s;s=g_utf8_next_char(s))
1459 decomposition=g_unicode_canonical_decomposition(
1460 g_utf8_get_char(s),&len);
1461 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1463 g_free(decomposition);
1467 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1469 g_tree_insert(qperiod,g_strdup(testword),
1470 GINT_TO_POINTER(1));
1471 if (pswit[ECHO_SWITCH])
1472 g_print("\n%s\n",aline);
1473 if (!pswit[OVERVIEW_SWITCH])
1474 g_print(" Line %ld column %ld - Extra period?\n",
1475 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1481 t=g_utf8_next_char(t);
1487 * check_for_following_punctuation:
1489 * Check for words usually not followed by punctuation.
1491 void check_for_following_punctuation(const char *aline)
1494 const char *s,*wordstart;
1497 if (pswit[TYPO_SWITCH])
1508 inword=g_utf8_strdown(t,-1);
1510 for (i=0;*nocomma[i];i++)
1511 if (!strcmp(inword,nocomma[i]))
1513 c=g_utf8_get_char(s);
1514 if (c==',' || c==';' || c==':')
1516 if (pswit[ECHO_SWITCH])
1517 g_print("\n%s\n",aline);
1518 if (!pswit[OVERVIEW_SWITCH])
1519 g_print(" Line %ld column %ld - "
1520 "Query punctuation after %s?\n",
1521 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1527 for (i=0;*noperiod[i];i++)
1528 if (!strcmp(inword,noperiod[i]))
1530 c=g_utf8_get_char(s);
1531 if (c=='.' || c=='!')
1533 if (pswit[ECHO_SWITCH])
1534 g_print("\n%s\n",aline);
1535 if (!pswit[OVERVIEW_SWITCH])
1536 g_print(" Line %ld column %ld - "
1537 "Query punctuation after %s?\n",
1538 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1552 * Check for commonly mistyped words,
1553 * and digits like 0 for O in a word.
1555 void check_for_typos(const char *aline,struct warnings *warnings)
1557 const char *s,*t,*nt,*wordstart;
1559 gunichar *decomposition;
1561 int i,vowel,consonant,*dupcnt;
1562 gboolean isdup,istypo,alower;
1565 gsize decomposition_len;
1569 inword=getaword(&s);
1573 continue; /* don't bother with empty lines */
1575 if (mixdigit(inword))
1577 if (pswit[ECHO_SWITCH])
1578 g_print("\n%s\n",aline);
1579 if (!pswit[OVERVIEW_SWITCH])
1580 g_print(" Line %ld column %ld - Query digit in %s\n",
1581 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1586 * Put the word through a series of tests for likely typos and OCR
1589 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1593 for (t=inword;*t;t=g_utf8_next_char(t))
1595 c=g_utf8_get_char(t);
1596 nt=g_utf8_next_char(t);
1597 /* lowercase for testing */
1598 if (g_unichar_islower(c))
1600 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1603 * We have an uppercase mid-word. However, there are
1605 * Mac and Mc like McGill
1606 * French contractions like l'Abbe
1608 offset=g_utf8_pointer_to_offset(inword,t);
1610 pc=g_utf8_get_char(g_utf8_prev_char(t));
1613 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1614 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1615 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1616 CHAR_IS_APOSTROPHE(pc))
1622 testword=g_utf8_casefold(inword,-1);
1624 if (pswit[TYPO_SWITCH])
1627 * Check for certain unlikely two-letter combinations at word
1630 len=g_utf8_strlen(testword,-1);
1633 for (i=0;*nostart[i];i++)
1634 if (g_str_has_prefix(testword,nostart[i]))
1636 for (i=0;*noend[i];i++)
1637 if (g_str_has_suffix(testword,noend[i]))
1640 /* ght is common, gbt never. Like that. */
1641 if (strstr(testword,"cb"))
1643 if (strstr(testword,"gbt"))
1645 if (strstr(testword,"pbt"))
1647 if (strstr(testword,"tbs"))
1649 if (strstr(testword,"mrn"))
1651 if (strstr(testword,"ahle"))
1653 if (strstr(testword,"ihle"))
1656 * "TBE" does happen - like HEARTBEAT - but uncommon.
1657 * Also "TBI" - frostbite, outbid - but uncommon.
1658 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1659 * numerals, but "ii" is a common scanno.
1661 if (strstr(testword,"tbi"))
1663 if (strstr(testword,"tbe"))
1665 if (strstr(testword,"ii"))
1668 * Check for no vowels or no consonants.
1669 * If none, flag a typo.
1671 if (!istypo && len>1)
1674 for (t=testword;*t;t=g_utf8_next_char(t))
1676 c=g_utf8_get_char(t);
1678 g_unicode_canonical_decomposition(c,&decomposition_len);
1679 if (c=='y' || g_unichar_isdigit(c))
1681 /* Yah, this is loose. */
1685 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1689 g_free(decomposition);
1691 if (!vowel || !consonant)
1695 * Now exclude the word from being reported if it's in
1698 for (i=0;*okword[i];i++)
1699 if (!strcmp(testword,okword[i]))
1702 * What looks like a typo may be a Roman numeral.
1705 if (istypo && isroman(testword))
1707 /* Check the manual list of typos. */
1709 for (i=0;*typo[i];i++)
1710 if (!strcmp(testword,typo[i]))
1713 * Check lowercase s, l, i and m - special cases.
1714 * "j" - often a semi-colon gone wrong.
1715 * "d" for a missing apostrophe - he d
1718 if (!istypo && len==1 &&
1719 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1723 dupcnt=g_tree_lookup(qword,testword);
1727 isdup=!pswit[VERBOSE_SWITCH];
1731 dupcnt=g_new0(int,1);
1732 g_tree_insert(qword,g_strdup(testword),dupcnt);
1737 if (pswit[ECHO_SWITCH])
1738 g_print("\n%s\n",aline);
1739 if (!pswit[OVERVIEW_SWITCH])
1741 g_print(" Line %ld column %ld - Query word %s",
1742 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1744 if (!pswit[VERBOSE_SWITCH])
1745 g_print(" - not reporting duplicates");
1753 /* check the user's list of typos */
1754 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1756 if (pswit[ECHO_SWITCH])
1757 g_print("\n%s\n",aline);
1758 if (!pswit[OVERVIEW_SWITCH])
1759 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1760 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1762 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1764 if (pswit[PARANOID_SWITCH] && warnings->digit)
1766 /* In paranoid mode, query all 0 and 1 standing alone. */
1767 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1769 if (pswit[ECHO_SWITCH])
1770 g_print("\n%s\n",aline);
1771 if (!pswit[OVERVIEW_SWITCH])
1772 g_print(" Line %ld column %ld - Query standalone %s\n",
1773 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1784 * check_for_misspaced_punctuation:
1786 * Look for added or missing spaces around punctuation and quotes.
1787 * If there is a punctuation character like ! with no space on
1788 * either side, suspect a missing!space. If there are spaces on
1789 * both sides , assume a typo. If we see a double quote with no
1790 * space or punctuation on either side of it, assume unspaced
1791 * quotes "like"this.
1793 void check_for_misspaced_punctuation(const char *aline,
1794 struct parities *parities,gboolean isemptyline)
1796 gboolean isacro,isellipsis;
1798 gunichar c,nc,pc,n2c;
1800 c=g_utf8_get_char(aline);
1801 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1802 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1806 nc=g_utf8_get_char(g_utf8_next_char(s));
1807 /* For each character in the line after the first. */
1808 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1810 /* we need to suppress warnings for acronyms like M.D. */
1812 /* we need to suppress warnings for ellipsis . . . */
1815 * If there are letters on both sides of it or
1816 * if it's strict punctuation followed by an alpha.
1818 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1819 g_utf8_strchr("?!,;:",-1,c)))
1823 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1824 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1826 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1832 if (pswit[ECHO_SWITCH])
1833 g_print("\n%s\n",aline);
1834 if (!pswit[OVERVIEW_SWITCH])
1835 g_print(" Line %ld column %ld - Missing space?\n",
1836 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1841 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1844 * If there are spaces on both sides,
1845 * or space before and end of line.
1849 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1850 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1852 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1856 if (!isemptyline && !isellipsis)
1858 if (pswit[ECHO_SWITCH])
1859 g_print("\n%s\n",aline);
1860 if (!pswit[OVERVIEW_SWITCH])
1861 g_print(" Line %ld column %ld - "
1862 "Spaced punctuation?\n",linecnt,
1863 g_utf8_pointer_to_offset(aline,s)+1);
1870 /* Split out the characters that CANNOT be preceded by space. */
1871 c=g_utf8_get_char(aline);
1872 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1873 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1877 nc=g_utf8_get_char(g_utf8_next_char(s));
1878 /* for each character in the line after the first */
1879 if (g_utf8_strchr("?!,;:",-1,c))
1881 /* if it's punctuation that _cannot_ have a space before it */
1882 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1885 * If nc DOES == space,
1886 * it was already reported just above.
1888 if (pswit[ECHO_SWITCH])
1889 g_print("\n%s\n",aline);
1890 if (!pswit[OVERVIEW_SWITCH])
1891 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1892 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1899 * Special case " .X" where X is any alpha.
1900 * This plugs a hole in the acronym code above.
1901 * Inelegant, but maintainable.
1903 c=g_utf8_get_char(aline);
1904 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1905 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1909 nc=g_utf8_get_char(g_utf8_next_char(s));
1910 /* for each character in the line after the first */
1913 /* if it's a period */
1914 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1917 * If the period follows a space and
1918 * is followed by a letter.
1920 if (pswit[ECHO_SWITCH])
1921 g_print("\n%s\n",aline);
1922 if (!pswit[OVERVIEW_SWITCH])
1923 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1924 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1930 c=g_utf8_get_char(aline);
1931 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1932 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1936 nc=g_utf8_get_char(g_utf8_next_char(s));
1937 /* for each character in the line after the first */
1938 if (CHAR_IS_DQUOTE(c))
1940 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1941 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1942 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1944 if (pswit[ECHO_SWITCH])
1945 g_print("\n%s\n",aline);
1946 if (!pswit[OVERVIEW_SWITCH])
1947 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1948 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1954 /* Check parity of quotes. */
1955 nc=g_utf8_get_char(aline);
1956 for (s=aline;*s;s=g_utf8_next_char(s))
1959 nc=g_utf8_get_char(g_utf8_next_char(s));
1960 if (CHAR_IS_DQUOTE(c))
1964 parities->dquote=!parities->dquote;
1965 parity=parities->dquote;
1967 else if (c==CHAR_LD_QUOTE)
1974 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
1976 if (pswit[ECHO_SWITCH])
1977 g_print("\n%s\n",aline);
1978 if (!pswit[OVERVIEW_SWITCH])
1979 g_print(" Line %ld column %ld - "
1980 "Wrongspaced quotes?\n",
1981 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1989 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
1990 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
1992 if (pswit[ECHO_SWITCH])
1993 g_print("\n%s\n",aline);
1994 if (!pswit[OVERVIEW_SWITCH])
1995 g_print(" Line %ld column %ld - "
1996 "Wrongspaced quotes?\n",
1997 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2004 c=g_utf8_get_char(aline);
2005 if (CHAR_IS_DQUOTE(c))
2007 if (g_utf8_strchr(",;:!?)]} ",-1,
2008 g_utf8_get_char(g_utf8_next_char(aline))))
2010 if (pswit[ECHO_SWITCH])
2011 g_print("\n%s\n",aline);
2012 if (!pswit[OVERVIEW_SWITCH])
2013 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2019 if (pswit[SQUOTE_SWITCH])
2021 nc=g_utf8_get_char(aline);
2022 for (s=aline;*s;s=g_utf8_next_char(s))
2025 nc=g_utf8_get_char(g_utf8_next_char(s));
2026 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2027 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2028 !g_unichar_isalpha(nc)))
2030 parities->squote=!parities->squote;
2031 if (!parities->squote)
2034 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2036 if (pswit[ECHO_SWITCH])
2037 g_print("\n%s\n",aline);
2038 if (!pswit[OVERVIEW_SWITCH])
2039 g_print(" Line %ld column %ld - "
2040 "Wrongspaced singlequotes?\n",
2041 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2049 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2050 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2052 if (pswit[ECHO_SWITCH])
2053 g_print("\n%s\n",aline);
2054 if (!pswit[OVERVIEW_SWITCH])
2055 g_print(" Line %ld column %ld - "
2056 "Wrongspaced singlequotes?\n",
2057 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2068 * check_for_double_punctuation:
2070 * Look for double punctuation like ,. or ,,
2071 * Thanks to DW for the suggestion!
2072 * In books with references, ".," and ".;" are common
2073 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2074 * OTOH, from my initial tests, there are also fairly
2075 * common errors. What to do? Make these cases paranoid?
2076 * ".," is the most common, so warnings->dotcomma is used
2077 * to suppress detailed reporting if it occurs often.
2079 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2083 nc=g_utf8_get_char(aline);
2084 for (s=aline;*s;s=g_utf8_next_char(s))
2087 nc=g_utf8_get_char(g_utf8_next_char(s));
2088 /* for each punctuation character in the line */
2089 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2090 g_utf8_strchr(".?!,;:",-1,nc))
2092 /* followed by punctuation, it's a query, unless . . . */
2093 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2094 !warnings->dotcomma && c=='.' && nc==',' ||
2095 warnings->isFrench && g_str_has_prefix(s,",...") ||
2096 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2097 warnings->isFrench && g_str_has_prefix(s,";...") ||
2098 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2099 warnings->isFrench && g_str_has_prefix(s,":...") ||
2100 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2101 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2102 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2103 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2104 warnings->isFrench && g_str_has_prefix(s,"...?"))
2106 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2107 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2108 warnings->isFrench && g_str_has_prefix(s,";...") ||
2109 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2110 warnings->isFrench && g_str_has_prefix(s,":...") ||
2111 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2112 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2113 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2114 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2115 warnings->isFrench && g_str_has_prefix(s,"...?"))
2118 nc=g_utf8_get_char(g_utf8_next_char(s));
2120 ; /* do nothing for .. !! and ?? which can be legit */
2124 if (pswit[ECHO_SWITCH])
2125 g_print("\n%s\n",aline);
2126 if (!pswit[OVERVIEW_SWITCH])
2127 g_print(" Line %ld column %ld - Double punctuation?\n",
2128 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2137 * check_for_spaced_quotes:
2139 void check_for_spaced_quotes(const char *aline)
2143 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2147 while ((t=strstr(s," \" ")))
2149 if (pswit[ECHO_SWITCH])
2150 g_print("\n%s\n",aline);
2151 if (!pswit[OVERVIEW_SWITCH])
2152 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2153 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2156 s=g_utf8_next_char(g_utf8_next_char(t));
2158 pattern=g_string_new(NULL);
2159 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2161 g_string_assign(pattern," ");
2162 g_string_append_unichar(pattern,single_quotes[i]);
2163 g_string_append_c(pattern,' ');
2165 while ((t=strstr(s,pattern->str)))
2167 if (pswit[ECHO_SWITCH])
2168 g_print("\n%s\n",aline);
2169 if (!pswit[OVERVIEW_SWITCH])
2170 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2171 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2174 s=g_utf8_next_char(g_utf8_next_char(t));
2177 g_string_free(pattern,TRUE);
2181 * check_for_miscased_genative:
2183 * Check special case of 'S instead of 's at end of word.
2185 void check_for_miscased_genative(const char *aline)
2191 c=g_utf8_get_char(aline);
2192 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2193 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2197 nc=g_utf8_get_char(g_utf8_next_char(s));
2198 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2200 if (pswit[ECHO_SWITCH])
2201 g_print("\n%s\n",aline);
2202 if (!pswit[OVERVIEW_SWITCH])
2203 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2204 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2212 * check_end_of_line:
2214 * Now check special cases - start and end of line -
2215 * for single and double quotes. Start is sometimes [sic]
2216 * but better to query it anyway.
2217 * While we're here, check for dash at end of line.
2219 void check_end_of_line(const char *aline,struct warnings *warnings)
2224 lbytes=strlen(aline);
2225 if (g_utf8_strlen(aline,lbytes)>1)
2227 s=g_utf8_prev_char(aline+lbytes);
2228 c1=g_utf8_get_char(s);
2229 c2=g_utf8_get_char(g_utf8_prev_char(s));
2230 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2232 if (pswit[ECHO_SWITCH])
2233 g_print("\n%s\n",aline);
2234 if (!pswit[OVERVIEW_SWITCH])
2235 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2236 g_utf8_strlen(aline,lbytes));
2240 c1=g_utf8_get_char(aline);
2241 c2=g_utf8_get_char(g_utf8_next_char(aline));
2242 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2244 if (pswit[ECHO_SWITCH])
2245 g_print("\n%s\n",aline);
2246 if (!pswit[OVERVIEW_SWITCH])
2247 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2252 * Dash at end of line may well be legit - paranoid mode only
2253 * and don't report em-dash at line-end.
2255 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2257 for (s=g_utf8_prev_char(aline+lbytes);
2258 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2260 if (g_utf8_get_char(s)=='-' &&
2261 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2263 if (pswit[ECHO_SWITCH])
2264 g_print("\n%s\n",aline);
2265 if (!pswit[OVERVIEW_SWITCH])
2266 g_print(" Line %ld column %ld - "
2267 "Hyphen at end of line?\n",
2268 linecnt,g_utf8_pointer_to_offset(aline,s));
2275 * check_for_unspaced_bracket:
2277 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2278 * If so, suspect a scanno like "a]most".
2280 void check_for_unspaced_bracket(const char *aline)
2284 c=g_utf8_get_char(aline);
2285 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2286 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2290 nc=g_utf8_get_char(g_utf8_next_char(s));
2293 /* for each bracket character in the line except 1st & last */
2294 if (g_utf8_strchr("{[()]}",-1,c) &&
2295 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2297 if (pswit[ECHO_SWITCH])
2298 g_print("\n%s\n",aline);
2299 if (!pswit[OVERVIEW_SWITCH])
2300 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2301 linecnt,g_utf8_pointer_to_offset(aline,s));
2309 * check_for_unpunctuated_endquote:
2311 void check_for_unpunctuated_endquote(const char *aline)
2316 c=g_utf8_get_char(aline);
2317 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2318 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2322 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2323 nc=g_utf8_get_char(g_utf8_next_char(s));
2324 /* for each character in the line except 1st */
2325 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
2327 if (pswit[ECHO_SWITCH])
2328 g_print("\n%s\n",aline);
2329 if (!pswit[OVERVIEW_SWITCH])
2330 g_print(" Line %ld column %ld - "
2331 "endquote missing punctuation?\n",
2332 linecnt,g_utf8_pointer_to_offset(aline,s));
2340 * check_for_html_tag:
2342 * Check for <HTML TAG>.
2344 * If there is a < in the line, followed at some point
2345 * by a > then we suspect HTML.
2347 void check_for_html_tag(const char *aline)
2349 const char *open,*close;
2351 open=strchr(aline,'<');
2354 close=strchr(g_utf8_next_char(open),'>');
2357 if (pswit[ECHO_SWITCH])
2358 g_print("\n%s\n",aline);
2359 if (!pswit[OVERVIEW_SWITCH])
2361 tag=g_strndup(open,close-open+1);
2362 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2363 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2373 * check_for_html_entity:
2375 * Check for &symbol; HTML.
2377 * If there is a & in the line, followed at
2378 * some point by a ; then we suspect HTML.
2380 void check_for_html_entity(const char *aline)
2382 const char *s,*amp,*scolon;
2384 amp=strchr(aline,'&');
2387 scolon=strchr(amp,';');
2390 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2391 if (g_utf8_get_char(s)==CHAR_SPACE)
2392 break; /* Don't report "Jones & Son;" */
2395 if (pswit[ECHO_SWITCH])
2396 g_print("\n%s\n",aline);
2397 if (!pswit[OVERVIEW_SWITCH])
2399 entity=g_strndup(amp,scolon-amp+1);
2400 g_print(" Line %ld column %d - HTML symbol? %s \n",
2401 linecnt,(int)(amp-aline)+1,entity);
2412 * check_for_omitted_punctuation:
2414 * Check for omitted punctuation at end of paragraph by working back
2415 * through prevline. DW.
2416 * Need to check this only for "normal" paras.
2417 * So what is a "normal" para?
2418 * Not normal if one-liner (chapter headings, etc.)
2419 * Not normal if doesn't contain at least one locase letter
2420 * Not normal if starts with space
2422 void check_for_omitted_punctuation(const char *prevline,
2423 struct line_properties *last,int start_para_line)
2425 gboolean letter_on_line=FALSE;
2428 gboolean closing_quote;
2429 for (s=prevline;*s;s=g_utf8_next_char(s))
2430 if (g_unichar_isalpha(g_utf8_get_char(s)))
2432 letter_on_line=TRUE;
2436 * This next "if" is a problem.
2437 * If we say "start_para_line <= linecnt - 1", that includes
2438 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2439 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2440 * misses genuine one-line paragraphs.
2442 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2443 g_utf8_get_char(prevline)>CHAR_SPACE)
2445 s=prevline+strlen(prevline);
2448 s=g_utf8_prev_char(s);
2449 c=g_utf8_get_char(s);
2450 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2453 closing_quote=FALSE;
2454 } while (closing_quote && s>prevline);
2455 for (;s>prevline;s=g_utf8_prev_char(s))
2457 if (g_unichar_isalpha(g_utf8_get_char(s)))
2459 if (pswit[ECHO_SWITCH])
2460 g_print("\n%s\n",prevline);
2461 if (!pswit[OVERVIEW_SWITCH])
2462 g_print(" Line %ld column %ld - "
2463 "No punctuation at para end?\n",
2464 linecnt-1,g_utf8_strlen(prevline,-1));
2469 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2475 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2477 const char *word=key;
2480 g_print("\nNote: Queried word %s was duplicated %d times\n",
2485 void print_as_windows_1252(const char *string)
2487 gsize inbytes,outbytes;
2489 static GIConv converter=(GIConv)-1;
2492 if (converter!=(GIConv)-1)
2493 g_iconv_close(converter);
2494 converter=(GIConv)-1;
2497 if (converter==(GIConv)-1)
2498 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2499 if (converter!=(GIConv)-1)
2501 inbytes=outbytes=strlen(string);
2502 bp=buf=g_malloc(outbytes+1);
2503 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2509 fputs(string,stdout);
2512 void print_as_utf_8(const char *string)
2514 fputs(string,stdout);
2522 void procfile(const char *filename)
2525 gchar *parastart=NULL; /* first line of current para */
2526 gchar *etext,*aline;
2529 struct first_pass_results *first_pass_results;
2530 struct warnings *warnings;
2531 struct counters counters={0};
2532 struct line_properties last={0};
2533 struct parities parities={0};
2534 struct pending pending={0};
2535 gboolean isemptyline;
2536 long start_para_line=0;
2537 gboolean isnewpara=FALSE,enddash=FALSE;
2538 last.start=CHAR_SPACE;
2539 linecnt=checked_linecnt=0;
2540 etext=read_etext(filename,&err);
2543 if (pswit[STDOUT_SWITCH])
2544 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2546 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2549 g_print("\n\nFile: %s\n\n",filename);
2550 first_pass_results=first_pass(etext);
2551 warnings=report_first_pass(first_pass_results);
2552 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2553 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2555 * Here we go with the main pass. Hold onto yer hat!
2559 while ((aline=flgets(&etext_ptr,linecnt+1)))
2564 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2565 continue; // skip DP page separators completely
2566 if (linecnt<first_pass_results->firstline ||
2567 (first_pass_results->footerline>0 &&
2568 linecnt>first_pass_results->footerline))
2570 if (pswit[HEADER_SWITCH])
2572 if (g_str_has_prefix(aline,"Title:"))
2573 g_print(" %s\n",aline);
2574 if (g_str_has_prefix(aline,"Author:"))
2575 g_print(" %s\n",aline);
2576 if (g_str_has_prefix(aline,"Release Date:"))
2577 g_print(" %s\n",aline);
2578 if (g_str_has_prefix(aline,"Edition:"))
2579 g_print(" %s\n\n",aline);
2581 continue; /* skip through the header */
2584 print_pending(aline,parastart,&pending);
2585 isemptyline=analyse_quotes(aline,linecnt,&counters);
2586 if (isnewpara && !isemptyline)
2588 /* This line is the start of a new paragraph. */
2589 start_para_line=linecnt;
2590 /* Capture its first line in case we want to report it later. */
2592 parastart=g_strdup(aline);
2593 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2595 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2596 !g_unichar_isdigit(g_utf8_get_char(s)))
2597 s=g_utf8_next_char(s);
2598 if (g_unichar_islower(g_utf8_get_char(s)))
2600 /* and its first letter is lowercase */
2601 if (pswit[ECHO_SWITCH])
2602 g_print("\n%s\n",aline);
2603 if (!pswit[OVERVIEW_SWITCH])
2604 g_print(" Line %ld column %ld - "
2605 "Paragraph starts with lower-case\n",
2606 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2610 isnewpara=FALSE; /* Signal the end of new para processing. */
2612 /* Check for an em-dash broken at line end. */
2613 if (enddash && g_utf8_get_char(aline)=='-')
2615 if (pswit[ECHO_SWITCH])
2616 g_print("\n%s\n",aline);
2617 if (!pswit[OVERVIEW_SWITCH])
2618 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2623 for (s=g_utf8_prev_char(aline+strlen(aline));
2624 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2626 if (s>=aline && g_utf8_get_char(s)=='-')
2628 check_for_control_characters(aline);
2630 check_for_odd_characters(aline,warnings,isemptyline);
2631 if (warnings->longline)
2632 check_for_long_line(aline);
2633 if (warnings->shortline)
2634 check_for_short_line(aline,&last);
2636 last.len=g_utf8_strlen(aline,-1);
2637 last.start=g_utf8_get_char(aline);
2638 check_for_starting_punctuation(aline);
2641 check_for_spaced_emdash(aline);
2642 check_for_spaced_dash(aline);
2644 check_for_unmarked_paragraphs(aline);
2645 check_for_jeebies(aline);
2646 check_for_mta_from(aline);
2647 check_for_orphan_character(aline);
2648 check_for_pling_scanno(aline);
2649 check_for_extra_period(aline,warnings);
2650 check_for_following_punctuation(aline);
2651 check_for_typos(aline,warnings);
2652 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2653 check_for_double_punctuation(aline,warnings);
2654 check_for_spaced_quotes(aline);
2655 check_for_miscased_genative(aline);
2656 check_end_of_line(aline,warnings);
2657 check_for_unspaced_bracket(aline);
2658 if (warnings->endquote)
2659 check_for_unpunctuated_endquote(aline);
2660 check_for_html_tag(aline);
2661 check_for_html_entity(aline);
2664 check_for_mismatched_quotes(&counters,&pending);
2665 counters_reset(&counters);
2666 /* let the next iteration know that it's starting a new para */
2669 check_for_omitted_punctuation(prevline,&last,start_para_line);
2672 prevline=g_strdup(aline);
2675 check_for_mismatched_quotes(&counters,&pending);
2676 print_pending(NULL,parastart,&pending);
2677 reset_pending(&pending);
2686 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2687 g_tree_foreach(qword,report_duplicate_queries,NULL);
2688 g_tree_unref(qword);
2689 g_tree_unref(qperiod);
2690 counters_destroy(&counters);
2691 g_set_print_handler(NULL);
2692 print_as_windows_1252(NULL);
2693 if (pswit[MARKUP_SWITCH])
2700 * Get one line from the input text, checking for
2701 * the existence of exactly one CR/LF line-end per line.
2703 * Returns: a pointer to the line.
2705 char *flgets(char **etext,long lcnt)
2708 gboolean isCR=FALSE;
2709 char *theline=*etext;
2714 c=g_utf8_get_char(*etext);
2717 if (*etext==theline)
2719 else if (pswit[LINE_END_SWITCH])
2721 if (pswit[ECHO_SWITCH])
2723 s=g_strndup(theline,eos-theline);
2724 g_print("\n%s\n",s);
2727 if (!pswit[OVERVIEW_SWITCH])
2728 /* There may, or may not, have been a CR */
2729 g_print(" Line %ld - No LF?\n",lcnt);
2735 *etext=g_utf8_next_char(*etext);
2736 /* either way, it's end of line */
2743 /* Error - a LF without a preceding CR */
2744 if (pswit[LINE_END_SWITCH])
2746 if (pswit[ECHO_SWITCH])
2748 s=g_strndup(theline,eos-theline);
2749 g_print("\n%s\n",s);
2752 if (!pswit[OVERVIEW_SWITCH])
2753 g_print(" Line %ld - No CR?\n",lcnt);
2764 /* Error - two successive CRs */
2765 if (pswit[LINE_END_SWITCH])
2767 if (pswit[ECHO_SWITCH])
2769 s=g_strndup(theline,eos-theline);
2770 g_print("\n%s\n",s);
2773 if (!pswit[OVERVIEW_SWITCH])
2774 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2783 if (pswit[LINE_END_SWITCH] && isCR)
2785 if (pswit[ECHO_SWITCH])
2787 s=g_strndup(theline,eos-theline);
2788 g_print("\n%s\n",s);
2791 if (!pswit[OVERVIEW_SWITCH])
2792 g_print(" Line %ld column %ld - CR without LF?\n",
2793 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2799 eos=g_utf8_next_char(eos);
2803 if (pswit[MARKUP_SWITCH])
2804 postprocess_for_HTML(theline);
2805 if (pswit[DP_SWITCH])
2806 postprocess_for_DP(theline);
2813 * Takes a "word" as a parameter, and checks whether it
2814 * contains a mixture of alpha and digits. Generally, this is an
2815 * error, but may not be for cases like 4th or L5 12s. 3d.
2817 * Returns: TRUE iff an is error found.
2819 gboolean mixdigit(const char *checkword)
2821 gboolean wehaveadigit,wehavealetter,query;
2822 const char *s,*nondigit;
2823 wehaveadigit=wehavealetter=query=FALSE;
2824 for (s=checkword;*s;s=g_utf8_next_char(s))
2825 if (g_unichar_isalpha(g_utf8_get_char(s)))
2827 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2829 if (wehaveadigit && wehavealetter)
2831 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2833 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2834 nondigit=g_utf8_next_char(nondigit))
2836 /* digits, ending in st, rd, nd, th of either case */
2837 if (!g_ascii_strcasecmp(nondigit,"st") ||
2838 !g_ascii_strcasecmp(nondigit,"rd") ||
2839 !g_ascii_strcasecmp(nondigit,"nd") ||
2840 !g_ascii_strcasecmp(nondigit,"th"))
2842 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2843 !g_ascii_strcasecmp(nondigit,"rds") ||
2844 !g_ascii_strcasecmp(nondigit,"nds") ||
2845 !g_ascii_strcasecmp(nondigit,"ths"))
2847 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2848 !g_ascii_strcasecmp(nondigit,"rdly") ||
2849 !g_ascii_strcasecmp(nondigit,"ndly") ||
2850 !g_ascii_strcasecmp(nondigit,"thly"))
2852 /* digits, ending in l, L, s or d */
2853 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2854 !strcmp(nondigit,"d"))
2857 * L at the start of a number, representing Britsh pounds, like L500.
2858 * This is cute. We know the current word is mixed digit. If the first
2859 * letter is L, there must be at least one digit following. If both
2860 * digits and letters follow, we have a genuine error, else we have a
2861 * capital L followed by digits, and we accept that as a non-error.
2863 if (g_utf8_get_char(checkword)=='L' &&
2864 !mixdigit(g_utf8_next_char(checkword)))
2873 * Extracts the first/next "word" from the line, and returns it.
2874 * A word is defined as one English word unit--or at least that's the aim.
2875 * "ptr" is advanced to the position in the line where we will start
2876 * looking for the next word.
2878 * Returns: A newly-allocated string.
2880 gchar *getaword(const char **ptr)
2885 word=g_string_new(NULL);
2886 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2887 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2888 **ptr;*ptr=g_utf8_next_char(*ptr))
2891 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2892 * Especially yucky is the case of L1,000
2893 * This section looks for a pattern of characters including a digit
2894 * followed by a comma or period followed by one or more digits.
2895 * If found, it returns this whole pattern as a word; otherwise we discard
2896 * the results and resume our normal programming.
2899 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2900 g_unichar_isalpha(g_utf8_get_char(s)) ||
2901 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2902 g_string_append_unichar(word,g_utf8_get_char(s));
2905 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
2907 c=g_utf8_get_char(t);
2908 pc=g_utf8_get_char(g_utf8_prev_char(t));
2909 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
2912 return g_string_free(word,FALSE);
2916 /* we didn't find a punctuated number - do the regular getword thing */
2917 g_string_truncate(word,0);
2918 c=g_utf8_get_char(*ptr);
2919 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
2920 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
2921 g_string_append_unichar(word,c);
2922 return g_string_free(word,FALSE);
2928 * Is this word a Roman Numeral?
2930 * It doesn't actually validate that the number is a valid Roman Numeral--for
2931 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2932 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2933 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2934 * expressions thereof, except when it came to taxes. Allow any number of M,
2935 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2936 * XL or an optional XC, an optional IX or IV, an optional V and any number
2939 gboolean isroman(const char *t)
2945 while (g_utf8_get_char(t)=='m' && *t)
2947 if (g_utf8_get_char(t)=='d')
2949 if (g_str_has_prefix(t,"cm"))
2951 if (g_str_has_prefix(t,"cd"))
2953 while (g_utf8_get_char(t)=='c' && *t)
2955 if (g_str_has_prefix(t,"xl"))
2957 if (g_str_has_prefix(t,"xc"))
2959 if (g_utf8_get_char(t)=='l')
2961 while (g_utf8_get_char(t)=='x' && *t)
2963 if (g_str_has_prefix(t,"ix"))
2965 if (g_str_has_prefix(t,"iv"))
2967 if (g_utf8_get_char(t)=='v')
2969 while (g_utf8_get_char(t)=='i' && *t)
2975 * postprocess_for_DP:
2977 * Invoked with the -d switch from flgets().
2978 * It simply "removes" from the line a hard-coded set of common
2979 * DP-specific tags, so that the line passed to the main routine has
2980 * been pre-cleaned of DP markup.
2982 void postprocess_for_DP(char *theline)
2988 for (i=0;*DPmarkup[i];i++)
2989 while ((s=strstr(theline,DPmarkup[i])))
2991 t=s+strlen(DPmarkup[i]);
2992 memmove(s,t,strlen(t)+1);
2997 * postprocess_for_HTML:
2999 * Invoked with the -m switch from flgets().
3000 * It simply "removes" from the line a hard-coded set of common
3001 * HTML tags and "replaces" a hard-coded set of common HTML
3002 * entities, so that the line passed to the main routine has
3003 * been pre-cleaned of HTML.
3005 void postprocess_for_HTML(char *theline)
3007 while (losemarkup(theline))
3009 loseentities(theline);
3012 char *losemarkup(char *theline)
3016 s=strchr(theline,'<');
3017 t=s?strchr(s,'>'):NULL;
3020 for (i=0;*markup[i];i++)
3021 if (tagcomp(g_utf8_next_char(s),markup[i]))
3023 t=g_utf8_next_char(t);
3024 memmove(s,t,strlen(t)+1);
3027 /* It's an unrecognized <xxx>. */
3031 void loseentities(char *theline)
3038 GTree *entities=NULL;
3039 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3043 g_tree_destroy(entities);
3045 if (translit!=(GIConv)-1)
3046 g_iconv_close(translit);
3047 translit=(GIConv)-1;
3048 if (to_utf8!=(GIConv)-1)
3049 g_iconv_close(to_utf8);
3057 entities=g_tree_new((GCompareFunc)strcmp);
3058 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3059 g_tree_insert(entities,HTMLentities[i].name,
3060 GUINT_TO_POINTER(HTMLentities[i].c));
3062 if (translit==(GIConv)-1)
3063 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3064 if (to_utf8==(GIConv)-1)
3065 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3066 while((amp=strchr(theline,'&')))
3068 scolon=strchr(amp,';');
3073 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3074 c=strtol(amp+2,NULL,10);
3075 else if (amp[2]=='x' &&
3076 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3077 c=strtol(amp+3,NULL,16);
3081 s=g_strndup(amp+1,scolon-(amp+1));
3082 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3091 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3092 theline+=g_unichar_to_utf8(c,theline);
3096 nb=g_unichar_to_utf8(c,s);
3097 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3099 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3101 memcpy(theline,s,nb);
3105 memmove(theline,g_utf8_next_char(scolon),
3106 strlen(g_utf8_next_char(scolon))+1);
3109 theline=g_utf8_next_char(amp);
3113 gboolean tagcomp(const char *strin,const char *basetag)
3117 if (g_utf8_get_char(strin)=='/')
3118 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3120 t=g_utf8_casefold(strin,-1);
3121 s=g_utf8_casefold(basetag,-1);
3122 retval=g_str_has_prefix(t,s);
3128 void proghelp(GOptionContext *context)
3131 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3132 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3133 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3134 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3135 "For details, read the file COPYING.\n",stderr);
3136 fputs("This is Free Software; "
3137 "you may redistribute it under certain conditions (GPL);\n",stderr);
3138 fputs("read the file COPYING for details.\n\n",stderr);
3139 help=g_option_context_get_help(context,TRUE,NULL);
3142 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3143 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3144 "non-ASCII\n",stderr);
3145 fputs("characters like accented letters, "
3146 "lines longer than 75 or shorter than 55,\n",stderr);
3147 fputs("unbalanced quotes or brackets, "
3148 "a variety of badly formatted punctuation, \n",stderr);
3149 fputs("HTML tags, some likely typos. "
3150 "It is NOT a substitute for human judgement.\n",stderr);