1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 static GOptionEntry options[]={
132 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
133 "Ignore DP-specific markup", NULL },
134 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
135 "Don't echo queried line", NULL },
136 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
137 "Check single quotes", NULL },
138 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
139 "Check common typos", NULL },
140 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
141 "Require closure of quotes on every paragraph", NULL },
142 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
143 "Disable paranoid querying of everything", NULL },
144 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
145 "Disable line end checking", NULL },
146 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
147 "Overview: just show counts", NULL },
148 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
149 "Output errors to stdout instead of stderr", NULL },
150 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
151 "Echo header fields", NULL },
152 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
153 "Ignore markup in < >", NULL },
154 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
155 "Use file of user-defined typos", NULL },
156 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
157 "Defaults for use on www upload", NULL },
158 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
159 "Verbose - list everything", NULL },
163 long cnt_dquot; /* for overview mode, count of doublequote queries */
164 long cnt_squot; /* for overview mode, count of singlequote queries */
165 long cnt_brack; /* for overview mode, count of brackets queries */
166 long cnt_bin; /* for overview mode, count of non-ASCII queries */
167 long cnt_odd; /* for overview mode, count of odd character queries */
168 long cnt_long; /* for overview mode, count of long line errors */
169 long cnt_short; /* for overview mode, count of short line queries */
170 long cnt_punct; /* for overview mode,
171 count of punctuation and spacing queries */
172 long cnt_dash; /* for overview mode, count of dash-related queries */
173 long cnt_word; /* for overview mode, count of word queries */
174 long cnt_html; /* for overview mode, count of html queries */
175 long cnt_lineend; /* for overview mode, count of line-end queries */
176 long cnt_spacend; /* count of lines with space at end */
177 long linecnt; /* count of total lines in the file */
178 long checked_linecnt; /* count of lines actually checked */
180 void proghelp(GOptionContext *context);
181 void procfile(const char *);
185 gboolean mixdigit(const char *);
186 gchar *getaword(const char **);
187 char *flgets(char **,long);
188 void postprocess_for_HTML(char *);
189 char *linehasmarkup(char *);
190 char *losemarkup(char *);
191 gboolean tagcomp(const char *,const char *);
192 void loseentities(char *);
193 gboolean isroman(const char *);
194 void postprocess_for_DP(char *);
195 void print_as_windows_1252(const char *string);
196 void print_as_utf_8(const char *string);
198 GTree *qword,*qperiod;
204 void parse_options(int *argc,char ***argv)
207 GOptionContext *context;
208 context=g_option_context_new(
209 "file - looks for errors in Project Gutenberg(TM) etexts");
210 g_option_context_add_main_entries(context,options,NULL);
211 if (!g_option_context_parse(context,argc,argv,&err))
213 g_printerr("Bookloupe: %s\n",err->message);
214 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
217 /* Paranoid checking is turned OFF, not on, by its switch */
218 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
219 if (pswit[PARANOID_SWITCH])
220 /* if running in paranoid mode, typo checks default to enabled */
221 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
222 /* Line-end checking is turned OFF, not on, by its switch */
223 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
224 /* Echoing is turned OFF, not on, by its switch */
225 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
226 if (pswit[OVERVIEW_SWITCH])
227 /* just print summary; don't echo */
228 pswit[ECHO_SWITCH]=FALSE;
230 * Web uploads - for the moment, this is really just a placeholder
231 * until we decide what processing we really want to do on web uploads
233 if (pswit[WEB_SWITCH])
235 /* specific override for web uploads */
236 pswit[ECHO_SWITCH]=TRUE;
237 pswit[SQUOTE_SWITCH]=FALSE;
238 pswit[TYPO_SWITCH]=TRUE;
239 pswit[QPARA_SWITCH]=FALSE;
240 pswit[PARANOID_SWITCH]=TRUE;
241 pswit[LINE_END_SWITCH]=FALSE;
242 pswit[OVERVIEW_SWITCH]=FALSE;
243 pswit[STDOUT_SWITCH]=FALSE;
244 pswit[HEADER_SWITCH]=TRUE;
245 pswit[VERBOSE_SWITCH]=FALSE;
246 pswit[MARKUP_SWITCH]=FALSE;
247 pswit[USERTYPO_SWITCH]=FALSE;
248 pswit[DP_SWITCH]=FALSE;
255 g_option_context_free(context);
261 * Read in the user-defined stealth scanno list.
263 void read_user_scannos(void)
266 gchar *usertypo_file;
270 gchar *contents,*utf8,**lines;
271 usertypo_file=g_strdup("bookloupe.typ");
272 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
273 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
276 g_free(usertypo_file);
277 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
278 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
280 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
283 g_free(usertypo_file);
284 usertypo_file=g_strdup("gutcheck.typ");
285 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
287 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
290 g_free(usertypo_file);
291 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
292 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
294 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
296 g_free(usertypo_file);
297 g_print(" --> I couldn't find bookloupe.typ "
298 "-- proceeding without user typos.\n");
303 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
304 g_free(usertypo_file);
308 if (g_utf8_validate(contents,len,NULL))
309 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
311 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
313 lines=g_strsplit_set(utf8,"\r\n",0);
315 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
316 for (i=0;lines[i];i++)
317 if (*(unsigned char *)lines[i]>'!')
318 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
327 * Read an etext returning a newly allocated string containing the file
328 * contents or NULL on error.
330 gchar *read_etext(const char *filename,GError **err)
332 GError *tmp_err=NULL;
333 gchar *contents,*utf8;
334 gsize len,bytes_read,bytes_written;
336 if (!g_file_get_contents(filename,&contents,&len,err))
338 if (g_utf8_validate(contents,len,NULL))
340 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
341 g_set_print_handler(print_as_utf_8);
343 SetConsoleOutputCP(CP_UTF8);
348 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
349 &bytes_written,&tmp_err);
350 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
351 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
354 for(i=0;i<bytes_read;i++)
355 if (contents[i]=='\n')
360 else if (contents[i]!='\r')
362 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
363 "Input conversion failed. Byte %d at line %d, column %d is not a "
364 "valid Windows-1252 character",
365 ((unsigned char *)contents)[bytes_read],line,col);
368 g_propagate_error(err,tmp_err);
369 g_set_print_handler(print_as_windows_1252);
371 SetConsoleOutputCP(1252);
378 void cleanup_on_exit(void)
381 SetConsoleOutputCP(saved_cp);
385 int main(int argc,char **argv)
388 atexit(cleanup_on_exit);
389 saved_cp=GetConsoleOutputCP();
391 running_from=g_path_get_dirname(argv[0]);
392 parse_options(&argc,&argv);
393 if (pswit[USERTYPO_SWITCH])
395 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
397 if (pswit[OVERVIEW_SWITCH])
399 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
400 checked_linecnt,linecnt,linecnt-checked_linecnt);
401 g_print(" --------------- Queries found --------------\n");
403 g_print(" Long lines: %14ld\n",cnt_long);
405 g_print(" Short lines: %14ld\n",cnt_short);
407 g_print(" Line-end problems: %14ld\n",cnt_lineend);
409 g_print(" Common typos: %14ld\n",cnt_word);
411 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
413 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
415 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
417 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
419 g_print(" Proofing characters: %14ld\n",cnt_odd);
421 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
423 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
425 g_print(" Possible HTML tags: %14ld\n",cnt_html);
427 g_print(" TOTAL QUERIES %14ld\n",
428 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
429 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
431 g_free(running_from);
433 g_tree_unref(usertypo);
440 * Run a first pass - verify that it's a valid PG
441 * file, decide whether to report some things that
442 * occur many times in the text like long or short
443 * lines, non-standard dashes, etc.
445 struct first_pass_results *first_pass(const char *etext)
447 gunichar laststart=CHAR_SPACE;
452 unsigned int lastlen=0,lastblen=0;
453 long spline=0,nspline=0;
454 static struct first_pass_results results={0};
456 lines=g_strsplit(etext,"\n",0);
457 for (j=0;lines[j];j++)
459 lbytes=strlen(lines[j]);
460 while (lbytes>0 && lines[j][lbytes-1]=='\r')
461 lines[j][--lbytes]='\0';
462 llen=g_utf8_strlen(lines[j],lbytes);
464 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
465 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
468 g_print(" --> Duplicate header?\n");
469 spline=linecnt+1; /* first line of non-header text, that is */
471 if (!strncmp(lines[j],"*** START",9) &&
472 strstr(lines[j],"PROJECT GUTENBERG"))
475 g_print(" --> Duplicate header?\n");
476 nspline=linecnt+1; /* first line of non-header text, that is */
478 if (spline || nspline)
480 lc_line=g_utf8_strdown(lines[j],lbytes);
481 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
483 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
485 if (results.footerline)
487 /* it's an old-form header - we can detect duplicates */
489 g_print(" --> Duplicate footer?\n");
492 results.footerline=linecnt;
498 results.firstline=spline;
500 results.firstline=nspline; /* override with new */
501 if (results.footerline)
502 continue; /* don't count the boilerplate in the footer */
503 results.totlen+=llen;
504 for (s=lines[j];*s;s=g_utf8_next_char(s))
506 if (g_utf8_get_char(s)>127)
508 if (g_unichar_isalpha(g_utf8_get_char(s)))
510 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
511 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
512 results.endquote_count++;
514 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
515 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
518 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
520 if (strstr(lines[j],".,"))
522 /* only count ast lines for ignoring purposes where there is */
523 /* locase text on the line */
524 if (strchr(lines[j],'*'))
526 for (s=lines[j];*s;s=g_utf8_next_char(s))
527 if (g_unichar_islower(g_utf8_get_char(s)))
532 if (strchr(lines[j],'/'))
533 results.fslashline++;
536 for (s=g_utf8_prev_char(lines[j]+lbytes);
537 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
538 s=g_utf8_prev_char(s))
540 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
541 g_utf8_get_char(g_utf8_prev_char(s))!='-')
544 if (llen>LONGEST_PG_LINE)
546 if (llen>WAY_TOO_LONG)
547 results.verylongline++;
548 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
550 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
553 if (strstr(lines[j],"<i>"))
554 results.htmcount+=4; /* bonus marks! */
556 /* Check for spaced em-dashes */
557 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
560 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
561 results.space_emdash++;
562 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
563 /* count of em-dashes with spaces both sides */
564 results.non_PG_space_emdash++;
565 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
566 /* count of PG-type em-dashes with no spaces */
567 results.PG_space_emdash++;
572 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
573 results.Dutchcount++;
574 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
575 results.Frenchcount++;
576 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
577 results.standalone_digit++;
580 /* Check for spaced dashes */
581 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
585 laststart=lines[j][0];
594 * Make some snap decisions based on the first pass results.
596 struct warnings *report_first_pass(struct first_pass_results *results)
598 static struct warnings warnings={0};
600 g_print(" --> %ld lines in this file have white space at end\n",
603 if (results->dotcomma>5)
606 g_print(" --> %ld lines in this file contain '.,'. "
607 "Not reporting them.\n",results->dotcomma);
610 * If more than 50 lines, or one-tenth, are short,
611 * don't bother reporting them.
613 warnings.shortline=1;
614 if (results->shortline>50 || results->shortline*10>linecnt)
616 warnings.shortline=0;
617 g_print(" --> %ld lines in this file are short. "
618 "Not reporting short lines.\n",results->shortline);
621 * If more than 50 lines, or one-tenth, are long,
622 * don't bother reporting them.
625 if (results->longline>50 || results->longline*10>linecnt)
628 g_print(" --> %ld lines in this file are long. "
629 "Not reporting long lines.\n",results->longline);
631 /* If more than 10 lines contain asterisks, don't bother reporting them. */
633 if (results->astline>10)
636 g_print(" --> %ld lines in this file contain asterisks. "
637 "Not reporting them.\n",results->astline);
640 * If more than 10 lines contain forward slashes,
641 * don't bother reporting them.
644 if (results->fslashline>10)
647 g_print(" --> %ld lines in this file contain forward slashes. "
648 "Not reporting them.\n",results->fslashline);
651 * If more than 20 lines contain unpunctuated endquotes,
652 * don't bother reporting them.
655 if (results->endquote_count>20)
658 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
659 "Not reporting them.\n",results->endquote_count);
662 * If more than 15 lines contain standalone digits,
663 * don't bother reporting them.
666 if (results->standalone_digit>10)
669 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
670 "Not reporting them.\n",results->standalone_digit);
673 * If more than 20 lines contain hyphens at end,
674 * don't bother reporting them.
677 if (results->hyphens>20)
680 g_print(" --> %ld lines in this file have hyphens at end. "
681 "Not reporting them.\n",results->hyphens);
683 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
685 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
686 pswit[MARKUP_SWITCH]=1;
688 if (results->verylongline>0)
689 g_print(" --> %ld lines in this file are VERY long!\n",
690 results->verylongline);
692 * If there are more non-PG spaced dashes than PG em-dashes,
693 * assume it's deliberate.
694 * Current PG guidelines say don't use them, but older texts do,
695 * and some people insist on them whatever the guidelines say.
698 if (results->spacedash+results->non_PG_space_emdash>
699 results->PG_space_emdash)
702 g_print(" --> There are %ld spaced dashes and em-dashes. "
703 "Not reporting them.\n",
704 results->spacedash+results->non_PG_space_emdash);
706 /* If more than a quarter of characters are hi-bit, bug out. */
708 if (results->binlen*4>results->totlen)
710 g_print(" --> This file does not appear to be ASCII. "
711 "Terminating. Best of luck with it!\n");
714 if (results->alphalen*4<results->totlen)
716 g_print(" --> This file does not appear to be text. "
717 "Terminating. Best of luck with it!\n");
720 if (results->binlen*100>results->totlen || results->binlen>100)
722 g_print(" --> There are a lot of foreign letters here. "
723 "Not reporting them.\n");
726 warnings.isDutch=FALSE;
727 if (results->Dutchcount>50)
729 warnings.isDutch=TRUE;
730 g_print(" --> This looks like Dutch - "
731 "switching off dashes and warnings for 's Middags case.\n");
733 warnings.isFrench=FALSE;
734 if (results->Frenchcount>50)
736 warnings.isFrench=TRUE;
737 g_print(" --> This looks like French - "
738 "switching off some doublepunct.\n");
740 if (results->firstline && results->footerline)
741 g_print(" The PG header and footer appear to be already on.\n");
744 if (results->firstline)
745 g_print(" The PG header is on - no footer.\n");
746 if (results->footerline)
747 g_print(" The PG footer is on - no header.\n");
750 if (pswit[VERBOSE_SWITCH])
753 warnings.shortline=1;
762 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
764 if (warnings.isDutch)
766 if (results->footerline>0 && results->firstline>0 &&
767 results->footerline>results->firstline &&
768 results->footerline-results->firstline<100)
770 g_print(" --> I don't really know where this text starts. \n");
771 g_print(" There are no reference points.\n");
772 g_print(" I'm going to have to report the header and footer "
774 results->firstline=0;
782 * Look along the line, accumulate the count of quotes, and see
783 * if this is an empty line - i.e. a line with nothing on it
785 * If line has just spaces, period, * and/or - on it, don't
786 * count it, since empty lines with asterisks or dashes to
787 * separate sections are common.
789 * Returns: TRUE if the line is empty.
791 gboolean analyse_quotes(const char *aline,struct counters *counters)
794 /* assume the line is empty until proven otherwise */
795 gboolean isemptyline=TRUE;
796 const char *s=aline,*sprev,*snext;
801 snext=g_utf8_next_char(s);
802 c=g_utf8_get_char(s);
804 increment_matching(counters,c,!matching_difference(counters,c));
805 else if (CHAR_IS_DQUOTE(c))
806 increment_matching(counters,c,!CHAR_IS_CLOSING_QUOTE(c));
807 else if (CHAR_IS_SQUOTE(c))
812 * At start of line, it can only be an openquote.
813 * Hardcode a very common exception!
815 if (!g_str_has_prefix(snext,"tis") &&
816 !g_str_has_prefix(snext,"Tis"))
817 increment_matching(counters,c,TRUE);
819 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
820 g_unichar_isalpha(g_utf8_get_char(snext)))
821 /* Do nothing! it's definitely an apostrophe, not a quote */
823 /* it's outside a word - let's check it out */
824 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
825 g_unichar_isalpha(g_utf8_get_char(snext)))
827 /* it damwell better BE an openquote */
828 if (!g_str_has_prefix(snext,"tis") &&
829 !g_str_has_prefix(snext,"Tis"))
830 /* hardcode a very common exception! */
831 increment_matching(counters,c,TRUE);
835 /* now - is it a closequote? */
836 guessquote=0; /* accumulate clues */
837 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
839 /* it follows a letter - could be either */
841 if (g_utf8_get_char(sprev)=='s')
843 /* looks like a plural apostrophe */
845 if (g_utf8_get_char(snext)==CHAR_SPACE)
850 /* it doesn't have a letter either side */
851 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
852 strchr(".?!,;: ",g_utf8_get_char(snext)))
853 guessquote+=8; /* looks like a closequote */
856 if (matching_difference(counters,CHAR_SQUOTE)>0)
858 * Give it the benefit of some doubt,
859 * if a squote is already open.
865 increment_matching(counters,c,FALSE);
868 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
870 isemptyline=FALSE; /* ignore lines like * * * as spacers */
871 if (c==CHAR_UNDERSCORE)
872 counters->c_unders++;
873 if (c==CHAR_OPEN_SBRACK)
875 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
876 !matching_difference(counters,c) && s==aline &&
877 g_str_has_prefix(s,"[Illustration:"))
878 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
880 increment_matching(counters,c,TRUE);
882 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
883 increment_matching(counters,c,TRUE);
884 if (c==CHAR_CLOSE_SBRACK)
886 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
887 !matching_difference(counters,c) && !*snext)
888 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
890 increment_matching(counters,c,FALSE);
892 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
893 increment_matching(counters,c,FALSE);
901 * check_for_control_characters:
903 * Check for invalid or questionable characters in the line
904 * Anything above 127 is invalid for plain ASCII, and
905 * non-printable control characters should also be flagged.
906 * Tabs should generally not be there.
908 void check_for_control_characters(const char *aline)
912 for (s=aline;*s;s=g_utf8_next_char(s))
914 c=g_utf8_get_char(s);
915 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
917 if (pswit[ECHO_SWITCH])
918 g_print("\n%s\n",aline);
919 if (!pswit[OVERVIEW_SWITCH])
920 g_print(" Line %ld column %ld - Control character %u\n",
921 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
929 * check_for_odd_characters:
931 * Check for binary and other odd characters.
933 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
934 gboolean isemptyline)
936 /* Don't repeat multiple warnings on one line. */
937 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
938 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
941 for (s=aline;*s;s=g_utf8_next_char(s))
943 c=g_utf8_get_char(s);
944 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
946 if (pswit[ECHO_SWITCH])
947 g_print("\n%s\n",aline);
948 if (!pswit[OVERVIEW_SWITCH])
949 if (c>127 && c<160 || c>255)
950 g_print(" Line %ld column %ld - "
951 "Non-ISO-8859 character %u\n",
952 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
954 g_print(" Line %ld column %ld - "
955 "Non-ASCII character %u\n",
956 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
961 if (!eTab && c==CHAR_TAB)
963 if (pswit[ECHO_SWITCH])
964 g_print("\n%s\n",aline);
965 if (!pswit[OVERVIEW_SWITCH])
966 g_print(" Line %ld column %ld - Tab character?\n",
967 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
972 if (!eTilde && c==CHAR_TILDE)
975 * Often used by OCR software to indicate an
976 * unrecognizable character.
978 if (pswit[ECHO_SWITCH])
979 g_print("\n%s\n",aline);
980 if (!pswit[OVERVIEW_SWITCH])
981 g_print(" Line %ld column %ld - Tilde character?\n",
982 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
987 if (!eCarat && c==CHAR_CARAT)
989 if (pswit[ECHO_SWITCH])
990 g_print("\n%s\n",aline);
991 if (!pswit[OVERVIEW_SWITCH])
992 g_print(" Line %ld column %ld - Carat character?\n",
993 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
998 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1000 if (pswit[ECHO_SWITCH])
1001 g_print("\n%s\n",aline);
1002 if (!pswit[OVERVIEW_SWITCH])
1003 g_print(" Line %ld column %ld - Forward slash?\n",
1004 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1010 * Report asterisks only in paranoid mode,
1011 * since they're often deliberate.
1013 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1016 if (pswit[ECHO_SWITCH])
1017 g_print("\n%s\n",aline);
1018 if (!pswit[OVERVIEW_SWITCH])
1019 g_print(" Line %ld column %ld - Asterisk?\n",
1020 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1029 * check_for_long_line:
1031 * Check for line too long.
1033 void check_for_long_line(const char *aline)
1035 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1037 if (pswit[ECHO_SWITCH])
1038 g_print("\n%s\n",aline);
1039 if (!pswit[OVERVIEW_SWITCH])
1040 g_print(" Line %ld column %ld - Long line %ld\n",
1041 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1048 * check_for_short_line:
1050 * Check for line too short.
1052 * This one is a bit trickier to implement: we don't want to
1053 * flag the last line of a paragraph for being short, so we
1054 * have to wait until we know that our current line is a
1055 * "normal" line, then report the _previous_ line if it was too
1056 * short. We also don't want to report indented lines like
1057 * chapter heads or formatted quotations. We therefore keep
1058 * last->len as the length of the last line examined, and
1059 * last->blen as the length of the last but one, and try to
1060 * suppress unnecessary warnings by checking that both were of
1061 * "normal" length. We keep the first character of the last
1062 * line in last->start, and if it was a space, we assume that
1063 * the formatting is deliberate. I can't figure out a way to
1064 * distinguish something like a quoted verse left-aligned or
1065 * the header or footer of a letter from a paragraph of short
1066 * lines - maybe if I examined the whole paragraph, and if the
1067 * para has less than, say, 8 lines and if all lines are short,
1068 * then just assume it's OK? Need to look at some texts to see
1069 * how often a formula like this would get the right result.
1071 void check_for_short_line(const char *aline,const struct line_properties *last)
1073 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1074 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1075 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1077 if (pswit[ECHO_SWITCH])
1078 g_print("\n%s\n",prevline);
1079 if (!pswit[OVERVIEW_SWITCH])
1080 g_print(" Line %ld column %ld - Short line %ld?\n",
1081 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1088 * check_for_starting_punctuation:
1090 * Look for punctuation other than full ellipses at start of line.
1092 void check_for_starting_punctuation(const char *aline)
1094 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1095 !g_str_has_prefix(aline,". . ."))
1097 if (pswit[ECHO_SWITCH])
1098 g_print("\n%s\n",aline);
1099 if (!pswit[OVERVIEW_SWITCH])
1100 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1108 * check_for_spaced_emdash:
1110 * Check for spaced em-dashes.
1112 * We must check _all_ occurrences of "--" on the line
1113 * hence the loop - even if the first double-dash is OK
1114 * there may be another that's wrong later on.
1116 void check_for_spaced_emdash(const char *aline)
1118 const char *s,*t,*next;
1119 for (s=aline;t=strstr(s,"--");s=next)
1121 next=g_utf8_next_char(g_utf8_next_char(t));
1122 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1123 g_utf8_get_char(next)==CHAR_SPACE)
1125 if (pswit[ECHO_SWITCH])
1126 g_print("\n%s\n",aline);
1127 if (!pswit[OVERVIEW_SWITCH])
1128 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1129 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1137 * check_for_spaced_dash:
1139 * Check for spaced dashes.
1141 void check_for_spaced_dash(const char *aline)
1144 if ((s=strstr(aline," -")))
1146 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1148 if (pswit[ECHO_SWITCH])
1149 g_print("\n%s\n",aline);
1150 if (!pswit[OVERVIEW_SWITCH])
1151 g_print(" Line %ld column %ld - Spaced dash?\n",
1152 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1157 else if ((s=strstr(aline,"- ")))
1159 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1161 if (pswit[ECHO_SWITCH])
1162 g_print("\n%s\n",aline);
1163 if (!pswit[OVERVIEW_SWITCH])
1164 g_print(" Line %ld column %ld - Spaced dash?\n",
1165 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1173 * check_for_unmarked_paragraphs:
1175 * Check for unmarked paragraphs indicated by separate speakers.
1177 * May well be false positive:
1178 * "Bravo!" "Wonderful!" called the crowd.
1179 * but useful all the same.
1181 void check_for_unmarked_paragraphs(const char *aline)
1184 s=strstr(aline,"\" \"");
1186 s=strstr(aline,"\" \"");
1189 if (pswit[ECHO_SWITCH])
1190 g_print("\n%s\n",aline);
1191 if (!pswit[OVERVIEW_SWITCH])
1192 g_print(" Line %ld column %ld - "
1193 "Query missing paragraph break?\n",
1194 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1201 * check_for_jeebies:
1203 * Check for "to he" and other easy h/b errors.
1205 * This is a very inadequate effort on the h/b problem,
1206 * but the phrase "to he" is always an error, whereas "to
1207 * be" is quite common.
1208 * Similarly, '"Quiet!", be said.' is a non-be error
1209 * "to he" is _not_ always an error!:
1210 * "Where they went to he couldn't say."
1211 * Another false positive:
1212 * What would "Cinderella" be without the . . .
1213 * and another: "If he wants to he can see for himself."
1215 void check_for_jeebies(const char *aline)
1218 s=strstr(aline," be could ");
1220 s=strstr(aline," be would ");
1222 s=strstr(aline," was be ");
1224 s=strstr(aline," be is ");
1226 s=strstr(aline," is be ");
1228 s=strstr(aline,"\", be ");
1230 s=strstr(aline,"\" be ");
1232 s=strstr(aline,"\" be ");
1234 s=strstr(aline," to he ");
1237 if (pswit[ECHO_SWITCH])
1238 g_print("\n%s\n",aline);
1239 if (!pswit[OVERVIEW_SWITCH])
1240 g_print(" Line %ld column %ld - Query he/be error?\n",
1241 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1245 s=strstr(aline," the had ");
1247 s=strstr(aline," a had ");
1249 s=strstr(aline," they bad ");
1251 s=strstr(aline," she bad ");
1253 s=strstr(aline," he bad ");
1255 s=strstr(aline," you bad ");
1257 s=strstr(aline," i bad ");
1260 if (pswit[ECHO_SWITCH])
1261 g_print("\n%s\n",aline);
1262 if (!pswit[OVERVIEW_SWITCH])
1263 g_print(" Line %ld column %ld - Query had/bad error?\n",
1264 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1268 s=strstr(aline,"; hut ");
1270 s=strstr(aline,", hut ");
1273 if (pswit[ECHO_SWITCH])
1274 g_print("\n%s\n",aline);
1275 if (!pswit[OVERVIEW_SWITCH])
1276 g_print(" Line %ld column %ld - Query hut/but error?\n",
1277 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1284 * check_for_mta_from:
1286 * Special case - angled bracket in front of "From" placed there by an
1287 * MTA when sending an e-mail.
1289 void check_for_mta_from(const char *aline)
1292 s=strstr(aline,">From");
1295 if (pswit[ECHO_SWITCH])
1296 g_print("\n%s\n",aline);
1297 if (!pswit[OVERVIEW_SWITCH])
1298 g_print(" Line %ld column %ld - "
1299 "Query angled bracket with From\n",
1300 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1307 * check_for_orphan_character:
1309 * Check for a single character line -
1310 * often an overflow from bad wrapping.
1312 void check_for_orphan_character(const char *aline)
1315 c=g_utf8_get_char(aline);
1316 if (c && !*g_utf8_next_char(aline))
1318 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1319 ; /* Nothing - ignore numerals alone on a line. */
1322 if (pswit[ECHO_SWITCH])
1323 g_print("\n%s\n",aline);
1324 if (!pswit[OVERVIEW_SWITCH])
1325 g_print(" Line %ld column 1 - Query single character line\n",
1334 * check_for_pling_scanno:
1336 * Check for I" - often should be !
1338 void check_for_pling_scanno(const char *aline)
1341 s=strstr(aline," I\"");
1344 if (pswit[ECHO_SWITCH])
1345 g_print("\n%s\n",aline);
1346 if (!pswit[OVERVIEW_SWITCH])
1347 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1348 linecnt,g_utf8_pointer_to_offset(aline,s));
1355 * check_for_extra_period:
1357 * Check for period without a capital letter. Cut-down from gutspell.
1358 * Only works when it happens on a single line.
1360 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1362 const char *s,*t,*s1,*sprev;
1367 gunichar c,nc,pc,*decomposition;
1368 if (pswit[PARANOID_SWITCH])
1370 for (t=aline;t=strstr(t,". ");)
1374 t=g_utf8_next_char(t);
1375 /* start of line punctuation is handled elsewhere */
1378 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1380 t=g_utf8_next_char(t);
1383 if (warnings->isDutch)
1385 /* For Frank & Jeroen -- 's Middags case */
1386 gunichar c2,c3,c4,c5;
1387 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1388 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1389 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1390 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1391 if (CHAR_IS_APOSTROPHE(c2) &&
1392 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1393 g_unichar_isupper(c5))
1395 t=g_utf8_next_char(t);
1399 s1=g_utf8_next_char(g_utf8_next_char(t));
1400 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1401 !isdigit(g_utf8_get_char(s1)))
1402 s1=g_utf8_next_char(s1);
1403 if (g_unichar_islower(g_utf8_get_char(s1)))
1405 /* we have something to investigate */
1407 /* so let's go back and find out */
1408 nc=g_utf8_get_char(t);
1409 s1=g_utf8_prev_char(t);
1410 c=g_utf8_get_char(s1);
1411 sprev=g_utf8_prev_char(s1);
1412 pc=g_utf8_get_char(sprev);
1414 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1415 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1416 g_unichar_isalpha(nc)))
1421 sprev=g_utf8_prev_char(s1);
1422 pc=g_utf8_get_char(sprev);
1424 s1=g_utf8_next_char(s1);
1427 testword=g_strndup(s1,s-s1);
1429 testword=g_strdup(s1);
1430 for (i=0;*abbrev[i];i++)
1431 if (!strcmp(testword,abbrev[i]))
1433 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1435 if (!*g_utf8_next_char(testword))
1437 if (isroman(testword))
1442 for (s=testword;*s;s=g_utf8_next_char(s))
1444 decomposition=g_unicode_canonical_decomposition(
1445 g_utf8_get_char(s),&len);
1446 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1448 g_free(decomposition);
1452 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1454 g_tree_insert(qperiod,g_strdup(testword),
1455 GINT_TO_POINTER(1));
1456 if (pswit[ECHO_SWITCH])
1457 g_print("\n%s\n",aline);
1458 if (!pswit[OVERVIEW_SWITCH])
1459 g_print(" Line %ld column %ld - Extra period?\n",
1460 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1466 t=g_utf8_next_char(t);
1472 * check_for_following_punctuation:
1474 * Check for words usually not followed by punctuation.
1476 void check_for_following_punctuation(const char *aline)
1479 const char *s,*wordstart;
1482 if (pswit[TYPO_SWITCH])
1493 inword=g_utf8_strdown(t,-1);
1495 for (i=0;*nocomma[i];i++)
1496 if (!strcmp(inword,nocomma[i]))
1498 c=g_utf8_get_char(s);
1499 if (c==',' || c==';' || c==':')
1501 if (pswit[ECHO_SWITCH])
1502 g_print("\n%s\n",aline);
1503 if (!pswit[OVERVIEW_SWITCH])
1504 g_print(" Line %ld column %ld - "
1505 "Query punctuation after %s?\n",
1506 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1512 for (i=0;*noperiod[i];i++)
1513 if (!strcmp(inword,noperiod[i]))
1515 c=g_utf8_get_char(s);
1516 if (c=='.' || c=='!')
1518 if (pswit[ECHO_SWITCH])
1519 g_print("\n%s\n",aline);
1520 if (!pswit[OVERVIEW_SWITCH])
1521 g_print(" Line %ld column %ld - "
1522 "Query punctuation after %s?\n",
1523 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1537 * Check for commonly mistyped words,
1538 * and digits like 0 for O in a word.
1540 void check_for_typos(const char *aline,struct warnings *warnings)
1542 const char *s,*t,*nt,*wordstart;
1544 gunichar *decomposition;
1546 int i,vowel,consonant,*dupcnt;
1547 gboolean isdup,istypo,alower;
1550 gsize decomposition_len;
1554 inword=getaword(&s);
1558 continue; /* don't bother with empty lines */
1560 if (mixdigit(inword))
1562 if (pswit[ECHO_SWITCH])
1563 g_print("\n%s\n",aline);
1564 if (!pswit[OVERVIEW_SWITCH])
1565 g_print(" Line %ld column %ld - Query digit in %s\n",
1566 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1571 * Put the word through a series of tests for likely typos and OCR
1574 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1578 for (t=inword;*t;t=g_utf8_next_char(t))
1580 c=g_utf8_get_char(t);
1581 nt=g_utf8_next_char(t);
1582 /* lowercase for testing */
1583 if (g_unichar_islower(c))
1585 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1588 * We have an uppercase mid-word. However, there are
1590 * Mac and Mc like McGill
1591 * French contractions like l'Abbe
1593 offset=g_utf8_pointer_to_offset(inword,t);
1595 pc=g_utf8_get_char(g_utf8_prev_char(t));
1598 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1599 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1600 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1601 CHAR_IS_APOSTROPHE(pc))
1607 testword=g_utf8_casefold(inword,-1);
1609 if (pswit[TYPO_SWITCH])
1612 * Check for certain unlikely two-letter combinations at word
1615 len=g_utf8_strlen(testword,-1);
1618 for (i=0;*nostart[i];i++)
1619 if (g_str_has_prefix(testword,nostart[i]))
1621 for (i=0;*noend[i];i++)
1622 if (g_str_has_suffix(testword,noend[i]))
1625 /* ght is common, gbt never. Like that. */
1626 if (strstr(testword,"cb"))
1628 if (strstr(testword,"gbt"))
1630 if (strstr(testword,"pbt"))
1632 if (strstr(testword,"tbs"))
1634 if (strstr(testword,"mrn"))
1636 if (strstr(testword,"ahle"))
1638 if (strstr(testword,"ihle"))
1641 * "TBE" does happen - like HEARTBEAT - but uncommon.
1642 * Also "TBI" - frostbite, outbid - but uncommon.
1643 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1644 * numerals, but "ii" is a common scanno.
1646 if (strstr(testword,"tbi"))
1648 if (strstr(testword,"tbe"))
1650 if (strstr(testword,"ii"))
1653 * Check for no vowels or no consonants.
1654 * If none, flag a typo.
1656 if (!istypo && len>1)
1659 for (t=testword;*t;t=g_utf8_next_char(t))
1661 c=g_utf8_get_char(t);
1663 g_unicode_canonical_decomposition(c,&decomposition_len);
1664 if (c=='y' || g_unichar_isdigit(c))
1666 /* Yah, this is loose. */
1670 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1674 g_free(decomposition);
1676 if (!vowel || !consonant)
1680 * Now exclude the word from being reported if it's in
1683 for (i=0;*okword[i];i++)
1684 if (!strcmp(testword,okword[i]))
1687 * What looks like a typo may be a Roman numeral.
1690 if (istypo && isroman(testword))
1692 /* Check the manual list of typos. */
1694 for (i=0;*typo[i];i++)
1695 if (!strcmp(testword,typo[i]))
1698 * Check lowercase s, l, i and m - special cases.
1699 * "j" - often a semi-colon gone wrong.
1700 * "d" for a missing apostrophe - he d
1703 if (!istypo && len==1 &&
1704 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1708 dupcnt=g_tree_lookup(qword,testword);
1712 isdup=!pswit[VERBOSE_SWITCH];
1716 dupcnt=g_new0(int,1);
1717 g_tree_insert(qword,g_strdup(testword),dupcnt);
1722 if (pswit[ECHO_SWITCH])
1723 g_print("\n%s\n",aline);
1724 if (!pswit[OVERVIEW_SWITCH])
1726 g_print(" Line %ld column %ld - Query word %s",
1727 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1729 if (!pswit[VERBOSE_SWITCH])
1730 g_print(" - not reporting duplicates");
1738 /* check the user's list of typos */
1739 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1741 if (pswit[ECHO_SWITCH])
1742 g_print("\n%s\n",aline);
1743 if (!pswit[OVERVIEW_SWITCH])
1744 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1745 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1747 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1749 if (pswit[PARANOID_SWITCH] && warnings->digit)
1751 /* In paranoid mode, query all 0 and 1 standing alone. */
1752 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1754 if (pswit[ECHO_SWITCH])
1755 g_print("\n%s\n",aline);
1756 if (!pswit[OVERVIEW_SWITCH])
1757 g_print(" Line %ld column %ld - Query standalone %s\n",
1758 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1769 * check_for_misspaced_punctuation:
1771 * Look for added or missing spaces around punctuation and quotes.
1772 * If there is a punctuation character like ! with no space on
1773 * either side, suspect a missing!space. If there are spaces on
1774 * both sides , assume a typo. If we see a double quote with no
1775 * space or punctuation on either side of it, assume unspaced
1776 * quotes "like"this.
1778 void check_for_misspaced_punctuation(const char *aline,
1779 struct parities *parities,gboolean isemptyline)
1781 gboolean isacro,isellipsis;
1783 gunichar c,nc,pc,n2c;
1784 c=g_utf8_get_char(aline);
1785 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1786 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1790 nc=g_utf8_get_char(g_utf8_next_char(s));
1791 /* For each character in the line after the first. */
1792 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1794 /* we need to suppress warnings for acronyms like M.D. */
1796 /* we need to suppress warnings for ellipsis . . . */
1799 * If there are letters on both sides of it or
1800 * if it's strict punctuation followed by an alpha.
1802 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1803 g_utf8_strchr("?!,;:",-1,c)))
1807 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1808 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1810 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1816 if (pswit[ECHO_SWITCH])
1817 g_print("\n%s\n",aline);
1818 if (!pswit[OVERVIEW_SWITCH])
1819 g_print(" Line %ld column %ld - Missing space?\n",
1820 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1825 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1828 * If there are spaces on both sides,
1829 * or space before and end of line.
1833 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1834 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1836 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1840 if (!isemptyline && !isellipsis)
1842 if (pswit[ECHO_SWITCH])
1843 g_print("\n%s\n",aline);
1844 if (!pswit[OVERVIEW_SWITCH])
1845 g_print(" Line %ld column %ld - "
1846 "Spaced punctuation?\n",linecnt,
1847 g_utf8_pointer_to_offset(aline,s)+1);
1854 /* Split out the characters that CANNOT be preceded by space. */
1855 c=g_utf8_get_char(aline);
1856 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1857 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1861 nc=g_utf8_get_char(g_utf8_next_char(s));
1862 /* for each character in the line after the first */
1863 if (g_utf8_strchr("?!,;:",-1,c))
1865 /* if it's punctuation that _cannot_ have a space before it */
1866 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1869 * If nc DOES == space,
1870 * it was already reported just above.
1872 if (pswit[ECHO_SWITCH])
1873 g_print("\n%s\n",aline);
1874 if (!pswit[OVERVIEW_SWITCH])
1875 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1876 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1883 * Special case " .X" where X is any alpha.
1884 * This plugs a hole in the acronym code above.
1885 * Inelegant, but maintainable.
1887 c=g_utf8_get_char(aline);
1888 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1889 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1893 nc=g_utf8_get_char(g_utf8_next_char(s));
1894 /* for each character in the line after the first */
1897 /* if it's a period */
1898 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1901 * If the period follows a space and
1902 * is followed by a letter.
1904 if (pswit[ECHO_SWITCH])
1905 g_print("\n%s\n",aline);
1906 if (!pswit[OVERVIEW_SWITCH])
1907 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1908 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1914 c=g_utf8_get_char(aline);
1915 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1916 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1920 nc=g_utf8_get_char(g_utf8_next_char(s));
1921 /* for each character in the line after the first */
1924 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1925 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1926 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1928 if (pswit[ECHO_SWITCH])
1929 g_print("\n%s\n",aline);
1930 if (!pswit[OVERVIEW_SWITCH])
1931 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1932 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1938 /* Check parity of quotes. */
1939 nc=g_utf8_get_char(aline);
1940 for (s=aline;*s;s=g_utf8_next_char(s))
1943 nc=g_utf8_get_char(g_utf8_next_char(s));
1946 parities->dquote=!parities->dquote;
1947 if (!parities->dquote)
1950 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
1952 if (pswit[ECHO_SWITCH])
1953 g_print("\n%s\n",aline);
1954 if (!pswit[OVERVIEW_SWITCH])
1955 g_print(" Line %ld column %ld - "
1956 "Wrongspaced quotes?\n",
1957 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1965 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1966 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
1968 if (pswit[ECHO_SWITCH])
1969 g_print("\n%s\n",aline);
1970 if (!pswit[OVERVIEW_SWITCH])
1971 g_print(" Line %ld column %ld - "
1972 "Wrongspaced quotes?\n",
1973 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1980 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
1982 if (g_utf8_strchr(",;:!?)]} ",-1,
1983 g_utf8_get_char(g_utf8_next_char(aline))))
1985 if (pswit[ECHO_SWITCH])
1986 g_print("\n%s\n",aline);
1987 if (!pswit[OVERVIEW_SWITCH])
1988 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
1994 if (pswit[SQUOTE_SWITCH])
1996 nc=g_utf8_get_char(aline);
1997 for (s=aline;*s;s=g_utf8_next_char(s))
2000 nc=g_utf8_get_char(g_utf8_next_char(s));
2001 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2002 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2003 !g_unichar_isalpha(nc)))
2005 parities->squote=!parities->squote;
2006 if (!parities->squote)
2009 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2011 if (pswit[ECHO_SWITCH])
2012 g_print("\n%s\n",aline);
2013 if (!pswit[OVERVIEW_SWITCH])
2014 g_print(" Line %ld column %ld - "
2015 "Wrongspaced singlequotes?\n",
2016 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2024 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2025 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2027 if (pswit[ECHO_SWITCH])
2028 g_print("\n%s\n",aline);
2029 if (!pswit[OVERVIEW_SWITCH])
2030 g_print(" Line %ld column %ld - "
2031 "Wrongspaced singlequotes?\n",
2032 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2043 * check_for_double_punctuation:
2045 * Look for double punctuation like ,. or ,,
2046 * Thanks to DW for the suggestion!
2047 * In books with references, ".," and ".;" are common
2048 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2049 * OTOH, from my initial tests, there are also fairly
2050 * common errors. What to do? Make these cases paranoid?
2051 * ".," is the most common, so warnings->dotcomma is used
2052 * to suppress detailed reporting if it occurs often.
2054 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2058 nc=g_utf8_get_char(aline);
2059 for (s=aline;*s;s=g_utf8_next_char(s))
2062 nc=g_utf8_get_char(g_utf8_next_char(s));
2063 /* for each punctuation character in the line */
2064 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2065 g_utf8_strchr(".?!,;:",-1,nc))
2067 /* followed by punctuation, it's a query, unless . . . */
2068 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2069 !warnings->dotcomma && c=='.' && nc==',' ||
2070 warnings->isFrench && g_str_has_prefix(s,",...") ||
2071 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2072 warnings->isFrench && g_str_has_prefix(s,";...") ||
2073 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2074 warnings->isFrench && g_str_has_prefix(s,":...") ||
2075 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2076 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2077 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2078 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2079 warnings->isFrench && g_str_has_prefix(s,"...?"))
2081 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2082 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2083 warnings->isFrench && g_str_has_prefix(s,";...") ||
2084 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2085 warnings->isFrench && g_str_has_prefix(s,":...") ||
2086 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2087 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2088 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2089 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2090 warnings->isFrench && g_str_has_prefix(s,"...?"))
2093 nc=g_utf8_get_char(g_utf8_next_char(s));
2095 ; /* do nothing for .. !! and ?? which can be legit */
2099 if (pswit[ECHO_SWITCH])
2100 g_print("\n%s\n",aline);
2101 if (!pswit[OVERVIEW_SWITCH])
2102 g_print(" Line %ld column %ld - Double punctuation?\n",
2103 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2112 * check_for_spaced_quotes:
2114 void check_for_spaced_quotes(const char *aline)
2118 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2122 while ((t=strstr(s," \" ")))
2124 if (pswit[ECHO_SWITCH])
2125 g_print("\n%s\n",aline);
2126 if (!pswit[OVERVIEW_SWITCH])
2127 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2128 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2131 s=g_utf8_next_char(g_utf8_next_char(t));
2133 pattern=g_string_new(NULL);
2134 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2136 g_string_assign(pattern," ");
2137 g_string_append_unichar(pattern,single_quotes[i]);
2138 g_string_append_c(pattern,' ');
2140 while ((t=strstr(s,pattern->str)))
2142 if (pswit[ECHO_SWITCH])
2143 g_print("\n%s\n",aline);
2144 if (!pswit[OVERVIEW_SWITCH])
2145 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2146 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2149 s=g_utf8_next_char(g_utf8_next_char(t));
2152 g_string_free(pattern,TRUE);
2156 * check_for_miscased_genative:
2158 * Check special case of 'S instead of 's at end of word.
2160 void check_for_miscased_genative(const char *aline)
2166 c=g_utf8_get_char(aline);
2167 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2168 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2172 nc=g_utf8_get_char(g_utf8_next_char(s));
2173 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2175 if (pswit[ECHO_SWITCH])
2176 g_print("\n%s\n",aline);
2177 if (!pswit[OVERVIEW_SWITCH])
2178 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2179 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2187 * check_end_of_line:
2189 * Now check special cases - start and end of line -
2190 * for single and double quotes. Start is sometimes [sic]
2191 * but better to query it anyway.
2192 * While we're here, check for dash at end of line.
2194 void check_end_of_line(const char *aline,struct warnings *warnings)
2199 lbytes=strlen(aline);
2200 if (g_utf8_strlen(aline,lbytes)>1)
2202 s=g_utf8_prev_char(aline+lbytes);
2203 c1=g_utf8_get_char(s);
2204 c2=g_utf8_get_char(g_utf8_prev_char(s));
2205 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2207 if (pswit[ECHO_SWITCH])
2208 g_print("\n%s\n",aline);
2209 if (!pswit[OVERVIEW_SWITCH])
2210 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2211 g_utf8_strlen(aline,lbytes));
2215 c1=g_utf8_get_char(aline);
2216 c2=g_utf8_get_char(g_utf8_next_char(aline));
2217 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2219 if (pswit[ECHO_SWITCH])
2220 g_print("\n%s\n",aline);
2221 if (!pswit[OVERVIEW_SWITCH])
2222 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2227 * Dash at end of line may well be legit - paranoid mode only
2228 * and don't report em-dash at line-end.
2230 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2232 for (s=g_utf8_prev_char(aline+lbytes);
2233 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2235 if (g_utf8_get_char(s)=='-' &&
2236 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2238 if (pswit[ECHO_SWITCH])
2239 g_print("\n%s\n",aline);
2240 if (!pswit[OVERVIEW_SWITCH])
2241 g_print(" Line %ld column %ld - "
2242 "Hyphen at end of line?\n",
2243 linecnt,g_utf8_pointer_to_offset(aline,s));
2250 * check_for_unspaced_bracket:
2252 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2253 * If so, suspect a scanno like "a]most".
2255 void check_for_unspaced_bracket(const char *aline)
2259 c=g_utf8_get_char(aline);
2260 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2261 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2265 nc=g_utf8_get_char(g_utf8_next_char(s));
2268 /* for each bracket character in the line except 1st & last */
2269 if (g_utf8_strchr("{[()]}",-1,c) &&
2270 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2272 if (pswit[ECHO_SWITCH])
2273 g_print("\n%s\n",aline);
2274 if (!pswit[OVERVIEW_SWITCH])
2275 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2276 linecnt,g_utf8_pointer_to_offset(aline,s));
2284 * check_for_unpunctuated_endquote:
2286 void check_for_unpunctuated_endquote(const char *aline)
2290 c=g_utf8_get_char(aline);
2291 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2292 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2296 nc=g_utf8_get_char(g_utf8_next_char(s));
2297 /* for each character in the line except 1st */
2298 if (c==CHAR_DQUOTE && isalpha(pc))
2300 if (pswit[ECHO_SWITCH])
2301 g_print("\n%s\n",aline);
2302 if (!pswit[OVERVIEW_SWITCH])
2303 g_print(" Line %ld column %ld - "
2304 "endquote missing punctuation?\n",
2305 linecnt,g_utf8_pointer_to_offset(aline,s));
2313 * check_for_html_tag:
2315 * Check for <HTML TAG>.
2317 * If there is a < in the line, followed at some point
2318 * by a > then we suspect HTML.
2320 void check_for_html_tag(const char *aline)
2322 const char *open,*close;
2324 open=strchr(aline,'<');
2327 close=strchr(g_utf8_next_char(open),'>');
2330 if (pswit[ECHO_SWITCH])
2331 g_print("\n%s\n",aline);
2332 if (!pswit[OVERVIEW_SWITCH])
2334 tag=g_strndup(open,close-open+1);
2335 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2336 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2346 * check_for_html_entity:
2348 * Check for &symbol; HTML.
2350 * If there is a & in the line, followed at
2351 * some point by a ; then we suspect HTML.
2353 void check_for_html_entity(const char *aline)
2355 const char *s,*amp,*scolon;
2357 amp=strchr(aline,'&');
2360 scolon=strchr(amp,';');
2363 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2364 if (g_utf8_get_char(s)==CHAR_SPACE)
2365 break; /* Don't report "Jones & Son;" */
2368 if (pswit[ECHO_SWITCH])
2369 g_print("\n%s\n",aline);
2370 if (!pswit[OVERVIEW_SWITCH])
2372 entity=g_strndup(amp,scolon-amp+1);
2373 g_print(" Line %ld column %d - HTML symbol? %s \n",
2374 linecnt,(int)(amp-aline)+1,entity);
2385 * check_for_omitted_punctuation:
2387 * Check for omitted punctuation at end of paragraph by working back
2388 * through prevline. DW.
2389 * Need to check this only for "normal" paras.
2390 * So what is a "normal" para?
2391 * Not normal if one-liner (chapter headings, etc.)
2392 * Not normal if doesn't contain at least one locase letter
2393 * Not normal if starts with space
2395 void check_for_omitted_punctuation(const char *prevline,
2396 struct line_properties *last,int start_para_line)
2398 gboolean letter_on_line=FALSE;
2401 for (s=prevline;*s;s=g_utf8_next_char(s))
2402 if (g_unichar_isalpha(g_utf8_get_char(s)))
2404 letter_on_line=TRUE;
2408 * This next "if" is a problem.
2409 * If we say "start_para_line <= linecnt - 1", that includes
2410 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2411 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2412 * misses genuine one-line paragraphs.
2414 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2415 g_utf8_get_char(prevline)>CHAR_SPACE)
2417 s=prevline+strlen(prevline);
2420 s=g_utf8_prev_char(s);
2421 c=g_utf8_get_char(s);
2422 } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
2423 for (;s>prevline;s=g_utf8_prev_char(s))
2425 if (g_unichar_isalpha(g_utf8_get_char(s)))
2427 if (pswit[ECHO_SWITCH])
2428 g_print("\n%s\n",prevline);
2429 if (!pswit[OVERVIEW_SWITCH])
2430 g_print(" Line %ld column %ld - "
2431 "No punctuation at para end?\n",
2432 linecnt-1,g_utf8_strlen(prevline,-1));
2437 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2443 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2445 const char *word=key;
2448 g_print("\nNote: Queried word %s was duplicated %d times\n",
2453 void print_as_windows_1252(const char *string)
2455 gsize inbytes,outbytes;
2457 static GIConv converter=(GIConv)-1;
2460 if (converter!=(GIConv)-1)
2461 g_iconv_close(converter);
2462 converter=(GIConv)-1;
2465 if (converter==(GIConv)-1)
2466 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2467 if (converter!=(GIConv)-1)
2469 inbytes=outbytes=strlen(string);
2470 bp=buf=g_malloc(outbytes+1);
2471 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2477 fputs(string,stdout);
2480 void print_as_utf_8(const char *string)
2482 fputs(string,stdout);
2490 void procfile(const char *filename)
2493 gchar *parastart=NULL; /* first line of current para */
2494 gchar *etext,*aline;
2497 struct first_pass_results *first_pass_results;
2498 struct warnings *warnings;
2499 struct counters counters={0};
2500 struct line_properties last={0};
2501 struct parities parities={0};
2502 struct pending pending={0};
2503 gboolean isemptyline;
2504 long start_para_line=0;
2505 gboolean isnewpara=FALSE,enddash=FALSE;
2506 last.start=CHAR_SPACE;
2507 linecnt=checked_linecnt=0;
2508 etext=read_etext(filename,&err);
2511 if (pswit[STDOUT_SWITCH])
2512 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2514 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2517 if (g_path_is_absolute(filename))
2518 g_print("\n\nFile: %s\n\n",filename);
2522 cwd=g_get_current_dir();
2523 path=g_build_filename(cwd,filename,NULL);
2525 g_print("\n\nFile: %s\n\n",path);
2528 first_pass_results=first_pass(etext);
2529 warnings=report_first_pass(first_pass_results);
2530 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2531 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2533 * Here we go with the main pass. Hold onto yer hat!
2537 while ((aline=flgets(&etext_ptr,linecnt+1)))
2542 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2543 continue; // skip DP page separators completely
2544 if (linecnt<first_pass_results->firstline ||
2545 (first_pass_results->footerline>0 &&
2546 linecnt>first_pass_results->footerline))
2548 if (pswit[HEADER_SWITCH])
2550 if (g_str_has_prefix(aline,"Title:"))
2551 g_print(" %s\n",aline);
2552 if (g_str_has_prefix(aline,"Author:"))
2553 g_print(" %s\n",aline);
2554 if (g_str_has_prefix(aline,"Release Date:"))
2555 g_print(" %s\n",aline);
2556 if (g_str_has_prefix(aline,"Edition:"))
2557 g_print(" %s\n\n",aline);
2559 continue; /* skip through the header */
2562 print_pending(aline,parastart,&pending);
2563 isemptyline=analyse_quotes(aline,&counters);
2564 if (isnewpara && !isemptyline)
2566 /* This line is the start of a new paragraph. */
2567 start_para_line=linecnt;
2568 /* Capture its first line in case we want to report it later. */
2570 parastart=g_strdup(aline);
2571 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2573 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2574 !g_unichar_isdigit(g_utf8_get_char(s)))
2575 s=g_utf8_next_char(s);
2576 if (g_unichar_islower(g_utf8_get_char(s)))
2578 /* and its first letter is lowercase */
2579 if (pswit[ECHO_SWITCH])
2580 g_print("\n%s\n",aline);
2581 if (!pswit[OVERVIEW_SWITCH])
2582 g_print(" Line %ld column %ld - "
2583 "Paragraph starts with lower-case\n",
2584 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2588 isnewpara=FALSE; /* Signal the end of new para processing. */
2590 /* Check for an em-dash broken at line end. */
2591 if (enddash && g_utf8_get_char(aline)=='-')
2593 if (pswit[ECHO_SWITCH])
2594 g_print("\n%s\n",aline);
2595 if (!pswit[OVERVIEW_SWITCH])
2596 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2601 for (s=g_utf8_prev_char(aline+strlen(aline));
2602 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2604 if (s>=aline && g_utf8_get_char(s)=='-')
2606 check_for_control_characters(aline);
2608 check_for_odd_characters(aline,warnings,isemptyline);
2609 if (warnings->longline)
2610 check_for_long_line(aline);
2611 if (warnings->shortline)
2612 check_for_short_line(aline,&last);
2614 last.len=g_utf8_strlen(aline,-1);
2615 last.start=g_utf8_get_char(aline);
2616 check_for_starting_punctuation(aline);
2619 check_for_spaced_emdash(aline);
2620 check_for_spaced_dash(aline);
2622 check_for_unmarked_paragraphs(aline);
2623 check_for_jeebies(aline);
2624 check_for_mta_from(aline);
2625 check_for_orphan_character(aline);
2626 check_for_pling_scanno(aline);
2627 check_for_extra_period(aline,warnings);
2628 check_for_following_punctuation(aline);
2629 check_for_typos(aline,warnings);
2630 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2631 check_for_double_punctuation(aline,warnings);
2632 check_for_spaced_quotes(aline);
2633 check_for_miscased_genative(aline);
2634 check_end_of_line(aline,warnings);
2635 check_for_unspaced_bracket(aline);
2636 if (warnings->endquote)
2637 check_for_unpunctuated_endquote(aline);
2638 check_for_html_tag(aline);
2639 check_for_html_entity(aline);
2642 check_for_mismatched_quotes(&counters,&pending);
2643 counters_reset(&counters);
2644 /* let the next iteration know that it's starting a new para */
2647 check_for_omitted_punctuation(prevline,&last,start_para_line);
2650 prevline=g_strdup(aline);
2653 check_for_mismatched_quotes(&counters,&pending);
2654 print_pending(NULL,parastart,&pending);
2655 reset_pending(&pending);
2664 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2665 g_tree_foreach(qword,report_duplicate_queries,NULL);
2666 g_tree_unref(qword);
2667 g_tree_unref(qperiod);
2668 counters_destroy(&counters);
2669 g_set_print_handler(NULL);
2670 print_as_windows_1252(NULL);
2671 if (pswit[MARKUP_SWITCH])
2678 * Get one line from the input text, checking for
2679 * the existence of exactly one CR/LF line-end per line.
2681 * Returns: a pointer to the line.
2683 char *flgets(char **etext,long lcnt)
2686 gboolean isCR=FALSE;
2687 char *theline=*etext;
2692 c=g_utf8_get_char(*etext);
2693 *etext=g_utf8_next_char(*etext);
2696 /* either way, it's end of line */
2703 /* Error - a LF without a preceding CR */
2704 if (pswit[LINE_END_SWITCH])
2706 if (pswit[ECHO_SWITCH])
2708 s=g_strndup(theline,eos-theline);
2709 g_print("\n%s\n",s);
2712 if (!pswit[OVERVIEW_SWITCH])
2713 g_print(" Line %ld - No CR?\n",lcnt);
2724 /* Error - two successive CRs */
2725 if (pswit[LINE_END_SWITCH])
2727 if (pswit[ECHO_SWITCH])
2729 s=g_strndup(theline,eos-theline);
2730 g_print("\n%s\n",s);
2733 if (!pswit[OVERVIEW_SWITCH])
2734 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2743 if (pswit[LINE_END_SWITCH] && isCR)
2745 if (pswit[ECHO_SWITCH])
2747 s=g_strndup(theline,eos-theline);
2748 g_print("\n%s\n",s);
2751 if (!pswit[OVERVIEW_SWITCH])
2752 g_print(" Line %ld column %ld - CR without LF?\n",
2753 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2759 eos=g_utf8_next_char(eos);
2763 if (pswit[MARKUP_SWITCH])
2764 postprocess_for_HTML(theline);
2765 if (pswit[DP_SWITCH])
2766 postprocess_for_DP(theline);
2773 * Takes a "word" as a parameter, and checks whether it
2774 * contains a mixture of alpha and digits. Generally, this is an
2775 * error, but may not be for cases like 4th or L5 12s. 3d.
2777 * Returns: TRUE iff an is error found.
2779 gboolean mixdigit(const char *checkword)
2781 gboolean wehaveadigit,wehavealetter,query;
2782 const char *s,*nondigit;
2783 wehaveadigit=wehavealetter=query=FALSE;
2784 for (s=checkword;*s;s=g_utf8_next_char(s))
2785 if (g_unichar_isalpha(g_utf8_get_char(s)))
2787 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2789 if (wehaveadigit && wehavealetter)
2791 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2793 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2794 nondigit=g_utf8_next_char(nondigit))
2796 /* digits, ending in st, rd, nd, th of either case */
2797 if (!g_ascii_strcasecmp(nondigit,"st") ||
2798 !g_ascii_strcasecmp(nondigit,"rd") ||
2799 !g_ascii_strcasecmp(nondigit,"nd") ||
2800 !g_ascii_strcasecmp(nondigit,"th"))
2802 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2803 !g_ascii_strcasecmp(nondigit,"rds") ||
2804 !g_ascii_strcasecmp(nondigit,"nds") ||
2805 !g_ascii_strcasecmp(nondigit,"ths"))
2807 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2808 !g_ascii_strcasecmp(nondigit,"rdly") ||
2809 !g_ascii_strcasecmp(nondigit,"ndly") ||
2810 !g_ascii_strcasecmp(nondigit,"thly"))
2812 /* digits, ending in l, L, s or d */
2813 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2814 !strcmp(nondigit,"d"))
2817 * L at the start of a number, representing Britsh pounds, like L500.
2818 * This is cute. We know the current word is mixed digit. If the first
2819 * letter is L, there must be at least one digit following. If both
2820 * digits and letters follow, we have a genuine error, else we have a
2821 * capital L followed by digits, and we accept that as a non-error.
2823 if (g_utf8_get_char(checkword)=='L' &&
2824 !mixdigit(g_utf8_next_char(checkword)))
2833 * Extracts the first/next "word" from the line, and returns it.
2834 * A word is defined as one English word unit--or at least that's the aim.
2835 * "ptr" is advanced to the position in the line where we will start
2836 * looking for the next word.
2838 * Returns: A newly-allocated string.
2840 gchar *getaword(const char **ptr)
2845 word=g_string_new(NULL);
2846 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2847 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2848 **ptr;*ptr=g_utf8_next_char(*ptr))
2851 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2852 * Especially yucky is the case of L1,000
2853 * This section looks for a pattern of characters including a digit
2854 * followed by a comma or period followed by one or more digits.
2855 * If found, it returns this whole pattern as a word; otherwise we discard
2856 * the results and resume our normal programming.
2859 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2860 g_unichar_isalpha(g_utf8_get_char(s)) ||
2861 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2862 g_string_append_unichar(word,g_utf8_get_char(s));
2865 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
2867 c=g_utf8_get_char(t);
2868 pc=g_utf8_get_char(g_utf8_prev_char(t));
2869 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
2872 return g_string_free(word,FALSE);
2876 /* we didn't find a punctuated number - do the regular getword thing */
2877 g_string_truncate(word,0);
2878 c=g_utf8_get_char(*ptr);
2879 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
2880 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
2881 g_string_append_unichar(word,c);
2882 return g_string_free(word,FALSE);
2888 * Is this word a Roman Numeral?
2890 * It doesn't actually validate that the number is a valid Roman Numeral--for
2891 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2892 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2893 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2894 * expressions thereof, except when it came to taxes. Allow any number of M,
2895 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2896 * XL or an optional XC, an optional IX or IV, an optional V and any number
2899 gboolean isroman(const char *t)
2905 while (g_utf8_get_char(t)=='m' && *t)
2907 if (g_utf8_get_char(t)=='d')
2909 if (g_str_has_prefix(t,"cm"))
2911 if (g_str_has_prefix(t,"cd"))
2913 while (g_utf8_get_char(t)=='c' && *t)
2915 if (g_str_has_prefix(t,"xl"))
2917 if (g_str_has_prefix(t,"xc"))
2919 if (g_utf8_get_char(t)=='l')
2921 while (g_utf8_get_char(t)=='x' && *t)
2923 if (g_str_has_prefix(t,"ix"))
2925 if (g_str_has_prefix(t,"iv"))
2927 if (g_utf8_get_char(t)=='v')
2929 while (g_utf8_get_char(t)=='i' && *t)
2935 * postprocess_for_DP:
2937 * Invoked with the -d switch from flgets().
2938 * It simply "removes" from the line a hard-coded set of common
2939 * DP-specific tags, so that the line passed to the main routine has
2940 * been pre-cleaned of DP markup.
2942 void postprocess_for_DP(char *theline)
2948 for (i=0;*DPmarkup[i];i++)
2949 while ((s=strstr(theline,DPmarkup[i])))
2951 t=s+strlen(DPmarkup[i]);
2952 memmove(s,t,strlen(t)+1);
2957 * postprocess_for_HTML:
2959 * Invoked with the -m switch from flgets().
2960 * It simply "removes" from the line a hard-coded set of common
2961 * HTML tags and "replaces" a hard-coded set of common HTML
2962 * entities, so that the line passed to the main routine has
2963 * been pre-cleaned of HTML.
2965 void postprocess_for_HTML(char *theline)
2967 while (losemarkup(theline))
2969 loseentities(theline);
2972 char *losemarkup(char *theline)
2976 s=strchr(theline,'<');
2977 t=s?strchr(s,'>'):NULL;
2980 for (i=0;*markup[i];i++)
2981 if (tagcomp(g_utf8_next_char(s),markup[i]))
2983 t=g_utf8_next_char(t);
2984 memmove(s,t,strlen(t)+1);
2987 /* It's an unrecognized <xxx>. */
2991 void loseentities(char *theline)
2998 GTree *entities=NULL;
2999 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3003 g_tree_destroy(entities);
3005 if (translit!=(GIConv)-1)
3006 g_iconv_close(translit);
3007 translit=(GIConv)-1;
3008 if (to_utf8!=(GIConv)-1)
3009 g_iconv_close(to_utf8);
3017 entities=g_tree_new((GCompareFunc)strcmp);
3018 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3019 g_tree_insert(entities,HTMLentities[i].name,
3020 GUINT_TO_POINTER(HTMLentities[i].c));
3022 if (translit==(GIConv)-1)
3023 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3024 if (to_utf8==(GIConv)-1)
3025 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3026 while((amp=strchr(theline,'&')))
3028 scolon=strchr(amp,';');
3033 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3034 c=strtol(amp+2,NULL,10);
3035 else if (amp[2]=='x' &&
3036 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3037 c=strtol(amp+3,NULL,16);
3041 s=g_strndup(amp+1,scolon-(amp+1));
3042 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3051 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3052 theline+=g_unichar_to_utf8(c,theline);
3056 nb=g_unichar_to_utf8(c,s);
3057 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3059 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3061 memcpy(theline,s,nb);
3065 memmove(theline,g_utf8_next_char(scolon),
3066 strlen(g_utf8_next_char(scolon))+1);
3069 theline=g_utf8_next_char(amp);
3073 gboolean tagcomp(const char *strin,const char *basetag)
3077 if (g_utf8_get_char(strin)=='/')
3078 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3080 t=g_utf8_casefold(strin,-1);
3081 s=g_utf8_casefold(basetag,-1);
3082 retval=g_str_has_prefix(t,s);
3088 void proghelp(GOptionContext *context)
3091 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3092 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3093 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3094 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3095 "For details, read the file COPYING.\n",stderr);
3096 fputs("This is Free Software; "
3097 "you may redistribute it under certain conditions (GPL);\n",stderr);
3098 fputs("read the file COPYING for details.\n\n",stderr);
3099 help=g_option_context_get_help(context,TRUE,NULL);
3102 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3103 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3104 "non-ASCII\n",stderr);
3105 fputs("characters like accented letters, "
3106 "lines longer than 75 or shorter than 55,\n",stderr);
3107 fputs("unbalanced quotes or brackets, "
3108 "a variety of badly formatted punctuation, \n",stderr);
3109 fputs("HTML tags, some likely typos. "
3110 "It is NOT a substitute for human judgement.\n",stderr);