1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 static GOptionEntry options[]={
132 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
133 "Ignore DP-specific markup", NULL },
134 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
135 "Don't echo queried line", NULL },
136 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
137 "Check single quotes", NULL },
138 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
139 "Check common typos", NULL },
140 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
141 "Require closure of quotes on every paragraph", NULL },
142 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
143 "Disable paranoid querying of everything", NULL },
144 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
145 "Disable line end checking", NULL },
146 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
147 "Overview: just show counts", NULL },
148 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
149 "Output errors to stdout instead of stderr", NULL },
150 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
151 "Echo header fields", NULL },
152 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
153 "Ignore markup in < >", NULL },
154 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
155 "Use file of user-defined typos", NULL },
156 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
157 "Defaults for use on www upload", NULL },
158 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
159 "Verbose - list everything", NULL },
163 long cnt_dquot; /* for overview mode, count of doublequote queries */
164 long cnt_squot; /* for overview mode, count of singlequote queries */
165 long cnt_brack; /* for overview mode, count of brackets queries */
166 long cnt_bin; /* for overview mode, count of non-ASCII queries */
167 long cnt_odd; /* for overview mode, count of odd character queries */
168 long cnt_long; /* for overview mode, count of long line errors */
169 long cnt_short; /* for overview mode, count of short line queries */
170 long cnt_punct; /* for overview mode,
171 count of punctuation and spacing queries */
172 long cnt_dash; /* for overview mode, count of dash-related queries */
173 long cnt_word; /* for overview mode, count of word queries */
174 long cnt_html; /* for overview mode, count of html queries */
175 long cnt_lineend; /* for overview mode, count of line-end queries */
176 long cnt_spacend; /* count of lines with space at end */
177 long linecnt; /* count of total lines in the file */
178 long checked_linecnt; /* count of lines actually checked */
180 void proghelp(GOptionContext *context);
181 void procfile(const char *);
185 gboolean mixdigit(const char *);
186 gchar *getaword(const char **);
187 char *flgets(char **,long);
188 void postprocess_for_HTML(char *);
189 char *linehasmarkup(char *);
190 char *losemarkup(char *);
191 gboolean tagcomp(const char *,const char *);
192 void loseentities(char *);
193 gboolean isroman(const char *);
194 void postprocess_for_DP(char *);
195 void print_as_windows_1252(const char *string);
196 void print_as_utf_8(const char *string);
198 GTree *qword,*qperiod;
204 void parse_options(int *argc,char ***argv)
207 GOptionContext *context;
208 context=g_option_context_new(
209 "file - looks for errors in Project Gutenberg(TM) etexts");
210 g_option_context_add_main_entries(context,options,NULL);
211 if (!g_option_context_parse(context,argc,argv,&err))
213 g_printerr("Bookloupe: %s\n",err->message);
214 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
217 /* Paranoid checking is turned OFF, not on, by its switch */
218 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
219 if (pswit[PARANOID_SWITCH])
220 /* if running in paranoid mode, typo checks default to enabled */
221 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
222 /* Line-end checking is turned OFF, not on, by its switch */
223 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
224 /* Echoing is turned OFF, not on, by its switch */
225 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
226 if (pswit[OVERVIEW_SWITCH])
227 /* just print summary; don't echo */
228 pswit[ECHO_SWITCH]=FALSE;
230 * Web uploads - for the moment, this is really just a placeholder
231 * until we decide what processing we really want to do on web uploads
233 if (pswit[WEB_SWITCH])
235 /* specific override for web uploads */
236 pswit[ECHO_SWITCH]=TRUE;
237 pswit[SQUOTE_SWITCH]=FALSE;
238 pswit[TYPO_SWITCH]=TRUE;
239 pswit[QPARA_SWITCH]=FALSE;
240 pswit[PARANOID_SWITCH]=TRUE;
241 pswit[LINE_END_SWITCH]=FALSE;
242 pswit[OVERVIEW_SWITCH]=FALSE;
243 pswit[STDOUT_SWITCH]=FALSE;
244 pswit[HEADER_SWITCH]=TRUE;
245 pswit[VERBOSE_SWITCH]=FALSE;
246 pswit[MARKUP_SWITCH]=FALSE;
247 pswit[USERTYPO_SWITCH]=FALSE;
248 pswit[DP_SWITCH]=FALSE;
255 g_option_context_free(context);
261 * Read in the user-defined stealth scanno list.
263 void read_user_scannos(void)
266 gchar *usertypo_file;
270 gchar *contents,*utf8,**lines;
271 usertypo_file=g_strdup("bookloupe.typ");
272 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
273 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
276 g_free(usertypo_file);
277 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
278 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
280 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
283 g_free(usertypo_file);
284 usertypo_file=g_strdup("gutcheck.typ");
285 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
287 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
290 g_free(usertypo_file);
291 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
292 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
294 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
296 g_free(usertypo_file);
297 g_print(" --> I couldn't find bookloupe.typ "
298 "-- proceeding without user typos.\n");
303 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
304 g_free(usertypo_file);
308 if (g_utf8_validate(contents,len,NULL))
309 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
311 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
313 lines=g_strsplit_set(utf8,"\r\n",0);
315 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
316 for (i=0;lines[i];i++)
317 if (*(unsigned char *)lines[i]>'!')
318 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
327 * Read an etext returning a newly allocated string containing the file
328 * contents or NULL on error.
330 gchar *read_etext(const char *filename,GError **err)
332 GError *tmp_err=NULL;
333 gchar *contents,*utf8;
334 gsize len,bytes_read,bytes_written;
336 if (!g_file_get_contents(filename,&contents,&len,err))
338 if (g_utf8_validate(contents,len,NULL))
340 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
341 g_set_print_handler(print_as_utf_8);
343 SetConsoleOutputCP(CP_UTF8);
348 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
349 &bytes_written,&tmp_err);
350 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
351 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
354 for(i=0;i<bytes_read;i++)
355 if (contents[i]=='\n')
360 else if (contents[i]!='\r')
362 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
363 "Input conversion failed. Byte %d at line %d, column %d is not a "
364 "valid Windows-1252 character",
365 ((unsigned char *)contents)[bytes_read],line,col);
368 g_propagate_error(err,tmp_err);
369 g_set_print_handler(print_as_windows_1252);
371 SetConsoleOutputCP(1252);
378 void cleanup_on_exit(void)
381 SetConsoleOutputCP(saved_cp);
385 int main(int argc,char **argv)
388 atexit(cleanup_on_exit);
389 saved_cp=GetConsoleOutputCP();
391 running_from=g_path_get_dirname(argv[0]);
392 parse_options(&argc,&argv);
393 if (pswit[USERTYPO_SWITCH])
395 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
397 if (pswit[OVERVIEW_SWITCH])
399 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
400 checked_linecnt,linecnt,linecnt-checked_linecnt);
401 g_print(" --------------- Queries found --------------\n");
403 g_print(" Long lines: %14ld\n",cnt_long);
405 g_print(" Short lines: %14ld\n",cnt_short);
407 g_print(" Line-end problems: %14ld\n",cnt_lineend);
409 g_print(" Common typos: %14ld\n",cnt_word);
411 g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
413 g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
415 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
417 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
419 g_print(" Proofing characters: %14ld\n",cnt_odd);
421 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
423 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
425 g_print(" Possible HTML tags: %14ld\n",cnt_html);
427 g_print(" TOTAL QUERIES %14ld\n",
428 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
429 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
431 g_free(running_from);
433 g_tree_unref(usertypo);
440 * Run a first pass - verify that it's a valid PG
441 * file, decide whether to report some things that
442 * occur many times in the text like long or short
443 * lines, non-standard dashes, etc.
445 struct first_pass_results *first_pass(const char *etext)
447 gunichar laststart=CHAR_SPACE;
452 unsigned int lastlen=0,lastblen=0;
453 long spline=0,nspline=0;
454 static struct first_pass_results results={0};
456 lines=g_strsplit(etext,"\n",0);
457 for (j=0;lines[j];j++)
459 lbytes=strlen(lines[j]);
460 while (lbytes>0 && lines[j][lbytes-1]=='\r')
461 lines[j][--lbytes]='\0';
462 llen=g_utf8_strlen(lines[j],lbytes);
464 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
465 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
468 g_print(" --> Duplicate header?\n");
469 spline=linecnt+1; /* first line of non-header text, that is */
471 if (!strncmp(lines[j],"*** START",9) &&
472 strstr(lines[j],"PROJECT GUTENBERG"))
475 g_print(" --> Duplicate header?\n");
476 nspline=linecnt+1; /* first line of non-header text, that is */
478 if (spline || nspline)
480 lc_line=g_utf8_strdown(lines[j],lbytes);
481 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
483 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
485 if (results.footerline)
487 /* it's an old-form header - we can detect duplicates */
489 g_print(" --> Duplicate footer?\n");
492 results.footerline=linecnt;
498 results.firstline=spline;
500 results.firstline=nspline; /* override with new */
501 if (results.footerline)
502 continue; /* don't count the boilerplate in the footer */
503 results.totlen+=llen;
504 for (s=lines[j];*s;s=g_utf8_next_char(s))
506 if (g_utf8_get_char(s)>127)
508 if (g_unichar_isalpha(g_utf8_get_char(s)))
510 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
511 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
512 results.endquote_count++;
514 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
515 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
518 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
520 if (strstr(lines[j],".,"))
522 /* only count ast lines for ignoring purposes where there is */
523 /* locase text on the line */
524 if (strchr(lines[j],'*'))
526 for (s=lines[j];*s;s=g_utf8_next_char(s))
527 if (g_unichar_islower(g_utf8_get_char(s)))
532 if (strchr(lines[j],'/'))
533 results.fslashline++;
536 for (s=g_utf8_prev_char(lines[j]+lbytes);
537 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
538 s=g_utf8_prev_char(s))
540 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
541 g_utf8_get_char(g_utf8_prev_char(s))!='-')
544 if (llen>LONGEST_PG_LINE)
546 if (llen>WAY_TOO_LONG)
547 results.verylongline++;
548 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
550 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
553 if (strstr(lines[j],"<i>"))
554 results.htmcount+=4; /* bonus marks! */
556 /* Check for spaced em-dashes */
557 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
560 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
561 results.space_emdash++;
562 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
563 /* count of em-dashes with spaces both sides */
564 results.non_PG_space_emdash++;
565 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
566 /* count of PG-type em-dashes with no spaces */
567 results.PG_space_emdash++;
572 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
573 results.Dutchcount++;
574 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
575 results.Frenchcount++;
576 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
577 results.standalone_digit++;
580 /* Check for spaced dashes */
581 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
585 laststart=lines[j][0];
594 * Make some snap decisions based on the first pass results.
596 struct warnings *report_first_pass(struct first_pass_results *results)
598 static struct warnings warnings={0};
600 g_print(" --> %ld lines in this file have white space at end\n",
603 if (results->dotcomma>5)
606 g_print(" --> %ld lines in this file contain '.,'. "
607 "Not reporting them.\n",results->dotcomma);
610 * If more than 50 lines, or one-tenth, are short,
611 * don't bother reporting them.
613 warnings.shortline=1;
614 if (results->shortline>50 || results->shortline*10>linecnt)
616 warnings.shortline=0;
617 g_print(" --> %ld lines in this file are short. "
618 "Not reporting short lines.\n",results->shortline);
621 * If more than 50 lines, or one-tenth, are long,
622 * don't bother reporting them.
625 if (results->longline>50 || results->longline*10>linecnt)
628 g_print(" --> %ld lines in this file are long. "
629 "Not reporting long lines.\n",results->longline);
631 /* If more than 10 lines contain asterisks, don't bother reporting them. */
633 if (results->astline>10)
636 g_print(" --> %ld lines in this file contain asterisks. "
637 "Not reporting them.\n",results->astline);
640 * If more than 10 lines contain forward slashes,
641 * don't bother reporting them.
644 if (results->fslashline>10)
647 g_print(" --> %ld lines in this file contain forward slashes. "
648 "Not reporting them.\n",results->fslashline);
651 * If more than 20 lines contain unpunctuated endquotes,
652 * don't bother reporting them.
655 if (results->endquote_count>20)
658 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
659 "Not reporting them.\n",results->endquote_count);
662 * If more than 15 lines contain standalone digits,
663 * don't bother reporting them.
666 if (results->standalone_digit>10)
669 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
670 "Not reporting them.\n",results->standalone_digit);
673 * If more than 20 lines contain hyphens at end,
674 * don't bother reporting them.
677 if (results->hyphens>20)
680 g_print(" --> %ld lines in this file have hyphens at end. "
681 "Not reporting them.\n",results->hyphens);
683 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
685 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
686 pswit[MARKUP_SWITCH]=1;
688 if (results->verylongline>0)
689 g_print(" --> %ld lines in this file are VERY long!\n",
690 results->verylongline);
692 * If there are more non-PG spaced dashes than PG em-dashes,
693 * assume it's deliberate.
694 * Current PG guidelines say don't use them, but older texts do,
695 * and some people insist on them whatever the guidelines say.
698 if (results->spacedash+results->non_PG_space_emdash>
699 results->PG_space_emdash)
702 g_print(" --> There are %ld spaced dashes and em-dashes. "
703 "Not reporting them.\n",
704 results->spacedash+results->non_PG_space_emdash);
706 /* If more than a quarter of characters are hi-bit, bug out. */
708 if (results->binlen*4>results->totlen)
710 g_print(" --> This file does not appear to be ASCII. "
711 "Terminating. Best of luck with it!\n");
714 if (results->alphalen*4<results->totlen)
716 g_print(" --> This file does not appear to be text. "
717 "Terminating. Best of luck with it!\n");
720 if (results->binlen*100>results->totlen || results->binlen>100)
722 g_print(" --> There are a lot of foreign letters here. "
723 "Not reporting them.\n");
726 warnings.isDutch=FALSE;
727 if (results->Dutchcount>50)
729 warnings.isDutch=TRUE;
730 g_print(" --> This looks like Dutch - "
731 "switching off dashes and warnings for 's Middags case.\n");
733 warnings.isFrench=FALSE;
734 if (results->Frenchcount>50)
736 warnings.isFrench=TRUE;
737 g_print(" --> This looks like French - "
738 "switching off some doublepunct.\n");
740 if (results->firstline && results->footerline)
741 g_print(" The PG header and footer appear to be already on.\n");
744 if (results->firstline)
745 g_print(" The PG header is on - no footer.\n");
746 if (results->footerline)
747 g_print(" The PG footer is on - no header.\n");
750 if (pswit[VERBOSE_SWITCH])
753 warnings.shortline=1;
762 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
764 if (warnings.isDutch)
766 if (results->footerline>0 && results->firstline>0 &&
767 results->footerline>results->firstline &&
768 results->footerline-results->firstline<100)
770 g_print(" --> I don't really know where this text starts. \n");
771 g_print(" There are no reference points.\n");
772 g_print(" I'm going to have to report the header and footer "
774 results->firstline=0;
782 * Look along the line, accumulate the count of quotes, and see
783 * if this is an empty line - i.e. a line with nothing on it
785 * If line has just spaces, period, * and/or - on it, don't
786 * count it, since empty lines with asterisks or dashes to
787 * separate sections are common.
789 * Returns: TRUE if the line is empty.
791 gboolean analyse_quotes(const char *aline,struct counters *counters)
794 /* assume the line is empty until proven otherwise */
795 gboolean isemptyline=TRUE;
796 const char *s=aline,*sprev,*snext;
801 snext=g_utf8_next_char(s);
802 c=g_utf8_get_char(s);
805 if (CHAR_IS_SQUOTE(c))
810 * At start of line, it can only be an openquote.
811 * Hardcode a very common exception!
813 if (!g_str_has_prefix(snext,"tis") &&
814 !g_str_has_prefix(snext,"Tis"))
815 increment_matching(counters,c,TRUE);
817 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
818 g_unichar_isalpha(g_utf8_get_char(snext)))
819 /* Do nothing! it's definitely an apostrophe, not a quote */
821 /* it's outside a word - let's check it out */
822 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
823 g_unichar_isalpha(g_utf8_get_char(snext)))
825 /* it damwell better BE an openquote */
826 if (!g_str_has_prefix(snext,"tis") &&
827 !g_str_has_prefix(snext,"Tis"))
828 /* hardcode a very common exception! */
829 increment_matching(counters,c,TRUE);
833 /* now - is it a closequote? */
834 guessquote=0; /* accumulate clues */
835 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
837 /* it follows a letter - could be either */
839 if (g_utf8_get_char(sprev)=='s')
841 /* looks like a plural apostrophe */
843 if (g_utf8_get_char(snext)==CHAR_SPACE)
848 /* it doesn't have a letter either side */
849 else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
850 strchr(".?!,;: ",g_utf8_get_char(snext)))
851 guessquote+=8; /* looks like a closequote */
854 if (matching_difference(counters,CHAR_SQUOTE)>0)
856 * Give it the benefit of some doubt,
857 * if a squote is already open.
863 increment_matching(counters,c,FALSE);
866 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
868 isemptyline=FALSE; /* ignore lines like * * * as spacers */
869 if (c==CHAR_UNDERSCORE)
870 counters->c_unders++;
871 if (c==CHAR_OPEN_SBRACK)
873 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
874 !matching_difference(counters,c) && s==aline &&
875 g_str_has_prefix(s,"[Illustration:"))
876 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
878 increment_matching(counters,c,TRUE);
880 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
881 increment_matching(counters,c,TRUE);
882 if (c==CHAR_CLOSE_SBRACK)
884 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
885 !matching_difference(counters,c) && !*snext)
886 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
888 increment_matching(counters,c,FALSE);
890 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
891 increment_matching(counters,c,FALSE);
899 * check_for_control_characters:
901 * Check for invalid or questionable characters in the line
902 * Anything above 127 is invalid for plain ASCII, and
903 * non-printable control characters should also be flagged.
904 * Tabs should generally not be there.
906 void check_for_control_characters(const char *aline)
910 for (s=aline;*s;s=g_utf8_next_char(s))
912 c=g_utf8_get_char(s);
913 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
915 if (pswit[ECHO_SWITCH])
916 g_print("\n%s\n",aline);
917 if (!pswit[OVERVIEW_SWITCH])
918 g_print(" Line %ld column %ld - Control character %u\n",
919 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
927 * check_for_odd_characters:
929 * Check for binary and other odd characters.
931 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
932 gboolean isemptyline)
934 /* Don't repeat multiple warnings on one line. */
935 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
936 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
939 for (s=aline;*s;s=g_utf8_next_char(s))
941 c=g_utf8_get_char(s);
942 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
944 if (pswit[ECHO_SWITCH])
945 g_print("\n%s\n",aline);
946 if (!pswit[OVERVIEW_SWITCH])
947 if (c>127 && c<160 || c>255)
948 g_print(" Line %ld column %ld - "
949 "Non-ISO-8859 character %u\n",
950 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
952 g_print(" Line %ld column %ld - "
953 "Non-ASCII character %u\n",
954 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
959 if (!eTab && c==CHAR_TAB)
961 if (pswit[ECHO_SWITCH])
962 g_print("\n%s\n",aline);
963 if (!pswit[OVERVIEW_SWITCH])
964 g_print(" Line %ld column %ld - Tab character?\n",
965 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
970 if (!eTilde && c==CHAR_TILDE)
973 * Often used by OCR software to indicate an
974 * unrecognizable character.
976 if (pswit[ECHO_SWITCH])
977 g_print("\n%s\n",aline);
978 if (!pswit[OVERVIEW_SWITCH])
979 g_print(" Line %ld column %ld - Tilde character?\n",
980 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
985 if (!eCarat && c==CHAR_CARAT)
987 if (pswit[ECHO_SWITCH])
988 g_print("\n%s\n",aline);
989 if (!pswit[OVERVIEW_SWITCH])
990 g_print(" Line %ld column %ld - Carat character?\n",
991 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
996 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
998 if (pswit[ECHO_SWITCH])
999 g_print("\n%s\n",aline);
1000 if (!pswit[OVERVIEW_SWITCH])
1001 g_print(" Line %ld column %ld - Forward slash?\n",
1002 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1008 * Report asterisks only in paranoid mode,
1009 * since they're often deliberate.
1011 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1014 if (pswit[ECHO_SWITCH])
1015 g_print("\n%s\n",aline);
1016 if (!pswit[OVERVIEW_SWITCH])
1017 g_print(" Line %ld column %ld - Asterisk?\n",
1018 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1027 * check_for_long_line:
1029 * Check for line too long.
1031 void check_for_long_line(const char *aline)
1033 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1035 if (pswit[ECHO_SWITCH])
1036 g_print("\n%s\n",aline);
1037 if (!pswit[OVERVIEW_SWITCH])
1038 g_print(" Line %ld column %ld - Long line %ld\n",
1039 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1046 * check_for_short_line:
1048 * Check for line too short.
1050 * This one is a bit trickier to implement: we don't want to
1051 * flag the last line of a paragraph for being short, so we
1052 * have to wait until we know that our current line is a
1053 * "normal" line, then report the _previous_ line if it was too
1054 * short. We also don't want to report indented lines like
1055 * chapter heads or formatted quotations. We therefore keep
1056 * last->len as the length of the last line examined, and
1057 * last->blen as the length of the last but one, and try to
1058 * suppress unnecessary warnings by checking that both were of
1059 * "normal" length. We keep the first character of the last
1060 * line in last->start, and if it was a space, we assume that
1061 * the formatting is deliberate. I can't figure out a way to
1062 * distinguish something like a quoted verse left-aligned or
1063 * the header or footer of a letter from a paragraph of short
1064 * lines - maybe if I examined the whole paragraph, and if the
1065 * para has less than, say, 8 lines and if all lines are short,
1066 * then just assume it's OK? Need to look at some texts to see
1067 * how often a formula like this would get the right result.
1069 void check_for_short_line(const char *aline,const struct line_properties *last)
1071 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1072 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1073 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1075 if (pswit[ECHO_SWITCH])
1076 g_print("\n%s\n",prevline);
1077 if (!pswit[OVERVIEW_SWITCH])
1078 g_print(" Line %ld column %ld - Short line %ld?\n",
1079 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1086 * check_for_starting_punctuation:
1088 * Look for punctuation other than full ellipses at start of line.
1090 void check_for_starting_punctuation(const char *aline)
1092 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1093 !g_str_has_prefix(aline,". . ."))
1095 if (pswit[ECHO_SWITCH])
1096 g_print("\n%s\n",aline);
1097 if (!pswit[OVERVIEW_SWITCH])
1098 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1106 * check_for_spaced_emdash:
1108 * Check for spaced em-dashes.
1110 * We must check _all_ occurrences of "--" on the line
1111 * hence the loop - even if the first double-dash is OK
1112 * there may be another that's wrong later on.
1114 void check_for_spaced_emdash(const char *aline)
1116 const char *s,*t,*next;
1117 for (s=aline;t=strstr(s,"--");s=next)
1119 next=g_utf8_next_char(g_utf8_next_char(t));
1120 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1121 g_utf8_get_char(next)==CHAR_SPACE)
1123 if (pswit[ECHO_SWITCH])
1124 g_print("\n%s\n",aline);
1125 if (!pswit[OVERVIEW_SWITCH])
1126 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1127 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1135 * check_for_spaced_dash:
1137 * Check for spaced dashes.
1139 void check_for_spaced_dash(const char *aline)
1142 if ((s=strstr(aline," -")))
1144 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1146 if (pswit[ECHO_SWITCH])
1147 g_print("\n%s\n",aline);
1148 if (!pswit[OVERVIEW_SWITCH])
1149 g_print(" Line %ld column %ld - Spaced dash?\n",
1150 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1155 else if ((s=strstr(aline,"- ")))
1157 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1159 if (pswit[ECHO_SWITCH])
1160 g_print("\n%s\n",aline);
1161 if (!pswit[OVERVIEW_SWITCH])
1162 g_print(" Line %ld column %ld - Spaced dash?\n",
1163 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1171 * check_for_unmarked_paragraphs:
1173 * Check for unmarked paragraphs indicated by separate speakers.
1175 * May well be false positive:
1176 * "Bravo!" "Wonderful!" called the crowd.
1177 * but useful all the same.
1179 void check_for_unmarked_paragraphs(const char *aline)
1182 s=strstr(aline,"\" \"");
1184 s=strstr(aline,"\" \"");
1187 if (pswit[ECHO_SWITCH])
1188 g_print("\n%s\n",aline);
1189 if (!pswit[OVERVIEW_SWITCH])
1190 g_print(" Line %ld column %ld - "
1191 "Query missing paragraph break?\n",
1192 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1199 * check_for_jeebies:
1201 * Check for "to he" and other easy h/b errors.
1203 * This is a very inadequate effort on the h/b problem,
1204 * but the phrase "to he" is always an error, whereas "to
1205 * be" is quite common.
1206 * Similarly, '"Quiet!", be said.' is a non-be error
1207 * "to he" is _not_ always an error!:
1208 * "Where they went to he couldn't say."
1209 * Another false positive:
1210 * What would "Cinderella" be without the . . .
1211 * and another: "If he wants to he can see for himself."
1213 void check_for_jeebies(const char *aline)
1216 s=strstr(aline," be could ");
1218 s=strstr(aline," be would ");
1220 s=strstr(aline," was be ");
1222 s=strstr(aline," be is ");
1224 s=strstr(aline," is be ");
1226 s=strstr(aline,"\", be ");
1228 s=strstr(aline,"\" be ");
1230 s=strstr(aline,"\" be ");
1232 s=strstr(aline," to he ");
1235 if (pswit[ECHO_SWITCH])
1236 g_print("\n%s\n",aline);
1237 if (!pswit[OVERVIEW_SWITCH])
1238 g_print(" Line %ld column %ld - Query he/be error?\n",
1239 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1243 s=strstr(aline," the had ");
1245 s=strstr(aline," a had ");
1247 s=strstr(aline," they bad ");
1249 s=strstr(aline," she bad ");
1251 s=strstr(aline," he bad ");
1253 s=strstr(aline," you bad ");
1255 s=strstr(aline," i bad ");
1258 if (pswit[ECHO_SWITCH])
1259 g_print("\n%s\n",aline);
1260 if (!pswit[OVERVIEW_SWITCH])
1261 g_print(" Line %ld column %ld - Query had/bad error?\n",
1262 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1266 s=strstr(aline,"; hut ");
1268 s=strstr(aline,", hut ");
1271 if (pswit[ECHO_SWITCH])
1272 g_print("\n%s\n",aline);
1273 if (!pswit[OVERVIEW_SWITCH])
1274 g_print(" Line %ld column %ld - Query hut/but error?\n",
1275 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1282 * check_for_mta_from:
1284 * Special case - angled bracket in front of "From" placed there by an
1285 * MTA when sending an e-mail.
1287 void check_for_mta_from(const char *aline)
1290 s=strstr(aline,">From");
1293 if (pswit[ECHO_SWITCH])
1294 g_print("\n%s\n",aline);
1295 if (!pswit[OVERVIEW_SWITCH])
1296 g_print(" Line %ld column %ld - "
1297 "Query angled bracket with From\n",
1298 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1305 * check_for_orphan_character:
1307 * Check for a single character line -
1308 * often an overflow from bad wrapping.
1310 void check_for_orphan_character(const char *aline)
1313 c=g_utf8_get_char(aline);
1314 if (c && !*g_utf8_next_char(aline))
1316 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1317 ; /* Nothing - ignore numerals alone on a line. */
1320 if (pswit[ECHO_SWITCH])
1321 g_print("\n%s\n",aline);
1322 if (!pswit[OVERVIEW_SWITCH])
1323 g_print(" Line %ld column 1 - Query single character line\n",
1332 * check_for_pling_scanno:
1334 * Check for I" - often should be !
1336 void check_for_pling_scanno(const char *aline)
1339 s=strstr(aline," I\"");
1342 if (pswit[ECHO_SWITCH])
1343 g_print("\n%s\n",aline);
1344 if (!pswit[OVERVIEW_SWITCH])
1345 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1346 linecnt,g_utf8_pointer_to_offset(aline,s));
1353 * check_for_extra_period:
1355 * Check for period without a capital letter. Cut-down from gutspell.
1356 * Only works when it happens on a single line.
1358 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1360 const char *s,*t,*s1,*sprev;
1365 gunichar c,nc,pc,*decomposition;
1366 if (pswit[PARANOID_SWITCH])
1368 for (t=aline;t=strstr(t,". ");)
1372 t=g_utf8_next_char(t);
1373 /* start of line punctuation is handled elsewhere */
1376 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1378 t=g_utf8_next_char(t);
1381 if (warnings->isDutch)
1383 /* For Frank & Jeroen -- 's Middags case */
1384 gunichar c2,c3,c4,c5;
1385 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1386 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1387 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1388 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1389 if (CHAR_IS_APOSTROPHE(c2) &&
1390 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1391 g_unichar_isupper(c5))
1393 t=g_utf8_next_char(t);
1397 s1=g_utf8_next_char(g_utf8_next_char(t));
1398 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1399 !isdigit(g_utf8_get_char(s1)))
1400 s1=g_utf8_next_char(s1);
1401 if (g_unichar_islower(g_utf8_get_char(s1)))
1403 /* we have something to investigate */
1405 /* so let's go back and find out */
1406 nc=g_utf8_get_char(t);
1407 s1=g_utf8_prev_char(t);
1408 c=g_utf8_get_char(s1);
1409 sprev=g_utf8_prev_char(s1);
1410 pc=g_utf8_get_char(sprev);
1412 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1413 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1414 g_unichar_isalpha(nc)))
1419 sprev=g_utf8_prev_char(s1);
1420 pc=g_utf8_get_char(sprev);
1422 s1=g_utf8_next_char(s1);
1425 testword=g_strndup(s1,s-s1);
1427 testword=g_strdup(s1);
1428 for (i=0;*abbrev[i];i++)
1429 if (!strcmp(testword,abbrev[i]))
1431 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1433 if (!*g_utf8_next_char(testword))
1435 if (isroman(testword))
1440 for (s=testword;*s;s=g_utf8_next_char(s))
1442 decomposition=g_unicode_canonical_decomposition(
1443 g_utf8_get_char(s),&len);
1444 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1446 g_free(decomposition);
1450 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1452 g_tree_insert(qperiod,g_strdup(testword),
1453 GINT_TO_POINTER(1));
1454 if (pswit[ECHO_SWITCH])
1455 g_print("\n%s\n",aline);
1456 if (!pswit[OVERVIEW_SWITCH])
1457 g_print(" Line %ld column %ld - Extra period?\n",
1458 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1464 t=g_utf8_next_char(t);
1470 * check_for_following_punctuation:
1472 * Check for words usually not followed by punctuation.
1474 void check_for_following_punctuation(const char *aline)
1477 const char *s,*wordstart;
1480 if (pswit[TYPO_SWITCH])
1491 inword=g_utf8_strdown(t,-1);
1493 for (i=0;*nocomma[i];i++)
1494 if (!strcmp(inword,nocomma[i]))
1496 c=g_utf8_get_char(s);
1497 if (c==',' || c==';' || c==':')
1499 if (pswit[ECHO_SWITCH])
1500 g_print("\n%s\n",aline);
1501 if (!pswit[OVERVIEW_SWITCH])
1502 g_print(" Line %ld column %ld - "
1503 "Query punctuation after %s?\n",
1504 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1510 for (i=0;*noperiod[i];i++)
1511 if (!strcmp(inword,noperiod[i]))
1513 c=g_utf8_get_char(s);
1514 if (c=='.' || c=='!')
1516 if (pswit[ECHO_SWITCH])
1517 g_print("\n%s\n",aline);
1518 if (!pswit[OVERVIEW_SWITCH])
1519 g_print(" Line %ld column %ld - "
1520 "Query punctuation after %s?\n",
1521 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1535 * Check for commonly mistyped words,
1536 * and digits like 0 for O in a word.
1538 void check_for_typos(const char *aline,struct warnings *warnings)
1540 const char *s,*t,*nt,*wordstart;
1542 gunichar *decomposition;
1544 int i,vowel,consonant,*dupcnt;
1545 gboolean isdup,istypo,alower;
1548 gsize decomposition_len;
1552 inword=getaword(&s);
1556 continue; /* don't bother with empty lines */
1558 if (mixdigit(inword))
1560 if (pswit[ECHO_SWITCH])
1561 g_print("\n%s\n",aline);
1562 if (!pswit[OVERVIEW_SWITCH])
1563 g_print(" Line %ld column %ld - Query digit in %s\n",
1564 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1569 * Put the word through a series of tests for likely typos and OCR
1572 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1576 for (t=inword;*t;t=g_utf8_next_char(t))
1578 c=g_utf8_get_char(t);
1579 nt=g_utf8_next_char(t);
1580 /* lowercase for testing */
1581 if (g_unichar_islower(c))
1583 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1586 * We have an uppercase mid-word. However, there are
1588 * Mac and Mc like McGill
1589 * French contractions like l'Abbe
1591 offset=g_utf8_pointer_to_offset(inword,t);
1593 pc=g_utf8_get_char(g_utf8_prev_char(t));
1596 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1597 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1598 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1599 CHAR_IS_APOSTROPHE(pc))
1605 testword=g_utf8_casefold(inword,-1);
1607 if (pswit[TYPO_SWITCH])
1610 * Check for certain unlikely two-letter combinations at word
1613 len=g_utf8_strlen(testword,-1);
1616 for (i=0;*nostart[i];i++)
1617 if (g_str_has_prefix(testword,nostart[i]))
1619 for (i=0;*noend[i];i++)
1620 if (g_str_has_suffix(testword,noend[i]))
1623 /* ght is common, gbt never. Like that. */
1624 if (strstr(testword,"cb"))
1626 if (strstr(testword,"gbt"))
1628 if (strstr(testword,"pbt"))
1630 if (strstr(testword,"tbs"))
1632 if (strstr(testword,"mrn"))
1634 if (strstr(testword,"ahle"))
1636 if (strstr(testword,"ihle"))
1639 * "TBE" does happen - like HEARTBEAT - but uncommon.
1640 * Also "TBI" - frostbite, outbid - but uncommon.
1641 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1642 * numerals, but "ii" is a common scanno.
1644 if (strstr(testword,"tbi"))
1646 if (strstr(testword,"tbe"))
1648 if (strstr(testword,"ii"))
1651 * Check for no vowels or no consonants.
1652 * If none, flag a typo.
1654 if (!istypo && len>1)
1657 for (t=testword;*t;t=g_utf8_next_char(t))
1659 c=g_utf8_get_char(t);
1661 g_unicode_canonical_decomposition(c,&decomposition_len);
1662 if (c=='y' || g_unichar_isdigit(c))
1664 /* Yah, this is loose. */
1668 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1672 g_free(decomposition);
1674 if (!vowel || !consonant)
1678 * Now exclude the word from being reported if it's in
1681 for (i=0;*okword[i];i++)
1682 if (!strcmp(testword,okword[i]))
1685 * What looks like a typo may be a Roman numeral.
1688 if (istypo && isroman(testword))
1690 /* Check the manual list of typos. */
1692 for (i=0;*typo[i];i++)
1693 if (!strcmp(testword,typo[i]))
1696 * Check lowercase s, l, i and m - special cases.
1697 * "j" - often a semi-colon gone wrong.
1698 * "d" for a missing apostrophe - he d
1701 if (!istypo && len==1 &&
1702 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1706 dupcnt=g_tree_lookup(qword,testword);
1710 isdup=!pswit[VERBOSE_SWITCH];
1714 dupcnt=g_new0(int,1);
1715 g_tree_insert(qword,g_strdup(testword),dupcnt);
1720 if (pswit[ECHO_SWITCH])
1721 g_print("\n%s\n",aline);
1722 if (!pswit[OVERVIEW_SWITCH])
1724 g_print(" Line %ld column %ld - Query word %s",
1725 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1727 if (!pswit[VERBOSE_SWITCH])
1728 g_print(" - not reporting duplicates");
1736 /* check the user's list of typos */
1737 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1739 if (pswit[ECHO_SWITCH])
1740 g_print("\n%s\n",aline);
1741 if (!pswit[OVERVIEW_SWITCH])
1742 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1743 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1745 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1747 if (pswit[PARANOID_SWITCH] && warnings->digit)
1749 /* In paranoid mode, query all 0 and 1 standing alone. */
1750 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1752 if (pswit[ECHO_SWITCH])
1753 g_print("\n%s\n",aline);
1754 if (!pswit[OVERVIEW_SWITCH])
1755 g_print(" Line %ld column %ld - Query standalone %s\n",
1756 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1767 * check_for_misspaced_punctuation:
1769 * Look for added or missing spaces around punctuation and quotes.
1770 * If there is a punctuation character like ! with no space on
1771 * either side, suspect a missing!space. If there are spaces on
1772 * both sides , assume a typo. If we see a double quote with no
1773 * space or punctuation on either side of it, assume unspaced
1774 * quotes "like"this.
1776 void check_for_misspaced_punctuation(const char *aline,
1777 struct parities *parities,gboolean isemptyline)
1779 gboolean isacro,isellipsis;
1781 gunichar c,nc,pc,n2c;
1782 c=g_utf8_get_char(aline);
1783 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1784 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1788 nc=g_utf8_get_char(g_utf8_next_char(s));
1789 /* For each character in the line after the first. */
1790 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1792 /* we need to suppress warnings for acronyms like M.D. */
1794 /* we need to suppress warnings for ellipsis . . . */
1797 * If there are letters on both sides of it or
1798 * if it's strict punctuation followed by an alpha.
1800 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1801 g_utf8_strchr("?!,;:",-1,c)))
1805 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1806 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1808 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1814 if (pswit[ECHO_SWITCH])
1815 g_print("\n%s\n",aline);
1816 if (!pswit[OVERVIEW_SWITCH])
1817 g_print(" Line %ld column %ld - Missing space?\n",
1818 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1823 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1826 * If there are spaces on both sides,
1827 * or space before and end of line.
1831 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1832 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1834 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1838 if (!isemptyline && !isellipsis)
1840 if (pswit[ECHO_SWITCH])
1841 g_print("\n%s\n",aline);
1842 if (!pswit[OVERVIEW_SWITCH])
1843 g_print(" Line %ld column %ld - "
1844 "Spaced punctuation?\n",linecnt,
1845 g_utf8_pointer_to_offset(aline,s)+1);
1852 /* Split out the characters that CANNOT be preceded by space. */
1853 c=g_utf8_get_char(aline);
1854 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1855 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1859 nc=g_utf8_get_char(g_utf8_next_char(s));
1860 /* for each character in the line after the first */
1861 if (g_utf8_strchr("?!,;:",-1,c))
1863 /* if it's punctuation that _cannot_ have a space before it */
1864 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1867 * If nc DOES == space,
1868 * it was already reported just above.
1870 if (pswit[ECHO_SWITCH])
1871 g_print("\n%s\n",aline);
1872 if (!pswit[OVERVIEW_SWITCH])
1873 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1874 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1881 * Special case " .X" where X is any alpha.
1882 * This plugs a hole in the acronym code above.
1883 * Inelegant, but maintainable.
1885 c=g_utf8_get_char(aline);
1886 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1887 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1891 nc=g_utf8_get_char(g_utf8_next_char(s));
1892 /* for each character in the line after the first */
1895 /* if it's a period */
1896 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1899 * If the period follows a space and
1900 * is followed by a letter.
1902 if (pswit[ECHO_SWITCH])
1903 g_print("\n%s\n",aline);
1904 if (!pswit[OVERVIEW_SWITCH])
1905 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1906 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1912 c=g_utf8_get_char(aline);
1913 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1914 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1918 nc=g_utf8_get_char(g_utf8_next_char(s));
1919 /* for each character in the line after the first */
1922 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1923 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1924 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1926 if (pswit[ECHO_SWITCH])
1927 g_print("\n%s\n",aline);
1928 if (!pswit[OVERVIEW_SWITCH])
1929 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1930 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1936 /* Check parity of quotes. */
1937 nc=g_utf8_get_char(aline);
1938 for (s=aline;*s;s=g_utf8_next_char(s))
1941 nc=g_utf8_get_char(g_utf8_next_char(s));
1944 parities->dquote=!parities->dquote;
1945 if (!parities->dquote)
1948 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
1950 if (pswit[ECHO_SWITCH])
1951 g_print("\n%s\n",aline);
1952 if (!pswit[OVERVIEW_SWITCH])
1953 g_print(" Line %ld column %ld - "
1954 "Wrongspaced quotes?\n",
1955 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1963 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1964 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
1966 if (pswit[ECHO_SWITCH])
1967 g_print("\n%s\n",aline);
1968 if (!pswit[OVERVIEW_SWITCH])
1969 g_print(" Line %ld column %ld - "
1970 "Wrongspaced quotes?\n",
1971 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1978 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
1980 if (g_utf8_strchr(",;:!?)]} ",-1,
1981 g_utf8_get_char(g_utf8_next_char(aline))))
1983 if (pswit[ECHO_SWITCH])
1984 g_print("\n%s\n",aline);
1985 if (!pswit[OVERVIEW_SWITCH])
1986 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
1992 if (pswit[SQUOTE_SWITCH])
1994 nc=g_utf8_get_char(aline);
1995 for (s=aline;*s;s=g_utf8_next_char(s))
1998 nc=g_utf8_get_char(g_utf8_next_char(s));
1999 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2000 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2001 !g_unichar_isalpha(nc)))
2003 parities->squote=!parities->squote;
2004 if (!parities->squote)
2007 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2009 if (pswit[ECHO_SWITCH])
2010 g_print("\n%s\n",aline);
2011 if (!pswit[OVERVIEW_SWITCH])
2012 g_print(" Line %ld column %ld - "
2013 "Wrongspaced singlequotes?\n",
2014 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2022 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2023 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2025 if (pswit[ECHO_SWITCH])
2026 g_print("\n%s\n",aline);
2027 if (!pswit[OVERVIEW_SWITCH])
2028 g_print(" Line %ld column %ld - "
2029 "Wrongspaced singlequotes?\n",
2030 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2041 * check_for_double_punctuation:
2043 * Look for double punctuation like ,. or ,,
2044 * Thanks to DW for the suggestion!
2045 * In books with references, ".," and ".;" are common
2046 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2047 * OTOH, from my initial tests, there are also fairly
2048 * common errors. What to do? Make these cases paranoid?
2049 * ".," is the most common, so warnings->dotcomma is used
2050 * to suppress detailed reporting if it occurs often.
2052 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2056 nc=g_utf8_get_char(aline);
2057 for (s=aline;*s;s=g_utf8_next_char(s))
2060 nc=g_utf8_get_char(g_utf8_next_char(s));
2061 /* for each punctuation character in the line */
2062 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2063 g_utf8_strchr(".?!,;:",-1,nc))
2065 /* followed by punctuation, it's a query, unless . . . */
2066 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2067 !warnings->dotcomma && c=='.' && nc==',' ||
2068 warnings->isFrench && g_str_has_prefix(s,",...") ||
2069 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2070 warnings->isFrench && g_str_has_prefix(s,";...") ||
2071 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2072 warnings->isFrench && g_str_has_prefix(s,":...") ||
2073 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2074 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2075 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2076 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2077 warnings->isFrench && g_str_has_prefix(s,"...?"))
2079 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2080 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2081 warnings->isFrench && g_str_has_prefix(s,";...") ||
2082 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2083 warnings->isFrench && g_str_has_prefix(s,":...") ||
2084 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2085 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2086 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2087 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2088 warnings->isFrench && g_str_has_prefix(s,"...?"))
2091 nc=g_utf8_get_char(g_utf8_next_char(s));
2093 ; /* do nothing for .. !! and ?? which can be legit */
2097 if (pswit[ECHO_SWITCH])
2098 g_print("\n%s\n",aline);
2099 if (!pswit[OVERVIEW_SWITCH])
2100 g_print(" Line %ld column %ld - Double punctuation?\n",
2101 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2110 * check_for_spaced_quotes:
2112 void check_for_spaced_quotes(const char *aline)
2116 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2120 while ((t=strstr(s," \" ")))
2122 if (pswit[ECHO_SWITCH])
2123 g_print("\n%s\n",aline);
2124 if (!pswit[OVERVIEW_SWITCH])
2125 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2126 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2129 s=g_utf8_next_char(g_utf8_next_char(t));
2131 pattern=g_string_new(NULL);
2132 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2134 g_string_assign(pattern," ");
2135 g_string_append_unichar(pattern,single_quotes[i]);
2136 g_string_append_c(pattern,' ');
2138 while ((t=strstr(s,pattern->str)))
2140 if (pswit[ECHO_SWITCH])
2141 g_print("\n%s\n",aline);
2142 if (!pswit[OVERVIEW_SWITCH])
2143 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2144 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2147 s=g_utf8_next_char(g_utf8_next_char(t));
2150 g_string_free(pattern,TRUE);
2154 * check_for_miscased_genative:
2156 * Check special case of 'S instead of 's at end of word.
2158 void check_for_miscased_genative(const char *aline)
2164 c=g_utf8_get_char(aline);
2165 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2166 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2170 nc=g_utf8_get_char(g_utf8_next_char(s));
2171 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2173 if (pswit[ECHO_SWITCH])
2174 g_print("\n%s\n",aline);
2175 if (!pswit[OVERVIEW_SWITCH])
2176 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2177 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2185 * check_end_of_line:
2187 * Now check special cases - start and end of line -
2188 * for single and double quotes. Start is sometimes [sic]
2189 * but better to query it anyway.
2190 * While we're here, check for dash at end of line.
2192 void check_end_of_line(const char *aline,struct warnings *warnings)
2197 lbytes=strlen(aline);
2198 if (g_utf8_strlen(aline,lbytes)>1)
2200 s=g_utf8_prev_char(aline+lbytes);
2201 c1=g_utf8_get_char(s);
2202 c2=g_utf8_get_char(g_utf8_prev_char(s));
2203 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2205 if (pswit[ECHO_SWITCH])
2206 g_print("\n%s\n",aline);
2207 if (!pswit[OVERVIEW_SWITCH])
2208 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2209 g_utf8_strlen(aline,lbytes));
2213 c1=g_utf8_get_char(aline);
2214 c2=g_utf8_get_char(g_utf8_next_char(aline));
2215 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2217 if (pswit[ECHO_SWITCH])
2218 g_print("\n%s\n",aline);
2219 if (!pswit[OVERVIEW_SWITCH])
2220 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2225 * Dash at end of line may well be legit - paranoid mode only
2226 * and don't report em-dash at line-end.
2228 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2230 for (s=g_utf8_prev_char(aline+lbytes);
2231 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2233 if (g_utf8_get_char(s)=='-' &&
2234 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2236 if (pswit[ECHO_SWITCH])
2237 g_print("\n%s\n",aline);
2238 if (!pswit[OVERVIEW_SWITCH])
2239 g_print(" Line %ld column %ld - "
2240 "Hyphen at end of line?\n",
2241 linecnt,g_utf8_pointer_to_offset(aline,s));
2248 * check_for_unspaced_bracket:
2250 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2251 * If so, suspect a scanno like "a]most".
2253 void check_for_unspaced_bracket(const char *aline)
2257 c=g_utf8_get_char(aline);
2258 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2259 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2263 nc=g_utf8_get_char(g_utf8_next_char(s));
2266 /* for each bracket character in the line except 1st & last */
2267 if (g_utf8_strchr("{[()]}",-1,c) &&
2268 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2270 if (pswit[ECHO_SWITCH])
2271 g_print("\n%s\n",aline);
2272 if (!pswit[OVERVIEW_SWITCH])
2273 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2274 linecnt,g_utf8_pointer_to_offset(aline,s));
2282 * check_for_unpunctuated_endquote:
2284 void check_for_unpunctuated_endquote(const char *aline)
2288 c=g_utf8_get_char(aline);
2289 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2290 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2294 nc=g_utf8_get_char(g_utf8_next_char(s));
2295 /* for each character in the line except 1st */
2296 if (c==CHAR_DQUOTE && isalpha(pc))
2298 if (pswit[ECHO_SWITCH])
2299 g_print("\n%s\n",aline);
2300 if (!pswit[OVERVIEW_SWITCH])
2301 g_print(" Line %ld column %ld - "
2302 "endquote missing punctuation?\n",
2303 linecnt,g_utf8_pointer_to_offset(aline,s));
2311 * check_for_html_tag:
2313 * Check for <HTML TAG>.
2315 * If there is a < in the line, followed at some point
2316 * by a > then we suspect HTML.
2318 void check_for_html_tag(const char *aline)
2320 const char *open,*close;
2322 open=strchr(aline,'<');
2325 close=strchr(g_utf8_next_char(open),'>');
2328 if (pswit[ECHO_SWITCH])
2329 g_print("\n%s\n",aline);
2330 if (!pswit[OVERVIEW_SWITCH])
2332 tag=g_strndup(open,close-open+1);
2333 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2334 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2344 * check_for_html_entity:
2346 * Check for &symbol; HTML.
2348 * If there is a & in the line, followed at
2349 * some point by a ; then we suspect HTML.
2351 void check_for_html_entity(const char *aline)
2353 const char *s,*amp,*scolon;
2355 amp=strchr(aline,'&');
2358 scolon=strchr(amp,';');
2361 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2362 if (g_utf8_get_char(s)==CHAR_SPACE)
2363 break; /* Don't report "Jones & Son;" */
2366 if (pswit[ECHO_SWITCH])
2367 g_print("\n%s\n",aline);
2368 if (!pswit[OVERVIEW_SWITCH])
2370 entity=g_strndup(amp,scolon-amp+1);
2371 g_print(" Line %ld column %d - HTML symbol? %s \n",
2372 linecnt,(int)(amp-aline)+1,entity);
2383 * check_for_omitted_punctuation:
2385 * Check for omitted punctuation at end of paragraph by working back
2386 * through prevline. DW.
2387 * Need to check this only for "normal" paras.
2388 * So what is a "normal" para?
2389 * Not normal if one-liner (chapter headings, etc.)
2390 * Not normal if doesn't contain at least one locase letter
2391 * Not normal if starts with space
2393 void check_for_omitted_punctuation(const char *prevline,
2394 struct line_properties *last,int start_para_line)
2396 gboolean letter_on_line=FALSE;
2399 for (s=prevline;*s;s=g_utf8_next_char(s))
2400 if (g_unichar_isalpha(g_utf8_get_char(s)))
2402 letter_on_line=TRUE;
2406 * This next "if" is a problem.
2407 * If we say "start_para_line <= linecnt - 1", that includes
2408 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2409 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2410 * misses genuine one-line paragraphs.
2412 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2413 g_utf8_get_char(prevline)>CHAR_SPACE)
2415 s=prevline+strlen(prevline);
2418 s=g_utf8_prev_char(s);
2419 c=g_utf8_get_char(s);
2420 } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
2421 for (;s>prevline;s=g_utf8_prev_char(s))
2423 if (g_unichar_isalpha(g_utf8_get_char(s)))
2425 if (pswit[ECHO_SWITCH])
2426 g_print("\n%s\n",prevline);
2427 if (!pswit[OVERVIEW_SWITCH])
2428 g_print(" Line %ld column %ld - "
2429 "No punctuation at para end?\n",
2430 linecnt-1,g_utf8_strlen(prevline,-1));
2435 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2441 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2443 const char *word=key;
2446 g_print("\nNote: Queried word %s was duplicated %d times\n",
2451 void print_as_windows_1252(const char *string)
2453 gsize inbytes,outbytes;
2455 static GIConv converter=(GIConv)-1;
2458 if (converter!=(GIConv)-1)
2459 g_iconv_close(converter);
2460 converter=(GIConv)-1;
2463 if (converter==(GIConv)-1)
2464 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2465 if (converter!=(GIConv)-1)
2467 inbytes=outbytes=strlen(string);
2468 bp=buf=g_malloc(outbytes+1);
2469 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2475 fputs(string,stdout);
2478 void print_as_utf_8(const char *string)
2480 fputs(string,stdout);
2488 void procfile(const char *filename)
2491 gchar *parastart=NULL; /* first line of current para */
2492 gchar *etext,*aline;
2495 struct first_pass_results *first_pass_results;
2496 struct warnings *warnings;
2497 struct counters counters={0};
2498 struct line_properties last={0};
2499 struct parities parities={0};
2500 struct pending pending={0};
2501 gboolean isemptyline;
2502 long start_para_line=0;
2503 gboolean isnewpara=FALSE,enddash=FALSE;
2504 last.start=CHAR_SPACE;
2505 linecnt=checked_linecnt=0;
2506 etext=read_etext(filename,&err);
2509 if (pswit[STDOUT_SWITCH])
2510 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2512 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2515 g_print("\n\nFile: %s\n\n",filename);
2516 first_pass_results=first_pass(etext);
2517 warnings=report_first_pass(first_pass_results);
2518 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2519 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2521 * Here we go with the main pass. Hold onto yer hat!
2525 if (g_path_is_absolute(filename))
2526 g_print("\nPath: %s\n",filename);
2530 cwd=g_get_current_dir();
2531 path=g_build_filename(cwd,filename,NULL);
2533 g_print("\nPath: %s\n",path);
2536 g_print(" Line 1 - Path to ebook printed\n");
2537 while ((aline=flgets(&etext_ptr,linecnt+1)))
2542 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2543 continue; // skip DP page separators completely
2544 if (linecnt<first_pass_results->firstline ||
2545 (first_pass_results->footerline>0 &&
2546 linecnt>first_pass_results->footerline))
2548 if (pswit[HEADER_SWITCH])
2550 if (g_str_has_prefix(aline,"Title:"))
2551 g_print(" %s\n",aline);
2552 if (g_str_has_prefix(aline,"Author:"))
2553 g_print(" %s\n",aline);
2554 if (g_str_has_prefix(aline,"Release Date:"))
2555 g_print(" %s\n",aline);
2556 if (g_str_has_prefix(aline,"Edition:"))
2557 g_print(" %s\n\n",aline);
2559 continue; /* skip through the header */
2562 print_pending(aline,parastart,&pending);
2563 isemptyline=analyse_quotes(aline,&counters);
2564 if (isnewpara && !isemptyline)
2566 /* This line is the start of a new paragraph. */
2567 start_para_line=linecnt;
2568 /* Capture its first line in case we want to report it later. */
2570 parastart=g_strdup(aline);
2571 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2573 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2574 !g_unichar_isdigit(g_utf8_get_char(s)))
2575 s=g_utf8_next_char(s);
2576 if (g_unichar_islower(g_utf8_get_char(s)))
2578 /* and its first letter is lowercase */
2579 if (pswit[ECHO_SWITCH])
2580 g_print("\n%s\n",aline);
2581 if (!pswit[OVERVIEW_SWITCH])
2582 g_print(" Line %ld column %ld - "
2583 "Paragraph starts with lower-case\n",
2584 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2588 isnewpara=FALSE; /* Signal the end of new para processing. */
2590 /* Check for an em-dash broken at line end. */
2591 if (enddash && g_utf8_get_char(aline)=='-')
2593 if (pswit[ECHO_SWITCH])
2594 g_print("\n%s\n",aline);
2595 if (!pswit[OVERVIEW_SWITCH])
2596 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2601 for (s=g_utf8_prev_char(aline+strlen(aline));
2602 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2604 if (s>=aline && g_utf8_get_char(s)=='-')
2606 check_for_control_characters(aline);
2608 check_for_odd_characters(aline,warnings,isemptyline);
2609 if (warnings->longline)
2610 check_for_long_line(aline);
2611 if (warnings->shortline)
2612 check_for_short_line(aline,&last);
2614 last.len=g_utf8_strlen(aline,-1);
2615 last.start=g_utf8_get_char(aline);
2616 check_for_starting_punctuation(aline);
2619 check_for_spaced_emdash(aline);
2620 check_for_spaced_dash(aline);
2622 check_for_unmarked_paragraphs(aline);
2623 check_for_jeebies(aline);
2624 check_for_mta_from(aline);
2625 check_for_orphan_character(aline);
2626 check_for_pling_scanno(aline);
2627 check_for_extra_period(aline,warnings);
2628 check_for_following_punctuation(aline);
2629 check_for_typos(aline,warnings);
2630 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2631 check_for_double_punctuation(aline,warnings);
2632 check_for_spaced_quotes(aline);
2633 check_for_miscased_genative(aline);
2634 check_end_of_line(aline,warnings);
2635 check_for_unspaced_bracket(aline);
2636 if (warnings->endquote)
2637 check_for_unpunctuated_endquote(aline);
2638 check_for_html_tag(aline);
2639 check_for_html_entity(aline);
2642 check_for_mismatched_quotes(&counters,&pending);
2643 counters_reset(&counters);
2644 /* let the next iteration know that it's starting a new para */
2647 check_for_omitted_punctuation(prevline,&last,start_para_line);
2650 prevline=g_strdup(aline);
2653 check_for_mismatched_quotes(&counters,&pending);
2654 print_pending(NULL,parastart,&pending);
2655 reset_pending(&pending);
2664 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2665 g_tree_foreach(qword,report_duplicate_queries,NULL);
2666 g_tree_unref(qword);
2667 g_tree_unref(qperiod);
2668 counters_destroy(&counters);
2669 g_set_print_handler(NULL);
2670 print_as_windows_1252(NULL);
2671 if (pswit[MARKUP_SWITCH])
2678 * Get one line from the input text, checking for
2679 * the existence of exactly one CR/LF line-end per line.
2681 * Returns: a pointer to the line.
2683 char *flgets(char **etext,long lcnt)
2686 gboolean isCR=FALSE;
2687 char *theline=*etext;
2692 c=g_utf8_get_char(*etext);
2693 *etext=g_utf8_next_char(*etext);
2696 /* either way, it's end of line */
2703 /* Error - a LF without a preceding CR */
2704 if (pswit[LINE_END_SWITCH])
2706 if (pswit[ECHO_SWITCH])
2708 s=g_strndup(theline,eos-theline);
2709 g_print("\n%s\n",s);
2712 if (!pswit[OVERVIEW_SWITCH])
2713 g_print(" Line %ld - No CR?\n",lcnt);
2724 /* Error - two successive CRs */
2725 if (pswit[LINE_END_SWITCH])
2727 if (pswit[ECHO_SWITCH])
2729 s=g_strndup(theline,eos-theline);
2730 g_print("\n%s\n",s);
2733 if (!pswit[OVERVIEW_SWITCH])
2734 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2743 if (pswit[LINE_END_SWITCH] && isCR)
2745 if (pswit[ECHO_SWITCH])
2747 s=g_strndup(theline,eos-theline);
2748 g_print("\n%s\n",s);
2751 if (!pswit[OVERVIEW_SWITCH])
2752 g_print(" Line %ld column %ld - CR without LF?\n",
2753 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2759 eos=g_utf8_next_char(eos);
2763 if (pswit[MARKUP_SWITCH])
2764 postprocess_for_HTML(theline);
2765 if (pswit[DP_SWITCH])
2766 postprocess_for_DP(theline);
2773 * Takes a "word" as a parameter, and checks whether it
2774 * contains a mixture of alpha and digits. Generally, this is an
2775 * error, but may not be for cases like 4th or L5 12s. 3d.
2777 * Returns: TRUE iff an is error found.
2779 gboolean mixdigit(const char *checkword)
2781 gboolean wehaveadigit,wehavealetter,query;
2782 const char *s,*nondigit;
2783 wehaveadigit=wehavealetter=query=FALSE;
2784 for (s=checkword;*s;s=g_utf8_next_char(s))
2785 if (g_unichar_isalpha(g_utf8_get_char(s)))
2787 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2789 if (wehaveadigit && wehavealetter)
2791 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2793 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2794 nondigit=g_utf8_next_char(nondigit))
2796 /* digits, ending in st, rd, nd, th of either case */
2797 if (!g_ascii_strcasecmp(nondigit,"st") ||
2798 !g_ascii_strcasecmp(nondigit,"rd") ||
2799 !g_ascii_strcasecmp(nondigit,"nd") ||
2800 !g_ascii_strcasecmp(nondigit,"th"))
2802 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2803 !g_ascii_strcasecmp(nondigit,"rds") ||
2804 !g_ascii_strcasecmp(nondigit,"nds") ||
2805 !g_ascii_strcasecmp(nondigit,"ths"))
2807 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2808 !g_ascii_strcasecmp(nondigit,"rdly") ||
2809 !g_ascii_strcasecmp(nondigit,"ndly") ||
2810 !g_ascii_strcasecmp(nondigit,"thly"))
2812 /* digits, ending in l, L, s or d */
2813 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2814 !strcmp(nondigit,"d"))
2817 * L at the start of a number, representing Britsh pounds, like L500.
2818 * This is cute. We know the current word is mixed digit. If the first
2819 * letter is L, there must be at least one digit following. If both
2820 * digits and letters follow, we have a genuine error, else we have a
2821 * capital L followed by digits, and we accept that as a non-error.
2823 if (g_utf8_get_char(checkword)=='L' &&
2824 !mixdigit(g_utf8_next_char(checkword)))
2833 * Extracts the first/next "word" from the line, and returns it.
2834 * A word is defined as one English word unit--or at least that's the aim.
2835 * "ptr" is advanced to the position in the line where we will start
2836 * looking for the next word.
2838 * Returns: A newly-allocated string.
2840 gchar *getaword(const char **ptr)
2845 word=g_string_new(NULL);
2846 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2847 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2848 **ptr;*ptr=g_utf8_next_char(*ptr))
2851 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2852 * Especially yucky is the case of L1,000
2853 * This section looks for a pattern of characters including a digit
2854 * followed by a comma or period followed by one or more digits.
2855 * If found, it returns this whole pattern as a word; otherwise we discard
2856 * the results and resume our normal programming.
2859 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2860 g_unichar_isalpha(g_utf8_get_char(s)) ||
2861 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2862 g_string_append_unichar(word,g_utf8_get_char(s));
2865 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
2867 c=g_utf8_get_char(t);
2868 pc=g_utf8_get_char(g_utf8_prev_char(t));
2869 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
2872 return g_string_free(word,FALSE);
2876 /* we didn't find a punctuated number - do the regular getword thing */
2877 g_string_truncate(word,0);
2878 c=g_utf8_get_char(*ptr);
2879 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
2880 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
2881 g_string_append_unichar(word,c);
2882 return g_string_free(word,FALSE);
2888 * Is this word a Roman Numeral?
2890 * It doesn't actually validate that the number is a valid Roman Numeral--for
2891 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2892 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2893 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2894 * expressions thereof, except when it came to taxes. Allow any number of M,
2895 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2896 * XL or an optional XC, an optional IX or IV, an optional V and any number
2899 gboolean isroman(const char *t)
2905 while (g_utf8_get_char(t)=='m' && *t)
2907 if (g_utf8_get_char(t)=='d')
2909 if (g_str_has_prefix(t,"cm"))
2911 if (g_str_has_prefix(t,"cd"))
2913 while (g_utf8_get_char(t)=='c' && *t)
2915 if (g_str_has_prefix(t,"xl"))
2917 if (g_str_has_prefix(t,"xc"))
2919 if (g_utf8_get_char(t)=='l')
2921 while (g_utf8_get_char(t)=='x' && *t)
2923 if (g_str_has_prefix(t,"ix"))
2925 if (g_str_has_prefix(t,"iv"))
2927 if (g_utf8_get_char(t)=='v')
2929 while (g_utf8_get_char(t)=='i' && *t)
2935 * postprocess_for_DP:
2937 * Invoked with the -d switch from flgets().
2938 * It simply "removes" from the line a hard-coded set of common
2939 * DP-specific tags, so that the line passed to the main routine has
2940 * been pre-cleaned of DP markup.
2942 void postprocess_for_DP(char *theline)
2948 for (i=0;*DPmarkup[i];i++)
2949 while ((s=strstr(theline,DPmarkup[i])))
2951 t=s+strlen(DPmarkup[i]);
2952 memmove(s,t,strlen(t)+1);
2957 * postprocess_for_HTML:
2959 * Invoked with the -m switch from flgets().
2960 * It simply "removes" from the line a hard-coded set of common
2961 * HTML tags and "replaces" a hard-coded set of common HTML
2962 * entities, so that the line passed to the main routine has
2963 * been pre-cleaned of HTML.
2965 void postprocess_for_HTML(char *theline)
2967 while (losemarkup(theline))
2969 loseentities(theline);
2972 char *losemarkup(char *theline)
2976 s=strchr(theline,'<');
2977 t=s?strchr(s,'>'):NULL;
2980 for (i=0;*markup[i];i++)
2981 if (tagcomp(g_utf8_next_char(s),markup[i]))
2983 t=g_utf8_next_char(t);
2984 memmove(s,t,strlen(t)+1);
2987 /* It's an unrecognized <xxx>. */
2991 void loseentities(char *theline)
2998 GTree *entities=NULL;
2999 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3003 g_tree_destroy(entities);
3005 if (translit!=(GIConv)-1)
3006 g_iconv_close(translit);
3007 translit=(GIConv)-1;
3008 if (to_utf8!=(GIConv)-1)
3009 g_iconv_close(to_utf8);
3017 entities=g_tree_new((GCompareFunc)strcmp);
3018 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3019 g_tree_insert(entities,HTMLentities[i].name,
3020 GUINT_TO_POINTER(HTMLentities[i].c));
3022 if (translit==(GIConv)-1)
3023 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3024 if (to_utf8==(GIConv)-1)
3025 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3026 while((amp=strchr(theline,'&')))
3028 scolon=strchr(amp,';');
3033 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3034 c=strtol(amp+2,NULL,10);
3035 else if (amp[2]=='x' &&
3036 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3037 c=strtol(amp+3,NULL,16);
3041 s=g_strndup(amp+1,scolon-(amp+1));
3042 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3051 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3052 theline+=g_unichar_to_utf8(c,theline);
3056 nb=g_unichar_to_utf8(c,s);
3057 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3059 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3061 memcpy(theline,s,nb);
3065 memmove(theline,g_utf8_next_char(scolon),
3066 strlen(g_utf8_next_char(scolon))+1);
3069 theline=g_utf8_next_char(amp);
3073 gboolean tagcomp(const char *strin,const char *basetag)
3077 if (g_utf8_get_char(strin)=='/')
3078 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3080 t=g_utf8_casefold(strin,-1);
3081 s=g_utf8_casefold(basetag,-1);
3082 retval=g_str_has_prefix(t,s);
3088 void proghelp(GOptionContext *context)
3091 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3092 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3093 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3094 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3095 "For details, read the file COPYING.\n",stderr);
3096 fputs("This is Free Software; "
3097 "you may redistribute it under certain conditions (GPL);\n",stderr);
3098 fputs("read the file COPYING for details.\n\n",stderr);
3099 help=g_option_context_get_help(context,TRUE,NULL);
3102 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3103 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3104 "non-ASCII\n",stderr);
3105 fputs("characters like accented letters, "
3106 "lines longer than 75 or shorter than 55,\n",stderr);
3107 fputs("unbalanced quotes or brackets, "
3108 "a variety of badly formatted punctuation, \n",stderr);
3109 fputs("HTML tags, some likely typos. "
3110 "It is NOT a substitute for human judgement.\n",stderr);