1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 static GOptionEntry options[]={
132 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
133 "Ignore DP-specific markup", NULL },
134 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
135 "Don't echo queried line", NULL },
136 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
137 "Check single quotes", NULL },
138 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
139 "Check common typos", NULL },
140 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
141 "Require closure of quotes on every paragraph", NULL },
142 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
143 "Disable paranoid querying of everything", NULL },
144 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
145 "Disable line end checking", NULL },
146 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
147 "Overview: just show counts", NULL },
148 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
149 "Output errors to stdout instead of stderr", NULL },
150 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
151 "Echo header fields", NULL },
152 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
153 "Ignore markup in < >", NULL },
154 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
155 "Use file of user-defined typos", NULL },
156 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
157 "Defaults for use on www upload", NULL },
158 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
159 "Verbose - list everything", NULL },
163 long cnt_quote; /* for overview mode, count of quote queries */
164 long cnt_brack; /* for overview mode, count of brackets queries */
165 long cnt_bin; /* for overview mode, count of non-ASCII queries */
166 long cnt_odd; /* for overview mode, count of odd character queries */
167 long cnt_long; /* for overview mode, count of long line errors */
168 long cnt_short; /* for overview mode, count of short line queries */
169 long cnt_punct; /* for overview mode,
170 count of punctuation and spacing queries */
171 long cnt_dash; /* for overview mode, count of dash-related queries */
172 long cnt_word; /* for overview mode, count of word queries */
173 long cnt_html; /* for overview mode, count of html queries */
174 long cnt_lineend; /* for overview mode, count of line-end queries */
175 long cnt_spacend; /* count of lines with space at end */
176 long linecnt; /* count of total lines in the file */
177 long checked_linecnt; /* count of lines actually checked */
179 void proghelp(GOptionContext *context);
180 void procfile(const char *);
184 gboolean mixdigit(const char *);
185 gchar *getaword(const char **);
186 char *flgets(char **,long);
187 void postprocess_for_HTML(char *);
188 char *linehasmarkup(char *);
189 char *losemarkup(char *);
190 gboolean tagcomp(const char *,const char *);
191 void loseentities(char *);
192 gboolean isroman(const char *);
193 void postprocess_for_DP(char *);
194 void print_as_windows_1252(const char *string);
195 void print_as_utf_8(const char *string);
197 GTree *qword,*qperiod;
203 void parse_options(int *argc,char ***argv)
206 GOptionContext *context;
207 context=g_option_context_new(
208 "file - looks for errors in Project Gutenberg(TM) etexts");
209 g_option_context_add_main_entries(context,options,NULL);
210 if (!g_option_context_parse(context,argc,argv,&err))
212 g_printerr("Bookloupe: %s\n",err->message);
213 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
216 /* Paranoid checking is turned OFF, not on, by its switch */
217 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
218 if (pswit[PARANOID_SWITCH])
219 /* if running in paranoid mode, typo checks default to enabled */
220 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
221 /* Line-end checking is turned OFF, not on, by its switch */
222 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
223 /* Echoing is turned OFF, not on, by its switch */
224 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
225 if (pswit[OVERVIEW_SWITCH])
226 /* just print summary; don't echo */
227 pswit[ECHO_SWITCH]=FALSE;
229 * Web uploads - for the moment, this is really just a placeholder
230 * until we decide what processing we really want to do on web uploads
232 if (pswit[WEB_SWITCH])
234 /* specific override for web uploads */
235 pswit[ECHO_SWITCH]=TRUE;
236 pswit[SQUOTE_SWITCH]=FALSE;
237 pswit[TYPO_SWITCH]=TRUE;
238 pswit[QPARA_SWITCH]=FALSE;
239 pswit[PARANOID_SWITCH]=TRUE;
240 pswit[LINE_END_SWITCH]=FALSE;
241 pswit[OVERVIEW_SWITCH]=FALSE;
242 pswit[STDOUT_SWITCH]=FALSE;
243 pswit[HEADER_SWITCH]=TRUE;
244 pswit[VERBOSE_SWITCH]=FALSE;
245 pswit[MARKUP_SWITCH]=FALSE;
246 pswit[USERTYPO_SWITCH]=FALSE;
247 pswit[DP_SWITCH]=FALSE;
254 g_option_context_free(context);
260 * Read in the user-defined stealth scanno list.
262 void read_user_scannos(void)
265 gchar *usertypo_file;
269 gchar *contents,*utf8,**lines;
270 usertypo_file=g_strdup("bookloupe.typ");
271 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
272 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
275 g_free(usertypo_file);
276 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
277 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
279 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
282 g_free(usertypo_file);
283 usertypo_file=g_strdup("gutcheck.typ");
284 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
286 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
289 g_free(usertypo_file);
290 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
291 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
293 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
295 g_free(usertypo_file);
296 g_print(" --> I couldn't find bookloupe.typ "
297 "-- proceeding without user typos.\n");
302 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
303 g_free(usertypo_file);
307 if (g_utf8_validate(contents,len,NULL))
308 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
310 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
312 lines=g_strsplit_set(utf8,"\r\n",0);
314 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
315 for (i=0;lines[i];i++)
316 if (*(unsigned char *)lines[i]>'!')
317 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
326 * Read an etext returning a newly allocated string containing the file
327 * contents or NULL on error.
329 gchar *read_etext(const char *filename,GError **err)
331 GError *tmp_err=NULL;
332 gchar *contents,*utf8;
333 gsize len,bytes_read,bytes_written;
335 if (!g_file_get_contents(filename,&contents,&len,err))
337 if (g_utf8_validate(contents,len,NULL))
339 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
340 g_set_print_handler(print_as_utf_8);
342 SetConsoleOutputCP(CP_UTF8);
347 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
348 &bytes_written,&tmp_err);
349 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
350 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
353 for(i=0;i<bytes_read;i++)
354 if (contents[i]=='\n')
359 else if (contents[i]!='\r')
361 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
362 "Input conversion failed. Byte %d at line %d, column %d is not a "
363 "valid Windows-1252 character",
364 ((unsigned char *)contents)[bytes_read],line,col);
367 g_propagate_error(err,tmp_err);
368 g_set_print_handler(print_as_windows_1252);
370 SetConsoleOutputCP(1252);
377 void cleanup_on_exit(void)
380 SetConsoleOutputCP(saved_cp);
384 int main(int argc,char **argv)
387 atexit(cleanup_on_exit);
388 saved_cp=GetConsoleOutputCP();
390 running_from=g_path_get_dirname(argv[0]);
391 parse_options(&argc,&argv);
392 if (pswit[USERTYPO_SWITCH])
394 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
396 if (pswit[OVERVIEW_SWITCH])
398 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
399 checked_linecnt,linecnt,linecnt-checked_linecnt);
400 g_print(" --------------- Queries found --------------\n");
402 g_print(" Long lines: %14ld\n",cnt_long);
404 g_print(" Short lines: %14ld\n",cnt_short);
406 g_print(" Line-end problems: %14ld\n",cnt_lineend);
408 g_print(" Common typos: %14ld\n",cnt_word);
410 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
412 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
414 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
416 g_print(" Proofing characters: %14ld\n",cnt_odd);
418 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
420 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
422 g_print(" Possible HTML tags: %14ld\n",cnt_html);
424 g_print(" TOTAL QUERIES %14ld\n",
425 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
426 cnt_dash+cnt_word+cnt_html+cnt_lineend);
428 g_free(running_from);
430 g_tree_unref(usertypo);
437 * Run a first pass - verify that it's a valid PG
438 * file, decide whether to report some things that
439 * occur many times in the text like long or short
440 * lines, non-standard dashes, etc.
442 struct first_pass_results *first_pass(const char *etext)
444 gunichar laststart=CHAR_SPACE;
449 unsigned int lastlen=0,lastblen=0;
450 long spline=0,nspline=0;
451 static struct first_pass_results results={0};
453 lines=g_strsplit(etext,"\n",0);
454 for (j=0;lines[j];j++)
456 lbytes=strlen(lines[j]);
457 while (lbytes>0 && lines[j][lbytes-1]=='\r')
458 lines[j][--lbytes]='\0';
459 llen=g_utf8_strlen(lines[j],lbytes);
461 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
462 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
465 g_print(" --> Duplicate header?\n");
466 spline=linecnt+1; /* first line of non-header text, that is */
468 if (!strncmp(lines[j],"*** START",9) &&
469 strstr(lines[j],"PROJECT GUTENBERG"))
472 g_print(" --> Duplicate header?\n");
473 nspline=linecnt+1; /* first line of non-header text, that is */
475 if (spline || nspline)
477 lc_line=g_utf8_strdown(lines[j],lbytes);
478 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
480 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
482 if (results.footerline)
484 /* it's an old-form header - we can detect duplicates */
486 g_print(" --> Duplicate footer?\n");
489 results.footerline=linecnt;
495 results.firstline=spline;
497 results.firstline=nspline; /* override with new */
498 if (results.footerline)
499 continue; /* don't count the boilerplate in the footer */
500 results.totlen+=llen;
501 for (s=lines[j];*s;s=g_utf8_next_char(s))
503 if (g_utf8_get_char(s)>127)
505 if (g_unichar_isalpha(g_utf8_get_char(s)))
507 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
508 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
509 results.endquote_count++;
511 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
512 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
515 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
517 if (strstr(lines[j],".,"))
519 /* only count ast lines for ignoring purposes where there is */
520 /* locase text on the line */
521 if (strchr(lines[j],'*'))
523 for (s=lines[j];*s;s=g_utf8_next_char(s))
524 if (g_unichar_islower(g_utf8_get_char(s)))
529 if (strchr(lines[j],'/'))
530 results.fslashline++;
533 for (s=g_utf8_prev_char(lines[j]+lbytes);
534 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
535 s=g_utf8_prev_char(s))
537 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
538 g_utf8_get_char(g_utf8_prev_char(s))!='-')
541 if (llen>LONGEST_PG_LINE)
543 if (llen>WAY_TOO_LONG)
544 results.verylongline++;
545 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
547 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
550 if (strstr(lines[j],"<i>"))
551 results.htmcount+=4; /* bonus marks! */
553 /* Check for spaced em-dashes */
554 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
557 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
558 results.space_emdash++;
559 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
560 /* count of em-dashes with spaces both sides */
561 results.non_PG_space_emdash++;
562 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
563 /* count of PG-type em-dashes with no spaces */
564 results.PG_space_emdash++;
569 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
570 results.Dutchcount++;
571 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
572 results.Frenchcount++;
573 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
574 results.standalone_digit++;
577 /* Check for spaced dashes */
578 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
582 laststart=lines[j][0];
591 * Make some snap decisions based on the first pass results.
593 struct warnings *report_first_pass(struct first_pass_results *results)
595 static struct warnings warnings={0};
597 g_print(" --> %ld lines in this file have white space at end\n",
600 if (results->dotcomma>5)
603 g_print(" --> %ld lines in this file contain '.,'. "
604 "Not reporting them.\n",results->dotcomma);
607 * If more than 50 lines, or one-tenth, are short,
608 * don't bother reporting them.
610 warnings.shortline=1;
611 if (results->shortline>50 || results->shortline*10>linecnt)
613 warnings.shortline=0;
614 g_print(" --> %ld lines in this file are short. "
615 "Not reporting short lines.\n",results->shortline);
618 * If more than 50 lines, or one-tenth, are long,
619 * don't bother reporting them.
622 if (results->longline>50 || results->longline*10>linecnt)
625 g_print(" --> %ld lines in this file are long. "
626 "Not reporting long lines.\n",results->longline);
628 /* If more than 10 lines contain asterisks, don't bother reporting them. */
630 if (results->astline>10)
633 g_print(" --> %ld lines in this file contain asterisks. "
634 "Not reporting them.\n",results->astline);
637 * If more than 10 lines contain forward slashes,
638 * don't bother reporting them.
641 if (results->fslashline>10)
644 g_print(" --> %ld lines in this file contain forward slashes. "
645 "Not reporting them.\n",results->fslashline);
648 * If more than 20 lines contain unpunctuated endquotes,
649 * don't bother reporting them.
652 if (results->endquote_count>20)
655 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
656 "Not reporting them.\n",results->endquote_count);
659 * If more than 15 lines contain standalone digits,
660 * don't bother reporting them.
663 if (results->standalone_digit>10)
666 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
667 "Not reporting them.\n",results->standalone_digit);
670 * If more than 20 lines contain hyphens at end,
671 * don't bother reporting them.
674 if (results->hyphens>20)
677 g_print(" --> %ld lines in this file have hyphens at end. "
678 "Not reporting them.\n",results->hyphens);
680 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
682 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
683 pswit[MARKUP_SWITCH]=1;
685 if (results->verylongline>0)
686 g_print(" --> %ld lines in this file are VERY long!\n",
687 results->verylongline);
689 * If there are more non-PG spaced dashes than PG em-dashes,
690 * assume it's deliberate.
691 * Current PG guidelines say don't use them, but older texts do,
692 * and some people insist on them whatever the guidelines say.
695 if (results->spacedash+results->non_PG_space_emdash>
696 results->PG_space_emdash)
699 g_print(" --> There are %ld spaced dashes and em-dashes. "
700 "Not reporting them.\n",
701 results->spacedash+results->non_PG_space_emdash);
703 /* If more than a quarter of characters are hi-bit, bug out. */
705 if (results->binlen*4>results->totlen)
707 g_print(" --> This file does not appear to be ASCII. "
708 "Terminating. Best of luck with it!\n");
711 if (results->alphalen*4<results->totlen)
713 g_print(" --> This file does not appear to be text. "
714 "Terminating. Best of luck with it!\n");
717 if (results->binlen*100>results->totlen || results->binlen>100)
719 g_print(" --> There are a lot of foreign letters here. "
720 "Not reporting them.\n");
723 warnings.isDutch=FALSE;
724 if (results->Dutchcount>50)
726 warnings.isDutch=TRUE;
727 g_print(" --> This looks like Dutch - "
728 "switching off dashes and warnings for 's Middags case.\n");
730 warnings.isFrench=FALSE;
731 if (results->Frenchcount>50)
733 warnings.isFrench=TRUE;
734 g_print(" --> This looks like French - "
735 "switching off some doublepunct.\n");
737 if (results->firstline && results->footerline)
738 g_print(" The PG header and footer appear to be already on.\n");
741 if (results->firstline)
742 g_print(" The PG header is on - no footer.\n");
743 if (results->footerline)
744 g_print(" The PG footer is on - no header.\n");
747 if (pswit[VERBOSE_SWITCH])
750 warnings.shortline=1;
759 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
761 if (warnings.isDutch)
763 if (results->footerline>0 && results->firstline>0 &&
764 results->footerline>results->firstline &&
765 results->footerline-results->firstline<100)
767 g_print(" --> I don't really know where this text starts. \n");
768 g_print(" There are no reference points.\n");
769 g_print(" I'm going to have to report the header and footer "
771 results->firstline=0;
779 * Look along the line, accumulate the count of quotes, and see
780 * if this is an empty line - i.e. a line with nothing on it
782 * If line has just spaces, period, * and/or - on it, don't
783 * count it, since empty lines with asterisks or dashes to
784 * separate sections are common.
786 * Returns: TRUE if the line is empty.
788 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
791 /* assume the line is empty until proven otherwise */
792 gboolean isemptyline=TRUE;
793 const char *s=aline,*sprev,*snext;
796 GError *tmp_err=NULL;
799 snext=g_utf8_next_char(s);
800 c=g_utf8_get_char(s);
801 if (CHAR_IS_DQUOTE(c))
802 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
803 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
808 * At start of line, it can only be a quotation mark.
809 * Hardcode a very common exception!
811 if (!g_str_has_prefix(snext,"tis") &&
812 !g_str_has_prefix(snext,"Tis"))
813 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
815 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
816 g_unichar_isalpha(g_utf8_get_char(snext)))
817 /* Do nothing! it's definitely an apostrophe, not a quote */
819 /* it's outside a word - let's check it out */
820 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
821 g_unichar_isalpha(g_utf8_get_char(snext)))
823 /* certainly looks like a quotation mark */
824 if (!g_str_has_prefix(snext,"tis") &&
825 !g_str_has_prefix(snext,"Tis"))
826 /* hardcode a very common exception! */
828 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
829 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
831 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
836 /* now - is it a quotation mark? */
837 guessquote=0; /* accumulate clues */
838 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
840 /* it follows a letter - could be either */
842 if (g_utf8_get_char(sprev)=='s')
844 /* looks like a plural apostrophe */
846 if (g_utf8_get_char(snext)==CHAR_SPACE)
850 if (innermost_quote_matches(counters,c))
852 * Give it the benefit of some doubt,
853 * if a squote is already open.
859 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
862 /* no adjacent letter - it must be a quote of some kind */
863 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
868 if (pswit[ECHO_SWITCH])
869 g_print("\n%s\n",aline);
870 if (!pswit[OVERVIEW_SWITCH])
871 g_print(" Line %ld column %ld - %s\n",
872 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
873 g_clear_error(&tmp_err);
875 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
877 isemptyline=FALSE; /* ignore lines like * * * as spacers */
878 if (c==CHAR_UNDERSCORE)
879 counters->c_unders++;
880 if (c==CHAR_OPEN_SBRACK)
882 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
883 !matching_difference(counters,c) && s==aline &&
884 g_str_has_prefix(s,"[Illustration:"))
885 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
887 increment_matching(counters,c,TRUE);
889 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
890 increment_matching(counters,c,TRUE);
891 if (c==CHAR_CLOSE_SBRACK)
893 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
894 !matching_difference(counters,c) && !*snext)
895 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
897 increment_matching(counters,c,FALSE);
899 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
900 increment_matching(counters,c,FALSE);
908 * check_for_control_characters:
910 * Check for invalid or questionable characters in the line
911 * Anything above 127 is invalid for plain ASCII, and
912 * non-printable control characters should also be flagged.
913 * Tabs should generally not be there.
915 void check_for_control_characters(const char *aline)
919 for (s=aline;*s;s=g_utf8_next_char(s))
921 c=g_utf8_get_char(s);
922 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
924 if (pswit[ECHO_SWITCH])
925 g_print("\n%s\n",aline);
926 if (!pswit[OVERVIEW_SWITCH])
927 g_print(" Line %ld column %ld - Control character %u\n",
928 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
936 * check_for_odd_characters:
938 * Check for binary and other odd characters.
940 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
941 gboolean isemptyline)
943 /* Don't repeat multiple warnings on one line. */
944 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
945 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
948 for (s=aline;*s;s=g_utf8_next_char(s))
950 c=g_utf8_get_char(s);
951 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
953 if (pswit[ECHO_SWITCH])
954 g_print("\n%s\n",aline);
955 if (!pswit[OVERVIEW_SWITCH])
956 if (c>127 && c<160 || c>255)
957 g_print(" Line %ld column %ld - "
958 "Non-ISO-8859 character %u\n",
959 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
961 g_print(" Line %ld column %ld - "
962 "Non-ASCII character %u\n",
963 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
968 if (!eTab && c==CHAR_TAB)
970 if (pswit[ECHO_SWITCH])
971 g_print("\n%s\n",aline);
972 if (!pswit[OVERVIEW_SWITCH])
973 g_print(" Line %ld column %ld - Tab character?\n",
974 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
979 if (!eTilde && c==CHAR_TILDE)
982 * Often used by OCR software to indicate an
983 * unrecognizable character.
985 if (pswit[ECHO_SWITCH])
986 g_print("\n%s\n",aline);
987 if (!pswit[OVERVIEW_SWITCH])
988 g_print(" Line %ld column %ld - Tilde character?\n",
989 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
994 if (!eCarat && c==CHAR_CARAT)
996 if (pswit[ECHO_SWITCH])
997 g_print("\n%s\n",aline);
998 if (!pswit[OVERVIEW_SWITCH])
999 g_print(" Line %ld column %ld - Carat character?\n",
1000 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1005 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1007 if (pswit[ECHO_SWITCH])
1008 g_print("\n%s\n",aline);
1009 if (!pswit[OVERVIEW_SWITCH])
1010 g_print(" Line %ld column %ld - Forward slash?\n",
1011 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1017 * Report asterisks only in paranoid mode,
1018 * since they're often deliberate.
1020 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1023 if (pswit[ECHO_SWITCH])
1024 g_print("\n%s\n",aline);
1025 if (!pswit[OVERVIEW_SWITCH])
1026 g_print(" Line %ld column %ld - Asterisk?\n",
1027 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1036 * check_for_long_line:
1038 * Check for line too long.
1040 void check_for_long_line(const char *aline)
1042 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1044 if (pswit[ECHO_SWITCH])
1045 g_print("\n%s\n",aline);
1046 if (!pswit[OVERVIEW_SWITCH])
1047 g_print(" Line %ld column %ld - Long line %ld\n",
1048 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1055 * check_for_short_line:
1057 * Check for line too short.
1059 * This one is a bit trickier to implement: we don't want to
1060 * flag the last line of a paragraph for being short, so we
1061 * have to wait until we know that our current line is a
1062 * "normal" line, then report the _previous_ line if it was too
1063 * short. We also don't want to report indented lines like
1064 * chapter heads or formatted quotations. We therefore keep
1065 * last->len as the length of the last line examined, and
1066 * last->blen as the length of the last but one, and try to
1067 * suppress unnecessary warnings by checking that both were of
1068 * "normal" length. We keep the first character of the last
1069 * line in last->start, and if it was a space, we assume that
1070 * the formatting is deliberate. I can't figure out a way to
1071 * distinguish something like a quoted verse left-aligned or
1072 * the header or footer of a letter from a paragraph of short
1073 * lines - maybe if I examined the whole paragraph, and if the
1074 * para has less than, say, 8 lines and if all lines are short,
1075 * then just assume it's OK? Need to look at some texts to see
1076 * how often a formula like this would get the right result.
1078 void check_for_short_line(const char *aline,const struct line_properties *last)
1080 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1081 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1082 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1084 if (pswit[ECHO_SWITCH])
1085 g_print("\n%s\n",prevline);
1086 if (!pswit[OVERVIEW_SWITCH])
1087 g_print(" Line %ld column %ld - Short line %ld?\n",
1088 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1095 * check_for_starting_punctuation:
1097 * Look for punctuation other than full ellipses at start of line.
1099 void check_for_starting_punctuation(const char *aline)
1101 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1102 !g_str_has_prefix(aline,". . ."))
1104 if (pswit[ECHO_SWITCH])
1105 g_print("\n%s\n",aline);
1106 if (!pswit[OVERVIEW_SWITCH])
1107 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1115 * check_for_spaced_emdash:
1117 * Check for spaced em-dashes.
1119 * We must check _all_ occurrences of "--" on the line
1120 * hence the loop - even if the first double-dash is OK
1121 * there may be another that's wrong later on.
1123 void check_for_spaced_emdash(const char *aline)
1125 const char *s,*t,*next;
1126 for (s=aline;t=strstr(s,"--");s=next)
1128 next=g_utf8_next_char(g_utf8_next_char(t));
1129 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1130 g_utf8_get_char(next)==CHAR_SPACE)
1132 if (pswit[ECHO_SWITCH])
1133 g_print("\n%s\n",aline);
1134 if (!pswit[OVERVIEW_SWITCH])
1135 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1136 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1144 * check_for_spaced_dash:
1146 * Check for spaced dashes.
1148 void check_for_spaced_dash(const char *aline)
1151 if ((s=strstr(aline," -")))
1153 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1155 if (pswit[ECHO_SWITCH])
1156 g_print("\n%s\n",aline);
1157 if (!pswit[OVERVIEW_SWITCH])
1158 g_print(" Line %ld column %ld - Spaced dash?\n",
1159 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1164 else if ((s=strstr(aline,"- ")))
1166 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1168 if (pswit[ECHO_SWITCH])
1169 g_print("\n%s\n",aline);
1170 if (!pswit[OVERVIEW_SWITCH])
1171 g_print(" Line %ld column %ld - Spaced dash?\n",
1172 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1180 * check_for_unmarked_paragraphs:
1182 * Check for unmarked paragraphs indicated by separate speakers.
1184 * May well be false positive:
1185 * "Bravo!" "Wonderful!" called the crowd.
1186 * but useful all the same.
1188 void check_for_unmarked_paragraphs(const char *aline)
1191 s=strstr(aline,"\" \"");
1193 s=strstr(aline,"\" \"");
1196 if (pswit[ECHO_SWITCH])
1197 g_print("\n%s\n",aline);
1198 if (!pswit[OVERVIEW_SWITCH])
1199 g_print(" Line %ld column %ld - "
1200 "Query missing paragraph break?\n",
1201 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1208 * check_for_jeebies:
1210 * Check for "to he" and other easy h/b errors.
1212 * This is a very inadequate effort on the h/b problem,
1213 * but the phrase "to he" is always an error, whereas "to
1214 * be" is quite common.
1215 * Similarly, '"Quiet!", be said.' is a non-be error
1216 * "to he" is _not_ always an error!:
1217 * "Where they went to he couldn't say."
1218 * Another false positive:
1219 * What would "Cinderella" be without the . . .
1220 * and another: "If he wants to he can see for himself."
1222 void check_for_jeebies(const char *aline)
1225 s=strstr(aline," be could ");
1227 s=strstr(aline," be would ");
1229 s=strstr(aline," was be ");
1231 s=strstr(aline," be is ");
1233 s=strstr(aline," is be ");
1235 s=strstr(aline,"\", be ");
1237 s=strstr(aline,"\" be ");
1239 s=strstr(aline,"\" be ");
1241 s=strstr(aline," to he ");
1244 if (pswit[ECHO_SWITCH])
1245 g_print("\n%s\n",aline);
1246 if (!pswit[OVERVIEW_SWITCH])
1247 g_print(" Line %ld column %ld - Query he/be error?\n",
1248 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1252 s=strstr(aline," the had ");
1254 s=strstr(aline," a had ");
1256 s=strstr(aline," they bad ");
1258 s=strstr(aline," she bad ");
1260 s=strstr(aline," he bad ");
1262 s=strstr(aline," you bad ");
1264 s=strstr(aline," i bad ");
1267 if (pswit[ECHO_SWITCH])
1268 g_print("\n%s\n",aline);
1269 if (!pswit[OVERVIEW_SWITCH])
1270 g_print(" Line %ld column %ld - Query had/bad error?\n",
1271 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1275 s=strstr(aline,"; hut ");
1277 s=strstr(aline,", hut ");
1280 if (pswit[ECHO_SWITCH])
1281 g_print("\n%s\n",aline);
1282 if (!pswit[OVERVIEW_SWITCH])
1283 g_print(" Line %ld column %ld - Query hut/but error?\n",
1284 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1291 * check_for_mta_from:
1293 * Special case - angled bracket in front of "From" placed there by an
1294 * MTA when sending an e-mail.
1296 void check_for_mta_from(const char *aline)
1299 s=strstr(aline,">From");
1302 if (pswit[ECHO_SWITCH])
1303 g_print("\n%s\n",aline);
1304 if (!pswit[OVERVIEW_SWITCH])
1305 g_print(" Line %ld column %ld - "
1306 "Query angled bracket with From\n",
1307 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1314 * check_for_orphan_character:
1316 * Check for a single character line -
1317 * often an overflow from bad wrapping.
1319 void check_for_orphan_character(const char *aline)
1322 c=g_utf8_get_char(aline);
1323 if (c && !*g_utf8_next_char(aline))
1325 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1326 ; /* Nothing - ignore numerals alone on a line. */
1329 if (pswit[ECHO_SWITCH])
1330 g_print("\n%s\n",aline);
1331 if (!pswit[OVERVIEW_SWITCH])
1332 g_print(" Line %ld column 1 - Query single character line\n",
1341 * check_for_pling_scanno:
1343 * Check for I" - often should be !
1345 void check_for_pling_scanno(const char *aline)
1348 s=strstr(aline," I\"");
1351 if (pswit[ECHO_SWITCH])
1352 g_print("\n%s\n",aline);
1353 if (!pswit[OVERVIEW_SWITCH])
1354 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1355 linecnt,g_utf8_pointer_to_offset(aline,s));
1362 * check_for_extra_period:
1364 * Check for period without a capital letter. Cut-down from gutspell.
1365 * Only works when it happens on a single line.
1367 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1369 const char *s,*t,*s1,*sprev;
1374 gunichar c,nc,pc,*decomposition;
1375 if (pswit[PARANOID_SWITCH])
1377 for (t=aline;t=strstr(t,". ");)
1381 t=g_utf8_next_char(t);
1382 /* start of line punctuation is handled elsewhere */
1385 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1387 t=g_utf8_next_char(t);
1390 if (warnings->isDutch)
1392 /* For Frank & Jeroen -- 's Middags case */
1393 gunichar c2,c3,c4,c5;
1394 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1395 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1396 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1397 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1398 if (CHAR_IS_APOSTROPHE(c2) &&
1399 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1400 g_unichar_isupper(c5))
1402 t=g_utf8_next_char(t);
1406 s1=g_utf8_next_char(g_utf8_next_char(t));
1407 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1408 !isdigit(g_utf8_get_char(s1)))
1409 s1=g_utf8_next_char(s1);
1410 if (g_unichar_islower(g_utf8_get_char(s1)))
1412 /* we have something to investigate */
1414 /* so let's go back and find out */
1415 nc=g_utf8_get_char(t);
1416 s1=g_utf8_prev_char(t);
1417 c=g_utf8_get_char(s1);
1418 sprev=g_utf8_prev_char(s1);
1419 pc=g_utf8_get_char(sprev);
1421 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1422 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1423 g_unichar_isalpha(nc)))
1428 sprev=g_utf8_prev_char(s1);
1429 pc=g_utf8_get_char(sprev);
1431 s1=g_utf8_next_char(s1);
1434 testword=g_strndup(s1,s-s1);
1436 testword=g_strdup(s1);
1437 for (i=0;*abbrev[i];i++)
1438 if (!strcmp(testword,abbrev[i]))
1440 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1442 if (!*g_utf8_next_char(testword))
1444 if (isroman(testword))
1449 for (s=testword;*s;s=g_utf8_next_char(s))
1451 decomposition=g_unicode_canonical_decomposition(
1452 g_utf8_get_char(s),&len);
1453 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1455 g_free(decomposition);
1459 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1461 g_tree_insert(qperiod,g_strdup(testword),
1462 GINT_TO_POINTER(1));
1463 if (pswit[ECHO_SWITCH])
1464 g_print("\n%s\n",aline);
1465 if (!pswit[OVERVIEW_SWITCH])
1466 g_print(" Line %ld column %ld - Extra period?\n",
1467 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1473 t=g_utf8_next_char(t);
1479 * check_for_following_punctuation:
1481 * Check for words usually not followed by punctuation.
1483 void check_for_following_punctuation(const char *aline)
1486 const char *s,*wordstart;
1489 if (pswit[TYPO_SWITCH])
1500 inword=g_utf8_strdown(t,-1);
1502 for (i=0;*nocomma[i];i++)
1503 if (!strcmp(inword,nocomma[i]))
1505 c=g_utf8_get_char(s);
1506 if (c==',' || c==';' || c==':')
1508 if (pswit[ECHO_SWITCH])
1509 g_print("\n%s\n",aline);
1510 if (!pswit[OVERVIEW_SWITCH])
1511 g_print(" Line %ld column %ld - "
1512 "Query punctuation after %s?\n",
1513 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1519 for (i=0;*noperiod[i];i++)
1520 if (!strcmp(inword,noperiod[i]))
1522 c=g_utf8_get_char(s);
1523 if (c=='.' || c=='!')
1525 if (pswit[ECHO_SWITCH])
1526 g_print("\n%s\n",aline);
1527 if (!pswit[OVERVIEW_SWITCH])
1528 g_print(" Line %ld column %ld - "
1529 "Query punctuation after %s?\n",
1530 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1544 * Check for commonly mistyped words,
1545 * and digits like 0 for O in a word.
1547 void check_for_typos(const char *aline,struct warnings *warnings)
1549 const char *s,*t,*nt,*wordstart;
1551 gunichar *decomposition;
1553 int i,vowel,consonant,*dupcnt;
1554 gboolean isdup,istypo,alower;
1557 gsize decomposition_len;
1561 inword=getaword(&s);
1565 continue; /* don't bother with empty lines */
1567 if (mixdigit(inword))
1569 if (pswit[ECHO_SWITCH])
1570 g_print("\n%s\n",aline);
1571 if (!pswit[OVERVIEW_SWITCH])
1572 g_print(" Line %ld column %ld - Query digit in %s\n",
1573 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1578 * Put the word through a series of tests for likely typos and OCR
1581 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1585 for (t=inword;*t;t=g_utf8_next_char(t))
1587 c=g_utf8_get_char(t);
1588 nt=g_utf8_next_char(t);
1589 /* lowercase for testing */
1590 if (g_unichar_islower(c))
1592 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1595 * We have an uppercase mid-word. However, there are
1597 * Mac and Mc like McGill
1598 * French contractions like l'Abbe
1600 offset=g_utf8_pointer_to_offset(inword,t);
1602 pc=g_utf8_get_char(g_utf8_prev_char(t));
1605 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1606 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1607 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1608 CHAR_IS_APOSTROPHE(pc))
1614 testword=g_utf8_casefold(inword,-1);
1616 if (pswit[TYPO_SWITCH])
1619 * Check for certain unlikely two-letter combinations at word
1622 len=g_utf8_strlen(testword,-1);
1625 for (i=0;*nostart[i];i++)
1626 if (g_str_has_prefix(testword,nostart[i]))
1628 for (i=0;*noend[i];i++)
1629 if (g_str_has_suffix(testword,noend[i]))
1632 /* ght is common, gbt never. Like that. */
1633 if (strstr(testword,"cb"))
1635 if (strstr(testword,"gbt"))
1637 if (strstr(testword,"pbt"))
1639 if (strstr(testword,"tbs"))
1641 if (strstr(testword,"mrn"))
1643 if (strstr(testword,"ahle"))
1645 if (strstr(testword,"ihle"))
1648 * "TBE" does happen - like HEARTBEAT - but uncommon.
1649 * Also "TBI" - frostbite, outbid - but uncommon.
1650 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1651 * numerals, but "ii" is a common scanno.
1653 if (strstr(testword,"tbi"))
1655 if (strstr(testword,"tbe"))
1657 if (strstr(testword,"ii"))
1660 * Check for no vowels or no consonants.
1661 * If none, flag a typo.
1663 if (!istypo && len>1)
1666 for (t=testword;*t;t=g_utf8_next_char(t))
1668 c=g_utf8_get_char(t);
1670 g_unicode_canonical_decomposition(c,&decomposition_len);
1671 if (c=='y' || g_unichar_isdigit(c))
1673 /* Yah, this is loose. */
1677 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1681 g_free(decomposition);
1683 if (!vowel || !consonant)
1687 * Now exclude the word from being reported if it's in
1690 for (i=0;*okword[i];i++)
1691 if (!strcmp(testword,okword[i]))
1694 * What looks like a typo may be a Roman numeral.
1697 if (istypo && isroman(testword))
1699 /* Check the manual list of typos. */
1701 for (i=0;*typo[i];i++)
1702 if (!strcmp(testword,typo[i]))
1705 * Check lowercase s, l, i and m - special cases.
1706 * "j" - often a semi-colon gone wrong.
1707 * "d" for a missing apostrophe - he d
1710 if (!istypo && len==1 &&
1711 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1715 dupcnt=g_tree_lookup(qword,testword);
1719 isdup=!pswit[VERBOSE_SWITCH];
1723 dupcnt=g_new0(int,1);
1724 g_tree_insert(qword,g_strdup(testword),dupcnt);
1729 if (pswit[ECHO_SWITCH])
1730 g_print("\n%s\n",aline);
1731 if (!pswit[OVERVIEW_SWITCH])
1733 g_print(" Line %ld column %ld - Query word %s",
1734 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1736 if (!pswit[VERBOSE_SWITCH])
1737 g_print(" - not reporting duplicates");
1745 /* check the user's list of typos */
1746 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1748 if (pswit[ECHO_SWITCH])
1749 g_print("\n%s\n",aline);
1750 if (!pswit[OVERVIEW_SWITCH])
1751 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1752 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1754 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1756 if (pswit[PARANOID_SWITCH] && warnings->digit)
1758 /* In paranoid mode, query all 0 and 1 standing alone. */
1759 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1761 if (pswit[ECHO_SWITCH])
1762 g_print("\n%s\n",aline);
1763 if (!pswit[OVERVIEW_SWITCH])
1764 g_print(" Line %ld column %ld - Query standalone %s\n",
1765 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1776 * check_for_misspaced_punctuation:
1778 * Look for added or missing spaces around punctuation and quotes.
1779 * If there is a punctuation character like ! with no space on
1780 * either side, suspect a missing!space. If there are spaces on
1781 * both sides , assume a typo. If we see a double quote with no
1782 * space or punctuation on either side of it, assume unspaced
1783 * quotes "like"this.
1785 void check_for_misspaced_punctuation(const char *aline,
1786 struct parities *parities,gboolean isemptyline)
1788 gboolean isacro,isellipsis;
1790 gunichar c,nc,pc,n2c;
1791 c=g_utf8_get_char(aline);
1792 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1793 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1797 nc=g_utf8_get_char(g_utf8_next_char(s));
1798 /* For each character in the line after the first. */
1799 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1801 /* we need to suppress warnings for acronyms like M.D. */
1803 /* we need to suppress warnings for ellipsis . . . */
1806 * If there are letters on both sides of it or
1807 * if it's strict punctuation followed by an alpha.
1809 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1810 g_utf8_strchr("?!,;:",-1,c)))
1814 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1815 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1817 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1823 if (pswit[ECHO_SWITCH])
1824 g_print("\n%s\n",aline);
1825 if (!pswit[OVERVIEW_SWITCH])
1826 g_print(" Line %ld column %ld - Missing space?\n",
1827 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1832 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1835 * If there are spaces on both sides,
1836 * or space before and end of line.
1840 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1841 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1843 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1847 if (!isemptyline && !isellipsis)
1849 if (pswit[ECHO_SWITCH])
1850 g_print("\n%s\n",aline);
1851 if (!pswit[OVERVIEW_SWITCH])
1852 g_print(" Line %ld column %ld - "
1853 "Spaced punctuation?\n",linecnt,
1854 g_utf8_pointer_to_offset(aline,s)+1);
1861 /* Split out the characters that CANNOT be preceded by space. */
1862 c=g_utf8_get_char(aline);
1863 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1864 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1868 nc=g_utf8_get_char(g_utf8_next_char(s));
1869 /* for each character in the line after the first */
1870 if (g_utf8_strchr("?!,;:",-1,c))
1872 /* if it's punctuation that _cannot_ have a space before it */
1873 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1876 * If nc DOES == space,
1877 * it was already reported just above.
1879 if (pswit[ECHO_SWITCH])
1880 g_print("\n%s\n",aline);
1881 if (!pswit[OVERVIEW_SWITCH])
1882 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1883 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1890 * Special case " .X" where X is any alpha.
1891 * This plugs a hole in the acronym code above.
1892 * Inelegant, but maintainable.
1894 c=g_utf8_get_char(aline);
1895 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1896 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1900 nc=g_utf8_get_char(g_utf8_next_char(s));
1901 /* for each character in the line after the first */
1904 /* if it's a period */
1905 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1908 * If the period follows a space and
1909 * is followed by a letter.
1911 if (pswit[ECHO_SWITCH])
1912 g_print("\n%s\n",aline);
1913 if (!pswit[OVERVIEW_SWITCH])
1914 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1915 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1921 c=g_utf8_get_char(aline);
1922 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1923 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1927 nc=g_utf8_get_char(g_utf8_next_char(s));
1928 /* for each character in the line after the first */
1931 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1932 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1933 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1935 if (pswit[ECHO_SWITCH])
1936 g_print("\n%s\n",aline);
1937 if (!pswit[OVERVIEW_SWITCH])
1938 g_print(" Line %ld column %ld - Unspaced quotes?\n",
1939 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1945 /* Check parity of quotes. */
1946 nc=g_utf8_get_char(aline);
1947 for (s=aline;*s;s=g_utf8_next_char(s))
1950 nc=g_utf8_get_char(g_utf8_next_char(s));
1953 parities->dquote=!parities->dquote;
1954 if (!parities->dquote)
1957 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
1959 if (pswit[ECHO_SWITCH])
1960 g_print("\n%s\n",aline);
1961 if (!pswit[OVERVIEW_SWITCH])
1962 g_print(" Line %ld column %ld - "
1963 "Wrongspaced quotes?\n",
1964 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1972 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1973 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
1975 if (pswit[ECHO_SWITCH])
1976 g_print("\n%s\n",aline);
1977 if (!pswit[OVERVIEW_SWITCH])
1978 g_print(" Line %ld column %ld - "
1979 "Wrongspaced quotes?\n",
1980 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1987 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
1989 if (g_utf8_strchr(",;:!?)]} ",-1,
1990 g_utf8_get_char(g_utf8_next_char(aline))))
1992 if (pswit[ECHO_SWITCH])
1993 g_print("\n%s\n",aline);
1994 if (!pswit[OVERVIEW_SWITCH])
1995 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2001 if (pswit[SQUOTE_SWITCH])
2003 nc=g_utf8_get_char(aline);
2004 for (s=aline;*s;s=g_utf8_next_char(s))
2007 nc=g_utf8_get_char(g_utf8_next_char(s));
2008 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2009 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2010 !g_unichar_isalpha(nc)))
2012 parities->squote=!parities->squote;
2013 if (!parities->squote)
2016 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2018 if (pswit[ECHO_SWITCH])
2019 g_print("\n%s\n",aline);
2020 if (!pswit[OVERVIEW_SWITCH])
2021 g_print(" Line %ld column %ld - "
2022 "Wrongspaced singlequotes?\n",
2023 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2031 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2032 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2034 if (pswit[ECHO_SWITCH])
2035 g_print("\n%s\n",aline);
2036 if (!pswit[OVERVIEW_SWITCH])
2037 g_print(" Line %ld column %ld - "
2038 "Wrongspaced singlequotes?\n",
2039 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2050 * check_for_double_punctuation:
2052 * Look for double punctuation like ,. or ,,
2053 * Thanks to DW for the suggestion!
2054 * In books with references, ".," and ".;" are common
2055 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2056 * OTOH, from my initial tests, there are also fairly
2057 * common errors. What to do? Make these cases paranoid?
2058 * ".," is the most common, so warnings->dotcomma is used
2059 * to suppress detailed reporting if it occurs often.
2061 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2065 nc=g_utf8_get_char(aline);
2066 for (s=aline;*s;s=g_utf8_next_char(s))
2069 nc=g_utf8_get_char(g_utf8_next_char(s));
2070 /* for each punctuation character in the line */
2071 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2072 g_utf8_strchr(".?!,;:",-1,nc))
2074 /* followed by punctuation, it's a query, unless . . . */
2075 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2076 !warnings->dotcomma && c=='.' && nc==',' ||
2077 warnings->isFrench && g_str_has_prefix(s,",...") ||
2078 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2079 warnings->isFrench && g_str_has_prefix(s,";...") ||
2080 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2081 warnings->isFrench && g_str_has_prefix(s,":...") ||
2082 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2083 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2084 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2085 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2086 warnings->isFrench && g_str_has_prefix(s,"...?"))
2088 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2089 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2090 warnings->isFrench && g_str_has_prefix(s,";...") ||
2091 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2092 warnings->isFrench && g_str_has_prefix(s,":...") ||
2093 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2094 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2095 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2096 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2097 warnings->isFrench && g_str_has_prefix(s,"...?"))
2100 nc=g_utf8_get_char(g_utf8_next_char(s));
2102 ; /* do nothing for .. !! and ?? which can be legit */
2106 if (pswit[ECHO_SWITCH])
2107 g_print("\n%s\n",aline);
2108 if (!pswit[OVERVIEW_SWITCH])
2109 g_print(" Line %ld column %ld - Double punctuation?\n",
2110 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2119 * check_for_spaced_quotes:
2121 void check_for_spaced_quotes(const char *aline)
2125 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2129 while ((t=strstr(s," \" ")))
2131 if (pswit[ECHO_SWITCH])
2132 g_print("\n%s\n",aline);
2133 if (!pswit[OVERVIEW_SWITCH])
2134 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2135 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2138 s=g_utf8_next_char(g_utf8_next_char(t));
2140 pattern=g_string_new(NULL);
2141 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2143 g_string_assign(pattern," ");
2144 g_string_append_unichar(pattern,single_quotes[i]);
2145 g_string_append_c(pattern,' ');
2147 while ((t=strstr(s,pattern->str)))
2149 if (pswit[ECHO_SWITCH])
2150 g_print("\n%s\n",aline);
2151 if (!pswit[OVERVIEW_SWITCH])
2152 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2153 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2156 s=g_utf8_next_char(g_utf8_next_char(t));
2159 g_string_free(pattern,TRUE);
2163 * check_for_miscased_genative:
2165 * Check special case of 'S instead of 's at end of word.
2167 void check_for_miscased_genative(const char *aline)
2173 c=g_utf8_get_char(aline);
2174 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2175 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2179 nc=g_utf8_get_char(g_utf8_next_char(s));
2180 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2182 if (pswit[ECHO_SWITCH])
2183 g_print("\n%s\n",aline);
2184 if (!pswit[OVERVIEW_SWITCH])
2185 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2186 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2194 * check_end_of_line:
2196 * Now check special cases - start and end of line -
2197 * for single and double quotes. Start is sometimes [sic]
2198 * but better to query it anyway.
2199 * While we're here, check for dash at end of line.
2201 void check_end_of_line(const char *aline,struct warnings *warnings)
2206 lbytes=strlen(aline);
2207 if (g_utf8_strlen(aline,lbytes)>1)
2209 s=g_utf8_prev_char(aline+lbytes);
2210 c1=g_utf8_get_char(s);
2211 c2=g_utf8_get_char(g_utf8_prev_char(s));
2212 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2214 if (pswit[ECHO_SWITCH])
2215 g_print("\n%s\n",aline);
2216 if (!pswit[OVERVIEW_SWITCH])
2217 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2218 g_utf8_strlen(aline,lbytes));
2222 c1=g_utf8_get_char(aline);
2223 c2=g_utf8_get_char(g_utf8_next_char(aline));
2224 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2226 if (pswit[ECHO_SWITCH])
2227 g_print("\n%s\n",aline);
2228 if (!pswit[OVERVIEW_SWITCH])
2229 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2234 * Dash at end of line may well be legit - paranoid mode only
2235 * and don't report em-dash at line-end.
2237 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2239 for (s=g_utf8_prev_char(aline+lbytes);
2240 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2242 if (g_utf8_get_char(s)=='-' &&
2243 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2245 if (pswit[ECHO_SWITCH])
2246 g_print("\n%s\n",aline);
2247 if (!pswit[OVERVIEW_SWITCH])
2248 g_print(" Line %ld column %ld - "
2249 "Hyphen at end of line?\n",
2250 linecnt,g_utf8_pointer_to_offset(aline,s));
2257 * check_for_unspaced_bracket:
2259 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2260 * If so, suspect a scanno like "a]most".
2262 void check_for_unspaced_bracket(const char *aline)
2266 c=g_utf8_get_char(aline);
2267 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2268 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2272 nc=g_utf8_get_char(g_utf8_next_char(s));
2275 /* for each bracket character in the line except 1st & last */
2276 if (g_utf8_strchr("{[()]}",-1,c) &&
2277 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2279 if (pswit[ECHO_SWITCH])
2280 g_print("\n%s\n",aline);
2281 if (!pswit[OVERVIEW_SWITCH])
2282 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2283 linecnt,g_utf8_pointer_to_offset(aline,s));
2291 * check_for_unpunctuated_endquote:
2293 void check_for_unpunctuated_endquote(const char *aline)
2297 c=g_utf8_get_char(aline);
2298 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2299 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2303 nc=g_utf8_get_char(g_utf8_next_char(s));
2304 /* for each character in the line except 1st */
2305 if (c==CHAR_DQUOTE && isalpha(pc))
2307 if (pswit[ECHO_SWITCH])
2308 g_print("\n%s\n",aline);
2309 if (!pswit[OVERVIEW_SWITCH])
2310 g_print(" Line %ld column %ld - "
2311 "endquote missing punctuation?\n",
2312 linecnt,g_utf8_pointer_to_offset(aline,s));
2320 * check_for_html_tag:
2322 * Check for <HTML TAG>.
2324 * If there is a < in the line, followed at some point
2325 * by a > then we suspect HTML.
2327 void check_for_html_tag(const char *aline)
2329 const char *open,*close;
2331 open=strchr(aline,'<');
2334 close=strchr(g_utf8_next_char(open),'>');
2337 if (pswit[ECHO_SWITCH])
2338 g_print("\n%s\n",aline);
2339 if (!pswit[OVERVIEW_SWITCH])
2341 tag=g_strndup(open,close-open+1);
2342 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2343 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2353 * check_for_html_entity:
2355 * Check for &symbol; HTML.
2357 * If there is a & in the line, followed at
2358 * some point by a ; then we suspect HTML.
2360 void check_for_html_entity(const char *aline)
2362 const char *s,*amp,*scolon;
2364 amp=strchr(aline,'&');
2367 scolon=strchr(amp,';');
2370 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2371 if (g_utf8_get_char(s)==CHAR_SPACE)
2372 break; /* Don't report "Jones & Son;" */
2375 if (pswit[ECHO_SWITCH])
2376 g_print("\n%s\n",aline);
2377 if (!pswit[OVERVIEW_SWITCH])
2379 entity=g_strndup(amp,scolon-amp+1);
2380 g_print(" Line %ld column %d - HTML symbol? %s \n",
2381 linecnt,(int)(amp-aline)+1,entity);
2392 * check_for_omitted_punctuation:
2394 * Check for omitted punctuation at end of paragraph by working back
2395 * through prevline. DW.
2396 * Need to check this only for "normal" paras.
2397 * So what is a "normal" para?
2398 * Not normal if one-liner (chapter headings, etc.)
2399 * Not normal if doesn't contain at least one locase letter
2400 * Not normal if starts with space
2402 void check_for_omitted_punctuation(const char *prevline,
2403 struct line_properties *last,int start_para_line)
2405 gboolean letter_on_line=FALSE;
2408 gboolean closing_quote;
2409 for (s=prevline;*s;s=g_utf8_next_char(s))
2410 if (g_unichar_isalpha(g_utf8_get_char(s)))
2412 letter_on_line=TRUE;
2416 * This next "if" is a problem.
2417 * If we say "start_para_line <= linecnt - 1", that includes
2418 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2419 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2420 * misses genuine one-line paragraphs.
2422 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2423 g_utf8_get_char(prevline)>CHAR_SPACE)
2425 s=prevline+strlen(prevline);
2428 s=g_utf8_prev_char(s);
2429 c=g_utf8_get_char(s);
2430 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2433 closing_quote=FALSE;
2434 } while (closing_quote && s>prevline);
2435 for (;s>prevline;s=g_utf8_prev_char(s))
2437 if (g_unichar_isalpha(g_utf8_get_char(s)))
2439 if (pswit[ECHO_SWITCH])
2440 g_print("\n%s\n",prevline);
2441 if (!pswit[OVERVIEW_SWITCH])
2442 g_print(" Line %ld column %ld - "
2443 "No punctuation at para end?\n",
2444 linecnt-1,g_utf8_strlen(prevline,-1));
2449 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2455 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2457 const char *word=key;
2460 g_print("\nNote: Queried word %s was duplicated %d times\n",
2465 void print_as_windows_1252(const char *string)
2467 gsize inbytes,outbytes;
2469 static GIConv converter=(GIConv)-1;
2472 if (converter!=(GIConv)-1)
2473 g_iconv_close(converter);
2474 converter=(GIConv)-1;
2477 if (converter==(GIConv)-1)
2478 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2479 if (converter!=(GIConv)-1)
2481 inbytes=outbytes=strlen(string);
2482 bp=buf=g_malloc(outbytes+1);
2483 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2489 fputs(string,stdout);
2492 void print_as_utf_8(const char *string)
2494 fputs(string,stdout);
2502 void procfile(const char *filename)
2505 gchar *parastart=NULL; /* first line of current para */
2506 gchar *etext,*aline;
2509 struct first_pass_results *first_pass_results;
2510 struct warnings *warnings;
2511 struct counters counters={0};
2512 struct line_properties last={0};
2513 struct parities parities={0};
2514 struct pending pending={0};
2515 gboolean isemptyline;
2516 long start_para_line=0;
2517 gboolean isnewpara=FALSE,enddash=FALSE;
2518 last.start=CHAR_SPACE;
2519 linecnt=checked_linecnt=0;
2520 etext=read_etext(filename,&err);
2523 if (pswit[STDOUT_SWITCH])
2524 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2526 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2529 g_print("\n\nFile: %s\n\n",filename);
2530 first_pass_results=first_pass(etext);
2531 warnings=report_first_pass(first_pass_results);
2532 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2533 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2535 * Here we go with the main pass. Hold onto yer hat!
2539 while ((aline=flgets(&etext_ptr,linecnt+1)))
2544 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2545 continue; // skip DP page separators completely
2546 if (linecnt<first_pass_results->firstline ||
2547 (first_pass_results->footerline>0 &&
2548 linecnt>first_pass_results->footerline))
2550 if (pswit[HEADER_SWITCH])
2552 if (g_str_has_prefix(aline,"Title:"))
2553 g_print(" %s\n",aline);
2554 if (g_str_has_prefix(aline,"Author:"))
2555 g_print(" %s\n",aline);
2556 if (g_str_has_prefix(aline,"Release Date:"))
2557 g_print(" %s\n",aline);
2558 if (g_str_has_prefix(aline,"Edition:"))
2559 g_print(" %s\n\n",aline);
2561 continue; /* skip through the header */
2564 print_pending(aline,parastart,&pending);
2565 isemptyline=analyse_quotes(aline,linecnt,&counters);
2566 if (isnewpara && !isemptyline)
2568 /* This line is the start of a new paragraph. */
2569 start_para_line=linecnt;
2570 /* Capture its first line in case we want to report it later. */
2572 parastart=g_strdup(aline);
2573 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2575 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2576 !g_unichar_isdigit(g_utf8_get_char(s)))
2577 s=g_utf8_next_char(s);
2578 if (g_unichar_islower(g_utf8_get_char(s)))
2580 /* and its first letter is lowercase */
2581 if (pswit[ECHO_SWITCH])
2582 g_print("\n%s\n",aline);
2583 if (!pswit[OVERVIEW_SWITCH])
2584 g_print(" Line %ld column %ld - "
2585 "Paragraph starts with lower-case\n",
2586 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2590 isnewpara=FALSE; /* Signal the end of new para processing. */
2592 /* Check for an em-dash broken at line end. */
2593 if (enddash && g_utf8_get_char(aline)=='-')
2595 if (pswit[ECHO_SWITCH])
2596 g_print("\n%s\n",aline);
2597 if (!pswit[OVERVIEW_SWITCH])
2598 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2603 for (s=g_utf8_prev_char(aline+strlen(aline));
2604 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2606 if (s>=aline && g_utf8_get_char(s)=='-')
2608 check_for_control_characters(aline);
2610 check_for_odd_characters(aline,warnings,isemptyline);
2611 if (warnings->longline)
2612 check_for_long_line(aline);
2613 if (warnings->shortline)
2614 check_for_short_line(aline,&last);
2616 last.len=g_utf8_strlen(aline,-1);
2617 last.start=g_utf8_get_char(aline);
2618 check_for_starting_punctuation(aline);
2621 check_for_spaced_emdash(aline);
2622 check_for_spaced_dash(aline);
2624 check_for_unmarked_paragraphs(aline);
2625 check_for_jeebies(aline);
2626 check_for_mta_from(aline);
2627 check_for_orphan_character(aline);
2628 check_for_pling_scanno(aline);
2629 check_for_extra_period(aline,warnings);
2630 check_for_following_punctuation(aline);
2631 check_for_typos(aline,warnings);
2632 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2633 check_for_double_punctuation(aline,warnings);
2634 check_for_spaced_quotes(aline);
2635 check_for_miscased_genative(aline);
2636 check_end_of_line(aline,warnings);
2637 check_for_unspaced_bracket(aline);
2638 if (warnings->endquote)
2639 check_for_unpunctuated_endquote(aline);
2640 check_for_html_tag(aline);
2641 check_for_html_entity(aline);
2644 check_for_mismatched_quotes(&counters,&pending);
2645 counters_reset(&counters);
2646 /* let the next iteration know that it's starting a new para */
2649 check_for_omitted_punctuation(prevline,&last,start_para_line);
2652 prevline=g_strdup(aline);
2655 check_for_mismatched_quotes(&counters,&pending);
2656 print_pending(NULL,parastart,&pending);
2657 reset_pending(&pending);
2666 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2667 g_tree_foreach(qword,report_duplicate_queries,NULL);
2668 g_tree_unref(qword);
2669 g_tree_unref(qperiod);
2670 counters_destroy(&counters);
2671 g_set_print_handler(NULL);
2672 print_as_windows_1252(NULL);
2673 if (pswit[MARKUP_SWITCH])
2680 * Get one line from the input text, checking for
2681 * the existence of exactly one CR/LF line-end per line.
2683 * Returns: a pointer to the line.
2685 char *flgets(char **etext,long lcnt)
2688 gboolean isCR=FALSE;
2689 char *theline=*etext;
2694 c=g_utf8_get_char(*etext);
2695 *etext=g_utf8_next_char(*etext);
2698 /* either way, it's end of line */
2705 /* Error - a LF without a preceding CR */
2706 if (pswit[LINE_END_SWITCH])
2708 if (pswit[ECHO_SWITCH])
2710 s=g_strndup(theline,eos-theline);
2711 g_print("\n%s\n",s);
2714 if (!pswit[OVERVIEW_SWITCH])
2715 g_print(" Line %ld - No CR?\n",lcnt);
2726 /* Error - two successive CRs */
2727 if (pswit[LINE_END_SWITCH])
2729 if (pswit[ECHO_SWITCH])
2731 s=g_strndup(theline,eos-theline);
2732 g_print("\n%s\n",s);
2735 if (!pswit[OVERVIEW_SWITCH])
2736 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2745 if (pswit[LINE_END_SWITCH] && isCR)
2747 if (pswit[ECHO_SWITCH])
2749 s=g_strndup(theline,eos-theline);
2750 g_print("\n%s\n",s);
2753 if (!pswit[OVERVIEW_SWITCH])
2754 g_print(" Line %ld column %ld - CR without LF?\n",
2755 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2761 eos=g_utf8_next_char(eos);
2765 if (pswit[MARKUP_SWITCH])
2766 postprocess_for_HTML(theline);
2767 if (pswit[DP_SWITCH])
2768 postprocess_for_DP(theline);
2775 * Takes a "word" as a parameter, and checks whether it
2776 * contains a mixture of alpha and digits. Generally, this is an
2777 * error, but may not be for cases like 4th or L5 12s. 3d.
2779 * Returns: TRUE iff an is error found.
2781 gboolean mixdigit(const char *checkword)
2783 gboolean wehaveadigit,wehavealetter,query;
2784 const char *s,*nondigit;
2785 wehaveadigit=wehavealetter=query=FALSE;
2786 for (s=checkword;*s;s=g_utf8_next_char(s))
2787 if (g_unichar_isalpha(g_utf8_get_char(s)))
2789 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2791 if (wehaveadigit && wehavealetter)
2793 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2795 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2796 nondigit=g_utf8_next_char(nondigit))
2798 /* digits, ending in st, rd, nd, th of either case */
2799 if (!g_ascii_strcasecmp(nondigit,"st") ||
2800 !g_ascii_strcasecmp(nondigit,"rd") ||
2801 !g_ascii_strcasecmp(nondigit,"nd") ||
2802 !g_ascii_strcasecmp(nondigit,"th"))
2804 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2805 !g_ascii_strcasecmp(nondigit,"rds") ||
2806 !g_ascii_strcasecmp(nondigit,"nds") ||
2807 !g_ascii_strcasecmp(nondigit,"ths"))
2809 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2810 !g_ascii_strcasecmp(nondigit,"rdly") ||
2811 !g_ascii_strcasecmp(nondigit,"ndly") ||
2812 !g_ascii_strcasecmp(nondigit,"thly"))
2814 /* digits, ending in l, L, s or d */
2815 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2816 !strcmp(nondigit,"d"))
2819 * L at the start of a number, representing Britsh pounds, like L500.
2820 * This is cute. We know the current word is mixed digit. If the first
2821 * letter is L, there must be at least one digit following. If both
2822 * digits and letters follow, we have a genuine error, else we have a
2823 * capital L followed by digits, and we accept that as a non-error.
2825 if (g_utf8_get_char(checkword)=='L' &&
2826 !mixdigit(g_utf8_next_char(checkword)))
2835 * Extracts the first/next "word" from the line, and returns it.
2836 * A word is defined as one English word unit--or at least that's the aim.
2837 * "ptr" is advanced to the position in the line where we will start
2838 * looking for the next word.
2840 * Returns: A newly-allocated string.
2842 gchar *getaword(const char **ptr)
2847 word=g_string_new(NULL);
2848 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2849 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2850 **ptr;*ptr=g_utf8_next_char(*ptr))
2853 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2854 * Especially yucky is the case of L1,000
2855 * This section looks for a pattern of characters including a digit
2856 * followed by a comma or period followed by one or more digits.
2857 * If found, it returns this whole pattern as a word; otherwise we discard
2858 * the results and resume our normal programming.
2861 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2862 g_unichar_isalpha(g_utf8_get_char(s)) ||
2863 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2864 g_string_append_unichar(word,g_utf8_get_char(s));
2867 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
2869 c=g_utf8_get_char(t);
2870 pc=g_utf8_get_char(g_utf8_prev_char(t));
2871 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
2874 return g_string_free(word,FALSE);
2878 /* we didn't find a punctuated number - do the regular getword thing */
2879 g_string_truncate(word,0);
2880 c=g_utf8_get_char(*ptr);
2881 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
2882 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
2883 g_string_append_unichar(word,c);
2884 return g_string_free(word,FALSE);
2890 * Is this word a Roman Numeral?
2892 * It doesn't actually validate that the number is a valid Roman Numeral--for
2893 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2894 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2895 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2896 * expressions thereof, except when it came to taxes. Allow any number of M,
2897 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2898 * XL or an optional XC, an optional IX or IV, an optional V and any number
2901 gboolean isroman(const char *t)
2907 while (g_utf8_get_char(t)=='m' && *t)
2909 if (g_utf8_get_char(t)=='d')
2911 if (g_str_has_prefix(t,"cm"))
2913 if (g_str_has_prefix(t,"cd"))
2915 while (g_utf8_get_char(t)=='c' && *t)
2917 if (g_str_has_prefix(t,"xl"))
2919 if (g_str_has_prefix(t,"xc"))
2921 if (g_utf8_get_char(t)=='l')
2923 while (g_utf8_get_char(t)=='x' && *t)
2925 if (g_str_has_prefix(t,"ix"))
2927 if (g_str_has_prefix(t,"iv"))
2929 if (g_utf8_get_char(t)=='v')
2931 while (g_utf8_get_char(t)=='i' && *t)
2937 * postprocess_for_DP:
2939 * Invoked with the -d switch from flgets().
2940 * It simply "removes" from the line a hard-coded set of common
2941 * DP-specific tags, so that the line passed to the main routine has
2942 * been pre-cleaned of DP markup.
2944 void postprocess_for_DP(char *theline)
2950 for (i=0;*DPmarkup[i];i++)
2951 while ((s=strstr(theline,DPmarkup[i])))
2953 t=s+strlen(DPmarkup[i]);
2954 memmove(s,t,strlen(t)+1);
2959 * postprocess_for_HTML:
2961 * Invoked with the -m switch from flgets().
2962 * It simply "removes" from the line a hard-coded set of common
2963 * HTML tags and "replaces" a hard-coded set of common HTML
2964 * entities, so that the line passed to the main routine has
2965 * been pre-cleaned of HTML.
2967 void postprocess_for_HTML(char *theline)
2969 while (losemarkup(theline))
2971 loseentities(theline);
2974 char *losemarkup(char *theline)
2978 s=strchr(theline,'<');
2979 t=s?strchr(s,'>'):NULL;
2982 for (i=0;*markup[i];i++)
2983 if (tagcomp(g_utf8_next_char(s),markup[i]))
2985 t=g_utf8_next_char(t);
2986 memmove(s,t,strlen(t)+1);
2989 /* It's an unrecognized <xxx>. */
2993 void loseentities(char *theline)
3000 GTree *entities=NULL;
3001 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3005 g_tree_destroy(entities);
3007 if (translit!=(GIConv)-1)
3008 g_iconv_close(translit);
3009 translit=(GIConv)-1;
3010 if (to_utf8!=(GIConv)-1)
3011 g_iconv_close(to_utf8);
3019 entities=g_tree_new((GCompareFunc)strcmp);
3020 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3021 g_tree_insert(entities,HTMLentities[i].name,
3022 GUINT_TO_POINTER(HTMLentities[i].c));
3024 if (translit==(GIConv)-1)
3025 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3026 if (to_utf8==(GIConv)-1)
3027 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3028 while((amp=strchr(theline,'&')))
3030 scolon=strchr(amp,';');
3035 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3036 c=strtol(amp+2,NULL,10);
3037 else if (amp[2]=='x' &&
3038 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3039 c=strtol(amp+3,NULL,16);
3043 s=g_strndup(amp+1,scolon-(amp+1));
3044 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3053 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3054 theline+=g_unichar_to_utf8(c,theline);
3058 nb=g_unichar_to_utf8(c,s);
3059 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3061 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3063 memcpy(theline,s,nb);
3067 memmove(theline,g_utf8_next_char(scolon),
3068 strlen(g_utf8_next_char(scolon))+1);
3071 theline=g_utf8_next_char(amp);
3075 gboolean tagcomp(const char *strin,const char *basetag)
3079 if (g_utf8_get_char(strin)=='/')
3080 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3082 t=g_utf8_casefold(strin,-1);
3083 s=g_utf8_casefold(basetag,-1);
3084 retval=g_str_has_prefix(t,s);
3090 void proghelp(GOptionContext *context)
3093 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3094 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3095 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3096 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3097 "For details, read the file COPYING.\n",stderr);
3098 fputs("This is Free Software; "
3099 "you may redistribute it under certain conditions (GPL);\n",stderr);
3100 fputs("read the file COPYING for details.\n\n",stderr);
3101 help=g_option_context_get_help(context,TRUE,NULL);
3104 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3105 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3106 "non-ASCII\n",stderr);
3107 fputs("characters like accented letters, "
3108 "lines longer than 75 or shorter than 55,\n",stderr);
3109 fputs("unbalanced quotes or brackets, "
3110 "a variety of badly formatted punctuation, \n",stderr);
3111 fputs("HTML tags, some likely typos. "
3112 "It is NOT a substitute for human judgement.\n",stderr);