Fix bug #29: analyse_quotes() shadows the linecnt global variable for no good reason
1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 static GOptionEntry options[]={
132 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
133 "Ignore DP-specific markup", NULL },
134 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
135 "Don't echo queried line", NULL },
136 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
137 "Check single quotes", NULL },
138 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
139 "Check common typos", NULL },
140 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
141 "Require closure of quotes on every paragraph", NULL },
142 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
143 "Disable paranoid querying of everything", NULL },
144 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
145 "Disable line end checking", NULL },
146 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
147 "Overview: just show counts", NULL },
148 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
149 "Output errors to stdout instead of stderr", NULL },
150 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
151 "Echo header fields", NULL },
152 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
153 "Ignore markup in < >", NULL },
154 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
155 "Use file of user-defined typos", NULL },
156 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
157 "Defaults for use on www upload", NULL },
158 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
159 "Verbose - list everything", NULL },
163 long cnt_quote; /* for overview mode, count of quote queries */
164 long cnt_brack; /* for overview mode, count of brackets queries */
165 long cnt_bin; /* for overview mode, count of non-ASCII queries */
166 long cnt_odd; /* for overview mode, count of odd character queries */
167 long cnt_long; /* for overview mode, count of long line errors */
168 long cnt_short; /* for overview mode, count of short line queries */
169 long cnt_punct; /* for overview mode,
170 count of punctuation and spacing queries */
171 long cnt_dash; /* for overview mode, count of dash-related queries */
172 long cnt_word; /* for overview mode, count of word queries */
173 long cnt_html; /* for overview mode, count of html queries */
174 long cnt_lineend; /* for overview mode, count of line-end queries */
175 long cnt_spacend; /* count of lines with space at end */
176 long linecnt; /* count of total lines in the file */
177 long checked_linecnt; /* count of lines actually checked */
179 void proghelp(GOptionContext *context);
180 void procfile(const char *);
184 gboolean mixdigit(const char *);
185 gchar *getaword(const char **);
186 char *flgets(char **,long);
187 void postprocess_for_HTML(char *);
188 char *linehasmarkup(char *);
189 char *losemarkup(char *);
190 gboolean tagcomp(const char *,const char *);
191 void loseentities(char *);
192 gboolean isroman(const char *);
193 void postprocess_for_DP(char *);
194 void print_as_windows_1252(const char *string);
195 void print_as_utf_8(const char *string);
197 GTree *qword,*qperiod;
203 void parse_options(int *argc,char ***argv)
206 GOptionContext *context;
207 context=g_option_context_new(
208 "file - looks for errors in Project Gutenberg(TM) etexts");
209 g_option_context_add_main_entries(context,options,NULL);
210 if (!g_option_context_parse(context,argc,argv,&err))
212 g_printerr("Bookloupe: %s\n",err->message);
213 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
216 /* Paranoid checking is turned OFF, not on, by its switch */
217 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
218 if (pswit[PARANOID_SWITCH])
219 /* if running in paranoid mode, typo checks default to enabled */
220 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
221 /* Line-end checking is turned OFF, not on, by its switch */
222 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
223 /* Echoing is turned OFF, not on, by its switch */
224 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
225 if (pswit[OVERVIEW_SWITCH])
226 /* just print summary; don't echo */
227 pswit[ECHO_SWITCH]=FALSE;
229 * Web uploads - for the moment, this is really just a placeholder
230 * until we decide what processing we really want to do on web uploads
232 if (pswit[WEB_SWITCH])
234 /* specific override for web uploads */
235 pswit[ECHO_SWITCH]=TRUE;
236 pswit[SQUOTE_SWITCH]=FALSE;
237 pswit[TYPO_SWITCH]=TRUE;
238 pswit[QPARA_SWITCH]=FALSE;
239 pswit[PARANOID_SWITCH]=TRUE;
240 pswit[LINE_END_SWITCH]=FALSE;
241 pswit[OVERVIEW_SWITCH]=FALSE;
242 pswit[STDOUT_SWITCH]=FALSE;
243 pswit[HEADER_SWITCH]=TRUE;
244 pswit[VERBOSE_SWITCH]=FALSE;
245 pswit[MARKUP_SWITCH]=FALSE;
246 pswit[USERTYPO_SWITCH]=FALSE;
247 pswit[DP_SWITCH]=FALSE;
254 g_option_context_free(context);
260 * Read in the user-defined stealth scanno list.
262 void read_user_scannos(void)
265 gchar *usertypo_file;
269 gchar *contents,*utf8,**lines;
270 usertypo_file=g_strdup("bookloupe.typ");
271 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
272 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
275 g_free(usertypo_file);
276 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
277 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
279 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
282 g_free(usertypo_file);
283 usertypo_file=g_strdup("gutcheck.typ");
284 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
286 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
289 g_free(usertypo_file);
290 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
291 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
293 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
295 g_free(usertypo_file);
296 g_print(" --> I couldn't find bookloupe.typ "
297 "-- proceeding without user typos.\n");
302 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
303 g_free(usertypo_file);
307 if (g_utf8_validate(contents,len,NULL))
308 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
310 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
312 lines=g_strsplit_set(utf8,"\r\n",0);
314 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
315 for (i=0;lines[i];i++)
316 if (*(unsigned char *)lines[i]>'!')
317 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
326 * Read an etext returning a newly allocated string containing the file
327 * contents or NULL on error.
329 gchar *read_etext(const char *filename,GError **err)
331 GError *tmp_err=NULL;
332 gchar *contents,*utf8;
333 gsize len,bytes_read,bytes_written;
335 if (!g_file_get_contents(filename,&contents,&len,err))
337 if (g_utf8_validate(contents,len,NULL))
339 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
340 g_set_print_handler(print_as_utf_8);
342 SetConsoleOutputCP(CP_UTF8);
347 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
348 &bytes_written,&tmp_err);
349 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
350 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
353 for(i=0;i<bytes_read;i++)
354 if (contents[i]=='\n')
359 else if (contents[i]!='\r')
361 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
362 "Input conversion failed. Byte %d at line %d, column %d is not a "
363 "valid Windows-1252 character",
364 ((unsigned char *)contents)[bytes_read],line,col);
367 g_propagate_error(err,tmp_err);
368 g_set_print_handler(print_as_windows_1252);
370 SetConsoleOutputCP(1252);
377 void cleanup_on_exit(void)
380 SetConsoleOutputCP(saved_cp);
384 int main(int argc,char **argv)
387 atexit(cleanup_on_exit);
388 saved_cp=GetConsoleOutputCP();
390 running_from=g_path_get_dirname(argv[0]);
391 parse_options(&argc,&argv);
392 if (pswit[USERTYPO_SWITCH])
394 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
396 if (pswit[OVERVIEW_SWITCH])
398 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
399 checked_linecnt,linecnt,linecnt-checked_linecnt);
400 g_print(" --------------- Queries found --------------\n");
402 g_print(" Long lines: %14ld\n",cnt_long);
404 g_print(" Short lines: %14ld\n",cnt_short);
406 g_print(" Line-end problems: %14ld\n",cnt_lineend);
408 g_print(" Common typos: %14ld\n",cnt_word);
410 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
412 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
414 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
416 g_print(" Proofing characters: %14ld\n",cnt_odd);
418 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
420 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
422 g_print(" Possible HTML tags: %14ld\n",cnt_html);
424 g_print(" TOTAL QUERIES %14ld\n",
425 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
426 cnt_dash+cnt_word+cnt_html+cnt_lineend);
428 g_free(running_from);
430 g_tree_unref(usertypo);
434 void count_dashes(const char *line,const char *dash,
435 struct dash_results *results)
440 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
443 tokens=g_strsplit(line,dash,0);
446 for(i=1;tokens[i];i++)
448 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
449 nc=g_utf8_get_char(tokens[i]);
450 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
452 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
454 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
460 /* count of lines with em-dashes with spaces both sides */
461 results->non_PG_space++;
463 /* count of lines with PG-type em-dashes with no spaces */
471 * Run a first pass - verify that it's a valid PG
472 * file, decide whether to report some things that
473 * occur many times in the text like long or short
474 * lines, non-standard dashes, etc.
476 struct first_pass_results *first_pass(const char *etext)
478 gunichar laststart=CHAR_SPACE;
483 unsigned int lastlen=0,lastblen=0;
484 long spline=0,nspline=0;
485 static struct first_pass_results results={0};
486 struct dash_results tmp_dash_results;
489 lines=g_strsplit(etext,"\n",0);
490 for (j=0;lines[j];j++)
492 lbytes=strlen(lines[j]);
493 while (lbytes>0 && lines[j][lbytes-1]=='\r')
494 lines[j][--lbytes]='\0';
495 llen=g_utf8_strlen(lines[j],lbytes);
497 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
498 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
501 g_print(" --> Duplicate header?\n");
502 spline=linecnt+1; /* first line of non-header text, that is */
504 if (!strncmp(lines[j],"*** START",9) &&
505 strstr(lines[j],"PROJECT GUTENBERG"))
508 g_print(" --> Duplicate header?\n");
509 nspline=linecnt+1; /* first line of non-header text, that is */
511 if (spline || nspline)
513 lc_line=g_utf8_strdown(lines[j],lbytes);
514 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
516 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
518 if (results.footerline)
520 /* it's an old-form header - we can detect duplicates */
522 g_print(" --> Duplicate footer?\n");
525 results.footerline=linecnt;
531 results.firstline=spline;
533 results.firstline=nspline; /* override with new */
534 if (results.footerline)
535 continue; /* don't count the boilerplate in the footer */
536 results.totlen+=llen;
537 for (s=lines[j];*s;s=g_utf8_next_char(s))
539 if (g_utf8_get_char(s)>127)
541 if (g_unichar_isalpha(g_utf8_get_char(s)))
545 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
546 qc=QUOTE_CLASS(g_utf8_get_char(s));
549 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
550 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
551 results.endquote_count++;
554 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
555 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
558 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
560 if (strstr(lines[j],".,"))
562 /* only count ast lines for ignoring purposes where there is */
563 /* locase text on the line */
564 if (strchr(lines[j],'*'))
566 for (s=lines[j];*s;s=g_utf8_next_char(s))
567 if (g_unichar_islower(g_utf8_get_char(s)))
572 if (strchr(lines[j],'/'))
573 results.fslashline++;
576 for (s=g_utf8_prev_char(lines[j]+lbytes);
577 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
578 s=g_utf8_prev_char(s))
580 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
581 g_utf8_get_char(g_utf8_prev_char(s))!='-')
584 if (llen>LONGEST_PG_LINE)
586 if (llen>WAY_TOO_LONG)
587 results.verylongline++;
588 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
590 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
593 if (strstr(lines[j],"<i>"))
594 results.htmcount+=4; /* bonus marks! */
596 /* Check for spaced em-dashes */
597 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
598 count_dashes(lines[j],"--",&tmp_dash_results);
599 count_dashes(lines[j],"—",&tmp_dash_results);
600 if (tmp_dash_results.base)
601 results.emdash.base++;
602 if (tmp_dash_results.non_PG_space)
603 results.emdash.non_PG_space++;
604 if (tmp_dash_results.PG_space)
605 results.emdash.PG_space++;
609 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
610 results.Dutchcount++;
611 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
612 results.Frenchcount++;
613 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
614 results.standalone_digit++;
617 /* Check for spaced dashes */
618 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
622 laststart=lines[j][0];
631 * Make some snap decisions based on the first pass results.
633 struct warnings *report_first_pass(struct first_pass_results *results)
635 static struct warnings warnings={0};
637 g_print(" --> %ld lines in this file have white space at end\n",
640 if (results->dotcomma>5)
643 g_print(" --> %ld lines in this file contain '.,'. "
644 "Not reporting them.\n",results->dotcomma);
647 * If more than 50 lines, or one-tenth, are short,
648 * don't bother reporting them.
650 warnings.shortline=1;
651 if (results->shortline>50 || results->shortline*10>linecnt)
653 warnings.shortline=0;
654 g_print(" --> %ld lines in this file are short. "
655 "Not reporting short lines.\n",results->shortline);
658 * If more than 50 lines, or one-tenth, are long,
659 * don't bother reporting them.
662 if (results->longline>50 || results->longline*10>linecnt)
665 g_print(" --> %ld lines in this file are long. "
666 "Not reporting long lines.\n",results->longline);
668 /* If more than 10 lines contain asterisks, don't bother reporting them. */
670 if (results->astline>10)
673 g_print(" --> %ld lines in this file contain asterisks. "
674 "Not reporting them.\n",results->astline);
677 * If more than 10 lines contain forward slashes,
678 * don't bother reporting them.
681 if (results->fslashline>10)
684 g_print(" --> %ld lines in this file contain forward slashes. "
685 "Not reporting them.\n",results->fslashline);
688 * If more than 20 lines contain unpunctuated endquotes,
689 * don't bother reporting them.
692 if (results->endquote_count>20)
695 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
696 "Not reporting them.\n",results->endquote_count);
699 * If more than 15 lines contain standalone digits,
700 * don't bother reporting them.
703 if (results->standalone_digit>10)
706 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
707 "Not reporting them.\n",results->standalone_digit);
710 * If more than 20 lines contain hyphens at end,
711 * don't bother reporting them.
714 if (results->hyphens>20)
717 g_print(" --> %ld lines in this file have hyphens at end. "
718 "Not reporting them.\n",results->hyphens);
720 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
722 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
723 pswit[MARKUP_SWITCH]=1;
725 if (results->verylongline>0)
726 g_print(" --> %ld lines in this file are VERY long!\n",
727 results->verylongline);
729 * If there are more non-PG spaced dashes than PG em-dashes,
730 * assume it's deliberate.
731 * Current PG guidelines say don't use them, but older texts do,
732 * and some people insist on them whatever the guidelines say.
735 if (results->spacedash+results->emdash.non_PG_space>
736 results->emdash.PG_space)
739 g_print(" --> There are %ld spaced dashes and em-dashes. "
740 "Not reporting them.\n",
741 results->spacedash+results->emdash.non_PG_space);
743 /* If more than a quarter of characters are hi-bit, bug out. */
745 if (results->binlen*4>results->totlen)
747 g_print(" --> This file does not appear to be ASCII. "
748 "Terminating. Best of luck with it!\n");
751 if (results->alphalen*4<results->totlen)
753 g_print(" --> This file does not appear to be text. "
754 "Terminating. Best of luck with it!\n");
757 if (results->binlen*100>results->totlen || results->binlen>100)
759 g_print(" --> There are a lot of foreign letters here. "
760 "Not reporting them.\n");
763 warnings.isDutch=FALSE;
764 if (results->Dutchcount>50)
766 warnings.isDutch=TRUE;
767 g_print(" --> This looks like Dutch - "
768 "switching off dashes and warnings for 's Middags case.\n");
770 warnings.isFrench=FALSE;
771 if (results->Frenchcount>50)
773 warnings.isFrench=TRUE;
774 g_print(" --> This looks like French - "
775 "switching off some doublepunct.\n");
777 if (results->firstline && results->footerline)
778 g_print(" The PG header and footer appear to be already on.\n");
781 if (results->firstline)
782 g_print(" The PG header is on - no footer.\n");
783 if (results->footerline)
784 g_print(" The PG footer is on - no header.\n");
787 if (pswit[VERBOSE_SWITCH])
790 warnings.shortline=1;
799 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
801 if (warnings.isDutch)
803 if (results->footerline>0 && results->firstline>0 &&
804 results->footerline>results->firstline &&
805 results->footerline-results->firstline<100)
807 g_print(" --> I don't really know where this text starts. \n");
808 g_print(" There are no reference points.\n");
809 g_print(" I'm going to have to report the header and footer "
811 results->firstline=0;
819 * Look along the line, accumulate the count of quotes, and see
820 * if this is an empty line - i.e. a line with nothing on it
822 * If line has just spaces, period, * and/or - on it, don't
823 * count it, since empty lines with asterisks or dashes to
824 * separate sections are common.
826 * Returns: TRUE if the line is empty.
828 gboolean analyse_quotes(const char *aline,struct counters *counters)
831 /* assume the line is empty until proven otherwise */
832 gboolean isemptyline=TRUE;
833 const char *s=aline,*sprev,*snext;
836 GError *tmp_err=NULL;
839 snext=g_utf8_next_char(s);
840 c=g_utf8_get_char(s);
841 if (CHAR_IS_DQUOTE(c))
842 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
843 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
848 * At start of line, it can only be a quotation mark.
849 * Hardcode a very common exception!
851 if (!g_str_has_prefix(snext,"tis") &&
852 !g_str_has_prefix(snext,"Tis"))
853 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
855 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
856 g_unichar_isalpha(g_utf8_get_char(snext)))
857 /* Do nothing! it's definitely an apostrophe, not a quote */
859 /* it's outside a word - let's check it out */
860 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
861 g_unichar_isalpha(g_utf8_get_char(snext)))
863 /* certainly looks like a quotation mark */
864 if (!g_str_has_prefix(snext,"tis") &&
865 !g_str_has_prefix(snext,"Tis"))
866 /* hardcode a very common exception! */
868 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
869 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
871 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
876 /* now - is it a quotation mark? */
877 guessquote=0; /* accumulate clues */
878 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
880 /* it follows a letter - could be either */
882 if (g_utf8_get_char(sprev)=='s')
884 /* looks like a plural apostrophe */
886 if (g_utf8_get_char(snext)==CHAR_SPACE)
890 if (innermost_quote_matches(counters,c))
892 * Give it the benefit of some doubt,
893 * if a squote is already open.
899 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
902 /* no adjacent letter - it must be a quote of some kind */
903 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
908 if (pswit[ECHO_SWITCH])
909 g_print("\n%s\n",aline);
910 if (!pswit[OVERVIEW_SWITCH])
911 g_print(" Line %ld column %ld - %s\n",
912 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
913 g_clear_error(&tmp_err);
915 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
917 isemptyline=FALSE; /* ignore lines like * * * as spacers */
918 if (c==CHAR_UNDERSCORE)
919 counters->c_unders++;
920 if (c==CHAR_OPEN_SBRACK)
922 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
923 !matching_difference(counters,c) && s==aline &&
924 g_str_has_prefix(s,"[Illustration:"))
925 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
927 increment_matching(counters,c,TRUE);
929 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
930 increment_matching(counters,c,TRUE);
931 if (c==CHAR_CLOSE_SBRACK)
933 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
934 !matching_difference(counters,c) && !*snext)
935 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
937 increment_matching(counters,c,FALSE);
939 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
940 increment_matching(counters,c,FALSE);
948 * check_for_control_characters:
950 * Check for invalid or questionable characters in the line
951 * Anything above 127 is invalid for plain ASCII, and
952 * non-printable control characters should also be flagged.
953 * Tabs should generally not be there.
955 void check_for_control_characters(const char *aline)
959 for (s=aline;*s;s=g_utf8_next_char(s))
961 c=g_utf8_get_char(s);
962 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
964 if (pswit[ECHO_SWITCH])
965 g_print("\n%s\n",aline);
966 if (!pswit[OVERVIEW_SWITCH])
967 g_print(" Line %ld column %ld - Control character %u\n",
968 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
976 * check_for_odd_characters:
978 * Check for binary and other odd characters.
980 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
981 gboolean isemptyline)
983 /* Don't repeat multiple warnings on one line. */
984 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
985 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
988 for (s=aline;*s;s=g_utf8_next_char(s))
990 c=g_utf8_get_char(s);
991 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
993 if (pswit[ECHO_SWITCH])
994 g_print("\n%s\n",aline);
995 if (!pswit[OVERVIEW_SWITCH])
996 if (c>127 && c<160 || c>255)
997 g_print(" Line %ld column %ld - "
998 "Non-ISO-8859 character %u\n",
999 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1001 g_print(" Line %ld column %ld - "
1002 "Non-ASCII character %u\n",
1003 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1008 if (!eTab && c==CHAR_TAB)
1010 if (pswit[ECHO_SWITCH])
1011 g_print("\n%s\n",aline);
1012 if (!pswit[OVERVIEW_SWITCH])
1013 g_print(" Line %ld column %ld - Tab character?\n",
1014 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1019 if (!eTilde && c==CHAR_TILDE)
1022 * Often used by OCR software to indicate an
1023 * unrecognizable character.
1025 if (pswit[ECHO_SWITCH])
1026 g_print("\n%s\n",aline);
1027 if (!pswit[OVERVIEW_SWITCH])
1028 g_print(" Line %ld column %ld - Tilde character?\n",
1029 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1034 if (!eCarat && c==CHAR_CARAT)
1036 if (pswit[ECHO_SWITCH])
1037 g_print("\n%s\n",aline);
1038 if (!pswit[OVERVIEW_SWITCH])
1039 g_print(" Line %ld column %ld - Carat character?\n",
1040 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1045 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1047 if (pswit[ECHO_SWITCH])
1048 g_print("\n%s\n",aline);
1049 if (!pswit[OVERVIEW_SWITCH])
1050 g_print(" Line %ld column %ld - Forward slash?\n",
1051 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1057 * Report asterisks only in paranoid mode,
1058 * since they're often deliberate.
1060 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1063 if (pswit[ECHO_SWITCH])
1064 g_print("\n%s\n",aline);
1065 if (!pswit[OVERVIEW_SWITCH])
1066 g_print(" Line %ld column %ld - Asterisk?\n",
1067 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1076 * check_for_long_line:
1078 * Check for line too long.
1080 void check_for_long_line(const char *aline)
1082 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1084 if (pswit[ECHO_SWITCH])
1085 g_print("\n%s\n",aline);
1086 if (!pswit[OVERVIEW_SWITCH])
1087 g_print(" Line %ld column %ld - Long line %ld\n",
1088 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1095 * check_for_short_line:
1097 * Check for line too short.
1099 * This one is a bit trickier to implement: we don't want to
1100 * flag the last line of a paragraph for being short, so we
1101 * have to wait until we know that our current line is a
1102 * "normal" line, then report the _previous_ line if it was too
1103 * short. We also don't want to report indented lines like
1104 * chapter heads or formatted quotations. We therefore keep
1105 * last->len as the length of the last line examined, and
1106 * last->blen as the length of the last but one, and try to
1107 * suppress unnecessary warnings by checking that both were of
1108 * "normal" length. We keep the first character of the last
1109 * line in last->start, and if it was a space, we assume that
1110 * the formatting is deliberate. I can't figure out a way to
1111 * distinguish something like a quoted verse left-aligned or
1112 * the header or footer of a letter from a paragraph of short
1113 * lines - maybe if I examined the whole paragraph, and if the
1114 * para has less than, say, 8 lines and if all lines are short,
1115 * then just assume it's OK? Need to look at some texts to see
1116 * how often a formula like this would get the right result.
1118 void check_for_short_line(const char *aline,const struct line_properties *last)
1120 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1121 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1122 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1124 if (pswit[ECHO_SWITCH])
1125 g_print("\n%s\n",prevline);
1126 if (!pswit[OVERVIEW_SWITCH])
1127 g_print(" Line %ld column %ld - Short line %ld?\n",
1128 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1135 * check_for_starting_punctuation:
1137 * Look for punctuation other than full ellipses at start of line.
1139 void check_for_starting_punctuation(const char *aline)
1141 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1142 !g_str_has_prefix(aline,". . ."))
1144 if (pswit[ECHO_SWITCH])
1145 g_print("\n%s\n",aline);
1146 if (!pswit[OVERVIEW_SWITCH])
1147 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1157 * Find the first em-dash, return a pointer to it and set <next> to the
1158 * character following the dash.
1160 char *str_emdash(const char *s,const char **next)
1168 *next=g_utf8_next_char(s2);
1173 *next=g_utf8_next_char(g_utf8_next_char(s1));
1178 *next=g_utf8_next_char(g_utf8_next_char(s1));
1183 *next=g_utf8_next_char(s2);
1189 * check_for_spaced_emdash:
1191 * Check for spaced em-dashes.
1193 * We must check _all_ occurrences of em-dashes on the line
1194 * hence the loop - even if the first dash is OK
1195 * there may be another that's wrong later on.
1197 void check_for_spaced_emdash(const char *aline)
1199 const char *s,*t,*next;
1200 for (s=aline;t=str_emdash(s,&next);s=next)
1202 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1203 g_utf8_get_char(next)==CHAR_SPACE)
1205 if (pswit[ECHO_SWITCH])
1206 g_print("\n%s\n",aline);
1207 if (!pswit[OVERVIEW_SWITCH])
1208 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1209 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1217 * check_for_spaced_dash:
1219 * Check for spaced dashes.
1221 void check_for_spaced_dash(const char *aline)
1224 if ((s=strstr(aline," -")))
1226 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1228 if (pswit[ECHO_SWITCH])
1229 g_print("\n%s\n",aline);
1230 if (!pswit[OVERVIEW_SWITCH])
1231 g_print(" Line %ld column %ld - Spaced dash?\n",
1232 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1237 else if ((s=strstr(aline,"- ")))
1239 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1241 if (pswit[ECHO_SWITCH])
1242 g_print("\n%s\n",aline);
1243 if (!pswit[OVERVIEW_SWITCH])
1244 g_print(" Line %ld column %ld - Spaced dash?\n",
1245 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1253 * check_for_unmarked_paragraphs:
1255 * Check for unmarked paragraphs indicated by separate speakers.
1257 * May well be false positive:
1258 * "Bravo!" "Wonderful!" called the crowd.
1259 * but useful all the same.
1261 void check_for_unmarked_paragraphs(const char *aline)
1264 s=strstr(aline,"\" \"");
1266 s=strstr(aline,"\" \"");
1269 if (pswit[ECHO_SWITCH])
1270 g_print("\n%s\n",aline);
1271 if (!pswit[OVERVIEW_SWITCH])
1272 g_print(" Line %ld column %ld - "
1273 "Query missing paragraph break?\n",
1274 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1281 * check_for_jeebies:
1283 * Check for "to he" and other easy h/b errors.
1285 * This is a very inadequate effort on the h/b problem,
1286 * but the phrase "to he" is always an error, whereas "to
1287 * be" is quite common.
1288 * Similarly, '"Quiet!", be said.' is a non-be error
1289 * "to he" is _not_ always an error!:
1290 * "Where they went to he couldn't say."
1291 * Another false positive:
1292 * What would "Cinderella" be without the . . .
1293 * and another: "If he wants to he can see for himself."
1295 void check_for_jeebies(const char *aline)
1298 s=strstr(aline," be could ");
1300 s=strstr(aline," be would ");
1302 s=strstr(aline," was be ");
1304 s=strstr(aline," be is ");
1306 s=strstr(aline," is be ");
1308 s=strstr(aline,"\", be ");
1310 s=strstr(aline,"\" be ");
1312 s=strstr(aline,"\" be ");
1314 s=strstr(aline," to he ");
1317 if (pswit[ECHO_SWITCH])
1318 g_print("\n%s\n",aline);
1319 if (!pswit[OVERVIEW_SWITCH])
1320 g_print(" Line %ld column %ld - Query he/be error?\n",
1321 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1325 s=strstr(aline," the had ");
1327 s=strstr(aline," a had ");
1329 s=strstr(aline," they bad ");
1331 s=strstr(aline," she bad ");
1333 s=strstr(aline," he bad ");
1335 s=strstr(aline," you bad ");
1337 s=strstr(aline," i bad ");
1340 if (pswit[ECHO_SWITCH])
1341 g_print("\n%s\n",aline);
1342 if (!pswit[OVERVIEW_SWITCH])
1343 g_print(" Line %ld column %ld - Query had/bad error?\n",
1344 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1348 s=strstr(aline,"; hut ");
1350 s=strstr(aline,", hut ");
1353 if (pswit[ECHO_SWITCH])
1354 g_print("\n%s\n",aline);
1355 if (!pswit[OVERVIEW_SWITCH])
1356 g_print(" Line %ld column %ld - Query hut/but error?\n",
1357 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1364 * check_for_mta_from:
1366 * Special case - angled bracket in front of "From" placed there by an
1367 * MTA when sending an e-mail.
1369 void check_for_mta_from(const char *aline)
1372 s=strstr(aline,">From");
1375 if (pswit[ECHO_SWITCH])
1376 g_print("\n%s\n",aline);
1377 if (!pswit[OVERVIEW_SWITCH])
1378 g_print(" Line %ld column %ld - "
1379 "Query angled bracket with From\n",
1380 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1387 * check_for_orphan_character:
1389 * Check for a single character line -
1390 * often an overflow from bad wrapping.
1392 void check_for_orphan_character(const char *aline)
1395 c=g_utf8_get_char(aline);
1396 if (c && !*g_utf8_next_char(aline))
1398 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1399 ; /* Nothing - ignore numerals alone on a line. */
1402 if (pswit[ECHO_SWITCH])
1403 g_print("\n%s\n",aline);
1404 if (!pswit[OVERVIEW_SWITCH])
1405 g_print(" Line %ld column 1 - Query single character line\n",
1414 * check_for_pling_scanno:
1416 * Check for I" - often should be !
1418 void check_for_pling_scanno(const char *aline)
1421 s=strstr(aline," I\"");
1424 if (pswit[ECHO_SWITCH])
1425 g_print("\n%s\n",aline);
1426 if (!pswit[OVERVIEW_SWITCH])
1427 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1428 linecnt,g_utf8_pointer_to_offset(aline,s));
1435 * check_for_extra_period:
1437 * Check for period without a capital letter. Cut-down from gutspell.
1438 * Only works when it happens on a single line.
1440 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1442 const char *s,*t,*s1,*sprev;
1447 gunichar c,nc,pc,*decomposition;
1448 if (pswit[PARANOID_SWITCH])
1450 for (t=aline;t=strstr(t,". ");)
1454 t=g_utf8_next_char(t);
1455 /* start of line punctuation is handled elsewhere */
1458 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1460 t=g_utf8_next_char(t);
1463 if (warnings->isDutch)
1465 /* For Frank & Jeroen -- 's Middags case */
1466 gunichar c2,c3,c4,c5;
1467 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1468 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1469 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1470 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1471 if (CHAR_IS_APOSTROPHE(c2) &&
1472 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1473 g_unichar_isupper(c5))
1475 t=g_utf8_next_char(t);
1479 s1=g_utf8_next_char(g_utf8_next_char(t));
1480 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1481 !isdigit(g_utf8_get_char(s1)))
1482 s1=g_utf8_next_char(s1);
1483 if (g_unichar_islower(g_utf8_get_char(s1)))
1485 /* we have something to investigate */
1487 /* so let's go back and find out */
1488 nc=g_utf8_get_char(t);
1489 s1=g_utf8_prev_char(t);
1490 c=g_utf8_get_char(s1);
1491 sprev=g_utf8_prev_char(s1);
1492 pc=g_utf8_get_char(sprev);
1494 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1495 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1496 g_unichar_isalpha(nc)))
1501 sprev=g_utf8_prev_char(s1);
1502 pc=g_utf8_get_char(sprev);
1504 s1=g_utf8_next_char(s1);
1507 testword=g_strndup(s1,s-s1);
1509 testword=g_strdup(s1);
1510 for (i=0;*abbrev[i];i++)
1511 if (!strcmp(testword,abbrev[i]))
1513 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1515 if (!*g_utf8_next_char(testword))
1517 if (isroman(testword))
1522 for (s=testword;*s;s=g_utf8_next_char(s))
1524 decomposition=g_unicode_canonical_decomposition(
1525 g_utf8_get_char(s),&len);
1526 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1528 g_free(decomposition);
1532 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1534 g_tree_insert(qperiod,g_strdup(testword),
1535 GINT_TO_POINTER(1));
1536 if (pswit[ECHO_SWITCH])
1537 g_print("\n%s\n",aline);
1538 if (!pswit[OVERVIEW_SWITCH])
1539 g_print(" Line %ld column %ld - Extra period?\n",
1540 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1546 t=g_utf8_next_char(t);
1552 * check_for_following_punctuation:
1554 * Check for words usually not followed by punctuation.
1556 void check_for_following_punctuation(const char *aline)
1559 const char *s,*wordstart;
1562 if (pswit[TYPO_SWITCH])
1573 inword=g_utf8_strdown(t,-1);
1575 for (i=0;*nocomma[i];i++)
1576 if (!strcmp(inword,nocomma[i]))
1578 c=g_utf8_get_char(s);
1579 if (c==',' || c==';' || c==':')
1581 if (pswit[ECHO_SWITCH])
1582 g_print("\n%s\n",aline);
1583 if (!pswit[OVERVIEW_SWITCH])
1584 g_print(" Line %ld column %ld - "
1585 "Query punctuation after %s?\n",
1586 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1592 for (i=0;*noperiod[i];i++)
1593 if (!strcmp(inword,noperiod[i]))
1595 c=g_utf8_get_char(s);
1596 if (c=='.' || c=='!')
1598 if (pswit[ECHO_SWITCH])
1599 g_print("\n%s\n",aline);
1600 if (!pswit[OVERVIEW_SWITCH])
1601 g_print(" Line %ld column %ld - "
1602 "Query punctuation after %s?\n",
1603 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1617 * Check for commonly mistyped words,
1618 * and digits like 0 for O in a word.
1620 void check_for_typos(const char *aline,struct warnings *warnings)
1622 const char *s,*t,*nt,*wordstart;
1624 gunichar *decomposition;
1626 int i,vowel,consonant,*dupcnt;
1627 gboolean isdup,istypo,alower;
1630 gsize decomposition_len;
1634 inword=getaword(&s);
1638 continue; /* don't bother with empty lines */
1640 if (mixdigit(inword))
1642 if (pswit[ECHO_SWITCH])
1643 g_print("\n%s\n",aline);
1644 if (!pswit[OVERVIEW_SWITCH])
1645 g_print(" Line %ld column %ld - Query digit in %s\n",
1646 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1651 * Put the word through a series of tests for likely typos and OCR
1654 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1658 for (t=inword;*t;t=g_utf8_next_char(t))
1660 c=g_utf8_get_char(t);
1661 nt=g_utf8_next_char(t);
1662 /* lowercase for testing */
1663 if (g_unichar_islower(c))
1665 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1668 * We have an uppercase mid-word. However, there are
1670 * Mac and Mc like McGill
1671 * French contractions like l'Abbe
1673 offset=g_utf8_pointer_to_offset(inword,t);
1675 pc=g_utf8_get_char(g_utf8_prev_char(t));
1678 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1679 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1680 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1681 CHAR_IS_APOSTROPHE(pc))
1687 testword=g_utf8_casefold(inword,-1);
1689 if (pswit[TYPO_SWITCH])
1692 * Check for certain unlikely two-letter combinations at word
1695 len=g_utf8_strlen(testword,-1);
1698 for (i=0;*nostart[i];i++)
1699 if (g_str_has_prefix(testword,nostart[i]))
1701 for (i=0;*noend[i];i++)
1702 if (g_str_has_suffix(testword,noend[i]))
1705 /* ght is common, gbt never. Like that. */
1706 if (strstr(testword,"cb"))
1708 if (strstr(testword,"gbt"))
1710 if (strstr(testword,"pbt"))
1712 if (strstr(testword,"tbs"))
1714 if (strstr(testword,"mrn"))
1716 if (strstr(testword,"ahle"))
1718 if (strstr(testword,"ihle"))
1721 * "TBE" does happen - like HEARTBEAT - but uncommon.
1722 * Also "TBI" - frostbite, outbid - but uncommon.
1723 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1724 * numerals, but "ii" is a common scanno.
1726 if (strstr(testword,"tbi"))
1728 if (strstr(testword,"tbe"))
1730 if (strstr(testword,"ii"))
1733 * Check for no vowels or no consonants.
1734 * If none, flag a typo.
1736 if (!istypo && len>1)
1739 for (t=testword;*t;t=g_utf8_next_char(t))
1741 c=g_utf8_get_char(t);
1743 g_unicode_canonical_decomposition(c,&decomposition_len);
1744 if (c=='y' || g_unichar_isdigit(c))
1746 /* Yah, this is loose. */
1750 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1754 g_free(decomposition);
1756 if (!vowel || !consonant)
1760 * Now exclude the word from being reported if it's in
1763 for (i=0;*okword[i];i++)
1764 if (!strcmp(testword,okword[i]))
1767 * What looks like a typo may be a Roman numeral.
1770 if (istypo && isroman(testword))
1772 /* Check the manual list of typos. */
1774 for (i=0;*typo[i];i++)
1775 if (!strcmp(testword,typo[i]))
1778 * Check lowercase s, l, i and m - special cases.
1779 * "j" - often a semi-colon gone wrong.
1780 * "d" for a missing apostrophe - he d
1783 if (!istypo && len==1 &&
1784 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1788 dupcnt=g_tree_lookup(qword,testword);
1792 isdup=!pswit[VERBOSE_SWITCH];
1796 dupcnt=g_new0(int,1);
1797 g_tree_insert(qword,g_strdup(testword),dupcnt);
1802 if (pswit[ECHO_SWITCH])
1803 g_print("\n%s\n",aline);
1804 if (!pswit[OVERVIEW_SWITCH])
1806 g_print(" Line %ld column %ld - Query word %s",
1807 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1809 if (!pswit[VERBOSE_SWITCH])
1810 g_print(" - not reporting duplicates");
1818 /* check the user's list of typos */
1819 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1821 if (pswit[ECHO_SWITCH])
1822 g_print("\n%s\n",aline);
1823 if (!pswit[OVERVIEW_SWITCH])
1824 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1825 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1827 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1829 if (pswit[PARANOID_SWITCH] && warnings->digit)
1831 /* In paranoid mode, query all 0 and 1 standing alone. */
1832 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1834 if (pswit[ECHO_SWITCH])
1835 g_print("\n%s\n",aline);
1836 if (!pswit[OVERVIEW_SWITCH])
1837 g_print(" Line %ld column %ld - Query standalone %s\n",
1838 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1849 * check_for_misspaced_punctuation:
1851 * Look for added or missing spaces around punctuation and quotes.
1852 * If there is a punctuation character like ! with no space on
1853 * either side, suspect a missing!space. If there are spaces on
1854 * both sides , assume a typo. If we see a double quote with no
1855 * space or punctuation on either side of it, assume unspaced
1856 * quotes "like"this.
1858 void check_for_misspaced_punctuation(const char *aline,
1859 struct parities *parities,gboolean isemptyline)
1861 gboolean isacro,isellipsis;
1863 gunichar c,nc,pc,n2c;
1865 c=g_utf8_get_char(aline);
1866 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1867 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1871 nc=g_utf8_get_char(g_utf8_next_char(s));
1872 /* For each character in the line after the first. */
1873 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1875 /* we need to suppress warnings for acronyms like M.D. */
1877 /* we need to suppress warnings for ellipsis . . . */
1880 * If there are letters on both sides of it or
1881 * if it's strict punctuation followed by an alpha.
1883 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1884 g_utf8_strchr("?!,;:",-1,c)))
1888 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1889 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1891 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1897 if (pswit[ECHO_SWITCH])
1898 g_print("\n%s\n",aline);
1899 if (!pswit[OVERVIEW_SWITCH])
1900 g_print(" Line %ld column %ld - Missing space?\n",
1901 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1906 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1909 * If there are spaces on both sides,
1910 * or space before and end of line.
1914 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1915 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1917 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1921 if (!isemptyline && !isellipsis)
1923 if (pswit[ECHO_SWITCH])
1924 g_print("\n%s\n",aline);
1925 if (!pswit[OVERVIEW_SWITCH])
1926 g_print(" Line %ld column %ld - "
1927 "Spaced punctuation?\n",linecnt,
1928 g_utf8_pointer_to_offset(aline,s)+1);
1935 /* Split out the characters that CANNOT be preceded by space. */
1936 c=g_utf8_get_char(aline);
1937 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1938 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1942 nc=g_utf8_get_char(g_utf8_next_char(s));
1943 /* for each character in the line after the first */
1944 if (g_utf8_strchr("?!,;:",-1,c))
1946 /* if it's punctuation that _cannot_ have a space before it */
1947 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1950 * If nc DOES == space,
1951 * it was already reported just above.
1953 if (pswit[ECHO_SWITCH])
1954 g_print("\n%s\n",aline);
1955 if (!pswit[OVERVIEW_SWITCH])
1956 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1957 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1964 * Special case " .X" where X is any alpha.
1965 * This plugs a hole in the acronym code above.
1966 * Inelegant, but maintainable.
1968 c=g_utf8_get_char(aline);
1969 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1970 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1974 nc=g_utf8_get_char(g_utf8_next_char(s));
1975 /* for each character in the line after the first */
1978 /* if it's a period */
1979 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1982 * If the period follows a space and
1983 * is followed by a letter.
1985 if (pswit[ECHO_SWITCH])
1986 g_print("\n%s\n",aline);
1987 if (!pswit[OVERVIEW_SWITCH])
1988 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1989 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1995 c=g_utf8_get_char(aline);
1996 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1997 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2001 nc=g_utf8_get_char(g_utf8_next_char(s));
2002 /* for each character in the line after the first */
2003 if (CHAR_IS_DQUOTE(c))
2005 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2006 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2007 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2009 if (pswit[ECHO_SWITCH])
2010 g_print("\n%s\n",aline);
2011 if (!pswit[OVERVIEW_SWITCH])
2012 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2013 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2019 /* Check parity of quotes. */
2020 nc=g_utf8_get_char(aline);
2021 for (s=aline;*s;s=g_utf8_next_char(s))
2024 nc=g_utf8_get_char(g_utf8_next_char(s));
2025 if (CHAR_IS_DQUOTE(c))
2029 parities->dquote=!parities->dquote;
2030 parity=parities->dquote;
2032 else if (c==CHAR_LD_QUOTE)
2039 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2041 if (pswit[ECHO_SWITCH])
2042 g_print("\n%s\n",aline);
2043 if (!pswit[OVERVIEW_SWITCH])
2044 g_print(" Line %ld column %ld - "
2045 "Wrongspaced quotes?\n",
2046 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2054 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2055 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2057 if (pswit[ECHO_SWITCH])
2058 g_print("\n%s\n",aline);
2059 if (!pswit[OVERVIEW_SWITCH])
2060 g_print(" Line %ld column %ld - "
2061 "Wrongspaced quotes?\n",
2062 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2069 c=g_utf8_get_char(aline);
2070 if (CHAR_IS_DQUOTE(c))
2072 if (g_utf8_strchr(",;:!?)]} ",-1,
2073 g_utf8_get_char(g_utf8_next_char(aline))))
2075 if (pswit[ECHO_SWITCH])
2076 g_print("\n%s\n",aline);
2077 if (!pswit[OVERVIEW_SWITCH])
2078 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2084 if (pswit[SQUOTE_SWITCH])
2086 nc=g_utf8_get_char(aline);
2087 for (s=aline;*s;s=g_utf8_next_char(s))
2090 nc=g_utf8_get_char(g_utf8_next_char(s));
2091 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2092 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2093 !g_unichar_isalpha(nc)))
2095 parities->squote=!parities->squote;
2096 if (!parities->squote)
2099 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2101 if (pswit[ECHO_SWITCH])
2102 g_print("\n%s\n",aline);
2103 if (!pswit[OVERVIEW_SWITCH])
2104 g_print(" Line %ld column %ld - "
2105 "Wrongspaced singlequotes?\n",
2106 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2114 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2115 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2117 if (pswit[ECHO_SWITCH])
2118 g_print("\n%s\n",aline);
2119 if (!pswit[OVERVIEW_SWITCH])
2120 g_print(" Line %ld column %ld - "
2121 "Wrongspaced singlequotes?\n",
2122 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2133 * check_for_double_punctuation:
2135 * Look for double punctuation like ,. or ,,
2136 * Thanks to DW for the suggestion!
2137 * In books with references, ".," and ".;" are common
2138 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2139 * OTOH, from my initial tests, there are also fairly
2140 * common errors. What to do? Make these cases paranoid?
2141 * ".," is the most common, so warnings->dotcomma is used
2142 * to suppress detailed reporting if it occurs often.
2144 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2148 nc=g_utf8_get_char(aline);
2149 for (s=aline;*s;s=g_utf8_next_char(s))
2152 nc=g_utf8_get_char(g_utf8_next_char(s));
2153 /* for each punctuation character in the line */
2154 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2155 g_utf8_strchr(".?!,;:",-1,nc))
2157 /* followed by punctuation, it's a query, unless . . . */
2158 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2159 !warnings->dotcomma && c=='.' && nc==',' ||
2160 warnings->isFrench && g_str_has_prefix(s,",...") ||
2161 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2162 warnings->isFrench && g_str_has_prefix(s,";...") ||
2163 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2164 warnings->isFrench && g_str_has_prefix(s,":...") ||
2165 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2166 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2167 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2168 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2169 warnings->isFrench && g_str_has_prefix(s,"...?"))
2171 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2172 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2173 warnings->isFrench && g_str_has_prefix(s,";...") ||
2174 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2175 warnings->isFrench && g_str_has_prefix(s,":...") ||
2176 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2177 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2178 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2179 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2180 warnings->isFrench && g_str_has_prefix(s,"...?"))
2183 nc=g_utf8_get_char(g_utf8_next_char(s));
2185 ; /* do nothing for .. !! and ?? which can be legit */
2189 if (pswit[ECHO_SWITCH])
2190 g_print("\n%s\n",aline);
2191 if (!pswit[OVERVIEW_SWITCH])
2192 g_print(" Line %ld column %ld - Double punctuation?\n",
2193 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2202 * check_for_spaced_quotes:
2204 void check_for_spaced_quotes(const char *aline)
2208 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2212 while ((t=strstr(s," \" ")))
2214 if (pswit[ECHO_SWITCH])
2215 g_print("\n%s\n",aline);
2216 if (!pswit[OVERVIEW_SWITCH])
2217 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2218 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2221 s=g_utf8_next_char(g_utf8_next_char(t));
2223 pattern=g_string_new(NULL);
2224 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2226 g_string_assign(pattern," ");
2227 g_string_append_unichar(pattern,single_quotes[i]);
2228 g_string_append_c(pattern,' ');
2230 while ((t=strstr(s,pattern->str)))
2232 if (pswit[ECHO_SWITCH])
2233 g_print("\n%s\n",aline);
2234 if (!pswit[OVERVIEW_SWITCH])
2235 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2236 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2239 s=g_utf8_next_char(g_utf8_next_char(t));
2242 g_string_free(pattern,TRUE);
2246 * check_for_miscased_genative:
2248 * Check special case of 'S instead of 's at end of word.
2250 void check_for_miscased_genative(const char *aline)
2256 c=g_utf8_get_char(aline);
2257 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2258 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2262 nc=g_utf8_get_char(g_utf8_next_char(s));
2263 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2265 if (pswit[ECHO_SWITCH])
2266 g_print("\n%s\n",aline);
2267 if (!pswit[OVERVIEW_SWITCH])
2268 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2269 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2277 * check_end_of_line:
2279 * Now check special cases - start and end of line -
2280 * for single and double quotes. Start is sometimes [sic]
2281 * but better to query it anyway.
2282 * While we're here, check for dash at end of line.
2284 void check_end_of_line(const char *aline,struct warnings *warnings)
2289 lbytes=strlen(aline);
2290 if (g_utf8_strlen(aline,lbytes)>1)
2292 s=g_utf8_prev_char(aline+lbytes);
2293 c1=g_utf8_get_char(s);
2294 c2=g_utf8_get_char(g_utf8_prev_char(s));
2295 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2297 if (pswit[ECHO_SWITCH])
2298 g_print("\n%s\n",aline);
2299 if (!pswit[OVERVIEW_SWITCH])
2300 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2301 g_utf8_strlen(aline,lbytes));
2305 c1=g_utf8_get_char(aline);
2306 c2=g_utf8_get_char(g_utf8_next_char(aline));
2307 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2309 if (pswit[ECHO_SWITCH])
2310 g_print("\n%s\n",aline);
2311 if (!pswit[OVERVIEW_SWITCH])
2312 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2317 * Dash at end of line may well be legit - paranoid mode only
2318 * and don't report em-dash at line-end.
2320 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2322 for (s=g_utf8_prev_char(aline+lbytes);
2323 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2325 if (g_utf8_get_char(s)=='-' &&
2326 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2328 if (pswit[ECHO_SWITCH])
2329 g_print("\n%s\n",aline);
2330 if (!pswit[OVERVIEW_SWITCH])
2331 g_print(" Line %ld column %ld - "
2332 "Hyphen at end of line?\n",
2333 linecnt,g_utf8_pointer_to_offset(aline,s));
2340 * check_for_unspaced_bracket:
2342 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2343 * If so, suspect a scanno like "a]most".
2345 void check_for_unspaced_bracket(const char *aline)
2349 c=g_utf8_get_char(aline);
2350 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2351 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2355 nc=g_utf8_get_char(g_utf8_next_char(s));
2358 /* for each bracket character in the line except 1st & last */
2359 if (g_utf8_strchr("{[()]}",-1,c) &&
2360 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2362 if (pswit[ECHO_SWITCH])
2363 g_print("\n%s\n",aline);
2364 if (!pswit[OVERVIEW_SWITCH])
2365 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2366 linecnt,g_utf8_pointer_to_offset(aline,s));
2374 * check_for_unpunctuated_endquote:
2376 void check_for_unpunctuated_endquote(const char *aline)
2381 c=g_utf8_get_char(aline);
2382 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2383 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2387 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2388 nc=g_utf8_get_char(g_utf8_next_char(s));
2389 /* for each character in the line except 1st */
2390 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2392 if (pswit[ECHO_SWITCH])
2393 g_print("\n%s\n",aline);
2394 if (!pswit[OVERVIEW_SWITCH])
2395 g_print(" Line %ld column %ld - "
2396 "endquote missing punctuation?\n",
2397 linecnt,g_utf8_pointer_to_offset(aline,s));
2405 * check_for_html_tag:
2407 * Check for <HTML TAG>.
2409 * If there is a < in the line, followed at some point
2410 * by a > then we suspect HTML.
2412 void check_for_html_tag(const char *aline)
2414 const char *open,*close;
2416 open=strchr(aline,'<');
2419 close=strchr(g_utf8_next_char(open),'>');
2422 if (pswit[ECHO_SWITCH])
2423 g_print("\n%s\n",aline);
2424 if (!pswit[OVERVIEW_SWITCH])
2426 tag=g_strndup(open,close-open+1);
2427 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2428 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2438 * check_for_html_entity:
2440 * Check for &symbol; HTML.
2442 * If there is a & in the line, followed at
2443 * some point by a ; then we suspect HTML.
2445 void check_for_html_entity(const char *aline)
2447 const char *s,*amp,*scolon;
2449 amp=strchr(aline,'&');
2452 scolon=strchr(amp,';');
2455 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2456 if (g_utf8_get_char(s)==CHAR_SPACE)
2457 break; /* Don't report "Jones & Son;" */
2460 if (pswit[ECHO_SWITCH])
2461 g_print("\n%s\n",aline);
2462 if (!pswit[OVERVIEW_SWITCH])
2464 entity=g_strndup(amp,scolon-amp+1);
2465 g_print(" Line %ld column %d - HTML symbol? %s \n",
2466 linecnt,(int)(amp-aline)+1,entity);
2477 * check_for_omitted_punctuation:
2479 * Check for omitted punctuation at end of paragraph by working back
2480 * through prevline. DW.
2481 * Need to check this only for "normal" paras.
2482 * So what is a "normal" para?
2483 * Not normal if one-liner (chapter headings, etc.)
2484 * Not normal if doesn't contain at least one locase letter
2485 * Not normal if starts with space
2487 void check_for_omitted_punctuation(const char *prevline,
2488 struct line_properties *last,int start_para_line)
2490 gboolean letter_on_line=FALSE;
2493 gboolean closing_quote;
2494 for (s=prevline;*s;s=g_utf8_next_char(s))
2495 if (g_unichar_isalpha(g_utf8_get_char(s)))
2497 letter_on_line=TRUE;
2501 * This next "if" is a problem.
2502 * If we say "start_para_line <= linecnt - 1", that includes
2503 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2504 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2505 * misses genuine one-line paragraphs.
2507 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2508 g_utf8_get_char(prevline)>CHAR_SPACE)
2510 s=prevline+strlen(prevline);
2513 s=g_utf8_prev_char(s);
2514 c=g_utf8_get_char(s);
2515 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2518 closing_quote=FALSE;
2519 } while (closing_quote && s>prevline);
2520 for (;s>prevline;s=g_utf8_prev_char(s))
2522 if (g_unichar_isalpha(g_utf8_get_char(s)))
2524 if (pswit[ECHO_SWITCH])
2525 g_print("\n%s\n",prevline);
2526 if (!pswit[OVERVIEW_SWITCH])
2527 g_print(" Line %ld column %ld - "
2528 "No punctuation at para end?\n",
2529 linecnt-1,g_utf8_strlen(prevline,-1));
2534 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2540 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2542 const char *word=key;
2545 g_print("\nNote: Queried word %s was duplicated %d times\n",
2550 void print_as_windows_1252(const char *string)
2552 gsize inbytes,outbytes;
2554 static GIConv converter=(GIConv)-1;
2557 if (converter!=(GIConv)-1)
2558 g_iconv_close(converter);
2559 converter=(GIConv)-1;
2562 if (converter==(GIConv)-1)
2563 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2564 if (converter!=(GIConv)-1)
2566 inbytes=outbytes=strlen(string);
2567 bp=buf=g_malloc(outbytes+1);
2568 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2574 fputs(string,stdout);
2577 void print_as_utf_8(const char *string)
2579 fputs(string,stdout);
2587 void procfile(const char *filename)
2590 gchar *parastart=NULL; /* first line of current para */
2591 gchar *etext,*aline;
2594 struct first_pass_results *first_pass_results;
2595 struct warnings *warnings;
2596 struct counters counters={0};
2597 struct line_properties last={0};
2598 struct parities parities={0};
2599 struct pending pending={0};
2600 gboolean isemptyline;
2601 long start_para_line=0;
2602 gboolean isnewpara=FALSE,enddash=FALSE;
2603 last.start=CHAR_SPACE;
2604 linecnt=checked_linecnt=0;
2605 etext=read_etext(filename,&err);
2608 if (pswit[STDOUT_SWITCH])
2609 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2611 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2614 g_print("\n\nFile: %s\n\n",filename);
2615 first_pass_results=first_pass(etext);
2616 warnings=report_first_pass(first_pass_results);
2617 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2618 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2620 * Here we go with the main pass. Hold onto yer hat!
2624 while ((aline=flgets(&etext_ptr,linecnt+1)))
2629 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2630 continue; // skip DP page separators completely
2631 if (linecnt<first_pass_results->firstline ||
2632 (first_pass_results->footerline>0 &&
2633 linecnt>first_pass_results->footerline))
2635 if (pswit[HEADER_SWITCH])
2637 if (g_str_has_prefix(aline,"Title:"))
2638 g_print(" %s\n",aline);
2639 if (g_str_has_prefix(aline,"Author:"))
2640 g_print(" %s\n",aline);
2641 if (g_str_has_prefix(aline,"Release Date:"))
2642 g_print(" %s\n",aline);
2643 if (g_str_has_prefix(aline,"Edition:"))
2644 g_print(" %s\n\n",aline);
2646 continue; /* skip through the header */
2649 print_pending(aline,parastart,&pending);
2650 isemptyline=analyse_quotes(aline,&counters);
2651 if (isnewpara && !isemptyline)
2653 /* This line is the start of a new paragraph. */
2654 start_para_line=linecnt;
2655 /* Capture its first line in case we want to report it later. */
2657 parastart=g_strdup(aline);
2658 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2660 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2661 !g_unichar_isdigit(g_utf8_get_char(s)))
2662 s=g_utf8_next_char(s);
2663 if (g_unichar_islower(g_utf8_get_char(s)))
2665 /* and its first letter is lowercase */
2666 if (pswit[ECHO_SWITCH])
2667 g_print("\n%s\n",aline);
2668 if (!pswit[OVERVIEW_SWITCH])
2669 g_print(" Line %ld column %ld - "
2670 "Paragraph starts with lower-case\n",
2671 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2675 isnewpara=FALSE; /* Signal the end of new para processing. */
2677 /* Check for an em-dash broken at line end. */
2678 if (enddash && g_utf8_get_char(aline)=='-')
2680 if (pswit[ECHO_SWITCH])
2681 g_print("\n%s\n",aline);
2682 if (!pswit[OVERVIEW_SWITCH])
2683 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2688 for (s=g_utf8_prev_char(aline+strlen(aline));
2689 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2691 if (s>=aline && g_utf8_get_char(s)=='-')
2693 check_for_control_characters(aline);
2695 check_for_odd_characters(aline,warnings,isemptyline);
2696 if (warnings->longline)
2697 check_for_long_line(aline);
2698 if (warnings->shortline)
2699 check_for_short_line(aline,&last);
2701 last.len=g_utf8_strlen(aline,-1);
2702 last.start=g_utf8_get_char(aline);
2703 check_for_starting_punctuation(aline);
2706 check_for_spaced_emdash(aline);
2707 check_for_spaced_dash(aline);
2709 check_for_unmarked_paragraphs(aline);
2710 check_for_jeebies(aline);
2711 check_for_mta_from(aline);
2712 check_for_orphan_character(aline);
2713 check_for_pling_scanno(aline);
2714 check_for_extra_period(aline,warnings);
2715 check_for_following_punctuation(aline);
2716 check_for_typos(aline,warnings);
2717 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2718 check_for_double_punctuation(aline,warnings);
2719 check_for_spaced_quotes(aline);
2720 check_for_miscased_genative(aline);
2721 check_end_of_line(aline,warnings);
2722 check_for_unspaced_bracket(aline);
2723 if (warnings->endquote)
2724 check_for_unpunctuated_endquote(aline);
2725 check_for_html_tag(aline);
2726 check_for_html_entity(aline);
2729 check_for_mismatched_quotes(&counters,&pending);
2730 counters_reset(&counters);
2731 /* let the next iteration know that it's starting a new para */
2734 check_for_omitted_punctuation(prevline,&last,start_para_line);
2737 prevline=g_strdup(aline);
2740 check_for_mismatched_quotes(&counters,&pending);
2741 print_pending(NULL,parastart,&pending);
2742 reset_pending(&pending);
2751 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2752 g_tree_foreach(qword,report_duplicate_queries,NULL);
2753 g_tree_unref(qword);
2754 g_tree_unref(qperiod);
2755 counters_destroy(&counters);
2756 g_set_print_handler(NULL);
2757 print_as_windows_1252(NULL);
2758 if (pswit[MARKUP_SWITCH])
2765 * Get one line from the input text, checking for
2766 * the existence of exactly one CR/LF line-end per line.
2768 * Returns: a pointer to the line.
2770 char *flgets(char **etext,long lcnt)
2773 gboolean isCR=FALSE;
2774 char *theline=*etext;
2779 c=g_utf8_get_char(*etext);
2780 *etext=g_utf8_next_char(*etext);
2783 /* either way, it's end of line */
2790 /* Error - a LF without a preceding CR */
2791 if (pswit[LINE_END_SWITCH])
2793 if (pswit[ECHO_SWITCH])
2795 s=g_strndup(theline,eos-theline);
2796 g_print("\n%s\n",s);
2799 if (!pswit[OVERVIEW_SWITCH])
2800 g_print(" Line %ld - No CR?\n",lcnt);
2811 /* Error - two successive CRs */
2812 if (pswit[LINE_END_SWITCH])
2814 if (pswit[ECHO_SWITCH])
2816 s=g_strndup(theline,eos-theline);
2817 g_print("\n%s\n",s);
2820 if (!pswit[OVERVIEW_SWITCH])
2821 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2830 if (pswit[LINE_END_SWITCH] && isCR)
2832 if (pswit[ECHO_SWITCH])
2834 s=g_strndup(theline,eos-theline);
2835 g_print("\n%s\n",s);
2838 if (!pswit[OVERVIEW_SWITCH])
2839 g_print(" Line %ld column %ld - CR without LF?\n",
2840 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2846 eos=g_utf8_next_char(eos);
2850 if (pswit[MARKUP_SWITCH])
2851 postprocess_for_HTML(theline);
2852 if (pswit[DP_SWITCH])
2853 postprocess_for_DP(theline);
2860 * Takes a "word" as a parameter, and checks whether it
2861 * contains a mixture of alpha and digits. Generally, this is an
2862 * error, but may not be for cases like 4th or L5 12s. 3d.
2864 * Returns: TRUE iff an is error found.
2866 gboolean mixdigit(const char *checkword)
2868 gboolean wehaveadigit,wehavealetter,query;
2869 const char *s,*nondigit;
2870 wehaveadigit=wehavealetter=query=FALSE;
2871 for (s=checkword;*s;s=g_utf8_next_char(s))
2872 if (g_unichar_isalpha(g_utf8_get_char(s)))
2874 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2876 if (wehaveadigit && wehavealetter)
2878 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2880 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2881 nondigit=g_utf8_next_char(nondigit))
2883 /* digits, ending in st, rd, nd, th of either case */
2884 if (!g_ascii_strcasecmp(nondigit,"st") ||
2885 !g_ascii_strcasecmp(nondigit,"rd") ||
2886 !g_ascii_strcasecmp(nondigit,"nd") ||
2887 !g_ascii_strcasecmp(nondigit,"th"))
2889 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2890 !g_ascii_strcasecmp(nondigit,"rds") ||
2891 !g_ascii_strcasecmp(nondigit,"nds") ||
2892 !g_ascii_strcasecmp(nondigit,"ths"))
2894 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2895 !g_ascii_strcasecmp(nondigit,"rdly") ||
2896 !g_ascii_strcasecmp(nondigit,"ndly") ||
2897 !g_ascii_strcasecmp(nondigit,"thly"))
2899 /* digits, ending in l, L, s or d */
2900 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2901 !strcmp(nondigit,"d"))
2904 * L at the start of a number, representing Britsh pounds, like L500.
2905 * This is cute. We know the current word is mixed digit. If the first
2906 * letter is L, there must be at least one digit following. If both
2907 * digits and letters follow, we have a genuine error, else we have a
2908 * capital L followed by digits, and we accept that as a non-error.
2910 if (g_utf8_get_char(checkword)=='L' &&
2911 !mixdigit(g_utf8_next_char(checkword)))
2920 * Extracts the first/next "word" from the line, and returns it.
2921 * A word is defined as one English word unit--or at least that's the aim.
2922 * "ptr" is advanced to the position in the line where we will start
2923 * looking for the next word.
2925 * Returns: A newly-allocated string.
2927 gchar *getaword(const char **ptr)
2932 word=g_string_new(NULL);
2933 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2934 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2935 **ptr;*ptr=g_utf8_next_char(*ptr))
2938 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2939 * Especially yucky is the case of L1,000
2940 * This section looks for a pattern of characters including a digit
2941 * followed by a comma or period followed by one or more digits.
2942 * If found, it returns this whole pattern as a word; otherwise we discard
2943 * the results and resume our normal programming.
2946 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2947 g_unichar_isalpha(g_utf8_get_char(s)) ||
2948 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2949 g_string_append_unichar(word,g_utf8_get_char(s));
2952 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
2954 c=g_utf8_get_char(t);
2955 pc=g_utf8_get_char(g_utf8_prev_char(t));
2956 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
2959 return g_string_free(word,FALSE);
2963 /* we didn't find a punctuated number - do the regular getword thing */
2964 g_string_truncate(word,0);
2965 c=g_utf8_get_char(*ptr);
2966 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
2967 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
2968 g_string_append_unichar(word,c);
2969 return g_string_free(word,FALSE);
2975 * Is this word a Roman Numeral?
2977 * It doesn't actually validate that the number is a valid Roman Numeral--for
2978 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
2979 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
2980 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
2981 * expressions thereof, except when it came to taxes. Allow any number of M,
2982 * an optional D, an optional CM or CD, any number of optional Cs, an optional
2983 * XL or an optional XC, an optional IX or IV, an optional V and any number
2986 gboolean isroman(const char *t)
2992 while (g_utf8_get_char(t)=='m' && *t)
2994 if (g_utf8_get_char(t)=='d')
2996 if (g_str_has_prefix(t,"cm"))
2998 if (g_str_has_prefix(t,"cd"))
3000 while (g_utf8_get_char(t)=='c' && *t)
3002 if (g_str_has_prefix(t,"xl"))
3004 if (g_str_has_prefix(t,"xc"))
3006 if (g_utf8_get_char(t)=='l')
3008 while (g_utf8_get_char(t)=='x' && *t)
3010 if (g_str_has_prefix(t,"ix"))
3012 if (g_str_has_prefix(t,"iv"))
3014 if (g_utf8_get_char(t)=='v')
3016 while (g_utf8_get_char(t)=='i' && *t)
3022 * postprocess_for_DP:
3024 * Invoked with the -d switch from flgets().
3025 * It simply "removes" from the line a hard-coded set of common
3026 * DP-specific tags, so that the line passed to the main routine has
3027 * been pre-cleaned of DP markup.
3029 void postprocess_for_DP(char *theline)
3035 for (i=0;*DPmarkup[i];i++)
3036 while ((s=strstr(theline,DPmarkup[i])))
3038 t=s+strlen(DPmarkup[i]);
3039 memmove(s,t,strlen(t)+1);
3044 * postprocess_for_HTML:
3046 * Invoked with the -m switch from flgets().
3047 * It simply "removes" from the line a hard-coded set of common
3048 * HTML tags and "replaces" a hard-coded set of common HTML
3049 * entities, so that the line passed to the main routine has
3050 * been pre-cleaned of HTML.
3052 void postprocess_for_HTML(char *theline)
3054 while (losemarkup(theline))
3056 loseentities(theline);
3059 char *losemarkup(char *theline)
3063 s=strchr(theline,'<');
3064 t=s?strchr(s,'>'):NULL;
3067 for (i=0;*markup[i];i++)
3068 if (tagcomp(g_utf8_next_char(s),markup[i]))
3070 t=g_utf8_next_char(t);
3071 memmove(s,t,strlen(t)+1);
3074 /* It's an unrecognized <xxx>. */
3078 void loseentities(char *theline)
3085 GTree *entities=NULL;
3086 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3090 g_tree_destroy(entities);
3092 if (translit!=(GIConv)-1)
3093 g_iconv_close(translit);
3094 translit=(GIConv)-1;
3095 if (to_utf8!=(GIConv)-1)
3096 g_iconv_close(to_utf8);
3104 entities=g_tree_new((GCompareFunc)strcmp);
3105 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3106 g_tree_insert(entities,HTMLentities[i].name,
3107 GUINT_TO_POINTER(HTMLentities[i].c));
3109 if (translit==(GIConv)-1)
3110 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3111 if (to_utf8==(GIConv)-1)
3112 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3113 while((amp=strchr(theline,'&')))
3115 scolon=strchr(amp,';');
3120 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3121 c=strtol(amp+2,NULL,10);
3122 else if (amp[2]=='x' &&
3123 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3124 c=strtol(amp+3,NULL,16);
3128 s=g_strndup(amp+1,scolon-(amp+1));
3129 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3138 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3139 theline+=g_unichar_to_utf8(c,theline);
3143 nb=g_unichar_to_utf8(c,s);
3144 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3146 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3148 memcpy(theline,s,nb);
3152 memmove(theline,g_utf8_next_char(scolon),
3153 strlen(g_utf8_next_char(scolon))+1);
3156 theline=g_utf8_next_char(amp);
3160 gboolean tagcomp(const char *strin,const char *basetag)
3164 if (g_utf8_get_char(strin)=='/')
3165 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3167 t=g_utf8_casefold(strin,-1);
3168 s=g_utf8_casefold(basetag,-1);
3169 retval=g_str_has_prefix(t,s);
3175 void proghelp(GOptionContext *context)
3178 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3179 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3180 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3181 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3182 "For details, read the file COPYING.\n",stderr);
3183 fputs("This is Free Software; "
3184 "you may redistribute it under certain conditions (GPL);\n",stderr);
3185 fputs("read the file COPYING for details.\n\n",stderr);
3186 help=g_option_context_get_help(context,TRUE,NULL);
3189 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3190 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3191 "non-ASCII\n",stderr);
3192 fputs("characters like accented letters, "
3193 "lines longer than 75 or shorter than 55,\n",stderr);
3194 fputs("unbalanced quotes or brackets, "
3195 "a variety of badly formatted punctuation, \n",stderr);
3196 fputs("HTML tags, some likely typos. "
3197 "It is NOT a substitute for human judgement.\n",stderr);