1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 static GOptionEntry options[]={
132 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
133 "Ignore DP-specific markup", NULL },
134 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
135 "Don't echo queried line", NULL },
136 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
137 "Check single quotes", NULL },
138 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
139 "Check common typos", NULL },
140 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
141 "Require closure of quotes on every paragraph", NULL },
142 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
143 "Disable paranoid querying of everything", NULL },
144 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
145 "Disable line end checking", NULL },
146 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
147 "Overview: just show counts", NULL },
148 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
149 "Output errors to stdout instead of stderr", NULL },
150 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
151 "Echo header fields", NULL },
152 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
153 "Ignore markup in < >", NULL },
154 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
155 "Use file of user-defined typos", NULL },
156 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
157 "Defaults for use on www upload", NULL },
158 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
159 "Verbose - list everything", NULL },
163 long cnt_quote; /* for overview mode, count of quote queries */
164 long cnt_brack; /* for overview mode, count of brackets queries */
165 long cnt_bin; /* for overview mode, count of non-ASCII queries */
166 long cnt_odd; /* for overview mode, count of odd character queries */
167 long cnt_long; /* for overview mode, count of long line errors */
168 long cnt_short; /* for overview mode, count of short line queries */
169 long cnt_punct; /* for overview mode,
170 count of punctuation and spacing queries */
171 long cnt_dash; /* for overview mode, count of dash-related queries */
172 long cnt_word; /* for overview mode, count of word queries */
173 long cnt_html; /* for overview mode, count of html queries */
174 long cnt_lineend; /* for overview mode, count of line-end queries */
175 long cnt_spacend; /* count of lines with space at end */
176 long linecnt; /* count of total lines in the file */
177 long checked_linecnt; /* count of lines actually checked */
179 void proghelp(GOptionContext *context);
180 void procfile(const char *);
184 gboolean mixdigit(const char *);
185 gchar *getaword(const char **);
186 char *flgets(char **,long,int);
187 void postprocess_for_HTML(char *);
188 char *linehasmarkup(char *);
189 char *losemarkup(char *);
190 gboolean tagcomp(const char *,const char *);
191 void loseentities(char *);
192 gboolean isroman(const char *);
193 void postprocess_for_DP(char *);
194 void print_as_windows_1252(const char *string);
195 void print_as_utf_8(const char *string);
197 GTree *qword,*qperiod;
203 void parse_options(int *argc,char ***argv)
206 GOptionContext *context;
207 context=g_option_context_new(
208 "file - looks for errors in Project Gutenberg(TM) etexts");
209 g_option_context_add_main_entries(context,options,NULL);
210 if (!g_option_context_parse(context,argc,argv,&err))
212 g_printerr("Bookloupe: %s\n",err->message);
213 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
216 /* Paranoid checking is turned OFF, not on, by its switch */
217 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
218 if (pswit[PARANOID_SWITCH])
219 /* if running in paranoid mode, typo checks default to enabled */
220 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
221 /* Line-end checking is turned OFF, not on, by its switch */
222 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
223 /* Echoing is turned OFF, not on, by its switch */
224 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
225 if (pswit[OVERVIEW_SWITCH])
226 /* just print summary; don't echo */
227 pswit[ECHO_SWITCH]=FALSE;
229 * Web uploads - for the moment, this is really just a placeholder
230 * until we decide what processing we really want to do on web uploads
232 if (pswit[WEB_SWITCH])
234 /* specific override for web uploads */
235 pswit[ECHO_SWITCH]=TRUE;
236 pswit[SQUOTE_SWITCH]=FALSE;
237 pswit[TYPO_SWITCH]=TRUE;
238 pswit[QPARA_SWITCH]=FALSE;
239 pswit[PARANOID_SWITCH]=TRUE;
240 pswit[LINE_END_SWITCH]=FALSE;
241 pswit[OVERVIEW_SWITCH]=FALSE;
242 pswit[STDOUT_SWITCH]=FALSE;
243 pswit[HEADER_SWITCH]=TRUE;
244 pswit[VERBOSE_SWITCH]=FALSE;
245 pswit[MARKUP_SWITCH]=FALSE;
246 pswit[USERTYPO_SWITCH]=FALSE;
247 pswit[DP_SWITCH]=FALSE;
254 g_option_context_free(context);
260 * Read in the user-defined stealth scanno list.
262 void read_user_scannos(void)
265 gchar *usertypo_file;
269 gchar *contents,*utf8,**lines;
270 usertypo_file=g_strdup("bookloupe.typ");
271 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
272 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
275 g_free(usertypo_file);
276 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
277 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
279 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
282 g_free(usertypo_file);
283 usertypo_file=g_strdup("gutcheck.typ");
284 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
286 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
289 g_free(usertypo_file);
290 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
291 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
293 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
295 g_free(usertypo_file);
296 g_print(" --> I couldn't find bookloupe.typ "
297 "-- proceeding without user typos.\n");
302 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
303 g_free(usertypo_file);
307 if (g_utf8_validate(contents,len,NULL))
308 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
310 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
312 lines=g_strsplit_set(utf8,"\r\n",0);
314 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
315 for (i=0;lines[i];i++)
316 if (*(unsigned char *)lines[i]>'!')
317 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
326 * Read an etext returning a newly allocated string containing the file
327 * contents or NULL on error.
329 gchar *read_etext(const char *filename,GError **err)
331 GError *tmp_err=NULL;
332 gchar *contents,*utf8;
333 gsize len,bytes_read,bytes_written;
335 if (!g_file_get_contents(filename,&contents,&len,err))
337 if (g_utf8_validate(contents,len,NULL))
339 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
340 g_set_print_handler(print_as_utf_8);
342 SetConsoleOutputCP(CP_UTF8);
347 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
348 &bytes_written,&tmp_err);
349 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
350 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
353 for(i=0;i<bytes_read;i++)
354 if (contents[i]=='\n')
359 else if (contents[i]!='\r')
361 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
362 "Input conversion failed. Byte %d at line %d, column %d is not a "
363 "valid Windows-1252 character",
364 ((unsigned char *)contents)[bytes_read],line,col);
367 g_propagate_error(err,tmp_err);
368 g_set_print_handler(print_as_windows_1252);
370 SetConsoleOutputCP(1252);
377 void cleanup_on_exit(void)
380 SetConsoleOutputCP(saved_cp);
384 int main(int argc,char **argv)
387 atexit(cleanup_on_exit);
388 saved_cp=GetConsoleOutputCP();
390 running_from=g_path_get_dirname(argv[0]);
391 parse_options(&argc,&argv);
392 if (pswit[USERTYPO_SWITCH])
394 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
396 if (pswit[OVERVIEW_SWITCH])
398 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
399 checked_linecnt,linecnt,linecnt-checked_linecnt);
400 g_print(" --------------- Queries found --------------\n");
402 g_print(" Long lines: %14ld\n",cnt_long);
404 g_print(" Short lines: %14ld\n",cnt_short);
406 g_print(" Line-end problems: %14ld\n",cnt_lineend);
408 g_print(" Common typos: %14ld\n",cnt_word);
410 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
412 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
414 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
416 g_print(" Proofing characters: %14ld\n",cnt_odd);
418 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
420 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
422 g_print(" Possible HTML tags: %14ld\n",cnt_html);
424 g_print(" TOTAL QUERIES %14ld\n",
425 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
426 cnt_dash+cnt_word+cnt_html+cnt_lineend);
428 g_free(running_from);
430 g_tree_unref(usertypo);
434 void count_dashes(const char *line,const char *dash,
435 struct dash_results *results)
440 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
443 tokens=g_strsplit(line,dash,0);
446 for(i=1;tokens[i];i++)
448 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
449 nc=g_utf8_get_char(tokens[i]);
450 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
452 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
454 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
460 /* count of lines with em-dashes with spaces both sides */
461 results->non_PG_space++;
463 /* count of lines with PG-type em-dashes with no spaces */
471 * Run a first pass - verify that it's a valid PG
472 * file, decide whether to report some things that
473 * occur many times in the text like long or short
474 * lines, non-standard dashes, etc.
476 struct first_pass_results *first_pass(const char *etext)
478 gunichar laststart=CHAR_SPACE;
483 unsigned int lastlen=0,lastblen=0;
484 long spline=0,nspline=0;
485 static struct first_pass_results results={0};
486 struct dash_results tmp_dash_results;
489 lines=g_strsplit(etext,"\n",0);
492 /* An empty etext has no terminators */
493 results.newlines=DOS_NEWLINES;
498 * If there are no LFs, we don't have UNIX-style
499 * terminators, but we might have OS9-style ones.
501 results.newlines=OS9_NEWLINES;
503 lines=g_strsplit(etext,"\r",0);
504 if (!lines[0] || !lines[1])
505 /* Looks like we don't have any terminators at all */
506 results.newlines=DOS_NEWLINES;
510 /* We might have UNIX-style terminators */
511 results.newlines=UNIX_NEWLINES;
513 for (j=0;lines[j];j++)
515 lbytes=strlen(lines[j]);
516 if (lbytes>0 && lines[j][lbytes-1]=='\r')
518 results.newlines=DOS_NEWLINES;
521 lines[j][--lbytes]='\0';
522 } while (lbytes>0 && lines[j][lbytes-1]=='\r');
524 llen=g_utf8_strlen(lines[j],lbytes);
526 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
527 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
530 g_print(" --> Duplicate header?\n");
531 spline=linecnt+1; /* first line of non-header text, that is */
533 if (!strncmp(lines[j],"*** START",9) &&
534 strstr(lines[j],"PROJECT GUTENBERG"))
537 g_print(" --> Duplicate header?\n");
538 nspline=linecnt+1; /* first line of non-header text, that is */
540 if (spline || nspline)
542 lc_line=g_utf8_strdown(lines[j],lbytes);
543 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
545 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
547 if (results.footerline)
549 /* it's an old-form header - we can detect duplicates */
551 g_print(" --> Duplicate footer?\n");
554 results.footerline=linecnt;
560 results.firstline=spline;
562 results.firstline=nspline; /* override with new */
563 if (results.footerline)
564 continue; /* don't count the boilerplate in the footer */
565 results.totlen+=llen;
566 for (s=lines[j];*s;s=g_utf8_next_char(s))
568 if (g_utf8_get_char(s)>127)
570 if (g_unichar_isalpha(g_utf8_get_char(s)))
574 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
575 qc=QUOTE_CLASS(g_utf8_get_char(s));
578 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
579 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
580 results.endquote_count++;
583 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
584 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
587 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
589 if (strstr(lines[j],".,"))
591 /* only count ast lines for ignoring purposes where there is */
592 /* locase text on the line */
593 if (strchr(lines[j],'*'))
595 for (s=lines[j];*s;s=g_utf8_next_char(s))
596 if (g_unichar_islower(g_utf8_get_char(s)))
601 if (strchr(lines[j],'/'))
602 results.fslashline++;
605 for (s=g_utf8_prev_char(lines[j]+lbytes);
606 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
607 s=g_utf8_prev_char(s))
609 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
610 g_utf8_get_char(g_utf8_prev_char(s))!='-')
613 if (llen>LONGEST_PG_LINE)
615 if (llen>WAY_TOO_LONG)
616 results.verylongline++;
617 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
619 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
622 if (strstr(lines[j],"<i>"))
623 results.htmcount+=4; /* bonus marks! */
625 /* Check for spaced em-dashes */
626 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
627 count_dashes(lines[j],"--",&tmp_dash_results);
628 count_dashes(lines[j],"—",&tmp_dash_results);
629 if (tmp_dash_results.base)
630 results.emdash.base++;
631 if (tmp_dash_results.non_PG_space)
632 results.emdash.non_PG_space++;
633 if (tmp_dash_results.PG_space)
634 results.emdash.PG_space++;
638 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
639 results.Dutchcount++;
640 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
641 results.Frenchcount++;
642 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
643 results.standalone_digit++;
646 /* Check for spaced dashes */
647 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
651 laststart=lines[j][0];
660 * Make some snap decisions based on the first pass results.
662 struct warnings *report_first_pass(struct first_pass_results *results)
664 static struct warnings warnings={0};
665 warnings.newlines=results->newlines;
666 if (warnings.newlines==UNIX_NEWLINES)
667 g_print(" --> No lines in this file have a CR. Not reporting them. "
668 "Project Gutenberg requires that all lineends be CR-LF.\n");
669 else if (warnings.newlines==OS9_NEWLINES)
670 g_print(" --> No lines in this file have a LF. Not reporting them. "
671 "Project Gutenberg requires that all lineends be CR-LF.\n");
673 g_print(" --> %ld lines in this file have white space at end\n",
676 if (results->dotcomma>5)
679 g_print(" --> %ld lines in this file contain '.,'. "
680 "Not reporting them.\n",results->dotcomma);
683 * If more than 50 lines, or one-tenth, are short,
684 * don't bother reporting them.
686 warnings.shortline=1;
687 if (results->shortline>50 || results->shortline*10>linecnt)
689 warnings.shortline=0;
690 g_print(" --> %ld lines in this file are short. "
691 "Not reporting short lines.\n",results->shortline);
694 * If more than 50 lines, or one-tenth, are long,
695 * don't bother reporting them.
698 if (results->longline>50 || results->longline*10>linecnt)
701 g_print(" --> %ld lines in this file are long. "
702 "Not reporting long lines.\n",results->longline);
704 /* If more than 10 lines contain asterisks, don't bother reporting them. */
706 if (results->astline>10)
709 g_print(" --> %ld lines in this file contain asterisks. "
710 "Not reporting them.\n",results->astline);
713 * If more than 10 lines contain forward slashes,
714 * don't bother reporting them.
717 if (results->fslashline>10)
720 g_print(" --> %ld lines in this file contain forward slashes. "
721 "Not reporting them.\n",results->fslashline);
724 * If more than 20 lines contain unpunctuated endquotes,
725 * don't bother reporting them.
728 if (results->endquote_count>20)
731 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
732 "Not reporting them.\n",results->endquote_count);
735 * If more than 15 lines contain standalone digits,
736 * don't bother reporting them.
739 if (results->standalone_digit>10)
742 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
743 "Not reporting them.\n",results->standalone_digit);
746 * If more than 20 lines contain hyphens at end,
747 * don't bother reporting them.
750 if (results->hyphens>20)
753 g_print(" --> %ld lines in this file have hyphens at end. "
754 "Not reporting them.\n",results->hyphens);
756 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
758 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
759 pswit[MARKUP_SWITCH]=1;
761 if (results->verylongline>0)
762 g_print(" --> %ld lines in this file are VERY long!\n",
763 results->verylongline);
765 * If there are more non-PG spaced dashes than PG em-dashes,
766 * assume it's deliberate.
767 * Current PG guidelines say don't use them, but older texts do,
768 * and some people insist on them whatever the guidelines say.
771 if (results->spacedash+results->emdash.non_PG_space>
772 results->emdash.PG_space)
775 g_print(" --> There are %ld spaced dashes and em-dashes. "
776 "Not reporting them.\n",
777 results->spacedash+results->emdash.non_PG_space);
779 /* If more than a quarter of characters are hi-bit, bug out. */
781 if (results->binlen*4>results->totlen)
783 g_print(" --> This file does not appear to be ASCII. "
784 "Terminating. Best of luck with it!\n");
787 if (results->alphalen*4<results->totlen)
789 g_print(" --> This file does not appear to be text. "
790 "Terminating. Best of luck with it!\n");
793 if (results->binlen*100>results->totlen || results->binlen>100)
795 g_print(" --> There are a lot of foreign letters here. "
796 "Not reporting them.\n");
799 warnings.isDutch=FALSE;
800 if (results->Dutchcount>50)
802 warnings.isDutch=TRUE;
803 g_print(" --> This looks like Dutch - "
804 "switching off dashes and warnings for 's Middags case.\n");
806 warnings.isFrench=FALSE;
807 if (results->Frenchcount>50)
809 warnings.isFrench=TRUE;
810 g_print(" --> This looks like French - "
811 "switching off some doublepunct.\n");
813 if (results->firstline && results->footerline)
814 g_print(" The PG header and footer appear to be already on.\n");
817 if (results->firstline)
818 g_print(" The PG header is on - no footer.\n");
819 if (results->footerline)
820 g_print(" The PG footer is on - no header.\n");
823 if (pswit[VERBOSE_SWITCH])
826 warnings.shortline=1;
835 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
837 if (warnings.isDutch)
839 if (results->footerline>0 && results->firstline>0 &&
840 results->footerline>results->firstline &&
841 results->footerline-results->firstline<100)
843 g_print(" --> I don't really know where this text starts. \n");
844 g_print(" There are no reference points.\n");
845 g_print(" I'm going to have to report the header and footer "
847 results->firstline=0;
855 * Look along the line, accumulate the count of quotes, and see
856 * if this is an empty line - i.e. a line with nothing on it
858 * If line has just spaces, period, * and/or - on it, don't
859 * count it, since empty lines with asterisks or dashes to
860 * separate sections are common.
862 * Returns: TRUE if the line is empty.
864 gboolean analyse_quotes(const char *aline,struct counters *counters)
867 /* assume the line is empty until proven otherwise */
868 gboolean isemptyline=TRUE;
869 const char *s=aline,*sprev,*snext;
872 GError *tmp_err=NULL;
875 snext=g_utf8_next_char(s);
876 c=g_utf8_get_char(s);
877 if (CHAR_IS_DQUOTE(c))
878 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
879 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
884 * At start of line, it can only be a quotation mark.
885 * Hardcode a very common exception!
887 if (!g_str_has_prefix(snext,"tis") &&
888 !g_str_has_prefix(snext,"Tis"))
889 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
891 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
892 g_unichar_isalpha(g_utf8_get_char(snext)))
893 /* Do nothing! it's definitely an apostrophe, not a quote */
895 /* it's outside a word - let's check it out */
896 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
897 g_unichar_isalpha(g_utf8_get_char(snext)))
899 /* certainly looks like a quotation mark */
900 if (!g_str_has_prefix(snext,"tis") &&
901 !g_str_has_prefix(snext,"Tis"))
902 /* hardcode a very common exception! */
904 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
905 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
907 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
912 /* now - is it a quotation mark? */
913 guessquote=0; /* accumulate clues */
914 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
916 /* it follows a letter - could be either */
918 if (g_utf8_get_char(sprev)=='s')
920 /* looks like a plural apostrophe */
922 if (g_utf8_get_char(snext)==CHAR_SPACE)
926 if (innermost_quote_matches(counters,c))
928 * Give it the benefit of some doubt,
929 * if a squote is already open.
935 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
938 /* no adjacent letter - it must be a quote of some kind */
939 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
944 if (pswit[ECHO_SWITCH])
945 g_print("\n%s\n",aline);
946 if (!pswit[OVERVIEW_SWITCH])
947 g_print(" Line %ld column %ld - %s\n",
948 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
949 g_clear_error(&tmp_err);
951 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
953 isemptyline=FALSE; /* ignore lines like * * * as spacers */
954 if (c==CHAR_UNDERSCORE)
955 counters->c_unders++;
956 if (c==CHAR_OPEN_SBRACK)
958 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
959 !matching_difference(counters,c) && s==aline &&
960 g_str_has_prefix(s,"[Illustration:"))
961 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
963 increment_matching(counters,c,TRUE);
965 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
966 increment_matching(counters,c,TRUE);
967 if (c==CHAR_CLOSE_SBRACK)
969 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
970 !matching_difference(counters,c) && !*snext)
971 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
973 increment_matching(counters,c,FALSE);
975 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
976 increment_matching(counters,c,FALSE);
984 * check_for_control_characters:
986 * Check for invalid or questionable characters in the line
987 * Anything above 127 is invalid for plain ASCII, and
988 * non-printable control characters should also be flagged.
989 * Tabs should generally not be there.
991 void check_for_control_characters(const char *aline)
995 for (s=aline;*s;s=g_utf8_next_char(s))
997 c=g_utf8_get_char(s);
998 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1000 if (pswit[ECHO_SWITCH])
1001 g_print("\n%s\n",aline);
1002 if (!pswit[OVERVIEW_SWITCH])
1003 g_print(" Line %ld column %ld - Control character %u\n",
1004 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1012 * check_for_odd_characters:
1014 * Check for binary and other odd characters.
1016 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1017 gboolean isemptyline)
1019 /* Don't repeat multiple warnings on one line. */
1020 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1021 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1024 for (s=aline;*s;s=g_utf8_next_char(s))
1026 c=g_utf8_get_char(s);
1027 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1029 if (pswit[ECHO_SWITCH])
1030 g_print("\n%s\n",aline);
1031 if (!pswit[OVERVIEW_SWITCH])
1032 if (c>127 && c<160 || c>255)
1033 g_print(" Line %ld column %ld - "
1034 "Non-ISO-8859 character %u\n",
1035 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1037 g_print(" Line %ld column %ld - "
1038 "Non-ASCII character %u\n",
1039 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1044 if (!eTab && c==CHAR_TAB)
1046 if (pswit[ECHO_SWITCH])
1047 g_print("\n%s\n",aline);
1048 if (!pswit[OVERVIEW_SWITCH])
1049 g_print(" Line %ld column %ld - Tab character?\n",
1050 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1055 if (!eTilde && c==CHAR_TILDE)
1058 * Often used by OCR software to indicate an
1059 * unrecognizable character.
1061 if (pswit[ECHO_SWITCH])
1062 g_print("\n%s\n",aline);
1063 if (!pswit[OVERVIEW_SWITCH])
1064 g_print(" Line %ld column %ld - Tilde character?\n",
1065 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1070 if (!eCarat && c==CHAR_CARAT)
1072 if (pswit[ECHO_SWITCH])
1073 g_print("\n%s\n",aline);
1074 if (!pswit[OVERVIEW_SWITCH])
1075 g_print(" Line %ld column %ld - Carat character?\n",
1076 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1081 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1083 if (pswit[ECHO_SWITCH])
1084 g_print("\n%s\n",aline);
1085 if (!pswit[OVERVIEW_SWITCH])
1086 g_print(" Line %ld column %ld - Forward slash?\n",
1087 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1093 * Report asterisks only in paranoid mode,
1094 * since they're often deliberate.
1096 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1099 if (pswit[ECHO_SWITCH])
1100 g_print("\n%s\n",aline);
1101 if (!pswit[OVERVIEW_SWITCH])
1102 g_print(" Line %ld column %ld - Asterisk?\n",
1103 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1112 * check_for_long_line:
1114 * Check for line too long.
1116 void check_for_long_line(const char *aline)
1118 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1120 if (pswit[ECHO_SWITCH])
1121 g_print("\n%s\n",aline);
1122 if (!pswit[OVERVIEW_SWITCH])
1123 g_print(" Line %ld column %ld - Long line %ld\n",
1124 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1131 * check_for_short_line:
1133 * Check for line too short.
1135 * This one is a bit trickier to implement: we don't want to
1136 * flag the last line of a paragraph for being short, so we
1137 * have to wait until we know that our current line is a
1138 * "normal" line, then report the _previous_ line if it was too
1139 * short. We also don't want to report indented lines like
1140 * chapter heads or formatted quotations. We therefore keep
1141 * last->len as the length of the last line examined, and
1142 * last->blen as the length of the last but one, and try to
1143 * suppress unnecessary warnings by checking that both were of
1144 * "normal" length. We keep the first character of the last
1145 * line in last->start, and if it was a space, we assume that
1146 * the formatting is deliberate. I can't figure out a way to
1147 * distinguish something like a quoted verse left-aligned or
1148 * the header or footer of a letter from a paragraph of short
1149 * lines - maybe if I examined the whole paragraph, and if the
1150 * para has less than, say, 8 lines and if all lines are short,
1151 * then just assume it's OK? Need to look at some texts to see
1152 * how often a formula like this would get the right result.
1154 void check_for_short_line(const char *aline,const struct line_properties *last)
1156 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1157 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1158 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1160 if (pswit[ECHO_SWITCH])
1161 g_print("\n%s\n",prevline);
1162 if (!pswit[OVERVIEW_SWITCH])
1163 g_print(" Line %ld column %ld - Short line %ld?\n",
1164 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1171 * check_for_starting_punctuation:
1173 * Look for punctuation other than full ellipses at start of line.
1175 void check_for_starting_punctuation(const char *aline)
1177 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1178 !g_str_has_prefix(aline,". . ."))
1180 if (pswit[ECHO_SWITCH])
1181 g_print("\n%s\n",aline);
1182 if (!pswit[OVERVIEW_SWITCH])
1183 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1193 * Find the first em-dash, return a pointer to it and set <next> to the
1194 * character following the dash.
1196 char *str_emdash(const char *s,const char **next)
1204 *next=g_utf8_next_char(s2);
1209 *next=g_utf8_next_char(g_utf8_next_char(s1));
1214 *next=g_utf8_next_char(g_utf8_next_char(s1));
1219 *next=g_utf8_next_char(s2);
1225 * check_for_spaced_emdash:
1227 * Check for spaced em-dashes.
1229 * We must check _all_ occurrences of em-dashes on the line
1230 * hence the loop - even if the first dash is OK
1231 * there may be another that's wrong later on.
1233 void check_for_spaced_emdash(const char *aline)
1235 const char *s,*t,*next;
1236 for (s=aline;t=str_emdash(s,&next);s=next)
1238 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1239 g_utf8_get_char(next)==CHAR_SPACE)
1241 if (pswit[ECHO_SWITCH])
1242 g_print("\n%s\n",aline);
1243 if (!pswit[OVERVIEW_SWITCH])
1244 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1245 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1253 * check_for_spaced_dash:
1255 * Check for spaced dashes.
1257 void check_for_spaced_dash(const char *aline)
1260 if ((s=strstr(aline," -")))
1262 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1264 if (pswit[ECHO_SWITCH])
1265 g_print("\n%s\n",aline);
1266 if (!pswit[OVERVIEW_SWITCH])
1267 g_print(" Line %ld column %ld - Spaced dash?\n",
1268 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1273 else if ((s=strstr(aline,"- ")))
1275 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1277 if (pswit[ECHO_SWITCH])
1278 g_print("\n%s\n",aline);
1279 if (!pswit[OVERVIEW_SWITCH])
1280 g_print(" Line %ld column %ld - Spaced dash?\n",
1281 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1289 * check_for_unmarked_paragraphs:
1291 * Check for unmarked paragraphs indicated by separate speakers.
1293 * May well be false positive:
1294 * "Bravo!" "Wonderful!" called the crowd.
1295 * but useful all the same.
1297 void check_for_unmarked_paragraphs(const char *aline)
1300 s=strstr(aline,"\" \"");
1302 s=strstr(aline,"\" \"");
1305 if (pswit[ECHO_SWITCH])
1306 g_print("\n%s\n",aline);
1307 if (!pswit[OVERVIEW_SWITCH])
1308 g_print(" Line %ld column %ld - "
1309 "Query missing paragraph break?\n",
1310 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1317 * check_for_jeebies:
1319 * Check for "to he" and other easy h/b errors.
1321 * This is a very inadequate effort on the h/b problem,
1322 * but the phrase "to he" is always an error, whereas "to
1323 * be" is quite common.
1324 * Similarly, '"Quiet!", be said.' is a non-be error
1325 * "to he" is _not_ always an error!:
1326 * "Where they went to he couldn't say."
1327 * Another false positive:
1328 * What would "Cinderella" be without the . . .
1329 * and another: "If he wants to he can see for himself."
1331 void check_for_jeebies(const char *aline)
1334 s=strstr(aline," be could ");
1336 s=strstr(aline," be would ");
1338 s=strstr(aline," was be ");
1340 s=strstr(aline," be is ");
1342 s=strstr(aline," is be ");
1344 s=strstr(aline,"\", be ");
1346 s=strstr(aline,"\" be ");
1348 s=strstr(aline,"\" be ");
1350 s=strstr(aline," to he ");
1353 if (pswit[ECHO_SWITCH])
1354 g_print("\n%s\n",aline);
1355 if (!pswit[OVERVIEW_SWITCH])
1356 g_print(" Line %ld column %ld - Query he/be error?\n",
1357 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1361 s=strstr(aline," the had ");
1363 s=strstr(aline," a had ");
1365 s=strstr(aline," they bad ");
1367 s=strstr(aline," she bad ");
1369 s=strstr(aline," he bad ");
1371 s=strstr(aline," you bad ");
1373 s=strstr(aline," i bad ");
1376 if (pswit[ECHO_SWITCH])
1377 g_print("\n%s\n",aline);
1378 if (!pswit[OVERVIEW_SWITCH])
1379 g_print(" Line %ld column %ld - Query had/bad error?\n",
1380 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1384 s=strstr(aline,"; hut ");
1386 s=strstr(aline,", hut ");
1389 if (pswit[ECHO_SWITCH])
1390 g_print("\n%s\n",aline);
1391 if (!pswit[OVERVIEW_SWITCH])
1392 g_print(" Line %ld column %ld - Query hut/but error?\n",
1393 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1400 * check_for_mta_from:
1402 * Special case - angled bracket in front of "From" placed there by an
1403 * MTA when sending an e-mail.
1405 void check_for_mta_from(const char *aline)
1408 s=strstr(aline,">From");
1411 if (pswit[ECHO_SWITCH])
1412 g_print("\n%s\n",aline);
1413 if (!pswit[OVERVIEW_SWITCH])
1414 g_print(" Line %ld column %ld - "
1415 "Query angled bracket with From\n",
1416 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1423 * check_for_orphan_character:
1425 * Check for a single character line -
1426 * often an overflow from bad wrapping.
1428 void check_for_orphan_character(const char *aline)
1431 c=g_utf8_get_char(aline);
1432 if (c && !*g_utf8_next_char(aline))
1434 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1435 ; /* Nothing - ignore numerals alone on a line. */
1438 if (pswit[ECHO_SWITCH])
1439 g_print("\n%s\n",aline);
1440 if (!pswit[OVERVIEW_SWITCH])
1441 g_print(" Line %ld column 1 - Query single character line\n",
1450 * check_for_pling_scanno:
1452 * Check for I" - often should be !
1454 void check_for_pling_scanno(const char *aline)
1457 s=strstr(aline," I\"");
1460 if (pswit[ECHO_SWITCH])
1461 g_print("\n%s\n",aline);
1462 if (!pswit[OVERVIEW_SWITCH])
1463 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1464 linecnt,g_utf8_pointer_to_offset(aline,s));
1471 * check_for_extra_period:
1473 * Check for period without a capital letter. Cut-down from gutspell.
1474 * Only works when it happens on a single line.
1476 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1478 const char *s,*t,*s1,*sprev;
1483 gunichar c,nc,pc,*decomposition;
1484 if (pswit[PARANOID_SWITCH])
1486 for (t=aline;t=strstr(t,". ");)
1490 t=g_utf8_next_char(t);
1491 /* start of line punctuation is handled elsewhere */
1494 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1496 t=g_utf8_next_char(t);
1499 if (warnings->isDutch)
1501 /* For Frank & Jeroen -- 's Middags case */
1502 gunichar c2,c3,c4,c5;
1503 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1504 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1505 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1506 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1507 if (CHAR_IS_APOSTROPHE(c2) &&
1508 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1509 g_unichar_isupper(c5))
1511 t=g_utf8_next_char(t);
1515 s1=g_utf8_next_char(g_utf8_next_char(t));
1516 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1517 !g_unichar_isdigit(g_utf8_get_char(s1)))
1518 s1=g_utf8_next_char(s1);
1519 if (g_unichar_islower(g_utf8_get_char(s1)))
1521 /* we have something to investigate */
1523 /* so let's go back and find out */
1524 nc=g_utf8_get_char(t);
1525 s1=g_utf8_prev_char(t);
1526 c=g_utf8_get_char(s1);
1527 sprev=g_utf8_prev_char(s1);
1528 pc=g_utf8_get_char(sprev);
1530 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1531 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1532 g_unichar_isalpha(nc)))
1537 sprev=g_utf8_prev_char(s1);
1538 pc=g_utf8_get_char(sprev);
1540 s1=g_utf8_next_char(s1);
1543 testword=g_strndup(s1,s-s1);
1545 testword=g_strdup(s1);
1546 for (i=0;*abbrev[i];i++)
1547 if (!strcmp(testword,abbrev[i]))
1549 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1551 if (!*g_utf8_next_char(testword))
1553 if (isroman(testword))
1558 for (s=testword;*s;s=g_utf8_next_char(s))
1560 decomposition=g_unicode_canonical_decomposition(
1561 g_utf8_get_char(s),&len);
1562 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1564 g_free(decomposition);
1568 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1570 g_tree_insert(qperiod,g_strdup(testword),
1571 GINT_TO_POINTER(1));
1572 if (pswit[ECHO_SWITCH])
1573 g_print("\n%s\n",aline);
1574 if (!pswit[OVERVIEW_SWITCH])
1575 g_print(" Line %ld column %ld - Extra period?\n",
1576 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1582 t=g_utf8_next_char(t);
1588 * check_for_following_punctuation:
1590 * Check for words usually not followed by punctuation.
1592 void check_for_following_punctuation(const char *aline)
1595 const char *s,*wordstart;
1598 if (pswit[TYPO_SWITCH])
1609 inword=g_utf8_strdown(t,-1);
1611 for (i=0;*nocomma[i];i++)
1612 if (!strcmp(inword,nocomma[i]))
1614 c=g_utf8_get_char(s);
1615 if (c==',' || c==';' || c==':')
1617 if (pswit[ECHO_SWITCH])
1618 g_print("\n%s\n",aline);
1619 if (!pswit[OVERVIEW_SWITCH])
1620 g_print(" Line %ld column %ld - "
1621 "Query punctuation after %s?\n",
1622 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1628 for (i=0;*noperiod[i];i++)
1629 if (!strcmp(inword,noperiod[i]))
1631 c=g_utf8_get_char(s);
1632 if (c=='.' || c=='!')
1634 if (pswit[ECHO_SWITCH])
1635 g_print("\n%s\n",aline);
1636 if (!pswit[OVERVIEW_SWITCH])
1637 g_print(" Line %ld column %ld - "
1638 "Query punctuation after %s?\n",
1639 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1653 * Check for commonly mistyped words,
1654 * and digits like 0 for O in a word.
1656 void check_for_typos(const char *aline,struct warnings *warnings)
1658 const char *s,*t,*nt,*wordstart;
1660 gunichar *decomposition;
1662 int i,vowel,consonant,*dupcnt;
1663 gboolean isdup,istypo,alower;
1666 gsize decomposition_len;
1670 inword=getaword(&s);
1674 continue; /* don't bother with empty lines */
1676 if (mixdigit(inword))
1678 if (pswit[ECHO_SWITCH])
1679 g_print("\n%s\n",aline);
1680 if (!pswit[OVERVIEW_SWITCH])
1681 g_print(" Line %ld column %ld - Query digit in %s\n",
1682 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1687 * Put the word through a series of tests for likely typos and OCR
1690 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1694 for (t=inword;*t;t=g_utf8_next_char(t))
1696 c=g_utf8_get_char(t);
1697 nt=g_utf8_next_char(t);
1698 /* lowercase for testing */
1699 if (g_unichar_islower(c))
1701 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1704 * We have an uppercase mid-word. However, there are
1706 * Mac and Mc like McGill
1707 * French contractions like l'Abbe
1709 offset=g_utf8_pointer_to_offset(inword,t);
1711 pc=g_utf8_get_char(g_utf8_prev_char(t));
1714 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1715 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1716 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1717 CHAR_IS_APOSTROPHE(pc))
1723 testword=g_utf8_casefold(inword,-1);
1725 if (pswit[TYPO_SWITCH])
1728 * Check for certain unlikely two-letter combinations at word
1731 len=g_utf8_strlen(testword,-1);
1734 for (i=0;*nostart[i];i++)
1735 if (g_str_has_prefix(testword,nostart[i]))
1737 for (i=0;*noend[i];i++)
1738 if (g_str_has_suffix(testword,noend[i]))
1741 /* ght is common, gbt never. Like that. */
1742 if (strstr(testword,"cb"))
1744 if (strstr(testword,"gbt"))
1746 if (strstr(testword,"pbt"))
1748 if (strstr(testword,"tbs"))
1750 if (strstr(testword,"mrn"))
1752 if (strstr(testword,"ahle"))
1754 if (strstr(testword,"ihle"))
1757 * "TBE" does happen - like HEARTBEAT - but uncommon.
1758 * Also "TBI" - frostbite, outbid - but uncommon.
1759 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1760 * numerals, but "ii" is a common scanno.
1762 if (strstr(testword,"tbi"))
1764 if (strstr(testword,"tbe"))
1766 if (strstr(testword,"ii"))
1769 * Check for no vowels or no consonants.
1770 * If none, flag a typo.
1772 if (!istypo && len>1)
1775 for (t=testword;*t;t=g_utf8_next_char(t))
1777 c=g_utf8_get_char(t);
1779 g_unicode_canonical_decomposition(c,&decomposition_len);
1780 if (c=='y' || g_unichar_isdigit(c))
1782 /* Yah, this is loose. */
1786 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1790 g_free(decomposition);
1792 if (!vowel || !consonant)
1796 * Now exclude the word from being reported if it's in
1799 for (i=0;*okword[i];i++)
1800 if (!strcmp(testword,okword[i]))
1803 * What looks like a typo may be a Roman numeral.
1806 if (istypo && isroman(testword))
1808 /* Check the manual list of typos. */
1810 for (i=0;*typo[i];i++)
1811 if (!strcmp(testword,typo[i]))
1814 * Check lowercase s, l, i and m - special cases.
1815 * "j" - often a semi-colon gone wrong.
1816 * "d" for a missing apostrophe - he d
1819 if (!istypo && len==1 &&
1820 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1824 dupcnt=g_tree_lookup(qword,testword);
1828 isdup=!pswit[VERBOSE_SWITCH];
1832 dupcnt=g_new0(int,1);
1833 g_tree_insert(qword,g_strdup(testword),dupcnt);
1838 if (pswit[ECHO_SWITCH])
1839 g_print("\n%s\n",aline);
1840 if (!pswit[OVERVIEW_SWITCH])
1842 g_print(" Line %ld column %ld - Query word %s",
1843 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1845 if (!pswit[VERBOSE_SWITCH])
1846 g_print(" - not reporting duplicates");
1854 /* check the user's list of typos */
1855 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1857 if (pswit[ECHO_SWITCH])
1858 g_print("\n%s\n",aline);
1859 if (!pswit[OVERVIEW_SWITCH])
1860 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1861 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1863 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1865 if (pswit[PARANOID_SWITCH] && warnings->digit)
1867 /* In paranoid mode, query all 0 and 1 standing alone. */
1868 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1870 if (pswit[ECHO_SWITCH])
1871 g_print("\n%s\n",aline);
1872 if (!pswit[OVERVIEW_SWITCH])
1873 g_print(" Line %ld column %ld - Query standalone %s\n",
1874 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1885 * check_for_misspaced_punctuation:
1887 * Look for added or missing spaces around punctuation and quotes.
1888 * If there is a punctuation character like ! with no space on
1889 * either side, suspect a missing!space. If there are spaces on
1890 * both sides , assume a typo. If we see a double quote with no
1891 * space or punctuation on either side of it, assume unspaced
1892 * quotes "like"this.
1894 void check_for_misspaced_punctuation(const char *aline,
1895 struct parities *parities,gboolean isemptyline)
1897 gboolean isacro,isellipsis;
1899 gunichar c,nc,pc,n2c;
1901 c=g_utf8_get_char(aline);
1902 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1903 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1907 nc=g_utf8_get_char(g_utf8_next_char(s));
1908 /* For each character in the line after the first. */
1909 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1911 /* we need to suppress warnings for acronyms like M.D. */
1913 /* we need to suppress warnings for ellipsis . . . */
1916 * If there are letters on both sides of it or
1917 * if it's strict punctuation followed by an alpha.
1919 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1920 g_utf8_strchr("?!,;:",-1,c)))
1924 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1925 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1927 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1933 if (pswit[ECHO_SWITCH])
1934 g_print("\n%s\n",aline);
1935 if (!pswit[OVERVIEW_SWITCH])
1936 g_print(" Line %ld column %ld - Missing space?\n",
1937 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1942 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1945 * If there are spaces on both sides,
1946 * or space before and end of line.
1950 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1951 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1953 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1957 if (!isemptyline && !isellipsis)
1959 if (pswit[ECHO_SWITCH])
1960 g_print("\n%s\n",aline);
1961 if (!pswit[OVERVIEW_SWITCH])
1962 g_print(" Line %ld column %ld - "
1963 "Spaced punctuation?\n",linecnt,
1964 g_utf8_pointer_to_offset(aline,s)+1);
1971 /* Split out the characters that CANNOT be preceded by space. */
1972 c=g_utf8_get_char(aline);
1973 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1974 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1978 nc=g_utf8_get_char(g_utf8_next_char(s));
1979 /* for each character in the line after the first */
1980 if (g_utf8_strchr("?!,;:",-1,c))
1982 /* if it's punctuation that _cannot_ have a space before it */
1983 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1986 * If nc DOES == space,
1987 * it was already reported just above.
1989 if (pswit[ECHO_SWITCH])
1990 g_print("\n%s\n",aline);
1991 if (!pswit[OVERVIEW_SWITCH])
1992 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1993 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2000 * Special case " .X" where X is any alpha.
2001 * This plugs a hole in the acronym code above.
2002 * Inelegant, but maintainable.
2004 c=g_utf8_get_char(aline);
2005 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2006 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2010 nc=g_utf8_get_char(g_utf8_next_char(s));
2011 /* for each character in the line after the first */
2014 /* if it's a period */
2015 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2018 * If the period follows a space and
2019 * is followed by a letter.
2021 if (pswit[ECHO_SWITCH])
2022 g_print("\n%s\n",aline);
2023 if (!pswit[OVERVIEW_SWITCH])
2024 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2025 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2031 c=g_utf8_get_char(aline);
2032 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2033 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2037 nc=g_utf8_get_char(g_utf8_next_char(s));
2038 /* for each character in the line after the first */
2039 if (CHAR_IS_DQUOTE(c))
2041 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2042 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2043 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2045 if (pswit[ECHO_SWITCH])
2046 g_print("\n%s\n",aline);
2047 if (!pswit[OVERVIEW_SWITCH])
2048 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2049 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2055 /* Check parity of quotes. */
2056 nc=g_utf8_get_char(aline);
2057 for (s=aline;*s;s=g_utf8_next_char(s))
2060 nc=g_utf8_get_char(g_utf8_next_char(s));
2061 if (CHAR_IS_DQUOTE(c))
2065 parities->dquote=!parities->dquote;
2066 parity=parities->dquote;
2068 else if (c==CHAR_LD_QUOTE)
2075 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2077 if (pswit[ECHO_SWITCH])
2078 g_print("\n%s\n",aline);
2079 if (!pswit[OVERVIEW_SWITCH])
2080 g_print(" Line %ld column %ld - "
2081 "Wrongspaced quotes?\n",
2082 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2090 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2091 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2093 if (pswit[ECHO_SWITCH])
2094 g_print("\n%s\n",aline);
2095 if (!pswit[OVERVIEW_SWITCH])
2096 g_print(" Line %ld column %ld - "
2097 "Wrongspaced quotes?\n",
2098 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2105 c=g_utf8_get_char(aline);
2106 if (CHAR_IS_DQUOTE(c))
2108 if (g_utf8_strchr(",;:!?)]} ",-1,
2109 g_utf8_get_char(g_utf8_next_char(aline))))
2111 if (pswit[ECHO_SWITCH])
2112 g_print("\n%s\n",aline);
2113 if (!pswit[OVERVIEW_SWITCH])
2114 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2120 if (pswit[SQUOTE_SWITCH])
2122 nc=g_utf8_get_char(aline);
2123 for (s=aline;*s;s=g_utf8_next_char(s))
2126 nc=g_utf8_get_char(g_utf8_next_char(s));
2127 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2128 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2129 !g_unichar_isalpha(nc)))
2131 parities->squote=!parities->squote;
2132 if (!parities->squote)
2135 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2137 if (pswit[ECHO_SWITCH])
2138 g_print("\n%s\n",aline);
2139 if (!pswit[OVERVIEW_SWITCH])
2140 g_print(" Line %ld column %ld - "
2141 "Wrongspaced singlequotes?\n",
2142 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2150 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2151 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2153 if (pswit[ECHO_SWITCH])
2154 g_print("\n%s\n",aline);
2155 if (!pswit[OVERVIEW_SWITCH])
2156 g_print(" Line %ld column %ld - "
2157 "Wrongspaced singlequotes?\n",
2158 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2169 * check_for_double_punctuation:
2171 * Look for double punctuation like ,. or ,,
2172 * Thanks to DW for the suggestion!
2173 * In books with references, ".," and ".;" are common
2174 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2175 * OTOH, from my initial tests, there are also fairly
2176 * common errors. What to do? Make these cases paranoid?
2177 * ".," is the most common, so warnings->dotcomma is used
2178 * to suppress detailed reporting if it occurs often.
2180 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2184 nc=g_utf8_get_char(aline);
2185 for (s=aline;*s;s=g_utf8_next_char(s))
2188 nc=g_utf8_get_char(g_utf8_next_char(s));
2189 /* for each punctuation character in the line */
2190 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2191 g_utf8_strchr(".?!,;:",-1,nc))
2193 /* followed by punctuation, it's a query, unless . . . */
2194 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2195 !warnings->dotcomma && c=='.' && nc==',' ||
2196 warnings->isFrench && g_str_has_prefix(s,",...") ||
2197 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2198 warnings->isFrench && g_str_has_prefix(s,";...") ||
2199 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2200 warnings->isFrench && g_str_has_prefix(s,":...") ||
2201 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2202 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2203 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2204 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2205 warnings->isFrench && g_str_has_prefix(s,"...?"))
2207 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2208 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2209 warnings->isFrench && g_str_has_prefix(s,";...") ||
2210 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2211 warnings->isFrench && g_str_has_prefix(s,":...") ||
2212 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2213 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2214 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2215 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2216 warnings->isFrench && g_str_has_prefix(s,"...?"))
2219 nc=g_utf8_get_char(g_utf8_next_char(s));
2221 ; /* do nothing for .. !! and ?? which can be legit */
2225 if (pswit[ECHO_SWITCH])
2226 g_print("\n%s\n",aline);
2227 if (!pswit[OVERVIEW_SWITCH])
2228 g_print(" Line %ld column %ld - Double punctuation?\n",
2229 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2238 * check_for_spaced_quotes:
2240 void check_for_spaced_quotes(const char *aline)
2244 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2248 while ((t=strstr(s," \" ")))
2250 if (pswit[ECHO_SWITCH])
2251 g_print("\n%s\n",aline);
2252 if (!pswit[OVERVIEW_SWITCH])
2253 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2254 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2257 s=g_utf8_next_char(g_utf8_next_char(t));
2259 pattern=g_string_new(NULL);
2260 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2262 g_string_assign(pattern," ");
2263 g_string_append_unichar(pattern,single_quotes[i]);
2264 g_string_append_c(pattern,' ');
2266 while ((t=strstr(s,pattern->str)))
2268 if (pswit[ECHO_SWITCH])
2269 g_print("\n%s\n",aline);
2270 if (!pswit[OVERVIEW_SWITCH])
2271 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2272 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2275 s=g_utf8_next_char(g_utf8_next_char(t));
2278 g_string_free(pattern,TRUE);
2282 * check_for_miscased_genative:
2284 * Check special case of 'S instead of 's at end of word.
2286 void check_for_miscased_genative(const char *aline)
2292 c=g_utf8_get_char(aline);
2293 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2294 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2298 nc=g_utf8_get_char(g_utf8_next_char(s));
2299 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2301 if (pswit[ECHO_SWITCH])
2302 g_print("\n%s\n",aline);
2303 if (!pswit[OVERVIEW_SWITCH])
2304 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2305 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2313 * check_end_of_line:
2315 * Now check special cases - start and end of line -
2316 * for single and double quotes. Start is sometimes [sic]
2317 * but better to query it anyway.
2318 * While we're here, check for dash at end of line.
2320 void check_end_of_line(const char *aline,struct warnings *warnings)
2325 lbytes=strlen(aline);
2326 if (g_utf8_strlen(aline,lbytes)>1)
2328 s=g_utf8_prev_char(aline+lbytes);
2329 c1=g_utf8_get_char(s);
2330 c2=g_utf8_get_char(g_utf8_prev_char(s));
2331 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2333 if (pswit[ECHO_SWITCH])
2334 g_print("\n%s\n",aline);
2335 if (!pswit[OVERVIEW_SWITCH])
2336 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2337 g_utf8_strlen(aline,lbytes));
2341 c1=g_utf8_get_char(aline);
2342 c2=g_utf8_get_char(g_utf8_next_char(aline));
2343 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2345 if (pswit[ECHO_SWITCH])
2346 g_print("\n%s\n",aline);
2347 if (!pswit[OVERVIEW_SWITCH])
2348 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2353 * Dash at end of line may well be legit - paranoid mode only
2354 * and don't report em-dash at line-end.
2356 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2358 for (s=g_utf8_prev_char(aline+lbytes);
2359 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2361 if (g_utf8_get_char(s)=='-' &&
2362 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2364 if (pswit[ECHO_SWITCH])
2365 g_print("\n%s\n",aline);
2366 if (!pswit[OVERVIEW_SWITCH])
2367 g_print(" Line %ld column %ld - "
2368 "Hyphen at end of line?\n",
2369 linecnt,g_utf8_pointer_to_offset(aline,s));
2376 * check_for_unspaced_bracket:
2378 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2379 * If so, suspect a scanno like "a]most".
2381 void check_for_unspaced_bracket(const char *aline)
2385 c=g_utf8_get_char(aline);
2386 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2387 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2391 nc=g_utf8_get_char(g_utf8_next_char(s));
2394 /* for each bracket character in the line except 1st & last */
2395 if (g_utf8_strchr("{[()]}",-1,c) &&
2396 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2398 if (pswit[ECHO_SWITCH])
2399 g_print("\n%s\n",aline);
2400 if (!pswit[OVERVIEW_SWITCH])
2401 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2402 linecnt,g_utf8_pointer_to_offset(aline,s));
2410 * check_for_unpunctuated_endquote:
2412 void check_for_unpunctuated_endquote(const char *aline)
2417 c=g_utf8_get_char(aline);
2418 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2419 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2423 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2424 nc=g_utf8_get_char(g_utf8_next_char(s));
2425 /* for each character in the line except 1st */
2426 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2428 if (pswit[ECHO_SWITCH])
2429 g_print("\n%s\n",aline);
2430 if (!pswit[OVERVIEW_SWITCH])
2431 g_print(" Line %ld column %ld - "
2432 "endquote missing punctuation?\n",
2433 linecnt,g_utf8_pointer_to_offset(aline,s));
2441 * check_for_html_tag:
2443 * Check for <HTML TAG>.
2445 * If there is a < in the line, followed at some point
2446 * by a > then we suspect HTML.
2448 void check_for_html_tag(const char *aline)
2450 const char *open,*close;
2452 open=strchr(aline,'<');
2455 close=strchr(g_utf8_next_char(open),'>');
2458 if (pswit[ECHO_SWITCH])
2459 g_print("\n%s\n",aline);
2460 if (!pswit[OVERVIEW_SWITCH])
2462 tag=g_strndup(open,close-open+1);
2463 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2464 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2474 * check_for_html_entity:
2476 * Check for &symbol; HTML.
2478 * If there is a & in the line, followed at
2479 * some point by a ; then we suspect HTML.
2481 void check_for_html_entity(const char *aline)
2483 const char *s,*amp,*scolon;
2485 amp=strchr(aline,'&');
2488 scolon=strchr(amp,';');
2491 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2492 if (g_utf8_get_char(s)==CHAR_SPACE)
2493 break; /* Don't report "Jones & Son;" */
2496 if (pswit[ECHO_SWITCH])
2497 g_print("\n%s\n",aline);
2498 if (!pswit[OVERVIEW_SWITCH])
2500 entity=g_strndup(amp,scolon-amp+1);
2501 g_print(" Line %ld column %d - HTML symbol? %s \n",
2502 linecnt,(int)(amp-aline)+1,entity);
2513 * check_for_omitted_punctuation:
2515 * Check for omitted punctuation at end of paragraph by working back
2516 * through prevline. DW.
2517 * Need to check this only for "normal" paras.
2518 * So what is a "normal" para?
2519 * Not normal if one-liner (chapter headings, etc.)
2520 * Not normal if doesn't contain at least one locase letter
2521 * Not normal if starts with space
2523 void check_for_omitted_punctuation(const char *prevline,
2524 struct line_properties *last,int start_para_line)
2526 gboolean letter_on_line=FALSE;
2529 gboolean closing_quote;
2530 for (s=prevline;*s;s=g_utf8_next_char(s))
2531 if (g_unichar_isalpha(g_utf8_get_char(s)))
2533 letter_on_line=TRUE;
2537 * This next "if" is a problem.
2538 * If we say "start_para_line <= linecnt - 1", that includes
2539 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2540 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2541 * misses genuine one-line paragraphs.
2543 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2544 g_utf8_get_char(prevline)>CHAR_SPACE)
2546 s=prevline+strlen(prevline);
2549 s=g_utf8_prev_char(s);
2550 c=g_utf8_get_char(s);
2551 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2554 closing_quote=FALSE;
2555 } while (closing_quote && s>prevline);
2556 for (;s>prevline;s=g_utf8_prev_char(s))
2558 if (g_unichar_isalpha(g_utf8_get_char(s)))
2560 if (pswit[ECHO_SWITCH])
2561 g_print("\n%s\n",prevline);
2562 if (!pswit[OVERVIEW_SWITCH])
2563 g_print(" Line %ld column %ld - "
2564 "No punctuation at para end?\n",
2565 linecnt-1,g_utf8_strlen(prevline,-1));
2570 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2576 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2578 const char *word=key;
2581 g_print("\nNote: Queried word %s was duplicated %d times\n",
2586 void print_as_windows_1252(const char *string)
2588 gsize inbytes,outbytes;
2590 static GIConv converter=(GIConv)-1;
2593 if (converter!=(GIConv)-1)
2594 g_iconv_close(converter);
2595 converter=(GIConv)-1;
2598 if (converter==(GIConv)-1)
2599 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2600 if (converter!=(GIConv)-1)
2602 inbytes=outbytes=strlen(string);
2603 bp=buf=g_malloc(outbytes+1);
2604 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2610 fputs(string,stdout);
2613 void print_as_utf_8(const char *string)
2615 fputs(string,stdout);
2623 void procfile(const char *filename)
2626 gchar *parastart=NULL; /* first line of current para */
2627 gchar *etext,*aline;
2630 struct first_pass_results *first_pass_results;
2631 struct warnings *warnings;
2632 struct counters counters={0};
2633 struct line_properties last={0};
2634 struct parities parities={0};
2635 struct pending pending={0};
2636 gboolean isemptyline;
2637 long start_para_line=0;
2638 gboolean isnewpara=FALSE,enddash=FALSE;
2639 last.start=CHAR_SPACE;
2640 linecnt=checked_linecnt=0;
2641 etext=read_etext(filename,&err);
2644 if (pswit[STDOUT_SWITCH])
2645 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2647 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2650 g_print("\n\nFile: %s\n\n",filename);
2651 first_pass_results=first_pass(etext);
2652 warnings=report_first_pass(first_pass_results);
2653 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2654 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2656 * Here we go with the main pass. Hold onto yer hat!
2660 while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
2665 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2666 continue; // skip DP page separators completely
2667 if (linecnt<first_pass_results->firstline ||
2668 (first_pass_results->footerline>0 &&
2669 linecnt>first_pass_results->footerline))
2671 if (pswit[HEADER_SWITCH])
2673 if (g_str_has_prefix(aline,"Title:"))
2674 g_print(" %s\n",aline);
2675 if (g_str_has_prefix(aline,"Author:"))
2676 g_print(" %s\n",aline);
2677 if (g_str_has_prefix(aline,"Release Date:"))
2678 g_print(" %s\n",aline);
2679 if (g_str_has_prefix(aline,"Edition:"))
2680 g_print(" %s\n\n",aline);
2682 continue; /* skip through the header */
2685 print_pending(aline,parastart,&pending);
2686 isemptyline=analyse_quotes(aline,&counters);
2687 if (isnewpara && !isemptyline)
2689 /* This line is the start of a new paragraph. */
2690 start_para_line=linecnt;
2691 /* Capture its first line in case we want to report it later. */
2693 parastart=g_strdup(aline);
2694 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2696 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2697 !g_unichar_isdigit(g_utf8_get_char(s)))
2698 s=g_utf8_next_char(s);
2699 if (g_unichar_islower(g_utf8_get_char(s)))
2701 /* and its first letter is lowercase */
2702 if (pswit[ECHO_SWITCH])
2703 g_print("\n%s\n",aline);
2704 if (!pswit[OVERVIEW_SWITCH])
2705 g_print(" Line %ld column %ld - "
2706 "Paragraph starts with lower-case\n",
2707 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2711 isnewpara=FALSE; /* Signal the end of new para processing. */
2713 /* Check for an em-dash broken at line end. */
2714 if (enddash && g_utf8_get_char(aline)=='-')
2716 if (pswit[ECHO_SWITCH])
2717 g_print("\n%s\n",aline);
2718 if (!pswit[OVERVIEW_SWITCH])
2719 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2724 for (s=g_utf8_prev_char(aline+strlen(aline));
2725 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2727 if (s>=aline && g_utf8_get_char(s)=='-')
2729 check_for_control_characters(aline);
2731 check_for_odd_characters(aline,warnings,isemptyline);
2732 if (warnings->longline)
2733 check_for_long_line(aline);
2734 if (warnings->shortline)
2735 check_for_short_line(aline,&last);
2737 last.len=g_utf8_strlen(aline,-1);
2738 last.start=g_utf8_get_char(aline);
2739 check_for_starting_punctuation(aline);
2742 check_for_spaced_emdash(aline);
2743 check_for_spaced_dash(aline);
2745 check_for_unmarked_paragraphs(aline);
2746 check_for_jeebies(aline);
2747 check_for_mta_from(aline);
2748 check_for_orphan_character(aline);
2749 check_for_pling_scanno(aline);
2750 check_for_extra_period(aline,warnings);
2751 check_for_following_punctuation(aline);
2752 check_for_typos(aline,warnings);
2753 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2754 check_for_double_punctuation(aline,warnings);
2755 check_for_spaced_quotes(aline);
2756 check_for_miscased_genative(aline);
2757 check_end_of_line(aline,warnings);
2758 check_for_unspaced_bracket(aline);
2759 if (warnings->endquote)
2760 check_for_unpunctuated_endquote(aline);
2761 check_for_html_tag(aline);
2762 check_for_html_entity(aline);
2765 check_for_mismatched_quotes(&counters,&pending);
2766 counters_reset(&counters);
2767 /* let the next iteration know that it's starting a new para */
2770 check_for_omitted_punctuation(prevline,&last,start_para_line);
2773 prevline=g_strdup(aline);
2776 check_for_mismatched_quotes(&counters,&pending);
2777 print_pending(NULL,parastart,&pending);
2778 reset_pending(&pending);
2787 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2788 g_tree_foreach(qword,report_duplicate_queries,NULL);
2789 g_tree_unref(qword);
2790 g_tree_unref(qperiod);
2791 counters_destroy(&counters);
2792 g_set_print_handler(NULL);
2793 print_as_windows_1252(NULL);
2794 if (pswit[MARKUP_SWITCH])
2801 * Get one line from the input text. The setting of newlines has the following
2804 * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
2806 * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
2807 * the newline character.
2809 * UNIX_NEWLINES: Check for the presence of CRs.
2811 * In all cases, check that the last line is correctly terminated.
2813 * Returns: a pointer to the line.
2815 char *flgets(char **etext,long lcnt,int newlines)
2818 gboolean isCR=FALSE;
2819 char *theline=*etext;
2824 c=g_utf8_get_char(*etext);
2827 if (*etext==theline)
2829 else if (pswit[LINE_END_SWITCH])
2831 if (pswit[ECHO_SWITCH])
2833 s=g_strndup(theline,eos-theline);
2834 g_print("\n%s\n",s);
2837 if (!pswit[OVERVIEW_SWITCH])
2839 if (newlines==OS9_NEWLINES)
2840 g_print(" Line %ld - No CR?\n",lcnt);
2843 /* There may, or may not, have been a CR */
2844 g_print(" Line %ld - No LF?\n",lcnt);
2852 *etext=g_utf8_next_char(*etext);
2853 /* either way, it's end of line */
2856 if (newlines==DOS_NEWLINES && !isCR)
2858 /* Error - a LF without a preceding CR */
2859 if (pswit[LINE_END_SWITCH])
2861 if (pswit[ECHO_SWITCH])
2863 s=g_strndup(theline,eos-theline);
2864 g_print("\n%s\n",s);
2867 if (!pswit[OVERVIEW_SWITCH])
2868 g_print(" Line %ld - No CR?\n",lcnt);
2877 if (newlines==OS9_NEWLINES)
2879 if (isCR || newlines==UNIX_NEWLINES)
2881 if (pswit[LINE_END_SWITCH])
2883 if (pswit[ECHO_SWITCH])
2885 s=g_strndup(theline,eos-theline);
2886 g_print("\n%s\n",s);
2889 if (!pswit[OVERVIEW_SWITCH])
2891 if (newlines==UNIX_NEWLINES)
2892 g_print(" Line %ld column %ld - Embedded CR?\n",
2893 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2895 g_print(" Line %ld - Two successive CRs?\n",
2901 if (newlines==UNIX_NEWLINES)
2904 if (newlines==DOS_NEWLINES)
2909 if (pswit[LINE_END_SWITCH] && isCR)
2911 if (pswit[ECHO_SWITCH])
2913 s=g_strndup(theline,eos-theline);
2914 g_print("\n%s\n",s);
2917 if (!pswit[OVERVIEW_SWITCH])
2918 g_print(" Line %ld column %ld - CR without LF?\n",
2919 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2925 eos=g_utf8_next_char(eos);
2929 if (pswit[MARKUP_SWITCH])
2930 postprocess_for_HTML(theline);
2931 if (pswit[DP_SWITCH])
2932 postprocess_for_DP(theline);
2939 * Takes a "word" as a parameter, and checks whether it
2940 * contains a mixture of alpha and digits. Generally, this is an
2941 * error, but may not be for cases like 4th or L5 12s. 3d.
2943 * Returns: TRUE iff an is error found.
2945 gboolean mixdigit(const char *checkword)
2947 gboolean wehaveadigit,wehavealetter,query;
2948 const char *s,*nondigit;
2949 wehaveadigit=wehavealetter=query=FALSE;
2950 for (s=checkword;*s;s=g_utf8_next_char(s))
2951 if (g_unichar_isalpha(g_utf8_get_char(s)))
2953 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2955 if (wehaveadigit && wehavealetter)
2957 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2959 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2960 nondigit=g_utf8_next_char(nondigit))
2962 /* digits, ending in st, rd, nd, th of either case */
2963 if (!g_ascii_strcasecmp(nondigit,"st") ||
2964 !g_ascii_strcasecmp(nondigit,"rd") ||
2965 !g_ascii_strcasecmp(nondigit,"nd") ||
2966 !g_ascii_strcasecmp(nondigit,"th"))
2968 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2969 !g_ascii_strcasecmp(nondigit,"rds") ||
2970 !g_ascii_strcasecmp(nondigit,"nds") ||
2971 !g_ascii_strcasecmp(nondigit,"ths"))
2973 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2974 !g_ascii_strcasecmp(nondigit,"rdly") ||
2975 !g_ascii_strcasecmp(nondigit,"ndly") ||
2976 !g_ascii_strcasecmp(nondigit,"thly"))
2978 /* digits, ending in l, L, s or d */
2979 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2980 !strcmp(nondigit,"d"))
2983 * L at the start of a number, representing Britsh pounds, like L500.
2984 * This is cute. We know the current word is mixed digit. If the first
2985 * letter is L, there must be at least one digit following. If both
2986 * digits and letters follow, we have a genuine error, else we have a
2987 * capital L followed by digits, and we accept that as a non-error.
2989 if (g_utf8_get_char(checkword)=='L' &&
2990 !mixdigit(g_utf8_next_char(checkword)))
2999 * Extracts the first/next "word" from the line, and returns it.
3000 * A word is defined as one English word unit--or at least that's the aim.
3001 * "ptr" is advanced to the position in the line where we will start
3002 * looking for the next word.
3004 * Returns: A newly-allocated string.
3006 gchar *getaword(const char **ptr)
3011 word=g_string_new(NULL);
3012 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
3013 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
3014 **ptr;*ptr=g_utf8_next_char(*ptr))
3016 /* Handle exceptions for footnote markers like [1] */
3017 if (g_utf8_get_char(*ptr)=='[')
3019 g_string_append_c(word,'[');
3020 s=g_utf8_next_char(*ptr);
3021 for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
3022 g_string_append_unichar(word,g_utf8_get_char(s));
3023 if (g_utf8_get_char(s)==']')
3025 g_string_append_c(word,']');
3026 *ptr=g_utf8_next_char(s);
3027 return g_string_free(word,FALSE);
3030 g_string_truncate(word,0);
3034 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
3035 * Especially yucky is the case of L1,000
3036 * This section looks for a pattern of characters including a digit
3037 * followed by a comma or period followed by one or more digits.
3038 * If found, it returns this whole pattern as a word; otherwise we discard
3039 * the results and resume our normal programming.
3042 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
3043 g_unichar_isalpha(g_utf8_get_char(s)) ||
3044 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3045 g_string_append_unichar(word,g_utf8_get_char(s));
3048 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3050 c=g_utf8_get_char(t);
3051 pc=g_utf8_get_char(g_utf8_prev_char(t));
3052 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3055 return g_string_free(word,FALSE);
3059 /* we didn't find a punctuated number - do the regular getword thing */
3060 g_string_truncate(word,0);
3061 c=g_utf8_get_char(*ptr);
3062 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3063 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3064 g_string_append_unichar(word,c);
3065 return g_string_free(word,FALSE);
3071 * Is this word a Roman Numeral?
3073 * It doesn't actually validate that the number is a valid Roman Numeral--for
3074 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3075 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3076 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3077 * expressions thereof, except when it came to taxes. Allow any number of M,
3078 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3079 * XL or an optional XC, an optional IX or IV, an optional V and any number
3082 gboolean isroman(const char *t)
3088 while (g_utf8_get_char(t)=='m' && *t)
3090 if (g_utf8_get_char(t)=='d')
3092 if (g_str_has_prefix(t,"cm"))
3094 if (g_str_has_prefix(t,"cd"))
3096 while (g_utf8_get_char(t)=='c' && *t)
3098 if (g_str_has_prefix(t,"xl"))
3100 if (g_str_has_prefix(t,"xc"))
3102 if (g_utf8_get_char(t)=='l')
3104 while (g_utf8_get_char(t)=='x' && *t)
3106 if (g_str_has_prefix(t,"ix"))
3108 if (g_str_has_prefix(t,"iv"))
3110 if (g_utf8_get_char(t)=='v')
3112 while (g_utf8_get_char(t)=='i' && *t)
3118 * postprocess_for_DP:
3120 * Invoked with the -d switch from flgets().
3121 * It simply "removes" from the line a hard-coded set of common
3122 * DP-specific tags, so that the line passed to the main routine has
3123 * been pre-cleaned of DP markup.
3125 void postprocess_for_DP(char *theline)
3131 for (i=0;*DPmarkup[i];i++)
3132 while ((s=strstr(theline,DPmarkup[i])))
3134 t=s+strlen(DPmarkup[i]);
3135 memmove(s,t,strlen(t)+1);
3140 * postprocess_for_HTML:
3142 * Invoked with the -m switch from flgets().
3143 * It simply "removes" from the line a hard-coded set of common
3144 * HTML tags and "replaces" a hard-coded set of common HTML
3145 * entities, so that the line passed to the main routine has
3146 * been pre-cleaned of HTML.
3148 void postprocess_for_HTML(char *theline)
3150 while (losemarkup(theline))
3152 loseentities(theline);
3155 char *losemarkup(char *theline)
3159 s=strchr(theline,'<');
3160 t=s?strchr(s,'>'):NULL;
3163 for (i=0;*markup[i];i++)
3164 if (tagcomp(g_utf8_next_char(s),markup[i]))
3166 t=g_utf8_next_char(t);
3167 memmove(s,t,strlen(t)+1);
3170 /* It's an unrecognized <xxx>. */
3174 void loseentities(char *theline)
3181 GTree *entities=NULL;
3182 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3186 g_tree_destroy(entities);
3188 if (translit!=(GIConv)-1)
3189 g_iconv_close(translit);
3190 translit=(GIConv)-1;
3191 if (to_utf8!=(GIConv)-1)
3192 g_iconv_close(to_utf8);
3200 entities=g_tree_new((GCompareFunc)strcmp);
3201 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3202 g_tree_insert(entities,HTMLentities[i].name,
3203 GUINT_TO_POINTER(HTMLentities[i].c));
3205 if (translit==(GIConv)-1)
3206 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3207 if (to_utf8==(GIConv)-1)
3208 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3209 while((amp=strchr(theline,'&')))
3211 scolon=strchr(amp,';');
3216 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3217 c=strtol(amp+2,NULL,10);
3218 else if (amp[2]=='x' &&
3219 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3220 c=strtol(amp+3,NULL,16);
3224 s=g_strndup(amp+1,scolon-(amp+1));
3225 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3234 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3235 theline+=g_unichar_to_utf8(c,theline);
3239 nb=g_unichar_to_utf8(c,s);
3240 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3242 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3244 memcpy(theline,s,nb);
3248 memmove(theline,g_utf8_next_char(scolon),
3249 strlen(g_utf8_next_char(scolon))+1);
3252 theline=g_utf8_next_char(amp);
3256 gboolean tagcomp(const char *strin,const char *basetag)
3260 if (g_utf8_get_char(strin)=='/')
3261 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3263 t=g_utf8_casefold(strin,-1);
3264 s=g_utf8_casefold(basetag,-1);
3265 retval=g_str_has_prefix(t,s);
3271 void proghelp(GOptionContext *context)
3274 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3275 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3276 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3277 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3278 "For details, read the file COPYING.\n",stderr);
3279 fputs("This is Free Software; "
3280 "you may redistribute it under certain conditions (GPL);\n",stderr);
3281 fputs("read the file COPYING for details.\n\n",stderr);
3282 help=g_option_context_get_help(context,TRUE,NULL);
3285 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3286 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3287 "non-ASCII\n",stderr);
3288 fputs("characters like accented letters, "
3289 "lines longer than 75 or shorter than 55,\n",stderr);
3290 fputs("unbalanced quotes or brackets, "
3291 "a variety of badly formatted punctuation, \n",stderr);
3292 fputs("HTML tags, some likely typos. "
3293 "It is NOT a substitute for human judgement.\n",stderr);