1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
35 gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
36 GIConv charset_validator=(GIConv)-1;
42 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
43 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
44 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
45 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
46 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
47 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
48 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
49 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
50 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
51 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
52 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
53 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
54 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
55 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
56 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
57 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
58 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
59 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
60 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
61 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
62 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
63 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
64 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
65 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
66 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
67 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
68 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
69 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
70 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
76 /* Common abbreviations and other OK words not to query as typos. */
78 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
79 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
80 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
81 "outbid", "outbids", "frostbite", "frostbitten", ""
84 /* Common abbreviations that cause otherwise unexplained periods. */
86 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
87 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
91 * Two-Letter combinations that rarely if ever start words,
92 * but are common scannos or otherwise common letter combinations.
95 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
99 * Two-Letter combinations that rarely if ever end words,
100 * but are common scannos or otherwise common letter combinations.
103 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
104 "sw", "gr", "sl", "cl", "iy", ""
108 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
109 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
110 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
111 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
115 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
119 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
120 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
121 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
122 "during", "let", "toward", "among", ""
126 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
127 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
128 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
129 "among", "those", "into", "whom", "having", "thence", ""
132 gboolean pswit[SWITNO]; /* program switches */
135 static GOptionEntry options[]={
136 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
137 "Ignore DP-specific markup", NULL },
138 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
139 "Don't echo queried line", NULL },
140 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
141 "Check single quotes", NULL },
142 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
143 "Check common typos", NULL },
144 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
145 "Require closure of quotes on every paragraph", NULL },
146 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
147 "Disable paranoid querying of everything", NULL },
148 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
149 "Disable line end checking", NULL },
150 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
151 "Overview: just show counts", NULL },
152 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
153 "Output errors to stdout instead of stderr", NULL },
154 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
155 "Echo header fields", NULL },
156 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
157 "Ignore markup in < >", NULL },
158 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
159 "Use file of user-defined typos", NULL },
160 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
161 "Defaults for use on www upload", NULL },
162 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
163 "Verbose - list everything", NULL },
164 { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
165 "Set of characters valid for this ebook", "NAME" },
169 long cnt_quote; /* for overview mode, count of quote queries */
170 long cnt_brack; /* for overview mode, count of brackets queries */
171 long cnt_bin; /* for overview mode, count of non-ASCII queries */
172 long cnt_odd; /* for overview mode, count of odd character queries */
173 long cnt_long; /* for overview mode, count of long line errors */
174 long cnt_short; /* for overview mode, count of short line queries */
175 long cnt_punct; /* for overview mode,
176 count of punctuation and spacing queries */
177 long cnt_dash; /* for overview mode, count of dash-related queries */
178 long cnt_word; /* for overview mode, count of word queries */
179 long cnt_html; /* for overview mode, count of html queries */
180 long cnt_lineend; /* for overview mode, count of line-end queries */
181 long cnt_spacend; /* count of lines with space at end */
182 long linecnt; /* count of total lines in the file */
183 long checked_linecnt; /* count of lines actually checked */
185 void proghelp(GOptionContext *context);
186 void procfile(const char *);
190 gboolean mixdigit(const char *);
191 gchar *getaword(const char **);
192 char *flgets(char **,long);
193 void postprocess_for_HTML(char *);
194 char *linehasmarkup(char *);
195 char *losemarkup(char *);
196 gboolean tagcomp(const char *,const char *);
197 void loseentities(char *);
198 gboolean isroman(const char *);
199 void postprocess_for_DP(char *);
200 void print_as_windows_1252(const char *string);
201 void print_as_utf_8(const char *string);
203 GTree *qword,*qperiod;
209 gboolean set_charset(const char *name,GError **err)
211 /* The various UNICODE encodings all share the same character set. */
212 const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
213 "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
214 "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
215 "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
216 "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
220 if (charset_validator==(GIConv)-1)
221 g_iconv_close(charset_validator);
222 if (!name || !g_strcasecmp(name,"auto"))
225 charset_validator=(GIConv)-1;
229 charset=g_strdup(name);
230 for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
231 if (!g_strcasecmp(charset,unicode_aliases[i]))
234 charset=g_strdup("UTF-8");
237 if (!strcmp(charset,"UTF-8"))
238 charset_validator=(GIConv)-1;
241 charset_validator=g_iconv_open(charset,"UTF-8");
242 if (charset_validator==(GIConv)-1)
244 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
245 "Unknown character set \"%s\"",charset);
252 void parse_options(int *argc,char ***argv)
255 GOptionContext *context;
256 context=g_option_context_new(
257 "file - looks for errors in Project Gutenberg(TM) etexts");
258 g_option_context_add_main_entries(context,options,NULL);
259 if (!g_option_context_parse(context,argc,argv,&err))
261 g_printerr("Bookloupe: %s\n",err->message);
262 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
265 /* Paranoid checking is turned OFF, not on, by its switch */
266 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
267 if (pswit[PARANOID_SWITCH])
268 /* if running in paranoid mode, typo checks default to enabled */
269 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
270 /* Line-end checking is turned OFF, not on, by its switch */
271 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
272 /* Echoing is turned OFF, not on, by its switch */
273 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
274 if (pswit[OVERVIEW_SWITCH])
275 /* just print summary; don't echo */
276 pswit[ECHO_SWITCH]=FALSE;
278 * Web uploads - for the moment, this is really just a placeholder
279 * until we decide what processing we really want to do on web uploads
281 if (pswit[WEB_SWITCH])
283 /* specific override for web uploads */
284 pswit[ECHO_SWITCH]=TRUE;
285 pswit[SQUOTE_SWITCH]=FALSE;
286 pswit[TYPO_SWITCH]=TRUE;
287 pswit[QPARA_SWITCH]=FALSE;
288 pswit[PARANOID_SWITCH]=TRUE;
289 pswit[LINE_END_SWITCH]=FALSE;
290 pswit[OVERVIEW_SWITCH]=FALSE;
291 pswit[STDOUT_SWITCH]=FALSE;
292 pswit[HEADER_SWITCH]=TRUE;
293 pswit[VERBOSE_SWITCH]=FALSE;
294 pswit[MARKUP_SWITCH]=FALSE;
295 pswit[USERTYPO_SWITCH]=FALSE;
296 pswit[DP_SWITCH]=FALSE;
298 if (opt_charset && !set_charset(opt_charset,&err))
300 g_printerr("%s\n",err->message);
310 g_option_context_free(context);
316 * Read in the user-defined stealth scanno list.
318 void read_user_scannos(void)
321 gchar *usertypo_file;
325 gchar *contents,*utf8,**lines;
326 usertypo_file=g_strdup("bookloupe.typ");
327 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
328 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
331 g_free(usertypo_file);
332 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
333 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
335 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
338 g_free(usertypo_file);
339 usertypo_file=g_strdup("gutcheck.typ");
340 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
342 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
345 g_free(usertypo_file);
346 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
347 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
349 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
351 g_free(usertypo_file);
352 g_print(" --> I couldn't find bookloupe.typ "
353 "-- proceeding without user typos.\n");
358 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
359 g_free(usertypo_file);
363 if (g_utf8_validate(contents,len,NULL))
365 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
367 (void)set_charset("UNICODE",NULL);
370 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
372 lines=g_strsplit_set(utf8,"\r\n",0);
374 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
375 for (i=0;lines[i];i++)
376 if (*(unsigned char *)lines[i]>'!')
377 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
386 * Read an etext returning a newly allocated string containing the file
387 * contents or NULL on error.
389 gchar *read_etext(const char *filename,GError **err)
391 GError *tmp_err=NULL;
392 gchar *contents,*utf8;
393 gsize len,bytes_read,bytes_written;
395 if (!g_file_get_contents(filename,&contents,&len,err))
397 if (g_utf8_validate(contents,len,NULL))
399 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
400 g_set_print_handler(print_as_utf_8);
402 SetConsoleOutputCP(CP_UTF8);
407 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
408 &bytes_written,&tmp_err);
409 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
410 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
413 for(i=0;i<bytes_read;i++)
414 if (contents[i]=='\n')
419 else if (contents[i]!='\r')
421 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
422 "Input conversion failed. Byte %d at line %d, column %d is not a "
423 "valid Windows-1252 character",
424 ((unsigned char *)contents)[bytes_read],line,col);
427 g_propagate_error(err,tmp_err);
428 g_set_print_handler(print_as_windows_1252);
430 SetConsoleOutputCP(1252);
437 void cleanup_on_exit(void)
440 SetConsoleOutputCP(saved_cp);
444 int main(int argc,char **argv)
447 atexit(cleanup_on_exit);
448 saved_cp=GetConsoleOutputCP();
450 running_from=g_path_get_dirname(argv[0]);
451 parse_options(&argc,&argv);
452 if (pswit[USERTYPO_SWITCH])
454 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
456 if (pswit[OVERVIEW_SWITCH])
458 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
459 checked_linecnt,linecnt,linecnt-checked_linecnt);
460 g_print(" --------------- Queries found --------------\n");
462 g_print(" Long lines: %14ld\n",cnt_long);
464 g_print(" Short lines: %14ld\n",cnt_short);
466 g_print(" Line-end problems: %14ld\n",cnt_lineend);
468 g_print(" Common typos: %14ld\n",cnt_word);
470 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
472 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
474 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
476 g_print(" Proofing characters: %14ld\n",cnt_odd);
478 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
480 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
482 g_print(" Possible HTML tags: %14ld\n",cnt_html);
484 g_print(" TOTAL QUERIES %14ld\n",
485 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
486 cnt_dash+cnt_word+cnt_html+cnt_lineend);
488 g_free(running_from);
490 g_tree_unref(usertypo);
491 set_charset(NULL,NULL);
498 * Run a first pass - verify that it's a valid PG
499 * file, decide whether to report some things that
500 * occur many times in the text like long or short
501 * lines, non-standard dashes, etc.
503 struct first_pass_results *first_pass(const char *etext)
505 gunichar laststart=CHAR_SPACE;
510 unsigned int lastlen=0,lastblen=0;
511 long spline=0,nspline=0;
512 static struct first_pass_results results={0};
514 lines=g_strsplit(etext,"\n",0);
515 for (j=0;lines[j];j++)
517 lbytes=strlen(lines[j]);
518 while (lbytes>0 && lines[j][lbytes-1]=='\r')
519 lines[j][--lbytes]='\0';
520 llen=g_utf8_strlen(lines[j],lbytes);
522 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
523 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
526 g_print(" --> Duplicate header?\n");
527 spline=linecnt+1; /* first line of non-header text, that is */
529 if (!strncmp(lines[j],"*** START",9) &&
530 strstr(lines[j],"PROJECT GUTENBERG"))
533 g_print(" --> Duplicate header?\n");
534 nspline=linecnt+1; /* first line of non-header text, that is */
536 if (spline || nspline)
538 lc_line=g_utf8_strdown(lines[j],lbytes);
539 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
541 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
543 if (results.footerline)
545 /* it's an old-form header - we can detect duplicates */
547 g_print(" --> Duplicate footer?\n");
550 results.footerline=linecnt;
556 results.firstline=spline;
558 results.firstline=nspline; /* override with new */
559 if (results.footerline)
560 continue; /* don't count the boilerplate in the footer */
561 results.totlen+=llen;
562 for (s=lines[j];*s;s=g_utf8_next_char(s))
564 if (g_utf8_get_char(s)>127)
566 if (g_unichar_isalpha(g_utf8_get_char(s)))
568 if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
569 isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
570 results.endquote_count++;
572 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
573 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
576 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
578 if (strstr(lines[j],".,"))
580 /* only count ast lines for ignoring purposes where there is */
581 /* locase text on the line */
582 if (strchr(lines[j],'*'))
584 for (s=lines[j];*s;s=g_utf8_next_char(s))
585 if (g_unichar_islower(g_utf8_get_char(s)))
590 if (strchr(lines[j],'/'))
591 results.fslashline++;
594 for (s=g_utf8_prev_char(lines[j]+lbytes);
595 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
596 s=g_utf8_prev_char(s))
598 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
599 g_utf8_get_char(g_utf8_prev_char(s))!='-')
602 if (llen>LONGEST_PG_LINE)
604 if (llen>WAY_TOO_LONG)
605 results.verylongline++;
606 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
608 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
611 if (strstr(lines[j],"<i>"))
612 results.htmcount+=4; /* bonus marks! */
614 /* Check for spaced em-dashes */
615 if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
618 if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
619 results.space_emdash++;
620 if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
621 /* count of em-dashes with spaces both sides */
622 results.non_PG_space_emdash++;
623 if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
624 /* count of PG-type em-dashes with no spaces */
625 results.PG_space_emdash++;
630 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
631 results.Dutchcount++;
632 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
633 results.Frenchcount++;
634 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
635 results.standalone_digit++;
638 /* Check for spaced dashes */
639 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
643 laststart=lines[j][0];
652 * Make some snap decisions based on the first pass results.
654 struct warnings *report_first_pass(struct first_pass_results *results)
656 static struct warnings warnings={0};
658 g_print(" --> %ld lines in this file have white space at end\n",
661 if (results->dotcomma>5)
664 g_print(" --> %ld lines in this file contain '.,'. "
665 "Not reporting them.\n",results->dotcomma);
668 * If more than 50 lines, or one-tenth, are short,
669 * don't bother reporting them.
671 warnings.shortline=1;
672 if (results->shortline>50 || results->shortline*10>linecnt)
674 warnings.shortline=0;
675 g_print(" --> %ld lines in this file are short. "
676 "Not reporting short lines.\n",results->shortline);
679 * If more than 50 lines, or one-tenth, are long,
680 * don't bother reporting them.
683 if (results->longline>50 || results->longline*10>linecnt)
686 g_print(" --> %ld lines in this file are long. "
687 "Not reporting long lines.\n",results->longline);
689 /* If more than 10 lines contain asterisks, don't bother reporting them. */
691 if (results->astline>10)
694 g_print(" --> %ld lines in this file contain asterisks. "
695 "Not reporting them.\n",results->astline);
698 * If more than 10 lines contain forward slashes,
699 * don't bother reporting them.
702 if (results->fslashline>10)
705 g_print(" --> %ld lines in this file contain forward slashes. "
706 "Not reporting them.\n",results->fslashline);
709 * If more than 20 lines contain unpunctuated endquotes,
710 * don't bother reporting them.
713 if (results->endquote_count>20)
716 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
717 "Not reporting them.\n",results->endquote_count);
720 * If more than 15 lines contain standalone digits,
721 * don't bother reporting them.
724 if (results->standalone_digit>10)
727 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
728 "Not reporting them.\n",results->standalone_digit);
731 * If more than 20 lines contain hyphens at end,
732 * don't bother reporting them.
735 if (results->hyphens>20)
738 g_print(" --> %ld lines in this file have hyphens at end. "
739 "Not reporting them.\n",results->hyphens);
741 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
743 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
744 pswit[MARKUP_SWITCH]=1;
746 if (results->verylongline>0)
747 g_print(" --> %ld lines in this file are VERY long!\n",
748 results->verylongline);
750 * If there are more non-PG spaced dashes than PG em-dashes,
751 * assume it's deliberate.
752 * Current PG guidelines say don't use them, but older texts do,
753 * and some people insist on them whatever the guidelines say.
756 if (results->spacedash+results->non_PG_space_emdash>
757 results->PG_space_emdash)
760 g_print(" --> There are %ld spaced dashes and em-dashes. "
761 "Not reporting them.\n",
762 results->spacedash+results->non_PG_space_emdash);
768 /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
770 /* If more than a quarter of characters are hi-bit, bug out. */
771 if (results->binlen*4>results->totlen)
773 g_print(" --> This file does not appear to be ASCII. "
774 "Terminating. Best of luck with it!\n");
777 if (results->alphalen*4<results->totlen)
779 g_print(" --> This file does not appear to be text. "
780 "Terminating. Best of luck with it!\n");
783 if (results->binlen*100>results->totlen || results->binlen>100)
785 g_print(" --> There are a lot of foreign letters here. "
786 "Not reporting them.\n");
787 if (!pswit[VERBOSE_SWITCH])
791 warnings.isDutch=FALSE;
792 if (results->Dutchcount>50)
794 warnings.isDutch=TRUE;
795 g_print(" --> This looks like Dutch - "
796 "switching off dashes and warnings for 's Middags case.\n");
798 warnings.isFrench=FALSE;
799 if (results->Frenchcount>50)
801 warnings.isFrench=TRUE;
802 g_print(" --> This looks like French - "
803 "switching off some doublepunct.\n");
805 if (results->firstline && results->footerline)
806 g_print(" The PG header and footer appear to be already on.\n");
809 if (results->firstline)
810 g_print(" The PG header is on - no footer.\n");
811 if (results->footerline)
812 g_print(" The PG footer is on - no header.\n");
815 if (pswit[VERBOSE_SWITCH])
817 warnings.shortline=1;
826 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
828 if (warnings.isDutch)
830 if (results->footerline>0 && results->firstline>0 &&
831 results->footerline>results->firstline &&
832 results->footerline-results->firstline<100)
834 g_print(" --> I don't really know where this text starts. \n");
835 g_print(" There are no reference points.\n");
836 g_print(" I'm going to have to report the header and footer "
838 results->firstline=0;
846 * Look along the line, accumulate the count of quotes, and see
847 * if this is an empty line - i.e. a line with nothing on it
849 * If line has just spaces, period, * and/or - on it, don't
850 * count it, since empty lines with asterisks or dashes to
851 * separate sections are common.
853 * Returns: TRUE if the line is empty.
855 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
858 /* assume the line is empty until proven otherwise */
859 gboolean isemptyline=TRUE;
860 const char *s=aline,*sprev,*snext;
863 GError *tmp_err=NULL;
866 snext=g_utf8_next_char(s);
867 c=g_utf8_get_char(s);
868 if (CHAR_IS_DQUOTE(c))
869 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
870 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
875 * At start of line, it can only be a quotation mark.
876 * Hardcode a very common exception!
878 if (!g_str_has_prefix(snext,"tis") &&
879 !g_str_has_prefix(snext,"Tis"))
880 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
882 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
883 g_unichar_isalpha(g_utf8_get_char(snext)))
884 /* Do nothing! it's definitely an apostrophe, not a quote */
886 /* it's outside a word - let's check it out */
887 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
888 g_unichar_isalpha(g_utf8_get_char(snext)))
890 /* certainly looks like a quotation mark */
891 if (!g_str_has_prefix(snext,"tis") &&
892 !g_str_has_prefix(snext,"Tis"))
893 /* hardcode a very common exception! */
895 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
896 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
898 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
903 /* now - is it a quotation mark? */
904 guessquote=0; /* accumulate clues */
905 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
907 /* it follows a letter - could be either */
909 if (g_utf8_get_char(sprev)=='s')
911 /* looks like a plural apostrophe */
913 if (g_utf8_get_char(snext)==CHAR_SPACE)
917 if (innermost_quote_matches(counters,c))
919 * Give it the benefit of some doubt,
920 * if a squote is already open.
926 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
929 /* no adjacent letter - it must be a quote of some kind */
930 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
935 if (pswit[ECHO_SWITCH])
936 g_print("\n%s\n",aline);
937 if (!pswit[OVERVIEW_SWITCH])
938 g_print(" Line %ld column %ld - %s\n",
939 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
940 g_clear_error(&tmp_err);
942 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
944 isemptyline=FALSE; /* ignore lines like * * * as spacers */
945 if (c==CHAR_UNDERSCORE)
946 counters->c_unders++;
947 if (c==CHAR_OPEN_SBRACK)
949 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
950 !matching_difference(counters,c) && s==aline &&
951 g_str_has_prefix(s,"[Illustration:"))
952 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
954 increment_matching(counters,c,TRUE);
956 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
957 increment_matching(counters,c,TRUE);
958 if (c==CHAR_CLOSE_SBRACK)
960 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
961 !matching_difference(counters,c) && !*snext)
962 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
964 increment_matching(counters,c,FALSE);
966 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
967 increment_matching(counters,c,FALSE);
975 * check_for_control_characters:
977 * Check for invalid or questionable characters in the line
978 * Anything above 127 is invalid for plain ASCII, and
979 * non-printable control characters should also be flagged.
980 * Tabs should generally not be there.
982 void check_for_control_characters(const char *aline)
986 for (s=aline;*s;s=g_utf8_next_char(s))
988 c=g_utf8_get_char(s);
989 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
991 if (pswit[ECHO_SWITCH])
992 g_print("\n%s\n",aline);
993 if (!pswit[OVERVIEW_SWITCH])
994 g_print(" Line %ld column %ld - Control character %u\n",
995 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1003 * check_for_odd_characters:
1005 * Check for binary and other odd characters.
1007 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1008 gboolean isemptyline)
1010 /* Don't repeat multiple warnings on one line. */
1011 gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1012 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1017 for (s=aline;*s;s=g_utf8_next_char(s))
1019 c=g_utf8_get_char(s);
1020 if (warnings->bin && !eInvalidChar &&
1021 (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1023 if (pswit[ECHO_SWITCH])
1024 g_print("\n%s\n",aline);
1025 if (!pswit[OVERVIEW_SWITCH])
1026 if (c>127 && c<160 || c>255)
1027 g_print(" Line %ld column %ld - "
1028 "Non-ISO-8859 character %u\n",
1029 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1031 g_print(" Line %ld column %ld - "
1032 "Non-ASCII character %u\n",
1033 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1038 if (!eInvalidChar && charset)
1040 if (charset_validator==(GIConv)-1)
1042 if (!g_unichar_isdefined(c))
1044 if (pswit[ECHO_SWITCH])
1045 g_print("\n%s\n",aline);
1046 if (!pswit[OVERVIEW_SWITCH])
1047 g_print(" Line %ld column %ld - Unassigned UNICODE "
1048 "code point U+%04" G_GINT32_MODIFIER "X\n",
1049 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1054 else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1055 c>=100000 && c<=0x10FFFD)
1057 if (pswit[ECHO_SWITCH])
1058 g_print("\n%s\n",aline);
1059 if (!pswit[OVERVIEW_SWITCH])
1060 g_print(" Line %ld column %ld - Private Use "
1061 "character U+%04" G_GINT32_MODIFIER "X\n",
1062 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1070 t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1071 charset_validator,NULL,&nb,NULL);
1076 if (pswit[ECHO_SWITCH])
1077 g_print("\n%s\n",aline);
1078 if (!pswit[OVERVIEW_SWITCH])
1079 g_print(" Line %ld column %ld - Non-%s "
1080 "character %u\n",linecnt,
1081 g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1088 if (!eTab && c==CHAR_TAB)
1090 if (pswit[ECHO_SWITCH])
1091 g_print("\n%s\n",aline);
1092 if (!pswit[OVERVIEW_SWITCH])
1093 g_print(" Line %ld column %ld - Tab character?\n",
1094 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1099 if (!eTilde && c==CHAR_TILDE)
1102 * Often used by OCR software to indicate an
1103 * unrecognizable character.
1105 if (pswit[ECHO_SWITCH])
1106 g_print("\n%s\n",aline);
1107 if (!pswit[OVERVIEW_SWITCH])
1108 g_print(" Line %ld column %ld - Tilde character?\n",
1109 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1114 if (!eCarat && c==CHAR_CARAT)
1116 if (pswit[ECHO_SWITCH])
1117 g_print("\n%s\n",aline);
1118 if (!pswit[OVERVIEW_SWITCH])
1119 g_print(" Line %ld column %ld - Carat character?\n",
1120 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1125 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1127 if (pswit[ECHO_SWITCH])
1128 g_print("\n%s\n",aline);
1129 if (!pswit[OVERVIEW_SWITCH])
1130 g_print(" Line %ld column %ld - Forward slash?\n",
1131 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1137 * Report asterisks only in paranoid mode,
1138 * since they're often deliberate.
1140 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1143 if (pswit[ECHO_SWITCH])
1144 g_print("\n%s\n",aline);
1145 if (!pswit[OVERVIEW_SWITCH])
1146 g_print(" Line %ld column %ld - Asterisk?\n",
1147 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1156 * check_for_long_line:
1158 * Check for line too long.
1160 void check_for_long_line(const char *aline)
1162 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1164 if (pswit[ECHO_SWITCH])
1165 g_print("\n%s\n",aline);
1166 if (!pswit[OVERVIEW_SWITCH])
1167 g_print(" Line %ld column %ld - Long line %ld\n",
1168 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1175 * check_for_short_line:
1177 * Check for line too short.
1179 * This one is a bit trickier to implement: we don't want to
1180 * flag the last line of a paragraph for being short, so we
1181 * have to wait until we know that our current line is a
1182 * "normal" line, then report the _previous_ line if it was too
1183 * short. We also don't want to report indented lines like
1184 * chapter heads or formatted quotations. We therefore keep
1185 * last->len as the length of the last line examined, and
1186 * last->blen as the length of the last but one, and try to
1187 * suppress unnecessary warnings by checking that both were of
1188 * "normal" length. We keep the first character of the last
1189 * line in last->start, and if it was a space, we assume that
1190 * the formatting is deliberate. I can't figure out a way to
1191 * distinguish something like a quoted verse left-aligned or
1192 * the header or footer of a letter from a paragraph of short
1193 * lines - maybe if I examined the whole paragraph, and if the
1194 * para has less than, say, 8 lines and if all lines are short,
1195 * then just assume it's OK? Need to look at some texts to see
1196 * how often a formula like this would get the right result.
1198 void check_for_short_line(const char *aline,const struct line_properties *last)
1200 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1201 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1202 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1204 if (pswit[ECHO_SWITCH])
1205 g_print("\n%s\n",prevline);
1206 if (!pswit[OVERVIEW_SWITCH])
1207 g_print(" Line %ld column %ld - Short line %ld?\n",
1208 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1215 * check_for_starting_punctuation:
1217 * Look for punctuation other than full ellipses at start of line.
1219 void check_for_starting_punctuation(const char *aline)
1221 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1222 !g_str_has_prefix(aline,". . ."))
1224 if (pswit[ECHO_SWITCH])
1225 g_print("\n%s\n",aline);
1226 if (!pswit[OVERVIEW_SWITCH])
1227 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1235 * check_for_spaced_emdash:
1237 * Check for spaced em-dashes.
1239 * We must check _all_ occurrences of "--" on the line
1240 * hence the loop - even if the first double-dash is OK
1241 * there may be another that's wrong later on.
1243 void check_for_spaced_emdash(const char *aline)
1245 const char *s,*t,*next;
1246 for (s=aline;t=strstr(s,"--");s=next)
1248 next=g_utf8_next_char(g_utf8_next_char(t));
1249 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1250 g_utf8_get_char(next)==CHAR_SPACE)
1252 if (pswit[ECHO_SWITCH])
1253 g_print("\n%s\n",aline);
1254 if (!pswit[OVERVIEW_SWITCH])
1255 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1256 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1264 * check_for_spaced_dash:
1266 * Check for spaced dashes.
1268 void check_for_spaced_dash(const char *aline)
1271 if ((s=strstr(aline," -")))
1273 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1275 if (pswit[ECHO_SWITCH])
1276 g_print("\n%s\n",aline);
1277 if (!pswit[OVERVIEW_SWITCH])
1278 g_print(" Line %ld column %ld - Spaced dash?\n",
1279 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1284 else if ((s=strstr(aline,"- ")))
1286 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1288 if (pswit[ECHO_SWITCH])
1289 g_print("\n%s\n",aline);
1290 if (!pswit[OVERVIEW_SWITCH])
1291 g_print(" Line %ld column %ld - Spaced dash?\n",
1292 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1300 * check_for_unmarked_paragraphs:
1302 * Check for unmarked paragraphs indicated by separate speakers.
1304 * May well be false positive:
1305 * "Bravo!" "Wonderful!" called the crowd.
1306 * but useful all the same.
1308 void check_for_unmarked_paragraphs(const char *aline)
1311 s=strstr(aline,"\" \"");
1313 s=strstr(aline,"\" \"");
1316 if (pswit[ECHO_SWITCH])
1317 g_print("\n%s\n",aline);
1318 if (!pswit[OVERVIEW_SWITCH])
1319 g_print(" Line %ld column %ld - "
1320 "Query missing paragraph break?\n",
1321 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1328 * check_for_jeebies:
1330 * Check for "to he" and other easy h/b errors.
1332 * This is a very inadequate effort on the h/b problem,
1333 * but the phrase "to he" is always an error, whereas "to
1334 * be" is quite common.
1335 * Similarly, '"Quiet!", be said.' is a non-be error
1336 * "to he" is _not_ always an error!:
1337 * "Where they went to he couldn't say."
1338 * Another false positive:
1339 * What would "Cinderella" be without the . . .
1340 * and another: "If he wants to he can see for himself."
1342 void check_for_jeebies(const char *aline)
1345 s=strstr(aline," be could ");
1347 s=strstr(aline," be would ");
1349 s=strstr(aline," was be ");
1351 s=strstr(aline," be is ");
1353 s=strstr(aline," is be ");
1355 s=strstr(aline,"\", be ");
1357 s=strstr(aline,"\" be ");
1359 s=strstr(aline,"\" be ");
1361 s=strstr(aline," to he ");
1364 if (pswit[ECHO_SWITCH])
1365 g_print("\n%s\n",aline);
1366 if (!pswit[OVERVIEW_SWITCH])
1367 g_print(" Line %ld column %ld - Query he/be error?\n",
1368 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1372 s=strstr(aline," the had ");
1374 s=strstr(aline," a had ");
1376 s=strstr(aline," they bad ");
1378 s=strstr(aline," she bad ");
1380 s=strstr(aline," he bad ");
1382 s=strstr(aline," you bad ");
1384 s=strstr(aline," i bad ");
1387 if (pswit[ECHO_SWITCH])
1388 g_print("\n%s\n",aline);
1389 if (!pswit[OVERVIEW_SWITCH])
1390 g_print(" Line %ld column %ld - Query had/bad error?\n",
1391 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1395 s=strstr(aline,"; hut ");
1397 s=strstr(aline,", hut ");
1400 if (pswit[ECHO_SWITCH])
1401 g_print("\n%s\n",aline);
1402 if (!pswit[OVERVIEW_SWITCH])
1403 g_print(" Line %ld column %ld - Query hut/but error?\n",
1404 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1411 * check_for_mta_from:
1413 * Special case - angled bracket in front of "From" placed there by an
1414 * MTA when sending an e-mail.
1416 void check_for_mta_from(const char *aline)
1419 s=strstr(aline,">From");
1422 if (pswit[ECHO_SWITCH])
1423 g_print("\n%s\n",aline);
1424 if (!pswit[OVERVIEW_SWITCH])
1425 g_print(" Line %ld column %ld - "
1426 "Query angled bracket with From\n",
1427 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1434 * check_for_orphan_character:
1436 * Check for a single character line -
1437 * often an overflow from bad wrapping.
1439 void check_for_orphan_character(const char *aline)
1442 c=g_utf8_get_char(aline);
1443 if (c && !*g_utf8_next_char(aline))
1445 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1446 ; /* Nothing - ignore numerals alone on a line. */
1449 if (pswit[ECHO_SWITCH])
1450 g_print("\n%s\n",aline);
1451 if (!pswit[OVERVIEW_SWITCH])
1452 g_print(" Line %ld column 1 - Query single character line\n",
1461 * check_for_pling_scanno:
1463 * Check for I" - often should be !
1465 void check_for_pling_scanno(const char *aline)
1468 s=strstr(aline," I\"");
1471 if (pswit[ECHO_SWITCH])
1472 g_print("\n%s\n",aline);
1473 if (!pswit[OVERVIEW_SWITCH])
1474 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1475 linecnt,g_utf8_pointer_to_offset(aline,s));
1482 * check_for_extra_period:
1484 * Check for period without a capital letter. Cut-down from gutspell.
1485 * Only works when it happens on a single line.
1487 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1489 const char *s,*t,*s1,*sprev;
1494 gunichar c,nc,pc,*decomposition;
1495 if (pswit[PARANOID_SWITCH])
1497 for (t=aline;t=strstr(t,". ");)
1501 t=g_utf8_next_char(t);
1502 /* start of line punctuation is handled elsewhere */
1505 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1507 t=g_utf8_next_char(t);
1510 if (warnings->isDutch)
1512 /* For Frank & Jeroen -- 's Middags case */
1513 gunichar c2,c3,c4,c5;
1514 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1515 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1516 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1517 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1518 if (CHAR_IS_APOSTROPHE(c2) &&
1519 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1520 g_unichar_isupper(c5))
1522 t=g_utf8_next_char(t);
1526 s1=g_utf8_next_char(g_utf8_next_char(t));
1527 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1528 !isdigit(g_utf8_get_char(s1)))
1529 s1=g_utf8_next_char(s1);
1530 if (g_unichar_islower(g_utf8_get_char(s1)))
1532 /* we have something to investigate */
1534 /* so let's go back and find out */
1535 nc=g_utf8_get_char(t);
1536 s1=g_utf8_prev_char(t);
1537 c=g_utf8_get_char(s1);
1538 sprev=g_utf8_prev_char(s1);
1539 pc=g_utf8_get_char(sprev);
1541 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1542 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1543 g_unichar_isalpha(nc)))
1548 sprev=g_utf8_prev_char(s1);
1549 pc=g_utf8_get_char(sprev);
1551 s1=g_utf8_next_char(s1);
1554 testword=g_strndup(s1,s-s1);
1556 testword=g_strdup(s1);
1557 for (i=0;*abbrev[i];i++)
1558 if (!strcmp(testword,abbrev[i]))
1560 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1562 if (!*g_utf8_next_char(testword))
1564 if (isroman(testword))
1569 for (s=testword;*s;s=g_utf8_next_char(s))
1571 decomposition=g_unicode_canonical_decomposition(
1572 g_utf8_get_char(s),&len);
1573 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1575 g_free(decomposition);
1579 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1581 g_tree_insert(qperiod,g_strdup(testword),
1582 GINT_TO_POINTER(1));
1583 if (pswit[ECHO_SWITCH])
1584 g_print("\n%s\n",aline);
1585 if (!pswit[OVERVIEW_SWITCH])
1586 g_print(" Line %ld column %ld - Extra period?\n",
1587 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1593 t=g_utf8_next_char(t);
1599 * check_for_following_punctuation:
1601 * Check for words usually not followed by punctuation.
1603 void check_for_following_punctuation(const char *aline)
1606 const char *s,*wordstart;
1609 if (pswit[TYPO_SWITCH])
1620 inword=g_utf8_strdown(t,-1);
1622 for (i=0;*nocomma[i];i++)
1623 if (!strcmp(inword,nocomma[i]))
1625 c=g_utf8_get_char(s);
1626 if (c==',' || c==';' || c==':')
1628 if (pswit[ECHO_SWITCH])
1629 g_print("\n%s\n",aline);
1630 if (!pswit[OVERVIEW_SWITCH])
1631 g_print(" Line %ld column %ld - "
1632 "Query punctuation after %s?\n",
1633 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1639 for (i=0;*noperiod[i];i++)
1640 if (!strcmp(inword,noperiod[i]))
1642 c=g_utf8_get_char(s);
1643 if (c=='.' || c=='!')
1645 if (pswit[ECHO_SWITCH])
1646 g_print("\n%s\n",aline);
1647 if (!pswit[OVERVIEW_SWITCH])
1648 g_print(" Line %ld column %ld - "
1649 "Query punctuation after %s?\n",
1650 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1664 * Check for commonly mistyped words,
1665 * and digits like 0 for O in a word.
1667 void check_for_typos(const char *aline,struct warnings *warnings)
1669 const char *s,*t,*nt,*wordstart;
1671 gunichar *decomposition;
1673 int i,vowel,consonant,*dupcnt;
1674 gboolean isdup,istypo,alower;
1677 gsize decomposition_len;
1681 inword=getaword(&s);
1685 continue; /* don't bother with empty lines */
1687 if (mixdigit(inword))
1689 if (pswit[ECHO_SWITCH])
1690 g_print("\n%s\n",aline);
1691 if (!pswit[OVERVIEW_SWITCH])
1692 g_print(" Line %ld column %ld - Query digit in %s\n",
1693 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1698 * Put the word through a series of tests for likely typos and OCR
1701 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1705 for (t=inword;*t;t=g_utf8_next_char(t))
1707 c=g_utf8_get_char(t);
1708 nt=g_utf8_next_char(t);
1709 /* lowercase for testing */
1710 if (g_unichar_islower(c))
1712 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1715 * We have an uppercase mid-word. However, there are
1717 * Mac and Mc like McGill
1718 * French contractions like l'Abbe
1720 offset=g_utf8_pointer_to_offset(inword,t);
1722 pc=g_utf8_get_char(g_utf8_prev_char(t));
1725 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1726 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1727 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1728 CHAR_IS_APOSTROPHE(pc))
1734 testword=g_utf8_casefold(inword,-1);
1736 if (pswit[TYPO_SWITCH])
1739 * Check for certain unlikely two-letter combinations at word
1742 len=g_utf8_strlen(testword,-1);
1745 for (i=0;*nostart[i];i++)
1746 if (g_str_has_prefix(testword,nostart[i]))
1748 for (i=0;*noend[i];i++)
1749 if (g_str_has_suffix(testword,noend[i]))
1752 /* ght is common, gbt never. Like that. */
1753 if (strstr(testword,"cb"))
1755 if (strstr(testword,"gbt"))
1757 if (strstr(testword,"pbt"))
1759 if (strstr(testword,"tbs"))
1761 if (strstr(testword,"mrn"))
1763 if (strstr(testword,"ahle"))
1765 if (strstr(testword,"ihle"))
1768 * "TBE" does happen - like HEARTBEAT - but uncommon.
1769 * Also "TBI" - frostbite, outbid - but uncommon.
1770 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1771 * numerals, but "ii" is a common scanno.
1773 if (strstr(testword,"tbi"))
1775 if (strstr(testword,"tbe"))
1777 if (strstr(testword,"ii"))
1780 * Check for no vowels or no consonants.
1781 * If none, flag a typo.
1783 if (!istypo && len>1)
1786 for (t=testword;*t;t=g_utf8_next_char(t))
1788 c=g_utf8_get_char(t);
1790 g_unicode_canonical_decomposition(c,&decomposition_len);
1791 if (c=='y' || g_unichar_isdigit(c))
1793 /* Yah, this is loose. */
1797 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1801 g_free(decomposition);
1803 if (!vowel || !consonant)
1807 * Now exclude the word from being reported if it's in
1810 for (i=0;*okword[i];i++)
1811 if (!strcmp(testword,okword[i]))
1814 * What looks like a typo may be a Roman numeral.
1817 if (istypo && isroman(testword))
1819 /* Check the manual list of typos. */
1821 for (i=0;*typo[i];i++)
1822 if (!strcmp(testword,typo[i]))
1825 * Check lowercase s, l, i and m - special cases.
1826 * "j" - often a semi-colon gone wrong.
1827 * "d" for a missing apostrophe - he d
1830 if (!istypo && len==1 &&
1831 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1835 dupcnt=g_tree_lookup(qword,testword);
1839 isdup=!pswit[VERBOSE_SWITCH];
1843 dupcnt=g_new0(int,1);
1844 g_tree_insert(qword,g_strdup(testword),dupcnt);
1849 if (pswit[ECHO_SWITCH])
1850 g_print("\n%s\n",aline);
1851 if (!pswit[OVERVIEW_SWITCH])
1853 g_print(" Line %ld column %ld - Query word %s",
1854 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1856 if (!pswit[VERBOSE_SWITCH])
1857 g_print(" - not reporting duplicates");
1865 /* check the user's list of typos */
1866 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1868 if (pswit[ECHO_SWITCH])
1869 g_print("\n%s\n",aline);
1870 if (!pswit[OVERVIEW_SWITCH])
1871 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1872 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1874 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1876 if (pswit[PARANOID_SWITCH] && warnings->digit)
1878 /* In paranoid mode, query all 0 and 1 standing alone. */
1879 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1881 if (pswit[ECHO_SWITCH])
1882 g_print("\n%s\n",aline);
1883 if (!pswit[OVERVIEW_SWITCH])
1884 g_print(" Line %ld column %ld - Query standalone %s\n",
1885 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1896 * check_for_misspaced_punctuation:
1898 * Look for added or missing spaces around punctuation and quotes.
1899 * If there is a punctuation character like ! with no space on
1900 * either side, suspect a missing!space. If there are spaces on
1901 * both sides , assume a typo. If we see a double quote with no
1902 * space or punctuation on either side of it, assume unspaced
1903 * quotes "like"this.
1905 void check_for_misspaced_punctuation(const char *aline,
1906 struct parities *parities,gboolean isemptyline)
1908 gboolean isacro,isellipsis;
1910 gunichar c,nc,pc,n2c;
1911 c=g_utf8_get_char(aline);
1912 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1913 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1917 nc=g_utf8_get_char(g_utf8_next_char(s));
1918 /* For each character in the line after the first. */
1919 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1921 /* we need to suppress warnings for acronyms like M.D. */
1923 /* we need to suppress warnings for ellipsis . . . */
1926 * If there are letters on both sides of it or
1927 * if it's strict punctuation followed by an alpha.
1929 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1930 g_utf8_strchr("?!,;:",-1,c)))
1934 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1935 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1937 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1943 if (pswit[ECHO_SWITCH])
1944 g_print("\n%s\n",aline);
1945 if (!pswit[OVERVIEW_SWITCH])
1946 g_print(" Line %ld column %ld - Missing space?\n",
1947 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1952 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1955 * If there are spaces on both sides,
1956 * or space before and end of line.
1960 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1961 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1963 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1967 if (!isemptyline && !isellipsis)
1969 if (pswit[ECHO_SWITCH])
1970 g_print("\n%s\n",aline);
1971 if (!pswit[OVERVIEW_SWITCH])
1972 g_print(" Line %ld column %ld - "
1973 "Spaced punctuation?\n",linecnt,
1974 g_utf8_pointer_to_offset(aline,s)+1);
1981 /* Split out the characters that CANNOT be preceded by space. */
1982 c=g_utf8_get_char(aline);
1983 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1984 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1988 nc=g_utf8_get_char(g_utf8_next_char(s));
1989 /* for each character in the line after the first */
1990 if (g_utf8_strchr("?!,;:",-1,c))
1992 /* if it's punctuation that _cannot_ have a space before it */
1993 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1996 * If nc DOES == space,
1997 * it was already reported just above.
1999 if (pswit[ECHO_SWITCH])
2000 g_print("\n%s\n",aline);
2001 if (!pswit[OVERVIEW_SWITCH])
2002 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2003 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2010 * Special case " .X" where X is any alpha.
2011 * This plugs a hole in the acronym code above.
2012 * Inelegant, but maintainable.
2014 c=g_utf8_get_char(aline);
2015 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2016 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2020 nc=g_utf8_get_char(g_utf8_next_char(s));
2021 /* for each character in the line after the first */
2024 /* if it's a period */
2025 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
2028 * If the period follows a space and
2029 * is followed by a letter.
2031 if (pswit[ECHO_SWITCH])
2032 g_print("\n%s\n",aline);
2033 if (!pswit[OVERVIEW_SWITCH])
2034 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2035 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2041 c=g_utf8_get_char(aline);
2042 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2043 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2047 nc=g_utf8_get_char(g_utf8_next_char(s));
2048 /* for each character in the line after the first */
2051 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2052 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2053 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2055 if (pswit[ECHO_SWITCH])
2056 g_print("\n%s\n",aline);
2057 if (!pswit[OVERVIEW_SWITCH])
2058 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2059 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2065 /* Check parity of quotes. */
2066 nc=g_utf8_get_char(aline);
2067 for (s=aline;*s;s=g_utf8_next_char(s))
2070 nc=g_utf8_get_char(g_utf8_next_char(s));
2073 parities->dquote=!parities->dquote;
2074 if (!parities->dquote)
2077 if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
2079 if (pswit[ECHO_SWITCH])
2080 g_print("\n%s\n",aline);
2081 if (!pswit[OVERVIEW_SWITCH])
2082 g_print(" Line %ld column %ld - "
2083 "Wrongspaced quotes?\n",
2084 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2092 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2093 !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
2095 if (pswit[ECHO_SWITCH])
2096 g_print("\n%s\n",aline);
2097 if (!pswit[OVERVIEW_SWITCH])
2098 g_print(" Line %ld column %ld - "
2099 "Wrongspaced quotes?\n",
2100 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2107 if (g_utf8_get_char(aline)==CHAR_DQUOTE)
2109 if (g_utf8_strchr(",;:!?)]} ",-1,
2110 g_utf8_get_char(g_utf8_next_char(aline))))
2112 if (pswit[ECHO_SWITCH])
2113 g_print("\n%s\n",aline);
2114 if (!pswit[OVERVIEW_SWITCH])
2115 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2121 if (pswit[SQUOTE_SWITCH])
2123 nc=g_utf8_get_char(aline);
2124 for (s=aline;*s;s=g_utf8_next_char(s))
2127 nc=g_utf8_get_char(g_utf8_next_char(s));
2128 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2129 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2130 !g_unichar_isalpha(nc)))
2132 parities->squote=!parities->squote;
2133 if (!parities->squote)
2136 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2138 if (pswit[ECHO_SWITCH])
2139 g_print("\n%s\n",aline);
2140 if (!pswit[OVERVIEW_SWITCH])
2141 g_print(" Line %ld column %ld - "
2142 "Wrongspaced singlequotes?\n",
2143 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2151 if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
2152 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2154 if (pswit[ECHO_SWITCH])
2155 g_print("\n%s\n",aline);
2156 if (!pswit[OVERVIEW_SWITCH])
2157 g_print(" Line %ld column %ld - "
2158 "Wrongspaced singlequotes?\n",
2159 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2170 * check_for_double_punctuation:
2172 * Look for double punctuation like ,. or ,,
2173 * Thanks to DW for the suggestion!
2174 * In books with references, ".," and ".;" are common
2175 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2176 * OTOH, from my initial tests, there are also fairly
2177 * common errors. What to do? Make these cases paranoid?
2178 * ".," is the most common, so warnings->dotcomma is used
2179 * to suppress detailed reporting if it occurs often.
2181 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2185 nc=g_utf8_get_char(aline);
2186 for (s=aline;*s;s=g_utf8_next_char(s))
2189 nc=g_utf8_get_char(g_utf8_next_char(s));
2190 /* for each punctuation character in the line */
2191 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2192 g_utf8_strchr(".?!,;:",-1,nc))
2194 /* followed by punctuation, it's a query, unless . . . */
2195 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2196 !warnings->dotcomma && c=='.' && nc==',' ||
2197 warnings->isFrench && g_str_has_prefix(s,",...") ||
2198 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2199 warnings->isFrench && g_str_has_prefix(s,";...") ||
2200 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2201 warnings->isFrench && g_str_has_prefix(s,":...") ||
2202 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2203 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2204 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2205 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2206 warnings->isFrench && g_str_has_prefix(s,"...?"))
2208 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2209 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2210 warnings->isFrench && g_str_has_prefix(s,";...") ||
2211 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2212 warnings->isFrench && g_str_has_prefix(s,":...") ||
2213 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2214 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2215 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2216 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2217 warnings->isFrench && g_str_has_prefix(s,"...?"))
2220 nc=g_utf8_get_char(g_utf8_next_char(s));
2222 ; /* do nothing for .. !! and ?? which can be legit */
2226 if (pswit[ECHO_SWITCH])
2227 g_print("\n%s\n",aline);
2228 if (!pswit[OVERVIEW_SWITCH])
2229 g_print(" Line %ld column %ld - Double punctuation?\n",
2230 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2239 * check_for_spaced_quotes:
2241 void check_for_spaced_quotes(const char *aline)
2245 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2249 while ((t=strstr(s," \" ")))
2251 if (pswit[ECHO_SWITCH])
2252 g_print("\n%s\n",aline);
2253 if (!pswit[OVERVIEW_SWITCH])
2254 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2255 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2258 s=g_utf8_next_char(g_utf8_next_char(t));
2260 pattern=g_string_new(NULL);
2261 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2263 g_string_assign(pattern," ");
2264 g_string_append_unichar(pattern,single_quotes[i]);
2265 g_string_append_c(pattern,' ');
2267 while ((t=strstr(s,pattern->str)))
2269 if (pswit[ECHO_SWITCH])
2270 g_print("\n%s\n",aline);
2271 if (!pswit[OVERVIEW_SWITCH])
2272 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2273 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2276 s=g_utf8_next_char(g_utf8_next_char(t));
2279 g_string_free(pattern,TRUE);
2283 * check_for_miscased_genative:
2285 * Check special case of 'S instead of 's at end of word.
2287 void check_for_miscased_genative(const char *aline)
2293 c=g_utf8_get_char(aline);
2294 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2295 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2299 nc=g_utf8_get_char(g_utf8_next_char(s));
2300 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2302 if (pswit[ECHO_SWITCH])
2303 g_print("\n%s\n",aline);
2304 if (!pswit[OVERVIEW_SWITCH])
2305 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2306 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2314 * check_end_of_line:
2316 * Now check special cases - start and end of line -
2317 * for single and double quotes. Start is sometimes [sic]
2318 * but better to query it anyway.
2319 * While we're here, check for dash at end of line.
2321 void check_end_of_line(const char *aline,struct warnings *warnings)
2326 lbytes=strlen(aline);
2327 if (g_utf8_strlen(aline,lbytes)>1)
2329 s=g_utf8_prev_char(aline+lbytes);
2330 c1=g_utf8_get_char(s);
2331 c2=g_utf8_get_char(g_utf8_prev_char(s));
2332 if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2334 if (pswit[ECHO_SWITCH])
2335 g_print("\n%s\n",aline);
2336 if (!pswit[OVERVIEW_SWITCH])
2337 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2338 g_utf8_strlen(aline,lbytes));
2342 c1=g_utf8_get_char(aline);
2343 c2=g_utf8_get_char(g_utf8_next_char(aline));
2344 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2346 if (pswit[ECHO_SWITCH])
2347 g_print("\n%s\n",aline);
2348 if (!pswit[OVERVIEW_SWITCH])
2349 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2354 * Dash at end of line may well be legit - paranoid mode only
2355 * and don't report em-dash at line-end.
2357 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2359 for (s=g_utf8_prev_char(aline+lbytes);
2360 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2362 if (g_utf8_get_char(s)=='-' &&
2363 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2365 if (pswit[ECHO_SWITCH])
2366 g_print("\n%s\n",aline);
2367 if (!pswit[OVERVIEW_SWITCH])
2368 g_print(" Line %ld column %ld - "
2369 "Hyphen at end of line?\n",
2370 linecnt,g_utf8_pointer_to_offset(aline,s));
2377 * check_for_unspaced_bracket:
2379 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2380 * If so, suspect a scanno like "a]most".
2382 void check_for_unspaced_bracket(const char *aline)
2386 c=g_utf8_get_char(aline);
2387 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2388 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2392 nc=g_utf8_get_char(g_utf8_next_char(s));
2395 /* for each bracket character in the line except 1st & last */
2396 if (g_utf8_strchr("{[()]}",-1,c) &&
2397 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2399 if (pswit[ECHO_SWITCH])
2400 g_print("\n%s\n",aline);
2401 if (!pswit[OVERVIEW_SWITCH])
2402 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2403 linecnt,g_utf8_pointer_to_offset(aline,s));
2411 * check_for_unpunctuated_endquote:
2413 void check_for_unpunctuated_endquote(const char *aline)
2417 c=g_utf8_get_char(aline);
2418 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2419 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2423 nc=g_utf8_get_char(g_utf8_next_char(s));
2424 /* for each character in the line except 1st */
2425 if (c==CHAR_DQUOTE && isalpha(pc))
2427 if (pswit[ECHO_SWITCH])
2428 g_print("\n%s\n",aline);
2429 if (!pswit[OVERVIEW_SWITCH])
2430 g_print(" Line %ld column %ld - "
2431 "endquote missing punctuation?\n",
2432 linecnt,g_utf8_pointer_to_offset(aline,s));
2440 * check_for_html_tag:
2442 * Check for <HTML TAG>.
2444 * If there is a < in the line, followed at some point
2445 * by a > then we suspect HTML.
2447 void check_for_html_tag(const char *aline)
2449 const char *open,*close;
2451 open=strchr(aline,'<');
2454 close=strchr(g_utf8_next_char(open),'>');
2457 if (pswit[ECHO_SWITCH])
2458 g_print("\n%s\n",aline);
2459 if (!pswit[OVERVIEW_SWITCH])
2461 tag=g_strndup(open,close-open+1);
2462 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2463 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2473 * check_for_html_entity:
2475 * Check for &symbol; HTML.
2477 * If there is a & in the line, followed at
2478 * some point by a ; then we suspect HTML.
2480 void check_for_html_entity(const char *aline)
2482 const char *s,*amp,*scolon;
2484 amp=strchr(aline,'&');
2487 scolon=strchr(amp,';');
2490 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2491 if (g_utf8_get_char(s)==CHAR_SPACE)
2492 break; /* Don't report "Jones & Son;" */
2495 if (pswit[ECHO_SWITCH])
2496 g_print("\n%s\n",aline);
2497 if (!pswit[OVERVIEW_SWITCH])
2499 entity=g_strndup(amp,scolon-amp+1);
2500 g_print(" Line %ld column %d - HTML symbol? %s \n",
2501 linecnt,(int)(amp-aline)+1,entity);
2512 * check_for_omitted_punctuation:
2514 * Check for omitted punctuation at end of paragraph by working back
2515 * through prevline. DW.
2516 * Need to check this only for "normal" paras.
2517 * So what is a "normal" para?
2518 * Not normal if one-liner (chapter headings, etc.)
2519 * Not normal if doesn't contain at least one locase letter
2520 * Not normal if starts with space
2522 void check_for_omitted_punctuation(const char *prevline,
2523 struct line_properties *last,int start_para_line)
2525 gboolean letter_on_line=FALSE;
2528 gboolean closing_quote;
2529 for (s=prevline;*s;s=g_utf8_next_char(s))
2530 if (g_unichar_isalpha(g_utf8_get_char(s)))
2532 letter_on_line=TRUE;
2536 * This next "if" is a problem.
2537 * If we say "start_para_line <= linecnt - 1", that includes
2538 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2539 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2540 * misses genuine one-line paragraphs.
2542 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2543 g_utf8_get_char(prevline)>CHAR_SPACE)
2545 s=prevline+strlen(prevline);
2548 s=g_utf8_prev_char(s);
2549 c=g_utf8_get_char(s);
2550 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2553 closing_quote=FALSE;
2554 } while (closing_quote && s>prevline);
2555 for (;s>prevline;s=g_utf8_prev_char(s))
2557 if (g_unichar_isalpha(g_utf8_get_char(s)))
2559 if (pswit[ECHO_SWITCH])
2560 g_print("\n%s\n",prevline);
2561 if (!pswit[OVERVIEW_SWITCH])
2562 g_print(" Line %ld column %ld - "
2563 "No punctuation at para end?\n",
2564 linecnt-1,g_utf8_strlen(prevline,-1));
2569 if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
2575 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2577 const char *word=key;
2580 g_print("\nNote: Queried word %s was duplicated %d times\n",
2585 void print_as_windows_1252(const char *string)
2587 gsize inbytes,outbytes;
2589 static GIConv converter=(GIConv)-1;
2592 if (converter!=(GIConv)-1)
2593 g_iconv_close(converter);
2594 converter=(GIConv)-1;
2597 if (converter==(GIConv)-1)
2598 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2599 if (converter!=(GIConv)-1)
2601 inbytes=outbytes=strlen(string);
2602 bp=buf=g_malloc(outbytes+1);
2603 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2609 fputs(string,stdout);
2612 void print_as_utf_8(const char *string)
2614 fputs(string,stdout);
2622 void procfile(const char *filename)
2625 gchar *parastart=NULL; /* first line of current para */
2626 gchar *etext,*aline;
2629 struct first_pass_results *first_pass_results;
2630 struct warnings *warnings;
2631 struct counters counters={0};
2632 struct line_properties last={0};
2633 struct parities parities={0};
2634 struct pending pending={0};
2635 gboolean isemptyline;
2636 long start_para_line=0;
2637 gboolean isnewpara=FALSE,enddash=FALSE;
2638 last.start=CHAR_SPACE;
2639 linecnt=checked_linecnt=0;
2640 etext=read_etext(filename,&err);
2643 if (pswit[STDOUT_SWITCH])
2644 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2646 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2649 g_print("\n\nFile: %s\n\n",filename);
2650 first_pass_results=first_pass(etext);
2651 warnings=report_first_pass(first_pass_results);
2652 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2653 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2655 * Here we go with the main pass. Hold onto yer hat!
2659 while ((aline=flgets(&etext_ptr,linecnt+1)))
2664 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2665 continue; // skip DP page separators completely
2666 if (linecnt<first_pass_results->firstline ||
2667 (first_pass_results->footerline>0 &&
2668 linecnt>first_pass_results->footerline))
2670 if (pswit[HEADER_SWITCH])
2672 if (g_str_has_prefix(aline,"Title:"))
2673 g_print(" %s\n",aline);
2674 if (g_str_has_prefix(aline,"Author:"))
2675 g_print(" %s\n",aline);
2676 if (g_str_has_prefix(aline,"Release Date:"))
2677 g_print(" %s\n",aline);
2678 if (g_str_has_prefix(aline,"Edition:"))
2679 g_print(" %s\n\n",aline);
2681 continue; /* skip through the header */
2684 print_pending(aline,parastart,&pending);
2685 isemptyline=analyse_quotes(aline,linecnt,&counters);
2686 if (isnewpara && !isemptyline)
2688 /* This line is the start of a new paragraph. */
2689 start_para_line=linecnt;
2690 /* Capture its first line in case we want to report it later. */
2692 parastart=g_strdup(aline);
2693 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2695 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2696 !g_unichar_isdigit(g_utf8_get_char(s)))
2697 s=g_utf8_next_char(s);
2698 if (g_unichar_islower(g_utf8_get_char(s)))
2700 /* and its first letter is lowercase */
2701 if (pswit[ECHO_SWITCH])
2702 g_print("\n%s\n",aline);
2703 if (!pswit[OVERVIEW_SWITCH])
2704 g_print(" Line %ld column %ld - "
2705 "Paragraph starts with lower-case\n",
2706 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2710 isnewpara=FALSE; /* Signal the end of new para processing. */
2712 /* Check for an em-dash broken at line end. */
2713 if (enddash && g_utf8_get_char(aline)=='-')
2715 if (pswit[ECHO_SWITCH])
2716 g_print("\n%s\n",aline);
2717 if (!pswit[OVERVIEW_SWITCH])
2718 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2723 for (s=g_utf8_prev_char(aline+strlen(aline));
2724 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2726 if (s>=aline && g_utf8_get_char(s)=='-')
2728 check_for_control_characters(aline);
2729 check_for_odd_characters(aline,warnings,isemptyline);
2730 if (warnings->longline)
2731 check_for_long_line(aline);
2732 if (warnings->shortline)
2733 check_for_short_line(aline,&last);
2735 last.len=g_utf8_strlen(aline,-1);
2736 last.start=g_utf8_get_char(aline);
2737 check_for_starting_punctuation(aline);
2740 check_for_spaced_emdash(aline);
2741 check_for_spaced_dash(aline);
2743 check_for_unmarked_paragraphs(aline);
2744 check_for_jeebies(aline);
2745 check_for_mta_from(aline);
2746 check_for_orphan_character(aline);
2747 check_for_pling_scanno(aline);
2748 check_for_extra_period(aline,warnings);
2749 check_for_following_punctuation(aline);
2750 check_for_typos(aline,warnings);
2751 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2752 check_for_double_punctuation(aline,warnings);
2753 check_for_spaced_quotes(aline);
2754 check_for_miscased_genative(aline);
2755 check_end_of_line(aline,warnings);
2756 check_for_unspaced_bracket(aline);
2757 if (warnings->endquote)
2758 check_for_unpunctuated_endquote(aline);
2759 check_for_html_tag(aline);
2760 check_for_html_entity(aline);
2763 check_for_mismatched_quotes(&counters,&pending);
2764 counters_reset(&counters);
2765 /* let the next iteration know that it's starting a new para */
2768 check_for_omitted_punctuation(prevline,&last,start_para_line);
2771 prevline=g_strdup(aline);
2774 check_for_mismatched_quotes(&counters,&pending);
2775 print_pending(NULL,parastart,&pending);
2776 reset_pending(&pending);
2785 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2786 g_tree_foreach(qword,report_duplicate_queries,NULL);
2787 g_tree_unref(qword);
2788 g_tree_unref(qperiod);
2789 counters_destroy(&counters);
2790 g_set_print_handler(NULL);
2791 print_as_windows_1252(NULL);
2792 if (pswit[MARKUP_SWITCH])
2799 * Get one line from the input text, checking for
2800 * the existence of exactly one CR/LF line-end per line.
2802 * Returns: a pointer to the line.
2804 char *flgets(char **etext,long lcnt)
2807 gboolean isCR=FALSE;
2808 char *theline=*etext;
2813 c=g_utf8_get_char(*etext);
2814 *etext=g_utf8_next_char(*etext);
2817 /* either way, it's end of line */
2824 /* Error - a LF without a preceding CR */
2825 if (pswit[LINE_END_SWITCH])
2827 if (pswit[ECHO_SWITCH])
2829 s=g_strndup(theline,eos-theline);
2830 g_print("\n%s\n",s);
2833 if (!pswit[OVERVIEW_SWITCH])
2834 g_print(" Line %ld - No CR?\n",lcnt);
2845 /* Error - two successive CRs */
2846 if (pswit[LINE_END_SWITCH])
2848 if (pswit[ECHO_SWITCH])
2850 s=g_strndup(theline,eos-theline);
2851 g_print("\n%s\n",s);
2854 if (!pswit[OVERVIEW_SWITCH])
2855 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2864 if (pswit[LINE_END_SWITCH] && isCR)
2866 if (pswit[ECHO_SWITCH])
2868 s=g_strndup(theline,eos-theline);
2869 g_print("\n%s\n",s);
2872 if (!pswit[OVERVIEW_SWITCH])
2873 g_print(" Line %ld column %ld - CR without LF?\n",
2874 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2880 eos=g_utf8_next_char(eos);
2884 if (pswit[MARKUP_SWITCH])
2885 postprocess_for_HTML(theline);
2886 if (pswit[DP_SWITCH])
2887 postprocess_for_DP(theline);
2894 * Takes a "word" as a parameter, and checks whether it
2895 * contains a mixture of alpha and digits. Generally, this is an
2896 * error, but may not be for cases like 4th or L5 12s. 3d.
2898 * Returns: TRUE iff an is error found.
2900 gboolean mixdigit(const char *checkword)
2902 gboolean wehaveadigit,wehavealetter,query;
2903 const char *s,*nondigit;
2904 wehaveadigit=wehavealetter=query=FALSE;
2905 for (s=checkword;*s;s=g_utf8_next_char(s))
2906 if (g_unichar_isalpha(g_utf8_get_char(s)))
2908 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2910 if (wehaveadigit && wehavealetter)
2912 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2914 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2915 nondigit=g_utf8_next_char(nondigit))
2917 /* digits, ending in st, rd, nd, th of either case */
2918 if (!g_ascii_strcasecmp(nondigit,"st") ||
2919 !g_ascii_strcasecmp(nondigit,"rd") ||
2920 !g_ascii_strcasecmp(nondigit,"nd") ||
2921 !g_ascii_strcasecmp(nondigit,"th"))
2923 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2924 !g_ascii_strcasecmp(nondigit,"rds") ||
2925 !g_ascii_strcasecmp(nondigit,"nds") ||
2926 !g_ascii_strcasecmp(nondigit,"ths"))
2928 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2929 !g_ascii_strcasecmp(nondigit,"rdly") ||
2930 !g_ascii_strcasecmp(nondigit,"ndly") ||
2931 !g_ascii_strcasecmp(nondigit,"thly"))
2933 /* digits, ending in l, L, s or d */
2934 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2935 !strcmp(nondigit,"d"))
2938 * L at the start of a number, representing Britsh pounds, like L500.
2939 * This is cute. We know the current word is mixed digit. If the first
2940 * letter is L, there must be at least one digit following. If both
2941 * digits and letters follow, we have a genuine error, else we have a
2942 * capital L followed by digits, and we accept that as a non-error.
2944 if (g_utf8_get_char(checkword)=='L' &&
2945 !mixdigit(g_utf8_next_char(checkword)))
2954 * Extracts the first/next "word" from the line, and returns it.
2955 * A word is defined as one English word unit--or at least that's the aim.
2956 * "ptr" is advanced to the position in the line where we will start
2957 * looking for the next word.
2959 * Returns: A newly-allocated string.
2961 gchar *getaword(const char **ptr)
2966 word=g_string_new(NULL);
2967 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2968 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2969 **ptr;*ptr=g_utf8_next_char(*ptr))
2972 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2973 * Especially yucky is the case of L1,000
2974 * This section looks for a pattern of characters including a digit
2975 * followed by a comma or period followed by one or more digits.
2976 * If found, it returns this whole pattern as a word; otherwise we discard
2977 * the results and resume our normal programming.
2980 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2981 g_unichar_isalpha(g_utf8_get_char(s)) ||
2982 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
2983 g_string_append_unichar(word,g_utf8_get_char(s));
2986 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
2988 c=g_utf8_get_char(t);
2989 pc=g_utf8_get_char(g_utf8_prev_char(t));
2990 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
2993 return g_string_free(word,FALSE);
2997 /* we didn't find a punctuated number - do the regular getword thing */
2998 g_string_truncate(word,0);
2999 c=g_utf8_get_char(*ptr);
3000 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3001 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3002 g_string_append_unichar(word,c);
3003 return g_string_free(word,FALSE);
3009 * Is this word a Roman Numeral?
3011 * It doesn't actually validate that the number is a valid Roman Numeral--for
3012 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3013 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3014 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3015 * expressions thereof, except when it came to taxes. Allow any number of M,
3016 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3017 * XL or an optional XC, an optional IX or IV, an optional V and any number
3020 gboolean isroman(const char *t)
3026 while (g_utf8_get_char(t)=='m' && *t)
3028 if (g_utf8_get_char(t)=='d')
3030 if (g_str_has_prefix(t,"cm"))
3032 if (g_str_has_prefix(t,"cd"))
3034 while (g_utf8_get_char(t)=='c' && *t)
3036 if (g_str_has_prefix(t,"xl"))
3038 if (g_str_has_prefix(t,"xc"))
3040 if (g_utf8_get_char(t)=='l')
3042 while (g_utf8_get_char(t)=='x' && *t)
3044 if (g_str_has_prefix(t,"ix"))
3046 if (g_str_has_prefix(t,"iv"))
3048 if (g_utf8_get_char(t)=='v')
3050 while (g_utf8_get_char(t)=='i' && *t)
3056 * postprocess_for_DP:
3058 * Invoked with the -d switch from flgets().
3059 * It simply "removes" from the line a hard-coded set of common
3060 * DP-specific tags, so that the line passed to the main routine has
3061 * been pre-cleaned of DP markup.
3063 void postprocess_for_DP(char *theline)
3069 for (i=0;*DPmarkup[i];i++)
3070 while ((s=strstr(theline,DPmarkup[i])))
3072 t=s+strlen(DPmarkup[i]);
3073 memmove(s,t,strlen(t)+1);
3078 * postprocess_for_HTML:
3080 * Invoked with the -m switch from flgets().
3081 * It simply "removes" from the line a hard-coded set of common
3082 * HTML tags and "replaces" a hard-coded set of common HTML
3083 * entities, so that the line passed to the main routine has
3084 * been pre-cleaned of HTML.
3086 void postprocess_for_HTML(char *theline)
3088 while (losemarkup(theline))
3090 loseentities(theline);
3093 char *losemarkup(char *theline)
3097 s=strchr(theline,'<');
3098 t=s?strchr(s,'>'):NULL;
3101 for (i=0;*markup[i];i++)
3102 if (tagcomp(g_utf8_next_char(s),markup[i]))
3104 t=g_utf8_next_char(t);
3105 memmove(s,t,strlen(t)+1);
3108 /* It's an unrecognized <xxx>. */
3112 void loseentities(char *theline)
3119 GTree *entities=NULL;
3120 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3124 g_tree_destroy(entities);
3126 if (translit!=(GIConv)-1)
3127 g_iconv_close(translit);
3128 translit=(GIConv)-1;
3129 if (to_utf8!=(GIConv)-1)
3130 g_iconv_close(to_utf8);
3138 entities=g_tree_new((GCompareFunc)strcmp);
3139 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3140 g_tree_insert(entities,HTMLentities[i].name,
3141 GUINT_TO_POINTER(HTMLentities[i].c));
3143 if (translit==(GIConv)-1)
3144 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3145 if (to_utf8==(GIConv)-1)
3146 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3147 while((amp=strchr(theline,'&')))
3149 scolon=strchr(amp,';');
3154 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3155 c=strtol(amp+2,NULL,10);
3156 else if (amp[2]=='x' &&
3157 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3158 c=strtol(amp+3,NULL,16);
3162 s=g_strndup(amp+1,scolon-(amp+1));
3163 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3172 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3173 theline+=g_unichar_to_utf8(c,theline);
3177 nb=g_unichar_to_utf8(c,s);
3178 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3180 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3182 memcpy(theline,s,nb);
3186 memmove(theline,g_utf8_next_char(scolon),
3187 strlen(g_utf8_next_char(scolon))+1);
3190 theline=g_utf8_next_char(amp);
3194 gboolean tagcomp(const char *strin,const char *basetag)
3198 if (g_utf8_get_char(strin)=='/')
3199 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3201 t=g_utf8_casefold(strin,-1);
3202 s=g_utf8_casefold(basetag,-1);
3203 retval=g_str_has_prefix(t,s);
3209 void proghelp(GOptionContext *context)
3212 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3213 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3214 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3215 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3216 "For details, read the file COPYING.\n",stderr);
3217 fputs("This is Free Software; "
3218 "you may redistribute it under certain conditions (GPL);\n",stderr);
3219 fputs("read the file COPYING for details.\n\n",stderr);
3220 help=g_option_context_get_help(context,TRUE,NULL);
3223 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3224 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3225 "non-ASCII\n",stderr);
3226 fputs("characters like accented letters, "
3227 "lines longer than 75 or shorter than 55,\n",stderr);
3228 fputs("unbalanced quotes or brackets, "
3229 "a variety of badly formatted punctuation, \n",stderr);
3230 fputs("HTML tags, some likely typos. "
3231 "It is NOT a substitute for human judgement.\n",stderr);