1 /*************************************************************************/
2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk> */
7 /* This program is free software; you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation; either version 2 of the License, or */
10 /* (at your option) any later version. */
12 /* This program is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /*************************************************************************/
30 #include "bookloupe.h"
33 #include "HTMLentities.h"
39 "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
40 "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa",
41 "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
42 "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
43 "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
44 "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
45 "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
46 "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
47 "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
48 "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
49 "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
50 "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
51 "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
52 "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
53 "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
54 "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
55 "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
56 "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
57 "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
58 "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
59 "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
60 "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
61 "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
62 "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
63 "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
64 "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
65 "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
66 "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
67 "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
73 /* Common abbreviations and other OK words not to query as typos. */
75 "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
76 "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
77 "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
78 "outbid", "outbids", "frostbite", "frostbitten", ""
81 /* Common abbreviations that cause otherwise unexplained periods. */
83 "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
84 "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
88 * Two-Letter combinations that rarely if ever start words,
89 * but are common scannos or otherwise common letter combinations.
92 "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
96 * Two-Letter combinations that rarely if ever end words,
97 * but are common scannos or otherwise common letter combinations.
100 "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
101 "sw", "gr", "sl", "cl", "iy", ""
105 "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
106 "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
107 "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
108 "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
112 "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
116 "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
117 "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
118 "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
119 "during", "let", "toward", "among", ""
123 "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
124 "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
125 "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
126 "among", "those", "into", "whom", "having", "thence", ""
129 gboolean pswit[SWITNO]; /* program switches */
131 static GOptionEntry options[]={
132 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
133 "Ignore DP-specific markup", NULL },
134 { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
135 "Don't echo queried line", NULL },
136 { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
137 "Check single quotes", NULL },
138 { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
139 "Check common typos", NULL },
140 { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
141 "Require closure of quotes on every paragraph", NULL },
142 { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
143 "Disable paranoid querying of everything", NULL },
144 { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
145 "Disable line end checking", NULL },
146 { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
147 "Overview: just show counts", NULL },
148 { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
149 "Output errors to stdout instead of stderr", NULL },
150 { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
151 "Echo header fields", NULL },
152 { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
153 "Ignore markup in < >", NULL },
154 { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
155 "Use file of user-defined typos", NULL },
156 { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
157 "Defaults for use on www upload", NULL },
158 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
159 "Verbose - list everything", NULL },
163 long cnt_quote; /* for overview mode, count of quote queries */
164 long cnt_brack; /* for overview mode, count of brackets queries */
165 long cnt_bin; /* for overview mode, count of non-ASCII queries */
166 long cnt_odd; /* for overview mode, count of odd character queries */
167 long cnt_long; /* for overview mode, count of long line errors */
168 long cnt_short; /* for overview mode, count of short line queries */
169 long cnt_punct; /* for overview mode,
170 count of punctuation and spacing queries */
171 long cnt_dash; /* for overview mode, count of dash-related queries */
172 long cnt_word; /* for overview mode, count of word queries */
173 long cnt_html; /* for overview mode, count of html queries */
174 long cnt_lineend; /* for overview mode, count of line-end queries */
175 long cnt_spacend; /* count of lines with space at end */
176 long linecnt; /* count of total lines in the file */
177 long checked_linecnt; /* count of lines actually checked */
179 void proghelp(GOptionContext *context);
180 void procfile(const char *);
184 gboolean mixdigit(const char *);
185 gchar *getaword(const char **);
186 char *flgets(char **,long,gboolean);
187 void postprocess_for_HTML(char *);
188 char *linehasmarkup(char *);
189 char *losemarkup(char *);
190 gboolean tagcomp(const char *,const char *);
191 void loseentities(char *);
192 gboolean isroman(const char *);
193 void postprocess_for_DP(char *);
194 void print_as_windows_1252(const char *string);
195 void print_as_utf_8(const char *string);
197 GTree *qword,*qperiod;
203 void parse_options(int *argc,char ***argv)
206 GOptionContext *context;
207 context=g_option_context_new(
208 "file - looks for errors in Project Gutenberg(TM) etexts");
209 g_option_context_add_main_entries(context,options,NULL);
210 if (!g_option_context_parse(context,argc,argv,&err))
212 g_printerr("Bookloupe: %s\n",err->message);
213 g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
216 /* Paranoid checking is turned OFF, not on, by its switch */
217 pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
218 if (pswit[PARANOID_SWITCH])
219 /* if running in paranoid mode, typo checks default to enabled */
220 pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
221 /* Line-end checking is turned OFF, not on, by its switch */
222 pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
223 /* Echoing is turned OFF, not on, by its switch */
224 pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
225 if (pswit[OVERVIEW_SWITCH])
226 /* just print summary; don't echo */
227 pswit[ECHO_SWITCH]=FALSE;
229 * Web uploads - for the moment, this is really just a placeholder
230 * until we decide what processing we really want to do on web uploads
232 if (pswit[WEB_SWITCH])
234 /* specific override for web uploads */
235 pswit[ECHO_SWITCH]=TRUE;
236 pswit[SQUOTE_SWITCH]=FALSE;
237 pswit[TYPO_SWITCH]=TRUE;
238 pswit[QPARA_SWITCH]=FALSE;
239 pswit[PARANOID_SWITCH]=TRUE;
240 pswit[LINE_END_SWITCH]=FALSE;
241 pswit[OVERVIEW_SWITCH]=FALSE;
242 pswit[STDOUT_SWITCH]=FALSE;
243 pswit[HEADER_SWITCH]=TRUE;
244 pswit[VERBOSE_SWITCH]=FALSE;
245 pswit[MARKUP_SWITCH]=FALSE;
246 pswit[USERTYPO_SWITCH]=FALSE;
247 pswit[DP_SWITCH]=FALSE;
254 g_option_context_free(context);
260 * Read in the user-defined stealth scanno list.
262 void read_user_scannos(void)
265 gchar *usertypo_file;
269 gchar *contents,*utf8,**lines;
270 usertypo_file=g_strdup("bookloupe.typ");
271 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
272 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
275 g_free(usertypo_file);
276 usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
277 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
279 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
282 g_free(usertypo_file);
283 usertypo_file=g_strdup("gutcheck.typ");
284 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
286 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
289 g_free(usertypo_file);
290 usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
291 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
293 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
295 g_free(usertypo_file);
296 g_print(" --> I couldn't find bookloupe.typ "
297 "-- proceeding without user typos.\n");
302 fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
303 g_free(usertypo_file);
307 if (g_utf8_validate(contents,len,NULL))
308 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
310 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
312 lines=g_strsplit_set(utf8,"\r\n",0);
314 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
315 for (i=0;lines[i];i++)
316 if (*(unsigned char *)lines[i]>'!')
317 g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
326 * Read an etext returning a newly allocated string containing the file
327 * contents or NULL on error.
329 gchar *read_etext(const char *filename,GError **err)
331 GError *tmp_err=NULL;
332 gchar *contents,*utf8;
333 gsize len,bytes_read,bytes_written;
335 if (!g_file_get_contents(filename,&contents,&len,err))
337 if (g_utf8_validate(contents,len,NULL))
339 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
340 g_set_print_handler(print_as_utf_8);
342 SetConsoleOutputCP(CP_UTF8);
347 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
348 &bytes_written,&tmp_err);
349 if (g_error_matches(tmp_err,G_CONVERT_ERROR,
350 G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
353 for(i=0;i<bytes_read;i++)
354 if (contents[i]=='\n')
359 else if (contents[i]!='\r')
361 g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
362 "Input conversion failed. Byte %d at line %d, column %d is not a "
363 "valid Windows-1252 character",
364 ((unsigned char *)contents)[bytes_read],line,col);
367 g_propagate_error(err,tmp_err);
368 g_set_print_handler(print_as_windows_1252);
370 SetConsoleOutputCP(1252);
377 void cleanup_on_exit(void)
380 SetConsoleOutputCP(saved_cp);
384 int main(int argc,char **argv)
387 atexit(cleanup_on_exit);
388 saved_cp=GetConsoleOutputCP();
390 running_from=g_path_get_dirname(argv[0]);
391 parse_options(&argc,&argv);
392 if (pswit[USERTYPO_SWITCH])
394 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
396 if (pswit[OVERVIEW_SWITCH])
398 g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
399 checked_linecnt,linecnt,linecnt-checked_linecnt);
400 g_print(" --------------- Queries found --------------\n");
402 g_print(" Long lines: %14ld\n",cnt_long);
404 g_print(" Short lines: %14ld\n",cnt_short);
406 g_print(" Line-end problems: %14ld\n",cnt_lineend);
408 g_print(" Common typos: %14ld\n",cnt_word);
410 g_print(" Unmatched quotes: %14ld\n",cnt_quote);
412 g_print(" Unmatched brackets: %14ld\n",cnt_brack);
414 g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
416 g_print(" Proofing characters: %14ld\n",cnt_odd);
418 g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
420 g_print(" Non-standard dashes: %14ld\n",cnt_dash);
422 g_print(" Possible HTML tags: %14ld\n",cnt_html);
424 g_print(" TOTAL QUERIES %14ld\n",
425 cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
426 cnt_dash+cnt_word+cnt_html+cnt_lineend);
428 g_free(running_from);
430 g_tree_unref(usertypo);
434 void count_dashes(const char *line,const char *dash,
435 struct dash_results *results)
440 gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
443 tokens=g_strsplit(line,dash,0);
446 for(i=1;tokens[i];i++)
448 pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
449 nc=g_utf8_get_char(tokens[i]);
450 if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
452 if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
454 else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
460 /* count of lines with em-dashes with spaces both sides */
461 results->non_PG_space++;
463 /* count of lines with PG-type em-dashes with no spaces */
471 * Run a first pass - verify that it's a valid PG
472 * file, decide whether to report some things that
473 * occur many times in the text like long or short
474 * lines, non-standard dashes, etc.
476 struct first_pass_results *first_pass(const char *etext)
478 gunichar laststart=CHAR_SPACE;
483 unsigned int lastlen=0,lastblen=0;
484 long spline=0,nspline=0;
485 static struct first_pass_results results={0};
486 struct dash_results tmp_dash_results;
489 lines=g_strsplit(etext,"\n",0);
491 /* If there's at least one line, we might have UNIX-style terminators */
492 results.unix_lineends=TRUE;
493 for (j=0;lines[j];j++)
495 lbytes=strlen(lines[j]);
496 if (lbytes>0 && lines[j][lbytes-1]=='\r')
498 results.unix_lineends=FALSE;
501 lines[j][--lbytes]='\0';
502 } while (lbytes>0 && lines[j][lbytes-1]=='\r');
504 llen=g_utf8_strlen(lines[j],lbytes);
506 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
507 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
510 g_print(" --> Duplicate header?\n");
511 spline=linecnt+1; /* first line of non-header text, that is */
513 if (!strncmp(lines[j],"*** START",9) &&
514 strstr(lines[j],"PROJECT GUTENBERG"))
517 g_print(" --> Duplicate header?\n");
518 nspline=linecnt+1; /* first line of non-header text, that is */
520 if (spline || nspline)
522 lc_line=g_utf8_strdown(lines[j],lbytes);
523 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
525 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
527 if (results.footerline)
529 /* it's an old-form header - we can detect duplicates */
531 g_print(" --> Duplicate footer?\n");
534 results.footerline=linecnt;
540 results.firstline=spline;
542 results.firstline=nspline; /* override with new */
543 if (results.footerline)
544 continue; /* don't count the boilerplate in the footer */
545 results.totlen+=llen;
546 for (s=lines[j];*s;s=g_utf8_next_char(s))
548 if (g_utf8_get_char(s)>127)
550 if (g_unichar_isalpha(g_utf8_get_char(s)))
554 if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
555 qc=QUOTE_CLASS(g_utf8_get_char(s));
558 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
559 g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
560 results.endquote_count++;
563 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
564 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
567 g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
569 if (strstr(lines[j],".,"))
571 /* only count ast lines for ignoring purposes where there is */
572 /* locase text on the line */
573 if (strchr(lines[j],'*'))
575 for (s=lines[j];*s;s=g_utf8_next_char(s))
576 if (g_unichar_islower(g_utf8_get_char(s)))
581 if (strchr(lines[j],'/'))
582 results.fslashline++;
585 for (s=g_utf8_prev_char(lines[j]+lbytes);
586 s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
587 s=g_utf8_prev_char(s))
589 if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
590 g_utf8_get_char(g_utf8_prev_char(s))!='-')
593 if (llen>LONGEST_PG_LINE)
595 if (llen>WAY_TOO_LONG)
596 results.verylongline++;
597 if (strchr(lines[j],'<') && strchr(lines[j],'>'))
599 i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
602 if (strstr(lines[j],"<i>"))
603 results.htmcount+=4; /* bonus marks! */
605 /* Check for spaced em-dashes */
606 memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
607 count_dashes(lines[j],"--",&tmp_dash_results);
608 count_dashes(lines[j],"—",&tmp_dash_results);
609 if (tmp_dash_results.base)
610 results.emdash.base++;
611 if (tmp_dash_results.non_PG_space)
612 results.emdash.non_PG_space++;
613 if (tmp_dash_results.PG_space)
614 results.emdash.PG_space++;
618 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
619 results.Dutchcount++;
620 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
621 results.Frenchcount++;
622 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
623 results.standalone_digit++;
626 /* Check for spaced dashes */
627 if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
631 laststart=lines[j][0];
640 * Make some snap decisions based on the first pass results.
642 struct warnings *report_first_pass(struct first_pass_results *results)
644 static struct warnings warnings={0};
646 if (results->unix_lineends)
649 g_print(" --> No lines in this file have a CR. Not reporting them. "
650 "Project Gutenberg requires that all lineends be CR-LF.\n");
653 g_print(" --> %ld lines in this file have white space at end\n",
656 if (results->dotcomma>5)
659 g_print(" --> %ld lines in this file contain '.,'. "
660 "Not reporting them.\n",results->dotcomma);
663 * If more than 50 lines, or one-tenth, are short,
664 * don't bother reporting them.
666 warnings.shortline=1;
667 if (results->shortline>50 || results->shortline*10>linecnt)
669 warnings.shortline=0;
670 g_print(" --> %ld lines in this file are short. "
671 "Not reporting short lines.\n",results->shortline);
674 * If more than 50 lines, or one-tenth, are long,
675 * don't bother reporting them.
678 if (results->longline>50 || results->longline*10>linecnt)
681 g_print(" --> %ld lines in this file are long. "
682 "Not reporting long lines.\n",results->longline);
684 /* If more than 10 lines contain asterisks, don't bother reporting them. */
686 if (results->astline>10)
689 g_print(" --> %ld lines in this file contain asterisks. "
690 "Not reporting them.\n",results->astline);
693 * If more than 10 lines contain forward slashes,
694 * don't bother reporting them.
697 if (results->fslashline>10)
700 g_print(" --> %ld lines in this file contain forward slashes. "
701 "Not reporting them.\n",results->fslashline);
704 * If more than 20 lines contain unpunctuated endquotes,
705 * don't bother reporting them.
708 if (results->endquote_count>20)
711 g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
712 "Not reporting them.\n",results->endquote_count);
715 * If more than 15 lines contain standalone digits,
716 * don't bother reporting them.
719 if (results->standalone_digit>10)
722 g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
723 "Not reporting them.\n",results->standalone_digit);
726 * If more than 20 lines contain hyphens at end,
727 * don't bother reporting them.
730 if (results->hyphens>20)
733 g_print(" --> %ld lines in this file have hyphens at end. "
734 "Not reporting them.\n",results->hyphens);
736 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
738 g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
739 pswit[MARKUP_SWITCH]=1;
741 if (results->verylongline>0)
742 g_print(" --> %ld lines in this file are VERY long!\n",
743 results->verylongline);
745 * If there are more non-PG spaced dashes than PG em-dashes,
746 * assume it's deliberate.
747 * Current PG guidelines say don't use them, but older texts do,
748 * and some people insist on them whatever the guidelines say.
751 if (results->spacedash+results->emdash.non_PG_space>
752 results->emdash.PG_space)
755 g_print(" --> There are %ld spaced dashes and em-dashes. "
756 "Not reporting them.\n",
757 results->spacedash+results->emdash.non_PG_space);
759 /* If more than a quarter of characters are hi-bit, bug out. */
761 if (results->binlen*4>results->totlen)
763 g_print(" --> This file does not appear to be ASCII. "
764 "Terminating. Best of luck with it!\n");
767 if (results->alphalen*4<results->totlen)
769 g_print(" --> This file does not appear to be text. "
770 "Terminating. Best of luck with it!\n");
773 if (results->binlen*100>results->totlen || results->binlen>100)
775 g_print(" --> There are a lot of foreign letters here. "
776 "Not reporting them.\n");
779 warnings.isDutch=FALSE;
780 if (results->Dutchcount>50)
782 warnings.isDutch=TRUE;
783 g_print(" --> This looks like Dutch - "
784 "switching off dashes and warnings for 's Middags case.\n");
786 warnings.isFrench=FALSE;
787 if (results->Frenchcount>50)
789 warnings.isFrench=TRUE;
790 g_print(" --> This looks like French - "
791 "switching off some doublepunct.\n");
793 if (results->firstline && results->footerline)
794 g_print(" The PG header and footer appear to be already on.\n");
797 if (results->firstline)
798 g_print(" The PG header is on - no footer.\n");
799 if (results->footerline)
800 g_print(" The PG footer is on - no header.\n");
803 if (pswit[VERBOSE_SWITCH])
806 warnings.shortline=1;
815 g_print(" *** Verbose output is ON -- you asked for it! ***\n");
817 if (warnings.isDutch)
819 if (results->footerline>0 && results->firstline>0 &&
820 results->footerline>results->firstline &&
821 results->footerline-results->firstline<100)
823 g_print(" --> I don't really know where this text starts. \n");
824 g_print(" There are no reference points.\n");
825 g_print(" I'm going to have to report the header and footer "
827 results->firstline=0;
835 * Look along the line, accumulate the count of quotes, and see
836 * if this is an empty line - i.e. a line with nothing on it
838 * If line has just spaces, period, * and/or - on it, don't
839 * count it, since empty lines with asterisks or dashes to
840 * separate sections are common.
842 * Returns: TRUE if the line is empty.
844 gboolean analyse_quotes(const char *aline,struct counters *counters)
847 /* assume the line is empty until proven otherwise */
848 gboolean isemptyline=TRUE;
849 const char *s=aline,*sprev,*snext;
852 GError *tmp_err=NULL;
855 snext=g_utf8_next_char(s);
856 c=g_utf8_get_char(s);
857 if (CHAR_IS_DQUOTE(c))
858 (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
859 else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
864 * At start of line, it can only be a quotation mark.
865 * Hardcode a very common exception!
867 if (!g_str_has_prefix(snext,"tis") &&
868 !g_str_has_prefix(snext,"Tis"))
869 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
871 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
872 g_unichar_isalpha(g_utf8_get_char(snext)))
873 /* Do nothing! it's definitely an apostrophe, not a quote */
875 /* it's outside a word - let's check it out */
876 else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
877 g_unichar_isalpha(g_utf8_get_char(snext)))
879 /* certainly looks like a quotation mark */
880 if (!g_str_has_prefix(snext,"tis") &&
881 !g_str_has_prefix(snext,"Tis"))
882 /* hardcode a very common exception! */
884 if (strchr(".?!,;:",g_utf8_get_char(sprev)))
885 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
887 (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
892 /* now - is it a quotation mark? */
893 guessquote=0; /* accumulate clues */
894 if (g_unichar_isalpha(g_utf8_get_char(sprev)))
896 /* it follows a letter - could be either */
898 if (g_utf8_get_char(sprev)=='s')
900 /* looks like a plural apostrophe */
902 if (g_utf8_get_char(snext)==CHAR_SPACE)
906 if (innermost_quote_matches(counters,c))
908 * Give it the benefit of some doubt,
909 * if a squote is already open.
915 (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
918 /* no adjacent letter - it must be a quote of some kind */
919 (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
924 if (pswit[ECHO_SWITCH])
925 g_print("\n%s\n",aline);
926 if (!pswit[OVERVIEW_SWITCH])
927 g_print(" Line %ld column %ld - %s\n",
928 linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
929 g_clear_error(&tmp_err);
931 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
933 isemptyline=FALSE; /* ignore lines like * * * as spacers */
934 if (c==CHAR_UNDERSCORE)
935 counters->c_unders++;
936 if (c==CHAR_OPEN_SBRACK)
938 if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
939 !matching_difference(counters,c) && s==aline &&
940 g_str_has_prefix(s,"[Illustration:"))
941 increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
943 increment_matching(counters,c,TRUE);
945 else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
946 increment_matching(counters,c,TRUE);
947 if (c==CHAR_CLOSE_SBRACK)
949 if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
950 !matching_difference(counters,c) && !*snext)
951 increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
953 increment_matching(counters,c,FALSE);
955 else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
956 increment_matching(counters,c,FALSE);
964 * check_for_control_characters:
966 * Check for invalid or questionable characters in the line
967 * Anything above 127 is invalid for plain ASCII, and
968 * non-printable control characters should also be flagged.
969 * Tabs should generally not be there.
971 void check_for_control_characters(const char *aline)
975 for (s=aline;*s;s=g_utf8_next_char(s))
977 c=g_utf8_get_char(s);
978 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
980 if (pswit[ECHO_SWITCH])
981 g_print("\n%s\n",aline);
982 if (!pswit[OVERVIEW_SWITCH])
983 g_print(" Line %ld column %ld - Control character %u\n",
984 linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
992 * check_for_odd_characters:
994 * Check for binary and other odd characters.
996 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
997 gboolean isemptyline)
999 /* Don't repeat multiple warnings on one line. */
1000 gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1001 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1004 for (s=aline;*s;s=g_utf8_next_char(s))
1006 c=g_utf8_get_char(s);
1007 if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1009 if (pswit[ECHO_SWITCH])
1010 g_print("\n%s\n",aline);
1011 if (!pswit[OVERVIEW_SWITCH])
1012 if (c>127 && c<160 || c>255)
1013 g_print(" Line %ld column %ld - "
1014 "Non-ISO-8859 character %u\n",
1015 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1017 g_print(" Line %ld column %ld - "
1018 "Non-ASCII character %u\n",
1019 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1024 if (!eTab && c==CHAR_TAB)
1026 if (pswit[ECHO_SWITCH])
1027 g_print("\n%s\n",aline);
1028 if (!pswit[OVERVIEW_SWITCH])
1029 g_print(" Line %ld column %ld - Tab character?\n",
1030 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1035 if (!eTilde && c==CHAR_TILDE)
1038 * Often used by OCR software to indicate an
1039 * unrecognizable character.
1041 if (pswit[ECHO_SWITCH])
1042 g_print("\n%s\n",aline);
1043 if (!pswit[OVERVIEW_SWITCH])
1044 g_print(" Line %ld column %ld - Tilde character?\n",
1045 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1050 if (!eCarat && c==CHAR_CARAT)
1052 if (pswit[ECHO_SWITCH])
1053 g_print("\n%s\n",aline);
1054 if (!pswit[OVERVIEW_SWITCH])
1055 g_print(" Line %ld column %ld - Carat character?\n",
1056 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1061 if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1063 if (pswit[ECHO_SWITCH])
1064 g_print("\n%s\n",aline);
1065 if (!pswit[OVERVIEW_SWITCH])
1066 g_print(" Line %ld column %ld - Forward slash?\n",
1067 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1073 * Report asterisks only in paranoid mode,
1074 * since they're often deliberate.
1076 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1079 if (pswit[ECHO_SWITCH])
1080 g_print("\n%s\n",aline);
1081 if (!pswit[OVERVIEW_SWITCH])
1082 g_print(" Line %ld column %ld - Asterisk?\n",
1083 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1092 * check_for_long_line:
1094 * Check for line too long.
1096 void check_for_long_line(const char *aline)
1098 if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1100 if (pswit[ECHO_SWITCH])
1101 g_print("\n%s\n",aline);
1102 if (!pswit[OVERVIEW_SWITCH])
1103 g_print(" Line %ld column %ld - Long line %ld\n",
1104 linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1111 * check_for_short_line:
1113 * Check for line too short.
1115 * This one is a bit trickier to implement: we don't want to
1116 * flag the last line of a paragraph for being short, so we
1117 * have to wait until we know that our current line is a
1118 * "normal" line, then report the _previous_ line if it was too
1119 * short. We also don't want to report indented lines like
1120 * chapter heads or formatted quotations. We therefore keep
1121 * last->len as the length of the last line examined, and
1122 * last->blen as the length of the last but one, and try to
1123 * suppress unnecessary warnings by checking that both were of
1124 * "normal" length. We keep the first character of the last
1125 * line in last->start, and if it was a space, we assume that
1126 * the formatting is deliberate. I can't figure out a way to
1127 * distinguish something like a quoted verse left-aligned or
1128 * the header or footer of a letter from a paragraph of short
1129 * lines - maybe if I examined the whole paragraph, and if the
1130 * para has less than, say, 8 lines and if all lines are short,
1131 * then just assume it's OK? Need to look at some texts to see
1132 * how often a formula like this would get the right result.
1134 void check_for_short_line(const char *aline,const struct line_properties *last)
1136 if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1137 last->len<SHORTEST_PG_LINE && last->blen>1 &&
1138 last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1140 if (pswit[ECHO_SWITCH])
1141 g_print("\n%s\n",prevline);
1142 if (!pswit[OVERVIEW_SWITCH])
1143 g_print(" Line %ld column %ld - Short line %ld?\n",
1144 linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1151 * check_for_starting_punctuation:
1153 * Look for punctuation other than full ellipses at start of line.
1155 void check_for_starting_punctuation(const char *aline)
1157 if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1158 !g_str_has_prefix(aline,". . ."))
1160 if (pswit[ECHO_SWITCH])
1161 g_print("\n%s\n",aline);
1162 if (!pswit[OVERVIEW_SWITCH])
1163 g_print(" Line %ld column 1 - Begins with punctuation?\n",
1173 * Find the first em-dash, return a pointer to it and set <next> to the
1174 * character following the dash.
1176 char *str_emdash(const char *s,const char **next)
1184 *next=g_utf8_next_char(s2);
1189 *next=g_utf8_next_char(g_utf8_next_char(s1));
1194 *next=g_utf8_next_char(g_utf8_next_char(s1));
1199 *next=g_utf8_next_char(s2);
1205 * check_for_spaced_emdash:
1207 * Check for spaced em-dashes.
1209 * We must check _all_ occurrences of em-dashes on the line
1210 * hence the loop - even if the first dash is OK
1211 * there may be another that's wrong later on.
1213 void check_for_spaced_emdash(const char *aline)
1215 const char *s,*t,*next;
1216 for (s=aline;t=str_emdash(s,&next);s=next)
1218 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1219 g_utf8_get_char(next)==CHAR_SPACE)
1221 if (pswit[ECHO_SWITCH])
1222 g_print("\n%s\n",aline);
1223 if (!pswit[OVERVIEW_SWITCH])
1224 g_print(" Line %ld column %ld - Spaced em-dash?\n",
1225 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1233 * check_for_spaced_dash:
1235 * Check for spaced dashes.
1237 void check_for_spaced_dash(const char *aline)
1240 if ((s=strstr(aline," -")))
1242 if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1244 if (pswit[ECHO_SWITCH])
1245 g_print("\n%s\n",aline);
1246 if (!pswit[OVERVIEW_SWITCH])
1247 g_print(" Line %ld column %ld - Spaced dash?\n",
1248 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1253 else if ((s=strstr(aline,"- ")))
1255 if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1257 if (pswit[ECHO_SWITCH])
1258 g_print("\n%s\n",aline);
1259 if (!pswit[OVERVIEW_SWITCH])
1260 g_print(" Line %ld column %ld - Spaced dash?\n",
1261 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1269 * check_for_unmarked_paragraphs:
1271 * Check for unmarked paragraphs indicated by separate speakers.
1273 * May well be false positive:
1274 * "Bravo!" "Wonderful!" called the crowd.
1275 * but useful all the same.
1277 void check_for_unmarked_paragraphs(const char *aline)
1280 s=strstr(aline,"\" \"");
1282 s=strstr(aline,"\" \"");
1285 if (pswit[ECHO_SWITCH])
1286 g_print("\n%s\n",aline);
1287 if (!pswit[OVERVIEW_SWITCH])
1288 g_print(" Line %ld column %ld - "
1289 "Query missing paragraph break?\n",
1290 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1297 * check_for_jeebies:
1299 * Check for "to he" and other easy h/b errors.
1301 * This is a very inadequate effort on the h/b problem,
1302 * but the phrase "to he" is always an error, whereas "to
1303 * be" is quite common.
1304 * Similarly, '"Quiet!", be said.' is a non-be error
1305 * "to he" is _not_ always an error!:
1306 * "Where they went to he couldn't say."
1307 * Another false positive:
1308 * What would "Cinderella" be without the . . .
1309 * and another: "If he wants to he can see for himself."
1311 void check_for_jeebies(const char *aline)
1314 s=strstr(aline," be could ");
1316 s=strstr(aline," be would ");
1318 s=strstr(aline," was be ");
1320 s=strstr(aline," be is ");
1322 s=strstr(aline," is be ");
1324 s=strstr(aline,"\", be ");
1326 s=strstr(aline,"\" be ");
1328 s=strstr(aline,"\" be ");
1330 s=strstr(aline," to he ");
1333 if (pswit[ECHO_SWITCH])
1334 g_print("\n%s\n",aline);
1335 if (!pswit[OVERVIEW_SWITCH])
1336 g_print(" Line %ld column %ld - Query he/be error?\n",
1337 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1341 s=strstr(aline," the had ");
1343 s=strstr(aline," a had ");
1345 s=strstr(aline," they bad ");
1347 s=strstr(aline," she bad ");
1349 s=strstr(aline," he bad ");
1351 s=strstr(aline," you bad ");
1353 s=strstr(aline," i bad ");
1356 if (pswit[ECHO_SWITCH])
1357 g_print("\n%s\n",aline);
1358 if (!pswit[OVERVIEW_SWITCH])
1359 g_print(" Line %ld column %ld - Query had/bad error?\n",
1360 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1364 s=strstr(aline,"; hut ");
1366 s=strstr(aline,", hut ");
1369 if (pswit[ECHO_SWITCH])
1370 g_print("\n%s\n",aline);
1371 if (!pswit[OVERVIEW_SWITCH])
1372 g_print(" Line %ld column %ld - Query hut/but error?\n",
1373 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1380 * check_for_mta_from:
1382 * Special case - angled bracket in front of "From" placed there by an
1383 * MTA when sending an e-mail.
1385 void check_for_mta_from(const char *aline)
1388 s=strstr(aline,">From");
1391 if (pswit[ECHO_SWITCH])
1392 g_print("\n%s\n",aline);
1393 if (!pswit[OVERVIEW_SWITCH])
1394 g_print(" Line %ld column %ld - "
1395 "Query angled bracket with From\n",
1396 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1403 * check_for_orphan_character:
1405 * Check for a single character line -
1406 * often an overflow from bad wrapping.
1408 void check_for_orphan_character(const char *aline)
1411 c=g_utf8_get_char(aline);
1412 if (c && !*g_utf8_next_char(aline))
1414 if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1415 ; /* Nothing - ignore numerals alone on a line. */
1418 if (pswit[ECHO_SWITCH])
1419 g_print("\n%s\n",aline);
1420 if (!pswit[OVERVIEW_SWITCH])
1421 g_print(" Line %ld column 1 - Query single character line\n",
1430 * check_for_pling_scanno:
1432 * Check for I" - often should be !
1434 void check_for_pling_scanno(const char *aline)
1437 s=strstr(aline," I\"");
1440 if (pswit[ECHO_SWITCH])
1441 g_print("\n%s\n",aline);
1442 if (!pswit[OVERVIEW_SWITCH])
1443 g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1444 linecnt,g_utf8_pointer_to_offset(aline,s));
1451 * check_for_extra_period:
1453 * Check for period without a capital letter. Cut-down from gutspell.
1454 * Only works when it happens on a single line.
1456 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1458 const char *s,*t,*s1,*sprev;
1463 gunichar c,nc,pc,*decomposition;
1464 if (pswit[PARANOID_SWITCH])
1466 for (t=aline;t=strstr(t,". ");)
1470 t=g_utf8_next_char(t);
1471 /* start of line punctuation is handled elsewhere */
1474 if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1476 t=g_utf8_next_char(t);
1479 if (warnings->isDutch)
1481 /* For Frank & Jeroen -- 's Middags case */
1482 gunichar c2,c3,c4,c5;
1483 c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1484 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1485 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1486 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1487 if (CHAR_IS_APOSTROPHE(c2) &&
1488 g_unichar_islower(c3) && c4==CHAR_SPACE &&
1489 g_unichar_isupper(c5))
1491 t=g_utf8_next_char(t);
1495 s1=g_utf8_next_char(g_utf8_next_char(t));
1496 while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1497 !g_unichar_isdigit(g_utf8_get_char(s1)))
1498 s1=g_utf8_next_char(s1);
1499 if (g_unichar_islower(g_utf8_get_char(s1)))
1501 /* we have something to investigate */
1503 /* so let's go back and find out */
1504 nc=g_utf8_get_char(t);
1505 s1=g_utf8_prev_char(t);
1506 c=g_utf8_get_char(s1);
1507 sprev=g_utf8_prev_char(s1);
1508 pc=g_utf8_get_char(sprev);
1510 (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1511 g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1512 g_unichar_isalpha(nc)))
1517 sprev=g_utf8_prev_char(s1);
1518 pc=g_utf8_get_char(sprev);
1520 s1=g_utf8_next_char(s1);
1523 testword=g_strndup(s1,s-s1);
1525 testword=g_strdup(s1);
1526 for (i=0;*abbrev[i];i++)
1527 if (!strcmp(testword,abbrev[i]))
1529 if (g_unichar_isdigit(g_utf8_get_char(testword)))
1531 if (!*g_utf8_next_char(testword))
1533 if (isroman(testword))
1538 for (s=testword;*s;s=g_utf8_next_char(s))
1540 decomposition=g_unicode_canonical_decomposition(
1541 g_utf8_get_char(s),&len);
1542 if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1544 g_free(decomposition);
1548 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1550 g_tree_insert(qperiod,g_strdup(testword),
1551 GINT_TO_POINTER(1));
1552 if (pswit[ECHO_SWITCH])
1553 g_print("\n%s\n",aline);
1554 if (!pswit[OVERVIEW_SWITCH])
1555 g_print(" Line %ld column %ld - Extra period?\n",
1556 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1562 t=g_utf8_next_char(t);
1568 * check_for_following_punctuation:
1570 * Check for words usually not followed by punctuation.
1572 void check_for_following_punctuation(const char *aline)
1575 const char *s,*wordstart;
1578 if (pswit[TYPO_SWITCH])
1589 inword=g_utf8_strdown(t,-1);
1591 for (i=0;*nocomma[i];i++)
1592 if (!strcmp(inword,nocomma[i]))
1594 c=g_utf8_get_char(s);
1595 if (c==',' || c==';' || c==':')
1597 if (pswit[ECHO_SWITCH])
1598 g_print("\n%s\n",aline);
1599 if (!pswit[OVERVIEW_SWITCH])
1600 g_print(" Line %ld column %ld - "
1601 "Query punctuation after %s?\n",
1602 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1608 for (i=0;*noperiod[i];i++)
1609 if (!strcmp(inword,noperiod[i]))
1611 c=g_utf8_get_char(s);
1612 if (c=='.' || c=='!')
1614 if (pswit[ECHO_SWITCH])
1615 g_print("\n%s\n",aline);
1616 if (!pswit[OVERVIEW_SWITCH])
1617 g_print(" Line %ld column %ld - "
1618 "Query punctuation after %s?\n",
1619 linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1633 * Check for commonly mistyped words,
1634 * and digits like 0 for O in a word.
1636 void check_for_typos(const char *aline,struct warnings *warnings)
1638 const char *s,*t,*nt,*wordstart;
1640 gunichar *decomposition;
1642 int i,vowel,consonant,*dupcnt;
1643 gboolean isdup,istypo,alower;
1646 gsize decomposition_len;
1650 inword=getaword(&s);
1654 continue; /* don't bother with empty lines */
1656 if (mixdigit(inword))
1658 if (pswit[ECHO_SWITCH])
1659 g_print("\n%s\n",aline);
1660 if (!pswit[OVERVIEW_SWITCH])
1661 g_print(" Line %ld column %ld - Query digit in %s\n",
1662 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1667 * Put the word through a series of tests for likely typos and OCR
1670 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1674 for (t=inword;*t;t=g_utf8_next_char(t))
1676 c=g_utf8_get_char(t);
1677 nt=g_utf8_next_char(t);
1678 /* lowercase for testing */
1679 if (g_unichar_islower(c))
1681 if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1684 * We have an uppercase mid-word. However, there are
1686 * Mac and Mc like McGill
1687 * French contractions like l'Abbe
1689 offset=g_utf8_pointer_to_offset(inword,t);
1691 pc=g_utf8_get_char(g_utf8_prev_char(t));
1694 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1695 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1696 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1697 CHAR_IS_APOSTROPHE(pc))
1703 testword=g_utf8_casefold(inword,-1);
1705 if (pswit[TYPO_SWITCH])
1708 * Check for certain unlikely two-letter combinations at word
1711 len=g_utf8_strlen(testword,-1);
1714 for (i=0;*nostart[i];i++)
1715 if (g_str_has_prefix(testword,nostart[i]))
1717 for (i=0;*noend[i];i++)
1718 if (g_str_has_suffix(testword,noend[i]))
1721 /* ght is common, gbt never. Like that. */
1722 if (strstr(testword,"cb"))
1724 if (strstr(testword,"gbt"))
1726 if (strstr(testword,"pbt"))
1728 if (strstr(testword,"tbs"))
1730 if (strstr(testword,"mrn"))
1732 if (strstr(testword,"ahle"))
1734 if (strstr(testword,"ihle"))
1737 * "TBE" does happen - like HEARTBEAT - but uncommon.
1738 * Also "TBI" - frostbite, outbid - but uncommon.
1739 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1740 * numerals, but "ii" is a common scanno.
1742 if (strstr(testword,"tbi"))
1744 if (strstr(testword,"tbe"))
1746 if (strstr(testword,"ii"))
1749 * Check for no vowels or no consonants.
1750 * If none, flag a typo.
1752 if (!istypo && len>1)
1755 for (t=testword;*t;t=g_utf8_next_char(t))
1757 c=g_utf8_get_char(t);
1759 g_unicode_canonical_decomposition(c,&decomposition_len);
1760 if (c=='y' || g_unichar_isdigit(c))
1762 /* Yah, this is loose. */
1766 else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1770 g_free(decomposition);
1772 if (!vowel || !consonant)
1776 * Now exclude the word from being reported if it's in
1779 for (i=0;*okword[i];i++)
1780 if (!strcmp(testword,okword[i]))
1783 * What looks like a typo may be a Roman numeral.
1786 if (istypo && isroman(testword))
1788 /* Check the manual list of typos. */
1790 for (i=0;*typo[i];i++)
1791 if (!strcmp(testword,typo[i]))
1794 * Check lowercase s, l, i and m - special cases.
1795 * "j" - often a semi-colon gone wrong.
1796 * "d" for a missing apostrophe - he d
1799 if (!istypo && len==1 &&
1800 g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1804 dupcnt=g_tree_lookup(qword,testword);
1808 isdup=!pswit[VERBOSE_SWITCH];
1812 dupcnt=g_new0(int,1);
1813 g_tree_insert(qword,g_strdup(testword),dupcnt);
1818 if (pswit[ECHO_SWITCH])
1819 g_print("\n%s\n",aline);
1820 if (!pswit[OVERVIEW_SWITCH])
1822 g_print(" Line %ld column %ld - Query word %s",
1823 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1825 if (!pswit[VERBOSE_SWITCH])
1826 g_print(" - not reporting duplicates");
1834 /* check the user's list of typos */
1835 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1837 if (pswit[ECHO_SWITCH])
1838 g_print("\n%s\n",aline);
1839 if (!pswit[OVERVIEW_SWITCH])
1840 g_print(" Line %ld column %ld - Query possible scanno %s\n",
1841 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1843 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1845 if (pswit[PARANOID_SWITCH] && warnings->digit)
1847 /* In paranoid mode, query all 0 and 1 standing alone. */
1848 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1850 if (pswit[ECHO_SWITCH])
1851 g_print("\n%s\n",aline);
1852 if (!pswit[OVERVIEW_SWITCH])
1853 g_print(" Line %ld column %ld - Query standalone %s\n",
1854 linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1865 * check_for_misspaced_punctuation:
1867 * Look for added or missing spaces around punctuation and quotes.
1868 * If there is a punctuation character like ! with no space on
1869 * either side, suspect a missing!space. If there are spaces on
1870 * both sides , assume a typo. If we see a double quote with no
1871 * space or punctuation on either side of it, assume unspaced
1872 * quotes "like"this.
1874 void check_for_misspaced_punctuation(const char *aline,
1875 struct parities *parities,gboolean isemptyline)
1877 gboolean isacro,isellipsis;
1879 gunichar c,nc,pc,n2c;
1881 c=g_utf8_get_char(aline);
1882 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1883 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1887 nc=g_utf8_get_char(g_utf8_next_char(s));
1888 /* For each character in the line after the first. */
1889 if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1891 /* we need to suppress warnings for acronyms like M.D. */
1893 /* we need to suppress warnings for ellipsis . . . */
1896 * If there are letters on both sides of it or
1897 * if it's strict punctuation followed by an alpha.
1899 if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1900 g_utf8_strchr("?!,;:",-1,c)))
1904 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1905 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1907 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1913 if (pswit[ECHO_SWITCH])
1914 g_print("\n%s\n",aline);
1915 if (!pswit[OVERVIEW_SWITCH])
1916 g_print(" Line %ld column %ld - Missing space?\n",
1917 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1922 if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1925 * If there are spaces on both sides,
1926 * or space before and end of line.
1930 if (g_utf8_pointer_to_offset(aline,s)>2 &&
1931 g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1933 n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1937 if (!isemptyline && !isellipsis)
1939 if (pswit[ECHO_SWITCH])
1940 g_print("\n%s\n",aline);
1941 if (!pswit[OVERVIEW_SWITCH])
1942 g_print(" Line %ld column %ld - "
1943 "Spaced punctuation?\n",linecnt,
1944 g_utf8_pointer_to_offset(aline,s)+1);
1951 /* Split out the characters that CANNOT be preceded by space. */
1952 c=g_utf8_get_char(aline);
1953 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1954 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1958 nc=g_utf8_get_char(g_utf8_next_char(s));
1959 /* for each character in the line after the first */
1960 if (g_utf8_strchr("?!,;:",-1,c))
1962 /* if it's punctuation that _cannot_ have a space before it */
1963 if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1966 * If nc DOES == space,
1967 * it was already reported just above.
1969 if (pswit[ECHO_SWITCH])
1970 g_print("\n%s\n",aline);
1971 if (!pswit[OVERVIEW_SWITCH])
1972 g_print(" Line %ld column %ld - Spaced punctuation?\n",
1973 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1980 * Special case " .X" where X is any alpha.
1981 * This plugs a hole in the acronym code above.
1982 * Inelegant, but maintainable.
1984 c=g_utf8_get_char(aline);
1985 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1986 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1990 nc=g_utf8_get_char(g_utf8_next_char(s));
1991 /* for each character in the line after the first */
1994 /* if it's a period */
1995 if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1998 * If the period follows a space and
1999 * is followed by a letter.
2001 if (pswit[ECHO_SWITCH])
2002 g_print("\n%s\n",aline);
2003 if (!pswit[OVERVIEW_SWITCH])
2004 g_print(" Line %ld column %ld - Spaced punctuation?\n",
2005 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2011 c=g_utf8_get_char(aline);
2012 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2013 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2017 nc=g_utf8_get_char(g_utf8_next_char(s));
2018 /* for each character in the line after the first */
2019 if (CHAR_IS_DQUOTE(c))
2021 if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
2022 !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
2023 !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
2025 if (pswit[ECHO_SWITCH])
2026 g_print("\n%s\n",aline);
2027 if (!pswit[OVERVIEW_SWITCH])
2028 g_print(" Line %ld column %ld - Unspaced quotes?\n",
2029 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2035 /* Check parity of quotes. */
2036 nc=g_utf8_get_char(aline);
2037 for (s=aline;*s;s=g_utf8_next_char(s))
2040 nc=g_utf8_get_char(g_utf8_next_char(s));
2041 if (CHAR_IS_DQUOTE(c))
2045 parities->dquote=!parities->dquote;
2046 parity=parities->dquote;
2048 else if (c==CHAR_LD_QUOTE)
2055 if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
2057 if (pswit[ECHO_SWITCH])
2058 g_print("\n%s\n",aline);
2059 if (!pswit[OVERVIEW_SWITCH])
2060 g_print(" Line %ld column %ld - "
2061 "Wrongspaced quotes?\n",
2062 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2070 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2071 !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
2073 if (pswit[ECHO_SWITCH])
2074 g_print("\n%s\n",aline);
2075 if (!pswit[OVERVIEW_SWITCH])
2076 g_print(" Line %ld column %ld - "
2077 "Wrongspaced quotes?\n",
2078 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2085 c=g_utf8_get_char(aline);
2086 if (CHAR_IS_DQUOTE(c))
2088 if (g_utf8_strchr(",;:!?)]} ",-1,
2089 g_utf8_get_char(g_utf8_next_char(aline))))
2091 if (pswit[ECHO_SWITCH])
2092 g_print("\n%s\n",aline);
2093 if (!pswit[OVERVIEW_SWITCH])
2094 g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
2100 if (pswit[SQUOTE_SWITCH])
2102 nc=g_utf8_get_char(aline);
2103 for (s=aline;*s;s=g_utf8_next_char(s))
2106 nc=g_utf8_get_char(g_utf8_next_char(s));
2107 if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2108 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2109 !g_unichar_isalpha(nc)))
2111 parities->squote=!parities->squote;
2112 if (!parities->squote)
2115 if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
2117 if (pswit[ECHO_SWITCH])
2118 g_print("\n%s\n",aline);
2119 if (!pswit[OVERVIEW_SWITCH])
2120 g_print(" Line %ld column %ld - "
2121 "Wrongspaced singlequotes?\n",
2122 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2130 if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
2131 !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
2133 if (pswit[ECHO_SWITCH])
2134 g_print("\n%s\n",aline);
2135 if (!pswit[OVERVIEW_SWITCH])
2136 g_print(" Line %ld column %ld - "
2137 "Wrongspaced singlequotes?\n",
2138 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2149 * check_for_double_punctuation:
2151 * Look for double punctuation like ,. or ,,
2152 * Thanks to DW for the suggestion!
2153 * In books with references, ".," and ".;" are common
2154 * e.g. "etc., etc.," and vol. 1.; vol 3.;
2155 * OTOH, from my initial tests, there are also fairly
2156 * common errors. What to do? Make these cases paranoid?
2157 * ".," is the most common, so warnings->dotcomma is used
2158 * to suppress detailed reporting if it occurs often.
2160 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
2164 nc=g_utf8_get_char(aline);
2165 for (s=aline;*s;s=g_utf8_next_char(s))
2168 nc=g_utf8_get_char(g_utf8_next_char(s));
2169 /* for each punctuation character in the line */
2170 if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
2171 g_utf8_strchr(".?!,;:",-1,nc))
2173 /* followed by punctuation, it's a query, unless . . . */
2174 if (c==nc && (c=='.' || c=='?' || c=='!') ||
2175 !warnings->dotcomma && c=='.' && nc==',' ||
2176 warnings->isFrench && g_str_has_prefix(s,",...") ||
2177 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2178 warnings->isFrench && g_str_has_prefix(s,";...") ||
2179 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2180 warnings->isFrench && g_str_has_prefix(s,":...") ||
2181 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2182 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2183 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2184 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2185 warnings->isFrench && g_str_has_prefix(s,"...?"))
2187 if (warnings->isFrench && g_str_has_prefix(s,",...") ||
2188 warnings->isFrench && g_str_has_prefix(s,"...,") ||
2189 warnings->isFrench && g_str_has_prefix(s,";...") ||
2190 warnings->isFrench && g_str_has_prefix(s,"...;") ||
2191 warnings->isFrench && g_str_has_prefix(s,":...") ||
2192 warnings->isFrench && g_str_has_prefix(s,"...:") ||
2193 warnings->isFrench && g_str_has_prefix(s,"!...") ||
2194 warnings->isFrench && g_str_has_prefix(s,"...!") ||
2195 warnings->isFrench && g_str_has_prefix(s,"?...") ||
2196 warnings->isFrench && g_str_has_prefix(s,"...?"))
2199 nc=g_utf8_get_char(g_utf8_next_char(s));
2201 ; /* do nothing for .. !! and ?? which can be legit */
2205 if (pswit[ECHO_SWITCH])
2206 g_print("\n%s\n",aline);
2207 if (!pswit[OVERVIEW_SWITCH])
2208 g_print(" Line %ld column %ld - Double punctuation?\n",
2209 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2218 * check_for_spaced_quotes:
2220 void check_for_spaced_quotes(const char *aline)
2224 const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2228 while ((t=strstr(s," \" ")))
2230 if (pswit[ECHO_SWITCH])
2231 g_print("\n%s\n",aline);
2232 if (!pswit[OVERVIEW_SWITCH])
2233 g_print(" Line %ld column %ld - Spaced doublequote?\n",
2234 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2237 s=g_utf8_next_char(g_utf8_next_char(t));
2239 pattern=g_string_new(NULL);
2240 for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2242 g_string_assign(pattern," ");
2243 g_string_append_unichar(pattern,single_quotes[i]);
2244 g_string_append_c(pattern,' ');
2246 while ((t=strstr(s,pattern->str)))
2248 if (pswit[ECHO_SWITCH])
2249 g_print("\n%s\n",aline);
2250 if (!pswit[OVERVIEW_SWITCH])
2251 g_print(" Line %ld column %ld - Spaced singlequote?\n",
2252 linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2255 s=g_utf8_next_char(g_utf8_next_char(t));
2258 g_string_free(pattern,TRUE);
2262 * check_for_miscased_genative:
2264 * Check special case of 'S instead of 's at end of word.
2266 void check_for_miscased_genative(const char *aline)
2272 c=g_utf8_get_char(aline);
2273 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2274 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2278 nc=g_utf8_get_char(g_utf8_next_char(s));
2279 if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2281 if (pswit[ECHO_SWITCH])
2282 g_print("\n%s\n",aline);
2283 if (!pswit[OVERVIEW_SWITCH])
2284 g_print(" Line %ld column %ld - Capital \"S\"?\n",
2285 linecnt,g_utf8_pointer_to_offset(aline,s)+2);
2293 * check_end_of_line:
2295 * Now check special cases - start and end of line -
2296 * for single and double quotes. Start is sometimes [sic]
2297 * but better to query it anyway.
2298 * While we're here, check for dash at end of line.
2300 void check_end_of_line(const char *aline,struct warnings *warnings)
2305 lbytes=strlen(aline);
2306 if (g_utf8_strlen(aline,lbytes)>1)
2308 s=g_utf8_prev_char(aline+lbytes);
2309 c1=g_utf8_get_char(s);
2310 c2=g_utf8_get_char(g_utf8_prev_char(s));
2311 if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2313 if (pswit[ECHO_SWITCH])
2314 g_print("\n%s\n",aline);
2315 if (!pswit[OVERVIEW_SWITCH])
2316 g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
2317 g_utf8_strlen(aline,lbytes));
2321 c1=g_utf8_get_char(aline);
2322 c2=g_utf8_get_char(g_utf8_next_char(aline));
2323 if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2325 if (pswit[ECHO_SWITCH])
2326 g_print("\n%s\n",aline);
2327 if (!pswit[OVERVIEW_SWITCH])
2328 g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
2333 * Dash at end of line may well be legit - paranoid mode only
2334 * and don't report em-dash at line-end.
2336 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
2338 for (s=g_utf8_prev_char(aline+lbytes);
2339 s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
2341 if (g_utf8_get_char(s)=='-' &&
2342 g_utf8_get_char(g_utf8_prev_char(s))!='-')
2344 if (pswit[ECHO_SWITCH])
2345 g_print("\n%s\n",aline);
2346 if (!pswit[OVERVIEW_SWITCH])
2347 g_print(" Line %ld column %ld - "
2348 "Hyphen at end of line?\n",
2349 linecnt,g_utf8_pointer_to_offset(aline,s));
2356 * check_for_unspaced_bracket:
2358 * Brackets are often unspaced, but shouldn't be surrounded by alpha.
2359 * If so, suspect a scanno like "a]most".
2361 void check_for_unspaced_bracket(const char *aline)
2365 c=g_utf8_get_char(aline);
2366 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2367 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2371 nc=g_utf8_get_char(g_utf8_next_char(s));
2374 /* for each bracket character in the line except 1st & last */
2375 if (g_utf8_strchr("{[()]}",-1,c) &&
2376 g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
2378 if (pswit[ECHO_SWITCH])
2379 g_print("\n%s\n",aline);
2380 if (!pswit[OVERVIEW_SWITCH])
2381 g_print(" Line %ld column %ld - Unspaced bracket?\n",
2382 linecnt,g_utf8_pointer_to_offset(aline,s));
2390 * check_for_unpunctuated_endquote:
2392 void check_for_unpunctuated_endquote(const char *aline)
2397 c=g_utf8_get_char(aline);
2398 nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
2399 for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
2403 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
2404 nc=g_utf8_get_char(g_utf8_next_char(s));
2405 /* for each character in the line except 1st */
2406 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
2408 if (pswit[ECHO_SWITCH])
2409 g_print("\n%s\n",aline);
2410 if (!pswit[OVERVIEW_SWITCH])
2411 g_print(" Line %ld column %ld - "
2412 "endquote missing punctuation?\n",
2413 linecnt,g_utf8_pointer_to_offset(aline,s));
2421 * check_for_html_tag:
2423 * Check for <HTML TAG>.
2425 * If there is a < in the line, followed at some point
2426 * by a > then we suspect HTML.
2428 void check_for_html_tag(const char *aline)
2430 const char *open,*close;
2432 open=strchr(aline,'<');
2435 close=strchr(g_utf8_next_char(open),'>');
2438 if (pswit[ECHO_SWITCH])
2439 g_print("\n%s\n",aline);
2440 if (!pswit[OVERVIEW_SWITCH])
2442 tag=g_strndup(open,close-open+1);
2443 g_print(" Line %ld column %ld - HTML Tag? %s \n",
2444 linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
2454 * check_for_html_entity:
2456 * Check for &symbol; HTML.
2458 * If there is a & in the line, followed at
2459 * some point by a ; then we suspect HTML.
2461 void check_for_html_entity(const char *aline)
2463 const char *s,*amp,*scolon;
2465 amp=strchr(aline,'&');
2468 scolon=strchr(amp,';');
2471 for (s=amp;s<scolon;s=g_utf8_next_char(s))
2472 if (g_utf8_get_char(s)==CHAR_SPACE)
2473 break; /* Don't report "Jones & Son;" */
2476 if (pswit[ECHO_SWITCH])
2477 g_print("\n%s\n",aline);
2478 if (!pswit[OVERVIEW_SWITCH])
2480 entity=g_strndup(amp,scolon-amp+1);
2481 g_print(" Line %ld column %d - HTML symbol? %s \n",
2482 linecnt,(int)(amp-aline)+1,entity);
2493 * check_for_omitted_punctuation:
2495 * Check for omitted punctuation at end of paragraph by working back
2496 * through prevline. DW.
2497 * Need to check this only for "normal" paras.
2498 * So what is a "normal" para?
2499 * Not normal if one-liner (chapter headings, etc.)
2500 * Not normal if doesn't contain at least one locase letter
2501 * Not normal if starts with space
2503 void check_for_omitted_punctuation(const char *prevline,
2504 struct line_properties *last,int start_para_line)
2506 gboolean letter_on_line=FALSE;
2509 gboolean closing_quote;
2510 for (s=prevline;*s;s=g_utf8_next_char(s))
2511 if (g_unichar_isalpha(g_utf8_get_char(s)))
2513 letter_on_line=TRUE;
2517 * This next "if" is a problem.
2518 * If we say "start_para_line <= linecnt - 1", that includes
2519 * one-line "paragraphs" like chapter heads. Lotsa false positives.
2520 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
2521 * misses genuine one-line paragraphs.
2523 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2524 g_utf8_get_char(prevline)>CHAR_SPACE)
2526 s=prevline+strlen(prevline);
2529 s=g_utf8_prev_char(s);
2530 c=g_utf8_get_char(s);
2531 if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
2534 closing_quote=FALSE;
2535 } while (closing_quote && s>prevline);
2536 for (;s>prevline;s=g_utf8_prev_char(s))
2538 if (g_unichar_isalpha(g_utf8_get_char(s)))
2540 if (pswit[ECHO_SWITCH])
2541 g_print("\n%s\n",prevline);
2542 if (!pswit[OVERVIEW_SWITCH])
2543 g_print(" Line %ld column %ld - "
2544 "No punctuation at para end?\n",
2545 linecnt-1,g_utf8_strlen(prevline,-1));
2550 if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
2556 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
2558 const char *word=key;
2561 g_print("\nNote: Queried word %s was duplicated %d times\n",
2566 void print_as_windows_1252(const char *string)
2568 gsize inbytes,outbytes;
2570 static GIConv converter=(GIConv)-1;
2573 if (converter!=(GIConv)-1)
2574 g_iconv_close(converter);
2575 converter=(GIConv)-1;
2578 if (converter==(GIConv)-1)
2579 converter=g_iconv_open("WINDOWS-1252","UTF-8");
2580 if (converter!=(GIConv)-1)
2582 inbytes=outbytes=strlen(string);
2583 bp=buf=g_malloc(outbytes+1);
2584 g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
2590 fputs(string,stdout);
2593 void print_as_utf_8(const char *string)
2595 fputs(string,stdout);
2603 void procfile(const char *filename)
2606 gchar *parastart=NULL; /* first line of current para */
2607 gchar *etext,*aline;
2610 struct first_pass_results *first_pass_results;
2611 struct warnings *warnings;
2612 struct counters counters={0};
2613 struct line_properties last={0};
2614 struct parities parities={0};
2615 struct pending pending={0};
2616 gboolean isemptyline;
2617 long start_para_line=0;
2618 gboolean isnewpara=FALSE,enddash=FALSE;
2619 last.start=CHAR_SPACE;
2620 linecnt=checked_linecnt=0;
2621 etext=read_etext(filename,&err);
2624 if (pswit[STDOUT_SWITCH])
2625 fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
2627 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
2630 g_print("\n\nFile: %s\n\n",filename);
2631 first_pass_results=first_pass(etext);
2632 warnings=report_first_pass(first_pass_results);
2633 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
2634 qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
2636 * Here we go with the main pass. Hold onto yer hat!
2640 while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr)))
2645 if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
2646 continue; // skip DP page separators completely
2647 if (linecnt<first_pass_results->firstline ||
2648 (first_pass_results->footerline>0 &&
2649 linecnt>first_pass_results->footerline))
2651 if (pswit[HEADER_SWITCH])
2653 if (g_str_has_prefix(aline,"Title:"))
2654 g_print(" %s\n",aline);
2655 if (g_str_has_prefix(aline,"Author:"))
2656 g_print(" %s\n",aline);
2657 if (g_str_has_prefix(aline,"Release Date:"))
2658 g_print(" %s\n",aline);
2659 if (g_str_has_prefix(aline,"Edition:"))
2660 g_print(" %s\n\n",aline);
2662 continue; /* skip through the header */
2665 print_pending(aline,parastart,&pending);
2666 isemptyline=analyse_quotes(aline,&counters);
2667 if (isnewpara && !isemptyline)
2669 /* This line is the start of a new paragraph. */
2670 start_para_line=linecnt;
2671 /* Capture its first line in case we want to report it later. */
2673 parastart=g_strdup(aline);
2674 memset(&parities,0,sizeof(parities)); /* restart the quote count */
2676 while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
2677 !g_unichar_isdigit(g_utf8_get_char(s)))
2678 s=g_utf8_next_char(s);
2679 if (g_unichar_islower(g_utf8_get_char(s)))
2681 /* and its first letter is lowercase */
2682 if (pswit[ECHO_SWITCH])
2683 g_print("\n%s\n",aline);
2684 if (!pswit[OVERVIEW_SWITCH])
2685 g_print(" Line %ld column %ld - "
2686 "Paragraph starts with lower-case\n",
2687 linecnt,g_utf8_pointer_to_offset(aline,s)+1);
2691 isnewpara=FALSE; /* Signal the end of new para processing. */
2693 /* Check for an em-dash broken at line end. */
2694 if (enddash && g_utf8_get_char(aline)=='-')
2696 if (pswit[ECHO_SWITCH])
2697 g_print("\n%s\n",aline);
2698 if (!pswit[OVERVIEW_SWITCH])
2699 g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
2704 for (s=g_utf8_prev_char(aline+strlen(aline));
2705 g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
2707 if (s>=aline && g_utf8_get_char(s)=='-')
2709 check_for_control_characters(aline);
2711 check_for_odd_characters(aline,warnings,isemptyline);
2712 if (warnings->longline)
2713 check_for_long_line(aline);
2714 if (warnings->shortline)
2715 check_for_short_line(aline,&last);
2717 last.len=g_utf8_strlen(aline,-1);
2718 last.start=g_utf8_get_char(aline);
2719 check_for_starting_punctuation(aline);
2722 check_for_spaced_emdash(aline);
2723 check_for_spaced_dash(aline);
2725 check_for_unmarked_paragraphs(aline);
2726 check_for_jeebies(aline);
2727 check_for_mta_from(aline);
2728 check_for_orphan_character(aline);
2729 check_for_pling_scanno(aline);
2730 check_for_extra_period(aline,warnings);
2731 check_for_following_punctuation(aline);
2732 check_for_typos(aline,warnings);
2733 check_for_misspaced_punctuation(aline,&parities,isemptyline);
2734 check_for_double_punctuation(aline,warnings);
2735 check_for_spaced_quotes(aline);
2736 check_for_miscased_genative(aline);
2737 check_end_of_line(aline,warnings);
2738 check_for_unspaced_bracket(aline);
2739 if (warnings->endquote)
2740 check_for_unpunctuated_endquote(aline);
2741 check_for_html_tag(aline);
2742 check_for_html_entity(aline);
2745 check_for_mismatched_quotes(&counters,&pending);
2746 counters_reset(&counters);
2747 /* let the next iteration know that it's starting a new para */
2750 check_for_omitted_punctuation(prevline,&last,start_para_line);
2753 prevline=g_strdup(aline);
2756 check_for_mismatched_quotes(&counters,&pending);
2757 print_pending(NULL,parastart,&pending);
2758 reset_pending(&pending);
2767 if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
2768 g_tree_foreach(qword,report_duplicate_queries,NULL);
2769 g_tree_unref(qword);
2770 g_tree_unref(qperiod);
2771 counters_destroy(&counters);
2772 g_set_print_handler(NULL);
2773 print_as_windows_1252(NULL);
2774 if (pswit[MARKUP_SWITCH])
2781 * Get one line from the input text, checking for
2782 * the existence of exactly one CR/LF line-end per line.
2784 * Returns: a pointer to the line.
2786 char *flgets(char **etext,long lcnt,gboolean warn_nocr)
2789 gboolean isCR=FALSE;
2790 char *theline=*etext;
2795 c=g_utf8_get_char(*etext);
2798 if (*etext==theline)
2800 else if (pswit[LINE_END_SWITCH])
2802 if (pswit[ECHO_SWITCH])
2804 s=g_strndup(theline,eos-theline);
2805 g_print("\n%s\n",s);
2808 if (!pswit[OVERVIEW_SWITCH])
2809 /* There may, or may not, have been a CR */
2810 g_print(" Line %ld - No LF?\n",lcnt);
2816 *etext=g_utf8_next_char(*etext);
2817 /* either way, it's end of line */
2824 /* Error - a LF without a preceding CR */
2825 if (pswit[LINE_END_SWITCH] && warn_nocr)
2827 if (pswit[ECHO_SWITCH])
2829 s=g_strndup(theline,eos-theline);
2830 g_print("\n%s\n",s);
2833 if (!pswit[OVERVIEW_SWITCH])
2834 g_print(" Line %ld - No CR?\n",lcnt);
2845 /* Error - two successive CRs */
2846 if (pswit[LINE_END_SWITCH])
2848 if (pswit[ECHO_SWITCH])
2850 s=g_strndup(theline,eos-theline);
2851 g_print("\n%s\n",s);
2854 if (!pswit[OVERVIEW_SWITCH])
2855 g_print(" Line %ld - Two successive CRs?\n",lcnt);
2864 if (pswit[LINE_END_SWITCH] && isCR)
2866 if (pswit[ECHO_SWITCH])
2868 s=g_strndup(theline,eos-theline);
2869 g_print("\n%s\n",s);
2872 if (!pswit[OVERVIEW_SWITCH])
2873 g_print(" Line %ld column %ld - CR without LF?\n",
2874 lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
2880 eos=g_utf8_next_char(eos);
2884 if (pswit[MARKUP_SWITCH])
2885 postprocess_for_HTML(theline);
2886 if (pswit[DP_SWITCH])
2887 postprocess_for_DP(theline);
2894 * Takes a "word" as a parameter, and checks whether it
2895 * contains a mixture of alpha and digits. Generally, this is an
2896 * error, but may not be for cases like 4th or L5 12s. 3d.
2898 * Returns: TRUE iff an is error found.
2900 gboolean mixdigit(const char *checkword)
2902 gboolean wehaveadigit,wehavealetter,query;
2903 const char *s,*nondigit;
2904 wehaveadigit=wehavealetter=query=FALSE;
2905 for (s=checkword;*s;s=g_utf8_next_char(s))
2906 if (g_unichar_isalpha(g_utf8_get_char(s)))
2908 else if (g_unichar_isdigit(g_utf8_get_char(s)))
2910 if (wehaveadigit && wehavealetter)
2912 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
2914 for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
2915 nondigit=g_utf8_next_char(nondigit))
2917 /* digits, ending in st, rd, nd, th of either case */
2918 if (!g_ascii_strcasecmp(nondigit,"st") ||
2919 !g_ascii_strcasecmp(nondigit,"rd") ||
2920 !g_ascii_strcasecmp(nondigit,"nd") ||
2921 !g_ascii_strcasecmp(nondigit,"th"))
2923 if (!g_ascii_strcasecmp(nondigit,"sts") ||
2924 !g_ascii_strcasecmp(nondigit,"rds") ||
2925 !g_ascii_strcasecmp(nondigit,"nds") ||
2926 !g_ascii_strcasecmp(nondigit,"ths"))
2928 if (!g_ascii_strcasecmp(nondigit,"stly") ||
2929 !g_ascii_strcasecmp(nondigit,"rdly") ||
2930 !g_ascii_strcasecmp(nondigit,"ndly") ||
2931 !g_ascii_strcasecmp(nondigit,"thly"))
2933 /* digits, ending in l, L, s or d */
2934 if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
2935 !strcmp(nondigit,"d"))
2938 * L at the start of a number, representing Britsh pounds, like L500.
2939 * This is cute. We know the current word is mixed digit. If the first
2940 * letter is L, there must be at least one digit following. If both
2941 * digits and letters follow, we have a genuine error, else we have a
2942 * capital L followed by digits, and we accept that as a non-error.
2944 if (g_utf8_get_char(checkword)=='L' &&
2945 !mixdigit(g_utf8_next_char(checkword)))
2954 * Extracts the first/next "word" from the line, and returns it.
2955 * A word is defined as one English word unit--or at least that's the aim.
2956 * "ptr" is advanced to the position in the line where we will start
2957 * looking for the next word.
2959 * Returns: A newly-allocated string.
2961 gchar *getaword(const char **ptr)
2966 word=g_string_new(NULL);
2967 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
2968 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
2969 **ptr;*ptr=g_utf8_next_char(*ptr))
2971 /* Handle exceptions for footnote markers like [1] */
2972 if (g_utf8_get_char(*ptr)=='[')
2974 g_string_append_c(word,'[');
2975 s=g_utf8_next_char(*ptr);
2976 for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
2977 g_string_append_unichar(word,g_utf8_get_char(s));
2978 if (g_utf8_get_char(s)==']')
2980 g_string_append_c(word,']');
2981 *ptr=g_utf8_next_char(s);
2982 return g_string_free(word,FALSE);
2985 g_string_truncate(word,0);
2989 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
2990 * Especially yucky is the case of L1,000
2991 * This section looks for a pattern of characters including a digit
2992 * followed by a comma or period followed by one or more digits.
2993 * If found, it returns this whole pattern as a word; otherwise we discard
2994 * the results and resume our normal programming.
2997 for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
2998 g_unichar_isalpha(g_utf8_get_char(s)) ||
2999 g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
3000 g_string_append_unichar(word,g_utf8_get_char(s));
3003 for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
3005 c=g_utf8_get_char(t);
3006 pc=g_utf8_get_char(g_utf8_prev_char(t));
3007 if ((c=='.' || c==',') && g_unichar_isdigit(pc))
3010 return g_string_free(word,FALSE);
3014 /* we didn't find a punctuated number - do the regular getword thing */
3015 g_string_truncate(word,0);
3016 c=g_utf8_get_char(*ptr);
3017 for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
3018 *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
3019 g_string_append_unichar(word,c);
3020 return g_string_free(word,FALSE);
3026 * Is this word a Roman Numeral?
3028 * It doesn't actually validate that the number is a valid Roman Numeral--for
3029 * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
3030 * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
3031 * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
3032 * expressions thereof, except when it came to taxes. Allow any number of M,
3033 * an optional D, an optional CM or CD, any number of optional Cs, an optional
3034 * XL or an optional XC, an optional IX or IV, an optional V and any number
3037 gboolean isroman(const char *t)
3043 while (g_utf8_get_char(t)=='m' && *t)
3045 if (g_utf8_get_char(t)=='d')
3047 if (g_str_has_prefix(t,"cm"))
3049 if (g_str_has_prefix(t,"cd"))
3051 while (g_utf8_get_char(t)=='c' && *t)
3053 if (g_str_has_prefix(t,"xl"))
3055 if (g_str_has_prefix(t,"xc"))
3057 if (g_utf8_get_char(t)=='l')
3059 while (g_utf8_get_char(t)=='x' && *t)
3061 if (g_str_has_prefix(t,"ix"))
3063 if (g_str_has_prefix(t,"iv"))
3065 if (g_utf8_get_char(t)=='v')
3067 while (g_utf8_get_char(t)=='i' && *t)
3073 * postprocess_for_DP:
3075 * Invoked with the -d switch from flgets().
3076 * It simply "removes" from the line a hard-coded set of common
3077 * DP-specific tags, so that the line passed to the main routine has
3078 * been pre-cleaned of DP markup.
3080 void postprocess_for_DP(char *theline)
3086 for (i=0;*DPmarkup[i];i++)
3087 while ((s=strstr(theline,DPmarkup[i])))
3089 t=s+strlen(DPmarkup[i]);
3090 memmove(s,t,strlen(t)+1);
3095 * postprocess_for_HTML:
3097 * Invoked with the -m switch from flgets().
3098 * It simply "removes" from the line a hard-coded set of common
3099 * HTML tags and "replaces" a hard-coded set of common HTML
3100 * entities, so that the line passed to the main routine has
3101 * been pre-cleaned of HTML.
3103 void postprocess_for_HTML(char *theline)
3105 while (losemarkup(theline))
3107 loseentities(theline);
3110 char *losemarkup(char *theline)
3114 s=strchr(theline,'<');
3115 t=s?strchr(s,'>'):NULL;
3118 for (i=0;*markup[i];i++)
3119 if (tagcomp(g_utf8_next_char(s),markup[i]))
3121 t=g_utf8_next_char(t);
3122 memmove(s,t,strlen(t)+1);
3125 /* It's an unrecognized <xxx>. */
3129 void loseentities(char *theline)
3136 GTree *entities=NULL;
3137 static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
3141 g_tree_destroy(entities);
3143 if (translit!=(GIConv)-1)
3144 g_iconv_close(translit);
3145 translit=(GIConv)-1;
3146 if (to_utf8!=(GIConv)-1)
3147 g_iconv_close(to_utf8);
3155 entities=g_tree_new((GCompareFunc)strcmp);
3156 for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
3157 g_tree_insert(entities,HTMLentities[i].name,
3158 GUINT_TO_POINTER(HTMLentities[i].c));
3160 if (translit==(GIConv)-1)
3161 translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
3162 if (to_utf8==(GIConv)-1)
3163 to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
3164 while((amp=strchr(theline,'&')))
3166 scolon=strchr(amp,';');
3171 if (amp+2+strspn(amp+2,"0123456789")==scolon)
3172 c=strtol(amp+2,NULL,10);
3173 else if (amp[2]=='x' &&
3174 amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
3175 c=strtol(amp+3,NULL,16);
3179 s=g_strndup(amp+1,scolon-(amp+1));
3180 c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
3189 if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
3190 theline+=g_unichar_to_utf8(c,theline);
3194 nb=g_unichar_to_utf8(c,s);
3195 t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
3197 s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
3199 memcpy(theline,s,nb);
3203 memmove(theline,g_utf8_next_char(scolon),
3204 strlen(g_utf8_next_char(scolon))+1);
3207 theline=g_utf8_next_char(amp);
3211 gboolean tagcomp(const char *strin,const char *basetag)
3215 if (g_utf8_get_char(strin)=='/')
3216 t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
3218 t=g_utf8_casefold(strin,-1);
3219 s=g_utf8_casefold(basetag,-1);
3220 retval=g_str_has_prefix(t,s);
3226 void proghelp(GOptionContext *context)
3229 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
3230 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
3231 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
3232 fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
3233 "For details, read the file COPYING.\n",stderr);
3234 fputs("This is Free Software; "
3235 "you may redistribute it under certain conditions (GPL);\n",stderr);
3236 fputs("read the file COPYING for details.\n\n",stderr);
3237 help=g_option_context_get_help(context,TRUE,NULL);
3240 fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
3241 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
3242 "non-ASCII\n",stderr);
3243 fputs("characters like accented letters, "
3244 "lines longer than 75 or shorter than 55,\n",stderr);
3245 fputs("unbalanced quotes or brackets, "
3246 "a variety of badly formatted punctuation, \n",stderr);
3247 fputs("HTML tags, some likely typos. "
3248 "It is NOT a substitute for human judgement.\n",stderr);