bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Mon Sep 30 08:18:42 2013 +0100 (2013-09-30)
changeset 138 5e27fa988c5c
parent 137 b6358ed2548d
child 139 c130152c4a57
child 144 d7a97f077f9e
permissions -rw-r--r--
Bugs #13+14: charsets in configuration files
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 gboolean typo_compat,paranoid_compat;
   136 
   137 static GOptionEntry options[]={
   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   139       "Ignore DP-specific markup", NULL },
   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   142       "Don't ignore DP-specific markup", NULL },
   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   144       "Echo queried line", NULL },
   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   147       "Don't echo queried line", NULL },
   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   149       "Check single quotes", NULL },
   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   152       "Don't check single quotes", NULL },
   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   154       "Check common typos", NULL },
   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   157       "Don't check common typos", NULL },
   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   159       "Require closure of quotes on every paragraph", NULL },
   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   162       "Don't require closure of quotes on every paragraph", NULL },
   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   165       "Enable paranoid querying of everything", NULL },
   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   168       "Disable paranoid querying of everything", NULL },
   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   171       "Enable line end checking", NULL },
   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   174       "Diable line end checking", NULL },
   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   176       "Overview: just show counts", NULL },
   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   179       "Show individual warnings", NULL },
   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   181       "Output errors to stdout instead of stderr", NULL },
   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   184       "Output errors to stderr instead of stdout", NULL },
   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   186       "Echo header fields", NULL },
   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Don't echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "No special handling for markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   199       "Ignore file of user-defined typos", NULL },
   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   201       "Verbose - list everything", NULL },
   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   204       "Switch off verbose mode", NULL },
   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   206       "Set of characters valid for this ebook", "NAME" },
   207     { NULL }
   208 };
   209 
   210 /*
   211  * Options relating to configuration which make no sense from inside
   212  * a configuration file.
   213  */
   214 
   215 static GOptionEntry config_options[]={
   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   217       "Defaults for use on www upload", NULL },
   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   219       "Dump current config settings", NULL },
   220     { NULL }
   221 };
   222 
   223 static GOptionEntry compatibility_options[]={
   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   225       "Toggle checking for common typos", NULL },
   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   227       "Toggle both paranoid mode and common typos", NULL },
   228     { NULL }
   229 };
   230 
   231 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   232 long cnt_squot;		/* for overview mode, count of singlequote queries */
   233 long cnt_brack;		/* for overview mode, count of brackets queries */
   234 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   235 long cnt_odd;		/* for overview mode, count of odd character queries */
   236 long cnt_long;		/* for overview mode, count of long line errors */
   237 long cnt_short;		/* for overview mode, count of short line queries */
   238 long cnt_punct;		/* for overview mode,
   239 			   count of punctuation and spacing queries */
   240 long cnt_dash;		/* for overview mode, count of dash-related queries */
   241 long cnt_word;		/* for overview mode, count of word queries */
   242 long cnt_html;		/* for overview mode, count of html queries */
   243 long cnt_lineend;	/* for overview mode, count of line-end queries */
   244 long cnt_spacend;	/* count of lines with space at end */
   245 long linecnt;		/* count of total lines in the file */
   246 long checked_linecnt;	/* count of lines actually checked */
   247 
   248 void proghelp(GOptionContext *context);
   249 void procfile(const char *);
   250 
   251 gchar *running_from;
   252 
   253 gboolean mixdigit(const char *);
   254 gchar *getaword(const char **);
   255 char *flgets(char **,long);
   256 void postprocess_for_HTML(char *);
   257 char *linehasmarkup(char *);
   258 char *losemarkup(char *);
   259 gboolean tagcomp(const char *,const char *);
   260 void loseentities(char *);
   261 gboolean isroman(const char *);
   262 void postprocess_for_DP(char *);
   263 void print_as_windows_1252(const char *string);
   264 void print_as_utf_8(const char *string);
   265 
   266 GTree *qword,*qperiod;
   267 
   268 #ifdef __WIN32__
   269 UINT saved_cp;
   270 #endif
   271 
   272 gboolean set_charset(const char *name,GError **err)
   273 {
   274     /* The various UNICODE encodings all share the same character set. */
   275     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   276       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   277       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   278       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   279       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   280     int i;
   281     if (charset)
   282 	g_free(charset);
   283     if (charset_validator!=(GIConv)-1)
   284 	g_iconv_close(charset_validator);
   285     if (!name || !g_strcasecmp(name,"auto"))
   286     {
   287 	charset=NULL;
   288 	charset_validator=(GIConv)-1;
   289 	return TRUE;
   290     }
   291     else
   292 	charset=g_strdup(name);
   293     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   294 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   295 	{
   296 	    g_free(charset);
   297 	    charset=g_strdup("UTF-8");
   298 	    break;
   299 	}
   300     if (!strcmp(charset,"UTF-8"))
   301 	charset_validator=(GIConv)-1;
   302     else
   303     {
   304 	charset_validator=g_iconv_open(charset,"UTF-8");
   305 	if (charset_validator==(GIConv)-1)
   306 	{
   307 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   308 	      "Unknown character set \"%s\"",charset);
   309 	    return FALSE;
   310 	}
   311     }
   312     return TRUE;
   313 }
   314 
   315 GKeyFile *config;
   316 
   317 void config_file_update(GKeyFile *kf)
   318 {
   319     int i;
   320     const char *s;
   321     gboolean sw;
   322     for(i=0;options[i].long_name;i++)
   323     {
   324 	if (g_str_has_prefix(options[i].long_name,"no-"))
   325 	    continue;
   326 	if (options[i].arg==G_OPTION_ARG_NONE)
   327 	{
   328 	    sw=*(gboolean *)options[i].arg_data;
   329 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   330 		sw=!sw;
   331 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   332 	}
   333 	else if (options[i].arg==G_OPTION_ARG_STRING)
   334 	{
   335 	    s=*(gchar **)options[i].arg_data;
   336 	    if (!s)
   337 		s="auto";
   338 	    g_key_file_set_string(kf,"options",options[i].long_name,s);
   339 	}
   340 	else
   341 	    g_assert_not_reached();
   342     }
   343 }
   344 
   345 void config_file_add_comments(GKeyFile *kf)
   346 {
   347     int i;
   348     gchar *comment;
   349     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   350       NULL);
   351     for(i=0;options[i].long_name;i++)
   352     {
   353 	if (g_str_has_prefix(options[i].long_name,"no-"))
   354 	    continue;
   355 	comment=g_strconcat(" ",options[i].description,NULL);
   356 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   357 	g_free(comment);
   358     }
   359 }
   360 
   361 void dump_config(void)
   362 {
   363     gchar *s;
   364     if (config)
   365 	config_file_update(config);
   366     else
   367     {
   368 	config=g_key_file_new();
   369 	config_file_update(config);
   370 	config_file_add_comments(config);
   371     }
   372     s=g_key_file_to_data(config,NULL,NULL);
   373     if (s)
   374 	g_print("%s",s);
   375     g_free(s);
   376 }
   377 
   378 GKeyFile *read_config_file(gchar **full_path)
   379 {
   380     int i;
   381     GError *err=NULL;
   382     gchar **search_dirs;
   383     gchar *path;
   384     const char *search_path;
   385     GKeyFile *kf;
   386     kf=g_key_file_new();
   387     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   388     if (search_path)
   389     {
   390 #ifdef __WIN32__
   391 	search_dirs=g_strsplit(search_path,";",0);
   392 #else
   393 	search_dirs=g_strsplit(search_path,":",0);
   394 #endif
   395     }
   396     else
   397     {
   398 	search_dirs=g_new(gchar *,4);
   399 	search_dirs[0]=g_get_current_dir();
   400 	search_dirs[1]=g_strdup(running_from);
   401 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   402 	search_dirs[3]=NULL;
   403     }
   404     for(i=0;search_dirs[i];i++)
   405     {
   406 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   407 	if (g_key_file_load_from_file(kf,path,
   408 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   409 	    break;
   410 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   411 	{
   412 	    g_printerr("Bookloupe: Error reading %s\n",path);
   413 	    g_printerr("%s\n",err->message);
   414 	    exit(1);
   415 	}
   416 	g_clear_error(&err);
   417 	g_free(path);
   418 	path=NULL;
   419     }
   420     if (!search_dirs[i])
   421     {
   422 	g_key_file_free(kf);
   423 	kf=NULL;
   424     }
   425     g_strfreev(search_dirs);
   426     if (full_path && kf)
   427 	*full_path=path;
   428     else
   429 	g_free(path);
   430     return kf;
   431 }
   432 
   433 void parse_config_file(void)
   434 {
   435     int i,j;
   436     gchar *path,*s;
   437     gchar **keys;
   438     gboolean sw;
   439     GError *err=NULL;
   440     config=read_config_file(&path);
   441     if (config)
   442 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   443     else
   444 	keys=NULL;
   445     if (keys)
   446     {
   447 	for(i=0;keys[i];i++)
   448 	{
   449 	    for(j=0;options[j].long_name;j++)
   450 	    {
   451 		if (g_str_has_prefix(options[j].long_name,"no-"))
   452 		    continue;
   453 		else if (!strcmp(keys[i],options[j].long_name))
   454 		{
   455 		    if (options[j].arg==G_OPTION_ARG_NONE)
   456 		    {
   457 			sw=g_key_file_get_boolean(config,"options",keys[i],
   458 			  &err);
   459 			if (err)
   460 			{
   461 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   462 			      path,keys[i],err->message);
   463 			    g_clear_error(&err);
   464 			}
   465 			else
   466 			{
   467 			    if (options[j].flags&G_OPTION_FLAG_REVERSE)
   468 				sw=!sw;
   469 			    *(gboolean *)options[j].arg_data=sw;
   470 			}
   471 			break;
   472 		    }
   473 		    else if (options[j].arg==G_OPTION_ARG_STRING)
   474 		    {
   475 			s=g_key_file_get_string(config,"options",keys[i],
   476 			  &err);
   477 			if (err)
   478 			{
   479 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   480 			      path,keys[i],err->message);
   481 			    g_clear_error(&err);
   482 			}
   483 			else
   484 			{
   485 			    g_free(*(gchar **)options[j].arg_data);
   486 			    if (!g_strcmp0(s,"auto"))
   487 			    {
   488 				*(gchar **)options[j].arg_data=NULL;
   489 				g_free(s);
   490 			    }
   491 			    else
   492 				*(gchar **)options[j].arg_data=s;
   493 			}
   494 			break;
   495 		    }
   496 		    else
   497 			g_assert_not_reached();
   498 		}
   499 	    }
   500 	    if (!options[j].long_name)
   501 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   502 		  path,keys[i]);
   503 	}
   504 	g_strfreev(keys);
   505     }
   506     if (config)
   507 	g_free(path);
   508 }
   509 
   510 void parse_options(int *argc,char ***argv)
   511 {
   512     GError *err=NULL;
   513     GOptionContext *context;
   514     GOptionGroup *compatibility;
   515     context=g_option_context_new(
   516       "file - look for errors in Project Gutenberg(TM) etexts");
   517     g_option_context_add_main_entries(context,options,NULL);
   518     g_option_context_add_main_entries(context,config_options,NULL);
   519     compatibility=g_option_group_new("compatibility",
   520       "Options for Compatibility with Gutcheck:",
   521       "Show compatibility options",NULL,NULL);
   522     g_option_group_add_entries(compatibility,compatibility_options);
   523     g_option_context_add_group(context,compatibility);
   524     g_option_context_set_description(context,
   525       "For simplicity, only the switch options which reverse the\n"
   526       "default configuration are listed. In most cases, both vanilla\n"
   527       "and \"no-\" prefixed versions are available for use.");
   528     if (!g_option_context_parse(context,argc,argv,&err))
   529     {
   530 	g_printerr("Bookloupe: %s\n",err->message);
   531 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   532 	exit(1);
   533     }
   534     if (typo_compat)
   535 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   536     if (paranoid_compat)
   537     {
   538 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   539 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   540     }
   541     /*
   542      * Web uploads - for the moment, this is really just a placeholder
   543      * until we decide what processing we really want to do on web uploads
   544      */
   545     if (pswit[WEB_SWITCH])
   546     {
   547 	/* specific override for web uploads */
   548 	pswit[ECHO_SWITCH]=TRUE;
   549 	pswit[SQUOTE_SWITCH]=FALSE;
   550 	pswit[TYPO_SWITCH]=TRUE;
   551 	pswit[QPARA_SWITCH]=FALSE;
   552 	pswit[PARANOID_SWITCH]=TRUE;
   553 	pswit[LINE_END_SWITCH]=FALSE;
   554 	pswit[OVERVIEW_SWITCH]=FALSE;
   555 	pswit[STDOUT_SWITCH]=FALSE;
   556 	pswit[HEADER_SWITCH]=TRUE;
   557 	pswit[VERBOSE_SWITCH]=FALSE;
   558 	pswit[MARKUP_SWITCH]=FALSE;
   559 	pswit[USERTYPO_SWITCH]=FALSE;
   560 	pswit[DP_SWITCH]=FALSE;
   561     }
   562     if (opt_charset && !set_charset(opt_charset,&err))
   563     {
   564 	g_printerr("%s\n",err->message);
   565 	exit(1);
   566     }
   567     if (pswit[DUMP_CONFIG_SWITCH])
   568     {
   569 	dump_config();
   570 	exit(0);
   571     }
   572     g_free(opt_charset);
   573     opt_charset=NULL;
   574     if (pswit[OVERVIEW_SWITCH])
   575 	/* just print summary; don't echo */
   576 	pswit[ECHO_SWITCH]=FALSE;
   577     if (*argc<2)
   578     {
   579 	proghelp(context);
   580 	exit(1);
   581     }
   582     g_option_context_free(context);
   583 }
   584 
   585 /*
   586  * read_user_scannos:
   587  *
   588  * Read in the user-defined stealth scanno list.
   589  */
   590 void read_user_scannos(void)
   591 {
   592     GError *err=NULL;
   593     gchar *usertypo_file;
   594     gboolean okay;
   595     int i;
   596     gsize len,nb;
   597     gchar *contents,*utf8,**lines;
   598     usertypo_file=g_strdup("bookloupe.typ");
   599     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   600     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   601     {
   602 	g_clear_error(&err);
   603 	g_free(usertypo_file);
   604 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   605 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   606     }
   607     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   608     {
   609 	g_clear_error(&err);
   610 	g_free(usertypo_file);
   611 	usertypo_file=g_strdup("gutcheck.typ");
   612 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   613     }
   614     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   615     {
   616 	g_clear_error(&err);
   617 	g_free(usertypo_file);
   618 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   619 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   620     }
   621     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   622     {
   623 	g_free(usertypo_file);
   624 	g_print("   --> I couldn't find bookloupe.typ "
   625 	  "-- proceeding without user typos.\n");
   626 	return;
   627     }
   628     else if (!okay)
   629     {
   630 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   631 	g_free(usertypo_file);
   632 	g_clear_error(&err);
   633 	exit(1);
   634     }
   635     if (g_utf8_validate(contents,len,NULL))
   636     {
   637 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   638 	if (!charset)
   639 	    (void)set_charset("UNICODE",NULL);
   640     }
   641     else
   642 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   643     g_free(contents);
   644     lines=g_strsplit_set(utf8,"\r\n",0);
   645     g_free(utf8);
   646     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   647     for (i=0;lines[i];i++)
   648 	if (*(unsigned char *)lines[i]>'!')
   649 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   650 	else
   651 	    g_free(lines[i]);
   652     g_free(lines);
   653 }
   654 
   655 /*
   656  * read_etext:
   657  *
   658  * Read an etext returning a newly allocated string containing the file
   659  * contents or NULL on error.
   660  */
   661 gchar *read_etext(const char *filename,GError **err)
   662 {
   663     GError *tmp_err=NULL;
   664     gchar *contents,*utf8;
   665     gsize len,bytes_read,bytes_written;
   666     int i,line,col;
   667     if (!g_file_get_contents(filename,&contents,&len,err))
   668 	return NULL;
   669     if (g_utf8_validate(contents,len,NULL))
   670     {
   671 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   672 	g_set_print_handler(print_as_utf_8);
   673 #ifdef __WIN32__
   674 	SetConsoleOutputCP(CP_UTF8);
   675 #endif
   676     }
   677     else
   678     {
   679 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   680 	  &bytes_written,&tmp_err);
   681 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   682 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   683 	{
   684 	    line=col=1;
   685 	    for(i=0;i<bytes_read;i++)
   686 		if (contents[i]=='\n')
   687 		{
   688 		    line++;
   689 		    col=1;
   690 		}
   691 		else if (contents[i]!='\r')
   692 		    col++;
   693 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   694 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   695 	      "valid Windows-1252 character",
   696 	      ((unsigned char *)contents)[bytes_read],line,col);
   697 	}
   698 	else if (tmp_err)
   699 	    g_propagate_error(err,tmp_err);
   700 	g_set_print_handler(print_as_windows_1252);
   701 #ifdef __WIN32__
   702 	SetConsoleOutputCP(1252);
   703 #endif
   704     }
   705     g_free(contents);
   706     return utf8;
   707 }
   708 
   709 void cleanup_on_exit(void)
   710 {
   711 #ifdef __WIN32__
   712     SetConsoleOutputCP(saved_cp);
   713 #endif
   714 }
   715 
   716 int main(int argc,char **argv)
   717 {
   718 #ifdef __WIN32__
   719     atexit(cleanup_on_exit);
   720     saved_cp=GetConsoleOutputCP();
   721 #endif
   722     running_from=g_path_get_dirname(argv[0]);
   723     /* Paranoid checking is turned OFF, not on, by its switch */
   724     pswit[PARANOID_SWITCH]=TRUE;
   725     /* if running in paranoid mode, typo checks default to enabled */
   726     pswit[TYPO_SWITCH]=TRUE;
   727     /* Line-end checking is turned OFF, not on, by its switch */
   728     pswit[LINE_END_SWITCH]=TRUE;
   729     /* Echoing is turned OFF, not on, by its switch */
   730     pswit[ECHO_SWITCH]=TRUE;
   731     parse_config_file();
   732     parse_options(&argc,&argv);
   733     if (pswit[USERTYPO_SWITCH])
   734 	read_user_scannos();
   735     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   736     procfile(argv[1]);
   737     if (pswit[OVERVIEW_SWITCH])
   738     {
   739 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   740 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   741 	g_print("    --------------- Queries found --------------\n");
   742 	if (cnt_long)
   743 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   744 	if (cnt_short)
   745 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   746 	if (cnt_lineend)
   747 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   748 	if (cnt_word)
   749 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   750 	if (cnt_dquot)
   751 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   752 	if (cnt_squot)
   753 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   754 	if (cnt_brack)
   755 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   756 	if (cnt_bin)
   757 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   758 	if (cnt_odd)
   759 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   760 	if (cnt_punct)
   761 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   762 	if (cnt_dash)
   763 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   764 	if (cnt_html)
   765 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   766 	g_print("\n");
   767 	g_print("    TOTAL QUERIES		  %14ld\n",
   768 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   769 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   770     }
   771     g_free(running_from);
   772     if (usertypo)
   773 	g_tree_unref(usertypo);
   774     set_charset(NULL,NULL);
   775     if (config)
   776 	g_key_file_free(config);
   777     return 0;
   778 }
   779 
   780 /*
   781  * first_pass:
   782  *
   783  * Run a first pass - verify that it's a valid PG
   784  * file, decide whether to report some things that
   785  * occur many times in the text like long or short
   786  * lines, non-standard dashes, etc.
   787  */
   788 struct first_pass_results *first_pass(const char *etext)
   789 {
   790     gunichar laststart=CHAR_SPACE;
   791     const char *s;
   792     gchar *lc_line;
   793     int i,j,lbytes,llen;
   794     gchar **lines;
   795     unsigned int lastlen=0,lastblen=0;
   796     long spline=0,nspline=0;
   797     static struct first_pass_results results={0};
   798     gchar *inword;
   799     lines=g_strsplit(etext,"\n",0);
   800     for (j=0;lines[j];j++)
   801     {
   802 	lbytes=strlen(lines[j]);
   803 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   804 	    lines[j][--lbytes]='\0';
   805 	llen=g_utf8_strlen(lines[j],lbytes);
   806 	linecnt++;
   807 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   808 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   809 	{
   810 	    if (spline)
   811 		g_print("   --> Duplicate header?\n");
   812 	    spline=linecnt+1;   /* first line of non-header text, that is */
   813 	}
   814 	if (!strncmp(lines[j],"*** START",9) &&
   815 	  strstr(lines[j],"PROJECT GUTENBERG"))
   816 	{
   817 	    if (nspline)
   818 		g_print("   --> Duplicate header?\n");
   819 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   820 	}
   821 	if (spline || nspline)
   822 	{
   823 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   824 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   825 	    {
   826 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   827 		{
   828 		    if (results.footerline)
   829 		    {
   830 			/* it's an old-form header - we can detect duplicates */
   831 			if (!nspline)
   832 			    g_print("   --> Duplicate footer?\n");
   833 		    }
   834 		    else
   835 			results.footerline=linecnt;
   836 		}
   837 	    }
   838 	    g_free(lc_line);
   839 	}
   840 	if (spline)
   841 	    results.firstline=spline;
   842 	if (nspline)
   843 	    results.firstline=nspline;  /* override with new */
   844 	if (results.footerline)
   845 	    continue;    /* don't count the boilerplate in the footer */
   846 	results.totlen+=llen;
   847 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   848 	{
   849 	    if (g_utf8_get_char(s)>127)
   850 		results.binlen++;
   851 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   852 		results.alphalen++;
   853 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   854 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   855 		results.endquote_count++;
   856 	}
   857 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   858 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   859 	    results.shortline++;
   860 	if (lbytes>0 &&
   861 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   862 	    cnt_spacend++;
   863 	if (strstr(lines[j],".,"))
   864 	    results.dotcomma++;
   865 	/* only count ast lines for ignoring purposes where there is */
   866 	/* locase text on the line */
   867 	if (strchr(lines[j],'*'))
   868 	{
   869 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   870 		if (g_unichar_islower(g_utf8_get_char(s)))
   871 		    break;
   872 	    if (*s)
   873 		results.astline++;
   874 	}
   875 	if (strchr(lines[j],'/'))
   876 	    results.fslashline++;
   877 	if (lbytes>0)
   878 	{
   879 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   880 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   881 	      s=g_utf8_prev_char(s))
   882 		;
   883 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   884 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   885 		results.hyphens++;
   886 	}
   887 	if (llen>LONGEST_PG_LINE)
   888 	    results.longline++;
   889 	if (llen>WAY_TOO_LONG)
   890 	    results.verylongline++;
   891 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   892 	{
   893 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   894 	    if (i>0)
   895 		results.htmcount++;
   896 	    if (strstr(lines[j],"<i>"))
   897 		results.htmcount+=4; /* bonus marks! */
   898 	}
   899 	/* Check for spaced em-dashes */
   900 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   901 	{
   902 	    results.emdash++;
   903 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   904 		results.space_emdash++;
   905 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   906 		/* count of em-dashes with spaces both sides */
   907 		results.non_PG_space_emdash++;
   908 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   909 		/* count of PG-type em-dashes with no spaces */
   910 		results.PG_space_emdash++;
   911 	}
   912 	for (s=lines[j];*s;)
   913 	{
   914 	    inword=getaword(&s);
   915 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   916 		results.Dutchcount++;
   917 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   918 		results.Frenchcount++;
   919 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   920 		results.standalone_digit++;
   921 	    g_free(inword);
   922 	}
   923 	/* Check for spaced dashes */
   924 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   925 	    results.spacedash++;
   926 	lastblen=lastlen;
   927 	lastlen=llen;
   928 	laststart=lines[j][0];
   929     }
   930     g_strfreev(lines);
   931     return &results;
   932 }
   933 
   934 /*
   935  * report_first_pass:
   936  *
   937  * Make some snap decisions based on the first pass results.
   938  */
   939 struct warnings *report_first_pass(struct first_pass_results *results)
   940 {
   941     static struct warnings warnings={0};
   942     if (cnt_spacend>0)
   943 	g_print("   --> %ld lines in this file have white space at end\n",
   944 	  cnt_spacend);
   945     warnings.dotcomma=1;
   946     if (results->dotcomma>5)
   947     {
   948 	warnings.dotcomma=0;
   949 	g_print("   --> %ld lines in this file contain '.,'. "
   950 	  "Not reporting them.\n",results->dotcomma);
   951     }
   952     /*
   953      * If more than 50 lines, or one-tenth, are short,
   954      * don't bother reporting them.
   955      */
   956     warnings.shortline=1;
   957     if (results->shortline>50 || results->shortline*10>linecnt)
   958     {
   959 	warnings.shortline=0;
   960 	g_print("   --> %ld lines in this file are short. "
   961 	  "Not reporting short lines.\n",results->shortline);
   962     }
   963     /*
   964      * If more than 50 lines, or one-tenth, are long,
   965      * don't bother reporting them.
   966      */
   967     warnings.longline=1;
   968     if (results->longline>50 || results->longline*10>linecnt)
   969     {
   970 	warnings.longline=0;
   971 	g_print("   --> %ld lines in this file are long. "
   972 	  "Not reporting long lines.\n",results->longline);
   973     }
   974     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   975     warnings.ast=1;
   976     if (results->astline>10)
   977     {
   978 	warnings.ast=0;
   979 	g_print("   --> %ld lines in this file contain asterisks. "
   980 	  "Not reporting them.\n",results->astline);
   981     }
   982     /*
   983      * If more than 10 lines contain forward slashes,
   984      * don't bother reporting them.
   985      */
   986     warnings.fslash=1;
   987     if (results->fslashline>10)
   988     {
   989 	warnings.fslash=0;
   990 	g_print("   --> %ld lines in this file contain forward slashes. "
   991 	  "Not reporting them.\n",results->fslashline);
   992     }
   993     /*
   994      * If more than 20 lines contain unpunctuated endquotes,
   995      * don't bother reporting them.
   996      */
   997     warnings.endquote=1;
   998     if (results->endquote_count>20)
   999     {
  1000 	warnings.endquote=0;
  1001 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
  1002 	  "Not reporting them.\n",results->endquote_count);
  1003     }
  1004     /*
  1005      * If more than 15 lines contain standalone digits,
  1006      * don't bother reporting them.
  1007      */
  1008     warnings.digit=1;
  1009     if (results->standalone_digit>10)
  1010     {
  1011 	warnings.digit=0;
  1012 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
  1013 	  "Not reporting them.\n",results->standalone_digit);
  1014     }
  1015     /*
  1016      * If more than 20 lines contain hyphens at end,
  1017      * don't bother reporting them.
  1018      */
  1019     warnings.hyphen=1;
  1020     if (results->hyphens>20)
  1021     {
  1022 	warnings.hyphen=0;
  1023 	g_print("   --> %ld lines in this file have hyphens at end. "
  1024 	  "Not reporting them.\n",results->hyphens);
  1025     }
  1026     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1027     {
  1028 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1029 	pswit[MARKUP_SWITCH]=1;
  1030     }
  1031     if (results->verylongline>0)
  1032 	g_print("   --> %ld lines in this file are VERY long!\n",
  1033 	  results->verylongline);
  1034     /*
  1035      * If there are more non-PG spaced dashes than PG em-dashes,
  1036      * assume it's deliberate.
  1037      * Current PG guidelines say don't use them, but older texts do,
  1038      * and some people insist on them whatever the guidelines say.
  1039      */
  1040     warnings.dash=1;
  1041     if (results->spacedash+results->non_PG_space_emdash>
  1042       results->PG_space_emdash)
  1043     {
  1044 	warnings.dash=0;
  1045 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1046 	  "Not reporting them.\n",
  1047 	  results->spacedash+results->non_PG_space_emdash);
  1048     }
  1049     if (charset)
  1050 	warnings.bin=0;
  1051     else
  1052     {
  1053 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1054 	warnings.bin=1;
  1055 	/* If more than a quarter of characters are hi-bit, bug out. */
  1056 	if (results->binlen*4>results->totlen)
  1057 	{
  1058 	    g_print("   --> This file does not appear to be ASCII. "
  1059 	      "Terminating. Best of luck with it!\n");
  1060 	    exit(1);
  1061 	}
  1062 	if (results->alphalen*4<results->totlen)
  1063 	{
  1064 	    g_print("   --> This file does not appear to be text. "
  1065 	      "Terminating. Best of luck with it!\n");
  1066 	    exit(1);
  1067 	}
  1068 	if (results->binlen*100>results->totlen || results->binlen>100)
  1069 	{
  1070 	    g_print("   --> There are a lot of foreign letters here. "
  1071 	      "Not reporting them.\n");
  1072 	    if (!pswit[VERBOSE_SWITCH])
  1073 		warnings.bin=0;
  1074 	}
  1075     }
  1076     warnings.isDutch=FALSE;
  1077     if (results->Dutchcount>50)
  1078     {
  1079 	warnings.isDutch=TRUE;
  1080 	g_print("   --> This looks like Dutch - "
  1081 	  "switching off dashes and warnings for 's Middags case.\n");
  1082     }
  1083     warnings.isFrench=FALSE;
  1084     if (results->Frenchcount>50)
  1085     {
  1086 	warnings.isFrench=TRUE;
  1087 	g_print("   --> This looks like French - "
  1088 	  "switching off some doublepunct.\n");
  1089     }
  1090     if (results->firstline && results->footerline)
  1091 	g_print("    The PG header and footer appear to be already on.\n");
  1092     else
  1093     {
  1094 	if (results->firstline)
  1095 	    g_print("    The PG header is on - no footer.\n");
  1096 	if (results->footerline)
  1097 	    g_print("    The PG footer is on - no header.\n");
  1098     }
  1099     g_print("\n");
  1100     if (pswit[VERBOSE_SWITCH])
  1101     {
  1102 	warnings.shortline=1;
  1103 	warnings.dotcomma=1;
  1104 	warnings.longline=1;
  1105 	warnings.dash=1;
  1106 	warnings.digit=1;
  1107 	warnings.ast=1;
  1108 	warnings.fslash=1;
  1109 	warnings.hyphen=1;
  1110 	warnings.endquote=1;
  1111 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1112     }
  1113     if (warnings.isDutch)
  1114 	warnings.dash=0;
  1115     if (results->footerline>0 && results->firstline>0 &&
  1116       results->footerline>results->firstline &&
  1117       results->footerline-results->firstline<100)
  1118     {
  1119 	g_print("   --> I don't really know where this text starts. \n");
  1120 	g_print("       There are no reference points.\n");
  1121 	g_print("       I'm going to have to report the header and footer "
  1122 	  "as well.\n");
  1123 	results->firstline=0;
  1124     }
  1125     return &warnings;
  1126 }
  1127 
  1128 /*
  1129  * analyse_quotes:
  1130  *
  1131  * Look along the line, accumulate the count of quotes, and see
  1132  * if this is an empty line - i.e. a line with nothing on it
  1133  * but spaces.
  1134  * If line has just spaces, period, * and/or - on it, don't
  1135  * count it, since empty lines with asterisks or dashes to
  1136  * separate sections are common.
  1137  *
  1138  * Returns: TRUE if the line is empty.
  1139  */
  1140 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1141 {
  1142     int guessquote=0;
  1143     /* assume the line is empty until proven otherwise */
  1144     gboolean isemptyline=TRUE;
  1145     const char *s=aline,*sprev,*snext;
  1146     gunichar c;
  1147     sprev=NULL;
  1148     while (*s)
  1149     {
  1150 	snext=g_utf8_next_char(s);
  1151 	c=g_utf8_get_char(s);
  1152 	if (c==CHAR_DQUOTE)
  1153 	    counters->quot++;
  1154 	if (CHAR_IS_SQUOTE(c))
  1155 	{
  1156 	    if (s==aline)
  1157 	    {
  1158 		/*
  1159 		 * At start of line, it can only be an openquote.
  1160 		 * Hardcode a very common exception!
  1161 		 */
  1162 		if (!g_str_has_prefix(snext,"tis") &&
  1163 		  !g_str_has_prefix(snext,"Tis"))
  1164 		    increment_matching(counters,c,TRUE);
  1165 	    }
  1166 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1167 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1168 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1169 		;
  1170 	    /* it's outside a word - let's check it out */
  1171 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1172 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1173 	    {
  1174 		/* it damwell better BE an openquote */
  1175 		if (!g_str_has_prefix(snext,"tis") &&
  1176 		  !g_str_has_prefix(snext,"Tis"))
  1177 		    /* hardcode a very common exception! */
  1178 		    increment_matching(counters,c,TRUE);
  1179 	    }
  1180 	    else
  1181 	    {
  1182 		/* now - is it a closequote? */
  1183 		guessquote=0;   /* accumulate clues */
  1184 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1185 		{
  1186 		    /* it follows a letter - could be either */
  1187 		    guessquote++;
  1188 		    if (g_utf8_get_char(sprev)=='s')
  1189 		    {
  1190 			/* looks like a plural apostrophe */
  1191 			guessquote-=3;
  1192 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1193 			    /* bonus marks! */
  1194 			    guessquote-=2;
  1195 		    }
  1196 		}
  1197 		/* it doesn't have a letter either side */
  1198 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
  1199 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
  1200 		    guessquote+=8; /* looks like a closequote */
  1201 		else
  1202 		    guessquote++;
  1203 		if (matching_difference(counters,CHAR_SQUOTE)>0)
  1204 		    /*
  1205 		     * Give it the benefit of some doubt,
  1206 		     * if a squote is already open.
  1207 		     */
  1208 		    guessquote++;
  1209 		else
  1210 		    guessquote--;
  1211 		if (guessquote>=0)
  1212 		    increment_matching(counters,c,FALSE);
  1213 	    }
  1214 	}
  1215 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1216 	  c!='\r' && c!='\n')
  1217 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1218 	if (c==CHAR_UNDERSCORE)
  1219 	    counters->c_unders++;
  1220 	if (c==CHAR_OPEN_SBRACK)
  1221 	{
  1222 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1223 	      !matching_difference(counters,c) && s==aline &&
  1224 	      g_str_has_prefix(s,"[Illustration:"))
  1225 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1226 	    else
  1227 		increment_matching(counters,c,TRUE);
  1228 	}
  1229 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1230 	    increment_matching(counters,c,TRUE);
  1231 	if (c==CHAR_CLOSE_SBRACK)
  1232 	{
  1233 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1234 	      !matching_difference(counters,c) && !*snext)
  1235 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1236 	    else
  1237 		increment_matching(counters,c,FALSE);
  1238 	}
  1239 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1240 	    increment_matching(counters,c,FALSE);
  1241 	sprev=s;
  1242 	s=snext;
  1243     }
  1244     return isemptyline;
  1245 }
  1246 
  1247 /*
  1248  * check_for_control_characters:
  1249  *
  1250  * Check for invalid or questionable characters in the line
  1251  * Anything above 127 is invalid for plain ASCII, and
  1252  * non-printable control characters should also be flagged.
  1253  * Tabs should generally not be there.
  1254  */
  1255 void check_for_control_characters(const char *aline)
  1256 {
  1257     gunichar c;
  1258     const char *s;
  1259     for (s=aline;*s;s=g_utf8_next_char(s))
  1260     {
  1261 	c=g_utf8_get_char(s);
  1262 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1263 	{
  1264 	    if (pswit[ECHO_SWITCH])
  1265 		g_print("\n%s\n",aline);
  1266 	    if (!pswit[OVERVIEW_SWITCH])
  1267 		g_print("    Line %ld column %ld - Control character %u\n",
  1268 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1269 	    else
  1270 		cnt_bin++;
  1271 	}
  1272     }
  1273 }
  1274 
  1275 /*
  1276  * check_for_odd_characters:
  1277  *
  1278  * Check for binary and other odd characters.
  1279  */
  1280 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1281   gboolean isemptyline)
  1282 {
  1283     /* Don't repeat multiple warnings on one line. */
  1284     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1285     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1286     const char *s;
  1287     gunichar c;
  1288     gsize nb;
  1289     gchar *t;
  1290     for (s=aline;*s;s=g_utf8_next_char(s))
  1291     {
  1292 	c=g_utf8_get_char(s);
  1293 	if (warnings->bin && !eInvalidChar &&
  1294 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1295 	{
  1296 	    if (pswit[ECHO_SWITCH])
  1297 		g_print("\n%s\n",aline);
  1298 	    if (!pswit[OVERVIEW_SWITCH])
  1299 		if (c>127 && c<160 || c>255)
  1300 		    g_print("    Line %ld column %ld - "
  1301 		      "Non-ISO-8859 character %u\n",
  1302 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1303 		else
  1304 		    g_print("    Line %ld column %ld - "
  1305 		      "Non-ASCII character %u\n",
  1306 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1307 	    else
  1308 		cnt_bin++;
  1309 	    eInvalidChar=TRUE;
  1310 	}
  1311 	if (!eInvalidChar && charset)
  1312 	{
  1313 	    if (charset_validator==(GIConv)-1)
  1314 	    {
  1315 		if (!g_unichar_isdefined(c))
  1316 		{
  1317 		    if (pswit[ECHO_SWITCH])
  1318 			g_print("\n%s\n",aline);
  1319 		    if (!pswit[OVERVIEW_SWITCH])
  1320 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1321 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1322 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1323 		    else
  1324 			cnt_bin++;
  1325 		    eInvalidChar=TRUE;
  1326 		}
  1327 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1328 		  c>=100000 && c<=0x10FFFD)
  1329 		{
  1330 		    if (pswit[ECHO_SWITCH])
  1331 			g_print("\n%s\n",aline);
  1332 		    if (!pswit[OVERVIEW_SWITCH])
  1333 			g_print("    Line %ld column %ld - Private Use "
  1334 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1335 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1336 		    else
  1337 			cnt_bin++;
  1338 		    eInvalidChar=TRUE;
  1339 		}
  1340 	    }
  1341 	    else
  1342 	    {
  1343 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1344 		  charset_validator,NULL,&nb,NULL);
  1345 		if (t)
  1346 		    g_free(t);
  1347 		else
  1348 		{
  1349 		    if (pswit[ECHO_SWITCH])
  1350 			g_print("\n%s\n",aline);
  1351 		    if (!pswit[OVERVIEW_SWITCH])
  1352 			g_print("    Line %ld column %ld - Non-%s "
  1353 			  "character %u\n",linecnt,
  1354 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1355 		    else
  1356 			cnt_bin++;
  1357 		    eInvalidChar=TRUE;
  1358 		}
  1359 	    }
  1360 	}
  1361 	if (!eTab && c==CHAR_TAB)
  1362 	{
  1363 	    if (pswit[ECHO_SWITCH])
  1364 		g_print("\n%s\n",aline);
  1365 	    if (!pswit[OVERVIEW_SWITCH])
  1366 		g_print("    Line %ld column %ld - Tab character?\n",
  1367 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1368 	    else
  1369 		cnt_odd++;
  1370 	    eTab=TRUE;
  1371 	}
  1372 	if (!eTilde && c==CHAR_TILDE)
  1373 	{
  1374 	    /*
  1375 	     * Often used by OCR software to indicate an
  1376 	     * unrecognizable character.
  1377 	     */
  1378 	    if (pswit[ECHO_SWITCH])
  1379 		g_print("\n%s\n",aline);
  1380 	    if (!pswit[OVERVIEW_SWITCH])
  1381 		g_print("    Line %ld column %ld - Tilde character?\n",
  1382 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1383 	    else
  1384 		cnt_odd++;
  1385 	    eTilde=TRUE;
  1386 	}
  1387 	if (!eCarat && c==CHAR_CARAT)
  1388 	{  
  1389 	    if (pswit[ECHO_SWITCH])
  1390 		g_print("\n%s\n",aline);
  1391 	    if (!pswit[OVERVIEW_SWITCH])
  1392 		g_print("    Line %ld column %ld - Carat character?\n",
  1393 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1394 	    else
  1395 		cnt_odd++;
  1396 	    eCarat=TRUE;
  1397 	}
  1398 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1399 	{  
  1400 	    if (pswit[ECHO_SWITCH])
  1401 		g_print("\n%s\n",aline);
  1402 	    if (!pswit[OVERVIEW_SWITCH])
  1403 		g_print("    Line %ld column %ld - Forward slash?\n",
  1404 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1405 	    else
  1406 		cnt_odd++;
  1407 	    eFSlash=TRUE;
  1408 	}
  1409 	/*
  1410 	 * Report asterisks only in paranoid mode,
  1411 	 * since they're often deliberate.
  1412 	 */
  1413 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1414 	  c==CHAR_ASTERISK)
  1415 	{
  1416 	    if (pswit[ECHO_SWITCH])
  1417 		g_print("\n%s\n",aline);
  1418 	    if (!pswit[OVERVIEW_SWITCH])
  1419 		g_print("    Line %ld column %ld - Asterisk?\n",
  1420 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1421 	    else
  1422 		cnt_odd++;
  1423 	    eAst=TRUE;
  1424 	}
  1425     }
  1426 }
  1427 
  1428 /*
  1429  * check_for_long_line:
  1430  *
  1431  * Check for line too long.
  1432  */
  1433 void check_for_long_line(const char *aline)
  1434 {
  1435     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1436     {
  1437 	if (pswit[ECHO_SWITCH])
  1438 	    g_print("\n%s\n",aline);
  1439 	if (!pswit[OVERVIEW_SWITCH])
  1440 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1441 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1442 	else
  1443 	    cnt_long++;
  1444     }
  1445 }
  1446 
  1447 /*
  1448  * check_for_short_line:
  1449  *
  1450  * Check for line too short.
  1451  *
  1452  * This one is a bit trickier to implement: we don't want to
  1453  * flag the last line of a paragraph for being short, so we
  1454  * have to wait until we know that our current line is a
  1455  * "normal" line, then report the _previous_ line if it was too
  1456  * short. We also don't want to report indented lines like
  1457  * chapter heads or formatted quotations. We therefore keep
  1458  * last->len as the length of the last line examined, and
  1459  * last->blen as the length of the last but one, and try to
  1460  * suppress unnecessary warnings by checking that both were of
  1461  * "normal" length. We keep the first character of the last
  1462  * line in last->start, and if it was a space, we assume that
  1463  * the formatting is deliberate. I can't figure out a way to
  1464  * distinguish something like a quoted verse left-aligned or
  1465  * the header or footer of a letter from a paragraph of short
  1466  * lines - maybe if I examined the whole paragraph, and if the
  1467  * para has less than, say, 8 lines and if all lines are short,
  1468  * then just assume it's OK? Need to look at some texts to see
  1469  * how often a formula like this would get the right result.
  1470  */
  1471 void check_for_short_line(const char *aline,const struct line_properties *last)
  1472 {
  1473     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1474       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1475       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1476     {
  1477 	if (pswit[ECHO_SWITCH])
  1478 	    g_print("\n%s\n",prevline);
  1479 	if (!pswit[OVERVIEW_SWITCH])
  1480 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1481 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1482 	else
  1483 	    cnt_short++;
  1484     }
  1485 }
  1486 
  1487 /*
  1488  * check_for_starting_punctuation:
  1489  *
  1490  * Look for punctuation other than full ellipses at start of line.
  1491  */
  1492 void check_for_starting_punctuation(const char *aline)
  1493 {
  1494     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1495       !g_str_has_prefix(aline,". . ."))
  1496     {
  1497 	if (pswit[ECHO_SWITCH])
  1498 	    g_print("\n%s\n",aline);
  1499 	if (!pswit[OVERVIEW_SWITCH])
  1500 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1501 	      linecnt);
  1502 	else
  1503 	    cnt_punct++;
  1504     }
  1505 }
  1506 
  1507 /*
  1508  * check_for_spaced_emdash:
  1509  *
  1510  * Check for spaced em-dashes.
  1511  *
  1512  * We must check _all_ occurrences of "--" on the line
  1513  * hence the loop - even if the first double-dash is OK
  1514  * there may be another that's wrong later on.
  1515  */
  1516 void check_for_spaced_emdash(const char *aline)
  1517 {
  1518     const char *s,*t,*next;
  1519     for (s=aline;t=strstr(s,"--");s=next)
  1520     {
  1521 	next=g_utf8_next_char(g_utf8_next_char(t));
  1522 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1523 	  g_utf8_get_char(next)==CHAR_SPACE)
  1524 	{
  1525 	    if (pswit[ECHO_SWITCH])
  1526 		g_print("\n%s\n",aline);
  1527 	    if (!pswit[OVERVIEW_SWITCH])
  1528 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1529 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1530 	    else
  1531 		cnt_dash++;
  1532 	}
  1533     }
  1534 }
  1535 
  1536 /*
  1537  * check_for_spaced_dash:
  1538  *
  1539  * Check for spaced dashes.
  1540  */
  1541 void check_for_spaced_dash(const char *aline)
  1542 {
  1543     const char *s;
  1544     if ((s=strstr(aline," -")))
  1545     {
  1546 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1547 	{
  1548 	    if (pswit[ECHO_SWITCH])
  1549 		g_print("\n%s\n",aline);
  1550 	    if (!pswit[OVERVIEW_SWITCH])
  1551 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1552 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1553 	    else
  1554 		cnt_dash++;
  1555 	}
  1556     }
  1557     else if ((s=strstr(aline,"- ")))
  1558     {
  1559 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1560 	{
  1561 	    if (pswit[ECHO_SWITCH])
  1562 		g_print("\n%s\n",aline);
  1563 	    if (!pswit[OVERVIEW_SWITCH])
  1564 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1565 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1566 	    else
  1567 		cnt_dash++;
  1568 	}
  1569     }
  1570 }
  1571 
  1572 /*
  1573  * check_for_unmarked_paragraphs:
  1574  *
  1575  * Check for unmarked paragraphs indicated by separate speakers.
  1576  *
  1577  * May well be false positive:
  1578  * "Bravo!" "Wonderful!" called the crowd.
  1579  * but useful all the same.
  1580  */
  1581 void check_for_unmarked_paragraphs(const char *aline)
  1582 {
  1583     const char *s;
  1584     s=strstr(aline,"\"  \"");
  1585     if (!s)
  1586 	s=strstr(aline,"\" \"");
  1587     if (s)
  1588     {
  1589 	if (pswit[ECHO_SWITCH])
  1590 	    g_print("\n%s\n",aline);
  1591 	if (!pswit[OVERVIEW_SWITCH])
  1592 	    g_print("    Line %ld column %ld - "
  1593 	      "Query missing paragraph break?\n",
  1594 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1595 	else
  1596 	    cnt_punct++;
  1597     }
  1598 }
  1599 
  1600 /*
  1601  * check_for_jeebies:
  1602  *
  1603  * Check for "to he" and other easy h/b errors.
  1604  *
  1605  * This is a very inadequate effort on the h/b problem,
  1606  * but the phrase "to he" is always an error, whereas "to
  1607  * be" is quite common.
  1608  * Similarly, '"Quiet!", be said.' is a non-be error
  1609  * "to he" is _not_ always an error!:
  1610  *       "Where they went to he couldn't say."
  1611  * Another false positive:
  1612  *       What would "Cinderella" be without the . . .
  1613  * and another: "If he wants to he can see for himself."
  1614  */
  1615 void check_for_jeebies(const char *aline)
  1616 {
  1617     const char *s;
  1618     s=strstr(aline," be could ");
  1619     if (!s)
  1620 	s=strstr(aline," be would ");
  1621     if (!s)
  1622 	s=strstr(aline," was be ");
  1623     if (!s)
  1624 	s=strstr(aline," be is ");
  1625     if (!s)
  1626 	s=strstr(aline," is be ");
  1627     if (!s)
  1628 	s=strstr(aline,"\", be ");
  1629     if (!s)
  1630 	s=strstr(aline,"\" be ");
  1631     if (!s)
  1632 	s=strstr(aline,"\" be ");
  1633     if (!s)
  1634 	s=strstr(aline," to he ");
  1635     if (s)
  1636     {
  1637 	if (pswit[ECHO_SWITCH])
  1638 	    g_print("\n%s\n",aline);
  1639 	if (!pswit[OVERVIEW_SWITCH])
  1640 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1641 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1642 	else
  1643 	    cnt_word++;
  1644     }
  1645     s=strstr(aline," the had ");
  1646     if (!s)
  1647 	s=strstr(aline," a had ");
  1648     if (!s)
  1649 	s=strstr(aline," they bad ");
  1650     if (!s)
  1651 	s=strstr(aline," she bad ");
  1652     if (!s)
  1653 	s=strstr(aline," he bad ");
  1654     if (!s)
  1655 	s=strstr(aline," you bad ");
  1656     if (!s)
  1657 	s=strstr(aline," i bad ");
  1658     if (s)
  1659     {
  1660 	if (pswit[ECHO_SWITCH])
  1661 	    g_print("\n%s\n",aline);
  1662 	if (!pswit[OVERVIEW_SWITCH])
  1663 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1664 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1665 	else
  1666 	    cnt_word++;
  1667     }
  1668     s=strstr(aline,"; hut ");
  1669     if (!s)
  1670 	s=strstr(aline,", hut ");
  1671     if (s)
  1672     {
  1673 	if (pswit[ECHO_SWITCH])
  1674 	    g_print("\n%s\n",aline);
  1675 	if (!pswit[OVERVIEW_SWITCH])
  1676 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1677 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1678 	else
  1679 	    cnt_word++;
  1680     }
  1681 }
  1682 
  1683 /*
  1684  * check_for_mta_from:
  1685  *
  1686  * Special case - angled bracket in front of "From" placed there by an
  1687  * MTA when sending an e-mail.
  1688  */
  1689 void check_for_mta_from(const char *aline)
  1690 {
  1691     const char *s;
  1692     s=strstr(aline,">From");
  1693     if (s)
  1694     {
  1695 	if (pswit[ECHO_SWITCH])
  1696 	    g_print("\n%s\n",aline);
  1697 	if (!pswit[OVERVIEW_SWITCH])
  1698 	    g_print("    Line %ld column %ld - "
  1699 	      "Query angled bracket with From\n",
  1700 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1701 	else
  1702 	    cnt_punct++;
  1703     }
  1704 }
  1705 
  1706 /*
  1707  * check_for_orphan_character:
  1708  *
  1709  * Check for a single character line -
  1710  * often an overflow from bad wrapping.
  1711  */
  1712 void check_for_orphan_character(const char *aline)
  1713 {
  1714     gunichar c;
  1715     c=g_utf8_get_char(aline);
  1716     if (c && !*g_utf8_next_char(aline))
  1717     {
  1718 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1719 	    ; /* Nothing - ignore numerals alone on a line. */
  1720 	else
  1721 	{
  1722 	    if (pswit[ECHO_SWITCH])
  1723 		g_print("\n%s\n",aline);
  1724 	    if (!pswit[OVERVIEW_SWITCH])
  1725 		g_print("    Line %ld column 1 - Query single character line\n",
  1726 		  linecnt);
  1727 	    else
  1728 		cnt_punct++;
  1729 	}
  1730     }
  1731 }
  1732 
  1733 /*
  1734  * check_for_pling_scanno:
  1735  *
  1736  * Check for I" - often should be !
  1737  */
  1738 void check_for_pling_scanno(const char *aline)
  1739 {
  1740     const char *s;
  1741     s=strstr(aline," I\"");
  1742     if (s)
  1743     {
  1744 	if (pswit[ECHO_SWITCH])
  1745 	    g_print("\n%s\n",aline);
  1746 	if (!pswit[OVERVIEW_SWITCH])
  1747 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1748 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1749 	else
  1750 	    cnt_punct++;
  1751     }
  1752 }
  1753 
  1754 /*
  1755  * check_for_extra_period:
  1756  *
  1757  * Check for period without a capital letter. Cut-down from gutspell.
  1758  * Only works when it happens on a single line.
  1759  */
  1760 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1761 {
  1762     const char *s,*t,*s1,*sprev;
  1763     int i;
  1764     gsize len;
  1765     gboolean istypo;
  1766     gchar *testword;
  1767     gunichar c,nc,pc,*decomposition;
  1768     if (pswit[PARANOID_SWITCH])
  1769     {
  1770 	for (t=aline;t=strstr(t,". ");)
  1771 	{
  1772 	    if (t==aline)
  1773 	    {
  1774 		t=g_utf8_next_char(t);
  1775 		/* start of line punctuation is handled elsewhere */
  1776 		continue;
  1777 	    }
  1778 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1779 	    {
  1780 		t=g_utf8_next_char(t);
  1781 		continue;
  1782 	    }
  1783 	    if (warnings->isDutch)
  1784 	    {
  1785 		/* For Frank & Jeroen -- 's Middags case */
  1786 		gunichar c2,c3,c4,c5;
  1787 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1788 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1789 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1790 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1791 		if (CHAR_IS_APOSTROPHE(c2) &&
  1792 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1793 		  g_unichar_isupper(c5))
  1794 		{
  1795 		    t=g_utf8_next_char(t);
  1796 		    continue;
  1797 		}
  1798 	    }
  1799 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1800 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1801 	      !isdigit(g_utf8_get_char(s1)))
  1802 		s1=g_utf8_next_char(s1);
  1803 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1804 	    {
  1805 		/* we have something to investigate */
  1806 		istypo=TRUE;
  1807 		/* so let's go back and find out */
  1808 		nc=g_utf8_get_char(t);
  1809 		s1=g_utf8_prev_char(t);
  1810 		c=g_utf8_get_char(s1);
  1811 		sprev=g_utf8_prev_char(s1);
  1812 		pc=g_utf8_get_char(sprev);
  1813 		while (s1>=aline &&
  1814 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1815 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1816 		  g_unichar_isalpha(nc)))
  1817 		{
  1818 		    nc=c;
  1819 		    s1=sprev;
  1820 		    c=pc;
  1821 		    sprev=g_utf8_prev_char(s1);
  1822 		    pc=g_utf8_get_char(sprev);
  1823 		}
  1824 		s1=g_utf8_next_char(s1);
  1825 		s=strchr(s1,'.');
  1826 		if (s)
  1827 		    testword=g_strndup(s1,s-s1);
  1828 		else
  1829 		    testword=g_strdup(s1);
  1830 		for (i=0;*abbrev[i];i++)
  1831 		    if (!strcmp(testword,abbrev[i]))
  1832 			istypo=FALSE;
  1833 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1834 		    istypo=FALSE;
  1835 		if (!*g_utf8_next_char(testword))
  1836 		    istypo=FALSE;
  1837 		if (isroman(testword))
  1838 		    istypo=FALSE;
  1839 		if (istypo)
  1840 		{
  1841 		    istypo=FALSE;
  1842 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1843 		    {
  1844 			decomposition=g_unicode_canonical_decomposition(
  1845 			  g_utf8_get_char(s),&len);
  1846 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1847 			    istypo=TRUE;
  1848 			g_free(decomposition);
  1849 		    }
  1850 		}
  1851 		if (istypo &&
  1852 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1853 		{
  1854 		    g_tree_insert(qperiod,g_strdup(testword),
  1855 		      GINT_TO_POINTER(1));
  1856 		    if (pswit[ECHO_SWITCH])
  1857 			g_print("\n%s\n",aline);
  1858 		    if (!pswit[OVERVIEW_SWITCH])
  1859 			g_print("    Line %ld column %ld - Extra period?\n",
  1860 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1861 		    else
  1862 			cnt_punct++;
  1863 		}
  1864 		g_free(testword);
  1865 	    }
  1866 	    t=g_utf8_next_char(t);
  1867 	}
  1868     }
  1869 }
  1870 
  1871 /*
  1872  * check_for_following_punctuation:
  1873  *
  1874  * Check for words usually not followed by punctuation.
  1875  */
  1876 void check_for_following_punctuation(const char *aline)
  1877 {
  1878     int i;
  1879     const char *s,*wordstart;
  1880     gunichar c;
  1881     gchar *inword,*t;
  1882     if (pswit[TYPO_SWITCH])
  1883     {
  1884 	for (s=aline;*s;)
  1885 	{
  1886 	    wordstart=s;
  1887 	    t=getaword(&s);
  1888 	    if (!*t)
  1889 	    {
  1890 		g_free(t);
  1891 		continue;
  1892 	    }
  1893 	    inword=g_utf8_strdown(t,-1);
  1894 	    g_free(t);
  1895 	    for (i=0;*nocomma[i];i++)
  1896 		if (!strcmp(inword,nocomma[i]))
  1897 		{
  1898 		    c=g_utf8_get_char(s);
  1899 		    if (c==',' || c==';' || c==':')
  1900 		    {
  1901 			if (pswit[ECHO_SWITCH])
  1902 			    g_print("\n%s\n",aline);
  1903 			if (!pswit[OVERVIEW_SWITCH])
  1904 			    g_print("    Line %ld column %ld - "
  1905 			      "Query punctuation after %s?\n",
  1906 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1907 			      inword);
  1908 			else
  1909 			    cnt_punct++;
  1910 		    }
  1911 		}
  1912 	    for (i=0;*noperiod[i];i++)
  1913 		if (!strcmp(inword,noperiod[i]))
  1914 		{
  1915 		    c=g_utf8_get_char(s);
  1916 		    if (c=='.' || c=='!')
  1917 		    {
  1918 			if (pswit[ECHO_SWITCH])
  1919 			    g_print("\n%s\n",aline);
  1920 			if (!pswit[OVERVIEW_SWITCH])
  1921 			    g_print("    Line %ld column %ld - "
  1922 			      "Query punctuation after %s?\n",
  1923 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1924 			      inword);
  1925 			else
  1926 			    cnt_punct++;
  1927 		    }
  1928 		}
  1929 	    g_free(inword);
  1930 	}
  1931     }
  1932 }
  1933 
  1934 /*
  1935  * check_for_typos:
  1936  *
  1937  * Check for commonly mistyped words,
  1938  * and digits like 0 for O in a word.
  1939  */
  1940 void check_for_typos(const char *aline,struct warnings *warnings)
  1941 {
  1942     const char *s,*t,*nt,*wordstart;
  1943     gchar *inword;
  1944     gunichar *decomposition;
  1945     gchar *testword;
  1946     int i,vowel,consonant,*dupcnt;
  1947     gboolean isdup,istypo,alower;
  1948     gunichar c,pc;
  1949     long offset,len;
  1950     gsize decomposition_len;
  1951     for (s=aline;*s;)
  1952     {
  1953 	wordstart=s;
  1954 	inword=getaword(&s);
  1955 	if (!*inword)
  1956 	{
  1957 	    g_free(inword);
  1958 	    continue; /* don't bother with empty lines */
  1959 	}
  1960 	if (mixdigit(inword))
  1961 	{
  1962 	    if (pswit[ECHO_SWITCH])
  1963 		g_print("\n%s\n",aline);
  1964 	    if (!pswit[OVERVIEW_SWITCH])
  1965 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1966 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1967 	    else
  1968 		cnt_word++;
  1969 	}
  1970 	/*
  1971 	 * Put the word through a series of tests for likely typos and OCR
  1972 	 * errors.
  1973 	 */
  1974 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1975 	{
  1976 	    istypo=FALSE;
  1977 	    alower=FALSE;
  1978 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1979 	    {
  1980 		c=g_utf8_get_char(t);
  1981 		nt=g_utf8_next_char(t);
  1982 		/* lowercase for testing */
  1983 		if (g_unichar_islower(c))
  1984 		    alower=TRUE;
  1985 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1986 		{
  1987 		    /*
  1988 		     * We have an uppercase mid-word. However, there are
  1989 		     * common cases:
  1990 		     *   Mac and Mc like McGill
  1991 		     *   French contractions like l'Abbe
  1992 		     */
  1993 		    offset=g_utf8_pointer_to_offset(inword,t);
  1994 		    if (offset>0)
  1995 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1996 		    else
  1997 			pc='\0';
  1998 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1999 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  2000 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  2001 		      CHAR_IS_APOSTROPHE(pc))
  2002 			; /* do nothing! */
  2003 		    else
  2004 			istypo=TRUE;
  2005 		}
  2006 	    }
  2007 	    testword=g_utf8_casefold(inword,-1);
  2008 	}
  2009 	if (pswit[TYPO_SWITCH])
  2010 	{
  2011 	    /*
  2012 	     * Check for certain unlikely two-letter combinations at word
  2013 	     * start and end.
  2014 	     */
  2015 	    len=g_utf8_strlen(testword,-1);
  2016 	    if (len>1)
  2017 	    {
  2018 		for (i=0;*nostart[i];i++)
  2019 		    if (g_str_has_prefix(testword,nostart[i]))
  2020 			istypo=TRUE;
  2021 		for (i=0;*noend[i];i++)
  2022 		    if (g_str_has_suffix(testword,noend[i]))
  2023 			istypo=TRUE;
  2024 	    }
  2025 	    /* ght is common, gbt never. Like that. */
  2026 	    if (strstr(testword,"cb"))
  2027 		istypo=TRUE;
  2028 	    if (strstr(testword,"gbt"))
  2029 		istypo=TRUE;
  2030 	    if (strstr(testword,"pbt"))
  2031 		istypo=TRUE;
  2032 	    if (strstr(testword,"tbs"))
  2033 		istypo=TRUE;
  2034 	    if (strstr(testword,"mrn"))
  2035 		istypo=TRUE;
  2036 	    if (strstr(testword,"ahle"))
  2037 		istypo=TRUE;
  2038 	    if (strstr(testword,"ihle"))
  2039 		istypo=TRUE;
  2040 	    /*
  2041 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2042 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2043 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2044 	     * numerals, but "ii" is a common scanno.
  2045 	     */
  2046 	    if (strstr(testword,"tbi"))
  2047 		istypo=TRUE;
  2048 	    if (strstr(testword,"tbe"))
  2049 		istypo=TRUE;
  2050 	    if (strstr(testword,"ii"))
  2051 		istypo=TRUE;
  2052 	    /*
  2053 	     * Check for no vowels or no consonants.
  2054 	     * If none, flag a typo.
  2055 	     */
  2056 	    if (!istypo && len>1)
  2057 	    {
  2058 		vowel=consonant=0;
  2059 		for (t=testword;*t;t=g_utf8_next_char(t))
  2060 		{
  2061 		    c=g_utf8_get_char(t);
  2062 		    decomposition=
  2063 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2064 		    if (c=='y' || g_unichar_isdigit(c))
  2065 		    {
  2066 			/* Yah, this is loose. */
  2067 			vowel++;
  2068 			consonant++;
  2069 		    }
  2070 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2071 			vowel++;
  2072 		    else
  2073 			consonant++;
  2074 		    g_free(decomposition);
  2075 		}
  2076 		if (!vowel || !consonant)
  2077 		    istypo=TRUE;
  2078 	    }
  2079 	    /*
  2080 	     * Now exclude the word from being reported if it's in
  2081 	     * the okword list.
  2082 	     */
  2083 	    for (i=0;*okword[i];i++)
  2084 		if (!strcmp(testword,okword[i]))
  2085 		    istypo=FALSE;
  2086 	    /*
  2087 	     * What looks like a typo may be a Roman numeral.
  2088 	     * Exclude these.
  2089 	     */
  2090 	    if (istypo && isroman(testword))
  2091 		istypo=FALSE;
  2092 	    /* Check the manual list of typos. */
  2093 	    if (!istypo)
  2094 		for (i=0;*typo[i];i++)
  2095 		    if (!strcmp(testword,typo[i]))
  2096 			istypo=TRUE;
  2097 	    /*
  2098 	     * Check lowercase s, l, i and m - special cases.
  2099 	     *   "j" - often a semi-colon gone wrong.
  2100 	     *   "d" for a missing apostrophe - he d
  2101 	     *   "n" for "in"
  2102 	     */
  2103 	    if (!istypo && len==1 &&
  2104 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2105 		istypo=TRUE;
  2106 	    if (istypo)
  2107 	    {
  2108 		dupcnt=g_tree_lookup(qword,testword);
  2109 		if (dupcnt)
  2110 		{
  2111 		    (*dupcnt)++;
  2112 		    isdup=!pswit[VERBOSE_SWITCH];
  2113 		}
  2114 		else
  2115 		{
  2116 		    dupcnt=g_new0(int,1);
  2117 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2118 		    isdup=FALSE;
  2119 		}
  2120 		if (!isdup)
  2121 		{
  2122 		    if (pswit[ECHO_SWITCH])
  2123 			g_print("\n%s\n",aline);
  2124 		    if (!pswit[OVERVIEW_SWITCH])
  2125 		    {
  2126 			g_print("    Line %ld column %ld - Query word %s",
  2127 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2128 			  inword);
  2129 			if (!pswit[VERBOSE_SWITCH])
  2130 			    g_print(" - not reporting duplicates");
  2131 			g_print("\n");
  2132 		    }
  2133 		    else
  2134 			cnt_word++;
  2135 		}
  2136 	    }
  2137 	}
  2138 	/* check the user's list of typos */
  2139 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2140 	{
  2141 	    if (pswit[ECHO_SWITCH])
  2142 		g_print("\n%s\n",aline);
  2143 	    if (!pswit[OVERVIEW_SWITCH])  
  2144 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2145 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2146 	}
  2147 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2148 	    g_free(testword);
  2149 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2150 	{
  2151 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2152 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2153 	    {
  2154 		if (pswit[ECHO_SWITCH])
  2155 		    g_print("\n%s\n",aline);
  2156 		if (!pswit[OVERVIEW_SWITCH])
  2157 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2158 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2159 		      inword);
  2160 		else
  2161 		    cnt_word++;
  2162 	    }
  2163 	}
  2164 	g_free(inword);
  2165     }
  2166 }
  2167 
  2168 /*
  2169  * check_for_misspaced_punctuation:
  2170  *
  2171  * Look for added or missing spaces around punctuation and quotes.
  2172  * If there is a punctuation character like ! with no space on
  2173  * either side, suspect a missing!space. If there are spaces on
  2174  * both sides , assume a typo. If we see a double quote with no
  2175  * space or punctuation on either side of it, assume unspaced
  2176  * quotes "like"this.
  2177  */
  2178 void check_for_misspaced_punctuation(const char *aline,
  2179   struct parities *parities,gboolean isemptyline)
  2180 {
  2181     gboolean isacro,isellipsis;
  2182     const char *s;
  2183     gunichar c,nc,pc,n2c;
  2184     c=g_utf8_get_char(aline);
  2185     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2186     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2187     {
  2188 	pc=c;
  2189 	c=nc;
  2190 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2191 	/* For each character in the line after the first. */
  2192 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2193 	{
  2194 	    /* we need to suppress warnings for acronyms like M.D. */
  2195 	    isacro=FALSE;
  2196 	    /* we need to suppress warnings for ellipsis . . . */
  2197 	    isellipsis=FALSE;
  2198 	    /*
  2199 	     * If there are letters on both sides of it or
  2200 	     * if it's strict punctuation followed by an alpha.
  2201 	     */
  2202 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2203 	      g_utf8_strchr("?!,;:",-1,c)))
  2204 	    {
  2205 		if (c=='.')
  2206 		{
  2207 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2208 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2209 			isacro=TRUE;
  2210 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2211 		    if (nc && n2c=='.')
  2212 			isacro=TRUE;
  2213 		}
  2214 		if (!isacro)
  2215 		{
  2216 		    if (pswit[ECHO_SWITCH])
  2217 			g_print("\n%s\n",aline);
  2218 		    if (!pswit[OVERVIEW_SWITCH])
  2219 			g_print("    Line %ld column %ld - Missing space?\n",
  2220 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2221 		    else
  2222 			cnt_punct++;
  2223 		}
  2224 	    }
  2225 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2226 	    {
  2227 		/*
  2228 		 * If there are spaces on both sides,
  2229 		 * or space before and end of line.
  2230 		 */
  2231 		if (c=='.')
  2232 		{
  2233 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2234 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2235 			isellipsis=TRUE;
  2236 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2237 		    if (nc && n2c=='.')
  2238 			isellipsis=TRUE;
  2239 		}
  2240 		if (!isemptyline && !isellipsis)
  2241 		{
  2242 		    if (pswit[ECHO_SWITCH])
  2243 			g_print("\n%s\n",aline);
  2244 		    if (!pswit[OVERVIEW_SWITCH])
  2245 			g_print("    Line %ld column %ld - "
  2246 			  "Spaced punctuation?\n",linecnt,
  2247 			  g_utf8_pointer_to_offset(aline,s)+1);
  2248 		    else
  2249 			cnt_punct++;
  2250 		}
  2251 	    }
  2252 	}
  2253     }
  2254     /* Split out the characters that CANNOT be preceded by space. */
  2255     c=g_utf8_get_char(aline);
  2256     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2257     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2258     {
  2259 	pc=c;
  2260 	c=nc;
  2261 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2262 	/* for each character in the line after the first */
  2263 	if (g_utf8_strchr("?!,;:",-1,c))
  2264 	{
  2265 	    /* if it's punctuation that _cannot_ have a space before it */
  2266 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2267 	    {
  2268 		/*
  2269 		 * If nc DOES == space,
  2270 		 * it was already reported just above.
  2271 		 */
  2272 		if (pswit[ECHO_SWITCH])
  2273 		    g_print("\n%s\n",aline);
  2274 		if (!pswit[OVERVIEW_SWITCH])
  2275 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2276 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2277 		else
  2278 		    cnt_punct++;
  2279 	    }
  2280 	}
  2281     }
  2282     /*
  2283      * Special case " .X" where X is any alpha.
  2284      * This plugs a hole in the acronym code above.
  2285      * Inelegant, but maintainable.
  2286      */
  2287     c=g_utf8_get_char(aline);
  2288     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2289     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2290     {
  2291 	pc=c;
  2292 	c=nc;
  2293 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2294 	/* for each character in the line after the first */
  2295 	if (c=='.')
  2296 	{
  2297 	    /* if it's a period */
  2298 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2299 	    {
  2300 		/*
  2301 		 * If the period follows a space and
  2302 		 * is followed by a letter.
  2303 		 */
  2304 		if (pswit[ECHO_SWITCH])
  2305 		    g_print("\n%s\n",aline);
  2306 		if (!pswit[OVERVIEW_SWITCH])
  2307 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2308 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2309 		else
  2310 		    cnt_punct++;
  2311 	    }
  2312 	}
  2313     }
  2314     c=g_utf8_get_char(aline);
  2315     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2316     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2317     {
  2318 	pc=c;
  2319 	c=nc;
  2320 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2321 	/* for each character in the line after the first */
  2322 	if (c==CHAR_DQUOTE)
  2323 	{
  2324 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2325 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2326 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2327 	    {
  2328 		if (pswit[ECHO_SWITCH])
  2329 		    g_print("\n%s\n",aline);
  2330 		if (!pswit[OVERVIEW_SWITCH])
  2331 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2332 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2333 		else
  2334 		    cnt_punct++;
  2335 	    }
  2336 	}
  2337     }
  2338     /* Check parity of quotes. */
  2339     nc=g_utf8_get_char(aline);
  2340     for (s=aline;*s;s=g_utf8_next_char(s))
  2341     {
  2342 	c=nc;
  2343 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2344 	if (c==CHAR_DQUOTE)
  2345 	{
  2346 	    parities->dquote=!parities->dquote;
  2347 	    if (!parities->dquote)
  2348 	    {
  2349 		/* parity even */
  2350 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2351 		{
  2352 		    if (pswit[ECHO_SWITCH])
  2353 			g_print("\n%s\n",aline);
  2354 		    if (!pswit[OVERVIEW_SWITCH])
  2355 			g_print("    Line %ld column %ld - "
  2356 			  "Wrongspaced quotes?\n",
  2357 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2358 		    else
  2359 			cnt_punct++;
  2360 		}
  2361 	    }
  2362 	    else
  2363 	    {
  2364 		/* parity odd */
  2365 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2366 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2367 		{
  2368 		    if (pswit[ECHO_SWITCH])
  2369 			g_print("\n%s\n",aline);
  2370 		    if (!pswit[OVERVIEW_SWITCH])
  2371 			g_print("    Line %ld column %ld - "
  2372 			  "Wrongspaced quotes?\n",
  2373 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2374 		    else
  2375 			cnt_punct++;
  2376 		}
  2377 	    }
  2378 	}
  2379     }
  2380     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  2381     {
  2382 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2383 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2384 	{
  2385 	    if (pswit[ECHO_SWITCH])
  2386 		g_print("\n%s\n",aline);
  2387 	    if (!pswit[OVERVIEW_SWITCH])
  2388 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2389 		  linecnt);
  2390 	    else
  2391 		cnt_punct++;
  2392 	}
  2393     }
  2394     if (pswit[SQUOTE_SWITCH])
  2395     {
  2396 	nc=g_utf8_get_char(aline);
  2397 	for (s=aline;*s;s=g_utf8_next_char(s))
  2398 	{
  2399 	    c=nc;
  2400 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2401 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2402 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2403 	      !g_unichar_isalpha(nc)))
  2404 	    {
  2405 		parities->squote=!parities->squote;
  2406 		if (!parities->squote)
  2407 		{
  2408 		    /* parity even */
  2409 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2410 		    {
  2411 			if (pswit[ECHO_SWITCH])
  2412 			    g_print("\n%s\n",aline);
  2413 			if (!pswit[OVERVIEW_SWITCH])
  2414 			    g_print("    Line %ld column %ld - "
  2415 			      "Wrongspaced singlequotes?\n",
  2416 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2417 			else
  2418 			    cnt_punct++;
  2419 		    }
  2420 		}
  2421 		else
  2422 		{
  2423 		    /* parity odd */
  2424 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2425 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2426 		    {
  2427 			if (pswit[ECHO_SWITCH])
  2428 			    g_print("\n%s\n",aline);
  2429 			if (!pswit[OVERVIEW_SWITCH])
  2430 			    g_print("    Line %ld column %ld - "
  2431 			      "Wrongspaced singlequotes?\n",
  2432 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2433 			else
  2434 			    cnt_punct++;
  2435 		    }
  2436 		}
  2437 	    }
  2438 	}
  2439     }
  2440 }
  2441 
  2442 /*
  2443  * check_for_double_punctuation:
  2444  *
  2445  * Look for double punctuation like ,. or ,,
  2446  * Thanks to DW for the suggestion!
  2447  * In books with references, ".," and ".;" are common
  2448  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2449  * OTOH, from my initial tests, there are also fairly
  2450  * common errors. What to do? Make these cases paranoid?
  2451  * ".," is the most common, so warnings->dotcomma is used
  2452  * to suppress detailed reporting if it occurs often.
  2453  */
  2454 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2455 {
  2456     const char *s;
  2457     gunichar c,nc;
  2458     nc=g_utf8_get_char(aline);
  2459     for (s=aline;*s;s=g_utf8_next_char(s))
  2460     {
  2461 	c=nc;
  2462 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2463 	/* for each punctuation character in the line */
  2464 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2465 	  g_utf8_strchr(".?!,;:",-1,nc))
  2466 	{
  2467 	    /* followed by punctuation, it's a query, unless . . . */
  2468 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2469 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2470 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2471 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2472 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2473 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2474 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2475 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2476 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2477 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2478 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2479 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2480 	    {
  2481 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2482 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2483 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2484 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2485 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2486 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2487 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2488 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2489 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2490 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2491 		{
  2492 		    s+=4;
  2493 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2494 		}
  2495 		; /* do nothing for .. !! and ?? which can be legit */
  2496 	    }
  2497 	    else
  2498 	    {
  2499 		if (pswit[ECHO_SWITCH])
  2500 		    g_print("\n%s\n",aline);
  2501 		if (!pswit[OVERVIEW_SWITCH])
  2502 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2503 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2504 		else
  2505 		    cnt_punct++;
  2506 	    }
  2507 	}
  2508     }
  2509 }
  2510 
  2511 /*
  2512  * check_for_spaced_quotes:
  2513  */
  2514 void check_for_spaced_quotes(const char *aline)
  2515 {
  2516     int i;
  2517     const char *s,*t;
  2518     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2519       CHAR_RS_QUOTE};
  2520     GString *pattern;
  2521     s=aline;
  2522     while ((t=strstr(s," \" ")))
  2523     {
  2524 	if (pswit[ECHO_SWITCH])
  2525 	    g_print("\n%s\n",aline);
  2526 	if (!pswit[OVERVIEW_SWITCH])
  2527 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2528 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2529 	else
  2530 	    cnt_punct++;
  2531 	s=g_utf8_next_char(g_utf8_next_char(t));
  2532     }
  2533     pattern=g_string_new(NULL);
  2534     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2535     {
  2536 	g_string_assign(pattern," ");
  2537 	g_string_append_unichar(pattern,single_quotes[i]);
  2538 	g_string_append_c(pattern,' ');
  2539 	s=aline;
  2540 	while ((t=strstr(s,pattern->str)))
  2541 	{
  2542 	    if (pswit[ECHO_SWITCH])
  2543 		g_print("\n%s\n",aline);
  2544 	    if (!pswit[OVERVIEW_SWITCH])
  2545 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2546 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2547 	    else
  2548 		cnt_punct++;
  2549 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2550 	}
  2551     }
  2552     g_string_free(pattern,TRUE);
  2553 }
  2554 
  2555 /*
  2556  * check_for_miscased_genative:
  2557  *
  2558  * Check special case of 'S instead of 's at end of word.
  2559  */
  2560 void check_for_miscased_genative(const char *aline)
  2561 {
  2562     const char *s;
  2563     gunichar c,nc,pc;
  2564     if (!*aline)
  2565 	return;
  2566     c=g_utf8_get_char(aline);
  2567     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2568     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2569     {
  2570 	pc=c;
  2571 	c=nc;
  2572 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2573 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2574 	{
  2575 	    if (pswit[ECHO_SWITCH])
  2576 		g_print("\n%s\n",aline);
  2577 	    if (!pswit[OVERVIEW_SWITCH])
  2578 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2579 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2580 	    else
  2581 		cnt_punct++;
  2582 	}
  2583     }
  2584 }
  2585 
  2586 /*
  2587  * check_end_of_line:
  2588  *
  2589  * Now check special cases - start and end of line -
  2590  * for single and double quotes. Start is sometimes [sic]
  2591  * but better to query it anyway.
  2592  * While we're here, check for dash at end of line.
  2593  */
  2594 void check_end_of_line(const char *aline,struct warnings *warnings)
  2595 {
  2596     int lbytes;
  2597     const char *s;
  2598     gunichar c1,c2;
  2599     lbytes=strlen(aline);
  2600     if (g_utf8_strlen(aline,lbytes)>1)
  2601     {
  2602 	s=g_utf8_prev_char(aline+lbytes);
  2603 	c1=g_utf8_get_char(s);
  2604 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2605 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2606 	{
  2607 	    if (pswit[ECHO_SWITCH])
  2608 		g_print("\n%s\n",aline);
  2609 	    if (!pswit[OVERVIEW_SWITCH])
  2610 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2611 		  g_utf8_strlen(aline,lbytes));
  2612 	    else
  2613 		cnt_punct++;
  2614 	}
  2615 	c1=g_utf8_get_char(aline);
  2616 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2617 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2618 	{
  2619 	    if (pswit[ECHO_SWITCH])
  2620 		g_print("\n%s\n",aline);
  2621 	    if (!pswit[OVERVIEW_SWITCH])
  2622 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2623 	    else
  2624 		cnt_punct++;
  2625 	}
  2626 	/*
  2627 	 * Dash at end of line may well be legit - paranoid mode only
  2628 	 * and don't report em-dash at line-end.
  2629 	 */
  2630 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2631 	{
  2632 	    for (s=g_utf8_prev_char(aline+lbytes);
  2633 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2634 		;
  2635 	    if (g_utf8_get_char(s)=='-' &&
  2636 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2637 	    {
  2638 		if (pswit[ECHO_SWITCH])
  2639 		    g_print("\n%s\n",aline);
  2640 		if (!pswit[OVERVIEW_SWITCH])
  2641 		    g_print("    Line %ld column %ld - "
  2642 		      "Hyphen at end of line?\n",
  2643 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2644 	    }
  2645 	}
  2646     }
  2647 }
  2648 
  2649 /*
  2650  * check_for_unspaced_bracket:
  2651  *
  2652  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2653  * If so, suspect a scanno like "a]most".
  2654  */
  2655 void check_for_unspaced_bracket(const char *aline)
  2656 {
  2657     const char *s;
  2658     gunichar c,nc,pc;
  2659     c=g_utf8_get_char(aline);
  2660     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2661     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2662     {
  2663 	pc=c;
  2664 	c=nc;
  2665 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2666 	if (!nc)
  2667 	    break;
  2668 	/* for each bracket character in the line except 1st & last */
  2669 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2670 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2671 	{
  2672 	    if (pswit[ECHO_SWITCH])
  2673 		g_print("\n%s\n",aline);
  2674 	    if (!pswit[OVERVIEW_SWITCH])
  2675 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2676 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2677 	    else
  2678 		cnt_punct++;
  2679 	}
  2680     }
  2681 }
  2682 
  2683 /*
  2684  * check_for_unpunctuated_endquote:
  2685  */
  2686 void check_for_unpunctuated_endquote(const char *aline)
  2687 {
  2688     const char *s;
  2689     gunichar c,nc,pc;
  2690     c=g_utf8_get_char(aline);
  2691     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2692     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2693     {
  2694 	pc=c;
  2695 	c=nc;
  2696 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2697 	/* for each character in the line except 1st */
  2698 	if (c==CHAR_DQUOTE && isalpha(pc))
  2699 	{
  2700 	    if (pswit[ECHO_SWITCH])
  2701 		g_print("\n%s\n",aline);
  2702 	    if (!pswit[OVERVIEW_SWITCH])
  2703 		g_print("    Line %ld column %ld - "
  2704 		  "endquote missing punctuation?\n",
  2705 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2706 	    else
  2707 		cnt_punct++;
  2708 	}
  2709     }
  2710 }
  2711 
  2712 /*
  2713  * check_for_html_tag:
  2714  *
  2715  * Check for <HTML TAG>.
  2716  *
  2717  * If there is a < in the line, followed at some point
  2718  * by a > then we suspect HTML.
  2719  */
  2720 void check_for_html_tag(const char *aline)
  2721 {
  2722     const char *open,*close;
  2723     gchar *tag;
  2724     open=strchr(aline,'<');
  2725     if (open)
  2726     {
  2727 	close=strchr(g_utf8_next_char(open),'>');
  2728 	if (close)
  2729 	{
  2730 	    if (pswit[ECHO_SWITCH])
  2731 		g_print("\n%s\n",aline);
  2732 	    if (!pswit[OVERVIEW_SWITCH])
  2733 	    {
  2734 		tag=g_strndup(open,close-open+1);
  2735 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2736 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2737 		g_free(tag);
  2738 	    }
  2739 	    else
  2740 		cnt_html++;
  2741 	}
  2742     }
  2743 }
  2744 
  2745 /*
  2746  * check_for_html_entity:
  2747  *
  2748  * Check for &symbol; HTML.
  2749  *
  2750  * If there is a & in the line, followed at
  2751  * some point by a ; then we suspect HTML.
  2752  */
  2753 void check_for_html_entity(const char *aline)
  2754 {
  2755     const char *s,*amp,*scolon;
  2756     gchar *entity;
  2757     amp=strchr(aline,'&');
  2758     if (amp)
  2759     {
  2760 	scolon=strchr(amp,';');
  2761 	if (scolon)
  2762 	{
  2763 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2764 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2765 		    break;		/* Don't report "Jones & Son;" */
  2766 	    if (s>=scolon)
  2767 	    {
  2768 		if (pswit[ECHO_SWITCH])
  2769 		    g_print("\n%s\n",aline);
  2770 		if (!pswit[OVERVIEW_SWITCH])
  2771 		{
  2772 		    entity=g_strndup(amp,scolon-amp+1);
  2773 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2774 		      linecnt,(int)(amp-aline)+1,entity);
  2775 		    g_free(entity);
  2776 		}
  2777 		else
  2778 		    cnt_html++;
  2779 	    }
  2780 	}
  2781     }
  2782 }
  2783 
  2784 /*
  2785  * check_for_omitted_punctuation:
  2786  *
  2787  * Check for omitted punctuation at end of paragraph by working back
  2788  * through prevline. DW.
  2789  * Need to check this only for "normal" paras.
  2790  * So what is a "normal" para?
  2791  *    Not normal if one-liner (chapter headings, etc.)
  2792  *    Not normal if doesn't contain at least one locase letter
  2793  *    Not normal if starts with space
  2794  */
  2795 void check_for_omitted_punctuation(const char *prevline,
  2796   struct line_properties *last,int start_para_line)
  2797 {
  2798     gboolean letter_on_line=FALSE;
  2799     const char *s;
  2800     gunichar c;
  2801     for (s=prevline;*s;s=g_utf8_next_char(s))
  2802 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2803 	{
  2804 	    letter_on_line=TRUE;
  2805 	    break;
  2806 	}
  2807     /*
  2808      * This next "if" is a problem.
  2809      * If we say "start_para_line <= linecnt - 1", that includes
  2810      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2811      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2812      * misses genuine one-line paragraphs.
  2813      */
  2814     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2815       g_utf8_get_char(prevline)>CHAR_SPACE)
  2816     {
  2817 	s=prevline+strlen(prevline);
  2818 	do
  2819 	{
  2820 	    s=g_utf8_prev_char(s);
  2821 	    c=g_utf8_get_char(s);
  2822 	} while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
  2823 	for (;s>prevline;s=g_utf8_prev_char(s))
  2824 	{
  2825 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2826 	    {
  2827 		if (pswit[ECHO_SWITCH])
  2828 		    g_print("\n%s\n",prevline);
  2829 		if (!pswit[OVERVIEW_SWITCH])
  2830 		    g_print("    Line %ld column %ld - "
  2831 		      "No punctuation at para end?\n",
  2832 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2833 		else
  2834 		    cnt_punct++;
  2835 		break;
  2836 	    }
  2837 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2838 		break;
  2839 	}
  2840     }
  2841 }
  2842 
  2843 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2844 {
  2845     const char *word=key;
  2846     int *dupcnt=value;
  2847     if (*dupcnt)
  2848 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2849 	  word,*dupcnt);
  2850     return FALSE;
  2851 }
  2852 
  2853 void print_as_windows_1252(const char *string)
  2854 {
  2855     gsize inbytes,outbytes;
  2856     gchar *buf,*bp;
  2857     static GIConv converter=(GIConv)-1;
  2858     if (!string)
  2859     {
  2860 	if (converter!=(GIConv)-1)
  2861 	    g_iconv_close(converter);
  2862 	converter=(GIConv)-1;
  2863 	return;
  2864     }
  2865     if (converter==(GIConv)-1)
  2866 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2867     if (converter!=(GIConv)-1)
  2868     {
  2869 	inbytes=outbytes=strlen(string);
  2870 	bp=buf=g_malloc(outbytes+1);
  2871 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2872 	*bp='\0';
  2873 	fputs(buf,stdout);
  2874 	g_free(buf);
  2875     }
  2876     else
  2877 	fputs(string,stdout);
  2878 }
  2879 
  2880 void print_as_utf_8(const char *string)
  2881 {
  2882     fputs(string,stdout);
  2883 }
  2884 
  2885 /*
  2886  * procfile:
  2887  *
  2888  * Process one file.
  2889  */
  2890 void procfile(const char *filename)
  2891 {
  2892     const char *s;
  2893     gchar *parastart=NULL;	/* first line of current para */
  2894     gchar *etext,*aline;
  2895     gchar *etext_ptr;
  2896     GError *err=NULL;
  2897     struct first_pass_results *first_pass_results;
  2898     struct warnings *warnings;
  2899     struct counters counters={0};
  2900     struct line_properties last={0};
  2901     struct parities parities={0};
  2902     struct pending pending={0};
  2903     gboolean isemptyline;
  2904     long start_para_line=0;
  2905     gboolean isnewpara=FALSE,enddash=FALSE;
  2906     last.start=CHAR_SPACE;
  2907     linecnt=checked_linecnt=0;
  2908     etext=read_etext(filename,&err);
  2909     if (!etext)
  2910     {
  2911 	if (pswit[STDOUT_SWITCH])
  2912 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2913 	else
  2914 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2915 	exit(1);
  2916     }
  2917     g_print("\n\nFile: %s\n\n",filename);
  2918     first_pass_results=first_pass(etext);
  2919     warnings=report_first_pass(first_pass_results);
  2920     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2921     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2922     /*
  2923      * Here we go with the main pass. Hold onto yer hat!
  2924      */
  2925     linecnt=0;
  2926     etext_ptr=etext;
  2927     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2928     {
  2929 	linecnt++;
  2930 	if (linecnt==1)
  2931 	    isnewpara=TRUE;
  2932 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2933 	    continue;    // skip DP page separators completely
  2934 	if (linecnt<first_pass_results->firstline ||
  2935 	  (first_pass_results->footerline>0 &&
  2936 	  linecnt>first_pass_results->footerline))
  2937 	{
  2938 	    if (pswit[HEADER_SWITCH])
  2939 	    {
  2940 		if (g_str_has_prefix(aline,"Title:"))
  2941 		    g_print("    %s\n",aline);
  2942 		if (g_str_has_prefix(aline,"Author:"))
  2943 		    g_print("    %s\n",aline);
  2944 		if (g_str_has_prefix(aline,"Release Date:"))
  2945 		    g_print("    %s\n",aline);
  2946 		if (g_str_has_prefix(aline,"Edition:"))
  2947 		    g_print("    %s\n\n",aline);
  2948 	    }
  2949 	    continue;		/* skip through the header */
  2950 	}
  2951 	checked_linecnt++;
  2952 	print_pending(aline,parastart,&pending);
  2953 	isemptyline=analyse_quotes(aline,&counters);
  2954 	if (isnewpara && !isemptyline)
  2955 	{
  2956 	    /* This line is the start of a new paragraph. */
  2957 	    start_para_line=linecnt;
  2958 	    /* Capture its first line in case we want to report it later. */
  2959 	    g_free(parastart);
  2960 	    parastart=g_strdup(aline);
  2961 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2962 	    s=aline;
  2963 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2964 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2965 		s=g_utf8_next_char(s);
  2966 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2967 	    {
  2968 		/* and its first letter is lowercase */
  2969 		if (pswit[ECHO_SWITCH])
  2970 		    g_print("\n%s\n",aline);
  2971 		if (!pswit[OVERVIEW_SWITCH])
  2972 		    g_print("    Line %ld column %ld - "
  2973 		      "Paragraph starts with lower-case\n",
  2974 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2975 		else
  2976 		    cnt_punct++;
  2977 	    }
  2978 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2979 	}
  2980 	/* Check for an em-dash broken at line end. */
  2981 	if (enddash && g_utf8_get_char(aline)=='-')
  2982 	{
  2983 	    if (pswit[ECHO_SWITCH])
  2984 		g_print("\n%s\n",aline);
  2985 	    if (!pswit[OVERVIEW_SWITCH])
  2986 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2987 	    else
  2988 		cnt_punct++;
  2989 	}
  2990 	enddash=FALSE;
  2991 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2992 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2993 	    ;
  2994 	if (s>=aline && g_utf8_get_char(s)=='-')
  2995 	    enddash=TRUE;
  2996 	check_for_control_characters(aline);
  2997 	check_for_odd_characters(aline,warnings,isemptyline);
  2998 	if (warnings->longline)
  2999 	    check_for_long_line(aline);
  3000 	if (warnings->shortline)
  3001 	    check_for_short_line(aline,&last);
  3002 	last.blen=last.len;
  3003 	last.len=g_utf8_strlen(aline,-1);
  3004 	last.start=g_utf8_get_char(aline);
  3005 	check_for_starting_punctuation(aline);
  3006 	if (warnings->dash)
  3007 	{
  3008 	    check_for_spaced_emdash(aline);
  3009 	    check_for_spaced_dash(aline);
  3010 	}
  3011 	check_for_unmarked_paragraphs(aline);
  3012 	check_for_jeebies(aline);
  3013 	check_for_mta_from(aline);
  3014 	check_for_orphan_character(aline);
  3015 	check_for_pling_scanno(aline);
  3016 	check_for_extra_period(aline,warnings);
  3017 	check_for_following_punctuation(aline);
  3018 	check_for_typos(aline,warnings);
  3019 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3020 	check_for_double_punctuation(aline,warnings);
  3021 	check_for_spaced_quotes(aline);
  3022 	check_for_miscased_genative(aline);
  3023 	check_end_of_line(aline,warnings);
  3024 	check_for_unspaced_bracket(aline);
  3025 	if (warnings->endquote)
  3026 	    check_for_unpunctuated_endquote(aline);
  3027 	check_for_html_tag(aline);
  3028 	check_for_html_entity(aline);
  3029 	if (isemptyline)
  3030 	{
  3031 	    check_for_mismatched_quotes(&counters,&pending);
  3032 	    counters_reset(&counters);
  3033 	    /* let the next iteration know that it's starting a new para */
  3034 	    isnewpara=TRUE;
  3035 	    if (prevline)
  3036 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3037 	}
  3038 	g_free(prevline);
  3039 	prevline=g_strdup(aline);
  3040     }
  3041     linecnt++;
  3042     check_for_mismatched_quotes(&counters,&pending);
  3043     print_pending(NULL,parastart,&pending);
  3044     reset_pending(&pending);
  3045     if (prevline)
  3046     {
  3047 	g_free(prevline);
  3048 	prevline=NULL;
  3049     }
  3050     g_free(parastart);
  3051     g_free(prevline);
  3052     g_free(etext);
  3053     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3054 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3055     g_tree_unref(qword);
  3056     g_tree_unref(qperiod);
  3057     counters_destroy(&counters);
  3058     g_set_print_handler(NULL);
  3059     print_as_windows_1252(NULL);
  3060     if (pswit[MARKUP_SWITCH])  
  3061 	loseentities(NULL);
  3062 }
  3063 
  3064 /*
  3065  * flgets:
  3066  *
  3067  * Get one line from the input text, checking for
  3068  * the existence of exactly one CR/LF line-end per line.
  3069  *
  3070  * Returns: a pointer to the line.
  3071  */
  3072 char *flgets(char **etext,long lcnt)
  3073 {
  3074     gunichar c;
  3075     gboolean isCR=FALSE;
  3076     char *theline=*etext;
  3077     char *eos=theline;
  3078     gchar *s;
  3079     for (;;)
  3080     {
  3081 	c=g_utf8_get_char(*etext);
  3082 	*etext=g_utf8_next_char(*etext);
  3083 	if (!c)
  3084 	    return NULL;
  3085 	/* either way, it's end of line */
  3086 	if (c=='\n')
  3087 	{
  3088 	    if (isCR)
  3089 		break;
  3090 	    else
  3091 	    {
  3092 		/* Error - a LF without a preceding CR */
  3093 		if (pswit[LINE_END_SWITCH])
  3094 		{
  3095 		    if (pswit[ECHO_SWITCH])
  3096 		    {
  3097 			s=g_strndup(theline,eos-theline);
  3098 			g_print("\n%s\n",s);
  3099 			g_free(s);
  3100 		    }
  3101 		    if (!pswit[OVERVIEW_SWITCH])
  3102 			g_print("    Line %ld - No CR?\n",lcnt);
  3103 		    else
  3104 			cnt_lineend++;
  3105 		}
  3106 		break;
  3107 	    }
  3108 	}
  3109 	if (c=='\r')
  3110 	{
  3111 	    if (isCR)
  3112 	    {
  3113 		/* Error - two successive CRs */
  3114 		if (pswit[LINE_END_SWITCH])
  3115 		{
  3116 		    if (pswit[ECHO_SWITCH])
  3117 		    {
  3118 			s=g_strndup(theline,eos-theline);
  3119 			g_print("\n%s\n",s);
  3120 			g_free(s);
  3121 		    }
  3122 		    if (!pswit[OVERVIEW_SWITCH])
  3123 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3124 		    else
  3125 			cnt_lineend++;
  3126 		}
  3127 	    }
  3128 	    isCR=TRUE;
  3129 	}
  3130 	else
  3131 	{
  3132 	    if (pswit[LINE_END_SWITCH] && isCR)
  3133 	    {
  3134 		if (pswit[ECHO_SWITCH])
  3135 		{
  3136 		    s=g_strndup(theline,eos-theline);
  3137 		    g_print("\n%s\n",s);
  3138 		    g_free(s);
  3139 		}
  3140 		if (!pswit[OVERVIEW_SWITCH])
  3141 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3142 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3143 		else
  3144 		    cnt_lineend++;
  3145 		*eos=' ';
  3146 	    }
  3147 	    isCR=FALSE;
  3148 	    eos=g_utf8_next_char(eos);
  3149 	}
  3150     }
  3151     *eos='\0';
  3152     if (pswit[MARKUP_SWITCH])  
  3153 	postprocess_for_HTML(theline);
  3154     if (pswit[DP_SWITCH])  
  3155 	postprocess_for_DP(theline);
  3156     return theline;
  3157 }
  3158 
  3159 /*
  3160  * mixdigit:
  3161  *
  3162  * Takes a "word" as a parameter, and checks whether it
  3163  * contains a mixture of alpha and digits. Generally, this is an
  3164  * error, but may not be for cases like 4th or L5 12s. 3d.
  3165  *
  3166  * Returns: TRUE iff an is error found.
  3167  */
  3168 gboolean mixdigit(const char *checkword)
  3169 {
  3170     gboolean wehaveadigit,wehavealetter,query;
  3171     const char *s,*nondigit;
  3172     wehaveadigit=wehavealetter=query=FALSE;
  3173     for (s=checkword;*s;s=g_utf8_next_char(s))
  3174 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3175 	    wehavealetter=TRUE;
  3176 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3177 	    wehaveadigit=TRUE;
  3178     if (wehaveadigit && wehavealetter)
  3179     {
  3180 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3181 	query=TRUE;
  3182 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3183 	  nondigit=g_utf8_next_char(nondigit))
  3184 	    ;
  3185 	/* digits, ending in st, rd, nd, th of either case */
  3186 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3187 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3188 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3189 	  !g_ascii_strcasecmp(nondigit,"th"))
  3190 	    query=FALSE;
  3191 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3192 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3193 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3194 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3195 	    query=FALSE;
  3196 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3197 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3198 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3199 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3200 	    query=FALSE;
  3201 	/* digits, ending in l, L, s or d */
  3202 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3203 	  !strcmp(nondigit,"d"))
  3204 	    query=FALSE;
  3205 	/*
  3206 	 * L at the start of a number, representing Britsh pounds, like L500.
  3207 	 * This is cute. We know the current word is mixed digit. If the first
  3208 	 * letter is L, there must be at least one digit following. If both
  3209 	 * digits and letters follow, we have a genuine error, else we have a
  3210 	 * capital L followed by digits, and we accept that as a non-error.
  3211 	 */
  3212 	if (g_utf8_get_char(checkword)=='L' &&
  3213 	  !mixdigit(g_utf8_next_char(checkword)))
  3214 	    query=FALSE;
  3215     }
  3216     return query;
  3217 }
  3218 
  3219 /*
  3220  * getaword:
  3221  *
  3222  * Extracts the first/next "word" from the line, and returns it.
  3223  * A word is defined as one English word unit--or at least that's the aim.
  3224  * "ptr" is advanced to the position in the line where we will start
  3225  * looking for the next word.
  3226  *
  3227  * Returns: A newly-allocated string.
  3228  */
  3229 gchar *getaword(const char **ptr)
  3230 {
  3231     const char *s,*t;
  3232     GString *word;
  3233     gunichar c,pc;
  3234     word=g_string_new(NULL);
  3235     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3236       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3237       **ptr;*ptr=g_utf8_next_char(*ptr))
  3238 	;
  3239     /*
  3240      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3241      * Especially yucky is the case of L1,000
  3242      * This section looks for a pattern of characters including a digit
  3243      * followed by a comma or period followed by one or more digits.
  3244      * If found, it returns this whole pattern as a word; otherwise we discard
  3245      * the results and resume our normal programming.
  3246      */
  3247     s=*ptr;
  3248     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3249       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3250       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3251 	g_string_append_unichar(word,g_utf8_get_char(s));
  3252     if (word->len)
  3253     {
  3254 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3255 	{
  3256 	    c=g_utf8_get_char(t);
  3257 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3258 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3259 	    {
  3260 		*ptr=s;
  3261 		return g_string_free(word,FALSE);
  3262 	    }
  3263 	}
  3264     }
  3265     /* we didn't find a punctuated number - do the regular getword thing */
  3266     g_string_truncate(word,0);
  3267     c=g_utf8_get_char(*ptr);
  3268     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3269       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3270 	g_string_append_unichar(word,c);
  3271     return g_string_free(word,FALSE);
  3272 }
  3273 
  3274 /*
  3275  * isroman:
  3276  *
  3277  * Is this word a Roman Numeral?
  3278  *
  3279  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3280  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3281  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3282  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3283  * expressions thereof, except when it came to taxes. Allow any number of M,
  3284  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3285  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3286  * of optional Is.
  3287  */
  3288 gboolean isroman(const char *t)
  3289 {
  3290     const char *s;
  3291     if (!t || !*t)
  3292 	return FALSE;
  3293     s=t;
  3294     while (g_utf8_get_char(t)=='m' && *t)
  3295 	t++;
  3296     if (g_utf8_get_char(t)=='d')
  3297 	t++;
  3298     if (g_str_has_prefix(t,"cm"))
  3299 	t+=2;
  3300     if (g_str_has_prefix(t,"cd"))
  3301 	t+=2;
  3302     while (g_utf8_get_char(t)=='c' && *t)
  3303 	t++;
  3304     if (g_str_has_prefix(t,"xl"))
  3305 	t+=2;
  3306     if (g_str_has_prefix(t,"xc"))
  3307 	t+=2;
  3308     if (g_utf8_get_char(t)=='l')
  3309 	t++;
  3310     while (g_utf8_get_char(t)=='x' && *t)
  3311 	t++;
  3312     if (g_str_has_prefix(t,"ix"))
  3313 	t+=2;
  3314     if (g_str_has_prefix(t,"iv"))
  3315 	t+=2;
  3316     if (g_utf8_get_char(t)=='v')
  3317 	t++;
  3318     while (g_utf8_get_char(t)=='i' && *t)
  3319 	t++;
  3320     return !*t;
  3321 }
  3322 
  3323 /*
  3324  * postprocess_for_DP:
  3325  *
  3326  * Invoked with the -d switch from flgets().
  3327  * It simply "removes" from the line a hard-coded set of common
  3328  * DP-specific tags, so that the line passed to the main routine has
  3329  * been pre-cleaned of DP markup.
  3330  */
  3331 void postprocess_for_DP(char *theline)
  3332 {
  3333     char *s,*t;
  3334     int i;
  3335     if (!*theline) 
  3336 	return;
  3337     for (i=0;*DPmarkup[i];i++)
  3338 	while ((s=strstr(theline,DPmarkup[i])))
  3339 	{
  3340 	    t=s+strlen(DPmarkup[i]);
  3341 	    memmove(s,t,strlen(t)+1);
  3342 	}
  3343 }
  3344 
  3345 /*
  3346  * postprocess_for_HTML:
  3347  *
  3348  * Invoked with the -m switch from flgets().
  3349  * It simply "removes" from the line a hard-coded set of common
  3350  * HTML tags and "replaces" a hard-coded set of common HTML
  3351  * entities, so that the line passed to the main routine has
  3352  * been pre-cleaned of HTML.
  3353  */
  3354 void postprocess_for_HTML(char *theline)
  3355 {
  3356     while (losemarkup(theline))
  3357 	;
  3358     loseentities(theline);
  3359 }
  3360 
  3361 char *losemarkup(char *theline)
  3362 {
  3363     char *s,*t;
  3364     int i;
  3365     s=strchr(theline,'<');
  3366     t=s?strchr(s,'>'):NULL;
  3367     if (!s || !t)
  3368 	return NULL;
  3369     for (i=0;*markup[i];i++)
  3370 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3371 	{
  3372 	    t=g_utf8_next_char(t);
  3373 	    memmove(s,t,strlen(t)+1);
  3374 	    return s;
  3375 	}
  3376     /* It's an unrecognized <xxx>. */
  3377     return NULL;
  3378 }
  3379 
  3380 void loseentities(char *theline)
  3381 {
  3382     int i;
  3383     gsize nb;
  3384     char *amp,*scolon;
  3385     gchar *s,*t;
  3386     gunichar c;
  3387     GTree *entities=NULL;
  3388     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3389     if (!theline)
  3390     {
  3391 	if (entities)
  3392 	    g_tree_destroy(entities);
  3393 	entities=NULL;
  3394 	if (translit!=(GIConv)-1)
  3395 	    g_iconv_close(translit);
  3396 	translit=(GIConv)-1;
  3397 	if (to_utf8!=(GIConv)-1)
  3398 	    g_iconv_close(to_utf8);
  3399 	to_utf8=(GIConv)-1;
  3400 	return;
  3401     }
  3402     if (!*theline)
  3403 	return;
  3404     if (!entities)
  3405     {
  3406 	entities=g_tree_new((GCompareFunc)strcmp);
  3407 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3408 	    g_tree_insert(entities,HTMLentities[i].name,
  3409 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3410     }
  3411     if (translit==(GIConv)-1)
  3412 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3413     if (to_utf8==(GIConv)-1)
  3414 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3415     while((amp=strchr(theline,'&')))
  3416     {
  3417 	scolon=strchr(amp,';');
  3418 	if (scolon)
  3419 	{
  3420 	    if (amp[1]=='#')
  3421 	    {
  3422 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3423 		    c=strtol(amp+2,NULL,10);
  3424 		else if (amp[2]=='x' &&
  3425 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3426 		    c=strtol(amp+3,NULL,16);
  3427 	    }
  3428 	    else
  3429 	    {
  3430 		s=g_strndup(amp+1,scolon-(amp+1));
  3431 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3432 		g_free(s);
  3433 	    }
  3434 	}
  3435 	else
  3436 	    c=0;
  3437 	if (c)
  3438 	{
  3439 	    theline=amp;
  3440 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3441 		theline+=g_unichar_to_utf8(c,theline);
  3442 	    else
  3443 	    {
  3444 		s=g_malloc(6);
  3445 		nb=g_unichar_to_utf8(c,s);
  3446 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3447 		g_free(s);
  3448 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3449 		g_free(t);
  3450 		memcpy(theline,s,nb);
  3451 		g_free(s);
  3452 		theline+=nb;
  3453 	    }
  3454 	    memmove(theline,g_utf8_next_char(scolon),
  3455 	      strlen(g_utf8_next_char(scolon))+1);
  3456 	}
  3457 	else
  3458 	    theline=g_utf8_next_char(amp);
  3459     }
  3460 }
  3461 
  3462 gboolean tagcomp(const char *strin,const char *basetag)
  3463 {
  3464     gboolean retval;
  3465     gchar *s,*t;
  3466     if (g_utf8_get_char(strin)=='/')
  3467 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3468     else
  3469 	t=g_utf8_casefold(strin,-1);
  3470     s=g_utf8_casefold(basetag,-1);
  3471     retval=g_str_has_prefix(t,s);
  3472     g_free(s);
  3473     g_free(t);
  3474     return retval;
  3475 }
  3476 
  3477 void proghelp(GOptionContext *context)
  3478 {
  3479     gchar *help;
  3480     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3481     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3482     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3483     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3484       "For details, read the file COPYING.\n",stderr);
  3485     fputs("This is Free Software; "
  3486       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3487     fputs("read the file COPYING for details.\n\n",stderr);
  3488     help=g_option_context_get_help(context,TRUE,NULL);
  3489     fputs(help,stderr);
  3490     g_free(help);
  3491     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3492     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3493       "non-ASCII\n",stderr);
  3494     fputs("characters like accented letters, "
  3495       "lines longer than 75 or shorter than 55,\n",stderr);
  3496     fputs("unbalanced quotes or brackets, "
  3497       "a variety of badly formatted punctuation, \n",stderr);
  3498     fputs("HTML tags, some likely typos. "
  3499       "It is NOT a substitute for human judgement.\n",stderr);
  3500     fputs("\n",stderr);
  3501 }