bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Mon Sep 30 07:37:36 2013 +0100 (2013-09-30)
changeset 137 b6358ed2548d
parent 135 a5d0e3c0af60
parent 136 2f3762ff90d8
child 138 5e27fa988c5c
permissions -rw-r--r--
Merge bug #14: Add a configuration file
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 gboolean typo_compat,paranoid_compat;
   136 
   137 static GOptionEntry options[]={
   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   139       "Ignore DP-specific markup", NULL },
   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   142       "Don't ignore DP-specific markup", NULL },
   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   144       "Echo queried line", NULL },
   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   147       "Don't echo queried line", NULL },
   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   149       "Check single quotes", NULL },
   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   152       "Don't check single quotes", NULL },
   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   154       "Check common typos", NULL },
   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   157       "Don't check common typos", NULL },
   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   159       "Require closure of quotes on every paragraph", NULL },
   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   162       "Don't require closure of quotes on every paragraph", NULL },
   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   165       "Enable paranoid querying of everything", NULL },
   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   168       "Disable paranoid querying of everything", NULL },
   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   171       "Enable line end checking", NULL },
   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   174       "Diable line end checking", NULL },
   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   176       "Overview: just show counts", NULL },
   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   179       "Show individual warnings", NULL },
   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   181       "Output errors to stdout instead of stderr", NULL },
   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   184       "Output errors to stderr instead of stdout", NULL },
   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   186       "Echo header fields", NULL },
   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Don't echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "No special handling for markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   199       "Ignore file of user-defined typos", NULL },
   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   201       "Verbose - list everything", NULL },
   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   204       "Switch off verbose mode", NULL },
   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   206       "Set of characters valid for this ebook", "NAME" },
   207     { NULL }
   208 };
   209 
   210 /*
   211  * Options relating to configuration which make no sense from inside
   212  * a configuration file.
   213  */
   214 
   215 static GOptionEntry config_options[]={
   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   217       "Defaults for use on www upload", NULL },
   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   219       "Dump current config settings", NULL },
   220     { NULL }
   221 };
   222 
   223 static GOptionEntry compatibility_options[]={
   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   225       "Toggle checking for common typos", NULL },
   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   227       "Toggle both paranoid mode and common typos", NULL },
   228     { NULL }
   229 };
   230 
   231 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   232 long cnt_squot;		/* for overview mode, count of singlequote queries */
   233 long cnt_brack;		/* for overview mode, count of brackets queries */
   234 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   235 long cnt_odd;		/* for overview mode, count of odd character queries */
   236 long cnt_long;		/* for overview mode, count of long line errors */
   237 long cnt_short;		/* for overview mode, count of short line queries */
   238 long cnt_punct;		/* for overview mode,
   239 			   count of punctuation and spacing queries */
   240 long cnt_dash;		/* for overview mode, count of dash-related queries */
   241 long cnt_word;		/* for overview mode, count of word queries */
   242 long cnt_html;		/* for overview mode, count of html queries */
   243 long cnt_lineend;	/* for overview mode, count of line-end queries */
   244 long cnt_spacend;	/* count of lines with space at end */
   245 long linecnt;		/* count of total lines in the file */
   246 long checked_linecnt;	/* count of lines actually checked */
   247 
   248 void proghelp(GOptionContext *context);
   249 void procfile(const char *);
   250 
   251 gchar *running_from;
   252 
   253 gboolean mixdigit(const char *);
   254 gchar *getaword(const char **);
   255 char *flgets(char **,long);
   256 void postprocess_for_HTML(char *);
   257 char *linehasmarkup(char *);
   258 char *losemarkup(char *);
   259 gboolean tagcomp(const char *,const char *);
   260 void loseentities(char *);
   261 gboolean isroman(const char *);
   262 void postprocess_for_DP(char *);
   263 void print_as_windows_1252(const char *string);
   264 void print_as_utf_8(const char *string);
   265 
   266 GTree *qword,*qperiod;
   267 
   268 #ifdef __WIN32__
   269 UINT saved_cp;
   270 #endif
   271 
   272 gboolean set_charset(const char *name,GError **err)
   273 {
   274     /* The various UNICODE encodings all share the same character set. */
   275     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   276       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   277       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   278       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   279       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   280     int i;
   281     if (charset)
   282 	g_free(charset);
   283     if (charset_validator!=(GIConv)-1)
   284 	g_iconv_close(charset_validator);
   285     if (!name || !g_strcasecmp(name,"auto"))
   286     {
   287 	charset=NULL;
   288 	charset_validator=(GIConv)-1;
   289 	return TRUE;
   290     }
   291     else
   292 	charset=g_strdup(name);
   293     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   294 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   295 	{
   296 	    g_free(charset);
   297 	    charset=g_strdup("UTF-8");
   298 	    break;
   299 	}
   300     if (!strcmp(charset,"UTF-8"))
   301 	charset_validator=(GIConv)-1;
   302     else
   303     {
   304 	charset_validator=g_iconv_open(charset,"UTF-8");
   305 	if (charset_validator==(GIConv)-1)
   306 	{
   307 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   308 	      "Unknown character set \"%s\"",charset);
   309 	    return FALSE;
   310 	}
   311     }
   312     return TRUE;
   313 }
   314 
   315 GKeyFile *config;
   316 
   317 void config_file_update(GKeyFile *kf)
   318 {
   319     int i;
   320     gboolean sw;
   321     for(i=0;options[i].long_name;i++)
   322     {
   323 	if (g_str_has_prefix(options[i].long_name,"no-"))
   324 	    continue;
   325 	if (options[i].arg==G_OPTION_ARG_NONE)
   326 	{
   327 	    sw=*(gboolean *)options[i].arg_data;
   328 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   329 		sw=!sw;
   330 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   331 	}
   332 	else
   333 	    g_assert_not_reached();
   334     }
   335 }
   336 
   337 void config_file_add_comments(GKeyFile *kf)
   338 {
   339     int i;
   340     gchar *comment;
   341     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   342       NULL);
   343     for(i=0;options[i].long_name;i++)
   344     {
   345 	if (g_str_has_prefix(options[i].long_name,"no-"))
   346 	    continue;
   347 	comment=g_strconcat(" ",options[i].description,NULL);
   348 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   349 	g_free(comment);
   350     }
   351 }
   352 
   353 void dump_config(void)
   354 {
   355     gchar *s;
   356     if (config)
   357 	config_file_update(config);
   358     else
   359     {
   360 	config=g_key_file_new();
   361 	config_file_update(config);
   362 	config_file_add_comments(config);
   363     }
   364     s=g_key_file_to_data(config,NULL,NULL);
   365     if (s)
   366 	g_print("%s",s);
   367     g_free(s);
   368 }
   369 
   370 GKeyFile *read_config_file(gchar **full_path)
   371 {
   372     int i;
   373     GError *err=NULL;
   374     gchar **search_dirs;
   375     gchar *path;
   376     const char *search_path;
   377     GKeyFile *kf;
   378     kf=g_key_file_new();
   379     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   380     if (search_path)
   381     {
   382 #ifdef __WIN32__
   383 	search_dirs=g_strsplit(search_path,";",0);
   384 #else
   385 	search_dirs=g_strsplit(search_path,":",0);
   386 #endif
   387     }
   388     else
   389     {
   390 	search_dirs=g_new(gchar *,4);
   391 	search_dirs[0]=g_get_current_dir();
   392 	search_dirs[1]=g_strdup(running_from);
   393 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   394 	search_dirs[3]=NULL;
   395     }
   396     for(i=0;search_dirs[i];i++)
   397     {
   398 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   399 	if (g_key_file_load_from_file(kf,path,
   400 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   401 	    break;
   402 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   403 	{
   404 	    g_printerr("Bookloupe: Error reading %s\n",path);
   405 	    g_printerr("%s\n",err->message);
   406 	    exit(1);
   407 	}
   408 	g_clear_error(&err);
   409 	g_free(path);
   410 	path=NULL;
   411     }
   412     if (!search_dirs[i])
   413     {
   414 	g_key_file_free(kf);
   415 	kf=NULL;
   416     }
   417     g_strfreev(search_dirs);
   418     if (full_path && kf)
   419 	*full_path=path;
   420     else
   421 	g_free(path);
   422     return kf;
   423 }
   424 
   425 void parse_config_file(void)
   426 {
   427     int i,j;
   428     gchar *path;
   429     gchar **keys;
   430     gboolean sw;
   431     GError *err=NULL;
   432     config=read_config_file(&path);
   433     if (config)
   434 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   435     else
   436 	keys=NULL;
   437     if (keys)
   438     {
   439 	for(i=0;keys[i];i++)
   440 	{
   441 	    for(j=0;options[j].long_name;j++)
   442 	    {
   443 		if (g_str_has_prefix(options[j].long_name,"no-"))
   444 		    continue;
   445 		else if (!strcmp(keys[i],options[j].long_name))
   446 		{
   447 		    if (options[j].arg==G_OPTION_ARG_NONE)
   448 		    {
   449 			sw=g_key_file_get_boolean(config,"options",keys[i],
   450 			  &err);
   451 			if (err)
   452 			{
   453 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   454 			      path,keys[i],err->message);
   455 			    g_clear_error(&err);
   456 			}
   457 			if (options[j].flags&G_OPTION_FLAG_REVERSE)
   458 			    sw=!sw;
   459 			*(gboolean *)options[j].arg_data=sw;
   460 			break;
   461 		    }
   462 		    else
   463 			g_assert_not_reached();
   464 		}
   465 	    }
   466 	    if (!options[j].long_name)
   467 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   468 		  path,keys[i]);
   469 	}
   470 	g_strfreev(keys);
   471     }
   472     if (config)
   473 	g_free(path);
   474 }
   475 
   476 void parse_options(int *argc,char ***argv)
   477 {
   478     GError *err=NULL;
   479     GOptionContext *context;
   480     GOptionGroup *compatibility;
   481     context=g_option_context_new(
   482       "file - look for errors in Project Gutenberg(TM) etexts");
   483     g_option_context_add_main_entries(context,options,NULL);
   484     g_option_context_add_main_entries(context,config_options,NULL);
   485     compatibility=g_option_group_new("compatibility",
   486       "Options for Compatibility with Gutcheck:",
   487       "Show compatibility options",NULL,NULL);
   488     g_option_group_add_entries(compatibility,compatibility_options);
   489     g_option_context_add_group(context,compatibility);
   490     g_option_context_set_description(context,
   491       "For simplicity, only the switch options which reverse the\n"
   492       "default configuration are listed. In most cases, both vanilla\n"
   493       "and \"no-\" prefixed versions are available for use.");
   494     if (!g_option_context_parse(context,argc,argv,&err))
   495     {
   496 	g_printerr("Bookloupe: %s\n",err->message);
   497 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   498 	exit(1);
   499     }
   500     if (typo_compat)
   501 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   502     if (paranoid_compat)
   503     {
   504 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   505 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   506     }
   507     /*
   508      * Web uploads - for the moment, this is really just a placeholder
   509      * until we decide what processing we really want to do on web uploads
   510      */
   511     if (pswit[WEB_SWITCH])
   512     {
   513 	/* specific override for web uploads */
   514 	pswit[ECHO_SWITCH]=TRUE;
   515 	pswit[SQUOTE_SWITCH]=FALSE;
   516 	pswit[TYPO_SWITCH]=TRUE;
   517 	pswit[QPARA_SWITCH]=FALSE;
   518 	pswit[PARANOID_SWITCH]=TRUE;
   519 	pswit[LINE_END_SWITCH]=FALSE;
   520 	pswit[OVERVIEW_SWITCH]=FALSE;
   521 	pswit[STDOUT_SWITCH]=FALSE;
   522 	pswit[HEADER_SWITCH]=TRUE;
   523 	pswit[VERBOSE_SWITCH]=FALSE;
   524 	pswit[MARKUP_SWITCH]=FALSE;
   525 	pswit[USERTYPO_SWITCH]=FALSE;
   526 	pswit[DP_SWITCH]=FALSE;
   527     }
   528     if (opt_charset && !set_charset(opt_charset,&err))
   529     {
   530 	g_printerr("%s\n",err->message);
   531 	exit(1);
   532     }
   533     g_free(opt_charset);
   534     opt_charset=NULL;
   535     if (pswit[DUMP_CONFIG_SWITCH])
   536     {
   537 	dump_config();
   538 	exit(0);
   539     }
   540     if (pswit[OVERVIEW_SWITCH])
   541 	/* just print summary; don't echo */
   542 	pswit[ECHO_SWITCH]=FALSE;
   543     if (*argc<2)
   544     {
   545 	proghelp(context);
   546 	exit(1);
   547     }
   548     g_option_context_free(context);
   549 }
   550 
   551 /*
   552  * read_user_scannos:
   553  *
   554  * Read in the user-defined stealth scanno list.
   555  */
   556 void read_user_scannos(void)
   557 {
   558     GError *err=NULL;
   559     gchar *usertypo_file;
   560     gboolean okay;
   561     int i;
   562     gsize len,nb;
   563     gchar *contents,*utf8,**lines;
   564     usertypo_file=g_strdup("bookloupe.typ");
   565     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   566     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   567     {
   568 	g_clear_error(&err);
   569 	g_free(usertypo_file);
   570 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   571 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   572     }
   573     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   574     {
   575 	g_clear_error(&err);
   576 	g_free(usertypo_file);
   577 	usertypo_file=g_strdup("gutcheck.typ");
   578 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   579     }
   580     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   581     {
   582 	g_clear_error(&err);
   583 	g_free(usertypo_file);
   584 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   585 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   586     }
   587     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   588     {
   589 	g_free(usertypo_file);
   590 	g_print("   --> I couldn't find bookloupe.typ "
   591 	  "-- proceeding without user typos.\n");
   592 	return;
   593     }
   594     else if (!okay)
   595     {
   596 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   597 	g_free(usertypo_file);
   598 	g_clear_error(&err);
   599 	exit(1);
   600     }
   601     if (g_utf8_validate(contents,len,NULL))
   602     {
   603 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   604 	if (!charset)
   605 	    (void)set_charset("UNICODE",NULL);
   606     }
   607     else
   608 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   609     g_free(contents);
   610     lines=g_strsplit_set(utf8,"\r\n",0);
   611     g_free(utf8);
   612     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   613     for (i=0;lines[i];i++)
   614 	if (*(unsigned char *)lines[i]>'!')
   615 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   616 	else
   617 	    g_free(lines[i]);
   618     g_free(lines);
   619 }
   620 
   621 /*
   622  * read_etext:
   623  *
   624  * Read an etext returning a newly allocated string containing the file
   625  * contents or NULL on error.
   626  */
   627 gchar *read_etext(const char *filename,GError **err)
   628 {
   629     GError *tmp_err=NULL;
   630     gchar *contents,*utf8;
   631     gsize len,bytes_read,bytes_written;
   632     int i,line,col;
   633     if (!g_file_get_contents(filename,&contents,&len,err))
   634 	return NULL;
   635     if (g_utf8_validate(contents,len,NULL))
   636     {
   637 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   638 	g_set_print_handler(print_as_utf_8);
   639 #ifdef __WIN32__
   640 	SetConsoleOutputCP(CP_UTF8);
   641 #endif
   642     }
   643     else
   644     {
   645 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   646 	  &bytes_written,&tmp_err);
   647 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   648 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   649 	{
   650 	    line=col=1;
   651 	    for(i=0;i<bytes_read;i++)
   652 		if (contents[i]=='\n')
   653 		{
   654 		    line++;
   655 		    col=1;
   656 		}
   657 		else if (contents[i]!='\r')
   658 		    col++;
   659 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   660 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   661 	      "valid Windows-1252 character",
   662 	      ((unsigned char *)contents)[bytes_read],line,col);
   663 	}
   664 	else if (tmp_err)
   665 	    g_propagate_error(err,tmp_err);
   666 	g_set_print_handler(print_as_windows_1252);
   667 #ifdef __WIN32__
   668 	SetConsoleOutputCP(1252);
   669 #endif
   670     }
   671     g_free(contents);
   672     return utf8;
   673 }
   674 
   675 void cleanup_on_exit(void)
   676 {
   677 #ifdef __WIN32__
   678     SetConsoleOutputCP(saved_cp);
   679 #endif
   680 }
   681 
   682 int main(int argc,char **argv)
   683 {
   684 #ifdef __WIN32__
   685     atexit(cleanup_on_exit);
   686     saved_cp=GetConsoleOutputCP();
   687 #endif
   688     running_from=g_path_get_dirname(argv[0]);
   689     /* Paranoid checking is turned OFF, not on, by its switch */
   690     pswit[PARANOID_SWITCH]=TRUE;
   691     /* if running in paranoid mode, typo checks default to enabled */
   692     pswit[TYPO_SWITCH]=TRUE;
   693     /* Line-end checking is turned OFF, not on, by its switch */
   694     pswit[LINE_END_SWITCH]=TRUE;
   695     /* Echoing is turned OFF, not on, by its switch */
   696     pswit[ECHO_SWITCH]=TRUE;
   697     parse_config_file();
   698     parse_options(&argc,&argv);
   699     if (pswit[USERTYPO_SWITCH])
   700 	read_user_scannos();
   701     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   702     procfile(argv[1]);
   703     if (pswit[OVERVIEW_SWITCH])
   704     {
   705 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   706 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   707 	g_print("    --------------- Queries found --------------\n");
   708 	if (cnt_long)
   709 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   710 	if (cnt_short)
   711 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   712 	if (cnt_lineend)
   713 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   714 	if (cnt_word)
   715 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   716 	if (cnt_dquot)
   717 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   718 	if (cnt_squot)
   719 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   720 	if (cnt_brack)
   721 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   722 	if (cnt_bin)
   723 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   724 	if (cnt_odd)
   725 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   726 	if (cnt_punct)
   727 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   728 	if (cnt_dash)
   729 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   730 	if (cnt_html)
   731 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   732 	g_print("\n");
   733 	g_print("    TOTAL QUERIES		  %14ld\n",
   734 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   735 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   736     }
   737     g_free(running_from);
   738     if (usertypo)
   739 	g_tree_unref(usertypo);
   740     set_charset(NULL,NULL);
   741     if (config)
   742 	g_key_file_free(config);
   743     return 0;
   744 }
   745 
   746 /*
   747  * first_pass:
   748  *
   749  * Run a first pass - verify that it's a valid PG
   750  * file, decide whether to report some things that
   751  * occur many times in the text like long or short
   752  * lines, non-standard dashes, etc.
   753  */
   754 struct first_pass_results *first_pass(const char *etext)
   755 {
   756     gunichar laststart=CHAR_SPACE;
   757     const char *s;
   758     gchar *lc_line;
   759     int i,j,lbytes,llen;
   760     gchar **lines;
   761     unsigned int lastlen=0,lastblen=0;
   762     long spline=0,nspline=0;
   763     static struct first_pass_results results={0};
   764     gchar *inword;
   765     lines=g_strsplit(etext,"\n",0);
   766     for (j=0;lines[j];j++)
   767     {
   768 	lbytes=strlen(lines[j]);
   769 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   770 	    lines[j][--lbytes]='\0';
   771 	llen=g_utf8_strlen(lines[j],lbytes);
   772 	linecnt++;
   773 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   774 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   775 	{
   776 	    if (spline)
   777 		g_print("   --> Duplicate header?\n");
   778 	    spline=linecnt+1;   /* first line of non-header text, that is */
   779 	}
   780 	if (!strncmp(lines[j],"*** START",9) &&
   781 	  strstr(lines[j],"PROJECT GUTENBERG"))
   782 	{
   783 	    if (nspline)
   784 		g_print("   --> Duplicate header?\n");
   785 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   786 	}
   787 	if (spline || nspline)
   788 	{
   789 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   790 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   791 	    {
   792 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   793 		{
   794 		    if (results.footerline)
   795 		    {
   796 			/* it's an old-form header - we can detect duplicates */
   797 			if (!nspline)
   798 			    g_print("   --> Duplicate footer?\n");
   799 		    }
   800 		    else
   801 			results.footerline=linecnt;
   802 		}
   803 	    }
   804 	    g_free(lc_line);
   805 	}
   806 	if (spline)
   807 	    results.firstline=spline;
   808 	if (nspline)
   809 	    results.firstline=nspline;  /* override with new */
   810 	if (results.footerline)
   811 	    continue;    /* don't count the boilerplate in the footer */
   812 	results.totlen+=llen;
   813 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   814 	{
   815 	    if (g_utf8_get_char(s)>127)
   816 		results.binlen++;
   817 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   818 		results.alphalen++;
   819 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   820 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   821 		results.endquote_count++;
   822 	}
   823 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   824 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   825 	    results.shortline++;
   826 	if (lbytes>0 &&
   827 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   828 	    cnt_spacend++;
   829 	if (strstr(lines[j],".,"))
   830 	    results.dotcomma++;
   831 	/* only count ast lines for ignoring purposes where there is */
   832 	/* locase text on the line */
   833 	if (strchr(lines[j],'*'))
   834 	{
   835 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   836 		if (g_unichar_islower(g_utf8_get_char(s)))
   837 		    break;
   838 	    if (*s)
   839 		results.astline++;
   840 	}
   841 	if (strchr(lines[j],'/'))
   842 	    results.fslashline++;
   843 	if (lbytes>0)
   844 	{
   845 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   846 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   847 	      s=g_utf8_prev_char(s))
   848 		;
   849 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   850 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   851 		results.hyphens++;
   852 	}
   853 	if (llen>LONGEST_PG_LINE)
   854 	    results.longline++;
   855 	if (llen>WAY_TOO_LONG)
   856 	    results.verylongline++;
   857 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   858 	{
   859 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   860 	    if (i>0)
   861 		results.htmcount++;
   862 	    if (strstr(lines[j],"<i>"))
   863 		results.htmcount+=4; /* bonus marks! */
   864 	}
   865 	/* Check for spaced em-dashes */
   866 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   867 	{
   868 	    results.emdash++;
   869 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   870 		results.space_emdash++;
   871 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   872 		/* count of em-dashes with spaces both sides */
   873 		results.non_PG_space_emdash++;
   874 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   875 		/* count of PG-type em-dashes with no spaces */
   876 		results.PG_space_emdash++;
   877 	}
   878 	for (s=lines[j];*s;)
   879 	{
   880 	    inword=getaword(&s);
   881 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   882 		results.Dutchcount++;
   883 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   884 		results.Frenchcount++;
   885 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   886 		results.standalone_digit++;
   887 	    g_free(inword);
   888 	}
   889 	/* Check for spaced dashes */
   890 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   891 	    results.spacedash++;
   892 	lastblen=lastlen;
   893 	lastlen=llen;
   894 	laststart=lines[j][0];
   895     }
   896     g_strfreev(lines);
   897     return &results;
   898 }
   899 
   900 /*
   901  * report_first_pass:
   902  *
   903  * Make some snap decisions based on the first pass results.
   904  */
   905 struct warnings *report_first_pass(struct first_pass_results *results)
   906 {
   907     static struct warnings warnings={0};
   908     if (cnt_spacend>0)
   909 	g_print("   --> %ld lines in this file have white space at end\n",
   910 	  cnt_spacend);
   911     warnings.dotcomma=1;
   912     if (results->dotcomma>5)
   913     {
   914 	warnings.dotcomma=0;
   915 	g_print("   --> %ld lines in this file contain '.,'. "
   916 	  "Not reporting them.\n",results->dotcomma);
   917     }
   918     /*
   919      * If more than 50 lines, or one-tenth, are short,
   920      * don't bother reporting them.
   921      */
   922     warnings.shortline=1;
   923     if (results->shortline>50 || results->shortline*10>linecnt)
   924     {
   925 	warnings.shortline=0;
   926 	g_print("   --> %ld lines in this file are short. "
   927 	  "Not reporting short lines.\n",results->shortline);
   928     }
   929     /*
   930      * If more than 50 lines, or one-tenth, are long,
   931      * don't bother reporting them.
   932      */
   933     warnings.longline=1;
   934     if (results->longline>50 || results->longline*10>linecnt)
   935     {
   936 	warnings.longline=0;
   937 	g_print("   --> %ld lines in this file are long. "
   938 	  "Not reporting long lines.\n",results->longline);
   939     }
   940     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   941     warnings.ast=1;
   942     if (results->astline>10)
   943     {
   944 	warnings.ast=0;
   945 	g_print("   --> %ld lines in this file contain asterisks. "
   946 	  "Not reporting them.\n",results->astline);
   947     }
   948     /*
   949      * If more than 10 lines contain forward slashes,
   950      * don't bother reporting them.
   951      */
   952     warnings.fslash=1;
   953     if (results->fslashline>10)
   954     {
   955 	warnings.fslash=0;
   956 	g_print("   --> %ld lines in this file contain forward slashes. "
   957 	  "Not reporting them.\n",results->fslashline);
   958     }
   959     /*
   960      * If more than 20 lines contain unpunctuated endquotes,
   961      * don't bother reporting them.
   962      */
   963     warnings.endquote=1;
   964     if (results->endquote_count>20)
   965     {
   966 	warnings.endquote=0;
   967 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   968 	  "Not reporting them.\n",results->endquote_count);
   969     }
   970     /*
   971      * If more than 15 lines contain standalone digits,
   972      * don't bother reporting them.
   973      */
   974     warnings.digit=1;
   975     if (results->standalone_digit>10)
   976     {
   977 	warnings.digit=0;
   978 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   979 	  "Not reporting them.\n",results->standalone_digit);
   980     }
   981     /*
   982      * If more than 20 lines contain hyphens at end,
   983      * don't bother reporting them.
   984      */
   985     warnings.hyphen=1;
   986     if (results->hyphens>20)
   987     {
   988 	warnings.hyphen=0;
   989 	g_print("   --> %ld lines in this file have hyphens at end. "
   990 	  "Not reporting them.\n",results->hyphens);
   991     }
   992     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   993     {
   994 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   995 	pswit[MARKUP_SWITCH]=1;
   996     }
   997     if (results->verylongline>0)
   998 	g_print("   --> %ld lines in this file are VERY long!\n",
   999 	  results->verylongline);
  1000     /*
  1001      * If there are more non-PG spaced dashes than PG em-dashes,
  1002      * assume it's deliberate.
  1003      * Current PG guidelines say don't use them, but older texts do,
  1004      * and some people insist on them whatever the guidelines say.
  1005      */
  1006     warnings.dash=1;
  1007     if (results->spacedash+results->non_PG_space_emdash>
  1008       results->PG_space_emdash)
  1009     {
  1010 	warnings.dash=0;
  1011 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1012 	  "Not reporting them.\n",
  1013 	  results->spacedash+results->non_PG_space_emdash);
  1014     }
  1015     if (charset)
  1016 	warnings.bin=0;
  1017     else
  1018     {
  1019 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1020 	warnings.bin=1;
  1021 	/* If more than a quarter of characters are hi-bit, bug out. */
  1022 	if (results->binlen*4>results->totlen)
  1023 	{
  1024 	    g_print("   --> This file does not appear to be ASCII. "
  1025 	      "Terminating. Best of luck with it!\n");
  1026 	    exit(1);
  1027 	}
  1028 	if (results->alphalen*4<results->totlen)
  1029 	{
  1030 	    g_print("   --> This file does not appear to be text. "
  1031 	      "Terminating. Best of luck with it!\n");
  1032 	    exit(1);
  1033 	}
  1034 	if (results->binlen*100>results->totlen || results->binlen>100)
  1035 	{
  1036 	    g_print("   --> There are a lot of foreign letters here. "
  1037 	      "Not reporting them.\n");
  1038 	    if (!pswit[VERBOSE_SWITCH])
  1039 		warnings.bin=0;
  1040 	}
  1041     }
  1042     warnings.isDutch=FALSE;
  1043     if (results->Dutchcount>50)
  1044     {
  1045 	warnings.isDutch=TRUE;
  1046 	g_print("   --> This looks like Dutch - "
  1047 	  "switching off dashes and warnings for 's Middags case.\n");
  1048     }
  1049     warnings.isFrench=FALSE;
  1050     if (results->Frenchcount>50)
  1051     {
  1052 	warnings.isFrench=TRUE;
  1053 	g_print("   --> This looks like French - "
  1054 	  "switching off some doublepunct.\n");
  1055     }
  1056     if (results->firstline && results->footerline)
  1057 	g_print("    The PG header and footer appear to be already on.\n");
  1058     else
  1059     {
  1060 	if (results->firstline)
  1061 	    g_print("    The PG header is on - no footer.\n");
  1062 	if (results->footerline)
  1063 	    g_print("    The PG footer is on - no header.\n");
  1064     }
  1065     g_print("\n");
  1066     if (pswit[VERBOSE_SWITCH])
  1067     {
  1068 	warnings.shortline=1;
  1069 	warnings.dotcomma=1;
  1070 	warnings.longline=1;
  1071 	warnings.dash=1;
  1072 	warnings.digit=1;
  1073 	warnings.ast=1;
  1074 	warnings.fslash=1;
  1075 	warnings.hyphen=1;
  1076 	warnings.endquote=1;
  1077 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1078     }
  1079     if (warnings.isDutch)
  1080 	warnings.dash=0;
  1081     if (results->footerline>0 && results->firstline>0 &&
  1082       results->footerline>results->firstline &&
  1083       results->footerline-results->firstline<100)
  1084     {
  1085 	g_print("   --> I don't really know where this text starts. \n");
  1086 	g_print("       There are no reference points.\n");
  1087 	g_print("       I'm going to have to report the header and footer "
  1088 	  "as well.\n");
  1089 	results->firstline=0;
  1090     }
  1091     return &warnings;
  1092 }
  1093 
  1094 /*
  1095  * analyse_quotes:
  1096  *
  1097  * Look along the line, accumulate the count of quotes, and see
  1098  * if this is an empty line - i.e. a line with nothing on it
  1099  * but spaces.
  1100  * If line has just spaces, period, * and/or - on it, don't
  1101  * count it, since empty lines with asterisks or dashes to
  1102  * separate sections are common.
  1103  *
  1104  * Returns: TRUE if the line is empty.
  1105  */
  1106 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1107 {
  1108     int guessquote=0;
  1109     /* assume the line is empty until proven otherwise */
  1110     gboolean isemptyline=TRUE;
  1111     const char *s=aline,*sprev,*snext;
  1112     gunichar c;
  1113     sprev=NULL;
  1114     while (*s)
  1115     {
  1116 	snext=g_utf8_next_char(s);
  1117 	c=g_utf8_get_char(s);
  1118 	if (c==CHAR_DQUOTE)
  1119 	    counters->quot++;
  1120 	if (CHAR_IS_SQUOTE(c))
  1121 	{
  1122 	    if (s==aline)
  1123 	    {
  1124 		/*
  1125 		 * At start of line, it can only be an openquote.
  1126 		 * Hardcode a very common exception!
  1127 		 */
  1128 		if (!g_str_has_prefix(snext,"tis") &&
  1129 		  !g_str_has_prefix(snext,"Tis"))
  1130 		    increment_matching(counters,c,TRUE);
  1131 	    }
  1132 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1133 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1134 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1135 		;
  1136 	    /* it's outside a word - let's check it out */
  1137 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1138 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1139 	    {
  1140 		/* it damwell better BE an openquote */
  1141 		if (!g_str_has_prefix(snext,"tis") &&
  1142 		  !g_str_has_prefix(snext,"Tis"))
  1143 		    /* hardcode a very common exception! */
  1144 		    increment_matching(counters,c,TRUE);
  1145 	    }
  1146 	    else
  1147 	    {
  1148 		/* now - is it a closequote? */
  1149 		guessquote=0;   /* accumulate clues */
  1150 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1151 		{
  1152 		    /* it follows a letter - could be either */
  1153 		    guessquote++;
  1154 		    if (g_utf8_get_char(sprev)=='s')
  1155 		    {
  1156 			/* looks like a plural apostrophe */
  1157 			guessquote-=3;
  1158 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1159 			    /* bonus marks! */
  1160 			    guessquote-=2;
  1161 		    }
  1162 		}
  1163 		/* it doesn't have a letter either side */
  1164 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
  1165 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
  1166 		    guessquote+=8; /* looks like a closequote */
  1167 		else
  1168 		    guessquote++;
  1169 		if (matching_difference(counters,CHAR_SQUOTE)>0)
  1170 		    /*
  1171 		     * Give it the benefit of some doubt,
  1172 		     * if a squote is already open.
  1173 		     */
  1174 		    guessquote++;
  1175 		else
  1176 		    guessquote--;
  1177 		if (guessquote>=0)
  1178 		    increment_matching(counters,c,FALSE);
  1179 	    }
  1180 	}
  1181 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1182 	  c!='\r' && c!='\n')
  1183 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1184 	if (c==CHAR_UNDERSCORE)
  1185 	    counters->c_unders++;
  1186 	if (c==CHAR_OPEN_SBRACK)
  1187 	{
  1188 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1189 	      !matching_difference(counters,c) && s==aline &&
  1190 	      g_str_has_prefix(s,"[Illustration:"))
  1191 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1192 	    else
  1193 		increment_matching(counters,c,TRUE);
  1194 	}
  1195 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1196 	    increment_matching(counters,c,TRUE);
  1197 	if (c==CHAR_CLOSE_SBRACK)
  1198 	{
  1199 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1200 	      !matching_difference(counters,c) && !*snext)
  1201 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1202 	    else
  1203 		increment_matching(counters,c,FALSE);
  1204 	}
  1205 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1206 	    increment_matching(counters,c,FALSE);
  1207 	sprev=s;
  1208 	s=snext;
  1209     }
  1210     return isemptyline;
  1211 }
  1212 
  1213 /*
  1214  * check_for_control_characters:
  1215  *
  1216  * Check for invalid or questionable characters in the line
  1217  * Anything above 127 is invalid for plain ASCII, and
  1218  * non-printable control characters should also be flagged.
  1219  * Tabs should generally not be there.
  1220  */
  1221 void check_for_control_characters(const char *aline)
  1222 {
  1223     gunichar c;
  1224     const char *s;
  1225     for (s=aline;*s;s=g_utf8_next_char(s))
  1226     {
  1227 	c=g_utf8_get_char(s);
  1228 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1229 	{
  1230 	    if (pswit[ECHO_SWITCH])
  1231 		g_print("\n%s\n",aline);
  1232 	    if (!pswit[OVERVIEW_SWITCH])
  1233 		g_print("    Line %ld column %ld - Control character %u\n",
  1234 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1235 	    else
  1236 		cnt_bin++;
  1237 	}
  1238     }
  1239 }
  1240 
  1241 /*
  1242  * check_for_odd_characters:
  1243  *
  1244  * Check for binary and other odd characters.
  1245  */
  1246 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1247   gboolean isemptyline)
  1248 {
  1249     /* Don't repeat multiple warnings on one line. */
  1250     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1251     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1252     const char *s;
  1253     gunichar c;
  1254     gsize nb;
  1255     gchar *t;
  1256     for (s=aline;*s;s=g_utf8_next_char(s))
  1257     {
  1258 	c=g_utf8_get_char(s);
  1259 	if (warnings->bin && !eInvalidChar &&
  1260 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1261 	{
  1262 	    if (pswit[ECHO_SWITCH])
  1263 		g_print("\n%s\n",aline);
  1264 	    if (!pswit[OVERVIEW_SWITCH])
  1265 		if (c>127 && c<160 || c>255)
  1266 		    g_print("    Line %ld column %ld - "
  1267 		      "Non-ISO-8859 character %u\n",
  1268 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1269 		else
  1270 		    g_print("    Line %ld column %ld - "
  1271 		      "Non-ASCII character %u\n",
  1272 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1273 	    else
  1274 		cnt_bin++;
  1275 	    eInvalidChar=TRUE;
  1276 	}
  1277 	if (!eInvalidChar && charset)
  1278 	{
  1279 	    if (charset_validator==(GIConv)-1)
  1280 	    {
  1281 		if (!g_unichar_isdefined(c))
  1282 		{
  1283 		    if (pswit[ECHO_SWITCH])
  1284 			g_print("\n%s\n",aline);
  1285 		    if (!pswit[OVERVIEW_SWITCH])
  1286 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1287 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1288 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1289 		    else
  1290 			cnt_bin++;
  1291 		    eInvalidChar=TRUE;
  1292 		}
  1293 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1294 		  c>=100000 && c<=0x10FFFD)
  1295 		{
  1296 		    if (pswit[ECHO_SWITCH])
  1297 			g_print("\n%s\n",aline);
  1298 		    if (!pswit[OVERVIEW_SWITCH])
  1299 			g_print("    Line %ld column %ld - Private Use "
  1300 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1301 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1302 		    else
  1303 			cnt_bin++;
  1304 		    eInvalidChar=TRUE;
  1305 		}
  1306 	    }
  1307 	    else
  1308 	    {
  1309 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1310 		  charset_validator,NULL,&nb,NULL);
  1311 		if (t)
  1312 		    g_free(t);
  1313 		else
  1314 		{
  1315 		    if (pswit[ECHO_SWITCH])
  1316 			g_print("\n%s\n",aline);
  1317 		    if (!pswit[OVERVIEW_SWITCH])
  1318 			g_print("    Line %ld column %ld - Non-%s "
  1319 			  "character %u\n",linecnt,
  1320 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1321 		    else
  1322 			cnt_bin++;
  1323 		    eInvalidChar=TRUE;
  1324 		}
  1325 	    }
  1326 	}
  1327 	if (!eTab && c==CHAR_TAB)
  1328 	{
  1329 	    if (pswit[ECHO_SWITCH])
  1330 		g_print("\n%s\n",aline);
  1331 	    if (!pswit[OVERVIEW_SWITCH])
  1332 		g_print("    Line %ld column %ld - Tab character?\n",
  1333 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1334 	    else
  1335 		cnt_odd++;
  1336 	    eTab=TRUE;
  1337 	}
  1338 	if (!eTilde && c==CHAR_TILDE)
  1339 	{
  1340 	    /*
  1341 	     * Often used by OCR software to indicate an
  1342 	     * unrecognizable character.
  1343 	     */
  1344 	    if (pswit[ECHO_SWITCH])
  1345 		g_print("\n%s\n",aline);
  1346 	    if (!pswit[OVERVIEW_SWITCH])
  1347 		g_print("    Line %ld column %ld - Tilde character?\n",
  1348 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1349 	    else
  1350 		cnt_odd++;
  1351 	    eTilde=TRUE;
  1352 	}
  1353 	if (!eCarat && c==CHAR_CARAT)
  1354 	{  
  1355 	    if (pswit[ECHO_SWITCH])
  1356 		g_print("\n%s\n",aline);
  1357 	    if (!pswit[OVERVIEW_SWITCH])
  1358 		g_print("    Line %ld column %ld - Carat character?\n",
  1359 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1360 	    else
  1361 		cnt_odd++;
  1362 	    eCarat=TRUE;
  1363 	}
  1364 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1365 	{  
  1366 	    if (pswit[ECHO_SWITCH])
  1367 		g_print("\n%s\n",aline);
  1368 	    if (!pswit[OVERVIEW_SWITCH])
  1369 		g_print("    Line %ld column %ld - Forward slash?\n",
  1370 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1371 	    else
  1372 		cnt_odd++;
  1373 	    eFSlash=TRUE;
  1374 	}
  1375 	/*
  1376 	 * Report asterisks only in paranoid mode,
  1377 	 * since they're often deliberate.
  1378 	 */
  1379 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1380 	  c==CHAR_ASTERISK)
  1381 	{
  1382 	    if (pswit[ECHO_SWITCH])
  1383 		g_print("\n%s\n",aline);
  1384 	    if (!pswit[OVERVIEW_SWITCH])
  1385 		g_print("    Line %ld column %ld - Asterisk?\n",
  1386 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1387 	    else
  1388 		cnt_odd++;
  1389 	    eAst=TRUE;
  1390 	}
  1391     }
  1392 }
  1393 
  1394 /*
  1395  * check_for_long_line:
  1396  *
  1397  * Check for line too long.
  1398  */
  1399 void check_for_long_line(const char *aline)
  1400 {
  1401     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1402     {
  1403 	if (pswit[ECHO_SWITCH])
  1404 	    g_print("\n%s\n",aline);
  1405 	if (!pswit[OVERVIEW_SWITCH])
  1406 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1407 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1408 	else
  1409 	    cnt_long++;
  1410     }
  1411 }
  1412 
  1413 /*
  1414  * check_for_short_line:
  1415  *
  1416  * Check for line too short.
  1417  *
  1418  * This one is a bit trickier to implement: we don't want to
  1419  * flag the last line of a paragraph for being short, so we
  1420  * have to wait until we know that our current line is a
  1421  * "normal" line, then report the _previous_ line if it was too
  1422  * short. We also don't want to report indented lines like
  1423  * chapter heads or formatted quotations. We therefore keep
  1424  * last->len as the length of the last line examined, and
  1425  * last->blen as the length of the last but one, and try to
  1426  * suppress unnecessary warnings by checking that both were of
  1427  * "normal" length. We keep the first character of the last
  1428  * line in last->start, and if it was a space, we assume that
  1429  * the formatting is deliberate. I can't figure out a way to
  1430  * distinguish something like a quoted verse left-aligned or
  1431  * the header or footer of a letter from a paragraph of short
  1432  * lines - maybe if I examined the whole paragraph, and if the
  1433  * para has less than, say, 8 lines and if all lines are short,
  1434  * then just assume it's OK? Need to look at some texts to see
  1435  * how often a formula like this would get the right result.
  1436  */
  1437 void check_for_short_line(const char *aline,const struct line_properties *last)
  1438 {
  1439     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1440       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1441       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1442     {
  1443 	if (pswit[ECHO_SWITCH])
  1444 	    g_print("\n%s\n",prevline);
  1445 	if (!pswit[OVERVIEW_SWITCH])
  1446 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1447 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1448 	else
  1449 	    cnt_short++;
  1450     }
  1451 }
  1452 
  1453 /*
  1454  * check_for_starting_punctuation:
  1455  *
  1456  * Look for punctuation other than full ellipses at start of line.
  1457  */
  1458 void check_for_starting_punctuation(const char *aline)
  1459 {
  1460     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1461       !g_str_has_prefix(aline,". . ."))
  1462     {
  1463 	if (pswit[ECHO_SWITCH])
  1464 	    g_print("\n%s\n",aline);
  1465 	if (!pswit[OVERVIEW_SWITCH])
  1466 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1467 	      linecnt);
  1468 	else
  1469 	    cnt_punct++;
  1470     }
  1471 }
  1472 
  1473 /*
  1474  * check_for_spaced_emdash:
  1475  *
  1476  * Check for spaced em-dashes.
  1477  *
  1478  * We must check _all_ occurrences of "--" on the line
  1479  * hence the loop - even if the first double-dash is OK
  1480  * there may be another that's wrong later on.
  1481  */
  1482 void check_for_spaced_emdash(const char *aline)
  1483 {
  1484     const char *s,*t,*next;
  1485     for (s=aline;t=strstr(s,"--");s=next)
  1486     {
  1487 	next=g_utf8_next_char(g_utf8_next_char(t));
  1488 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1489 	  g_utf8_get_char(next)==CHAR_SPACE)
  1490 	{
  1491 	    if (pswit[ECHO_SWITCH])
  1492 		g_print("\n%s\n",aline);
  1493 	    if (!pswit[OVERVIEW_SWITCH])
  1494 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1495 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1496 	    else
  1497 		cnt_dash++;
  1498 	}
  1499     }
  1500 }
  1501 
  1502 /*
  1503  * check_for_spaced_dash:
  1504  *
  1505  * Check for spaced dashes.
  1506  */
  1507 void check_for_spaced_dash(const char *aline)
  1508 {
  1509     const char *s;
  1510     if ((s=strstr(aline," -")))
  1511     {
  1512 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1513 	{
  1514 	    if (pswit[ECHO_SWITCH])
  1515 		g_print("\n%s\n",aline);
  1516 	    if (!pswit[OVERVIEW_SWITCH])
  1517 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1518 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1519 	    else
  1520 		cnt_dash++;
  1521 	}
  1522     }
  1523     else if ((s=strstr(aline,"- ")))
  1524     {
  1525 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1526 	{
  1527 	    if (pswit[ECHO_SWITCH])
  1528 		g_print("\n%s\n",aline);
  1529 	    if (!pswit[OVERVIEW_SWITCH])
  1530 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1531 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1532 	    else
  1533 		cnt_dash++;
  1534 	}
  1535     }
  1536 }
  1537 
  1538 /*
  1539  * check_for_unmarked_paragraphs:
  1540  *
  1541  * Check for unmarked paragraphs indicated by separate speakers.
  1542  *
  1543  * May well be false positive:
  1544  * "Bravo!" "Wonderful!" called the crowd.
  1545  * but useful all the same.
  1546  */
  1547 void check_for_unmarked_paragraphs(const char *aline)
  1548 {
  1549     const char *s;
  1550     s=strstr(aline,"\"  \"");
  1551     if (!s)
  1552 	s=strstr(aline,"\" \"");
  1553     if (s)
  1554     {
  1555 	if (pswit[ECHO_SWITCH])
  1556 	    g_print("\n%s\n",aline);
  1557 	if (!pswit[OVERVIEW_SWITCH])
  1558 	    g_print("    Line %ld column %ld - "
  1559 	      "Query missing paragraph break?\n",
  1560 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1561 	else
  1562 	    cnt_punct++;
  1563     }
  1564 }
  1565 
  1566 /*
  1567  * check_for_jeebies:
  1568  *
  1569  * Check for "to he" and other easy h/b errors.
  1570  *
  1571  * This is a very inadequate effort on the h/b problem,
  1572  * but the phrase "to he" is always an error, whereas "to
  1573  * be" is quite common.
  1574  * Similarly, '"Quiet!", be said.' is a non-be error
  1575  * "to he" is _not_ always an error!:
  1576  *       "Where they went to he couldn't say."
  1577  * Another false positive:
  1578  *       What would "Cinderella" be without the . . .
  1579  * and another: "If he wants to he can see for himself."
  1580  */
  1581 void check_for_jeebies(const char *aline)
  1582 {
  1583     const char *s;
  1584     s=strstr(aline," be could ");
  1585     if (!s)
  1586 	s=strstr(aline," be would ");
  1587     if (!s)
  1588 	s=strstr(aline," was be ");
  1589     if (!s)
  1590 	s=strstr(aline," be is ");
  1591     if (!s)
  1592 	s=strstr(aline," is be ");
  1593     if (!s)
  1594 	s=strstr(aline,"\", be ");
  1595     if (!s)
  1596 	s=strstr(aline,"\" be ");
  1597     if (!s)
  1598 	s=strstr(aline,"\" be ");
  1599     if (!s)
  1600 	s=strstr(aline," to he ");
  1601     if (s)
  1602     {
  1603 	if (pswit[ECHO_SWITCH])
  1604 	    g_print("\n%s\n",aline);
  1605 	if (!pswit[OVERVIEW_SWITCH])
  1606 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1607 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1608 	else
  1609 	    cnt_word++;
  1610     }
  1611     s=strstr(aline," the had ");
  1612     if (!s)
  1613 	s=strstr(aline," a had ");
  1614     if (!s)
  1615 	s=strstr(aline," they bad ");
  1616     if (!s)
  1617 	s=strstr(aline," she bad ");
  1618     if (!s)
  1619 	s=strstr(aline," he bad ");
  1620     if (!s)
  1621 	s=strstr(aline," you bad ");
  1622     if (!s)
  1623 	s=strstr(aline," i bad ");
  1624     if (s)
  1625     {
  1626 	if (pswit[ECHO_SWITCH])
  1627 	    g_print("\n%s\n",aline);
  1628 	if (!pswit[OVERVIEW_SWITCH])
  1629 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1630 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1631 	else
  1632 	    cnt_word++;
  1633     }
  1634     s=strstr(aline,"; hut ");
  1635     if (!s)
  1636 	s=strstr(aline,", hut ");
  1637     if (s)
  1638     {
  1639 	if (pswit[ECHO_SWITCH])
  1640 	    g_print("\n%s\n",aline);
  1641 	if (!pswit[OVERVIEW_SWITCH])
  1642 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1643 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1644 	else
  1645 	    cnt_word++;
  1646     }
  1647 }
  1648 
  1649 /*
  1650  * check_for_mta_from:
  1651  *
  1652  * Special case - angled bracket in front of "From" placed there by an
  1653  * MTA when sending an e-mail.
  1654  */
  1655 void check_for_mta_from(const char *aline)
  1656 {
  1657     const char *s;
  1658     s=strstr(aline,">From");
  1659     if (s)
  1660     {
  1661 	if (pswit[ECHO_SWITCH])
  1662 	    g_print("\n%s\n",aline);
  1663 	if (!pswit[OVERVIEW_SWITCH])
  1664 	    g_print("    Line %ld column %ld - "
  1665 	      "Query angled bracket with From\n",
  1666 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1667 	else
  1668 	    cnt_punct++;
  1669     }
  1670 }
  1671 
  1672 /*
  1673  * check_for_orphan_character:
  1674  *
  1675  * Check for a single character line -
  1676  * often an overflow from bad wrapping.
  1677  */
  1678 void check_for_orphan_character(const char *aline)
  1679 {
  1680     gunichar c;
  1681     c=g_utf8_get_char(aline);
  1682     if (c && !*g_utf8_next_char(aline))
  1683     {
  1684 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1685 	    ; /* Nothing - ignore numerals alone on a line. */
  1686 	else
  1687 	{
  1688 	    if (pswit[ECHO_SWITCH])
  1689 		g_print("\n%s\n",aline);
  1690 	    if (!pswit[OVERVIEW_SWITCH])
  1691 		g_print("    Line %ld column 1 - Query single character line\n",
  1692 		  linecnt);
  1693 	    else
  1694 		cnt_punct++;
  1695 	}
  1696     }
  1697 }
  1698 
  1699 /*
  1700  * check_for_pling_scanno:
  1701  *
  1702  * Check for I" - often should be !
  1703  */
  1704 void check_for_pling_scanno(const char *aline)
  1705 {
  1706     const char *s;
  1707     s=strstr(aline," I\"");
  1708     if (s)
  1709     {
  1710 	if (pswit[ECHO_SWITCH])
  1711 	    g_print("\n%s\n",aline);
  1712 	if (!pswit[OVERVIEW_SWITCH])
  1713 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1714 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1715 	else
  1716 	    cnt_punct++;
  1717     }
  1718 }
  1719 
  1720 /*
  1721  * check_for_extra_period:
  1722  *
  1723  * Check for period without a capital letter. Cut-down from gutspell.
  1724  * Only works when it happens on a single line.
  1725  */
  1726 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1727 {
  1728     const char *s,*t,*s1,*sprev;
  1729     int i;
  1730     gsize len;
  1731     gboolean istypo;
  1732     gchar *testword;
  1733     gunichar c,nc,pc,*decomposition;
  1734     if (pswit[PARANOID_SWITCH])
  1735     {
  1736 	for (t=aline;t=strstr(t,". ");)
  1737 	{
  1738 	    if (t==aline)
  1739 	    {
  1740 		t=g_utf8_next_char(t);
  1741 		/* start of line punctuation is handled elsewhere */
  1742 		continue;
  1743 	    }
  1744 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1745 	    {
  1746 		t=g_utf8_next_char(t);
  1747 		continue;
  1748 	    }
  1749 	    if (warnings->isDutch)
  1750 	    {
  1751 		/* For Frank & Jeroen -- 's Middags case */
  1752 		gunichar c2,c3,c4,c5;
  1753 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1754 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1755 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1756 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1757 		if (CHAR_IS_APOSTROPHE(c2) &&
  1758 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1759 		  g_unichar_isupper(c5))
  1760 		{
  1761 		    t=g_utf8_next_char(t);
  1762 		    continue;
  1763 		}
  1764 	    }
  1765 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1766 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1767 	      !isdigit(g_utf8_get_char(s1)))
  1768 		s1=g_utf8_next_char(s1);
  1769 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1770 	    {
  1771 		/* we have something to investigate */
  1772 		istypo=TRUE;
  1773 		/* so let's go back and find out */
  1774 		nc=g_utf8_get_char(t);
  1775 		s1=g_utf8_prev_char(t);
  1776 		c=g_utf8_get_char(s1);
  1777 		sprev=g_utf8_prev_char(s1);
  1778 		pc=g_utf8_get_char(sprev);
  1779 		while (s1>=aline &&
  1780 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1781 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1782 		  g_unichar_isalpha(nc)))
  1783 		{
  1784 		    nc=c;
  1785 		    s1=sprev;
  1786 		    c=pc;
  1787 		    sprev=g_utf8_prev_char(s1);
  1788 		    pc=g_utf8_get_char(sprev);
  1789 		}
  1790 		s1=g_utf8_next_char(s1);
  1791 		s=strchr(s1,'.');
  1792 		if (s)
  1793 		    testword=g_strndup(s1,s-s1);
  1794 		else
  1795 		    testword=g_strdup(s1);
  1796 		for (i=0;*abbrev[i];i++)
  1797 		    if (!strcmp(testword,abbrev[i]))
  1798 			istypo=FALSE;
  1799 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1800 		    istypo=FALSE;
  1801 		if (!*g_utf8_next_char(testword))
  1802 		    istypo=FALSE;
  1803 		if (isroman(testword))
  1804 		    istypo=FALSE;
  1805 		if (istypo)
  1806 		{
  1807 		    istypo=FALSE;
  1808 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1809 		    {
  1810 			decomposition=g_unicode_canonical_decomposition(
  1811 			  g_utf8_get_char(s),&len);
  1812 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1813 			    istypo=TRUE;
  1814 			g_free(decomposition);
  1815 		    }
  1816 		}
  1817 		if (istypo &&
  1818 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1819 		{
  1820 		    g_tree_insert(qperiod,g_strdup(testword),
  1821 		      GINT_TO_POINTER(1));
  1822 		    if (pswit[ECHO_SWITCH])
  1823 			g_print("\n%s\n",aline);
  1824 		    if (!pswit[OVERVIEW_SWITCH])
  1825 			g_print("    Line %ld column %ld - Extra period?\n",
  1826 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1827 		    else
  1828 			cnt_punct++;
  1829 		}
  1830 		g_free(testword);
  1831 	    }
  1832 	    t=g_utf8_next_char(t);
  1833 	}
  1834     }
  1835 }
  1836 
  1837 /*
  1838  * check_for_following_punctuation:
  1839  *
  1840  * Check for words usually not followed by punctuation.
  1841  */
  1842 void check_for_following_punctuation(const char *aline)
  1843 {
  1844     int i;
  1845     const char *s,*wordstart;
  1846     gunichar c;
  1847     gchar *inword,*t;
  1848     if (pswit[TYPO_SWITCH])
  1849     {
  1850 	for (s=aline;*s;)
  1851 	{
  1852 	    wordstart=s;
  1853 	    t=getaword(&s);
  1854 	    if (!*t)
  1855 	    {
  1856 		g_free(t);
  1857 		continue;
  1858 	    }
  1859 	    inword=g_utf8_strdown(t,-1);
  1860 	    g_free(t);
  1861 	    for (i=0;*nocomma[i];i++)
  1862 		if (!strcmp(inword,nocomma[i]))
  1863 		{
  1864 		    c=g_utf8_get_char(s);
  1865 		    if (c==',' || c==';' || c==':')
  1866 		    {
  1867 			if (pswit[ECHO_SWITCH])
  1868 			    g_print("\n%s\n",aline);
  1869 			if (!pswit[OVERVIEW_SWITCH])
  1870 			    g_print("    Line %ld column %ld - "
  1871 			      "Query punctuation after %s?\n",
  1872 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1873 			      inword);
  1874 			else
  1875 			    cnt_punct++;
  1876 		    }
  1877 		}
  1878 	    for (i=0;*noperiod[i];i++)
  1879 		if (!strcmp(inword,noperiod[i]))
  1880 		{
  1881 		    c=g_utf8_get_char(s);
  1882 		    if (c=='.' || c=='!')
  1883 		    {
  1884 			if (pswit[ECHO_SWITCH])
  1885 			    g_print("\n%s\n",aline);
  1886 			if (!pswit[OVERVIEW_SWITCH])
  1887 			    g_print("    Line %ld column %ld - "
  1888 			      "Query punctuation after %s?\n",
  1889 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1890 			      inword);
  1891 			else
  1892 			    cnt_punct++;
  1893 		    }
  1894 		}
  1895 	    g_free(inword);
  1896 	}
  1897     }
  1898 }
  1899 
  1900 /*
  1901  * check_for_typos:
  1902  *
  1903  * Check for commonly mistyped words,
  1904  * and digits like 0 for O in a word.
  1905  */
  1906 void check_for_typos(const char *aline,struct warnings *warnings)
  1907 {
  1908     const char *s,*t,*nt,*wordstart;
  1909     gchar *inword;
  1910     gunichar *decomposition;
  1911     gchar *testword;
  1912     int i,vowel,consonant,*dupcnt;
  1913     gboolean isdup,istypo,alower;
  1914     gunichar c,pc;
  1915     long offset,len;
  1916     gsize decomposition_len;
  1917     for (s=aline;*s;)
  1918     {
  1919 	wordstart=s;
  1920 	inword=getaword(&s);
  1921 	if (!*inword)
  1922 	{
  1923 	    g_free(inword);
  1924 	    continue; /* don't bother with empty lines */
  1925 	}
  1926 	if (mixdigit(inword))
  1927 	{
  1928 	    if (pswit[ECHO_SWITCH])
  1929 		g_print("\n%s\n",aline);
  1930 	    if (!pswit[OVERVIEW_SWITCH])
  1931 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1932 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1933 	    else
  1934 		cnt_word++;
  1935 	}
  1936 	/*
  1937 	 * Put the word through a series of tests for likely typos and OCR
  1938 	 * errors.
  1939 	 */
  1940 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1941 	{
  1942 	    istypo=FALSE;
  1943 	    alower=FALSE;
  1944 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1945 	    {
  1946 		c=g_utf8_get_char(t);
  1947 		nt=g_utf8_next_char(t);
  1948 		/* lowercase for testing */
  1949 		if (g_unichar_islower(c))
  1950 		    alower=TRUE;
  1951 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1952 		{
  1953 		    /*
  1954 		     * We have an uppercase mid-word. However, there are
  1955 		     * common cases:
  1956 		     *   Mac and Mc like McGill
  1957 		     *   French contractions like l'Abbe
  1958 		     */
  1959 		    offset=g_utf8_pointer_to_offset(inword,t);
  1960 		    if (offset>0)
  1961 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1962 		    else
  1963 			pc='\0';
  1964 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1965 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1966 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1967 		      CHAR_IS_APOSTROPHE(pc))
  1968 			; /* do nothing! */
  1969 		    else
  1970 			istypo=TRUE;
  1971 		}
  1972 	    }
  1973 	    testword=g_utf8_casefold(inword,-1);
  1974 	}
  1975 	if (pswit[TYPO_SWITCH])
  1976 	{
  1977 	    /*
  1978 	     * Check for certain unlikely two-letter combinations at word
  1979 	     * start and end.
  1980 	     */
  1981 	    len=g_utf8_strlen(testword,-1);
  1982 	    if (len>1)
  1983 	    {
  1984 		for (i=0;*nostart[i];i++)
  1985 		    if (g_str_has_prefix(testword,nostart[i]))
  1986 			istypo=TRUE;
  1987 		for (i=0;*noend[i];i++)
  1988 		    if (g_str_has_suffix(testword,noend[i]))
  1989 			istypo=TRUE;
  1990 	    }
  1991 	    /* ght is common, gbt never. Like that. */
  1992 	    if (strstr(testword,"cb"))
  1993 		istypo=TRUE;
  1994 	    if (strstr(testword,"gbt"))
  1995 		istypo=TRUE;
  1996 	    if (strstr(testword,"pbt"))
  1997 		istypo=TRUE;
  1998 	    if (strstr(testword,"tbs"))
  1999 		istypo=TRUE;
  2000 	    if (strstr(testword,"mrn"))
  2001 		istypo=TRUE;
  2002 	    if (strstr(testword,"ahle"))
  2003 		istypo=TRUE;
  2004 	    if (strstr(testword,"ihle"))
  2005 		istypo=TRUE;
  2006 	    /*
  2007 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2008 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2009 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2010 	     * numerals, but "ii" is a common scanno.
  2011 	     */
  2012 	    if (strstr(testword,"tbi"))
  2013 		istypo=TRUE;
  2014 	    if (strstr(testword,"tbe"))
  2015 		istypo=TRUE;
  2016 	    if (strstr(testword,"ii"))
  2017 		istypo=TRUE;
  2018 	    /*
  2019 	     * Check for no vowels or no consonants.
  2020 	     * If none, flag a typo.
  2021 	     */
  2022 	    if (!istypo && len>1)
  2023 	    {
  2024 		vowel=consonant=0;
  2025 		for (t=testword;*t;t=g_utf8_next_char(t))
  2026 		{
  2027 		    c=g_utf8_get_char(t);
  2028 		    decomposition=
  2029 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2030 		    if (c=='y' || g_unichar_isdigit(c))
  2031 		    {
  2032 			/* Yah, this is loose. */
  2033 			vowel++;
  2034 			consonant++;
  2035 		    }
  2036 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2037 			vowel++;
  2038 		    else
  2039 			consonant++;
  2040 		    g_free(decomposition);
  2041 		}
  2042 		if (!vowel || !consonant)
  2043 		    istypo=TRUE;
  2044 	    }
  2045 	    /*
  2046 	     * Now exclude the word from being reported if it's in
  2047 	     * the okword list.
  2048 	     */
  2049 	    for (i=0;*okword[i];i++)
  2050 		if (!strcmp(testword,okword[i]))
  2051 		    istypo=FALSE;
  2052 	    /*
  2053 	     * What looks like a typo may be a Roman numeral.
  2054 	     * Exclude these.
  2055 	     */
  2056 	    if (istypo && isroman(testword))
  2057 		istypo=FALSE;
  2058 	    /* Check the manual list of typos. */
  2059 	    if (!istypo)
  2060 		for (i=0;*typo[i];i++)
  2061 		    if (!strcmp(testword,typo[i]))
  2062 			istypo=TRUE;
  2063 	    /*
  2064 	     * Check lowercase s, l, i and m - special cases.
  2065 	     *   "j" - often a semi-colon gone wrong.
  2066 	     *   "d" for a missing apostrophe - he d
  2067 	     *   "n" for "in"
  2068 	     */
  2069 	    if (!istypo && len==1 &&
  2070 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2071 		istypo=TRUE;
  2072 	    if (istypo)
  2073 	    {
  2074 		dupcnt=g_tree_lookup(qword,testword);
  2075 		if (dupcnt)
  2076 		{
  2077 		    (*dupcnt)++;
  2078 		    isdup=!pswit[VERBOSE_SWITCH];
  2079 		}
  2080 		else
  2081 		{
  2082 		    dupcnt=g_new0(int,1);
  2083 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2084 		    isdup=FALSE;
  2085 		}
  2086 		if (!isdup)
  2087 		{
  2088 		    if (pswit[ECHO_SWITCH])
  2089 			g_print("\n%s\n",aline);
  2090 		    if (!pswit[OVERVIEW_SWITCH])
  2091 		    {
  2092 			g_print("    Line %ld column %ld - Query word %s",
  2093 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2094 			  inword);
  2095 			if (!pswit[VERBOSE_SWITCH])
  2096 			    g_print(" - not reporting duplicates");
  2097 			g_print("\n");
  2098 		    }
  2099 		    else
  2100 			cnt_word++;
  2101 		}
  2102 	    }
  2103 	}
  2104 	/* check the user's list of typos */
  2105 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2106 	{
  2107 	    if (pswit[ECHO_SWITCH])
  2108 		g_print("\n%s\n",aline);
  2109 	    if (!pswit[OVERVIEW_SWITCH])  
  2110 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2111 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2112 	}
  2113 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2114 	    g_free(testword);
  2115 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2116 	{
  2117 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2118 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2119 	    {
  2120 		if (pswit[ECHO_SWITCH])
  2121 		    g_print("\n%s\n",aline);
  2122 		if (!pswit[OVERVIEW_SWITCH])
  2123 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2124 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2125 		      inword);
  2126 		else
  2127 		    cnt_word++;
  2128 	    }
  2129 	}
  2130 	g_free(inword);
  2131     }
  2132 }
  2133 
  2134 /*
  2135  * check_for_misspaced_punctuation:
  2136  *
  2137  * Look for added or missing spaces around punctuation and quotes.
  2138  * If there is a punctuation character like ! with no space on
  2139  * either side, suspect a missing!space. If there are spaces on
  2140  * both sides , assume a typo. If we see a double quote with no
  2141  * space or punctuation on either side of it, assume unspaced
  2142  * quotes "like"this.
  2143  */
  2144 void check_for_misspaced_punctuation(const char *aline,
  2145   struct parities *parities,gboolean isemptyline)
  2146 {
  2147     gboolean isacro,isellipsis;
  2148     const char *s;
  2149     gunichar c,nc,pc,n2c;
  2150     c=g_utf8_get_char(aline);
  2151     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2152     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2153     {
  2154 	pc=c;
  2155 	c=nc;
  2156 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2157 	/* For each character in the line after the first. */
  2158 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2159 	{
  2160 	    /* we need to suppress warnings for acronyms like M.D. */
  2161 	    isacro=FALSE;
  2162 	    /* we need to suppress warnings for ellipsis . . . */
  2163 	    isellipsis=FALSE;
  2164 	    /*
  2165 	     * If there are letters on both sides of it or
  2166 	     * if it's strict punctuation followed by an alpha.
  2167 	     */
  2168 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2169 	      g_utf8_strchr("?!,;:",-1,c)))
  2170 	    {
  2171 		if (c=='.')
  2172 		{
  2173 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2174 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2175 			isacro=TRUE;
  2176 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2177 		    if (nc && n2c=='.')
  2178 			isacro=TRUE;
  2179 		}
  2180 		if (!isacro)
  2181 		{
  2182 		    if (pswit[ECHO_SWITCH])
  2183 			g_print("\n%s\n",aline);
  2184 		    if (!pswit[OVERVIEW_SWITCH])
  2185 			g_print("    Line %ld column %ld - Missing space?\n",
  2186 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2187 		    else
  2188 			cnt_punct++;
  2189 		}
  2190 	    }
  2191 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2192 	    {
  2193 		/*
  2194 		 * If there are spaces on both sides,
  2195 		 * or space before and end of line.
  2196 		 */
  2197 		if (c=='.')
  2198 		{
  2199 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2200 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2201 			isellipsis=TRUE;
  2202 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2203 		    if (nc && n2c=='.')
  2204 			isellipsis=TRUE;
  2205 		}
  2206 		if (!isemptyline && !isellipsis)
  2207 		{
  2208 		    if (pswit[ECHO_SWITCH])
  2209 			g_print("\n%s\n",aline);
  2210 		    if (!pswit[OVERVIEW_SWITCH])
  2211 			g_print("    Line %ld column %ld - "
  2212 			  "Spaced punctuation?\n",linecnt,
  2213 			  g_utf8_pointer_to_offset(aline,s)+1);
  2214 		    else
  2215 			cnt_punct++;
  2216 		}
  2217 	    }
  2218 	}
  2219     }
  2220     /* Split out the characters that CANNOT be preceded by space. */
  2221     c=g_utf8_get_char(aline);
  2222     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2223     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2224     {
  2225 	pc=c;
  2226 	c=nc;
  2227 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2228 	/* for each character in the line after the first */
  2229 	if (g_utf8_strchr("?!,;:",-1,c))
  2230 	{
  2231 	    /* if it's punctuation that _cannot_ have a space before it */
  2232 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2233 	    {
  2234 		/*
  2235 		 * If nc DOES == space,
  2236 		 * it was already reported just above.
  2237 		 */
  2238 		if (pswit[ECHO_SWITCH])
  2239 		    g_print("\n%s\n",aline);
  2240 		if (!pswit[OVERVIEW_SWITCH])
  2241 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2242 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2243 		else
  2244 		    cnt_punct++;
  2245 	    }
  2246 	}
  2247     }
  2248     /*
  2249      * Special case " .X" where X is any alpha.
  2250      * This plugs a hole in the acronym code above.
  2251      * Inelegant, but maintainable.
  2252      */
  2253     c=g_utf8_get_char(aline);
  2254     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2255     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2256     {
  2257 	pc=c;
  2258 	c=nc;
  2259 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2260 	/* for each character in the line after the first */
  2261 	if (c=='.')
  2262 	{
  2263 	    /* if it's a period */
  2264 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2265 	    {
  2266 		/*
  2267 		 * If the period follows a space and
  2268 		 * is followed by a letter.
  2269 		 */
  2270 		if (pswit[ECHO_SWITCH])
  2271 		    g_print("\n%s\n",aline);
  2272 		if (!pswit[OVERVIEW_SWITCH])
  2273 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2274 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2275 		else
  2276 		    cnt_punct++;
  2277 	    }
  2278 	}
  2279     }
  2280     c=g_utf8_get_char(aline);
  2281     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2282     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2283     {
  2284 	pc=c;
  2285 	c=nc;
  2286 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2287 	/* for each character in the line after the first */
  2288 	if (c==CHAR_DQUOTE)
  2289 	{
  2290 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2291 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2292 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2293 	    {
  2294 		if (pswit[ECHO_SWITCH])
  2295 		    g_print("\n%s\n",aline);
  2296 		if (!pswit[OVERVIEW_SWITCH])
  2297 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2298 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2299 		else
  2300 		    cnt_punct++;
  2301 	    }
  2302 	}
  2303     }
  2304     /* Check parity of quotes. */
  2305     nc=g_utf8_get_char(aline);
  2306     for (s=aline;*s;s=g_utf8_next_char(s))
  2307     {
  2308 	c=nc;
  2309 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2310 	if (c==CHAR_DQUOTE)
  2311 	{
  2312 	    parities->dquote=!parities->dquote;
  2313 	    if (!parities->dquote)
  2314 	    {
  2315 		/* parity even */
  2316 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2317 		{
  2318 		    if (pswit[ECHO_SWITCH])
  2319 			g_print("\n%s\n",aline);
  2320 		    if (!pswit[OVERVIEW_SWITCH])
  2321 			g_print("    Line %ld column %ld - "
  2322 			  "Wrongspaced quotes?\n",
  2323 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2324 		    else
  2325 			cnt_punct++;
  2326 		}
  2327 	    }
  2328 	    else
  2329 	    {
  2330 		/* parity odd */
  2331 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2332 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2333 		{
  2334 		    if (pswit[ECHO_SWITCH])
  2335 			g_print("\n%s\n",aline);
  2336 		    if (!pswit[OVERVIEW_SWITCH])
  2337 			g_print("    Line %ld column %ld - "
  2338 			  "Wrongspaced quotes?\n",
  2339 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2340 		    else
  2341 			cnt_punct++;
  2342 		}
  2343 	    }
  2344 	}
  2345     }
  2346     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  2347     {
  2348 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2349 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2350 	{
  2351 	    if (pswit[ECHO_SWITCH])
  2352 		g_print("\n%s\n",aline);
  2353 	    if (!pswit[OVERVIEW_SWITCH])
  2354 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2355 		  linecnt);
  2356 	    else
  2357 		cnt_punct++;
  2358 	}
  2359     }
  2360     if (pswit[SQUOTE_SWITCH])
  2361     {
  2362 	nc=g_utf8_get_char(aline);
  2363 	for (s=aline;*s;s=g_utf8_next_char(s))
  2364 	{
  2365 	    c=nc;
  2366 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2367 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2368 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2369 	      !g_unichar_isalpha(nc)))
  2370 	    {
  2371 		parities->squote=!parities->squote;
  2372 		if (!parities->squote)
  2373 		{
  2374 		    /* parity even */
  2375 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2376 		    {
  2377 			if (pswit[ECHO_SWITCH])
  2378 			    g_print("\n%s\n",aline);
  2379 			if (!pswit[OVERVIEW_SWITCH])
  2380 			    g_print("    Line %ld column %ld - "
  2381 			      "Wrongspaced singlequotes?\n",
  2382 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2383 			else
  2384 			    cnt_punct++;
  2385 		    }
  2386 		}
  2387 		else
  2388 		{
  2389 		    /* parity odd */
  2390 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2391 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2392 		    {
  2393 			if (pswit[ECHO_SWITCH])
  2394 			    g_print("\n%s\n",aline);
  2395 			if (!pswit[OVERVIEW_SWITCH])
  2396 			    g_print("    Line %ld column %ld - "
  2397 			      "Wrongspaced singlequotes?\n",
  2398 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2399 			else
  2400 			    cnt_punct++;
  2401 		    }
  2402 		}
  2403 	    }
  2404 	}
  2405     }
  2406 }
  2407 
  2408 /*
  2409  * check_for_double_punctuation:
  2410  *
  2411  * Look for double punctuation like ,. or ,,
  2412  * Thanks to DW for the suggestion!
  2413  * In books with references, ".," and ".;" are common
  2414  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2415  * OTOH, from my initial tests, there are also fairly
  2416  * common errors. What to do? Make these cases paranoid?
  2417  * ".," is the most common, so warnings->dotcomma is used
  2418  * to suppress detailed reporting if it occurs often.
  2419  */
  2420 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2421 {
  2422     const char *s;
  2423     gunichar c,nc;
  2424     nc=g_utf8_get_char(aline);
  2425     for (s=aline;*s;s=g_utf8_next_char(s))
  2426     {
  2427 	c=nc;
  2428 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2429 	/* for each punctuation character in the line */
  2430 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2431 	  g_utf8_strchr(".?!,;:",-1,nc))
  2432 	{
  2433 	    /* followed by punctuation, it's a query, unless . . . */
  2434 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2435 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2436 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2437 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2438 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2439 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2440 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2441 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2442 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2443 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2444 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2445 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2446 	    {
  2447 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2448 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2449 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2450 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2451 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2452 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2453 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2454 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2455 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2456 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2457 		{
  2458 		    s+=4;
  2459 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2460 		}
  2461 		; /* do nothing for .. !! and ?? which can be legit */
  2462 	    }
  2463 	    else
  2464 	    {
  2465 		if (pswit[ECHO_SWITCH])
  2466 		    g_print("\n%s\n",aline);
  2467 		if (!pswit[OVERVIEW_SWITCH])
  2468 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2469 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2470 		else
  2471 		    cnt_punct++;
  2472 	    }
  2473 	}
  2474     }
  2475 }
  2476 
  2477 /*
  2478  * check_for_spaced_quotes:
  2479  */
  2480 void check_for_spaced_quotes(const char *aline)
  2481 {
  2482     int i;
  2483     const char *s,*t;
  2484     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2485       CHAR_RS_QUOTE};
  2486     GString *pattern;
  2487     s=aline;
  2488     while ((t=strstr(s," \" ")))
  2489     {
  2490 	if (pswit[ECHO_SWITCH])
  2491 	    g_print("\n%s\n",aline);
  2492 	if (!pswit[OVERVIEW_SWITCH])
  2493 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2494 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2495 	else
  2496 	    cnt_punct++;
  2497 	s=g_utf8_next_char(g_utf8_next_char(t));
  2498     }
  2499     pattern=g_string_new(NULL);
  2500     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2501     {
  2502 	g_string_assign(pattern," ");
  2503 	g_string_append_unichar(pattern,single_quotes[i]);
  2504 	g_string_append_c(pattern,' ');
  2505 	s=aline;
  2506 	while ((t=strstr(s,pattern->str)))
  2507 	{
  2508 	    if (pswit[ECHO_SWITCH])
  2509 		g_print("\n%s\n",aline);
  2510 	    if (!pswit[OVERVIEW_SWITCH])
  2511 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2512 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2513 	    else
  2514 		cnt_punct++;
  2515 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2516 	}
  2517     }
  2518     g_string_free(pattern,TRUE);
  2519 }
  2520 
  2521 /*
  2522  * check_for_miscased_genative:
  2523  *
  2524  * Check special case of 'S instead of 's at end of word.
  2525  */
  2526 void check_for_miscased_genative(const char *aline)
  2527 {
  2528     const char *s;
  2529     gunichar c,nc,pc;
  2530     if (!*aline)
  2531 	return;
  2532     c=g_utf8_get_char(aline);
  2533     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2534     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2535     {
  2536 	pc=c;
  2537 	c=nc;
  2538 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2539 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2540 	{
  2541 	    if (pswit[ECHO_SWITCH])
  2542 		g_print("\n%s\n",aline);
  2543 	    if (!pswit[OVERVIEW_SWITCH])
  2544 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2545 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2546 	    else
  2547 		cnt_punct++;
  2548 	}
  2549     }
  2550 }
  2551 
  2552 /*
  2553  * check_end_of_line:
  2554  *
  2555  * Now check special cases - start and end of line -
  2556  * for single and double quotes. Start is sometimes [sic]
  2557  * but better to query it anyway.
  2558  * While we're here, check for dash at end of line.
  2559  */
  2560 void check_end_of_line(const char *aline,struct warnings *warnings)
  2561 {
  2562     int lbytes;
  2563     const char *s;
  2564     gunichar c1,c2;
  2565     lbytes=strlen(aline);
  2566     if (g_utf8_strlen(aline,lbytes)>1)
  2567     {
  2568 	s=g_utf8_prev_char(aline+lbytes);
  2569 	c1=g_utf8_get_char(s);
  2570 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2571 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2572 	{
  2573 	    if (pswit[ECHO_SWITCH])
  2574 		g_print("\n%s\n",aline);
  2575 	    if (!pswit[OVERVIEW_SWITCH])
  2576 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2577 		  g_utf8_strlen(aline,lbytes));
  2578 	    else
  2579 		cnt_punct++;
  2580 	}
  2581 	c1=g_utf8_get_char(aline);
  2582 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2583 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2584 	{
  2585 	    if (pswit[ECHO_SWITCH])
  2586 		g_print("\n%s\n",aline);
  2587 	    if (!pswit[OVERVIEW_SWITCH])
  2588 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2589 	    else
  2590 		cnt_punct++;
  2591 	}
  2592 	/*
  2593 	 * Dash at end of line may well be legit - paranoid mode only
  2594 	 * and don't report em-dash at line-end.
  2595 	 */
  2596 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2597 	{
  2598 	    for (s=g_utf8_prev_char(aline+lbytes);
  2599 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2600 		;
  2601 	    if (g_utf8_get_char(s)=='-' &&
  2602 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2603 	    {
  2604 		if (pswit[ECHO_SWITCH])
  2605 		    g_print("\n%s\n",aline);
  2606 		if (!pswit[OVERVIEW_SWITCH])
  2607 		    g_print("    Line %ld column %ld - "
  2608 		      "Hyphen at end of line?\n",
  2609 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2610 	    }
  2611 	}
  2612     }
  2613 }
  2614 
  2615 /*
  2616  * check_for_unspaced_bracket:
  2617  *
  2618  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2619  * If so, suspect a scanno like "a]most".
  2620  */
  2621 void check_for_unspaced_bracket(const char *aline)
  2622 {
  2623     const char *s;
  2624     gunichar c,nc,pc;
  2625     c=g_utf8_get_char(aline);
  2626     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2627     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2628     {
  2629 	pc=c;
  2630 	c=nc;
  2631 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2632 	if (!nc)
  2633 	    break;
  2634 	/* for each bracket character in the line except 1st & last */
  2635 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2636 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2637 	{
  2638 	    if (pswit[ECHO_SWITCH])
  2639 		g_print("\n%s\n",aline);
  2640 	    if (!pswit[OVERVIEW_SWITCH])
  2641 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2642 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2643 	    else
  2644 		cnt_punct++;
  2645 	}
  2646     }
  2647 }
  2648 
  2649 /*
  2650  * check_for_unpunctuated_endquote:
  2651  */
  2652 void check_for_unpunctuated_endquote(const char *aline)
  2653 {
  2654     const char *s;
  2655     gunichar c,nc,pc;
  2656     c=g_utf8_get_char(aline);
  2657     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2658     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2659     {
  2660 	pc=c;
  2661 	c=nc;
  2662 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2663 	/* for each character in the line except 1st */
  2664 	if (c==CHAR_DQUOTE && isalpha(pc))
  2665 	{
  2666 	    if (pswit[ECHO_SWITCH])
  2667 		g_print("\n%s\n",aline);
  2668 	    if (!pswit[OVERVIEW_SWITCH])
  2669 		g_print("    Line %ld column %ld - "
  2670 		  "endquote missing punctuation?\n",
  2671 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2672 	    else
  2673 		cnt_punct++;
  2674 	}
  2675     }
  2676 }
  2677 
  2678 /*
  2679  * check_for_html_tag:
  2680  *
  2681  * Check for <HTML TAG>.
  2682  *
  2683  * If there is a < in the line, followed at some point
  2684  * by a > then we suspect HTML.
  2685  */
  2686 void check_for_html_tag(const char *aline)
  2687 {
  2688     const char *open,*close;
  2689     gchar *tag;
  2690     open=strchr(aline,'<');
  2691     if (open)
  2692     {
  2693 	close=strchr(g_utf8_next_char(open),'>');
  2694 	if (close)
  2695 	{
  2696 	    if (pswit[ECHO_SWITCH])
  2697 		g_print("\n%s\n",aline);
  2698 	    if (!pswit[OVERVIEW_SWITCH])
  2699 	    {
  2700 		tag=g_strndup(open,close-open+1);
  2701 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2702 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2703 		g_free(tag);
  2704 	    }
  2705 	    else
  2706 		cnt_html++;
  2707 	}
  2708     }
  2709 }
  2710 
  2711 /*
  2712  * check_for_html_entity:
  2713  *
  2714  * Check for &symbol; HTML.
  2715  *
  2716  * If there is a & in the line, followed at
  2717  * some point by a ; then we suspect HTML.
  2718  */
  2719 void check_for_html_entity(const char *aline)
  2720 {
  2721     const char *s,*amp,*scolon;
  2722     gchar *entity;
  2723     amp=strchr(aline,'&');
  2724     if (amp)
  2725     {
  2726 	scolon=strchr(amp,';');
  2727 	if (scolon)
  2728 	{
  2729 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2730 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2731 		    break;		/* Don't report "Jones & Son;" */
  2732 	    if (s>=scolon)
  2733 	    {
  2734 		if (pswit[ECHO_SWITCH])
  2735 		    g_print("\n%s\n",aline);
  2736 		if (!pswit[OVERVIEW_SWITCH])
  2737 		{
  2738 		    entity=g_strndup(amp,scolon-amp+1);
  2739 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2740 		      linecnt,(int)(amp-aline)+1,entity);
  2741 		    g_free(entity);
  2742 		}
  2743 		else
  2744 		    cnt_html++;
  2745 	    }
  2746 	}
  2747     }
  2748 }
  2749 
  2750 /*
  2751  * check_for_omitted_punctuation:
  2752  *
  2753  * Check for omitted punctuation at end of paragraph by working back
  2754  * through prevline. DW.
  2755  * Need to check this only for "normal" paras.
  2756  * So what is a "normal" para?
  2757  *    Not normal if one-liner (chapter headings, etc.)
  2758  *    Not normal if doesn't contain at least one locase letter
  2759  *    Not normal if starts with space
  2760  */
  2761 void check_for_omitted_punctuation(const char *prevline,
  2762   struct line_properties *last,int start_para_line)
  2763 {
  2764     gboolean letter_on_line=FALSE;
  2765     const char *s;
  2766     gunichar c;
  2767     for (s=prevline;*s;s=g_utf8_next_char(s))
  2768 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2769 	{
  2770 	    letter_on_line=TRUE;
  2771 	    break;
  2772 	}
  2773     /*
  2774      * This next "if" is a problem.
  2775      * If we say "start_para_line <= linecnt - 1", that includes
  2776      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2777      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2778      * misses genuine one-line paragraphs.
  2779      */
  2780     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2781       g_utf8_get_char(prevline)>CHAR_SPACE)
  2782     {
  2783 	s=prevline+strlen(prevline);
  2784 	do
  2785 	{
  2786 	    s=g_utf8_prev_char(s);
  2787 	    c=g_utf8_get_char(s);
  2788 	} while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
  2789 	for (;s>prevline;s=g_utf8_prev_char(s))
  2790 	{
  2791 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2792 	    {
  2793 		if (pswit[ECHO_SWITCH])
  2794 		    g_print("\n%s\n",prevline);
  2795 		if (!pswit[OVERVIEW_SWITCH])
  2796 		    g_print("    Line %ld column %ld - "
  2797 		      "No punctuation at para end?\n",
  2798 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2799 		else
  2800 		    cnt_punct++;
  2801 		break;
  2802 	    }
  2803 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2804 		break;
  2805 	}
  2806     }
  2807 }
  2808 
  2809 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2810 {
  2811     const char *word=key;
  2812     int *dupcnt=value;
  2813     if (*dupcnt)
  2814 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2815 	  word,*dupcnt);
  2816     return FALSE;
  2817 }
  2818 
  2819 void print_as_windows_1252(const char *string)
  2820 {
  2821     gsize inbytes,outbytes;
  2822     gchar *buf,*bp;
  2823     static GIConv converter=(GIConv)-1;
  2824     if (!string)
  2825     {
  2826 	if (converter!=(GIConv)-1)
  2827 	    g_iconv_close(converter);
  2828 	converter=(GIConv)-1;
  2829 	return;
  2830     }
  2831     if (converter==(GIConv)-1)
  2832 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2833     if (converter!=(GIConv)-1)
  2834     {
  2835 	inbytes=outbytes=strlen(string);
  2836 	bp=buf=g_malloc(outbytes+1);
  2837 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2838 	*bp='\0';
  2839 	fputs(buf,stdout);
  2840 	g_free(buf);
  2841     }
  2842     else
  2843 	fputs(string,stdout);
  2844 }
  2845 
  2846 void print_as_utf_8(const char *string)
  2847 {
  2848     fputs(string,stdout);
  2849 }
  2850 
  2851 /*
  2852  * procfile:
  2853  *
  2854  * Process one file.
  2855  */
  2856 void procfile(const char *filename)
  2857 {
  2858     const char *s;
  2859     gchar *parastart=NULL;	/* first line of current para */
  2860     gchar *etext,*aline;
  2861     gchar *etext_ptr;
  2862     GError *err=NULL;
  2863     struct first_pass_results *first_pass_results;
  2864     struct warnings *warnings;
  2865     struct counters counters={0};
  2866     struct line_properties last={0};
  2867     struct parities parities={0};
  2868     struct pending pending={0};
  2869     gboolean isemptyline;
  2870     long start_para_line=0;
  2871     gboolean isnewpara=FALSE,enddash=FALSE;
  2872     last.start=CHAR_SPACE;
  2873     linecnt=checked_linecnt=0;
  2874     etext=read_etext(filename,&err);
  2875     if (!etext)
  2876     {
  2877 	if (pswit[STDOUT_SWITCH])
  2878 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2879 	else
  2880 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2881 	exit(1);
  2882     }
  2883     g_print("\n\nFile: %s\n\n",filename);
  2884     first_pass_results=first_pass(etext);
  2885     warnings=report_first_pass(first_pass_results);
  2886     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2887     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2888     /*
  2889      * Here we go with the main pass. Hold onto yer hat!
  2890      */
  2891     linecnt=0;
  2892     etext_ptr=etext;
  2893     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2894     {
  2895 	linecnt++;
  2896 	if (linecnt==1)
  2897 	    isnewpara=TRUE;
  2898 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2899 	    continue;    // skip DP page separators completely
  2900 	if (linecnt<first_pass_results->firstline ||
  2901 	  (first_pass_results->footerline>0 &&
  2902 	  linecnt>first_pass_results->footerline))
  2903 	{
  2904 	    if (pswit[HEADER_SWITCH])
  2905 	    {
  2906 		if (g_str_has_prefix(aline,"Title:"))
  2907 		    g_print("    %s\n",aline);
  2908 		if (g_str_has_prefix(aline,"Author:"))
  2909 		    g_print("    %s\n",aline);
  2910 		if (g_str_has_prefix(aline,"Release Date:"))
  2911 		    g_print("    %s\n",aline);
  2912 		if (g_str_has_prefix(aline,"Edition:"))
  2913 		    g_print("    %s\n\n",aline);
  2914 	    }
  2915 	    continue;		/* skip through the header */
  2916 	}
  2917 	checked_linecnt++;
  2918 	print_pending(aline,parastart,&pending);
  2919 	isemptyline=analyse_quotes(aline,&counters);
  2920 	if (isnewpara && !isemptyline)
  2921 	{
  2922 	    /* This line is the start of a new paragraph. */
  2923 	    start_para_line=linecnt;
  2924 	    /* Capture its first line in case we want to report it later. */
  2925 	    g_free(parastart);
  2926 	    parastart=g_strdup(aline);
  2927 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2928 	    s=aline;
  2929 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2930 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2931 		s=g_utf8_next_char(s);
  2932 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2933 	    {
  2934 		/* and its first letter is lowercase */
  2935 		if (pswit[ECHO_SWITCH])
  2936 		    g_print("\n%s\n",aline);
  2937 		if (!pswit[OVERVIEW_SWITCH])
  2938 		    g_print("    Line %ld column %ld - "
  2939 		      "Paragraph starts with lower-case\n",
  2940 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2941 		else
  2942 		    cnt_punct++;
  2943 	    }
  2944 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2945 	}
  2946 	/* Check for an em-dash broken at line end. */
  2947 	if (enddash && g_utf8_get_char(aline)=='-')
  2948 	{
  2949 	    if (pswit[ECHO_SWITCH])
  2950 		g_print("\n%s\n",aline);
  2951 	    if (!pswit[OVERVIEW_SWITCH])
  2952 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2953 	    else
  2954 		cnt_punct++;
  2955 	}
  2956 	enddash=FALSE;
  2957 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2958 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2959 	    ;
  2960 	if (s>=aline && g_utf8_get_char(s)=='-')
  2961 	    enddash=TRUE;
  2962 	check_for_control_characters(aline);
  2963 	check_for_odd_characters(aline,warnings,isemptyline);
  2964 	if (warnings->longline)
  2965 	    check_for_long_line(aline);
  2966 	if (warnings->shortline)
  2967 	    check_for_short_line(aline,&last);
  2968 	last.blen=last.len;
  2969 	last.len=g_utf8_strlen(aline,-1);
  2970 	last.start=g_utf8_get_char(aline);
  2971 	check_for_starting_punctuation(aline);
  2972 	if (warnings->dash)
  2973 	{
  2974 	    check_for_spaced_emdash(aline);
  2975 	    check_for_spaced_dash(aline);
  2976 	}
  2977 	check_for_unmarked_paragraphs(aline);
  2978 	check_for_jeebies(aline);
  2979 	check_for_mta_from(aline);
  2980 	check_for_orphan_character(aline);
  2981 	check_for_pling_scanno(aline);
  2982 	check_for_extra_period(aline,warnings);
  2983 	check_for_following_punctuation(aline);
  2984 	check_for_typos(aline,warnings);
  2985 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2986 	check_for_double_punctuation(aline,warnings);
  2987 	check_for_spaced_quotes(aline);
  2988 	check_for_miscased_genative(aline);
  2989 	check_end_of_line(aline,warnings);
  2990 	check_for_unspaced_bracket(aline);
  2991 	if (warnings->endquote)
  2992 	    check_for_unpunctuated_endquote(aline);
  2993 	check_for_html_tag(aline);
  2994 	check_for_html_entity(aline);
  2995 	if (isemptyline)
  2996 	{
  2997 	    check_for_mismatched_quotes(&counters,&pending);
  2998 	    counters_reset(&counters);
  2999 	    /* let the next iteration know that it's starting a new para */
  3000 	    isnewpara=TRUE;
  3001 	    if (prevline)
  3002 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3003 	}
  3004 	g_free(prevline);
  3005 	prevline=g_strdup(aline);
  3006     }
  3007     linecnt++;
  3008     check_for_mismatched_quotes(&counters,&pending);
  3009     print_pending(NULL,parastart,&pending);
  3010     reset_pending(&pending);
  3011     if (prevline)
  3012     {
  3013 	g_free(prevline);
  3014 	prevline=NULL;
  3015     }
  3016     g_free(parastart);
  3017     g_free(prevline);
  3018     g_free(etext);
  3019     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3020 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3021     g_tree_unref(qword);
  3022     g_tree_unref(qperiod);
  3023     counters_destroy(&counters);
  3024     g_set_print_handler(NULL);
  3025     print_as_windows_1252(NULL);
  3026     if (pswit[MARKUP_SWITCH])  
  3027 	loseentities(NULL);
  3028 }
  3029 
  3030 /*
  3031  * flgets:
  3032  *
  3033  * Get one line from the input text, checking for
  3034  * the existence of exactly one CR/LF line-end per line.
  3035  *
  3036  * Returns: a pointer to the line.
  3037  */
  3038 char *flgets(char **etext,long lcnt)
  3039 {
  3040     gunichar c;
  3041     gboolean isCR=FALSE;
  3042     char *theline=*etext;
  3043     char *eos=theline;
  3044     gchar *s;
  3045     for (;;)
  3046     {
  3047 	c=g_utf8_get_char(*etext);
  3048 	*etext=g_utf8_next_char(*etext);
  3049 	if (!c)
  3050 	    return NULL;
  3051 	/* either way, it's end of line */
  3052 	if (c=='\n')
  3053 	{
  3054 	    if (isCR)
  3055 		break;
  3056 	    else
  3057 	    {
  3058 		/* Error - a LF without a preceding CR */
  3059 		if (pswit[LINE_END_SWITCH])
  3060 		{
  3061 		    if (pswit[ECHO_SWITCH])
  3062 		    {
  3063 			s=g_strndup(theline,eos-theline);
  3064 			g_print("\n%s\n",s);
  3065 			g_free(s);
  3066 		    }
  3067 		    if (!pswit[OVERVIEW_SWITCH])
  3068 			g_print("    Line %ld - No CR?\n",lcnt);
  3069 		    else
  3070 			cnt_lineend++;
  3071 		}
  3072 		break;
  3073 	    }
  3074 	}
  3075 	if (c=='\r')
  3076 	{
  3077 	    if (isCR)
  3078 	    {
  3079 		/* Error - two successive CRs */
  3080 		if (pswit[LINE_END_SWITCH])
  3081 		{
  3082 		    if (pswit[ECHO_SWITCH])
  3083 		    {
  3084 			s=g_strndup(theline,eos-theline);
  3085 			g_print("\n%s\n",s);
  3086 			g_free(s);
  3087 		    }
  3088 		    if (!pswit[OVERVIEW_SWITCH])
  3089 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3090 		    else
  3091 			cnt_lineend++;
  3092 		}
  3093 	    }
  3094 	    isCR=TRUE;
  3095 	}
  3096 	else
  3097 	{
  3098 	    if (pswit[LINE_END_SWITCH] && isCR)
  3099 	    {
  3100 		if (pswit[ECHO_SWITCH])
  3101 		{
  3102 		    s=g_strndup(theline,eos-theline);
  3103 		    g_print("\n%s\n",s);
  3104 		    g_free(s);
  3105 		}
  3106 		if (!pswit[OVERVIEW_SWITCH])
  3107 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3108 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3109 		else
  3110 		    cnt_lineend++;
  3111 		*eos=' ';
  3112 	    }
  3113 	    isCR=FALSE;
  3114 	    eos=g_utf8_next_char(eos);
  3115 	}
  3116     }
  3117     *eos='\0';
  3118     if (pswit[MARKUP_SWITCH])  
  3119 	postprocess_for_HTML(theline);
  3120     if (pswit[DP_SWITCH])  
  3121 	postprocess_for_DP(theline);
  3122     return theline;
  3123 }
  3124 
  3125 /*
  3126  * mixdigit:
  3127  *
  3128  * Takes a "word" as a parameter, and checks whether it
  3129  * contains a mixture of alpha and digits. Generally, this is an
  3130  * error, but may not be for cases like 4th or L5 12s. 3d.
  3131  *
  3132  * Returns: TRUE iff an is error found.
  3133  */
  3134 gboolean mixdigit(const char *checkword)
  3135 {
  3136     gboolean wehaveadigit,wehavealetter,query;
  3137     const char *s,*nondigit;
  3138     wehaveadigit=wehavealetter=query=FALSE;
  3139     for (s=checkword;*s;s=g_utf8_next_char(s))
  3140 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3141 	    wehavealetter=TRUE;
  3142 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3143 	    wehaveadigit=TRUE;
  3144     if (wehaveadigit && wehavealetter)
  3145     {
  3146 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3147 	query=TRUE;
  3148 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3149 	  nondigit=g_utf8_next_char(nondigit))
  3150 	    ;
  3151 	/* digits, ending in st, rd, nd, th of either case */
  3152 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3153 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3154 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3155 	  !g_ascii_strcasecmp(nondigit,"th"))
  3156 	    query=FALSE;
  3157 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3158 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3159 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3160 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3161 	    query=FALSE;
  3162 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3163 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3164 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3165 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3166 	    query=FALSE;
  3167 	/* digits, ending in l, L, s or d */
  3168 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3169 	  !strcmp(nondigit,"d"))
  3170 	    query=FALSE;
  3171 	/*
  3172 	 * L at the start of a number, representing Britsh pounds, like L500.
  3173 	 * This is cute. We know the current word is mixed digit. If the first
  3174 	 * letter is L, there must be at least one digit following. If both
  3175 	 * digits and letters follow, we have a genuine error, else we have a
  3176 	 * capital L followed by digits, and we accept that as a non-error.
  3177 	 */
  3178 	if (g_utf8_get_char(checkword)=='L' &&
  3179 	  !mixdigit(g_utf8_next_char(checkword)))
  3180 	    query=FALSE;
  3181     }
  3182     return query;
  3183 }
  3184 
  3185 /*
  3186  * getaword:
  3187  *
  3188  * Extracts the first/next "word" from the line, and returns it.
  3189  * A word is defined as one English word unit--or at least that's the aim.
  3190  * "ptr" is advanced to the position in the line where we will start
  3191  * looking for the next word.
  3192  *
  3193  * Returns: A newly-allocated string.
  3194  */
  3195 gchar *getaword(const char **ptr)
  3196 {
  3197     const char *s,*t;
  3198     GString *word;
  3199     gunichar c,pc;
  3200     word=g_string_new(NULL);
  3201     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3202       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3203       **ptr;*ptr=g_utf8_next_char(*ptr))
  3204 	;
  3205     /*
  3206      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3207      * Especially yucky is the case of L1,000
  3208      * This section looks for a pattern of characters including a digit
  3209      * followed by a comma or period followed by one or more digits.
  3210      * If found, it returns this whole pattern as a word; otherwise we discard
  3211      * the results and resume our normal programming.
  3212      */
  3213     s=*ptr;
  3214     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3215       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3216       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3217 	g_string_append_unichar(word,g_utf8_get_char(s));
  3218     if (word->len)
  3219     {
  3220 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3221 	{
  3222 	    c=g_utf8_get_char(t);
  3223 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3224 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3225 	    {
  3226 		*ptr=s;
  3227 		return g_string_free(word,FALSE);
  3228 	    }
  3229 	}
  3230     }
  3231     /* we didn't find a punctuated number - do the regular getword thing */
  3232     g_string_truncate(word,0);
  3233     c=g_utf8_get_char(*ptr);
  3234     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3235       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3236 	g_string_append_unichar(word,c);
  3237     return g_string_free(word,FALSE);
  3238 }
  3239 
  3240 /*
  3241  * isroman:
  3242  *
  3243  * Is this word a Roman Numeral?
  3244  *
  3245  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3246  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3247  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3248  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3249  * expressions thereof, except when it came to taxes. Allow any number of M,
  3250  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3251  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3252  * of optional Is.
  3253  */
  3254 gboolean isroman(const char *t)
  3255 {
  3256     const char *s;
  3257     if (!t || !*t)
  3258 	return FALSE;
  3259     s=t;
  3260     while (g_utf8_get_char(t)=='m' && *t)
  3261 	t++;
  3262     if (g_utf8_get_char(t)=='d')
  3263 	t++;
  3264     if (g_str_has_prefix(t,"cm"))
  3265 	t+=2;
  3266     if (g_str_has_prefix(t,"cd"))
  3267 	t+=2;
  3268     while (g_utf8_get_char(t)=='c' && *t)
  3269 	t++;
  3270     if (g_str_has_prefix(t,"xl"))
  3271 	t+=2;
  3272     if (g_str_has_prefix(t,"xc"))
  3273 	t+=2;
  3274     if (g_utf8_get_char(t)=='l')
  3275 	t++;
  3276     while (g_utf8_get_char(t)=='x' && *t)
  3277 	t++;
  3278     if (g_str_has_prefix(t,"ix"))
  3279 	t+=2;
  3280     if (g_str_has_prefix(t,"iv"))
  3281 	t+=2;
  3282     if (g_utf8_get_char(t)=='v')
  3283 	t++;
  3284     while (g_utf8_get_char(t)=='i' && *t)
  3285 	t++;
  3286     return !*t;
  3287 }
  3288 
  3289 /*
  3290  * postprocess_for_DP:
  3291  *
  3292  * Invoked with the -d switch from flgets().
  3293  * It simply "removes" from the line a hard-coded set of common
  3294  * DP-specific tags, so that the line passed to the main routine has
  3295  * been pre-cleaned of DP markup.
  3296  */
  3297 void postprocess_for_DP(char *theline)
  3298 {
  3299     char *s,*t;
  3300     int i;
  3301     if (!*theline) 
  3302 	return;
  3303     for (i=0;*DPmarkup[i];i++)
  3304 	while ((s=strstr(theline,DPmarkup[i])))
  3305 	{
  3306 	    t=s+strlen(DPmarkup[i]);
  3307 	    memmove(s,t,strlen(t)+1);
  3308 	}
  3309 }
  3310 
  3311 /*
  3312  * postprocess_for_HTML:
  3313  *
  3314  * Invoked with the -m switch from flgets().
  3315  * It simply "removes" from the line a hard-coded set of common
  3316  * HTML tags and "replaces" a hard-coded set of common HTML
  3317  * entities, so that the line passed to the main routine has
  3318  * been pre-cleaned of HTML.
  3319  */
  3320 void postprocess_for_HTML(char *theline)
  3321 {
  3322     while (losemarkup(theline))
  3323 	;
  3324     loseentities(theline);
  3325 }
  3326 
  3327 char *losemarkup(char *theline)
  3328 {
  3329     char *s,*t;
  3330     int i;
  3331     s=strchr(theline,'<');
  3332     t=s?strchr(s,'>'):NULL;
  3333     if (!s || !t)
  3334 	return NULL;
  3335     for (i=0;*markup[i];i++)
  3336 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3337 	{
  3338 	    t=g_utf8_next_char(t);
  3339 	    memmove(s,t,strlen(t)+1);
  3340 	    return s;
  3341 	}
  3342     /* It's an unrecognized <xxx>. */
  3343     return NULL;
  3344 }
  3345 
  3346 void loseentities(char *theline)
  3347 {
  3348     int i;
  3349     gsize nb;
  3350     char *amp,*scolon;
  3351     gchar *s,*t;
  3352     gunichar c;
  3353     GTree *entities=NULL;
  3354     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3355     if (!theline)
  3356     {
  3357 	if (entities)
  3358 	    g_tree_destroy(entities);
  3359 	entities=NULL;
  3360 	if (translit!=(GIConv)-1)
  3361 	    g_iconv_close(translit);
  3362 	translit=(GIConv)-1;
  3363 	if (to_utf8!=(GIConv)-1)
  3364 	    g_iconv_close(to_utf8);
  3365 	to_utf8=(GIConv)-1;
  3366 	return;
  3367     }
  3368     if (!*theline)
  3369 	return;
  3370     if (!entities)
  3371     {
  3372 	entities=g_tree_new((GCompareFunc)strcmp);
  3373 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3374 	    g_tree_insert(entities,HTMLentities[i].name,
  3375 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3376     }
  3377     if (translit==(GIConv)-1)
  3378 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3379     if (to_utf8==(GIConv)-1)
  3380 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3381     while((amp=strchr(theline,'&')))
  3382     {
  3383 	scolon=strchr(amp,';');
  3384 	if (scolon)
  3385 	{
  3386 	    if (amp[1]=='#')
  3387 	    {
  3388 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3389 		    c=strtol(amp+2,NULL,10);
  3390 		else if (amp[2]=='x' &&
  3391 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3392 		    c=strtol(amp+3,NULL,16);
  3393 	    }
  3394 	    else
  3395 	    {
  3396 		s=g_strndup(amp+1,scolon-(amp+1));
  3397 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3398 		g_free(s);
  3399 	    }
  3400 	}
  3401 	else
  3402 	    c=0;
  3403 	if (c)
  3404 	{
  3405 	    theline=amp;
  3406 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3407 		theline+=g_unichar_to_utf8(c,theline);
  3408 	    else
  3409 	    {
  3410 		s=g_malloc(6);
  3411 		nb=g_unichar_to_utf8(c,s);
  3412 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3413 		g_free(s);
  3414 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3415 		g_free(t);
  3416 		memcpy(theline,s,nb);
  3417 		g_free(s);
  3418 		theline+=nb;
  3419 	    }
  3420 	    memmove(theline,g_utf8_next_char(scolon),
  3421 	      strlen(g_utf8_next_char(scolon))+1);
  3422 	}
  3423 	else
  3424 	    theline=g_utf8_next_char(amp);
  3425     }
  3426 }
  3427 
  3428 gboolean tagcomp(const char *strin,const char *basetag)
  3429 {
  3430     gboolean retval;
  3431     gchar *s,*t;
  3432     if (g_utf8_get_char(strin)=='/')
  3433 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3434     else
  3435 	t=g_utf8_casefold(strin,-1);
  3436     s=g_utf8_casefold(basetag,-1);
  3437     retval=g_str_has_prefix(t,s);
  3438     g_free(s);
  3439     g_free(t);
  3440     return retval;
  3441 }
  3442 
  3443 void proghelp(GOptionContext *context)
  3444 {
  3445     gchar *help;
  3446     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3447     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3448     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3449     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3450       "For details, read the file COPYING.\n",stderr);
  3451     fputs("This is Free Software; "
  3452       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3453     fputs("read the file COPYING for details.\n\n",stderr);
  3454     help=g_option_context_get_help(context,TRUE,NULL);
  3455     fputs(help,stderr);
  3456     g_free(help);
  3457     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3458     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3459       "non-ASCII\n",stderr);
  3460     fputs("characters like accented letters, "
  3461       "lines longer than 75 or shorter than 55,\n",stderr);
  3462     fputs("unbalanced quotes or brackets, "
  3463       "a variety of badly formatted punctuation, \n",stderr);
  3464     fputs("HTML tags, some likely typos. "
  3465       "It is NOT a substitute for human judgement.\n",stderr);
  3466     fputs("\n",stderr);
  3467 }