bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Mon Oct 21 23:36:40 2013 +0100 (2013-10-21)
changeset 191 189183b37598
parent 186 4912234d80be
parent 184 cd3068704d3a
child 192 1aeda7fe17ca
permissions -rw-r--r--
Merge bug #24: Accept alternate form of newline
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *prevline;
    36 
    37 /* Common typos. */
    38 char *typo[] = {
    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    68     "se", ""
    69 };
    70 
    71 GTree *usertypo;
    72 
    73 /* Common abbreviations and other OK words not to query as typos. */
    74 char *okword[] = {
    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    78     "outbid", "outbids", "frostbite", "frostbitten", ""
    79 };
    80 
    81 /* Common abbreviations that cause otherwise unexplained periods. */
    82 char *abbrev[] = {
    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    85 };
    86 
    87 /*
    88  * Two-Letter combinations that rarely if ever start words,
    89  * but are common scannos or otherwise common letter combinations.
    90  */
    91 char *nostart[] = {
    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    93 };
    94 
    95 /*
    96  * Two-Letter combinations that rarely if ever end words,
    97  * but are common scannos or otherwise common letter combinations.
    98  */
    99 char *noend[] = {
   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   101     "sw", "gr", "sl", "cl", "iy", ""
   102 };
   103 
   104 char *markup[] = {
   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   109 };
   110 
   111 char *DPmarkup[] = {
   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   113 };
   114 
   115 char *nocomma[] = {
   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   119     "during", "let", "toward", "among", ""
   120 };
   121 
   122 char *noperiod[] = {
   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   126     "among", "those", "into", "whom", "having", "thence", ""
   127 }; 
   128 
   129 gboolean pswit[SWITNO];  /* program switches */
   130 
   131 gboolean typo_compat,paranoid_compat;
   132 
   133 static GOptionEntry options[]={
   134     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   135       "Ignore DP-specific markup", NULL },
   136     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   137       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   138       "Don't ignore DP-specific markup", NULL },
   139     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   140       "Echo queried line", NULL },
   141     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   142       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   143       "Don't echo queried line", NULL },
   144     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   145       "Check single quotes", NULL },
   146     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   147       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   148       "Don't check single quotes", NULL },
   149     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   150       "Check common typos", NULL },
   151     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   152       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   153       "Don't check common typos", NULL },
   154     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   155       "Require closure of quotes on every paragraph", NULL },
   156     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   157       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   158       "Don't require closure of quotes on every paragraph", NULL },
   159     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   160       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   161       "Enable paranoid querying of everything", NULL },
   162     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   163       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   164       "Disable paranoid querying of everything", NULL },
   165     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   166       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   167       "Enable line end checking", NULL },
   168     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   169       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   170       "Diable line end checking", NULL },
   171     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   172       "Overview: just show counts", NULL },
   173     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   174       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   175       "Show individual warnings", NULL },
   176     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   177       "Output errors to stdout instead of stderr", NULL },
   178     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   179       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   180       "Output errors to stderr instead of stdout", NULL },
   181     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   182       "Echo header fields", NULL },
   183     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   184       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   185       "Don't echo header fields", NULL },
   186     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   187       "Ignore markup in < >", NULL },
   188     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   189       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   190       "No special handling for markup in < >", NULL },
   191     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   192       "Use file of user-defined typos", NULL },
   193     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   194       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   195       "Ignore file of user-defined typos", NULL },
   196     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   197       "Verbose - list everything", NULL },
   198     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   199       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   200       "Switch off verbose mode", NULL },
   201     { NULL }
   202 };
   203 
   204 /*
   205  * Options relating to configuration which make no sense from inside
   206  * a configuration file.
   207  */
   208 
   209 static GOptionEntry config_options[]={
   210     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   211       "Defaults for use on www upload", NULL },
   212     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   213       "Dump current config settings", NULL },
   214     { NULL }
   215 };
   216 
   217 static GOptionEntry compatibility_options[]={
   218     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   219       "Toggle checking for common typos", NULL },
   220     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   221       "Toggle both paranoid mode and common typos", NULL },
   222     { NULL }
   223 };
   224 
   225 long cnt_quote;		/* for overview mode, count of quote queries */
   226 long cnt_brack;		/* for overview mode, count of brackets queries */
   227 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   228 long cnt_odd;		/* for overview mode, count of odd character queries */
   229 long cnt_long;		/* for overview mode, count of long line errors */
   230 long cnt_short;		/* for overview mode, count of short line queries */
   231 long cnt_punct;		/* for overview mode,
   232 			   count of punctuation and spacing queries */
   233 long cnt_dash;		/* for overview mode, count of dash-related queries */
   234 long cnt_word;		/* for overview mode, count of word queries */
   235 long cnt_html;		/* for overview mode, count of html queries */
   236 long cnt_lineend;	/* for overview mode, count of line-end queries */
   237 long cnt_spacend;	/* count of lines with space at end */
   238 long linecnt;		/* count of total lines in the file */
   239 long checked_linecnt;	/* count of lines actually checked */
   240 
   241 void proghelp(GOptionContext *context);
   242 void procfile(const char *);
   243 
   244 gchar *running_from;
   245 
   246 gboolean mixdigit(const char *);
   247 gchar *getaword(const char **);
   248 char *flgets(char **,long,gboolean);
   249 void postprocess_for_HTML(char *);
   250 char *linehasmarkup(char *);
   251 char *losemarkup(char *);
   252 gboolean tagcomp(const char *,const char *);
   253 void loseentities(char *);
   254 gboolean isroman(const char *);
   255 void postprocess_for_DP(char *);
   256 void print_as_windows_1252(const char *string);
   257 void print_as_utf_8(const char *string);
   258 
   259 GTree *qword,*qperiod;
   260 
   261 #ifdef __WIN32__
   262 UINT saved_cp;
   263 #endif
   264 
   265 GKeyFile *config;
   266 
   267 void config_file_update(GKeyFile *kf)
   268 {
   269     int i;
   270     gboolean sw;
   271     for(i=0;options[i].long_name;i++)
   272     {
   273 	if (g_str_has_prefix(options[i].long_name,"no-"))
   274 	    continue;
   275 	if (options[i].arg==G_OPTION_ARG_NONE)
   276 	{
   277 	    sw=*(gboolean *)options[i].arg_data;
   278 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   279 		sw=!sw;
   280 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   281 	}
   282 	else
   283 	    g_assert_not_reached();
   284     }
   285 }
   286 
   287 void config_file_add_comments(GKeyFile *kf)
   288 {
   289     int i;
   290     gchar *comment;
   291     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   292       NULL);
   293     for(i=0;options[i].long_name;i++)
   294     {
   295 	if (g_str_has_prefix(options[i].long_name,"no-"))
   296 	    continue;
   297 	comment=g_strconcat(" ",options[i].description,NULL);
   298 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   299 	g_free(comment);
   300     }
   301 }
   302 
   303 void dump_config(void)
   304 {
   305     gchar *s;
   306     if (config)
   307 	config_file_update(config);
   308     else
   309     {
   310 	config=g_key_file_new();
   311 	config_file_update(config);
   312 	config_file_add_comments(config);
   313     }
   314     s=g_key_file_to_data(config,NULL,NULL);
   315     if (s)
   316 	g_print("%s",s);
   317     g_free(s);
   318 }
   319 
   320 GKeyFile *read_config_file(gchar **full_path)
   321 {
   322     int i;
   323     GError *err=NULL;
   324     gchar **search_dirs;
   325     gchar *path;
   326     const char *search_path;
   327     GKeyFile *kf;
   328     kf=g_key_file_new();
   329     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   330     if (search_path)
   331     {
   332 #ifdef __WIN32__
   333 	search_dirs=g_strsplit(search_path,";",0);
   334 #else
   335 	search_dirs=g_strsplit(search_path,":",0);
   336 #endif
   337     }
   338     else
   339     {
   340 	search_dirs=g_new(gchar *,4);
   341 	search_dirs[0]=g_get_current_dir();
   342 	search_dirs[1]=g_strdup(running_from);
   343 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   344 	search_dirs[3]=NULL;
   345     }
   346     for(i=0;search_dirs[i];i++)
   347     {
   348 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   349 	if (g_key_file_load_from_file(kf,path,
   350 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   351 	    break;
   352 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   353 	{
   354 	    g_printerr("Bookloupe: Error reading %s\n",path);
   355 	    g_printerr("%s\n",err->message);
   356 	    exit(1);
   357 	}
   358 	g_clear_error(&err);
   359 	g_free(path);
   360 	path=NULL;
   361     }
   362     if (!search_dirs[i])
   363     {
   364 	g_key_file_free(kf);
   365 	kf=NULL;
   366     }
   367     g_strfreev(search_dirs);
   368     if (full_path && kf)
   369 	*full_path=path;
   370     else
   371 	g_free(path);
   372     return kf;
   373 }
   374 
   375 void parse_config_file(void)
   376 {
   377     int i,j;
   378     gchar *path;
   379     gchar **keys;
   380     gboolean sw;
   381     GError *err=NULL;
   382     config=read_config_file(&path);
   383     if (config)
   384 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   385     else
   386 	keys=NULL;
   387     if (keys)
   388     {
   389 	for(i=0;keys[i];i++)
   390 	{
   391 	    for(j=0;options[j].long_name;j++)
   392 	    {
   393 		if (g_str_has_prefix(options[j].long_name,"no-"))
   394 		    continue;
   395 		else if (!strcmp(keys[i],options[j].long_name))
   396 		{
   397 		    if (options[j].arg==G_OPTION_ARG_NONE)
   398 		    {
   399 			sw=g_key_file_get_boolean(config,"options",keys[i],
   400 			  &err);
   401 			if (err)
   402 			{
   403 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   404 			      path,keys[i],err->message);
   405 			    g_clear_error(&err);
   406 			}
   407 			if (options[j].flags&G_OPTION_FLAG_REVERSE)
   408 			    sw=!sw;
   409 			*(gboolean *)options[j].arg_data=sw;
   410 			break;
   411 		    }
   412 		    else
   413 			g_assert_not_reached();
   414 		}
   415 	    }
   416 	    if (!options[j].long_name)
   417 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   418 		  path,keys[i]);
   419 	}
   420 	g_strfreev(keys);
   421     }
   422     if (config)
   423 	g_free(path);
   424 }
   425 
   426 void parse_options(int *argc,char ***argv)
   427 {
   428     GError *err=NULL;
   429     GOptionContext *context;
   430     GOptionGroup *compatibility;
   431     context=g_option_context_new(
   432       "file - look for errors in Project Gutenberg(TM) etexts");
   433     g_option_context_add_main_entries(context,options,NULL);
   434     g_option_context_add_main_entries(context,config_options,NULL);
   435     compatibility=g_option_group_new("compatibility",
   436       "Options for Compatibility with Gutcheck:",
   437       "Show compatibility options",NULL,NULL);
   438     g_option_group_add_entries(compatibility,compatibility_options);
   439     g_option_context_add_group(context,compatibility);
   440     g_option_context_set_description(context,
   441       "For simplicity, only the switch options which reverse the\n"
   442       "default configuration are listed. In most cases, both vanilla\n"
   443       "and \"no-\" prefixed versions are available for use.");
   444     if (!g_option_context_parse(context,argc,argv,&err))
   445     {
   446 	g_printerr("Bookloupe: %s\n",err->message);
   447 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   448 	exit(1);
   449     }
   450     if (typo_compat)
   451 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   452     if (paranoid_compat)
   453     {
   454 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   455 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   456     }
   457     /*
   458      * Web uploads - for the moment, this is really just a placeholder
   459      * until we decide what processing we really want to do on web uploads
   460      */
   461     if (pswit[WEB_SWITCH])
   462     {
   463 	/* specific override for web uploads */
   464 	pswit[ECHO_SWITCH]=TRUE;
   465 	pswit[SQUOTE_SWITCH]=FALSE;
   466 	pswit[TYPO_SWITCH]=TRUE;
   467 	pswit[QPARA_SWITCH]=FALSE;
   468 	pswit[PARANOID_SWITCH]=TRUE;
   469 	pswit[LINE_END_SWITCH]=FALSE;
   470 	pswit[OVERVIEW_SWITCH]=FALSE;
   471 	pswit[STDOUT_SWITCH]=FALSE;
   472 	pswit[HEADER_SWITCH]=TRUE;
   473 	pswit[VERBOSE_SWITCH]=FALSE;
   474 	pswit[MARKUP_SWITCH]=FALSE;
   475 	pswit[USERTYPO_SWITCH]=FALSE;
   476 	pswit[DP_SWITCH]=FALSE;
   477     }
   478     if (pswit[DUMP_CONFIG_SWITCH])
   479     {
   480 	dump_config();
   481 	exit(0);
   482     }
   483     if (pswit[OVERVIEW_SWITCH])
   484 	/* just print summary; don't echo */
   485 	pswit[ECHO_SWITCH]=FALSE;
   486     if (*argc<2)
   487     {
   488 	proghelp(context);
   489 	exit(1);
   490     }
   491     g_option_context_free(context);
   492 }
   493 
   494 /*
   495  * read_user_scannos:
   496  *
   497  * Read in the user-defined stealth scanno list.
   498  */
   499 void read_user_scannos(void)
   500 {
   501     GError *err=NULL;
   502     gchar *usertypo_file;
   503     gboolean okay;
   504     int i;
   505     gsize len,nb;
   506     gchar *contents,*utf8,**lines;
   507     usertypo_file=g_strdup("bookloupe.typ");
   508     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   509     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   510     {
   511 	g_clear_error(&err);
   512 	g_free(usertypo_file);
   513 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   514 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   515     }
   516     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   517     {
   518 	g_clear_error(&err);
   519 	g_free(usertypo_file);
   520 	usertypo_file=g_strdup("gutcheck.typ");
   521 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   522     }
   523     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   524     {
   525 	g_clear_error(&err);
   526 	g_free(usertypo_file);
   527 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   528 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   529     }
   530     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   531     {
   532 	g_free(usertypo_file);
   533 	g_print("   --> I couldn't find bookloupe.typ "
   534 	  "-- proceeding without user typos.\n");
   535 	return;
   536     }
   537     else if (!okay)
   538     {
   539 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   540 	g_free(usertypo_file);
   541 	g_clear_error(&err);
   542 	exit(1);
   543     }
   544     if (g_utf8_validate(contents,len,NULL))
   545 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   546     else
   547 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   548     g_free(contents);
   549     lines=g_strsplit_set(utf8,"\r\n",0);
   550     g_free(utf8);
   551     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   552     for (i=0;lines[i];i++)
   553 	if (*(unsigned char *)lines[i]>'!')
   554 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   555 	else
   556 	    g_free(lines[i]);
   557     g_free(lines);
   558 }
   559 
   560 /*
   561  * read_etext:
   562  *
   563  * Read an etext returning a newly allocated string containing the file
   564  * contents or NULL on error.
   565  */
   566 gchar *read_etext(const char *filename,GError **err)
   567 {
   568     GError *tmp_err=NULL;
   569     gchar *contents,*utf8;
   570     gsize len,bytes_read,bytes_written;
   571     int i,line,col;
   572     if (!g_file_get_contents(filename,&contents,&len,err))
   573 	return NULL;
   574     if (g_utf8_validate(contents,len,NULL))
   575     {
   576 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   577 	g_set_print_handler(print_as_utf_8);
   578 #ifdef __WIN32__
   579 	SetConsoleOutputCP(CP_UTF8);
   580 #endif
   581     }
   582     else
   583     {
   584 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   585 	  &bytes_written,&tmp_err);
   586 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   587 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   588 	{
   589 	    line=col=1;
   590 	    for(i=0;i<bytes_read;i++)
   591 		if (contents[i]=='\n')
   592 		{
   593 		    line++;
   594 		    col=1;
   595 		}
   596 		else if (contents[i]!='\r')
   597 		    col++;
   598 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   599 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   600 	      "valid Windows-1252 character",
   601 	      ((unsigned char *)contents)[bytes_read],line,col);
   602 	}
   603 	else if (tmp_err)
   604 	    g_propagate_error(err,tmp_err);
   605 	g_set_print_handler(print_as_windows_1252);
   606 #ifdef __WIN32__
   607 	SetConsoleOutputCP(1252);
   608 #endif
   609     }
   610     g_free(contents);
   611     return utf8;
   612 }
   613 
   614 void cleanup_on_exit(void)
   615 {
   616 #ifdef __WIN32__
   617     SetConsoleOutputCP(saved_cp);
   618 #endif
   619 }
   620 
   621 int main(int argc,char **argv)
   622 {
   623 #ifdef __WIN32__
   624     atexit(cleanup_on_exit);
   625     saved_cp=GetConsoleOutputCP();
   626 #endif
   627     running_from=g_path_get_dirname(argv[0]);
   628     /* Paranoid checking is turned OFF, not on, by its switch */
   629     pswit[PARANOID_SWITCH]=TRUE;
   630     /* if running in paranoid mode, typo checks default to enabled */
   631     pswit[TYPO_SWITCH]=TRUE;
   632     /* Line-end checking is turned OFF, not on, by its switch */
   633     pswit[LINE_END_SWITCH]=TRUE;
   634     /* Echoing is turned OFF, not on, by its switch */
   635     pswit[ECHO_SWITCH]=TRUE;
   636     parse_config_file();
   637     parse_options(&argc,&argv);
   638     if (pswit[USERTYPO_SWITCH])
   639 	read_user_scannos();
   640     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   641     procfile(argv[1]);
   642     if (pswit[OVERVIEW_SWITCH])
   643     {
   644 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   645 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   646 	g_print("    --------------- Queries found --------------\n");
   647 	if (cnt_long)
   648 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   649 	if (cnt_short)
   650 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   651 	if (cnt_lineend)
   652 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   653 	if (cnt_word)
   654 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   655 	if (cnt_quote)
   656 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   657 	if (cnt_brack)
   658 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   659 	if (cnt_bin)
   660 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   661 	if (cnt_odd)
   662 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   663 	if (cnt_punct)
   664 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   665 	if (cnt_dash)
   666 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   667 	if (cnt_html)
   668 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   669 	g_print("\n");
   670 	g_print("    TOTAL QUERIES		  %14ld\n",
   671 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   672 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   673     }
   674     g_free(running_from);
   675     if (usertypo)
   676 	g_tree_unref(usertypo);
   677     if (config)
   678 	g_key_file_free(config);
   679     return 0;
   680 }
   681 
   682 void count_dashes(const char *line,const char *dash,
   683   struct dash_results *results)
   684 {
   685     int i;
   686     gchar **tokens;
   687     gunichar pc,nc;
   688     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   689     if (!*line)
   690 	return;
   691     tokens=g_strsplit(line,dash,0);
   692     if (tokens[1])
   693 	results->base++;
   694     for(i=1;tokens[i];i++)
   695     {
   696 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   697 	nc=g_utf8_get_char(tokens[i]);
   698 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   699 	    spaced=TRUE;
   700 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   701 	    spaced2=TRUE;
   702 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   703 	    unspaced=TRUE;
   704     }
   705     if (spaced)
   706 	results->space++;
   707     if (spaced2)
   708 	/* count of lines with em-dashes with spaces both sides */
   709 	results->non_PG_space++;
   710     if (unspaced)
   711 	/* count of lines with PG-type em-dashes with no spaces */
   712 	results->PG_space++;
   713     g_strfreev(tokens);
   714 }
   715 
   716 /*
   717  * first_pass:
   718  *
   719  * Run a first pass - verify that it's a valid PG
   720  * file, decide whether to report some things that
   721  * occur many times in the text like long or short
   722  * lines, non-standard dashes, etc.
   723  */
   724 struct first_pass_results *first_pass(const char *etext)
   725 {
   726     gunichar laststart=CHAR_SPACE;
   727     const char *s;
   728     gchar *lc_line;
   729     int i,j,lbytes,llen;
   730     gchar **lines;
   731     unsigned int lastlen=0,lastblen=0;
   732     long spline=0,nspline=0;
   733     static struct first_pass_results results={0};
   734     struct dash_results tmp_dash_results;
   735     gchar *inword;
   736     QuoteClass qc;
   737     lines=g_strsplit(etext,"\n",0);
   738     if (lines[0])
   739 	/* If there's at least one line, we might have UNIX-style terminators */
   740 	results.unix_lineends=TRUE;
   741     for (j=0;lines[j];j++)
   742     {
   743 	lbytes=strlen(lines[j]);
   744 	if (lbytes>0 && lines[j][lbytes-1]=='\r')
   745 	{
   746 	    results.unix_lineends=FALSE;
   747 	    do
   748 	    {
   749 		lines[j][--lbytes]='\0';
   750 	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
   751 	}
   752 	llen=g_utf8_strlen(lines[j],lbytes);
   753 	linecnt++;
   754 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   755 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   756 	{
   757 	    if (spline)
   758 		g_print("   --> Duplicate header?\n");
   759 	    spline=linecnt+1;   /* first line of non-header text, that is */
   760 	}
   761 	if (!strncmp(lines[j],"*** START",9) &&
   762 	  strstr(lines[j],"PROJECT GUTENBERG"))
   763 	{
   764 	    if (nspline)
   765 		g_print("   --> Duplicate header?\n");
   766 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   767 	}
   768 	if (spline || nspline)
   769 	{
   770 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   771 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   772 	    {
   773 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   774 		{
   775 		    if (results.footerline)
   776 		    {
   777 			/* it's an old-form header - we can detect duplicates */
   778 			if (!nspline)
   779 			    g_print("   --> Duplicate footer?\n");
   780 		    }
   781 		    else
   782 			results.footerline=linecnt;
   783 		}
   784 	    }
   785 	    g_free(lc_line);
   786 	}
   787 	if (spline)
   788 	    results.firstline=spline;
   789 	if (nspline)
   790 	    results.firstline=nspline;  /* override with new */
   791 	if (results.footerline)
   792 	    continue;    /* don't count the boilerplate in the footer */
   793 	results.totlen+=llen;
   794 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   795 	{
   796 	    if (g_utf8_get_char(s)>127)
   797 		results.binlen++;
   798 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   799 		results.alphalen++;
   800 	    if (s>lines[j])
   801 	    {
   802 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   803 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   804 		else
   805 		    qc=INVALID_QUOTE;
   806 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   807 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   808 		    results.endquote_count++;
   809 	    }
   810 	}
   811 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   812 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   813 	    results.shortline++;
   814 	if (lbytes>0 &&
   815 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   816 	    cnt_spacend++;
   817 	if (strstr(lines[j],".,"))
   818 	    results.dotcomma++;
   819 	/* only count ast lines for ignoring purposes where there is */
   820 	/* locase text on the line */
   821 	if (strchr(lines[j],'*'))
   822 	{
   823 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   824 		if (g_unichar_islower(g_utf8_get_char(s)))
   825 		    break;
   826 	    if (*s)
   827 		results.astline++;
   828 	}
   829 	if (strchr(lines[j],'/'))
   830 	    results.fslashline++;
   831 	if (lbytes>0)
   832 	{
   833 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   834 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   835 	      s=g_utf8_prev_char(s))
   836 		;
   837 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   838 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   839 		results.hyphens++;
   840 	}
   841 	if (llen>LONGEST_PG_LINE)
   842 	    results.longline++;
   843 	if (llen>WAY_TOO_LONG)
   844 	    results.verylongline++;
   845 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   846 	{
   847 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   848 	    if (i>0)
   849 		results.htmcount++;
   850 	    if (strstr(lines[j],"<i>"))
   851 		results.htmcount+=4; /* bonus marks! */
   852 	}
   853 	/* Check for spaced em-dashes */
   854 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   855 	count_dashes(lines[j],"--",&tmp_dash_results);
   856 	count_dashes(lines[j],"—",&tmp_dash_results);
   857 	if (tmp_dash_results.base)
   858 	    results.emdash.base++;
   859 	if (tmp_dash_results.non_PG_space)
   860 	    results.emdash.non_PG_space++;
   861 	if (tmp_dash_results.PG_space)
   862 	    results.emdash.PG_space++;
   863 	for (s=lines[j];*s;)
   864 	{
   865 	    inword=getaword(&s);
   866 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   867 		results.Dutchcount++;
   868 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   869 		results.Frenchcount++;
   870 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   871 		results.standalone_digit++;
   872 	    g_free(inword);
   873 	}
   874 	/* Check for spaced dashes */
   875 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   876 	    results.spacedash++;
   877 	lastblen=lastlen;
   878 	lastlen=llen;
   879 	laststart=lines[j][0];
   880     }
   881     g_strfreev(lines);
   882     return &results;
   883 }
   884 
   885 /*
   886  * report_first_pass:
   887  *
   888  * Make some snap decisions based on the first pass results.
   889  */
   890 struct warnings *report_first_pass(struct first_pass_results *results)
   891 {
   892     static struct warnings warnings={0};
   893     warnings.nocr=1;
   894     if (results->unix_lineends)
   895     {
   896 	warnings.nocr=0;
   897 	g_print("   --> No lines in this file have a CR. Not reporting them. "
   898 	  "Project Gutenberg requires that all lineends be CR-LF.\n");
   899     }
   900     if (cnt_spacend>0)
   901 	g_print("   --> %ld lines in this file have white space at end\n",
   902 	  cnt_spacend);
   903     warnings.dotcomma=1;
   904     if (results->dotcomma>5)
   905     {
   906 	warnings.dotcomma=0;
   907 	g_print("   --> %ld lines in this file contain '.,'. "
   908 	  "Not reporting them.\n",results->dotcomma);
   909     }
   910     /*
   911      * If more than 50 lines, or one-tenth, are short,
   912      * don't bother reporting them.
   913      */
   914     warnings.shortline=1;
   915     if (results->shortline>50 || results->shortline*10>linecnt)
   916     {
   917 	warnings.shortline=0;
   918 	g_print("   --> %ld lines in this file are short. "
   919 	  "Not reporting short lines.\n",results->shortline);
   920     }
   921     /*
   922      * If more than 50 lines, or one-tenth, are long,
   923      * don't bother reporting them.
   924      */
   925     warnings.longline=1;
   926     if (results->longline>50 || results->longline*10>linecnt)
   927     {
   928 	warnings.longline=0;
   929 	g_print("   --> %ld lines in this file are long. "
   930 	  "Not reporting long lines.\n",results->longline);
   931     }
   932     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   933     warnings.ast=1;
   934     if (results->astline>10)
   935     {
   936 	warnings.ast=0;
   937 	g_print("   --> %ld lines in this file contain asterisks. "
   938 	  "Not reporting them.\n",results->astline);
   939     }
   940     /*
   941      * If more than 10 lines contain forward slashes,
   942      * don't bother reporting them.
   943      */
   944     warnings.fslash=1;
   945     if (results->fslashline>10)
   946     {
   947 	warnings.fslash=0;
   948 	g_print("   --> %ld lines in this file contain forward slashes. "
   949 	  "Not reporting them.\n",results->fslashline);
   950     }
   951     /*
   952      * If more than 20 lines contain unpunctuated endquotes,
   953      * don't bother reporting them.
   954      */
   955     warnings.endquote=1;
   956     if (results->endquote_count>20)
   957     {
   958 	warnings.endquote=0;
   959 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   960 	  "Not reporting them.\n",results->endquote_count);
   961     }
   962     /*
   963      * If more than 15 lines contain standalone digits,
   964      * don't bother reporting them.
   965      */
   966     warnings.digit=1;
   967     if (results->standalone_digit>10)
   968     {
   969 	warnings.digit=0;
   970 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   971 	  "Not reporting them.\n",results->standalone_digit);
   972     }
   973     /*
   974      * If more than 20 lines contain hyphens at end,
   975      * don't bother reporting them.
   976      */
   977     warnings.hyphen=1;
   978     if (results->hyphens>20)
   979     {
   980 	warnings.hyphen=0;
   981 	g_print("   --> %ld lines in this file have hyphens at end. "
   982 	  "Not reporting them.\n",results->hyphens);
   983     }
   984     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   985     {
   986 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   987 	pswit[MARKUP_SWITCH]=1;
   988     }
   989     if (results->verylongline>0)
   990 	g_print("   --> %ld lines in this file are VERY long!\n",
   991 	  results->verylongline);
   992     /*
   993      * If there are more non-PG spaced dashes than PG em-dashes,
   994      * assume it's deliberate.
   995      * Current PG guidelines say don't use them, but older texts do,
   996      * and some people insist on them whatever the guidelines say.
   997      */
   998     warnings.dash=1;
   999     if (results->spacedash+results->emdash.non_PG_space>
  1000       results->emdash.PG_space)
  1001     {
  1002 	warnings.dash=0;
  1003 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1004 	  "Not reporting them.\n",
  1005 	  results->spacedash+results->emdash.non_PG_space);
  1006     }
  1007     /* If more than a quarter of characters are hi-bit, bug out. */
  1008     warnings.bin=1;
  1009     if (results->binlen*4>results->totlen)
  1010     {
  1011 	g_print("   --> This file does not appear to be ASCII. "
  1012 	  "Terminating. Best of luck with it!\n");
  1013 	exit(1);
  1014     }
  1015     if (results->alphalen*4<results->totlen)
  1016     {
  1017 	g_print("   --> This file does not appear to be text. "
  1018 	  "Terminating. Best of luck with it!\n");
  1019 	exit(1);
  1020     }
  1021     if (results->binlen*100>results->totlen || results->binlen>100)
  1022     {
  1023 	g_print("   --> There are a lot of foreign letters here. "
  1024 	  "Not reporting them.\n");
  1025 	warnings.bin=0;
  1026     }
  1027     warnings.isDutch=FALSE;
  1028     if (results->Dutchcount>50)
  1029     {
  1030 	warnings.isDutch=TRUE;
  1031 	g_print("   --> This looks like Dutch - "
  1032 	  "switching off dashes and warnings for 's Middags case.\n");
  1033     }
  1034     warnings.isFrench=FALSE;
  1035     if (results->Frenchcount>50)
  1036     {
  1037 	warnings.isFrench=TRUE;
  1038 	g_print("   --> This looks like French - "
  1039 	  "switching off some doublepunct.\n");
  1040     }
  1041     if (results->firstline && results->footerline)
  1042 	g_print("    The PG header and footer appear to be already on.\n");
  1043     else
  1044     {
  1045 	if (results->firstline)
  1046 	    g_print("    The PG header is on - no footer.\n");
  1047 	if (results->footerline)
  1048 	    g_print("    The PG footer is on - no header.\n");
  1049     }
  1050     g_print("\n");
  1051     if (pswit[VERBOSE_SWITCH])
  1052     {
  1053 	warnings.bin=1;
  1054 	warnings.shortline=1;
  1055 	warnings.dotcomma=1;
  1056 	warnings.longline=1;
  1057 	warnings.dash=1;
  1058 	warnings.digit=1;
  1059 	warnings.ast=1;
  1060 	warnings.fslash=1;
  1061 	warnings.hyphen=1;
  1062 	warnings.endquote=1;
  1063 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1064     }
  1065     if (warnings.isDutch)
  1066 	warnings.dash=0;
  1067     if (results->footerline>0 && results->firstline>0 &&
  1068       results->footerline>results->firstline &&
  1069       results->footerline-results->firstline<100)
  1070     {
  1071 	g_print("   --> I don't really know where this text starts. \n");
  1072 	g_print("       There are no reference points.\n");
  1073 	g_print("       I'm going to have to report the header and footer "
  1074 	  "as well.\n");
  1075 	results->firstline=0;
  1076     }
  1077     return &warnings;
  1078 }
  1079 
  1080 /*
  1081  * analyse_quotes:
  1082  *
  1083  * Look along the line, accumulate the count of quotes, and see
  1084  * if this is an empty line - i.e. a line with nothing on it
  1085  * but spaces.
  1086  * If line has just spaces, period, * and/or - on it, don't
  1087  * count it, since empty lines with asterisks or dashes to
  1088  * separate sections are common.
  1089  *
  1090  * Returns: TRUE if the line is empty.
  1091  */
  1092 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1093 {
  1094     int guessquote=0;
  1095     /* assume the line is empty until proven otherwise */
  1096     gboolean isemptyline=TRUE;
  1097     const char *s=aline,*sprev,*snext;
  1098     gunichar c;
  1099     sprev=NULL;
  1100     GError *tmp_err=NULL;
  1101     while (*s)
  1102     {
  1103 	snext=g_utf8_next_char(s);
  1104 	c=g_utf8_get_char(s);
  1105 	if (CHAR_IS_DQUOTE(c))
  1106 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1107 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1108 	{
  1109 	    if (s==aline)
  1110 	    {
  1111 		/*
  1112 		 * At start of line, it can only be a quotation mark.
  1113 		 * Hardcode a very common exception!
  1114 		 */
  1115 		if (!g_str_has_prefix(snext,"tis") &&
  1116 		  !g_str_has_prefix(snext,"Tis"))
  1117 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1118 	    }
  1119 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1120 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1121 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1122 		;
  1123 	    /* it's outside a word - let's check it out */
  1124 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1125 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1126 	    {
  1127 		/* certainly looks like a quotation mark */
  1128 		if (!g_str_has_prefix(snext,"tis") &&
  1129 		  !g_str_has_prefix(snext,"Tis"))
  1130 		    /* hardcode a very common exception! */
  1131 		{
  1132 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1133 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1134 		    else
  1135 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1136 		}
  1137 	    }
  1138 	    else
  1139 	    {
  1140 		/* now - is it a quotation mark? */
  1141 		guessquote=0;   /* accumulate clues */
  1142 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1143 		{
  1144 		    /* it follows a letter - could be either */
  1145 		    guessquote++;
  1146 		    if (g_utf8_get_char(sprev)=='s')
  1147 		    {
  1148 			/* looks like a plural apostrophe */
  1149 			guessquote-=3;
  1150 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1151 			    /* bonus marks! */
  1152 			    guessquote-=2;
  1153 		    }
  1154 		    if (innermost_quote_matches(counters,c))
  1155 			/*
  1156 			 * Give it the benefit of some doubt,
  1157 			 * if a squote is already open.
  1158 			 */
  1159 			guessquote++;
  1160 		    else
  1161 			guessquote--;
  1162 		    if (guessquote>=0)
  1163 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1164 		}
  1165 		else
  1166 		    /* no adjacent letter - it must be a quote of some kind */
  1167 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1168 	    }
  1169 	}
  1170 	if (tmp_err)
  1171 	{
  1172 	    if (pswit[ECHO_SWITCH])
  1173 		g_print("\n%s\n",aline);
  1174 	    if (!pswit[OVERVIEW_SWITCH])
  1175 		g_print("    Line %ld column %ld - %s\n",
  1176 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1177 	    g_clear_error(&tmp_err);
  1178 	}
  1179 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1180 	  c!='\r' && c!='\n')
  1181 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1182 	if (c==CHAR_UNDERSCORE)
  1183 	    counters->c_unders++;
  1184 	if (c==CHAR_OPEN_SBRACK)
  1185 	{
  1186 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1187 	      !matching_difference(counters,c) && s==aline &&
  1188 	      g_str_has_prefix(s,"[Illustration:"))
  1189 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1190 	    else
  1191 		increment_matching(counters,c,TRUE);
  1192 	}
  1193 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1194 	    increment_matching(counters,c,TRUE);
  1195 	if (c==CHAR_CLOSE_SBRACK)
  1196 	{
  1197 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1198 	      !matching_difference(counters,c) && !*snext)
  1199 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1200 	    else
  1201 		increment_matching(counters,c,FALSE);
  1202 	}
  1203 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1204 	    increment_matching(counters,c,FALSE);
  1205 	sprev=s;
  1206 	s=snext;
  1207     }
  1208     return isemptyline;
  1209 }
  1210 
  1211 /*
  1212  * check_for_control_characters:
  1213  *
  1214  * Check for invalid or questionable characters in the line
  1215  * Anything above 127 is invalid for plain ASCII, and
  1216  * non-printable control characters should also be flagged.
  1217  * Tabs should generally not be there.
  1218  */
  1219 void check_for_control_characters(const char *aline)
  1220 {
  1221     gunichar c;
  1222     const char *s;
  1223     for (s=aline;*s;s=g_utf8_next_char(s))
  1224     {
  1225 	c=g_utf8_get_char(s);
  1226 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1227 	{
  1228 	    if (pswit[ECHO_SWITCH])
  1229 		g_print("\n%s\n",aline);
  1230 	    if (!pswit[OVERVIEW_SWITCH])
  1231 		g_print("    Line %ld column %ld - Control character %u\n",
  1232 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1233 	    else
  1234 		cnt_bin++;
  1235 	}
  1236     }
  1237 }
  1238 
  1239 /*
  1240  * check_for_odd_characters:
  1241  *
  1242  * Check for binary and other odd characters.
  1243  */
  1244 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1245   gboolean isemptyline)
  1246 {
  1247     /* Don't repeat multiple warnings on one line. */
  1248     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
  1249     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1250     const char *s;
  1251     gunichar c;
  1252     for (s=aline;*s;s=g_utf8_next_char(s))
  1253     {
  1254 	c=g_utf8_get_char(s);
  1255 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1256 	{
  1257 	    if (pswit[ECHO_SWITCH])
  1258 		g_print("\n%s\n",aline);
  1259 	    if (!pswit[OVERVIEW_SWITCH])
  1260 		if (c>127 && c<160 || c>255)
  1261 		    g_print("    Line %ld column %ld - "
  1262 		      "Non-ISO-8859 character %u\n",
  1263 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1264 		else
  1265 		    g_print("    Line %ld column %ld - "
  1266 		      "Non-ASCII character %u\n",
  1267 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1268 	    else
  1269 		cnt_bin++;
  1270 	    eNon_A=TRUE;
  1271 	}
  1272 	if (!eTab && c==CHAR_TAB)
  1273 	{
  1274 	    if (pswit[ECHO_SWITCH])
  1275 		g_print("\n%s\n",aline);
  1276 	    if (!pswit[OVERVIEW_SWITCH])
  1277 		g_print("    Line %ld column %ld - Tab character?\n",
  1278 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1279 	    else
  1280 		cnt_odd++;
  1281 	    eTab=TRUE;
  1282 	}
  1283 	if (!eTilde && c==CHAR_TILDE)
  1284 	{
  1285 	    /*
  1286 	     * Often used by OCR software to indicate an
  1287 	     * unrecognizable character.
  1288 	     */
  1289 	    if (pswit[ECHO_SWITCH])
  1290 		g_print("\n%s\n",aline);
  1291 	    if (!pswit[OVERVIEW_SWITCH])
  1292 		g_print("    Line %ld column %ld - Tilde character?\n",
  1293 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1294 	    else
  1295 		cnt_odd++;
  1296 	    eTilde=TRUE;
  1297 	}
  1298 	if (!eCarat && c==CHAR_CARAT)
  1299 	{  
  1300 	    if (pswit[ECHO_SWITCH])
  1301 		g_print("\n%s\n",aline);
  1302 	    if (!pswit[OVERVIEW_SWITCH])
  1303 		g_print("    Line %ld column %ld - Carat character?\n",
  1304 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1305 	    else
  1306 		cnt_odd++;
  1307 	    eCarat=TRUE;
  1308 	}
  1309 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1310 	{  
  1311 	    if (pswit[ECHO_SWITCH])
  1312 		g_print("\n%s\n",aline);
  1313 	    if (!pswit[OVERVIEW_SWITCH])
  1314 		g_print("    Line %ld column %ld - Forward slash?\n",
  1315 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1316 	    else
  1317 		cnt_odd++;
  1318 	    eFSlash=TRUE;
  1319 	}
  1320 	/*
  1321 	 * Report asterisks only in paranoid mode,
  1322 	 * since they're often deliberate.
  1323 	 */
  1324 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1325 	  c==CHAR_ASTERISK)
  1326 	{
  1327 	    if (pswit[ECHO_SWITCH])
  1328 		g_print("\n%s\n",aline);
  1329 	    if (!pswit[OVERVIEW_SWITCH])
  1330 		g_print("    Line %ld column %ld - Asterisk?\n",
  1331 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1332 	    else
  1333 		cnt_odd++;
  1334 	    eAst=TRUE;
  1335 	}
  1336     }
  1337 }
  1338 
  1339 /*
  1340  * check_for_long_line:
  1341  *
  1342  * Check for line too long.
  1343  */
  1344 void check_for_long_line(const char *aline)
  1345 {
  1346     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1347     {
  1348 	if (pswit[ECHO_SWITCH])
  1349 	    g_print("\n%s\n",aline);
  1350 	if (!pswit[OVERVIEW_SWITCH])
  1351 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1352 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1353 	else
  1354 	    cnt_long++;
  1355     }
  1356 }
  1357 
  1358 /*
  1359  * check_for_short_line:
  1360  *
  1361  * Check for line too short.
  1362  *
  1363  * This one is a bit trickier to implement: we don't want to
  1364  * flag the last line of a paragraph for being short, so we
  1365  * have to wait until we know that our current line is a
  1366  * "normal" line, then report the _previous_ line if it was too
  1367  * short. We also don't want to report indented lines like
  1368  * chapter heads or formatted quotations. We therefore keep
  1369  * last->len as the length of the last line examined, and
  1370  * last->blen as the length of the last but one, and try to
  1371  * suppress unnecessary warnings by checking that both were of
  1372  * "normal" length. We keep the first character of the last
  1373  * line in last->start, and if it was a space, we assume that
  1374  * the formatting is deliberate. I can't figure out a way to
  1375  * distinguish something like a quoted verse left-aligned or
  1376  * the header or footer of a letter from a paragraph of short
  1377  * lines - maybe if I examined the whole paragraph, and if the
  1378  * para has less than, say, 8 lines and if all lines are short,
  1379  * then just assume it's OK? Need to look at some texts to see
  1380  * how often a formula like this would get the right result.
  1381  */
  1382 void check_for_short_line(const char *aline,const struct line_properties *last)
  1383 {
  1384     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1385       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1386       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1387     {
  1388 	if (pswit[ECHO_SWITCH])
  1389 	    g_print("\n%s\n",prevline);
  1390 	if (!pswit[OVERVIEW_SWITCH])
  1391 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1392 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1393 	else
  1394 	    cnt_short++;
  1395     }
  1396 }
  1397 
  1398 /*
  1399  * check_for_starting_punctuation:
  1400  *
  1401  * Look for punctuation other than full ellipses at start of line.
  1402  */
  1403 void check_for_starting_punctuation(const char *aline)
  1404 {
  1405     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1406       !g_str_has_prefix(aline,". . ."))
  1407     {
  1408 	if (pswit[ECHO_SWITCH])
  1409 	    g_print("\n%s\n",aline);
  1410 	if (!pswit[OVERVIEW_SWITCH])
  1411 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1412 	      linecnt);
  1413 	else
  1414 	    cnt_punct++;
  1415     }
  1416 }
  1417 
  1418 /*
  1419  * str_emdash:
  1420  *
  1421  * Find the first em-dash, return a pointer to it and set <next> to the
  1422  * character following the dash.
  1423  */
  1424 char *str_emdash(const char *s,const char **next)
  1425 {
  1426     const char *s1,*s2;
  1427     s1=strstr(s,"--");
  1428     s2=strstr(s,"—");
  1429     if (!s1)
  1430     {
  1431 	if (s2)
  1432 	    *next=g_utf8_next_char(s2);
  1433 	return (char *)s2;
  1434     }
  1435     else if (!s2)
  1436     {
  1437 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1438 	return (char *)s1;
  1439     }
  1440     else if (s1<s2)
  1441     {
  1442 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1443 	return (char *)s1;
  1444     }
  1445     else
  1446     {
  1447 	*next=g_utf8_next_char(s2);
  1448 	return (char *)s2;
  1449     }
  1450 }
  1451 
  1452 /*
  1453  * check_for_spaced_emdash:
  1454  *
  1455  * Check for spaced em-dashes.
  1456  *
  1457  * We must check _all_ occurrences of em-dashes on the line
  1458  * hence the loop - even if the first dash is OK
  1459  * there may be another that's wrong later on.
  1460  */
  1461 void check_for_spaced_emdash(const char *aline)
  1462 {
  1463     const char *s,*t,*next;
  1464     for (s=aline;t=str_emdash(s,&next);s=next)
  1465     {
  1466 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1467 	  g_utf8_get_char(next)==CHAR_SPACE)
  1468 	{
  1469 	    if (pswit[ECHO_SWITCH])
  1470 		g_print("\n%s\n",aline);
  1471 	    if (!pswit[OVERVIEW_SWITCH])
  1472 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1473 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1474 	    else
  1475 		cnt_dash++;
  1476 	}
  1477     }
  1478 }
  1479 
  1480 /*
  1481  * check_for_spaced_dash:
  1482  *
  1483  * Check for spaced dashes.
  1484  */
  1485 void check_for_spaced_dash(const char *aline)
  1486 {
  1487     const char *s;
  1488     if ((s=strstr(aline," -")))
  1489     {
  1490 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1491 	{
  1492 	    if (pswit[ECHO_SWITCH])
  1493 		g_print("\n%s\n",aline);
  1494 	    if (!pswit[OVERVIEW_SWITCH])
  1495 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1496 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1497 	    else
  1498 		cnt_dash++;
  1499 	}
  1500     }
  1501     else if ((s=strstr(aline,"- ")))
  1502     {
  1503 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1504 	{
  1505 	    if (pswit[ECHO_SWITCH])
  1506 		g_print("\n%s\n",aline);
  1507 	    if (!pswit[OVERVIEW_SWITCH])
  1508 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1509 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1510 	    else
  1511 		cnt_dash++;
  1512 	}
  1513     }
  1514 }
  1515 
  1516 /*
  1517  * check_for_unmarked_paragraphs:
  1518  *
  1519  * Check for unmarked paragraphs indicated by separate speakers.
  1520  *
  1521  * May well be false positive:
  1522  * "Bravo!" "Wonderful!" called the crowd.
  1523  * but useful all the same.
  1524  */
  1525 void check_for_unmarked_paragraphs(const char *aline)
  1526 {
  1527     const char *s;
  1528     s=strstr(aline,"\"  \"");
  1529     if (!s)
  1530 	s=strstr(aline,"\" \"");
  1531     if (s)
  1532     {
  1533 	if (pswit[ECHO_SWITCH])
  1534 	    g_print("\n%s\n",aline);
  1535 	if (!pswit[OVERVIEW_SWITCH])
  1536 	    g_print("    Line %ld column %ld - "
  1537 	      "Query missing paragraph break?\n",
  1538 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1539 	else
  1540 	    cnt_punct++;
  1541     }
  1542 }
  1543 
  1544 /*
  1545  * check_for_jeebies:
  1546  *
  1547  * Check for "to he" and other easy h/b errors.
  1548  *
  1549  * This is a very inadequate effort on the h/b problem,
  1550  * but the phrase "to he" is always an error, whereas "to
  1551  * be" is quite common.
  1552  * Similarly, '"Quiet!", be said.' is a non-be error
  1553  * "to he" is _not_ always an error!:
  1554  *       "Where they went to he couldn't say."
  1555  * Another false positive:
  1556  *       What would "Cinderella" be without the . . .
  1557  * and another: "If he wants to he can see for himself."
  1558  */
  1559 void check_for_jeebies(const char *aline)
  1560 {
  1561     const char *s;
  1562     s=strstr(aline," be could ");
  1563     if (!s)
  1564 	s=strstr(aline," be would ");
  1565     if (!s)
  1566 	s=strstr(aline," was be ");
  1567     if (!s)
  1568 	s=strstr(aline," be is ");
  1569     if (!s)
  1570 	s=strstr(aline," is be ");
  1571     if (!s)
  1572 	s=strstr(aline,"\", be ");
  1573     if (!s)
  1574 	s=strstr(aline,"\" be ");
  1575     if (!s)
  1576 	s=strstr(aline,"\" be ");
  1577     if (!s)
  1578 	s=strstr(aline," to he ");
  1579     if (s)
  1580     {
  1581 	if (pswit[ECHO_SWITCH])
  1582 	    g_print("\n%s\n",aline);
  1583 	if (!pswit[OVERVIEW_SWITCH])
  1584 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1585 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1586 	else
  1587 	    cnt_word++;
  1588     }
  1589     s=strstr(aline," the had ");
  1590     if (!s)
  1591 	s=strstr(aline," a had ");
  1592     if (!s)
  1593 	s=strstr(aline," they bad ");
  1594     if (!s)
  1595 	s=strstr(aline," she bad ");
  1596     if (!s)
  1597 	s=strstr(aline," he bad ");
  1598     if (!s)
  1599 	s=strstr(aline," you bad ");
  1600     if (!s)
  1601 	s=strstr(aline," i bad ");
  1602     if (s)
  1603     {
  1604 	if (pswit[ECHO_SWITCH])
  1605 	    g_print("\n%s\n",aline);
  1606 	if (!pswit[OVERVIEW_SWITCH])
  1607 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1608 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1609 	else
  1610 	    cnt_word++;
  1611     }
  1612     s=strstr(aline,"; hut ");
  1613     if (!s)
  1614 	s=strstr(aline,", hut ");
  1615     if (s)
  1616     {
  1617 	if (pswit[ECHO_SWITCH])
  1618 	    g_print("\n%s\n",aline);
  1619 	if (!pswit[OVERVIEW_SWITCH])
  1620 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1621 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1622 	else
  1623 	    cnt_word++;
  1624     }
  1625 }
  1626 
  1627 /*
  1628  * check_for_mta_from:
  1629  *
  1630  * Special case - angled bracket in front of "From" placed there by an
  1631  * MTA when sending an e-mail.
  1632  */
  1633 void check_for_mta_from(const char *aline)
  1634 {
  1635     const char *s;
  1636     s=strstr(aline,">From");
  1637     if (s)
  1638     {
  1639 	if (pswit[ECHO_SWITCH])
  1640 	    g_print("\n%s\n",aline);
  1641 	if (!pswit[OVERVIEW_SWITCH])
  1642 	    g_print("    Line %ld column %ld - "
  1643 	      "Query angled bracket with From\n",
  1644 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1645 	else
  1646 	    cnt_punct++;
  1647     }
  1648 }
  1649 
  1650 /*
  1651  * check_for_orphan_character:
  1652  *
  1653  * Check for a single character line -
  1654  * often an overflow from bad wrapping.
  1655  */
  1656 void check_for_orphan_character(const char *aline)
  1657 {
  1658     gunichar c;
  1659     c=g_utf8_get_char(aline);
  1660     if (c && !*g_utf8_next_char(aline))
  1661     {
  1662 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1663 	    ; /* Nothing - ignore numerals alone on a line. */
  1664 	else
  1665 	{
  1666 	    if (pswit[ECHO_SWITCH])
  1667 		g_print("\n%s\n",aline);
  1668 	    if (!pswit[OVERVIEW_SWITCH])
  1669 		g_print("    Line %ld column 1 - Query single character line\n",
  1670 		  linecnt);
  1671 	    else
  1672 		cnt_punct++;
  1673 	}
  1674     }
  1675 }
  1676 
  1677 /*
  1678  * check_for_pling_scanno:
  1679  *
  1680  * Check for I" - often should be !
  1681  */
  1682 void check_for_pling_scanno(const char *aline)
  1683 {
  1684     const char *s;
  1685     s=strstr(aline," I\"");
  1686     if (s)
  1687     {
  1688 	if (pswit[ECHO_SWITCH])
  1689 	    g_print("\n%s\n",aline);
  1690 	if (!pswit[OVERVIEW_SWITCH])
  1691 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1692 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1693 	else
  1694 	    cnt_punct++;
  1695     }
  1696 }
  1697 
  1698 /*
  1699  * check_for_extra_period:
  1700  *
  1701  * Check for period without a capital letter. Cut-down from gutspell.
  1702  * Only works when it happens on a single line.
  1703  */
  1704 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1705 {
  1706     const char *s,*t,*s1,*sprev;
  1707     int i;
  1708     gsize len;
  1709     gboolean istypo;
  1710     gchar *testword;
  1711     gunichar c,nc,pc,*decomposition;
  1712     if (pswit[PARANOID_SWITCH])
  1713     {
  1714 	for (t=aline;t=strstr(t,". ");)
  1715 	{
  1716 	    if (t==aline)
  1717 	    {
  1718 		t=g_utf8_next_char(t);
  1719 		/* start of line punctuation is handled elsewhere */
  1720 		continue;
  1721 	    }
  1722 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1723 	    {
  1724 		t=g_utf8_next_char(t);
  1725 		continue;
  1726 	    }
  1727 	    if (warnings->isDutch)
  1728 	    {
  1729 		/* For Frank & Jeroen -- 's Middags case */
  1730 		gunichar c2,c3,c4,c5;
  1731 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1732 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1733 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1734 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1735 		if (CHAR_IS_APOSTROPHE(c2) &&
  1736 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1737 		  g_unichar_isupper(c5))
  1738 		{
  1739 		    t=g_utf8_next_char(t);
  1740 		    continue;
  1741 		}
  1742 	    }
  1743 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1744 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1745 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1746 		s1=g_utf8_next_char(s1);
  1747 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1748 	    {
  1749 		/* we have something to investigate */
  1750 		istypo=TRUE;
  1751 		/* so let's go back and find out */
  1752 		nc=g_utf8_get_char(t);
  1753 		s1=g_utf8_prev_char(t);
  1754 		c=g_utf8_get_char(s1);
  1755 		sprev=g_utf8_prev_char(s1);
  1756 		pc=g_utf8_get_char(sprev);
  1757 		while (s1>=aline &&
  1758 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1759 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1760 		  g_unichar_isalpha(nc)))
  1761 		{
  1762 		    nc=c;
  1763 		    s1=sprev;
  1764 		    c=pc;
  1765 		    sprev=g_utf8_prev_char(s1);
  1766 		    pc=g_utf8_get_char(sprev);
  1767 		}
  1768 		s1=g_utf8_next_char(s1);
  1769 		s=strchr(s1,'.');
  1770 		if (s)
  1771 		    testword=g_strndup(s1,s-s1);
  1772 		else
  1773 		    testword=g_strdup(s1);
  1774 		for (i=0;*abbrev[i];i++)
  1775 		    if (!strcmp(testword,abbrev[i]))
  1776 			istypo=FALSE;
  1777 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1778 		    istypo=FALSE;
  1779 		if (!*g_utf8_next_char(testword))
  1780 		    istypo=FALSE;
  1781 		if (isroman(testword))
  1782 		    istypo=FALSE;
  1783 		if (istypo)
  1784 		{
  1785 		    istypo=FALSE;
  1786 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1787 		    {
  1788 			decomposition=g_unicode_canonical_decomposition(
  1789 			  g_utf8_get_char(s),&len);
  1790 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1791 			    istypo=TRUE;
  1792 			g_free(decomposition);
  1793 		    }
  1794 		}
  1795 		if (istypo &&
  1796 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1797 		{
  1798 		    g_tree_insert(qperiod,g_strdup(testword),
  1799 		      GINT_TO_POINTER(1));
  1800 		    if (pswit[ECHO_SWITCH])
  1801 			g_print("\n%s\n",aline);
  1802 		    if (!pswit[OVERVIEW_SWITCH])
  1803 			g_print("    Line %ld column %ld - Extra period?\n",
  1804 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1805 		    else
  1806 			cnt_punct++;
  1807 		}
  1808 		g_free(testword);
  1809 	    }
  1810 	    t=g_utf8_next_char(t);
  1811 	}
  1812     }
  1813 }
  1814 
  1815 /*
  1816  * check_for_following_punctuation:
  1817  *
  1818  * Check for words usually not followed by punctuation.
  1819  */
  1820 void check_for_following_punctuation(const char *aline)
  1821 {
  1822     int i;
  1823     const char *s,*wordstart;
  1824     gunichar c;
  1825     gchar *inword,*t;
  1826     if (pswit[TYPO_SWITCH])
  1827     {
  1828 	for (s=aline;*s;)
  1829 	{
  1830 	    wordstart=s;
  1831 	    t=getaword(&s);
  1832 	    if (!*t)
  1833 	    {
  1834 		g_free(t);
  1835 		continue;
  1836 	    }
  1837 	    inword=g_utf8_strdown(t,-1);
  1838 	    g_free(t);
  1839 	    for (i=0;*nocomma[i];i++)
  1840 		if (!strcmp(inword,nocomma[i]))
  1841 		{
  1842 		    c=g_utf8_get_char(s);
  1843 		    if (c==',' || c==';' || c==':')
  1844 		    {
  1845 			if (pswit[ECHO_SWITCH])
  1846 			    g_print("\n%s\n",aline);
  1847 			if (!pswit[OVERVIEW_SWITCH])
  1848 			    g_print("    Line %ld column %ld - "
  1849 			      "Query punctuation after %s?\n",
  1850 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1851 			      inword);
  1852 			else
  1853 			    cnt_punct++;
  1854 		    }
  1855 		}
  1856 	    for (i=0;*noperiod[i];i++)
  1857 		if (!strcmp(inword,noperiod[i]))
  1858 		{
  1859 		    c=g_utf8_get_char(s);
  1860 		    if (c=='.' || c=='!')
  1861 		    {
  1862 			if (pswit[ECHO_SWITCH])
  1863 			    g_print("\n%s\n",aline);
  1864 			if (!pswit[OVERVIEW_SWITCH])
  1865 			    g_print("    Line %ld column %ld - "
  1866 			      "Query punctuation after %s?\n",
  1867 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1868 			      inword);
  1869 			else
  1870 			    cnt_punct++;
  1871 		    }
  1872 		}
  1873 	    g_free(inword);
  1874 	}
  1875     }
  1876 }
  1877 
  1878 /*
  1879  * check_for_typos:
  1880  *
  1881  * Check for commonly mistyped words,
  1882  * and digits like 0 for O in a word.
  1883  */
  1884 void check_for_typos(const char *aline,struct warnings *warnings)
  1885 {
  1886     const char *s,*t,*nt,*wordstart;
  1887     gchar *inword;
  1888     gunichar *decomposition;
  1889     gchar *testword;
  1890     int i,vowel,consonant,*dupcnt;
  1891     gboolean isdup,istypo,alower;
  1892     gunichar c,pc;
  1893     long offset,len;
  1894     gsize decomposition_len;
  1895     for (s=aline;*s;)
  1896     {
  1897 	wordstart=s;
  1898 	inword=getaword(&s);
  1899 	if (!*inword)
  1900 	{
  1901 	    g_free(inword);
  1902 	    continue; /* don't bother with empty lines */
  1903 	}
  1904 	if (mixdigit(inword))
  1905 	{
  1906 	    if (pswit[ECHO_SWITCH])
  1907 		g_print("\n%s\n",aline);
  1908 	    if (!pswit[OVERVIEW_SWITCH])
  1909 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1910 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1911 	    else
  1912 		cnt_word++;
  1913 	}
  1914 	/*
  1915 	 * Put the word through a series of tests for likely typos and OCR
  1916 	 * errors.
  1917 	 */
  1918 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1919 	{
  1920 	    istypo=FALSE;
  1921 	    alower=FALSE;
  1922 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1923 	    {
  1924 		c=g_utf8_get_char(t);
  1925 		nt=g_utf8_next_char(t);
  1926 		/* lowercase for testing */
  1927 		if (g_unichar_islower(c))
  1928 		    alower=TRUE;
  1929 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1930 		{
  1931 		    /*
  1932 		     * We have an uppercase mid-word. However, there are
  1933 		     * common cases:
  1934 		     *   Mac and Mc like McGill
  1935 		     *   French contractions like l'Abbe
  1936 		     */
  1937 		    offset=g_utf8_pointer_to_offset(inword,t);
  1938 		    if (offset>0)
  1939 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1940 		    else
  1941 			pc='\0';
  1942 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1943 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1944 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1945 		      CHAR_IS_APOSTROPHE(pc))
  1946 			; /* do nothing! */
  1947 		    else
  1948 			istypo=TRUE;
  1949 		}
  1950 	    }
  1951 	    testword=g_utf8_casefold(inword,-1);
  1952 	}
  1953 	if (pswit[TYPO_SWITCH])
  1954 	{
  1955 	    /*
  1956 	     * Check for certain unlikely two-letter combinations at word
  1957 	     * start and end.
  1958 	     */
  1959 	    len=g_utf8_strlen(testword,-1);
  1960 	    if (len>1)
  1961 	    {
  1962 		for (i=0;*nostart[i];i++)
  1963 		    if (g_str_has_prefix(testword,nostart[i]))
  1964 			istypo=TRUE;
  1965 		for (i=0;*noend[i];i++)
  1966 		    if (g_str_has_suffix(testword,noend[i]))
  1967 			istypo=TRUE;
  1968 	    }
  1969 	    /* ght is common, gbt never. Like that. */
  1970 	    if (strstr(testword,"cb"))
  1971 		istypo=TRUE;
  1972 	    if (strstr(testword,"gbt"))
  1973 		istypo=TRUE;
  1974 	    if (strstr(testword,"pbt"))
  1975 		istypo=TRUE;
  1976 	    if (strstr(testword,"tbs"))
  1977 		istypo=TRUE;
  1978 	    if (strstr(testword,"mrn"))
  1979 		istypo=TRUE;
  1980 	    if (strstr(testword,"ahle"))
  1981 		istypo=TRUE;
  1982 	    if (strstr(testword,"ihle"))
  1983 		istypo=TRUE;
  1984 	    /*
  1985 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1986 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1987 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1988 	     * numerals, but "ii" is a common scanno.
  1989 	     */
  1990 	    if (strstr(testword,"tbi"))
  1991 		istypo=TRUE;
  1992 	    if (strstr(testword,"tbe"))
  1993 		istypo=TRUE;
  1994 	    if (strstr(testword,"ii"))
  1995 		istypo=TRUE;
  1996 	    /*
  1997 	     * Check for no vowels or no consonants.
  1998 	     * If none, flag a typo.
  1999 	     */
  2000 	    if (!istypo && len>1)
  2001 	    {
  2002 		vowel=consonant=0;
  2003 		for (t=testword;*t;t=g_utf8_next_char(t))
  2004 		{
  2005 		    c=g_utf8_get_char(t);
  2006 		    decomposition=
  2007 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2008 		    if (c=='y' || g_unichar_isdigit(c))
  2009 		    {
  2010 			/* Yah, this is loose. */
  2011 			vowel++;
  2012 			consonant++;
  2013 		    }
  2014 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2015 			vowel++;
  2016 		    else
  2017 			consonant++;
  2018 		    g_free(decomposition);
  2019 		}
  2020 		if (!vowel || !consonant)
  2021 		    istypo=TRUE;
  2022 	    }
  2023 	    /*
  2024 	     * Now exclude the word from being reported if it's in
  2025 	     * the okword list.
  2026 	     */
  2027 	    for (i=0;*okword[i];i++)
  2028 		if (!strcmp(testword,okword[i]))
  2029 		    istypo=FALSE;
  2030 	    /*
  2031 	     * What looks like a typo may be a Roman numeral.
  2032 	     * Exclude these.
  2033 	     */
  2034 	    if (istypo && isroman(testword))
  2035 		istypo=FALSE;
  2036 	    /* Check the manual list of typos. */
  2037 	    if (!istypo)
  2038 		for (i=0;*typo[i];i++)
  2039 		    if (!strcmp(testword,typo[i]))
  2040 			istypo=TRUE;
  2041 	    /*
  2042 	     * Check lowercase s, l, i and m - special cases.
  2043 	     *   "j" - often a semi-colon gone wrong.
  2044 	     *   "d" for a missing apostrophe - he d
  2045 	     *   "n" for "in"
  2046 	     */
  2047 	    if (!istypo && len==1 &&
  2048 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2049 		istypo=TRUE;
  2050 	    if (istypo)
  2051 	    {
  2052 		dupcnt=g_tree_lookup(qword,testword);
  2053 		if (dupcnt)
  2054 		{
  2055 		    (*dupcnt)++;
  2056 		    isdup=!pswit[VERBOSE_SWITCH];
  2057 		}
  2058 		else
  2059 		{
  2060 		    dupcnt=g_new0(int,1);
  2061 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2062 		    isdup=FALSE;
  2063 		}
  2064 		if (!isdup)
  2065 		{
  2066 		    if (pswit[ECHO_SWITCH])
  2067 			g_print("\n%s\n",aline);
  2068 		    if (!pswit[OVERVIEW_SWITCH])
  2069 		    {
  2070 			g_print("    Line %ld column %ld - Query word %s",
  2071 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2072 			  inword);
  2073 			if (!pswit[VERBOSE_SWITCH])
  2074 			    g_print(" - not reporting duplicates");
  2075 			g_print("\n");
  2076 		    }
  2077 		    else
  2078 			cnt_word++;
  2079 		}
  2080 	    }
  2081 	}
  2082 	/* check the user's list of typos */
  2083 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2084 	{
  2085 	    if (pswit[ECHO_SWITCH])
  2086 		g_print("\n%s\n",aline);
  2087 	    if (!pswit[OVERVIEW_SWITCH])  
  2088 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2089 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2090 	}
  2091 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2092 	    g_free(testword);
  2093 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2094 	{
  2095 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2096 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2097 	    {
  2098 		if (pswit[ECHO_SWITCH])
  2099 		    g_print("\n%s\n",aline);
  2100 		if (!pswit[OVERVIEW_SWITCH])
  2101 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2102 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2103 		      inword);
  2104 		else
  2105 		    cnt_word++;
  2106 	    }
  2107 	}
  2108 	g_free(inword);
  2109     }
  2110 }
  2111 
  2112 /*
  2113  * check_for_misspaced_punctuation:
  2114  *
  2115  * Look for added or missing spaces around punctuation and quotes.
  2116  * If there is a punctuation character like ! with no space on
  2117  * either side, suspect a missing!space. If there are spaces on
  2118  * both sides , assume a typo. If we see a double quote with no
  2119  * space or punctuation on either side of it, assume unspaced
  2120  * quotes "like"this.
  2121  */
  2122 void check_for_misspaced_punctuation(const char *aline,
  2123   struct parities *parities,gboolean isemptyline)
  2124 {
  2125     gboolean isacro,isellipsis;
  2126     const char *s;
  2127     gunichar c,nc,pc,n2c;
  2128     int parity;
  2129     c=g_utf8_get_char(aline);
  2130     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2131     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2132     {
  2133 	pc=c;
  2134 	c=nc;
  2135 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2136 	/* For each character in the line after the first. */
  2137 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2138 	{
  2139 	    /* we need to suppress warnings for acronyms like M.D. */
  2140 	    isacro=FALSE;
  2141 	    /* we need to suppress warnings for ellipsis . . . */
  2142 	    isellipsis=FALSE;
  2143 	    /*
  2144 	     * If there are letters on both sides of it or
  2145 	     * if it's strict punctuation followed by an alpha.
  2146 	     */
  2147 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2148 	      g_utf8_strchr("?!,;:",-1,c)))
  2149 	    {
  2150 		if (c=='.')
  2151 		{
  2152 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2153 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2154 			isacro=TRUE;
  2155 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2156 		    if (nc && n2c=='.')
  2157 			isacro=TRUE;
  2158 		}
  2159 		if (!isacro)
  2160 		{
  2161 		    if (pswit[ECHO_SWITCH])
  2162 			g_print("\n%s\n",aline);
  2163 		    if (!pswit[OVERVIEW_SWITCH])
  2164 			g_print("    Line %ld column %ld - Missing space?\n",
  2165 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2166 		    else
  2167 			cnt_punct++;
  2168 		}
  2169 	    }
  2170 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2171 	    {
  2172 		/*
  2173 		 * If there are spaces on both sides,
  2174 		 * or space before and end of line.
  2175 		 */
  2176 		if (c=='.')
  2177 		{
  2178 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2179 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2180 			isellipsis=TRUE;
  2181 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2182 		    if (nc && n2c=='.')
  2183 			isellipsis=TRUE;
  2184 		}
  2185 		if (!isemptyline && !isellipsis)
  2186 		{
  2187 		    if (pswit[ECHO_SWITCH])
  2188 			g_print("\n%s\n",aline);
  2189 		    if (!pswit[OVERVIEW_SWITCH])
  2190 			g_print("    Line %ld column %ld - "
  2191 			  "Spaced punctuation?\n",linecnt,
  2192 			  g_utf8_pointer_to_offset(aline,s)+1);
  2193 		    else
  2194 			cnt_punct++;
  2195 		}
  2196 	    }
  2197 	}
  2198     }
  2199     /* Split out the characters that CANNOT be preceded by space. */
  2200     c=g_utf8_get_char(aline);
  2201     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2202     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2203     {
  2204 	pc=c;
  2205 	c=nc;
  2206 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2207 	/* for each character in the line after the first */
  2208 	if (g_utf8_strchr("?!,;:",-1,c))
  2209 	{
  2210 	    /* if it's punctuation that _cannot_ have a space before it */
  2211 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2212 	    {
  2213 		/*
  2214 		 * If nc DOES == space,
  2215 		 * it was already reported just above.
  2216 		 */
  2217 		if (pswit[ECHO_SWITCH])
  2218 		    g_print("\n%s\n",aline);
  2219 		if (!pswit[OVERVIEW_SWITCH])
  2220 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2221 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2222 		else
  2223 		    cnt_punct++;
  2224 	    }
  2225 	}
  2226     }
  2227     /*
  2228      * Special case " .X" where X is any alpha.
  2229      * This plugs a hole in the acronym code above.
  2230      * Inelegant, but maintainable.
  2231      */
  2232     c=g_utf8_get_char(aline);
  2233     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2234     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2235     {
  2236 	pc=c;
  2237 	c=nc;
  2238 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2239 	/* for each character in the line after the first */
  2240 	if (c=='.')
  2241 	{
  2242 	    /* if it's a period */
  2243 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2244 	    {
  2245 		/*
  2246 		 * If the period follows a space and
  2247 		 * is followed by a letter.
  2248 		 */
  2249 		if (pswit[ECHO_SWITCH])
  2250 		    g_print("\n%s\n",aline);
  2251 		if (!pswit[OVERVIEW_SWITCH])
  2252 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2253 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2254 		else
  2255 		    cnt_punct++;
  2256 	    }
  2257 	}
  2258     }
  2259     c=g_utf8_get_char(aline);
  2260     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2261     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2262     {
  2263 	pc=c;
  2264 	c=nc;
  2265 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2266 	/* for each character in the line after the first */
  2267 	if (CHAR_IS_DQUOTE(c))
  2268 	{
  2269 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2270 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2271 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2272 	    {
  2273 		if (pswit[ECHO_SWITCH])
  2274 		    g_print("\n%s\n",aline);
  2275 		if (!pswit[OVERVIEW_SWITCH])
  2276 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2277 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2278 		else
  2279 		    cnt_punct++;
  2280 	    }
  2281 	}
  2282     }
  2283     /* Check parity of quotes. */
  2284     nc=g_utf8_get_char(aline);
  2285     for (s=aline;*s;s=g_utf8_next_char(s))
  2286     {
  2287 	c=nc;
  2288 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2289 	if (CHAR_IS_DQUOTE(c))
  2290 	{
  2291 	    if (c==CHAR_DQUOTE)
  2292 	    {
  2293 		parities->dquote=!parities->dquote;
  2294 		parity=parities->dquote;
  2295 	    }
  2296 	    else if (c==CHAR_LD_QUOTE)
  2297 		parity=1;
  2298 	    else
  2299 		parity=0;
  2300 	    if (!parity)
  2301 	    {
  2302 		/* parity even */
  2303 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2304 		{
  2305 		    if (pswit[ECHO_SWITCH])
  2306 			g_print("\n%s\n",aline);
  2307 		    if (!pswit[OVERVIEW_SWITCH])
  2308 			g_print("    Line %ld column %ld - "
  2309 			  "Wrongspaced quotes?\n",
  2310 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2311 		    else
  2312 			cnt_punct++;
  2313 		}
  2314 	    }
  2315 	    else
  2316 	    {
  2317 		/* parity odd */
  2318 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2319 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2320 		{
  2321 		    if (pswit[ECHO_SWITCH])
  2322 			g_print("\n%s\n",aline);
  2323 		    if (!pswit[OVERVIEW_SWITCH])
  2324 			g_print("    Line %ld column %ld - "
  2325 			  "Wrongspaced quotes?\n",
  2326 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2327 		    else
  2328 			cnt_punct++;
  2329 		}
  2330 	    }
  2331 	}
  2332     }
  2333     c=g_utf8_get_char(aline);
  2334     if (CHAR_IS_DQUOTE(c))
  2335     {
  2336 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2337 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2338 	{
  2339 	    if (pswit[ECHO_SWITCH])
  2340 		g_print("\n%s\n",aline);
  2341 	    if (!pswit[OVERVIEW_SWITCH])
  2342 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2343 		  linecnt);
  2344 	    else
  2345 		cnt_punct++;
  2346 	}
  2347     }
  2348     if (pswit[SQUOTE_SWITCH])
  2349     {
  2350 	nc=g_utf8_get_char(aline);
  2351 	for (s=aline;*s;s=g_utf8_next_char(s))
  2352 	{
  2353 	    c=nc;
  2354 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2355 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2356 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2357 	      !g_unichar_isalpha(nc)))
  2358 	    {
  2359 		parities->squote=!parities->squote;
  2360 		if (!parities->squote)
  2361 		{
  2362 		    /* parity even */
  2363 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2364 		    {
  2365 			if (pswit[ECHO_SWITCH])
  2366 			    g_print("\n%s\n",aline);
  2367 			if (!pswit[OVERVIEW_SWITCH])
  2368 			    g_print("    Line %ld column %ld - "
  2369 			      "Wrongspaced singlequotes?\n",
  2370 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2371 			else
  2372 			    cnt_punct++;
  2373 		    }
  2374 		}
  2375 		else
  2376 		{
  2377 		    /* parity odd */
  2378 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2379 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2380 		    {
  2381 			if (pswit[ECHO_SWITCH])
  2382 			    g_print("\n%s\n",aline);
  2383 			if (!pswit[OVERVIEW_SWITCH])
  2384 			    g_print("    Line %ld column %ld - "
  2385 			      "Wrongspaced singlequotes?\n",
  2386 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2387 			else
  2388 			    cnt_punct++;
  2389 		    }
  2390 		}
  2391 	    }
  2392 	}
  2393     }
  2394 }
  2395 
  2396 /*
  2397  * check_for_double_punctuation:
  2398  *
  2399  * Look for double punctuation like ,. or ,,
  2400  * Thanks to DW for the suggestion!
  2401  * In books with references, ".," and ".;" are common
  2402  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2403  * OTOH, from my initial tests, there are also fairly
  2404  * common errors. What to do? Make these cases paranoid?
  2405  * ".," is the most common, so warnings->dotcomma is used
  2406  * to suppress detailed reporting if it occurs often.
  2407  */
  2408 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2409 {
  2410     const char *s;
  2411     gunichar c,nc;
  2412     nc=g_utf8_get_char(aline);
  2413     for (s=aline;*s;s=g_utf8_next_char(s))
  2414     {
  2415 	c=nc;
  2416 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2417 	/* for each punctuation character in the line */
  2418 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2419 	  g_utf8_strchr(".?!,;:",-1,nc))
  2420 	{
  2421 	    /* followed by punctuation, it's a query, unless . . . */
  2422 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2423 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2424 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2425 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2426 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2427 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2428 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2429 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2430 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2431 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2432 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2433 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2434 	    {
  2435 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2436 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2437 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2438 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2439 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2440 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2441 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2442 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2443 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2444 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2445 		{
  2446 		    s+=4;
  2447 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2448 		}
  2449 		; /* do nothing for .. !! and ?? which can be legit */
  2450 	    }
  2451 	    else
  2452 	    {
  2453 		if (pswit[ECHO_SWITCH])
  2454 		    g_print("\n%s\n",aline);
  2455 		if (!pswit[OVERVIEW_SWITCH])
  2456 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2457 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2458 		else
  2459 		    cnt_punct++;
  2460 	    }
  2461 	}
  2462     }
  2463 }
  2464 
  2465 /*
  2466  * check_for_spaced_quotes:
  2467  */
  2468 void check_for_spaced_quotes(const char *aline)
  2469 {
  2470     int i;
  2471     const char *s,*t;
  2472     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2473       CHAR_RS_QUOTE};
  2474     GString *pattern;
  2475     s=aline;
  2476     while ((t=strstr(s," \" ")))
  2477     {
  2478 	if (pswit[ECHO_SWITCH])
  2479 	    g_print("\n%s\n",aline);
  2480 	if (!pswit[OVERVIEW_SWITCH])
  2481 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2482 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2483 	else
  2484 	    cnt_punct++;
  2485 	s=g_utf8_next_char(g_utf8_next_char(t));
  2486     }
  2487     pattern=g_string_new(NULL);
  2488     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2489     {
  2490 	g_string_assign(pattern," ");
  2491 	g_string_append_unichar(pattern,single_quotes[i]);
  2492 	g_string_append_c(pattern,' ');
  2493 	s=aline;
  2494 	while ((t=strstr(s,pattern->str)))
  2495 	{
  2496 	    if (pswit[ECHO_SWITCH])
  2497 		g_print("\n%s\n",aline);
  2498 	    if (!pswit[OVERVIEW_SWITCH])
  2499 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2500 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2501 	    else
  2502 		cnt_punct++;
  2503 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2504 	}
  2505     }
  2506     g_string_free(pattern,TRUE);
  2507 }
  2508 
  2509 /*
  2510  * check_for_miscased_genative:
  2511  *
  2512  * Check special case of 'S instead of 's at end of word.
  2513  */
  2514 void check_for_miscased_genative(const char *aline)
  2515 {
  2516     const char *s;
  2517     gunichar c,nc,pc;
  2518     if (!*aline)
  2519 	return;
  2520     c=g_utf8_get_char(aline);
  2521     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2522     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2523     {
  2524 	pc=c;
  2525 	c=nc;
  2526 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2527 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2528 	{
  2529 	    if (pswit[ECHO_SWITCH])
  2530 		g_print("\n%s\n",aline);
  2531 	    if (!pswit[OVERVIEW_SWITCH])
  2532 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2533 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2534 	    else
  2535 		cnt_punct++;
  2536 	}
  2537     }
  2538 }
  2539 
  2540 /*
  2541  * check_end_of_line:
  2542  *
  2543  * Now check special cases - start and end of line -
  2544  * for single and double quotes. Start is sometimes [sic]
  2545  * but better to query it anyway.
  2546  * While we're here, check for dash at end of line.
  2547  */
  2548 void check_end_of_line(const char *aline,struct warnings *warnings)
  2549 {
  2550     int lbytes;
  2551     const char *s;
  2552     gunichar c1,c2;
  2553     lbytes=strlen(aline);
  2554     if (g_utf8_strlen(aline,lbytes)>1)
  2555     {
  2556 	s=g_utf8_prev_char(aline+lbytes);
  2557 	c1=g_utf8_get_char(s);
  2558 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2559 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2560 	{
  2561 	    if (pswit[ECHO_SWITCH])
  2562 		g_print("\n%s\n",aline);
  2563 	    if (!pswit[OVERVIEW_SWITCH])
  2564 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2565 		  g_utf8_strlen(aline,lbytes));
  2566 	    else
  2567 		cnt_punct++;
  2568 	}
  2569 	c1=g_utf8_get_char(aline);
  2570 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2571 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2572 	{
  2573 	    if (pswit[ECHO_SWITCH])
  2574 		g_print("\n%s\n",aline);
  2575 	    if (!pswit[OVERVIEW_SWITCH])
  2576 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2577 	    else
  2578 		cnt_punct++;
  2579 	}
  2580 	/*
  2581 	 * Dash at end of line may well be legit - paranoid mode only
  2582 	 * and don't report em-dash at line-end.
  2583 	 */
  2584 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2585 	{
  2586 	    for (s=g_utf8_prev_char(aline+lbytes);
  2587 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2588 		;
  2589 	    if (g_utf8_get_char(s)=='-' &&
  2590 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2591 	    {
  2592 		if (pswit[ECHO_SWITCH])
  2593 		    g_print("\n%s\n",aline);
  2594 		if (!pswit[OVERVIEW_SWITCH])
  2595 		    g_print("    Line %ld column %ld - "
  2596 		      "Hyphen at end of line?\n",
  2597 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2598 	    }
  2599 	}
  2600     }
  2601 }
  2602 
  2603 /*
  2604  * check_for_unspaced_bracket:
  2605  *
  2606  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2607  * If so, suspect a scanno like "a]most".
  2608  */
  2609 void check_for_unspaced_bracket(const char *aline)
  2610 {
  2611     const char *s;
  2612     gunichar c,nc,pc;
  2613     c=g_utf8_get_char(aline);
  2614     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2615     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2616     {
  2617 	pc=c;
  2618 	c=nc;
  2619 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2620 	if (!nc)
  2621 	    break;
  2622 	/* for each bracket character in the line except 1st & last */
  2623 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2624 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2625 	{
  2626 	    if (pswit[ECHO_SWITCH])
  2627 		g_print("\n%s\n",aline);
  2628 	    if (!pswit[OVERVIEW_SWITCH])
  2629 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2630 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2631 	    else
  2632 		cnt_punct++;
  2633 	}
  2634     }
  2635 }
  2636 
  2637 /*
  2638  * check_for_unpunctuated_endquote:
  2639  */
  2640 void check_for_unpunctuated_endquote(const char *aline)
  2641 {
  2642     const char *s;
  2643     gunichar c,nc,pc;
  2644     QuoteClass qc;
  2645     c=g_utf8_get_char(aline);
  2646     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2647     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2648     {
  2649 	pc=c;
  2650 	c=nc;
  2651 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2652 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2653 	/* for each character in the line except 1st */
  2654 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2655 	{
  2656 	    if (pswit[ECHO_SWITCH])
  2657 		g_print("\n%s\n",aline);
  2658 	    if (!pswit[OVERVIEW_SWITCH])
  2659 		g_print("    Line %ld column %ld - "
  2660 		  "endquote missing punctuation?\n",
  2661 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2662 	    else
  2663 		cnt_punct++;
  2664 	}
  2665     }
  2666 }
  2667 
  2668 /*
  2669  * check_for_html_tag:
  2670  *
  2671  * Check for <HTML TAG>.
  2672  *
  2673  * If there is a < in the line, followed at some point
  2674  * by a > then we suspect HTML.
  2675  */
  2676 void check_for_html_tag(const char *aline)
  2677 {
  2678     const char *open,*close;
  2679     gchar *tag;
  2680     open=strchr(aline,'<');
  2681     if (open)
  2682     {
  2683 	close=strchr(g_utf8_next_char(open),'>');
  2684 	if (close)
  2685 	{
  2686 	    if (pswit[ECHO_SWITCH])
  2687 		g_print("\n%s\n",aline);
  2688 	    if (!pswit[OVERVIEW_SWITCH])
  2689 	    {
  2690 		tag=g_strndup(open,close-open+1);
  2691 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2692 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2693 		g_free(tag);
  2694 	    }
  2695 	    else
  2696 		cnt_html++;
  2697 	}
  2698     }
  2699 }
  2700 
  2701 /*
  2702  * check_for_html_entity:
  2703  *
  2704  * Check for &symbol; HTML.
  2705  *
  2706  * If there is a & in the line, followed at
  2707  * some point by a ; then we suspect HTML.
  2708  */
  2709 void check_for_html_entity(const char *aline)
  2710 {
  2711     const char *s,*amp,*scolon;
  2712     gchar *entity;
  2713     amp=strchr(aline,'&');
  2714     if (amp)
  2715     {
  2716 	scolon=strchr(amp,';');
  2717 	if (scolon)
  2718 	{
  2719 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2720 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2721 		    break;		/* Don't report "Jones & Son;" */
  2722 	    if (s>=scolon)
  2723 	    {
  2724 		if (pswit[ECHO_SWITCH])
  2725 		    g_print("\n%s\n",aline);
  2726 		if (!pswit[OVERVIEW_SWITCH])
  2727 		{
  2728 		    entity=g_strndup(amp,scolon-amp+1);
  2729 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2730 		      linecnt,(int)(amp-aline)+1,entity);
  2731 		    g_free(entity);
  2732 		}
  2733 		else
  2734 		    cnt_html++;
  2735 	    }
  2736 	}
  2737     }
  2738 }
  2739 
  2740 /*
  2741  * check_for_omitted_punctuation:
  2742  *
  2743  * Check for omitted punctuation at end of paragraph by working back
  2744  * through prevline. DW.
  2745  * Need to check this only for "normal" paras.
  2746  * So what is a "normal" para?
  2747  *    Not normal if one-liner (chapter headings, etc.)
  2748  *    Not normal if doesn't contain at least one locase letter
  2749  *    Not normal if starts with space
  2750  */
  2751 void check_for_omitted_punctuation(const char *prevline,
  2752   struct line_properties *last,int start_para_line)
  2753 {
  2754     gboolean letter_on_line=FALSE;
  2755     const char *s;
  2756     gunichar c;
  2757     gboolean closing_quote;
  2758     for (s=prevline;*s;s=g_utf8_next_char(s))
  2759 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2760 	{
  2761 	    letter_on_line=TRUE;
  2762 	    break;
  2763 	}
  2764     /*
  2765      * This next "if" is a problem.
  2766      * If we say "start_para_line <= linecnt - 1", that includes
  2767      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2768      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2769      * misses genuine one-line paragraphs.
  2770      */
  2771     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2772       g_utf8_get_char(prevline)>CHAR_SPACE)
  2773     {
  2774 	s=prevline+strlen(prevline);
  2775 	do
  2776 	{
  2777 	    s=g_utf8_prev_char(s);
  2778 	    c=g_utf8_get_char(s);
  2779 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2780 		closing_quote=TRUE;
  2781 	    else
  2782 		closing_quote=FALSE;
  2783 	} while (closing_quote && s>prevline);
  2784 	for (;s>prevline;s=g_utf8_prev_char(s))
  2785 	{
  2786 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2787 	    {
  2788 		if (pswit[ECHO_SWITCH])
  2789 		    g_print("\n%s\n",prevline);
  2790 		if (!pswit[OVERVIEW_SWITCH])
  2791 		    g_print("    Line %ld column %ld - "
  2792 		      "No punctuation at para end?\n",
  2793 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2794 		else
  2795 		    cnt_punct++;
  2796 		break;
  2797 	    }
  2798 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2799 		break;
  2800 	}
  2801     }
  2802 }
  2803 
  2804 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2805 {
  2806     const char *word=key;
  2807     int *dupcnt=value;
  2808     if (*dupcnt)
  2809 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2810 	  word,*dupcnt);
  2811     return FALSE;
  2812 }
  2813 
  2814 void print_as_windows_1252(const char *string)
  2815 {
  2816     gsize inbytes,outbytes;
  2817     gchar *buf,*bp;
  2818     static GIConv converter=(GIConv)-1;
  2819     if (!string)
  2820     {
  2821 	if (converter!=(GIConv)-1)
  2822 	    g_iconv_close(converter);
  2823 	converter=(GIConv)-1;
  2824 	return;
  2825     }
  2826     if (converter==(GIConv)-1)
  2827 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2828     if (converter!=(GIConv)-1)
  2829     {
  2830 	inbytes=outbytes=strlen(string);
  2831 	bp=buf=g_malloc(outbytes+1);
  2832 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2833 	*bp='\0';
  2834 	fputs(buf,stdout);
  2835 	g_free(buf);
  2836     }
  2837     else
  2838 	fputs(string,stdout);
  2839 }
  2840 
  2841 void print_as_utf_8(const char *string)
  2842 {
  2843     fputs(string,stdout);
  2844 }
  2845 
  2846 /*
  2847  * procfile:
  2848  *
  2849  * Process one file.
  2850  */
  2851 void procfile(const char *filename)
  2852 {
  2853     const char *s;
  2854     gchar *parastart=NULL;	/* first line of current para */
  2855     gchar *etext,*aline;
  2856     gchar *etext_ptr;
  2857     GError *err=NULL;
  2858     struct first_pass_results *first_pass_results;
  2859     struct warnings *warnings;
  2860     struct counters counters={0};
  2861     struct line_properties last={0};
  2862     struct parities parities={0};
  2863     struct pending pending={0};
  2864     gboolean isemptyline;
  2865     long start_para_line=0;
  2866     gboolean isnewpara=FALSE,enddash=FALSE;
  2867     last.start=CHAR_SPACE;
  2868     linecnt=checked_linecnt=0;
  2869     etext=read_etext(filename,&err);
  2870     if (!etext)
  2871     {
  2872 	if (pswit[STDOUT_SWITCH])
  2873 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2874 	else
  2875 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2876 	exit(1);
  2877     }
  2878     g_print("\n\nFile: %s\n\n",filename);
  2879     first_pass_results=first_pass(etext);
  2880     warnings=report_first_pass(first_pass_results);
  2881     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2882     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2883     /*
  2884      * Here we go with the main pass. Hold onto yer hat!
  2885      */
  2886     linecnt=0;
  2887     etext_ptr=etext;
  2888     while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr)))
  2889     {
  2890 	linecnt++;
  2891 	if (linecnt==1)
  2892 	    isnewpara=TRUE;
  2893 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2894 	    continue;    // skip DP page separators completely
  2895 	if (linecnt<first_pass_results->firstline ||
  2896 	  (first_pass_results->footerline>0 &&
  2897 	  linecnt>first_pass_results->footerline))
  2898 	{
  2899 	    if (pswit[HEADER_SWITCH])
  2900 	    {
  2901 		if (g_str_has_prefix(aline,"Title:"))
  2902 		    g_print("    %s\n",aline);
  2903 		if (g_str_has_prefix(aline,"Author:"))
  2904 		    g_print("    %s\n",aline);
  2905 		if (g_str_has_prefix(aline,"Release Date:"))
  2906 		    g_print("    %s\n",aline);
  2907 		if (g_str_has_prefix(aline,"Edition:"))
  2908 		    g_print("    %s\n\n",aline);
  2909 	    }
  2910 	    continue;		/* skip through the header */
  2911 	}
  2912 	checked_linecnt++;
  2913 	print_pending(aline,parastart,&pending);
  2914 	isemptyline=analyse_quotes(aline,&counters);
  2915 	if (isnewpara && !isemptyline)
  2916 	{
  2917 	    /* This line is the start of a new paragraph. */
  2918 	    start_para_line=linecnt;
  2919 	    /* Capture its first line in case we want to report it later. */
  2920 	    g_free(parastart);
  2921 	    parastart=g_strdup(aline);
  2922 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2923 	    s=aline;
  2924 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2925 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2926 		s=g_utf8_next_char(s);
  2927 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2928 	    {
  2929 		/* and its first letter is lowercase */
  2930 		if (pswit[ECHO_SWITCH])
  2931 		    g_print("\n%s\n",aline);
  2932 		if (!pswit[OVERVIEW_SWITCH])
  2933 		    g_print("    Line %ld column %ld - "
  2934 		      "Paragraph starts with lower-case\n",
  2935 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2936 		else
  2937 		    cnt_punct++;
  2938 	    }
  2939 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2940 	}
  2941 	/* Check for an em-dash broken at line end. */
  2942 	if (enddash && g_utf8_get_char(aline)=='-')
  2943 	{
  2944 	    if (pswit[ECHO_SWITCH])
  2945 		g_print("\n%s\n",aline);
  2946 	    if (!pswit[OVERVIEW_SWITCH])
  2947 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2948 	    else
  2949 		cnt_punct++;
  2950 	}
  2951 	enddash=FALSE;
  2952 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2953 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2954 	    ;
  2955 	if (s>=aline && g_utf8_get_char(s)=='-')
  2956 	    enddash=TRUE;
  2957 	check_for_control_characters(aline);
  2958 	if (warnings->bin)
  2959 	    check_for_odd_characters(aline,warnings,isemptyline);
  2960 	if (warnings->longline)
  2961 	    check_for_long_line(aline);
  2962 	if (warnings->shortline)
  2963 	    check_for_short_line(aline,&last);
  2964 	last.blen=last.len;
  2965 	last.len=g_utf8_strlen(aline,-1);
  2966 	last.start=g_utf8_get_char(aline);
  2967 	check_for_starting_punctuation(aline);
  2968 	if (warnings->dash)
  2969 	{
  2970 	    check_for_spaced_emdash(aline);
  2971 	    check_for_spaced_dash(aline);
  2972 	}
  2973 	check_for_unmarked_paragraphs(aline);
  2974 	check_for_jeebies(aline);
  2975 	check_for_mta_from(aline);
  2976 	check_for_orphan_character(aline);
  2977 	check_for_pling_scanno(aline);
  2978 	check_for_extra_period(aline,warnings);
  2979 	check_for_following_punctuation(aline);
  2980 	check_for_typos(aline,warnings);
  2981 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2982 	check_for_double_punctuation(aline,warnings);
  2983 	check_for_spaced_quotes(aline);
  2984 	check_for_miscased_genative(aline);
  2985 	check_end_of_line(aline,warnings);
  2986 	check_for_unspaced_bracket(aline);
  2987 	if (warnings->endquote)
  2988 	    check_for_unpunctuated_endquote(aline);
  2989 	check_for_html_tag(aline);
  2990 	check_for_html_entity(aline);
  2991 	if (isemptyline)
  2992 	{
  2993 	    check_for_mismatched_quotes(&counters,&pending);
  2994 	    counters_reset(&counters);
  2995 	    /* let the next iteration know that it's starting a new para */
  2996 	    isnewpara=TRUE;
  2997 	    if (prevline)
  2998 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2999 	}
  3000 	g_free(prevline);
  3001 	prevline=g_strdup(aline);
  3002     }
  3003     linecnt++;
  3004     check_for_mismatched_quotes(&counters,&pending);
  3005     print_pending(NULL,parastart,&pending);
  3006     reset_pending(&pending);
  3007     if (prevline)
  3008     {
  3009 	g_free(prevline);
  3010 	prevline=NULL;
  3011     }
  3012     g_free(parastart);
  3013     g_free(prevline);
  3014     g_free(etext);
  3015     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3016 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3017     g_tree_unref(qword);
  3018     g_tree_unref(qperiod);
  3019     counters_destroy(&counters);
  3020     g_set_print_handler(NULL);
  3021     print_as_windows_1252(NULL);
  3022     if (pswit[MARKUP_SWITCH])  
  3023 	loseentities(NULL);
  3024 }
  3025 
  3026 /*
  3027  * flgets:
  3028  *
  3029  * Get one line from the input text, checking for
  3030  * the existence of exactly one CR/LF line-end per line.
  3031  *
  3032  * Returns: a pointer to the line.
  3033  */
  3034 char *flgets(char **etext,long lcnt,gboolean warn_nocr)
  3035 {
  3036     gunichar c;
  3037     gboolean isCR=FALSE;
  3038     char *theline=*etext;
  3039     char *eos=theline;
  3040     gchar *s;
  3041     for (;;)
  3042     {
  3043 	c=g_utf8_get_char(*etext);
  3044 	if (!c)
  3045 	{
  3046 	    if (*etext==theline)
  3047 		return NULL;
  3048 	    else if (pswit[LINE_END_SWITCH])
  3049 	    {
  3050 		if (pswit[ECHO_SWITCH])
  3051 		{
  3052 		    s=g_strndup(theline,eos-theline);
  3053 		    g_print("\n%s\n",s);
  3054 		    g_free(s);
  3055 		}
  3056 		if (!pswit[OVERVIEW_SWITCH])
  3057 		    /* There may, or may not, have been a CR */
  3058 		    g_print("    Line %ld - No LF?\n",lcnt);
  3059 		else
  3060 		    cnt_lineend++;
  3061 	    }
  3062 	    break;
  3063 	}
  3064 	*etext=g_utf8_next_char(*etext);
  3065 	/* either way, it's end of line */
  3066 	if (c=='\n')
  3067 	{
  3068 	    if (isCR)
  3069 		break;
  3070 	    else
  3071 	    {
  3072 		/* Error - a LF without a preceding CR */
  3073 		if (pswit[LINE_END_SWITCH] && warn_nocr)
  3074 		{
  3075 		    if (pswit[ECHO_SWITCH])
  3076 		    {
  3077 			s=g_strndup(theline,eos-theline);
  3078 			g_print("\n%s\n",s);
  3079 			g_free(s);
  3080 		    }
  3081 		    if (!pswit[OVERVIEW_SWITCH])
  3082 			g_print("    Line %ld - No CR?\n",lcnt);
  3083 		    else
  3084 			cnt_lineend++;
  3085 		}
  3086 		break;
  3087 	    }
  3088 	}
  3089 	if (c=='\r')
  3090 	{
  3091 	    if (isCR)
  3092 	    {
  3093 		/* Error - two successive CRs */
  3094 		if (pswit[LINE_END_SWITCH])
  3095 		{
  3096 		    if (pswit[ECHO_SWITCH])
  3097 		    {
  3098 			s=g_strndup(theline,eos-theline);
  3099 			g_print("\n%s\n",s);
  3100 			g_free(s);
  3101 		    }
  3102 		    if (!pswit[OVERVIEW_SWITCH])
  3103 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3104 		    else
  3105 			cnt_lineend++;
  3106 		}
  3107 	    }
  3108 	    isCR=TRUE;
  3109 	}
  3110 	else
  3111 	{
  3112 	    if (pswit[LINE_END_SWITCH] && isCR)
  3113 	    {
  3114 		if (pswit[ECHO_SWITCH])
  3115 		{
  3116 		    s=g_strndup(theline,eos-theline);
  3117 		    g_print("\n%s\n",s);
  3118 		    g_free(s);
  3119 		}
  3120 		if (!pswit[OVERVIEW_SWITCH])
  3121 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3122 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3123 		else
  3124 		    cnt_lineend++;
  3125 		*eos=' ';
  3126 	    }
  3127 	    isCR=FALSE;
  3128 	    eos=g_utf8_next_char(eos);
  3129 	}
  3130     }
  3131     *eos='\0';
  3132     if (pswit[MARKUP_SWITCH])  
  3133 	postprocess_for_HTML(theline);
  3134     if (pswit[DP_SWITCH])  
  3135 	postprocess_for_DP(theline);
  3136     return theline;
  3137 }
  3138 
  3139 /*
  3140  * mixdigit:
  3141  *
  3142  * Takes a "word" as a parameter, and checks whether it
  3143  * contains a mixture of alpha and digits. Generally, this is an
  3144  * error, but may not be for cases like 4th or L5 12s. 3d.
  3145  *
  3146  * Returns: TRUE iff an is error found.
  3147  */
  3148 gboolean mixdigit(const char *checkword)
  3149 {
  3150     gboolean wehaveadigit,wehavealetter,query;
  3151     const char *s,*nondigit;
  3152     wehaveadigit=wehavealetter=query=FALSE;
  3153     for (s=checkword;*s;s=g_utf8_next_char(s))
  3154 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3155 	    wehavealetter=TRUE;
  3156 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3157 	    wehaveadigit=TRUE;
  3158     if (wehaveadigit && wehavealetter)
  3159     {
  3160 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3161 	query=TRUE;
  3162 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3163 	  nondigit=g_utf8_next_char(nondigit))
  3164 	    ;
  3165 	/* digits, ending in st, rd, nd, th of either case */
  3166 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3167 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3168 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3169 	  !g_ascii_strcasecmp(nondigit,"th"))
  3170 	    query=FALSE;
  3171 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3172 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3173 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3174 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3175 	    query=FALSE;
  3176 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3177 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3178 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3179 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3180 	    query=FALSE;
  3181 	/* digits, ending in l, L, s or d */
  3182 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3183 	  !strcmp(nondigit,"d"))
  3184 	    query=FALSE;
  3185 	/*
  3186 	 * L at the start of a number, representing Britsh pounds, like L500.
  3187 	 * This is cute. We know the current word is mixed digit. If the first
  3188 	 * letter is L, there must be at least one digit following. If both
  3189 	 * digits and letters follow, we have a genuine error, else we have a
  3190 	 * capital L followed by digits, and we accept that as a non-error.
  3191 	 */
  3192 	if (g_utf8_get_char(checkword)=='L' &&
  3193 	  !mixdigit(g_utf8_next_char(checkword)))
  3194 	    query=FALSE;
  3195     }
  3196     return query;
  3197 }
  3198 
  3199 /*
  3200  * getaword:
  3201  *
  3202  * Extracts the first/next "word" from the line, and returns it.
  3203  * A word is defined as one English word unit--or at least that's the aim.
  3204  * "ptr" is advanced to the position in the line where we will start
  3205  * looking for the next word.
  3206  *
  3207  * Returns: A newly-allocated string.
  3208  */
  3209 gchar *getaword(const char **ptr)
  3210 {
  3211     const char *s,*t;
  3212     GString *word;
  3213     gunichar c,pc;
  3214     word=g_string_new(NULL);
  3215     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3216       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3217       **ptr;*ptr=g_utf8_next_char(*ptr))
  3218     {
  3219 	/* Handle exceptions for footnote markers like [1] */
  3220 	if (g_utf8_get_char(*ptr)=='[')
  3221 	{
  3222 	    g_string_append_c(word,'[');
  3223 	    s=g_utf8_next_char(*ptr);
  3224 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
  3225 		g_string_append_unichar(word,g_utf8_get_char(s));
  3226 	    if (g_utf8_get_char(s)==']')
  3227 	    {
  3228 		g_string_append_c(word,']');
  3229 		*ptr=g_utf8_next_char(s);
  3230 		return g_string_free(word,FALSE);
  3231 	    }
  3232 	    else
  3233 		g_string_truncate(word,0);
  3234 	}
  3235     }
  3236     /*
  3237      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3238      * Especially yucky is the case of L1,000
  3239      * This section looks for a pattern of characters including a digit
  3240      * followed by a comma or period followed by one or more digits.
  3241      * If found, it returns this whole pattern as a word; otherwise we discard
  3242      * the results and resume our normal programming.
  3243      */
  3244     s=*ptr;
  3245     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3246       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3247       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3248 	g_string_append_unichar(word,g_utf8_get_char(s));
  3249     if (word->len)
  3250     {
  3251 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3252 	{
  3253 	    c=g_utf8_get_char(t);
  3254 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3255 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3256 	    {
  3257 		*ptr=s;
  3258 		return g_string_free(word,FALSE);
  3259 	    }
  3260 	}
  3261     }
  3262     /* we didn't find a punctuated number - do the regular getword thing */
  3263     g_string_truncate(word,0);
  3264     c=g_utf8_get_char(*ptr);
  3265     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3266       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3267 	g_string_append_unichar(word,c);
  3268     return g_string_free(word,FALSE);
  3269 }
  3270 
  3271 /*
  3272  * isroman:
  3273  *
  3274  * Is this word a Roman Numeral?
  3275  *
  3276  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3277  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3278  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3279  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3280  * expressions thereof, except when it came to taxes. Allow any number of M,
  3281  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3282  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3283  * of optional Is.
  3284  */
  3285 gboolean isroman(const char *t)
  3286 {
  3287     const char *s;
  3288     if (!t || !*t)
  3289 	return FALSE;
  3290     s=t;
  3291     while (g_utf8_get_char(t)=='m' && *t)
  3292 	t++;
  3293     if (g_utf8_get_char(t)=='d')
  3294 	t++;
  3295     if (g_str_has_prefix(t,"cm"))
  3296 	t+=2;
  3297     if (g_str_has_prefix(t,"cd"))
  3298 	t+=2;
  3299     while (g_utf8_get_char(t)=='c' && *t)
  3300 	t++;
  3301     if (g_str_has_prefix(t,"xl"))
  3302 	t+=2;
  3303     if (g_str_has_prefix(t,"xc"))
  3304 	t+=2;
  3305     if (g_utf8_get_char(t)=='l')
  3306 	t++;
  3307     while (g_utf8_get_char(t)=='x' && *t)
  3308 	t++;
  3309     if (g_str_has_prefix(t,"ix"))
  3310 	t+=2;
  3311     if (g_str_has_prefix(t,"iv"))
  3312 	t+=2;
  3313     if (g_utf8_get_char(t)=='v')
  3314 	t++;
  3315     while (g_utf8_get_char(t)=='i' && *t)
  3316 	t++;
  3317     return !*t;
  3318 }
  3319 
  3320 /*
  3321  * postprocess_for_DP:
  3322  *
  3323  * Invoked with the -d switch from flgets().
  3324  * It simply "removes" from the line a hard-coded set of common
  3325  * DP-specific tags, so that the line passed to the main routine has
  3326  * been pre-cleaned of DP markup.
  3327  */
  3328 void postprocess_for_DP(char *theline)
  3329 {
  3330     char *s,*t;
  3331     int i;
  3332     if (!*theline) 
  3333 	return;
  3334     for (i=0;*DPmarkup[i];i++)
  3335 	while ((s=strstr(theline,DPmarkup[i])))
  3336 	{
  3337 	    t=s+strlen(DPmarkup[i]);
  3338 	    memmove(s,t,strlen(t)+1);
  3339 	}
  3340 }
  3341 
  3342 /*
  3343  * postprocess_for_HTML:
  3344  *
  3345  * Invoked with the -m switch from flgets().
  3346  * It simply "removes" from the line a hard-coded set of common
  3347  * HTML tags and "replaces" a hard-coded set of common HTML
  3348  * entities, so that the line passed to the main routine has
  3349  * been pre-cleaned of HTML.
  3350  */
  3351 void postprocess_for_HTML(char *theline)
  3352 {
  3353     while (losemarkup(theline))
  3354 	;
  3355     loseentities(theline);
  3356 }
  3357 
  3358 char *losemarkup(char *theline)
  3359 {
  3360     char *s,*t;
  3361     int i;
  3362     s=strchr(theline,'<');
  3363     t=s?strchr(s,'>'):NULL;
  3364     if (!s || !t)
  3365 	return NULL;
  3366     for (i=0;*markup[i];i++)
  3367 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3368 	{
  3369 	    t=g_utf8_next_char(t);
  3370 	    memmove(s,t,strlen(t)+1);
  3371 	    return s;
  3372 	}
  3373     /* It's an unrecognized <xxx>. */
  3374     return NULL;
  3375 }
  3376 
  3377 void loseentities(char *theline)
  3378 {
  3379     int i;
  3380     gsize nb;
  3381     char *amp,*scolon;
  3382     gchar *s,*t;
  3383     gunichar c;
  3384     GTree *entities=NULL;
  3385     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3386     if (!theline)
  3387     {
  3388 	if (entities)
  3389 	    g_tree_destroy(entities);
  3390 	entities=NULL;
  3391 	if (translit!=(GIConv)-1)
  3392 	    g_iconv_close(translit);
  3393 	translit=(GIConv)-1;
  3394 	if (to_utf8!=(GIConv)-1)
  3395 	    g_iconv_close(to_utf8);
  3396 	to_utf8=(GIConv)-1;
  3397 	return;
  3398     }
  3399     if (!*theline)
  3400 	return;
  3401     if (!entities)
  3402     {
  3403 	entities=g_tree_new((GCompareFunc)strcmp);
  3404 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3405 	    g_tree_insert(entities,HTMLentities[i].name,
  3406 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3407     }
  3408     if (translit==(GIConv)-1)
  3409 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3410     if (to_utf8==(GIConv)-1)
  3411 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3412     while((amp=strchr(theline,'&')))
  3413     {
  3414 	scolon=strchr(amp,';');
  3415 	if (scolon)
  3416 	{
  3417 	    if (amp[1]=='#')
  3418 	    {
  3419 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3420 		    c=strtol(amp+2,NULL,10);
  3421 		else if (amp[2]=='x' &&
  3422 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3423 		    c=strtol(amp+3,NULL,16);
  3424 	    }
  3425 	    else
  3426 	    {
  3427 		s=g_strndup(amp+1,scolon-(amp+1));
  3428 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3429 		g_free(s);
  3430 	    }
  3431 	}
  3432 	else
  3433 	    c=0;
  3434 	if (c)
  3435 	{
  3436 	    theline=amp;
  3437 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3438 		theline+=g_unichar_to_utf8(c,theline);
  3439 	    else
  3440 	    {
  3441 		s=g_malloc(6);
  3442 		nb=g_unichar_to_utf8(c,s);
  3443 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3444 		g_free(s);
  3445 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3446 		g_free(t);
  3447 		memcpy(theline,s,nb);
  3448 		g_free(s);
  3449 		theline+=nb;
  3450 	    }
  3451 	    memmove(theline,g_utf8_next_char(scolon),
  3452 	      strlen(g_utf8_next_char(scolon))+1);
  3453 	}
  3454 	else
  3455 	    theline=g_utf8_next_char(amp);
  3456     }
  3457 }
  3458 
  3459 gboolean tagcomp(const char *strin,const char *basetag)
  3460 {
  3461     gboolean retval;
  3462     gchar *s,*t;
  3463     if (g_utf8_get_char(strin)=='/')
  3464 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3465     else
  3466 	t=g_utf8_casefold(strin,-1);
  3467     s=g_utf8_casefold(basetag,-1);
  3468     retval=g_str_has_prefix(t,s);
  3469     g_free(s);
  3470     g_free(t);
  3471     return retval;
  3472 }
  3473 
  3474 void proghelp(GOptionContext *context)
  3475 {
  3476     gchar *help;
  3477     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3478     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3479     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3480     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3481       "For details, read the file COPYING.\n",stderr);
  3482     fputs("This is Free Software; "
  3483       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3484     fputs("read the file COPYING for details.\n\n",stderr);
  3485     help=g_option_context_get_help(context,TRUE,NULL);
  3486     fputs(help,stderr);
  3487     g_free(help);
  3488     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3489     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3490       "non-ASCII\n",stderr);
  3491     fputs("characters like accented letters, "
  3492       "lines longer than 75 or shorter than 55,\n",stderr);
  3493     fputs("unbalanced quotes or brackets, "
  3494       "a variety of badly formatted punctuation, \n",stderr);
  3495     fputs("HTML tags, some likely typos. "
  3496       "It is NOT a substitute for human judgement.\n",stderr);
  3497     fputs("\n",stderr);
  3498 }