bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sun Sep 29 22:51:27 2013 +0100 (2013-09-29)
changeset 136 2f3762ff90d8
parent 103 adc06e9e8470
child 137 b6358ed2548d
permissions -rw-r--r--
Fix bug #14: Add a configuration file
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *prevline;
    36 
    37 /* Common typos. */
    38 char *typo[] = {
    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    68     "se", ""
    69 };
    70 
    71 GTree *usertypo;
    72 
    73 /* Common abbreviations and other OK words not to query as typos. */
    74 char *okword[] = {
    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    78     "outbid", "outbids", "frostbite", "frostbitten", ""
    79 };
    80 
    81 /* Common abbreviations that cause otherwise unexplained periods. */
    82 char *abbrev[] = {
    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    85 };
    86 
    87 /*
    88  * Two-Letter combinations that rarely if ever start words,
    89  * but are common scannos or otherwise common letter combinations.
    90  */
    91 char *nostart[] = {
    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    93 };
    94 
    95 /*
    96  * Two-Letter combinations that rarely if ever end words,
    97  * but are common scannos or otherwise common letter combinations.
    98  */
    99 char *noend[] = {
   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   101     "sw", "gr", "sl", "cl", "iy", ""
   102 };
   103 
   104 char *markup[] = {
   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   109 };
   110 
   111 char *DPmarkup[] = {
   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   113 };
   114 
   115 char *nocomma[] = {
   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   119     "during", "let", "toward", "among", ""
   120 };
   121 
   122 char *noperiod[] = {
   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   126     "among", "those", "into", "whom", "having", "thence", ""
   127 }; 
   128 
   129 gboolean pswit[SWITNO];  /* program switches */
   130 
   131 gboolean typo_compat,paranoid_compat;
   132 
   133 static GOptionEntry options[]={
   134     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   135       "Ignore DP-specific markup", NULL },
   136     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   137       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   138       "Don't ignore DP-specific markup", NULL },
   139     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   140       "Echo queried line", NULL },
   141     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   142       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   143       "Don't echo queried line", NULL },
   144     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   145       "Check single quotes", NULL },
   146     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   147       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   148       "Don't check single quotes", NULL },
   149     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   150       "Check common typos", NULL },
   151     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   152       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   153       "Don't check common typos", NULL },
   154     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   155       "Require closure of quotes on every paragraph", NULL },
   156     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   157       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   158       "Don't require closure of quotes on every paragraph", NULL },
   159     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   160       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   161       "Enable paranoid querying of everything", NULL },
   162     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   163       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   164       "Disable paranoid querying of everything", NULL },
   165     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   166       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   167       "Enable line end checking", NULL },
   168     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   169       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   170       "Diable line end checking", NULL },
   171     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   172       "Overview: just show counts", NULL },
   173     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   174       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   175       "Show individual warnings", NULL },
   176     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   177       "Output errors to stdout instead of stderr", NULL },
   178     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   179       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   180       "Output errors to stderr instead of stdout", NULL },
   181     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   182       "Echo header fields", NULL },
   183     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   184       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   185       "Don't echo header fields", NULL },
   186     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   187       "Ignore markup in < >", NULL },
   188     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   189       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   190       "No special handling for markup in < >", NULL },
   191     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   192       "Use file of user-defined typos", NULL },
   193     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   194       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   195       "Ignore file of user-defined typos", NULL },
   196     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   197       "Verbose - list everything", NULL },
   198     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   199       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   200       "Switch off verbose mode", NULL },
   201     { NULL }
   202 };
   203 
   204 /*
   205  * Options relating to configuration which make no sense from inside
   206  * a configuration file.
   207  */
   208 
   209 static GOptionEntry config_options[]={
   210     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   211       "Defaults for use on www upload", NULL },
   212     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   213       "Dump current config settings", NULL },
   214     { NULL }
   215 };
   216 
   217 static GOptionEntry compatibility_options[]={
   218     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   219       "Toggle checking for common typos", NULL },
   220     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   221       "Toggle both paranoid mode and common typos", NULL },
   222     { NULL }
   223 };
   224 
   225 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   226 long cnt_squot;		/* for overview mode, count of singlequote queries */
   227 long cnt_brack;		/* for overview mode, count of brackets queries */
   228 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   229 long cnt_odd;		/* for overview mode, count of odd character queries */
   230 long cnt_long;		/* for overview mode, count of long line errors */
   231 long cnt_short;		/* for overview mode, count of short line queries */
   232 long cnt_punct;		/* for overview mode,
   233 			   count of punctuation and spacing queries */
   234 long cnt_dash;		/* for overview mode, count of dash-related queries */
   235 long cnt_word;		/* for overview mode, count of word queries */
   236 long cnt_html;		/* for overview mode, count of html queries */
   237 long cnt_lineend;	/* for overview mode, count of line-end queries */
   238 long cnt_spacend;	/* count of lines with space at end */
   239 long linecnt;		/* count of total lines in the file */
   240 long checked_linecnt;	/* count of lines actually checked */
   241 
   242 void proghelp(GOptionContext *context);
   243 void procfile(const char *);
   244 
   245 gchar *running_from;
   246 
   247 gboolean mixdigit(const char *);
   248 gchar *getaword(const char **);
   249 char *flgets(char **,long);
   250 void postprocess_for_HTML(char *);
   251 char *linehasmarkup(char *);
   252 char *losemarkup(char *);
   253 gboolean tagcomp(const char *,const char *);
   254 void loseentities(char *);
   255 gboolean isroman(const char *);
   256 void postprocess_for_DP(char *);
   257 void print_as_windows_1252(const char *string);
   258 void print_as_utf_8(const char *string);
   259 
   260 GTree *qword,*qperiod;
   261 
   262 #ifdef __WIN32__
   263 UINT saved_cp;
   264 #endif
   265 
   266 GKeyFile *config;
   267 
   268 void config_file_update(GKeyFile *kf)
   269 {
   270     int i;
   271     gboolean sw;
   272     for(i=0;options[i].long_name;i++)
   273     {
   274 	if (g_str_has_prefix(options[i].long_name,"no-"))
   275 	    continue;
   276 	if (options[i].arg==G_OPTION_ARG_NONE)
   277 	{
   278 	    sw=*(gboolean *)options[i].arg_data;
   279 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   280 		sw=!sw;
   281 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   282 	}
   283 	else
   284 	    g_assert_not_reached();
   285     }
   286 }
   287 
   288 void config_file_add_comments(GKeyFile *kf)
   289 {
   290     int i;
   291     gchar *comment;
   292     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   293       NULL);
   294     for(i=0;options[i].long_name;i++)
   295     {
   296 	if (g_str_has_prefix(options[i].long_name,"no-"))
   297 	    continue;
   298 	comment=g_strconcat(" ",options[i].description,NULL);
   299 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   300 	g_free(comment);
   301     }
   302 }
   303 
   304 void dump_config(void)
   305 {
   306     gchar *s;
   307     if (config)
   308 	config_file_update(config);
   309     else
   310     {
   311 	config=g_key_file_new();
   312 	config_file_update(config);
   313 	config_file_add_comments(config);
   314     }
   315     s=g_key_file_to_data(config,NULL,NULL);
   316     if (s)
   317 	g_print("%s",s);
   318     g_free(s);
   319 }
   320 
   321 GKeyFile *read_config_file(gchar **full_path)
   322 {
   323     int i;
   324     GError *err=NULL;
   325     gchar **search_dirs;
   326     gchar *path;
   327     const char *search_path;
   328     GKeyFile *kf;
   329     kf=g_key_file_new();
   330     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   331     if (search_path)
   332     {
   333 #ifdef __WIN32__
   334 	search_dirs=g_strsplit(search_path,";",0);
   335 #else
   336 	search_dirs=g_strsplit(search_path,":",0);
   337 #endif
   338     }
   339     else
   340     {
   341 	search_dirs=g_new(gchar *,4);
   342 	search_dirs[0]=g_get_current_dir();
   343 	search_dirs[1]=g_strdup(running_from);
   344 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   345 	search_dirs[3]=NULL;
   346     }
   347     for(i=0;search_dirs[i];i++)
   348     {
   349 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   350 	if (g_key_file_load_from_file(kf,path,
   351 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   352 	    break;
   353 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   354 	{
   355 	    g_printerr("Bookloupe: Error reading %s\n",path);
   356 	    g_printerr("%s\n",err->message);
   357 	    exit(1);
   358 	}
   359 	g_clear_error(&err);
   360 	g_free(path);
   361 	path=NULL;
   362     }
   363     if (!search_dirs[i])
   364     {
   365 	g_key_file_free(kf);
   366 	kf=NULL;
   367     }
   368     g_strfreev(search_dirs);
   369     if (full_path && kf)
   370 	*full_path=path;
   371     else
   372 	g_free(path);
   373     return kf;
   374 }
   375 
   376 void parse_config_file(void)
   377 {
   378     int i,j;
   379     gchar *path;
   380     gchar **keys;
   381     gboolean sw;
   382     GError *err=NULL;
   383     config=read_config_file(&path);
   384     if (config)
   385 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   386     else
   387 	keys=NULL;
   388     if (keys)
   389     {
   390 	for(i=0;keys[i];i++)
   391 	{
   392 	    for(j=0;options[j].long_name;j++)
   393 	    {
   394 		if (g_str_has_prefix(options[j].long_name,"no-"))
   395 		    continue;
   396 		else if (!strcmp(keys[i],options[j].long_name))
   397 		{
   398 		    if (options[j].arg==G_OPTION_ARG_NONE)
   399 		    {
   400 			sw=g_key_file_get_boolean(config,"options",keys[i],
   401 			  &err);
   402 			if (err)
   403 			{
   404 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   405 			      path,keys[i],err->message);
   406 			    g_clear_error(&err);
   407 			}
   408 			if (options[j].flags&G_OPTION_FLAG_REVERSE)
   409 			    sw=!sw;
   410 			*(gboolean *)options[j].arg_data=sw;
   411 			break;
   412 		    }
   413 		    else
   414 			g_assert_not_reached();
   415 		}
   416 	    }
   417 	    if (!options[j].long_name)
   418 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   419 		  path,keys[i]);
   420 	}
   421 	g_strfreev(keys);
   422     }
   423     if (config)
   424 	g_free(path);
   425 }
   426 
   427 void parse_options(int *argc,char ***argv)
   428 {
   429     GError *err=NULL;
   430     GOptionContext *context;
   431     GOptionGroup *compatibility;
   432     context=g_option_context_new(
   433       "file - look for errors in Project Gutenberg(TM) etexts");
   434     g_option_context_add_main_entries(context,options,NULL);
   435     g_option_context_add_main_entries(context,config_options,NULL);
   436     compatibility=g_option_group_new("compatibility",
   437       "Options for Compatibility with Gutcheck:",
   438       "Show compatibility options",NULL,NULL);
   439     g_option_group_add_entries(compatibility,compatibility_options);
   440     g_option_context_add_group(context,compatibility);
   441     g_option_context_set_description(context,
   442       "For simplicity, only the switch options which reverse the\n"
   443       "default configuration are listed. In most cases, both vanilla\n"
   444       "and \"no-\" prefixed versions are available for use.");
   445     if (!g_option_context_parse(context,argc,argv,&err))
   446     {
   447 	g_printerr("Bookloupe: %s\n",err->message);
   448 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   449 	exit(1);
   450     }
   451     if (typo_compat)
   452 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   453     if (paranoid_compat)
   454     {
   455 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   456 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   457     }
   458     /*
   459      * Web uploads - for the moment, this is really just a placeholder
   460      * until we decide what processing we really want to do on web uploads
   461      */
   462     if (pswit[WEB_SWITCH])
   463     {
   464 	/* specific override for web uploads */
   465 	pswit[ECHO_SWITCH]=TRUE;
   466 	pswit[SQUOTE_SWITCH]=FALSE;
   467 	pswit[TYPO_SWITCH]=TRUE;
   468 	pswit[QPARA_SWITCH]=FALSE;
   469 	pswit[PARANOID_SWITCH]=TRUE;
   470 	pswit[LINE_END_SWITCH]=FALSE;
   471 	pswit[OVERVIEW_SWITCH]=FALSE;
   472 	pswit[STDOUT_SWITCH]=FALSE;
   473 	pswit[HEADER_SWITCH]=TRUE;
   474 	pswit[VERBOSE_SWITCH]=FALSE;
   475 	pswit[MARKUP_SWITCH]=FALSE;
   476 	pswit[USERTYPO_SWITCH]=FALSE;
   477 	pswit[DP_SWITCH]=FALSE;
   478     }
   479     if (pswit[DUMP_CONFIG_SWITCH])
   480     {
   481 	dump_config();
   482 	exit(0);
   483     }
   484     if (pswit[OVERVIEW_SWITCH])
   485 	/* just print summary; don't echo */
   486 	pswit[ECHO_SWITCH]=FALSE;
   487     if (*argc<2)
   488     {
   489 	proghelp(context);
   490 	exit(1);
   491     }
   492     g_option_context_free(context);
   493 }
   494 
   495 /*
   496  * read_user_scannos:
   497  *
   498  * Read in the user-defined stealth scanno list.
   499  */
   500 void read_user_scannos(void)
   501 {
   502     GError *err=NULL;
   503     gchar *usertypo_file;
   504     gboolean okay;
   505     int i;
   506     gsize len,nb;
   507     gchar *contents,*utf8,**lines;
   508     usertypo_file=g_strdup("bookloupe.typ");
   509     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   510     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   511     {
   512 	g_clear_error(&err);
   513 	g_free(usertypo_file);
   514 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   515 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   516     }
   517     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   518     {
   519 	g_clear_error(&err);
   520 	g_free(usertypo_file);
   521 	usertypo_file=g_strdup("gutcheck.typ");
   522 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   523     }
   524     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   525     {
   526 	g_clear_error(&err);
   527 	g_free(usertypo_file);
   528 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   529 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   530     }
   531     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   532     {
   533 	g_free(usertypo_file);
   534 	g_print("   --> I couldn't find bookloupe.typ "
   535 	  "-- proceeding without user typos.\n");
   536 	return;
   537     }
   538     else if (!okay)
   539     {
   540 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   541 	g_free(usertypo_file);
   542 	g_clear_error(&err);
   543 	exit(1);
   544     }
   545     if (g_utf8_validate(contents,len,NULL))
   546 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   547     else
   548 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   549     g_free(contents);
   550     lines=g_strsplit_set(utf8,"\r\n",0);
   551     g_free(utf8);
   552     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   553     for (i=0;lines[i];i++)
   554 	if (*(unsigned char *)lines[i]>'!')
   555 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   556 	else
   557 	    g_free(lines[i]);
   558     g_free(lines);
   559 }
   560 
   561 /*
   562  * read_etext:
   563  *
   564  * Read an etext returning a newly allocated string containing the file
   565  * contents or NULL on error.
   566  */
   567 gchar *read_etext(const char *filename,GError **err)
   568 {
   569     GError *tmp_err=NULL;
   570     gchar *contents,*utf8;
   571     gsize len,bytes_read,bytes_written;
   572     int i,line,col;
   573     if (!g_file_get_contents(filename,&contents,&len,err))
   574 	return NULL;
   575     if (g_utf8_validate(contents,len,NULL))
   576     {
   577 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   578 	g_set_print_handler(print_as_utf_8);
   579 #ifdef __WIN32__
   580 	SetConsoleOutputCP(CP_UTF8);
   581 #endif
   582     }
   583     else
   584     {
   585 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   586 	  &bytes_written,&tmp_err);
   587 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   588 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   589 	{
   590 	    line=col=1;
   591 	    for(i=0;i<bytes_read;i++)
   592 		if (contents[i]=='\n')
   593 		{
   594 		    line++;
   595 		    col=1;
   596 		}
   597 		else if (contents[i]!='\r')
   598 		    col++;
   599 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   600 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   601 	      "valid Windows-1252 character",
   602 	      ((unsigned char *)contents)[bytes_read],line,col);
   603 	}
   604 	else if (tmp_err)
   605 	    g_propagate_error(err,tmp_err);
   606 	g_set_print_handler(print_as_windows_1252);
   607 #ifdef __WIN32__
   608 	SetConsoleOutputCP(1252);
   609 #endif
   610     }
   611     g_free(contents);
   612     return utf8;
   613 }
   614 
   615 void cleanup_on_exit(void)
   616 {
   617 #ifdef __WIN32__
   618     SetConsoleOutputCP(saved_cp);
   619 #endif
   620 }
   621 
   622 int main(int argc,char **argv)
   623 {
   624 #ifdef __WIN32__
   625     atexit(cleanup_on_exit);
   626     saved_cp=GetConsoleOutputCP();
   627 #endif
   628     running_from=g_path_get_dirname(argv[0]);
   629     /* Paranoid checking is turned OFF, not on, by its switch */
   630     pswit[PARANOID_SWITCH]=TRUE;
   631     /* if running in paranoid mode, typo checks default to enabled */
   632     pswit[TYPO_SWITCH]=TRUE;
   633     /* Line-end checking is turned OFF, not on, by its switch */
   634     pswit[LINE_END_SWITCH]=TRUE;
   635     /* Echoing is turned OFF, not on, by its switch */
   636     pswit[ECHO_SWITCH]=TRUE;
   637     parse_config_file();
   638     parse_options(&argc,&argv);
   639     if (pswit[USERTYPO_SWITCH])
   640 	read_user_scannos();
   641     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   642     procfile(argv[1]);
   643     if (pswit[OVERVIEW_SWITCH])
   644     {
   645 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   646 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   647 	g_print("    --------------- Queries found --------------\n");
   648 	if (cnt_long)
   649 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   650 	if (cnt_short)
   651 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   652 	if (cnt_lineend)
   653 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   654 	if (cnt_word)
   655 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   656 	if (cnt_dquot)
   657 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   658 	if (cnt_squot)
   659 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   660 	if (cnt_brack)
   661 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   662 	if (cnt_bin)
   663 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   664 	if (cnt_odd)
   665 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   666 	if (cnt_punct)
   667 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   668 	if (cnt_dash)
   669 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   670 	if (cnt_html)
   671 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   672 	g_print("\n");
   673 	g_print("    TOTAL QUERIES		  %14ld\n",
   674 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   675 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   676     }
   677     g_free(running_from);
   678     if (usertypo)
   679 	g_tree_unref(usertypo);
   680     if (config)
   681 	g_key_file_free(config);
   682     return 0;
   683 }
   684 
   685 /*
   686  * first_pass:
   687  *
   688  * Run a first pass - verify that it's a valid PG
   689  * file, decide whether to report some things that
   690  * occur many times in the text like long or short
   691  * lines, non-standard dashes, etc.
   692  */
   693 struct first_pass_results *first_pass(const char *etext)
   694 {
   695     gunichar laststart=CHAR_SPACE;
   696     const char *s;
   697     gchar *lc_line;
   698     int i,j,lbytes,llen;
   699     gchar **lines;
   700     unsigned int lastlen=0,lastblen=0;
   701     long spline=0,nspline=0;
   702     static struct first_pass_results results={0};
   703     gchar *inword;
   704     lines=g_strsplit(etext,"\n",0);
   705     for (j=0;lines[j];j++)
   706     {
   707 	lbytes=strlen(lines[j]);
   708 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   709 	    lines[j][--lbytes]='\0';
   710 	llen=g_utf8_strlen(lines[j],lbytes);
   711 	linecnt++;
   712 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   713 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   714 	{
   715 	    if (spline)
   716 		g_print("   --> Duplicate header?\n");
   717 	    spline=linecnt+1;   /* first line of non-header text, that is */
   718 	}
   719 	if (!strncmp(lines[j],"*** START",9) &&
   720 	  strstr(lines[j],"PROJECT GUTENBERG"))
   721 	{
   722 	    if (nspline)
   723 		g_print("   --> Duplicate header?\n");
   724 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   725 	}
   726 	if (spline || nspline)
   727 	{
   728 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   729 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   730 	    {
   731 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   732 		{
   733 		    if (results.footerline)
   734 		    {
   735 			/* it's an old-form header - we can detect duplicates */
   736 			if (!nspline)
   737 			    g_print("   --> Duplicate footer?\n");
   738 		    }
   739 		    else
   740 			results.footerline=linecnt;
   741 		}
   742 	    }
   743 	    g_free(lc_line);
   744 	}
   745 	if (spline)
   746 	    results.firstline=spline;
   747 	if (nspline)
   748 	    results.firstline=nspline;  /* override with new */
   749 	if (results.footerline)
   750 	    continue;    /* don't count the boilerplate in the footer */
   751 	results.totlen+=llen;
   752 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   753 	{
   754 	    if (g_utf8_get_char(s)>127)
   755 		results.binlen++;
   756 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   757 		results.alphalen++;
   758 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   759 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   760 		results.endquote_count++;
   761 	}
   762 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   763 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   764 	    results.shortline++;
   765 	if (lbytes>0 &&
   766 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   767 	    cnt_spacend++;
   768 	if (strstr(lines[j],".,"))
   769 	    results.dotcomma++;
   770 	/* only count ast lines for ignoring purposes where there is */
   771 	/* locase text on the line */
   772 	if (strchr(lines[j],'*'))
   773 	{
   774 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   775 		if (g_unichar_islower(g_utf8_get_char(s)))
   776 		    break;
   777 	    if (*s)
   778 		results.astline++;
   779 	}
   780 	if (strchr(lines[j],'/'))
   781 	    results.fslashline++;
   782 	if (lbytes>0)
   783 	{
   784 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   785 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   786 	      s=g_utf8_prev_char(s))
   787 		;
   788 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   789 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   790 		results.hyphens++;
   791 	}
   792 	if (llen>LONGEST_PG_LINE)
   793 	    results.longline++;
   794 	if (llen>WAY_TOO_LONG)
   795 	    results.verylongline++;
   796 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   797 	{
   798 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   799 	    if (i>0)
   800 		results.htmcount++;
   801 	    if (strstr(lines[j],"<i>"))
   802 		results.htmcount+=4; /* bonus marks! */
   803 	}
   804 	/* Check for spaced em-dashes */
   805 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   806 	{
   807 	    results.emdash++;
   808 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   809 		results.space_emdash++;
   810 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   811 		/* count of em-dashes with spaces both sides */
   812 		results.non_PG_space_emdash++;
   813 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   814 		/* count of PG-type em-dashes with no spaces */
   815 		results.PG_space_emdash++;
   816 	}
   817 	for (s=lines[j];*s;)
   818 	{
   819 	    inword=getaword(&s);
   820 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   821 		results.Dutchcount++;
   822 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   823 		results.Frenchcount++;
   824 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   825 		results.standalone_digit++;
   826 	    g_free(inword);
   827 	}
   828 	/* Check for spaced dashes */
   829 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   830 	    results.spacedash++;
   831 	lastblen=lastlen;
   832 	lastlen=llen;
   833 	laststart=lines[j][0];
   834     }
   835     g_strfreev(lines);
   836     return &results;
   837 }
   838 
   839 /*
   840  * report_first_pass:
   841  *
   842  * Make some snap decisions based on the first pass results.
   843  */
   844 struct warnings *report_first_pass(struct first_pass_results *results)
   845 {
   846     static struct warnings warnings={0};
   847     if (cnt_spacend>0)
   848 	g_print("   --> %ld lines in this file have white space at end\n",
   849 	  cnt_spacend);
   850     warnings.dotcomma=1;
   851     if (results->dotcomma>5)
   852     {
   853 	warnings.dotcomma=0;
   854 	g_print("   --> %ld lines in this file contain '.,'. "
   855 	  "Not reporting them.\n",results->dotcomma);
   856     }
   857     /*
   858      * If more than 50 lines, or one-tenth, are short,
   859      * don't bother reporting them.
   860      */
   861     warnings.shortline=1;
   862     if (results->shortline>50 || results->shortline*10>linecnt)
   863     {
   864 	warnings.shortline=0;
   865 	g_print("   --> %ld lines in this file are short. "
   866 	  "Not reporting short lines.\n",results->shortline);
   867     }
   868     /*
   869      * If more than 50 lines, or one-tenth, are long,
   870      * don't bother reporting them.
   871      */
   872     warnings.longline=1;
   873     if (results->longline>50 || results->longline*10>linecnt)
   874     {
   875 	warnings.longline=0;
   876 	g_print("   --> %ld lines in this file are long. "
   877 	  "Not reporting long lines.\n",results->longline);
   878     }
   879     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   880     warnings.ast=1;
   881     if (results->astline>10)
   882     {
   883 	warnings.ast=0;
   884 	g_print("   --> %ld lines in this file contain asterisks. "
   885 	  "Not reporting them.\n",results->astline);
   886     }
   887     /*
   888      * If more than 10 lines contain forward slashes,
   889      * don't bother reporting them.
   890      */
   891     warnings.fslash=1;
   892     if (results->fslashline>10)
   893     {
   894 	warnings.fslash=0;
   895 	g_print("   --> %ld lines in this file contain forward slashes. "
   896 	  "Not reporting them.\n",results->fslashline);
   897     }
   898     /*
   899      * If more than 20 lines contain unpunctuated endquotes,
   900      * don't bother reporting them.
   901      */
   902     warnings.endquote=1;
   903     if (results->endquote_count>20)
   904     {
   905 	warnings.endquote=0;
   906 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   907 	  "Not reporting them.\n",results->endquote_count);
   908     }
   909     /*
   910      * If more than 15 lines contain standalone digits,
   911      * don't bother reporting them.
   912      */
   913     warnings.digit=1;
   914     if (results->standalone_digit>10)
   915     {
   916 	warnings.digit=0;
   917 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   918 	  "Not reporting them.\n",results->standalone_digit);
   919     }
   920     /*
   921      * If more than 20 lines contain hyphens at end,
   922      * don't bother reporting them.
   923      */
   924     warnings.hyphen=1;
   925     if (results->hyphens>20)
   926     {
   927 	warnings.hyphen=0;
   928 	g_print("   --> %ld lines in this file have hyphens at end. "
   929 	  "Not reporting them.\n",results->hyphens);
   930     }
   931     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   932     {
   933 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   934 	pswit[MARKUP_SWITCH]=1;
   935     }
   936     if (results->verylongline>0)
   937 	g_print("   --> %ld lines in this file are VERY long!\n",
   938 	  results->verylongline);
   939     /*
   940      * If there are more non-PG spaced dashes than PG em-dashes,
   941      * assume it's deliberate.
   942      * Current PG guidelines say don't use them, but older texts do,
   943      * and some people insist on them whatever the guidelines say.
   944      */
   945     warnings.dash=1;
   946     if (results->spacedash+results->non_PG_space_emdash>
   947       results->PG_space_emdash)
   948     {
   949 	warnings.dash=0;
   950 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   951 	  "Not reporting them.\n",
   952 	  results->spacedash+results->non_PG_space_emdash);
   953     }
   954     /* If more than a quarter of characters are hi-bit, bug out. */
   955     warnings.bin=1;
   956     if (results->binlen*4>results->totlen)
   957     {
   958 	g_print("   --> This file does not appear to be ASCII. "
   959 	  "Terminating. Best of luck with it!\n");
   960 	exit(1);
   961     }
   962     if (results->alphalen*4<results->totlen)
   963     {
   964 	g_print("   --> This file does not appear to be text. "
   965 	  "Terminating. Best of luck with it!\n");
   966 	exit(1);
   967     }
   968     if (results->binlen*100>results->totlen || results->binlen>100)
   969     {
   970 	g_print("   --> There are a lot of foreign letters here. "
   971 	  "Not reporting them.\n");
   972 	warnings.bin=0;
   973     }
   974     warnings.isDutch=FALSE;
   975     if (results->Dutchcount>50)
   976     {
   977 	warnings.isDutch=TRUE;
   978 	g_print("   --> This looks like Dutch - "
   979 	  "switching off dashes and warnings for 's Middags case.\n");
   980     }
   981     warnings.isFrench=FALSE;
   982     if (results->Frenchcount>50)
   983     {
   984 	warnings.isFrench=TRUE;
   985 	g_print("   --> This looks like French - "
   986 	  "switching off some doublepunct.\n");
   987     }
   988     if (results->firstline && results->footerline)
   989 	g_print("    The PG header and footer appear to be already on.\n");
   990     else
   991     {
   992 	if (results->firstline)
   993 	    g_print("    The PG header is on - no footer.\n");
   994 	if (results->footerline)
   995 	    g_print("    The PG footer is on - no header.\n");
   996     }
   997     g_print("\n");
   998     if (pswit[VERBOSE_SWITCH])
   999     {
  1000 	warnings.bin=1;
  1001 	warnings.shortline=1;
  1002 	warnings.dotcomma=1;
  1003 	warnings.longline=1;
  1004 	warnings.dash=1;
  1005 	warnings.digit=1;
  1006 	warnings.ast=1;
  1007 	warnings.fslash=1;
  1008 	warnings.hyphen=1;
  1009 	warnings.endquote=1;
  1010 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1011     }
  1012     if (warnings.isDutch)
  1013 	warnings.dash=0;
  1014     if (results->footerline>0 && results->firstline>0 &&
  1015       results->footerline>results->firstline &&
  1016       results->footerline-results->firstline<100)
  1017     {
  1018 	g_print("   --> I don't really know where this text starts. \n");
  1019 	g_print("       There are no reference points.\n");
  1020 	g_print("       I'm going to have to report the header and footer "
  1021 	  "as well.\n");
  1022 	results->firstline=0;
  1023     }
  1024     return &warnings;
  1025 }
  1026 
  1027 /*
  1028  * analyse_quotes:
  1029  *
  1030  * Look along the line, accumulate the count of quotes, and see
  1031  * if this is an empty line - i.e. a line with nothing on it
  1032  * but spaces.
  1033  * If line has just spaces, period, * and/or - on it, don't
  1034  * count it, since empty lines with asterisks or dashes to
  1035  * separate sections are common.
  1036  *
  1037  * Returns: TRUE if the line is empty.
  1038  */
  1039 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1040 {
  1041     int guessquote=0;
  1042     /* assume the line is empty until proven otherwise */
  1043     gboolean isemptyline=TRUE;
  1044     const char *s=aline,*sprev,*snext;
  1045     gunichar c;
  1046     sprev=NULL;
  1047     while (*s)
  1048     {
  1049 	snext=g_utf8_next_char(s);
  1050 	c=g_utf8_get_char(s);
  1051 	if (c==CHAR_DQUOTE)
  1052 	    counters->quot++;
  1053 	if (CHAR_IS_SQUOTE(c))
  1054 	{
  1055 	    if (s==aline)
  1056 	    {
  1057 		/*
  1058 		 * At start of line, it can only be an openquote.
  1059 		 * Hardcode a very common exception!
  1060 		 */
  1061 		if (!g_str_has_prefix(snext,"tis") &&
  1062 		  !g_str_has_prefix(snext,"Tis"))
  1063 		    increment_matching(counters,c,TRUE);
  1064 	    }
  1065 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1066 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1067 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1068 		;
  1069 	    /* it's outside a word - let's check it out */
  1070 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1071 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1072 	    {
  1073 		/* it damwell better BE an openquote */
  1074 		if (!g_str_has_prefix(snext,"tis") &&
  1075 		  !g_str_has_prefix(snext,"Tis"))
  1076 		    /* hardcode a very common exception! */
  1077 		    increment_matching(counters,c,TRUE);
  1078 	    }
  1079 	    else
  1080 	    {
  1081 		/* now - is it a closequote? */
  1082 		guessquote=0;   /* accumulate clues */
  1083 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1084 		{
  1085 		    /* it follows a letter - could be either */
  1086 		    guessquote++;
  1087 		    if (g_utf8_get_char(sprev)=='s')
  1088 		    {
  1089 			/* looks like a plural apostrophe */
  1090 			guessquote-=3;
  1091 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1092 			    /* bonus marks! */
  1093 			    guessquote-=2;
  1094 		    }
  1095 		}
  1096 		/* it doesn't have a letter either side */
  1097 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
  1098 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
  1099 		    guessquote+=8; /* looks like a closequote */
  1100 		else
  1101 		    guessquote++;
  1102 		if (matching_difference(counters,CHAR_SQUOTE)>0)
  1103 		    /*
  1104 		     * Give it the benefit of some doubt,
  1105 		     * if a squote is already open.
  1106 		     */
  1107 		    guessquote++;
  1108 		else
  1109 		    guessquote--;
  1110 		if (guessquote>=0)
  1111 		    increment_matching(counters,c,FALSE);
  1112 	    }
  1113 	}
  1114 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1115 	  c!='\r' && c!='\n')
  1116 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1117 	if (c==CHAR_UNDERSCORE)
  1118 	    counters->c_unders++;
  1119 	if (c==CHAR_OPEN_SBRACK)
  1120 	{
  1121 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1122 	      !matching_difference(counters,c) && s==aline &&
  1123 	      g_str_has_prefix(s,"[Illustration:"))
  1124 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1125 	    else
  1126 		increment_matching(counters,c,TRUE);
  1127 	}
  1128 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1129 	    increment_matching(counters,c,TRUE);
  1130 	if (c==CHAR_CLOSE_SBRACK)
  1131 	{
  1132 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1133 	      !matching_difference(counters,c) && !*snext)
  1134 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1135 	    else
  1136 		increment_matching(counters,c,FALSE);
  1137 	}
  1138 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1139 	    increment_matching(counters,c,FALSE);
  1140 	sprev=s;
  1141 	s=snext;
  1142     }
  1143     return isemptyline;
  1144 }
  1145 
  1146 /*
  1147  * check_for_control_characters:
  1148  *
  1149  * Check for invalid or questionable characters in the line
  1150  * Anything above 127 is invalid for plain ASCII, and
  1151  * non-printable control characters should also be flagged.
  1152  * Tabs should generally not be there.
  1153  */
  1154 void check_for_control_characters(const char *aline)
  1155 {
  1156     gunichar c;
  1157     const char *s;
  1158     for (s=aline;*s;s=g_utf8_next_char(s))
  1159     {
  1160 	c=g_utf8_get_char(s);
  1161 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1162 	{
  1163 	    if (pswit[ECHO_SWITCH])
  1164 		g_print("\n%s\n",aline);
  1165 	    if (!pswit[OVERVIEW_SWITCH])
  1166 		g_print("    Line %ld column %ld - Control character %u\n",
  1167 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1168 	    else
  1169 		cnt_bin++;
  1170 	}
  1171     }
  1172 }
  1173 
  1174 /*
  1175  * check_for_odd_characters:
  1176  *
  1177  * Check for binary and other odd characters.
  1178  */
  1179 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1180   gboolean isemptyline)
  1181 {
  1182     /* Don't repeat multiple warnings on one line. */
  1183     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
  1184     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1185     const char *s;
  1186     gunichar c;
  1187     for (s=aline;*s;s=g_utf8_next_char(s))
  1188     {
  1189 	c=g_utf8_get_char(s);
  1190 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1191 	{
  1192 	    if (pswit[ECHO_SWITCH])
  1193 		g_print("\n%s\n",aline);
  1194 	    if (!pswit[OVERVIEW_SWITCH])
  1195 		if (c>127 && c<160 || c>255)
  1196 		    g_print("    Line %ld column %ld - "
  1197 		      "Non-ISO-8859 character %u\n",
  1198 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1199 		else
  1200 		    g_print("    Line %ld column %ld - "
  1201 		      "Non-ASCII character %u\n",
  1202 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1203 	    else
  1204 		cnt_bin++;
  1205 	    eNon_A=TRUE;
  1206 	}
  1207 	if (!eTab && c==CHAR_TAB)
  1208 	{
  1209 	    if (pswit[ECHO_SWITCH])
  1210 		g_print("\n%s\n",aline);
  1211 	    if (!pswit[OVERVIEW_SWITCH])
  1212 		g_print("    Line %ld column %ld - Tab character?\n",
  1213 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1214 	    else
  1215 		cnt_odd++;
  1216 	    eTab=TRUE;
  1217 	}
  1218 	if (!eTilde && c==CHAR_TILDE)
  1219 	{
  1220 	    /*
  1221 	     * Often used by OCR software to indicate an
  1222 	     * unrecognizable character.
  1223 	     */
  1224 	    if (pswit[ECHO_SWITCH])
  1225 		g_print("\n%s\n",aline);
  1226 	    if (!pswit[OVERVIEW_SWITCH])
  1227 		g_print("    Line %ld column %ld - Tilde character?\n",
  1228 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1229 	    else
  1230 		cnt_odd++;
  1231 	    eTilde=TRUE;
  1232 	}
  1233 	if (!eCarat && c==CHAR_CARAT)
  1234 	{  
  1235 	    if (pswit[ECHO_SWITCH])
  1236 		g_print("\n%s\n",aline);
  1237 	    if (!pswit[OVERVIEW_SWITCH])
  1238 		g_print("    Line %ld column %ld - Carat character?\n",
  1239 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1240 	    else
  1241 		cnt_odd++;
  1242 	    eCarat=TRUE;
  1243 	}
  1244 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1245 	{  
  1246 	    if (pswit[ECHO_SWITCH])
  1247 		g_print("\n%s\n",aline);
  1248 	    if (!pswit[OVERVIEW_SWITCH])
  1249 		g_print("    Line %ld column %ld - Forward slash?\n",
  1250 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1251 	    else
  1252 		cnt_odd++;
  1253 	    eFSlash=TRUE;
  1254 	}
  1255 	/*
  1256 	 * Report asterisks only in paranoid mode,
  1257 	 * since they're often deliberate.
  1258 	 */
  1259 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1260 	  c==CHAR_ASTERISK)
  1261 	{
  1262 	    if (pswit[ECHO_SWITCH])
  1263 		g_print("\n%s\n",aline);
  1264 	    if (!pswit[OVERVIEW_SWITCH])
  1265 		g_print("    Line %ld column %ld - Asterisk?\n",
  1266 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1267 	    else
  1268 		cnt_odd++;
  1269 	    eAst=TRUE;
  1270 	}
  1271     }
  1272 }
  1273 
  1274 /*
  1275  * check_for_long_line:
  1276  *
  1277  * Check for line too long.
  1278  */
  1279 void check_for_long_line(const char *aline)
  1280 {
  1281     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1282     {
  1283 	if (pswit[ECHO_SWITCH])
  1284 	    g_print("\n%s\n",aline);
  1285 	if (!pswit[OVERVIEW_SWITCH])
  1286 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1287 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1288 	else
  1289 	    cnt_long++;
  1290     }
  1291 }
  1292 
  1293 /*
  1294  * check_for_short_line:
  1295  *
  1296  * Check for line too short.
  1297  *
  1298  * This one is a bit trickier to implement: we don't want to
  1299  * flag the last line of a paragraph for being short, so we
  1300  * have to wait until we know that our current line is a
  1301  * "normal" line, then report the _previous_ line if it was too
  1302  * short. We also don't want to report indented lines like
  1303  * chapter heads or formatted quotations. We therefore keep
  1304  * last->len as the length of the last line examined, and
  1305  * last->blen as the length of the last but one, and try to
  1306  * suppress unnecessary warnings by checking that both were of
  1307  * "normal" length. We keep the first character of the last
  1308  * line in last->start, and if it was a space, we assume that
  1309  * the formatting is deliberate. I can't figure out a way to
  1310  * distinguish something like a quoted verse left-aligned or
  1311  * the header or footer of a letter from a paragraph of short
  1312  * lines - maybe if I examined the whole paragraph, and if the
  1313  * para has less than, say, 8 lines and if all lines are short,
  1314  * then just assume it's OK? Need to look at some texts to see
  1315  * how often a formula like this would get the right result.
  1316  */
  1317 void check_for_short_line(const char *aline,const struct line_properties *last)
  1318 {
  1319     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1320       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1321       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1322     {
  1323 	if (pswit[ECHO_SWITCH])
  1324 	    g_print("\n%s\n",prevline);
  1325 	if (!pswit[OVERVIEW_SWITCH])
  1326 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1327 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1328 	else
  1329 	    cnt_short++;
  1330     }
  1331 }
  1332 
  1333 /*
  1334  * check_for_starting_punctuation:
  1335  *
  1336  * Look for punctuation other than full ellipses at start of line.
  1337  */
  1338 void check_for_starting_punctuation(const char *aline)
  1339 {
  1340     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1341       !g_str_has_prefix(aline,". . ."))
  1342     {
  1343 	if (pswit[ECHO_SWITCH])
  1344 	    g_print("\n%s\n",aline);
  1345 	if (!pswit[OVERVIEW_SWITCH])
  1346 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1347 	      linecnt);
  1348 	else
  1349 	    cnt_punct++;
  1350     }
  1351 }
  1352 
  1353 /*
  1354  * check_for_spaced_emdash:
  1355  *
  1356  * Check for spaced em-dashes.
  1357  *
  1358  * We must check _all_ occurrences of "--" on the line
  1359  * hence the loop - even if the first double-dash is OK
  1360  * there may be another that's wrong later on.
  1361  */
  1362 void check_for_spaced_emdash(const char *aline)
  1363 {
  1364     const char *s,*t,*next;
  1365     for (s=aline;t=strstr(s,"--");s=next)
  1366     {
  1367 	next=g_utf8_next_char(g_utf8_next_char(t));
  1368 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1369 	  g_utf8_get_char(next)==CHAR_SPACE)
  1370 	{
  1371 	    if (pswit[ECHO_SWITCH])
  1372 		g_print("\n%s\n",aline);
  1373 	    if (!pswit[OVERVIEW_SWITCH])
  1374 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1375 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1376 	    else
  1377 		cnt_dash++;
  1378 	}
  1379     }
  1380 }
  1381 
  1382 /*
  1383  * check_for_spaced_dash:
  1384  *
  1385  * Check for spaced dashes.
  1386  */
  1387 void check_for_spaced_dash(const char *aline)
  1388 {
  1389     const char *s;
  1390     if ((s=strstr(aline," -")))
  1391     {
  1392 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1393 	{
  1394 	    if (pswit[ECHO_SWITCH])
  1395 		g_print("\n%s\n",aline);
  1396 	    if (!pswit[OVERVIEW_SWITCH])
  1397 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1398 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1399 	    else
  1400 		cnt_dash++;
  1401 	}
  1402     }
  1403     else if ((s=strstr(aline,"- ")))
  1404     {
  1405 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1406 	{
  1407 	    if (pswit[ECHO_SWITCH])
  1408 		g_print("\n%s\n",aline);
  1409 	    if (!pswit[OVERVIEW_SWITCH])
  1410 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1411 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1412 	    else
  1413 		cnt_dash++;
  1414 	}
  1415     }
  1416 }
  1417 
  1418 /*
  1419  * check_for_unmarked_paragraphs:
  1420  *
  1421  * Check for unmarked paragraphs indicated by separate speakers.
  1422  *
  1423  * May well be false positive:
  1424  * "Bravo!" "Wonderful!" called the crowd.
  1425  * but useful all the same.
  1426  */
  1427 void check_for_unmarked_paragraphs(const char *aline)
  1428 {
  1429     const char *s;
  1430     s=strstr(aline,"\"  \"");
  1431     if (!s)
  1432 	s=strstr(aline,"\" \"");
  1433     if (s)
  1434     {
  1435 	if (pswit[ECHO_SWITCH])
  1436 	    g_print("\n%s\n",aline);
  1437 	if (!pswit[OVERVIEW_SWITCH])
  1438 	    g_print("    Line %ld column %ld - "
  1439 	      "Query missing paragraph break?\n",
  1440 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1441 	else
  1442 	    cnt_punct++;
  1443     }
  1444 }
  1445 
  1446 /*
  1447  * check_for_jeebies:
  1448  *
  1449  * Check for "to he" and other easy h/b errors.
  1450  *
  1451  * This is a very inadequate effort on the h/b problem,
  1452  * but the phrase "to he" is always an error, whereas "to
  1453  * be" is quite common.
  1454  * Similarly, '"Quiet!", be said.' is a non-be error
  1455  * "to he" is _not_ always an error!:
  1456  *       "Where they went to he couldn't say."
  1457  * Another false positive:
  1458  *       What would "Cinderella" be without the . . .
  1459  * and another: "If he wants to he can see for himself."
  1460  */
  1461 void check_for_jeebies(const char *aline)
  1462 {
  1463     const char *s;
  1464     s=strstr(aline," be could ");
  1465     if (!s)
  1466 	s=strstr(aline," be would ");
  1467     if (!s)
  1468 	s=strstr(aline," was be ");
  1469     if (!s)
  1470 	s=strstr(aline," be is ");
  1471     if (!s)
  1472 	s=strstr(aline," is be ");
  1473     if (!s)
  1474 	s=strstr(aline,"\", be ");
  1475     if (!s)
  1476 	s=strstr(aline,"\" be ");
  1477     if (!s)
  1478 	s=strstr(aline,"\" be ");
  1479     if (!s)
  1480 	s=strstr(aline," to he ");
  1481     if (s)
  1482     {
  1483 	if (pswit[ECHO_SWITCH])
  1484 	    g_print("\n%s\n",aline);
  1485 	if (!pswit[OVERVIEW_SWITCH])
  1486 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1487 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1488 	else
  1489 	    cnt_word++;
  1490     }
  1491     s=strstr(aline," the had ");
  1492     if (!s)
  1493 	s=strstr(aline," a had ");
  1494     if (!s)
  1495 	s=strstr(aline," they bad ");
  1496     if (!s)
  1497 	s=strstr(aline," she bad ");
  1498     if (!s)
  1499 	s=strstr(aline," he bad ");
  1500     if (!s)
  1501 	s=strstr(aline," you bad ");
  1502     if (!s)
  1503 	s=strstr(aline," i bad ");
  1504     if (s)
  1505     {
  1506 	if (pswit[ECHO_SWITCH])
  1507 	    g_print("\n%s\n",aline);
  1508 	if (!pswit[OVERVIEW_SWITCH])
  1509 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1510 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1511 	else
  1512 	    cnt_word++;
  1513     }
  1514     s=strstr(aline,"; hut ");
  1515     if (!s)
  1516 	s=strstr(aline,", hut ");
  1517     if (s)
  1518     {
  1519 	if (pswit[ECHO_SWITCH])
  1520 	    g_print("\n%s\n",aline);
  1521 	if (!pswit[OVERVIEW_SWITCH])
  1522 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1523 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1524 	else
  1525 	    cnt_word++;
  1526     }
  1527 }
  1528 
  1529 /*
  1530  * check_for_mta_from:
  1531  *
  1532  * Special case - angled bracket in front of "From" placed there by an
  1533  * MTA when sending an e-mail.
  1534  */
  1535 void check_for_mta_from(const char *aline)
  1536 {
  1537     const char *s;
  1538     s=strstr(aline,">From");
  1539     if (s)
  1540     {
  1541 	if (pswit[ECHO_SWITCH])
  1542 	    g_print("\n%s\n",aline);
  1543 	if (!pswit[OVERVIEW_SWITCH])
  1544 	    g_print("    Line %ld column %ld - "
  1545 	      "Query angled bracket with From\n",
  1546 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1547 	else
  1548 	    cnt_punct++;
  1549     }
  1550 }
  1551 
  1552 /*
  1553  * check_for_orphan_character:
  1554  *
  1555  * Check for a single character line -
  1556  * often an overflow from bad wrapping.
  1557  */
  1558 void check_for_orphan_character(const char *aline)
  1559 {
  1560     gunichar c;
  1561     c=g_utf8_get_char(aline);
  1562     if (c && !*g_utf8_next_char(aline))
  1563     {
  1564 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1565 	    ; /* Nothing - ignore numerals alone on a line. */
  1566 	else
  1567 	{
  1568 	    if (pswit[ECHO_SWITCH])
  1569 		g_print("\n%s\n",aline);
  1570 	    if (!pswit[OVERVIEW_SWITCH])
  1571 		g_print("    Line %ld column 1 - Query single character line\n",
  1572 		  linecnt);
  1573 	    else
  1574 		cnt_punct++;
  1575 	}
  1576     }
  1577 }
  1578 
  1579 /*
  1580  * check_for_pling_scanno:
  1581  *
  1582  * Check for I" - often should be !
  1583  */
  1584 void check_for_pling_scanno(const char *aline)
  1585 {
  1586     const char *s;
  1587     s=strstr(aline," I\"");
  1588     if (s)
  1589     {
  1590 	if (pswit[ECHO_SWITCH])
  1591 	    g_print("\n%s\n",aline);
  1592 	if (!pswit[OVERVIEW_SWITCH])
  1593 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1594 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1595 	else
  1596 	    cnt_punct++;
  1597     }
  1598 }
  1599 
  1600 /*
  1601  * check_for_extra_period:
  1602  *
  1603  * Check for period without a capital letter. Cut-down from gutspell.
  1604  * Only works when it happens on a single line.
  1605  */
  1606 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1607 {
  1608     const char *s,*t,*s1,*sprev;
  1609     int i;
  1610     gsize len;
  1611     gboolean istypo;
  1612     gchar *testword;
  1613     gunichar c,nc,pc,*decomposition;
  1614     if (pswit[PARANOID_SWITCH])
  1615     {
  1616 	for (t=aline;t=strstr(t,". ");)
  1617 	{
  1618 	    if (t==aline)
  1619 	    {
  1620 		t=g_utf8_next_char(t);
  1621 		/* start of line punctuation is handled elsewhere */
  1622 		continue;
  1623 	    }
  1624 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1625 	    {
  1626 		t=g_utf8_next_char(t);
  1627 		continue;
  1628 	    }
  1629 	    if (warnings->isDutch)
  1630 	    {
  1631 		/* For Frank & Jeroen -- 's Middags case */
  1632 		gunichar c2,c3,c4,c5;
  1633 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1634 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1635 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1636 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1637 		if (CHAR_IS_APOSTROPHE(c2) &&
  1638 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1639 		  g_unichar_isupper(c5))
  1640 		{
  1641 		    t=g_utf8_next_char(t);
  1642 		    continue;
  1643 		}
  1644 	    }
  1645 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1646 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1647 	      !isdigit(g_utf8_get_char(s1)))
  1648 		s1=g_utf8_next_char(s1);
  1649 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1650 	    {
  1651 		/* we have something to investigate */
  1652 		istypo=TRUE;
  1653 		/* so let's go back and find out */
  1654 		nc=g_utf8_get_char(t);
  1655 		s1=g_utf8_prev_char(t);
  1656 		c=g_utf8_get_char(s1);
  1657 		sprev=g_utf8_prev_char(s1);
  1658 		pc=g_utf8_get_char(sprev);
  1659 		while (s1>=aline &&
  1660 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1661 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1662 		  g_unichar_isalpha(nc)))
  1663 		{
  1664 		    nc=c;
  1665 		    s1=sprev;
  1666 		    c=pc;
  1667 		    sprev=g_utf8_prev_char(s1);
  1668 		    pc=g_utf8_get_char(sprev);
  1669 		}
  1670 		s1=g_utf8_next_char(s1);
  1671 		s=strchr(s1,'.');
  1672 		if (s)
  1673 		    testword=g_strndup(s1,s-s1);
  1674 		else
  1675 		    testword=g_strdup(s1);
  1676 		for (i=0;*abbrev[i];i++)
  1677 		    if (!strcmp(testword,abbrev[i]))
  1678 			istypo=FALSE;
  1679 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1680 		    istypo=FALSE;
  1681 		if (!*g_utf8_next_char(testword))
  1682 		    istypo=FALSE;
  1683 		if (isroman(testword))
  1684 		    istypo=FALSE;
  1685 		if (istypo)
  1686 		{
  1687 		    istypo=FALSE;
  1688 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1689 		    {
  1690 			decomposition=g_unicode_canonical_decomposition(
  1691 			  g_utf8_get_char(s),&len);
  1692 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1693 			    istypo=TRUE;
  1694 			g_free(decomposition);
  1695 		    }
  1696 		}
  1697 		if (istypo &&
  1698 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1699 		{
  1700 		    g_tree_insert(qperiod,g_strdup(testword),
  1701 		      GINT_TO_POINTER(1));
  1702 		    if (pswit[ECHO_SWITCH])
  1703 			g_print("\n%s\n",aline);
  1704 		    if (!pswit[OVERVIEW_SWITCH])
  1705 			g_print("    Line %ld column %ld - Extra period?\n",
  1706 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1707 		    else
  1708 			cnt_punct++;
  1709 		}
  1710 		g_free(testword);
  1711 	    }
  1712 	    t=g_utf8_next_char(t);
  1713 	}
  1714     }
  1715 }
  1716 
  1717 /*
  1718  * check_for_following_punctuation:
  1719  *
  1720  * Check for words usually not followed by punctuation.
  1721  */
  1722 void check_for_following_punctuation(const char *aline)
  1723 {
  1724     int i;
  1725     const char *s,*wordstart;
  1726     gunichar c;
  1727     gchar *inword,*t;
  1728     if (pswit[TYPO_SWITCH])
  1729     {
  1730 	for (s=aline;*s;)
  1731 	{
  1732 	    wordstart=s;
  1733 	    t=getaword(&s);
  1734 	    if (!*t)
  1735 	    {
  1736 		g_free(t);
  1737 		continue;
  1738 	    }
  1739 	    inword=g_utf8_strdown(t,-1);
  1740 	    g_free(t);
  1741 	    for (i=0;*nocomma[i];i++)
  1742 		if (!strcmp(inword,nocomma[i]))
  1743 		{
  1744 		    c=g_utf8_get_char(s);
  1745 		    if (c==',' || c==';' || c==':')
  1746 		    {
  1747 			if (pswit[ECHO_SWITCH])
  1748 			    g_print("\n%s\n",aline);
  1749 			if (!pswit[OVERVIEW_SWITCH])
  1750 			    g_print("    Line %ld column %ld - "
  1751 			      "Query punctuation after %s?\n",
  1752 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1753 			      inword);
  1754 			else
  1755 			    cnt_punct++;
  1756 		    }
  1757 		}
  1758 	    for (i=0;*noperiod[i];i++)
  1759 		if (!strcmp(inword,noperiod[i]))
  1760 		{
  1761 		    c=g_utf8_get_char(s);
  1762 		    if (c=='.' || c=='!')
  1763 		    {
  1764 			if (pswit[ECHO_SWITCH])
  1765 			    g_print("\n%s\n",aline);
  1766 			if (!pswit[OVERVIEW_SWITCH])
  1767 			    g_print("    Line %ld column %ld - "
  1768 			      "Query punctuation after %s?\n",
  1769 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1770 			      inword);
  1771 			else
  1772 			    cnt_punct++;
  1773 		    }
  1774 		}
  1775 	    g_free(inword);
  1776 	}
  1777     }
  1778 }
  1779 
  1780 /*
  1781  * check_for_typos:
  1782  *
  1783  * Check for commonly mistyped words,
  1784  * and digits like 0 for O in a word.
  1785  */
  1786 void check_for_typos(const char *aline,struct warnings *warnings)
  1787 {
  1788     const char *s,*t,*nt,*wordstart;
  1789     gchar *inword;
  1790     gunichar *decomposition;
  1791     gchar *testword;
  1792     int i,vowel,consonant,*dupcnt;
  1793     gboolean isdup,istypo,alower;
  1794     gunichar c,pc;
  1795     long offset,len;
  1796     gsize decomposition_len;
  1797     for (s=aline;*s;)
  1798     {
  1799 	wordstart=s;
  1800 	inword=getaword(&s);
  1801 	if (!*inword)
  1802 	{
  1803 	    g_free(inword);
  1804 	    continue; /* don't bother with empty lines */
  1805 	}
  1806 	if (mixdigit(inword))
  1807 	{
  1808 	    if (pswit[ECHO_SWITCH])
  1809 		g_print("\n%s\n",aline);
  1810 	    if (!pswit[OVERVIEW_SWITCH])
  1811 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1812 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1813 	    else
  1814 		cnt_word++;
  1815 	}
  1816 	/*
  1817 	 * Put the word through a series of tests for likely typos and OCR
  1818 	 * errors.
  1819 	 */
  1820 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1821 	{
  1822 	    istypo=FALSE;
  1823 	    alower=FALSE;
  1824 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1825 	    {
  1826 		c=g_utf8_get_char(t);
  1827 		nt=g_utf8_next_char(t);
  1828 		/* lowercase for testing */
  1829 		if (g_unichar_islower(c))
  1830 		    alower=TRUE;
  1831 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1832 		{
  1833 		    /*
  1834 		     * We have an uppercase mid-word. However, there are
  1835 		     * common cases:
  1836 		     *   Mac and Mc like McGill
  1837 		     *   French contractions like l'Abbe
  1838 		     */
  1839 		    offset=g_utf8_pointer_to_offset(inword,t);
  1840 		    if (offset>0)
  1841 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1842 		    else
  1843 			pc='\0';
  1844 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1845 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1846 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1847 		      CHAR_IS_APOSTROPHE(pc))
  1848 			; /* do nothing! */
  1849 		    else
  1850 			istypo=TRUE;
  1851 		}
  1852 	    }
  1853 	    testword=g_utf8_casefold(inword,-1);
  1854 	}
  1855 	if (pswit[TYPO_SWITCH])
  1856 	{
  1857 	    /*
  1858 	     * Check for certain unlikely two-letter combinations at word
  1859 	     * start and end.
  1860 	     */
  1861 	    len=g_utf8_strlen(testword,-1);
  1862 	    if (len>1)
  1863 	    {
  1864 		for (i=0;*nostart[i];i++)
  1865 		    if (g_str_has_prefix(testword,nostart[i]))
  1866 			istypo=TRUE;
  1867 		for (i=0;*noend[i];i++)
  1868 		    if (g_str_has_suffix(testword,noend[i]))
  1869 			istypo=TRUE;
  1870 	    }
  1871 	    /* ght is common, gbt never. Like that. */
  1872 	    if (strstr(testword,"cb"))
  1873 		istypo=TRUE;
  1874 	    if (strstr(testword,"gbt"))
  1875 		istypo=TRUE;
  1876 	    if (strstr(testword,"pbt"))
  1877 		istypo=TRUE;
  1878 	    if (strstr(testword,"tbs"))
  1879 		istypo=TRUE;
  1880 	    if (strstr(testword,"mrn"))
  1881 		istypo=TRUE;
  1882 	    if (strstr(testword,"ahle"))
  1883 		istypo=TRUE;
  1884 	    if (strstr(testword,"ihle"))
  1885 		istypo=TRUE;
  1886 	    /*
  1887 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1888 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1889 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1890 	     * numerals, but "ii" is a common scanno.
  1891 	     */
  1892 	    if (strstr(testword,"tbi"))
  1893 		istypo=TRUE;
  1894 	    if (strstr(testword,"tbe"))
  1895 		istypo=TRUE;
  1896 	    if (strstr(testword,"ii"))
  1897 		istypo=TRUE;
  1898 	    /*
  1899 	     * Check for no vowels or no consonants.
  1900 	     * If none, flag a typo.
  1901 	     */
  1902 	    if (!istypo && len>1)
  1903 	    {
  1904 		vowel=consonant=0;
  1905 		for (t=testword;*t;t=g_utf8_next_char(t))
  1906 		{
  1907 		    c=g_utf8_get_char(t);
  1908 		    decomposition=
  1909 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1910 		    if (c=='y' || g_unichar_isdigit(c))
  1911 		    {
  1912 			/* Yah, this is loose. */
  1913 			vowel++;
  1914 			consonant++;
  1915 		    }
  1916 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1917 			vowel++;
  1918 		    else
  1919 			consonant++;
  1920 		    g_free(decomposition);
  1921 		}
  1922 		if (!vowel || !consonant)
  1923 		    istypo=TRUE;
  1924 	    }
  1925 	    /*
  1926 	     * Now exclude the word from being reported if it's in
  1927 	     * the okword list.
  1928 	     */
  1929 	    for (i=0;*okword[i];i++)
  1930 		if (!strcmp(testword,okword[i]))
  1931 		    istypo=FALSE;
  1932 	    /*
  1933 	     * What looks like a typo may be a Roman numeral.
  1934 	     * Exclude these.
  1935 	     */
  1936 	    if (istypo && isroman(testword))
  1937 		istypo=FALSE;
  1938 	    /* Check the manual list of typos. */
  1939 	    if (!istypo)
  1940 		for (i=0;*typo[i];i++)
  1941 		    if (!strcmp(testword,typo[i]))
  1942 			istypo=TRUE;
  1943 	    /*
  1944 	     * Check lowercase s, l, i and m - special cases.
  1945 	     *   "j" - often a semi-colon gone wrong.
  1946 	     *   "d" for a missing apostrophe - he d
  1947 	     *   "n" for "in"
  1948 	     */
  1949 	    if (!istypo && len==1 &&
  1950 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1951 		istypo=TRUE;
  1952 	    if (istypo)
  1953 	    {
  1954 		dupcnt=g_tree_lookup(qword,testword);
  1955 		if (dupcnt)
  1956 		{
  1957 		    (*dupcnt)++;
  1958 		    isdup=!pswit[VERBOSE_SWITCH];
  1959 		}
  1960 		else
  1961 		{
  1962 		    dupcnt=g_new0(int,1);
  1963 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1964 		    isdup=FALSE;
  1965 		}
  1966 		if (!isdup)
  1967 		{
  1968 		    if (pswit[ECHO_SWITCH])
  1969 			g_print("\n%s\n",aline);
  1970 		    if (!pswit[OVERVIEW_SWITCH])
  1971 		    {
  1972 			g_print("    Line %ld column %ld - Query word %s",
  1973 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1974 			  inword);
  1975 			if (!pswit[VERBOSE_SWITCH])
  1976 			    g_print(" - not reporting duplicates");
  1977 			g_print("\n");
  1978 		    }
  1979 		    else
  1980 			cnt_word++;
  1981 		}
  1982 	    }
  1983 	}
  1984 	/* check the user's list of typos */
  1985 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1986 	{
  1987 	    if (pswit[ECHO_SWITCH])
  1988 		g_print("\n%s\n",aline);
  1989 	    if (!pswit[OVERVIEW_SWITCH])  
  1990 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1991 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1992 	}
  1993 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1994 	    g_free(testword);
  1995 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1996 	{
  1997 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1998 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1999 	    {
  2000 		if (pswit[ECHO_SWITCH])
  2001 		    g_print("\n%s\n",aline);
  2002 		if (!pswit[OVERVIEW_SWITCH])
  2003 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2004 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2005 		      inword);
  2006 		else
  2007 		    cnt_word++;
  2008 	    }
  2009 	}
  2010 	g_free(inword);
  2011     }
  2012 }
  2013 
  2014 /*
  2015  * check_for_misspaced_punctuation:
  2016  *
  2017  * Look for added or missing spaces around punctuation and quotes.
  2018  * If there is a punctuation character like ! with no space on
  2019  * either side, suspect a missing!space. If there are spaces on
  2020  * both sides , assume a typo. If we see a double quote with no
  2021  * space or punctuation on either side of it, assume unspaced
  2022  * quotes "like"this.
  2023  */
  2024 void check_for_misspaced_punctuation(const char *aline,
  2025   struct parities *parities,gboolean isemptyline)
  2026 {
  2027     gboolean isacro,isellipsis;
  2028     const char *s;
  2029     gunichar c,nc,pc,n2c;
  2030     c=g_utf8_get_char(aline);
  2031     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2032     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2033     {
  2034 	pc=c;
  2035 	c=nc;
  2036 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2037 	/* For each character in the line after the first. */
  2038 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2039 	{
  2040 	    /* we need to suppress warnings for acronyms like M.D. */
  2041 	    isacro=FALSE;
  2042 	    /* we need to suppress warnings for ellipsis . . . */
  2043 	    isellipsis=FALSE;
  2044 	    /*
  2045 	     * If there are letters on both sides of it or
  2046 	     * if it's strict punctuation followed by an alpha.
  2047 	     */
  2048 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2049 	      g_utf8_strchr("?!,;:",-1,c)))
  2050 	    {
  2051 		if (c=='.')
  2052 		{
  2053 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2054 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2055 			isacro=TRUE;
  2056 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2057 		    if (nc && n2c=='.')
  2058 			isacro=TRUE;
  2059 		}
  2060 		if (!isacro)
  2061 		{
  2062 		    if (pswit[ECHO_SWITCH])
  2063 			g_print("\n%s\n",aline);
  2064 		    if (!pswit[OVERVIEW_SWITCH])
  2065 			g_print("    Line %ld column %ld - Missing space?\n",
  2066 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2067 		    else
  2068 			cnt_punct++;
  2069 		}
  2070 	    }
  2071 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2072 	    {
  2073 		/*
  2074 		 * If there are spaces on both sides,
  2075 		 * or space before and end of line.
  2076 		 */
  2077 		if (c=='.')
  2078 		{
  2079 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2080 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2081 			isellipsis=TRUE;
  2082 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2083 		    if (nc && n2c=='.')
  2084 			isellipsis=TRUE;
  2085 		}
  2086 		if (!isemptyline && !isellipsis)
  2087 		{
  2088 		    if (pswit[ECHO_SWITCH])
  2089 			g_print("\n%s\n",aline);
  2090 		    if (!pswit[OVERVIEW_SWITCH])
  2091 			g_print("    Line %ld column %ld - "
  2092 			  "Spaced punctuation?\n",linecnt,
  2093 			  g_utf8_pointer_to_offset(aline,s)+1);
  2094 		    else
  2095 			cnt_punct++;
  2096 		}
  2097 	    }
  2098 	}
  2099     }
  2100     /* Split out the characters that CANNOT be preceded by space. */
  2101     c=g_utf8_get_char(aline);
  2102     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2103     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2104     {
  2105 	pc=c;
  2106 	c=nc;
  2107 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2108 	/* for each character in the line after the first */
  2109 	if (g_utf8_strchr("?!,;:",-1,c))
  2110 	{
  2111 	    /* if it's punctuation that _cannot_ have a space before it */
  2112 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2113 	    {
  2114 		/*
  2115 		 * If nc DOES == space,
  2116 		 * it was already reported just above.
  2117 		 */
  2118 		if (pswit[ECHO_SWITCH])
  2119 		    g_print("\n%s\n",aline);
  2120 		if (!pswit[OVERVIEW_SWITCH])
  2121 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2122 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2123 		else
  2124 		    cnt_punct++;
  2125 	    }
  2126 	}
  2127     }
  2128     /*
  2129      * Special case " .X" where X is any alpha.
  2130      * This plugs a hole in the acronym code above.
  2131      * Inelegant, but maintainable.
  2132      */
  2133     c=g_utf8_get_char(aline);
  2134     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2135     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2136     {
  2137 	pc=c;
  2138 	c=nc;
  2139 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2140 	/* for each character in the line after the first */
  2141 	if (c=='.')
  2142 	{
  2143 	    /* if it's a period */
  2144 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2145 	    {
  2146 		/*
  2147 		 * If the period follows a space and
  2148 		 * is followed by a letter.
  2149 		 */
  2150 		if (pswit[ECHO_SWITCH])
  2151 		    g_print("\n%s\n",aline);
  2152 		if (!pswit[OVERVIEW_SWITCH])
  2153 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2154 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2155 		else
  2156 		    cnt_punct++;
  2157 	    }
  2158 	}
  2159     }
  2160     c=g_utf8_get_char(aline);
  2161     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2162     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2163     {
  2164 	pc=c;
  2165 	c=nc;
  2166 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2167 	/* for each character in the line after the first */
  2168 	if (c==CHAR_DQUOTE)
  2169 	{
  2170 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2171 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2172 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2173 	    {
  2174 		if (pswit[ECHO_SWITCH])
  2175 		    g_print("\n%s\n",aline);
  2176 		if (!pswit[OVERVIEW_SWITCH])
  2177 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2178 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2179 		else
  2180 		    cnt_punct++;
  2181 	    }
  2182 	}
  2183     }
  2184     /* Check parity of quotes. */
  2185     nc=g_utf8_get_char(aline);
  2186     for (s=aline;*s;s=g_utf8_next_char(s))
  2187     {
  2188 	c=nc;
  2189 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2190 	if (c==CHAR_DQUOTE)
  2191 	{
  2192 	    parities->dquote=!parities->dquote;
  2193 	    if (!parities->dquote)
  2194 	    {
  2195 		/* parity even */
  2196 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2197 		{
  2198 		    if (pswit[ECHO_SWITCH])
  2199 			g_print("\n%s\n",aline);
  2200 		    if (!pswit[OVERVIEW_SWITCH])
  2201 			g_print("    Line %ld column %ld - "
  2202 			  "Wrongspaced quotes?\n",
  2203 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2204 		    else
  2205 			cnt_punct++;
  2206 		}
  2207 	    }
  2208 	    else
  2209 	    {
  2210 		/* parity odd */
  2211 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2212 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2213 		{
  2214 		    if (pswit[ECHO_SWITCH])
  2215 			g_print("\n%s\n",aline);
  2216 		    if (!pswit[OVERVIEW_SWITCH])
  2217 			g_print("    Line %ld column %ld - "
  2218 			  "Wrongspaced quotes?\n",
  2219 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2220 		    else
  2221 			cnt_punct++;
  2222 		}
  2223 	    }
  2224 	}
  2225     }
  2226     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  2227     {
  2228 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2229 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2230 	{
  2231 	    if (pswit[ECHO_SWITCH])
  2232 		g_print("\n%s\n",aline);
  2233 	    if (!pswit[OVERVIEW_SWITCH])
  2234 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2235 		  linecnt);
  2236 	    else
  2237 		cnt_punct++;
  2238 	}
  2239     }
  2240     if (pswit[SQUOTE_SWITCH])
  2241     {
  2242 	nc=g_utf8_get_char(aline);
  2243 	for (s=aline;*s;s=g_utf8_next_char(s))
  2244 	{
  2245 	    c=nc;
  2246 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2247 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2248 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2249 	      !g_unichar_isalpha(nc)))
  2250 	    {
  2251 		parities->squote=!parities->squote;
  2252 		if (!parities->squote)
  2253 		{
  2254 		    /* parity even */
  2255 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2256 		    {
  2257 			if (pswit[ECHO_SWITCH])
  2258 			    g_print("\n%s\n",aline);
  2259 			if (!pswit[OVERVIEW_SWITCH])
  2260 			    g_print("    Line %ld column %ld - "
  2261 			      "Wrongspaced singlequotes?\n",
  2262 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2263 			else
  2264 			    cnt_punct++;
  2265 		    }
  2266 		}
  2267 		else
  2268 		{
  2269 		    /* parity odd */
  2270 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2271 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2272 		    {
  2273 			if (pswit[ECHO_SWITCH])
  2274 			    g_print("\n%s\n",aline);
  2275 			if (!pswit[OVERVIEW_SWITCH])
  2276 			    g_print("    Line %ld column %ld - "
  2277 			      "Wrongspaced singlequotes?\n",
  2278 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2279 			else
  2280 			    cnt_punct++;
  2281 		    }
  2282 		}
  2283 	    }
  2284 	}
  2285     }
  2286 }
  2287 
  2288 /*
  2289  * check_for_double_punctuation:
  2290  *
  2291  * Look for double punctuation like ,. or ,,
  2292  * Thanks to DW for the suggestion!
  2293  * In books with references, ".," and ".;" are common
  2294  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2295  * OTOH, from my initial tests, there are also fairly
  2296  * common errors. What to do? Make these cases paranoid?
  2297  * ".," is the most common, so warnings->dotcomma is used
  2298  * to suppress detailed reporting if it occurs often.
  2299  */
  2300 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2301 {
  2302     const char *s;
  2303     gunichar c,nc;
  2304     nc=g_utf8_get_char(aline);
  2305     for (s=aline;*s;s=g_utf8_next_char(s))
  2306     {
  2307 	c=nc;
  2308 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2309 	/* for each punctuation character in the line */
  2310 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2311 	  g_utf8_strchr(".?!,;:",-1,nc))
  2312 	{
  2313 	    /* followed by punctuation, it's a query, unless . . . */
  2314 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2315 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2316 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2317 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2318 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2319 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2320 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2321 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2322 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2323 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2324 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2325 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2326 	    {
  2327 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2328 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2329 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2330 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2331 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2332 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2333 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2334 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2335 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2336 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2337 		{
  2338 		    s+=4;
  2339 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2340 		}
  2341 		; /* do nothing for .. !! and ?? which can be legit */
  2342 	    }
  2343 	    else
  2344 	    {
  2345 		if (pswit[ECHO_SWITCH])
  2346 		    g_print("\n%s\n",aline);
  2347 		if (!pswit[OVERVIEW_SWITCH])
  2348 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2349 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2350 		else
  2351 		    cnt_punct++;
  2352 	    }
  2353 	}
  2354     }
  2355 }
  2356 
  2357 /*
  2358  * check_for_spaced_quotes:
  2359  */
  2360 void check_for_spaced_quotes(const char *aline)
  2361 {
  2362     int i;
  2363     const char *s,*t;
  2364     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2365       CHAR_RS_QUOTE};
  2366     GString *pattern;
  2367     s=aline;
  2368     while ((t=strstr(s," \" ")))
  2369     {
  2370 	if (pswit[ECHO_SWITCH])
  2371 	    g_print("\n%s\n",aline);
  2372 	if (!pswit[OVERVIEW_SWITCH])
  2373 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2374 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2375 	else
  2376 	    cnt_punct++;
  2377 	s=g_utf8_next_char(g_utf8_next_char(t));
  2378     }
  2379     pattern=g_string_new(NULL);
  2380     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2381     {
  2382 	g_string_assign(pattern," ");
  2383 	g_string_append_unichar(pattern,single_quotes[i]);
  2384 	g_string_append_c(pattern,' ');
  2385 	s=aline;
  2386 	while ((t=strstr(s,pattern->str)))
  2387 	{
  2388 	    if (pswit[ECHO_SWITCH])
  2389 		g_print("\n%s\n",aline);
  2390 	    if (!pswit[OVERVIEW_SWITCH])
  2391 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2392 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2393 	    else
  2394 		cnt_punct++;
  2395 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2396 	}
  2397     }
  2398     g_string_free(pattern,TRUE);
  2399 }
  2400 
  2401 /*
  2402  * check_for_miscased_genative:
  2403  *
  2404  * Check special case of 'S instead of 's at end of word.
  2405  */
  2406 void check_for_miscased_genative(const char *aline)
  2407 {
  2408     const char *s;
  2409     gunichar c,nc,pc;
  2410     if (!*aline)
  2411 	return;
  2412     c=g_utf8_get_char(aline);
  2413     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2414     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2415     {
  2416 	pc=c;
  2417 	c=nc;
  2418 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2419 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2420 	{
  2421 	    if (pswit[ECHO_SWITCH])
  2422 		g_print("\n%s\n",aline);
  2423 	    if (!pswit[OVERVIEW_SWITCH])
  2424 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2425 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2426 	    else
  2427 		cnt_punct++;
  2428 	}
  2429     }
  2430 }
  2431 
  2432 /*
  2433  * check_end_of_line:
  2434  *
  2435  * Now check special cases - start and end of line -
  2436  * for single and double quotes. Start is sometimes [sic]
  2437  * but better to query it anyway.
  2438  * While we're here, check for dash at end of line.
  2439  */
  2440 void check_end_of_line(const char *aline,struct warnings *warnings)
  2441 {
  2442     int lbytes;
  2443     const char *s;
  2444     gunichar c1,c2;
  2445     lbytes=strlen(aline);
  2446     if (g_utf8_strlen(aline,lbytes)>1)
  2447     {
  2448 	s=g_utf8_prev_char(aline+lbytes);
  2449 	c1=g_utf8_get_char(s);
  2450 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2451 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2452 	{
  2453 	    if (pswit[ECHO_SWITCH])
  2454 		g_print("\n%s\n",aline);
  2455 	    if (!pswit[OVERVIEW_SWITCH])
  2456 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2457 		  g_utf8_strlen(aline,lbytes));
  2458 	    else
  2459 		cnt_punct++;
  2460 	}
  2461 	c1=g_utf8_get_char(aline);
  2462 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2463 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2464 	{
  2465 	    if (pswit[ECHO_SWITCH])
  2466 		g_print("\n%s\n",aline);
  2467 	    if (!pswit[OVERVIEW_SWITCH])
  2468 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2469 	    else
  2470 		cnt_punct++;
  2471 	}
  2472 	/*
  2473 	 * Dash at end of line may well be legit - paranoid mode only
  2474 	 * and don't report em-dash at line-end.
  2475 	 */
  2476 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2477 	{
  2478 	    for (s=g_utf8_prev_char(aline+lbytes);
  2479 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2480 		;
  2481 	    if (g_utf8_get_char(s)=='-' &&
  2482 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2483 	    {
  2484 		if (pswit[ECHO_SWITCH])
  2485 		    g_print("\n%s\n",aline);
  2486 		if (!pswit[OVERVIEW_SWITCH])
  2487 		    g_print("    Line %ld column %ld - "
  2488 		      "Hyphen at end of line?\n",
  2489 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2490 	    }
  2491 	}
  2492     }
  2493 }
  2494 
  2495 /*
  2496  * check_for_unspaced_bracket:
  2497  *
  2498  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2499  * If so, suspect a scanno like "a]most".
  2500  */
  2501 void check_for_unspaced_bracket(const char *aline)
  2502 {
  2503     const char *s;
  2504     gunichar c,nc,pc;
  2505     c=g_utf8_get_char(aline);
  2506     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2507     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2508     {
  2509 	pc=c;
  2510 	c=nc;
  2511 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2512 	if (!nc)
  2513 	    break;
  2514 	/* for each bracket character in the line except 1st & last */
  2515 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2516 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2517 	{
  2518 	    if (pswit[ECHO_SWITCH])
  2519 		g_print("\n%s\n",aline);
  2520 	    if (!pswit[OVERVIEW_SWITCH])
  2521 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2522 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2523 	    else
  2524 		cnt_punct++;
  2525 	}
  2526     }
  2527 }
  2528 
  2529 /*
  2530  * check_for_unpunctuated_endquote:
  2531  */
  2532 void check_for_unpunctuated_endquote(const char *aline)
  2533 {
  2534     const char *s;
  2535     gunichar c,nc,pc;
  2536     c=g_utf8_get_char(aline);
  2537     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2538     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2539     {
  2540 	pc=c;
  2541 	c=nc;
  2542 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2543 	/* for each character in the line except 1st */
  2544 	if (c==CHAR_DQUOTE && isalpha(pc))
  2545 	{
  2546 	    if (pswit[ECHO_SWITCH])
  2547 		g_print("\n%s\n",aline);
  2548 	    if (!pswit[OVERVIEW_SWITCH])
  2549 		g_print("    Line %ld column %ld - "
  2550 		  "endquote missing punctuation?\n",
  2551 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2552 	    else
  2553 		cnt_punct++;
  2554 	}
  2555     }
  2556 }
  2557 
  2558 /*
  2559  * check_for_html_tag:
  2560  *
  2561  * Check for <HTML TAG>.
  2562  *
  2563  * If there is a < in the line, followed at some point
  2564  * by a > then we suspect HTML.
  2565  */
  2566 void check_for_html_tag(const char *aline)
  2567 {
  2568     const char *open,*close;
  2569     gchar *tag;
  2570     open=strchr(aline,'<');
  2571     if (open)
  2572     {
  2573 	close=strchr(g_utf8_next_char(open),'>');
  2574 	if (close)
  2575 	{
  2576 	    if (pswit[ECHO_SWITCH])
  2577 		g_print("\n%s\n",aline);
  2578 	    if (!pswit[OVERVIEW_SWITCH])
  2579 	    {
  2580 		tag=g_strndup(open,close-open+1);
  2581 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2582 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2583 		g_free(tag);
  2584 	    }
  2585 	    else
  2586 		cnt_html++;
  2587 	}
  2588     }
  2589 }
  2590 
  2591 /*
  2592  * check_for_html_entity:
  2593  *
  2594  * Check for &symbol; HTML.
  2595  *
  2596  * If there is a & in the line, followed at
  2597  * some point by a ; then we suspect HTML.
  2598  */
  2599 void check_for_html_entity(const char *aline)
  2600 {
  2601     const char *s,*amp,*scolon;
  2602     gchar *entity;
  2603     amp=strchr(aline,'&');
  2604     if (amp)
  2605     {
  2606 	scolon=strchr(amp,';');
  2607 	if (scolon)
  2608 	{
  2609 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2610 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2611 		    break;		/* Don't report "Jones & Son;" */
  2612 	    if (s>=scolon)
  2613 	    {
  2614 		if (pswit[ECHO_SWITCH])
  2615 		    g_print("\n%s\n",aline);
  2616 		if (!pswit[OVERVIEW_SWITCH])
  2617 		{
  2618 		    entity=g_strndup(amp,scolon-amp+1);
  2619 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2620 		      linecnt,(int)(amp-aline)+1,entity);
  2621 		    g_free(entity);
  2622 		}
  2623 		else
  2624 		    cnt_html++;
  2625 	    }
  2626 	}
  2627     }
  2628 }
  2629 
  2630 /*
  2631  * check_for_omitted_punctuation:
  2632  *
  2633  * Check for omitted punctuation at end of paragraph by working back
  2634  * through prevline. DW.
  2635  * Need to check this only for "normal" paras.
  2636  * So what is a "normal" para?
  2637  *    Not normal if one-liner (chapter headings, etc.)
  2638  *    Not normal if doesn't contain at least one locase letter
  2639  *    Not normal if starts with space
  2640  */
  2641 void check_for_omitted_punctuation(const char *prevline,
  2642   struct line_properties *last,int start_para_line)
  2643 {
  2644     gboolean letter_on_line=FALSE;
  2645     const char *s;
  2646     gunichar c;
  2647     for (s=prevline;*s;s=g_utf8_next_char(s))
  2648 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2649 	{
  2650 	    letter_on_line=TRUE;
  2651 	    break;
  2652 	}
  2653     /*
  2654      * This next "if" is a problem.
  2655      * If we say "start_para_line <= linecnt - 1", that includes
  2656      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2657      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2658      * misses genuine one-line paragraphs.
  2659      */
  2660     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2661       g_utf8_get_char(prevline)>CHAR_SPACE)
  2662     {
  2663 	s=prevline+strlen(prevline);
  2664 	do
  2665 	{
  2666 	    s=g_utf8_prev_char(s);
  2667 	    c=g_utf8_get_char(s);
  2668 	} while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
  2669 	for (;s>prevline;s=g_utf8_prev_char(s))
  2670 	{
  2671 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2672 	    {
  2673 		if (pswit[ECHO_SWITCH])
  2674 		    g_print("\n%s\n",prevline);
  2675 		if (!pswit[OVERVIEW_SWITCH])
  2676 		    g_print("    Line %ld column %ld - "
  2677 		      "No punctuation at para end?\n",
  2678 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2679 		else
  2680 		    cnt_punct++;
  2681 		break;
  2682 	    }
  2683 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2684 		break;
  2685 	}
  2686     }
  2687 }
  2688 
  2689 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2690 {
  2691     const char *word=key;
  2692     int *dupcnt=value;
  2693     if (*dupcnt)
  2694 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2695 	  word,*dupcnt);
  2696     return FALSE;
  2697 }
  2698 
  2699 void print_as_windows_1252(const char *string)
  2700 {
  2701     gsize inbytes,outbytes;
  2702     gchar *buf,*bp;
  2703     static GIConv converter=(GIConv)-1;
  2704     if (!string)
  2705     {
  2706 	if (converter!=(GIConv)-1)
  2707 	    g_iconv_close(converter);
  2708 	converter=(GIConv)-1;
  2709 	return;
  2710     }
  2711     if (converter==(GIConv)-1)
  2712 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2713     if (converter!=(GIConv)-1)
  2714     {
  2715 	inbytes=outbytes=strlen(string);
  2716 	bp=buf=g_malloc(outbytes+1);
  2717 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2718 	*bp='\0';
  2719 	fputs(buf,stdout);
  2720 	g_free(buf);
  2721     }
  2722     else
  2723 	fputs(string,stdout);
  2724 }
  2725 
  2726 void print_as_utf_8(const char *string)
  2727 {
  2728     fputs(string,stdout);
  2729 }
  2730 
  2731 /*
  2732  * procfile:
  2733  *
  2734  * Process one file.
  2735  */
  2736 void procfile(const char *filename)
  2737 {
  2738     const char *s;
  2739     gchar *parastart=NULL;	/* first line of current para */
  2740     gchar *etext,*aline;
  2741     gchar *etext_ptr;
  2742     GError *err=NULL;
  2743     struct first_pass_results *first_pass_results;
  2744     struct warnings *warnings;
  2745     struct counters counters={0};
  2746     struct line_properties last={0};
  2747     struct parities parities={0};
  2748     struct pending pending={0};
  2749     gboolean isemptyline;
  2750     long start_para_line=0;
  2751     gboolean isnewpara=FALSE,enddash=FALSE;
  2752     last.start=CHAR_SPACE;
  2753     linecnt=checked_linecnt=0;
  2754     etext=read_etext(filename,&err);
  2755     if (!etext)
  2756     {
  2757 	if (pswit[STDOUT_SWITCH])
  2758 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2759 	else
  2760 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2761 	exit(1);
  2762     }
  2763     g_print("\n\nFile: %s\n\n",filename);
  2764     first_pass_results=first_pass(etext);
  2765     warnings=report_first_pass(first_pass_results);
  2766     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2767     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2768     /*
  2769      * Here we go with the main pass. Hold onto yer hat!
  2770      */
  2771     linecnt=0;
  2772     etext_ptr=etext;
  2773     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2774     {
  2775 	linecnt++;
  2776 	if (linecnt==1)
  2777 	    isnewpara=TRUE;
  2778 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2779 	    continue;    // skip DP page separators completely
  2780 	if (linecnt<first_pass_results->firstline ||
  2781 	  (first_pass_results->footerline>0 &&
  2782 	  linecnt>first_pass_results->footerline))
  2783 	{
  2784 	    if (pswit[HEADER_SWITCH])
  2785 	    {
  2786 		if (g_str_has_prefix(aline,"Title:"))
  2787 		    g_print("    %s\n",aline);
  2788 		if (g_str_has_prefix(aline,"Author:"))
  2789 		    g_print("    %s\n",aline);
  2790 		if (g_str_has_prefix(aline,"Release Date:"))
  2791 		    g_print("    %s\n",aline);
  2792 		if (g_str_has_prefix(aline,"Edition:"))
  2793 		    g_print("    %s\n\n",aline);
  2794 	    }
  2795 	    continue;		/* skip through the header */
  2796 	}
  2797 	checked_linecnt++;
  2798 	print_pending(aline,parastart,&pending);
  2799 	isemptyline=analyse_quotes(aline,&counters);
  2800 	if (isnewpara && !isemptyline)
  2801 	{
  2802 	    /* This line is the start of a new paragraph. */
  2803 	    start_para_line=linecnt;
  2804 	    /* Capture its first line in case we want to report it later. */
  2805 	    g_free(parastart);
  2806 	    parastart=g_strdup(aline);
  2807 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2808 	    s=aline;
  2809 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2810 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2811 		s=g_utf8_next_char(s);
  2812 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2813 	    {
  2814 		/* and its first letter is lowercase */
  2815 		if (pswit[ECHO_SWITCH])
  2816 		    g_print("\n%s\n",aline);
  2817 		if (!pswit[OVERVIEW_SWITCH])
  2818 		    g_print("    Line %ld column %ld - "
  2819 		      "Paragraph starts with lower-case\n",
  2820 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2821 		else
  2822 		    cnt_punct++;
  2823 	    }
  2824 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2825 	}
  2826 	/* Check for an em-dash broken at line end. */
  2827 	if (enddash && g_utf8_get_char(aline)=='-')
  2828 	{
  2829 	    if (pswit[ECHO_SWITCH])
  2830 		g_print("\n%s\n",aline);
  2831 	    if (!pswit[OVERVIEW_SWITCH])
  2832 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2833 	    else
  2834 		cnt_punct++;
  2835 	}
  2836 	enddash=FALSE;
  2837 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2838 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2839 	    ;
  2840 	if (s>=aline && g_utf8_get_char(s)=='-')
  2841 	    enddash=TRUE;
  2842 	check_for_control_characters(aline);
  2843 	if (warnings->bin)
  2844 	    check_for_odd_characters(aline,warnings,isemptyline);
  2845 	if (warnings->longline)
  2846 	    check_for_long_line(aline);
  2847 	if (warnings->shortline)
  2848 	    check_for_short_line(aline,&last);
  2849 	last.blen=last.len;
  2850 	last.len=g_utf8_strlen(aline,-1);
  2851 	last.start=g_utf8_get_char(aline);
  2852 	check_for_starting_punctuation(aline);
  2853 	if (warnings->dash)
  2854 	{
  2855 	    check_for_spaced_emdash(aline);
  2856 	    check_for_spaced_dash(aline);
  2857 	}
  2858 	check_for_unmarked_paragraphs(aline);
  2859 	check_for_jeebies(aline);
  2860 	check_for_mta_from(aline);
  2861 	check_for_orphan_character(aline);
  2862 	check_for_pling_scanno(aline);
  2863 	check_for_extra_period(aline,warnings);
  2864 	check_for_following_punctuation(aline);
  2865 	check_for_typos(aline,warnings);
  2866 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2867 	check_for_double_punctuation(aline,warnings);
  2868 	check_for_spaced_quotes(aline);
  2869 	check_for_miscased_genative(aline);
  2870 	check_end_of_line(aline,warnings);
  2871 	check_for_unspaced_bracket(aline);
  2872 	if (warnings->endquote)
  2873 	    check_for_unpunctuated_endquote(aline);
  2874 	check_for_html_tag(aline);
  2875 	check_for_html_entity(aline);
  2876 	if (isemptyline)
  2877 	{
  2878 	    check_for_mismatched_quotes(&counters,&pending);
  2879 	    counters_reset(&counters);
  2880 	    /* let the next iteration know that it's starting a new para */
  2881 	    isnewpara=TRUE;
  2882 	    if (prevline)
  2883 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2884 	}
  2885 	g_free(prevline);
  2886 	prevline=g_strdup(aline);
  2887     }
  2888     linecnt++;
  2889     check_for_mismatched_quotes(&counters,&pending);
  2890     print_pending(NULL,parastart,&pending);
  2891     reset_pending(&pending);
  2892     if (prevline)
  2893     {
  2894 	g_free(prevline);
  2895 	prevline=NULL;
  2896     }
  2897     g_free(parastart);
  2898     g_free(prevline);
  2899     g_free(etext);
  2900     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2901 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2902     g_tree_unref(qword);
  2903     g_tree_unref(qperiod);
  2904     counters_destroy(&counters);
  2905     g_set_print_handler(NULL);
  2906     print_as_windows_1252(NULL);
  2907     if (pswit[MARKUP_SWITCH])  
  2908 	loseentities(NULL);
  2909 }
  2910 
  2911 /*
  2912  * flgets:
  2913  *
  2914  * Get one line from the input text, checking for
  2915  * the existence of exactly one CR/LF line-end per line.
  2916  *
  2917  * Returns: a pointer to the line.
  2918  */
  2919 char *flgets(char **etext,long lcnt)
  2920 {
  2921     gunichar c;
  2922     gboolean isCR=FALSE;
  2923     char *theline=*etext;
  2924     char *eos=theline;
  2925     gchar *s;
  2926     for (;;)
  2927     {
  2928 	c=g_utf8_get_char(*etext);
  2929 	*etext=g_utf8_next_char(*etext);
  2930 	if (!c)
  2931 	    return NULL;
  2932 	/* either way, it's end of line */
  2933 	if (c=='\n')
  2934 	{
  2935 	    if (isCR)
  2936 		break;
  2937 	    else
  2938 	    {
  2939 		/* Error - a LF without a preceding CR */
  2940 		if (pswit[LINE_END_SWITCH])
  2941 		{
  2942 		    if (pswit[ECHO_SWITCH])
  2943 		    {
  2944 			s=g_strndup(theline,eos-theline);
  2945 			g_print("\n%s\n",s);
  2946 			g_free(s);
  2947 		    }
  2948 		    if (!pswit[OVERVIEW_SWITCH])
  2949 			g_print("    Line %ld - No CR?\n",lcnt);
  2950 		    else
  2951 			cnt_lineend++;
  2952 		}
  2953 		break;
  2954 	    }
  2955 	}
  2956 	if (c=='\r')
  2957 	{
  2958 	    if (isCR)
  2959 	    {
  2960 		/* Error - two successive CRs */
  2961 		if (pswit[LINE_END_SWITCH])
  2962 		{
  2963 		    if (pswit[ECHO_SWITCH])
  2964 		    {
  2965 			s=g_strndup(theline,eos-theline);
  2966 			g_print("\n%s\n",s);
  2967 			g_free(s);
  2968 		    }
  2969 		    if (!pswit[OVERVIEW_SWITCH])
  2970 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2971 		    else
  2972 			cnt_lineend++;
  2973 		}
  2974 	    }
  2975 	    isCR=TRUE;
  2976 	}
  2977 	else
  2978 	{
  2979 	    if (pswit[LINE_END_SWITCH] && isCR)
  2980 	    {
  2981 		if (pswit[ECHO_SWITCH])
  2982 		{
  2983 		    s=g_strndup(theline,eos-theline);
  2984 		    g_print("\n%s\n",s);
  2985 		    g_free(s);
  2986 		}
  2987 		if (!pswit[OVERVIEW_SWITCH])
  2988 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2989 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2990 		else
  2991 		    cnt_lineend++;
  2992 		*eos=' ';
  2993 	    }
  2994 	    isCR=FALSE;
  2995 	    eos=g_utf8_next_char(eos);
  2996 	}
  2997     }
  2998     *eos='\0';
  2999     if (pswit[MARKUP_SWITCH])  
  3000 	postprocess_for_HTML(theline);
  3001     if (pswit[DP_SWITCH])  
  3002 	postprocess_for_DP(theline);
  3003     return theline;
  3004 }
  3005 
  3006 /*
  3007  * mixdigit:
  3008  *
  3009  * Takes a "word" as a parameter, and checks whether it
  3010  * contains a mixture of alpha and digits. Generally, this is an
  3011  * error, but may not be for cases like 4th or L5 12s. 3d.
  3012  *
  3013  * Returns: TRUE iff an is error found.
  3014  */
  3015 gboolean mixdigit(const char *checkword)
  3016 {
  3017     gboolean wehaveadigit,wehavealetter,query;
  3018     const char *s,*nondigit;
  3019     wehaveadigit=wehavealetter=query=FALSE;
  3020     for (s=checkword;*s;s=g_utf8_next_char(s))
  3021 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3022 	    wehavealetter=TRUE;
  3023 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3024 	    wehaveadigit=TRUE;
  3025     if (wehaveadigit && wehavealetter)
  3026     {
  3027 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3028 	query=TRUE;
  3029 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3030 	  nondigit=g_utf8_next_char(nondigit))
  3031 	    ;
  3032 	/* digits, ending in st, rd, nd, th of either case */
  3033 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3034 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3035 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3036 	  !g_ascii_strcasecmp(nondigit,"th"))
  3037 	    query=FALSE;
  3038 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3039 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3040 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3041 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3042 	    query=FALSE;
  3043 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3044 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3045 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3046 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3047 	    query=FALSE;
  3048 	/* digits, ending in l, L, s or d */
  3049 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3050 	  !strcmp(nondigit,"d"))
  3051 	    query=FALSE;
  3052 	/*
  3053 	 * L at the start of a number, representing Britsh pounds, like L500.
  3054 	 * This is cute. We know the current word is mixed digit. If the first
  3055 	 * letter is L, there must be at least one digit following. If both
  3056 	 * digits and letters follow, we have a genuine error, else we have a
  3057 	 * capital L followed by digits, and we accept that as a non-error.
  3058 	 */
  3059 	if (g_utf8_get_char(checkword)=='L' &&
  3060 	  !mixdigit(g_utf8_next_char(checkword)))
  3061 	    query=FALSE;
  3062     }
  3063     return query;
  3064 }
  3065 
  3066 /*
  3067  * getaword:
  3068  *
  3069  * Extracts the first/next "word" from the line, and returns it.
  3070  * A word is defined as one English word unit--or at least that's the aim.
  3071  * "ptr" is advanced to the position in the line where we will start
  3072  * looking for the next word.
  3073  *
  3074  * Returns: A newly-allocated string.
  3075  */
  3076 gchar *getaword(const char **ptr)
  3077 {
  3078     const char *s,*t;
  3079     GString *word;
  3080     gunichar c,pc;
  3081     word=g_string_new(NULL);
  3082     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3083       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3084       **ptr;*ptr=g_utf8_next_char(*ptr))
  3085 	;
  3086     /*
  3087      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3088      * Especially yucky is the case of L1,000
  3089      * This section looks for a pattern of characters including a digit
  3090      * followed by a comma or period followed by one or more digits.
  3091      * If found, it returns this whole pattern as a word; otherwise we discard
  3092      * the results and resume our normal programming.
  3093      */
  3094     s=*ptr;
  3095     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3096       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3097       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3098 	g_string_append_unichar(word,g_utf8_get_char(s));
  3099     if (word->len)
  3100     {
  3101 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3102 	{
  3103 	    c=g_utf8_get_char(t);
  3104 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3105 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3106 	    {
  3107 		*ptr=s;
  3108 		return g_string_free(word,FALSE);
  3109 	    }
  3110 	}
  3111     }
  3112     /* we didn't find a punctuated number - do the regular getword thing */
  3113     g_string_truncate(word,0);
  3114     c=g_utf8_get_char(*ptr);
  3115     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3116       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3117 	g_string_append_unichar(word,c);
  3118     return g_string_free(word,FALSE);
  3119 }
  3120 
  3121 /*
  3122  * isroman:
  3123  *
  3124  * Is this word a Roman Numeral?
  3125  *
  3126  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3127  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3128  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3129  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3130  * expressions thereof, except when it came to taxes. Allow any number of M,
  3131  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3132  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3133  * of optional Is.
  3134  */
  3135 gboolean isroman(const char *t)
  3136 {
  3137     const char *s;
  3138     if (!t || !*t)
  3139 	return FALSE;
  3140     s=t;
  3141     while (g_utf8_get_char(t)=='m' && *t)
  3142 	t++;
  3143     if (g_utf8_get_char(t)=='d')
  3144 	t++;
  3145     if (g_str_has_prefix(t,"cm"))
  3146 	t+=2;
  3147     if (g_str_has_prefix(t,"cd"))
  3148 	t+=2;
  3149     while (g_utf8_get_char(t)=='c' && *t)
  3150 	t++;
  3151     if (g_str_has_prefix(t,"xl"))
  3152 	t+=2;
  3153     if (g_str_has_prefix(t,"xc"))
  3154 	t+=2;
  3155     if (g_utf8_get_char(t)=='l')
  3156 	t++;
  3157     while (g_utf8_get_char(t)=='x' && *t)
  3158 	t++;
  3159     if (g_str_has_prefix(t,"ix"))
  3160 	t+=2;
  3161     if (g_str_has_prefix(t,"iv"))
  3162 	t+=2;
  3163     if (g_utf8_get_char(t)=='v')
  3164 	t++;
  3165     while (g_utf8_get_char(t)=='i' && *t)
  3166 	t++;
  3167     return !*t;
  3168 }
  3169 
  3170 /*
  3171  * postprocess_for_DP:
  3172  *
  3173  * Invoked with the -d switch from flgets().
  3174  * It simply "removes" from the line a hard-coded set of common
  3175  * DP-specific tags, so that the line passed to the main routine has
  3176  * been pre-cleaned of DP markup.
  3177  */
  3178 void postprocess_for_DP(char *theline)
  3179 {
  3180     char *s,*t;
  3181     int i;
  3182     if (!*theline) 
  3183 	return;
  3184     for (i=0;*DPmarkup[i];i++)
  3185 	while ((s=strstr(theline,DPmarkup[i])))
  3186 	{
  3187 	    t=s+strlen(DPmarkup[i]);
  3188 	    memmove(s,t,strlen(t)+1);
  3189 	}
  3190 }
  3191 
  3192 /*
  3193  * postprocess_for_HTML:
  3194  *
  3195  * Invoked with the -m switch from flgets().
  3196  * It simply "removes" from the line a hard-coded set of common
  3197  * HTML tags and "replaces" a hard-coded set of common HTML
  3198  * entities, so that the line passed to the main routine has
  3199  * been pre-cleaned of HTML.
  3200  */
  3201 void postprocess_for_HTML(char *theline)
  3202 {
  3203     while (losemarkup(theline))
  3204 	;
  3205     loseentities(theline);
  3206 }
  3207 
  3208 char *losemarkup(char *theline)
  3209 {
  3210     char *s,*t;
  3211     int i;
  3212     s=strchr(theline,'<');
  3213     t=s?strchr(s,'>'):NULL;
  3214     if (!s || !t)
  3215 	return NULL;
  3216     for (i=0;*markup[i];i++)
  3217 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3218 	{
  3219 	    t=g_utf8_next_char(t);
  3220 	    memmove(s,t,strlen(t)+1);
  3221 	    return s;
  3222 	}
  3223     /* It's an unrecognized <xxx>. */
  3224     return NULL;
  3225 }
  3226 
  3227 void loseentities(char *theline)
  3228 {
  3229     int i;
  3230     gsize nb;
  3231     char *amp,*scolon;
  3232     gchar *s,*t;
  3233     gunichar c;
  3234     GTree *entities=NULL;
  3235     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3236     if (!theline)
  3237     {
  3238 	if (entities)
  3239 	    g_tree_destroy(entities);
  3240 	entities=NULL;
  3241 	if (translit!=(GIConv)-1)
  3242 	    g_iconv_close(translit);
  3243 	translit=(GIConv)-1;
  3244 	if (to_utf8!=(GIConv)-1)
  3245 	    g_iconv_close(to_utf8);
  3246 	to_utf8=(GIConv)-1;
  3247 	return;
  3248     }
  3249     if (!*theline)
  3250 	return;
  3251     if (!entities)
  3252     {
  3253 	entities=g_tree_new((GCompareFunc)strcmp);
  3254 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3255 	    g_tree_insert(entities,HTMLentities[i].name,
  3256 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3257     }
  3258     if (translit==(GIConv)-1)
  3259 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3260     if (to_utf8==(GIConv)-1)
  3261 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3262     while((amp=strchr(theline,'&')))
  3263     {
  3264 	scolon=strchr(amp,';');
  3265 	if (scolon)
  3266 	{
  3267 	    if (amp[1]=='#')
  3268 	    {
  3269 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3270 		    c=strtol(amp+2,NULL,10);
  3271 		else if (amp[2]=='x' &&
  3272 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3273 		    c=strtol(amp+3,NULL,16);
  3274 	    }
  3275 	    else
  3276 	    {
  3277 		s=g_strndup(amp+1,scolon-(amp+1));
  3278 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3279 		g_free(s);
  3280 	    }
  3281 	}
  3282 	else
  3283 	    c=0;
  3284 	if (c)
  3285 	{
  3286 	    theline=amp;
  3287 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3288 		theline+=g_unichar_to_utf8(c,theline);
  3289 	    else
  3290 	    {
  3291 		s=g_malloc(6);
  3292 		nb=g_unichar_to_utf8(c,s);
  3293 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3294 		g_free(s);
  3295 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3296 		g_free(t);
  3297 		memcpy(theline,s,nb);
  3298 		g_free(s);
  3299 		theline+=nb;
  3300 	    }
  3301 	    memmove(theline,g_utf8_next_char(scolon),
  3302 	      strlen(g_utf8_next_char(scolon))+1);
  3303 	}
  3304 	else
  3305 	    theline=g_utf8_next_char(amp);
  3306     }
  3307 }
  3308 
  3309 gboolean tagcomp(const char *strin,const char *basetag)
  3310 {
  3311     gboolean retval;
  3312     gchar *s,*t;
  3313     if (g_utf8_get_char(strin)=='/')
  3314 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3315     else
  3316 	t=g_utf8_casefold(strin,-1);
  3317     s=g_utf8_casefold(basetag,-1);
  3318     retval=g_str_has_prefix(t,s);
  3319     g_free(s);
  3320     g_free(t);
  3321     return retval;
  3322 }
  3323 
  3324 void proghelp(GOptionContext *context)
  3325 {
  3326     gchar *help;
  3327     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3328     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3329     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3330     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3331       "For details, read the file COPYING.\n",stderr);
  3332     fputs("This is Free Software; "
  3333       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3334     fputs("read the file COPYING for details.\n\n",stderr);
  3335     help=g_option_context_get_help(context,TRUE,NULL);
  3336     fputs(help,stderr);
  3337     g_free(help);
  3338     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3339     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3340       "non-ASCII\n",stderr);
  3341     fputs("characters like accented letters, "
  3342       "lines longer than 75 or shorter than 55,\n",stderr);
  3343     fputs("unbalanced quotes or brackets, "
  3344       "a variety of badly formatted punctuation, \n",stderr);
  3345     fputs("HTML tags, some likely typos. "
  3346       "It is NOT a substitute for human judgement.\n",stderr);
  3347     fputs("\n",stderr);
  3348 }