bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sun Sep 29 22:51:27 2013 +0100 (2013-09-29)
changeset 186 4912234d80be
parent 174 ad92d11d59b8
child 187 6ed7afd99ea9
child 191 189183b37598
permissions -rw-r--r--
Fix bug #14: Add a configuration file
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *prevline;
    36 
    37 /* Common typos. */
    38 char *typo[] = {
    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    68     "se", ""
    69 };
    70 
    71 GTree *usertypo;
    72 
    73 /* Common abbreviations and other OK words not to query as typos. */
    74 char *okword[] = {
    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    78     "outbid", "outbids", "frostbite", "frostbitten", ""
    79 };
    80 
    81 /* Common abbreviations that cause otherwise unexplained periods. */
    82 char *abbrev[] = {
    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    85 };
    86 
    87 /*
    88  * Two-Letter combinations that rarely if ever start words,
    89  * but are common scannos or otherwise common letter combinations.
    90  */
    91 char *nostart[] = {
    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    93 };
    94 
    95 /*
    96  * Two-Letter combinations that rarely if ever end words,
    97  * but are common scannos or otherwise common letter combinations.
    98  */
    99 char *noend[] = {
   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   101     "sw", "gr", "sl", "cl", "iy", ""
   102 };
   103 
   104 char *markup[] = {
   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   109 };
   110 
   111 char *DPmarkup[] = {
   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   113 };
   114 
   115 char *nocomma[] = {
   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   119     "during", "let", "toward", "among", ""
   120 };
   121 
   122 char *noperiod[] = {
   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   126     "among", "those", "into", "whom", "having", "thence", ""
   127 }; 
   128 
   129 gboolean pswit[SWITNO];  /* program switches */
   130 
   131 gboolean typo_compat,paranoid_compat;
   132 
   133 static GOptionEntry options[]={
   134     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   135       "Ignore DP-specific markup", NULL },
   136     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   137       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   138       "Don't ignore DP-specific markup", NULL },
   139     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   140       "Echo queried line", NULL },
   141     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   142       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   143       "Don't echo queried line", NULL },
   144     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   145       "Check single quotes", NULL },
   146     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   147       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   148       "Don't check single quotes", NULL },
   149     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   150       "Check common typos", NULL },
   151     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   152       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   153       "Don't check common typos", NULL },
   154     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   155       "Require closure of quotes on every paragraph", NULL },
   156     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   157       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   158       "Don't require closure of quotes on every paragraph", NULL },
   159     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   160       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   161       "Enable paranoid querying of everything", NULL },
   162     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   163       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   164       "Disable paranoid querying of everything", NULL },
   165     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   166       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   167       "Enable line end checking", NULL },
   168     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   169       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   170       "Diable line end checking", NULL },
   171     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   172       "Overview: just show counts", NULL },
   173     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   174       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   175       "Show individual warnings", NULL },
   176     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   177       "Output errors to stdout instead of stderr", NULL },
   178     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   179       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   180       "Output errors to stderr instead of stdout", NULL },
   181     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   182       "Echo header fields", NULL },
   183     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   184       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   185       "Don't echo header fields", NULL },
   186     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   187       "Ignore markup in < >", NULL },
   188     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   189       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   190       "No special handling for markup in < >", NULL },
   191     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   192       "Use file of user-defined typos", NULL },
   193     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   194       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   195       "Ignore file of user-defined typos", NULL },
   196     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   197       "Verbose - list everything", NULL },
   198     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   199       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   200       "Switch off verbose mode", NULL },
   201     { NULL }
   202 };
   203 
   204 /*
   205  * Options relating to configuration which make no sense from inside
   206  * a configuration file.
   207  */
   208 
   209 static GOptionEntry config_options[]={
   210     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   211       "Defaults for use on www upload", NULL },
   212     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   213       "Dump current config settings", NULL },
   214     { NULL }
   215 };
   216 
   217 static GOptionEntry compatibility_options[]={
   218     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   219       "Toggle checking for common typos", NULL },
   220     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   221       "Toggle both paranoid mode and common typos", NULL },
   222     { NULL }
   223 };
   224 
   225 long cnt_quote;		/* for overview mode, count of quote queries */
   226 long cnt_brack;		/* for overview mode, count of brackets queries */
   227 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   228 long cnt_odd;		/* for overview mode, count of odd character queries */
   229 long cnt_long;		/* for overview mode, count of long line errors */
   230 long cnt_short;		/* for overview mode, count of short line queries */
   231 long cnt_punct;		/* for overview mode,
   232 			   count of punctuation and spacing queries */
   233 long cnt_dash;		/* for overview mode, count of dash-related queries */
   234 long cnt_word;		/* for overview mode, count of word queries */
   235 long cnt_html;		/* for overview mode, count of html queries */
   236 long cnt_lineend;	/* for overview mode, count of line-end queries */
   237 long cnt_spacend;	/* count of lines with space at end */
   238 long linecnt;		/* count of total lines in the file */
   239 long checked_linecnt;	/* count of lines actually checked */
   240 
   241 void proghelp(GOptionContext *context);
   242 void procfile(const char *);
   243 
   244 gchar *running_from;
   245 
   246 gboolean mixdigit(const char *);
   247 gchar *getaword(const char **);
   248 char *flgets(char **,long);
   249 void postprocess_for_HTML(char *);
   250 char *linehasmarkup(char *);
   251 char *losemarkup(char *);
   252 gboolean tagcomp(const char *,const char *);
   253 void loseentities(char *);
   254 gboolean isroman(const char *);
   255 void postprocess_for_DP(char *);
   256 void print_as_windows_1252(const char *string);
   257 void print_as_utf_8(const char *string);
   258 
   259 GTree *qword,*qperiod;
   260 
   261 #ifdef __WIN32__
   262 UINT saved_cp;
   263 #endif
   264 
   265 GKeyFile *config;
   266 
   267 void config_file_update(GKeyFile *kf)
   268 {
   269     int i;
   270     gboolean sw;
   271     for(i=0;options[i].long_name;i++)
   272     {
   273 	if (g_str_has_prefix(options[i].long_name,"no-"))
   274 	    continue;
   275 	if (options[i].arg==G_OPTION_ARG_NONE)
   276 	{
   277 	    sw=*(gboolean *)options[i].arg_data;
   278 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   279 		sw=!sw;
   280 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   281 	}
   282 	else
   283 	    g_assert_not_reached();
   284     }
   285 }
   286 
   287 void config_file_add_comments(GKeyFile *kf)
   288 {
   289     int i;
   290     gchar *comment;
   291     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   292       NULL);
   293     for(i=0;options[i].long_name;i++)
   294     {
   295 	if (g_str_has_prefix(options[i].long_name,"no-"))
   296 	    continue;
   297 	comment=g_strconcat(" ",options[i].description,NULL);
   298 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   299 	g_free(comment);
   300     }
   301 }
   302 
   303 void dump_config(void)
   304 {
   305     gchar *s;
   306     if (config)
   307 	config_file_update(config);
   308     else
   309     {
   310 	config=g_key_file_new();
   311 	config_file_update(config);
   312 	config_file_add_comments(config);
   313     }
   314     s=g_key_file_to_data(config,NULL,NULL);
   315     if (s)
   316 	g_print("%s",s);
   317     g_free(s);
   318 }
   319 
   320 GKeyFile *read_config_file(gchar **full_path)
   321 {
   322     int i;
   323     GError *err=NULL;
   324     gchar **search_dirs;
   325     gchar *path;
   326     const char *search_path;
   327     GKeyFile *kf;
   328     kf=g_key_file_new();
   329     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   330     if (search_path)
   331     {
   332 #ifdef __WIN32__
   333 	search_dirs=g_strsplit(search_path,";",0);
   334 #else
   335 	search_dirs=g_strsplit(search_path,":",0);
   336 #endif
   337     }
   338     else
   339     {
   340 	search_dirs=g_new(gchar *,4);
   341 	search_dirs[0]=g_get_current_dir();
   342 	search_dirs[1]=g_strdup(running_from);
   343 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   344 	search_dirs[3]=NULL;
   345     }
   346     for(i=0;search_dirs[i];i++)
   347     {
   348 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   349 	if (g_key_file_load_from_file(kf,path,
   350 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   351 	    break;
   352 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   353 	{
   354 	    g_printerr("Bookloupe: Error reading %s\n",path);
   355 	    g_printerr("%s\n",err->message);
   356 	    exit(1);
   357 	}
   358 	g_clear_error(&err);
   359 	g_free(path);
   360 	path=NULL;
   361     }
   362     if (!search_dirs[i])
   363     {
   364 	g_key_file_free(kf);
   365 	kf=NULL;
   366     }
   367     g_strfreev(search_dirs);
   368     if (full_path && kf)
   369 	*full_path=path;
   370     else
   371 	g_free(path);
   372     return kf;
   373 }
   374 
   375 void parse_config_file(void)
   376 {
   377     int i,j;
   378     gchar *path;
   379     gchar **keys;
   380     gboolean sw;
   381     GError *err=NULL;
   382     config=read_config_file(&path);
   383     if (config)
   384 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   385     else
   386 	keys=NULL;
   387     if (keys)
   388     {
   389 	for(i=0;keys[i];i++)
   390 	{
   391 	    for(j=0;options[j].long_name;j++)
   392 	    {
   393 		if (g_str_has_prefix(options[j].long_name,"no-"))
   394 		    continue;
   395 		else if (!strcmp(keys[i],options[j].long_name))
   396 		{
   397 		    if (options[j].arg==G_OPTION_ARG_NONE)
   398 		    {
   399 			sw=g_key_file_get_boolean(config,"options",keys[i],
   400 			  &err);
   401 			if (err)
   402 			{
   403 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   404 			      path,keys[i],err->message);
   405 			    g_clear_error(&err);
   406 			}
   407 			if (options[j].flags&G_OPTION_FLAG_REVERSE)
   408 			    sw=!sw;
   409 			*(gboolean *)options[j].arg_data=sw;
   410 			break;
   411 		    }
   412 		    else
   413 			g_assert_not_reached();
   414 		}
   415 	    }
   416 	    if (!options[j].long_name)
   417 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   418 		  path,keys[i]);
   419 	}
   420 	g_strfreev(keys);
   421     }
   422     if (config)
   423 	g_free(path);
   424 }
   425 
   426 void parse_options(int *argc,char ***argv)
   427 {
   428     GError *err=NULL;
   429     GOptionContext *context;
   430     GOptionGroup *compatibility;
   431     context=g_option_context_new(
   432       "file - look for errors in Project Gutenberg(TM) etexts");
   433     g_option_context_add_main_entries(context,options,NULL);
   434     g_option_context_add_main_entries(context,config_options,NULL);
   435     compatibility=g_option_group_new("compatibility",
   436       "Options for Compatibility with Gutcheck:",
   437       "Show compatibility options",NULL,NULL);
   438     g_option_group_add_entries(compatibility,compatibility_options);
   439     g_option_context_add_group(context,compatibility);
   440     g_option_context_set_description(context,
   441       "For simplicity, only the switch options which reverse the\n"
   442       "default configuration are listed. In most cases, both vanilla\n"
   443       "and \"no-\" prefixed versions are available for use.");
   444     if (!g_option_context_parse(context,argc,argv,&err))
   445     {
   446 	g_printerr("Bookloupe: %s\n",err->message);
   447 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   448 	exit(1);
   449     }
   450     if (typo_compat)
   451 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   452     if (paranoid_compat)
   453     {
   454 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   455 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   456     }
   457     /*
   458      * Web uploads - for the moment, this is really just a placeholder
   459      * until we decide what processing we really want to do on web uploads
   460      */
   461     if (pswit[WEB_SWITCH])
   462     {
   463 	/* specific override for web uploads */
   464 	pswit[ECHO_SWITCH]=TRUE;
   465 	pswit[SQUOTE_SWITCH]=FALSE;
   466 	pswit[TYPO_SWITCH]=TRUE;
   467 	pswit[QPARA_SWITCH]=FALSE;
   468 	pswit[PARANOID_SWITCH]=TRUE;
   469 	pswit[LINE_END_SWITCH]=FALSE;
   470 	pswit[OVERVIEW_SWITCH]=FALSE;
   471 	pswit[STDOUT_SWITCH]=FALSE;
   472 	pswit[HEADER_SWITCH]=TRUE;
   473 	pswit[VERBOSE_SWITCH]=FALSE;
   474 	pswit[MARKUP_SWITCH]=FALSE;
   475 	pswit[USERTYPO_SWITCH]=FALSE;
   476 	pswit[DP_SWITCH]=FALSE;
   477     }
   478     if (pswit[DUMP_CONFIG_SWITCH])
   479     {
   480 	dump_config();
   481 	exit(0);
   482     }
   483     if (pswit[OVERVIEW_SWITCH])
   484 	/* just print summary; don't echo */
   485 	pswit[ECHO_SWITCH]=FALSE;
   486     if (*argc<2)
   487     {
   488 	proghelp(context);
   489 	exit(1);
   490     }
   491     g_option_context_free(context);
   492 }
   493 
   494 /*
   495  * read_user_scannos:
   496  *
   497  * Read in the user-defined stealth scanno list.
   498  */
   499 void read_user_scannos(void)
   500 {
   501     GError *err=NULL;
   502     gchar *usertypo_file;
   503     gboolean okay;
   504     int i;
   505     gsize len,nb;
   506     gchar *contents,*utf8,**lines;
   507     usertypo_file=g_strdup("bookloupe.typ");
   508     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   509     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   510     {
   511 	g_clear_error(&err);
   512 	g_free(usertypo_file);
   513 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   514 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   515     }
   516     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   517     {
   518 	g_clear_error(&err);
   519 	g_free(usertypo_file);
   520 	usertypo_file=g_strdup("gutcheck.typ");
   521 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   522     }
   523     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   524     {
   525 	g_clear_error(&err);
   526 	g_free(usertypo_file);
   527 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   528 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   529     }
   530     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   531     {
   532 	g_free(usertypo_file);
   533 	g_print("   --> I couldn't find bookloupe.typ "
   534 	  "-- proceeding without user typos.\n");
   535 	return;
   536     }
   537     else if (!okay)
   538     {
   539 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   540 	g_free(usertypo_file);
   541 	g_clear_error(&err);
   542 	exit(1);
   543     }
   544     if (g_utf8_validate(contents,len,NULL))
   545 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   546     else
   547 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   548     g_free(contents);
   549     lines=g_strsplit_set(utf8,"\r\n",0);
   550     g_free(utf8);
   551     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   552     for (i=0;lines[i];i++)
   553 	if (*(unsigned char *)lines[i]>'!')
   554 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   555 	else
   556 	    g_free(lines[i]);
   557     g_free(lines);
   558 }
   559 
   560 /*
   561  * read_etext:
   562  *
   563  * Read an etext returning a newly allocated string containing the file
   564  * contents or NULL on error.
   565  */
   566 gchar *read_etext(const char *filename,GError **err)
   567 {
   568     GError *tmp_err=NULL;
   569     gchar *contents,*utf8;
   570     gsize len,bytes_read,bytes_written;
   571     int i,line,col;
   572     if (!g_file_get_contents(filename,&contents,&len,err))
   573 	return NULL;
   574     if (g_utf8_validate(contents,len,NULL))
   575     {
   576 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   577 	g_set_print_handler(print_as_utf_8);
   578 #ifdef __WIN32__
   579 	SetConsoleOutputCP(CP_UTF8);
   580 #endif
   581     }
   582     else
   583     {
   584 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   585 	  &bytes_written,&tmp_err);
   586 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   587 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   588 	{
   589 	    line=col=1;
   590 	    for(i=0;i<bytes_read;i++)
   591 		if (contents[i]=='\n')
   592 		{
   593 		    line++;
   594 		    col=1;
   595 		}
   596 		else if (contents[i]!='\r')
   597 		    col++;
   598 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   599 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   600 	      "valid Windows-1252 character",
   601 	      ((unsigned char *)contents)[bytes_read],line,col);
   602 	}
   603 	else if (tmp_err)
   604 	    g_propagate_error(err,tmp_err);
   605 	g_set_print_handler(print_as_windows_1252);
   606 #ifdef __WIN32__
   607 	SetConsoleOutputCP(1252);
   608 #endif
   609     }
   610     g_free(contents);
   611     return utf8;
   612 }
   613 
   614 void cleanup_on_exit(void)
   615 {
   616 #ifdef __WIN32__
   617     SetConsoleOutputCP(saved_cp);
   618 #endif
   619 }
   620 
   621 int main(int argc,char **argv)
   622 {
   623 #ifdef __WIN32__
   624     atexit(cleanup_on_exit);
   625     saved_cp=GetConsoleOutputCP();
   626 #endif
   627     running_from=g_path_get_dirname(argv[0]);
   628     /* Paranoid checking is turned OFF, not on, by its switch */
   629     pswit[PARANOID_SWITCH]=TRUE;
   630     /* if running in paranoid mode, typo checks default to enabled */
   631     pswit[TYPO_SWITCH]=TRUE;
   632     /* Line-end checking is turned OFF, not on, by its switch */
   633     pswit[LINE_END_SWITCH]=TRUE;
   634     /* Echoing is turned OFF, not on, by its switch */
   635     pswit[ECHO_SWITCH]=TRUE;
   636     parse_config_file();
   637     parse_options(&argc,&argv);
   638     if (pswit[USERTYPO_SWITCH])
   639 	read_user_scannos();
   640     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   641     procfile(argv[1]);
   642     if (pswit[OVERVIEW_SWITCH])
   643     {
   644 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   645 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   646 	g_print("    --------------- Queries found --------------\n");
   647 	if (cnt_long)
   648 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   649 	if (cnt_short)
   650 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   651 	if (cnt_lineend)
   652 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   653 	if (cnt_word)
   654 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   655 	if (cnt_quote)
   656 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   657 	if (cnt_brack)
   658 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   659 	if (cnt_bin)
   660 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   661 	if (cnt_odd)
   662 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   663 	if (cnt_punct)
   664 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   665 	if (cnt_dash)
   666 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   667 	if (cnt_html)
   668 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   669 	g_print("\n");
   670 	g_print("    TOTAL QUERIES		  %14ld\n",
   671 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   672 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   673     }
   674     g_free(running_from);
   675     if (usertypo)
   676 	g_tree_unref(usertypo);
   677     if (config)
   678 	g_key_file_free(config);
   679     return 0;
   680 }
   681 
   682 void count_dashes(const char *line,const char *dash,
   683   struct dash_results *results)
   684 {
   685     int i;
   686     gchar **tokens;
   687     gunichar pc,nc;
   688     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   689     if (!*line)
   690 	return;
   691     tokens=g_strsplit(line,dash,0);
   692     if (tokens[1])
   693 	results->base++;
   694     for(i=1;tokens[i];i++)
   695     {
   696 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   697 	nc=g_utf8_get_char(tokens[i]);
   698 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   699 	    spaced=TRUE;
   700 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   701 	    spaced2=TRUE;
   702 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   703 	    unspaced=TRUE;
   704     }
   705     if (spaced)
   706 	results->space++;
   707     if (spaced2)
   708 	/* count of lines with em-dashes with spaces both sides */
   709 	results->non_PG_space++;
   710     if (unspaced)
   711 	/* count of lines with PG-type em-dashes with no spaces */
   712 	results->PG_space++;
   713     g_strfreev(tokens);
   714 }
   715 
   716 /*
   717  * first_pass:
   718  *
   719  * Run a first pass - verify that it's a valid PG
   720  * file, decide whether to report some things that
   721  * occur many times in the text like long or short
   722  * lines, non-standard dashes, etc.
   723  */
   724 struct first_pass_results *first_pass(const char *etext)
   725 {
   726     gunichar laststart=CHAR_SPACE;
   727     const char *s;
   728     gchar *lc_line;
   729     int i,j,lbytes,llen;
   730     gchar **lines;
   731     unsigned int lastlen=0,lastblen=0;
   732     long spline=0,nspline=0;
   733     static struct first_pass_results results={0};
   734     struct dash_results tmp_dash_results;
   735     gchar *inword;
   736     QuoteClass qc;
   737     lines=g_strsplit(etext,"\n",0);
   738     for (j=0;lines[j];j++)
   739     {
   740 	lbytes=strlen(lines[j]);
   741 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   742 	    lines[j][--lbytes]='\0';
   743 	llen=g_utf8_strlen(lines[j],lbytes);
   744 	linecnt++;
   745 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   746 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   747 	{
   748 	    if (spline)
   749 		g_print("   --> Duplicate header?\n");
   750 	    spline=linecnt+1;   /* first line of non-header text, that is */
   751 	}
   752 	if (!strncmp(lines[j],"*** START",9) &&
   753 	  strstr(lines[j],"PROJECT GUTENBERG"))
   754 	{
   755 	    if (nspline)
   756 		g_print("   --> Duplicate header?\n");
   757 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   758 	}
   759 	if (spline || nspline)
   760 	{
   761 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   762 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   763 	    {
   764 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   765 		{
   766 		    if (results.footerline)
   767 		    {
   768 			/* it's an old-form header - we can detect duplicates */
   769 			if (!nspline)
   770 			    g_print("   --> Duplicate footer?\n");
   771 		    }
   772 		    else
   773 			results.footerline=linecnt;
   774 		}
   775 	    }
   776 	    g_free(lc_line);
   777 	}
   778 	if (spline)
   779 	    results.firstline=spline;
   780 	if (nspline)
   781 	    results.firstline=nspline;  /* override with new */
   782 	if (results.footerline)
   783 	    continue;    /* don't count the boilerplate in the footer */
   784 	results.totlen+=llen;
   785 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   786 	{
   787 	    if (g_utf8_get_char(s)>127)
   788 		results.binlen++;
   789 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   790 		results.alphalen++;
   791 	    if (s>lines[j])
   792 	    {
   793 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   794 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   795 		else
   796 		    qc=INVALID_QUOTE;
   797 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   798 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   799 		    results.endquote_count++;
   800 	    }
   801 	}
   802 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   803 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   804 	    results.shortline++;
   805 	if (lbytes>0 &&
   806 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   807 	    cnt_spacend++;
   808 	if (strstr(lines[j],".,"))
   809 	    results.dotcomma++;
   810 	/* only count ast lines for ignoring purposes where there is */
   811 	/* locase text on the line */
   812 	if (strchr(lines[j],'*'))
   813 	{
   814 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   815 		if (g_unichar_islower(g_utf8_get_char(s)))
   816 		    break;
   817 	    if (*s)
   818 		results.astline++;
   819 	}
   820 	if (strchr(lines[j],'/'))
   821 	    results.fslashline++;
   822 	if (lbytes>0)
   823 	{
   824 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   825 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   826 	      s=g_utf8_prev_char(s))
   827 		;
   828 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   829 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   830 		results.hyphens++;
   831 	}
   832 	if (llen>LONGEST_PG_LINE)
   833 	    results.longline++;
   834 	if (llen>WAY_TOO_LONG)
   835 	    results.verylongline++;
   836 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   837 	{
   838 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   839 	    if (i>0)
   840 		results.htmcount++;
   841 	    if (strstr(lines[j],"<i>"))
   842 		results.htmcount+=4; /* bonus marks! */
   843 	}
   844 	/* Check for spaced em-dashes */
   845 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   846 	count_dashes(lines[j],"--",&tmp_dash_results);
   847 	count_dashes(lines[j],"—",&tmp_dash_results);
   848 	if (tmp_dash_results.base)
   849 	    results.emdash.base++;
   850 	if (tmp_dash_results.non_PG_space)
   851 	    results.emdash.non_PG_space++;
   852 	if (tmp_dash_results.PG_space)
   853 	    results.emdash.PG_space++;
   854 	for (s=lines[j];*s;)
   855 	{
   856 	    inword=getaword(&s);
   857 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   858 		results.Dutchcount++;
   859 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   860 		results.Frenchcount++;
   861 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   862 		results.standalone_digit++;
   863 	    g_free(inword);
   864 	}
   865 	/* Check for spaced dashes */
   866 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   867 	    results.spacedash++;
   868 	lastblen=lastlen;
   869 	lastlen=llen;
   870 	laststart=lines[j][0];
   871     }
   872     g_strfreev(lines);
   873     return &results;
   874 }
   875 
   876 /*
   877  * report_first_pass:
   878  *
   879  * Make some snap decisions based on the first pass results.
   880  */
   881 struct warnings *report_first_pass(struct first_pass_results *results)
   882 {
   883     static struct warnings warnings={0};
   884     if (cnt_spacend>0)
   885 	g_print("   --> %ld lines in this file have white space at end\n",
   886 	  cnt_spacend);
   887     warnings.dotcomma=1;
   888     if (results->dotcomma>5)
   889     {
   890 	warnings.dotcomma=0;
   891 	g_print("   --> %ld lines in this file contain '.,'. "
   892 	  "Not reporting them.\n",results->dotcomma);
   893     }
   894     /*
   895      * If more than 50 lines, or one-tenth, are short,
   896      * don't bother reporting them.
   897      */
   898     warnings.shortline=1;
   899     if (results->shortline>50 || results->shortline*10>linecnt)
   900     {
   901 	warnings.shortline=0;
   902 	g_print("   --> %ld lines in this file are short. "
   903 	  "Not reporting short lines.\n",results->shortline);
   904     }
   905     /*
   906      * If more than 50 lines, or one-tenth, are long,
   907      * don't bother reporting them.
   908      */
   909     warnings.longline=1;
   910     if (results->longline>50 || results->longline*10>linecnt)
   911     {
   912 	warnings.longline=0;
   913 	g_print("   --> %ld lines in this file are long. "
   914 	  "Not reporting long lines.\n",results->longline);
   915     }
   916     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   917     warnings.ast=1;
   918     if (results->astline>10)
   919     {
   920 	warnings.ast=0;
   921 	g_print("   --> %ld lines in this file contain asterisks. "
   922 	  "Not reporting them.\n",results->astline);
   923     }
   924     /*
   925      * If more than 10 lines contain forward slashes,
   926      * don't bother reporting them.
   927      */
   928     warnings.fslash=1;
   929     if (results->fslashline>10)
   930     {
   931 	warnings.fslash=0;
   932 	g_print("   --> %ld lines in this file contain forward slashes. "
   933 	  "Not reporting them.\n",results->fslashline);
   934     }
   935     /*
   936      * If more than 20 lines contain unpunctuated endquotes,
   937      * don't bother reporting them.
   938      */
   939     warnings.endquote=1;
   940     if (results->endquote_count>20)
   941     {
   942 	warnings.endquote=0;
   943 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   944 	  "Not reporting them.\n",results->endquote_count);
   945     }
   946     /*
   947      * If more than 15 lines contain standalone digits,
   948      * don't bother reporting them.
   949      */
   950     warnings.digit=1;
   951     if (results->standalone_digit>10)
   952     {
   953 	warnings.digit=0;
   954 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   955 	  "Not reporting them.\n",results->standalone_digit);
   956     }
   957     /*
   958      * If more than 20 lines contain hyphens at end,
   959      * don't bother reporting them.
   960      */
   961     warnings.hyphen=1;
   962     if (results->hyphens>20)
   963     {
   964 	warnings.hyphen=0;
   965 	g_print("   --> %ld lines in this file have hyphens at end. "
   966 	  "Not reporting them.\n",results->hyphens);
   967     }
   968     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   969     {
   970 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   971 	pswit[MARKUP_SWITCH]=1;
   972     }
   973     if (results->verylongline>0)
   974 	g_print("   --> %ld lines in this file are VERY long!\n",
   975 	  results->verylongline);
   976     /*
   977      * If there are more non-PG spaced dashes than PG em-dashes,
   978      * assume it's deliberate.
   979      * Current PG guidelines say don't use them, but older texts do,
   980      * and some people insist on them whatever the guidelines say.
   981      */
   982     warnings.dash=1;
   983     if (results->spacedash+results->emdash.non_PG_space>
   984       results->emdash.PG_space)
   985     {
   986 	warnings.dash=0;
   987 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   988 	  "Not reporting them.\n",
   989 	  results->spacedash+results->emdash.non_PG_space);
   990     }
   991     /* If more than a quarter of characters are hi-bit, bug out. */
   992     warnings.bin=1;
   993     if (results->binlen*4>results->totlen)
   994     {
   995 	g_print("   --> This file does not appear to be ASCII. "
   996 	  "Terminating. Best of luck with it!\n");
   997 	exit(1);
   998     }
   999     if (results->alphalen*4<results->totlen)
  1000     {
  1001 	g_print("   --> This file does not appear to be text. "
  1002 	  "Terminating. Best of luck with it!\n");
  1003 	exit(1);
  1004     }
  1005     if (results->binlen*100>results->totlen || results->binlen>100)
  1006     {
  1007 	g_print("   --> There are a lot of foreign letters here. "
  1008 	  "Not reporting them.\n");
  1009 	warnings.bin=0;
  1010     }
  1011     warnings.isDutch=FALSE;
  1012     if (results->Dutchcount>50)
  1013     {
  1014 	warnings.isDutch=TRUE;
  1015 	g_print("   --> This looks like Dutch - "
  1016 	  "switching off dashes and warnings for 's Middags case.\n");
  1017     }
  1018     warnings.isFrench=FALSE;
  1019     if (results->Frenchcount>50)
  1020     {
  1021 	warnings.isFrench=TRUE;
  1022 	g_print("   --> This looks like French - "
  1023 	  "switching off some doublepunct.\n");
  1024     }
  1025     if (results->firstline && results->footerline)
  1026 	g_print("    The PG header and footer appear to be already on.\n");
  1027     else
  1028     {
  1029 	if (results->firstline)
  1030 	    g_print("    The PG header is on - no footer.\n");
  1031 	if (results->footerline)
  1032 	    g_print("    The PG footer is on - no header.\n");
  1033     }
  1034     g_print("\n");
  1035     if (pswit[VERBOSE_SWITCH])
  1036     {
  1037 	warnings.bin=1;
  1038 	warnings.shortline=1;
  1039 	warnings.dotcomma=1;
  1040 	warnings.longline=1;
  1041 	warnings.dash=1;
  1042 	warnings.digit=1;
  1043 	warnings.ast=1;
  1044 	warnings.fslash=1;
  1045 	warnings.hyphen=1;
  1046 	warnings.endquote=1;
  1047 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1048     }
  1049     if (warnings.isDutch)
  1050 	warnings.dash=0;
  1051     if (results->footerline>0 && results->firstline>0 &&
  1052       results->footerline>results->firstline &&
  1053       results->footerline-results->firstline<100)
  1054     {
  1055 	g_print("   --> I don't really know where this text starts. \n");
  1056 	g_print("       There are no reference points.\n");
  1057 	g_print("       I'm going to have to report the header and footer "
  1058 	  "as well.\n");
  1059 	results->firstline=0;
  1060     }
  1061     return &warnings;
  1062 }
  1063 
  1064 /*
  1065  * analyse_quotes:
  1066  *
  1067  * Look along the line, accumulate the count of quotes, and see
  1068  * if this is an empty line - i.e. a line with nothing on it
  1069  * but spaces.
  1070  * If line has just spaces, period, * and/or - on it, don't
  1071  * count it, since empty lines with asterisks or dashes to
  1072  * separate sections are common.
  1073  *
  1074  * Returns: TRUE if the line is empty.
  1075  */
  1076 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1077 {
  1078     int guessquote=0;
  1079     /* assume the line is empty until proven otherwise */
  1080     gboolean isemptyline=TRUE;
  1081     const char *s=aline,*sprev,*snext;
  1082     gunichar c;
  1083     sprev=NULL;
  1084     GError *tmp_err=NULL;
  1085     while (*s)
  1086     {
  1087 	snext=g_utf8_next_char(s);
  1088 	c=g_utf8_get_char(s);
  1089 	if (CHAR_IS_DQUOTE(c))
  1090 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1091 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1092 	{
  1093 	    if (s==aline)
  1094 	    {
  1095 		/*
  1096 		 * At start of line, it can only be a quotation mark.
  1097 		 * Hardcode a very common exception!
  1098 		 */
  1099 		if (!g_str_has_prefix(snext,"tis") &&
  1100 		  !g_str_has_prefix(snext,"Tis"))
  1101 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1102 	    }
  1103 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1104 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1105 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1106 		;
  1107 	    /* it's outside a word - let's check it out */
  1108 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1109 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1110 	    {
  1111 		/* certainly looks like a quotation mark */
  1112 		if (!g_str_has_prefix(snext,"tis") &&
  1113 		  !g_str_has_prefix(snext,"Tis"))
  1114 		    /* hardcode a very common exception! */
  1115 		{
  1116 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1117 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1118 		    else
  1119 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1120 		}
  1121 	    }
  1122 	    else
  1123 	    {
  1124 		/* now - is it a quotation mark? */
  1125 		guessquote=0;   /* accumulate clues */
  1126 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1127 		{
  1128 		    /* it follows a letter - could be either */
  1129 		    guessquote++;
  1130 		    if (g_utf8_get_char(sprev)=='s')
  1131 		    {
  1132 			/* looks like a plural apostrophe */
  1133 			guessquote-=3;
  1134 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1135 			    /* bonus marks! */
  1136 			    guessquote-=2;
  1137 		    }
  1138 		    if (innermost_quote_matches(counters,c))
  1139 			/*
  1140 			 * Give it the benefit of some doubt,
  1141 			 * if a squote is already open.
  1142 			 */
  1143 			guessquote++;
  1144 		    else
  1145 			guessquote--;
  1146 		    if (guessquote>=0)
  1147 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1148 		}
  1149 		else
  1150 		    /* no adjacent letter - it must be a quote of some kind */
  1151 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1152 	    }
  1153 	}
  1154 	if (tmp_err)
  1155 	{
  1156 	    if (pswit[ECHO_SWITCH])
  1157 		g_print("\n%s\n",aline);
  1158 	    if (!pswit[OVERVIEW_SWITCH])
  1159 		g_print("    Line %ld column %ld - %s\n",
  1160 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1161 	    g_clear_error(&tmp_err);
  1162 	}
  1163 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1164 	  c!='\r' && c!='\n')
  1165 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1166 	if (c==CHAR_UNDERSCORE)
  1167 	    counters->c_unders++;
  1168 	if (c==CHAR_OPEN_SBRACK)
  1169 	{
  1170 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1171 	      !matching_difference(counters,c) && s==aline &&
  1172 	      g_str_has_prefix(s,"[Illustration:"))
  1173 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1174 	    else
  1175 		increment_matching(counters,c,TRUE);
  1176 	}
  1177 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1178 	    increment_matching(counters,c,TRUE);
  1179 	if (c==CHAR_CLOSE_SBRACK)
  1180 	{
  1181 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1182 	      !matching_difference(counters,c) && !*snext)
  1183 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1184 	    else
  1185 		increment_matching(counters,c,FALSE);
  1186 	}
  1187 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1188 	    increment_matching(counters,c,FALSE);
  1189 	sprev=s;
  1190 	s=snext;
  1191     }
  1192     return isemptyline;
  1193 }
  1194 
  1195 /*
  1196  * check_for_control_characters:
  1197  *
  1198  * Check for invalid or questionable characters in the line
  1199  * Anything above 127 is invalid for plain ASCII, and
  1200  * non-printable control characters should also be flagged.
  1201  * Tabs should generally not be there.
  1202  */
  1203 void check_for_control_characters(const char *aline)
  1204 {
  1205     gunichar c;
  1206     const char *s;
  1207     for (s=aline;*s;s=g_utf8_next_char(s))
  1208     {
  1209 	c=g_utf8_get_char(s);
  1210 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1211 	{
  1212 	    if (pswit[ECHO_SWITCH])
  1213 		g_print("\n%s\n",aline);
  1214 	    if (!pswit[OVERVIEW_SWITCH])
  1215 		g_print("    Line %ld column %ld - Control character %u\n",
  1216 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1217 	    else
  1218 		cnt_bin++;
  1219 	}
  1220     }
  1221 }
  1222 
  1223 /*
  1224  * check_for_odd_characters:
  1225  *
  1226  * Check for binary and other odd characters.
  1227  */
  1228 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1229   gboolean isemptyline)
  1230 {
  1231     /* Don't repeat multiple warnings on one line. */
  1232     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
  1233     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1234     const char *s;
  1235     gunichar c;
  1236     for (s=aline;*s;s=g_utf8_next_char(s))
  1237     {
  1238 	c=g_utf8_get_char(s);
  1239 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1240 	{
  1241 	    if (pswit[ECHO_SWITCH])
  1242 		g_print("\n%s\n",aline);
  1243 	    if (!pswit[OVERVIEW_SWITCH])
  1244 		if (c>127 && c<160 || c>255)
  1245 		    g_print("    Line %ld column %ld - "
  1246 		      "Non-ISO-8859 character %u\n",
  1247 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1248 		else
  1249 		    g_print("    Line %ld column %ld - "
  1250 		      "Non-ASCII character %u\n",
  1251 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1252 	    else
  1253 		cnt_bin++;
  1254 	    eNon_A=TRUE;
  1255 	}
  1256 	if (!eTab && c==CHAR_TAB)
  1257 	{
  1258 	    if (pswit[ECHO_SWITCH])
  1259 		g_print("\n%s\n",aline);
  1260 	    if (!pswit[OVERVIEW_SWITCH])
  1261 		g_print("    Line %ld column %ld - Tab character?\n",
  1262 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1263 	    else
  1264 		cnt_odd++;
  1265 	    eTab=TRUE;
  1266 	}
  1267 	if (!eTilde && c==CHAR_TILDE)
  1268 	{
  1269 	    /*
  1270 	     * Often used by OCR software to indicate an
  1271 	     * unrecognizable character.
  1272 	     */
  1273 	    if (pswit[ECHO_SWITCH])
  1274 		g_print("\n%s\n",aline);
  1275 	    if (!pswit[OVERVIEW_SWITCH])
  1276 		g_print("    Line %ld column %ld - Tilde character?\n",
  1277 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1278 	    else
  1279 		cnt_odd++;
  1280 	    eTilde=TRUE;
  1281 	}
  1282 	if (!eCarat && c==CHAR_CARAT)
  1283 	{  
  1284 	    if (pswit[ECHO_SWITCH])
  1285 		g_print("\n%s\n",aline);
  1286 	    if (!pswit[OVERVIEW_SWITCH])
  1287 		g_print("    Line %ld column %ld - Carat character?\n",
  1288 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1289 	    else
  1290 		cnt_odd++;
  1291 	    eCarat=TRUE;
  1292 	}
  1293 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1294 	{  
  1295 	    if (pswit[ECHO_SWITCH])
  1296 		g_print("\n%s\n",aline);
  1297 	    if (!pswit[OVERVIEW_SWITCH])
  1298 		g_print("    Line %ld column %ld - Forward slash?\n",
  1299 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1300 	    else
  1301 		cnt_odd++;
  1302 	    eFSlash=TRUE;
  1303 	}
  1304 	/*
  1305 	 * Report asterisks only in paranoid mode,
  1306 	 * since they're often deliberate.
  1307 	 */
  1308 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1309 	  c==CHAR_ASTERISK)
  1310 	{
  1311 	    if (pswit[ECHO_SWITCH])
  1312 		g_print("\n%s\n",aline);
  1313 	    if (!pswit[OVERVIEW_SWITCH])
  1314 		g_print("    Line %ld column %ld - Asterisk?\n",
  1315 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1316 	    else
  1317 		cnt_odd++;
  1318 	    eAst=TRUE;
  1319 	}
  1320     }
  1321 }
  1322 
  1323 /*
  1324  * check_for_long_line:
  1325  *
  1326  * Check for line too long.
  1327  */
  1328 void check_for_long_line(const char *aline)
  1329 {
  1330     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1331     {
  1332 	if (pswit[ECHO_SWITCH])
  1333 	    g_print("\n%s\n",aline);
  1334 	if (!pswit[OVERVIEW_SWITCH])
  1335 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1336 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1337 	else
  1338 	    cnt_long++;
  1339     }
  1340 }
  1341 
  1342 /*
  1343  * check_for_short_line:
  1344  *
  1345  * Check for line too short.
  1346  *
  1347  * This one is a bit trickier to implement: we don't want to
  1348  * flag the last line of a paragraph for being short, so we
  1349  * have to wait until we know that our current line is a
  1350  * "normal" line, then report the _previous_ line if it was too
  1351  * short. We also don't want to report indented lines like
  1352  * chapter heads or formatted quotations. We therefore keep
  1353  * last->len as the length of the last line examined, and
  1354  * last->blen as the length of the last but one, and try to
  1355  * suppress unnecessary warnings by checking that both were of
  1356  * "normal" length. We keep the first character of the last
  1357  * line in last->start, and if it was a space, we assume that
  1358  * the formatting is deliberate. I can't figure out a way to
  1359  * distinguish something like a quoted verse left-aligned or
  1360  * the header or footer of a letter from a paragraph of short
  1361  * lines - maybe if I examined the whole paragraph, and if the
  1362  * para has less than, say, 8 lines and if all lines are short,
  1363  * then just assume it's OK? Need to look at some texts to see
  1364  * how often a formula like this would get the right result.
  1365  */
  1366 void check_for_short_line(const char *aline,const struct line_properties *last)
  1367 {
  1368     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1369       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1370       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1371     {
  1372 	if (pswit[ECHO_SWITCH])
  1373 	    g_print("\n%s\n",prevline);
  1374 	if (!pswit[OVERVIEW_SWITCH])
  1375 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1376 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1377 	else
  1378 	    cnt_short++;
  1379     }
  1380 }
  1381 
  1382 /*
  1383  * check_for_starting_punctuation:
  1384  *
  1385  * Look for punctuation other than full ellipses at start of line.
  1386  */
  1387 void check_for_starting_punctuation(const char *aline)
  1388 {
  1389     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1390       !g_str_has_prefix(aline,". . ."))
  1391     {
  1392 	if (pswit[ECHO_SWITCH])
  1393 	    g_print("\n%s\n",aline);
  1394 	if (!pswit[OVERVIEW_SWITCH])
  1395 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1396 	      linecnt);
  1397 	else
  1398 	    cnt_punct++;
  1399     }
  1400 }
  1401 
  1402 /*
  1403  * str_emdash:
  1404  *
  1405  * Find the first em-dash, return a pointer to it and set <next> to the
  1406  * character following the dash.
  1407  */
  1408 char *str_emdash(const char *s,const char **next)
  1409 {
  1410     const char *s1,*s2;
  1411     s1=strstr(s,"--");
  1412     s2=strstr(s,"—");
  1413     if (!s1)
  1414     {
  1415 	if (s2)
  1416 	    *next=g_utf8_next_char(s2);
  1417 	return (char *)s2;
  1418     }
  1419     else if (!s2)
  1420     {
  1421 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1422 	return (char *)s1;
  1423     }
  1424     else if (s1<s2)
  1425     {
  1426 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1427 	return (char *)s1;
  1428     }
  1429     else
  1430     {
  1431 	*next=g_utf8_next_char(s2);
  1432 	return (char *)s2;
  1433     }
  1434 }
  1435 
  1436 /*
  1437  * check_for_spaced_emdash:
  1438  *
  1439  * Check for spaced em-dashes.
  1440  *
  1441  * We must check _all_ occurrences of em-dashes on the line
  1442  * hence the loop - even if the first dash is OK
  1443  * there may be another that's wrong later on.
  1444  */
  1445 void check_for_spaced_emdash(const char *aline)
  1446 {
  1447     const char *s,*t,*next;
  1448     for (s=aline;t=str_emdash(s,&next);s=next)
  1449     {
  1450 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1451 	  g_utf8_get_char(next)==CHAR_SPACE)
  1452 	{
  1453 	    if (pswit[ECHO_SWITCH])
  1454 		g_print("\n%s\n",aline);
  1455 	    if (!pswit[OVERVIEW_SWITCH])
  1456 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1457 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1458 	    else
  1459 		cnt_dash++;
  1460 	}
  1461     }
  1462 }
  1463 
  1464 /*
  1465  * check_for_spaced_dash:
  1466  *
  1467  * Check for spaced dashes.
  1468  */
  1469 void check_for_spaced_dash(const char *aline)
  1470 {
  1471     const char *s;
  1472     if ((s=strstr(aline," -")))
  1473     {
  1474 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1475 	{
  1476 	    if (pswit[ECHO_SWITCH])
  1477 		g_print("\n%s\n",aline);
  1478 	    if (!pswit[OVERVIEW_SWITCH])
  1479 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1480 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1481 	    else
  1482 		cnt_dash++;
  1483 	}
  1484     }
  1485     else if ((s=strstr(aline,"- ")))
  1486     {
  1487 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1488 	{
  1489 	    if (pswit[ECHO_SWITCH])
  1490 		g_print("\n%s\n",aline);
  1491 	    if (!pswit[OVERVIEW_SWITCH])
  1492 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1493 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1494 	    else
  1495 		cnt_dash++;
  1496 	}
  1497     }
  1498 }
  1499 
  1500 /*
  1501  * check_for_unmarked_paragraphs:
  1502  *
  1503  * Check for unmarked paragraphs indicated by separate speakers.
  1504  *
  1505  * May well be false positive:
  1506  * "Bravo!" "Wonderful!" called the crowd.
  1507  * but useful all the same.
  1508  */
  1509 void check_for_unmarked_paragraphs(const char *aline)
  1510 {
  1511     const char *s;
  1512     s=strstr(aline,"\"  \"");
  1513     if (!s)
  1514 	s=strstr(aline,"\" \"");
  1515     if (s)
  1516     {
  1517 	if (pswit[ECHO_SWITCH])
  1518 	    g_print("\n%s\n",aline);
  1519 	if (!pswit[OVERVIEW_SWITCH])
  1520 	    g_print("    Line %ld column %ld - "
  1521 	      "Query missing paragraph break?\n",
  1522 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1523 	else
  1524 	    cnt_punct++;
  1525     }
  1526 }
  1527 
  1528 /*
  1529  * check_for_jeebies:
  1530  *
  1531  * Check for "to he" and other easy h/b errors.
  1532  *
  1533  * This is a very inadequate effort on the h/b problem,
  1534  * but the phrase "to he" is always an error, whereas "to
  1535  * be" is quite common.
  1536  * Similarly, '"Quiet!", be said.' is a non-be error
  1537  * "to he" is _not_ always an error!:
  1538  *       "Where they went to he couldn't say."
  1539  * Another false positive:
  1540  *       What would "Cinderella" be without the . . .
  1541  * and another: "If he wants to he can see for himself."
  1542  */
  1543 void check_for_jeebies(const char *aline)
  1544 {
  1545     const char *s;
  1546     s=strstr(aline," be could ");
  1547     if (!s)
  1548 	s=strstr(aline," be would ");
  1549     if (!s)
  1550 	s=strstr(aline," was be ");
  1551     if (!s)
  1552 	s=strstr(aline," be is ");
  1553     if (!s)
  1554 	s=strstr(aline," is be ");
  1555     if (!s)
  1556 	s=strstr(aline,"\", be ");
  1557     if (!s)
  1558 	s=strstr(aline,"\" be ");
  1559     if (!s)
  1560 	s=strstr(aline,"\" be ");
  1561     if (!s)
  1562 	s=strstr(aline," to he ");
  1563     if (s)
  1564     {
  1565 	if (pswit[ECHO_SWITCH])
  1566 	    g_print("\n%s\n",aline);
  1567 	if (!pswit[OVERVIEW_SWITCH])
  1568 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1569 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1570 	else
  1571 	    cnt_word++;
  1572     }
  1573     s=strstr(aline," the had ");
  1574     if (!s)
  1575 	s=strstr(aline," a had ");
  1576     if (!s)
  1577 	s=strstr(aline," they bad ");
  1578     if (!s)
  1579 	s=strstr(aline," she bad ");
  1580     if (!s)
  1581 	s=strstr(aline," he bad ");
  1582     if (!s)
  1583 	s=strstr(aline," you bad ");
  1584     if (!s)
  1585 	s=strstr(aline," i bad ");
  1586     if (s)
  1587     {
  1588 	if (pswit[ECHO_SWITCH])
  1589 	    g_print("\n%s\n",aline);
  1590 	if (!pswit[OVERVIEW_SWITCH])
  1591 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1592 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1593 	else
  1594 	    cnt_word++;
  1595     }
  1596     s=strstr(aline,"; hut ");
  1597     if (!s)
  1598 	s=strstr(aline,", hut ");
  1599     if (s)
  1600     {
  1601 	if (pswit[ECHO_SWITCH])
  1602 	    g_print("\n%s\n",aline);
  1603 	if (!pswit[OVERVIEW_SWITCH])
  1604 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1605 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1606 	else
  1607 	    cnt_word++;
  1608     }
  1609 }
  1610 
  1611 /*
  1612  * check_for_mta_from:
  1613  *
  1614  * Special case - angled bracket in front of "From" placed there by an
  1615  * MTA when sending an e-mail.
  1616  */
  1617 void check_for_mta_from(const char *aline)
  1618 {
  1619     const char *s;
  1620     s=strstr(aline,">From");
  1621     if (s)
  1622     {
  1623 	if (pswit[ECHO_SWITCH])
  1624 	    g_print("\n%s\n",aline);
  1625 	if (!pswit[OVERVIEW_SWITCH])
  1626 	    g_print("    Line %ld column %ld - "
  1627 	      "Query angled bracket with From\n",
  1628 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1629 	else
  1630 	    cnt_punct++;
  1631     }
  1632 }
  1633 
  1634 /*
  1635  * check_for_orphan_character:
  1636  *
  1637  * Check for a single character line -
  1638  * often an overflow from bad wrapping.
  1639  */
  1640 void check_for_orphan_character(const char *aline)
  1641 {
  1642     gunichar c;
  1643     c=g_utf8_get_char(aline);
  1644     if (c && !*g_utf8_next_char(aline))
  1645     {
  1646 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1647 	    ; /* Nothing - ignore numerals alone on a line. */
  1648 	else
  1649 	{
  1650 	    if (pswit[ECHO_SWITCH])
  1651 		g_print("\n%s\n",aline);
  1652 	    if (!pswit[OVERVIEW_SWITCH])
  1653 		g_print("    Line %ld column 1 - Query single character line\n",
  1654 		  linecnt);
  1655 	    else
  1656 		cnt_punct++;
  1657 	}
  1658     }
  1659 }
  1660 
  1661 /*
  1662  * check_for_pling_scanno:
  1663  *
  1664  * Check for I" - often should be !
  1665  */
  1666 void check_for_pling_scanno(const char *aline)
  1667 {
  1668     const char *s;
  1669     s=strstr(aline," I\"");
  1670     if (s)
  1671     {
  1672 	if (pswit[ECHO_SWITCH])
  1673 	    g_print("\n%s\n",aline);
  1674 	if (!pswit[OVERVIEW_SWITCH])
  1675 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1676 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1677 	else
  1678 	    cnt_punct++;
  1679     }
  1680 }
  1681 
  1682 /*
  1683  * check_for_extra_period:
  1684  *
  1685  * Check for period without a capital letter. Cut-down from gutspell.
  1686  * Only works when it happens on a single line.
  1687  */
  1688 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1689 {
  1690     const char *s,*t,*s1,*sprev;
  1691     int i;
  1692     gsize len;
  1693     gboolean istypo;
  1694     gchar *testword;
  1695     gunichar c,nc,pc,*decomposition;
  1696     if (pswit[PARANOID_SWITCH])
  1697     {
  1698 	for (t=aline;t=strstr(t,". ");)
  1699 	{
  1700 	    if (t==aline)
  1701 	    {
  1702 		t=g_utf8_next_char(t);
  1703 		/* start of line punctuation is handled elsewhere */
  1704 		continue;
  1705 	    }
  1706 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1707 	    {
  1708 		t=g_utf8_next_char(t);
  1709 		continue;
  1710 	    }
  1711 	    if (warnings->isDutch)
  1712 	    {
  1713 		/* For Frank & Jeroen -- 's Middags case */
  1714 		gunichar c2,c3,c4,c5;
  1715 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1716 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1717 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1718 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1719 		if (CHAR_IS_APOSTROPHE(c2) &&
  1720 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1721 		  g_unichar_isupper(c5))
  1722 		{
  1723 		    t=g_utf8_next_char(t);
  1724 		    continue;
  1725 		}
  1726 	    }
  1727 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1728 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1729 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1730 		s1=g_utf8_next_char(s1);
  1731 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1732 	    {
  1733 		/* we have something to investigate */
  1734 		istypo=TRUE;
  1735 		/* so let's go back and find out */
  1736 		nc=g_utf8_get_char(t);
  1737 		s1=g_utf8_prev_char(t);
  1738 		c=g_utf8_get_char(s1);
  1739 		sprev=g_utf8_prev_char(s1);
  1740 		pc=g_utf8_get_char(sprev);
  1741 		while (s1>=aline &&
  1742 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1743 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1744 		  g_unichar_isalpha(nc)))
  1745 		{
  1746 		    nc=c;
  1747 		    s1=sprev;
  1748 		    c=pc;
  1749 		    sprev=g_utf8_prev_char(s1);
  1750 		    pc=g_utf8_get_char(sprev);
  1751 		}
  1752 		s1=g_utf8_next_char(s1);
  1753 		s=strchr(s1,'.');
  1754 		if (s)
  1755 		    testword=g_strndup(s1,s-s1);
  1756 		else
  1757 		    testword=g_strdup(s1);
  1758 		for (i=0;*abbrev[i];i++)
  1759 		    if (!strcmp(testword,abbrev[i]))
  1760 			istypo=FALSE;
  1761 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1762 		    istypo=FALSE;
  1763 		if (!*g_utf8_next_char(testword))
  1764 		    istypo=FALSE;
  1765 		if (isroman(testword))
  1766 		    istypo=FALSE;
  1767 		if (istypo)
  1768 		{
  1769 		    istypo=FALSE;
  1770 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1771 		    {
  1772 			decomposition=g_unicode_canonical_decomposition(
  1773 			  g_utf8_get_char(s),&len);
  1774 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1775 			    istypo=TRUE;
  1776 			g_free(decomposition);
  1777 		    }
  1778 		}
  1779 		if (istypo &&
  1780 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1781 		{
  1782 		    g_tree_insert(qperiod,g_strdup(testword),
  1783 		      GINT_TO_POINTER(1));
  1784 		    if (pswit[ECHO_SWITCH])
  1785 			g_print("\n%s\n",aline);
  1786 		    if (!pswit[OVERVIEW_SWITCH])
  1787 			g_print("    Line %ld column %ld - Extra period?\n",
  1788 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1789 		    else
  1790 			cnt_punct++;
  1791 		}
  1792 		g_free(testword);
  1793 	    }
  1794 	    t=g_utf8_next_char(t);
  1795 	}
  1796     }
  1797 }
  1798 
  1799 /*
  1800  * check_for_following_punctuation:
  1801  *
  1802  * Check for words usually not followed by punctuation.
  1803  */
  1804 void check_for_following_punctuation(const char *aline)
  1805 {
  1806     int i;
  1807     const char *s,*wordstart;
  1808     gunichar c;
  1809     gchar *inword,*t;
  1810     if (pswit[TYPO_SWITCH])
  1811     {
  1812 	for (s=aline;*s;)
  1813 	{
  1814 	    wordstart=s;
  1815 	    t=getaword(&s);
  1816 	    if (!*t)
  1817 	    {
  1818 		g_free(t);
  1819 		continue;
  1820 	    }
  1821 	    inword=g_utf8_strdown(t,-1);
  1822 	    g_free(t);
  1823 	    for (i=0;*nocomma[i];i++)
  1824 		if (!strcmp(inword,nocomma[i]))
  1825 		{
  1826 		    c=g_utf8_get_char(s);
  1827 		    if (c==',' || c==';' || c==':')
  1828 		    {
  1829 			if (pswit[ECHO_SWITCH])
  1830 			    g_print("\n%s\n",aline);
  1831 			if (!pswit[OVERVIEW_SWITCH])
  1832 			    g_print("    Line %ld column %ld - "
  1833 			      "Query punctuation after %s?\n",
  1834 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1835 			      inword);
  1836 			else
  1837 			    cnt_punct++;
  1838 		    }
  1839 		}
  1840 	    for (i=0;*noperiod[i];i++)
  1841 		if (!strcmp(inword,noperiod[i]))
  1842 		{
  1843 		    c=g_utf8_get_char(s);
  1844 		    if (c=='.' || c=='!')
  1845 		    {
  1846 			if (pswit[ECHO_SWITCH])
  1847 			    g_print("\n%s\n",aline);
  1848 			if (!pswit[OVERVIEW_SWITCH])
  1849 			    g_print("    Line %ld column %ld - "
  1850 			      "Query punctuation after %s?\n",
  1851 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1852 			      inword);
  1853 			else
  1854 			    cnt_punct++;
  1855 		    }
  1856 		}
  1857 	    g_free(inword);
  1858 	}
  1859     }
  1860 }
  1861 
  1862 /*
  1863  * check_for_typos:
  1864  *
  1865  * Check for commonly mistyped words,
  1866  * and digits like 0 for O in a word.
  1867  */
  1868 void check_for_typos(const char *aline,struct warnings *warnings)
  1869 {
  1870     const char *s,*t,*nt,*wordstart;
  1871     gchar *inword;
  1872     gunichar *decomposition;
  1873     gchar *testword;
  1874     int i,vowel,consonant,*dupcnt;
  1875     gboolean isdup,istypo,alower;
  1876     gunichar c,pc;
  1877     long offset,len;
  1878     gsize decomposition_len;
  1879     for (s=aline;*s;)
  1880     {
  1881 	wordstart=s;
  1882 	inword=getaword(&s);
  1883 	if (!*inword)
  1884 	{
  1885 	    g_free(inword);
  1886 	    continue; /* don't bother with empty lines */
  1887 	}
  1888 	if (mixdigit(inword))
  1889 	{
  1890 	    if (pswit[ECHO_SWITCH])
  1891 		g_print("\n%s\n",aline);
  1892 	    if (!pswit[OVERVIEW_SWITCH])
  1893 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1894 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1895 	    else
  1896 		cnt_word++;
  1897 	}
  1898 	/*
  1899 	 * Put the word through a series of tests for likely typos and OCR
  1900 	 * errors.
  1901 	 */
  1902 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1903 	{
  1904 	    istypo=FALSE;
  1905 	    alower=FALSE;
  1906 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1907 	    {
  1908 		c=g_utf8_get_char(t);
  1909 		nt=g_utf8_next_char(t);
  1910 		/* lowercase for testing */
  1911 		if (g_unichar_islower(c))
  1912 		    alower=TRUE;
  1913 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1914 		{
  1915 		    /*
  1916 		     * We have an uppercase mid-word. However, there are
  1917 		     * common cases:
  1918 		     *   Mac and Mc like McGill
  1919 		     *   French contractions like l'Abbe
  1920 		     */
  1921 		    offset=g_utf8_pointer_to_offset(inword,t);
  1922 		    if (offset>0)
  1923 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1924 		    else
  1925 			pc='\0';
  1926 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1927 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1928 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1929 		      CHAR_IS_APOSTROPHE(pc))
  1930 			; /* do nothing! */
  1931 		    else
  1932 			istypo=TRUE;
  1933 		}
  1934 	    }
  1935 	    testword=g_utf8_casefold(inword,-1);
  1936 	}
  1937 	if (pswit[TYPO_SWITCH])
  1938 	{
  1939 	    /*
  1940 	     * Check for certain unlikely two-letter combinations at word
  1941 	     * start and end.
  1942 	     */
  1943 	    len=g_utf8_strlen(testword,-1);
  1944 	    if (len>1)
  1945 	    {
  1946 		for (i=0;*nostart[i];i++)
  1947 		    if (g_str_has_prefix(testword,nostart[i]))
  1948 			istypo=TRUE;
  1949 		for (i=0;*noend[i];i++)
  1950 		    if (g_str_has_suffix(testword,noend[i]))
  1951 			istypo=TRUE;
  1952 	    }
  1953 	    /* ght is common, gbt never. Like that. */
  1954 	    if (strstr(testword,"cb"))
  1955 		istypo=TRUE;
  1956 	    if (strstr(testword,"gbt"))
  1957 		istypo=TRUE;
  1958 	    if (strstr(testword,"pbt"))
  1959 		istypo=TRUE;
  1960 	    if (strstr(testword,"tbs"))
  1961 		istypo=TRUE;
  1962 	    if (strstr(testword,"mrn"))
  1963 		istypo=TRUE;
  1964 	    if (strstr(testword,"ahle"))
  1965 		istypo=TRUE;
  1966 	    if (strstr(testword,"ihle"))
  1967 		istypo=TRUE;
  1968 	    /*
  1969 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1970 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1971 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1972 	     * numerals, but "ii" is a common scanno.
  1973 	     */
  1974 	    if (strstr(testword,"tbi"))
  1975 		istypo=TRUE;
  1976 	    if (strstr(testword,"tbe"))
  1977 		istypo=TRUE;
  1978 	    if (strstr(testword,"ii"))
  1979 		istypo=TRUE;
  1980 	    /*
  1981 	     * Check for no vowels or no consonants.
  1982 	     * If none, flag a typo.
  1983 	     */
  1984 	    if (!istypo && len>1)
  1985 	    {
  1986 		vowel=consonant=0;
  1987 		for (t=testword;*t;t=g_utf8_next_char(t))
  1988 		{
  1989 		    c=g_utf8_get_char(t);
  1990 		    decomposition=
  1991 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1992 		    if (c=='y' || g_unichar_isdigit(c))
  1993 		    {
  1994 			/* Yah, this is loose. */
  1995 			vowel++;
  1996 			consonant++;
  1997 		    }
  1998 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1999 			vowel++;
  2000 		    else
  2001 			consonant++;
  2002 		    g_free(decomposition);
  2003 		}
  2004 		if (!vowel || !consonant)
  2005 		    istypo=TRUE;
  2006 	    }
  2007 	    /*
  2008 	     * Now exclude the word from being reported if it's in
  2009 	     * the okword list.
  2010 	     */
  2011 	    for (i=0;*okword[i];i++)
  2012 		if (!strcmp(testword,okword[i]))
  2013 		    istypo=FALSE;
  2014 	    /*
  2015 	     * What looks like a typo may be a Roman numeral.
  2016 	     * Exclude these.
  2017 	     */
  2018 	    if (istypo && isroman(testword))
  2019 		istypo=FALSE;
  2020 	    /* Check the manual list of typos. */
  2021 	    if (!istypo)
  2022 		for (i=0;*typo[i];i++)
  2023 		    if (!strcmp(testword,typo[i]))
  2024 			istypo=TRUE;
  2025 	    /*
  2026 	     * Check lowercase s, l, i and m - special cases.
  2027 	     *   "j" - often a semi-colon gone wrong.
  2028 	     *   "d" for a missing apostrophe - he d
  2029 	     *   "n" for "in"
  2030 	     */
  2031 	    if (!istypo && len==1 &&
  2032 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2033 		istypo=TRUE;
  2034 	    if (istypo)
  2035 	    {
  2036 		dupcnt=g_tree_lookup(qword,testword);
  2037 		if (dupcnt)
  2038 		{
  2039 		    (*dupcnt)++;
  2040 		    isdup=!pswit[VERBOSE_SWITCH];
  2041 		}
  2042 		else
  2043 		{
  2044 		    dupcnt=g_new0(int,1);
  2045 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2046 		    isdup=FALSE;
  2047 		}
  2048 		if (!isdup)
  2049 		{
  2050 		    if (pswit[ECHO_SWITCH])
  2051 			g_print("\n%s\n",aline);
  2052 		    if (!pswit[OVERVIEW_SWITCH])
  2053 		    {
  2054 			g_print("    Line %ld column %ld - Query word %s",
  2055 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2056 			  inword);
  2057 			if (!pswit[VERBOSE_SWITCH])
  2058 			    g_print(" - not reporting duplicates");
  2059 			g_print("\n");
  2060 		    }
  2061 		    else
  2062 			cnt_word++;
  2063 		}
  2064 	    }
  2065 	}
  2066 	/* check the user's list of typos */
  2067 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2068 	{
  2069 	    if (pswit[ECHO_SWITCH])
  2070 		g_print("\n%s\n",aline);
  2071 	    if (!pswit[OVERVIEW_SWITCH])  
  2072 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2073 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2074 	}
  2075 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2076 	    g_free(testword);
  2077 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2078 	{
  2079 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2080 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2081 	    {
  2082 		if (pswit[ECHO_SWITCH])
  2083 		    g_print("\n%s\n",aline);
  2084 		if (!pswit[OVERVIEW_SWITCH])
  2085 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2086 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2087 		      inword);
  2088 		else
  2089 		    cnt_word++;
  2090 	    }
  2091 	}
  2092 	g_free(inword);
  2093     }
  2094 }
  2095 
  2096 /*
  2097  * check_for_misspaced_punctuation:
  2098  *
  2099  * Look for added or missing spaces around punctuation and quotes.
  2100  * If there is a punctuation character like ! with no space on
  2101  * either side, suspect a missing!space. If there are spaces on
  2102  * both sides , assume a typo. If we see a double quote with no
  2103  * space or punctuation on either side of it, assume unspaced
  2104  * quotes "like"this.
  2105  */
  2106 void check_for_misspaced_punctuation(const char *aline,
  2107   struct parities *parities,gboolean isemptyline)
  2108 {
  2109     gboolean isacro,isellipsis;
  2110     const char *s;
  2111     gunichar c,nc,pc,n2c;
  2112     int parity;
  2113     c=g_utf8_get_char(aline);
  2114     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2115     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2116     {
  2117 	pc=c;
  2118 	c=nc;
  2119 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2120 	/* For each character in the line after the first. */
  2121 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2122 	{
  2123 	    /* we need to suppress warnings for acronyms like M.D. */
  2124 	    isacro=FALSE;
  2125 	    /* we need to suppress warnings for ellipsis . . . */
  2126 	    isellipsis=FALSE;
  2127 	    /*
  2128 	     * If there are letters on both sides of it or
  2129 	     * if it's strict punctuation followed by an alpha.
  2130 	     */
  2131 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2132 	      g_utf8_strchr("?!,;:",-1,c)))
  2133 	    {
  2134 		if (c=='.')
  2135 		{
  2136 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2137 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2138 			isacro=TRUE;
  2139 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2140 		    if (nc && n2c=='.')
  2141 			isacro=TRUE;
  2142 		}
  2143 		if (!isacro)
  2144 		{
  2145 		    if (pswit[ECHO_SWITCH])
  2146 			g_print("\n%s\n",aline);
  2147 		    if (!pswit[OVERVIEW_SWITCH])
  2148 			g_print("    Line %ld column %ld - Missing space?\n",
  2149 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2150 		    else
  2151 			cnt_punct++;
  2152 		}
  2153 	    }
  2154 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2155 	    {
  2156 		/*
  2157 		 * If there are spaces on both sides,
  2158 		 * or space before and end of line.
  2159 		 */
  2160 		if (c=='.')
  2161 		{
  2162 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2163 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2164 			isellipsis=TRUE;
  2165 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2166 		    if (nc && n2c=='.')
  2167 			isellipsis=TRUE;
  2168 		}
  2169 		if (!isemptyline && !isellipsis)
  2170 		{
  2171 		    if (pswit[ECHO_SWITCH])
  2172 			g_print("\n%s\n",aline);
  2173 		    if (!pswit[OVERVIEW_SWITCH])
  2174 			g_print("    Line %ld column %ld - "
  2175 			  "Spaced punctuation?\n",linecnt,
  2176 			  g_utf8_pointer_to_offset(aline,s)+1);
  2177 		    else
  2178 			cnt_punct++;
  2179 		}
  2180 	    }
  2181 	}
  2182     }
  2183     /* Split out the characters that CANNOT be preceded by space. */
  2184     c=g_utf8_get_char(aline);
  2185     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2186     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2187     {
  2188 	pc=c;
  2189 	c=nc;
  2190 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2191 	/* for each character in the line after the first */
  2192 	if (g_utf8_strchr("?!,;:",-1,c))
  2193 	{
  2194 	    /* if it's punctuation that _cannot_ have a space before it */
  2195 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2196 	    {
  2197 		/*
  2198 		 * If nc DOES == space,
  2199 		 * it was already reported just above.
  2200 		 */
  2201 		if (pswit[ECHO_SWITCH])
  2202 		    g_print("\n%s\n",aline);
  2203 		if (!pswit[OVERVIEW_SWITCH])
  2204 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2205 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2206 		else
  2207 		    cnt_punct++;
  2208 	    }
  2209 	}
  2210     }
  2211     /*
  2212      * Special case " .X" where X is any alpha.
  2213      * This plugs a hole in the acronym code above.
  2214      * Inelegant, but maintainable.
  2215      */
  2216     c=g_utf8_get_char(aline);
  2217     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2218     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2219     {
  2220 	pc=c;
  2221 	c=nc;
  2222 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2223 	/* for each character in the line after the first */
  2224 	if (c=='.')
  2225 	{
  2226 	    /* if it's a period */
  2227 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2228 	    {
  2229 		/*
  2230 		 * If the period follows a space and
  2231 		 * is followed by a letter.
  2232 		 */
  2233 		if (pswit[ECHO_SWITCH])
  2234 		    g_print("\n%s\n",aline);
  2235 		if (!pswit[OVERVIEW_SWITCH])
  2236 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2237 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2238 		else
  2239 		    cnt_punct++;
  2240 	    }
  2241 	}
  2242     }
  2243     c=g_utf8_get_char(aline);
  2244     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2245     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2246     {
  2247 	pc=c;
  2248 	c=nc;
  2249 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2250 	/* for each character in the line after the first */
  2251 	if (CHAR_IS_DQUOTE(c))
  2252 	{
  2253 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2254 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2255 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2256 	    {
  2257 		if (pswit[ECHO_SWITCH])
  2258 		    g_print("\n%s\n",aline);
  2259 		if (!pswit[OVERVIEW_SWITCH])
  2260 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2261 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2262 		else
  2263 		    cnt_punct++;
  2264 	    }
  2265 	}
  2266     }
  2267     /* Check parity of quotes. */
  2268     nc=g_utf8_get_char(aline);
  2269     for (s=aline;*s;s=g_utf8_next_char(s))
  2270     {
  2271 	c=nc;
  2272 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2273 	if (CHAR_IS_DQUOTE(c))
  2274 	{
  2275 	    if (c==CHAR_DQUOTE)
  2276 	    {
  2277 		parities->dquote=!parities->dquote;
  2278 		parity=parities->dquote;
  2279 	    }
  2280 	    else if (c==CHAR_LD_QUOTE)
  2281 		parity=1;
  2282 	    else
  2283 		parity=0;
  2284 	    if (!parity)
  2285 	    {
  2286 		/* parity even */
  2287 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2288 		{
  2289 		    if (pswit[ECHO_SWITCH])
  2290 			g_print("\n%s\n",aline);
  2291 		    if (!pswit[OVERVIEW_SWITCH])
  2292 			g_print("    Line %ld column %ld - "
  2293 			  "Wrongspaced quotes?\n",
  2294 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2295 		    else
  2296 			cnt_punct++;
  2297 		}
  2298 	    }
  2299 	    else
  2300 	    {
  2301 		/* parity odd */
  2302 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2303 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2304 		{
  2305 		    if (pswit[ECHO_SWITCH])
  2306 			g_print("\n%s\n",aline);
  2307 		    if (!pswit[OVERVIEW_SWITCH])
  2308 			g_print("    Line %ld column %ld - "
  2309 			  "Wrongspaced quotes?\n",
  2310 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2311 		    else
  2312 			cnt_punct++;
  2313 		}
  2314 	    }
  2315 	}
  2316     }
  2317     c=g_utf8_get_char(aline);
  2318     if (CHAR_IS_DQUOTE(c))
  2319     {
  2320 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2321 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2322 	{
  2323 	    if (pswit[ECHO_SWITCH])
  2324 		g_print("\n%s\n",aline);
  2325 	    if (!pswit[OVERVIEW_SWITCH])
  2326 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2327 		  linecnt);
  2328 	    else
  2329 		cnt_punct++;
  2330 	}
  2331     }
  2332     if (pswit[SQUOTE_SWITCH])
  2333     {
  2334 	nc=g_utf8_get_char(aline);
  2335 	for (s=aline;*s;s=g_utf8_next_char(s))
  2336 	{
  2337 	    c=nc;
  2338 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2339 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2340 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2341 	      !g_unichar_isalpha(nc)))
  2342 	    {
  2343 		parities->squote=!parities->squote;
  2344 		if (!parities->squote)
  2345 		{
  2346 		    /* parity even */
  2347 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2348 		    {
  2349 			if (pswit[ECHO_SWITCH])
  2350 			    g_print("\n%s\n",aline);
  2351 			if (!pswit[OVERVIEW_SWITCH])
  2352 			    g_print("    Line %ld column %ld - "
  2353 			      "Wrongspaced singlequotes?\n",
  2354 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2355 			else
  2356 			    cnt_punct++;
  2357 		    }
  2358 		}
  2359 		else
  2360 		{
  2361 		    /* parity odd */
  2362 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2363 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2364 		    {
  2365 			if (pswit[ECHO_SWITCH])
  2366 			    g_print("\n%s\n",aline);
  2367 			if (!pswit[OVERVIEW_SWITCH])
  2368 			    g_print("    Line %ld column %ld - "
  2369 			      "Wrongspaced singlequotes?\n",
  2370 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2371 			else
  2372 			    cnt_punct++;
  2373 		    }
  2374 		}
  2375 	    }
  2376 	}
  2377     }
  2378 }
  2379 
  2380 /*
  2381  * check_for_double_punctuation:
  2382  *
  2383  * Look for double punctuation like ,. or ,,
  2384  * Thanks to DW for the suggestion!
  2385  * In books with references, ".," and ".;" are common
  2386  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2387  * OTOH, from my initial tests, there are also fairly
  2388  * common errors. What to do? Make these cases paranoid?
  2389  * ".," is the most common, so warnings->dotcomma is used
  2390  * to suppress detailed reporting if it occurs often.
  2391  */
  2392 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2393 {
  2394     const char *s;
  2395     gunichar c,nc;
  2396     nc=g_utf8_get_char(aline);
  2397     for (s=aline;*s;s=g_utf8_next_char(s))
  2398     {
  2399 	c=nc;
  2400 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2401 	/* for each punctuation character in the line */
  2402 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2403 	  g_utf8_strchr(".?!,;:",-1,nc))
  2404 	{
  2405 	    /* followed by punctuation, it's a query, unless . . . */
  2406 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2407 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2408 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2409 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2410 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2411 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2412 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2413 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2414 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2415 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2416 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2417 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2418 	    {
  2419 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2420 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2421 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2422 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2423 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2424 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2425 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2426 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2427 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2428 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2429 		{
  2430 		    s+=4;
  2431 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2432 		}
  2433 		; /* do nothing for .. !! and ?? which can be legit */
  2434 	    }
  2435 	    else
  2436 	    {
  2437 		if (pswit[ECHO_SWITCH])
  2438 		    g_print("\n%s\n",aline);
  2439 		if (!pswit[OVERVIEW_SWITCH])
  2440 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2441 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2442 		else
  2443 		    cnt_punct++;
  2444 	    }
  2445 	}
  2446     }
  2447 }
  2448 
  2449 /*
  2450  * check_for_spaced_quotes:
  2451  */
  2452 void check_for_spaced_quotes(const char *aline)
  2453 {
  2454     int i;
  2455     const char *s,*t;
  2456     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2457       CHAR_RS_QUOTE};
  2458     GString *pattern;
  2459     s=aline;
  2460     while ((t=strstr(s," \" ")))
  2461     {
  2462 	if (pswit[ECHO_SWITCH])
  2463 	    g_print("\n%s\n",aline);
  2464 	if (!pswit[OVERVIEW_SWITCH])
  2465 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2466 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2467 	else
  2468 	    cnt_punct++;
  2469 	s=g_utf8_next_char(g_utf8_next_char(t));
  2470     }
  2471     pattern=g_string_new(NULL);
  2472     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2473     {
  2474 	g_string_assign(pattern," ");
  2475 	g_string_append_unichar(pattern,single_quotes[i]);
  2476 	g_string_append_c(pattern,' ');
  2477 	s=aline;
  2478 	while ((t=strstr(s,pattern->str)))
  2479 	{
  2480 	    if (pswit[ECHO_SWITCH])
  2481 		g_print("\n%s\n",aline);
  2482 	    if (!pswit[OVERVIEW_SWITCH])
  2483 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2484 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2485 	    else
  2486 		cnt_punct++;
  2487 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2488 	}
  2489     }
  2490     g_string_free(pattern,TRUE);
  2491 }
  2492 
  2493 /*
  2494  * check_for_miscased_genative:
  2495  *
  2496  * Check special case of 'S instead of 's at end of word.
  2497  */
  2498 void check_for_miscased_genative(const char *aline)
  2499 {
  2500     const char *s;
  2501     gunichar c,nc,pc;
  2502     if (!*aline)
  2503 	return;
  2504     c=g_utf8_get_char(aline);
  2505     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2506     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2507     {
  2508 	pc=c;
  2509 	c=nc;
  2510 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2511 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2512 	{
  2513 	    if (pswit[ECHO_SWITCH])
  2514 		g_print("\n%s\n",aline);
  2515 	    if (!pswit[OVERVIEW_SWITCH])
  2516 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2517 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2518 	    else
  2519 		cnt_punct++;
  2520 	}
  2521     }
  2522 }
  2523 
  2524 /*
  2525  * check_end_of_line:
  2526  *
  2527  * Now check special cases - start and end of line -
  2528  * for single and double quotes. Start is sometimes [sic]
  2529  * but better to query it anyway.
  2530  * While we're here, check for dash at end of line.
  2531  */
  2532 void check_end_of_line(const char *aline,struct warnings *warnings)
  2533 {
  2534     int lbytes;
  2535     const char *s;
  2536     gunichar c1,c2;
  2537     lbytes=strlen(aline);
  2538     if (g_utf8_strlen(aline,lbytes)>1)
  2539     {
  2540 	s=g_utf8_prev_char(aline+lbytes);
  2541 	c1=g_utf8_get_char(s);
  2542 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2543 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2544 	{
  2545 	    if (pswit[ECHO_SWITCH])
  2546 		g_print("\n%s\n",aline);
  2547 	    if (!pswit[OVERVIEW_SWITCH])
  2548 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2549 		  g_utf8_strlen(aline,lbytes));
  2550 	    else
  2551 		cnt_punct++;
  2552 	}
  2553 	c1=g_utf8_get_char(aline);
  2554 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2555 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2556 	{
  2557 	    if (pswit[ECHO_SWITCH])
  2558 		g_print("\n%s\n",aline);
  2559 	    if (!pswit[OVERVIEW_SWITCH])
  2560 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2561 	    else
  2562 		cnt_punct++;
  2563 	}
  2564 	/*
  2565 	 * Dash at end of line may well be legit - paranoid mode only
  2566 	 * and don't report em-dash at line-end.
  2567 	 */
  2568 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2569 	{
  2570 	    for (s=g_utf8_prev_char(aline+lbytes);
  2571 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2572 		;
  2573 	    if (g_utf8_get_char(s)=='-' &&
  2574 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2575 	    {
  2576 		if (pswit[ECHO_SWITCH])
  2577 		    g_print("\n%s\n",aline);
  2578 		if (!pswit[OVERVIEW_SWITCH])
  2579 		    g_print("    Line %ld column %ld - "
  2580 		      "Hyphen at end of line?\n",
  2581 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2582 	    }
  2583 	}
  2584     }
  2585 }
  2586 
  2587 /*
  2588  * check_for_unspaced_bracket:
  2589  *
  2590  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2591  * If so, suspect a scanno like "a]most".
  2592  */
  2593 void check_for_unspaced_bracket(const char *aline)
  2594 {
  2595     const char *s;
  2596     gunichar c,nc,pc;
  2597     c=g_utf8_get_char(aline);
  2598     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2599     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2600     {
  2601 	pc=c;
  2602 	c=nc;
  2603 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2604 	if (!nc)
  2605 	    break;
  2606 	/* for each bracket character in the line except 1st & last */
  2607 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2608 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2609 	{
  2610 	    if (pswit[ECHO_SWITCH])
  2611 		g_print("\n%s\n",aline);
  2612 	    if (!pswit[OVERVIEW_SWITCH])
  2613 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2614 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2615 	    else
  2616 		cnt_punct++;
  2617 	}
  2618     }
  2619 }
  2620 
  2621 /*
  2622  * check_for_unpunctuated_endquote:
  2623  */
  2624 void check_for_unpunctuated_endquote(const char *aline)
  2625 {
  2626     const char *s;
  2627     gunichar c,nc,pc;
  2628     QuoteClass qc;
  2629     c=g_utf8_get_char(aline);
  2630     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2631     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2632     {
  2633 	pc=c;
  2634 	c=nc;
  2635 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2636 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2637 	/* for each character in the line except 1st */
  2638 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2639 	{
  2640 	    if (pswit[ECHO_SWITCH])
  2641 		g_print("\n%s\n",aline);
  2642 	    if (!pswit[OVERVIEW_SWITCH])
  2643 		g_print("    Line %ld column %ld - "
  2644 		  "endquote missing punctuation?\n",
  2645 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2646 	    else
  2647 		cnt_punct++;
  2648 	}
  2649     }
  2650 }
  2651 
  2652 /*
  2653  * check_for_html_tag:
  2654  *
  2655  * Check for <HTML TAG>.
  2656  *
  2657  * If there is a < in the line, followed at some point
  2658  * by a > then we suspect HTML.
  2659  */
  2660 void check_for_html_tag(const char *aline)
  2661 {
  2662     const char *open,*close;
  2663     gchar *tag;
  2664     open=strchr(aline,'<');
  2665     if (open)
  2666     {
  2667 	close=strchr(g_utf8_next_char(open),'>');
  2668 	if (close)
  2669 	{
  2670 	    if (pswit[ECHO_SWITCH])
  2671 		g_print("\n%s\n",aline);
  2672 	    if (!pswit[OVERVIEW_SWITCH])
  2673 	    {
  2674 		tag=g_strndup(open,close-open+1);
  2675 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2676 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2677 		g_free(tag);
  2678 	    }
  2679 	    else
  2680 		cnt_html++;
  2681 	}
  2682     }
  2683 }
  2684 
  2685 /*
  2686  * check_for_html_entity:
  2687  *
  2688  * Check for &symbol; HTML.
  2689  *
  2690  * If there is a & in the line, followed at
  2691  * some point by a ; then we suspect HTML.
  2692  */
  2693 void check_for_html_entity(const char *aline)
  2694 {
  2695     const char *s,*amp,*scolon;
  2696     gchar *entity;
  2697     amp=strchr(aline,'&');
  2698     if (amp)
  2699     {
  2700 	scolon=strchr(amp,';');
  2701 	if (scolon)
  2702 	{
  2703 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2704 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2705 		    break;		/* Don't report "Jones & Son;" */
  2706 	    if (s>=scolon)
  2707 	    {
  2708 		if (pswit[ECHO_SWITCH])
  2709 		    g_print("\n%s\n",aline);
  2710 		if (!pswit[OVERVIEW_SWITCH])
  2711 		{
  2712 		    entity=g_strndup(amp,scolon-amp+1);
  2713 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2714 		      linecnt,(int)(amp-aline)+1,entity);
  2715 		    g_free(entity);
  2716 		}
  2717 		else
  2718 		    cnt_html++;
  2719 	    }
  2720 	}
  2721     }
  2722 }
  2723 
  2724 /*
  2725  * check_for_omitted_punctuation:
  2726  *
  2727  * Check for omitted punctuation at end of paragraph by working back
  2728  * through prevline. DW.
  2729  * Need to check this only for "normal" paras.
  2730  * So what is a "normal" para?
  2731  *    Not normal if one-liner (chapter headings, etc.)
  2732  *    Not normal if doesn't contain at least one locase letter
  2733  *    Not normal if starts with space
  2734  */
  2735 void check_for_omitted_punctuation(const char *prevline,
  2736   struct line_properties *last,int start_para_line)
  2737 {
  2738     gboolean letter_on_line=FALSE;
  2739     const char *s;
  2740     gunichar c;
  2741     gboolean closing_quote;
  2742     for (s=prevline;*s;s=g_utf8_next_char(s))
  2743 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2744 	{
  2745 	    letter_on_line=TRUE;
  2746 	    break;
  2747 	}
  2748     /*
  2749      * This next "if" is a problem.
  2750      * If we say "start_para_line <= linecnt - 1", that includes
  2751      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2752      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2753      * misses genuine one-line paragraphs.
  2754      */
  2755     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2756       g_utf8_get_char(prevline)>CHAR_SPACE)
  2757     {
  2758 	s=prevline+strlen(prevline);
  2759 	do
  2760 	{
  2761 	    s=g_utf8_prev_char(s);
  2762 	    c=g_utf8_get_char(s);
  2763 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2764 		closing_quote=TRUE;
  2765 	    else
  2766 		closing_quote=FALSE;
  2767 	} while (closing_quote && s>prevline);
  2768 	for (;s>prevline;s=g_utf8_prev_char(s))
  2769 	{
  2770 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2771 	    {
  2772 		if (pswit[ECHO_SWITCH])
  2773 		    g_print("\n%s\n",prevline);
  2774 		if (!pswit[OVERVIEW_SWITCH])
  2775 		    g_print("    Line %ld column %ld - "
  2776 		      "No punctuation at para end?\n",
  2777 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2778 		else
  2779 		    cnt_punct++;
  2780 		break;
  2781 	    }
  2782 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2783 		break;
  2784 	}
  2785     }
  2786 }
  2787 
  2788 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2789 {
  2790     const char *word=key;
  2791     int *dupcnt=value;
  2792     if (*dupcnt)
  2793 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2794 	  word,*dupcnt);
  2795     return FALSE;
  2796 }
  2797 
  2798 void print_as_windows_1252(const char *string)
  2799 {
  2800     gsize inbytes,outbytes;
  2801     gchar *buf,*bp;
  2802     static GIConv converter=(GIConv)-1;
  2803     if (!string)
  2804     {
  2805 	if (converter!=(GIConv)-1)
  2806 	    g_iconv_close(converter);
  2807 	converter=(GIConv)-1;
  2808 	return;
  2809     }
  2810     if (converter==(GIConv)-1)
  2811 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2812     if (converter!=(GIConv)-1)
  2813     {
  2814 	inbytes=outbytes=strlen(string);
  2815 	bp=buf=g_malloc(outbytes+1);
  2816 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2817 	*bp='\0';
  2818 	fputs(buf,stdout);
  2819 	g_free(buf);
  2820     }
  2821     else
  2822 	fputs(string,stdout);
  2823 }
  2824 
  2825 void print_as_utf_8(const char *string)
  2826 {
  2827     fputs(string,stdout);
  2828 }
  2829 
  2830 /*
  2831  * procfile:
  2832  *
  2833  * Process one file.
  2834  */
  2835 void procfile(const char *filename)
  2836 {
  2837     const char *s;
  2838     gchar *parastart=NULL;	/* first line of current para */
  2839     gchar *etext,*aline;
  2840     gchar *etext_ptr;
  2841     GError *err=NULL;
  2842     struct first_pass_results *first_pass_results;
  2843     struct warnings *warnings;
  2844     struct counters counters={0};
  2845     struct line_properties last={0};
  2846     struct parities parities={0};
  2847     struct pending pending={0};
  2848     gboolean isemptyline;
  2849     long start_para_line=0;
  2850     gboolean isnewpara=FALSE,enddash=FALSE;
  2851     last.start=CHAR_SPACE;
  2852     linecnt=checked_linecnt=0;
  2853     etext=read_etext(filename,&err);
  2854     if (!etext)
  2855     {
  2856 	if (pswit[STDOUT_SWITCH])
  2857 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2858 	else
  2859 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2860 	exit(1);
  2861     }
  2862     g_print("\n\nFile: %s\n\n",filename);
  2863     first_pass_results=first_pass(etext);
  2864     warnings=report_first_pass(first_pass_results);
  2865     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2866     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2867     /*
  2868      * Here we go with the main pass. Hold onto yer hat!
  2869      */
  2870     linecnt=0;
  2871     etext_ptr=etext;
  2872     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2873     {
  2874 	linecnt++;
  2875 	if (linecnt==1)
  2876 	    isnewpara=TRUE;
  2877 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2878 	    continue;    // skip DP page separators completely
  2879 	if (linecnt<first_pass_results->firstline ||
  2880 	  (first_pass_results->footerline>0 &&
  2881 	  linecnt>first_pass_results->footerline))
  2882 	{
  2883 	    if (pswit[HEADER_SWITCH])
  2884 	    {
  2885 		if (g_str_has_prefix(aline,"Title:"))
  2886 		    g_print("    %s\n",aline);
  2887 		if (g_str_has_prefix(aline,"Author:"))
  2888 		    g_print("    %s\n",aline);
  2889 		if (g_str_has_prefix(aline,"Release Date:"))
  2890 		    g_print("    %s\n",aline);
  2891 		if (g_str_has_prefix(aline,"Edition:"))
  2892 		    g_print("    %s\n\n",aline);
  2893 	    }
  2894 	    continue;		/* skip through the header */
  2895 	}
  2896 	checked_linecnt++;
  2897 	print_pending(aline,parastart,&pending);
  2898 	isemptyline=analyse_quotes(aline,&counters);
  2899 	if (isnewpara && !isemptyline)
  2900 	{
  2901 	    /* This line is the start of a new paragraph. */
  2902 	    start_para_line=linecnt;
  2903 	    /* Capture its first line in case we want to report it later. */
  2904 	    g_free(parastart);
  2905 	    parastart=g_strdup(aline);
  2906 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2907 	    s=aline;
  2908 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2909 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2910 		s=g_utf8_next_char(s);
  2911 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2912 	    {
  2913 		/* and its first letter is lowercase */
  2914 		if (pswit[ECHO_SWITCH])
  2915 		    g_print("\n%s\n",aline);
  2916 		if (!pswit[OVERVIEW_SWITCH])
  2917 		    g_print("    Line %ld column %ld - "
  2918 		      "Paragraph starts with lower-case\n",
  2919 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2920 		else
  2921 		    cnt_punct++;
  2922 	    }
  2923 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2924 	}
  2925 	/* Check for an em-dash broken at line end. */
  2926 	if (enddash && g_utf8_get_char(aline)=='-')
  2927 	{
  2928 	    if (pswit[ECHO_SWITCH])
  2929 		g_print("\n%s\n",aline);
  2930 	    if (!pswit[OVERVIEW_SWITCH])
  2931 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2932 	    else
  2933 		cnt_punct++;
  2934 	}
  2935 	enddash=FALSE;
  2936 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2937 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2938 	    ;
  2939 	if (s>=aline && g_utf8_get_char(s)=='-')
  2940 	    enddash=TRUE;
  2941 	check_for_control_characters(aline);
  2942 	if (warnings->bin)
  2943 	    check_for_odd_characters(aline,warnings,isemptyline);
  2944 	if (warnings->longline)
  2945 	    check_for_long_line(aline);
  2946 	if (warnings->shortline)
  2947 	    check_for_short_line(aline,&last);
  2948 	last.blen=last.len;
  2949 	last.len=g_utf8_strlen(aline,-1);
  2950 	last.start=g_utf8_get_char(aline);
  2951 	check_for_starting_punctuation(aline);
  2952 	if (warnings->dash)
  2953 	{
  2954 	    check_for_spaced_emdash(aline);
  2955 	    check_for_spaced_dash(aline);
  2956 	}
  2957 	check_for_unmarked_paragraphs(aline);
  2958 	check_for_jeebies(aline);
  2959 	check_for_mta_from(aline);
  2960 	check_for_orphan_character(aline);
  2961 	check_for_pling_scanno(aline);
  2962 	check_for_extra_period(aline,warnings);
  2963 	check_for_following_punctuation(aline);
  2964 	check_for_typos(aline,warnings);
  2965 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2966 	check_for_double_punctuation(aline,warnings);
  2967 	check_for_spaced_quotes(aline);
  2968 	check_for_miscased_genative(aline);
  2969 	check_end_of_line(aline,warnings);
  2970 	check_for_unspaced_bracket(aline);
  2971 	if (warnings->endquote)
  2972 	    check_for_unpunctuated_endquote(aline);
  2973 	check_for_html_tag(aline);
  2974 	check_for_html_entity(aline);
  2975 	if (isemptyline)
  2976 	{
  2977 	    check_for_mismatched_quotes(&counters,&pending);
  2978 	    counters_reset(&counters);
  2979 	    /* let the next iteration know that it's starting a new para */
  2980 	    isnewpara=TRUE;
  2981 	    if (prevline)
  2982 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2983 	}
  2984 	g_free(prevline);
  2985 	prevline=g_strdup(aline);
  2986     }
  2987     linecnt++;
  2988     check_for_mismatched_quotes(&counters,&pending);
  2989     print_pending(NULL,parastart,&pending);
  2990     reset_pending(&pending);
  2991     if (prevline)
  2992     {
  2993 	g_free(prevline);
  2994 	prevline=NULL;
  2995     }
  2996     g_free(parastart);
  2997     g_free(prevline);
  2998     g_free(etext);
  2999     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3000 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3001     g_tree_unref(qword);
  3002     g_tree_unref(qperiod);
  3003     counters_destroy(&counters);
  3004     g_set_print_handler(NULL);
  3005     print_as_windows_1252(NULL);
  3006     if (pswit[MARKUP_SWITCH])  
  3007 	loseentities(NULL);
  3008 }
  3009 
  3010 /*
  3011  * flgets:
  3012  *
  3013  * Get one line from the input text, checking for
  3014  * the existence of exactly one CR/LF line-end per line.
  3015  *
  3016  * Returns: a pointer to the line.
  3017  */
  3018 char *flgets(char **etext,long lcnt)
  3019 {
  3020     gunichar c;
  3021     gboolean isCR=FALSE;
  3022     char *theline=*etext;
  3023     char *eos=theline;
  3024     gchar *s;
  3025     for (;;)
  3026     {
  3027 	c=g_utf8_get_char(*etext);
  3028 	if (!c)
  3029 	{
  3030 	    if (*etext==theline)
  3031 		return NULL;
  3032 	    else if (pswit[LINE_END_SWITCH])
  3033 	    {
  3034 		if (pswit[ECHO_SWITCH])
  3035 		{
  3036 		    s=g_strndup(theline,eos-theline);
  3037 		    g_print("\n%s\n",s);
  3038 		    g_free(s);
  3039 		}
  3040 		if (!pswit[OVERVIEW_SWITCH])
  3041 		    /* There may, or may not, have been a CR */
  3042 		    g_print("    Line %ld - No LF?\n",lcnt);
  3043 		else
  3044 		    cnt_lineend++;
  3045 	    }
  3046 	    break;
  3047 	}
  3048 	*etext=g_utf8_next_char(*etext);
  3049 	/* either way, it's end of line */
  3050 	if (c=='\n')
  3051 	{
  3052 	    if (isCR)
  3053 		break;
  3054 	    else
  3055 	    {
  3056 		/* Error - a LF without a preceding CR */
  3057 		if (pswit[LINE_END_SWITCH])
  3058 		{
  3059 		    if (pswit[ECHO_SWITCH])
  3060 		    {
  3061 			s=g_strndup(theline,eos-theline);
  3062 			g_print("\n%s\n",s);
  3063 			g_free(s);
  3064 		    }
  3065 		    if (!pswit[OVERVIEW_SWITCH])
  3066 			g_print("    Line %ld - No CR?\n",lcnt);
  3067 		    else
  3068 			cnt_lineend++;
  3069 		}
  3070 		break;
  3071 	    }
  3072 	}
  3073 	if (c=='\r')
  3074 	{
  3075 	    if (isCR)
  3076 	    {
  3077 		/* Error - two successive CRs */
  3078 		if (pswit[LINE_END_SWITCH])
  3079 		{
  3080 		    if (pswit[ECHO_SWITCH])
  3081 		    {
  3082 			s=g_strndup(theline,eos-theline);
  3083 			g_print("\n%s\n",s);
  3084 			g_free(s);
  3085 		    }
  3086 		    if (!pswit[OVERVIEW_SWITCH])
  3087 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3088 		    else
  3089 			cnt_lineend++;
  3090 		}
  3091 	    }
  3092 	    isCR=TRUE;
  3093 	}
  3094 	else
  3095 	{
  3096 	    if (pswit[LINE_END_SWITCH] && isCR)
  3097 	    {
  3098 		if (pswit[ECHO_SWITCH])
  3099 		{
  3100 		    s=g_strndup(theline,eos-theline);
  3101 		    g_print("\n%s\n",s);
  3102 		    g_free(s);
  3103 		}
  3104 		if (!pswit[OVERVIEW_SWITCH])
  3105 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3106 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3107 		else
  3108 		    cnt_lineend++;
  3109 		*eos=' ';
  3110 	    }
  3111 	    isCR=FALSE;
  3112 	    eos=g_utf8_next_char(eos);
  3113 	}
  3114     }
  3115     *eos='\0';
  3116     if (pswit[MARKUP_SWITCH])  
  3117 	postprocess_for_HTML(theline);
  3118     if (pswit[DP_SWITCH])  
  3119 	postprocess_for_DP(theline);
  3120     return theline;
  3121 }
  3122 
  3123 /*
  3124  * mixdigit:
  3125  *
  3126  * Takes a "word" as a parameter, and checks whether it
  3127  * contains a mixture of alpha and digits. Generally, this is an
  3128  * error, but may not be for cases like 4th or L5 12s. 3d.
  3129  *
  3130  * Returns: TRUE iff an is error found.
  3131  */
  3132 gboolean mixdigit(const char *checkword)
  3133 {
  3134     gboolean wehaveadigit,wehavealetter,query;
  3135     const char *s,*nondigit;
  3136     wehaveadigit=wehavealetter=query=FALSE;
  3137     for (s=checkword;*s;s=g_utf8_next_char(s))
  3138 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3139 	    wehavealetter=TRUE;
  3140 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3141 	    wehaveadigit=TRUE;
  3142     if (wehaveadigit && wehavealetter)
  3143     {
  3144 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3145 	query=TRUE;
  3146 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3147 	  nondigit=g_utf8_next_char(nondigit))
  3148 	    ;
  3149 	/* digits, ending in st, rd, nd, th of either case */
  3150 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3151 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3152 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3153 	  !g_ascii_strcasecmp(nondigit,"th"))
  3154 	    query=FALSE;
  3155 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3156 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3157 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3158 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3159 	    query=FALSE;
  3160 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3161 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3162 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3163 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3164 	    query=FALSE;
  3165 	/* digits, ending in l, L, s or d */
  3166 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3167 	  !strcmp(nondigit,"d"))
  3168 	    query=FALSE;
  3169 	/*
  3170 	 * L at the start of a number, representing Britsh pounds, like L500.
  3171 	 * This is cute. We know the current word is mixed digit. If the first
  3172 	 * letter is L, there must be at least one digit following. If both
  3173 	 * digits and letters follow, we have a genuine error, else we have a
  3174 	 * capital L followed by digits, and we accept that as a non-error.
  3175 	 */
  3176 	if (g_utf8_get_char(checkword)=='L' &&
  3177 	  !mixdigit(g_utf8_next_char(checkword)))
  3178 	    query=FALSE;
  3179     }
  3180     return query;
  3181 }
  3182 
  3183 /*
  3184  * getaword:
  3185  *
  3186  * Extracts the first/next "word" from the line, and returns it.
  3187  * A word is defined as one English word unit--or at least that's the aim.
  3188  * "ptr" is advanced to the position in the line where we will start
  3189  * looking for the next word.
  3190  *
  3191  * Returns: A newly-allocated string.
  3192  */
  3193 gchar *getaword(const char **ptr)
  3194 {
  3195     const char *s,*t;
  3196     GString *word;
  3197     gunichar c,pc;
  3198     word=g_string_new(NULL);
  3199     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3200       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3201       **ptr;*ptr=g_utf8_next_char(*ptr))
  3202     {
  3203 	/* Handle exceptions for footnote markers like [1] */
  3204 	if (g_utf8_get_char(*ptr)=='[')
  3205 	{
  3206 	    g_string_append_c(word,'[');
  3207 	    s=g_utf8_next_char(*ptr);
  3208 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
  3209 		g_string_append_unichar(word,g_utf8_get_char(s));
  3210 	    if (g_utf8_get_char(s)==']')
  3211 	    {
  3212 		g_string_append_c(word,']');
  3213 		*ptr=g_utf8_next_char(s);
  3214 		return g_string_free(word,FALSE);
  3215 	    }
  3216 	    else
  3217 		g_string_truncate(word,0);
  3218 	}
  3219     }
  3220     /*
  3221      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3222      * Especially yucky is the case of L1,000
  3223      * This section looks for a pattern of characters including a digit
  3224      * followed by a comma or period followed by one or more digits.
  3225      * If found, it returns this whole pattern as a word; otherwise we discard
  3226      * the results and resume our normal programming.
  3227      */
  3228     s=*ptr;
  3229     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3230       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3231       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3232 	g_string_append_unichar(word,g_utf8_get_char(s));
  3233     if (word->len)
  3234     {
  3235 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3236 	{
  3237 	    c=g_utf8_get_char(t);
  3238 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3239 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3240 	    {
  3241 		*ptr=s;
  3242 		return g_string_free(word,FALSE);
  3243 	    }
  3244 	}
  3245     }
  3246     /* we didn't find a punctuated number - do the regular getword thing */
  3247     g_string_truncate(word,0);
  3248     c=g_utf8_get_char(*ptr);
  3249     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3250       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3251 	g_string_append_unichar(word,c);
  3252     return g_string_free(word,FALSE);
  3253 }
  3254 
  3255 /*
  3256  * isroman:
  3257  *
  3258  * Is this word a Roman Numeral?
  3259  *
  3260  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3261  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3262  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3263  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3264  * expressions thereof, except when it came to taxes. Allow any number of M,
  3265  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3266  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3267  * of optional Is.
  3268  */
  3269 gboolean isroman(const char *t)
  3270 {
  3271     const char *s;
  3272     if (!t || !*t)
  3273 	return FALSE;
  3274     s=t;
  3275     while (g_utf8_get_char(t)=='m' && *t)
  3276 	t++;
  3277     if (g_utf8_get_char(t)=='d')
  3278 	t++;
  3279     if (g_str_has_prefix(t,"cm"))
  3280 	t+=2;
  3281     if (g_str_has_prefix(t,"cd"))
  3282 	t+=2;
  3283     while (g_utf8_get_char(t)=='c' && *t)
  3284 	t++;
  3285     if (g_str_has_prefix(t,"xl"))
  3286 	t+=2;
  3287     if (g_str_has_prefix(t,"xc"))
  3288 	t+=2;
  3289     if (g_utf8_get_char(t)=='l')
  3290 	t++;
  3291     while (g_utf8_get_char(t)=='x' && *t)
  3292 	t++;
  3293     if (g_str_has_prefix(t,"ix"))
  3294 	t+=2;
  3295     if (g_str_has_prefix(t,"iv"))
  3296 	t+=2;
  3297     if (g_utf8_get_char(t)=='v')
  3298 	t++;
  3299     while (g_utf8_get_char(t)=='i' && *t)
  3300 	t++;
  3301     return !*t;
  3302 }
  3303 
  3304 /*
  3305  * postprocess_for_DP:
  3306  *
  3307  * Invoked with the -d switch from flgets().
  3308  * It simply "removes" from the line a hard-coded set of common
  3309  * DP-specific tags, so that the line passed to the main routine has
  3310  * been pre-cleaned of DP markup.
  3311  */
  3312 void postprocess_for_DP(char *theline)
  3313 {
  3314     char *s,*t;
  3315     int i;
  3316     if (!*theline) 
  3317 	return;
  3318     for (i=0;*DPmarkup[i];i++)
  3319 	while ((s=strstr(theline,DPmarkup[i])))
  3320 	{
  3321 	    t=s+strlen(DPmarkup[i]);
  3322 	    memmove(s,t,strlen(t)+1);
  3323 	}
  3324 }
  3325 
  3326 /*
  3327  * postprocess_for_HTML:
  3328  *
  3329  * Invoked with the -m switch from flgets().
  3330  * It simply "removes" from the line a hard-coded set of common
  3331  * HTML tags and "replaces" a hard-coded set of common HTML
  3332  * entities, so that the line passed to the main routine has
  3333  * been pre-cleaned of HTML.
  3334  */
  3335 void postprocess_for_HTML(char *theline)
  3336 {
  3337     while (losemarkup(theline))
  3338 	;
  3339     loseentities(theline);
  3340 }
  3341 
  3342 char *losemarkup(char *theline)
  3343 {
  3344     char *s,*t;
  3345     int i;
  3346     s=strchr(theline,'<');
  3347     t=s?strchr(s,'>'):NULL;
  3348     if (!s || !t)
  3349 	return NULL;
  3350     for (i=0;*markup[i];i++)
  3351 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3352 	{
  3353 	    t=g_utf8_next_char(t);
  3354 	    memmove(s,t,strlen(t)+1);
  3355 	    return s;
  3356 	}
  3357     /* It's an unrecognized <xxx>. */
  3358     return NULL;
  3359 }
  3360 
  3361 void loseentities(char *theline)
  3362 {
  3363     int i;
  3364     gsize nb;
  3365     char *amp,*scolon;
  3366     gchar *s,*t;
  3367     gunichar c;
  3368     GTree *entities=NULL;
  3369     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3370     if (!theline)
  3371     {
  3372 	if (entities)
  3373 	    g_tree_destroy(entities);
  3374 	entities=NULL;
  3375 	if (translit!=(GIConv)-1)
  3376 	    g_iconv_close(translit);
  3377 	translit=(GIConv)-1;
  3378 	if (to_utf8!=(GIConv)-1)
  3379 	    g_iconv_close(to_utf8);
  3380 	to_utf8=(GIConv)-1;
  3381 	return;
  3382     }
  3383     if (!*theline)
  3384 	return;
  3385     if (!entities)
  3386     {
  3387 	entities=g_tree_new((GCompareFunc)strcmp);
  3388 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3389 	    g_tree_insert(entities,HTMLentities[i].name,
  3390 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3391     }
  3392     if (translit==(GIConv)-1)
  3393 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3394     if (to_utf8==(GIConv)-1)
  3395 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3396     while((amp=strchr(theline,'&')))
  3397     {
  3398 	scolon=strchr(amp,';');
  3399 	if (scolon)
  3400 	{
  3401 	    if (amp[1]=='#')
  3402 	    {
  3403 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3404 		    c=strtol(amp+2,NULL,10);
  3405 		else if (amp[2]=='x' &&
  3406 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3407 		    c=strtol(amp+3,NULL,16);
  3408 	    }
  3409 	    else
  3410 	    {
  3411 		s=g_strndup(amp+1,scolon-(amp+1));
  3412 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3413 		g_free(s);
  3414 	    }
  3415 	}
  3416 	else
  3417 	    c=0;
  3418 	if (c)
  3419 	{
  3420 	    theline=amp;
  3421 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3422 		theline+=g_unichar_to_utf8(c,theline);
  3423 	    else
  3424 	    {
  3425 		s=g_malloc(6);
  3426 		nb=g_unichar_to_utf8(c,s);
  3427 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3428 		g_free(s);
  3429 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3430 		g_free(t);
  3431 		memcpy(theline,s,nb);
  3432 		g_free(s);
  3433 		theline+=nb;
  3434 	    }
  3435 	    memmove(theline,g_utf8_next_char(scolon),
  3436 	      strlen(g_utf8_next_char(scolon))+1);
  3437 	}
  3438 	else
  3439 	    theline=g_utf8_next_char(amp);
  3440     }
  3441 }
  3442 
  3443 gboolean tagcomp(const char *strin,const char *basetag)
  3444 {
  3445     gboolean retval;
  3446     gchar *s,*t;
  3447     if (g_utf8_get_char(strin)=='/')
  3448 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3449     else
  3450 	t=g_utf8_casefold(strin,-1);
  3451     s=g_utf8_casefold(basetag,-1);
  3452     retval=g_str_has_prefix(t,s);
  3453     g_free(s);
  3454     g_free(t);
  3455     return retval;
  3456 }
  3457 
  3458 void proghelp(GOptionContext *context)
  3459 {
  3460     gchar *help;
  3461     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3462     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3463     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3464     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3465       "For details, read the file COPYING.\n",stderr);
  3466     fputs("This is Free Software; "
  3467       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3468     fputs("read the file COPYING for details.\n\n",stderr);
  3469     help=g_option_context_get_help(context,TRUE,NULL);
  3470     fputs(help,stderr);
  3471     g_free(help);
  3472     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3473     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3474       "non-ASCII\n",stderr);
  3475     fputs("characters like accented letters, "
  3476       "lines longer than 75 or shorter than 55,\n",stderr);
  3477     fputs("unbalanced quotes or brackets, "
  3478       "a variety of badly formatted punctuation, \n",stderr);
  3479     fputs("HTML tags, some likely typos. "
  3480       "It is NOT a substitute for human judgement.\n",stderr);
  3481     fputs("\n",stderr);
  3482 }