bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Mon Oct 21 23:39:54 2013 +0100 (2013-10-21)
changeset 192 1aeda7fe17ca
parent 191 189183b37598
parent 185 a6d93c9932ac
child 193 7fdf168fb748
permissions -rw-r--r--
Merge bug #13: Character sets
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 gboolean typo_compat,paranoid_compat;
   136 
   137 static GOptionEntry options[]={
   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   139       "Ignore DP-specific markup", NULL },
   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   142       "Don't ignore DP-specific markup", NULL },
   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   144       "Echo queried line", NULL },
   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   147       "Don't echo queried line", NULL },
   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   149       "Check single quotes", NULL },
   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   152       "Don't check single quotes", NULL },
   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   154       "Check common typos", NULL },
   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   157       "Don't check common typos", NULL },
   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   159       "Require closure of quotes on every paragraph", NULL },
   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   162       "Don't require closure of quotes on every paragraph", NULL },
   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   165       "Enable paranoid querying of everything", NULL },
   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   168       "Disable paranoid querying of everything", NULL },
   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   171       "Enable line end checking", NULL },
   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   174       "Diable line end checking", NULL },
   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   176       "Overview: just show counts", NULL },
   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   179       "Show individual warnings", NULL },
   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   181       "Output errors to stdout instead of stderr", NULL },
   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   184       "Output errors to stderr instead of stdout", NULL },
   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   186       "Echo header fields", NULL },
   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Don't echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "No special handling for markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   199       "Ignore file of user-defined typos", NULL },
   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   201       "Verbose - list everything", NULL },
   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   204       "Switch off verbose mode", NULL },
   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   206       "Set of characters valid for this ebook", "NAME" },
   207     { NULL }
   208 };
   209 
   210 /*
   211  * Options relating to configuration which make no sense from inside
   212  * a configuration file.
   213  */
   214 
   215 static GOptionEntry config_options[]={
   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   217       "Defaults for use on www upload", NULL },
   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   219       "Dump current config settings", NULL },
   220     { NULL }
   221 };
   222 
   223 static GOptionEntry compatibility_options[]={
   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   225       "Toggle checking for common typos", NULL },
   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   227       "Toggle both paranoid mode and common typos", NULL },
   228     { NULL }
   229 };
   230 
   231 long cnt_quote;		/* for overview mode, count of quote queries */
   232 long cnt_brack;		/* for overview mode, count of brackets queries */
   233 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   234 long cnt_odd;		/* for overview mode, count of odd character queries */
   235 long cnt_long;		/* for overview mode, count of long line errors */
   236 long cnt_short;		/* for overview mode, count of short line queries */
   237 long cnt_punct;		/* for overview mode,
   238 			   count of punctuation and spacing queries */
   239 long cnt_dash;		/* for overview mode, count of dash-related queries */
   240 long cnt_word;		/* for overview mode, count of word queries */
   241 long cnt_html;		/* for overview mode, count of html queries */
   242 long cnt_lineend;	/* for overview mode, count of line-end queries */
   243 long cnt_spacend;	/* count of lines with space at end */
   244 long linecnt;		/* count of total lines in the file */
   245 long checked_linecnt;	/* count of lines actually checked */
   246 
   247 void proghelp(GOptionContext *context);
   248 void procfile(const char *);
   249 
   250 gchar *running_from;
   251 
   252 gboolean mixdigit(const char *);
   253 gchar *getaword(const char **);
   254 char *flgets(char **,long);
   255 void postprocess_for_HTML(char *);
   256 char *linehasmarkup(char *);
   257 char *losemarkup(char *);
   258 gboolean tagcomp(const char *,const char *);
   259 void loseentities(char *);
   260 gboolean isroman(const char *);
   261 void postprocess_for_DP(char *);
   262 void print_as_windows_1252(const char *string);
   263 void print_as_utf_8(const char *string);
   264 
   265 GTree *qword,*qperiod;
   266 
   267 #ifdef __WIN32__
   268 UINT saved_cp;
   269 #endif
   270 
   271 GKeyFile *config;
   272 
   273 void config_file_update(GKeyFile *kf)
   274 {
   275     int i;
   276     gboolean sw;
   277     for(i=0;options[i].long_name;i++)
   278     {
   279 	if (g_str_has_prefix(options[i].long_name,"no-"))
   280 	    continue;
   281 	if (options[i].arg==G_OPTION_ARG_NONE)
   282 	{
   283 	    sw=*(gboolean *)options[i].arg_data;
   284 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   285 		sw=!sw;
   286 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   287 	}
   288 	else
   289 	    g_assert_not_reached();
   290     }
   291 }
   292 
   293 void config_file_add_comments(GKeyFile *kf)
   294 {
   295     int i;
   296     gchar *comment;
   297     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   298       NULL);
   299     for(i=0;options[i].long_name;i++)
   300     {
   301 	if (g_str_has_prefix(options[i].long_name,"no-"))
   302 	    continue;
   303 	comment=g_strconcat(" ",options[i].description,NULL);
   304 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   305 	g_free(comment);
   306     }
   307 }
   308 
   309 void dump_config(void)
   310 {
   311     gchar *s;
   312     if (config)
   313 	config_file_update(config);
   314     else
   315     {
   316 	config=g_key_file_new();
   317 	config_file_update(config);
   318 	config_file_add_comments(config);
   319     }
   320     s=g_key_file_to_data(config,NULL,NULL);
   321     if (s)
   322 	g_print("%s",s);
   323     g_free(s);
   324 }
   325 
   326 GKeyFile *read_config_file(gchar **full_path)
   327 {
   328     int i;
   329     GError *err=NULL;
   330     gchar **search_dirs;
   331     gchar *path;
   332     const char *search_path;
   333     GKeyFile *kf;
   334     kf=g_key_file_new();
   335     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   336     if (search_path)
   337     {
   338 #ifdef __WIN32__
   339 	search_dirs=g_strsplit(search_path,";",0);
   340 #else
   341 	search_dirs=g_strsplit(search_path,":",0);
   342 #endif
   343     }
   344     else
   345     {
   346 	search_dirs=g_new(gchar *,4);
   347 	search_dirs[0]=g_get_current_dir();
   348 	search_dirs[1]=g_strdup(running_from);
   349 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   350 	search_dirs[3]=NULL;
   351     }
   352     for(i=0;search_dirs[i];i++)
   353     {
   354 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   355 	if (g_key_file_load_from_file(kf,path,
   356 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   357 	    break;
   358 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   359 	{
   360 	    g_printerr("Bookloupe: Error reading %s\n",path);
   361 	    g_printerr("%s\n",err->message);
   362 	    exit(1);
   363 	}
   364 	g_clear_error(&err);
   365 	g_free(path);
   366 	path=NULL;
   367     }
   368     if (!search_dirs[i])
   369     {
   370 	g_key_file_free(kf);
   371 	kf=NULL;
   372     }
   373     g_strfreev(search_dirs);
   374     if (full_path && kf)
   375 	*full_path=path;
   376     else
   377 	g_free(path);
   378     return kf;
   379 }
   380 
   381 void parse_config_file(void)
   382 {
   383     int i,j;
   384     gchar *path;
   385     gchar **keys;
   386     gboolean sw;
   387     GError *err=NULL;
   388     config=read_config_file(&path);
   389     if (config)
   390 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   391     else
   392 	keys=NULL;
   393     if (keys)
   394     {
   395 	for(i=0;keys[i];i++)
   396 	{
   397 	    for(j=0;options[j].long_name;j++)
   398 	    {
   399 		if (g_str_has_prefix(options[j].long_name,"no-"))
   400 		    continue;
   401 		else if (!strcmp(keys[i],options[j].long_name))
   402 		{
   403 		    if (options[j].arg==G_OPTION_ARG_NONE)
   404 		    {
   405 			sw=g_key_file_get_boolean(config,"options",keys[i],
   406 			  &err);
   407 			if (err)
   408 			{
   409 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   410 			      path,keys[i],err->message);
   411 			    g_clear_error(&err);
   412 			}
   413 			if (options[j].flags&G_OPTION_FLAG_REVERSE)
   414 			    sw=!sw;
   415 			*(gboolean *)options[j].arg_data=sw;
   416 			break;
   417 		    }
   418 		    else
   419 			g_assert_not_reached();
   420 		}
   421 	    }
   422 	    if (!options[j].long_name)
   423 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   424 		  path,keys[i]);
   425 	}
   426 	g_strfreev(keys);
   427     }
   428     if (config)
   429 	g_free(path);
   430 }
   431 
   432 gboolean set_charset(const char *name,GError **err)
   433 {
   434     /* The various UNICODE encodings all share the same character set. */
   435     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   436       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   437       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   438       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   439       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   440     int i;
   441     if (charset)
   442 	g_free(charset);
   443     if (charset_validator!=(GIConv)-1)
   444 	g_iconv_close(charset_validator);
   445     if (!name || !g_strcasecmp(name,"auto"))
   446     {
   447 	charset=NULL;
   448 	charset_validator=(GIConv)-1;
   449 	return TRUE;
   450     }
   451     else
   452 	charset=g_strdup(name);
   453     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   454 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   455 	{
   456 	    g_free(charset);
   457 	    charset=g_strdup("UTF-8");
   458 	    break;
   459 	}
   460     if (!strcmp(charset,"UTF-8"))
   461 	charset_validator=(GIConv)-1;
   462     else
   463     {
   464 	charset_validator=g_iconv_open(charset,"UTF-8");
   465 	if (charset_validator==(GIConv)-1)
   466 	{
   467 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   468 	      "Unknown character set \"%s\"",charset);
   469 	    return FALSE;
   470 	}
   471     }
   472     return TRUE;
   473 }
   474 
   475 void parse_options(int *argc,char ***argv)
   476 {
   477     GError *err=NULL;
   478     GOptionContext *context;
   479     GOptionGroup *compatibility;
   480     context=g_option_context_new(
   481       "file - look for errors in Project Gutenberg(TM) etexts");
   482     g_option_context_add_main_entries(context,options,NULL);
   483     g_option_context_add_main_entries(context,config_options,NULL);
   484     compatibility=g_option_group_new("compatibility",
   485       "Options for Compatibility with Gutcheck:",
   486       "Show compatibility options",NULL,NULL);
   487     g_option_group_add_entries(compatibility,compatibility_options);
   488     g_option_context_add_group(context,compatibility);
   489     g_option_context_set_description(context,
   490       "For simplicity, only the switch options which reverse the\n"
   491       "default configuration are listed. In most cases, both vanilla\n"
   492       "and \"no-\" prefixed versions are available for use.");
   493     if (!g_option_context_parse(context,argc,argv,&err))
   494     {
   495 	g_printerr("Bookloupe: %s\n",err->message);
   496 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   497 	exit(1);
   498     }
   499     if (typo_compat)
   500 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   501     if (paranoid_compat)
   502     {
   503 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   504 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   505     }
   506     /*
   507      * Web uploads - for the moment, this is really just a placeholder
   508      * until we decide what processing we really want to do on web uploads
   509      */
   510     if (pswit[WEB_SWITCH])
   511     {
   512 	/* specific override for web uploads */
   513 	pswit[ECHO_SWITCH]=TRUE;
   514 	pswit[SQUOTE_SWITCH]=FALSE;
   515 	pswit[TYPO_SWITCH]=TRUE;
   516 	pswit[QPARA_SWITCH]=FALSE;
   517 	pswit[PARANOID_SWITCH]=TRUE;
   518 	pswit[LINE_END_SWITCH]=FALSE;
   519 	pswit[OVERVIEW_SWITCH]=FALSE;
   520 	pswit[STDOUT_SWITCH]=FALSE;
   521 	pswit[HEADER_SWITCH]=TRUE;
   522 	pswit[VERBOSE_SWITCH]=FALSE;
   523 	pswit[MARKUP_SWITCH]=FALSE;
   524 	pswit[USERTYPO_SWITCH]=FALSE;
   525 	pswit[DP_SWITCH]=FALSE;
   526     }
   527     if (opt_charset && !set_charset(opt_charset,&err))
   528     {
   529 	g_printerr("%s\n",err->message);
   530 	exit(1);
   531     }
   532     if (pswit[DUMP_CONFIG_SWITCH])
   533     {
   534 	dump_config();
   535 	exit(0);
   536     }
   537     g_free(opt_charset);
   538     opt_charset=NULL;
   539     if (pswit[OVERVIEW_SWITCH])
   540 	/* just print summary; don't echo */
   541 	pswit[ECHO_SWITCH]=FALSE;
   542     if (*argc<2)
   543     {
   544 	proghelp(context);
   545 	exit(1);
   546     }
   547     g_option_context_free(context);
   548 }
   549 
   550 /*
   551  * read_user_scannos:
   552  *
   553  * Read in the user-defined stealth scanno list.
   554  */
   555 void read_user_scannos(void)
   556 {
   557     GError *err=NULL;
   558     gchar *usertypo_file;
   559     gboolean okay;
   560     int i;
   561     gsize len,nb;
   562     gchar *contents,*utf8,**lines;
   563     usertypo_file=g_strdup("bookloupe.typ");
   564     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   565     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   566     {
   567 	g_clear_error(&err);
   568 	g_free(usertypo_file);
   569 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   570 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   571     }
   572     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   573     {
   574 	g_clear_error(&err);
   575 	g_free(usertypo_file);
   576 	usertypo_file=g_strdup("gutcheck.typ");
   577 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   578     }
   579     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   580     {
   581 	g_clear_error(&err);
   582 	g_free(usertypo_file);
   583 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   584 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   585     }
   586     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   587     {
   588 	g_free(usertypo_file);
   589 	g_print("   --> I couldn't find bookloupe.typ "
   590 	  "-- proceeding without user typos.\n");
   591 	return;
   592     }
   593     else if (!okay)
   594     {
   595 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   596 	g_free(usertypo_file);
   597 	g_clear_error(&err);
   598 	exit(1);
   599     }
   600     if (g_utf8_validate(contents,len,NULL))
   601     {
   602 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   603 	if (!charset)
   604 	    (void)set_charset("UNICODE",NULL);
   605     }
   606     else
   607 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   608     g_free(contents);
   609     lines=g_strsplit_set(utf8,"\r\n",0);
   610     g_free(utf8);
   611     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   612     for (i=0;lines[i];i++)
   613 	if (*(unsigned char *)lines[i]>'!')
   614 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   615 	else
   616 	    g_free(lines[i]);
   617     g_free(lines);
   618 }
   619 
   620 /*
   621  * read_etext:
   622  *
   623  * Read an etext returning a newly allocated string containing the file
   624  * contents or NULL on error.
   625  */
   626 gchar *read_etext(const char *filename,GError **err)
   627 {
   628     GError *tmp_err=NULL;
   629     gchar *contents,*utf8;
   630     gsize len,bytes_read,bytes_written;
   631     int i,line,col;
   632     if (!g_file_get_contents(filename,&contents,&len,err))
   633 	return NULL;
   634     if (g_utf8_validate(contents,len,NULL))
   635     {
   636 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   637 	g_set_print_handler(print_as_utf_8);
   638 #ifdef __WIN32__
   639 	SetConsoleOutputCP(CP_UTF8);
   640 #endif
   641     }
   642     else
   643     {
   644 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   645 	  &bytes_written,&tmp_err);
   646 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   647 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   648 	{
   649 	    line=col=1;
   650 	    for(i=0;i<bytes_read;i++)
   651 		if (contents[i]=='\n')
   652 		{
   653 		    line++;
   654 		    col=1;
   655 		}
   656 		else if (contents[i]!='\r')
   657 		    col++;
   658 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   659 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   660 	      "valid Windows-1252 character",
   661 	      ((unsigned char *)contents)[bytes_read],line,col);
   662 	}
   663 	else if (tmp_err)
   664 	    g_propagate_error(err,tmp_err);
   665 	g_set_print_handler(print_as_windows_1252);
   666 #ifdef __WIN32__
   667 	SetConsoleOutputCP(1252);
   668 #endif
   669     }
   670     g_free(contents);
   671     return utf8;
   672 }
   673 
   674 void cleanup_on_exit(void)
   675 {
   676 #ifdef __WIN32__
   677     SetConsoleOutputCP(saved_cp);
   678 #endif
   679 }
   680 
   681 int main(int argc,char **argv)
   682 {
   683 #ifdef __WIN32__
   684     atexit(cleanup_on_exit);
   685     saved_cp=GetConsoleOutputCP();
   686 #endif
   687     running_from=g_path_get_dirname(argv[0]);
   688     /* Paranoid checking is turned OFF, not on, by its switch */
   689     pswit[PARANOID_SWITCH]=TRUE;
   690     /* if running in paranoid mode, typo checks default to enabled */
   691     pswit[TYPO_SWITCH]=TRUE;
   692     /* Line-end checking is turned OFF, not on, by its switch */
   693     pswit[LINE_END_SWITCH]=TRUE;
   694     /* Echoing is turned OFF, not on, by its switch */
   695     pswit[ECHO_SWITCH]=TRUE;
   696     parse_config_file();
   697     parse_options(&argc,&argv);
   698     if (pswit[USERTYPO_SWITCH])
   699 	read_user_scannos();
   700     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   701     procfile(argv[1]);
   702     if (pswit[OVERVIEW_SWITCH])
   703     {
   704 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   705 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   706 	g_print("    --------------- Queries found --------------\n");
   707 	if (cnt_long)
   708 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   709 	if (cnt_short)
   710 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   711 	if (cnt_lineend)
   712 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   713 	if (cnt_word)
   714 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   715 	if (cnt_quote)
   716 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   717 	if (cnt_brack)
   718 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   719 	if (cnt_bin)
   720 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   721 	if (cnt_odd)
   722 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   723 	if (cnt_punct)
   724 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   725 	if (cnt_dash)
   726 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   727 	if (cnt_html)
   728 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   729 	g_print("\n");
   730 	g_print("    TOTAL QUERIES		  %14ld\n",
   731 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   732 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   733     }
   734     g_free(running_from);
   735     if (usertypo)
   736 	g_tree_unref(usertypo);
   737     set_charset(NULL,NULL);
   738     if (config)
   739 	g_key_file_free(config);
   740     return 0;
   741 }
   742 
   743 void count_dashes(const char *line,const char *dash,
   744   struct dash_results *results)
   745 {
   746     int i;
   747     gchar **tokens;
   748     gunichar pc,nc;
   749     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   750     if (!*line)
   751 	return;
   752     tokens=g_strsplit(line,dash,0);
   753     if (tokens[1])
   754 	results->base++;
   755     for(i=1;tokens[i];i++)
   756     {
   757 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   758 	nc=g_utf8_get_char(tokens[i]);
   759 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   760 	    spaced=TRUE;
   761 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   762 	    spaced2=TRUE;
   763 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   764 	    unspaced=TRUE;
   765     }
   766     if (spaced)
   767 	results->space++;
   768     if (spaced2)
   769 	/* count of lines with em-dashes with spaces both sides */
   770 	results->non_PG_space++;
   771     if (unspaced)
   772 	/* count of lines with PG-type em-dashes with no spaces */
   773 	results->PG_space++;
   774     g_strfreev(tokens);
   775 }
   776 
   777 /*
   778  * first_pass:
   779  *
   780  * Run a first pass - verify that it's a valid PG
   781  * file, decide whether to report some things that
   782  * occur many times in the text like long or short
   783  * lines, non-standard dashes, etc.
   784  */
   785 struct first_pass_results *first_pass(const char *etext)
   786 {
   787     gunichar laststart=CHAR_SPACE;
   788     const char *s;
   789     gchar *lc_line;
   790     int i,j,lbytes,llen;
   791     gchar **lines;
   792     unsigned int lastlen=0,lastblen=0;
   793     long spline=0,nspline=0;
   794     static struct first_pass_results results={0};
   795     struct dash_results tmp_dash_results;
   796     gchar *inword;
   797     QuoteClass qc;
   798     lines=g_strsplit(etext,"\n",0);
   799     for (j=0;lines[j];j++)
   800     {
   801 	lbytes=strlen(lines[j]);
   802 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   803 	    lines[j][--lbytes]='\0';
   804 	llen=g_utf8_strlen(lines[j],lbytes);
   805 	linecnt++;
   806 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   807 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   808 	{
   809 	    if (spline)
   810 		g_print("   --> Duplicate header?\n");
   811 	    spline=linecnt+1;   /* first line of non-header text, that is */
   812 	}
   813 	if (!strncmp(lines[j],"*** START",9) &&
   814 	  strstr(lines[j],"PROJECT GUTENBERG"))
   815 	{
   816 	    if (nspline)
   817 		g_print("   --> Duplicate header?\n");
   818 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   819 	}
   820 	if (spline || nspline)
   821 	{
   822 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   823 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   824 	    {
   825 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   826 		{
   827 		    if (results.footerline)
   828 		    {
   829 			/* it's an old-form header - we can detect duplicates */
   830 			if (!nspline)
   831 			    g_print("   --> Duplicate footer?\n");
   832 		    }
   833 		    else
   834 			results.footerline=linecnt;
   835 		}
   836 	    }
   837 	    g_free(lc_line);
   838 	}
   839 	if (spline)
   840 	    results.firstline=spline;
   841 	if (nspline)
   842 	    results.firstline=nspline;  /* override with new */
   843 	if (results.footerline)
   844 	    continue;    /* don't count the boilerplate in the footer */
   845 	results.totlen+=llen;
   846 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   847 	{
   848 	    if (g_utf8_get_char(s)>127)
   849 		results.binlen++;
   850 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   851 		results.alphalen++;
   852 	    if (s>lines[j])
   853 	    {
   854 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   855 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   856 		else
   857 		    qc=INVALID_QUOTE;
   858 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   859 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   860 		    results.endquote_count++;
   861 	    }
   862 	}
   863 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   864 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   865 	    results.shortline++;
   866 	if (lbytes>0 &&
   867 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   868 	    cnt_spacend++;
   869 	if (strstr(lines[j],".,"))
   870 	    results.dotcomma++;
   871 	/* only count ast lines for ignoring purposes where there is */
   872 	/* locase text on the line */
   873 	if (strchr(lines[j],'*'))
   874 	{
   875 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   876 		if (g_unichar_islower(g_utf8_get_char(s)))
   877 		    break;
   878 	    if (*s)
   879 		results.astline++;
   880 	}
   881 	if (strchr(lines[j],'/'))
   882 	    results.fslashline++;
   883 	if (lbytes>0)
   884 	{
   885 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   886 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   887 	      s=g_utf8_prev_char(s))
   888 		;
   889 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   890 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   891 		results.hyphens++;
   892 	}
   893 	if (llen>LONGEST_PG_LINE)
   894 	    results.longline++;
   895 	if (llen>WAY_TOO_LONG)
   896 	    results.verylongline++;
   897 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   898 	{
   899 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   900 	    if (i>0)
   901 		results.htmcount++;
   902 	    if (strstr(lines[j],"<i>"))
   903 		results.htmcount+=4; /* bonus marks! */
   904 	}
   905 	/* Check for spaced em-dashes */
   906 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   907 	count_dashes(lines[j],"--",&tmp_dash_results);
   908 	count_dashes(lines[j],"—",&tmp_dash_results);
   909 	if (tmp_dash_results.base)
   910 	    results.emdash.base++;
   911 	if (tmp_dash_results.non_PG_space)
   912 	    results.emdash.non_PG_space++;
   913 	if (tmp_dash_results.PG_space)
   914 	    results.emdash.PG_space++;
   915 	for (s=lines[j];*s;)
   916 	{
   917 	    inword=getaword(&s);
   918 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   919 		results.Dutchcount++;
   920 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   921 		results.Frenchcount++;
   922 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   923 		results.standalone_digit++;
   924 	    g_free(inword);
   925 	}
   926 	/* Check for spaced dashes */
   927 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   928 	    results.spacedash++;
   929 	lastblen=lastlen;
   930 	lastlen=llen;
   931 	laststart=lines[j][0];
   932     }
   933     g_strfreev(lines);
   934     return &results;
   935 }
   936 
   937 /*
   938  * report_first_pass:
   939  *
   940  * Make some snap decisions based on the first pass results.
   941  */
   942 struct warnings *report_first_pass(struct first_pass_results *results)
   943 {
   944     static struct warnings warnings={0};
   945     if (cnt_spacend>0)
   946 	g_print("   --> %ld lines in this file have white space at end\n",
   947 	  cnt_spacend);
   948     warnings.dotcomma=1;
   949     if (results->dotcomma>5)
   950     {
   951 	warnings.dotcomma=0;
   952 	g_print("   --> %ld lines in this file contain '.,'. "
   953 	  "Not reporting them.\n",results->dotcomma);
   954     }
   955     /*
   956      * If more than 50 lines, or one-tenth, are short,
   957      * don't bother reporting them.
   958      */
   959     warnings.shortline=1;
   960     if (results->shortline>50 || results->shortline*10>linecnt)
   961     {
   962 	warnings.shortline=0;
   963 	g_print("   --> %ld lines in this file are short. "
   964 	  "Not reporting short lines.\n",results->shortline);
   965     }
   966     /*
   967      * If more than 50 lines, or one-tenth, are long,
   968      * don't bother reporting them.
   969      */
   970     warnings.longline=1;
   971     if (results->longline>50 || results->longline*10>linecnt)
   972     {
   973 	warnings.longline=0;
   974 	g_print("   --> %ld lines in this file are long. "
   975 	  "Not reporting long lines.\n",results->longline);
   976     }
   977     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   978     warnings.ast=1;
   979     if (results->astline>10)
   980     {
   981 	warnings.ast=0;
   982 	g_print("   --> %ld lines in this file contain asterisks. "
   983 	  "Not reporting them.\n",results->astline);
   984     }
   985     /*
   986      * If more than 10 lines contain forward slashes,
   987      * don't bother reporting them.
   988      */
   989     warnings.fslash=1;
   990     if (results->fslashline>10)
   991     {
   992 	warnings.fslash=0;
   993 	g_print("   --> %ld lines in this file contain forward slashes. "
   994 	  "Not reporting them.\n",results->fslashline);
   995     }
   996     /*
   997      * If more than 20 lines contain unpunctuated endquotes,
   998      * don't bother reporting them.
   999      */
  1000     warnings.endquote=1;
  1001     if (results->endquote_count>20)
  1002     {
  1003 	warnings.endquote=0;
  1004 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
  1005 	  "Not reporting them.\n",results->endquote_count);
  1006     }
  1007     /*
  1008      * If more than 15 lines contain standalone digits,
  1009      * don't bother reporting them.
  1010      */
  1011     warnings.digit=1;
  1012     if (results->standalone_digit>10)
  1013     {
  1014 	warnings.digit=0;
  1015 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
  1016 	  "Not reporting them.\n",results->standalone_digit);
  1017     }
  1018     /*
  1019      * If more than 20 lines contain hyphens at end,
  1020      * don't bother reporting them.
  1021      */
  1022     warnings.hyphen=1;
  1023     if (results->hyphens>20)
  1024     {
  1025 	warnings.hyphen=0;
  1026 	g_print("   --> %ld lines in this file have hyphens at end. "
  1027 	  "Not reporting them.\n",results->hyphens);
  1028     }
  1029     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1030     {
  1031 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1032 	pswit[MARKUP_SWITCH]=1;
  1033     }
  1034     if (results->verylongline>0)
  1035 	g_print("   --> %ld lines in this file are VERY long!\n",
  1036 	  results->verylongline);
  1037     /*
  1038      * If there are more non-PG spaced dashes than PG em-dashes,
  1039      * assume it's deliberate.
  1040      * Current PG guidelines say don't use them, but older texts do,
  1041      * and some people insist on them whatever the guidelines say.
  1042      */
  1043     warnings.dash=1;
  1044     if (results->spacedash+results->emdash.non_PG_space>
  1045       results->emdash.PG_space)
  1046     {
  1047 	warnings.dash=0;
  1048 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1049 	  "Not reporting them.\n",
  1050 	  results->spacedash+results->emdash.non_PG_space);
  1051     }
  1052     if (charset)
  1053 	warnings.bin=0;
  1054     else
  1055     {
  1056 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1057 	warnings.bin=1;
  1058 	/* If more than a quarter of characters are hi-bit, bug out. */
  1059 	if (results->binlen*4>results->totlen)
  1060 	{
  1061 	    g_print("   --> This file does not appear to be ASCII. "
  1062 	      "Terminating. Best of luck with it!\n");
  1063 	    exit(1);
  1064 	}
  1065 	if (results->alphalen*4<results->totlen)
  1066 	{
  1067 	    g_print("   --> This file does not appear to be text. "
  1068 	      "Terminating. Best of luck with it!\n");
  1069 	    exit(1);
  1070 	}
  1071 	if (results->binlen*100>results->totlen || results->binlen>100)
  1072 	{
  1073 	    g_print("   --> There are a lot of foreign letters here. "
  1074 	      "Not reporting them.\n");
  1075 	    if (!pswit[VERBOSE_SWITCH])
  1076 		warnings.bin=0;
  1077 	}
  1078     }
  1079     warnings.isDutch=FALSE;
  1080     if (results->Dutchcount>50)
  1081     {
  1082 	warnings.isDutch=TRUE;
  1083 	g_print("   --> This looks like Dutch - "
  1084 	  "switching off dashes and warnings for 's Middags case.\n");
  1085     }
  1086     warnings.isFrench=FALSE;
  1087     if (results->Frenchcount>50)
  1088     {
  1089 	warnings.isFrench=TRUE;
  1090 	g_print("   --> This looks like French - "
  1091 	  "switching off some doublepunct.\n");
  1092     }
  1093     if (results->firstline && results->footerline)
  1094 	g_print("    The PG header and footer appear to be already on.\n");
  1095     else
  1096     {
  1097 	if (results->firstline)
  1098 	    g_print("    The PG header is on - no footer.\n");
  1099 	if (results->footerline)
  1100 	    g_print("    The PG footer is on - no header.\n");
  1101     }
  1102     g_print("\n");
  1103     if (pswit[VERBOSE_SWITCH])
  1104     {
  1105 	warnings.shortline=1;
  1106 	warnings.dotcomma=1;
  1107 	warnings.longline=1;
  1108 	warnings.dash=1;
  1109 	warnings.digit=1;
  1110 	warnings.ast=1;
  1111 	warnings.fslash=1;
  1112 	warnings.hyphen=1;
  1113 	warnings.endquote=1;
  1114 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1115     }
  1116     if (warnings.isDutch)
  1117 	warnings.dash=0;
  1118     if (results->footerline>0 && results->firstline>0 &&
  1119       results->footerline>results->firstline &&
  1120       results->footerline-results->firstline<100)
  1121     {
  1122 	g_print("   --> I don't really know where this text starts. \n");
  1123 	g_print("       There are no reference points.\n");
  1124 	g_print("       I'm going to have to report the header and footer "
  1125 	  "as well.\n");
  1126 	results->firstline=0;
  1127     }
  1128     return &warnings;
  1129 }
  1130 
  1131 /*
  1132  * analyse_quotes:
  1133  *
  1134  * Look along the line, accumulate the count of quotes, and see
  1135  * if this is an empty line - i.e. a line with nothing on it
  1136  * but spaces.
  1137  * If line has just spaces, period, * and/or - on it, don't
  1138  * count it, since empty lines with asterisks or dashes to
  1139  * separate sections are common.
  1140  *
  1141  * Returns: TRUE if the line is empty.
  1142  */
  1143 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1144 {
  1145     int guessquote=0;
  1146     /* assume the line is empty until proven otherwise */
  1147     gboolean isemptyline=TRUE;
  1148     const char *s=aline,*sprev,*snext;
  1149     gunichar c;
  1150     sprev=NULL;
  1151     GError *tmp_err=NULL;
  1152     while (*s)
  1153     {
  1154 	snext=g_utf8_next_char(s);
  1155 	c=g_utf8_get_char(s);
  1156 	if (CHAR_IS_DQUOTE(c))
  1157 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1158 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1159 	{
  1160 	    if (s==aline)
  1161 	    {
  1162 		/*
  1163 		 * At start of line, it can only be a quotation mark.
  1164 		 * Hardcode a very common exception!
  1165 		 */
  1166 		if (!g_str_has_prefix(snext,"tis") &&
  1167 		  !g_str_has_prefix(snext,"Tis"))
  1168 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1169 	    }
  1170 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1171 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1172 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1173 		;
  1174 	    /* it's outside a word - let's check it out */
  1175 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1176 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1177 	    {
  1178 		/* certainly looks like a quotation mark */
  1179 		if (!g_str_has_prefix(snext,"tis") &&
  1180 		  !g_str_has_prefix(snext,"Tis"))
  1181 		    /* hardcode a very common exception! */
  1182 		{
  1183 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1184 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1185 		    else
  1186 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1187 		}
  1188 	    }
  1189 	    else
  1190 	    {
  1191 		/* now - is it a quotation mark? */
  1192 		guessquote=0;   /* accumulate clues */
  1193 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1194 		{
  1195 		    /* it follows a letter - could be either */
  1196 		    guessquote++;
  1197 		    if (g_utf8_get_char(sprev)=='s')
  1198 		    {
  1199 			/* looks like a plural apostrophe */
  1200 			guessquote-=3;
  1201 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1202 			    /* bonus marks! */
  1203 			    guessquote-=2;
  1204 		    }
  1205 		    if (innermost_quote_matches(counters,c))
  1206 			/*
  1207 			 * Give it the benefit of some doubt,
  1208 			 * if a squote is already open.
  1209 			 */
  1210 			guessquote++;
  1211 		    else
  1212 			guessquote--;
  1213 		    if (guessquote>=0)
  1214 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1215 		}
  1216 		else
  1217 		    /* no adjacent letter - it must be a quote of some kind */
  1218 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1219 	    }
  1220 	}
  1221 	if (tmp_err)
  1222 	{
  1223 	    if (pswit[ECHO_SWITCH])
  1224 		g_print("\n%s\n",aline);
  1225 	    if (!pswit[OVERVIEW_SWITCH])
  1226 		g_print("    Line %ld column %ld - %s\n",
  1227 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1228 	    g_clear_error(&tmp_err);
  1229 	}
  1230 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1231 	  c!='\r' && c!='\n')
  1232 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1233 	if (c==CHAR_UNDERSCORE)
  1234 	    counters->c_unders++;
  1235 	if (c==CHAR_OPEN_SBRACK)
  1236 	{
  1237 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1238 	      !matching_difference(counters,c) && s==aline &&
  1239 	      g_str_has_prefix(s,"[Illustration:"))
  1240 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1241 	    else
  1242 		increment_matching(counters,c,TRUE);
  1243 	}
  1244 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1245 	    increment_matching(counters,c,TRUE);
  1246 	if (c==CHAR_CLOSE_SBRACK)
  1247 	{
  1248 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1249 	      !matching_difference(counters,c) && !*snext)
  1250 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1251 	    else
  1252 		increment_matching(counters,c,FALSE);
  1253 	}
  1254 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1255 	    increment_matching(counters,c,FALSE);
  1256 	sprev=s;
  1257 	s=snext;
  1258     }
  1259     return isemptyline;
  1260 }
  1261 
  1262 /*
  1263  * check_for_control_characters:
  1264  *
  1265  * Check for invalid or questionable characters in the line
  1266  * Anything above 127 is invalid for plain ASCII, and
  1267  * non-printable control characters should also be flagged.
  1268  * Tabs should generally not be there.
  1269  */
  1270 void check_for_control_characters(const char *aline)
  1271 {
  1272     gunichar c;
  1273     const char *s;
  1274     for (s=aline;*s;s=g_utf8_next_char(s))
  1275     {
  1276 	c=g_utf8_get_char(s);
  1277 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1278 	{
  1279 	    if (pswit[ECHO_SWITCH])
  1280 		g_print("\n%s\n",aline);
  1281 	    if (!pswit[OVERVIEW_SWITCH])
  1282 		g_print("    Line %ld column %ld - Control character %u\n",
  1283 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1284 	    else
  1285 		cnt_bin++;
  1286 	}
  1287     }
  1288 }
  1289 
  1290 /*
  1291  * check_for_odd_characters:
  1292  *
  1293  * Check for binary and other odd characters.
  1294  */
  1295 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1296   gboolean isemptyline)
  1297 {
  1298     /* Don't repeat multiple warnings on one line. */
  1299     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1300     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1301     const char *s;
  1302     gunichar c;
  1303     gsize nb;
  1304     gchar *t;
  1305     for (s=aline;*s;s=g_utf8_next_char(s))
  1306     {
  1307 	c=g_utf8_get_char(s);
  1308 	if (warnings->bin && !eInvalidChar &&
  1309 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1310 	{
  1311 	    if (pswit[ECHO_SWITCH])
  1312 		g_print("\n%s\n",aline);
  1313 	    if (!pswit[OVERVIEW_SWITCH])
  1314 		if (c>127 && c<160 || c>255)
  1315 		    g_print("    Line %ld column %ld - "
  1316 		      "Non-ISO-8859 character %u\n",
  1317 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1318 		else
  1319 		    g_print("    Line %ld column %ld - "
  1320 		      "Non-ASCII character %u\n",
  1321 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1322 	    else
  1323 		cnt_bin++;
  1324 	    eInvalidChar=TRUE;
  1325 	}
  1326 	if (!eInvalidChar && charset)
  1327 	{
  1328 	    if (charset_validator==(GIConv)-1)
  1329 	    {
  1330 		if (!g_unichar_isdefined(c))
  1331 		{
  1332 		    if (pswit[ECHO_SWITCH])
  1333 			g_print("\n%s\n",aline);
  1334 		    if (!pswit[OVERVIEW_SWITCH])
  1335 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1336 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1337 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1338 		    else
  1339 			cnt_bin++;
  1340 		    eInvalidChar=TRUE;
  1341 		}
  1342 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1343 		  c>=100000 && c<=0x10FFFD)
  1344 		{
  1345 		    if (pswit[ECHO_SWITCH])
  1346 			g_print("\n%s\n",aline);
  1347 		    if (!pswit[OVERVIEW_SWITCH])
  1348 			g_print("    Line %ld column %ld - Private Use "
  1349 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1350 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1351 		    else
  1352 			cnt_bin++;
  1353 		    eInvalidChar=TRUE;
  1354 		}
  1355 	    }
  1356 	    else
  1357 	    {
  1358 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1359 		  charset_validator,NULL,&nb,NULL);
  1360 		if (t)
  1361 		    g_free(t);
  1362 		else
  1363 		{
  1364 		    if (pswit[ECHO_SWITCH])
  1365 			g_print("\n%s\n",aline);
  1366 		    if (!pswit[OVERVIEW_SWITCH])
  1367 			g_print("    Line %ld column %ld - Non-%s "
  1368 			  "character %u\n",linecnt,
  1369 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1370 		    else
  1371 			cnt_bin++;
  1372 		    eInvalidChar=TRUE;
  1373 		}
  1374 	    }
  1375 	}
  1376 	if (!eTab && c==CHAR_TAB)
  1377 	{
  1378 	    if (pswit[ECHO_SWITCH])
  1379 		g_print("\n%s\n",aline);
  1380 	    if (!pswit[OVERVIEW_SWITCH])
  1381 		g_print("    Line %ld column %ld - Tab character?\n",
  1382 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1383 	    else
  1384 		cnt_odd++;
  1385 	    eTab=TRUE;
  1386 	}
  1387 	if (!eTilde && c==CHAR_TILDE)
  1388 	{
  1389 	    /*
  1390 	     * Often used by OCR software to indicate an
  1391 	     * unrecognizable character.
  1392 	     */
  1393 	    if (pswit[ECHO_SWITCH])
  1394 		g_print("\n%s\n",aline);
  1395 	    if (!pswit[OVERVIEW_SWITCH])
  1396 		g_print("    Line %ld column %ld - Tilde character?\n",
  1397 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1398 	    else
  1399 		cnt_odd++;
  1400 	    eTilde=TRUE;
  1401 	}
  1402 	if (!eCarat && c==CHAR_CARAT)
  1403 	{  
  1404 	    if (pswit[ECHO_SWITCH])
  1405 		g_print("\n%s\n",aline);
  1406 	    if (!pswit[OVERVIEW_SWITCH])
  1407 		g_print("    Line %ld column %ld - Carat character?\n",
  1408 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1409 	    else
  1410 		cnt_odd++;
  1411 	    eCarat=TRUE;
  1412 	}
  1413 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1414 	{  
  1415 	    if (pswit[ECHO_SWITCH])
  1416 		g_print("\n%s\n",aline);
  1417 	    if (!pswit[OVERVIEW_SWITCH])
  1418 		g_print("    Line %ld column %ld - Forward slash?\n",
  1419 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1420 	    else
  1421 		cnt_odd++;
  1422 	    eFSlash=TRUE;
  1423 	}
  1424 	/*
  1425 	 * Report asterisks only in paranoid mode,
  1426 	 * since they're often deliberate.
  1427 	 */
  1428 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1429 	  c==CHAR_ASTERISK)
  1430 	{
  1431 	    if (pswit[ECHO_SWITCH])
  1432 		g_print("\n%s\n",aline);
  1433 	    if (!pswit[OVERVIEW_SWITCH])
  1434 		g_print("    Line %ld column %ld - Asterisk?\n",
  1435 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1436 	    else
  1437 		cnt_odd++;
  1438 	    eAst=TRUE;
  1439 	}
  1440     }
  1441 }
  1442 
  1443 /*
  1444  * check_for_long_line:
  1445  *
  1446  * Check for line too long.
  1447  */
  1448 void check_for_long_line(const char *aline)
  1449 {
  1450     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1451     {
  1452 	if (pswit[ECHO_SWITCH])
  1453 	    g_print("\n%s\n",aline);
  1454 	if (!pswit[OVERVIEW_SWITCH])
  1455 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1456 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1457 	else
  1458 	    cnt_long++;
  1459     }
  1460 }
  1461 
  1462 /*
  1463  * check_for_short_line:
  1464  *
  1465  * Check for line too short.
  1466  *
  1467  * This one is a bit trickier to implement: we don't want to
  1468  * flag the last line of a paragraph for being short, so we
  1469  * have to wait until we know that our current line is a
  1470  * "normal" line, then report the _previous_ line if it was too
  1471  * short. We also don't want to report indented lines like
  1472  * chapter heads or formatted quotations. We therefore keep
  1473  * last->len as the length of the last line examined, and
  1474  * last->blen as the length of the last but one, and try to
  1475  * suppress unnecessary warnings by checking that both were of
  1476  * "normal" length. We keep the first character of the last
  1477  * line in last->start, and if it was a space, we assume that
  1478  * the formatting is deliberate. I can't figure out a way to
  1479  * distinguish something like a quoted verse left-aligned or
  1480  * the header or footer of a letter from a paragraph of short
  1481  * lines - maybe if I examined the whole paragraph, and if the
  1482  * para has less than, say, 8 lines and if all lines are short,
  1483  * then just assume it's OK? Need to look at some texts to see
  1484  * how often a formula like this would get the right result.
  1485  */
  1486 void check_for_short_line(const char *aline,const struct line_properties *last)
  1487 {
  1488     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1489       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1490       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1491     {
  1492 	if (pswit[ECHO_SWITCH])
  1493 	    g_print("\n%s\n",prevline);
  1494 	if (!pswit[OVERVIEW_SWITCH])
  1495 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1496 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1497 	else
  1498 	    cnt_short++;
  1499     }
  1500 }
  1501 
  1502 /*
  1503  * check_for_starting_punctuation:
  1504  *
  1505  * Look for punctuation other than full ellipses at start of line.
  1506  */
  1507 void check_for_starting_punctuation(const char *aline)
  1508 {
  1509     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1510       !g_str_has_prefix(aline,". . ."))
  1511     {
  1512 	if (pswit[ECHO_SWITCH])
  1513 	    g_print("\n%s\n",aline);
  1514 	if (!pswit[OVERVIEW_SWITCH])
  1515 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1516 	      linecnt);
  1517 	else
  1518 	    cnt_punct++;
  1519     }
  1520 }
  1521 
  1522 /*
  1523  * str_emdash:
  1524  *
  1525  * Find the first em-dash, return a pointer to it and set <next> to the
  1526  * character following the dash.
  1527  */
  1528 char *str_emdash(const char *s,const char **next)
  1529 {
  1530     const char *s1,*s2;
  1531     s1=strstr(s,"--");
  1532     s2=strstr(s,"—");
  1533     if (!s1)
  1534     {
  1535 	if (s2)
  1536 	    *next=g_utf8_next_char(s2);
  1537 	return (char *)s2;
  1538     }
  1539     else if (!s2)
  1540     {
  1541 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1542 	return (char *)s1;
  1543     }
  1544     else if (s1<s2)
  1545     {
  1546 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1547 	return (char *)s1;
  1548     }
  1549     else
  1550     {
  1551 	*next=g_utf8_next_char(s2);
  1552 	return (char *)s2;
  1553     }
  1554 }
  1555 
  1556 /*
  1557  * check_for_spaced_emdash:
  1558  *
  1559  * Check for spaced em-dashes.
  1560  *
  1561  * We must check _all_ occurrences of em-dashes on the line
  1562  * hence the loop - even if the first dash is OK
  1563  * there may be another that's wrong later on.
  1564  */
  1565 void check_for_spaced_emdash(const char *aline)
  1566 {
  1567     const char *s,*t,*next;
  1568     for (s=aline;t=str_emdash(s,&next);s=next)
  1569     {
  1570 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1571 	  g_utf8_get_char(next)==CHAR_SPACE)
  1572 	{
  1573 	    if (pswit[ECHO_SWITCH])
  1574 		g_print("\n%s\n",aline);
  1575 	    if (!pswit[OVERVIEW_SWITCH])
  1576 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1577 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1578 	    else
  1579 		cnt_dash++;
  1580 	}
  1581     }
  1582 }
  1583 
  1584 /*
  1585  * check_for_spaced_dash:
  1586  *
  1587  * Check for spaced dashes.
  1588  */
  1589 void check_for_spaced_dash(const char *aline)
  1590 {
  1591     const char *s;
  1592     if ((s=strstr(aline," -")))
  1593     {
  1594 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1595 	{
  1596 	    if (pswit[ECHO_SWITCH])
  1597 		g_print("\n%s\n",aline);
  1598 	    if (!pswit[OVERVIEW_SWITCH])
  1599 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1600 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1601 	    else
  1602 		cnt_dash++;
  1603 	}
  1604     }
  1605     else if ((s=strstr(aline,"- ")))
  1606     {
  1607 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1608 	{
  1609 	    if (pswit[ECHO_SWITCH])
  1610 		g_print("\n%s\n",aline);
  1611 	    if (!pswit[OVERVIEW_SWITCH])
  1612 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1613 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1614 	    else
  1615 		cnt_dash++;
  1616 	}
  1617     }
  1618 }
  1619 
  1620 /*
  1621  * check_for_unmarked_paragraphs:
  1622  *
  1623  * Check for unmarked paragraphs indicated by separate speakers.
  1624  *
  1625  * May well be false positive:
  1626  * "Bravo!" "Wonderful!" called the crowd.
  1627  * but useful all the same.
  1628  */
  1629 void check_for_unmarked_paragraphs(const char *aline)
  1630 {
  1631     const char *s;
  1632     s=strstr(aline,"\"  \"");
  1633     if (!s)
  1634 	s=strstr(aline,"\" \"");
  1635     if (s)
  1636     {
  1637 	if (pswit[ECHO_SWITCH])
  1638 	    g_print("\n%s\n",aline);
  1639 	if (!pswit[OVERVIEW_SWITCH])
  1640 	    g_print("    Line %ld column %ld - "
  1641 	      "Query missing paragraph break?\n",
  1642 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1643 	else
  1644 	    cnt_punct++;
  1645     }
  1646 }
  1647 
  1648 /*
  1649  * check_for_jeebies:
  1650  *
  1651  * Check for "to he" and other easy h/b errors.
  1652  *
  1653  * This is a very inadequate effort on the h/b problem,
  1654  * but the phrase "to he" is always an error, whereas "to
  1655  * be" is quite common.
  1656  * Similarly, '"Quiet!", be said.' is a non-be error
  1657  * "to he" is _not_ always an error!:
  1658  *       "Where they went to he couldn't say."
  1659  * Another false positive:
  1660  *       What would "Cinderella" be without the . . .
  1661  * and another: "If he wants to he can see for himself."
  1662  */
  1663 void check_for_jeebies(const char *aline)
  1664 {
  1665     const char *s;
  1666     s=strstr(aline," be could ");
  1667     if (!s)
  1668 	s=strstr(aline," be would ");
  1669     if (!s)
  1670 	s=strstr(aline," was be ");
  1671     if (!s)
  1672 	s=strstr(aline," be is ");
  1673     if (!s)
  1674 	s=strstr(aline," is be ");
  1675     if (!s)
  1676 	s=strstr(aline,"\", be ");
  1677     if (!s)
  1678 	s=strstr(aline,"\" be ");
  1679     if (!s)
  1680 	s=strstr(aline,"\" be ");
  1681     if (!s)
  1682 	s=strstr(aline," to he ");
  1683     if (s)
  1684     {
  1685 	if (pswit[ECHO_SWITCH])
  1686 	    g_print("\n%s\n",aline);
  1687 	if (!pswit[OVERVIEW_SWITCH])
  1688 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1689 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1690 	else
  1691 	    cnt_word++;
  1692     }
  1693     s=strstr(aline," the had ");
  1694     if (!s)
  1695 	s=strstr(aline," a had ");
  1696     if (!s)
  1697 	s=strstr(aline," they bad ");
  1698     if (!s)
  1699 	s=strstr(aline," she bad ");
  1700     if (!s)
  1701 	s=strstr(aline," he bad ");
  1702     if (!s)
  1703 	s=strstr(aline," you bad ");
  1704     if (!s)
  1705 	s=strstr(aline," i bad ");
  1706     if (s)
  1707     {
  1708 	if (pswit[ECHO_SWITCH])
  1709 	    g_print("\n%s\n",aline);
  1710 	if (!pswit[OVERVIEW_SWITCH])
  1711 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1712 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1713 	else
  1714 	    cnt_word++;
  1715     }
  1716     s=strstr(aline,"; hut ");
  1717     if (!s)
  1718 	s=strstr(aline,", hut ");
  1719     if (s)
  1720     {
  1721 	if (pswit[ECHO_SWITCH])
  1722 	    g_print("\n%s\n",aline);
  1723 	if (!pswit[OVERVIEW_SWITCH])
  1724 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1725 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1726 	else
  1727 	    cnt_word++;
  1728     }
  1729 }
  1730 
  1731 /*
  1732  * check_for_mta_from:
  1733  *
  1734  * Special case - angled bracket in front of "From" placed there by an
  1735  * MTA when sending an e-mail.
  1736  */
  1737 void check_for_mta_from(const char *aline)
  1738 {
  1739     const char *s;
  1740     s=strstr(aline,">From");
  1741     if (s)
  1742     {
  1743 	if (pswit[ECHO_SWITCH])
  1744 	    g_print("\n%s\n",aline);
  1745 	if (!pswit[OVERVIEW_SWITCH])
  1746 	    g_print("    Line %ld column %ld - "
  1747 	      "Query angled bracket with From\n",
  1748 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1749 	else
  1750 	    cnt_punct++;
  1751     }
  1752 }
  1753 
  1754 /*
  1755  * check_for_orphan_character:
  1756  *
  1757  * Check for a single character line -
  1758  * often an overflow from bad wrapping.
  1759  */
  1760 void check_for_orphan_character(const char *aline)
  1761 {
  1762     gunichar c;
  1763     c=g_utf8_get_char(aline);
  1764     if (c && !*g_utf8_next_char(aline))
  1765     {
  1766 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1767 	    ; /* Nothing - ignore numerals alone on a line. */
  1768 	else
  1769 	{
  1770 	    if (pswit[ECHO_SWITCH])
  1771 		g_print("\n%s\n",aline);
  1772 	    if (!pswit[OVERVIEW_SWITCH])
  1773 		g_print("    Line %ld column 1 - Query single character line\n",
  1774 		  linecnt);
  1775 	    else
  1776 		cnt_punct++;
  1777 	}
  1778     }
  1779 }
  1780 
  1781 /*
  1782  * check_for_pling_scanno:
  1783  *
  1784  * Check for I" - often should be !
  1785  */
  1786 void check_for_pling_scanno(const char *aline)
  1787 {
  1788     const char *s;
  1789     s=strstr(aline," I\"");
  1790     if (s)
  1791     {
  1792 	if (pswit[ECHO_SWITCH])
  1793 	    g_print("\n%s\n",aline);
  1794 	if (!pswit[OVERVIEW_SWITCH])
  1795 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1796 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1797 	else
  1798 	    cnt_punct++;
  1799     }
  1800 }
  1801 
  1802 /*
  1803  * check_for_extra_period:
  1804  *
  1805  * Check for period without a capital letter. Cut-down from gutspell.
  1806  * Only works when it happens on a single line.
  1807  */
  1808 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1809 {
  1810     const char *s,*t,*s1,*sprev;
  1811     int i;
  1812     gsize len;
  1813     gboolean istypo;
  1814     gchar *testword;
  1815     gunichar c,nc,pc,*decomposition;
  1816     if (pswit[PARANOID_SWITCH])
  1817     {
  1818 	for (t=aline;t=strstr(t,". ");)
  1819 	{
  1820 	    if (t==aline)
  1821 	    {
  1822 		t=g_utf8_next_char(t);
  1823 		/* start of line punctuation is handled elsewhere */
  1824 		continue;
  1825 	    }
  1826 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1827 	    {
  1828 		t=g_utf8_next_char(t);
  1829 		continue;
  1830 	    }
  1831 	    if (warnings->isDutch)
  1832 	    {
  1833 		/* For Frank & Jeroen -- 's Middags case */
  1834 		gunichar c2,c3,c4,c5;
  1835 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1836 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1837 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1838 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1839 		if (CHAR_IS_APOSTROPHE(c2) &&
  1840 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1841 		  g_unichar_isupper(c5))
  1842 		{
  1843 		    t=g_utf8_next_char(t);
  1844 		    continue;
  1845 		}
  1846 	    }
  1847 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1848 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1849 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1850 		s1=g_utf8_next_char(s1);
  1851 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1852 	    {
  1853 		/* we have something to investigate */
  1854 		istypo=TRUE;
  1855 		/* so let's go back and find out */
  1856 		nc=g_utf8_get_char(t);
  1857 		s1=g_utf8_prev_char(t);
  1858 		c=g_utf8_get_char(s1);
  1859 		sprev=g_utf8_prev_char(s1);
  1860 		pc=g_utf8_get_char(sprev);
  1861 		while (s1>=aline &&
  1862 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1863 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1864 		  g_unichar_isalpha(nc)))
  1865 		{
  1866 		    nc=c;
  1867 		    s1=sprev;
  1868 		    c=pc;
  1869 		    sprev=g_utf8_prev_char(s1);
  1870 		    pc=g_utf8_get_char(sprev);
  1871 		}
  1872 		s1=g_utf8_next_char(s1);
  1873 		s=strchr(s1,'.');
  1874 		if (s)
  1875 		    testword=g_strndup(s1,s-s1);
  1876 		else
  1877 		    testword=g_strdup(s1);
  1878 		for (i=0;*abbrev[i];i++)
  1879 		    if (!strcmp(testword,abbrev[i]))
  1880 			istypo=FALSE;
  1881 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1882 		    istypo=FALSE;
  1883 		if (!*g_utf8_next_char(testword))
  1884 		    istypo=FALSE;
  1885 		if (isroman(testword))
  1886 		    istypo=FALSE;
  1887 		if (istypo)
  1888 		{
  1889 		    istypo=FALSE;
  1890 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1891 		    {
  1892 			decomposition=g_unicode_canonical_decomposition(
  1893 			  g_utf8_get_char(s),&len);
  1894 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1895 			    istypo=TRUE;
  1896 			g_free(decomposition);
  1897 		    }
  1898 		}
  1899 		if (istypo &&
  1900 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1901 		{
  1902 		    g_tree_insert(qperiod,g_strdup(testword),
  1903 		      GINT_TO_POINTER(1));
  1904 		    if (pswit[ECHO_SWITCH])
  1905 			g_print("\n%s\n",aline);
  1906 		    if (!pswit[OVERVIEW_SWITCH])
  1907 			g_print("    Line %ld column %ld - Extra period?\n",
  1908 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1909 		    else
  1910 			cnt_punct++;
  1911 		}
  1912 		g_free(testword);
  1913 	    }
  1914 	    t=g_utf8_next_char(t);
  1915 	}
  1916     }
  1917 }
  1918 
  1919 /*
  1920  * check_for_following_punctuation:
  1921  *
  1922  * Check for words usually not followed by punctuation.
  1923  */
  1924 void check_for_following_punctuation(const char *aline)
  1925 {
  1926     int i;
  1927     const char *s,*wordstart;
  1928     gunichar c;
  1929     gchar *inword,*t;
  1930     if (pswit[TYPO_SWITCH])
  1931     {
  1932 	for (s=aline;*s;)
  1933 	{
  1934 	    wordstart=s;
  1935 	    t=getaword(&s);
  1936 	    if (!*t)
  1937 	    {
  1938 		g_free(t);
  1939 		continue;
  1940 	    }
  1941 	    inword=g_utf8_strdown(t,-1);
  1942 	    g_free(t);
  1943 	    for (i=0;*nocomma[i];i++)
  1944 		if (!strcmp(inword,nocomma[i]))
  1945 		{
  1946 		    c=g_utf8_get_char(s);
  1947 		    if (c==',' || c==';' || c==':')
  1948 		    {
  1949 			if (pswit[ECHO_SWITCH])
  1950 			    g_print("\n%s\n",aline);
  1951 			if (!pswit[OVERVIEW_SWITCH])
  1952 			    g_print("    Line %ld column %ld - "
  1953 			      "Query punctuation after %s?\n",
  1954 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1955 			      inword);
  1956 			else
  1957 			    cnt_punct++;
  1958 		    }
  1959 		}
  1960 	    for (i=0;*noperiod[i];i++)
  1961 		if (!strcmp(inword,noperiod[i]))
  1962 		{
  1963 		    c=g_utf8_get_char(s);
  1964 		    if (c=='.' || c=='!')
  1965 		    {
  1966 			if (pswit[ECHO_SWITCH])
  1967 			    g_print("\n%s\n",aline);
  1968 			if (!pswit[OVERVIEW_SWITCH])
  1969 			    g_print("    Line %ld column %ld - "
  1970 			      "Query punctuation after %s?\n",
  1971 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1972 			      inword);
  1973 			else
  1974 			    cnt_punct++;
  1975 		    }
  1976 		}
  1977 	    g_free(inword);
  1978 	}
  1979     }
  1980 }
  1981 
  1982 /*
  1983  * check_for_typos:
  1984  *
  1985  * Check for commonly mistyped words,
  1986  * and digits like 0 for O in a word.
  1987  */
  1988 void check_for_typos(const char *aline,struct warnings *warnings)
  1989 {
  1990     const char *s,*t,*nt,*wordstart;
  1991     gchar *inword;
  1992     gunichar *decomposition;
  1993     gchar *testword;
  1994     int i,vowel,consonant,*dupcnt;
  1995     gboolean isdup,istypo,alower;
  1996     gunichar c,pc;
  1997     long offset,len;
  1998     gsize decomposition_len;
  1999     for (s=aline;*s;)
  2000     {
  2001 	wordstart=s;
  2002 	inword=getaword(&s);
  2003 	if (!*inword)
  2004 	{
  2005 	    g_free(inword);
  2006 	    continue; /* don't bother with empty lines */
  2007 	}
  2008 	if (mixdigit(inword))
  2009 	{
  2010 	    if (pswit[ECHO_SWITCH])
  2011 		g_print("\n%s\n",aline);
  2012 	    if (!pswit[OVERVIEW_SWITCH])
  2013 		g_print("    Line %ld column %ld - Query digit in %s\n",
  2014 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  2015 	    else
  2016 		cnt_word++;
  2017 	}
  2018 	/*
  2019 	 * Put the word through a series of tests for likely typos and OCR
  2020 	 * errors.
  2021 	 */
  2022 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2023 	{
  2024 	    istypo=FALSE;
  2025 	    alower=FALSE;
  2026 	    for (t=inword;*t;t=g_utf8_next_char(t))
  2027 	    {
  2028 		c=g_utf8_get_char(t);
  2029 		nt=g_utf8_next_char(t);
  2030 		/* lowercase for testing */
  2031 		if (g_unichar_islower(c))
  2032 		    alower=TRUE;
  2033 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  2034 		{
  2035 		    /*
  2036 		     * We have an uppercase mid-word. However, there are
  2037 		     * common cases:
  2038 		     *   Mac and Mc like McGill
  2039 		     *   French contractions like l'Abbe
  2040 		     */
  2041 		    offset=g_utf8_pointer_to_offset(inword,t);
  2042 		    if (offset>0)
  2043 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  2044 		    else
  2045 			pc='\0';
  2046 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  2047 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  2048 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  2049 		      CHAR_IS_APOSTROPHE(pc))
  2050 			; /* do nothing! */
  2051 		    else
  2052 			istypo=TRUE;
  2053 		}
  2054 	    }
  2055 	    testword=g_utf8_casefold(inword,-1);
  2056 	}
  2057 	if (pswit[TYPO_SWITCH])
  2058 	{
  2059 	    /*
  2060 	     * Check for certain unlikely two-letter combinations at word
  2061 	     * start and end.
  2062 	     */
  2063 	    len=g_utf8_strlen(testword,-1);
  2064 	    if (len>1)
  2065 	    {
  2066 		for (i=0;*nostart[i];i++)
  2067 		    if (g_str_has_prefix(testword,nostart[i]))
  2068 			istypo=TRUE;
  2069 		for (i=0;*noend[i];i++)
  2070 		    if (g_str_has_suffix(testword,noend[i]))
  2071 			istypo=TRUE;
  2072 	    }
  2073 	    /* ght is common, gbt never. Like that. */
  2074 	    if (strstr(testword,"cb"))
  2075 		istypo=TRUE;
  2076 	    if (strstr(testword,"gbt"))
  2077 		istypo=TRUE;
  2078 	    if (strstr(testword,"pbt"))
  2079 		istypo=TRUE;
  2080 	    if (strstr(testword,"tbs"))
  2081 		istypo=TRUE;
  2082 	    if (strstr(testword,"mrn"))
  2083 		istypo=TRUE;
  2084 	    if (strstr(testword,"ahle"))
  2085 		istypo=TRUE;
  2086 	    if (strstr(testword,"ihle"))
  2087 		istypo=TRUE;
  2088 	    /*
  2089 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2090 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2091 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2092 	     * numerals, but "ii" is a common scanno.
  2093 	     */
  2094 	    if (strstr(testword,"tbi"))
  2095 		istypo=TRUE;
  2096 	    if (strstr(testword,"tbe"))
  2097 		istypo=TRUE;
  2098 	    if (strstr(testword,"ii"))
  2099 		istypo=TRUE;
  2100 	    /*
  2101 	     * Check for no vowels or no consonants.
  2102 	     * If none, flag a typo.
  2103 	     */
  2104 	    if (!istypo && len>1)
  2105 	    {
  2106 		vowel=consonant=0;
  2107 		for (t=testword;*t;t=g_utf8_next_char(t))
  2108 		{
  2109 		    c=g_utf8_get_char(t);
  2110 		    decomposition=
  2111 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2112 		    if (c=='y' || g_unichar_isdigit(c))
  2113 		    {
  2114 			/* Yah, this is loose. */
  2115 			vowel++;
  2116 			consonant++;
  2117 		    }
  2118 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2119 			vowel++;
  2120 		    else
  2121 			consonant++;
  2122 		    g_free(decomposition);
  2123 		}
  2124 		if (!vowel || !consonant)
  2125 		    istypo=TRUE;
  2126 	    }
  2127 	    /*
  2128 	     * Now exclude the word from being reported if it's in
  2129 	     * the okword list.
  2130 	     */
  2131 	    for (i=0;*okword[i];i++)
  2132 		if (!strcmp(testword,okword[i]))
  2133 		    istypo=FALSE;
  2134 	    /*
  2135 	     * What looks like a typo may be a Roman numeral.
  2136 	     * Exclude these.
  2137 	     */
  2138 	    if (istypo && isroman(testword))
  2139 		istypo=FALSE;
  2140 	    /* Check the manual list of typos. */
  2141 	    if (!istypo)
  2142 		for (i=0;*typo[i];i++)
  2143 		    if (!strcmp(testword,typo[i]))
  2144 			istypo=TRUE;
  2145 	    /*
  2146 	     * Check lowercase s, l, i and m - special cases.
  2147 	     *   "j" - often a semi-colon gone wrong.
  2148 	     *   "d" for a missing apostrophe - he d
  2149 	     *   "n" for "in"
  2150 	     */
  2151 	    if (!istypo && len==1 &&
  2152 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2153 		istypo=TRUE;
  2154 	    if (istypo)
  2155 	    {
  2156 		dupcnt=g_tree_lookup(qword,testword);
  2157 		if (dupcnt)
  2158 		{
  2159 		    (*dupcnt)++;
  2160 		    isdup=!pswit[VERBOSE_SWITCH];
  2161 		}
  2162 		else
  2163 		{
  2164 		    dupcnt=g_new0(int,1);
  2165 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2166 		    isdup=FALSE;
  2167 		}
  2168 		if (!isdup)
  2169 		{
  2170 		    if (pswit[ECHO_SWITCH])
  2171 			g_print("\n%s\n",aline);
  2172 		    if (!pswit[OVERVIEW_SWITCH])
  2173 		    {
  2174 			g_print("    Line %ld column %ld - Query word %s",
  2175 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2176 			  inword);
  2177 			if (!pswit[VERBOSE_SWITCH])
  2178 			    g_print(" - not reporting duplicates");
  2179 			g_print("\n");
  2180 		    }
  2181 		    else
  2182 			cnt_word++;
  2183 		}
  2184 	    }
  2185 	}
  2186 	/* check the user's list of typos */
  2187 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2188 	{
  2189 	    if (pswit[ECHO_SWITCH])
  2190 		g_print("\n%s\n",aline);
  2191 	    if (!pswit[OVERVIEW_SWITCH])  
  2192 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2193 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2194 	}
  2195 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2196 	    g_free(testword);
  2197 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2198 	{
  2199 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2200 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2201 	    {
  2202 		if (pswit[ECHO_SWITCH])
  2203 		    g_print("\n%s\n",aline);
  2204 		if (!pswit[OVERVIEW_SWITCH])
  2205 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2206 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2207 		      inword);
  2208 		else
  2209 		    cnt_word++;
  2210 	    }
  2211 	}
  2212 	g_free(inword);
  2213     }
  2214 }
  2215 
  2216 /*
  2217  * check_for_misspaced_punctuation:
  2218  *
  2219  * Look for added or missing spaces around punctuation and quotes.
  2220  * If there is a punctuation character like ! with no space on
  2221  * either side, suspect a missing!space. If there are spaces on
  2222  * both sides , assume a typo. If we see a double quote with no
  2223  * space or punctuation on either side of it, assume unspaced
  2224  * quotes "like"this.
  2225  */
  2226 void check_for_misspaced_punctuation(const char *aline,
  2227   struct parities *parities,gboolean isemptyline)
  2228 {
  2229     gboolean isacro,isellipsis;
  2230     const char *s;
  2231     gunichar c,nc,pc,n2c;
  2232     int parity;
  2233     c=g_utf8_get_char(aline);
  2234     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2235     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2236     {
  2237 	pc=c;
  2238 	c=nc;
  2239 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2240 	/* For each character in the line after the first. */
  2241 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2242 	{
  2243 	    /* we need to suppress warnings for acronyms like M.D. */
  2244 	    isacro=FALSE;
  2245 	    /* we need to suppress warnings for ellipsis . . . */
  2246 	    isellipsis=FALSE;
  2247 	    /*
  2248 	     * If there are letters on both sides of it or
  2249 	     * if it's strict punctuation followed by an alpha.
  2250 	     */
  2251 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2252 	      g_utf8_strchr("?!,;:",-1,c)))
  2253 	    {
  2254 		if (c=='.')
  2255 		{
  2256 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2257 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2258 			isacro=TRUE;
  2259 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2260 		    if (nc && n2c=='.')
  2261 			isacro=TRUE;
  2262 		}
  2263 		if (!isacro)
  2264 		{
  2265 		    if (pswit[ECHO_SWITCH])
  2266 			g_print("\n%s\n",aline);
  2267 		    if (!pswit[OVERVIEW_SWITCH])
  2268 			g_print("    Line %ld column %ld - Missing space?\n",
  2269 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2270 		    else
  2271 			cnt_punct++;
  2272 		}
  2273 	    }
  2274 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2275 	    {
  2276 		/*
  2277 		 * If there are spaces on both sides,
  2278 		 * or space before and end of line.
  2279 		 */
  2280 		if (c=='.')
  2281 		{
  2282 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2283 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2284 			isellipsis=TRUE;
  2285 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2286 		    if (nc && n2c=='.')
  2287 			isellipsis=TRUE;
  2288 		}
  2289 		if (!isemptyline && !isellipsis)
  2290 		{
  2291 		    if (pswit[ECHO_SWITCH])
  2292 			g_print("\n%s\n",aline);
  2293 		    if (!pswit[OVERVIEW_SWITCH])
  2294 			g_print("    Line %ld column %ld - "
  2295 			  "Spaced punctuation?\n",linecnt,
  2296 			  g_utf8_pointer_to_offset(aline,s)+1);
  2297 		    else
  2298 			cnt_punct++;
  2299 		}
  2300 	    }
  2301 	}
  2302     }
  2303     /* Split out the characters that CANNOT be preceded by space. */
  2304     c=g_utf8_get_char(aline);
  2305     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2306     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2307     {
  2308 	pc=c;
  2309 	c=nc;
  2310 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2311 	/* for each character in the line after the first */
  2312 	if (g_utf8_strchr("?!,;:",-1,c))
  2313 	{
  2314 	    /* if it's punctuation that _cannot_ have a space before it */
  2315 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2316 	    {
  2317 		/*
  2318 		 * If nc DOES == space,
  2319 		 * it was already reported just above.
  2320 		 */
  2321 		if (pswit[ECHO_SWITCH])
  2322 		    g_print("\n%s\n",aline);
  2323 		if (!pswit[OVERVIEW_SWITCH])
  2324 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2325 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2326 		else
  2327 		    cnt_punct++;
  2328 	    }
  2329 	}
  2330     }
  2331     /*
  2332      * Special case " .X" where X is any alpha.
  2333      * This plugs a hole in the acronym code above.
  2334      * Inelegant, but maintainable.
  2335      */
  2336     c=g_utf8_get_char(aline);
  2337     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2338     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2339     {
  2340 	pc=c;
  2341 	c=nc;
  2342 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2343 	/* for each character in the line after the first */
  2344 	if (c=='.')
  2345 	{
  2346 	    /* if it's a period */
  2347 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2348 	    {
  2349 		/*
  2350 		 * If the period follows a space and
  2351 		 * is followed by a letter.
  2352 		 */
  2353 		if (pswit[ECHO_SWITCH])
  2354 		    g_print("\n%s\n",aline);
  2355 		if (!pswit[OVERVIEW_SWITCH])
  2356 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2357 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2358 		else
  2359 		    cnt_punct++;
  2360 	    }
  2361 	}
  2362     }
  2363     c=g_utf8_get_char(aline);
  2364     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2365     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2366     {
  2367 	pc=c;
  2368 	c=nc;
  2369 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2370 	/* for each character in the line after the first */
  2371 	if (CHAR_IS_DQUOTE(c))
  2372 	{
  2373 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2374 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2375 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2376 	    {
  2377 		if (pswit[ECHO_SWITCH])
  2378 		    g_print("\n%s\n",aline);
  2379 		if (!pswit[OVERVIEW_SWITCH])
  2380 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2381 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2382 		else
  2383 		    cnt_punct++;
  2384 	    }
  2385 	}
  2386     }
  2387     /* Check parity of quotes. */
  2388     nc=g_utf8_get_char(aline);
  2389     for (s=aline;*s;s=g_utf8_next_char(s))
  2390     {
  2391 	c=nc;
  2392 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2393 	if (CHAR_IS_DQUOTE(c))
  2394 	{
  2395 	    if (c==CHAR_DQUOTE)
  2396 	    {
  2397 		parities->dquote=!parities->dquote;
  2398 		parity=parities->dquote;
  2399 	    }
  2400 	    else if (c==CHAR_LD_QUOTE)
  2401 		parity=1;
  2402 	    else
  2403 		parity=0;
  2404 	    if (!parity)
  2405 	    {
  2406 		/* parity even */
  2407 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2408 		{
  2409 		    if (pswit[ECHO_SWITCH])
  2410 			g_print("\n%s\n",aline);
  2411 		    if (!pswit[OVERVIEW_SWITCH])
  2412 			g_print("    Line %ld column %ld - "
  2413 			  "Wrongspaced quotes?\n",
  2414 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2415 		    else
  2416 			cnt_punct++;
  2417 		}
  2418 	    }
  2419 	    else
  2420 	    {
  2421 		/* parity odd */
  2422 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2423 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2424 		{
  2425 		    if (pswit[ECHO_SWITCH])
  2426 			g_print("\n%s\n",aline);
  2427 		    if (!pswit[OVERVIEW_SWITCH])
  2428 			g_print("    Line %ld column %ld - "
  2429 			  "Wrongspaced quotes?\n",
  2430 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2431 		    else
  2432 			cnt_punct++;
  2433 		}
  2434 	    }
  2435 	}
  2436     }
  2437     c=g_utf8_get_char(aline);
  2438     if (CHAR_IS_DQUOTE(c))
  2439     {
  2440 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2441 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2442 	{
  2443 	    if (pswit[ECHO_SWITCH])
  2444 		g_print("\n%s\n",aline);
  2445 	    if (!pswit[OVERVIEW_SWITCH])
  2446 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2447 		  linecnt);
  2448 	    else
  2449 		cnt_punct++;
  2450 	}
  2451     }
  2452     if (pswit[SQUOTE_SWITCH])
  2453     {
  2454 	nc=g_utf8_get_char(aline);
  2455 	for (s=aline;*s;s=g_utf8_next_char(s))
  2456 	{
  2457 	    c=nc;
  2458 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2459 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2460 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2461 	      !g_unichar_isalpha(nc)))
  2462 	    {
  2463 		parities->squote=!parities->squote;
  2464 		if (!parities->squote)
  2465 		{
  2466 		    /* parity even */
  2467 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2468 		    {
  2469 			if (pswit[ECHO_SWITCH])
  2470 			    g_print("\n%s\n",aline);
  2471 			if (!pswit[OVERVIEW_SWITCH])
  2472 			    g_print("    Line %ld column %ld - "
  2473 			      "Wrongspaced singlequotes?\n",
  2474 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2475 			else
  2476 			    cnt_punct++;
  2477 		    }
  2478 		}
  2479 		else
  2480 		{
  2481 		    /* parity odd */
  2482 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2483 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2484 		    {
  2485 			if (pswit[ECHO_SWITCH])
  2486 			    g_print("\n%s\n",aline);
  2487 			if (!pswit[OVERVIEW_SWITCH])
  2488 			    g_print("    Line %ld column %ld - "
  2489 			      "Wrongspaced singlequotes?\n",
  2490 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2491 			else
  2492 			    cnt_punct++;
  2493 		    }
  2494 		}
  2495 	    }
  2496 	}
  2497     }
  2498 }
  2499 
  2500 /*
  2501  * check_for_double_punctuation:
  2502  *
  2503  * Look for double punctuation like ,. or ,,
  2504  * Thanks to DW for the suggestion!
  2505  * In books with references, ".," and ".;" are common
  2506  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2507  * OTOH, from my initial tests, there are also fairly
  2508  * common errors. What to do? Make these cases paranoid?
  2509  * ".," is the most common, so warnings->dotcomma is used
  2510  * to suppress detailed reporting if it occurs often.
  2511  */
  2512 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2513 {
  2514     const char *s;
  2515     gunichar c,nc;
  2516     nc=g_utf8_get_char(aline);
  2517     for (s=aline;*s;s=g_utf8_next_char(s))
  2518     {
  2519 	c=nc;
  2520 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2521 	/* for each punctuation character in the line */
  2522 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2523 	  g_utf8_strchr(".?!,;:",-1,nc))
  2524 	{
  2525 	    /* followed by punctuation, it's a query, unless . . . */
  2526 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2527 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2528 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2529 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2530 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2531 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2532 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2533 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2534 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2535 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2536 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2537 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2538 	    {
  2539 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2540 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2541 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2542 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2543 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2544 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2545 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2546 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2547 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2548 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2549 		{
  2550 		    s+=4;
  2551 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2552 		}
  2553 		; /* do nothing for .. !! and ?? which can be legit */
  2554 	    }
  2555 	    else
  2556 	    {
  2557 		if (pswit[ECHO_SWITCH])
  2558 		    g_print("\n%s\n",aline);
  2559 		if (!pswit[OVERVIEW_SWITCH])
  2560 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2561 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2562 		else
  2563 		    cnt_punct++;
  2564 	    }
  2565 	}
  2566     }
  2567 }
  2568 
  2569 /*
  2570  * check_for_spaced_quotes:
  2571  */
  2572 void check_for_spaced_quotes(const char *aline)
  2573 {
  2574     int i;
  2575     const char *s,*t;
  2576     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2577       CHAR_RS_QUOTE};
  2578     GString *pattern;
  2579     s=aline;
  2580     while ((t=strstr(s," \" ")))
  2581     {
  2582 	if (pswit[ECHO_SWITCH])
  2583 	    g_print("\n%s\n",aline);
  2584 	if (!pswit[OVERVIEW_SWITCH])
  2585 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2586 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2587 	else
  2588 	    cnt_punct++;
  2589 	s=g_utf8_next_char(g_utf8_next_char(t));
  2590     }
  2591     pattern=g_string_new(NULL);
  2592     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2593     {
  2594 	g_string_assign(pattern," ");
  2595 	g_string_append_unichar(pattern,single_quotes[i]);
  2596 	g_string_append_c(pattern,' ');
  2597 	s=aline;
  2598 	while ((t=strstr(s,pattern->str)))
  2599 	{
  2600 	    if (pswit[ECHO_SWITCH])
  2601 		g_print("\n%s\n",aline);
  2602 	    if (!pswit[OVERVIEW_SWITCH])
  2603 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2604 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2605 	    else
  2606 		cnt_punct++;
  2607 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2608 	}
  2609     }
  2610     g_string_free(pattern,TRUE);
  2611 }
  2612 
  2613 /*
  2614  * check_for_miscased_genative:
  2615  *
  2616  * Check special case of 'S instead of 's at end of word.
  2617  */
  2618 void check_for_miscased_genative(const char *aline)
  2619 {
  2620     const char *s;
  2621     gunichar c,nc,pc;
  2622     if (!*aline)
  2623 	return;
  2624     c=g_utf8_get_char(aline);
  2625     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2626     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2627     {
  2628 	pc=c;
  2629 	c=nc;
  2630 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2631 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2632 	{
  2633 	    if (pswit[ECHO_SWITCH])
  2634 		g_print("\n%s\n",aline);
  2635 	    if (!pswit[OVERVIEW_SWITCH])
  2636 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2637 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2638 	    else
  2639 		cnt_punct++;
  2640 	}
  2641     }
  2642 }
  2643 
  2644 /*
  2645  * check_end_of_line:
  2646  *
  2647  * Now check special cases - start and end of line -
  2648  * for single and double quotes. Start is sometimes [sic]
  2649  * but better to query it anyway.
  2650  * While we're here, check for dash at end of line.
  2651  */
  2652 void check_end_of_line(const char *aline,struct warnings *warnings)
  2653 {
  2654     int lbytes;
  2655     const char *s;
  2656     gunichar c1,c2;
  2657     lbytes=strlen(aline);
  2658     if (g_utf8_strlen(aline,lbytes)>1)
  2659     {
  2660 	s=g_utf8_prev_char(aline+lbytes);
  2661 	c1=g_utf8_get_char(s);
  2662 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2663 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2664 	{
  2665 	    if (pswit[ECHO_SWITCH])
  2666 		g_print("\n%s\n",aline);
  2667 	    if (!pswit[OVERVIEW_SWITCH])
  2668 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2669 		  g_utf8_strlen(aline,lbytes));
  2670 	    else
  2671 		cnt_punct++;
  2672 	}
  2673 	c1=g_utf8_get_char(aline);
  2674 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2675 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2676 	{
  2677 	    if (pswit[ECHO_SWITCH])
  2678 		g_print("\n%s\n",aline);
  2679 	    if (!pswit[OVERVIEW_SWITCH])
  2680 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2681 	    else
  2682 		cnt_punct++;
  2683 	}
  2684 	/*
  2685 	 * Dash at end of line may well be legit - paranoid mode only
  2686 	 * and don't report em-dash at line-end.
  2687 	 */
  2688 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2689 	{
  2690 	    for (s=g_utf8_prev_char(aline+lbytes);
  2691 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2692 		;
  2693 	    if (g_utf8_get_char(s)=='-' &&
  2694 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2695 	    {
  2696 		if (pswit[ECHO_SWITCH])
  2697 		    g_print("\n%s\n",aline);
  2698 		if (!pswit[OVERVIEW_SWITCH])
  2699 		    g_print("    Line %ld column %ld - "
  2700 		      "Hyphen at end of line?\n",
  2701 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2702 	    }
  2703 	}
  2704     }
  2705 }
  2706 
  2707 /*
  2708  * check_for_unspaced_bracket:
  2709  *
  2710  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2711  * If so, suspect a scanno like "a]most".
  2712  */
  2713 void check_for_unspaced_bracket(const char *aline)
  2714 {
  2715     const char *s;
  2716     gunichar c,nc,pc;
  2717     c=g_utf8_get_char(aline);
  2718     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2719     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2720     {
  2721 	pc=c;
  2722 	c=nc;
  2723 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2724 	if (!nc)
  2725 	    break;
  2726 	/* for each bracket character in the line except 1st & last */
  2727 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2728 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2729 	{
  2730 	    if (pswit[ECHO_SWITCH])
  2731 		g_print("\n%s\n",aline);
  2732 	    if (!pswit[OVERVIEW_SWITCH])
  2733 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2734 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2735 	    else
  2736 		cnt_punct++;
  2737 	}
  2738     }
  2739 }
  2740 
  2741 /*
  2742  * check_for_unpunctuated_endquote:
  2743  */
  2744 void check_for_unpunctuated_endquote(const char *aline)
  2745 {
  2746     const char *s;
  2747     gunichar c,nc,pc;
  2748     QuoteClass qc;
  2749     c=g_utf8_get_char(aline);
  2750     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2751     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2752     {
  2753 	pc=c;
  2754 	c=nc;
  2755 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2756 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2757 	/* for each character in the line except 1st */
  2758 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2759 	{
  2760 	    if (pswit[ECHO_SWITCH])
  2761 		g_print("\n%s\n",aline);
  2762 	    if (!pswit[OVERVIEW_SWITCH])
  2763 		g_print("    Line %ld column %ld - "
  2764 		  "endquote missing punctuation?\n",
  2765 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2766 	    else
  2767 		cnt_punct++;
  2768 	}
  2769     }
  2770 }
  2771 
  2772 /*
  2773  * check_for_html_tag:
  2774  *
  2775  * Check for <HTML TAG>.
  2776  *
  2777  * If there is a < in the line, followed at some point
  2778  * by a > then we suspect HTML.
  2779  */
  2780 void check_for_html_tag(const char *aline)
  2781 {
  2782     const char *open,*close;
  2783     gchar *tag;
  2784     open=strchr(aline,'<');
  2785     if (open)
  2786     {
  2787 	close=strchr(g_utf8_next_char(open),'>');
  2788 	if (close)
  2789 	{
  2790 	    if (pswit[ECHO_SWITCH])
  2791 		g_print("\n%s\n",aline);
  2792 	    if (!pswit[OVERVIEW_SWITCH])
  2793 	    {
  2794 		tag=g_strndup(open,close-open+1);
  2795 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2796 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2797 		g_free(tag);
  2798 	    }
  2799 	    else
  2800 		cnt_html++;
  2801 	}
  2802     }
  2803 }
  2804 
  2805 /*
  2806  * check_for_html_entity:
  2807  *
  2808  * Check for &symbol; HTML.
  2809  *
  2810  * If there is a & in the line, followed at
  2811  * some point by a ; then we suspect HTML.
  2812  */
  2813 void check_for_html_entity(const char *aline)
  2814 {
  2815     const char *s,*amp,*scolon;
  2816     gchar *entity;
  2817     amp=strchr(aline,'&');
  2818     if (amp)
  2819     {
  2820 	scolon=strchr(amp,';');
  2821 	if (scolon)
  2822 	{
  2823 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2824 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2825 		    break;		/* Don't report "Jones & Son;" */
  2826 	    if (s>=scolon)
  2827 	    {
  2828 		if (pswit[ECHO_SWITCH])
  2829 		    g_print("\n%s\n",aline);
  2830 		if (!pswit[OVERVIEW_SWITCH])
  2831 		{
  2832 		    entity=g_strndup(amp,scolon-amp+1);
  2833 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2834 		      linecnt,(int)(amp-aline)+1,entity);
  2835 		    g_free(entity);
  2836 		}
  2837 		else
  2838 		    cnt_html++;
  2839 	    }
  2840 	}
  2841     }
  2842 }
  2843 
  2844 /*
  2845  * check_for_omitted_punctuation:
  2846  *
  2847  * Check for omitted punctuation at end of paragraph by working back
  2848  * through prevline. DW.
  2849  * Need to check this only for "normal" paras.
  2850  * So what is a "normal" para?
  2851  *    Not normal if one-liner (chapter headings, etc.)
  2852  *    Not normal if doesn't contain at least one locase letter
  2853  *    Not normal if starts with space
  2854  */
  2855 void check_for_omitted_punctuation(const char *prevline,
  2856   struct line_properties *last,int start_para_line)
  2857 {
  2858     gboolean letter_on_line=FALSE;
  2859     const char *s;
  2860     gunichar c;
  2861     gboolean closing_quote;
  2862     for (s=prevline;*s;s=g_utf8_next_char(s))
  2863 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2864 	{
  2865 	    letter_on_line=TRUE;
  2866 	    break;
  2867 	}
  2868     /*
  2869      * This next "if" is a problem.
  2870      * If we say "start_para_line <= linecnt - 1", that includes
  2871      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2872      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2873      * misses genuine one-line paragraphs.
  2874      */
  2875     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2876       g_utf8_get_char(prevline)>CHAR_SPACE)
  2877     {
  2878 	s=prevline+strlen(prevline);
  2879 	do
  2880 	{
  2881 	    s=g_utf8_prev_char(s);
  2882 	    c=g_utf8_get_char(s);
  2883 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2884 		closing_quote=TRUE;
  2885 	    else
  2886 		closing_quote=FALSE;
  2887 	} while (closing_quote && s>prevline);
  2888 	for (;s>prevline;s=g_utf8_prev_char(s))
  2889 	{
  2890 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2891 	    {
  2892 		if (pswit[ECHO_SWITCH])
  2893 		    g_print("\n%s\n",prevline);
  2894 		if (!pswit[OVERVIEW_SWITCH])
  2895 		    g_print("    Line %ld column %ld - "
  2896 		      "No punctuation at para end?\n",
  2897 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2898 		else
  2899 		    cnt_punct++;
  2900 		break;
  2901 	    }
  2902 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2903 		break;
  2904 	}
  2905     }
  2906 }
  2907 
  2908 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2909 {
  2910     const char *word=key;
  2911     int *dupcnt=value;
  2912     if (*dupcnt)
  2913 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2914 	  word,*dupcnt);
  2915     return FALSE;
  2916 }
  2917 
  2918 void print_as_windows_1252(const char *string)
  2919 {
  2920     gsize inbytes,outbytes;
  2921     gchar *buf,*bp;
  2922     static GIConv converter=(GIConv)-1;
  2923     if (!string)
  2924     {
  2925 	if (converter!=(GIConv)-1)
  2926 	    g_iconv_close(converter);
  2927 	converter=(GIConv)-1;
  2928 	return;
  2929     }
  2930     if (converter==(GIConv)-1)
  2931 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2932     if (converter!=(GIConv)-1)
  2933     {
  2934 	inbytes=outbytes=strlen(string);
  2935 	bp=buf=g_malloc(outbytes+1);
  2936 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2937 	*bp='\0';
  2938 	fputs(buf,stdout);
  2939 	g_free(buf);
  2940     }
  2941     else
  2942 	fputs(string,stdout);
  2943 }
  2944 
  2945 void print_as_utf_8(const char *string)
  2946 {
  2947     fputs(string,stdout);
  2948 }
  2949 
  2950 /*
  2951  * procfile:
  2952  *
  2953  * Process one file.
  2954  */
  2955 void procfile(const char *filename)
  2956 {
  2957     const char *s;
  2958     gchar *parastart=NULL;	/* first line of current para */
  2959     gchar *etext,*aline;
  2960     gchar *etext_ptr;
  2961     GError *err=NULL;
  2962     struct first_pass_results *first_pass_results;
  2963     struct warnings *warnings;
  2964     struct counters counters={0};
  2965     struct line_properties last={0};
  2966     struct parities parities={0};
  2967     struct pending pending={0};
  2968     gboolean isemptyline;
  2969     long start_para_line=0;
  2970     gboolean isnewpara=FALSE,enddash=FALSE;
  2971     last.start=CHAR_SPACE;
  2972     linecnt=checked_linecnt=0;
  2973     etext=read_etext(filename,&err);
  2974     if (!etext)
  2975     {
  2976 	if (pswit[STDOUT_SWITCH])
  2977 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2978 	else
  2979 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2980 	exit(1);
  2981     }
  2982     g_print("\n\nFile: %s\n\n",filename);
  2983     first_pass_results=first_pass(etext);
  2984     warnings=report_first_pass(first_pass_results);
  2985     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2986     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2987     /*
  2988      * Here we go with the main pass. Hold onto yer hat!
  2989      */
  2990     linecnt=0;
  2991     etext_ptr=etext;
  2992     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2993     {
  2994 	linecnt++;
  2995 	if (linecnt==1)
  2996 	    isnewpara=TRUE;
  2997 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2998 	    continue;    // skip DP page separators completely
  2999 	if (linecnt<first_pass_results->firstline ||
  3000 	  (first_pass_results->footerline>0 &&
  3001 	  linecnt>first_pass_results->footerline))
  3002 	{
  3003 	    if (pswit[HEADER_SWITCH])
  3004 	    {
  3005 		if (g_str_has_prefix(aline,"Title:"))
  3006 		    g_print("    %s\n",aline);
  3007 		if (g_str_has_prefix(aline,"Author:"))
  3008 		    g_print("    %s\n",aline);
  3009 		if (g_str_has_prefix(aline,"Release Date:"))
  3010 		    g_print("    %s\n",aline);
  3011 		if (g_str_has_prefix(aline,"Edition:"))
  3012 		    g_print("    %s\n\n",aline);
  3013 	    }
  3014 	    continue;		/* skip through the header */
  3015 	}
  3016 	checked_linecnt++;
  3017 	print_pending(aline,parastart,&pending);
  3018 	isemptyline=analyse_quotes(aline,&counters);
  3019 	if (isnewpara && !isemptyline)
  3020 	{
  3021 	    /* This line is the start of a new paragraph. */
  3022 	    start_para_line=linecnt;
  3023 	    /* Capture its first line in case we want to report it later. */
  3024 	    g_free(parastart);
  3025 	    parastart=g_strdup(aline);
  3026 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  3027 	    s=aline;
  3028 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  3029 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  3030 		s=g_utf8_next_char(s);
  3031 	    if (g_unichar_islower(g_utf8_get_char(s)))
  3032 	    {
  3033 		/* and its first letter is lowercase */
  3034 		if (pswit[ECHO_SWITCH])
  3035 		    g_print("\n%s\n",aline);
  3036 		if (!pswit[OVERVIEW_SWITCH])
  3037 		    g_print("    Line %ld column %ld - "
  3038 		      "Paragraph starts with lower-case\n",
  3039 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  3040 		else
  3041 		    cnt_punct++;
  3042 	    }
  3043 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  3044 	}
  3045 	/* Check for an em-dash broken at line end. */
  3046 	if (enddash && g_utf8_get_char(aline)=='-')
  3047 	{
  3048 	    if (pswit[ECHO_SWITCH])
  3049 		g_print("\n%s\n",aline);
  3050 	    if (!pswit[OVERVIEW_SWITCH])
  3051 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  3052 	    else
  3053 		cnt_punct++;
  3054 	}
  3055 	enddash=FALSE;
  3056 	for (s=g_utf8_prev_char(aline+strlen(aline));
  3057 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  3058 	    ;
  3059 	if (s>=aline && g_utf8_get_char(s)=='-')
  3060 	    enddash=TRUE;
  3061 	check_for_control_characters(aline);
  3062 	check_for_odd_characters(aline,warnings,isemptyline);
  3063 	if (warnings->longline)
  3064 	    check_for_long_line(aline);
  3065 	if (warnings->shortline)
  3066 	    check_for_short_line(aline,&last);
  3067 	last.blen=last.len;
  3068 	last.len=g_utf8_strlen(aline,-1);
  3069 	last.start=g_utf8_get_char(aline);
  3070 	check_for_starting_punctuation(aline);
  3071 	if (warnings->dash)
  3072 	{
  3073 	    check_for_spaced_emdash(aline);
  3074 	    check_for_spaced_dash(aline);
  3075 	}
  3076 	check_for_unmarked_paragraphs(aline);
  3077 	check_for_jeebies(aline);
  3078 	check_for_mta_from(aline);
  3079 	check_for_orphan_character(aline);
  3080 	check_for_pling_scanno(aline);
  3081 	check_for_extra_period(aline,warnings);
  3082 	check_for_following_punctuation(aline);
  3083 	check_for_typos(aline,warnings);
  3084 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3085 	check_for_double_punctuation(aline,warnings);
  3086 	check_for_spaced_quotes(aline);
  3087 	check_for_miscased_genative(aline);
  3088 	check_end_of_line(aline,warnings);
  3089 	check_for_unspaced_bracket(aline);
  3090 	if (warnings->endquote)
  3091 	    check_for_unpunctuated_endquote(aline);
  3092 	check_for_html_tag(aline);
  3093 	check_for_html_entity(aline);
  3094 	if (isemptyline)
  3095 	{
  3096 	    check_for_mismatched_quotes(&counters,&pending);
  3097 	    counters_reset(&counters);
  3098 	    /* let the next iteration know that it's starting a new para */
  3099 	    isnewpara=TRUE;
  3100 	    if (prevline)
  3101 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3102 	}
  3103 	g_free(prevline);
  3104 	prevline=g_strdup(aline);
  3105     }
  3106     linecnt++;
  3107     check_for_mismatched_quotes(&counters,&pending);
  3108     print_pending(NULL,parastart,&pending);
  3109     reset_pending(&pending);
  3110     if (prevline)
  3111     {
  3112 	g_free(prevline);
  3113 	prevline=NULL;
  3114     }
  3115     g_free(parastart);
  3116     g_free(prevline);
  3117     g_free(etext);
  3118     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3119 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3120     g_tree_unref(qword);
  3121     g_tree_unref(qperiod);
  3122     counters_destroy(&counters);
  3123     g_set_print_handler(NULL);
  3124     print_as_windows_1252(NULL);
  3125     if (pswit[MARKUP_SWITCH])  
  3126 	loseentities(NULL);
  3127 }
  3128 
  3129 /*
  3130  * flgets:
  3131  *
  3132  * Get one line from the input text, checking for
  3133  * the existence of exactly one CR/LF line-end per line.
  3134  *
  3135  * Returns: a pointer to the line.
  3136  */
  3137 char *flgets(char **etext,long lcnt)
  3138 {
  3139     gunichar c;
  3140     gboolean isCR=FALSE;
  3141     char *theline=*etext;
  3142     char *eos=theline;
  3143     gchar *s;
  3144     for (;;)
  3145     {
  3146 	c=g_utf8_get_char(*etext);
  3147 	if (!c)
  3148 	{
  3149 	    if (*etext==theline)
  3150 		return NULL;
  3151 	    else if (pswit[LINE_END_SWITCH])
  3152 	    {
  3153 		if (pswit[ECHO_SWITCH])
  3154 		{
  3155 		    s=g_strndup(theline,eos-theline);
  3156 		    g_print("\n%s\n",s);
  3157 		    g_free(s);
  3158 		}
  3159 		if (!pswit[OVERVIEW_SWITCH])
  3160 		    /* There may, or may not, have been a CR */
  3161 		    g_print("    Line %ld - No LF?\n",lcnt);
  3162 		else
  3163 		    cnt_lineend++;
  3164 	    }
  3165 	    break;
  3166 	}
  3167 	*etext=g_utf8_next_char(*etext);
  3168 	/* either way, it's end of line */
  3169 	if (c=='\n')
  3170 	{
  3171 	    if (isCR)
  3172 		break;
  3173 	    else
  3174 	    {
  3175 		/* Error - a LF without a preceding CR */
  3176 		if (pswit[LINE_END_SWITCH])
  3177 		{
  3178 		    if (pswit[ECHO_SWITCH])
  3179 		    {
  3180 			s=g_strndup(theline,eos-theline);
  3181 			g_print("\n%s\n",s);
  3182 			g_free(s);
  3183 		    }
  3184 		    if (!pswit[OVERVIEW_SWITCH])
  3185 			g_print("    Line %ld - No CR?\n",lcnt);
  3186 		    else
  3187 			cnt_lineend++;
  3188 		}
  3189 		break;
  3190 	    }
  3191 	}
  3192 	if (c=='\r')
  3193 	{
  3194 	    if (isCR)
  3195 	    {
  3196 		/* Error - two successive CRs */
  3197 		if (pswit[LINE_END_SWITCH])
  3198 		{
  3199 		    if (pswit[ECHO_SWITCH])
  3200 		    {
  3201 			s=g_strndup(theline,eos-theline);
  3202 			g_print("\n%s\n",s);
  3203 			g_free(s);
  3204 		    }
  3205 		    if (!pswit[OVERVIEW_SWITCH])
  3206 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3207 		    else
  3208 			cnt_lineend++;
  3209 		}
  3210 	    }
  3211 	    isCR=TRUE;
  3212 	}
  3213 	else
  3214 	{
  3215 	    if (pswit[LINE_END_SWITCH] && isCR)
  3216 	    {
  3217 		if (pswit[ECHO_SWITCH])
  3218 		{
  3219 		    s=g_strndup(theline,eos-theline);
  3220 		    g_print("\n%s\n",s);
  3221 		    g_free(s);
  3222 		}
  3223 		if (!pswit[OVERVIEW_SWITCH])
  3224 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3225 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3226 		else
  3227 		    cnt_lineend++;
  3228 		*eos=' ';
  3229 	    }
  3230 	    isCR=FALSE;
  3231 	    eos=g_utf8_next_char(eos);
  3232 	}
  3233     }
  3234     *eos='\0';
  3235     if (pswit[MARKUP_SWITCH])  
  3236 	postprocess_for_HTML(theline);
  3237     if (pswit[DP_SWITCH])  
  3238 	postprocess_for_DP(theline);
  3239     return theline;
  3240 }
  3241 
  3242 /*
  3243  * mixdigit:
  3244  *
  3245  * Takes a "word" as a parameter, and checks whether it
  3246  * contains a mixture of alpha and digits. Generally, this is an
  3247  * error, but may not be for cases like 4th or L5 12s. 3d.
  3248  *
  3249  * Returns: TRUE iff an is error found.
  3250  */
  3251 gboolean mixdigit(const char *checkword)
  3252 {
  3253     gboolean wehaveadigit,wehavealetter,query;
  3254     const char *s,*nondigit;
  3255     wehaveadigit=wehavealetter=query=FALSE;
  3256     for (s=checkword;*s;s=g_utf8_next_char(s))
  3257 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3258 	    wehavealetter=TRUE;
  3259 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3260 	    wehaveadigit=TRUE;
  3261     if (wehaveadigit && wehavealetter)
  3262     {
  3263 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3264 	query=TRUE;
  3265 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3266 	  nondigit=g_utf8_next_char(nondigit))
  3267 	    ;
  3268 	/* digits, ending in st, rd, nd, th of either case */
  3269 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3270 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3271 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3272 	  !g_ascii_strcasecmp(nondigit,"th"))
  3273 	    query=FALSE;
  3274 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3275 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3276 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3277 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3278 	    query=FALSE;
  3279 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3280 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3281 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3282 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3283 	    query=FALSE;
  3284 	/* digits, ending in l, L, s or d */
  3285 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3286 	  !strcmp(nondigit,"d"))
  3287 	    query=FALSE;
  3288 	/*
  3289 	 * L at the start of a number, representing Britsh pounds, like L500.
  3290 	 * This is cute. We know the current word is mixed digit. If the first
  3291 	 * letter is L, there must be at least one digit following. If both
  3292 	 * digits and letters follow, we have a genuine error, else we have a
  3293 	 * capital L followed by digits, and we accept that as a non-error.
  3294 	 */
  3295 	if (g_utf8_get_char(checkword)=='L' &&
  3296 	  !mixdigit(g_utf8_next_char(checkword)))
  3297 	    query=FALSE;
  3298     }
  3299     return query;
  3300 }
  3301 
  3302 /*
  3303  * getaword:
  3304  *
  3305  * Extracts the first/next "word" from the line, and returns it.
  3306  * A word is defined as one English word unit--or at least that's the aim.
  3307  * "ptr" is advanced to the position in the line where we will start
  3308  * looking for the next word.
  3309  *
  3310  * Returns: A newly-allocated string.
  3311  */
  3312 gchar *getaword(const char **ptr)
  3313 {
  3314     const char *s,*t;
  3315     GString *word;
  3316     gunichar c,pc;
  3317     word=g_string_new(NULL);
  3318     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3319       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3320       **ptr;*ptr=g_utf8_next_char(*ptr))
  3321     {
  3322 	/* Handle exceptions for footnote markers like [1] */
  3323 	if (g_utf8_get_char(*ptr)=='[')
  3324 	{
  3325 	    g_string_append_c(word,'[');
  3326 	    s=g_utf8_next_char(*ptr);
  3327 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
  3328 		g_string_append_unichar(word,g_utf8_get_char(s));
  3329 	    if (g_utf8_get_char(s)==']')
  3330 	    {
  3331 		g_string_append_c(word,']');
  3332 		*ptr=g_utf8_next_char(s);
  3333 		return g_string_free(word,FALSE);
  3334 	    }
  3335 	    else
  3336 		g_string_truncate(word,0);
  3337 	}
  3338     }
  3339     /*
  3340      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3341      * Especially yucky is the case of L1,000
  3342      * This section looks for a pattern of characters including a digit
  3343      * followed by a comma or period followed by one or more digits.
  3344      * If found, it returns this whole pattern as a word; otherwise we discard
  3345      * the results and resume our normal programming.
  3346      */
  3347     s=*ptr;
  3348     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3349       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3350       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3351 	g_string_append_unichar(word,g_utf8_get_char(s));
  3352     if (word->len)
  3353     {
  3354 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3355 	{
  3356 	    c=g_utf8_get_char(t);
  3357 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3358 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3359 	    {
  3360 		*ptr=s;
  3361 		return g_string_free(word,FALSE);
  3362 	    }
  3363 	}
  3364     }
  3365     /* we didn't find a punctuated number - do the regular getword thing */
  3366     g_string_truncate(word,0);
  3367     c=g_utf8_get_char(*ptr);
  3368     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3369       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3370 	g_string_append_unichar(word,c);
  3371     return g_string_free(word,FALSE);
  3372 }
  3373 
  3374 /*
  3375  * isroman:
  3376  *
  3377  * Is this word a Roman Numeral?
  3378  *
  3379  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3380  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3381  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3382  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3383  * expressions thereof, except when it came to taxes. Allow any number of M,
  3384  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3385  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3386  * of optional Is.
  3387  */
  3388 gboolean isroman(const char *t)
  3389 {
  3390     const char *s;
  3391     if (!t || !*t)
  3392 	return FALSE;
  3393     s=t;
  3394     while (g_utf8_get_char(t)=='m' && *t)
  3395 	t++;
  3396     if (g_utf8_get_char(t)=='d')
  3397 	t++;
  3398     if (g_str_has_prefix(t,"cm"))
  3399 	t+=2;
  3400     if (g_str_has_prefix(t,"cd"))
  3401 	t+=2;
  3402     while (g_utf8_get_char(t)=='c' && *t)
  3403 	t++;
  3404     if (g_str_has_prefix(t,"xl"))
  3405 	t+=2;
  3406     if (g_str_has_prefix(t,"xc"))
  3407 	t+=2;
  3408     if (g_utf8_get_char(t)=='l')
  3409 	t++;
  3410     while (g_utf8_get_char(t)=='x' && *t)
  3411 	t++;
  3412     if (g_str_has_prefix(t,"ix"))
  3413 	t+=2;
  3414     if (g_str_has_prefix(t,"iv"))
  3415 	t+=2;
  3416     if (g_utf8_get_char(t)=='v')
  3417 	t++;
  3418     while (g_utf8_get_char(t)=='i' && *t)
  3419 	t++;
  3420     return !*t;
  3421 }
  3422 
  3423 /*
  3424  * postprocess_for_DP:
  3425  *
  3426  * Invoked with the -d switch from flgets().
  3427  * It simply "removes" from the line a hard-coded set of common
  3428  * DP-specific tags, so that the line passed to the main routine has
  3429  * been pre-cleaned of DP markup.
  3430  */
  3431 void postprocess_for_DP(char *theline)
  3432 {
  3433     char *s,*t;
  3434     int i;
  3435     if (!*theline) 
  3436 	return;
  3437     for (i=0;*DPmarkup[i];i++)
  3438 	while ((s=strstr(theline,DPmarkup[i])))
  3439 	{
  3440 	    t=s+strlen(DPmarkup[i]);
  3441 	    memmove(s,t,strlen(t)+1);
  3442 	}
  3443 }
  3444 
  3445 /*
  3446  * postprocess_for_HTML:
  3447  *
  3448  * Invoked with the -m switch from flgets().
  3449  * It simply "removes" from the line a hard-coded set of common
  3450  * HTML tags and "replaces" a hard-coded set of common HTML
  3451  * entities, so that the line passed to the main routine has
  3452  * been pre-cleaned of HTML.
  3453  */
  3454 void postprocess_for_HTML(char *theline)
  3455 {
  3456     while (losemarkup(theline))
  3457 	;
  3458     loseentities(theline);
  3459 }
  3460 
  3461 char *losemarkup(char *theline)
  3462 {
  3463     char *s,*t;
  3464     int i;
  3465     s=strchr(theline,'<');
  3466     t=s?strchr(s,'>'):NULL;
  3467     if (!s || !t)
  3468 	return NULL;
  3469     for (i=0;*markup[i];i++)
  3470 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3471 	{
  3472 	    t=g_utf8_next_char(t);
  3473 	    memmove(s,t,strlen(t)+1);
  3474 	    return s;
  3475 	}
  3476     /* It's an unrecognized <xxx>. */
  3477     return NULL;
  3478 }
  3479 
  3480 void loseentities(char *theline)
  3481 {
  3482     int i;
  3483     gsize nb;
  3484     char *amp,*scolon;
  3485     gchar *s,*t;
  3486     gunichar c;
  3487     GTree *entities=NULL;
  3488     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3489     if (!theline)
  3490     {
  3491 	if (entities)
  3492 	    g_tree_destroy(entities);
  3493 	entities=NULL;
  3494 	if (translit!=(GIConv)-1)
  3495 	    g_iconv_close(translit);
  3496 	translit=(GIConv)-1;
  3497 	if (to_utf8!=(GIConv)-1)
  3498 	    g_iconv_close(to_utf8);
  3499 	to_utf8=(GIConv)-1;
  3500 	return;
  3501     }
  3502     if (!*theline)
  3503 	return;
  3504     if (!entities)
  3505     {
  3506 	entities=g_tree_new((GCompareFunc)strcmp);
  3507 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3508 	    g_tree_insert(entities,HTMLentities[i].name,
  3509 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3510     }
  3511     if (translit==(GIConv)-1)
  3512 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3513     if (to_utf8==(GIConv)-1)
  3514 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3515     while((amp=strchr(theline,'&')))
  3516     {
  3517 	scolon=strchr(amp,';');
  3518 	if (scolon)
  3519 	{
  3520 	    if (amp[1]=='#')
  3521 	    {
  3522 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3523 		    c=strtol(amp+2,NULL,10);
  3524 		else if (amp[2]=='x' &&
  3525 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3526 		    c=strtol(amp+3,NULL,16);
  3527 	    }
  3528 	    else
  3529 	    {
  3530 		s=g_strndup(amp+1,scolon-(amp+1));
  3531 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3532 		g_free(s);
  3533 	    }
  3534 	}
  3535 	else
  3536 	    c=0;
  3537 	if (c)
  3538 	{
  3539 	    theline=amp;
  3540 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3541 		theline+=g_unichar_to_utf8(c,theline);
  3542 	    else
  3543 	    {
  3544 		s=g_malloc(6);
  3545 		nb=g_unichar_to_utf8(c,s);
  3546 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3547 		g_free(s);
  3548 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3549 		g_free(t);
  3550 		memcpy(theline,s,nb);
  3551 		g_free(s);
  3552 		theline+=nb;
  3553 	    }
  3554 	    memmove(theline,g_utf8_next_char(scolon),
  3555 	      strlen(g_utf8_next_char(scolon))+1);
  3556 	}
  3557 	else
  3558 	    theline=g_utf8_next_char(amp);
  3559     }
  3560 }
  3561 
  3562 gboolean tagcomp(const char *strin,const char *basetag)
  3563 {
  3564     gboolean retval;
  3565     gchar *s,*t;
  3566     if (g_utf8_get_char(strin)=='/')
  3567 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3568     else
  3569 	t=g_utf8_casefold(strin,-1);
  3570     s=g_utf8_casefold(basetag,-1);
  3571     retval=g_str_has_prefix(t,s);
  3572     g_free(s);
  3573     g_free(t);
  3574     return retval;
  3575 }
  3576 
  3577 void proghelp(GOptionContext *context)
  3578 {
  3579     gchar *help;
  3580     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3581     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3582     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3583     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3584       "For details, read the file COPYING.\n",stderr);
  3585     fputs("This is Free Software; "
  3586       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3587     fputs("read the file COPYING for details.\n\n",stderr);
  3588     help=g_option_context_get_help(context,TRUE,NULL);
  3589     fputs(help,stderr);
  3590     g_free(help);
  3591     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3592     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3593       "non-ASCII\n",stderr);
  3594     fputs("characters like accented letters, "
  3595       "lines longer than 75 or shorter than 55,\n",stderr);
  3596     fputs("unbalanced quotes or brackets, "
  3597       "a variety of badly formatted punctuation, \n",stderr);
  3598     fputs("HTML tags, some likely typos. "
  3599       "It is NOT a substitute for human judgement.\n",stderr);
  3600     fputs("\n",stderr);
  3601 }