bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Mon Sep 30 08:31:45 2013 +0100 (2013-09-30)
changeset 139 c130152c4a57
parent 132 237b058061f2
parent 138 5e27fa988c5c
permissions -rw-r--r--
Merge bug #14: Add a configuration file
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 gboolean typo_compat,paranoid_compat;
   135 
   136 static GOptionEntry options[]={
   137     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   138       "Ignore DP-specific markup", NULL },
   139     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   140       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   141       "Don't ignore DP-specific markup", NULL },
   142     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   143       "Echo queried line", NULL },
   144     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   145       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   146       "Don't echo queried line", NULL },
   147     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   148       "Check single quotes", NULL },
   149     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   150       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   151       "Don't check single quotes", NULL },
   152     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   153       "Check common typos", NULL },
   154     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   155       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   156       "Don't check common typos", NULL },
   157     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   158       "Require closure of quotes on every paragraph", NULL },
   159     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   160       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   161       "Don't require closure of quotes on every paragraph", NULL },
   162     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   163       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   164       "Enable paranoid querying of everything", NULL },
   165     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   166       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   167       "Disable paranoid querying of everything", NULL },
   168     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   169       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   170       "Enable line end checking", NULL },
   171     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   172       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   173       "Diable line end checking", NULL },
   174     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   175       "Overview: just show counts", NULL },
   176     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   177       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   178       "Show individual warnings", NULL },
   179     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   180       "Output errors to stdout instead of stderr", NULL },
   181     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   182       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   183       "Output errors to stderr instead of stdout", NULL },
   184     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   185       "Echo header fields", NULL },
   186     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   187       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   188       "Don't echo header fields", NULL },
   189     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   190       "Ignore markup in < >", NULL },
   191     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   192       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   193       "No special handling for markup in < >", NULL },
   194     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   195       "Use file of user-defined typos", NULL },
   196     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   197       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   198       "Ignore file of user-defined typos", NULL },
   199     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   200       "Verbose - list everything", NULL },
   201     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   202       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   203       "Switch off verbose mode", NULL },
   204     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   205       "Set of characters valid for this ebook", "NAME" },
   206     { NULL }
   207 };
   208 
   209 /*
   210  * Options relating to configuration which make no sense from inside
   211  * a configuration file.
   212  */
   213 
   214 static GOptionEntry config_options[]={
   215     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   216       "Defaults for use on www upload", NULL },
   217     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   218       "Set of characters valid for this ebook", "NAME" },
   219     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   220       "Dump current config settings", NULL },
   221     { NULL }
   222 };
   223 
   224 static GOptionEntry compatibility_options[]={
   225     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   226       "Toggle checking for common typos", NULL },
   227     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   228       "Toggle both paranoid mode and common typos", NULL },
   229     { NULL }
   230 };
   231 
   232 long cnt_quote;		/* for overview mode, count of quote queries */
   233 long cnt_brack;		/* for overview mode, count of brackets queries */
   234 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   235 long cnt_odd;		/* for overview mode, count of odd character queries */
   236 long cnt_long;		/* for overview mode, count of long line errors */
   237 long cnt_short;		/* for overview mode, count of short line queries */
   238 long cnt_punct;		/* for overview mode,
   239 			   count of punctuation and spacing queries */
   240 long cnt_dash;		/* for overview mode, count of dash-related queries */
   241 long cnt_word;		/* for overview mode, count of word queries */
   242 long cnt_html;		/* for overview mode, count of html queries */
   243 long cnt_lineend;	/* for overview mode, count of line-end queries */
   244 long cnt_spacend;	/* count of lines with space at end */
   245 long linecnt;		/* count of total lines in the file */
   246 long checked_linecnt;	/* count of lines actually checked */
   247 
   248 void proghelp(GOptionContext *context);
   249 void procfile(const char *);
   250 
   251 gchar *running_from;
   252 
   253 gboolean mixdigit(const char *);
   254 gchar *getaword(const char **);
   255 char *flgets(char **,long);
   256 void postprocess_for_HTML(char *);
   257 char *linehasmarkup(char *);
   258 char *losemarkup(char *);
   259 gboolean tagcomp(const char *,const char *);
   260 void loseentities(char *);
   261 gboolean isroman(const char *);
   262 void postprocess_for_DP(char *);
   263 void print_as_windows_1252(const char *string);
   264 void print_as_utf_8(const char *string);
   265 
   266 GTree *qword,*qperiod;
   267 
   268 #ifdef __WIN32__
   269 UINT saved_cp;
   270 #endif
   271 
   272 gboolean set_charset(const char *name,GError **err)
   273 {
   274     /* The various UNICODE encodings all share the same character set. */
   275     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   276       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   277       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   278       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   279       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   280     int i;
   281     if (charset)
   282 	g_free(charset);
   283     if (charset_validator!=(GIConv)-1)
   284 	g_iconv_close(charset_validator);
   285     if (!name || !g_strcasecmp(name,"auto"))
   286     {
   287 	charset=NULL;
   288 	charset_validator=(GIConv)-1;
   289 	return TRUE;
   290     }
   291     else
   292 	charset=g_strdup(name);
   293     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   294 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   295 	{
   296 	    g_free(charset);
   297 	    charset=g_strdup("UTF-8");
   298 	    break;
   299 	}
   300     if (!strcmp(charset,"UTF-8"))
   301 	charset_validator=(GIConv)-1;
   302     else
   303     {
   304 	charset_validator=g_iconv_open(charset,"UTF-8");
   305 	if (charset_validator==(GIConv)-1)
   306 	{
   307 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   308 	      "Unknown character set \"%s\"",charset);
   309 	    return FALSE;
   310 	}
   311     }
   312     return TRUE;
   313 }
   314 
   315 GKeyFile *config;
   316 
   317 void config_file_update(GKeyFile *kf)
   318 {
   319     int i;
   320     const char *s;
   321     gboolean sw;
   322     for(i=0;options[i].long_name;i++)
   323     {
   324 	if (g_str_has_prefix(options[i].long_name,"no-"))
   325 	    continue;
   326 	if (options[i].arg==G_OPTION_ARG_NONE)
   327 	{
   328 	    sw=*(gboolean *)options[i].arg_data;
   329 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   330 		sw=!sw;
   331 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   332 	}
   333 	else if (options[i].arg==G_OPTION_ARG_STRING)
   334 	{
   335 	    s=*(gchar **)options[i].arg_data;
   336 	    if (!s)
   337 		s="auto";
   338 	    g_key_file_set_string(kf,"options",options[i].long_name,s);
   339 	}
   340 	else
   341 	    g_assert_not_reached();
   342     }
   343 }
   344 
   345 void config_file_add_comments(GKeyFile *kf)
   346 {
   347     int i;
   348     gchar *comment;
   349     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   350       NULL);
   351     for(i=0;options[i].long_name;i++)
   352     {
   353 	if (g_str_has_prefix(options[i].long_name,"no-"))
   354 	    continue;
   355 	comment=g_strconcat(" ",options[i].description,NULL);
   356 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   357 	g_free(comment);
   358     }
   359 }
   360 
   361 void dump_config(void)
   362 {
   363     gchar *s;
   364     if (config)
   365 	config_file_update(config);
   366     else
   367     {
   368 	config=g_key_file_new();
   369 	config_file_update(config);
   370 	config_file_add_comments(config);
   371     }
   372     s=g_key_file_to_data(config,NULL,NULL);
   373     if (s)
   374 	g_print("%s",s);
   375     g_free(s);
   376 }
   377 
   378 GKeyFile *read_config_file(gchar **full_path)
   379 {
   380     int i;
   381     GError *err=NULL;
   382     gchar **search_dirs;
   383     gchar *path;
   384     const char *search_path;
   385     GKeyFile *kf;
   386     kf=g_key_file_new();
   387     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   388     if (search_path)
   389     {
   390 #ifdef __WIN32__
   391 	search_dirs=g_strsplit(search_path,";",0);
   392 #else
   393 	search_dirs=g_strsplit(search_path,":",0);
   394 #endif
   395     }
   396     else
   397     {
   398 	search_dirs=g_new(gchar *,4);
   399 	search_dirs[0]=g_get_current_dir();
   400 	search_dirs[1]=g_strdup(running_from);
   401 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   402 	search_dirs[3]=NULL;
   403     }
   404     for(i=0;search_dirs[i];i++)
   405     {
   406 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   407 	if (g_key_file_load_from_file(kf,path,
   408 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   409 	    break;
   410 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   411 	{
   412 	    g_printerr("Bookloupe: Error reading %s\n",path);
   413 	    g_printerr("%s\n",err->message);
   414 	    exit(1);
   415 	}
   416 	g_clear_error(&err);
   417 	g_free(path);
   418 	path=NULL;
   419     }
   420     if (!search_dirs[i])
   421     {
   422 	g_key_file_free(kf);
   423 	kf=NULL;
   424     }
   425     g_strfreev(search_dirs);
   426     if (full_path && kf)
   427 	*full_path=path;
   428     else
   429 	g_free(path);
   430     return kf;
   431 }
   432 
   433 void parse_config_file(void)
   434 {
   435     int i,j;
   436     gchar *path,*s;
   437     gchar **keys;
   438     gboolean sw;
   439     GError *err=NULL;
   440     config=read_config_file(&path);
   441     if (config)
   442 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   443     else
   444 	keys=NULL;
   445     if (keys)
   446     {
   447 	for(i=0;keys[i];i++)
   448 	{
   449 	    for(j=0;options[j].long_name;j++)
   450 	    {
   451 		if (g_str_has_prefix(options[j].long_name,"no-"))
   452 		    continue;
   453 		else if (!strcmp(keys[i],options[j].long_name))
   454 		{
   455 		    if (options[j].arg==G_OPTION_ARG_NONE)
   456 		    {
   457 			sw=g_key_file_get_boolean(config,"options",keys[i],
   458 			  &err);
   459 			if (err)
   460 			{
   461 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   462 			      path,keys[i],err->message);
   463 			    g_clear_error(&err);
   464 			}
   465 			else
   466 			{
   467 			    if (options[j].flags&G_OPTION_FLAG_REVERSE)
   468 				sw=!sw;
   469 			    *(gboolean *)options[j].arg_data=sw;
   470 			}
   471 			break;
   472 		    }
   473 		    else if (options[j].arg==G_OPTION_ARG_STRING)
   474 		    {
   475 			s=g_key_file_get_string(config,"options",keys[i],
   476 			  &err);
   477 			if (err)
   478 			{
   479 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   480 			      path,keys[i],err->message);
   481 			    g_clear_error(&err);
   482 			}
   483 			else
   484 			{
   485 			    g_free(*(gchar **)options[j].arg_data);
   486 			    if (!g_strcmp0(s,"auto"))
   487 			    {
   488 				*(gchar **)options[j].arg_data=NULL;
   489 				g_free(s);
   490 			    }
   491 			    else
   492 				*(gchar **)options[j].arg_data=s;
   493 			}
   494 			break;
   495 		    }
   496 		    else
   497 			g_assert_not_reached();
   498 		}
   499 	    }
   500 	    if (!options[j].long_name)
   501 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   502 		  path,keys[i]);
   503 	}
   504 	g_strfreev(keys);
   505     }
   506     if (config)
   507 	g_free(path);
   508 }
   509 
   510 void parse_options(int *argc,char ***argv)
   511 {
   512     GError *err=NULL;
   513     GOptionContext *context;
   514     GOptionGroup *compatibility;
   515     context=g_option_context_new(
   516       "file - look for errors in Project Gutenberg(TM) etexts");
   517     g_option_context_add_main_entries(context,options,NULL);
   518     g_option_context_add_main_entries(context,config_options,NULL);
   519     compatibility=g_option_group_new("compatibility",
   520       "Options for Compatibility with Gutcheck:",
   521       "Show compatibility options",NULL,NULL);
   522     g_option_group_add_entries(compatibility,compatibility_options);
   523     g_option_context_add_group(context,compatibility);
   524     g_option_context_set_description(context,
   525       "For simplicity, only the switch options which reverse the\n"
   526       "default configuration are listed. In most cases, both vanilla\n"
   527       "and \"no-\" prefixed versions are available for use.");
   528     if (!g_option_context_parse(context,argc,argv,&err))
   529     {
   530 	g_printerr("Bookloupe: %s\n",err->message);
   531 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   532 	exit(1);
   533     }
   534     if (typo_compat)
   535 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   536     if (paranoid_compat)
   537     {
   538 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   539 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   540     }
   541     /*
   542      * Web uploads - for the moment, this is really just a placeholder
   543      * until we decide what processing we really want to do on web uploads
   544      */
   545     if (pswit[WEB_SWITCH])
   546     {
   547 	/* specific override for web uploads */
   548 	pswit[ECHO_SWITCH]=TRUE;
   549 	pswit[SQUOTE_SWITCH]=FALSE;
   550 	pswit[TYPO_SWITCH]=TRUE;
   551 	pswit[QPARA_SWITCH]=FALSE;
   552 	pswit[PARANOID_SWITCH]=TRUE;
   553 	pswit[LINE_END_SWITCH]=FALSE;
   554 	pswit[OVERVIEW_SWITCH]=FALSE;
   555 	pswit[STDOUT_SWITCH]=FALSE;
   556 	pswit[HEADER_SWITCH]=TRUE;
   557 	pswit[VERBOSE_SWITCH]=FALSE;
   558 	pswit[MARKUP_SWITCH]=FALSE;
   559 	pswit[USERTYPO_SWITCH]=FALSE;
   560 	pswit[DP_SWITCH]=FALSE;
   561     }
   562     if (opt_charset && !set_charset(opt_charset,&err))
   563     {
   564 	g_printerr("%s\n",err->message);
   565 	exit(1);
   566     }
   567     if (pswit[DUMP_CONFIG_SWITCH])
   568     {
   569 	dump_config();
   570 	exit(0);
   571     }
   572     g_free(opt_charset);
   573     opt_charset=NULL;
   574     if (pswit[OVERVIEW_SWITCH])
   575 	/* just print summary; don't echo */
   576 	pswit[ECHO_SWITCH]=FALSE;
   577     if (*argc<2)
   578     {
   579 	proghelp(context);
   580 	exit(1);
   581     }
   582     g_option_context_free(context);
   583 }
   584 
   585 /*
   586  * read_user_scannos:
   587  *
   588  * Read in the user-defined stealth scanno list.
   589  */
   590 void read_user_scannos(void)
   591 {
   592     GError *err=NULL;
   593     gchar *usertypo_file;
   594     gboolean okay;
   595     int i;
   596     gsize len,nb;
   597     gchar *contents,*utf8,**lines;
   598     usertypo_file=g_strdup("bookloupe.typ");
   599     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   600     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   601     {
   602 	g_clear_error(&err);
   603 	g_free(usertypo_file);
   604 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   605 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   606     }
   607     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   608     {
   609 	g_clear_error(&err);
   610 	g_free(usertypo_file);
   611 	usertypo_file=g_strdup("gutcheck.typ");
   612 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   613     }
   614     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   615     {
   616 	g_clear_error(&err);
   617 	g_free(usertypo_file);
   618 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   619 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   620     }
   621     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   622     {
   623 	g_free(usertypo_file);
   624 	g_print("   --> I couldn't find bookloupe.typ "
   625 	  "-- proceeding without user typos.\n");
   626 	return;
   627     }
   628     else if (!okay)
   629     {
   630 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   631 	g_free(usertypo_file);
   632 	g_clear_error(&err);
   633 	exit(1);
   634     }
   635     if (g_utf8_validate(contents,len,NULL))
   636     {
   637 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   638 	if (!charset)
   639 	    (void)set_charset("UNICODE",NULL);
   640     }
   641     else
   642 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   643     g_free(contents);
   644     lines=g_strsplit_set(utf8,"\r\n",0);
   645     g_free(utf8);
   646     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   647     for (i=0;lines[i];i++)
   648 	if (*(unsigned char *)lines[i]>'!')
   649 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   650 	else
   651 	    g_free(lines[i]);
   652     g_free(lines);
   653 }
   654 
   655 /*
   656  * read_etext:
   657  *
   658  * Read an etext returning a newly allocated string containing the file
   659  * contents or NULL on error.
   660  */
   661 gchar *read_etext(const char *filename,GError **err)
   662 {
   663     GError *tmp_err=NULL;
   664     gchar *contents,*utf8;
   665     gsize len,bytes_read,bytes_written;
   666     int i,line,col;
   667     if (!g_file_get_contents(filename,&contents,&len,err))
   668 	return NULL;
   669     if (g_utf8_validate(contents,len,NULL))
   670     {
   671 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   672 	g_set_print_handler(print_as_utf_8);
   673 #ifdef __WIN32__
   674 	SetConsoleOutputCP(CP_UTF8);
   675 #endif
   676     }
   677     else
   678     {
   679 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   680 	  &bytes_written,&tmp_err);
   681 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   682 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   683 	{
   684 	    line=col=1;
   685 	    for(i=0;i<bytes_read;i++)
   686 		if (contents[i]=='\n')
   687 		{
   688 		    line++;
   689 		    col=1;
   690 		}
   691 		else if (contents[i]!='\r')
   692 		    col++;
   693 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   694 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   695 	      "valid Windows-1252 character",
   696 	      ((unsigned char *)contents)[bytes_read],line,col);
   697 	}
   698 	else if (tmp_err)
   699 	    g_propagate_error(err,tmp_err);
   700 	g_set_print_handler(print_as_windows_1252);
   701 #ifdef __WIN32__
   702 	SetConsoleOutputCP(1252);
   703 #endif
   704     }
   705     g_free(contents);
   706     return utf8;
   707 }
   708 
   709 void cleanup_on_exit(void)
   710 {
   711 #ifdef __WIN32__
   712     SetConsoleOutputCP(saved_cp);
   713 #endif
   714 }
   715 
   716 int main(int argc,char **argv)
   717 {
   718 #ifdef __WIN32__
   719     atexit(cleanup_on_exit);
   720     saved_cp=GetConsoleOutputCP();
   721 #endif
   722     running_from=g_path_get_dirname(argv[0]);
   723     /* Paranoid checking is turned OFF, not on, by its switch */
   724     pswit[PARANOID_SWITCH]=TRUE;
   725     /* if running in paranoid mode, typo checks default to enabled */
   726     pswit[TYPO_SWITCH]=TRUE;
   727     /* Line-end checking is turned OFF, not on, by its switch */
   728     pswit[LINE_END_SWITCH]=TRUE;
   729     /* Echoing is turned OFF, not on, by its switch */
   730     pswit[ECHO_SWITCH]=TRUE;
   731     parse_config_file();
   732     parse_options(&argc,&argv);
   733     if (pswit[USERTYPO_SWITCH])
   734 	read_user_scannos();
   735     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   736     procfile(argv[1]);
   737     if (pswit[OVERVIEW_SWITCH])
   738     {
   739 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   740 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   741 	g_print("    --------------- Queries found --------------\n");
   742 	if (cnt_long)
   743 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   744 	if (cnt_short)
   745 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   746 	if (cnt_lineend)
   747 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   748 	if (cnt_word)
   749 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   750 	if (cnt_quote)
   751 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   752 	if (cnt_brack)
   753 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   754 	if (cnt_bin)
   755 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   756 	if (cnt_odd)
   757 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   758 	if (cnt_punct)
   759 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   760 	if (cnt_dash)
   761 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   762 	if (cnt_html)
   763 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   764 	g_print("\n");
   765 	g_print("    TOTAL QUERIES		  %14ld\n",
   766 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   767 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   768     }
   769     g_free(running_from);
   770     if (usertypo)
   771 	g_tree_unref(usertypo);
   772     set_charset(NULL,NULL);
   773     if (config)
   774 	g_key_file_free(config);
   775     return 0;
   776 }
   777 
   778 /*
   779  * first_pass:
   780  *
   781  * Run a first pass - verify that it's a valid PG
   782  * file, decide whether to report some things that
   783  * occur many times in the text like long or short
   784  * lines, non-standard dashes, etc.
   785  */
   786 struct first_pass_results *first_pass(const char *etext)
   787 {
   788     gunichar laststart=CHAR_SPACE;
   789     const char *s;
   790     gchar *lc_line;
   791     int i,j,lbytes,llen;
   792     gchar **lines;
   793     unsigned int lastlen=0,lastblen=0;
   794     long spline=0,nspline=0;
   795     static struct first_pass_results results={0};
   796     gchar *inword;
   797     lines=g_strsplit(etext,"\n",0);
   798     for (j=0;lines[j];j++)
   799     {
   800 	lbytes=strlen(lines[j]);
   801 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   802 	    lines[j][--lbytes]='\0';
   803 	llen=g_utf8_strlen(lines[j],lbytes);
   804 	linecnt++;
   805 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   806 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   807 	{
   808 	    if (spline)
   809 		g_print("   --> Duplicate header?\n");
   810 	    spline=linecnt+1;   /* first line of non-header text, that is */
   811 	}
   812 	if (!strncmp(lines[j],"*** START",9) &&
   813 	  strstr(lines[j],"PROJECT GUTENBERG"))
   814 	{
   815 	    if (nspline)
   816 		g_print("   --> Duplicate header?\n");
   817 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   818 	}
   819 	if (spline || nspline)
   820 	{
   821 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   822 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   823 	    {
   824 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   825 		{
   826 		    if (results.footerline)
   827 		    {
   828 			/* it's an old-form header - we can detect duplicates */
   829 			if (!nspline)
   830 			    g_print("   --> Duplicate footer?\n");
   831 		    }
   832 		    else
   833 			results.footerline=linecnt;
   834 		}
   835 	    }
   836 	    g_free(lc_line);
   837 	}
   838 	if (spline)
   839 	    results.firstline=spline;
   840 	if (nspline)
   841 	    results.firstline=nspline;  /* override with new */
   842 	if (results.footerline)
   843 	    continue;    /* don't count the boilerplate in the footer */
   844 	results.totlen+=llen;
   845 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   846 	{
   847 	    if (g_utf8_get_char(s)>127)
   848 		results.binlen++;
   849 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   850 		results.alphalen++;
   851 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   852 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   853 		results.endquote_count++;
   854 	}
   855 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   856 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   857 	    results.shortline++;
   858 	if (lbytes>0 &&
   859 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   860 	    cnt_spacend++;
   861 	if (strstr(lines[j],".,"))
   862 	    results.dotcomma++;
   863 	/* only count ast lines for ignoring purposes where there is */
   864 	/* locase text on the line */
   865 	if (strchr(lines[j],'*'))
   866 	{
   867 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   868 		if (g_unichar_islower(g_utf8_get_char(s)))
   869 		    break;
   870 	    if (*s)
   871 		results.astline++;
   872 	}
   873 	if (strchr(lines[j],'/'))
   874 	    results.fslashline++;
   875 	if (lbytes>0)
   876 	{
   877 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   878 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   879 	      s=g_utf8_prev_char(s))
   880 		;
   881 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   882 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   883 		results.hyphens++;
   884 	}
   885 	if (llen>LONGEST_PG_LINE)
   886 	    results.longline++;
   887 	if (llen>WAY_TOO_LONG)
   888 	    results.verylongline++;
   889 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   890 	{
   891 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   892 	    if (i>0)
   893 		results.htmcount++;
   894 	    if (strstr(lines[j],"<i>"))
   895 		results.htmcount+=4; /* bonus marks! */
   896 	}
   897 	/* Check for spaced em-dashes */
   898 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   899 	{
   900 	    results.emdash++;
   901 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   902 		results.space_emdash++;
   903 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   904 		/* count of em-dashes with spaces both sides */
   905 		results.non_PG_space_emdash++;
   906 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   907 		/* count of PG-type em-dashes with no spaces */
   908 		results.PG_space_emdash++;
   909 	}
   910 	for (s=lines[j];*s;)
   911 	{
   912 	    inword=getaword(&s);
   913 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   914 		results.Dutchcount++;
   915 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   916 		results.Frenchcount++;
   917 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   918 		results.standalone_digit++;
   919 	    g_free(inword);
   920 	}
   921 	/* Check for spaced dashes */
   922 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   923 	    results.spacedash++;
   924 	lastblen=lastlen;
   925 	lastlen=llen;
   926 	laststart=lines[j][0];
   927     }
   928     g_strfreev(lines);
   929     return &results;
   930 }
   931 
   932 /*
   933  * report_first_pass:
   934  *
   935  * Make some snap decisions based on the first pass results.
   936  */
   937 struct warnings *report_first_pass(struct first_pass_results *results)
   938 {
   939     static struct warnings warnings={0};
   940     if (cnt_spacend>0)
   941 	g_print("   --> %ld lines in this file have white space at end\n",
   942 	  cnt_spacend);
   943     warnings.dotcomma=1;
   944     if (results->dotcomma>5)
   945     {
   946 	warnings.dotcomma=0;
   947 	g_print("   --> %ld lines in this file contain '.,'. "
   948 	  "Not reporting them.\n",results->dotcomma);
   949     }
   950     /*
   951      * If more than 50 lines, or one-tenth, are short,
   952      * don't bother reporting them.
   953      */
   954     warnings.shortline=1;
   955     if (results->shortline>50 || results->shortline*10>linecnt)
   956     {
   957 	warnings.shortline=0;
   958 	g_print("   --> %ld lines in this file are short. "
   959 	  "Not reporting short lines.\n",results->shortline);
   960     }
   961     /*
   962      * If more than 50 lines, or one-tenth, are long,
   963      * don't bother reporting them.
   964      */
   965     warnings.longline=1;
   966     if (results->longline>50 || results->longline*10>linecnt)
   967     {
   968 	warnings.longline=0;
   969 	g_print("   --> %ld lines in this file are long. "
   970 	  "Not reporting long lines.\n",results->longline);
   971     }
   972     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   973     warnings.ast=1;
   974     if (results->astline>10)
   975     {
   976 	warnings.ast=0;
   977 	g_print("   --> %ld lines in this file contain asterisks. "
   978 	  "Not reporting them.\n",results->astline);
   979     }
   980     /*
   981      * If more than 10 lines contain forward slashes,
   982      * don't bother reporting them.
   983      */
   984     warnings.fslash=1;
   985     if (results->fslashline>10)
   986     {
   987 	warnings.fslash=0;
   988 	g_print("   --> %ld lines in this file contain forward slashes. "
   989 	  "Not reporting them.\n",results->fslashline);
   990     }
   991     /*
   992      * If more than 20 lines contain unpunctuated endquotes,
   993      * don't bother reporting them.
   994      */
   995     warnings.endquote=1;
   996     if (results->endquote_count>20)
   997     {
   998 	warnings.endquote=0;
   999 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
  1000 	  "Not reporting them.\n",results->endquote_count);
  1001     }
  1002     /*
  1003      * If more than 15 lines contain standalone digits,
  1004      * don't bother reporting them.
  1005      */
  1006     warnings.digit=1;
  1007     if (results->standalone_digit>10)
  1008     {
  1009 	warnings.digit=0;
  1010 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
  1011 	  "Not reporting them.\n",results->standalone_digit);
  1012     }
  1013     /*
  1014      * If more than 20 lines contain hyphens at end,
  1015      * don't bother reporting them.
  1016      */
  1017     warnings.hyphen=1;
  1018     if (results->hyphens>20)
  1019     {
  1020 	warnings.hyphen=0;
  1021 	g_print("   --> %ld lines in this file have hyphens at end. "
  1022 	  "Not reporting them.\n",results->hyphens);
  1023     }
  1024     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1025     {
  1026 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1027 	pswit[MARKUP_SWITCH]=1;
  1028     }
  1029     if (results->verylongline>0)
  1030 	g_print("   --> %ld lines in this file are VERY long!\n",
  1031 	  results->verylongline);
  1032     /*
  1033      * If there are more non-PG spaced dashes than PG em-dashes,
  1034      * assume it's deliberate.
  1035      * Current PG guidelines say don't use them, but older texts do,
  1036      * and some people insist on them whatever the guidelines say.
  1037      */
  1038     warnings.dash=1;
  1039     if (results->spacedash+results->non_PG_space_emdash>
  1040       results->PG_space_emdash)
  1041     {
  1042 	warnings.dash=0;
  1043 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1044 	  "Not reporting them.\n",
  1045 	  results->spacedash+results->non_PG_space_emdash);
  1046     }
  1047     if (charset)
  1048 	warnings.bin=0;
  1049     else
  1050     {
  1051 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1052 	warnings.bin=1;
  1053 	/* If more than a quarter of characters are hi-bit, bug out. */
  1054 	if (results->binlen*4>results->totlen)
  1055 	{
  1056 	    g_print("   --> This file does not appear to be ASCII. "
  1057 	      "Terminating. Best of luck with it!\n");
  1058 	    exit(1);
  1059 	}
  1060 	if (results->alphalen*4<results->totlen)
  1061 	{
  1062 	    g_print("   --> This file does not appear to be text. "
  1063 	      "Terminating. Best of luck with it!\n");
  1064 	    exit(1);
  1065 	}
  1066 	if (results->binlen*100>results->totlen || results->binlen>100)
  1067 	{
  1068 	    g_print("   --> There are a lot of foreign letters here. "
  1069 	      "Not reporting them.\n");
  1070 	    if (!pswit[VERBOSE_SWITCH])
  1071 		warnings.bin=0;
  1072 	}
  1073     }
  1074     warnings.isDutch=FALSE;
  1075     if (results->Dutchcount>50)
  1076     {
  1077 	warnings.isDutch=TRUE;
  1078 	g_print("   --> This looks like Dutch - "
  1079 	  "switching off dashes and warnings for 's Middags case.\n");
  1080     }
  1081     warnings.isFrench=FALSE;
  1082     if (results->Frenchcount>50)
  1083     {
  1084 	warnings.isFrench=TRUE;
  1085 	g_print("   --> This looks like French - "
  1086 	  "switching off some doublepunct.\n");
  1087     }
  1088     if (results->firstline && results->footerline)
  1089 	g_print("    The PG header and footer appear to be already on.\n");
  1090     else
  1091     {
  1092 	if (results->firstline)
  1093 	    g_print("    The PG header is on - no footer.\n");
  1094 	if (results->footerline)
  1095 	    g_print("    The PG footer is on - no header.\n");
  1096     }
  1097     g_print("\n");
  1098     if (pswit[VERBOSE_SWITCH])
  1099     {
  1100 	warnings.shortline=1;
  1101 	warnings.dotcomma=1;
  1102 	warnings.longline=1;
  1103 	warnings.dash=1;
  1104 	warnings.digit=1;
  1105 	warnings.ast=1;
  1106 	warnings.fslash=1;
  1107 	warnings.hyphen=1;
  1108 	warnings.endquote=1;
  1109 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1110     }
  1111     if (warnings.isDutch)
  1112 	warnings.dash=0;
  1113     if (results->footerline>0 && results->firstline>0 &&
  1114       results->footerline>results->firstline &&
  1115       results->footerline-results->firstline<100)
  1116     {
  1117 	g_print("   --> I don't really know where this text starts. \n");
  1118 	g_print("       There are no reference points.\n");
  1119 	g_print("       I'm going to have to report the header and footer "
  1120 	  "as well.\n");
  1121 	results->firstline=0;
  1122     }
  1123     return &warnings;
  1124 }
  1125 
  1126 /*
  1127  * analyse_quotes:
  1128  *
  1129  * Look along the line, accumulate the count of quotes, and see
  1130  * if this is an empty line - i.e. a line with nothing on it
  1131  * but spaces.
  1132  * If line has just spaces, period, * and/or - on it, don't
  1133  * count it, since empty lines with asterisks or dashes to
  1134  * separate sections are common.
  1135  *
  1136  * Returns: TRUE if the line is empty.
  1137  */
  1138 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
  1139 {
  1140     int guessquote=0;
  1141     /* assume the line is empty until proven otherwise */
  1142     gboolean isemptyline=TRUE;
  1143     const char *s=aline,*sprev,*snext;
  1144     gunichar c;
  1145     sprev=NULL;
  1146     GError *tmp_err=NULL;
  1147     while (*s)
  1148     {
  1149 	snext=g_utf8_next_char(s);
  1150 	c=g_utf8_get_char(s);
  1151 	if (CHAR_IS_DQUOTE(c))
  1152 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1153 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1154 	{
  1155 	    if (s==aline)
  1156 	    {
  1157 		/*
  1158 		 * At start of line, it can only be a quotation mark.
  1159 		 * Hardcode a very common exception!
  1160 		 */
  1161 		if (!g_str_has_prefix(snext,"tis") &&
  1162 		  !g_str_has_prefix(snext,"Tis"))
  1163 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1164 	    }
  1165 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1166 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1167 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1168 		;
  1169 	    /* it's outside a word - let's check it out */
  1170 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1171 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1172 	    {
  1173 		/* certainly looks like a quotation mark */
  1174 		if (!g_str_has_prefix(snext,"tis") &&
  1175 		  !g_str_has_prefix(snext,"Tis"))
  1176 		    /* hardcode a very common exception! */
  1177 		{
  1178 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1179 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1180 		    else
  1181 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1182 		}
  1183 	    }
  1184 	    else
  1185 	    {
  1186 		/* now - is it a quotation mark? */
  1187 		guessquote=0;   /* accumulate clues */
  1188 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1189 		{
  1190 		    /* it follows a letter - could be either */
  1191 		    guessquote++;
  1192 		    if (g_utf8_get_char(sprev)=='s')
  1193 		    {
  1194 			/* looks like a plural apostrophe */
  1195 			guessquote-=3;
  1196 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1197 			    /* bonus marks! */
  1198 			    guessquote-=2;
  1199 		    }
  1200 		    if (innermost_quote_matches(counters,c))
  1201 			/*
  1202 			 * Give it the benefit of some doubt,
  1203 			 * if a squote is already open.
  1204 			 */
  1205 			guessquote++;
  1206 		    else
  1207 			guessquote--;
  1208 		    if (guessquote>=0)
  1209 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1210 		}
  1211 		else
  1212 		    /* no adjacent letter - it must be a quote of some kind */
  1213 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1214 	    }
  1215 	}
  1216 	if (tmp_err)
  1217 	{
  1218 	    if (pswit[ECHO_SWITCH])
  1219 		g_print("\n%s\n",aline);
  1220 	    if (!pswit[OVERVIEW_SWITCH])
  1221 		g_print("    Line %ld column %ld - %s\n",
  1222 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1223 	    g_clear_error(&tmp_err);
  1224 	}
  1225 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1226 	  c!='\r' && c!='\n')
  1227 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1228 	if (c==CHAR_UNDERSCORE)
  1229 	    counters->c_unders++;
  1230 	if (c==CHAR_OPEN_SBRACK)
  1231 	{
  1232 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1233 	      !matching_difference(counters,c) && s==aline &&
  1234 	      g_str_has_prefix(s,"[Illustration:"))
  1235 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1236 	    else
  1237 		increment_matching(counters,c,TRUE);
  1238 	}
  1239 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1240 	    increment_matching(counters,c,TRUE);
  1241 	if (c==CHAR_CLOSE_SBRACK)
  1242 	{
  1243 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1244 	      !matching_difference(counters,c) && !*snext)
  1245 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1246 	    else
  1247 		increment_matching(counters,c,FALSE);
  1248 	}
  1249 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1250 	    increment_matching(counters,c,FALSE);
  1251 	sprev=s;
  1252 	s=snext;
  1253     }
  1254     return isemptyline;
  1255 }
  1256 
  1257 /*
  1258  * check_for_control_characters:
  1259  *
  1260  * Check for invalid or questionable characters in the line
  1261  * Anything above 127 is invalid for plain ASCII, and
  1262  * non-printable control characters should also be flagged.
  1263  * Tabs should generally not be there.
  1264  */
  1265 void check_for_control_characters(const char *aline)
  1266 {
  1267     gunichar c;
  1268     const char *s;
  1269     for (s=aline;*s;s=g_utf8_next_char(s))
  1270     {
  1271 	c=g_utf8_get_char(s);
  1272 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1273 	{
  1274 	    if (pswit[ECHO_SWITCH])
  1275 		g_print("\n%s\n",aline);
  1276 	    if (!pswit[OVERVIEW_SWITCH])
  1277 		g_print("    Line %ld column %ld - Control character %u\n",
  1278 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1279 	    else
  1280 		cnt_bin++;
  1281 	}
  1282     }
  1283 }
  1284 
  1285 /*
  1286  * check_for_odd_characters:
  1287  *
  1288  * Check for binary and other odd characters.
  1289  */
  1290 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1291   gboolean isemptyline)
  1292 {
  1293     /* Don't repeat multiple warnings on one line. */
  1294     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1295     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1296     const char *s;
  1297     gunichar c;
  1298     gsize nb;
  1299     gchar *t;
  1300     for (s=aline;*s;s=g_utf8_next_char(s))
  1301     {
  1302 	c=g_utf8_get_char(s);
  1303 	if (warnings->bin && !eInvalidChar &&
  1304 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1305 	{
  1306 	    if (pswit[ECHO_SWITCH])
  1307 		g_print("\n%s\n",aline);
  1308 	    if (!pswit[OVERVIEW_SWITCH])
  1309 		if (c>127 && c<160 || c>255)
  1310 		    g_print("    Line %ld column %ld - "
  1311 		      "Non-ISO-8859 character %u\n",
  1312 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1313 		else
  1314 		    g_print("    Line %ld column %ld - "
  1315 		      "Non-ASCII character %u\n",
  1316 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1317 	    else
  1318 		cnt_bin++;
  1319 	    eInvalidChar=TRUE;
  1320 	}
  1321 	if (!eInvalidChar && charset)
  1322 	{
  1323 	    if (charset_validator==(GIConv)-1)
  1324 	    {
  1325 		if (!g_unichar_isdefined(c))
  1326 		{
  1327 		    if (pswit[ECHO_SWITCH])
  1328 			g_print("\n%s\n",aline);
  1329 		    if (!pswit[OVERVIEW_SWITCH])
  1330 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1331 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1332 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1333 		    else
  1334 			cnt_bin++;
  1335 		    eInvalidChar=TRUE;
  1336 		}
  1337 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1338 		  c>=100000 && c<=0x10FFFD)
  1339 		{
  1340 		    if (pswit[ECHO_SWITCH])
  1341 			g_print("\n%s\n",aline);
  1342 		    if (!pswit[OVERVIEW_SWITCH])
  1343 			g_print("    Line %ld column %ld - Private Use "
  1344 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1345 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1346 		    else
  1347 			cnt_bin++;
  1348 		    eInvalidChar=TRUE;
  1349 		}
  1350 	    }
  1351 	    else
  1352 	    {
  1353 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1354 		  charset_validator,NULL,&nb,NULL);
  1355 		if (t)
  1356 		    g_free(t);
  1357 		else
  1358 		{
  1359 		    if (pswit[ECHO_SWITCH])
  1360 			g_print("\n%s\n",aline);
  1361 		    if (!pswit[OVERVIEW_SWITCH])
  1362 			g_print("    Line %ld column %ld - Non-%s "
  1363 			  "character %u\n",linecnt,
  1364 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1365 		    else
  1366 			cnt_bin++;
  1367 		    eInvalidChar=TRUE;
  1368 		}
  1369 	    }
  1370 	}
  1371 	if (!eTab && c==CHAR_TAB)
  1372 	{
  1373 	    if (pswit[ECHO_SWITCH])
  1374 		g_print("\n%s\n",aline);
  1375 	    if (!pswit[OVERVIEW_SWITCH])
  1376 		g_print("    Line %ld column %ld - Tab character?\n",
  1377 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1378 	    else
  1379 		cnt_odd++;
  1380 	    eTab=TRUE;
  1381 	}
  1382 	if (!eTilde && c==CHAR_TILDE)
  1383 	{
  1384 	    /*
  1385 	     * Often used by OCR software to indicate an
  1386 	     * unrecognizable character.
  1387 	     */
  1388 	    if (pswit[ECHO_SWITCH])
  1389 		g_print("\n%s\n",aline);
  1390 	    if (!pswit[OVERVIEW_SWITCH])
  1391 		g_print("    Line %ld column %ld - Tilde character?\n",
  1392 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1393 	    else
  1394 		cnt_odd++;
  1395 	    eTilde=TRUE;
  1396 	}
  1397 	if (!eCarat && c==CHAR_CARAT)
  1398 	{  
  1399 	    if (pswit[ECHO_SWITCH])
  1400 		g_print("\n%s\n",aline);
  1401 	    if (!pswit[OVERVIEW_SWITCH])
  1402 		g_print("    Line %ld column %ld - Carat character?\n",
  1403 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1404 	    else
  1405 		cnt_odd++;
  1406 	    eCarat=TRUE;
  1407 	}
  1408 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1409 	{  
  1410 	    if (pswit[ECHO_SWITCH])
  1411 		g_print("\n%s\n",aline);
  1412 	    if (!pswit[OVERVIEW_SWITCH])
  1413 		g_print("    Line %ld column %ld - Forward slash?\n",
  1414 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1415 	    else
  1416 		cnt_odd++;
  1417 	    eFSlash=TRUE;
  1418 	}
  1419 	/*
  1420 	 * Report asterisks only in paranoid mode,
  1421 	 * since they're often deliberate.
  1422 	 */
  1423 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1424 	  c==CHAR_ASTERISK)
  1425 	{
  1426 	    if (pswit[ECHO_SWITCH])
  1427 		g_print("\n%s\n",aline);
  1428 	    if (!pswit[OVERVIEW_SWITCH])
  1429 		g_print("    Line %ld column %ld - Asterisk?\n",
  1430 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1431 	    else
  1432 		cnt_odd++;
  1433 	    eAst=TRUE;
  1434 	}
  1435     }
  1436 }
  1437 
  1438 /*
  1439  * check_for_long_line:
  1440  *
  1441  * Check for line too long.
  1442  */
  1443 void check_for_long_line(const char *aline)
  1444 {
  1445     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1446     {
  1447 	if (pswit[ECHO_SWITCH])
  1448 	    g_print("\n%s\n",aline);
  1449 	if (!pswit[OVERVIEW_SWITCH])
  1450 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1451 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1452 	else
  1453 	    cnt_long++;
  1454     }
  1455 }
  1456 
  1457 /*
  1458  * check_for_short_line:
  1459  *
  1460  * Check for line too short.
  1461  *
  1462  * This one is a bit trickier to implement: we don't want to
  1463  * flag the last line of a paragraph for being short, so we
  1464  * have to wait until we know that our current line is a
  1465  * "normal" line, then report the _previous_ line if it was too
  1466  * short. We also don't want to report indented lines like
  1467  * chapter heads or formatted quotations. We therefore keep
  1468  * last->len as the length of the last line examined, and
  1469  * last->blen as the length of the last but one, and try to
  1470  * suppress unnecessary warnings by checking that both were of
  1471  * "normal" length. We keep the first character of the last
  1472  * line in last->start, and if it was a space, we assume that
  1473  * the formatting is deliberate. I can't figure out a way to
  1474  * distinguish something like a quoted verse left-aligned or
  1475  * the header or footer of a letter from a paragraph of short
  1476  * lines - maybe if I examined the whole paragraph, and if the
  1477  * para has less than, say, 8 lines and if all lines are short,
  1478  * then just assume it's OK? Need to look at some texts to see
  1479  * how often a formula like this would get the right result.
  1480  */
  1481 void check_for_short_line(const char *aline,const struct line_properties *last)
  1482 {
  1483     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1484       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1485       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1486     {
  1487 	if (pswit[ECHO_SWITCH])
  1488 	    g_print("\n%s\n",prevline);
  1489 	if (!pswit[OVERVIEW_SWITCH])
  1490 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1491 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1492 	else
  1493 	    cnt_short++;
  1494     }
  1495 }
  1496 
  1497 /*
  1498  * check_for_starting_punctuation:
  1499  *
  1500  * Look for punctuation other than full ellipses at start of line.
  1501  */
  1502 void check_for_starting_punctuation(const char *aline)
  1503 {
  1504     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1505       !g_str_has_prefix(aline,". . ."))
  1506     {
  1507 	if (pswit[ECHO_SWITCH])
  1508 	    g_print("\n%s\n",aline);
  1509 	if (!pswit[OVERVIEW_SWITCH])
  1510 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1511 	      linecnt);
  1512 	else
  1513 	    cnt_punct++;
  1514     }
  1515 }
  1516 
  1517 /*
  1518  * check_for_spaced_emdash:
  1519  *
  1520  * Check for spaced em-dashes.
  1521  *
  1522  * We must check _all_ occurrences of "--" on the line
  1523  * hence the loop - even if the first double-dash is OK
  1524  * there may be another that's wrong later on.
  1525  */
  1526 void check_for_spaced_emdash(const char *aline)
  1527 {
  1528     const char *s,*t,*next;
  1529     for (s=aline;t=strstr(s,"--");s=next)
  1530     {
  1531 	next=g_utf8_next_char(g_utf8_next_char(t));
  1532 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1533 	  g_utf8_get_char(next)==CHAR_SPACE)
  1534 	{
  1535 	    if (pswit[ECHO_SWITCH])
  1536 		g_print("\n%s\n",aline);
  1537 	    if (!pswit[OVERVIEW_SWITCH])
  1538 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1539 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1540 	    else
  1541 		cnt_dash++;
  1542 	}
  1543     }
  1544 }
  1545 
  1546 /*
  1547  * check_for_spaced_dash:
  1548  *
  1549  * Check for spaced dashes.
  1550  */
  1551 void check_for_spaced_dash(const char *aline)
  1552 {
  1553     const char *s;
  1554     if ((s=strstr(aline," -")))
  1555     {
  1556 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1557 	{
  1558 	    if (pswit[ECHO_SWITCH])
  1559 		g_print("\n%s\n",aline);
  1560 	    if (!pswit[OVERVIEW_SWITCH])
  1561 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1562 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1563 	    else
  1564 		cnt_dash++;
  1565 	}
  1566     }
  1567     else if ((s=strstr(aline,"- ")))
  1568     {
  1569 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1570 	{
  1571 	    if (pswit[ECHO_SWITCH])
  1572 		g_print("\n%s\n",aline);
  1573 	    if (!pswit[OVERVIEW_SWITCH])
  1574 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1575 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1576 	    else
  1577 		cnt_dash++;
  1578 	}
  1579     }
  1580 }
  1581 
  1582 /*
  1583  * check_for_unmarked_paragraphs:
  1584  *
  1585  * Check for unmarked paragraphs indicated by separate speakers.
  1586  *
  1587  * May well be false positive:
  1588  * "Bravo!" "Wonderful!" called the crowd.
  1589  * but useful all the same.
  1590  */
  1591 void check_for_unmarked_paragraphs(const char *aline)
  1592 {
  1593     const char *s;
  1594     s=strstr(aline,"\"  \"");
  1595     if (!s)
  1596 	s=strstr(aline,"\" \"");
  1597     if (s)
  1598     {
  1599 	if (pswit[ECHO_SWITCH])
  1600 	    g_print("\n%s\n",aline);
  1601 	if (!pswit[OVERVIEW_SWITCH])
  1602 	    g_print("    Line %ld column %ld - "
  1603 	      "Query missing paragraph break?\n",
  1604 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1605 	else
  1606 	    cnt_punct++;
  1607     }
  1608 }
  1609 
  1610 /*
  1611  * check_for_jeebies:
  1612  *
  1613  * Check for "to he" and other easy h/b errors.
  1614  *
  1615  * This is a very inadequate effort on the h/b problem,
  1616  * but the phrase "to he" is always an error, whereas "to
  1617  * be" is quite common.
  1618  * Similarly, '"Quiet!", be said.' is a non-be error
  1619  * "to he" is _not_ always an error!:
  1620  *       "Where they went to he couldn't say."
  1621  * Another false positive:
  1622  *       What would "Cinderella" be without the . . .
  1623  * and another: "If he wants to he can see for himself."
  1624  */
  1625 void check_for_jeebies(const char *aline)
  1626 {
  1627     const char *s;
  1628     s=strstr(aline," be could ");
  1629     if (!s)
  1630 	s=strstr(aline," be would ");
  1631     if (!s)
  1632 	s=strstr(aline," was be ");
  1633     if (!s)
  1634 	s=strstr(aline," be is ");
  1635     if (!s)
  1636 	s=strstr(aline," is be ");
  1637     if (!s)
  1638 	s=strstr(aline,"\", be ");
  1639     if (!s)
  1640 	s=strstr(aline,"\" be ");
  1641     if (!s)
  1642 	s=strstr(aline,"\" be ");
  1643     if (!s)
  1644 	s=strstr(aline," to he ");
  1645     if (s)
  1646     {
  1647 	if (pswit[ECHO_SWITCH])
  1648 	    g_print("\n%s\n",aline);
  1649 	if (!pswit[OVERVIEW_SWITCH])
  1650 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1651 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1652 	else
  1653 	    cnt_word++;
  1654     }
  1655     s=strstr(aline," the had ");
  1656     if (!s)
  1657 	s=strstr(aline," a had ");
  1658     if (!s)
  1659 	s=strstr(aline," they bad ");
  1660     if (!s)
  1661 	s=strstr(aline," she bad ");
  1662     if (!s)
  1663 	s=strstr(aline," he bad ");
  1664     if (!s)
  1665 	s=strstr(aline," you bad ");
  1666     if (!s)
  1667 	s=strstr(aline," i bad ");
  1668     if (s)
  1669     {
  1670 	if (pswit[ECHO_SWITCH])
  1671 	    g_print("\n%s\n",aline);
  1672 	if (!pswit[OVERVIEW_SWITCH])
  1673 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1674 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1675 	else
  1676 	    cnt_word++;
  1677     }
  1678     s=strstr(aline,"; hut ");
  1679     if (!s)
  1680 	s=strstr(aline,", hut ");
  1681     if (s)
  1682     {
  1683 	if (pswit[ECHO_SWITCH])
  1684 	    g_print("\n%s\n",aline);
  1685 	if (!pswit[OVERVIEW_SWITCH])
  1686 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1687 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1688 	else
  1689 	    cnt_word++;
  1690     }
  1691 }
  1692 
  1693 /*
  1694  * check_for_mta_from:
  1695  *
  1696  * Special case - angled bracket in front of "From" placed there by an
  1697  * MTA when sending an e-mail.
  1698  */
  1699 void check_for_mta_from(const char *aline)
  1700 {
  1701     const char *s;
  1702     s=strstr(aline,">From");
  1703     if (s)
  1704     {
  1705 	if (pswit[ECHO_SWITCH])
  1706 	    g_print("\n%s\n",aline);
  1707 	if (!pswit[OVERVIEW_SWITCH])
  1708 	    g_print("    Line %ld column %ld - "
  1709 	      "Query angled bracket with From\n",
  1710 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1711 	else
  1712 	    cnt_punct++;
  1713     }
  1714 }
  1715 
  1716 /*
  1717  * check_for_orphan_character:
  1718  *
  1719  * Check for a single character line -
  1720  * often an overflow from bad wrapping.
  1721  */
  1722 void check_for_orphan_character(const char *aline)
  1723 {
  1724     gunichar c;
  1725     c=g_utf8_get_char(aline);
  1726     if (c && !*g_utf8_next_char(aline))
  1727     {
  1728 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1729 	    ; /* Nothing - ignore numerals alone on a line. */
  1730 	else
  1731 	{
  1732 	    if (pswit[ECHO_SWITCH])
  1733 		g_print("\n%s\n",aline);
  1734 	    if (!pswit[OVERVIEW_SWITCH])
  1735 		g_print("    Line %ld column 1 - Query single character line\n",
  1736 		  linecnt);
  1737 	    else
  1738 		cnt_punct++;
  1739 	}
  1740     }
  1741 }
  1742 
  1743 /*
  1744  * check_for_pling_scanno:
  1745  *
  1746  * Check for I" - often should be !
  1747  */
  1748 void check_for_pling_scanno(const char *aline)
  1749 {
  1750     const char *s;
  1751     s=strstr(aline," I\"");
  1752     if (s)
  1753     {
  1754 	if (pswit[ECHO_SWITCH])
  1755 	    g_print("\n%s\n",aline);
  1756 	if (!pswit[OVERVIEW_SWITCH])
  1757 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1758 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1759 	else
  1760 	    cnt_punct++;
  1761     }
  1762 }
  1763 
  1764 /*
  1765  * check_for_extra_period:
  1766  *
  1767  * Check for period without a capital letter. Cut-down from gutspell.
  1768  * Only works when it happens on a single line.
  1769  */
  1770 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1771 {
  1772     const char *s,*t,*s1,*sprev;
  1773     int i;
  1774     gsize len;
  1775     gboolean istypo;
  1776     gchar *testword;
  1777     gunichar c,nc,pc,*decomposition;
  1778     if (pswit[PARANOID_SWITCH])
  1779     {
  1780 	for (t=aline;t=strstr(t,". ");)
  1781 	{
  1782 	    if (t==aline)
  1783 	    {
  1784 		t=g_utf8_next_char(t);
  1785 		/* start of line punctuation is handled elsewhere */
  1786 		continue;
  1787 	    }
  1788 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1789 	    {
  1790 		t=g_utf8_next_char(t);
  1791 		continue;
  1792 	    }
  1793 	    if (warnings->isDutch)
  1794 	    {
  1795 		/* For Frank & Jeroen -- 's Middags case */
  1796 		gunichar c2,c3,c4,c5;
  1797 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1798 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1799 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1800 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1801 		if (CHAR_IS_APOSTROPHE(c2) &&
  1802 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1803 		  g_unichar_isupper(c5))
  1804 		{
  1805 		    t=g_utf8_next_char(t);
  1806 		    continue;
  1807 		}
  1808 	    }
  1809 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1810 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1811 	      !isdigit(g_utf8_get_char(s1)))
  1812 		s1=g_utf8_next_char(s1);
  1813 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1814 	    {
  1815 		/* we have something to investigate */
  1816 		istypo=TRUE;
  1817 		/* so let's go back and find out */
  1818 		nc=g_utf8_get_char(t);
  1819 		s1=g_utf8_prev_char(t);
  1820 		c=g_utf8_get_char(s1);
  1821 		sprev=g_utf8_prev_char(s1);
  1822 		pc=g_utf8_get_char(sprev);
  1823 		while (s1>=aline &&
  1824 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1825 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1826 		  g_unichar_isalpha(nc)))
  1827 		{
  1828 		    nc=c;
  1829 		    s1=sprev;
  1830 		    c=pc;
  1831 		    sprev=g_utf8_prev_char(s1);
  1832 		    pc=g_utf8_get_char(sprev);
  1833 		}
  1834 		s1=g_utf8_next_char(s1);
  1835 		s=strchr(s1,'.');
  1836 		if (s)
  1837 		    testword=g_strndup(s1,s-s1);
  1838 		else
  1839 		    testword=g_strdup(s1);
  1840 		for (i=0;*abbrev[i];i++)
  1841 		    if (!strcmp(testword,abbrev[i]))
  1842 			istypo=FALSE;
  1843 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1844 		    istypo=FALSE;
  1845 		if (!*g_utf8_next_char(testword))
  1846 		    istypo=FALSE;
  1847 		if (isroman(testword))
  1848 		    istypo=FALSE;
  1849 		if (istypo)
  1850 		{
  1851 		    istypo=FALSE;
  1852 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1853 		    {
  1854 			decomposition=g_unicode_canonical_decomposition(
  1855 			  g_utf8_get_char(s),&len);
  1856 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1857 			    istypo=TRUE;
  1858 			g_free(decomposition);
  1859 		    }
  1860 		}
  1861 		if (istypo &&
  1862 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1863 		{
  1864 		    g_tree_insert(qperiod,g_strdup(testword),
  1865 		      GINT_TO_POINTER(1));
  1866 		    if (pswit[ECHO_SWITCH])
  1867 			g_print("\n%s\n",aline);
  1868 		    if (!pswit[OVERVIEW_SWITCH])
  1869 			g_print("    Line %ld column %ld - Extra period?\n",
  1870 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1871 		    else
  1872 			cnt_punct++;
  1873 		}
  1874 		g_free(testword);
  1875 	    }
  1876 	    t=g_utf8_next_char(t);
  1877 	}
  1878     }
  1879 }
  1880 
  1881 /*
  1882  * check_for_following_punctuation:
  1883  *
  1884  * Check for words usually not followed by punctuation.
  1885  */
  1886 void check_for_following_punctuation(const char *aline)
  1887 {
  1888     int i;
  1889     const char *s,*wordstart;
  1890     gunichar c;
  1891     gchar *inword,*t;
  1892     if (pswit[TYPO_SWITCH])
  1893     {
  1894 	for (s=aline;*s;)
  1895 	{
  1896 	    wordstart=s;
  1897 	    t=getaword(&s);
  1898 	    if (!*t)
  1899 	    {
  1900 		g_free(t);
  1901 		continue;
  1902 	    }
  1903 	    inword=g_utf8_strdown(t,-1);
  1904 	    g_free(t);
  1905 	    for (i=0;*nocomma[i];i++)
  1906 		if (!strcmp(inword,nocomma[i]))
  1907 		{
  1908 		    c=g_utf8_get_char(s);
  1909 		    if (c==',' || c==';' || c==':')
  1910 		    {
  1911 			if (pswit[ECHO_SWITCH])
  1912 			    g_print("\n%s\n",aline);
  1913 			if (!pswit[OVERVIEW_SWITCH])
  1914 			    g_print("    Line %ld column %ld - "
  1915 			      "Query punctuation after %s?\n",
  1916 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1917 			      inword);
  1918 			else
  1919 			    cnt_punct++;
  1920 		    }
  1921 		}
  1922 	    for (i=0;*noperiod[i];i++)
  1923 		if (!strcmp(inword,noperiod[i]))
  1924 		{
  1925 		    c=g_utf8_get_char(s);
  1926 		    if (c=='.' || c=='!')
  1927 		    {
  1928 			if (pswit[ECHO_SWITCH])
  1929 			    g_print("\n%s\n",aline);
  1930 			if (!pswit[OVERVIEW_SWITCH])
  1931 			    g_print("    Line %ld column %ld - "
  1932 			      "Query punctuation after %s?\n",
  1933 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1934 			      inword);
  1935 			else
  1936 			    cnt_punct++;
  1937 		    }
  1938 		}
  1939 	    g_free(inword);
  1940 	}
  1941     }
  1942 }
  1943 
  1944 /*
  1945  * check_for_typos:
  1946  *
  1947  * Check for commonly mistyped words,
  1948  * and digits like 0 for O in a word.
  1949  */
  1950 void check_for_typos(const char *aline,struct warnings *warnings)
  1951 {
  1952     const char *s,*t,*nt,*wordstart;
  1953     gchar *inword;
  1954     gunichar *decomposition;
  1955     gchar *testword;
  1956     int i,vowel,consonant,*dupcnt;
  1957     gboolean isdup,istypo,alower;
  1958     gunichar c,pc;
  1959     long offset,len;
  1960     gsize decomposition_len;
  1961     for (s=aline;*s;)
  1962     {
  1963 	wordstart=s;
  1964 	inword=getaword(&s);
  1965 	if (!*inword)
  1966 	{
  1967 	    g_free(inword);
  1968 	    continue; /* don't bother with empty lines */
  1969 	}
  1970 	if (mixdigit(inword))
  1971 	{
  1972 	    if (pswit[ECHO_SWITCH])
  1973 		g_print("\n%s\n",aline);
  1974 	    if (!pswit[OVERVIEW_SWITCH])
  1975 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1976 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1977 	    else
  1978 		cnt_word++;
  1979 	}
  1980 	/*
  1981 	 * Put the word through a series of tests for likely typos and OCR
  1982 	 * errors.
  1983 	 */
  1984 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1985 	{
  1986 	    istypo=FALSE;
  1987 	    alower=FALSE;
  1988 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1989 	    {
  1990 		c=g_utf8_get_char(t);
  1991 		nt=g_utf8_next_char(t);
  1992 		/* lowercase for testing */
  1993 		if (g_unichar_islower(c))
  1994 		    alower=TRUE;
  1995 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1996 		{
  1997 		    /*
  1998 		     * We have an uppercase mid-word. However, there are
  1999 		     * common cases:
  2000 		     *   Mac and Mc like McGill
  2001 		     *   French contractions like l'Abbe
  2002 		     */
  2003 		    offset=g_utf8_pointer_to_offset(inword,t);
  2004 		    if (offset>0)
  2005 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  2006 		    else
  2007 			pc='\0';
  2008 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  2009 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  2010 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  2011 		      CHAR_IS_APOSTROPHE(pc))
  2012 			; /* do nothing! */
  2013 		    else
  2014 			istypo=TRUE;
  2015 		}
  2016 	    }
  2017 	    testword=g_utf8_casefold(inword,-1);
  2018 	}
  2019 	if (pswit[TYPO_SWITCH])
  2020 	{
  2021 	    /*
  2022 	     * Check for certain unlikely two-letter combinations at word
  2023 	     * start and end.
  2024 	     */
  2025 	    len=g_utf8_strlen(testword,-1);
  2026 	    if (len>1)
  2027 	    {
  2028 		for (i=0;*nostart[i];i++)
  2029 		    if (g_str_has_prefix(testword,nostart[i]))
  2030 			istypo=TRUE;
  2031 		for (i=0;*noend[i];i++)
  2032 		    if (g_str_has_suffix(testword,noend[i]))
  2033 			istypo=TRUE;
  2034 	    }
  2035 	    /* ght is common, gbt never. Like that. */
  2036 	    if (strstr(testword,"cb"))
  2037 		istypo=TRUE;
  2038 	    if (strstr(testword,"gbt"))
  2039 		istypo=TRUE;
  2040 	    if (strstr(testword,"pbt"))
  2041 		istypo=TRUE;
  2042 	    if (strstr(testword,"tbs"))
  2043 		istypo=TRUE;
  2044 	    if (strstr(testword,"mrn"))
  2045 		istypo=TRUE;
  2046 	    if (strstr(testword,"ahle"))
  2047 		istypo=TRUE;
  2048 	    if (strstr(testword,"ihle"))
  2049 		istypo=TRUE;
  2050 	    /*
  2051 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2052 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2053 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2054 	     * numerals, but "ii" is a common scanno.
  2055 	     */
  2056 	    if (strstr(testword,"tbi"))
  2057 		istypo=TRUE;
  2058 	    if (strstr(testword,"tbe"))
  2059 		istypo=TRUE;
  2060 	    if (strstr(testword,"ii"))
  2061 		istypo=TRUE;
  2062 	    /*
  2063 	     * Check for no vowels or no consonants.
  2064 	     * If none, flag a typo.
  2065 	     */
  2066 	    if (!istypo && len>1)
  2067 	    {
  2068 		vowel=consonant=0;
  2069 		for (t=testword;*t;t=g_utf8_next_char(t))
  2070 		{
  2071 		    c=g_utf8_get_char(t);
  2072 		    decomposition=
  2073 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2074 		    if (c=='y' || g_unichar_isdigit(c))
  2075 		    {
  2076 			/* Yah, this is loose. */
  2077 			vowel++;
  2078 			consonant++;
  2079 		    }
  2080 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2081 			vowel++;
  2082 		    else
  2083 			consonant++;
  2084 		    g_free(decomposition);
  2085 		}
  2086 		if (!vowel || !consonant)
  2087 		    istypo=TRUE;
  2088 	    }
  2089 	    /*
  2090 	     * Now exclude the word from being reported if it's in
  2091 	     * the okword list.
  2092 	     */
  2093 	    for (i=0;*okword[i];i++)
  2094 		if (!strcmp(testword,okword[i]))
  2095 		    istypo=FALSE;
  2096 	    /*
  2097 	     * What looks like a typo may be a Roman numeral.
  2098 	     * Exclude these.
  2099 	     */
  2100 	    if (istypo && isroman(testword))
  2101 		istypo=FALSE;
  2102 	    /* Check the manual list of typos. */
  2103 	    if (!istypo)
  2104 		for (i=0;*typo[i];i++)
  2105 		    if (!strcmp(testword,typo[i]))
  2106 			istypo=TRUE;
  2107 	    /*
  2108 	     * Check lowercase s, l, i and m - special cases.
  2109 	     *   "j" - often a semi-colon gone wrong.
  2110 	     *   "d" for a missing apostrophe - he d
  2111 	     *   "n" for "in"
  2112 	     */
  2113 	    if (!istypo && len==1 &&
  2114 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2115 		istypo=TRUE;
  2116 	    if (istypo)
  2117 	    {
  2118 		dupcnt=g_tree_lookup(qword,testword);
  2119 		if (dupcnt)
  2120 		{
  2121 		    (*dupcnt)++;
  2122 		    isdup=!pswit[VERBOSE_SWITCH];
  2123 		}
  2124 		else
  2125 		{
  2126 		    dupcnt=g_new0(int,1);
  2127 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2128 		    isdup=FALSE;
  2129 		}
  2130 		if (!isdup)
  2131 		{
  2132 		    if (pswit[ECHO_SWITCH])
  2133 			g_print("\n%s\n",aline);
  2134 		    if (!pswit[OVERVIEW_SWITCH])
  2135 		    {
  2136 			g_print("    Line %ld column %ld - Query word %s",
  2137 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2138 			  inword);
  2139 			if (!pswit[VERBOSE_SWITCH])
  2140 			    g_print(" - not reporting duplicates");
  2141 			g_print("\n");
  2142 		    }
  2143 		    else
  2144 			cnt_word++;
  2145 		}
  2146 	    }
  2147 	}
  2148 	/* check the user's list of typos */
  2149 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2150 	{
  2151 	    if (pswit[ECHO_SWITCH])
  2152 		g_print("\n%s\n",aline);
  2153 	    if (!pswit[OVERVIEW_SWITCH])  
  2154 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2155 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2156 	}
  2157 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2158 	    g_free(testword);
  2159 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2160 	{
  2161 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2162 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2163 	    {
  2164 		if (pswit[ECHO_SWITCH])
  2165 		    g_print("\n%s\n",aline);
  2166 		if (!pswit[OVERVIEW_SWITCH])
  2167 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2168 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2169 		      inword);
  2170 		else
  2171 		    cnt_word++;
  2172 	    }
  2173 	}
  2174 	g_free(inword);
  2175     }
  2176 }
  2177 
  2178 /*
  2179  * check_for_misspaced_punctuation:
  2180  *
  2181  * Look for added or missing spaces around punctuation and quotes.
  2182  * If there is a punctuation character like ! with no space on
  2183  * either side, suspect a missing!space. If there are spaces on
  2184  * both sides , assume a typo. If we see a double quote with no
  2185  * space or punctuation on either side of it, assume unspaced
  2186  * quotes "like"this.
  2187  */
  2188 void check_for_misspaced_punctuation(const char *aline,
  2189   struct parities *parities,gboolean isemptyline)
  2190 {
  2191     gboolean isacro,isellipsis;
  2192     const char *s;
  2193     gunichar c,nc,pc,n2c;
  2194     c=g_utf8_get_char(aline);
  2195     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2196     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2197     {
  2198 	pc=c;
  2199 	c=nc;
  2200 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2201 	/* For each character in the line after the first. */
  2202 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2203 	{
  2204 	    /* we need to suppress warnings for acronyms like M.D. */
  2205 	    isacro=FALSE;
  2206 	    /* we need to suppress warnings for ellipsis . . . */
  2207 	    isellipsis=FALSE;
  2208 	    /*
  2209 	     * If there are letters on both sides of it or
  2210 	     * if it's strict punctuation followed by an alpha.
  2211 	     */
  2212 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2213 	      g_utf8_strchr("?!,;:",-1,c)))
  2214 	    {
  2215 		if (c=='.')
  2216 		{
  2217 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2218 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2219 			isacro=TRUE;
  2220 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2221 		    if (nc && n2c=='.')
  2222 			isacro=TRUE;
  2223 		}
  2224 		if (!isacro)
  2225 		{
  2226 		    if (pswit[ECHO_SWITCH])
  2227 			g_print("\n%s\n",aline);
  2228 		    if (!pswit[OVERVIEW_SWITCH])
  2229 			g_print("    Line %ld column %ld - Missing space?\n",
  2230 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2231 		    else
  2232 			cnt_punct++;
  2233 		}
  2234 	    }
  2235 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2236 	    {
  2237 		/*
  2238 		 * If there are spaces on both sides,
  2239 		 * or space before and end of line.
  2240 		 */
  2241 		if (c=='.')
  2242 		{
  2243 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2244 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2245 			isellipsis=TRUE;
  2246 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2247 		    if (nc && n2c=='.')
  2248 			isellipsis=TRUE;
  2249 		}
  2250 		if (!isemptyline && !isellipsis)
  2251 		{
  2252 		    if (pswit[ECHO_SWITCH])
  2253 			g_print("\n%s\n",aline);
  2254 		    if (!pswit[OVERVIEW_SWITCH])
  2255 			g_print("    Line %ld column %ld - "
  2256 			  "Spaced punctuation?\n",linecnt,
  2257 			  g_utf8_pointer_to_offset(aline,s)+1);
  2258 		    else
  2259 			cnt_punct++;
  2260 		}
  2261 	    }
  2262 	}
  2263     }
  2264     /* Split out the characters that CANNOT be preceded by space. */
  2265     c=g_utf8_get_char(aline);
  2266     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2267     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2268     {
  2269 	pc=c;
  2270 	c=nc;
  2271 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2272 	/* for each character in the line after the first */
  2273 	if (g_utf8_strchr("?!,;:",-1,c))
  2274 	{
  2275 	    /* if it's punctuation that _cannot_ have a space before it */
  2276 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2277 	    {
  2278 		/*
  2279 		 * If nc DOES == space,
  2280 		 * it was already reported just above.
  2281 		 */
  2282 		if (pswit[ECHO_SWITCH])
  2283 		    g_print("\n%s\n",aline);
  2284 		if (!pswit[OVERVIEW_SWITCH])
  2285 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2286 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2287 		else
  2288 		    cnt_punct++;
  2289 	    }
  2290 	}
  2291     }
  2292     /*
  2293      * Special case " .X" where X is any alpha.
  2294      * This plugs a hole in the acronym code above.
  2295      * Inelegant, but maintainable.
  2296      */
  2297     c=g_utf8_get_char(aline);
  2298     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2299     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2300     {
  2301 	pc=c;
  2302 	c=nc;
  2303 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2304 	/* for each character in the line after the first */
  2305 	if (c=='.')
  2306 	{
  2307 	    /* if it's a period */
  2308 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2309 	    {
  2310 		/*
  2311 		 * If the period follows a space and
  2312 		 * is followed by a letter.
  2313 		 */
  2314 		if (pswit[ECHO_SWITCH])
  2315 		    g_print("\n%s\n",aline);
  2316 		if (!pswit[OVERVIEW_SWITCH])
  2317 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2318 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2319 		else
  2320 		    cnt_punct++;
  2321 	    }
  2322 	}
  2323     }
  2324     c=g_utf8_get_char(aline);
  2325     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2326     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2327     {
  2328 	pc=c;
  2329 	c=nc;
  2330 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2331 	/* for each character in the line after the first */
  2332 	if (c==CHAR_DQUOTE)
  2333 	{
  2334 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2335 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2336 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2337 	    {
  2338 		if (pswit[ECHO_SWITCH])
  2339 		    g_print("\n%s\n",aline);
  2340 		if (!pswit[OVERVIEW_SWITCH])
  2341 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2342 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2343 		else
  2344 		    cnt_punct++;
  2345 	    }
  2346 	}
  2347     }
  2348     /* Check parity of quotes. */
  2349     nc=g_utf8_get_char(aline);
  2350     for (s=aline;*s;s=g_utf8_next_char(s))
  2351     {
  2352 	c=nc;
  2353 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2354 	if (c==CHAR_DQUOTE)
  2355 	{
  2356 	    parities->dquote=!parities->dquote;
  2357 	    if (!parities->dquote)
  2358 	    {
  2359 		/* parity even */
  2360 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2361 		{
  2362 		    if (pswit[ECHO_SWITCH])
  2363 			g_print("\n%s\n",aline);
  2364 		    if (!pswit[OVERVIEW_SWITCH])
  2365 			g_print("    Line %ld column %ld - "
  2366 			  "Wrongspaced quotes?\n",
  2367 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2368 		    else
  2369 			cnt_punct++;
  2370 		}
  2371 	    }
  2372 	    else
  2373 	    {
  2374 		/* parity odd */
  2375 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2376 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2377 		{
  2378 		    if (pswit[ECHO_SWITCH])
  2379 			g_print("\n%s\n",aline);
  2380 		    if (!pswit[OVERVIEW_SWITCH])
  2381 			g_print("    Line %ld column %ld - "
  2382 			  "Wrongspaced quotes?\n",
  2383 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2384 		    else
  2385 			cnt_punct++;
  2386 		}
  2387 	    }
  2388 	}
  2389     }
  2390     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  2391     {
  2392 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2393 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2394 	{
  2395 	    if (pswit[ECHO_SWITCH])
  2396 		g_print("\n%s\n",aline);
  2397 	    if (!pswit[OVERVIEW_SWITCH])
  2398 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2399 		  linecnt);
  2400 	    else
  2401 		cnt_punct++;
  2402 	}
  2403     }
  2404     if (pswit[SQUOTE_SWITCH])
  2405     {
  2406 	nc=g_utf8_get_char(aline);
  2407 	for (s=aline;*s;s=g_utf8_next_char(s))
  2408 	{
  2409 	    c=nc;
  2410 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2411 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2412 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2413 	      !g_unichar_isalpha(nc)))
  2414 	    {
  2415 		parities->squote=!parities->squote;
  2416 		if (!parities->squote)
  2417 		{
  2418 		    /* parity even */
  2419 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2420 		    {
  2421 			if (pswit[ECHO_SWITCH])
  2422 			    g_print("\n%s\n",aline);
  2423 			if (!pswit[OVERVIEW_SWITCH])
  2424 			    g_print("    Line %ld column %ld - "
  2425 			      "Wrongspaced singlequotes?\n",
  2426 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2427 			else
  2428 			    cnt_punct++;
  2429 		    }
  2430 		}
  2431 		else
  2432 		{
  2433 		    /* parity odd */
  2434 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2435 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2436 		    {
  2437 			if (pswit[ECHO_SWITCH])
  2438 			    g_print("\n%s\n",aline);
  2439 			if (!pswit[OVERVIEW_SWITCH])
  2440 			    g_print("    Line %ld column %ld - "
  2441 			      "Wrongspaced singlequotes?\n",
  2442 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2443 			else
  2444 			    cnt_punct++;
  2445 		    }
  2446 		}
  2447 	    }
  2448 	}
  2449     }
  2450 }
  2451 
  2452 /*
  2453  * check_for_double_punctuation:
  2454  *
  2455  * Look for double punctuation like ,. or ,,
  2456  * Thanks to DW for the suggestion!
  2457  * In books with references, ".," and ".;" are common
  2458  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2459  * OTOH, from my initial tests, there are also fairly
  2460  * common errors. What to do? Make these cases paranoid?
  2461  * ".," is the most common, so warnings->dotcomma is used
  2462  * to suppress detailed reporting if it occurs often.
  2463  */
  2464 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2465 {
  2466     const char *s;
  2467     gunichar c,nc;
  2468     nc=g_utf8_get_char(aline);
  2469     for (s=aline;*s;s=g_utf8_next_char(s))
  2470     {
  2471 	c=nc;
  2472 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2473 	/* for each punctuation character in the line */
  2474 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2475 	  g_utf8_strchr(".?!,;:",-1,nc))
  2476 	{
  2477 	    /* followed by punctuation, it's a query, unless . . . */
  2478 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2479 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2480 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2481 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2482 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2483 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2484 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2485 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2486 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2487 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2488 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2489 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2490 	    {
  2491 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2492 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2493 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2494 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2495 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2496 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2497 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2498 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2499 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2500 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2501 		{
  2502 		    s+=4;
  2503 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2504 		}
  2505 		; /* do nothing for .. !! and ?? which can be legit */
  2506 	    }
  2507 	    else
  2508 	    {
  2509 		if (pswit[ECHO_SWITCH])
  2510 		    g_print("\n%s\n",aline);
  2511 		if (!pswit[OVERVIEW_SWITCH])
  2512 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2513 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2514 		else
  2515 		    cnt_punct++;
  2516 	    }
  2517 	}
  2518     }
  2519 }
  2520 
  2521 /*
  2522  * check_for_spaced_quotes:
  2523  */
  2524 void check_for_spaced_quotes(const char *aline)
  2525 {
  2526     int i;
  2527     const char *s,*t;
  2528     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2529       CHAR_RS_QUOTE};
  2530     GString *pattern;
  2531     s=aline;
  2532     while ((t=strstr(s," \" ")))
  2533     {
  2534 	if (pswit[ECHO_SWITCH])
  2535 	    g_print("\n%s\n",aline);
  2536 	if (!pswit[OVERVIEW_SWITCH])
  2537 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2538 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2539 	else
  2540 	    cnt_punct++;
  2541 	s=g_utf8_next_char(g_utf8_next_char(t));
  2542     }
  2543     pattern=g_string_new(NULL);
  2544     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2545     {
  2546 	g_string_assign(pattern," ");
  2547 	g_string_append_unichar(pattern,single_quotes[i]);
  2548 	g_string_append_c(pattern,' ');
  2549 	s=aline;
  2550 	while ((t=strstr(s,pattern->str)))
  2551 	{
  2552 	    if (pswit[ECHO_SWITCH])
  2553 		g_print("\n%s\n",aline);
  2554 	    if (!pswit[OVERVIEW_SWITCH])
  2555 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2556 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2557 	    else
  2558 		cnt_punct++;
  2559 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2560 	}
  2561     }
  2562     g_string_free(pattern,TRUE);
  2563 }
  2564 
  2565 /*
  2566  * check_for_miscased_genative:
  2567  *
  2568  * Check special case of 'S instead of 's at end of word.
  2569  */
  2570 void check_for_miscased_genative(const char *aline)
  2571 {
  2572     const char *s;
  2573     gunichar c,nc,pc;
  2574     if (!*aline)
  2575 	return;
  2576     c=g_utf8_get_char(aline);
  2577     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2578     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2579     {
  2580 	pc=c;
  2581 	c=nc;
  2582 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2583 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2584 	{
  2585 	    if (pswit[ECHO_SWITCH])
  2586 		g_print("\n%s\n",aline);
  2587 	    if (!pswit[OVERVIEW_SWITCH])
  2588 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2589 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2590 	    else
  2591 		cnt_punct++;
  2592 	}
  2593     }
  2594 }
  2595 
  2596 /*
  2597  * check_end_of_line:
  2598  *
  2599  * Now check special cases - start and end of line -
  2600  * for single and double quotes. Start is sometimes [sic]
  2601  * but better to query it anyway.
  2602  * While we're here, check for dash at end of line.
  2603  */
  2604 void check_end_of_line(const char *aline,struct warnings *warnings)
  2605 {
  2606     int lbytes;
  2607     const char *s;
  2608     gunichar c1,c2;
  2609     lbytes=strlen(aline);
  2610     if (g_utf8_strlen(aline,lbytes)>1)
  2611     {
  2612 	s=g_utf8_prev_char(aline+lbytes);
  2613 	c1=g_utf8_get_char(s);
  2614 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2615 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2616 	{
  2617 	    if (pswit[ECHO_SWITCH])
  2618 		g_print("\n%s\n",aline);
  2619 	    if (!pswit[OVERVIEW_SWITCH])
  2620 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2621 		  g_utf8_strlen(aline,lbytes));
  2622 	    else
  2623 		cnt_punct++;
  2624 	}
  2625 	c1=g_utf8_get_char(aline);
  2626 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2627 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2628 	{
  2629 	    if (pswit[ECHO_SWITCH])
  2630 		g_print("\n%s\n",aline);
  2631 	    if (!pswit[OVERVIEW_SWITCH])
  2632 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2633 	    else
  2634 		cnt_punct++;
  2635 	}
  2636 	/*
  2637 	 * Dash at end of line may well be legit - paranoid mode only
  2638 	 * and don't report em-dash at line-end.
  2639 	 */
  2640 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2641 	{
  2642 	    for (s=g_utf8_prev_char(aline+lbytes);
  2643 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2644 		;
  2645 	    if (g_utf8_get_char(s)=='-' &&
  2646 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2647 	    {
  2648 		if (pswit[ECHO_SWITCH])
  2649 		    g_print("\n%s\n",aline);
  2650 		if (!pswit[OVERVIEW_SWITCH])
  2651 		    g_print("    Line %ld column %ld - "
  2652 		      "Hyphen at end of line?\n",
  2653 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2654 	    }
  2655 	}
  2656     }
  2657 }
  2658 
  2659 /*
  2660  * check_for_unspaced_bracket:
  2661  *
  2662  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2663  * If so, suspect a scanno like "a]most".
  2664  */
  2665 void check_for_unspaced_bracket(const char *aline)
  2666 {
  2667     const char *s;
  2668     gunichar c,nc,pc;
  2669     c=g_utf8_get_char(aline);
  2670     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2671     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2672     {
  2673 	pc=c;
  2674 	c=nc;
  2675 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2676 	if (!nc)
  2677 	    break;
  2678 	/* for each bracket character in the line except 1st & last */
  2679 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2680 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2681 	{
  2682 	    if (pswit[ECHO_SWITCH])
  2683 		g_print("\n%s\n",aline);
  2684 	    if (!pswit[OVERVIEW_SWITCH])
  2685 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2686 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2687 	    else
  2688 		cnt_punct++;
  2689 	}
  2690     }
  2691 }
  2692 
  2693 /*
  2694  * check_for_unpunctuated_endquote:
  2695  */
  2696 void check_for_unpunctuated_endquote(const char *aline)
  2697 {
  2698     const char *s;
  2699     gunichar c,nc,pc;
  2700     c=g_utf8_get_char(aline);
  2701     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2702     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2703     {
  2704 	pc=c;
  2705 	c=nc;
  2706 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2707 	/* for each character in the line except 1st */
  2708 	if (c==CHAR_DQUOTE && isalpha(pc))
  2709 	{
  2710 	    if (pswit[ECHO_SWITCH])
  2711 		g_print("\n%s\n",aline);
  2712 	    if (!pswit[OVERVIEW_SWITCH])
  2713 		g_print("    Line %ld column %ld - "
  2714 		  "endquote missing punctuation?\n",
  2715 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2716 	    else
  2717 		cnt_punct++;
  2718 	}
  2719     }
  2720 }
  2721 
  2722 /*
  2723  * check_for_html_tag:
  2724  *
  2725  * Check for <HTML TAG>.
  2726  *
  2727  * If there is a < in the line, followed at some point
  2728  * by a > then we suspect HTML.
  2729  */
  2730 void check_for_html_tag(const char *aline)
  2731 {
  2732     const char *open,*close;
  2733     gchar *tag;
  2734     open=strchr(aline,'<');
  2735     if (open)
  2736     {
  2737 	close=strchr(g_utf8_next_char(open),'>');
  2738 	if (close)
  2739 	{
  2740 	    if (pswit[ECHO_SWITCH])
  2741 		g_print("\n%s\n",aline);
  2742 	    if (!pswit[OVERVIEW_SWITCH])
  2743 	    {
  2744 		tag=g_strndup(open,close-open+1);
  2745 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2746 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2747 		g_free(tag);
  2748 	    }
  2749 	    else
  2750 		cnt_html++;
  2751 	}
  2752     }
  2753 }
  2754 
  2755 /*
  2756  * check_for_html_entity:
  2757  *
  2758  * Check for &symbol; HTML.
  2759  *
  2760  * If there is a & in the line, followed at
  2761  * some point by a ; then we suspect HTML.
  2762  */
  2763 void check_for_html_entity(const char *aline)
  2764 {
  2765     const char *s,*amp,*scolon;
  2766     gchar *entity;
  2767     amp=strchr(aline,'&');
  2768     if (amp)
  2769     {
  2770 	scolon=strchr(amp,';');
  2771 	if (scolon)
  2772 	{
  2773 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2774 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2775 		    break;		/* Don't report "Jones & Son;" */
  2776 	    if (s>=scolon)
  2777 	    {
  2778 		if (pswit[ECHO_SWITCH])
  2779 		    g_print("\n%s\n",aline);
  2780 		if (!pswit[OVERVIEW_SWITCH])
  2781 		{
  2782 		    entity=g_strndup(amp,scolon-amp+1);
  2783 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2784 		      linecnt,(int)(amp-aline)+1,entity);
  2785 		    g_free(entity);
  2786 		}
  2787 		else
  2788 		    cnt_html++;
  2789 	    }
  2790 	}
  2791     }
  2792 }
  2793 
  2794 /*
  2795  * check_for_omitted_punctuation:
  2796  *
  2797  * Check for omitted punctuation at end of paragraph by working back
  2798  * through prevline. DW.
  2799  * Need to check this only for "normal" paras.
  2800  * So what is a "normal" para?
  2801  *    Not normal if one-liner (chapter headings, etc.)
  2802  *    Not normal if doesn't contain at least one locase letter
  2803  *    Not normal if starts with space
  2804  */
  2805 void check_for_omitted_punctuation(const char *prevline,
  2806   struct line_properties *last,int start_para_line)
  2807 {
  2808     gboolean letter_on_line=FALSE;
  2809     const char *s;
  2810     gunichar c;
  2811     gboolean closing_quote;
  2812     for (s=prevline;*s;s=g_utf8_next_char(s))
  2813 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2814 	{
  2815 	    letter_on_line=TRUE;
  2816 	    break;
  2817 	}
  2818     /*
  2819      * This next "if" is a problem.
  2820      * If we say "start_para_line <= linecnt - 1", that includes
  2821      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2822      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2823      * misses genuine one-line paragraphs.
  2824      */
  2825     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2826       g_utf8_get_char(prevline)>CHAR_SPACE)
  2827     {
  2828 	s=prevline+strlen(prevline);
  2829 	do
  2830 	{
  2831 	    s=g_utf8_prev_char(s);
  2832 	    c=g_utf8_get_char(s);
  2833 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2834 		closing_quote=TRUE;
  2835 	    else
  2836 		closing_quote=FALSE;
  2837 	} while (closing_quote && s>prevline);
  2838 	for (;s>prevline;s=g_utf8_prev_char(s))
  2839 	{
  2840 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2841 	    {
  2842 		if (pswit[ECHO_SWITCH])
  2843 		    g_print("\n%s\n",prevline);
  2844 		if (!pswit[OVERVIEW_SWITCH])
  2845 		    g_print("    Line %ld column %ld - "
  2846 		      "No punctuation at para end?\n",
  2847 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2848 		else
  2849 		    cnt_punct++;
  2850 		break;
  2851 	    }
  2852 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2853 		break;
  2854 	}
  2855     }
  2856 }
  2857 
  2858 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2859 {
  2860     const char *word=key;
  2861     int *dupcnt=value;
  2862     if (*dupcnt)
  2863 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2864 	  word,*dupcnt);
  2865     return FALSE;
  2866 }
  2867 
  2868 void print_as_windows_1252(const char *string)
  2869 {
  2870     gsize inbytes,outbytes;
  2871     gchar *buf,*bp;
  2872     static GIConv converter=(GIConv)-1;
  2873     if (!string)
  2874     {
  2875 	if (converter!=(GIConv)-1)
  2876 	    g_iconv_close(converter);
  2877 	converter=(GIConv)-1;
  2878 	return;
  2879     }
  2880     if (converter==(GIConv)-1)
  2881 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2882     if (converter!=(GIConv)-1)
  2883     {
  2884 	inbytes=outbytes=strlen(string);
  2885 	bp=buf=g_malloc(outbytes+1);
  2886 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2887 	*bp='\0';
  2888 	fputs(buf,stdout);
  2889 	g_free(buf);
  2890     }
  2891     else
  2892 	fputs(string,stdout);
  2893 }
  2894 
  2895 void print_as_utf_8(const char *string)
  2896 {
  2897     fputs(string,stdout);
  2898 }
  2899 
  2900 /*
  2901  * procfile:
  2902  *
  2903  * Process one file.
  2904  */
  2905 void procfile(const char *filename)
  2906 {
  2907     const char *s;
  2908     gchar *parastart=NULL;	/* first line of current para */
  2909     gchar *etext,*aline;
  2910     gchar *etext_ptr;
  2911     GError *err=NULL;
  2912     struct first_pass_results *first_pass_results;
  2913     struct warnings *warnings;
  2914     struct counters counters={0};
  2915     struct line_properties last={0};
  2916     struct parities parities={0};
  2917     struct pending pending={0};
  2918     gboolean isemptyline;
  2919     long start_para_line=0;
  2920     gboolean isnewpara=FALSE,enddash=FALSE;
  2921     last.start=CHAR_SPACE;
  2922     linecnt=checked_linecnt=0;
  2923     etext=read_etext(filename,&err);
  2924     if (!etext)
  2925     {
  2926 	if (pswit[STDOUT_SWITCH])
  2927 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2928 	else
  2929 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2930 	exit(1);
  2931     }
  2932     g_print("\n\nFile: %s\n\n",filename);
  2933     first_pass_results=first_pass(etext);
  2934     warnings=report_first_pass(first_pass_results);
  2935     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2936     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2937     /*
  2938      * Here we go with the main pass. Hold onto yer hat!
  2939      */
  2940     linecnt=0;
  2941     etext_ptr=etext;
  2942     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2943     {
  2944 	linecnt++;
  2945 	if (linecnt==1)
  2946 	    isnewpara=TRUE;
  2947 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2948 	    continue;    // skip DP page separators completely
  2949 	if (linecnt<first_pass_results->firstline ||
  2950 	  (first_pass_results->footerline>0 &&
  2951 	  linecnt>first_pass_results->footerline))
  2952 	{
  2953 	    if (pswit[HEADER_SWITCH])
  2954 	    {
  2955 		if (g_str_has_prefix(aline,"Title:"))
  2956 		    g_print("    %s\n",aline);
  2957 		if (g_str_has_prefix(aline,"Author:"))
  2958 		    g_print("    %s\n",aline);
  2959 		if (g_str_has_prefix(aline,"Release Date:"))
  2960 		    g_print("    %s\n",aline);
  2961 		if (g_str_has_prefix(aline,"Edition:"))
  2962 		    g_print("    %s\n\n",aline);
  2963 	    }
  2964 	    continue;		/* skip through the header */
  2965 	}
  2966 	checked_linecnt++;
  2967 	print_pending(aline,parastart,&pending);
  2968 	isemptyline=analyse_quotes(aline,linecnt,&counters);
  2969 	if (isnewpara && !isemptyline)
  2970 	{
  2971 	    /* This line is the start of a new paragraph. */
  2972 	    start_para_line=linecnt;
  2973 	    /* Capture its first line in case we want to report it later. */
  2974 	    g_free(parastart);
  2975 	    parastart=g_strdup(aline);
  2976 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2977 	    s=aline;
  2978 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2979 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2980 		s=g_utf8_next_char(s);
  2981 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2982 	    {
  2983 		/* and its first letter is lowercase */
  2984 		if (pswit[ECHO_SWITCH])
  2985 		    g_print("\n%s\n",aline);
  2986 		if (!pswit[OVERVIEW_SWITCH])
  2987 		    g_print("    Line %ld column %ld - "
  2988 		      "Paragraph starts with lower-case\n",
  2989 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2990 		else
  2991 		    cnt_punct++;
  2992 	    }
  2993 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2994 	}
  2995 	/* Check for an em-dash broken at line end. */
  2996 	if (enddash && g_utf8_get_char(aline)=='-')
  2997 	{
  2998 	    if (pswit[ECHO_SWITCH])
  2999 		g_print("\n%s\n",aline);
  3000 	    if (!pswit[OVERVIEW_SWITCH])
  3001 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  3002 	    else
  3003 		cnt_punct++;
  3004 	}
  3005 	enddash=FALSE;
  3006 	for (s=g_utf8_prev_char(aline+strlen(aline));
  3007 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  3008 	    ;
  3009 	if (s>=aline && g_utf8_get_char(s)=='-')
  3010 	    enddash=TRUE;
  3011 	check_for_control_characters(aline);
  3012 	check_for_odd_characters(aline,warnings,isemptyline);
  3013 	if (warnings->longline)
  3014 	    check_for_long_line(aline);
  3015 	if (warnings->shortline)
  3016 	    check_for_short_line(aline,&last);
  3017 	last.blen=last.len;
  3018 	last.len=g_utf8_strlen(aline,-1);
  3019 	last.start=g_utf8_get_char(aline);
  3020 	check_for_starting_punctuation(aline);
  3021 	if (warnings->dash)
  3022 	{
  3023 	    check_for_spaced_emdash(aline);
  3024 	    check_for_spaced_dash(aline);
  3025 	}
  3026 	check_for_unmarked_paragraphs(aline);
  3027 	check_for_jeebies(aline);
  3028 	check_for_mta_from(aline);
  3029 	check_for_orphan_character(aline);
  3030 	check_for_pling_scanno(aline);
  3031 	check_for_extra_period(aline,warnings);
  3032 	check_for_following_punctuation(aline);
  3033 	check_for_typos(aline,warnings);
  3034 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3035 	check_for_double_punctuation(aline,warnings);
  3036 	check_for_spaced_quotes(aline);
  3037 	check_for_miscased_genative(aline);
  3038 	check_end_of_line(aline,warnings);
  3039 	check_for_unspaced_bracket(aline);
  3040 	if (warnings->endquote)
  3041 	    check_for_unpunctuated_endquote(aline);
  3042 	check_for_html_tag(aline);
  3043 	check_for_html_entity(aline);
  3044 	if (isemptyline)
  3045 	{
  3046 	    check_for_mismatched_quotes(&counters,&pending);
  3047 	    counters_reset(&counters);
  3048 	    /* let the next iteration know that it's starting a new para */
  3049 	    isnewpara=TRUE;
  3050 	    if (prevline)
  3051 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3052 	}
  3053 	g_free(prevline);
  3054 	prevline=g_strdup(aline);
  3055     }
  3056     linecnt++;
  3057     check_for_mismatched_quotes(&counters,&pending);
  3058     print_pending(NULL,parastart,&pending);
  3059     reset_pending(&pending);
  3060     if (prevline)
  3061     {
  3062 	g_free(prevline);
  3063 	prevline=NULL;
  3064     }
  3065     g_free(parastart);
  3066     g_free(prevline);
  3067     g_free(etext);
  3068     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3069 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3070     g_tree_unref(qword);
  3071     g_tree_unref(qperiod);
  3072     counters_destroy(&counters);
  3073     g_set_print_handler(NULL);
  3074     print_as_windows_1252(NULL);
  3075     if (pswit[MARKUP_SWITCH])  
  3076 	loseentities(NULL);
  3077 }
  3078 
  3079 /*
  3080  * flgets:
  3081  *
  3082  * Get one line from the input text, checking for
  3083  * the existence of exactly one CR/LF line-end per line.
  3084  *
  3085  * Returns: a pointer to the line.
  3086  */
  3087 char *flgets(char **etext,long lcnt)
  3088 {
  3089     gunichar c;
  3090     gboolean isCR=FALSE;
  3091     char *theline=*etext;
  3092     char *eos=theline;
  3093     gchar *s;
  3094     for (;;)
  3095     {
  3096 	c=g_utf8_get_char(*etext);
  3097 	*etext=g_utf8_next_char(*etext);
  3098 	if (!c)
  3099 	    return NULL;
  3100 	/* either way, it's end of line */
  3101 	if (c=='\n')
  3102 	{
  3103 	    if (isCR)
  3104 		break;
  3105 	    else
  3106 	    {
  3107 		/* Error - a LF without a preceding CR */
  3108 		if (pswit[LINE_END_SWITCH])
  3109 		{
  3110 		    if (pswit[ECHO_SWITCH])
  3111 		    {
  3112 			s=g_strndup(theline,eos-theline);
  3113 			g_print("\n%s\n",s);
  3114 			g_free(s);
  3115 		    }
  3116 		    if (!pswit[OVERVIEW_SWITCH])
  3117 			g_print("    Line %ld - No CR?\n",lcnt);
  3118 		    else
  3119 			cnt_lineend++;
  3120 		}
  3121 		break;
  3122 	    }
  3123 	}
  3124 	if (c=='\r')
  3125 	{
  3126 	    if (isCR)
  3127 	    {
  3128 		/* Error - two successive CRs */
  3129 		if (pswit[LINE_END_SWITCH])
  3130 		{
  3131 		    if (pswit[ECHO_SWITCH])
  3132 		    {
  3133 			s=g_strndup(theline,eos-theline);
  3134 			g_print("\n%s\n",s);
  3135 			g_free(s);
  3136 		    }
  3137 		    if (!pswit[OVERVIEW_SWITCH])
  3138 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3139 		    else
  3140 			cnt_lineend++;
  3141 		}
  3142 	    }
  3143 	    isCR=TRUE;
  3144 	}
  3145 	else
  3146 	{
  3147 	    if (pswit[LINE_END_SWITCH] && isCR)
  3148 	    {
  3149 		if (pswit[ECHO_SWITCH])
  3150 		{
  3151 		    s=g_strndup(theline,eos-theline);
  3152 		    g_print("\n%s\n",s);
  3153 		    g_free(s);
  3154 		}
  3155 		if (!pswit[OVERVIEW_SWITCH])
  3156 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3157 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3158 		else
  3159 		    cnt_lineend++;
  3160 		*eos=' ';
  3161 	    }
  3162 	    isCR=FALSE;
  3163 	    eos=g_utf8_next_char(eos);
  3164 	}
  3165     }
  3166     *eos='\0';
  3167     if (pswit[MARKUP_SWITCH])  
  3168 	postprocess_for_HTML(theline);
  3169     if (pswit[DP_SWITCH])  
  3170 	postprocess_for_DP(theline);
  3171     return theline;
  3172 }
  3173 
  3174 /*
  3175  * mixdigit:
  3176  *
  3177  * Takes a "word" as a parameter, and checks whether it
  3178  * contains a mixture of alpha and digits. Generally, this is an
  3179  * error, but may not be for cases like 4th or L5 12s. 3d.
  3180  *
  3181  * Returns: TRUE iff an is error found.
  3182  */
  3183 gboolean mixdigit(const char *checkword)
  3184 {
  3185     gboolean wehaveadigit,wehavealetter,query;
  3186     const char *s,*nondigit;
  3187     wehaveadigit=wehavealetter=query=FALSE;
  3188     for (s=checkword;*s;s=g_utf8_next_char(s))
  3189 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3190 	    wehavealetter=TRUE;
  3191 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3192 	    wehaveadigit=TRUE;
  3193     if (wehaveadigit && wehavealetter)
  3194     {
  3195 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3196 	query=TRUE;
  3197 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3198 	  nondigit=g_utf8_next_char(nondigit))
  3199 	    ;
  3200 	/* digits, ending in st, rd, nd, th of either case */
  3201 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3202 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3203 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3204 	  !g_ascii_strcasecmp(nondigit,"th"))
  3205 	    query=FALSE;
  3206 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3207 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3208 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3209 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3210 	    query=FALSE;
  3211 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3212 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3213 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3214 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3215 	    query=FALSE;
  3216 	/* digits, ending in l, L, s or d */
  3217 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3218 	  !strcmp(nondigit,"d"))
  3219 	    query=FALSE;
  3220 	/*
  3221 	 * L at the start of a number, representing Britsh pounds, like L500.
  3222 	 * This is cute. We know the current word is mixed digit. If the first
  3223 	 * letter is L, there must be at least one digit following. If both
  3224 	 * digits and letters follow, we have a genuine error, else we have a
  3225 	 * capital L followed by digits, and we accept that as a non-error.
  3226 	 */
  3227 	if (g_utf8_get_char(checkword)=='L' &&
  3228 	  !mixdigit(g_utf8_next_char(checkword)))
  3229 	    query=FALSE;
  3230     }
  3231     return query;
  3232 }
  3233 
  3234 /*
  3235  * getaword:
  3236  *
  3237  * Extracts the first/next "word" from the line, and returns it.
  3238  * A word is defined as one English word unit--or at least that's the aim.
  3239  * "ptr" is advanced to the position in the line where we will start
  3240  * looking for the next word.
  3241  *
  3242  * Returns: A newly-allocated string.
  3243  */
  3244 gchar *getaword(const char **ptr)
  3245 {
  3246     const char *s,*t;
  3247     GString *word;
  3248     gunichar c,pc;
  3249     word=g_string_new(NULL);
  3250     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3251       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3252       **ptr;*ptr=g_utf8_next_char(*ptr))
  3253 	;
  3254     /*
  3255      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3256      * Especially yucky is the case of L1,000
  3257      * This section looks for a pattern of characters including a digit
  3258      * followed by a comma or period followed by one or more digits.
  3259      * If found, it returns this whole pattern as a word; otherwise we discard
  3260      * the results and resume our normal programming.
  3261      */
  3262     s=*ptr;
  3263     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3264       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3265       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3266 	g_string_append_unichar(word,g_utf8_get_char(s));
  3267     if (word->len)
  3268     {
  3269 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3270 	{
  3271 	    c=g_utf8_get_char(t);
  3272 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3273 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3274 	    {
  3275 		*ptr=s;
  3276 		return g_string_free(word,FALSE);
  3277 	    }
  3278 	}
  3279     }
  3280     /* we didn't find a punctuated number - do the regular getword thing */
  3281     g_string_truncate(word,0);
  3282     c=g_utf8_get_char(*ptr);
  3283     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3284       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3285 	g_string_append_unichar(word,c);
  3286     return g_string_free(word,FALSE);
  3287 }
  3288 
  3289 /*
  3290  * isroman:
  3291  *
  3292  * Is this word a Roman Numeral?
  3293  *
  3294  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3295  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3296  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3297  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3298  * expressions thereof, except when it came to taxes. Allow any number of M,
  3299  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3300  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3301  * of optional Is.
  3302  */
  3303 gboolean isroman(const char *t)
  3304 {
  3305     const char *s;
  3306     if (!t || !*t)
  3307 	return FALSE;
  3308     s=t;
  3309     while (g_utf8_get_char(t)=='m' && *t)
  3310 	t++;
  3311     if (g_utf8_get_char(t)=='d')
  3312 	t++;
  3313     if (g_str_has_prefix(t,"cm"))
  3314 	t+=2;
  3315     if (g_str_has_prefix(t,"cd"))
  3316 	t+=2;
  3317     while (g_utf8_get_char(t)=='c' && *t)
  3318 	t++;
  3319     if (g_str_has_prefix(t,"xl"))
  3320 	t+=2;
  3321     if (g_str_has_prefix(t,"xc"))
  3322 	t+=2;
  3323     if (g_utf8_get_char(t)=='l')
  3324 	t++;
  3325     while (g_utf8_get_char(t)=='x' && *t)
  3326 	t++;
  3327     if (g_str_has_prefix(t,"ix"))
  3328 	t+=2;
  3329     if (g_str_has_prefix(t,"iv"))
  3330 	t+=2;
  3331     if (g_utf8_get_char(t)=='v')
  3332 	t++;
  3333     while (g_utf8_get_char(t)=='i' && *t)
  3334 	t++;
  3335     return !*t;
  3336 }
  3337 
  3338 /*
  3339  * postprocess_for_DP:
  3340  *
  3341  * Invoked with the -d switch from flgets().
  3342  * It simply "removes" from the line a hard-coded set of common
  3343  * DP-specific tags, so that the line passed to the main routine has
  3344  * been pre-cleaned of DP markup.
  3345  */
  3346 void postprocess_for_DP(char *theline)
  3347 {
  3348     char *s,*t;
  3349     int i;
  3350     if (!*theline) 
  3351 	return;
  3352     for (i=0;*DPmarkup[i];i++)
  3353 	while ((s=strstr(theline,DPmarkup[i])))
  3354 	{
  3355 	    t=s+strlen(DPmarkup[i]);
  3356 	    memmove(s,t,strlen(t)+1);
  3357 	}
  3358 }
  3359 
  3360 /*
  3361  * postprocess_for_HTML:
  3362  *
  3363  * Invoked with the -m switch from flgets().
  3364  * It simply "removes" from the line a hard-coded set of common
  3365  * HTML tags and "replaces" a hard-coded set of common HTML
  3366  * entities, so that the line passed to the main routine has
  3367  * been pre-cleaned of HTML.
  3368  */
  3369 void postprocess_for_HTML(char *theline)
  3370 {
  3371     while (losemarkup(theline))
  3372 	;
  3373     loseentities(theline);
  3374 }
  3375 
  3376 char *losemarkup(char *theline)
  3377 {
  3378     char *s,*t;
  3379     int i;
  3380     s=strchr(theline,'<');
  3381     t=s?strchr(s,'>'):NULL;
  3382     if (!s || !t)
  3383 	return NULL;
  3384     for (i=0;*markup[i];i++)
  3385 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3386 	{
  3387 	    t=g_utf8_next_char(t);
  3388 	    memmove(s,t,strlen(t)+1);
  3389 	    return s;
  3390 	}
  3391     /* It's an unrecognized <xxx>. */
  3392     return NULL;
  3393 }
  3394 
  3395 void loseentities(char *theline)
  3396 {
  3397     int i;
  3398     gsize nb;
  3399     char *amp,*scolon;
  3400     gchar *s,*t;
  3401     gunichar c;
  3402     GTree *entities=NULL;
  3403     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3404     if (!theline)
  3405     {
  3406 	if (entities)
  3407 	    g_tree_destroy(entities);
  3408 	entities=NULL;
  3409 	if (translit!=(GIConv)-1)
  3410 	    g_iconv_close(translit);
  3411 	translit=(GIConv)-1;
  3412 	if (to_utf8!=(GIConv)-1)
  3413 	    g_iconv_close(to_utf8);
  3414 	to_utf8=(GIConv)-1;
  3415 	return;
  3416     }
  3417     if (!*theline)
  3418 	return;
  3419     if (!entities)
  3420     {
  3421 	entities=g_tree_new((GCompareFunc)strcmp);
  3422 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3423 	    g_tree_insert(entities,HTMLentities[i].name,
  3424 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3425     }
  3426     if (translit==(GIConv)-1)
  3427 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3428     if (to_utf8==(GIConv)-1)
  3429 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3430     while((amp=strchr(theline,'&')))
  3431     {
  3432 	scolon=strchr(amp,';');
  3433 	if (scolon)
  3434 	{
  3435 	    if (amp[1]=='#')
  3436 	    {
  3437 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3438 		    c=strtol(amp+2,NULL,10);
  3439 		else if (amp[2]=='x' &&
  3440 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3441 		    c=strtol(amp+3,NULL,16);
  3442 	    }
  3443 	    else
  3444 	    {
  3445 		s=g_strndup(amp+1,scolon-(amp+1));
  3446 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3447 		g_free(s);
  3448 	    }
  3449 	}
  3450 	else
  3451 	    c=0;
  3452 	if (c)
  3453 	{
  3454 	    theline=amp;
  3455 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3456 		theline+=g_unichar_to_utf8(c,theline);
  3457 	    else
  3458 	    {
  3459 		s=g_malloc(6);
  3460 		nb=g_unichar_to_utf8(c,s);
  3461 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3462 		g_free(s);
  3463 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3464 		g_free(t);
  3465 		memcpy(theline,s,nb);
  3466 		g_free(s);
  3467 		theline+=nb;
  3468 	    }
  3469 	    memmove(theline,g_utf8_next_char(scolon),
  3470 	      strlen(g_utf8_next_char(scolon))+1);
  3471 	}
  3472 	else
  3473 	    theline=g_utf8_next_char(amp);
  3474     }
  3475 }
  3476 
  3477 gboolean tagcomp(const char *strin,const char *basetag)
  3478 {
  3479     gboolean retval;
  3480     gchar *s,*t;
  3481     if (g_utf8_get_char(strin)=='/')
  3482 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3483     else
  3484 	t=g_utf8_casefold(strin,-1);
  3485     s=g_utf8_casefold(basetag,-1);
  3486     retval=g_str_has_prefix(t,s);
  3487     g_free(s);
  3488     g_free(t);
  3489     return retval;
  3490 }
  3491 
  3492 void proghelp(GOptionContext *context)
  3493 {
  3494     gchar *help;
  3495     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3496     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3497     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3498     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3499       "For details, read the file COPYING.\n",stderr);
  3500     fputs("This is Free Software; "
  3501       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3502     fputs("read the file COPYING for details.\n\n",stderr);
  3503     help=g_option_context_get_help(context,TRUE,NULL);
  3504     fputs(help,stderr);
  3505     g_free(help);
  3506     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3507     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3508       "non-ASCII\n",stderr);
  3509     fputs("characters like accented letters, "
  3510       "lines longer than 75 or shorter than 55,\n",stderr);
  3511     fputs("unbalanced quotes or brackets, "
  3512       "a variety of badly formatted punctuation, \n",stderr);
  3513     fputs("HTML tags, some likely typos. "
  3514       "It is NOT a substitute for human judgement.\n",stderr);
  3515     fputs("\n",stderr);
  3516 }