bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sun Oct 27 16:58:50 2013 +0000 (2013-10-27)
changeset 201 f1d85b36e188
parent 200 8e0ba1a088c4
parent 185 a6d93c9932ac
child 202 c25e023cb9fe
permissions -rw-r--r--
Merge bug #13: Character sets
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 gboolean typo_compat,paranoid_compat;
   136 
   137 static GOptionEntry options[]={
   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   139       "Ignore DP-specific markup", NULL },
   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   142       "Don't ignore DP-specific markup", NULL },
   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   144       "Echo queried line", NULL },
   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   147       "Don't echo queried line", NULL },
   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   149       "Check single quotes", NULL },
   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   152       "Don't check single quotes", NULL },
   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   154       "Check common typos", NULL },
   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   157       "Don't check common typos", NULL },
   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   159       "Require closure of quotes on every paragraph", NULL },
   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   162       "Don't require closure of quotes on every paragraph", NULL },
   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   165       "Enable paranoid querying of everything", NULL },
   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   168       "Disable paranoid querying of everything", NULL },
   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   171       "Enable line end checking", NULL },
   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   174       "Disable line end checking", NULL },
   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   176       "Overview: just show counts", NULL },
   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   179       "Show individual warnings", NULL },
   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   181       "Output errors to stdout instead of stderr", NULL },
   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   184       "Output errors to stderr instead of stdout", NULL },
   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   186       "Echo header fields", NULL },
   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Don't echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "No special handling for markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   199       "Ignore file of user-defined typos", NULL },
   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   201       "Verbose - list everything", NULL },
   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   204       "Switch off verbose mode", NULL },
   205     { NULL }
   206 };
   207 
   208 /*
   209  * Options relating to configuration which make no sense from inside
   210  * a configuration file.
   211  */
   212 
   213 static GOptionEntry config_options[]={
   214     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   215       "Defaults for use on www upload", NULL },
   216     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   217       "Dump current config settings", NULL },
   218     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   219       "Set of characters valid for this ebook", "NAME" },
   220     { NULL }
   221 };
   222 
   223 static GOptionEntry compatibility_options[]={
   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   225       "Toggle checking for common typos", NULL },
   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   227       "Toggle both paranoid mode and common typos", NULL },
   228     { NULL }
   229 };
   230 
   231 long cnt_quote;		/* for overview mode, count of quote queries */
   232 long cnt_brack;		/* for overview mode, count of brackets queries */
   233 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   234 long cnt_odd;		/* for overview mode, count of odd character queries */
   235 long cnt_long;		/* for overview mode, count of long line errors */
   236 long cnt_short;		/* for overview mode, count of short line queries */
   237 long cnt_punct;		/* for overview mode,
   238 			   count of punctuation and spacing queries */
   239 long cnt_dash;		/* for overview mode, count of dash-related queries */
   240 long cnt_word;		/* for overview mode, count of word queries */
   241 long cnt_html;		/* for overview mode, count of html queries */
   242 long cnt_lineend;	/* for overview mode, count of line-end queries */
   243 long cnt_spacend;	/* count of lines with space at end */
   244 long linecnt;		/* count of total lines in the file */
   245 long checked_linecnt;	/* count of lines actually checked */
   246 
   247 void proghelp(GOptionContext *context);
   248 void procfile(const char *);
   249 
   250 gchar *running_from;
   251 
   252 gboolean mixdigit(const char *);
   253 gchar *getaword(const char **);
   254 char *flgets(char **,long,int);
   255 void postprocess_for_HTML(char *);
   256 char *linehasmarkup(char *);
   257 char *losemarkup(char *);
   258 gboolean tagcomp(const char *,const char *);
   259 void loseentities(char *);
   260 gboolean isroman(const char *);
   261 void postprocess_for_DP(char *);
   262 void print_as_windows_1252(const char *string);
   263 void print_as_utf_8(const char *string);
   264 
   265 GTree *qword,*qperiod;
   266 
   267 #ifdef __WIN32__
   268 UINT saved_cp;
   269 #endif
   270 
   271 GKeyFile *config;
   272 
   273 void config_file_update(GKeyFile *kf)
   274 {
   275     int i;
   276     gboolean sw;
   277     for(i=0;options[i].long_name;i++)
   278     {
   279 	if (g_str_has_prefix(options[i].long_name,"no-"))
   280 	    continue;
   281 	if (options[i].arg==G_OPTION_ARG_NONE)
   282 	{
   283 	    sw=*(gboolean *)options[i].arg_data;
   284 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   285 		sw=!sw;
   286 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   287 	}
   288 	else
   289 	    g_assert_not_reached();
   290     }
   291 }
   292 
   293 void config_file_add_comments(GKeyFile *kf)
   294 {
   295     int i;
   296     gchar *comment;
   297     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   298       NULL);
   299     for(i=0;options[i].long_name;i++)
   300     {
   301 	if (g_str_has_prefix(options[i].long_name,"no-"))
   302 	    continue;
   303 	comment=g_strconcat(" ",options[i].description,NULL);
   304 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   305 	g_free(comment);
   306     }
   307 }
   308 
   309 void dump_config(void)
   310 {
   311     gchar *s;
   312     if (config)
   313 	config_file_update(config);
   314     else
   315     {
   316 	config=g_key_file_new();
   317 	config_file_update(config);
   318 	config_file_add_comments(config);
   319     }
   320     s=g_key_file_to_data(config,NULL,NULL);
   321     if (s)
   322 	g_print("%s",s);
   323     g_free(s);
   324 }
   325 
   326 GKeyFile *read_config_file(gchar **full_path)
   327 {
   328     int i;
   329     GError *err=NULL;
   330     gchar **search_dirs;
   331     gchar *path;
   332     const char *search_path;
   333     GKeyFile *kf;
   334     kf=g_key_file_new();
   335     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   336     if (search_path)
   337     {
   338 #ifdef __WIN32__
   339 	search_dirs=g_strsplit(search_path,";",0);
   340 #else
   341 	search_dirs=g_strsplit(search_path,":",0);
   342 #endif
   343     }
   344     else
   345     {
   346 	search_dirs=g_new(gchar *,4);
   347 	search_dirs[0]=g_get_current_dir();
   348 	search_dirs[1]=g_strdup(running_from);
   349 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   350 	search_dirs[3]=NULL;
   351     }
   352     for(i=0;search_dirs[i];i++)
   353     {
   354 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   355 	if (g_key_file_load_from_file(kf,path,
   356 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   357 	    break;
   358 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   359 	{
   360 	    g_printerr("Bookloupe: Error reading %s\n",path);
   361 	    g_printerr("%s\n",err->message);
   362 	    exit(1);
   363 	}
   364 	g_clear_error(&err);
   365 	g_free(path);
   366 	path=NULL;
   367     }
   368     if (!search_dirs[i])
   369     {
   370 	g_key_file_free(kf);
   371 	kf=NULL;
   372     }
   373     g_strfreev(search_dirs);
   374     if (full_path && kf)
   375 	*full_path=path;
   376     else
   377 	g_free(path);
   378     return kf;
   379 }
   380 
   381 void parse_config_file(void)
   382 {
   383     int i,j;
   384     gchar *path;
   385     gchar **keys;
   386     gboolean sw;
   387     GError *err=NULL;
   388     config=read_config_file(&path);
   389     if (config)
   390 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   391     else
   392 	keys=NULL;
   393     if (keys)
   394     {
   395 	for(i=0;keys[i];i++)
   396 	{
   397 	    for(j=0;options[j].long_name;j++)
   398 	    {
   399 		if (g_str_has_prefix(options[j].long_name,"no-"))
   400 		    continue;
   401 		else if (!strcmp(keys[i],options[j].long_name))
   402 		{
   403 		    if (options[j].arg==G_OPTION_ARG_NONE)
   404 		    {
   405 			sw=g_key_file_get_boolean(config,"options",keys[i],
   406 			  &err);
   407 			if (err)
   408 			{
   409 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   410 			      path,keys[i],err->message);
   411 			    g_clear_error(&err);
   412 			}
   413 			if (options[j].flags&G_OPTION_FLAG_REVERSE)
   414 			    sw=!sw;
   415 			*(gboolean *)options[j].arg_data=sw;
   416 			break;
   417 		    }
   418 		    else
   419 			g_assert_not_reached();
   420 		}
   421 	    }
   422 	    if (!options[j].long_name)
   423 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   424 		  path,keys[i]);
   425 	}
   426 	g_strfreev(keys);
   427     }
   428     if (config)
   429 	g_free(path);
   430 }
   431 
   432 gboolean set_charset(const char *name,GError **err)
   433 {
   434     /* The various UNICODE encodings all share the same character set. */
   435     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   436       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   437       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   438       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   439       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   440     int i;
   441     if (charset)
   442 	g_free(charset);
   443     if (charset_validator!=(GIConv)-1)
   444 	g_iconv_close(charset_validator);
   445     if (!name || !g_strcasecmp(name,"auto"))
   446     {
   447 	charset=NULL;
   448 	charset_validator=(GIConv)-1;
   449 	return TRUE;
   450     }
   451     else
   452 	charset=g_strdup(name);
   453     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   454 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   455 	{
   456 	    g_free(charset);
   457 	    charset=g_strdup("UTF-8");
   458 	    break;
   459 	}
   460     if (!strcmp(charset,"UTF-8"))
   461 	charset_validator=(GIConv)-1;
   462     else
   463     {
   464 	charset_validator=g_iconv_open(charset,"UTF-8");
   465 	if (charset_validator==(GIConv)-1)
   466 	{
   467 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   468 	      "Unknown character set \"%s\"",charset);
   469 	    return FALSE;
   470 	}
   471     }
   472     return TRUE;
   473 }
   474 
   475 void parse_options(int *argc,char ***argv)
   476 {
   477     GError *err=NULL;
   478     GOptionContext *context;
   479     GOptionGroup *compatibility;
   480     context=g_option_context_new(
   481       "file - look for errors in Project Gutenberg(TM) etexts");
   482     g_option_context_add_main_entries(context,options,NULL);
   483     g_option_context_add_main_entries(context,config_options,NULL);
   484     compatibility=g_option_group_new("compatibility",
   485       "Options for Compatibility with Gutcheck:",
   486       "Show compatibility options",NULL,NULL);
   487     g_option_group_add_entries(compatibility,compatibility_options);
   488     g_option_context_add_group(context,compatibility);
   489     g_option_context_set_description(context,
   490       "For simplicity, only the switch options which reverse the\n"
   491       "default configuration are listed. In most cases, both vanilla\n"
   492       "and \"no-\" prefixed versions are available for use.");
   493     if (!g_option_context_parse(context,argc,argv,&err))
   494     {
   495 	g_printerr("Bookloupe: %s\n",err->message);
   496 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   497 	exit(1);
   498     }
   499     if (typo_compat)
   500 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   501     if (paranoid_compat)
   502     {
   503 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   504 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   505     }
   506     /*
   507      * Web uploads - for the moment, this is really just a placeholder
   508      * until we decide what processing we really want to do on web uploads
   509      */
   510     if (pswit[WEB_SWITCH])
   511     {
   512 	/* specific override for web uploads */
   513 	pswit[ECHO_SWITCH]=TRUE;
   514 	pswit[SQUOTE_SWITCH]=FALSE;
   515 	pswit[TYPO_SWITCH]=TRUE;
   516 	pswit[QPARA_SWITCH]=FALSE;
   517 	pswit[PARANOID_SWITCH]=TRUE;
   518 	pswit[LINE_END_SWITCH]=FALSE;
   519 	pswit[OVERVIEW_SWITCH]=FALSE;
   520 	pswit[STDOUT_SWITCH]=FALSE;
   521 	pswit[HEADER_SWITCH]=TRUE;
   522 	pswit[VERBOSE_SWITCH]=FALSE;
   523 	pswit[MARKUP_SWITCH]=FALSE;
   524 	pswit[USERTYPO_SWITCH]=FALSE;
   525 	pswit[DP_SWITCH]=FALSE;
   526     }
   527     if (opt_charset && !set_charset(opt_charset,&err))
   528     {
   529 	g_printerr("%s\n",err->message);
   530 	exit(1);
   531     }
   532     if (pswit[DUMP_CONFIG_SWITCH])
   533     {
   534 	dump_config();
   535 	exit(0);
   536     }
   537     g_free(opt_charset);
   538     opt_charset=NULL;
   539     if (pswit[OVERVIEW_SWITCH])
   540 	/* just print summary; don't echo */
   541 	pswit[ECHO_SWITCH]=FALSE;
   542     if (*argc<2)
   543     {
   544 	proghelp(context);
   545 	exit(1);
   546     }
   547     g_option_context_free(context);
   548 }
   549 
   550 /*
   551  * read_user_scannos:
   552  *
   553  * Read in the user-defined stealth scanno list.
   554  */
   555 void read_user_scannos(void)
   556 {
   557     GError *err=NULL;
   558     gchar *usertypo_file;
   559     gboolean okay;
   560     int i;
   561     gsize len,nb;
   562     gchar *contents,*utf8,**lines;
   563     usertypo_file=g_strdup("bookloupe.typ");
   564     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   565     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   566     {
   567 	g_clear_error(&err);
   568 	g_free(usertypo_file);
   569 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   570 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   571     }
   572     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   573     {
   574 	g_clear_error(&err);
   575 	g_free(usertypo_file);
   576 	usertypo_file=g_strdup("gutcheck.typ");
   577 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   578     }
   579     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   580     {
   581 	g_clear_error(&err);
   582 	g_free(usertypo_file);
   583 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   584 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   585     }
   586     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   587     {
   588 	g_free(usertypo_file);
   589 	g_print("   --> I couldn't find bookloupe.typ "
   590 	  "-- proceeding without user typos.\n");
   591 	return;
   592     }
   593     else if (!okay)
   594     {
   595 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   596 	g_free(usertypo_file);
   597 	g_clear_error(&err);
   598 	exit(1);
   599     }
   600     if (g_utf8_validate(contents,len,NULL))
   601     {
   602 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   603 	if (!charset)
   604 	    (void)set_charset("UNICODE",NULL);
   605     }
   606     else
   607 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   608     g_free(contents);
   609     lines=g_strsplit_set(utf8,"\r\n",0);
   610     g_free(utf8);
   611     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   612     for (i=0;lines[i];i++)
   613 	if (*(unsigned char *)lines[i]>'!')
   614 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   615 	else
   616 	    g_free(lines[i]);
   617     g_free(lines);
   618 }
   619 
   620 /*
   621  * read_etext:
   622  *
   623  * Read an etext returning a newly allocated string containing the file
   624  * contents or NULL on error.
   625  */
   626 gchar *read_etext(const char *filename,GError **err)
   627 {
   628     GError *tmp_err=NULL;
   629     gchar *contents,*utf8;
   630     gsize len,bytes_read,bytes_written;
   631     int i,line,col;
   632     if (!g_file_get_contents(filename,&contents,&len,err))
   633 	return NULL;
   634     if (g_utf8_validate(contents,len,NULL))
   635     {
   636 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   637 	g_set_print_handler(print_as_utf_8);
   638 #ifdef __WIN32__
   639 	SetConsoleOutputCP(CP_UTF8);
   640 #endif
   641     }
   642     else
   643     {
   644 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   645 	  &bytes_written,&tmp_err);
   646 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   647 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   648 	{
   649 	    line=col=1;
   650 	    for(i=0;i<bytes_read;i++)
   651 		if (contents[i]=='\n')
   652 		{
   653 		    line++;
   654 		    col=1;
   655 		}
   656 		else if (contents[i]!='\r')
   657 		    col++;
   658 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   659 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   660 	      "valid Windows-1252 character",
   661 	      ((unsigned char *)contents)[bytes_read],line,col);
   662 	}
   663 	else if (tmp_err)
   664 	    g_propagate_error(err,tmp_err);
   665 	g_set_print_handler(print_as_windows_1252);
   666 #ifdef __WIN32__
   667 	SetConsoleOutputCP(1252);
   668 #endif
   669     }
   670     g_free(contents);
   671     return utf8;
   672 }
   673 
   674 void cleanup_on_exit(void)
   675 {
   676 #ifdef __WIN32__
   677     SetConsoleOutputCP(saved_cp);
   678 #endif
   679 }
   680 
   681 int main(int argc,char **argv)
   682 {
   683 #ifdef __WIN32__
   684     atexit(cleanup_on_exit);
   685     saved_cp=GetConsoleOutputCP();
   686 #endif
   687     running_from=g_path_get_dirname(argv[0]);
   688     /* Paranoid checking is turned OFF, not on, by its switch */
   689     pswit[PARANOID_SWITCH]=TRUE;
   690     /* if running in paranoid mode, typo checks default to enabled */
   691     pswit[TYPO_SWITCH]=TRUE;
   692     /* Line-end checking is turned OFF, not on, by its switch */
   693     pswit[LINE_END_SWITCH]=TRUE;
   694     /* Echoing is turned OFF, not on, by its switch */
   695     pswit[ECHO_SWITCH]=TRUE;
   696     parse_config_file();
   697     parse_options(&argc,&argv);
   698     if (pswit[USERTYPO_SWITCH])
   699 	read_user_scannos();
   700     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   701     procfile(argv[1]);
   702     if (pswit[OVERVIEW_SWITCH])
   703     {
   704 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   705 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   706 	g_print("    --------------- Queries found --------------\n");
   707 	if (cnt_long)
   708 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   709 	if (cnt_short)
   710 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   711 	if (cnt_lineend)
   712 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   713 	if (cnt_word)
   714 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   715 	if (cnt_quote)
   716 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   717 	if (cnt_brack)
   718 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   719 	if (cnt_bin)
   720 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   721 	if (cnt_odd)
   722 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   723 	if (cnt_punct)
   724 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   725 	if (cnt_dash)
   726 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   727 	if (cnt_html)
   728 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   729 	g_print("\n");
   730 	g_print("    TOTAL QUERIES		  %14ld\n",
   731 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   732 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   733     }
   734     g_free(running_from);
   735     if (usertypo)
   736 	g_tree_unref(usertypo);
   737     set_charset(NULL,NULL);
   738     if (config)
   739 	g_key_file_free(config);
   740     return 0;
   741 }
   742 
   743 void count_dashes(const char *line,const char *dash,
   744   struct dash_results *results)
   745 {
   746     int i;
   747     gchar **tokens;
   748     gunichar pc,nc;
   749     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   750     if (!*line)
   751 	return;
   752     tokens=g_strsplit(line,dash,0);
   753     if (tokens[1])
   754 	results->base++;
   755     for(i=1;tokens[i];i++)
   756     {
   757 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   758 	nc=g_utf8_get_char(tokens[i]);
   759 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   760 	    spaced=TRUE;
   761 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   762 	    spaced2=TRUE;
   763 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   764 	    unspaced=TRUE;
   765     }
   766     if (spaced)
   767 	results->space++;
   768     if (spaced2)
   769 	/* count of lines with em-dashes with spaces both sides */
   770 	results->non_PG_space++;
   771     if (unspaced)
   772 	/* count of lines with PG-type em-dashes with no spaces */
   773 	results->PG_space++;
   774     g_strfreev(tokens);
   775 }
   776 
   777 /*
   778  * first_pass:
   779  *
   780  * Run a first pass - verify that it's a valid PG
   781  * file, decide whether to report some things that
   782  * occur many times in the text like long or short
   783  * lines, non-standard dashes, etc.
   784  */
   785 struct first_pass_results *first_pass(const char *etext)
   786 {
   787     gunichar laststart=CHAR_SPACE;
   788     const char *s;
   789     gchar *lc_line;
   790     int i,j,lbytes,llen;
   791     gchar **lines;
   792     unsigned int lastlen=0,lastblen=0;
   793     long spline=0,nspline=0;
   794     static struct first_pass_results results={0};
   795     struct dash_results tmp_dash_results;
   796     gchar *inword;
   797     QuoteClass qc;
   798     lines=g_strsplit(etext,"\n",0);
   799     if (!lines[0])
   800     {
   801 	/* An empty etext has no terminators */
   802 	results.newlines=DOS_NEWLINES;
   803     }
   804     else if (!lines[1])
   805     {
   806 	/*
   807 	 * If there are no LFs, we don't have UNIX-style
   808 	 * terminators, but we might have OS9-style ones.
   809 	 */
   810 	results.newlines=OS9_NEWLINES;
   811 	g_strfreev(lines);
   812 	lines=g_strsplit(etext,"\r",0);
   813 	if (!lines[0] || !lines[1])
   814 	    /* Looks like we don't have any terminators at all */
   815 	    results.newlines=DOS_NEWLINES;
   816     }
   817     else
   818     {
   819 	/* We might have UNIX-style terminators */
   820 	results.newlines=UNIX_NEWLINES;
   821     }
   822     for (j=0;lines[j];j++)
   823     {
   824 	lbytes=strlen(lines[j]);
   825 	if (lbytes>0 && lines[j][lbytes-1]=='\r')
   826 	{
   827 	    results.newlines=DOS_NEWLINES;
   828 	    do
   829 	    {
   830 		lines[j][--lbytes]='\0';
   831 	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
   832 	}
   833 	llen=g_utf8_strlen(lines[j],lbytes);
   834 	linecnt++;
   835 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   836 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   837 	{
   838 	    if (spline)
   839 		g_print("   --> Duplicate header?\n");
   840 	    spline=linecnt+1;   /* first line of non-header text, that is */
   841 	}
   842 	if (!strncmp(lines[j],"*** START",9) &&
   843 	  strstr(lines[j],"PROJECT GUTENBERG"))
   844 	{
   845 	    if (nspline)
   846 		g_print("   --> Duplicate header?\n");
   847 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   848 	}
   849 	if (spline || nspline)
   850 	{
   851 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   852 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   853 	    {
   854 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   855 		{
   856 		    if (results.footerline)
   857 		    {
   858 			/* it's an old-form header - we can detect duplicates */
   859 			if (!nspline)
   860 			    g_print("   --> Duplicate footer?\n");
   861 		    }
   862 		    else
   863 			results.footerline=linecnt;
   864 		}
   865 	    }
   866 	    g_free(lc_line);
   867 	}
   868 	if (spline)
   869 	    results.firstline=spline;
   870 	if (nspline)
   871 	    results.firstline=nspline;  /* override with new */
   872 	if (results.footerline)
   873 	    continue;    /* don't count the boilerplate in the footer */
   874 	results.totlen+=llen;
   875 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   876 	{
   877 	    if (g_utf8_get_char(s)>127)
   878 		results.binlen++;
   879 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   880 		results.alphalen++;
   881 	    if (s>lines[j])
   882 	    {
   883 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   884 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   885 		else
   886 		    qc=INVALID_QUOTE;
   887 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   888 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   889 		    results.endquote_count++;
   890 	    }
   891 	}
   892 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   893 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   894 	    results.shortline++;
   895 	if (lbytes>0 &&
   896 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   897 	    cnt_spacend++;
   898 	if (strstr(lines[j],".,"))
   899 	    results.dotcomma++;
   900 	/* only count ast lines for ignoring purposes where there is */
   901 	/* locase text on the line */
   902 	if (strchr(lines[j],'*'))
   903 	{
   904 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   905 		if (g_unichar_islower(g_utf8_get_char(s)))
   906 		    break;
   907 	    if (*s)
   908 		results.astline++;
   909 	}
   910 	if (strchr(lines[j],'/'))
   911 	    results.fslashline++;
   912 	if (lbytes>0)
   913 	{
   914 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   915 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   916 	      s=g_utf8_prev_char(s))
   917 		;
   918 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   919 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   920 		results.hyphens++;
   921 	}
   922 	if (llen>LONGEST_PG_LINE)
   923 	    results.longline++;
   924 	if (llen>WAY_TOO_LONG)
   925 	    results.verylongline++;
   926 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   927 	{
   928 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   929 	    if (i>0)
   930 		results.htmcount++;
   931 	    if (strstr(lines[j],"<i>"))
   932 		results.htmcount+=4; /* bonus marks! */
   933 	}
   934 	/* Check for spaced em-dashes */
   935 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   936 	count_dashes(lines[j],"--",&tmp_dash_results);
   937 	count_dashes(lines[j],"—",&tmp_dash_results);
   938 	if (tmp_dash_results.base)
   939 	    results.emdash.base++;
   940 	if (tmp_dash_results.non_PG_space)
   941 	    results.emdash.non_PG_space++;
   942 	if (tmp_dash_results.PG_space)
   943 	    results.emdash.PG_space++;
   944 	for (s=lines[j];*s;)
   945 	{
   946 	    inword=getaword(&s);
   947 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   948 		results.Dutchcount++;
   949 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   950 		results.Frenchcount++;
   951 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   952 		results.standalone_digit++;
   953 	    g_free(inword);
   954 	}
   955 	/* Check for spaced dashes */
   956 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   957 	    results.spacedash++;
   958 	lastblen=lastlen;
   959 	lastlen=llen;
   960 	laststart=lines[j][0];
   961     }
   962     g_strfreev(lines);
   963     return &results;
   964 }
   965 
   966 /*
   967  * report_first_pass:
   968  *
   969  * Make some snap decisions based on the first pass results.
   970  */
   971 struct warnings *report_first_pass(struct first_pass_results *results)
   972 {
   973     static struct warnings warnings={0};
   974     warnings.newlines=results->newlines;
   975     if (warnings.newlines==UNIX_NEWLINES)
   976 	g_print("   --> No lines in this file have a CR. Not reporting them. "
   977 	  "Project Gutenberg requires that all lineends be CR-LF.\n");
   978     else if (warnings.newlines==OS9_NEWLINES)
   979 	g_print("   --> No lines in this file have a LF. Not reporting them. "
   980 	  "Project Gutenberg requires that all lineends be CR-LF.\n");
   981     if (cnt_spacend>0)
   982 	g_print("   --> %ld lines in this file have white space at end\n",
   983 	  cnt_spacend);
   984     warnings.dotcomma=1;
   985     if (results->dotcomma>5)
   986     {
   987 	warnings.dotcomma=0;
   988 	g_print("   --> %ld lines in this file contain '.,'. "
   989 	  "Not reporting them.\n",results->dotcomma);
   990     }
   991     /*
   992      * If more than 50 lines, or one-tenth, are short,
   993      * don't bother reporting them.
   994      */
   995     warnings.shortline=1;
   996     if (results->shortline>50 || results->shortline*10>linecnt)
   997     {
   998 	warnings.shortline=0;
   999 	g_print("   --> %ld lines in this file are short. "
  1000 	  "Not reporting short lines.\n",results->shortline);
  1001     }
  1002     /*
  1003      * If more than 50 lines, or one-tenth, are long,
  1004      * don't bother reporting them.
  1005      */
  1006     warnings.longline=1;
  1007     if (results->longline>50 || results->longline*10>linecnt)
  1008     {
  1009 	warnings.longline=0;
  1010 	g_print("   --> %ld lines in this file are long. "
  1011 	  "Not reporting long lines.\n",results->longline);
  1012     }
  1013     /* If more than 10 lines contain asterisks, don't bother reporting them. */
  1014     warnings.ast=1;
  1015     if (results->astline>10)
  1016     {
  1017 	warnings.ast=0;
  1018 	g_print("   --> %ld lines in this file contain asterisks. "
  1019 	  "Not reporting them.\n",results->astline);
  1020     }
  1021     /*
  1022      * If more than 10 lines contain forward slashes,
  1023      * don't bother reporting them.
  1024      */
  1025     warnings.fslash=1;
  1026     if (results->fslashline>10)
  1027     {
  1028 	warnings.fslash=0;
  1029 	g_print("   --> %ld lines in this file contain forward slashes. "
  1030 	  "Not reporting them.\n",results->fslashline);
  1031     }
  1032     /*
  1033      * If more than 20 lines contain unpunctuated endquotes,
  1034      * don't bother reporting them.
  1035      */
  1036     warnings.endquote=1;
  1037     if (results->endquote_count>20)
  1038     {
  1039 	warnings.endquote=0;
  1040 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
  1041 	  "Not reporting them.\n",results->endquote_count);
  1042     }
  1043     /*
  1044      * If more than 15 lines contain standalone digits,
  1045      * don't bother reporting them.
  1046      */
  1047     warnings.digit=1;
  1048     if (results->standalone_digit>10)
  1049     {
  1050 	warnings.digit=0;
  1051 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
  1052 	  "Not reporting them.\n",results->standalone_digit);
  1053     }
  1054     /*
  1055      * If more than 20 lines contain hyphens at end,
  1056      * don't bother reporting them.
  1057      */
  1058     warnings.hyphen=1;
  1059     if (results->hyphens>20)
  1060     {
  1061 	warnings.hyphen=0;
  1062 	g_print("   --> %ld lines in this file have hyphens at end. "
  1063 	  "Not reporting them.\n",results->hyphens);
  1064     }
  1065     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1066     {
  1067 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1068 	pswit[MARKUP_SWITCH]=1;
  1069     }
  1070     if (results->verylongline>0)
  1071 	g_print("   --> %ld lines in this file are VERY long!\n",
  1072 	  results->verylongline);
  1073     /*
  1074      * If there are more non-PG spaced dashes than PG em-dashes,
  1075      * assume it's deliberate.
  1076      * Current PG guidelines say don't use them, but older texts do,
  1077      * and some people insist on them whatever the guidelines say.
  1078      */
  1079     warnings.dash=1;
  1080     if (results->spacedash+results->emdash.non_PG_space>
  1081       results->emdash.PG_space)
  1082     {
  1083 	warnings.dash=0;
  1084 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1085 	  "Not reporting them.\n",
  1086 	  results->spacedash+results->emdash.non_PG_space);
  1087     }
  1088     if (charset)
  1089 	warnings.bin=0;
  1090     else
  1091     {
  1092 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1093 	warnings.bin=1;
  1094 	/* If more than a quarter of characters are hi-bit, bug out. */
  1095 	if (results->binlen*4>results->totlen)
  1096 	{
  1097 	    g_print("   --> This file does not appear to be ASCII. "
  1098 	      "Terminating. Best of luck with it!\n");
  1099 	    exit(1);
  1100 	}
  1101 	if (results->alphalen*4<results->totlen)
  1102 	{
  1103 	    g_print("   --> This file does not appear to be text. "
  1104 	      "Terminating. Best of luck with it!\n");
  1105 	    exit(1);
  1106 	}
  1107 	if (results->binlen*100>results->totlen || results->binlen>100)
  1108 	{
  1109 	    g_print("   --> There are a lot of foreign letters here. "
  1110 	      "Not reporting them.\n");
  1111 	    if (!pswit[VERBOSE_SWITCH])
  1112 		warnings.bin=0;
  1113 	}
  1114     }
  1115     warnings.isDutch=FALSE;
  1116     if (results->Dutchcount>50)
  1117     {
  1118 	warnings.isDutch=TRUE;
  1119 	g_print("   --> This looks like Dutch - "
  1120 	  "switching off dashes and warnings for 's Middags case.\n");
  1121     }
  1122     warnings.isFrench=FALSE;
  1123     if (results->Frenchcount>50)
  1124     {
  1125 	warnings.isFrench=TRUE;
  1126 	g_print("   --> This looks like French - "
  1127 	  "switching off some doublepunct.\n");
  1128     }
  1129     if (results->firstline && results->footerline)
  1130 	g_print("    The PG header and footer appear to be already on.\n");
  1131     else
  1132     {
  1133 	if (results->firstline)
  1134 	    g_print("    The PG header is on - no footer.\n");
  1135 	if (results->footerline)
  1136 	    g_print("    The PG footer is on - no header.\n");
  1137     }
  1138     g_print("\n");
  1139     if (pswit[VERBOSE_SWITCH])
  1140     {
  1141 	warnings.shortline=1;
  1142 	warnings.dotcomma=1;
  1143 	warnings.longline=1;
  1144 	warnings.dash=1;
  1145 	warnings.digit=1;
  1146 	warnings.ast=1;
  1147 	warnings.fslash=1;
  1148 	warnings.hyphen=1;
  1149 	warnings.endquote=1;
  1150 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1151     }
  1152     if (warnings.isDutch)
  1153 	warnings.dash=0;
  1154     if (results->footerline>0 && results->firstline>0 &&
  1155       results->footerline>results->firstline &&
  1156       results->footerline-results->firstline<100)
  1157     {
  1158 	g_print("   --> I don't really know where this text starts. \n");
  1159 	g_print("       There are no reference points.\n");
  1160 	g_print("       I'm going to have to report the header and footer "
  1161 	  "as well.\n");
  1162 	results->firstline=0;
  1163     }
  1164     return &warnings;
  1165 }
  1166 
  1167 /*
  1168  * analyse_quotes:
  1169  *
  1170  * Look along the line, accumulate the count of quotes, and see
  1171  * if this is an empty line - i.e. a line with nothing on it
  1172  * but spaces.
  1173  * If line has just spaces, period, * and/or - on it, don't
  1174  * count it, since empty lines with asterisks or dashes to
  1175  * separate sections are common.
  1176  *
  1177  * Returns: TRUE if the line is empty.
  1178  */
  1179 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1180 {
  1181     int guessquote=0;
  1182     /* assume the line is empty until proven otherwise */
  1183     gboolean isemptyline=TRUE;
  1184     const char *s=aline,*sprev,*snext;
  1185     gunichar c;
  1186     sprev=NULL;
  1187     GError *tmp_err=NULL;
  1188     while (*s)
  1189     {
  1190 	snext=g_utf8_next_char(s);
  1191 	c=g_utf8_get_char(s);
  1192 	if (CHAR_IS_DQUOTE(c))
  1193 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1194 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1195 	{
  1196 	    if (s==aline)
  1197 	    {
  1198 		/*
  1199 		 * At start of line, it can only be a quotation mark.
  1200 		 * Hardcode a very common exception!
  1201 		 */
  1202 		if (!g_str_has_prefix(snext,"tis") &&
  1203 		  !g_str_has_prefix(snext,"Tis"))
  1204 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1205 	    }
  1206 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1207 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1208 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1209 		;
  1210 	    /* it's outside a word - let's check it out */
  1211 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1212 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1213 	    {
  1214 		/* certainly looks like a quotation mark */
  1215 		if (!g_str_has_prefix(snext,"tis") &&
  1216 		  !g_str_has_prefix(snext,"Tis"))
  1217 		    /* hardcode a very common exception! */
  1218 		{
  1219 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1220 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1221 		    else
  1222 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1223 		}
  1224 	    }
  1225 	    else
  1226 	    {
  1227 		/* now - is it a quotation mark? */
  1228 		guessquote=0;   /* accumulate clues */
  1229 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1230 		{
  1231 		    /* it follows a letter - could be either */
  1232 		    guessquote++;
  1233 		    if (g_utf8_get_char(sprev)=='s')
  1234 		    {
  1235 			/* looks like a plural apostrophe */
  1236 			guessquote-=3;
  1237 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1238 			    /* bonus marks! */
  1239 			    guessquote-=2;
  1240 		    }
  1241 		    if (innermost_quote_matches(counters,c))
  1242 			/*
  1243 			 * Give it the benefit of some doubt,
  1244 			 * if a squote is already open.
  1245 			 */
  1246 			guessquote++;
  1247 		    else
  1248 			guessquote--;
  1249 		    if (guessquote>=0)
  1250 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1251 		}
  1252 		else
  1253 		    /* no adjacent letter - it must be a quote of some kind */
  1254 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1255 	    }
  1256 	}
  1257 	if (tmp_err)
  1258 	{
  1259 	    if (pswit[ECHO_SWITCH])
  1260 		g_print("\n%s\n",aline);
  1261 	    if (!pswit[OVERVIEW_SWITCH])
  1262 		g_print("    Line %ld column %ld - %s\n",
  1263 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1264 	    g_clear_error(&tmp_err);
  1265 	}
  1266 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1267 	  c!='\r' && c!='\n')
  1268 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1269 	if (c==CHAR_UNDERSCORE)
  1270 	    counters->c_unders++;
  1271 	if (c==CHAR_OPEN_SBRACK)
  1272 	{
  1273 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1274 	      !matching_difference(counters,c) && s==aline &&
  1275 	      g_str_has_prefix(s,"[Illustration:"))
  1276 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1277 	    else
  1278 		increment_matching(counters,c,TRUE);
  1279 	}
  1280 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1281 	    increment_matching(counters,c,TRUE);
  1282 	if (c==CHAR_CLOSE_SBRACK)
  1283 	{
  1284 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1285 	      !matching_difference(counters,c) && !*snext)
  1286 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1287 	    else
  1288 		increment_matching(counters,c,FALSE);
  1289 	}
  1290 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1291 	    increment_matching(counters,c,FALSE);
  1292 	sprev=s;
  1293 	s=snext;
  1294     }
  1295     return isemptyline;
  1296 }
  1297 
  1298 /*
  1299  * check_for_control_characters:
  1300  *
  1301  * Check for invalid or questionable characters in the line
  1302  * Anything above 127 is invalid for plain ASCII, and
  1303  * non-printable control characters should also be flagged.
  1304  * Tabs should generally not be there.
  1305  */
  1306 void check_for_control_characters(const char *aline)
  1307 {
  1308     gunichar c;
  1309     const char *s;
  1310     for (s=aline;*s;s=g_utf8_next_char(s))
  1311     {
  1312 	c=g_utf8_get_char(s);
  1313 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1314 	{
  1315 	    if (pswit[ECHO_SWITCH])
  1316 		g_print("\n%s\n",aline);
  1317 	    if (!pswit[OVERVIEW_SWITCH])
  1318 		g_print("    Line %ld column %ld - Control character %u\n",
  1319 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1320 	    else
  1321 		cnt_bin++;
  1322 	}
  1323     }
  1324 }
  1325 
  1326 /*
  1327  * check_for_odd_characters:
  1328  *
  1329  * Check for binary and other odd characters.
  1330  */
  1331 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1332   gboolean isemptyline)
  1333 {
  1334     /* Don't repeat multiple warnings on one line. */
  1335     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1336     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1337     const char *s;
  1338     gunichar c;
  1339     gsize nb;
  1340     gchar *t;
  1341     for (s=aline;*s;s=g_utf8_next_char(s))
  1342     {
  1343 	c=g_utf8_get_char(s);
  1344 	if (warnings->bin && !eInvalidChar &&
  1345 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1346 	{
  1347 	    if (pswit[ECHO_SWITCH])
  1348 		g_print("\n%s\n",aline);
  1349 	    if (!pswit[OVERVIEW_SWITCH])
  1350 		if (c>127 && c<160 || c>255)
  1351 		    g_print("    Line %ld column %ld - "
  1352 		      "Non-ISO-8859 character %u\n",
  1353 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1354 		else
  1355 		    g_print("    Line %ld column %ld - "
  1356 		      "Non-ASCII character %u\n",
  1357 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1358 	    else
  1359 		cnt_bin++;
  1360 	    eInvalidChar=TRUE;
  1361 	}
  1362 	if (!eInvalidChar && charset)
  1363 	{
  1364 	    if (charset_validator==(GIConv)-1)
  1365 	    {
  1366 		if (!g_unichar_isdefined(c))
  1367 		{
  1368 		    if (pswit[ECHO_SWITCH])
  1369 			g_print("\n%s\n",aline);
  1370 		    if (!pswit[OVERVIEW_SWITCH])
  1371 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1372 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1373 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1374 		    else
  1375 			cnt_bin++;
  1376 		    eInvalidChar=TRUE;
  1377 		}
  1378 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1379 		  c>=100000 && c<=0x10FFFD)
  1380 		{
  1381 		    if (pswit[ECHO_SWITCH])
  1382 			g_print("\n%s\n",aline);
  1383 		    if (!pswit[OVERVIEW_SWITCH])
  1384 			g_print("    Line %ld column %ld - Private Use "
  1385 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1386 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1387 		    else
  1388 			cnt_bin++;
  1389 		    eInvalidChar=TRUE;
  1390 		}
  1391 	    }
  1392 	    else
  1393 	    {
  1394 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1395 		  charset_validator,NULL,&nb,NULL);
  1396 		if (t)
  1397 		    g_free(t);
  1398 		else
  1399 		{
  1400 		    if (pswit[ECHO_SWITCH])
  1401 			g_print("\n%s\n",aline);
  1402 		    if (!pswit[OVERVIEW_SWITCH])
  1403 			g_print("    Line %ld column %ld - Non-%s "
  1404 			  "character %u\n",linecnt,
  1405 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1406 		    else
  1407 			cnt_bin++;
  1408 		    eInvalidChar=TRUE;
  1409 		}
  1410 	    }
  1411 	}
  1412 	if (!eTab && c==CHAR_TAB)
  1413 	{
  1414 	    if (pswit[ECHO_SWITCH])
  1415 		g_print("\n%s\n",aline);
  1416 	    if (!pswit[OVERVIEW_SWITCH])
  1417 		g_print("    Line %ld column %ld - Tab character?\n",
  1418 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1419 	    else
  1420 		cnt_odd++;
  1421 	    eTab=TRUE;
  1422 	}
  1423 	if (!eTilde && c==CHAR_TILDE)
  1424 	{
  1425 	    /*
  1426 	     * Often used by OCR software to indicate an
  1427 	     * unrecognizable character.
  1428 	     */
  1429 	    if (pswit[ECHO_SWITCH])
  1430 		g_print("\n%s\n",aline);
  1431 	    if (!pswit[OVERVIEW_SWITCH])
  1432 		g_print("    Line %ld column %ld - Tilde character?\n",
  1433 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1434 	    else
  1435 		cnt_odd++;
  1436 	    eTilde=TRUE;
  1437 	}
  1438 	if (!eCarat && c==CHAR_CARAT)
  1439 	{  
  1440 	    if (pswit[ECHO_SWITCH])
  1441 		g_print("\n%s\n",aline);
  1442 	    if (!pswit[OVERVIEW_SWITCH])
  1443 		g_print("    Line %ld column %ld - Carat character?\n",
  1444 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1445 	    else
  1446 		cnt_odd++;
  1447 	    eCarat=TRUE;
  1448 	}
  1449 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1450 	{  
  1451 	    if (pswit[ECHO_SWITCH])
  1452 		g_print("\n%s\n",aline);
  1453 	    if (!pswit[OVERVIEW_SWITCH])
  1454 		g_print("    Line %ld column %ld - Forward slash?\n",
  1455 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1456 	    else
  1457 		cnt_odd++;
  1458 	    eFSlash=TRUE;
  1459 	}
  1460 	/*
  1461 	 * Report asterisks only in paranoid mode,
  1462 	 * since they're often deliberate.
  1463 	 */
  1464 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1465 	  c==CHAR_ASTERISK)
  1466 	{
  1467 	    if (pswit[ECHO_SWITCH])
  1468 		g_print("\n%s\n",aline);
  1469 	    if (!pswit[OVERVIEW_SWITCH])
  1470 		g_print("    Line %ld column %ld - Asterisk?\n",
  1471 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1472 	    else
  1473 		cnt_odd++;
  1474 	    eAst=TRUE;
  1475 	}
  1476     }
  1477 }
  1478 
  1479 /*
  1480  * check_for_long_line:
  1481  *
  1482  * Check for line too long.
  1483  */
  1484 void check_for_long_line(const char *aline)
  1485 {
  1486     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1487     {
  1488 	if (pswit[ECHO_SWITCH])
  1489 	    g_print("\n%s\n",aline);
  1490 	if (!pswit[OVERVIEW_SWITCH])
  1491 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1492 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1493 	else
  1494 	    cnt_long++;
  1495     }
  1496 }
  1497 
  1498 /*
  1499  * check_for_short_line:
  1500  *
  1501  * Check for line too short.
  1502  *
  1503  * This one is a bit trickier to implement: we don't want to
  1504  * flag the last line of a paragraph for being short, so we
  1505  * have to wait until we know that our current line is a
  1506  * "normal" line, then report the _previous_ line if it was too
  1507  * short. We also don't want to report indented lines like
  1508  * chapter heads or formatted quotations. We therefore keep
  1509  * last->len as the length of the last line examined, and
  1510  * last->blen as the length of the last but one, and try to
  1511  * suppress unnecessary warnings by checking that both were of
  1512  * "normal" length. We keep the first character of the last
  1513  * line in last->start, and if it was a space, we assume that
  1514  * the formatting is deliberate. I can't figure out a way to
  1515  * distinguish something like a quoted verse left-aligned or
  1516  * the header or footer of a letter from a paragraph of short
  1517  * lines - maybe if I examined the whole paragraph, and if the
  1518  * para has less than, say, 8 lines and if all lines are short,
  1519  * then just assume it's OK? Need to look at some texts to see
  1520  * how often a formula like this would get the right result.
  1521  */
  1522 void check_for_short_line(const char *aline,const struct line_properties *last)
  1523 {
  1524     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1525       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1526       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1527     {
  1528 	if (pswit[ECHO_SWITCH])
  1529 	    g_print("\n%s\n",prevline);
  1530 	if (!pswit[OVERVIEW_SWITCH])
  1531 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1532 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1533 	else
  1534 	    cnt_short++;
  1535     }
  1536 }
  1537 
  1538 /*
  1539  * check_for_starting_punctuation:
  1540  *
  1541  * Look for punctuation other than full ellipses at start of line.
  1542  */
  1543 void check_for_starting_punctuation(const char *aline)
  1544 {
  1545     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1546       !g_str_has_prefix(aline,". . ."))
  1547     {
  1548 	if (pswit[ECHO_SWITCH])
  1549 	    g_print("\n%s\n",aline);
  1550 	if (!pswit[OVERVIEW_SWITCH])
  1551 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1552 	      linecnt);
  1553 	else
  1554 	    cnt_punct++;
  1555     }
  1556 }
  1557 
  1558 /*
  1559  * str_emdash:
  1560  *
  1561  * Find the first em-dash, return a pointer to it and set <next> to the
  1562  * character following the dash.
  1563  */
  1564 char *str_emdash(const char *s,const char **next)
  1565 {
  1566     const char *s1,*s2;
  1567     s1=strstr(s,"--");
  1568     s2=strstr(s,"—");
  1569     if (!s1)
  1570     {
  1571 	if (s2)
  1572 	    *next=g_utf8_next_char(s2);
  1573 	return (char *)s2;
  1574     }
  1575     else if (!s2)
  1576     {
  1577 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1578 	return (char *)s1;
  1579     }
  1580     else if (s1<s2)
  1581     {
  1582 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1583 	return (char *)s1;
  1584     }
  1585     else
  1586     {
  1587 	*next=g_utf8_next_char(s2);
  1588 	return (char *)s2;
  1589     }
  1590 }
  1591 
  1592 /*
  1593  * check_for_spaced_emdash:
  1594  *
  1595  * Check for spaced em-dashes.
  1596  *
  1597  * We must check _all_ occurrences of em-dashes on the line
  1598  * hence the loop - even if the first dash is OK
  1599  * there may be another that's wrong later on.
  1600  */
  1601 void check_for_spaced_emdash(const char *aline)
  1602 {
  1603     const char *s,*t,*next;
  1604     for (s=aline;t=str_emdash(s,&next);s=next)
  1605     {
  1606 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1607 	  g_utf8_get_char(next)==CHAR_SPACE)
  1608 	{
  1609 	    if (pswit[ECHO_SWITCH])
  1610 		g_print("\n%s\n",aline);
  1611 	    if (!pswit[OVERVIEW_SWITCH])
  1612 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1613 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1614 	    else
  1615 		cnt_dash++;
  1616 	}
  1617     }
  1618 }
  1619 
  1620 /*
  1621  * check_for_spaced_dash:
  1622  *
  1623  * Check for spaced dashes.
  1624  */
  1625 void check_for_spaced_dash(const char *aline)
  1626 {
  1627     const char *s;
  1628     if ((s=strstr(aline," -")))
  1629     {
  1630 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1631 	{
  1632 	    if (pswit[ECHO_SWITCH])
  1633 		g_print("\n%s\n",aline);
  1634 	    if (!pswit[OVERVIEW_SWITCH])
  1635 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1636 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1637 	    else
  1638 		cnt_dash++;
  1639 	}
  1640     }
  1641     else if ((s=strstr(aline,"- ")))
  1642     {
  1643 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1644 	{
  1645 	    if (pswit[ECHO_SWITCH])
  1646 		g_print("\n%s\n",aline);
  1647 	    if (!pswit[OVERVIEW_SWITCH])
  1648 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1649 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1650 	    else
  1651 		cnt_dash++;
  1652 	}
  1653     }
  1654 }
  1655 
  1656 /*
  1657  * check_for_unmarked_paragraphs:
  1658  *
  1659  * Check for unmarked paragraphs indicated by separate speakers.
  1660  *
  1661  * May well be false positive:
  1662  * "Bravo!" "Wonderful!" called the crowd.
  1663  * but useful all the same.
  1664  */
  1665 void check_for_unmarked_paragraphs(const char *aline)
  1666 {
  1667     const char *s;
  1668     s=strstr(aline,"\"  \"");
  1669     if (!s)
  1670 	s=strstr(aline,"\" \"");
  1671     if (s)
  1672     {
  1673 	if (pswit[ECHO_SWITCH])
  1674 	    g_print("\n%s\n",aline);
  1675 	if (!pswit[OVERVIEW_SWITCH])
  1676 	    g_print("    Line %ld column %ld - "
  1677 	      "Query missing paragraph break?\n",
  1678 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1679 	else
  1680 	    cnt_punct++;
  1681     }
  1682 }
  1683 
  1684 /*
  1685  * check_for_jeebies:
  1686  *
  1687  * Check for "to he" and other easy h/b errors.
  1688  *
  1689  * This is a very inadequate effort on the h/b problem,
  1690  * but the phrase "to he" is always an error, whereas "to
  1691  * be" is quite common.
  1692  * Similarly, '"Quiet!", be said.' is a non-be error
  1693  * "to he" is _not_ always an error!:
  1694  *       "Where they went to he couldn't say."
  1695  * Another false positive:
  1696  *       What would "Cinderella" be without the . . .
  1697  * and another: "If he wants to he can see for himself."
  1698  */
  1699 void check_for_jeebies(const char *aline)
  1700 {
  1701     const char *s;
  1702     s=strstr(aline," be could ");
  1703     if (!s)
  1704 	s=strstr(aline," be would ");
  1705     if (!s)
  1706 	s=strstr(aline," was be ");
  1707     if (!s)
  1708 	s=strstr(aline," be is ");
  1709     if (!s)
  1710 	s=strstr(aline," is be ");
  1711     if (!s)
  1712 	s=strstr(aline,"\", be ");
  1713     if (!s)
  1714 	s=strstr(aline,"\" be ");
  1715     if (!s)
  1716 	s=strstr(aline,"\" be ");
  1717     if (!s)
  1718 	s=strstr(aline," to he ");
  1719     if (s)
  1720     {
  1721 	if (pswit[ECHO_SWITCH])
  1722 	    g_print("\n%s\n",aline);
  1723 	if (!pswit[OVERVIEW_SWITCH])
  1724 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1725 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1726 	else
  1727 	    cnt_word++;
  1728     }
  1729     s=strstr(aline," the had ");
  1730     if (!s)
  1731 	s=strstr(aline," a had ");
  1732     if (!s)
  1733 	s=strstr(aline," they bad ");
  1734     if (!s)
  1735 	s=strstr(aline," she bad ");
  1736     if (!s)
  1737 	s=strstr(aline," he bad ");
  1738     if (!s)
  1739 	s=strstr(aline," you bad ");
  1740     if (!s)
  1741 	s=strstr(aline," i bad ");
  1742     if (s)
  1743     {
  1744 	if (pswit[ECHO_SWITCH])
  1745 	    g_print("\n%s\n",aline);
  1746 	if (!pswit[OVERVIEW_SWITCH])
  1747 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1748 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1749 	else
  1750 	    cnt_word++;
  1751     }
  1752     s=strstr(aline,"; hut ");
  1753     if (!s)
  1754 	s=strstr(aline,", hut ");
  1755     if (s)
  1756     {
  1757 	if (pswit[ECHO_SWITCH])
  1758 	    g_print("\n%s\n",aline);
  1759 	if (!pswit[OVERVIEW_SWITCH])
  1760 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1761 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1762 	else
  1763 	    cnt_word++;
  1764     }
  1765 }
  1766 
  1767 /*
  1768  * check_for_mta_from:
  1769  *
  1770  * Special case - angled bracket in front of "From" placed there by an
  1771  * MTA when sending an e-mail.
  1772  */
  1773 void check_for_mta_from(const char *aline)
  1774 {
  1775     const char *s;
  1776     s=strstr(aline,">From");
  1777     if (s)
  1778     {
  1779 	if (pswit[ECHO_SWITCH])
  1780 	    g_print("\n%s\n",aline);
  1781 	if (!pswit[OVERVIEW_SWITCH])
  1782 	    g_print("    Line %ld column %ld - "
  1783 	      "Query angled bracket with From\n",
  1784 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1785 	else
  1786 	    cnt_punct++;
  1787     }
  1788 }
  1789 
  1790 /*
  1791  * check_for_orphan_character:
  1792  *
  1793  * Check for a single character line -
  1794  * often an overflow from bad wrapping.
  1795  */
  1796 void check_for_orphan_character(const char *aline)
  1797 {
  1798     gunichar c;
  1799     c=g_utf8_get_char(aline);
  1800     if (c && !*g_utf8_next_char(aline))
  1801     {
  1802 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1803 	    ; /* Nothing - ignore numerals alone on a line. */
  1804 	else
  1805 	{
  1806 	    if (pswit[ECHO_SWITCH])
  1807 		g_print("\n%s\n",aline);
  1808 	    if (!pswit[OVERVIEW_SWITCH])
  1809 		g_print("    Line %ld column 1 - Query single character line\n",
  1810 		  linecnt);
  1811 	    else
  1812 		cnt_punct++;
  1813 	}
  1814     }
  1815 }
  1816 
  1817 /*
  1818  * check_for_pling_scanno:
  1819  *
  1820  * Check for I" - often should be !
  1821  */
  1822 void check_for_pling_scanno(const char *aline)
  1823 {
  1824     const char *s;
  1825     s=strstr(aline," I\"");
  1826     if (s)
  1827     {
  1828 	if (pswit[ECHO_SWITCH])
  1829 	    g_print("\n%s\n",aline);
  1830 	if (!pswit[OVERVIEW_SWITCH])
  1831 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1832 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1833 	else
  1834 	    cnt_punct++;
  1835     }
  1836 }
  1837 
  1838 /*
  1839  * check_for_extra_period:
  1840  *
  1841  * Check for period without a capital letter. Cut-down from gutspell.
  1842  * Only works when it happens on a single line.
  1843  */
  1844 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1845 {
  1846     const char *s,*t,*s1,*sprev;
  1847     int i;
  1848     gsize len;
  1849     gboolean istypo;
  1850     gchar *testword;
  1851     gunichar c,nc,pc,*decomposition;
  1852     if (pswit[PARANOID_SWITCH])
  1853     {
  1854 	for (t=aline;t=strstr(t,". ");)
  1855 	{
  1856 	    if (t==aline)
  1857 	    {
  1858 		t=g_utf8_next_char(t);
  1859 		/* start of line punctuation is handled elsewhere */
  1860 		continue;
  1861 	    }
  1862 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1863 	    {
  1864 		t=g_utf8_next_char(t);
  1865 		continue;
  1866 	    }
  1867 	    if (warnings->isDutch)
  1868 	    {
  1869 		/* For Frank & Jeroen -- 's Middags case */
  1870 		gunichar c2,c3,c4,c5;
  1871 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1872 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1873 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1874 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1875 		if (CHAR_IS_APOSTROPHE(c2) &&
  1876 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1877 		  g_unichar_isupper(c5))
  1878 		{
  1879 		    t=g_utf8_next_char(t);
  1880 		    continue;
  1881 		}
  1882 	    }
  1883 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1884 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1885 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1886 		s1=g_utf8_next_char(s1);
  1887 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1888 	    {
  1889 		/* we have something to investigate */
  1890 		istypo=TRUE;
  1891 		/* so let's go back and find out */
  1892 		nc=g_utf8_get_char(t);
  1893 		s1=g_utf8_prev_char(t);
  1894 		c=g_utf8_get_char(s1);
  1895 		sprev=g_utf8_prev_char(s1);
  1896 		pc=g_utf8_get_char(sprev);
  1897 		while (s1>=aline &&
  1898 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1899 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1900 		  g_unichar_isalpha(nc)))
  1901 		{
  1902 		    nc=c;
  1903 		    s1=sprev;
  1904 		    c=pc;
  1905 		    sprev=g_utf8_prev_char(s1);
  1906 		    pc=g_utf8_get_char(sprev);
  1907 		}
  1908 		s1=g_utf8_next_char(s1);
  1909 		s=strchr(s1,'.');
  1910 		if (s)
  1911 		    testword=g_strndup(s1,s-s1);
  1912 		else
  1913 		    testword=g_strdup(s1);
  1914 		for (i=0;*abbrev[i];i++)
  1915 		    if (!strcmp(testword,abbrev[i]))
  1916 			istypo=FALSE;
  1917 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1918 		    istypo=FALSE;
  1919 		if (!*g_utf8_next_char(testword))
  1920 		    istypo=FALSE;
  1921 		if (isroman(testword))
  1922 		    istypo=FALSE;
  1923 		if (istypo)
  1924 		{
  1925 		    istypo=FALSE;
  1926 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1927 		    {
  1928 			decomposition=g_unicode_canonical_decomposition(
  1929 			  g_utf8_get_char(s),&len);
  1930 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1931 			    istypo=TRUE;
  1932 			g_free(decomposition);
  1933 		    }
  1934 		}
  1935 		if (istypo &&
  1936 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1937 		{
  1938 		    g_tree_insert(qperiod,g_strdup(testword),
  1939 		      GINT_TO_POINTER(1));
  1940 		    if (pswit[ECHO_SWITCH])
  1941 			g_print("\n%s\n",aline);
  1942 		    if (!pswit[OVERVIEW_SWITCH])
  1943 			g_print("    Line %ld column %ld - Extra period?\n",
  1944 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1945 		    else
  1946 			cnt_punct++;
  1947 		}
  1948 		g_free(testword);
  1949 	    }
  1950 	    t=g_utf8_next_char(t);
  1951 	}
  1952     }
  1953 }
  1954 
  1955 /*
  1956  * check_for_following_punctuation:
  1957  *
  1958  * Check for words usually not followed by punctuation.
  1959  */
  1960 void check_for_following_punctuation(const char *aline)
  1961 {
  1962     int i;
  1963     const char *s,*wordstart;
  1964     gunichar c;
  1965     gchar *inword,*t;
  1966     if (pswit[TYPO_SWITCH])
  1967     {
  1968 	for (s=aline;*s;)
  1969 	{
  1970 	    wordstart=s;
  1971 	    t=getaword(&s);
  1972 	    if (!*t)
  1973 	    {
  1974 		g_free(t);
  1975 		continue;
  1976 	    }
  1977 	    inword=g_utf8_strdown(t,-1);
  1978 	    g_free(t);
  1979 	    for (i=0;*nocomma[i];i++)
  1980 		if (!strcmp(inword,nocomma[i]))
  1981 		{
  1982 		    c=g_utf8_get_char(s);
  1983 		    if (c==',' || c==';' || c==':')
  1984 		    {
  1985 			if (pswit[ECHO_SWITCH])
  1986 			    g_print("\n%s\n",aline);
  1987 			if (!pswit[OVERVIEW_SWITCH])
  1988 			    g_print("    Line %ld column %ld - "
  1989 			      "Query punctuation after %s?\n",
  1990 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1991 			      inword);
  1992 			else
  1993 			    cnt_punct++;
  1994 		    }
  1995 		}
  1996 	    for (i=0;*noperiod[i];i++)
  1997 		if (!strcmp(inword,noperiod[i]))
  1998 		{
  1999 		    c=g_utf8_get_char(s);
  2000 		    if (c=='.' || c=='!')
  2001 		    {
  2002 			if (pswit[ECHO_SWITCH])
  2003 			    g_print("\n%s\n",aline);
  2004 			if (!pswit[OVERVIEW_SWITCH])
  2005 			    g_print("    Line %ld column %ld - "
  2006 			      "Query punctuation after %s?\n",
  2007 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  2008 			      inword);
  2009 			else
  2010 			    cnt_punct++;
  2011 		    }
  2012 		}
  2013 	    g_free(inword);
  2014 	}
  2015     }
  2016 }
  2017 
  2018 /*
  2019  * check_for_typos:
  2020  *
  2021  * Check for commonly mistyped words,
  2022  * and digits like 0 for O in a word.
  2023  */
  2024 void check_for_typos(const char *aline,struct warnings *warnings)
  2025 {
  2026     const char *s,*t,*nt,*wordstart;
  2027     gchar *inword;
  2028     gunichar *decomposition;
  2029     gchar *testword;
  2030     int i,vowel,consonant,*dupcnt;
  2031     gboolean isdup,istypo,alower;
  2032     gunichar c,pc;
  2033     long offset,len;
  2034     gsize decomposition_len;
  2035     for (s=aline;*s;)
  2036     {
  2037 	wordstart=s;
  2038 	inword=getaword(&s);
  2039 	if (!*inword)
  2040 	{
  2041 	    g_free(inword);
  2042 	    continue; /* don't bother with empty lines */
  2043 	}
  2044 	if (mixdigit(inword))
  2045 	{
  2046 	    if (pswit[ECHO_SWITCH])
  2047 		g_print("\n%s\n",aline);
  2048 	    if (!pswit[OVERVIEW_SWITCH])
  2049 		g_print("    Line %ld column %ld - Query digit in %s\n",
  2050 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  2051 	    else
  2052 		cnt_word++;
  2053 	}
  2054 	/*
  2055 	 * Put the word through a series of tests for likely typos and OCR
  2056 	 * errors.
  2057 	 */
  2058 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2059 	{
  2060 	    istypo=FALSE;
  2061 	    alower=FALSE;
  2062 	    for (t=inword;*t;t=g_utf8_next_char(t))
  2063 	    {
  2064 		c=g_utf8_get_char(t);
  2065 		nt=g_utf8_next_char(t);
  2066 		/* lowercase for testing */
  2067 		if (g_unichar_islower(c))
  2068 		    alower=TRUE;
  2069 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  2070 		{
  2071 		    /*
  2072 		     * We have an uppercase mid-word. However, there are
  2073 		     * common cases:
  2074 		     *   Mac and Mc like McGill
  2075 		     *   French contractions like l'Abbe
  2076 		     */
  2077 		    offset=g_utf8_pointer_to_offset(inword,t);
  2078 		    if (offset>0)
  2079 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  2080 		    else
  2081 			pc='\0';
  2082 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  2083 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  2084 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  2085 		      CHAR_IS_APOSTROPHE(pc))
  2086 			; /* do nothing! */
  2087 		    else
  2088 			istypo=TRUE;
  2089 		}
  2090 	    }
  2091 	    testword=g_utf8_casefold(inword,-1);
  2092 	}
  2093 	if (pswit[TYPO_SWITCH])
  2094 	{
  2095 	    /*
  2096 	     * Check for certain unlikely two-letter combinations at word
  2097 	     * start and end.
  2098 	     */
  2099 	    len=g_utf8_strlen(testword,-1);
  2100 	    if (len>1)
  2101 	    {
  2102 		for (i=0;*nostart[i];i++)
  2103 		    if (g_str_has_prefix(testword,nostart[i]))
  2104 			istypo=TRUE;
  2105 		for (i=0;*noend[i];i++)
  2106 		    if (g_str_has_suffix(testword,noend[i]))
  2107 			istypo=TRUE;
  2108 	    }
  2109 	    /* ght is common, gbt never. Like that. */
  2110 	    if (strstr(testword,"cb"))
  2111 		istypo=TRUE;
  2112 	    if (strstr(testword,"gbt"))
  2113 		istypo=TRUE;
  2114 	    if (strstr(testword,"pbt"))
  2115 		istypo=TRUE;
  2116 	    if (strstr(testword,"tbs"))
  2117 		istypo=TRUE;
  2118 	    if (strstr(testword,"mrn"))
  2119 		istypo=TRUE;
  2120 	    if (strstr(testword,"ahle"))
  2121 		istypo=TRUE;
  2122 	    if (strstr(testword,"ihle"))
  2123 		istypo=TRUE;
  2124 	    /*
  2125 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2126 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2127 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2128 	     * numerals, but "ii" is a common scanno.
  2129 	     */
  2130 	    if (strstr(testword,"tbi"))
  2131 		istypo=TRUE;
  2132 	    if (strstr(testword,"tbe"))
  2133 		istypo=TRUE;
  2134 	    if (strstr(testword,"ii"))
  2135 		istypo=TRUE;
  2136 	    /*
  2137 	     * Check for no vowels or no consonants.
  2138 	     * If none, flag a typo.
  2139 	     */
  2140 	    if (!istypo && len>1)
  2141 	    {
  2142 		vowel=consonant=0;
  2143 		for (t=testword;*t;t=g_utf8_next_char(t))
  2144 		{
  2145 		    c=g_utf8_get_char(t);
  2146 		    decomposition=
  2147 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2148 		    if (c=='y' || g_unichar_isdigit(c))
  2149 		    {
  2150 			/* Yah, this is loose. */
  2151 			vowel++;
  2152 			consonant++;
  2153 		    }
  2154 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2155 			vowel++;
  2156 		    else
  2157 			consonant++;
  2158 		    g_free(decomposition);
  2159 		}
  2160 		if (!vowel || !consonant)
  2161 		    istypo=TRUE;
  2162 	    }
  2163 	    /*
  2164 	     * Now exclude the word from being reported if it's in
  2165 	     * the okword list.
  2166 	     */
  2167 	    for (i=0;*okword[i];i++)
  2168 		if (!strcmp(testword,okword[i]))
  2169 		    istypo=FALSE;
  2170 	    /*
  2171 	     * What looks like a typo may be a Roman numeral.
  2172 	     * Exclude these.
  2173 	     */
  2174 	    if (istypo && isroman(testword))
  2175 		istypo=FALSE;
  2176 	    /* Check the manual list of typos. */
  2177 	    if (!istypo)
  2178 		for (i=0;*typo[i];i++)
  2179 		    if (!strcmp(testword,typo[i]))
  2180 			istypo=TRUE;
  2181 	    /*
  2182 	     * Check lowercase s, l, i and m - special cases.
  2183 	     *   "j" - often a semi-colon gone wrong.
  2184 	     *   "d" for a missing apostrophe - he d
  2185 	     *   "n" for "in"
  2186 	     */
  2187 	    if (!istypo && len==1 &&
  2188 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2189 		istypo=TRUE;
  2190 	    if (istypo)
  2191 	    {
  2192 		dupcnt=g_tree_lookup(qword,testword);
  2193 		if (dupcnt)
  2194 		{
  2195 		    (*dupcnt)++;
  2196 		    isdup=!pswit[VERBOSE_SWITCH];
  2197 		}
  2198 		else
  2199 		{
  2200 		    dupcnt=g_new0(int,1);
  2201 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2202 		    isdup=FALSE;
  2203 		}
  2204 		if (!isdup)
  2205 		{
  2206 		    if (pswit[ECHO_SWITCH])
  2207 			g_print("\n%s\n",aline);
  2208 		    if (!pswit[OVERVIEW_SWITCH])
  2209 		    {
  2210 			g_print("    Line %ld column %ld - Query word %s",
  2211 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2212 			  inword);
  2213 			if (!pswit[VERBOSE_SWITCH])
  2214 			    g_print(" - not reporting duplicates");
  2215 			g_print("\n");
  2216 		    }
  2217 		    else
  2218 			cnt_word++;
  2219 		}
  2220 	    }
  2221 	}
  2222 	/* check the user's list of typos */
  2223 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2224 	{
  2225 	    if (pswit[ECHO_SWITCH])
  2226 		g_print("\n%s\n",aline);
  2227 	    if (!pswit[OVERVIEW_SWITCH])  
  2228 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2229 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2230 	}
  2231 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2232 	    g_free(testword);
  2233 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2234 	{
  2235 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2236 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2237 	    {
  2238 		if (pswit[ECHO_SWITCH])
  2239 		    g_print("\n%s\n",aline);
  2240 		if (!pswit[OVERVIEW_SWITCH])
  2241 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2242 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2243 		      inword);
  2244 		else
  2245 		    cnt_word++;
  2246 	    }
  2247 	}
  2248 	g_free(inword);
  2249     }
  2250 }
  2251 
  2252 /*
  2253  * check_for_misspaced_punctuation:
  2254  *
  2255  * Look for added or missing spaces around punctuation and quotes.
  2256  * If there is a punctuation character like ! with no space on
  2257  * either side, suspect a missing!space. If there are spaces on
  2258  * both sides , assume a typo. If we see a double quote with no
  2259  * space or punctuation on either side of it, assume unspaced
  2260  * quotes "like"this.
  2261  */
  2262 void check_for_misspaced_punctuation(const char *aline,
  2263   struct parities *parities,gboolean isemptyline)
  2264 {
  2265     gboolean isacro,isellipsis;
  2266     const char *s;
  2267     gunichar c,nc,pc,n2c;
  2268     int parity;
  2269     c=g_utf8_get_char(aline);
  2270     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2271     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2272     {
  2273 	pc=c;
  2274 	c=nc;
  2275 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2276 	/* For each character in the line after the first. */
  2277 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2278 	{
  2279 	    /* we need to suppress warnings for acronyms like M.D. */
  2280 	    isacro=FALSE;
  2281 	    /* we need to suppress warnings for ellipsis . . . */
  2282 	    isellipsis=FALSE;
  2283 	    /*
  2284 	     * If there are letters on both sides of it or
  2285 	     * if it's strict punctuation followed by an alpha.
  2286 	     */
  2287 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2288 	      g_utf8_strchr("?!,;:",-1,c)))
  2289 	    {
  2290 		if (c=='.')
  2291 		{
  2292 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2293 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2294 			isacro=TRUE;
  2295 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2296 		    if (nc && n2c=='.')
  2297 			isacro=TRUE;
  2298 		}
  2299 		if (!isacro)
  2300 		{
  2301 		    if (pswit[ECHO_SWITCH])
  2302 			g_print("\n%s\n",aline);
  2303 		    if (!pswit[OVERVIEW_SWITCH])
  2304 			g_print("    Line %ld column %ld - Missing space?\n",
  2305 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2306 		    else
  2307 			cnt_punct++;
  2308 		}
  2309 	    }
  2310 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2311 	    {
  2312 		/*
  2313 		 * If there are spaces on both sides,
  2314 		 * or space before and end of line.
  2315 		 */
  2316 		if (c=='.')
  2317 		{
  2318 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2319 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2320 			isellipsis=TRUE;
  2321 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2322 		    if (nc && n2c=='.')
  2323 			isellipsis=TRUE;
  2324 		}
  2325 		if (!isemptyline && !isellipsis)
  2326 		{
  2327 		    if (pswit[ECHO_SWITCH])
  2328 			g_print("\n%s\n",aline);
  2329 		    if (!pswit[OVERVIEW_SWITCH])
  2330 			g_print("    Line %ld column %ld - "
  2331 			  "Spaced punctuation?\n",linecnt,
  2332 			  g_utf8_pointer_to_offset(aline,s)+1);
  2333 		    else
  2334 			cnt_punct++;
  2335 		}
  2336 	    }
  2337 	}
  2338     }
  2339     /* Split out the characters that CANNOT be preceded by space. */
  2340     c=g_utf8_get_char(aline);
  2341     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2342     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2343     {
  2344 	pc=c;
  2345 	c=nc;
  2346 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2347 	/* for each character in the line after the first */
  2348 	if (g_utf8_strchr("?!,;:",-1,c))
  2349 	{
  2350 	    /* if it's punctuation that _cannot_ have a space before it */
  2351 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2352 	    {
  2353 		/*
  2354 		 * If nc DOES == space,
  2355 		 * it was already reported just above.
  2356 		 */
  2357 		if (pswit[ECHO_SWITCH])
  2358 		    g_print("\n%s\n",aline);
  2359 		if (!pswit[OVERVIEW_SWITCH])
  2360 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2361 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2362 		else
  2363 		    cnt_punct++;
  2364 	    }
  2365 	}
  2366     }
  2367     /*
  2368      * Special case " .X" where X is any alpha.
  2369      * This plugs a hole in the acronym code above.
  2370      * Inelegant, but maintainable.
  2371      */
  2372     c=g_utf8_get_char(aline);
  2373     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2374     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2375     {
  2376 	pc=c;
  2377 	c=nc;
  2378 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2379 	/* for each character in the line after the first */
  2380 	if (c=='.')
  2381 	{
  2382 	    /* if it's a period */
  2383 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2384 	    {
  2385 		/*
  2386 		 * If the period follows a space and
  2387 		 * is followed by a letter.
  2388 		 */
  2389 		if (pswit[ECHO_SWITCH])
  2390 		    g_print("\n%s\n",aline);
  2391 		if (!pswit[OVERVIEW_SWITCH])
  2392 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2393 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2394 		else
  2395 		    cnt_punct++;
  2396 	    }
  2397 	}
  2398     }
  2399     c=g_utf8_get_char(aline);
  2400     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2401     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2402     {
  2403 	pc=c;
  2404 	c=nc;
  2405 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2406 	/* for each character in the line after the first */
  2407 	if (CHAR_IS_DQUOTE(c))
  2408 	{
  2409 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2410 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2411 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2412 	    {
  2413 		if (pswit[ECHO_SWITCH])
  2414 		    g_print("\n%s\n",aline);
  2415 		if (!pswit[OVERVIEW_SWITCH])
  2416 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2417 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2418 		else
  2419 		    cnt_punct++;
  2420 	    }
  2421 	}
  2422     }
  2423     /* Check parity of quotes. */
  2424     nc=g_utf8_get_char(aline);
  2425     for (s=aline;*s;s=g_utf8_next_char(s))
  2426     {
  2427 	c=nc;
  2428 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2429 	if (CHAR_IS_DQUOTE(c))
  2430 	{
  2431 	    if (c==CHAR_DQUOTE)
  2432 	    {
  2433 		parities->dquote=!parities->dquote;
  2434 		parity=parities->dquote;
  2435 	    }
  2436 	    else if (c==CHAR_LD_QUOTE)
  2437 		parity=1;
  2438 	    else
  2439 		parity=0;
  2440 	    if (!parity)
  2441 	    {
  2442 		/* parity even */
  2443 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2444 		{
  2445 		    if (pswit[ECHO_SWITCH])
  2446 			g_print("\n%s\n",aline);
  2447 		    if (!pswit[OVERVIEW_SWITCH])
  2448 			g_print("    Line %ld column %ld - "
  2449 			  "Wrongspaced quotes?\n",
  2450 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2451 		    else
  2452 			cnt_punct++;
  2453 		}
  2454 	    }
  2455 	    else
  2456 	    {
  2457 		/* parity odd */
  2458 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2459 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2460 		{
  2461 		    if (pswit[ECHO_SWITCH])
  2462 			g_print("\n%s\n",aline);
  2463 		    if (!pswit[OVERVIEW_SWITCH])
  2464 			g_print("    Line %ld column %ld - "
  2465 			  "Wrongspaced quotes?\n",
  2466 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2467 		    else
  2468 			cnt_punct++;
  2469 		}
  2470 	    }
  2471 	}
  2472     }
  2473     c=g_utf8_get_char(aline);
  2474     if (CHAR_IS_DQUOTE(c))
  2475     {
  2476 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2477 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2478 	{
  2479 	    if (pswit[ECHO_SWITCH])
  2480 		g_print("\n%s\n",aline);
  2481 	    if (!pswit[OVERVIEW_SWITCH])
  2482 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2483 		  linecnt);
  2484 	    else
  2485 		cnt_punct++;
  2486 	}
  2487     }
  2488     if (pswit[SQUOTE_SWITCH])
  2489     {
  2490 	nc=g_utf8_get_char(aline);
  2491 	for (s=aline;*s;s=g_utf8_next_char(s))
  2492 	{
  2493 	    c=nc;
  2494 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2495 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2496 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2497 	      !g_unichar_isalpha(nc)))
  2498 	    {
  2499 		parities->squote=!parities->squote;
  2500 		if (!parities->squote)
  2501 		{
  2502 		    /* parity even */
  2503 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2504 		    {
  2505 			if (pswit[ECHO_SWITCH])
  2506 			    g_print("\n%s\n",aline);
  2507 			if (!pswit[OVERVIEW_SWITCH])
  2508 			    g_print("    Line %ld column %ld - "
  2509 			      "Wrongspaced singlequotes?\n",
  2510 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2511 			else
  2512 			    cnt_punct++;
  2513 		    }
  2514 		}
  2515 		else
  2516 		{
  2517 		    /* parity odd */
  2518 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2519 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2520 		    {
  2521 			if (pswit[ECHO_SWITCH])
  2522 			    g_print("\n%s\n",aline);
  2523 			if (!pswit[OVERVIEW_SWITCH])
  2524 			    g_print("    Line %ld column %ld - "
  2525 			      "Wrongspaced singlequotes?\n",
  2526 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2527 			else
  2528 			    cnt_punct++;
  2529 		    }
  2530 		}
  2531 	    }
  2532 	}
  2533     }
  2534 }
  2535 
  2536 /*
  2537  * check_for_double_punctuation:
  2538  *
  2539  * Look for double punctuation like ,. or ,,
  2540  * Thanks to DW for the suggestion!
  2541  * In books with references, ".," and ".;" are common
  2542  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2543  * OTOH, from my initial tests, there are also fairly
  2544  * common errors. What to do? Make these cases paranoid?
  2545  * ".," is the most common, so warnings->dotcomma is used
  2546  * to suppress detailed reporting if it occurs often.
  2547  */
  2548 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2549 {
  2550     const char *s;
  2551     gunichar c,nc;
  2552     nc=g_utf8_get_char(aline);
  2553     for (s=aline;*s;s=g_utf8_next_char(s))
  2554     {
  2555 	c=nc;
  2556 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2557 	/* for each punctuation character in the line */
  2558 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2559 	  g_utf8_strchr(".?!,;:",-1,nc))
  2560 	{
  2561 	    /* followed by punctuation, it's a query, unless . . . */
  2562 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2563 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2564 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2565 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2566 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2567 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2568 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2569 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2570 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2571 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2572 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2573 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2574 	    {
  2575 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2576 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2577 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2578 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2579 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2580 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2581 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2582 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2583 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2584 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2585 		{
  2586 		    s+=4;
  2587 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2588 		}
  2589 		; /* do nothing for .. !! and ?? which can be legit */
  2590 	    }
  2591 	    else
  2592 	    {
  2593 		if (pswit[ECHO_SWITCH])
  2594 		    g_print("\n%s\n",aline);
  2595 		if (!pswit[OVERVIEW_SWITCH])
  2596 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2597 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2598 		else
  2599 		    cnt_punct++;
  2600 	    }
  2601 	}
  2602     }
  2603 }
  2604 
  2605 /*
  2606  * check_for_spaced_quotes:
  2607  */
  2608 void check_for_spaced_quotes(const char *aline)
  2609 {
  2610     int i;
  2611     const char *s,*t;
  2612     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2613       CHAR_RS_QUOTE};
  2614     GString *pattern;
  2615     s=aline;
  2616     while ((t=strstr(s," \" ")))
  2617     {
  2618 	if (pswit[ECHO_SWITCH])
  2619 	    g_print("\n%s\n",aline);
  2620 	if (!pswit[OVERVIEW_SWITCH])
  2621 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2622 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2623 	else
  2624 	    cnt_punct++;
  2625 	s=g_utf8_next_char(g_utf8_next_char(t));
  2626     }
  2627     pattern=g_string_new(NULL);
  2628     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2629     {
  2630 	g_string_assign(pattern," ");
  2631 	g_string_append_unichar(pattern,single_quotes[i]);
  2632 	g_string_append_c(pattern,' ');
  2633 	s=aline;
  2634 	while ((t=strstr(s,pattern->str)))
  2635 	{
  2636 	    if (pswit[ECHO_SWITCH])
  2637 		g_print("\n%s\n",aline);
  2638 	    if (!pswit[OVERVIEW_SWITCH])
  2639 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2640 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2641 	    else
  2642 		cnt_punct++;
  2643 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2644 	}
  2645     }
  2646     g_string_free(pattern,TRUE);
  2647 }
  2648 
  2649 /*
  2650  * check_for_miscased_genative:
  2651  *
  2652  * Check special case of 'S instead of 's at end of word.
  2653  */
  2654 void check_for_miscased_genative(const char *aline)
  2655 {
  2656     const char *s;
  2657     gunichar c,nc,pc;
  2658     if (!*aline)
  2659 	return;
  2660     c=g_utf8_get_char(aline);
  2661     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2662     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2663     {
  2664 	pc=c;
  2665 	c=nc;
  2666 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2667 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2668 	{
  2669 	    if (pswit[ECHO_SWITCH])
  2670 		g_print("\n%s\n",aline);
  2671 	    if (!pswit[OVERVIEW_SWITCH])
  2672 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2673 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2674 	    else
  2675 		cnt_punct++;
  2676 	}
  2677     }
  2678 }
  2679 
  2680 /*
  2681  * check_end_of_line:
  2682  *
  2683  * Now check special cases - start and end of line -
  2684  * for single and double quotes. Start is sometimes [sic]
  2685  * but better to query it anyway.
  2686  * While we're here, check for dash at end of line.
  2687  */
  2688 void check_end_of_line(const char *aline,struct warnings *warnings)
  2689 {
  2690     int lbytes;
  2691     const char *s;
  2692     gunichar c1,c2;
  2693     lbytes=strlen(aline);
  2694     if (g_utf8_strlen(aline,lbytes)>1)
  2695     {
  2696 	s=g_utf8_prev_char(aline+lbytes);
  2697 	c1=g_utf8_get_char(s);
  2698 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2699 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2700 	{
  2701 	    if (pswit[ECHO_SWITCH])
  2702 		g_print("\n%s\n",aline);
  2703 	    if (!pswit[OVERVIEW_SWITCH])
  2704 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2705 		  g_utf8_strlen(aline,lbytes));
  2706 	    else
  2707 		cnt_punct++;
  2708 	}
  2709 	c1=g_utf8_get_char(aline);
  2710 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2711 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2712 	{
  2713 	    if (pswit[ECHO_SWITCH])
  2714 		g_print("\n%s\n",aline);
  2715 	    if (!pswit[OVERVIEW_SWITCH])
  2716 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2717 	    else
  2718 		cnt_punct++;
  2719 	}
  2720 	/*
  2721 	 * Dash at end of line may well be legit - paranoid mode only
  2722 	 * and don't report em-dash at line-end.
  2723 	 */
  2724 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2725 	{
  2726 	    for (s=g_utf8_prev_char(aline+lbytes);
  2727 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2728 		;
  2729 	    if (g_utf8_get_char(s)=='-' &&
  2730 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2731 	    {
  2732 		if (pswit[ECHO_SWITCH])
  2733 		    g_print("\n%s\n",aline);
  2734 		if (!pswit[OVERVIEW_SWITCH])
  2735 		    g_print("    Line %ld column %ld - "
  2736 		      "Hyphen at end of line?\n",
  2737 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2738 	    }
  2739 	}
  2740     }
  2741 }
  2742 
  2743 /*
  2744  * check_for_unspaced_bracket:
  2745  *
  2746  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2747  * If so, suspect a scanno like "a]most".
  2748  */
  2749 void check_for_unspaced_bracket(const char *aline)
  2750 {
  2751     const char *s;
  2752     gunichar c,nc,pc;
  2753     c=g_utf8_get_char(aline);
  2754     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2755     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2756     {
  2757 	pc=c;
  2758 	c=nc;
  2759 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2760 	if (!nc)
  2761 	    break;
  2762 	/* for each bracket character in the line except 1st & last */
  2763 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2764 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2765 	{
  2766 	    if (pswit[ECHO_SWITCH])
  2767 		g_print("\n%s\n",aline);
  2768 	    if (!pswit[OVERVIEW_SWITCH])
  2769 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2770 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2771 	    else
  2772 		cnt_punct++;
  2773 	}
  2774     }
  2775 }
  2776 
  2777 /*
  2778  * check_for_unpunctuated_endquote:
  2779  */
  2780 void check_for_unpunctuated_endquote(const char *aline)
  2781 {
  2782     const char *s;
  2783     gunichar c,nc,pc;
  2784     QuoteClass qc;
  2785     c=g_utf8_get_char(aline);
  2786     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2787     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2788     {
  2789 	pc=c;
  2790 	c=nc;
  2791 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2792 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2793 	/* for each character in the line except 1st */
  2794 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2795 	{
  2796 	    if (pswit[ECHO_SWITCH])
  2797 		g_print("\n%s\n",aline);
  2798 	    if (!pswit[OVERVIEW_SWITCH])
  2799 		g_print("    Line %ld column %ld - "
  2800 		  "endquote missing punctuation?\n",
  2801 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2802 	    else
  2803 		cnt_punct++;
  2804 	}
  2805     }
  2806 }
  2807 
  2808 /*
  2809  * check_for_html_tag:
  2810  *
  2811  * Check for <HTML TAG>.
  2812  *
  2813  * If there is a < in the line, followed at some point
  2814  * by a > then we suspect HTML.
  2815  */
  2816 void check_for_html_tag(const char *aline)
  2817 {
  2818     const char *open,*close;
  2819     gchar *tag;
  2820     open=strchr(aline,'<');
  2821     if (open)
  2822     {
  2823 	close=strchr(g_utf8_next_char(open),'>');
  2824 	if (close)
  2825 	{
  2826 	    if (pswit[ECHO_SWITCH])
  2827 		g_print("\n%s\n",aline);
  2828 	    if (!pswit[OVERVIEW_SWITCH])
  2829 	    {
  2830 		tag=g_strndup(open,close-open+1);
  2831 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2832 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2833 		g_free(tag);
  2834 	    }
  2835 	    else
  2836 		cnt_html++;
  2837 	}
  2838     }
  2839 }
  2840 
  2841 /*
  2842  * check_for_html_entity:
  2843  *
  2844  * Check for &symbol; HTML.
  2845  *
  2846  * If there is a & in the line, followed at
  2847  * some point by a ; then we suspect HTML.
  2848  */
  2849 void check_for_html_entity(const char *aline)
  2850 {
  2851     const char *s,*amp,*scolon;
  2852     gchar *entity;
  2853     amp=strchr(aline,'&');
  2854     if (amp)
  2855     {
  2856 	scolon=strchr(amp,';');
  2857 	if (scolon)
  2858 	{
  2859 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2860 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2861 		    break;		/* Don't report "Jones & Son;" */
  2862 	    if (s>=scolon)
  2863 	    {
  2864 		if (pswit[ECHO_SWITCH])
  2865 		    g_print("\n%s\n",aline);
  2866 		if (!pswit[OVERVIEW_SWITCH])
  2867 		{
  2868 		    entity=g_strndup(amp,scolon-amp+1);
  2869 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2870 		      linecnt,(int)(amp-aline)+1,entity);
  2871 		    g_free(entity);
  2872 		}
  2873 		else
  2874 		    cnt_html++;
  2875 	    }
  2876 	}
  2877     }
  2878 }
  2879 
  2880 /*
  2881  * check_for_omitted_punctuation:
  2882  *
  2883  * Check for omitted punctuation at end of paragraph by working back
  2884  * through prevline. DW.
  2885  * Need to check this only for "normal" paras.
  2886  * So what is a "normal" para?
  2887  *    Not normal if one-liner (chapter headings, etc.)
  2888  *    Not normal if doesn't contain at least one locase letter
  2889  *    Not normal if starts with space
  2890  */
  2891 void check_for_omitted_punctuation(const char *prevline,
  2892   struct line_properties *last,int start_para_line)
  2893 {
  2894     gboolean letter_on_line=FALSE;
  2895     const char *s;
  2896     gunichar c;
  2897     gboolean closing_quote;
  2898     for (s=prevline;*s;s=g_utf8_next_char(s))
  2899 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2900 	{
  2901 	    letter_on_line=TRUE;
  2902 	    break;
  2903 	}
  2904     /*
  2905      * This next "if" is a problem.
  2906      * If we say "start_para_line <= linecnt - 1", that includes
  2907      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2908      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2909      * misses genuine one-line paragraphs.
  2910      */
  2911     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2912       g_utf8_get_char(prevline)>CHAR_SPACE)
  2913     {
  2914 	s=prevline+strlen(prevline);
  2915 	do
  2916 	{
  2917 	    s=g_utf8_prev_char(s);
  2918 	    c=g_utf8_get_char(s);
  2919 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2920 		closing_quote=TRUE;
  2921 	    else
  2922 		closing_quote=FALSE;
  2923 	} while (closing_quote && s>prevline);
  2924 	for (;s>prevline;s=g_utf8_prev_char(s))
  2925 	{
  2926 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2927 	    {
  2928 		if (pswit[ECHO_SWITCH])
  2929 		    g_print("\n%s\n",prevline);
  2930 		if (!pswit[OVERVIEW_SWITCH])
  2931 		    g_print("    Line %ld column %ld - "
  2932 		      "No punctuation at para end?\n",
  2933 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2934 		else
  2935 		    cnt_punct++;
  2936 		break;
  2937 	    }
  2938 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2939 		break;
  2940 	}
  2941     }
  2942 }
  2943 
  2944 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2945 {
  2946     const char *word=key;
  2947     int *dupcnt=value;
  2948     if (*dupcnt)
  2949 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2950 	  word,*dupcnt);
  2951     return FALSE;
  2952 }
  2953 
  2954 void print_as_windows_1252(const char *string)
  2955 {
  2956     gsize inbytes,outbytes;
  2957     gchar *buf,*bp;
  2958     static GIConv converter=(GIConv)-1;
  2959     if (!string)
  2960     {
  2961 	if (converter!=(GIConv)-1)
  2962 	    g_iconv_close(converter);
  2963 	converter=(GIConv)-1;
  2964 	return;
  2965     }
  2966     if (converter==(GIConv)-1)
  2967 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2968     if (converter!=(GIConv)-1)
  2969     {
  2970 	inbytes=outbytes=strlen(string);
  2971 	bp=buf=g_malloc(outbytes+1);
  2972 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2973 	*bp='\0';
  2974 	fputs(buf,stdout);
  2975 	g_free(buf);
  2976     }
  2977     else
  2978 	fputs(string,stdout);
  2979 }
  2980 
  2981 void print_as_utf_8(const char *string)
  2982 {
  2983     fputs(string,stdout);
  2984 }
  2985 
  2986 /*
  2987  * procfile:
  2988  *
  2989  * Process one file.
  2990  */
  2991 void procfile(const char *filename)
  2992 {
  2993     const char *s;
  2994     gchar *parastart=NULL;	/* first line of current para */
  2995     gchar *etext,*aline;
  2996     gchar *etext_ptr;
  2997     GError *err=NULL;
  2998     struct first_pass_results *first_pass_results;
  2999     struct warnings *warnings;
  3000     struct counters counters={0};
  3001     struct line_properties last={0};
  3002     struct parities parities={0};
  3003     struct pending pending={0};
  3004     gboolean isemptyline;
  3005     long start_para_line=0;
  3006     gboolean isnewpara=FALSE,enddash=FALSE;
  3007     last.start=CHAR_SPACE;
  3008     linecnt=checked_linecnt=0;
  3009     etext=read_etext(filename,&err);
  3010     if (!etext)
  3011     {
  3012 	if (pswit[STDOUT_SWITCH])
  3013 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  3014 	else
  3015 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  3016 	exit(1);
  3017     }
  3018     g_print("\n\nFile: %s\n\n",filename);
  3019     first_pass_results=first_pass(etext);
  3020     warnings=report_first_pass(first_pass_results);
  3021     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  3022     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  3023     /*
  3024      * Here we go with the main pass. Hold onto yer hat!
  3025      */
  3026     linecnt=0;
  3027     etext_ptr=etext;
  3028     while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
  3029     {
  3030 	linecnt++;
  3031 	if (linecnt==1)
  3032 	    isnewpara=TRUE;
  3033 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  3034 	    continue;    // skip DP page separators completely
  3035 	if (linecnt<first_pass_results->firstline ||
  3036 	  (first_pass_results->footerline>0 &&
  3037 	  linecnt>first_pass_results->footerline))
  3038 	{
  3039 	    if (pswit[HEADER_SWITCH])
  3040 	    {
  3041 		if (g_str_has_prefix(aline,"Title:"))
  3042 		    g_print("    %s\n",aline);
  3043 		if (g_str_has_prefix(aline,"Author:"))
  3044 		    g_print("    %s\n",aline);
  3045 		if (g_str_has_prefix(aline,"Release Date:"))
  3046 		    g_print("    %s\n",aline);
  3047 		if (g_str_has_prefix(aline,"Edition:"))
  3048 		    g_print("    %s\n\n",aline);
  3049 	    }
  3050 	    continue;		/* skip through the header */
  3051 	}
  3052 	checked_linecnt++;
  3053 	print_pending(aline,parastart,&pending);
  3054 	isemptyline=analyse_quotes(aline,&counters);
  3055 	if (isnewpara && !isemptyline)
  3056 	{
  3057 	    /* This line is the start of a new paragraph. */
  3058 	    start_para_line=linecnt;
  3059 	    /* Capture its first line in case we want to report it later. */
  3060 	    g_free(parastart);
  3061 	    parastart=g_strdup(aline);
  3062 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  3063 	    s=aline;
  3064 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  3065 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  3066 		s=g_utf8_next_char(s);
  3067 	    if (g_unichar_islower(g_utf8_get_char(s)))
  3068 	    {
  3069 		/* and its first letter is lowercase */
  3070 		if (pswit[ECHO_SWITCH])
  3071 		    g_print("\n%s\n",aline);
  3072 		if (!pswit[OVERVIEW_SWITCH])
  3073 		    g_print("    Line %ld column %ld - "
  3074 		      "Paragraph starts with lower-case\n",
  3075 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  3076 		else
  3077 		    cnt_punct++;
  3078 	    }
  3079 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  3080 	}
  3081 	/* Check for an em-dash broken at line end. */
  3082 	if (enddash && g_utf8_get_char(aline)=='-')
  3083 	{
  3084 	    if (pswit[ECHO_SWITCH])
  3085 		g_print("\n%s\n",aline);
  3086 	    if (!pswit[OVERVIEW_SWITCH])
  3087 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  3088 	    else
  3089 		cnt_punct++;
  3090 	}
  3091 	enddash=FALSE;
  3092 	for (s=g_utf8_prev_char(aline+strlen(aline));
  3093 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  3094 	    ;
  3095 	if (s>=aline && g_utf8_get_char(s)=='-')
  3096 	    enddash=TRUE;
  3097 	check_for_control_characters(aline);
  3098 	check_for_odd_characters(aline,warnings,isemptyline);
  3099 	if (warnings->longline)
  3100 	    check_for_long_line(aline);
  3101 	if (warnings->shortline)
  3102 	    check_for_short_line(aline,&last);
  3103 	last.blen=last.len;
  3104 	last.len=g_utf8_strlen(aline,-1);
  3105 	last.start=g_utf8_get_char(aline);
  3106 	check_for_starting_punctuation(aline);
  3107 	if (warnings->dash)
  3108 	{
  3109 	    check_for_spaced_emdash(aline);
  3110 	    check_for_spaced_dash(aline);
  3111 	}
  3112 	check_for_unmarked_paragraphs(aline);
  3113 	check_for_jeebies(aline);
  3114 	check_for_mta_from(aline);
  3115 	check_for_orphan_character(aline);
  3116 	check_for_pling_scanno(aline);
  3117 	check_for_extra_period(aline,warnings);
  3118 	check_for_following_punctuation(aline);
  3119 	check_for_typos(aline,warnings);
  3120 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3121 	check_for_double_punctuation(aline,warnings);
  3122 	check_for_spaced_quotes(aline);
  3123 	check_for_miscased_genative(aline);
  3124 	check_end_of_line(aline,warnings);
  3125 	check_for_unspaced_bracket(aline);
  3126 	if (warnings->endquote)
  3127 	    check_for_unpunctuated_endquote(aline);
  3128 	check_for_html_tag(aline);
  3129 	check_for_html_entity(aline);
  3130 	if (isemptyline)
  3131 	{
  3132 	    check_for_mismatched_quotes(&counters,&pending);
  3133 	    counters_reset(&counters);
  3134 	    /* let the next iteration know that it's starting a new para */
  3135 	    isnewpara=TRUE;
  3136 	    if (prevline)
  3137 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3138 	}
  3139 	g_free(prevline);
  3140 	prevline=g_strdup(aline);
  3141     }
  3142     linecnt++;
  3143     check_for_mismatched_quotes(&counters,&pending);
  3144     print_pending(NULL,parastart,&pending);
  3145     reset_pending(&pending);
  3146     if (prevline)
  3147     {
  3148 	g_free(prevline);
  3149 	prevline=NULL;
  3150     }
  3151     g_free(parastart);
  3152     g_free(prevline);
  3153     g_free(etext);
  3154     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3155 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3156     g_tree_unref(qword);
  3157     g_tree_unref(qperiod);
  3158     counters_destroy(&counters);
  3159     g_set_print_handler(NULL);
  3160     print_as_windows_1252(NULL);
  3161     if (pswit[MARKUP_SWITCH])  
  3162 	loseentities(NULL);
  3163 }
  3164 
  3165 /*
  3166  * flgets:
  3167  *
  3168  * Get one line from the input text. The setting of newlines has the following
  3169  * effect:
  3170  *
  3171  * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
  3172  *
  3173  * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
  3174  *		 the newline character.
  3175  *
  3176  * UNIX_NEWLINES: Check for the presence of CRs.
  3177  *
  3178  * In all cases, check that the last line is correctly terminated.
  3179  *
  3180  * Returns: a pointer to the line.
  3181  */
  3182 char *flgets(char **etext,long lcnt,int newlines)
  3183 {
  3184     gunichar c;
  3185     gboolean isCR=FALSE;
  3186     char *theline=*etext;
  3187     char *eos=theline;
  3188     gchar *s;
  3189     for (;;)
  3190     {
  3191 	c=g_utf8_get_char(*etext);
  3192 	if (!c)
  3193 	{
  3194 	    if (*etext==theline)
  3195 		return NULL;
  3196 	    else if (pswit[LINE_END_SWITCH])
  3197 	    {
  3198 		if (pswit[ECHO_SWITCH])
  3199 		{
  3200 		    s=g_strndup(theline,eos-theline);
  3201 		    g_print("\n%s\n",s);
  3202 		    g_free(s);
  3203 		}
  3204 		if (!pswit[OVERVIEW_SWITCH])
  3205 		{
  3206 		    if (newlines==OS9_NEWLINES)
  3207 			g_print("    Line %ld - No CR?\n",lcnt);
  3208 		    else
  3209 		    {
  3210 			/* There may, or may not, have been a CR */
  3211 			g_print("    Line %ld - No LF?\n",lcnt);
  3212 		    }
  3213 		}
  3214 		else
  3215 		    cnt_lineend++;
  3216 	    }
  3217 	    break;
  3218 	}
  3219 	*etext=g_utf8_next_char(*etext);
  3220 	/* either way, it's end of line */
  3221 	if (c=='\n')
  3222 	{
  3223 	    if (newlines==DOS_NEWLINES && !isCR)
  3224 	    {
  3225 		/* Error - a LF without a preceding CR */
  3226 		if (pswit[LINE_END_SWITCH])
  3227 		{
  3228 		    if (pswit[ECHO_SWITCH])
  3229 		    {
  3230 			s=g_strndup(theline,eos-theline);
  3231 			g_print("\n%s\n",s);
  3232 			g_free(s);
  3233 		    }
  3234 		    if (!pswit[OVERVIEW_SWITCH])
  3235 			g_print("    Line %ld - No CR?\n",lcnt);
  3236 		    else
  3237 			cnt_lineend++;
  3238 		}
  3239 	    }
  3240 	    break;
  3241 	}
  3242 	if (c=='\r')
  3243 	{
  3244 	    if (newlines==OS9_NEWLINES)
  3245 		break;
  3246 	    if (isCR || newlines==UNIX_NEWLINES)
  3247 	    {
  3248 		if (pswit[LINE_END_SWITCH])
  3249 		{
  3250 		    if (pswit[ECHO_SWITCH])
  3251 		    {
  3252 			s=g_strndup(theline,eos-theline);
  3253 			g_print("\n%s\n",s);
  3254 			g_free(s);
  3255 		    }
  3256 		    if (!pswit[OVERVIEW_SWITCH])
  3257 		    {
  3258 			if (newlines==UNIX_NEWLINES)
  3259 			    g_print("    Line %ld column %ld - Embedded CR?\n",
  3260 			      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3261 			else
  3262 			    g_print("    Line %ld - Two successive CRs?\n",
  3263 			      lcnt);
  3264 		    }
  3265 		    else
  3266 			cnt_lineend++;
  3267 		}
  3268 		if (newlines==UNIX_NEWLINES)
  3269 		    *eos=' ';
  3270 	    }
  3271 	    if (newlines==DOS_NEWLINES)
  3272 		isCR=TRUE;
  3273 	}
  3274 	else
  3275 	{
  3276 	    if (pswit[LINE_END_SWITCH] && isCR)
  3277 	    {
  3278 		if (pswit[ECHO_SWITCH])
  3279 		{
  3280 		    s=g_strndup(theline,eos-theline);
  3281 		    g_print("\n%s\n",s);
  3282 		    g_free(s);
  3283 		}
  3284 		if (!pswit[OVERVIEW_SWITCH])
  3285 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3286 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3287 		else
  3288 		    cnt_lineend++;
  3289 		*eos=' ';
  3290 	    }
  3291 	    isCR=FALSE;
  3292 	    eos=g_utf8_next_char(eos);
  3293 	}
  3294     }
  3295     *eos='\0';
  3296     if (pswit[MARKUP_SWITCH])  
  3297 	postprocess_for_HTML(theline);
  3298     if (pswit[DP_SWITCH])  
  3299 	postprocess_for_DP(theline);
  3300     return theline;
  3301 }
  3302 
  3303 /*
  3304  * mixdigit:
  3305  *
  3306  * Takes a "word" as a parameter, and checks whether it
  3307  * contains a mixture of alpha and digits. Generally, this is an
  3308  * error, but may not be for cases like 4th or L5 12s. 3d.
  3309  *
  3310  * Returns: TRUE iff an is error found.
  3311  */
  3312 gboolean mixdigit(const char *checkword)
  3313 {
  3314     gboolean wehaveadigit,wehavealetter,query;
  3315     const char *s,*nondigit;
  3316     wehaveadigit=wehavealetter=query=FALSE;
  3317     for (s=checkword;*s;s=g_utf8_next_char(s))
  3318 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3319 	    wehavealetter=TRUE;
  3320 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3321 	    wehaveadigit=TRUE;
  3322     if (wehaveadigit && wehavealetter)
  3323     {
  3324 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3325 	query=TRUE;
  3326 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3327 	  nondigit=g_utf8_next_char(nondigit))
  3328 	    ;
  3329 	/* digits, ending in st, rd, nd, th of either case */
  3330 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3331 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3332 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3333 	  !g_ascii_strcasecmp(nondigit,"th"))
  3334 	    query=FALSE;
  3335 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3336 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3337 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3338 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3339 	    query=FALSE;
  3340 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3341 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3342 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3343 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3344 	    query=FALSE;
  3345 	/* digits, ending in l, L, s or d */
  3346 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3347 	  !strcmp(nondigit,"d"))
  3348 	    query=FALSE;
  3349 	/*
  3350 	 * L at the start of a number, representing Britsh pounds, like L500.
  3351 	 * This is cute. We know the current word is mixed digit. If the first
  3352 	 * letter is L, there must be at least one digit following. If both
  3353 	 * digits and letters follow, we have a genuine error, else we have a
  3354 	 * capital L followed by digits, and we accept that as a non-error.
  3355 	 */
  3356 	if (g_utf8_get_char(checkword)=='L' &&
  3357 	  !mixdigit(g_utf8_next_char(checkword)))
  3358 	    query=FALSE;
  3359     }
  3360     return query;
  3361 }
  3362 
  3363 /*
  3364  * getaword:
  3365  *
  3366  * Extracts the first/next "word" from the line, and returns it.
  3367  * A word is defined as one English word unit--or at least that's the aim.
  3368  * "ptr" is advanced to the position in the line where we will start
  3369  * looking for the next word.
  3370  *
  3371  * Returns: A newly-allocated string.
  3372  */
  3373 gchar *getaword(const char **ptr)
  3374 {
  3375     const char *s,*t;
  3376     GString *word;
  3377     gunichar c,pc;
  3378     word=g_string_new(NULL);
  3379     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3380       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3381       **ptr;*ptr=g_utf8_next_char(*ptr))
  3382     {
  3383 	/* Handle exceptions for footnote markers like [1] */
  3384 	if (g_utf8_get_char(*ptr)=='[')
  3385 	{
  3386 	    g_string_append_c(word,'[');
  3387 	    s=g_utf8_next_char(*ptr);
  3388 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
  3389 		g_string_append_unichar(word,g_utf8_get_char(s));
  3390 	    if (g_utf8_get_char(s)==']')
  3391 	    {
  3392 		g_string_append_c(word,']');
  3393 		*ptr=g_utf8_next_char(s);
  3394 		return g_string_free(word,FALSE);
  3395 	    }
  3396 	    else
  3397 		g_string_truncate(word,0);
  3398 	}
  3399     }
  3400     /*
  3401      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3402      * Especially yucky is the case of L1,000
  3403      * This section looks for a pattern of characters including a digit
  3404      * followed by a comma or period followed by one or more digits.
  3405      * If found, it returns this whole pattern as a word; otherwise we discard
  3406      * the results and resume our normal programming.
  3407      */
  3408     s=*ptr;
  3409     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3410       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3411       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3412 	g_string_append_unichar(word,g_utf8_get_char(s));
  3413     if (word->len)
  3414     {
  3415 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3416 	{
  3417 	    c=g_utf8_get_char(t);
  3418 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3419 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3420 	    {
  3421 		*ptr=s;
  3422 		return g_string_free(word,FALSE);
  3423 	    }
  3424 	}
  3425     }
  3426     /* we didn't find a punctuated number - do the regular getword thing */
  3427     g_string_truncate(word,0);
  3428     c=g_utf8_get_char(*ptr);
  3429     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3430       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3431 	g_string_append_unichar(word,c);
  3432     return g_string_free(word,FALSE);
  3433 }
  3434 
  3435 /*
  3436  * isroman:
  3437  *
  3438  * Is this word a Roman Numeral?
  3439  *
  3440  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3441  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3442  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3443  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3444  * expressions thereof, except when it came to taxes. Allow any number of M,
  3445  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3446  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3447  * of optional Is.
  3448  */
  3449 gboolean isroman(const char *t)
  3450 {
  3451     const char *s;
  3452     if (!t || !*t)
  3453 	return FALSE;
  3454     s=t;
  3455     while (g_utf8_get_char(t)=='m' && *t)
  3456 	t++;
  3457     if (g_utf8_get_char(t)=='d')
  3458 	t++;
  3459     if (g_str_has_prefix(t,"cm"))
  3460 	t+=2;
  3461     if (g_str_has_prefix(t,"cd"))
  3462 	t+=2;
  3463     while (g_utf8_get_char(t)=='c' && *t)
  3464 	t++;
  3465     if (g_str_has_prefix(t,"xl"))
  3466 	t+=2;
  3467     if (g_str_has_prefix(t,"xc"))
  3468 	t+=2;
  3469     if (g_utf8_get_char(t)=='l')
  3470 	t++;
  3471     while (g_utf8_get_char(t)=='x' && *t)
  3472 	t++;
  3473     if (g_str_has_prefix(t,"ix"))
  3474 	t+=2;
  3475     if (g_str_has_prefix(t,"iv"))
  3476 	t+=2;
  3477     if (g_utf8_get_char(t)=='v')
  3478 	t++;
  3479     while (g_utf8_get_char(t)=='i' && *t)
  3480 	t++;
  3481     return !*t;
  3482 }
  3483 
  3484 /*
  3485  * postprocess_for_DP:
  3486  *
  3487  * Invoked with the -d switch from flgets().
  3488  * It simply "removes" from the line a hard-coded set of common
  3489  * DP-specific tags, so that the line passed to the main routine has
  3490  * been pre-cleaned of DP markup.
  3491  */
  3492 void postprocess_for_DP(char *theline)
  3493 {
  3494     char *s,*t;
  3495     int i;
  3496     if (!*theline) 
  3497 	return;
  3498     for (i=0;*DPmarkup[i];i++)
  3499 	while ((s=strstr(theline,DPmarkup[i])))
  3500 	{
  3501 	    t=s+strlen(DPmarkup[i]);
  3502 	    memmove(s,t,strlen(t)+1);
  3503 	}
  3504 }
  3505 
  3506 /*
  3507  * postprocess_for_HTML:
  3508  *
  3509  * Invoked with the -m switch from flgets().
  3510  * It simply "removes" from the line a hard-coded set of common
  3511  * HTML tags and "replaces" a hard-coded set of common HTML
  3512  * entities, so that the line passed to the main routine has
  3513  * been pre-cleaned of HTML.
  3514  */
  3515 void postprocess_for_HTML(char *theline)
  3516 {
  3517     while (losemarkup(theline))
  3518 	;
  3519     loseentities(theline);
  3520 }
  3521 
  3522 char *losemarkup(char *theline)
  3523 {
  3524     char *s,*t;
  3525     int i;
  3526     s=strchr(theline,'<');
  3527     t=s?strchr(s,'>'):NULL;
  3528     if (!s || !t)
  3529 	return NULL;
  3530     for (i=0;*markup[i];i++)
  3531 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3532 	{
  3533 	    t=g_utf8_next_char(t);
  3534 	    memmove(s,t,strlen(t)+1);
  3535 	    return s;
  3536 	}
  3537     /* It's an unrecognized <xxx>. */
  3538     return NULL;
  3539 }
  3540 
  3541 void loseentities(char *theline)
  3542 {
  3543     int i;
  3544     gsize nb;
  3545     char *amp,*scolon;
  3546     gchar *s,*t;
  3547     gunichar c;
  3548     GTree *entities=NULL;
  3549     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3550     if (!theline)
  3551     {
  3552 	if (entities)
  3553 	    g_tree_destroy(entities);
  3554 	entities=NULL;
  3555 	if (translit!=(GIConv)-1)
  3556 	    g_iconv_close(translit);
  3557 	translit=(GIConv)-1;
  3558 	if (to_utf8!=(GIConv)-1)
  3559 	    g_iconv_close(to_utf8);
  3560 	to_utf8=(GIConv)-1;
  3561 	return;
  3562     }
  3563     if (!*theline)
  3564 	return;
  3565     if (!entities)
  3566     {
  3567 	entities=g_tree_new((GCompareFunc)strcmp);
  3568 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3569 	    g_tree_insert(entities,HTMLentities[i].name,
  3570 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3571     }
  3572     if (translit==(GIConv)-1)
  3573 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3574     if (to_utf8==(GIConv)-1)
  3575 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3576     while((amp=strchr(theline,'&')))
  3577     {
  3578 	scolon=strchr(amp,';');
  3579 	if (scolon)
  3580 	{
  3581 	    if (amp[1]=='#')
  3582 	    {
  3583 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3584 		    c=strtol(amp+2,NULL,10);
  3585 		else if (amp[2]=='x' &&
  3586 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3587 		    c=strtol(amp+3,NULL,16);
  3588 	    }
  3589 	    else
  3590 	    {
  3591 		s=g_strndup(amp+1,scolon-(amp+1));
  3592 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3593 		g_free(s);
  3594 	    }
  3595 	}
  3596 	else
  3597 	    c=0;
  3598 	if (c)
  3599 	{
  3600 	    theline=amp;
  3601 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3602 		theline+=g_unichar_to_utf8(c,theline);
  3603 	    else
  3604 	    {
  3605 		s=g_malloc(6);
  3606 		nb=g_unichar_to_utf8(c,s);
  3607 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3608 		g_free(s);
  3609 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3610 		g_free(t);
  3611 		memcpy(theline,s,nb);
  3612 		g_free(s);
  3613 		theline+=nb;
  3614 	    }
  3615 	    memmove(theline,g_utf8_next_char(scolon),
  3616 	      strlen(g_utf8_next_char(scolon))+1);
  3617 	}
  3618 	else
  3619 	    theline=g_utf8_next_char(amp);
  3620     }
  3621 }
  3622 
  3623 gboolean tagcomp(const char *strin,const char *basetag)
  3624 {
  3625     gboolean retval;
  3626     gchar *s,*t;
  3627     if (g_utf8_get_char(strin)=='/')
  3628 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3629     else
  3630 	t=g_utf8_casefold(strin,-1);
  3631     s=g_utf8_casefold(basetag,-1);
  3632     retval=g_str_has_prefix(t,s);
  3633     g_free(s);
  3634     g_free(t);
  3635     return retval;
  3636 }
  3637 
  3638 void proghelp(GOptionContext *context)
  3639 {
  3640     gchar *help;
  3641     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3642     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3643     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3644     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3645       "For details, read the file COPYING.\n",stderr);
  3646     fputs("This is Free Software; "
  3647       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3648     fputs("read the file COPYING for details.\n\n",stderr);
  3649     help=g_option_context_get_help(context,TRUE,NULL);
  3650     fputs(help,stderr);
  3651     g_free(help);
  3652     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3653     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3654       "non-ASCII\n",stderr);
  3655     fputs("characters like accented letters, "
  3656       "lines longer than 75 or shorter than 55,\n",stderr);
  3657     fputs("unbalanced quotes or brackets, "
  3658       "a variety of badly formatted punctuation, \n",stderr);
  3659     fputs("HTML tags, some likely typos. "
  3660       "It is NOT a substitute for human judgement.\n",stderr);
  3661     fputs("\n",stderr);
  3662 }