bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Thu Oct 17 08:07:48 2013 +0100 (2013-10-17)
changeset 179 589d5af2c38d
parent 178 db7b24d83bed
child 180 fd54c7cb276b
permissions -rw-r--r--
Bugs #13+14: charsets in configuration files
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 gboolean typo_compat,paranoid_compat;
   136 
   137 static GOptionEntry options[]={
   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   139       "Ignore DP-specific markup", NULL },
   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   142       "Don't ignore DP-specific markup", NULL },
   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   144       "Echo queried line", NULL },
   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   147       "Don't echo queried line", NULL },
   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   149       "Check single quotes", NULL },
   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   152       "Don't check single quotes", NULL },
   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   154       "Check common typos", NULL },
   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   157       "Don't check common typos", NULL },
   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   159       "Require closure of quotes on every paragraph", NULL },
   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   162       "Don't require closure of quotes on every paragraph", NULL },
   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   165       "Enable paranoid querying of everything", NULL },
   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   168       "Disable paranoid querying of everything", NULL },
   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   171       "Enable line end checking", NULL },
   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   174       "Diable line end checking", NULL },
   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   176       "Overview: just show counts", NULL },
   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   179       "Show individual warnings", NULL },
   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   181       "Output errors to stdout instead of stderr", NULL },
   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   184       "Output errors to stderr instead of stdout", NULL },
   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   186       "Echo header fields", NULL },
   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Don't echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "No special handling for markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   199       "Ignore file of user-defined typos", NULL },
   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   201       "Verbose - list everything", NULL },
   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   204       "Switch off verbose mode", NULL },
   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   206       "Set of characters valid for this ebook", "NAME" },
   207     { NULL }
   208 };
   209 
   210 /*
   211  * Options relating to configuration which make no sense from inside
   212  * a configuration file.
   213  */
   214 
   215 static GOptionEntry config_options[]={
   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   217       "Defaults for use on www upload", NULL },
   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   219       "Dump current config settings", NULL },
   220     { NULL }
   221 };
   222 
   223 static GOptionEntry compatibility_options[]={
   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   225       "Toggle checking for common typos", NULL },
   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   227       "Toggle both paranoid mode and common typos", NULL },
   228     { NULL }
   229 };
   230 
   231 long cnt_quote;		/* for overview mode, count of quote queries */
   232 long cnt_brack;		/* for overview mode, count of brackets queries */
   233 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   234 long cnt_odd;		/* for overview mode, count of odd character queries */
   235 long cnt_long;		/* for overview mode, count of long line errors */
   236 long cnt_short;		/* for overview mode, count of short line queries */
   237 long cnt_punct;		/* for overview mode,
   238 			   count of punctuation and spacing queries */
   239 long cnt_dash;		/* for overview mode, count of dash-related queries */
   240 long cnt_word;		/* for overview mode, count of word queries */
   241 long cnt_html;		/* for overview mode, count of html queries */
   242 long cnt_lineend;	/* for overview mode, count of line-end queries */
   243 long cnt_spacend;	/* count of lines with space at end */
   244 long linecnt;		/* count of total lines in the file */
   245 long checked_linecnt;	/* count of lines actually checked */
   246 
   247 void proghelp(GOptionContext *context);
   248 void procfile(const char *);
   249 
   250 gchar *running_from;
   251 
   252 gboolean mixdigit(const char *);
   253 gchar *getaword(const char **);
   254 char *flgets(char **,long);
   255 void postprocess_for_HTML(char *);
   256 char *linehasmarkup(char *);
   257 char *losemarkup(char *);
   258 gboolean tagcomp(const char *,const char *);
   259 void loseentities(char *);
   260 gboolean isroman(const char *);
   261 void postprocess_for_DP(char *);
   262 void print_as_windows_1252(const char *string);
   263 void print_as_utf_8(const char *string);
   264 
   265 GTree *qword,*qperiod;
   266 
   267 #ifdef __WIN32__
   268 UINT saved_cp;
   269 #endif
   270 
   271 gboolean set_charset(const char *name,GError **err)
   272 {
   273     /* The various UNICODE encodings all share the same character set. */
   274     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   275       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   276       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   277       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   278       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   279     int i;
   280     if (charset)
   281 	g_free(charset);
   282     if (charset_validator!=(GIConv)-1)
   283 	g_iconv_close(charset_validator);
   284     if (!name || !g_strcasecmp(name,"auto"))
   285     {
   286 	charset=NULL;
   287 	charset_validator=(GIConv)-1;
   288 	return TRUE;
   289     }
   290     else
   291 	charset=g_strdup(name);
   292     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   293 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   294 	{
   295 	    g_free(charset);
   296 	    charset=g_strdup("UTF-8");
   297 	    break;
   298 	}
   299     if (!strcmp(charset,"UTF-8"))
   300 	charset_validator=(GIConv)-1;
   301     else
   302     {
   303 	charset_validator=g_iconv_open(charset,"UTF-8");
   304 	if (charset_validator==(GIConv)-1)
   305 	{
   306 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   307 	      "Unknown character set \"%s\"",charset);
   308 	    return FALSE;
   309 	}
   310     }
   311     return TRUE;
   312 }
   313 
   314 GKeyFile *config;
   315 
   316 void config_file_update(GKeyFile *kf)
   317 {
   318     int i;
   319     const char *s;
   320     gboolean sw;
   321     for(i=0;options[i].long_name;i++)
   322     {
   323 	if (g_str_has_prefix(options[i].long_name,"no-"))
   324 	    continue;
   325 	if (options[i].arg==G_OPTION_ARG_NONE)
   326 	{
   327 	    sw=*(gboolean *)options[i].arg_data;
   328 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   329 		sw=!sw;
   330 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   331 	}
   332 	else if (options[i].arg==G_OPTION_ARG_STRING)
   333 	{
   334 	    s=*(gchar **)options[i].arg_data;
   335 	    if (!s)
   336 		s="auto";
   337 	    g_key_file_set_string(kf,"options",options[i].long_name,s);
   338 	}
   339 	else
   340 	    g_assert_not_reached();
   341     }
   342 }
   343 
   344 void config_file_add_comments(GKeyFile *kf)
   345 {
   346     int i;
   347     gchar *comment;
   348     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   349       NULL);
   350     for(i=0;options[i].long_name;i++)
   351     {
   352 	if (g_str_has_prefix(options[i].long_name,"no-"))
   353 	    continue;
   354 	comment=g_strconcat(" ",options[i].description,NULL);
   355 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   356 	g_free(comment);
   357     }
   358 }
   359 
   360 void dump_config(void)
   361 {
   362     gchar *s;
   363     if (config)
   364 	config_file_update(config);
   365     else
   366     {
   367 	config=g_key_file_new();
   368 	config_file_update(config);
   369 	config_file_add_comments(config);
   370     }
   371     s=g_key_file_to_data(config,NULL,NULL);
   372     if (s)
   373 	g_print("%s",s);
   374     g_free(s);
   375 }
   376 
   377 GKeyFile *read_config_file(gchar **full_path)
   378 {
   379     int i;
   380     GError *err=NULL;
   381     gchar **search_dirs;
   382     gchar *path;
   383     const char *search_path;
   384     GKeyFile *kf;
   385     kf=g_key_file_new();
   386     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   387     if (search_path)
   388     {
   389 #ifdef __WIN32__
   390 	search_dirs=g_strsplit(search_path,";",0);
   391 #else
   392 	search_dirs=g_strsplit(search_path,":",0);
   393 #endif
   394     }
   395     else
   396     {
   397 	search_dirs=g_new(gchar *,4);
   398 	search_dirs[0]=g_get_current_dir();
   399 	search_dirs[1]=g_strdup(running_from);
   400 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   401 	search_dirs[3]=NULL;
   402     }
   403     for(i=0;search_dirs[i];i++)
   404     {
   405 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   406 	if (g_key_file_load_from_file(kf,path,
   407 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   408 	    break;
   409 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   410 	{
   411 	    g_printerr("Bookloupe: Error reading %s\n",path);
   412 	    g_printerr("%s\n",err->message);
   413 	    exit(1);
   414 	}
   415 	g_clear_error(&err);
   416 	g_free(path);
   417 	path=NULL;
   418     }
   419     if (!search_dirs[i])
   420     {
   421 	g_key_file_free(kf);
   422 	kf=NULL;
   423     }
   424     g_strfreev(search_dirs);
   425     if (full_path && kf)
   426 	*full_path=path;
   427     else
   428 	g_free(path);
   429     return kf;
   430 }
   431 
   432 void parse_config_file(void)
   433 {
   434     int i,j;
   435     gchar *path,*s;
   436     gchar **keys;
   437     gboolean sw;
   438     GError *err=NULL;
   439     config=read_config_file(&path);
   440     if (config)
   441 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   442     else
   443 	keys=NULL;
   444     if (keys)
   445     {
   446 	for(i=0;keys[i];i++)
   447 	{
   448 	    for(j=0;options[j].long_name;j++)
   449 	    {
   450 		if (g_str_has_prefix(options[j].long_name,"no-"))
   451 		    continue;
   452 		else if (!strcmp(keys[i],options[j].long_name))
   453 		{
   454 		    if (options[j].arg==G_OPTION_ARG_NONE)
   455 		    {
   456 			sw=g_key_file_get_boolean(config,"options",keys[i],
   457 			  &err);
   458 			if (err)
   459 			{
   460 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   461 			      path,keys[i],err->message);
   462 			    g_clear_error(&err);
   463 			}
   464 			else
   465 			{
   466 			    if (options[j].flags&G_OPTION_FLAG_REVERSE)
   467 				sw=!sw;
   468 			    *(gboolean *)options[j].arg_data=sw;
   469 			}
   470 			break;
   471 		    }
   472 		    else if (options[j].arg==G_OPTION_ARG_STRING)
   473 		    {
   474 			s=g_key_file_get_string(config,"options",keys[i],
   475 			  &err);
   476 			if (err)
   477 			{
   478 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   479 			      path,keys[i],err->message);
   480 			    g_clear_error(&err);
   481 			}
   482 			else
   483 			{
   484 			    g_free(*(gchar **)options[j].arg_data);
   485 			    if (!g_strcmp0(s,"auto"))
   486 			    {
   487 				*(gchar **)options[j].arg_data=NULL;
   488 				g_free(s);
   489 			    }
   490 			    else
   491 				*(gchar **)options[j].arg_data=s;
   492 			}
   493 			break;
   494 		    }
   495 		    else
   496 			g_assert_not_reached();
   497 		}
   498 	    }
   499 	    if (!options[j].long_name)
   500 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   501 		  path,keys[i]);
   502 	}
   503 	g_strfreev(keys);
   504     }
   505     if (config)
   506 	g_free(path);
   507 }
   508 
   509 void parse_options(int *argc,char ***argv)
   510 {
   511     GError *err=NULL;
   512     GOptionContext *context;
   513     GOptionGroup *compatibility;
   514     context=g_option_context_new(
   515       "file - look for errors in Project Gutenberg(TM) etexts");
   516     g_option_context_add_main_entries(context,options,NULL);
   517     g_option_context_add_main_entries(context,config_options,NULL);
   518     compatibility=g_option_group_new("compatibility",
   519       "Options for Compatibility with Gutcheck:",
   520       "Show compatibility options",NULL,NULL);
   521     g_option_group_add_entries(compatibility,compatibility_options);
   522     g_option_context_add_group(context,compatibility);
   523     g_option_context_set_description(context,
   524       "For simplicity, only the switch options which reverse the\n"
   525       "default configuration are listed. In most cases, both vanilla\n"
   526       "and \"no-\" prefixed versions are available for use.");
   527     if (!g_option_context_parse(context,argc,argv,&err))
   528     {
   529 	g_printerr("Bookloupe: %s\n",err->message);
   530 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   531 	exit(1);
   532     }
   533     if (typo_compat)
   534 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   535     if (paranoid_compat)
   536     {
   537 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   538 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   539     }
   540     /*
   541      * Web uploads - for the moment, this is really just a placeholder
   542      * until we decide what processing we really want to do on web uploads
   543      */
   544     if (pswit[WEB_SWITCH])
   545     {
   546 	/* specific override for web uploads */
   547 	pswit[ECHO_SWITCH]=TRUE;
   548 	pswit[SQUOTE_SWITCH]=FALSE;
   549 	pswit[TYPO_SWITCH]=TRUE;
   550 	pswit[QPARA_SWITCH]=FALSE;
   551 	pswit[PARANOID_SWITCH]=TRUE;
   552 	pswit[LINE_END_SWITCH]=FALSE;
   553 	pswit[OVERVIEW_SWITCH]=FALSE;
   554 	pswit[STDOUT_SWITCH]=FALSE;
   555 	pswit[HEADER_SWITCH]=TRUE;
   556 	pswit[VERBOSE_SWITCH]=FALSE;
   557 	pswit[MARKUP_SWITCH]=FALSE;
   558 	pswit[USERTYPO_SWITCH]=FALSE;
   559 	pswit[DP_SWITCH]=FALSE;
   560     }
   561     if (opt_charset && !set_charset(opt_charset,&err))
   562     {
   563 	g_printerr("%s\n",err->message);
   564 	exit(1);
   565     }
   566     if (pswit[DUMP_CONFIG_SWITCH])
   567     {
   568 	dump_config();
   569 	exit(0);
   570     }
   571     g_free(opt_charset);
   572     opt_charset=NULL;
   573     if (pswit[OVERVIEW_SWITCH])
   574 	/* just print summary; don't echo */
   575 	pswit[ECHO_SWITCH]=FALSE;
   576     if (*argc<2)
   577     {
   578 	proghelp(context);
   579 	exit(1);
   580     }
   581     g_option_context_free(context);
   582 }
   583 
   584 /*
   585  * read_user_scannos:
   586  *
   587  * Read in the user-defined stealth scanno list.
   588  */
   589 void read_user_scannos(void)
   590 {
   591     GError *err=NULL;
   592     gchar *usertypo_file;
   593     gboolean okay;
   594     int i;
   595     gsize len,nb;
   596     gchar *contents,*utf8,**lines;
   597     usertypo_file=g_strdup("bookloupe.typ");
   598     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   599     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   600     {
   601 	g_clear_error(&err);
   602 	g_free(usertypo_file);
   603 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   604 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   605     }
   606     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   607     {
   608 	g_clear_error(&err);
   609 	g_free(usertypo_file);
   610 	usertypo_file=g_strdup("gutcheck.typ");
   611 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   612     }
   613     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   614     {
   615 	g_clear_error(&err);
   616 	g_free(usertypo_file);
   617 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   618 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   619     }
   620     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   621     {
   622 	g_free(usertypo_file);
   623 	g_print("   --> I couldn't find bookloupe.typ "
   624 	  "-- proceeding without user typos.\n");
   625 	return;
   626     }
   627     else if (!okay)
   628     {
   629 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   630 	g_free(usertypo_file);
   631 	g_clear_error(&err);
   632 	exit(1);
   633     }
   634     if (g_utf8_validate(contents,len,NULL))
   635     {
   636 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   637 	if (!charset)
   638 	    (void)set_charset("UNICODE",NULL);
   639     }
   640     else
   641 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   642     g_free(contents);
   643     lines=g_strsplit_set(utf8,"\r\n",0);
   644     g_free(utf8);
   645     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   646     for (i=0;lines[i];i++)
   647 	if (*(unsigned char *)lines[i]>'!')
   648 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   649 	else
   650 	    g_free(lines[i]);
   651     g_free(lines);
   652 }
   653 
   654 /*
   655  * read_etext:
   656  *
   657  * Read an etext returning a newly allocated string containing the file
   658  * contents or NULL on error.
   659  */
   660 gchar *read_etext(const char *filename,GError **err)
   661 {
   662     GError *tmp_err=NULL;
   663     gchar *contents,*utf8;
   664     gsize len,bytes_read,bytes_written;
   665     int i,line,col;
   666     if (!g_file_get_contents(filename,&contents,&len,err))
   667 	return NULL;
   668     if (g_utf8_validate(contents,len,NULL))
   669     {
   670 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   671 	g_set_print_handler(print_as_utf_8);
   672 #ifdef __WIN32__
   673 	SetConsoleOutputCP(CP_UTF8);
   674 #endif
   675     }
   676     else
   677     {
   678 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   679 	  &bytes_written,&tmp_err);
   680 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   681 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   682 	{
   683 	    line=col=1;
   684 	    for(i=0;i<bytes_read;i++)
   685 		if (contents[i]=='\n')
   686 		{
   687 		    line++;
   688 		    col=1;
   689 		}
   690 		else if (contents[i]!='\r')
   691 		    col++;
   692 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   693 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   694 	      "valid Windows-1252 character",
   695 	      ((unsigned char *)contents)[bytes_read],line,col);
   696 	}
   697 	else if (tmp_err)
   698 	    g_propagate_error(err,tmp_err);
   699 	g_set_print_handler(print_as_windows_1252);
   700 #ifdef __WIN32__
   701 	SetConsoleOutputCP(1252);
   702 #endif
   703     }
   704     g_free(contents);
   705     return utf8;
   706 }
   707 
   708 void cleanup_on_exit(void)
   709 {
   710 #ifdef __WIN32__
   711     SetConsoleOutputCP(saved_cp);
   712 #endif
   713 }
   714 
   715 int main(int argc,char **argv)
   716 {
   717 #ifdef __WIN32__
   718     atexit(cleanup_on_exit);
   719     saved_cp=GetConsoleOutputCP();
   720 #endif
   721     running_from=g_path_get_dirname(argv[0]);
   722     /* Paranoid checking is turned OFF, not on, by its switch */
   723     pswit[PARANOID_SWITCH]=TRUE;
   724     /* if running in paranoid mode, typo checks default to enabled */
   725     pswit[TYPO_SWITCH]=TRUE;
   726     /* Line-end checking is turned OFF, not on, by its switch */
   727     pswit[LINE_END_SWITCH]=TRUE;
   728     /* Echoing is turned OFF, not on, by its switch */
   729     pswit[ECHO_SWITCH]=TRUE;
   730     parse_config_file();
   731     parse_options(&argc,&argv);
   732     if (pswit[USERTYPO_SWITCH])
   733 	read_user_scannos();
   734     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   735     procfile(argv[1]);
   736     if (pswit[OVERVIEW_SWITCH])
   737     {
   738 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   739 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   740 	g_print("    --------------- Queries found --------------\n");
   741 	if (cnt_long)
   742 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   743 	if (cnt_short)
   744 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   745 	if (cnt_lineend)
   746 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   747 	if (cnt_word)
   748 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   749 	if (cnt_quote)
   750 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   751 	if (cnt_brack)
   752 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   753 	if (cnt_bin)
   754 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   755 	if (cnt_odd)
   756 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   757 	if (cnt_punct)
   758 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   759 	if (cnt_dash)
   760 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   761 	if (cnt_html)
   762 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   763 	g_print("\n");
   764 	g_print("    TOTAL QUERIES		  %14ld\n",
   765 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   766 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   767     }
   768     g_free(running_from);
   769     if (usertypo)
   770 	g_tree_unref(usertypo);
   771     set_charset(NULL,NULL);
   772     if (config)
   773 	g_key_file_free(config);
   774     return 0;
   775 }
   776 
   777 void count_dashes(const char *line,const char *dash,
   778   struct dash_results *results)
   779 {
   780     int i;
   781     gchar **tokens;
   782     gunichar pc,nc;
   783     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   784     if (!*line)
   785 	return;
   786     tokens=g_strsplit(line,dash,0);
   787     if (tokens[1])
   788 	results->base++;
   789     for(i=1;tokens[i];i++)
   790     {
   791 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   792 	nc=g_utf8_get_char(tokens[i]);
   793 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   794 	    spaced=TRUE;
   795 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   796 	    spaced2=TRUE;
   797 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   798 	    unspaced=TRUE;
   799     }
   800     if (spaced)
   801 	results->space++;
   802     if (spaced2)
   803 	/* count of lines with em-dashes with spaces both sides */
   804 	results->non_PG_space++;
   805     if (unspaced)
   806 	/* count of lines with PG-type em-dashes with no spaces */
   807 	results->PG_space++;
   808     g_strfreev(tokens);
   809 }
   810 
   811 /*
   812  * first_pass:
   813  *
   814  * Run a first pass - verify that it's a valid PG
   815  * file, decide whether to report some things that
   816  * occur many times in the text like long or short
   817  * lines, non-standard dashes, etc.
   818  */
   819 struct first_pass_results *first_pass(const char *etext)
   820 {
   821     gunichar laststart=CHAR_SPACE;
   822     const char *s;
   823     gchar *lc_line;
   824     int i,j,lbytes,llen;
   825     gchar **lines;
   826     unsigned int lastlen=0,lastblen=0;
   827     long spline=0,nspline=0;
   828     static struct first_pass_results results={0};
   829     struct dash_results tmp_dash_results;
   830     gchar *inword;
   831     QuoteClass qc;
   832     lines=g_strsplit(etext,"\n",0);
   833     for (j=0;lines[j];j++)
   834     {
   835 	lbytes=strlen(lines[j]);
   836 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   837 	    lines[j][--lbytes]='\0';
   838 	llen=g_utf8_strlen(lines[j],lbytes);
   839 	linecnt++;
   840 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   841 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   842 	{
   843 	    if (spline)
   844 		g_print("   --> Duplicate header?\n");
   845 	    spline=linecnt+1;   /* first line of non-header text, that is */
   846 	}
   847 	if (!strncmp(lines[j],"*** START",9) &&
   848 	  strstr(lines[j],"PROJECT GUTENBERG"))
   849 	{
   850 	    if (nspline)
   851 		g_print("   --> Duplicate header?\n");
   852 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   853 	}
   854 	if (spline || nspline)
   855 	{
   856 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   857 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   858 	    {
   859 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   860 		{
   861 		    if (results.footerline)
   862 		    {
   863 			/* it's an old-form header - we can detect duplicates */
   864 			if (!nspline)
   865 			    g_print("   --> Duplicate footer?\n");
   866 		    }
   867 		    else
   868 			results.footerline=linecnt;
   869 		}
   870 	    }
   871 	    g_free(lc_line);
   872 	}
   873 	if (spline)
   874 	    results.firstline=spline;
   875 	if (nspline)
   876 	    results.firstline=nspline;  /* override with new */
   877 	if (results.footerline)
   878 	    continue;    /* don't count the boilerplate in the footer */
   879 	results.totlen+=llen;
   880 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   881 	{
   882 	    if (g_utf8_get_char(s)>127)
   883 		results.binlen++;
   884 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   885 		results.alphalen++;
   886 	    if (s>lines[j])
   887 	    {
   888 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   889 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   890 		else
   891 		    qc=INVALID_QUOTE;
   892 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   893 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   894 		    results.endquote_count++;
   895 	    }
   896 	}
   897 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   898 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   899 	    results.shortline++;
   900 	if (lbytes>0 &&
   901 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   902 	    cnt_spacend++;
   903 	if (strstr(lines[j],".,"))
   904 	    results.dotcomma++;
   905 	/* only count ast lines for ignoring purposes where there is */
   906 	/* locase text on the line */
   907 	if (strchr(lines[j],'*'))
   908 	{
   909 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   910 		if (g_unichar_islower(g_utf8_get_char(s)))
   911 		    break;
   912 	    if (*s)
   913 		results.astline++;
   914 	}
   915 	if (strchr(lines[j],'/'))
   916 	    results.fslashline++;
   917 	if (lbytes>0)
   918 	{
   919 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   920 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   921 	      s=g_utf8_prev_char(s))
   922 		;
   923 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   924 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   925 		results.hyphens++;
   926 	}
   927 	if (llen>LONGEST_PG_LINE)
   928 	    results.longline++;
   929 	if (llen>WAY_TOO_LONG)
   930 	    results.verylongline++;
   931 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   932 	{
   933 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   934 	    if (i>0)
   935 		results.htmcount++;
   936 	    if (strstr(lines[j],"<i>"))
   937 		results.htmcount+=4; /* bonus marks! */
   938 	}
   939 	/* Check for spaced em-dashes */
   940 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   941 	count_dashes(lines[j],"--",&tmp_dash_results);
   942 	count_dashes(lines[j],"—",&tmp_dash_results);
   943 	if (tmp_dash_results.base)
   944 	    results.emdash.base++;
   945 	if (tmp_dash_results.non_PG_space)
   946 	    results.emdash.non_PG_space++;
   947 	if (tmp_dash_results.PG_space)
   948 	    results.emdash.PG_space++;
   949 	for (s=lines[j];*s;)
   950 	{
   951 	    inword=getaword(&s);
   952 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   953 		results.Dutchcount++;
   954 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   955 		results.Frenchcount++;
   956 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   957 		results.standalone_digit++;
   958 	    g_free(inword);
   959 	}
   960 	/* Check for spaced dashes */
   961 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   962 	    results.spacedash++;
   963 	lastblen=lastlen;
   964 	lastlen=llen;
   965 	laststart=lines[j][0];
   966     }
   967     g_strfreev(lines);
   968     return &results;
   969 }
   970 
   971 /*
   972  * report_first_pass:
   973  *
   974  * Make some snap decisions based on the first pass results.
   975  */
   976 struct warnings *report_first_pass(struct first_pass_results *results)
   977 {
   978     static struct warnings warnings={0};
   979     if (cnt_spacend>0)
   980 	g_print("   --> %ld lines in this file have white space at end\n",
   981 	  cnt_spacend);
   982     warnings.dotcomma=1;
   983     if (results->dotcomma>5)
   984     {
   985 	warnings.dotcomma=0;
   986 	g_print("   --> %ld lines in this file contain '.,'. "
   987 	  "Not reporting them.\n",results->dotcomma);
   988     }
   989     /*
   990      * If more than 50 lines, or one-tenth, are short,
   991      * don't bother reporting them.
   992      */
   993     warnings.shortline=1;
   994     if (results->shortline>50 || results->shortline*10>linecnt)
   995     {
   996 	warnings.shortline=0;
   997 	g_print("   --> %ld lines in this file are short. "
   998 	  "Not reporting short lines.\n",results->shortline);
   999     }
  1000     /*
  1001      * If more than 50 lines, or one-tenth, are long,
  1002      * don't bother reporting them.
  1003      */
  1004     warnings.longline=1;
  1005     if (results->longline>50 || results->longline*10>linecnt)
  1006     {
  1007 	warnings.longline=0;
  1008 	g_print("   --> %ld lines in this file are long. "
  1009 	  "Not reporting long lines.\n",results->longline);
  1010     }
  1011     /* If more than 10 lines contain asterisks, don't bother reporting them. */
  1012     warnings.ast=1;
  1013     if (results->astline>10)
  1014     {
  1015 	warnings.ast=0;
  1016 	g_print("   --> %ld lines in this file contain asterisks. "
  1017 	  "Not reporting them.\n",results->astline);
  1018     }
  1019     /*
  1020      * If more than 10 lines contain forward slashes,
  1021      * don't bother reporting them.
  1022      */
  1023     warnings.fslash=1;
  1024     if (results->fslashline>10)
  1025     {
  1026 	warnings.fslash=0;
  1027 	g_print("   --> %ld lines in this file contain forward slashes. "
  1028 	  "Not reporting them.\n",results->fslashline);
  1029     }
  1030     /*
  1031      * If more than 20 lines contain unpunctuated endquotes,
  1032      * don't bother reporting them.
  1033      */
  1034     warnings.endquote=1;
  1035     if (results->endquote_count>20)
  1036     {
  1037 	warnings.endquote=0;
  1038 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
  1039 	  "Not reporting them.\n",results->endquote_count);
  1040     }
  1041     /*
  1042      * If more than 15 lines contain standalone digits,
  1043      * don't bother reporting them.
  1044      */
  1045     warnings.digit=1;
  1046     if (results->standalone_digit>10)
  1047     {
  1048 	warnings.digit=0;
  1049 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
  1050 	  "Not reporting them.\n",results->standalone_digit);
  1051     }
  1052     /*
  1053      * If more than 20 lines contain hyphens at end,
  1054      * don't bother reporting them.
  1055      */
  1056     warnings.hyphen=1;
  1057     if (results->hyphens>20)
  1058     {
  1059 	warnings.hyphen=0;
  1060 	g_print("   --> %ld lines in this file have hyphens at end. "
  1061 	  "Not reporting them.\n",results->hyphens);
  1062     }
  1063     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1064     {
  1065 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1066 	pswit[MARKUP_SWITCH]=1;
  1067     }
  1068     if (results->verylongline>0)
  1069 	g_print("   --> %ld lines in this file are VERY long!\n",
  1070 	  results->verylongline);
  1071     /*
  1072      * If there are more non-PG spaced dashes than PG em-dashes,
  1073      * assume it's deliberate.
  1074      * Current PG guidelines say don't use them, but older texts do,
  1075      * and some people insist on them whatever the guidelines say.
  1076      */
  1077     warnings.dash=1;
  1078     if (results->spacedash+results->emdash.non_PG_space>
  1079       results->emdash.PG_space)
  1080     {
  1081 	warnings.dash=0;
  1082 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1083 	  "Not reporting them.\n",
  1084 	  results->spacedash+results->emdash.non_PG_space);
  1085     }
  1086     if (charset)
  1087 	warnings.bin=0;
  1088     else
  1089     {
  1090 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1091 	warnings.bin=1;
  1092 	/* If more than a quarter of characters are hi-bit, bug out. */
  1093 	if (results->binlen*4>results->totlen)
  1094 	{
  1095 	    g_print("   --> This file does not appear to be ASCII. "
  1096 	      "Terminating. Best of luck with it!\n");
  1097 	    exit(1);
  1098 	}
  1099 	if (results->alphalen*4<results->totlen)
  1100 	{
  1101 	    g_print("   --> This file does not appear to be text. "
  1102 	      "Terminating. Best of luck with it!\n");
  1103 	    exit(1);
  1104 	}
  1105 	if (results->binlen*100>results->totlen || results->binlen>100)
  1106 	{
  1107 	    g_print("   --> There are a lot of foreign letters here. "
  1108 	      "Not reporting them.\n");
  1109 	    if (!pswit[VERBOSE_SWITCH])
  1110 		warnings.bin=0;
  1111 	}
  1112     }
  1113     warnings.isDutch=FALSE;
  1114     if (results->Dutchcount>50)
  1115     {
  1116 	warnings.isDutch=TRUE;
  1117 	g_print("   --> This looks like Dutch - "
  1118 	  "switching off dashes and warnings for 's Middags case.\n");
  1119     }
  1120     warnings.isFrench=FALSE;
  1121     if (results->Frenchcount>50)
  1122     {
  1123 	warnings.isFrench=TRUE;
  1124 	g_print("   --> This looks like French - "
  1125 	  "switching off some doublepunct.\n");
  1126     }
  1127     if (results->firstline && results->footerline)
  1128 	g_print("    The PG header and footer appear to be already on.\n");
  1129     else
  1130     {
  1131 	if (results->firstline)
  1132 	    g_print("    The PG header is on - no footer.\n");
  1133 	if (results->footerline)
  1134 	    g_print("    The PG footer is on - no header.\n");
  1135     }
  1136     g_print("\n");
  1137     if (pswit[VERBOSE_SWITCH])
  1138     {
  1139 	warnings.shortline=1;
  1140 	warnings.dotcomma=1;
  1141 	warnings.longline=1;
  1142 	warnings.dash=1;
  1143 	warnings.digit=1;
  1144 	warnings.ast=1;
  1145 	warnings.fslash=1;
  1146 	warnings.hyphen=1;
  1147 	warnings.endquote=1;
  1148 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1149     }
  1150     if (warnings.isDutch)
  1151 	warnings.dash=0;
  1152     if (results->footerline>0 && results->firstline>0 &&
  1153       results->footerline>results->firstline &&
  1154       results->footerline-results->firstline<100)
  1155     {
  1156 	g_print("   --> I don't really know where this text starts. \n");
  1157 	g_print("       There are no reference points.\n");
  1158 	g_print("       I'm going to have to report the header and footer "
  1159 	  "as well.\n");
  1160 	results->firstline=0;
  1161     }
  1162     return &warnings;
  1163 }
  1164 
  1165 /*
  1166  * analyse_quotes:
  1167  *
  1168  * Look along the line, accumulate the count of quotes, and see
  1169  * if this is an empty line - i.e. a line with nothing on it
  1170  * but spaces.
  1171  * If line has just spaces, period, * and/or - on it, don't
  1172  * count it, since empty lines with asterisks or dashes to
  1173  * separate sections are common.
  1174  *
  1175  * Returns: TRUE if the line is empty.
  1176  */
  1177 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1178 {
  1179     int guessquote=0;
  1180     /* assume the line is empty until proven otherwise */
  1181     gboolean isemptyline=TRUE;
  1182     const char *s=aline,*sprev,*snext;
  1183     gunichar c;
  1184     sprev=NULL;
  1185     GError *tmp_err=NULL;
  1186     while (*s)
  1187     {
  1188 	snext=g_utf8_next_char(s);
  1189 	c=g_utf8_get_char(s);
  1190 	if (CHAR_IS_DQUOTE(c))
  1191 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1192 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1193 	{
  1194 	    if (s==aline)
  1195 	    {
  1196 		/*
  1197 		 * At start of line, it can only be a quotation mark.
  1198 		 * Hardcode a very common exception!
  1199 		 */
  1200 		if (!g_str_has_prefix(snext,"tis") &&
  1201 		  !g_str_has_prefix(snext,"Tis"))
  1202 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1203 	    }
  1204 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1205 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1206 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1207 		;
  1208 	    /* it's outside a word - let's check it out */
  1209 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1210 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1211 	    {
  1212 		/* certainly looks like a quotation mark */
  1213 		if (!g_str_has_prefix(snext,"tis") &&
  1214 		  !g_str_has_prefix(snext,"Tis"))
  1215 		    /* hardcode a very common exception! */
  1216 		{
  1217 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1218 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1219 		    else
  1220 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1221 		}
  1222 	    }
  1223 	    else
  1224 	    {
  1225 		/* now - is it a quotation mark? */
  1226 		guessquote=0;   /* accumulate clues */
  1227 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1228 		{
  1229 		    /* it follows a letter - could be either */
  1230 		    guessquote++;
  1231 		    if (g_utf8_get_char(sprev)=='s')
  1232 		    {
  1233 			/* looks like a plural apostrophe */
  1234 			guessquote-=3;
  1235 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1236 			    /* bonus marks! */
  1237 			    guessquote-=2;
  1238 		    }
  1239 		    if (innermost_quote_matches(counters,c))
  1240 			/*
  1241 			 * Give it the benefit of some doubt,
  1242 			 * if a squote is already open.
  1243 			 */
  1244 			guessquote++;
  1245 		    else
  1246 			guessquote--;
  1247 		    if (guessquote>=0)
  1248 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1249 		}
  1250 		else
  1251 		    /* no adjacent letter - it must be a quote of some kind */
  1252 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1253 	    }
  1254 	}
  1255 	if (tmp_err)
  1256 	{
  1257 	    if (pswit[ECHO_SWITCH])
  1258 		g_print("\n%s\n",aline);
  1259 	    if (!pswit[OVERVIEW_SWITCH])
  1260 		g_print("    Line %ld column %ld - %s\n",
  1261 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1262 	    g_clear_error(&tmp_err);
  1263 	}
  1264 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1265 	  c!='\r' && c!='\n')
  1266 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1267 	if (c==CHAR_UNDERSCORE)
  1268 	    counters->c_unders++;
  1269 	if (c==CHAR_OPEN_SBRACK)
  1270 	{
  1271 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1272 	      !matching_difference(counters,c) && s==aline &&
  1273 	      g_str_has_prefix(s,"[Illustration:"))
  1274 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1275 	    else
  1276 		increment_matching(counters,c,TRUE);
  1277 	}
  1278 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1279 	    increment_matching(counters,c,TRUE);
  1280 	if (c==CHAR_CLOSE_SBRACK)
  1281 	{
  1282 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1283 	      !matching_difference(counters,c) && !*snext)
  1284 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1285 	    else
  1286 		increment_matching(counters,c,FALSE);
  1287 	}
  1288 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1289 	    increment_matching(counters,c,FALSE);
  1290 	sprev=s;
  1291 	s=snext;
  1292     }
  1293     return isemptyline;
  1294 }
  1295 
  1296 /*
  1297  * check_for_control_characters:
  1298  *
  1299  * Check for invalid or questionable characters in the line
  1300  * Anything above 127 is invalid for plain ASCII, and
  1301  * non-printable control characters should also be flagged.
  1302  * Tabs should generally not be there.
  1303  */
  1304 void check_for_control_characters(const char *aline)
  1305 {
  1306     gunichar c;
  1307     const char *s;
  1308     for (s=aline;*s;s=g_utf8_next_char(s))
  1309     {
  1310 	c=g_utf8_get_char(s);
  1311 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1312 	{
  1313 	    if (pswit[ECHO_SWITCH])
  1314 		g_print("\n%s\n",aline);
  1315 	    if (!pswit[OVERVIEW_SWITCH])
  1316 		g_print("    Line %ld column %ld - Control character %u\n",
  1317 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1318 	    else
  1319 		cnt_bin++;
  1320 	}
  1321     }
  1322 }
  1323 
  1324 /*
  1325  * check_for_odd_characters:
  1326  *
  1327  * Check for binary and other odd characters.
  1328  */
  1329 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1330   gboolean isemptyline)
  1331 {
  1332     /* Don't repeat multiple warnings on one line. */
  1333     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1334     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1335     const char *s;
  1336     gunichar c;
  1337     gsize nb;
  1338     gchar *t;
  1339     for (s=aline;*s;s=g_utf8_next_char(s))
  1340     {
  1341 	c=g_utf8_get_char(s);
  1342 	if (warnings->bin && !eInvalidChar &&
  1343 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1344 	{
  1345 	    if (pswit[ECHO_SWITCH])
  1346 		g_print("\n%s\n",aline);
  1347 	    if (!pswit[OVERVIEW_SWITCH])
  1348 		if (c>127 && c<160 || c>255)
  1349 		    g_print("    Line %ld column %ld - "
  1350 		      "Non-ISO-8859 character %u\n",
  1351 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1352 		else
  1353 		    g_print("    Line %ld column %ld - "
  1354 		      "Non-ASCII character %u\n",
  1355 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1356 	    else
  1357 		cnt_bin++;
  1358 	    eInvalidChar=TRUE;
  1359 	}
  1360 	if (!eInvalidChar && charset)
  1361 	{
  1362 	    if (charset_validator==(GIConv)-1)
  1363 	    {
  1364 		if (!g_unichar_isdefined(c))
  1365 		{
  1366 		    if (pswit[ECHO_SWITCH])
  1367 			g_print("\n%s\n",aline);
  1368 		    if (!pswit[OVERVIEW_SWITCH])
  1369 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1370 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1371 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1372 		    else
  1373 			cnt_bin++;
  1374 		    eInvalidChar=TRUE;
  1375 		}
  1376 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1377 		  c>=100000 && c<=0x10FFFD)
  1378 		{
  1379 		    if (pswit[ECHO_SWITCH])
  1380 			g_print("\n%s\n",aline);
  1381 		    if (!pswit[OVERVIEW_SWITCH])
  1382 			g_print("    Line %ld column %ld - Private Use "
  1383 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1384 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1385 		    else
  1386 			cnt_bin++;
  1387 		    eInvalidChar=TRUE;
  1388 		}
  1389 	    }
  1390 	    else
  1391 	    {
  1392 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1393 		  charset_validator,NULL,&nb,NULL);
  1394 		if (t)
  1395 		    g_free(t);
  1396 		else
  1397 		{
  1398 		    if (pswit[ECHO_SWITCH])
  1399 			g_print("\n%s\n",aline);
  1400 		    if (!pswit[OVERVIEW_SWITCH])
  1401 			g_print("    Line %ld column %ld - Non-%s "
  1402 			  "character %u\n",linecnt,
  1403 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1404 		    else
  1405 			cnt_bin++;
  1406 		    eInvalidChar=TRUE;
  1407 		}
  1408 	    }
  1409 	}
  1410 	if (!eTab && c==CHAR_TAB)
  1411 	{
  1412 	    if (pswit[ECHO_SWITCH])
  1413 		g_print("\n%s\n",aline);
  1414 	    if (!pswit[OVERVIEW_SWITCH])
  1415 		g_print("    Line %ld column %ld - Tab character?\n",
  1416 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1417 	    else
  1418 		cnt_odd++;
  1419 	    eTab=TRUE;
  1420 	}
  1421 	if (!eTilde && c==CHAR_TILDE)
  1422 	{
  1423 	    /*
  1424 	     * Often used by OCR software to indicate an
  1425 	     * unrecognizable character.
  1426 	     */
  1427 	    if (pswit[ECHO_SWITCH])
  1428 		g_print("\n%s\n",aline);
  1429 	    if (!pswit[OVERVIEW_SWITCH])
  1430 		g_print("    Line %ld column %ld - Tilde character?\n",
  1431 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1432 	    else
  1433 		cnt_odd++;
  1434 	    eTilde=TRUE;
  1435 	}
  1436 	if (!eCarat && c==CHAR_CARAT)
  1437 	{  
  1438 	    if (pswit[ECHO_SWITCH])
  1439 		g_print("\n%s\n",aline);
  1440 	    if (!pswit[OVERVIEW_SWITCH])
  1441 		g_print("    Line %ld column %ld - Carat character?\n",
  1442 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1443 	    else
  1444 		cnt_odd++;
  1445 	    eCarat=TRUE;
  1446 	}
  1447 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1448 	{  
  1449 	    if (pswit[ECHO_SWITCH])
  1450 		g_print("\n%s\n",aline);
  1451 	    if (!pswit[OVERVIEW_SWITCH])
  1452 		g_print("    Line %ld column %ld - Forward slash?\n",
  1453 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1454 	    else
  1455 		cnt_odd++;
  1456 	    eFSlash=TRUE;
  1457 	}
  1458 	/*
  1459 	 * Report asterisks only in paranoid mode,
  1460 	 * since they're often deliberate.
  1461 	 */
  1462 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1463 	  c==CHAR_ASTERISK)
  1464 	{
  1465 	    if (pswit[ECHO_SWITCH])
  1466 		g_print("\n%s\n",aline);
  1467 	    if (!pswit[OVERVIEW_SWITCH])
  1468 		g_print("    Line %ld column %ld - Asterisk?\n",
  1469 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1470 	    else
  1471 		cnt_odd++;
  1472 	    eAst=TRUE;
  1473 	}
  1474     }
  1475 }
  1476 
  1477 /*
  1478  * check_for_long_line:
  1479  *
  1480  * Check for line too long.
  1481  */
  1482 void check_for_long_line(const char *aline)
  1483 {
  1484     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1485     {
  1486 	if (pswit[ECHO_SWITCH])
  1487 	    g_print("\n%s\n",aline);
  1488 	if (!pswit[OVERVIEW_SWITCH])
  1489 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1490 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1491 	else
  1492 	    cnt_long++;
  1493     }
  1494 }
  1495 
  1496 /*
  1497  * check_for_short_line:
  1498  *
  1499  * Check for line too short.
  1500  *
  1501  * This one is a bit trickier to implement: we don't want to
  1502  * flag the last line of a paragraph for being short, so we
  1503  * have to wait until we know that our current line is a
  1504  * "normal" line, then report the _previous_ line if it was too
  1505  * short. We also don't want to report indented lines like
  1506  * chapter heads or formatted quotations. We therefore keep
  1507  * last->len as the length of the last line examined, and
  1508  * last->blen as the length of the last but one, and try to
  1509  * suppress unnecessary warnings by checking that both were of
  1510  * "normal" length. We keep the first character of the last
  1511  * line in last->start, and if it was a space, we assume that
  1512  * the formatting is deliberate. I can't figure out a way to
  1513  * distinguish something like a quoted verse left-aligned or
  1514  * the header or footer of a letter from a paragraph of short
  1515  * lines - maybe if I examined the whole paragraph, and if the
  1516  * para has less than, say, 8 lines and if all lines are short,
  1517  * then just assume it's OK? Need to look at some texts to see
  1518  * how often a formula like this would get the right result.
  1519  */
  1520 void check_for_short_line(const char *aline,const struct line_properties *last)
  1521 {
  1522     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1523       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1524       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1525     {
  1526 	if (pswit[ECHO_SWITCH])
  1527 	    g_print("\n%s\n",prevline);
  1528 	if (!pswit[OVERVIEW_SWITCH])
  1529 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1530 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1531 	else
  1532 	    cnt_short++;
  1533     }
  1534 }
  1535 
  1536 /*
  1537  * check_for_starting_punctuation:
  1538  *
  1539  * Look for punctuation other than full ellipses at start of line.
  1540  */
  1541 void check_for_starting_punctuation(const char *aline)
  1542 {
  1543     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1544       !g_str_has_prefix(aline,". . ."))
  1545     {
  1546 	if (pswit[ECHO_SWITCH])
  1547 	    g_print("\n%s\n",aline);
  1548 	if (!pswit[OVERVIEW_SWITCH])
  1549 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1550 	      linecnt);
  1551 	else
  1552 	    cnt_punct++;
  1553     }
  1554 }
  1555 
  1556 /*
  1557  * str_emdash:
  1558  *
  1559  * Find the first em-dash, return a pointer to it and set <next> to the
  1560  * character following the dash.
  1561  */
  1562 char *str_emdash(const char *s,const char **next)
  1563 {
  1564     const char *s1,*s2;
  1565     s1=strstr(s,"--");
  1566     s2=strstr(s,"—");
  1567     if (!s1)
  1568     {
  1569 	if (s2)
  1570 	    *next=g_utf8_next_char(s2);
  1571 	return (char *)s2;
  1572     }
  1573     else if (!s2)
  1574     {
  1575 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1576 	return (char *)s1;
  1577     }
  1578     else if (s1<s2)
  1579     {
  1580 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1581 	return (char *)s1;
  1582     }
  1583     else
  1584     {
  1585 	*next=g_utf8_next_char(s2);
  1586 	return (char *)s2;
  1587     }
  1588 }
  1589 
  1590 /*
  1591  * check_for_spaced_emdash:
  1592  *
  1593  * Check for spaced em-dashes.
  1594  *
  1595  * We must check _all_ occurrences of em-dashes on the line
  1596  * hence the loop - even if the first dash is OK
  1597  * there may be another that's wrong later on.
  1598  */
  1599 void check_for_spaced_emdash(const char *aline)
  1600 {
  1601     const char *s,*t,*next;
  1602     for (s=aline;t=str_emdash(s,&next);s=next)
  1603     {
  1604 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1605 	  g_utf8_get_char(next)==CHAR_SPACE)
  1606 	{
  1607 	    if (pswit[ECHO_SWITCH])
  1608 		g_print("\n%s\n",aline);
  1609 	    if (!pswit[OVERVIEW_SWITCH])
  1610 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1611 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1612 	    else
  1613 		cnt_dash++;
  1614 	}
  1615     }
  1616 }
  1617 
  1618 /*
  1619  * check_for_spaced_dash:
  1620  *
  1621  * Check for spaced dashes.
  1622  */
  1623 void check_for_spaced_dash(const char *aline)
  1624 {
  1625     const char *s;
  1626     if ((s=strstr(aline," -")))
  1627     {
  1628 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1629 	{
  1630 	    if (pswit[ECHO_SWITCH])
  1631 		g_print("\n%s\n",aline);
  1632 	    if (!pswit[OVERVIEW_SWITCH])
  1633 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1634 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1635 	    else
  1636 		cnt_dash++;
  1637 	}
  1638     }
  1639     else if ((s=strstr(aline,"- ")))
  1640     {
  1641 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1642 	{
  1643 	    if (pswit[ECHO_SWITCH])
  1644 		g_print("\n%s\n",aline);
  1645 	    if (!pswit[OVERVIEW_SWITCH])
  1646 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1647 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1648 	    else
  1649 		cnt_dash++;
  1650 	}
  1651     }
  1652 }
  1653 
  1654 /*
  1655  * check_for_unmarked_paragraphs:
  1656  *
  1657  * Check for unmarked paragraphs indicated by separate speakers.
  1658  *
  1659  * May well be false positive:
  1660  * "Bravo!" "Wonderful!" called the crowd.
  1661  * but useful all the same.
  1662  */
  1663 void check_for_unmarked_paragraphs(const char *aline)
  1664 {
  1665     const char *s;
  1666     s=strstr(aline,"\"  \"");
  1667     if (!s)
  1668 	s=strstr(aline,"\" \"");
  1669     if (s)
  1670     {
  1671 	if (pswit[ECHO_SWITCH])
  1672 	    g_print("\n%s\n",aline);
  1673 	if (!pswit[OVERVIEW_SWITCH])
  1674 	    g_print("    Line %ld column %ld - "
  1675 	      "Query missing paragraph break?\n",
  1676 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1677 	else
  1678 	    cnt_punct++;
  1679     }
  1680 }
  1681 
  1682 /*
  1683  * check_for_jeebies:
  1684  *
  1685  * Check for "to he" and other easy h/b errors.
  1686  *
  1687  * This is a very inadequate effort on the h/b problem,
  1688  * but the phrase "to he" is always an error, whereas "to
  1689  * be" is quite common.
  1690  * Similarly, '"Quiet!", be said.' is a non-be error
  1691  * "to he" is _not_ always an error!:
  1692  *       "Where they went to he couldn't say."
  1693  * Another false positive:
  1694  *       What would "Cinderella" be without the . . .
  1695  * and another: "If he wants to he can see for himself."
  1696  */
  1697 void check_for_jeebies(const char *aline)
  1698 {
  1699     const char *s;
  1700     s=strstr(aline," be could ");
  1701     if (!s)
  1702 	s=strstr(aline," be would ");
  1703     if (!s)
  1704 	s=strstr(aline," was be ");
  1705     if (!s)
  1706 	s=strstr(aline," be is ");
  1707     if (!s)
  1708 	s=strstr(aline," is be ");
  1709     if (!s)
  1710 	s=strstr(aline,"\", be ");
  1711     if (!s)
  1712 	s=strstr(aline,"\" be ");
  1713     if (!s)
  1714 	s=strstr(aline,"\" be ");
  1715     if (!s)
  1716 	s=strstr(aline," to he ");
  1717     if (s)
  1718     {
  1719 	if (pswit[ECHO_SWITCH])
  1720 	    g_print("\n%s\n",aline);
  1721 	if (!pswit[OVERVIEW_SWITCH])
  1722 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1723 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1724 	else
  1725 	    cnt_word++;
  1726     }
  1727     s=strstr(aline," the had ");
  1728     if (!s)
  1729 	s=strstr(aline," a had ");
  1730     if (!s)
  1731 	s=strstr(aline," they bad ");
  1732     if (!s)
  1733 	s=strstr(aline," she bad ");
  1734     if (!s)
  1735 	s=strstr(aline," he bad ");
  1736     if (!s)
  1737 	s=strstr(aline," you bad ");
  1738     if (!s)
  1739 	s=strstr(aline," i bad ");
  1740     if (s)
  1741     {
  1742 	if (pswit[ECHO_SWITCH])
  1743 	    g_print("\n%s\n",aline);
  1744 	if (!pswit[OVERVIEW_SWITCH])
  1745 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1746 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1747 	else
  1748 	    cnt_word++;
  1749     }
  1750     s=strstr(aline,"; hut ");
  1751     if (!s)
  1752 	s=strstr(aline,", hut ");
  1753     if (s)
  1754     {
  1755 	if (pswit[ECHO_SWITCH])
  1756 	    g_print("\n%s\n",aline);
  1757 	if (!pswit[OVERVIEW_SWITCH])
  1758 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1759 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1760 	else
  1761 	    cnt_word++;
  1762     }
  1763 }
  1764 
  1765 /*
  1766  * check_for_mta_from:
  1767  *
  1768  * Special case - angled bracket in front of "From" placed there by an
  1769  * MTA when sending an e-mail.
  1770  */
  1771 void check_for_mta_from(const char *aline)
  1772 {
  1773     const char *s;
  1774     s=strstr(aline,">From");
  1775     if (s)
  1776     {
  1777 	if (pswit[ECHO_SWITCH])
  1778 	    g_print("\n%s\n",aline);
  1779 	if (!pswit[OVERVIEW_SWITCH])
  1780 	    g_print("    Line %ld column %ld - "
  1781 	      "Query angled bracket with From\n",
  1782 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1783 	else
  1784 	    cnt_punct++;
  1785     }
  1786 }
  1787 
  1788 /*
  1789  * check_for_orphan_character:
  1790  *
  1791  * Check for a single character line -
  1792  * often an overflow from bad wrapping.
  1793  */
  1794 void check_for_orphan_character(const char *aline)
  1795 {
  1796     gunichar c;
  1797     c=g_utf8_get_char(aline);
  1798     if (c && !*g_utf8_next_char(aline))
  1799     {
  1800 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1801 	    ; /* Nothing - ignore numerals alone on a line. */
  1802 	else
  1803 	{
  1804 	    if (pswit[ECHO_SWITCH])
  1805 		g_print("\n%s\n",aline);
  1806 	    if (!pswit[OVERVIEW_SWITCH])
  1807 		g_print("    Line %ld column 1 - Query single character line\n",
  1808 		  linecnt);
  1809 	    else
  1810 		cnt_punct++;
  1811 	}
  1812     }
  1813 }
  1814 
  1815 /*
  1816  * check_for_pling_scanno:
  1817  *
  1818  * Check for I" - often should be !
  1819  */
  1820 void check_for_pling_scanno(const char *aline)
  1821 {
  1822     const char *s;
  1823     s=strstr(aline," I\"");
  1824     if (s)
  1825     {
  1826 	if (pswit[ECHO_SWITCH])
  1827 	    g_print("\n%s\n",aline);
  1828 	if (!pswit[OVERVIEW_SWITCH])
  1829 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1830 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1831 	else
  1832 	    cnt_punct++;
  1833     }
  1834 }
  1835 
  1836 /*
  1837  * check_for_extra_period:
  1838  *
  1839  * Check for period without a capital letter. Cut-down from gutspell.
  1840  * Only works when it happens on a single line.
  1841  */
  1842 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1843 {
  1844     const char *s,*t,*s1,*sprev;
  1845     int i;
  1846     gsize len;
  1847     gboolean istypo;
  1848     gchar *testword;
  1849     gunichar c,nc,pc,*decomposition;
  1850     if (pswit[PARANOID_SWITCH])
  1851     {
  1852 	for (t=aline;t=strstr(t,". ");)
  1853 	{
  1854 	    if (t==aline)
  1855 	    {
  1856 		t=g_utf8_next_char(t);
  1857 		/* start of line punctuation is handled elsewhere */
  1858 		continue;
  1859 	    }
  1860 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1861 	    {
  1862 		t=g_utf8_next_char(t);
  1863 		continue;
  1864 	    }
  1865 	    if (warnings->isDutch)
  1866 	    {
  1867 		/* For Frank & Jeroen -- 's Middags case */
  1868 		gunichar c2,c3,c4,c5;
  1869 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1870 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1871 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1872 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1873 		if (CHAR_IS_APOSTROPHE(c2) &&
  1874 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1875 		  g_unichar_isupper(c5))
  1876 		{
  1877 		    t=g_utf8_next_char(t);
  1878 		    continue;
  1879 		}
  1880 	    }
  1881 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1882 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1883 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1884 		s1=g_utf8_next_char(s1);
  1885 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1886 	    {
  1887 		/* we have something to investigate */
  1888 		istypo=TRUE;
  1889 		/* so let's go back and find out */
  1890 		nc=g_utf8_get_char(t);
  1891 		s1=g_utf8_prev_char(t);
  1892 		c=g_utf8_get_char(s1);
  1893 		sprev=g_utf8_prev_char(s1);
  1894 		pc=g_utf8_get_char(sprev);
  1895 		while (s1>=aline &&
  1896 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1897 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1898 		  g_unichar_isalpha(nc)))
  1899 		{
  1900 		    nc=c;
  1901 		    s1=sprev;
  1902 		    c=pc;
  1903 		    sprev=g_utf8_prev_char(s1);
  1904 		    pc=g_utf8_get_char(sprev);
  1905 		}
  1906 		s1=g_utf8_next_char(s1);
  1907 		s=strchr(s1,'.');
  1908 		if (s)
  1909 		    testword=g_strndup(s1,s-s1);
  1910 		else
  1911 		    testword=g_strdup(s1);
  1912 		for (i=0;*abbrev[i];i++)
  1913 		    if (!strcmp(testword,abbrev[i]))
  1914 			istypo=FALSE;
  1915 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1916 		    istypo=FALSE;
  1917 		if (!*g_utf8_next_char(testword))
  1918 		    istypo=FALSE;
  1919 		if (isroman(testword))
  1920 		    istypo=FALSE;
  1921 		if (istypo)
  1922 		{
  1923 		    istypo=FALSE;
  1924 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1925 		    {
  1926 			decomposition=g_unicode_canonical_decomposition(
  1927 			  g_utf8_get_char(s),&len);
  1928 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1929 			    istypo=TRUE;
  1930 			g_free(decomposition);
  1931 		    }
  1932 		}
  1933 		if (istypo &&
  1934 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1935 		{
  1936 		    g_tree_insert(qperiod,g_strdup(testword),
  1937 		      GINT_TO_POINTER(1));
  1938 		    if (pswit[ECHO_SWITCH])
  1939 			g_print("\n%s\n",aline);
  1940 		    if (!pswit[OVERVIEW_SWITCH])
  1941 			g_print("    Line %ld column %ld - Extra period?\n",
  1942 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1943 		    else
  1944 			cnt_punct++;
  1945 		}
  1946 		g_free(testword);
  1947 	    }
  1948 	    t=g_utf8_next_char(t);
  1949 	}
  1950     }
  1951 }
  1952 
  1953 /*
  1954  * check_for_following_punctuation:
  1955  *
  1956  * Check for words usually not followed by punctuation.
  1957  */
  1958 void check_for_following_punctuation(const char *aline)
  1959 {
  1960     int i;
  1961     const char *s,*wordstart;
  1962     gunichar c;
  1963     gchar *inword,*t;
  1964     if (pswit[TYPO_SWITCH])
  1965     {
  1966 	for (s=aline;*s;)
  1967 	{
  1968 	    wordstart=s;
  1969 	    t=getaword(&s);
  1970 	    if (!*t)
  1971 	    {
  1972 		g_free(t);
  1973 		continue;
  1974 	    }
  1975 	    inword=g_utf8_strdown(t,-1);
  1976 	    g_free(t);
  1977 	    for (i=0;*nocomma[i];i++)
  1978 		if (!strcmp(inword,nocomma[i]))
  1979 		{
  1980 		    c=g_utf8_get_char(s);
  1981 		    if (c==',' || c==';' || c==':')
  1982 		    {
  1983 			if (pswit[ECHO_SWITCH])
  1984 			    g_print("\n%s\n",aline);
  1985 			if (!pswit[OVERVIEW_SWITCH])
  1986 			    g_print("    Line %ld column %ld - "
  1987 			      "Query punctuation after %s?\n",
  1988 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1989 			      inword);
  1990 			else
  1991 			    cnt_punct++;
  1992 		    }
  1993 		}
  1994 	    for (i=0;*noperiod[i];i++)
  1995 		if (!strcmp(inword,noperiod[i]))
  1996 		{
  1997 		    c=g_utf8_get_char(s);
  1998 		    if (c=='.' || c=='!')
  1999 		    {
  2000 			if (pswit[ECHO_SWITCH])
  2001 			    g_print("\n%s\n",aline);
  2002 			if (!pswit[OVERVIEW_SWITCH])
  2003 			    g_print("    Line %ld column %ld - "
  2004 			      "Query punctuation after %s?\n",
  2005 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  2006 			      inword);
  2007 			else
  2008 			    cnt_punct++;
  2009 		    }
  2010 		}
  2011 	    g_free(inword);
  2012 	}
  2013     }
  2014 }
  2015 
  2016 /*
  2017  * check_for_typos:
  2018  *
  2019  * Check for commonly mistyped words,
  2020  * and digits like 0 for O in a word.
  2021  */
  2022 void check_for_typos(const char *aline,struct warnings *warnings)
  2023 {
  2024     const char *s,*t,*nt,*wordstart;
  2025     gchar *inword;
  2026     gunichar *decomposition;
  2027     gchar *testword;
  2028     int i,vowel,consonant,*dupcnt;
  2029     gboolean isdup,istypo,alower;
  2030     gunichar c,pc;
  2031     long offset,len;
  2032     gsize decomposition_len;
  2033     for (s=aline;*s;)
  2034     {
  2035 	wordstart=s;
  2036 	inword=getaword(&s);
  2037 	if (!*inword)
  2038 	{
  2039 	    g_free(inword);
  2040 	    continue; /* don't bother with empty lines */
  2041 	}
  2042 	if (mixdigit(inword))
  2043 	{
  2044 	    if (pswit[ECHO_SWITCH])
  2045 		g_print("\n%s\n",aline);
  2046 	    if (!pswit[OVERVIEW_SWITCH])
  2047 		g_print("    Line %ld column %ld - Query digit in %s\n",
  2048 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  2049 	    else
  2050 		cnt_word++;
  2051 	}
  2052 	/*
  2053 	 * Put the word through a series of tests for likely typos and OCR
  2054 	 * errors.
  2055 	 */
  2056 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2057 	{
  2058 	    istypo=FALSE;
  2059 	    alower=FALSE;
  2060 	    for (t=inword;*t;t=g_utf8_next_char(t))
  2061 	    {
  2062 		c=g_utf8_get_char(t);
  2063 		nt=g_utf8_next_char(t);
  2064 		/* lowercase for testing */
  2065 		if (g_unichar_islower(c))
  2066 		    alower=TRUE;
  2067 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  2068 		{
  2069 		    /*
  2070 		     * We have an uppercase mid-word. However, there are
  2071 		     * common cases:
  2072 		     *   Mac and Mc like McGill
  2073 		     *   French contractions like l'Abbe
  2074 		     */
  2075 		    offset=g_utf8_pointer_to_offset(inword,t);
  2076 		    if (offset>0)
  2077 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  2078 		    else
  2079 			pc='\0';
  2080 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  2081 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  2082 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  2083 		      CHAR_IS_APOSTROPHE(pc))
  2084 			; /* do nothing! */
  2085 		    else
  2086 			istypo=TRUE;
  2087 		}
  2088 	    }
  2089 	    testword=g_utf8_casefold(inword,-1);
  2090 	}
  2091 	if (pswit[TYPO_SWITCH])
  2092 	{
  2093 	    /*
  2094 	     * Check for certain unlikely two-letter combinations at word
  2095 	     * start and end.
  2096 	     */
  2097 	    len=g_utf8_strlen(testword,-1);
  2098 	    if (len>1)
  2099 	    {
  2100 		for (i=0;*nostart[i];i++)
  2101 		    if (g_str_has_prefix(testword,nostart[i]))
  2102 			istypo=TRUE;
  2103 		for (i=0;*noend[i];i++)
  2104 		    if (g_str_has_suffix(testword,noend[i]))
  2105 			istypo=TRUE;
  2106 	    }
  2107 	    /* ght is common, gbt never. Like that. */
  2108 	    if (strstr(testword,"cb"))
  2109 		istypo=TRUE;
  2110 	    if (strstr(testword,"gbt"))
  2111 		istypo=TRUE;
  2112 	    if (strstr(testword,"pbt"))
  2113 		istypo=TRUE;
  2114 	    if (strstr(testword,"tbs"))
  2115 		istypo=TRUE;
  2116 	    if (strstr(testword,"mrn"))
  2117 		istypo=TRUE;
  2118 	    if (strstr(testword,"ahle"))
  2119 		istypo=TRUE;
  2120 	    if (strstr(testword,"ihle"))
  2121 		istypo=TRUE;
  2122 	    /*
  2123 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2124 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2125 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2126 	     * numerals, but "ii" is a common scanno.
  2127 	     */
  2128 	    if (strstr(testword,"tbi"))
  2129 		istypo=TRUE;
  2130 	    if (strstr(testword,"tbe"))
  2131 		istypo=TRUE;
  2132 	    if (strstr(testword,"ii"))
  2133 		istypo=TRUE;
  2134 	    /*
  2135 	     * Check for no vowels or no consonants.
  2136 	     * If none, flag a typo.
  2137 	     */
  2138 	    if (!istypo && len>1)
  2139 	    {
  2140 		vowel=consonant=0;
  2141 		for (t=testword;*t;t=g_utf8_next_char(t))
  2142 		{
  2143 		    c=g_utf8_get_char(t);
  2144 		    decomposition=
  2145 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2146 		    if (c=='y' || g_unichar_isdigit(c))
  2147 		    {
  2148 			/* Yah, this is loose. */
  2149 			vowel++;
  2150 			consonant++;
  2151 		    }
  2152 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2153 			vowel++;
  2154 		    else
  2155 			consonant++;
  2156 		    g_free(decomposition);
  2157 		}
  2158 		if (!vowel || !consonant)
  2159 		    istypo=TRUE;
  2160 	    }
  2161 	    /*
  2162 	     * Now exclude the word from being reported if it's in
  2163 	     * the okword list.
  2164 	     */
  2165 	    for (i=0;*okword[i];i++)
  2166 		if (!strcmp(testword,okword[i]))
  2167 		    istypo=FALSE;
  2168 	    /*
  2169 	     * What looks like a typo may be a Roman numeral.
  2170 	     * Exclude these.
  2171 	     */
  2172 	    if (istypo && isroman(testword))
  2173 		istypo=FALSE;
  2174 	    /* Check the manual list of typos. */
  2175 	    if (!istypo)
  2176 		for (i=0;*typo[i];i++)
  2177 		    if (!strcmp(testword,typo[i]))
  2178 			istypo=TRUE;
  2179 	    /*
  2180 	     * Check lowercase s, l, i and m - special cases.
  2181 	     *   "j" - often a semi-colon gone wrong.
  2182 	     *   "d" for a missing apostrophe - he d
  2183 	     *   "n" for "in"
  2184 	     */
  2185 	    if (!istypo && len==1 &&
  2186 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2187 		istypo=TRUE;
  2188 	    if (istypo)
  2189 	    {
  2190 		dupcnt=g_tree_lookup(qword,testword);
  2191 		if (dupcnt)
  2192 		{
  2193 		    (*dupcnt)++;
  2194 		    isdup=!pswit[VERBOSE_SWITCH];
  2195 		}
  2196 		else
  2197 		{
  2198 		    dupcnt=g_new0(int,1);
  2199 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2200 		    isdup=FALSE;
  2201 		}
  2202 		if (!isdup)
  2203 		{
  2204 		    if (pswit[ECHO_SWITCH])
  2205 			g_print("\n%s\n",aline);
  2206 		    if (!pswit[OVERVIEW_SWITCH])
  2207 		    {
  2208 			g_print("    Line %ld column %ld - Query word %s",
  2209 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2210 			  inword);
  2211 			if (!pswit[VERBOSE_SWITCH])
  2212 			    g_print(" - not reporting duplicates");
  2213 			g_print("\n");
  2214 		    }
  2215 		    else
  2216 			cnt_word++;
  2217 		}
  2218 	    }
  2219 	}
  2220 	/* check the user's list of typos */
  2221 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2222 	{
  2223 	    if (pswit[ECHO_SWITCH])
  2224 		g_print("\n%s\n",aline);
  2225 	    if (!pswit[OVERVIEW_SWITCH])  
  2226 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2227 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2228 	}
  2229 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2230 	    g_free(testword);
  2231 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2232 	{
  2233 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2234 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2235 	    {
  2236 		if (pswit[ECHO_SWITCH])
  2237 		    g_print("\n%s\n",aline);
  2238 		if (!pswit[OVERVIEW_SWITCH])
  2239 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2240 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2241 		      inword);
  2242 		else
  2243 		    cnt_word++;
  2244 	    }
  2245 	}
  2246 	g_free(inword);
  2247     }
  2248 }
  2249 
  2250 /*
  2251  * check_for_misspaced_punctuation:
  2252  *
  2253  * Look for added or missing spaces around punctuation and quotes.
  2254  * If there is a punctuation character like ! with no space on
  2255  * either side, suspect a missing!space. If there are spaces on
  2256  * both sides , assume a typo. If we see a double quote with no
  2257  * space or punctuation on either side of it, assume unspaced
  2258  * quotes "like"this.
  2259  */
  2260 void check_for_misspaced_punctuation(const char *aline,
  2261   struct parities *parities,gboolean isemptyline)
  2262 {
  2263     gboolean isacro,isellipsis;
  2264     const char *s;
  2265     gunichar c,nc,pc,n2c;
  2266     int parity;
  2267     c=g_utf8_get_char(aline);
  2268     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2269     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2270     {
  2271 	pc=c;
  2272 	c=nc;
  2273 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2274 	/* For each character in the line after the first. */
  2275 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2276 	{
  2277 	    /* we need to suppress warnings for acronyms like M.D. */
  2278 	    isacro=FALSE;
  2279 	    /* we need to suppress warnings for ellipsis . . . */
  2280 	    isellipsis=FALSE;
  2281 	    /*
  2282 	     * If there are letters on both sides of it or
  2283 	     * if it's strict punctuation followed by an alpha.
  2284 	     */
  2285 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2286 	      g_utf8_strchr("?!,;:",-1,c)))
  2287 	    {
  2288 		if (c=='.')
  2289 		{
  2290 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2291 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2292 			isacro=TRUE;
  2293 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2294 		    if (nc && n2c=='.')
  2295 			isacro=TRUE;
  2296 		}
  2297 		if (!isacro)
  2298 		{
  2299 		    if (pswit[ECHO_SWITCH])
  2300 			g_print("\n%s\n",aline);
  2301 		    if (!pswit[OVERVIEW_SWITCH])
  2302 			g_print("    Line %ld column %ld - Missing space?\n",
  2303 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2304 		    else
  2305 			cnt_punct++;
  2306 		}
  2307 	    }
  2308 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2309 	    {
  2310 		/*
  2311 		 * If there are spaces on both sides,
  2312 		 * or space before and end of line.
  2313 		 */
  2314 		if (c=='.')
  2315 		{
  2316 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2317 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2318 			isellipsis=TRUE;
  2319 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2320 		    if (nc && n2c=='.')
  2321 			isellipsis=TRUE;
  2322 		}
  2323 		if (!isemptyline && !isellipsis)
  2324 		{
  2325 		    if (pswit[ECHO_SWITCH])
  2326 			g_print("\n%s\n",aline);
  2327 		    if (!pswit[OVERVIEW_SWITCH])
  2328 			g_print("    Line %ld column %ld - "
  2329 			  "Spaced punctuation?\n",linecnt,
  2330 			  g_utf8_pointer_to_offset(aline,s)+1);
  2331 		    else
  2332 			cnt_punct++;
  2333 		}
  2334 	    }
  2335 	}
  2336     }
  2337     /* Split out the characters that CANNOT be preceded by space. */
  2338     c=g_utf8_get_char(aline);
  2339     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2340     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2341     {
  2342 	pc=c;
  2343 	c=nc;
  2344 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2345 	/* for each character in the line after the first */
  2346 	if (g_utf8_strchr("?!,;:",-1,c))
  2347 	{
  2348 	    /* if it's punctuation that _cannot_ have a space before it */
  2349 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2350 	    {
  2351 		/*
  2352 		 * If nc DOES == space,
  2353 		 * it was already reported just above.
  2354 		 */
  2355 		if (pswit[ECHO_SWITCH])
  2356 		    g_print("\n%s\n",aline);
  2357 		if (!pswit[OVERVIEW_SWITCH])
  2358 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2359 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2360 		else
  2361 		    cnt_punct++;
  2362 	    }
  2363 	}
  2364     }
  2365     /*
  2366      * Special case " .X" where X is any alpha.
  2367      * This plugs a hole in the acronym code above.
  2368      * Inelegant, but maintainable.
  2369      */
  2370     c=g_utf8_get_char(aline);
  2371     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2372     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2373     {
  2374 	pc=c;
  2375 	c=nc;
  2376 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2377 	/* for each character in the line after the first */
  2378 	if (c=='.')
  2379 	{
  2380 	    /* if it's a period */
  2381 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2382 	    {
  2383 		/*
  2384 		 * If the period follows a space and
  2385 		 * is followed by a letter.
  2386 		 */
  2387 		if (pswit[ECHO_SWITCH])
  2388 		    g_print("\n%s\n",aline);
  2389 		if (!pswit[OVERVIEW_SWITCH])
  2390 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2391 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2392 		else
  2393 		    cnt_punct++;
  2394 	    }
  2395 	}
  2396     }
  2397     c=g_utf8_get_char(aline);
  2398     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2399     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2400     {
  2401 	pc=c;
  2402 	c=nc;
  2403 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2404 	/* for each character in the line after the first */
  2405 	if (CHAR_IS_DQUOTE(c))
  2406 	{
  2407 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2408 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2409 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2410 	    {
  2411 		if (pswit[ECHO_SWITCH])
  2412 		    g_print("\n%s\n",aline);
  2413 		if (!pswit[OVERVIEW_SWITCH])
  2414 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2415 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2416 		else
  2417 		    cnt_punct++;
  2418 	    }
  2419 	}
  2420     }
  2421     /* Check parity of quotes. */
  2422     nc=g_utf8_get_char(aline);
  2423     for (s=aline;*s;s=g_utf8_next_char(s))
  2424     {
  2425 	c=nc;
  2426 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2427 	if (CHAR_IS_DQUOTE(c))
  2428 	{
  2429 	    if (c==CHAR_DQUOTE)
  2430 	    {
  2431 		parities->dquote=!parities->dquote;
  2432 		parity=parities->dquote;
  2433 	    }
  2434 	    else if (c==CHAR_LD_QUOTE)
  2435 		parity=1;
  2436 	    else
  2437 		parity=0;
  2438 	    if (!parity)
  2439 	    {
  2440 		/* parity even */
  2441 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2442 		{
  2443 		    if (pswit[ECHO_SWITCH])
  2444 			g_print("\n%s\n",aline);
  2445 		    if (!pswit[OVERVIEW_SWITCH])
  2446 			g_print("    Line %ld column %ld - "
  2447 			  "Wrongspaced quotes?\n",
  2448 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2449 		    else
  2450 			cnt_punct++;
  2451 		}
  2452 	    }
  2453 	    else
  2454 	    {
  2455 		/* parity odd */
  2456 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2457 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2458 		{
  2459 		    if (pswit[ECHO_SWITCH])
  2460 			g_print("\n%s\n",aline);
  2461 		    if (!pswit[OVERVIEW_SWITCH])
  2462 			g_print("    Line %ld column %ld - "
  2463 			  "Wrongspaced quotes?\n",
  2464 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2465 		    else
  2466 			cnt_punct++;
  2467 		}
  2468 	    }
  2469 	}
  2470     }
  2471     c=g_utf8_get_char(aline);
  2472     if (CHAR_IS_DQUOTE(c))
  2473     {
  2474 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2475 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2476 	{
  2477 	    if (pswit[ECHO_SWITCH])
  2478 		g_print("\n%s\n",aline);
  2479 	    if (!pswit[OVERVIEW_SWITCH])
  2480 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2481 		  linecnt);
  2482 	    else
  2483 		cnt_punct++;
  2484 	}
  2485     }
  2486     if (pswit[SQUOTE_SWITCH])
  2487     {
  2488 	nc=g_utf8_get_char(aline);
  2489 	for (s=aline;*s;s=g_utf8_next_char(s))
  2490 	{
  2491 	    c=nc;
  2492 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2493 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2494 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2495 	      !g_unichar_isalpha(nc)))
  2496 	    {
  2497 		parities->squote=!parities->squote;
  2498 		if (!parities->squote)
  2499 		{
  2500 		    /* parity even */
  2501 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2502 		    {
  2503 			if (pswit[ECHO_SWITCH])
  2504 			    g_print("\n%s\n",aline);
  2505 			if (!pswit[OVERVIEW_SWITCH])
  2506 			    g_print("    Line %ld column %ld - "
  2507 			      "Wrongspaced singlequotes?\n",
  2508 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2509 			else
  2510 			    cnt_punct++;
  2511 		    }
  2512 		}
  2513 		else
  2514 		{
  2515 		    /* parity odd */
  2516 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2517 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2518 		    {
  2519 			if (pswit[ECHO_SWITCH])
  2520 			    g_print("\n%s\n",aline);
  2521 			if (!pswit[OVERVIEW_SWITCH])
  2522 			    g_print("    Line %ld column %ld - "
  2523 			      "Wrongspaced singlequotes?\n",
  2524 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2525 			else
  2526 			    cnt_punct++;
  2527 		    }
  2528 		}
  2529 	    }
  2530 	}
  2531     }
  2532 }
  2533 
  2534 /*
  2535  * check_for_double_punctuation:
  2536  *
  2537  * Look for double punctuation like ,. or ,,
  2538  * Thanks to DW for the suggestion!
  2539  * In books with references, ".," and ".;" are common
  2540  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2541  * OTOH, from my initial tests, there are also fairly
  2542  * common errors. What to do? Make these cases paranoid?
  2543  * ".," is the most common, so warnings->dotcomma is used
  2544  * to suppress detailed reporting if it occurs often.
  2545  */
  2546 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2547 {
  2548     const char *s;
  2549     gunichar c,nc;
  2550     nc=g_utf8_get_char(aline);
  2551     for (s=aline;*s;s=g_utf8_next_char(s))
  2552     {
  2553 	c=nc;
  2554 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2555 	/* for each punctuation character in the line */
  2556 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2557 	  g_utf8_strchr(".?!,;:",-1,nc))
  2558 	{
  2559 	    /* followed by punctuation, it's a query, unless . . . */
  2560 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2561 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2562 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2563 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2564 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2565 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2566 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2567 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2568 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2569 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2570 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2571 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2572 	    {
  2573 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2574 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2575 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2576 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2577 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2578 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2579 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2580 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2581 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2582 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2583 		{
  2584 		    s+=4;
  2585 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2586 		}
  2587 		; /* do nothing for .. !! and ?? which can be legit */
  2588 	    }
  2589 	    else
  2590 	    {
  2591 		if (pswit[ECHO_SWITCH])
  2592 		    g_print("\n%s\n",aline);
  2593 		if (!pswit[OVERVIEW_SWITCH])
  2594 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2595 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2596 		else
  2597 		    cnt_punct++;
  2598 	    }
  2599 	}
  2600     }
  2601 }
  2602 
  2603 /*
  2604  * check_for_spaced_quotes:
  2605  */
  2606 void check_for_spaced_quotes(const char *aline)
  2607 {
  2608     int i;
  2609     const char *s,*t;
  2610     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2611       CHAR_RS_QUOTE};
  2612     GString *pattern;
  2613     s=aline;
  2614     while ((t=strstr(s," \" ")))
  2615     {
  2616 	if (pswit[ECHO_SWITCH])
  2617 	    g_print("\n%s\n",aline);
  2618 	if (!pswit[OVERVIEW_SWITCH])
  2619 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2620 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2621 	else
  2622 	    cnt_punct++;
  2623 	s=g_utf8_next_char(g_utf8_next_char(t));
  2624     }
  2625     pattern=g_string_new(NULL);
  2626     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2627     {
  2628 	g_string_assign(pattern," ");
  2629 	g_string_append_unichar(pattern,single_quotes[i]);
  2630 	g_string_append_c(pattern,' ');
  2631 	s=aline;
  2632 	while ((t=strstr(s,pattern->str)))
  2633 	{
  2634 	    if (pswit[ECHO_SWITCH])
  2635 		g_print("\n%s\n",aline);
  2636 	    if (!pswit[OVERVIEW_SWITCH])
  2637 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2638 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2639 	    else
  2640 		cnt_punct++;
  2641 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2642 	}
  2643     }
  2644     g_string_free(pattern,TRUE);
  2645 }
  2646 
  2647 /*
  2648  * check_for_miscased_genative:
  2649  *
  2650  * Check special case of 'S instead of 's at end of word.
  2651  */
  2652 void check_for_miscased_genative(const char *aline)
  2653 {
  2654     const char *s;
  2655     gunichar c,nc,pc;
  2656     if (!*aline)
  2657 	return;
  2658     c=g_utf8_get_char(aline);
  2659     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2660     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2661     {
  2662 	pc=c;
  2663 	c=nc;
  2664 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2665 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2666 	{
  2667 	    if (pswit[ECHO_SWITCH])
  2668 		g_print("\n%s\n",aline);
  2669 	    if (!pswit[OVERVIEW_SWITCH])
  2670 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2671 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2672 	    else
  2673 		cnt_punct++;
  2674 	}
  2675     }
  2676 }
  2677 
  2678 /*
  2679  * check_end_of_line:
  2680  *
  2681  * Now check special cases - start and end of line -
  2682  * for single and double quotes. Start is sometimes [sic]
  2683  * but better to query it anyway.
  2684  * While we're here, check for dash at end of line.
  2685  */
  2686 void check_end_of_line(const char *aline,struct warnings *warnings)
  2687 {
  2688     int lbytes;
  2689     const char *s;
  2690     gunichar c1,c2;
  2691     lbytes=strlen(aline);
  2692     if (g_utf8_strlen(aline,lbytes)>1)
  2693     {
  2694 	s=g_utf8_prev_char(aline+lbytes);
  2695 	c1=g_utf8_get_char(s);
  2696 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2697 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2698 	{
  2699 	    if (pswit[ECHO_SWITCH])
  2700 		g_print("\n%s\n",aline);
  2701 	    if (!pswit[OVERVIEW_SWITCH])
  2702 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2703 		  g_utf8_strlen(aline,lbytes));
  2704 	    else
  2705 		cnt_punct++;
  2706 	}
  2707 	c1=g_utf8_get_char(aline);
  2708 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2709 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2710 	{
  2711 	    if (pswit[ECHO_SWITCH])
  2712 		g_print("\n%s\n",aline);
  2713 	    if (!pswit[OVERVIEW_SWITCH])
  2714 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2715 	    else
  2716 		cnt_punct++;
  2717 	}
  2718 	/*
  2719 	 * Dash at end of line may well be legit - paranoid mode only
  2720 	 * and don't report em-dash at line-end.
  2721 	 */
  2722 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2723 	{
  2724 	    for (s=g_utf8_prev_char(aline+lbytes);
  2725 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2726 		;
  2727 	    if (g_utf8_get_char(s)=='-' &&
  2728 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2729 	    {
  2730 		if (pswit[ECHO_SWITCH])
  2731 		    g_print("\n%s\n",aline);
  2732 		if (!pswit[OVERVIEW_SWITCH])
  2733 		    g_print("    Line %ld column %ld - "
  2734 		      "Hyphen at end of line?\n",
  2735 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2736 	    }
  2737 	}
  2738     }
  2739 }
  2740 
  2741 /*
  2742  * check_for_unspaced_bracket:
  2743  *
  2744  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2745  * If so, suspect a scanno like "a]most".
  2746  */
  2747 void check_for_unspaced_bracket(const char *aline)
  2748 {
  2749     const char *s;
  2750     gunichar c,nc,pc;
  2751     c=g_utf8_get_char(aline);
  2752     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2753     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2754     {
  2755 	pc=c;
  2756 	c=nc;
  2757 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2758 	if (!nc)
  2759 	    break;
  2760 	/* for each bracket character in the line except 1st & last */
  2761 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2762 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2763 	{
  2764 	    if (pswit[ECHO_SWITCH])
  2765 		g_print("\n%s\n",aline);
  2766 	    if (!pswit[OVERVIEW_SWITCH])
  2767 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2768 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2769 	    else
  2770 		cnt_punct++;
  2771 	}
  2772     }
  2773 }
  2774 
  2775 /*
  2776  * check_for_unpunctuated_endquote:
  2777  */
  2778 void check_for_unpunctuated_endquote(const char *aline)
  2779 {
  2780     const char *s;
  2781     gunichar c,nc,pc;
  2782     QuoteClass qc;
  2783     c=g_utf8_get_char(aline);
  2784     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2785     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2786     {
  2787 	pc=c;
  2788 	c=nc;
  2789 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2790 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2791 	/* for each character in the line except 1st */
  2792 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2793 	{
  2794 	    if (pswit[ECHO_SWITCH])
  2795 		g_print("\n%s\n",aline);
  2796 	    if (!pswit[OVERVIEW_SWITCH])
  2797 		g_print("    Line %ld column %ld - "
  2798 		  "endquote missing punctuation?\n",
  2799 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2800 	    else
  2801 		cnt_punct++;
  2802 	}
  2803     }
  2804 }
  2805 
  2806 /*
  2807  * check_for_html_tag:
  2808  *
  2809  * Check for <HTML TAG>.
  2810  *
  2811  * If there is a < in the line, followed at some point
  2812  * by a > then we suspect HTML.
  2813  */
  2814 void check_for_html_tag(const char *aline)
  2815 {
  2816     const char *open,*close;
  2817     gchar *tag;
  2818     open=strchr(aline,'<');
  2819     if (open)
  2820     {
  2821 	close=strchr(g_utf8_next_char(open),'>');
  2822 	if (close)
  2823 	{
  2824 	    if (pswit[ECHO_SWITCH])
  2825 		g_print("\n%s\n",aline);
  2826 	    if (!pswit[OVERVIEW_SWITCH])
  2827 	    {
  2828 		tag=g_strndup(open,close-open+1);
  2829 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2830 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2831 		g_free(tag);
  2832 	    }
  2833 	    else
  2834 		cnt_html++;
  2835 	}
  2836     }
  2837 }
  2838 
  2839 /*
  2840  * check_for_html_entity:
  2841  *
  2842  * Check for &symbol; HTML.
  2843  *
  2844  * If there is a & in the line, followed at
  2845  * some point by a ; then we suspect HTML.
  2846  */
  2847 void check_for_html_entity(const char *aline)
  2848 {
  2849     const char *s,*amp,*scolon;
  2850     gchar *entity;
  2851     amp=strchr(aline,'&');
  2852     if (amp)
  2853     {
  2854 	scolon=strchr(amp,';');
  2855 	if (scolon)
  2856 	{
  2857 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2858 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2859 		    break;		/* Don't report "Jones & Son;" */
  2860 	    if (s>=scolon)
  2861 	    {
  2862 		if (pswit[ECHO_SWITCH])
  2863 		    g_print("\n%s\n",aline);
  2864 		if (!pswit[OVERVIEW_SWITCH])
  2865 		{
  2866 		    entity=g_strndup(amp,scolon-amp+1);
  2867 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2868 		      linecnt,(int)(amp-aline)+1,entity);
  2869 		    g_free(entity);
  2870 		}
  2871 		else
  2872 		    cnt_html++;
  2873 	    }
  2874 	}
  2875     }
  2876 }
  2877 
  2878 /*
  2879  * check_for_omitted_punctuation:
  2880  *
  2881  * Check for omitted punctuation at end of paragraph by working back
  2882  * through prevline. DW.
  2883  * Need to check this only for "normal" paras.
  2884  * So what is a "normal" para?
  2885  *    Not normal if one-liner (chapter headings, etc.)
  2886  *    Not normal if doesn't contain at least one locase letter
  2887  *    Not normal if starts with space
  2888  */
  2889 void check_for_omitted_punctuation(const char *prevline,
  2890   struct line_properties *last,int start_para_line)
  2891 {
  2892     gboolean letter_on_line=FALSE;
  2893     const char *s;
  2894     gunichar c;
  2895     gboolean closing_quote;
  2896     for (s=prevline;*s;s=g_utf8_next_char(s))
  2897 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2898 	{
  2899 	    letter_on_line=TRUE;
  2900 	    break;
  2901 	}
  2902     /*
  2903      * This next "if" is a problem.
  2904      * If we say "start_para_line <= linecnt - 1", that includes
  2905      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2906      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2907      * misses genuine one-line paragraphs.
  2908      */
  2909     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2910       g_utf8_get_char(prevline)>CHAR_SPACE)
  2911     {
  2912 	s=prevline+strlen(prevline);
  2913 	do
  2914 	{
  2915 	    s=g_utf8_prev_char(s);
  2916 	    c=g_utf8_get_char(s);
  2917 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2918 		closing_quote=TRUE;
  2919 	    else
  2920 		closing_quote=FALSE;
  2921 	} while (closing_quote && s>prevline);
  2922 	for (;s>prevline;s=g_utf8_prev_char(s))
  2923 	{
  2924 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2925 	    {
  2926 		if (pswit[ECHO_SWITCH])
  2927 		    g_print("\n%s\n",prevline);
  2928 		if (!pswit[OVERVIEW_SWITCH])
  2929 		    g_print("    Line %ld column %ld - "
  2930 		      "No punctuation at para end?\n",
  2931 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2932 		else
  2933 		    cnt_punct++;
  2934 		break;
  2935 	    }
  2936 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2937 		break;
  2938 	}
  2939     }
  2940 }
  2941 
  2942 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2943 {
  2944     const char *word=key;
  2945     int *dupcnt=value;
  2946     if (*dupcnt)
  2947 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2948 	  word,*dupcnt);
  2949     return FALSE;
  2950 }
  2951 
  2952 void print_as_windows_1252(const char *string)
  2953 {
  2954     gsize inbytes,outbytes;
  2955     gchar *buf,*bp;
  2956     static GIConv converter=(GIConv)-1;
  2957     if (!string)
  2958     {
  2959 	if (converter!=(GIConv)-1)
  2960 	    g_iconv_close(converter);
  2961 	converter=(GIConv)-1;
  2962 	return;
  2963     }
  2964     if (converter==(GIConv)-1)
  2965 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2966     if (converter!=(GIConv)-1)
  2967     {
  2968 	inbytes=outbytes=strlen(string);
  2969 	bp=buf=g_malloc(outbytes+1);
  2970 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2971 	*bp='\0';
  2972 	fputs(buf,stdout);
  2973 	g_free(buf);
  2974     }
  2975     else
  2976 	fputs(string,stdout);
  2977 }
  2978 
  2979 void print_as_utf_8(const char *string)
  2980 {
  2981     fputs(string,stdout);
  2982 }
  2983 
  2984 /*
  2985  * procfile:
  2986  *
  2987  * Process one file.
  2988  */
  2989 void procfile(const char *filename)
  2990 {
  2991     const char *s;
  2992     gchar *parastart=NULL;	/* first line of current para */
  2993     gchar *etext,*aline;
  2994     gchar *etext_ptr;
  2995     GError *err=NULL;
  2996     struct first_pass_results *first_pass_results;
  2997     struct warnings *warnings;
  2998     struct counters counters={0};
  2999     struct line_properties last={0};
  3000     struct parities parities={0};
  3001     struct pending pending={0};
  3002     gboolean isemptyline;
  3003     long start_para_line=0;
  3004     gboolean isnewpara=FALSE,enddash=FALSE;
  3005     last.start=CHAR_SPACE;
  3006     linecnt=checked_linecnt=0;
  3007     etext=read_etext(filename,&err);
  3008     if (!etext)
  3009     {
  3010 	if (pswit[STDOUT_SWITCH])
  3011 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  3012 	else
  3013 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  3014 	exit(1);
  3015     }
  3016     g_print("\n\nFile: %s\n\n",filename);
  3017     first_pass_results=first_pass(etext);
  3018     warnings=report_first_pass(first_pass_results);
  3019     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  3020     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  3021     /*
  3022      * Here we go with the main pass. Hold onto yer hat!
  3023      */
  3024     linecnt=0;
  3025     etext_ptr=etext;
  3026     while ((aline=flgets(&etext_ptr,linecnt+1)))
  3027     {
  3028 	linecnt++;
  3029 	if (linecnt==1)
  3030 	    isnewpara=TRUE;
  3031 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  3032 	    continue;    // skip DP page separators completely
  3033 	if (linecnt<first_pass_results->firstline ||
  3034 	  (first_pass_results->footerline>0 &&
  3035 	  linecnt>first_pass_results->footerline))
  3036 	{
  3037 	    if (pswit[HEADER_SWITCH])
  3038 	    {
  3039 		if (g_str_has_prefix(aline,"Title:"))
  3040 		    g_print("    %s\n",aline);
  3041 		if (g_str_has_prefix(aline,"Author:"))
  3042 		    g_print("    %s\n",aline);
  3043 		if (g_str_has_prefix(aline,"Release Date:"))
  3044 		    g_print("    %s\n",aline);
  3045 		if (g_str_has_prefix(aline,"Edition:"))
  3046 		    g_print("    %s\n\n",aline);
  3047 	    }
  3048 	    continue;		/* skip through the header */
  3049 	}
  3050 	checked_linecnt++;
  3051 	print_pending(aline,parastart,&pending);
  3052 	isemptyline=analyse_quotes(aline,&counters);
  3053 	if (isnewpara && !isemptyline)
  3054 	{
  3055 	    /* This line is the start of a new paragraph. */
  3056 	    start_para_line=linecnt;
  3057 	    /* Capture its first line in case we want to report it later. */
  3058 	    g_free(parastart);
  3059 	    parastart=g_strdup(aline);
  3060 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  3061 	    s=aline;
  3062 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  3063 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  3064 		s=g_utf8_next_char(s);
  3065 	    if (g_unichar_islower(g_utf8_get_char(s)))
  3066 	    {
  3067 		/* and its first letter is lowercase */
  3068 		if (pswit[ECHO_SWITCH])
  3069 		    g_print("\n%s\n",aline);
  3070 		if (!pswit[OVERVIEW_SWITCH])
  3071 		    g_print("    Line %ld column %ld - "
  3072 		      "Paragraph starts with lower-case\n",
  3073 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  3074 		else
  3075 		    cnt_punct++;
  3076 	    }
  3077 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  3078 	}
  3079 	/* Check for an em-dash broken at line end. */
  3080 	if (enddash && g_utf8_get_char(aline)=='-')
  3081 	{
  3082 	    if (pswit[ECHO_SWITCH])
  3083 		g_print("\n%s\n",aline);
  3084 	    if (!pswit[OVERVIEW_SWITCH])
  3085 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  3086 	    else
  3087 		cnt_punct++;
  3088 	}
  3089 	enddash=FALSE;
  3090 	for (s=g_utf8_prev_char(aline+strlen(aline));
  3091 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  3092 	    ;
  3093 	if (s>=aline && g_utf8_get_char(s)=='-')
  3094 	    enddash=TRUE;
  3095 	check_for_control_characters(aline);
  3096 	check_for_odd_characters(aline,warnings,isemptyline);
  3097 	if (warnings->longline)
  3098 	    check_for_long_line(aline);
  3099 	if (warnings->shortline)
  3100 	    check_for_short_line(aline,&last);
  3101 	last.blen=last.len;
  3102 	last.len=g_utf8_strlen(aline,-1);
  3103 	last.start=g_utf8_get_char(aline);
  3104 	check_for_starting_punctuation(aline);
  3105 	if (warnings->dash)
  3106 	{
  3107 	    check_for_spaced_emdash(aline);
  3108 	    check_for_spaced_dash(aline);
  3109 	}
  3110 	check_for_unmarked_paragraphs(aline);
  3111 	check_for_jeebies(aline);
  3112 	check_for_mta_from(aline);
  3113 	check_for_orphan_character(aline);
  3114 	check_for_pling_scanno(aline);
  3115 	check_for_extra_period(aline,warnings);
  3116 	check_for_following_punctuation(aline);
  3117 	check_for_typos(aline,warnings);
  3118 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3119 	check_for_double_punctuation(aline,warnings);
  3120 	check_for_spaced_quotes(aline);
  3121 	check_for_miscased_genative(aline);
  3122 	check_end_of_line(aline,warnings);
  3123 	check_for_unspaced_bracket(aline);
  3124 	if (warnings->endquote)
  3125 	    check_for_unpunctuated_endquote(aline);
  3126 	check_for_html_tag(aline);
  3127 	check_for_html_entity(aline);
  3128 	if (isemptyline)
  3129 	{
  3130 	    check_for_mismatched_quotes(&counters,&pending);
  3131 	    counters_reset(&counters);
  3132 	    /* let the next iteration know that it's starting a new para */
  3133 	    isnewpara=TRUE;
  3134 	    if (prevline)
  3135 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3136 	}
  3137 	g_free(prevline);
  3138 	prevline=g_strdup(aline);
  3139     }
  3140     linecnt++;
  3141     check_for_mismatched_quotes(&counters,&pending);
  3142     print_pending(NULL,parastart,&pending);
  3143     reset_pending(&pending);
  3144     if (prevline)
  3145     {
  3146 	g_free(prevline);
  3147 	prevline=NULL;
  3148     }
  3149     g_free(parastart);
  3150     g_free(prevline);
  3151     g_free(etext);
  3152     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3153 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3154     g_tree_unref(qword);
  3155     g_tree_unref(qperiod);
  3156     counters_destroy(&counters);
  3157     g_set_print_handler(NULL);
  3158     print_as_windows_1252(NULL);
  3159     if (pswit[MARKUP_SWITCH])  
  3160 	loseentities(NULL);
  3161 }
  3162 
  3163 /*
  3164  * flgets:
  3165  *
  3166  * Get one line from the input text, checking for
  3167  * the existence of exactly one CR/LF line-end per line.
  3168  *
  3169  * Returns: a pointer to the line.
  3170  */
  3171 char *flgets(char **etext,long lcnt)
  3172 {
  3173     gunichar c;
  3174     gboolean isCR=FALSE;
  3175     char *theline=*etext;
  3176     char *eos=theline;
  3177     gchar *s;
  3178     for (;;)
  3179     {
  3180 	c=g_utf8_get_char(*etext);
  3181 	if (!c)
  3182 	{
  3183 	    if (*etext==theline)
  3184 		return NULL;
  3185 	    else if (pswit[LINE_END_SWITCH])
  3186 	    {
  3187 		if (pswit[ECHO_SWITCH])
  3188 		{
  3189 		    s=g_strndup(theline,eos-theline);
  3190 		    g_print("\n%s\n",s);
  3191 		    g_free(s);
  3192 		}
  3193 		if (!pswit[OVERVIEW_SWITCH])
  3194 		    /* There may, or may not, have been a CR */
  3195 		    g_print("    Line %ld - No LF?\n",lcnt);
  3196 		else
  3197 		    cnt_lineend++;
  3198 	    }
  3199 	    break;
  3200 	}
  3201 	*etext=g_utf8_next_char(*etext);
  3202 	/* either way, it's end of line */
  3203 	if (c=='\n')
  3204 	{
  3205 	    if (isCR)
  3206 		break;
  3207 	    else
  3208 	    {
  3209 		/* Error - a LF without a preceding CR */
  3210 		if (pswit[LINE_END_SWITCH])
  3211 		{
  3212 		    if (pswit[ECHO_SWITCH])
  3213 		    {
  3214 			s=g_strndup(theline,eos-theline);
  3215 			g_print("\n%s\n",s);
  3216 			g_free(s);
  3217 		    }
  3218 		    if (!pswit[OVERVIEW_SWITCH])
  3219 			g_print("    Line %ld - No CR?\n",lcnt);
  3220 		    else
  3221 			cnt_lineend++;
  3222 		}
  3223 		break;
  3224 	    }
  3225 	}
  3226 	if (c=='\r')
  3227 	{
  3228 	    if (isCR)
  3229 	    {
  3230 		/* Error - two successive CRs */
  3231 		if (pswit[LINE_END_SWITCH])
  3232 		{
  3233 		    if (pswit[ECHO_SWITCH])
  3234 		    {
  3235 			s=g_strndup(theline,eos-theline);
  3236 			g_print("\n%s\n",s);
  3237 			g_free(s);
  3238 		    }
  3239 		    if (!pswit[OVERVIEW_SWITCH])
  3240 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3241 		    else
  3242 			cnt_lineend++;
  3243 		}
  3244 	    }
  3245 	    isCR=TRUE;
  3246 	}
  3247 	else
  3248 	{
  3249 	    if (pswit[LINE_END_SWITCH] && isCR)
  3250 	    {
  3251 		if (pswit[ECHO_SWITCH])
  3252 		{
  3253 		    s=g_strndup(theline,eos-theline);
  3254 		    g_print("\n%s\n",s);
  3255 		    g_free(s);
  3256 		}
  3257 		if (!pswit[OVERVIEW_SWITCH])
  3258 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3259 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3260 		else
  3261 		    cnt_lineend++;
  3262 		*eos=' ';
  3263 	    }
  3264 	    isCR=FALSE;
  3265 	    eos=g_utf8_next_char(eos);
  3266 	}
  3267     }
  3268     *eos='\0';
  3269     if (pswit[MARKUP_SWITCH])  
  3270 	postprocess_for_HTML(theline);
  3271     if (pswit[DP_SWITCH])  
  3272 	postprocess_for_DP(theline);
  3273     return theline;
  3274 }
  3275 
  3276 /*
  3277  * mixdigit:
  3278  *
  3279  * Takes a "word" as a parameter, and checks whether it
  3280  * contains a mixture of alpha and digits. Generally, this is an
  3281  * error, but may not be for cases like 4th or L5 12s. 3d.
  3282  *
  3283  * Returns: TRUE iff an is error found.
  3284  */
  3285 gboolean mixdigit(const char *checkword)
  3286 {
  3287     gboolean wehaveadigit,wehavealetter,query;
  3288     const char *s,*nondigit;
  3289     wehaveadigit=wehavealetter=query=FALSE;
  3290     for (s=checkword;*s;s=g_utf8_next_char(s))
  3291 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3292 	    wehavealetter=TRUE;
  3293 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3294 	    wehaveadigit=TRUE;
  3295     if (wehaveadigit && wehavealetter)
  3296     {
  3297 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3298 	query=TRUE;
  3299 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3300 	  nondigit=g_utf8_next_char(nondigit))
  3301 	    ;
  3302 	/* digits, ending in st, rd, nd, th of either case */
  3303 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3304 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3305 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3306 	  !g_ascii_strcasecmp(nondigit,"th"))
  3307 	    query=FALSE;
  3308 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3309 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3310 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3311 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3312 	    query=FALSE;
  3313 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3314 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3315 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3316 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3317 	    query=FALSE;
  3318 	/* digits, ending in l, L, s or d */
  3319 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3320 	  !strcmp(nondigit,"d"))
  3321 	    query=FALSE;
  3322 	/*
  3323 	 * L at the start of a number, representing Britsh pounds, like L500.
  3324 	 * This is cute. We know the current word is mixed digit. If the first
  3325 	 * letter is L, there must be at least one digit following. If both
  3326 	 * digits and letters follow, we have a genuine error, else we have a
  3327 	 * capital L followed by digits, and we accept that as a non-error.
  3328 	 */
  3329 	if (g_utf8_get_char(checkword)=='L' &&
  3330 	  !mixdigit(g_utf8_next_char(checkword)))
  3331 	    query=FALSE;
  3332     }
  3333     return query;
  3334 }
  3335 
  3336 /*
  3337  * getaword:
  3338  *
  3339  * Extracts the first/next "word" from the line, and returns it.
  3340  * A word is defined as one English word unit--or at least that's the aim.
  3341  * "ptr" is advanced to the position in the line where we will start
  3342  * looking for the next word.
  3343  *
  3344  * Returns: A newly-allocated string.
  3345  */
  3346 gchar *getaword(const char **ptr)
  3347 {
  3348     const char *s,*t;
  3349     GString *word;
  3350     gunichar c,pc;
  3351     word=g_string_new(NULL);
  3352     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3353       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3354       **ptr;*ptr=g_utf8_next_char(*ptr))
  3355 	;
  3356     /*
  3357      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3358      * Especially yucky is the case of L1,000
  3359      * This section looks for a pattern of characters including a digit
  3360      * followed by a comma or period followed by one or more digits.
  3361      * If found, it returns this whole pattern as a word; otherwise we discard
  3362      * the results and resume our normal programming.
  3363      */
  3364     s=*ptr;
  3365     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3366       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3367       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3368 	g_string_append_unichar(word,g_utf8_get_char(s));
  3369     if (word->len)
  3370     {
  3371 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3372 	{
  3373 	    c=g_utf8_get_char(t);
  3374 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3375 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3376 	    {
  3377 		*ptr=s;
  3378 		return g_string_free(word,FALSE);
  3379 	    }
  3380 	}
  3381     }
  3382     /* we didn't find a punctuated number - do the regular getword thing */
  3383     g_string_truncate(word,0);
  3384     c=g_utf8_get_char(*ptr);
  3385     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3386       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3387 	g_string_append_unichar(word,c);
  3388     return g_string_free(word,FALSE);
  3389 }
  3390 
  3391 /*
  3392  * isroman:
  3393  *
  3394  * Is this word a Roman Numeral?
  3395  *
  3396  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3397  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3398  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3399  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3400  * expressions thereof, except when it came to taxes. Allow any number of M,
  3401  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3402  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3403  * of optional Is.
  3404  */
  3405 gboolean isroman(const char *t)
  3406 {
  3407     const char *s;
  3408     if (!t || !*t)
  3409 	return FALSE;
  3410     s=t;
  3411     while (g_utf8_get_char(t)=='m' && *t)
  3412 	t++;
  3413     if (g_utf8_get_char(t)=='d')
  3414 	t++;
  3415     if (g_str_has_prefix(t,"cm"))
  3416 	t+=2;
  3417     if (g_str_has_prefix(t,"cd"))
  3418 	t+=2;
  3419     while (g_utf8_get_char(t)=='c' && *t)
  3420 	t++;
  3421     if (g_str_has_prefix(t,"xl"))
  3422 	t+=2;
  3423     if (g_str_has_prefix(t,"xc"))
  3424 	t+=2;
  3425     if (g_utf8_get_char(t)=='l')
  3426 	t++;
  3427     while (g_utf8_get_char(t)=='x' && *t)
  3428 	t++;
  3429     if (g_str_has_prefix(t,"ix"))
  3430 	t+=2;
  3431     if (g_str_has_prefix(t,"iv"))
  3432 	t+=2;
  3433     if (g_utf8_get_char(t)=='v')
  3434 	t++;
  3435     while (g_utf8_get_char(t)=='i' && *t)
  3436 	t++;
  3437     return !*t;
  3438 }
  3439 
  3440 /*
  3441  * postprocess_for_DP:
  3442  *
  3443  * Invoked with the -d switch from flgets().
  3444  * It simply "removes" from the line a hard-coded set of common
  3445  * DP-specific tags, so that the line passed to the main routine has
  3446  * been pre-cleaned of DP markup.
  3447  */
  3448 void postprocess_for_DP(char *theline)
  3449 {
  3450     char *s,*t;
  3451     int i;
  3452     if (!*theline) 
  3453 	return;
  3454     for (i=0;*DPmarkup[i];i++)
  3455 	while ((s=strstr(theline,DPmarkup[i])))
  3456 	{
  3457 	    t=s+strlen(DPmarkup[i]);
  3458 	    memmove(s,t,strlen(t)+1);
  3459 	}
  3460 }
  3461 
  3462 /*
  3463  * postprocess_for_HTML:
  3464  *
  3465  * Invoked with the -m switch from flgets().
  3466  * It simply "removes" from the line a hard-coded set of common
  3467  * HTML tags and "replaces" a hard-coded set of common HTML
  3468  * entities, so that the line passed to the main routine has
  3469  * been pre-cleaned of HTML.
  3470  */
  3471 void postprocess_for_HTML(char *theline)
  3472 {
  3473     while (losemarkup(theline))
  3474 	;
  3475     loseentities(theline);
  3476 }
  3477 
  3478 char *losemarkup(char *theline)
  3479 {
  3480     char *s,*t;
  3481     int i;
  3482     s=strchr(theline,'<');
  3483     t=s?strchr(s,'>'):NULL;
  3484     if (!s || !t)
  3485 	return NULL;
  3486     for (i=0;*markup[i];i++)
  3487 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3488 	{
  3489 	    t=g_utf8_next_char(t);
  3490 	    memmove(s,t,strlen(t)+1);
  3491 	    return s;
  3492 	}
  3493     /* It's an unrecognized <xxx>. */
  3494     return NULL;
  3495 }
  3496 
  3497 void loseentities(char *theline)
  3498 {
  3499     int i;
  3500     gsize nb;
  3501     char *amp,*scolon;
  3502     gchar *s,*t;
  3503     gunichar c;
  3504     GTree *entities=NULL;
  3505     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3506     if (!theline)
  3507     {
  3508 	if (entities)
  3509 	    g_tree_destroy(entities);
  3510 	entities=NULL;
  3511 	if (translit!=(GIConv)-1)
  3512 	    g_iconv_close(translit);
  3513 	translit=(GIConv)-1;
  3514 	if (to_utf8!=(GIConv)-1)
  3515 	    g_iconv_close(to_utf8);
  3516 	to_utf8=(GIConv)-1;
  3517 	return;
  3518     }
  3519     if (!*theline)
  3520 	return;
  3521     if (!entities)
  3522     {
  3523 	entities=g_tree_new((GCompareFunc)strcmp);
  3524 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3525 	    g_tree_insert(entities,HTMLentities[i].name,
  3526 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3527     }
  3528     if (translit==(GIConv)-1)
  3529 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3530     if (to_utf8==(GIConv)-1)
  3531 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3532     while((amp=strchr(theline,'&')))
  3533     {
  3534 	scolon=strchr(amp,';');
  3535 	if (scolon)
  3536 	{
  3537 	    if (amp[1]=='#')
  3538 	    {
  3539 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3540 		    c=strtol(amp+2,NULL,10);
  3541 		else if (amp[2]=='x' &&
  3542 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3543 		    c=strtol(amp+3,NULL,16);
  3544 	    }
  3545 	    else
  3546 	    {
  3547 		s=g_strndup(amp+1,scolon-(amp+1));
  3548 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3549 		g_free(s);
  3550 	    }
  3551 	}
  3552 	else
  3553 	    c=0;
  3554 	if (c)
  3555 	{
  3556 	    theline=amp;
  3557 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3558 		theline+=g_unichar_to_utf8(c,theline);
  3559 	    else
  3560 	    {
  3561 		s=g_malloc(6);
  3562 		nb=g_unichar_to_utf8(c,s);
  3563 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3564 		g_free(s);
  3565 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3566 		g_free(t);
  3567 		memcpy(theline,s,nb);
  3568 		g_free(s);
  3569 		theline+=nb;
  3570 	    }
  3571 	    memmove(theline,g_utf8_next_char(scolon),
  3572 	      strlen(g_utf8_next_char(scolon))+1);
  3573 	}
  3574 	else
  3575 	    theline=g_utf8_next_char(amp);
  3576     }
  3577 }
  3578 
  3579 gboolean tagcomp(const char *strin,const char *basetag)
  3580 {
  3581     gboolean retval;
  3582     gchar *s,*t;
  3583     if (g_utf8_get_char(strin)=='/')
  3584 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3585     else
  3586 	t=g_utf8_casefold(strin,-1);
  3587     s=g_utf8_casefold(basetag,-1);
  3588     retval=g_str_has_prefix(t,s);
  3589     g_free(s);
  3590     g_free(t);
  3591     return retval;
  3592 }
  3593 
  3594 void proghelp(GOptionContext *context)
  3595 {
  3596     gchar *help;
  3597     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3598     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3599     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3600     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3601       "For details, read the file COPYING.\n",stderr);
  3602     fputs("This is Free Software; "
  3603       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3604     fputs("read the file COPYING for details.\n\n",stderr);
  3605     help=g_option_context_get_help(context,TRUE,NULL);
  3606     fputs(help,stderr);
  3607     g_free(help);
  3608     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3609     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3610       "non-ASCII\n",stderr);
  3611     fputs("characters like accented letters, "
  3612       "lines longer than 75 or shorter than 55,\n",stderr);
  3613     fputs("unbalanced quotes or brackets, "
  3614       "a variety of badly formatted punctuation, \n",stderr);
  3615     fputs("HTML tags, some likely typos. "
  3616       "It is NOT a substitute for human judgement.\n",stderr);
  3617     fputs("\n",stderr);
  3618 }