bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Mon Oct 21 23:51:21 2013 +0100 (2013-10-21)
changeset 195 cea17274ce55
parent 193 7fdf168fb748
parent 189 43b8447c9ea7
permissions -rw-r--r--
Merge bug #19: Update documentation for 2.1
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 gboolean typo_compat,paranoid_compat;
   136 
   137 static GOptionEntry options[]={
   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   139       "Ignore DP-specific markup", NULL },
   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   142       "Don't ignore DP-specific markup", NULL },
   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   144       "Echo queried line", NULL },
   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   147       "Don't echo queried line", NULL },
   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   149       "Check single quotes", NULL },
   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   152       "Don't check single quotes", NULL },
   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   154       "Check common typos", NULL },
   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   157       "Don't check common typos", NULL },
   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   159       "Require closure of quotes on every paragraph", NULL },
   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   162       "Don't require closure of quotes on every paragraph", NULL },
   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   165       "Enable paranoid querying of everything", NULL },
   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   168       "Disable paranoid querying of everything", NULL },
   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   171       "Enable line end checking", NULL },
   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   174       "Diable line end checking", NULL },
   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   176       "Overview: just show counts", NULL },
   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   179       "Show individual warnings", NULL },
   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   181       "Output errors to stdout instead of stderr", NULL },
   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   184       "Output errors to stderr instead of stdout", NULL },
   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   186       "Echo header fields", NULL },
   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Don't echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "No special handling for markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   199       "Ignore file of user-defined typos", NULL },
   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   201       "Verbose - list everything", NULL },
   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   204       "Switch off verbose mode", NULL },
   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   206       "Set of characters valid for this ebook", "NAME" },
   207     { NULL }
   208 };
   209 
   210 /*
   211  * Options relating to configuration which make no sense from inside
   212  * a configuration file.
   213  */
   214 
   215 static GOptionEntry config_options[]={
   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   217       "Defaults for use on www upload", NULL },
   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   219       "Dump current config settings", NULL },
   220     { NULL }
   221 };
   222 
   223 static GOptionEntry compatibility_options[]={
   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   225       "Toggle checking for common typos", NULL },
   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   227       "Toggle both paranoid mode and common typos", NULL },
   228     { NULL }
   229 };
   230 
   231 long cnt_quote;		/* for overview mode, count of quote queries */
   232 long cnt_brack;		/* for overview mode, count of brackets queries */
   233 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   234 long cnt_odd;		/* for overview mode, count of odd character queries */
   235 long cnt_long;		/* for overview mode, count of long line errors */
   236 long cnt_short;		/* for overview mode, count of short line queries */
   237 long cnt_punct;		/* for overview mode,
   238 			   count of punctuation and spacing queries */
   239 long cnt_dash;		/* for overview mode, count of dash-related queries */
   240 long cnt_word;		/* for overview mode, count of word queries */
   241 long cnt_html;		/* for overview mode, count of html queries */
   242 long cnt_lineend;	/* for overview mode, count of line-end queries */
   243 long cnt_spacend;	/* count of lines with space at end */
   244 long linecnt;		/* count of total lines in the file */
   245 long checked_linecnt;	/* count of lines actually checked */
   246 
   247 void proghelp(GOptionContext *context);
   248 void procfile(const char *);
   249 
   250 gchar *running_from;
   251 
   252 gboolean mixdigit(const char *);
   253 gchar *getaword(const char **);
   254 char *flgets(char **,long);
   255 void postprocess_for_HTML(char *);
   256 char *linehasmarkup(char *);
   257 char *losemarkup(char *);
   258 gboolean tagcomp(const char *,const char *);
   259 void loseentities(char *);
   260 gboolean isroman(const char *);
   261 void postprocess_for_DP(char *);
   262 void print_as_windows_1252(const char *string);
   263 void print_as_utf_8(const char *string);
   264 
   265 GTree *qword,*qperiod;
   266 
   267 #ifdef __WIN32__
   268 UINT saved_cp;
   269 #endif
   270 
   271 gboolean set_charset(const char *name,GError **err)
   272 {
   273     /* The various UNICODE encodings all share the same character set. */
   274     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   275       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   276       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   277       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   278       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   279     int i;
   280     if (charset)
   281 	g_free(charset);
   282     if (charset_validator!=(GIConv)-1)
   283 	g_iconv_close(charset_validator);
   284     if (!name || !g_strcasecmp(name,"auto"))
   285     {
   286 	charset=NULL;
   287 	charset_validator=(GIConv)-1;
   288 	return TRUE;
   289     }
   290     else
   291 	charset=g_strdup(name);
   292     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   293 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   294 	{
   295 	    g_free(charset);
   296 	    charset=g_strdup("UTF-8");
   297 	    break;
   298 	}
   299     if (!strcmp(charset,"UTF-8"))
   300 	charset_validator=(GIConv)-1;
   301     else
   302     {
   303 	charset_validator=g_iconv_open(charset,"UTF-8");
   304 	if (charset_validator==(GIConv)-1)
   305 	{
   306 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   307 	      "Unknown character set \"%s\"",charset);
   308 	    return FALSE;
   309 	}
   310     }
   311     return TRUE;
   312 }
   313 
   314 GKeyFile *config;
   315 
   316 void config_file_update(GKeyFile *kf)
   317 {
   318     int i;
   319     const char *s;
   320     gboolean sw;
   321     for(i=0;options[i].long_name;i++)
   322     {
   323 	if (g_str_has_prefix(options[i].long_name,"no-"))
   324 	    continue;
   325 	if (options[i].arg==G_OPTION_ARG_NONE)
   326 	{
   327 	    sw=*(gboolean *)options[i].arg_data;
   328 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   329 		sw=!sw;
   330 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   331 	}
   332 	else if (options[i].arg==G_OPTION_ARG_STRING)
   333 	{
   334 	    s=*(gchar **)options[i].arg_data;
   335 	    if (!s)
   336 		s="auto";
   337 	    g_key_file_set_string(kf,"options",options[i].long_name,s);
   338 	}
   339 	else
   340 	    g_assert_not_reached();
   341     }
   342 }
   343 
   344 void config_file_add_comments(GKeyFile *kf)
   345 {
   346     int i;
   347     gchar *comment;
   348     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   349       NULL);
   350     for(i=0;options[i].long_name;i++)
   351     {
   352 	if (g_str_has_prefix(options[i].long_name,"no-"))
   353 	    continue;
   354 	comment=g_strconcat(" ",options[i].description,NULL);
   355 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   356 	g_free(comment);
   357     }
   358 }
   359 
   360 void dump_config(void)
   361 {
   362     gchar *s;
   363     if (config)
   364 	config_file_update(config);
   365     else
   366     {
   367 	config=g_key_file_new();
   368 	config_file_update(config);
   369 	config_file_add_comments(config);
   370     }
   371     s=g_key_file_to_data(config,NULL,NULL);
   372     if (s)
   373 	g_print("%s",s);
   374     g_free(s);
   375 }
   376 
   377 GKeyFile *read_config_file(gchar **full_path)
   378 {
   379     int i;
   380     GError *err=NULL;
   381     gchar **search_dirs;
   382     gchar *path;
   383     const char *search_path;
   384     GKeyFile *kf;
   385     kf=g_key_file_new();
   386     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   387     if (search_path)
   388     {
   389 #ifdef __WIN32__
   390 	search_dirs=g_strsplit(search_path,";",0);
   391 #else
   392 	search_dirs=g_strsplit(search_path,":",0);
   393 #endif
   394     }
   395     else
   396     {
   397 	search_dirs=g_new(gchar *,4);
   398 	search_dirs[0]=g_get_current_dir();
   399 	search_dirs[1]=g_strdup(running_from);
   400 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   401 	search_dirs[3]=NULL;
   402     }
   403     for(i=0;search_dirs[i];i++)
   404     {
   405 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   406 	if (g_key_file_load_from_file(kf,path,
   407 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   408 	    break;
   409 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   410 	{
   411 	    g_printerr("Bookloupe: Error reading %s\n",path);
   412 	    g_printerr("%s\n",err->message);
   413 	    exit(1);
   414 	}
   415 	g_clear_error(&err);
   416 	g_free(path);
   417 	path=NULL;
   418     }
   419     if (!search_dirs[i])
   420     {
   421 	g_key_file_free(kf);
   422 	kf=NULL;
   423     }
   424     g_strfreev(search_dirs);
   425     if (full_path && kf)
   426 	*full_path=path;
   427     else
   428 	g_free(path);
   429     return kf;
   430 }
   431 
   432 void parse_config_file(void)
   433 {
   434     int i,j;
   435     gchar *path,*s;
   436     gchar **keys;
   437     gboolean sw;
   438     GError *err=NULL;
   439     config=read_config_file(&path);
   440     if (config)
   441 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   442     else
   443 	keys=NULL;
   444     if (keys)
   445     {
   446 	for(i=0;keys[i];i++)
   447 	{
   448 	    for(j=0;options[j].long_name;j++)
   449 	    {
   450 		if (g_str_has_prefix(options[j].long_name,"no-"))
   451 		    continue;
   452 		else if (!strcmp(keys[i],options[j].long_name))
   453 		{
   454 		    if (options[j].arg==G_OPTION_ARG_NONE)
   455 		    {
   456 			sw=g_key_file_get_boolean(config,"options",keys[i],
   457 			  &err);
   458 			if (err)
   459 			{
   460 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   461 			      path,keys[i],err->message);
   462 			    g_clear_error(&err);
   463 			}
   464 			else
   465 			{
   466 			    if (options[j].flags&G_OPTION_FLAG_REVERSE)
   467 				sw=!sw;
   468 			    *(gboolean *)options[j].arg_data=sw;
   469 			}
   470 			break;
   471 		    }
   472 		    else if (options[j].arg==G_OPTION_ARG_STRING)
   473 		    {
   474 			s=g_key_file_get_string(config,"options",keys[i],
   475 			  &err);
   476 			if (err)
   477 			{
   478 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   479 			      path,keys[i],err->message);
   480 			    g_clear_error(&err);
   481 			}
   482 			else
   483 			{
   484 			    g_free(*(gchar **)options[j].arg_data);
   485 			    if (!g_strcmp0(s,"auto"))
   486 			    {
   487 				*(gchar **)options[j].arg_data=NULL;
   488 				g_free(s);
   489 			    }
   490 			    else
   491 				*(gchar **)options[j].arg_data=s;
   492 			}
   493 			break;
   494 		    }
   495 		    else
   496 			g_assert_not_reached();
   497 		}
   498 	    }
   499 	    if (!options[j].long_name)
   500 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   501 		  path,keys[i]);
   502 	}
   503 	g_strfreev(keys);
   504     }
   505     if (config)
   506 	g_free(path);
   507 }
   508 
   509 void parse_options(int *argc,char ***argv)
   510 {
   511     GError *err=NULL;
   512     GOptionContext *context;
   513     GOptionGroup *compatibility;
   514     context=g_option_context_new(
   515       "file - look for errors in Project Gutenberg(TM) etexts");
   516     g_option_context_add_main_entries(context,options,NULL);
   517     g_option_context_add_main_entries(context,config_options,NULL);
   518     compatibility=g_option_group_new("compatibility",
   519       "Options for Compatibility with Gutcheck:",
   520       "Show compatibility options",NULL,NULL);
   521     g_option_group_add_entries(compatibility,compatibility_options);
   522     g_option_context_add_group(context,compatibility);
   523     g_option_context_set_description(context,
   524       "For simplicity, only the switch options which reverse the\n"
   525       "default configuration are listed. In most cases, both vanilla\n"
   526       "and \"no-\" prefixed versions are available for use.");
   527     if (!g_option_context_parse(context,argc,argv,&err))
   528     {
   529 	g_printerr("Bookloupe: %s\n",err->message);
   530 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   531 	exit(1);
   532     }
   533     if (typo_compat)
   534 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   535     if (paranoid_compat)
   536     {
   537 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   538 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   539     }
   540     /*
   541      * Web uploads - for the moment, this is really just a placeholder
   542      * until we decide what processing we really want to do on web uploads
   543      */
   544     if (pswit[WEB_SWITCH])
   545     {
   546 	/* specific override for web uploads */
   547 	pswit[ECHO_SWITCH]=TRUE;
   548 	pswit[SQUOTE_SWITCH]=FALSE;
   549 	pswit[TYPO_SWITCH]=TRUE;
   550 	pswit[QPARA_SWITCH]=FALSE;
   551 	pswit[PARANOID_SWITCH]=TRUE;
   552 	pswit[LINE_END_SWITCH]=FALSE;
   553 	pswit[OVERVIEW_SWITCH]=FALSE;
   554 	pswit[STDOUT_SWITCH]=FALSE;
   555 	pswit[HEADER_SWITCH]=TRUE;
   556 	pswit[VERBOSE_SWITCH]=FALSE;
   557 	pswit[MARKUP_SWITCH]=FALSE;
   558 	pswit[USERTYPO_SWITCH]=FALSE;
   559 	pswit[DP_SWITCH]=FALSE;
   560     }
   561     if (opt_charset && !set_charset(opt_charset,&err))
   562     {
   563 	g_printerr("%s\n",err->message);
   564 	exit(1);
   565     }
   566     if (pswit[DUMP_CONFIG_SWITCH])
   567     {
   568 	dump_config();
   569 	exit(0);
   570     }
   571     g_free(opt_charset);
   572     opt_charset=NULL;
   573     if (pswit[OVERVIEW_SWITCH])
   574 	/* just print summary; don't echo */
   575 	pswit[ECHO_SWITCH]=FALSE;
   576     if (*argc<2)
   577     {
   578 	proghelp(context);
   579 	exit(1);
   580     }
   581     g_option_context_free(context);
   582 }
   583 
   584 /*
   585  * read_user_scannos:
   586  *
   587  * Read in the user-defined stealth scanno list.
   588  */
   589 void read_user_scannos(void)
   590 {
   591     GError *err=NULL;
   592     gchar *usertypo_file;
   593     gboolean okay;
   594     int i;
   595     gsize len,nb;
   596     gchar *contents,*utf8,**lines;
   597     usertypo_file=g_strdup("bookloupe.typ");
   598     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   599     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   600     {
   601 	g_clear_error(&err);
   602 	g_free(usertypo_file);
   603 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   604 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   605     }
   606     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   607     {
   608 	g_clear_error(&err);
   609 	g_free(usertypo_file);
   610 	usertypo_file=g_strdup("gutcheck.typ");
   611 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   612     }
   613     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   614     {
   615 	g_clear_error(&err);
   616 	g_free(usertypo_file);
   617 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   618 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   619     }
   620     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   621     {
   622 	g_free(usertypo_file);
   623 	g_print("   --> I couldn't find bookloupe.typ "
   624 	  "-- proceeding without user typos.\n");
   625 	return;
   626     }
   627     else if (!okay)
   628     {
   629 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   630 	g_free(usertypo_file);
   631 	g_clear_error(&err);
   632 	exit(1);
   633     }
   634     if (g_utf8_validate(contents,len,NULL))
   635     {
   636 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   637 	if (!charset)
   638 	    (void)set_charset("UNICODE",NULL);
   639     }
   640     else
   641 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   642     g_free(contents);
   643     lines=g_strsplit_set(utf8,"\r\n",0);
   644     g_free(utf8);
   645     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   646     for (i=0;lines[i];i++)
   647 	if (*(unsigned char *)lines[i]>'!')
   648 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   649 	else
   650 	    g_free(lines[i]);
   651     g_free(lines);
   652 }
   653 
   654 /*
   655  * read_etext:
   656  *
   657  * Read an etext returning a newly allocated string containing the file
   658  * contents or NULL on error.
   659  */
   660 gchar *read_etext(const char *filename,GError **err)
   661 {
   662     GError *tmp_err=NULL;
   663     gchar *contents,*utf8;
   664     gsize len,bytes_read,bytes_written;
   665     int i,line,col;
   666     if (!g_file_get_contents(filename,&contents,&len,err))
   667 	return NULL;
   668     if (g_utf8_validate(contents,len,NULL))
   669     {
   670 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   671 	g_set_print_handler(print_as_utf_8);
   672 #ifdef __WIN32__
   673 	SetConsoleOutputCP(CP_UTF8);
   674 #endif
   675     }
   676     else
   677     {
   678 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   679 	  &bytes_written,&tmp_err);
   680 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   681 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   682 	{
   683 	    line=col=1;
   684 	    for(i=0;i<bytes_read;i++)
   685 		if (contents[i]=='\n')
   686 		{
   687 		    line++;
   688 		    col=1;
   689 		}
   690 		else if (contents[i]!='\r')
   691 		    col++;
   692 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   693 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   694 	      "valid Windows-1252 character",
   695 	      ((unsigned char *)contents)[bytes_read],line,col);
   696 	}
   697 	else if (tmp_err)
   698 	    g_propagate_error(err,tmp_err);
   699 	g_set_print_handler(print_as_windows_1252);
   700 #ifdef __WIN32__
   701 	SetConsoleOutputCP(1252);
   702 #endif
   703     }
   704     g_free(contents);
   705     return utf8;
   706 }
   707 
   708 void cleanup_on_exit(void)
   709 {
   710 #ifdef __WIN32__
   711     SetConsoleOutputCP(saved_cp);
   712 #endif
   713 }
   714 
   715 int main(int argc,char **argv)
   716 {
   717 #ifdef __WIN32__
   718     atexit(cleanup_on_exit);
   719     saved_cp=GetConsoleOutputCP();
   720 #endif
   721     running_from=g_path_get_dirname(argv[0]);
   722     /* Paranoid checking is turned OFF, not on, by its switch */
   723     pswit[PARANOID_SWITCH]=TRUE;
   724     /* if running in paranoid mode, typo checks default to enabled */
   725     pswit[TYPO_SWITCH]=TRUE;
   726     /* Line-end checking is turned OFF, not on, by its switch */
   727     pswit[LINE_END_SWITCH]=TRUE;
   728     /* Echoing is turned OFF, not on, by its switch */
   729     pswit[ECHO_SWITCH]=TRUE;
   730     parse_config_file();
   731     parse_options(&argc,&argv);
   732     if (pswit[USERTYPO_SWITCH])
   733 	read_user_scannos();
   734     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   735     procfile(argv[1]);
   736     if (pswit[OVERVIEW_SWITCH])
   737     {
   738 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   739 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   740 	g_print("    --------------- Queries found --------------\n");
   741 	if (cnt_long)
   742 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   743 	if (cnt_short)
   744 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   745 	if (cnt_lineend)
   746 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   747 	if (cnt_word)
   748 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   749 	if (cnt_quote)
   750 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   751 	if (cnt_brack)
   752 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   753 	if (cnt_bin)
   754 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   755 	if (cnt_odd)
   756 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   757 	if (cnt_punct)
   758 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   759 	if (cnt_dash)
   760 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   761 	if (cnt_html)
   762 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   763 	g_print("\n");
   764 	g_print("    TOTAL QUERIES		  %14ld\n",
   765 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   766 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   767     }
   768     g_free(running_from);
   769     if (usertypo)
   770 	g_tree_unref(usertypo);
   771     set_charset(NULL,NULL);
   772     if (config)
   773 	g_key_file_free(config);
   774     return 0;
   775 }
   776 
   777 void count_dashes(const char *line,const char *dash,
   778   struct dash_results *results)
   779 {
   780     int i;
   781     gchar **tokens;
   782     gunichar pc,nc;
   783     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   784     if (!*line)
   785 	return;
   786     tokens=g_strsplit(line,dash,0);
   787     if (tokens[1])
   788 	results->base++;
   789     for(i=1;tokens[i];i++)
   790     {
   791 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   792 	nc=g_utf8_get_char(tokens[i]);
   793 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   794 	    spaced=TRUE;
   795 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   796 	    spaced2=TRUE;
   797 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   798 	    unspaced=TRUE;
   799     }
   800     if (spaced)
   801 	results->space++;
   802     if (spaced2)
   803 	/* count of lines with em-dashes with spaces both sides */
   804 	results->non_PG_space++;
   805     if (unspaced)
   806 	/* count of lines with PG-type em-dashes with no spaces */
   807 	results->PG_space++;
   808     g_strfreev(tokens);
   809 }
   810 
   811 /*
   812  * first_pass:
   813  *
   814  * Run a first pass - verify that it's a valid PG
   815  * file, decide whether to report some things that
   816  * occur many times in the text like long or short
   817  * lines, non-standard dashes, etc.
   818  */
   819 struct first_pass_results *first_pass(const char *etext)
   820 {
   821     gunichar laststart=CHAR_SPACE;
   822     const char *s;
   823     gchar *lc_line;
   824     int i,j,lbytes,llen;
   825     gchar **lines;
   826     unsigned int lastlen=0,lastblen=0;
   827     long spline=0,nspline=0;
   828     static struct first_pass_results results={0};
   829     struct dash_results tmp_dash_results;
   830     gchar *inword;
   831     QuoteClass qc;
   832     lines=g_strsplit(etext,"\n",0);
   833     for (j=0;lines[j];j++)
   834     {
   835 	lbytes=strlen(lines[j]);
   836 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   837 	    lines[j][--lbytes]='\0';
   838 	llen=g_utf8_strlen(lines[j],lbytes);
   839 	linecnt++;
   840 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   841 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   842 	{
   843 	    if (spline)
   844 		g_print("   --> Duplicate header?\n");
   845 	    spline=linecnt+1;   /* first line of non-header text, that is */
   846 	}
   847 	if (!strncmp(lines[j],"*** START",9) &&
   848 	  strstr(lines[j],"PROJECT GUTENBERG"))
   849 	{
   850 	    if (nspline)
   851 		g_print("   --> Duplicate header?\n");
   852 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   853 	}
   854 	if (spline || nspline)
   855 	{
   856 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   857 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   858 	    {
   859 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   860 		{
   861 		    if (results.footerline)
   862 		    {
   863 			/* it's an old-form header - we can detect duplicates */
   864 			if (!nspline)
   865 			    g_print("   --> Duplicate footer?\n");
   866 		    }
   867 		    else
   868 			results.footerline=linecnt;
   869 		}
   870 	    }
   871 	    g_free(lc_line);
   872 	}
   873 	if (spline)
   874 	    results.firstline=spline;
   875 	if (nspline)
   876 	    results.firstline=nspline;  /* override with new */
   877 	if (results.footerline)
   878 	    continue;    /* don't count the boilerplate in the footer */
   879 	results.totlen+=llen;
   880 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   881 	{
   882 	    if (g_utf8_get_char(s)>127)
   883 		results.binlen++;
   884 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   885 		results.alphalen++;
   886 	    if (s>lines[j])
   887 	    {
   888 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   889 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   890 		else
   891 		    qc=INVALID_QUOTE;
   892 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   893 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   894 		    results.endquote_count++;
   895 	    }
   896 	}
   897 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   898 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   899 	    results.shortline++;
   900 	if (lbytes>0 &&
   901 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   902 	    cnt_spacend++;
   903 	if (strstr(lines[j],".,"))
   904 	    results.dotcomma++;
   905 	/* only count ast lines for ignoring purposes where there is */
   906 	/* locase text on the line */
   907 	if (strchr(lines[j],'*'))
   908 	{
   909 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   910 		if (g_unichar_islower(g_utf8_get_char(s)))
   911 		    break;
   912 	    if (*s)
   913 		results.astline++;
   914 	}
   915 	if (strchr(lines[j],'/'))
   916 	    results.fslashline++;
   917 	if (lbytes>0)
   918 	{
   919 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   920 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   921 	      s=g_utf8_prev_char(s))
   922 		;
   923 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   924 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   925 		results.hyphens++;
   926 	}
   927 	if (llen>LONGEST_PG_LINE)
   928 	    results.longline++;
   929 	if (llen>WAY_TOO_LONG)
   930 	    results.verylongline++;
   931 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   932 	{
   933 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   934 	    if (i>0)
   935 		results.htmcount++;
   936 	    if (strstr(lines[j],"<i>"))
   937 		results.htmcount+=4; /* bonus marks! */
   938 	}
   939 	/* Check for spaced em-dashes */
   940 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   941 	count_dashes(lines[j],"--",&tmp_dash_results);
   942 	count_dashes(lines[j],"—",&tmp_dash_results);
   943 	if (tmp_dash_results.base)
   944 	    results.emdash.base++;
   945 	if (tmp_dash_results.non_PG_space)
   946 	    results.emdash.non_PG_space++;
   947 	if (tmp_dash_results.PG_space)
   948 	    results.emdash.PG_space++;
   949 	for (s=lines[j];*s;)
   950 	{
   951 	    inword=getaword(&s);
   952 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   953 		results.Dutchcount++;
   954 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   955 		results.Frenchcount++;
   956 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   957 		results.standalone_digit++;
   958 	    g_free(inword);
   959 	}
   960 	/* Check for spaced dashes */
   961 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   962 	    results.spacedash++;
   963 	lastblen=lastlen;
   964 	lastlen=llen;
   965 	laststart=lines[j][0];
   966     }
   967     g_strfreev(lines);
   968     return &results;
   969 }
   970 
   971 /*
   972  * report_first_pass:
   973  *
   974  * Make some snap decisions based on the first pass results.
   975  */
   976 struct warnings *report_first_pass(struct first_pass_results *results)
   977 {
   978     static struct warnings warnings={0};
   979     if (cnt_spacend>0)
   980 	g_print("   --> %ld lines in this file have white space at end\n",
   981 	  cnt_spacend);
   982     warnings.dotcomma=1;
   983     if (results->dotcomma>5)
   984     {
   985 	warnings.dotcomma=0;
   986 	g_print("   --> %ld lines in this file contain '.,'. "
   987 	  "Not reporting them.\n",results->dotcomma);
   988     }
   989     /*
   990      * If more than 50 lines, or one-tenth, are short,
   991      * don't bother reporting them.
   992      */
   993     warnings.shortline=1;
   994     if (results->shortline>50 || results->shortline*10>linecnt)
   995     {
   996 	warnings.shortline=0;
   997 	g_print("   --> %ld lines in this file are short. "
   998 	  "Not reporting short lines.\n",results->shortline);
   999     }
  1000     /*
  1001      * If more than 50 lines, or one-tenth, are long,
  1002      * don't bother reporting them.
  1003      */
  1004     warnings.longline=1;
  1005     if (results->longline>50 || results->longline*10>linecnt)
  1006     {
  1007 	warnings.longline=0;
  1008 	g_print("   --> %ld lines in this file are long. "
  1009 	  "Not reporting long lines.\n",results->longline);
  1010     }
  1011     /* If more than 10 lines contain asterisks, don't bother reporting them. */
  1012     warnings.ast=1;
  1013     if (results->astline>10)
  1014     {
  1015 	warnings.ast=0;
  1016 	g_print("   --> %ld lines in this file contain asterisks. "
  1017 	  "Not reporting them.\n",results->astline);
  1018     }
  1019     /*
  1020      * If more than 10 lines contain forward slashes,
  1021      * don't bother reporting them.
  1022      */
  1023     warnings.fslash=1;
  1024     if (results->fslashline>10)
  1025     {
  1026 	warnings.fslash=0;
  1027 	g_print("   --> %ld lines in this file contain forward slashes. "
  1028 	  "Not reporting them.\n",results->fslashline);
  1029     }
  1030     /*
  1031      * If more than 20 lines contain unpunctuated endquotes,
  1032      * don't bother reporting them.
  1033      */
  1034     warnings.endquote=1;
  1035     if (results->endquote_count>20)
  1036     {
  1037 	warnings.endquote=0;
  1038 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
  1039 	  "Not reporting them.\n",results->endquote_count);
  1040     }
  1041     /*
  1042      * If more than 15 lines contain standalone digits,
  1043      * don't bother reporting them.
  1044      */
  1045     warnings.digit=1;
  1046     if (results->standalone_digit>10)
  1047     {
  1048 	warnings.digit=0;
  1049 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
  1050 	  "Not reporting them.\n",results->standalone_digit);
  1051     }
  1052     /*
  1053      * If more than 20 lines contain hyphens at end,
  1054      * don't bother reporting them.
  1055      */
  1056     warnings.hyphen=1;
  1057     if (results->hyphens>20)
  1058     {
  1059 	warnings.hyphen=0;
  1060 	g_print("   --> %ld lines in this file have hyphens at end. "
  1061 	  "Not reporting them.\n",results->hyphens);
  1062     }
  1063     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1064     {
  1065 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1066 	pswit[MARKUP_SWITCH]=1;
  1067     }
  1068     if (results->verylongline>0)
  1069 	g_print("   --> %ld lines in this file are VERY long!\n",
  1070 	  results->verylongline);
  1071     /*
  1072      * If there are more non-PG spaced dashes than PG em-dashes,
  1073      * assume it's deliberate.
  1074      * Current PG guidelines say don't use them, but older texts do,
  1075      * and some people insist on them whatever the guidelines say.
  1076      */
  1077     warnings.dash=1;
  1078     if (results->spacedash+results->emdash.non_PG_space>
  1079       results->emdash.PG_space)
  1080     {
  1081 	warnings.dash=0;
  1082 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1083 	  "Not reporting them.\n",
  1084 	  results->spacedash+results->emdash.non_PG_space);
  1085     }
  1086     if (charset)
  1087 	warnings.bin=0;
  1088     else
  1089     {
  1090 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1091 	warnings.bin=1;
  1092 	/* If more than a quarter of characters are hi-bit, bug out. */
  1093 	if (results->binlen*4>results->totlen)
  1094 	{
  1095 	    g_print("   --> This file does not appear to be ASCII. "
  1096 	      "Terminating. Best of luck with it!\n");
  1097 	    exit(1);
  1098 	}
  1099 	if (results->alphalen*4<results->totlen)
  1100 	{
  1101 	    g_print("   --> This file does not appear to be text. "
  1102 	      "Terminating. Best of luck with it!\n");
  1103 	    exit(1);
  1104 	}
  1105 	if (results->binlen*100>results->totlen || results->binlen>100)
  1106 	{
  1107 	    g_print("   --> There are a lot of foreign letters here. "
  1108 	      "Not reporting them.\n");
  1109 	    if (!pswit[VERBOSE_SWITCH])
  1110 		warnings.bin=0;
  1111 	}
  1112     }
  1113     warnings.isDutch=FALSE;
  1114     if (results->Dutchcount>50)
  1115     {
  1116 	warnings.isDutch=TRUE;
  1117 	g_print("   --> This looks like Dutch - "
  1118 	  "switching off dashes and warnings for 's Middags case.\n");
  1119     }
  1120     warnings.isFrench=FALSE;
  1121     if (results->Frenchcount>50)
  1122     {
  1123 	warnings.isFrench=TRUE;
  1124 	g_print("   --> This looks like French - "
  1125 	  "switching off some doublepunct.\n");
  1126     }
  1127     if (results->firstline && results->footerline)
  1128 	g_print("    The PG header and footer appear to be already on.\n");
  1129     else
  1130     {
  1131 	if (results->firstline)
  1132 	    g_print("    The PG header is on - no footer.\n");
  1133 	if (results->footerline)
  1134 	    g_print("    The PG footer is on - no header.\n");
  1135     }
  1136     g_print("\n");
  1137     if (pswit[VERBOSE_SWITCH])
  1138     {
  1139 	warnings.shortline=1;
  1140 	warnings.dotcomma=1;
  1141 	warnings.longline=1;
  1142 	warnings.dash=1;
  1143 	warnings.digit=1;
  1144 	warnings.ast=1;
  1145 	warnings.fslash=1;
  1146 	warnings.hyphen=1;
  1147 	warnings.endquote=1;
  1148 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1149     }
  1150     if (warnings.isDutch)
  1151 	warnings.dash=0;
  1152     if (results->footerline>0 && results->firstline>0 &&
  1153       results->footerline>results->firstline &&
  1154       results->footerline-results->firstline<100)
  1155     {
  1156 	g_print("   --> I don't really know where this text starts. \n");
  1157 	g_print("       There are no reference points.\n");
  1158 	g_print("       I'm going to have to report the header and footer "
  1159 	  "as well.\n");
  1160 	results->firstline=0;
  1161     }
  1162     return &warnings;
  1163 }
  1164 
  1165 /*
  1166  * analyse_quotes:
  1167  *
  1168  * Look along the line, accumulate the count of quotes, and see
  1169  * if this is an empty line - i.e. a line with nothing on it
  1170  * but spaces.
  1171  * If line has just spaces, period, * and/or - on it, don't
  1172  * count it, since empty lines with asterisks or dashes to
  1173  * separate sections are common.
  1174  *
  1175  * Returns: TRUE if the line is empty.
  1176  */
  1177 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1178 {
  1179     int guessquote=0;
  1180     /* assume the line is empty until proven otherwise */
  1181     gboolean isemptyline=TRUE;
  1182     const char *s=aline,*sprev,*snext;
  1183     gunichar c;
  1184     sprev=NULL;
  1185     GError *tmp_err=NULL;
  1186     while (*s)
  1187     {
  1188 	snext=g_utf8_next_char(s);
  1189 	c=g_utf8_get_char(s);
  1190 	if (CHAR_IS_DQUOTE(c))
  1191 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1192 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1193 	{
  1194 	    if (s==aline)
  1195 	    {
  1196 		/*
  1197 		 * At start of line, it can only be a quotation mark.
  1198 		 * Hardcode a very common exception!
  1199 		 */
  1200 		if (!g_str_has_prefix(snext,"tis") &&
  1201 		  !g_str_has_prefix(snext,"Tis"))
  1202 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1203 	    }
  1204 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1205 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1206 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1207 		;
  1208 	    /* it's outside a word - let's check it out */
  1209 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1210 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1211 	    {
  1212 		/* certainly looks like a quotation mark */
  1213 		if (!g_str_has_prefix(snext,"tis") &&
  1214 		  !g_str_has_prefix(snext,"Tis"))
  1215 		    /* hardcode a very common exception! */
  1216 		{
  1217 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1218 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1219 		    else
  1220 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1221 		}
  1222 	    }
  1223 	    else
  1224 	    {
  1225 		/* now - is it a quotation mark? */
  1226 		guessquote=0;   /* accumulate clues */
  1227 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1228 		{
  1229 		    /* it follows a letter - could be either */
  1230 		    guessquote++;
  1231 		    if (g_utf8_get_char(sprev)=='s')
  1232 		    {
  1233 			/* looks like a plural apostrophe */
  1234 			guessquote-=3;
  1235 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1236 			    /* bonus marks! */
  1237 			    guessquote-=2;
  1238 		    }
  1239 		    if (innermost_quote_matches(counters,c))
  1240 			/*
  1241 			 * Give it the benefit of some doubt,
  1242 			 * if a squote is already open.
  1243 			 */
  1244 			guessquote++;
  1245 		    else
  1246 			guessquote--;
  1247 		    if (guessquote>=0)
  1248 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1249 		}
  1250 		else
  1251 		    /* no adjacent letter - it must be a quote of some kind */
  1252 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1253 	    }
  1254 	}
  1255 	if (tmp_err)
  1256 	{
  1257 	    if (pswit[ECHO_SWITCH])
  1258 		g_print("\n%s\n",aline);
  1259 	    if (!pswit[OVERVIEW_SWITCH])
  1260 		g_print("    Line %ld column %ld - %s\n",
  1261 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1262 	    g_clear_error(&tmp_err);
  1263 	}
  1264 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1265 	  c!='\r' && c!='\n')
  1266 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1267 	if (c==CHAR_UNDERSCORE)
  1268 	    counters->c_unders++;
  1269 	if (c==CHAR_OPEN_SBRACK)
  1270 	{
  1271 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1272 	      !matching_difference(counters,c) && s==aline &&
  1273 	      g_str_has_prefix(s,"[Illustration:"))
  1274 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1275 	    else
  1276 		increment_matching(counters,c,TRUE);
  1277 	}
  1278 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1279 	    increment_matching(counters,c,TRUE);
  1280 	if (c==CHAR_CLOSE_SBRACK)
  1281 	{
  1282 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1283 	      !matching_difference(counters,c) && !*snext)
  1284 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1285 	    else
  1286 		increment_matching(counters,c,FALSE);
  1287 	}
  1288 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1289 	    increment_matching(counters,c,FALSE);
  1290 	sprev=s;
  1291 	s=snext;
  1292     }
  1293     return isemptyline;
  1294 }
  1295 
  1296 /*
  1297  * check_for_control_characters:
  1298  *
  1299  * Check for invalid or questionable characters in the line
  1300  * Anything above 127 is invalid for plain ASCII, and
  1301  * non-printable control characters should also be flagged.
  1302  * Tabs should generally not be there.
  1303  */
  1304 void check_for_control_characters(const char *aline)
  1305 {
  1306     gunichar c;
  1307     const char *s;
  1308     for (s=aline;*s;s=g_utf8_next_char(s))
  1309     {
  1310 	c=g_utf8_get_char(s);
  1311 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1312 	{
  1313 	    if (pswit[ECHO_SWITCH])
  1314 		g_print("\n%s\n",aline);
  1315 	    if (!pswit[OVERVIEW_SWITCH])
  1316 		g_print("    Line %ld column %ld - Control character %u\n",
  1317 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1318 	    else
  1319 		cnt_bin++;
  1320 	}
  1321     }
  1322 }
  1323 
  1324 /*
  1325  * check_for_odd_characters:
  1326  *
  1327  * Check for binary and other odd characters.
  1328  */
  1329 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1330   gboolean isemptyline)
  1331 {
  1332     /* Don't repeat multiple warnings on one line. */
  1333     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1334     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1335     const char *s;
  1336     gunichar c;
  1337     gsize nb;
  1338     gchar *t;
  1339     for (s=aline;*s;s=g_utf8_next_char(s))
  1340     {
  1341 	c=g_utf8_get_char(s);
  1342 	if (warnings->bin && !eInvalidChar &&
  1343 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1344 	{
  1345 	    if (pswit[ECHO_SWITCH])
  1346 		g_print("\n%s\n",aline);
  1347 	    if (!pswit[OVERVIEW_SWITCH])
  1348 		if (c>127 && c<160 || c>255)
  1349 		    g_print("    Line %ld column %ld - "
  1350 		      "Non-ISO-8859 character %u\n",
  1351 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1352 		else
  1353 		    g_print("    Line %ld column %ld - "
  1354 		      "Non-ASCII character %u\n",
  1355 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1356 	    else
  1357 		cnt_bin++;
  1358 	    eInvalidChar=TRUE;
  1359 	}
  1360 	if (!eInvalidChar && charset)
  1361 	{
  1362 	    if (charset_validator==(GIConv)-1)
  1363 	    {
  1364 		if (!g_unichar_isdefined(c))
  1365 		{
  1366 		    if (pswit[ECHO_SWITCH])
  1367 			g_print("\n%s\n",aline);
  1368 		    if (!pswit[OVERVIEW_SWITCH])
  1369 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1370 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1371 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1372 		    else
  1373 			cnt_bin++;
  1374 		    eInvalidChar=TRUE;
  1375 		}
  1376 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1377 		  c>=100000 && c<=0x10FFFD)
  1378 		{
  1379 		    if (pswit[ECHO_SWITCH])
  1380 			g_print("\n%s\n",aline);
  1381 		    if (!pswit[OVERVIEW_SWITCH])
  1382 			g_print("    Line %ld column %ld - Private Use "
  1383 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1384 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1385 		    else
  1386 			cnt_bin++;
  1387 		    eInvalidChar=TRUE;
  1388 		}
  1389 	    }
  1390 	    else
  1391 	    {
  1392 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1393 		  charset_validator,NULL,&nb,NULL);
  1394 		if (t)
  1395 		    g_free(t);
  1396 		else
  1397 		{
  1398 		    if (pswit[ECHO_SWITCH])
  1399 			g_print("\n%s\n",aline);
  1400 		    if (!pswit[OVERVIEW_SWITCH])
  1401 			g_print("    Line %ld column %ld - Non-%s "
  1402 			  "character %u\n",linecnt,
  1403 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1404 		    else
  1405 			cnt_bin++;
  1406 		    eInvalidChar=TRUE;
  1407 		}
  1408 	    }
  1409 	}
  1410 	if (!eTab && c==CHAR_TAB)
  1411 	{
  1412 	    if (pswit[ECHO_SWITCH])
  1413 		g_print("\n%s\n",aline);
  1414 	    if (!pswit[OVERVIEW_SWITCH])
  1415 		g_print("    Line %ld column %ld - Tab character?\n",
  1416 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1417 	    else
  1418 		cnt_odd++;
  1419 	    eTab=TRUE;
  1420 	}
  1421 	if (!eTilde && c==CHAR_TILDE)
  1422 	{
  1423 	    /*
  1424 	     * Often used by OCR software to indicate an
  1425 	     * unrecognizable character.
  1426 	     */
  1427 	    if (pswit[ECHO_SWITCH])
  1428 		g_print("\n%s\n",aline);
  1429 	    if (!pswit[OVERVIEW_SWITCH])
  1430 		g_print("    Line %ld column %ld - Tilde character?\n",
  1431 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1432 	    else
  1433 		cnt_odd++;
  1434 	    eTilde=TRUE;
  1435 	}
  1436 	if (!eCarat && c==CHAR_CARAT)
  1437 	{  
  1438 	    if (pswit[ECHO_SWITCH])
  1439 		g_print("\n%s\n",aline);
  1440 	    if (!pswit[OVERVIEW_SWITCH])
  1441 		g_print("    Line %ld column %ld - Carat character?\n",
  1442 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1443 	    else
  1444 		cnt_odd++;
  1445 	    eCarat=TRUE;
  1446 	}
  1447 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1448 	{  
  1449 	    if (pswit[ECHO_SWITCH])
  1450 		g_print("\n%s\n",aline);
  1451 	    if (!pswit[OVERVIEW_SWITCH])
  1452 		g_print("    Line %ld column %ld - Forward slash?\n",
  1453 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1454 	    else
  1455 		cnt_odd++;
  1456 	    eFSlash=TRUE;
  1457 	}
  1458 	/*
  1459 	 * Report asterisks only in paranoid mode,
  1460 	 * since they're often deliberate.
  1461 	 */
  1462 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1463 	  c==CHAR_ASTERISK)
  1464 	{
  1465 	    if (pswit[ECHO_SWITCH])
  1466 		g_print("\n%s\n",aline);
  1467 	    if (!pswit[OVERVIEW_SWITCH])
  1468 		g_print("    Line %ld column %ld - Asterisk?\n",
  1469 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1470 	    else
  1471 		cnt_odd++;
  1472 	    eAst=TRUE;
  1473 	}
  1474     }
  1475 }
  1476 
  1477 /*
  1478  * check_for_long_line:
  1479  *
  1480  * Check for line too long.
  1481  */
  1482 void check_for_long_line(const char *aline)
  1483 {
  1484     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1485     {
  1486 	if (pswit[ECHO_SWITCH])
  1487 	    g_print("\n%s\n",aline);
  1488 	if (!pswit[OVERVIEW_SWITCH])
  1489 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1490 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1491 	else
  1492 	    cnt_long++;
  1493     }
  1494 }
  1495 
  1496 /*
  1497  * check_for_short_line:
  1498  *
  1499  * Check for line too short.
  1500  *
  1501  * This one is a bit trickier to implement: we don't want to
  1502  * flag the last line of a paragraph for being short, so we
  1503  * have to wait until we know that our current line is a
  1504  * "normal" line, then report the _previous_ line if it was too
  1505  * short. We also don't want to report indented lines like
  1506  * chapter heads or formatted quotations. We therefore keep
  1507  * last->len as the length of the last line examined, and
  1508  * last->blen as the length of the last but one, and try to
  1509  * suppress unnecessary warnings by checking that both were of
  1510  * "normal" length. We keep the first character of the last
  1511  * line in last->start, and if it was a space, we assume that
  1512  * the formatting is deliberate. I can't figure out a way to
  1513  * distinguish something like a quoted verse left-aligned or
  1514  * the header or footer of a letter from a paragraph of short
  1515  * lines - maybe if I examined the whole paragraph, and if the
  1516  * para has less than, say, 8 lines and if all lines are short,
  1517  * then just assume it's OK? Need to look at some texts to see
  1518  * how often a formula like this would get the right result.
  1519  */
  1520 void check_for_short_line(const char *aline,const struct line_properties *last)
  1521 {
  1522     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1523       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1524       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1525     {
  1526 	if (pswit[ECHO_SWITCH])
  1527 	    g_print("\n%s\n",prevline);
  1528 	if (!pswit[OVERVIEW_SWITCH])
  1529 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1530 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1531 	else
  1532 	    cnt_short++;
  1533     }
  1534 }
  1535 
  1536 /*
  1537  * check_for_starting_punctuation:
  1538  *
  1539  * Look for punctuation other than full ellipses at start of line.
  1540  */
  1541 void check_for_starting_punctuation(const char *aline)
  1542 {
  1543     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1544       !g_str_has_prefix(aline,". . ."))
  1545     {
  1546 	if (pswit[ECHO_SWITCH])
  1547 	    g_print("\n%s\n",aline);
  1548 	if (!pswit[OVERVIEW_SWITCH])
  1549 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1550 	      linecnt);
  1551 	else
  1552 	    cnt_punct++;
  1553     }
  1554 }
  1555 
  1556 /*
  1557  * str_emdash:
  1558  *
  1559  * Find the first em-dash, return a pointer to it and set <next> to the
  1560  * character following the dash.
  1561  */
  1562 char *str_emdash(const char *s,const char **next)
  1563 {
  1564     const char *s1,*s2;
  1565     s1=strstr(s,"--");
  1566     s2=strstr(s,"—");
  1567     if (!s1)
  1568     {
  1569 	if (s2)
  1570 	    *next=g_utf8_next_char(s2);
  1571 	return (char *)s2;
  1572     }
  1573     else if (!s2)
  1574     {
  1575 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1576 	return (char *)s1;
  1577     }
  1578     else if (s1<s2)
  1579     {
  1580 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1581 	return (char *)s1;
  1582     }
  1583     else
  1584     {
  1585 	*next=g_utf8_next_char(s2);
  1586 	return (char *)s2;
  1587     }
  1588 }
  1589 
  1590 /*
  1591  * check_for_spaced_emdash:
  1592  *
  1593  * Check for spaced em-dashes.
  1594  *
  1595  * We must check _all_ occurrences of em-dashes on the line
  1596  * hence the loop - even if the first dash is OK
  1597  * there may be another that's wrong later on.
  1598  */
  1599 void check_for_spaced_emdash(const char *aline)
  1600 {
  1601     const char *s,*t,*next;
  1602     for (s=aline;t=str_emdash(s,&next);s=next)
  1603     {
  1604 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1605 	  g_utf8_get_char(next)==CHAR_SPACE)
  1606 	{
  1607 	    if (pswit[ECHO_SWITCH])
  1608 		g_print("\n%s\n",aline);
  1609 	    if (!pswit[OVERVIEW_SWITCH])
  1610 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1611 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1612 	    else
  1613 		cnt_dash++;
  1614 	}
  1615     }
  1616 }
  1617 
  1618 /*
  1619  * check_for_spaced_dash:
  1620  *
  1621  * Check for spaced dashes.
  1622  */
  1623 void check_for_spaced_dash(const char *aline)
  1624 {
  1625     const char *s;
  1626     if ((s=strstr(aline," -")))
  1627     {
  1628 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1629 	{
  1630 	    if (pswit[ECHO_SWITCH])
  1631 		g_print("\n%s\n",aline);
  1632 	    if (!pswit[OVERVIEW_SWITCH])
  1633 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1634 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1635 	    else
  1636 		cnt_dash++;
  1637 	}
  1638     }
  1639     else if ((s=strstr(aline,"- ")))
  1640     {
  1641 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1642 	{
  1643 	    if (pswit[ECHO_SWITCH])
  1644 		g_print("\n%s\n",aline);
  1645 	    if (!pswit[OVERVIEW_SWITCH])
  1646 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1647 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1648 	    else
  1649 		cnt_dash++;
  1650 	}
  1651     }
  1652 }
  1653 
  1654 /*
  1655  * check_for_unmarked_paragraphs:
  1656  *
  1657  * Check for unmarked paragraphs indicated by separate speakers.
  1658  *
  1659  * May well be false positive:
  1660  * "Bravo!" "Wonderful!" called the crowd.
  1661  * but useful all the same.
  1662  */
  1663 void check_for_unmarked_paragraphs(const char *aline)
  1664 {
  1665     const char *s;
  1666     s=strstr(aline,"\"  \"");
  1667     if (!s)
  1668 	s=strstr(aline,"\" \"");
  1669     if (s)
  1670     {
  1671 	if (pswit[ECHO_SWITCH])
  1672 	    g_print("\n%s\n",aline);
  1673 	if (!pswit[OVERVIEW_SWITCH])
  1674 	    g_print("    Line %ld column %ld - "
  1675 	      "Query missing paragraph break?\n",
  1676 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1677 	else
  1678 	    cnt_punct++;
  1679     }
  1680 }
  1681 
  1682 /*
  1683  * check_for_jeebies:
  1684  *
  1685  * Check for "to he" and other easy h/b errors.
  1686  *
  1687  * This is a very inadequate effort on the h/b problem,
  1688  * but the phrase "to he" is always an error, whereas "to
  1689  * be" is quite common.
  1690  * Similarly, '"Quiet!", be said.' is a non-be error
  1691  * "to he" is _not_ always an error!:
  1692  *       "Where they went to he couldn't say."
  1693  * Another false positive:
  1694  *       What would "Cinderella" be without the . . .
  1695  * and another: "If he wants to he can see for himself."
  1696  */
  1697 void check_for_jeebies(const char *aline)
  1698 {
  1699     const char *s;
  1700     s=strstr(aline," be could ");
  1701     if (!s)
  1702 	s=strstr(aline," be would ");
  1703     if (!s)
  1704 	s=strstr(aline," was be ");
  1705     if (!s)
  1706 	s=strstr(aline," be is ");
  1707     if (!s)
  1708 	s=strstr(aline," is be ");
  1709     if (!s)
  1710 	s=strstr(aline,"\", be ");
  1711     if (!s)
  1712 	s=strstr(aline,"\" be ");
  1713     if (!s)
  1714 	s=strstr(aline,"\" be ");
  1715     if (!s)
  1716 	s=strstr(aline," to he ");
  1717     if (s)
  1718     {
  1719 	if (pswit[ECHO_SWITCH])
  1720 	    g_print("\n%s\n",aline);
  1721 	if (!pswit[OVERVIEW_SWITCH])
  1722 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1723 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1724 	else
  1725 	    cnt_word++;
  1726     }
  1727     s=strstr(aline," the had ");
  1728     if (!s)
  1729 	s=strstr(aline," a had ");
  1730     if (!s)
  1731 	s=strstr(aline," they bad ");
  1732     if (!s)
  1733 	s=strstr(aline," she bad ");
  1734     if (!s)
  1735 	s=strstr(aline," he bad ");
  1736     if (!s)
  1737 	s=strstr(aline," you bad ");
  1738     if (!s)
  1739 	s=strstr(aline," i bad ");
  1740     if (s)
  1741     {
  1742 	if (pswit[ECHO_SWITCH])
  1743 	    g_print("\n%s\n",aline);
  1744 	if (!pswit[OVERVIEW_SWITCH])
  1745 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1746 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1747 	else
  1748 	    cnt_word++;
  1749     }
  1750     s=strstr(aline,"; hut ");
  1751     if (!s)
  1752 	s=strstr(aline,", hut ");
  1753     if (s)
  1754     {
  1755 	if (pswit[ECHO_SWITCH])
  1756 	    g_print("\n%s\n",aline);
  1757 	if (!pswit[OVERVIEW_SWITCH])
  1758 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1759 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1760 	else
  1761 	    cnt_word++;
  1762     }
  1763 }
  1764 
  1765 /*
  1766  * check_for_mta_from:
  1767  *
  1768  * Special case - angled bracket in front of "From" placed there by an
  1769  * MTA when sending an e-mail.
  1770  */
  1771 void check_for_mta_from(const char *aline)
  1772 {
  1773     const char *s;
  1774     s=strstr(aline,">From");
  1775     if (s)
  1776     {
  1777 	if (pswit[ECHO_SWITCH])
  1778 	    g_print("\n%s\n",aline);
  1779 	if (!pswit[OVERVIEW_SWITCH])
  1780 	    g_print("    Line %ld column %ld - "
  1781 	      "Query angled bracket with From\n",
  1782 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1783 	else
  1784 	    cnt_punct++;
  1785     }
  1786 }
  1787 
  1788 /*
  1789  * check_for_orphan_character:
  1790  *
  1791  * Check for a single character line -
  1792  * often an overflow from bad wrapping.
  1793  */
  1794 void check_for_orphan_character(const char *aline)
  1795 {
  1796     gunichar c;
  1797     c=g_utf8_get_char(aline);
  1798     if (c && !*g_utf8_next_char(aline))
  1799     {
  1800 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1801 	    ; /* Nothing - ignore numerals alone on a line. */
  1802 	else
  1803 	{
  1804 	    if (pswit[ECHO_SWITCH])
  1805 		g_print("\n%s\n",aline);
  1806 	    if (!pswit[OVERVIEW_SWITCH])
  1807 		g_print("    Line %ld column 1 - Query single character line\n",
  1808 		  linecnt);
  1809 	    else
  1810 		cnt_punct++;
  1811 	}
  1812     }
  1813 }
  1814 
  1815 /*
  1816  * check_for_pling_scanno:
  1817  *
  1818  * Check for I" - often should be !
  1819  */
  1820 void check_for_pling_scanno(const char *aline)
  1821 {
  1822     const char *s;
  1823     s=strstr(aline," I\"");
  1824     if (s)
  1825     {
  1826 	if (pswit[ECHO_SWITCH])
  1827 	    g_print("\n%s\n",aline);
  1828 	if (!pswit[OVERVIEW_SWITCH])
  1829 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1830 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1831 	else
  1832 	    cnt_punct++;
  1833     }
  1834 }
  1835 
  1836 /*
  1837  * check_for_extra_period:
  1838  *
  1839  * Check for period without a capital letter. Cut-down from gutspell.
  1840  * Only works when it happens on a single line.
  1841  */
  1842 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1843 {
  1844     const char *s,*t,*s1,*sprev;
  1845     int i;
  1846     gsize len;
  1847     gboolean istypo;
  1848     gchar *testword;
  1849     gunichar c,nc,pc,*decomposition;
  1850     if (pswit[PARANOID_SWITCH])
  1851     {
  1852 	for (t=aline;t=strstr(t,". ");)
  1853 	{
  1854 	    if (t==aline)
  1855 	    {
  1856 		t=g_utf8_next_char(t);
  1857 		/* start of line punctuation is handled elsewhere */
  1858 		continue;
  1859 	    }
  1860 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1861 	    {
  1862 		t=g_utf8_next_char(t);
  1863 		continue;
  1864 	    }
  1865 	    if (warnings->isDutch)
  1866 	    {
  1867 		/* For Frank & Jeroen -- 's Middags case */
  1868 		gunichar c2,c3,c4,c5;
  1869 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1870 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1871 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1872 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1873 		if (CHAR_IS_APOSTROPHE(c2) &&
  1874 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1875 		  g_unichar_isupper(c5))
  1876 		{
  1877 		    t=g_utf8_next_char(t);
  1878 		    continue;
  1879 		}
  1880 	    }
  1881 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1882 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1883 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1884 		s1=g_utf8_next_char(s1);
  1885 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1886 	    {
  1887 		/* we have something to investigate */
  1888 		istypo=TRUE;
  1889 		/* so let's go back and find out */
  1890 		nc=g_utf8_get_char(t);
  1891 		s1=g_utf8_prev_char(t);
  1892 		c=g_utf8_get_char(s1);
  1893 		sprev=g_utf8_prev_char(s1);
  1894 		pc=g_utf8_get_char(sprev);
  1895 		while (s1>=aline &&
  1896 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1897 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1898 		  g_unichar_isalpha(nc)))
  1899 		{
  1900 		    nc=c;
  1901 		    s1=sprev;
  1902 		    c=pc;
  1903 		    sprev=g_utf8_prev_char(s1);
  1904 		    pc=g_utf8_get_char(sprev);
  1905 		}
  1906 		s1=g_utf8_next_char(s1);
  1907 		s=strchr(s1,'.');
  1908 		if (s)
  1909 		    testword=g_strndup(s1,s-s1);
  1910 		else
  1911 		    testword=g_strdup(s1);
  1912 		for (i=0;*abbrev[i];i++)
  1913 		    if (!strcmp(testword,abbrev[i]))
  1914 			istypo=FALSE;
  1915 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1916 		    istypo=FALSE;
  1917 		if (!*g_utf8_next_char(testword))
  1918 		    istypo=FALSE;
  1919 		if (isroman(testword))
  1920 		    istypo=FALSE;
  1921 		if (istypo)
  1922 		{
  1923 		    istypo=FALSE;
  1924 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1925 		    {
  1926 			decomposition=g_unicode_canonical_decomposition(
  1927 			  g_utf8_get_char(s),&len);
  1928 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1929 			    istypo=TRUE;
  1930 			g_free(decomposition);
  1931 		    }
  1932 		}
  1933 		if (istypo &&
  1934 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1935 		{
  1936 		    g_tree_insert(qperiod,g_strdup(testword),
  1937 		      GINT_TO_POINTER(1));
  1938 		    if (pswit[ECHO_SWITCH])
  1939 			g_print("\n%s\n",aline);
  1940 		    if (!pswit[OVERVIEW_SWITCH])
  1941 			g_print("    Line %ld column %ld - Extra period?\n",
  1942 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1943 		    else
  1944 			cnt_punct++;
  1945 		}
  1946 		g_free(testword);
  1947 	    }
  1948 	    t=g_utf8_next_char(t);
  1949 	}
  1950     }
  1951 }
  1952 
  1953 /*
  1954  * check_for_following_punctuation:
  1955  *
  1956  * Check for words usually not followed by punctuation.
  1957  */
  1958 void check_for_following_punctuation(const char *aline)
  1959 {
  1960     int i;
  1961     const char *s,*wordstart;
  1962     gunichar c;
  1963     gchar *inword,*t;
  1964     if (pswit[TYPO_SWITCH])
  1965     {
  1966 	for (s=aline;*s;)
  1967 	{
  1968 	    wordstart=s;
  1969 	    t=getaword(&s);
  1970 	    if (!*t)
  1971 	    {
  1972 		g_free(t);
  1973 		continue;
  1974 	    }
  1975 	    inword=g_utf8_strdown(t,-1);
  1976 	    g_free(t);
  1977 	    for (i=0;*nocomma[i];i++)
  1978 		if (!strcmp(inword,nocomma[i]))
  1979 		{
  1980 		    c=g_utf8_get_char(s);
  1981 		    if (c==',' || c==';' || c==':')
  1982 		    {
  1983 			if (pswit[ECHO_SWITCH])
  1984 			    g_print("\n%s\n",aline);
  1985 			if (!pswit[OVERVIEW_SWITCH])
  1986 			    g_print("    Line %ld column %ld - "
  1987 			      "Query punctuation after %s?\n",
  1988 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1989 			      inword);
  1990 			else
  1991 			    cnt_punct++;
  1992 		    }
  1993 		}
  1994 	    for (i=0;*noperiod[i];i++)
  1995 		if (!strcmp(inword,noperiod[i]))
  1996 		{
  1997 		    c=g_utf8_get_char(s);
  1998 		    if (c=='.' || c=='!')
  1999 		    {
  2000 			if (pswit[ECHO_SWITCH])
  2001 			    g_print("\n%s\n",aline);
  2002 			if (!pswit[OVERVIEW_SWITCH])
  2003 			    g_print("    Line %ld column %ld - "
  2004 			      "Query punctuation after %s?\n",
  2005 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  2006 			      inword);
  2007 			else
  2008 			    cnt_punct++;
  2009 		    }
  2010 		}
  2011 	    g_free(inword);
  2012 	}
  2013     }
  2014 }
  2015 
  2016 /*
  2017  * check_for_typos:
  2018  *
  2019  * Check for commonly mistyped words,
  2020  * and digits like 0 for O in a word.
  2021  */
  2022 void check_for_typos(const char *aline,struct warnings *warnings)
  2023 {
  2024     const char *s,*t,*nt,*wordstart;
  2025     gchar *inword;
  2026     gunichar *decomposition;
  2027     gchar *testword;
  2028     int i,vowel,consonant,*dupcnt;
  2029     gboolean isdup,istypo,alower;
  2030     gunichar c,pc;
  2031     long offset,len;
  2032     gsize decomposition_len;
  2033     for (s=aline;*s;)
  2034     {
  2035 	wordstart=s;
  2036 	inword=getaword(&s);
  2037 	if (!*inword)
  2038 	{
  2039 	    g_free(inword);
  2040 	    continue; /* don't bother with empty lines */
  2041 	}
  2042 	if (mixdigit(inword))
  2043 	{
  2044 	    if (pswit[ECHO_SWITCH])
  2045 		g_print("\n%s\n",aline);
  2046 	    if (!pswit[OVERVIEW_SWITCH])
  2047 		g_print("    Line %ld column %ld - Query digit in %s\n",
  2048 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  2049 	    else
  2050 		cnt_word++;
  2051 	}
  2052 	/*
  2053 	 * Put the word through a series of tests for likely typos and OCR
  2054 	 * errors.
  2055 	 */
  2056 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2057 	{
  2058 	    istypo=FALSE;
  2059 	    alower=FALSE;
  2060 	    for (t=inword;*t;t=g_utf8_next_char(t))
  2061 	    {
  2062 		c=g_utf8_get_char(t);
  2063 		nt=g_utf8_next_char(t);
  2064 		/* lowercase for testing */
  2065 		if (g_unichar_islower(c))
  2066 		    alower=TRUE;
  2067 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  2068 		{
  2069 		    /*
  2070 		     * We have an uppercase mid-word. However, there are
  2071 		     * common cases:
  2072 		     *   Mac and Mc like McGill
  2073 		     *   French contractions like l'Abbe
  2074 		     */
  2075 		    offset=g_utf8_pointer_to_offset(inword,t);
  2076 		    if (offset>0)
  2077 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  2078 		    else
  2079 			pc='\0';
  2080 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  2081 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  2082 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  2083 		      CHAR_IS_APOSTROPHE(pc))
  2084 			; /* do nothing! */
  2085 		    else
  2086 			istypo=TRUE;
  2087 		}
  2088 	    }
  2089 	    testword=g_utf8_casefold(inword,-1);
  2090 	}
  2091 	if (pswit[TYPO_SWITCH])
  2092 	{
  2093 	    /*
  2094 	     * Check for certain unlikely two-letter combinations at word
  2095 	     * start and end.
  2096 	     */
  2097 	    len=g_utf8_strlen(testword,-1);
  2098 	    if (len>1)
  2099 	    {
  2100 		for (i=0;*nostart[i];i++)
  2101 		    if (g_str_has_prefix(testword,nostart[i]))
  2102 			istypo=TRUE;
  2103 		for (i=0;*noend[i];i++)
  2104 		    if (g_str_has_suffix(testword,noend[i]))
  2105 			istypo=TRUE;
  2106 	    }
  2107 	    /* ght is common, gbt never. Like that. */
  2108 	    if (strstr(testword,"cb"))
  2109 		istypo=TRUE;
  2110 	    if (strstr(testword,"gbt"))
  2111 		istypo=TRUE;
  2112 	    if (strstr(testword,"pbt"))
  2113 		istypo=TRUE;
  2114 	    if (strstr(testword,"tbs"))
  2115 		istypo=TRUE;
  2116 	    if (strstr(testword,"mrn"))
  2117 		istypo=TRUE;
  2118 	    if (strstr(testword,"ahle"))
  2119 		istypo=TRUE;
  2120 	    if (strstr(testword,"ihle"))
  2121 		istypo=TRUE;
  2122 	    /*
  2123 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2124 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2125 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2126 	     * numerals, but "ii" is a common scanno.
  2127 	     */
  2128 	    if (strstr(testword,"tbi"))
  2129 		istypo=TRUE;
  2130 	    if (strstr(testword,"tbe"))
  2131 		istypo=TRUE;
  2132 	    if (strstr(testword,"ii"))
  2133 		istypo=TRUE;
  2134 	    /*
  2135 	     * Check for no vowels or no consonants.
  2136 	     * If none, flag a typo.
  2137 	     */
  2138 	    if (!istypo && len>1)
  2139 	    {
  2140 		vowel=consonant=0;
  2141 		for (t=testword;*t;t=g_utf8_next_char(t))
  2142 		{
  2143 		    c=g_utf8_get_char(t);
  2144 		    decomposition=
  2145 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2146 		    if (c=='y' || g_unichar_isdigit(c))
  2147 		    {
  2148 			/* Yah, this is loose. */
  2149 			vowel++;
  2150 			consonant++;
  2151 		    }
  2152 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2153 			vowel++;
  2154 		    else
  2155 			consonant++;
  2156 		    g_free(decomposition);
  2157 		}
  2158 		if (!vowel || !consonant)
  2159 		    istypo=TRUE;
  2160 	    }
  2161 	    /*
  2162 	     * Now exclude the word from being reported if it's in
  2163 	     * the okword list.
  2164 	     */
  2165 	    for (i=0;*okword[i];i++)
  2166 		if (!strcmp(testword,okword[i]))
  2167 		    istypo=FALSE;
  2168 	    /*
  2169 	     * What looks like a typo may be a Roman numeral.
  2170 	     * Exclude these.
  2171 	     */
  2172 	    if (istypo && isroman(testword))
  2173 		istypo=FALSE;
  2174 	    /* Check the manual list of typos. */
  2175 	    if (!istypo)
  2176 		for (i=0;*typo[i];i++)
  2177 		    if (!strcmp(testword,typo[i]))
  2178 			istypo=TRUE;
  2179 	    /*
  2180 	     * Check lowercase s, l, i and m - special cases.
  2181 	     *   "j" - often a semi-colon gone wrong.
  2182 	     *   "d" for a missing apostrophe - he d
  2183 	     *   "n" for "in"
  2184 	     */
  2185 	    if (!istypo && len==1 &&
  2186 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2187 		istypo=TRUE;
  2188 	    if (istypo)
  2189 	    {
  2190 		dupcnt=g_tree_lookup(qword,testword);
  2191 		if (dupcnt)
  2192 		{
  2193 		    (*dupcnt)++;
  2194 		    isdup=!pswit[VERBOSE_SWITCH];
  2195 		}
  2196 		else
  2197 		{
  2198 		    dupcnt=g_new0(int,1);
  2199 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2200 		    isdup=FALSE;
  2201 		}
  2202 		if (!isdup)
  2203 		{
  2204 		    if (pswit[ECHO_SWITCH])
  2205 			g_print("\n%s\n",aline);
  2206 		    if (!pswit[OVERVIEW_SWITCH])
  2207 		    {
  2208 			g_print("    Line %ld column %ld - Query word %s",
  2209 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2210 			  inword);
  2211 			if (!pswit[VERBOSE_SWITCH])
  2212 			    g_print(" - not reporting duplicates");
  2213 			g_print("\n");
  2214 		    }
  2215 		    else
  2216 			cnt_word++;
  2217 		}
  2218 	    }
  2219 	}
  2220 	/* check the user's list of typos */
  2221 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2222 	{
  2223 	    if (pswit[ECHO_SWITCH])
  2224 		g_print("\n%s\n",aline);
  2225 	    if (!pswit[OVERVIEW_SWITCH])  
  2226 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2227 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2228 	}
  2229 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2230 	    g_free(testword);
  2231 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2232 	{
  2233 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2234 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2235 	    {
  2236 		if (pswit[ECHO_SWITCH])
  2237 		    g_print("\n%s\n",aline);
  2238 		if (!pswit[OVERVIEW_SWITCH])
  2239 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2240 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2241 		      inword);
  2242 		else
  2243 		    cnt_word++;
  2244 	    }
  2245 	}
  2246 	g_free(inword);
  2247     }
  2248 }
  2249 
  2250 /*
  2251  * check_for_misspaced_punctuation:
  2252  *
  2253  * Look for added or missing spaces around punctuation and quotes.
  2254  * If there is a punctuation character like ! with no space on
  2255  * either side, suspect a missing!space. If there are spaces on
  2256  * both sides , assume a typo. If we see a double quote with no
  2257  * space or punctuation on either side of it, assume unspaced
  2258  * quotes "like"this.
  2259  */
  2260 void check_for_misspaced_punctuation(const char *aline,
  2261   struct parities *parities,gboolean isemptyline)
  2262 {
  2263     gboolean isacro,isellipsis;
  2264     const char *s;
  2265     gunichar c,nc,pc,n2c;
  2266     int parity;
  2267     c=g_utf8_get_char(aline);
  2268     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2269     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2270     {
  2271 	pc=c;
  2272 	c=nc;
  2273 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2274 	/* For each character in the line after the first. */
  2275 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2276 	{
  2277 	    /* we need to suppress warnings for acronyms like M.D. */
  2278 	    isacro=FALSE;
  2279 	    /* we need to suppress warnings for ellipsis . . . */
  2280 	    isellipsis=FALSE;
  2281 	    /*
  2282 	     * If there are letters on both sides of it or
  2283 	     * if it's strict punctuation followed by an alpha.
  2284 	     */
  2285 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2286 	      g_utf8_strchr("?!,;:",-1,c)))
  2287 	    {
  2288 		if (c=='.')
  2289 		{
  2290 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2291 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2292 			isacro=TRUE;
  2293 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2294 		    if (nc && n2c=='.')
  2295 			isacro=TRUE;
  2296 		}
  2297 		if (!isacro)
  2298 		{
  2299 		    if (pswit[ECHO_SWITCH])
  2300 			g_print("\n%s\n",aline);
  2301 		    if (!pswit[OVERVIEW_SWITCH])
  2302 			g_print("    Line %ld column %ld - Missing space?\n",
  2303 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2304 		    else
  2305 			cnt_punct++;
  2306 		}
  2307 	    }
  2308 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2309 	    {
  2310 		/*
  2311 		 * If there are spaces on both sides,
  2312 		 * or space before and end of line.
  2313 		 */
  2314 		if (c=='.')
  2315 		{
  2316 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2317 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2318 			isellipsis=TRUE;
  2319 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2320 		    if (nc && n2c=='.')
  2321 			isellipsis=TRUE;
  2322 		}
  2323 		if (!isemptyline && !isellipsis)
  2324 		{
  2325 		    if (pswit[ECHO_SWITCH])
  2326 			g_print("\n%s\n",aline);
  2327 		    if (!pswit[OVERVIEW_SWITCH])
  2328 			g_print("    Line %ld column %ld - "
  2329 			  "Spaced punctuation?\n",linecnt,
  2330 			  g_utf8_pointer_to_offset(aline,s)+1);
  2331 		    else
  2332 			cnt_punct++;
  2333 		}
  2334 	    }
  2335 	}
  2336     }
  2337     /* Split out the characters that CANNOT be preceded by space. */
  2338     c=g_utf8_get_char(aline);
  2339     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2340     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2341     {
  2342 	pc=c;
  2343 	c=nc;
  2344 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2345 	/* for each character in the line after the first */
  2346 	if (g_utf8_strchr("?!,;:",-1,c))
  2347 	{
  2348 	    /* if it's punctuation that _cannot_ have a space before it */
  2349 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2350 	    {
  2351 		/*
  2352 		 * If nc DOES == space,
  2353 		 * it was already reported just above.
  2354 		 */
  2355 		if (pswit[ECHO_SWITCH])
  2356 		    g_print("\n%s\n",aline);
  2357 		if (!pswit[OVERVIEW_SWITCH])
  2358 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2359 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2360 		else
  2361 		    cnt_punct++;
  2362 	    }
  2363 	}
  2364     }
  2365     /*
  2366      * Special case " .X" where X is any alpha.
  2367      * This plugs a hole in the acronym code above.
  2368      * Inelegant, but maintainable.
  2369      */
  2370     c=g_utf8_get_char(aline);
  2371     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2372     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2373     {
  2374 	pc=c;
  2375 	c=nc;
  2376 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2377 	/* for each character in the line after the first */
  2378 	if (c=='.')
  2379 	{
  2380 	    /* if it's a period */
  2381 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2382 	    {
  2383 		/*
  2384 		 * If the period follows a space and
  2385 		 * is followed by a letter.
  2386 		 */
  2387 		if (pswit[ECHO_SWITCH])
  2388 		    g_print("\n%s\n",aline);
  2389 		if (!pswit[OVERVIEW_SWITCH])
  2390 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2391 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2392 		else
  2393 		    cnt_punct++;
  2394 	    }
  2395 	}
  2396     }
  2397     c=g_utf8_get_char(aline);
  2398     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2399     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2400     {
  2401 	pc=c;
  2402 	c=nc;
  2403 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2404 	/* for each character in the line after the first */
  2405 	if (CHAR_IS_DQUOTE(c))
  2406 	{
  2407 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2408 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2409 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2410 	    {
  2411 		if (pswit[ECHO_SWITCH])
  2412 		    g_print("\n%s\n",aline);
  2413 		if (!pswit[OVERVIEW_SWITCH])
  2414 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2415 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2416 		else
  2417 		    cnt_punct++;
  2418 	    }
  2419 	}
  2420     }
  2421     /* Check parity of quotes. */
  2422     nc=g_utf8_get_char(aline);
  2423     for (s=aline;*s;s=g_utf8_next_char(s))
  2424     {
  2425 	c=nc;
  2426 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2427 	if (CHAR_IS_DQUOTE(c))
  2428 	{
  2429 	    if (c==CHAR_DQUOTE)
  2430 	    {
  2431 		parities->dquote=!parities->dquote;
  2432 		parity=parities->dquote;
  2433 	    }
  2434 	    else if (c==CHAR_LD_QUOTE)
  2435 		parity=1;
  2436 	    else
  2437 		parity=0;
  2438 	    if (!parity)
  2439 	    {
  2440 		/* parity even */
  2441 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2442 		{
  2443 		    if (pswit[ECHO_SWITCH])
  2444 			g_print("\n%s\n",aline);
  2445 		    if (!pswit[OVERVIEW_SWITCH])
  2446 			g_print("    Line %ld column %ld - "
  2447 			  "Wrongspaced quotes?\n",
  2448 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2449 		    else
  2450 			cnt_punct++;
  2451 		}
  2452 	    }
  2453 	    else
  2454 	    {
  2455 		/* parity odd */
  2456 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2457 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2458 		{
  2459 		    if (pswit[ECHO_SWITCH])
  2460 			g_print("\n%s\n",aline);
  2461 		    if (!pswit[OVERVIEW_SWITCH])
  2462 			g_print("    Line %ld column %ld - "
  2463 			  "Wrongspaced quotes?\n",
  2464 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2465 		    else
  2466 			cnt_punct++;
  2467 		}
  2468 	    }
  2469 	}
  2470     }
  2471     c=g_utf8_get_char(aline);
  2472     if (CHAR_IS_DQUOTE(c))
  2473     {
  2474 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2475 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2476 	{
  2477 	    if (pswit[ECHO_SWITCH])
  2478 		g_print("\n%s\n",aline);
  2479 	    if (!pswit[OVERVIEW_SWITCH])
  2480 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2481 		  linecnt);
  2482 	    else
  2483 		cnt_punct++;
  2484 	}
  2485     }
  2486     if (pswit[SQUOTE_SWITCH])
  2487     {
  2488 	nc=g_utf8_get_char(aline);
  2489 	for (s=aline;*s;s=g_utf8_next_char(s))
  2490 	{
  2491 	    c=nc;
  2492 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2493 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2494 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2495 	      !g_unichar_isalpha(nc)))
  2496 	    {
  2497 		parities->squote=!parities->squote;
  2498 		if (!parities->squote)
  2499 		{
  2500 		    /* parity even */
  2501 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2502 		    {
  2503 			if (pswit[ECHO_SWITCH])
  2504 			    g_print("\n%s\n",aline);
  2505 			if (!pswit[OVERVIEW_SWITCH])
  2506 			    g_print("    Line %ld column %ld - "
  2507 			      "Wrongspaced singlequotes?\n",
  2508 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2509 			else
  2510 			    cnt_punct++;
  2511 		    }
  2512 		}
  2513 		else
  2514 		{
  2515 		    /* parity odd */
  2516 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2517 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2518 		    {
  2519 			if (pswit[ECHO_SWITCH])
  2520 			    g_print("\n%s\n",aline);
  2521 			if (!pswit[OVERVIEW_SWITCH])
  2522 			    g_print("    Line %ld column %ld - "
  2523 			      "Wrongspaced singlequotes?\n",
  2524 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2525 			else
  2526 			    cnt_punct++;
  2527 		    }
  2528 		}
  2529 	    }
  2530 	}
  2531     }
  2532 }
  2533 
  2534 /*
  2535  * str_follows_word:
  2536  *
  2537  * Given a position p within a string str, determine whether it follows the
  2538  * given word. This is roughly equivalent to the regular expression (?<=\bword)
  2539  * but has different boundary conditions.
  2540  */
  2541 static gboolean str_follows_word(const char *str,const char *p,const char *word)
  2542 {
  2543     int len=strlen(word);
  2544     if (p-len<str)
  2545 	return FALSE;
  2546     else if (!g_str_has_prefix(p-len,word))
  2547 	return FALSE;
  2548     else if (p-len==str)
  2549 	return TRUE;
  2550     else
  2551 	/* Using non-alpha as a word boundary. See UAX #29 for a better way. */
  2552 	return !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(p-len)));
  2553 }
  2554 
  2555 /*
  2556  * check_for_double_punctuation:
  2557  *
  2558  * Look for double punctuation like ,. or ,,
  2559  * Thanks to DW for the suggestion!
  2560  * In books with references, ".," and ".;" are common
  2561  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2562  * OTOH, from my initial tests, there are also fairly
  2563  * common errors. What to do? Make these cases paranoid?
  2564  * ".," is the most common, so warnings->dotcomma is used
  2565  * to suppress detailed reporting if it occurs often.
  2566  * Indeed, ".," is so common after "etc" or "&c" that
  2567  * we don't warn on these cases at all.
  2568  */
  2569 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2570 {
  2571     const char *s;
  2572     gunichar c,nc;
  2573     gboolean is_query;
  2574     nc=g_utf8_get_char(aline);
  2575     for (s=aline;*s;s=g_utf8_next_char(s))
  2576     {
  2577 	c=nc;
  2578 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2579 	/* for each punctuation character in the line */
  2580 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2581 	  g_utf8_strchr(".?!,;:",-1,nc))
  2582 	{
  2583 	    /* followed by punctuation, it's a query, unless . . . */
  2584 	    is_query=TRUE;
  2585 	    if (warnings->isFrench &&
  2586 	      (g_str_has_prefix(s,",...") || g_str_has_prefix(s,"...,") ||
  2587 	       g_str_has_prefix(s,";...") || g_str_has_prefix(s,"...;") ||
  2588 	       g_str_has_prefix(s,":...") || g_str_has_prefix(s,"...:") ||
  2589 	       g_str_has_prefix(s,"!...") || g_str_has_prefix(s,"...!") ||
  2590 	       g_str_has_prefix(s,"?...") || g_str_has_prefix(s,"...?")))
  2591 	    {
  2592 		s+=4;
  2593 		nc=g_utf8_get_char(g_utf8_next_char(s));
  2594 		is_query=FALSE;
  2595 	    }
  2596 	    else if (c==nc && (c=='.' || c=='?' || c=='!'))
  2597 	    {
  2598 		/* do nothing for .. !! and ?? which can be legit */
  2599 		is_query=FALSE;
  2600 	    }
  2601 	    else if (c=='.' && nc==',')
  2602 	    {
  2603 		if (!warnings->dotcomma || str_follows_word(aline,s,"etc") || 
  2604 		  str_follows_word(aline,s,"&c"))
  2605 		    is_query=FALSE;
  2606 	    }
  2607 	    if (is_query)
  2608 	    {
  2609 		if (pswit[ECHO_SWITCH])
  2610 		    g_print("\n%s\n",aline);
  2611 		if (!pswit[OVERVIEW_SWITCH])
  2612 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2613 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2614 		else
  2615 		    cnt_punct++;
  2616 	    }
  2617 	}
  2618     }
  2619 }
  2620 
  2621 /*
  2622  * check_for_spaced_quotes:
  2623  */
  2624 void check_for_spaced_quotes(const char *aline)
  2625 {
  2626     int i;
  2627     const char *s,*t;
  2628     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2629       CHAR_RS_QUOTE};
  2630     GString *pattern;
  2631     s=aline;
  2632     while ((t=strstr(s," \" ")))
  2633     {
  2634 	if (pswit[ECHO_SWITCH])
  2635 	    g_print("\n%s\n",aline);
  2636 	if (!pswit[OVERVIEW_SWITCH])
  2637 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2638 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2639 	else
  2640 	    cnt_punct++;
  2641 	s=g_utf8_next_char(g_utf8_next_char(t));
  2642     }
  2643     pattern=g_string_new(NULL);
  2644     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2645     {
  2646 	g_string_assign(pattern," ");
  2647 	g_string_append_unichar(pattern,single_quotes[i]);
  2648 	g_string_append_c(pattern,' ');
  2649 	s=aline;
  2650 	while ((t=strstr(s,pattern->str)))
  2651 	{
  2652 	    if (pswit[ECHO_SWITCH])
  2653 		g_print("\n%s\n",aline);
  2654 	    if (!pswit[OVERVIEW_SWITCH])
  2655 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2656 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2657 	    else
  2658 		cnt_punct++;
  2659 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2660 	}
  2661     }
  2662     g_string_free(pattern,TRUE);
  2663 }
  2664 
  2665 /*
  2666  * check_for_miscased_genative:
  2667  *
  2668  * Check special case of 'S instead of 's at end of word.
  2669  */
  2670 void check_for_miscased_genative(const char *aline)
  2671 {
  2672     const char *s;
  2673     gunichar c,nc,pc;
  2674     if (!*aline)
  2675 	return;
  2676     c=g_utf8_get_char(aline);
  2677     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2678     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2679     {
  2680 	pc=c;
  2681 	c=nc;
  2682 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2683 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2684 	{
  2685 	    if (pswit[ECHO_SWITCH])
  2686 		g_print("\n%s\n",aline);
  2687 	    if (!pswit[OVERVIEW_SWITCH])
  2688 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2689 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2690 	    else
  2691 		cnt_punct++;
  2692 	}
  2693     }
  2694 }
  2695 
  2696 /*
  2697  * check_end_of_line:
  2698  *
  2699  * Now check special cases - start and end of line -
  2700  * for single and double quotes. Start is sometimes [sic]
  2701  * but better to query it anyway.
  2702  * While we're here, check for dash at end of line.
  2703  */
  2704 void check_end_of_line(const char *aline,struct warnings *warnings)
  2705 {
  2706     int lbytes;
  2707     const char *s;
  2708     gunichar c1,c2;
  2709     lbytes=strlen(aline);
  2710     if (g_utf8_strlen(aline,lbytes)>1)
  2711     {
  2712 	s=g_utf8_prev_char(aline+lbytes);
  2713 	c1=g_utf8_get_char(s);
  2714 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2715 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2716 	{
  2717 	    if (pswit[ECHO_SWITCH])
  2718 		g_print("\n%s\n",aline);
  2719 	    if (!pswit[OVERVIEW_SWITCH])
  2720 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2721 		  g_utf8_strlen(aline,lbytes));
  2722 	    else
  2723 		cnt_punct++;
  2724 	}
  2725 	c1=g_utf8_get_char(aline);
  2726 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2727 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2728 	{
  2729 	    if (pswit[ECHO_SWITCH])
  2730 		g_print("\n%s\n",aline);
  2731 	    if (!pswit[OVERVIEW_SWITCH])
  2732 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2733 	    else
  2734 		cnt_punct++;
  2735 	}
  2736 	/*
  2737 	 * Dash at end of line may well be legit - paranoid mode only
  2738 	 * and don't report em-dash at line-end.
  2739 	 */
  2740 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2741 	{
  2742 	    for (s=g_utf8_prev_char(aline+lbytes);
  2743 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2744 		;
  2745 	    if (g_utf8_get_char(s)=='-' &&
  2746 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2747 	    {
  2748 		if (pswit[ECHO_SWITCH])
  2749 		    g_print("\n%s\n",aline);
  2750 		if (!pswit[OVERVIEW_SWITCH])
  2751 		    g_print("    Line %ld column %ld - "
  2752 		      "Hyphen at end of line?\n",
  2753 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2754 	    }
  2755 	}
  2756     }
  2757 }
  2758 
  2759 /*
  2760  * check_for_unspaced_bracket:
  2761  *
  2762  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2763  * If so, suspect a scanno like "a]most".
  2764  */
  2765 void check_for_unspaced_bracket(const char *aline)
  2766 {
  2767     const char *s;
  2768     gunichar c,nc,pc;
  2769     c=g_utf8_get_char(aline);
  2770     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2771     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2772     {
  2773 	pc=c;
  2774 	c=nc;
  2775 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2776 	if (!nc)
  2777 	    break;
  2778 	/* for each bracket character in the line except 1st & last */
  2779 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2780 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2781 	{
  2782 	    if (pswit[ECHO_SWITCH])
  2783 		g_print("\n%s\n",aline);
  2784 	    if (!pswit[OVERVIEW_SWITCH])
  2785 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2786 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2787 	    else
  2788 		cnt_punct++;
  2789 	}
  2790     }
  2791 }
  2792 
  2793 /*
  2794  * check_for_unpunctuated_endquote:
  2795  */
  2796 void check_for_unpunctuated_endquote(const char *aline)
  2797 {
  2798     const char *s;
  2799     gunichar c,nc,pc;
  2800     QuoteClass qc;
  2801     c=g_utf8_get_char(aline);
  2802     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2803     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2804     {
  2805 	pc=c;
  2806 	c=nc;
  2807 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2808 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2809 	/* for each character in the line except 1st */
  2810 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2811 	{
  2812 	    if (pswit[ECHO_SWITCH])
  2813 		g_print("\n%s\n",aline);
  2814 	    if (!pswit[OVERVIEW_SWITCH])
  2815 		g_print("    Line %ld column %ld - "
  2816 		  "endquote missing punctuation?\n",
  2817 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2818 	    else
  2819 		cnt_punct++;
  2820 	}
  2821     }
  2822 }
  2823 
  2824 /*
  2825  * check_for_html_tag:
  2826  *
  2827  * Check for <HTML TAG>.
  2828  *
  2829  * If there is a < in the line, followed at some point
  2830  * by a > then we suspect HTML.
  2831  */
  2832 void check_for_html_tag(const char *aline)
  2833 {
  2834     const char *open,*close;
  2835     gchar *tag;
  2836     open=strchr(aline,'<');
  2837     if (open)
  2838     {
  2839 	close=strchr(g_utf8_next_char(open),'>');
  2840 	if (close)
  2841 	{
  2842 	    if (pswit[ECHO_SWITCH])
  2843 		g_print("\n%s\n",aline);
  2844 	    if (!pswit[OVERVIEW_SWITCH])
  2845 	    {
  2846 		tag=g_strndup(open,close-open+1);
  2847 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2848 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2849 		g_free(tag);
  2850 	    }
  2851 	    else
  2852 		cnt_html++;
  2853 	}
  2854     }
  2855 }
  2856 
  2857 /*
  2858  * check_for_html_entity:
  2859  *
  2860  * Check for &symbol; HTML.
  2861  *
  2862  * If there is a & in the line, followed at
  2863  * some point by a ; then we suspect HTML.
  2864  */
  2865 void check_for_html_entity(const char *aline)
  2866 {
  2867     const char *s,*amp,*scolon;
  2868     gchar *entity;
  2869     amp=strchr(aline,'&');
  2870     if (amp)
  2871     {
  2872 	scolon=strchr(amp,';');
  2873 	if (scolon)
  2874 	{
  2875 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2876 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2877 		    break;		/* Don't report "Jones & Son;" */
  2878 	    if (s>=scolon)
  2879 	    {
  2880 		if (pswit[ECHO_SWITCH])
  2881 		    g_print("\n%s\n",aline);
  2882 		if (!pswit[OVERVIEW_SWITCH])
  2883 		{
  2884 		    entity=g_strndup(amp,scolon-amp+1);
  2885 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2886 		      linecnt,(int)(amp-aline)+1,entity);
  2887 		    g_free(entity);
  2888 		}
  2889 		else
  2890 		    cnt_html++;
  2891 	    }
  2892 	}
  2893     }
  2894 }
  2895 
  2896 /*
  2897  * check_for_omitted_punctuation:
  2898  *
  2899  * Check for omitted punctuation at end of paragraph by working back
  2900  * through prevline. DW.
  2901  * Need to check this only for "normal" paras.
  2902  * So what is a "normal" para?
  2903  *    Not normal if one-liner (chapter headings, etc.)
  2904  *    Not normal if doesn't contain at least one locase letter
  2905  *    Not normal if starts with space
  2906  */
  2907 void check_for_omitted_punctuation(const char *prevline,
  2908   struct line_properties *last,int start_para_line)
  2909 {
  2910     gboolean letter_on_line=FALSE;
  2911     const char *s;
  2912     gunichar c;
  2913     gboolean closing_quote;
  2914     for (s=prevline;*s;s=g_utf8_next_char(s))
  2915 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2916 	{
  2917 	    letter_on_line=TRUE;
  2918 	    break;
  2919 	}
  2920     /*
  2921      * This next "if" is a problem.
  2922      * If we say "start_para_line <= linecnt - 1", that includes
  2923      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2924      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2925      * misses genuine one-line paragraphs.
  2926      */
  2927     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2928       g_utf8_get_char(prevline)>CHAR_SPACE)
  2929     {
  2930 	s=prevline+strlen(prevline);
  2931 	do
  2932 	{
  2933 	    s=g_utf8_prev_char(s);
  2934 	    c=g_utf8_get_char(s);
  2935 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2936 		closing_quote=TRUE;
  2937 	    else
  2938 		closing_quote=FALSE;
  2939 	} while (closing_quote && s>prevline);
  2940 	for (;s>prevline;s=g_utf8_prev_char(s))
  2941 	{
  2942 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2943 	    {
  2944 		if (pswit[ECHO_SWITCH])
  2945 		    g_print("\n%s\n",prevline);
  2946 		if (!pswit[OVERVIEW_SWITCH])
  2947 		    g_print("    Line %ld column %ld - "
  2948 		      "No punctuation at para end?\n",
  2949 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2950 		else
  2951 		    cnt_punct++;
  2952 		break;
  2953 	    }
  2954 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2955 		break;
  2956 	}
  2957     }
  2958 }
  2959 
  2960 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2961 {
  2962     const char *word=key;
  2963     int *dupcnt=value;
  2964     if (*dupcnt)
  2965 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2966 	  word,*dupcnt);
  2967     return FALSE;
  2968 }
  2969 
  2970 void print_as_windows_1252(const char *string)
  2971 {
  2972     gsize inbytes,outbytes;
  2973     gchar *buf,*bp;
  2974     static GIConv converter=(GIConv)-1;
  2975     if (!string)
  2976     {
  2977 	if (converter!=(GIConv)-1)
  2978 	    g_iconv_close(converter);
  2979 	converter=(GIConv)-1;
  2980 	return;
  2981     }
  2982     if (converter==(GIConv)-1)
  2983 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2984     if (converter!=(GIConv)-1)
  2985     {
  2986 	inbytes=outbytes=strlen(string);
  2987 	bp=buf=g_malloc(outbytes+1);
  2988 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2989 	*bp='\0';
  2990 	fputs(buf,stdout);
  2991 	g_free(buf);
  2992     }
  2993     else
  2994 	fputs(string,stdout);
  2995 }
  2996 
  2997 void print_as_utf_8(const char *string)
  2998 {
  2999     fputs(string,stdout);
  3000 }
  3001 
  3002 /*
  3003  * procfile:
  3004  *
  3005  * Process one file.
  3006  */
  3007 void procfile(const char *filename)
  3008 {
  3009     const char *s;
  3010     gchar *parastart=NULL;	/* first line of current para */
  3011     gchar *etext,*aline;
  3012     gchar *etext_ptr;
  3013     GError *err=NULL;
  3014     struct first_pass_results *first_pass_results;
  3015     struct warnings *warnings;
  3016     struct counters counters={0};
  3017     struct line_properties last={0};
  3018     struct parities parities={0};
  3019     struct pending pending={0};
  3020     gboolean isemptyline;
  3021     long start_para_line=0;
  3022     gboolean isnewpara=FALSE,enddash=FALSE;
  3023     last.start=CHAR_SPACE;
  3024     linecnt=checked_linecnt=0;
  3025     etext=read_etext(filename,&err);
  3026     if (!etext)
  3027     {
  3028 	if (pswit[STDOUT_SWITCH])
  3029 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  3030 	else
  3031 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  3032 	exit(1);
  3033     }
  3034     g_print("\n\nFile: %s\n\n",filename);
  3035     first_pass_results=first_pass(etext);
  3036     warnings=report_first_pass(first_pass_results);
  3037     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  3038     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  3039     /*
  3040      * Here we go with the main pass. Hold onto yer hat!
  3041      */
  3042     linecnt=0;
  3043     etext_ptr=etext;
  3044     while ((aline=flgets(&etext_ptr,linecnt+1)))
  3045     {
  3046 	linecnt++;
  3047 	if (linecnt==1)
  3048 	    isnewpara=TRUE;
  3049 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  3050 	    continue;    // skip DP page separators completely
  3051 	if (linecnt<first_pass_results->firstline ||
  3052 	  (first_pass_results->footerline>0 &&
  3053 	  linecnt>first_pass_results->footerline))
  3054 	{
  3055 	    if (pswit[HEADER_SWITCH])
  3056 	    {
  3057 		if (g_str_has_prefix(aline,"Title:"))
  3058 		    g_print("    %s\n",aline);
  3059 		if (g_str_has_prefix(aline,"Author:"))
  3060 		    g_print("    %s\n",aline);
  3061 		if (g_str_has_prefix(aline,"Release Date:"))
  3062 		    g_print("    %s\n",aline);
  3063 		if (g_str_has_prefix(aline,"Edition:"))
  3064 		    g_print("    %s\n\n",aline);
  3065 	    }
  3066 	    continue;		/* skip through the header */
  3067 	}
  3068 	checked_linecnt++;
  3069 	print_pending(aline,parastart,&pending);
  3070 	isemptyline=analyse_quotes(aline,&counters);
  3071 	if (isnewpara && !isemptyline)
  3072 	{
  3073 	    /* This line is the start of a new paragraph. */
  3074 	    start_para_line=linecnt;
  3075 	    /* Capture its first line in case we want to report it later. */
  3076 	    g_free(parastart);
  3077 	    parastart=g_strdup(aline);
  3078 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  3079 	    s=aline;
  3080 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  3081 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  3082 		s=g_utf8_next_char(s);
  3083 	    if (g_unichar_islower(g_utf8_get_char(s)))
  3084 	    {
  3085 		/* and its first letter is lowercase */
  3086 		if (pswit[ECHO_SWITCH])
  3087 		    g_print("\n%s\n",aline);
  3088 		if (!pswit[OVERVIEW_SWITCH])
  3089 		    g_print("    Line %ld column %ld - "
  3090 		      "Paragraph starts with lower-case\n",
  3091 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  3092 		else
  3093 		    cnt_punct++;
  3094 	    }
  3095 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  3096 	}
  3097 	/* Check for an em-dash broken at line end. */
  3098 	if (enddash && g_utf8_get_char(aline)=='-')
  3099 	{
  3100 	    if (pswit[ECHO_SWITCH])
  3101 		g_print("\n%s\n",aline);
  3102 	    if (!pswit[OVERVIEW_SWITCH])
  3103 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  3104 	    else
  3105 		cnt_punct++;
  3106 	}
  3107 	enddash=FALSE;
  3108 	for (s=g_utf8_prev_char(aline+strlen(aline));
  3109 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  3110 	    ;
  3111 	if (s>=aline && g_utf8_get_char(s)=='-')
  3112 	    enddash=TRUE;
  3113 	check_for_control_characters(aline);
  3114 	check_for_odd_characters(aline,warnings,isemptyline);
  3115 	if (warnings->longline)
  3116 	    check_for_long_line(aline);
  3117 	if (warnings->shortline)
  3118 	    check_for_short_line(aline,&last);
  3119 	last.blen=last.len;
  3120 	last.len=g_utf8_strlen(aline,-1);
  3121 	last.start=g_utf8_get_char(aline);
  3122 	check_for_starting_punctuation(aline);
  3123 	if (warnings->dash)
  3124 	{
  3125 	    check_for_spaced_emdash(aline);
  3126 	    check_for_spaced_dash(aline);
  3127 	}
  3128 	check_for_unmarked_paragraphs(aline);
  3129 	check_for_jeebies(aline);
  3130 	check_for_mta_from(aline);
  3131 	check_for_orphan_character(aline);
  3132 	check_for_pling_scanno(aline);
  3133 	check_for_extra_period(aline,warnings);
  3134 	check_for_following_punctuation(aline);
  3135 	check_for_typos(aline,warnings);
  3136 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3137 	check_for_double_punctuation(aline,warnings);
  3138 	check_for_spaced_quotes(aline);
  3139 	check_for_miscased_genative(aline);
  3140 	check_end_of_line(aline,warnings);
  3141 	check_for_unspaced_bracket(aline);
  3142 	if (warnings->endquote)
  3143 	    check_for_unpunctuated_endquote(aline);
  3144 	check_for_html_tag(aline);
  3145 	check_for_html_entity(aline);
  3146 	if (isemptyline)
  3147 	{
  3148 	    check_for_mismatched_quotes(&counters,&pending);
  3149 	    counters_reset(&counters);
  3150 	    /* let the next iteration know that it's starting a new para */
  3151 	    isnewpara=TRUE;
  3152 	    if (prevline)
  3153 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3154 	}
  3155 	g_free(prevline);
  3156 	prevline=g_strdup(aline);
  3157     }
  3158     linecnt++;
  3159     check_for_mismatched_quotes(&counters,&pending);
  3160     print_pending(NULL,parastart,&pending);
  3161     reset_pending(&pending);
  3162     if (prevline)
  3163     {
  3164 	g_free(prevline);
  3165 	prevline=NULL;
  3166     }
  3167     g_free(parastart);
  3168     g_free(prevline);
  3169     g_free(etext);
  3170     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3171 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3172     g_tree_unref(qword);
  3173     g_tree_unref(qperiod);
  3174     counters_destroy(&counters);
  3175     g_set_print_handler(NULL);
  3176     print_as_windows_1252(NULL);
  3177     if (pswit[MARKUP_SWITCH])  
  3178 	loseentities(NULL);
  3179 }
  3180 
  3181 /*
  3182  * flgets:
  3183  *
  3184  * Get one line from the input text, checking for
  3185  * the existence of exactly one CR/LF line-end per line.
  3186  *
  3187  * Returns: a pointer to the line.
  3188  */
  3189 char *flgets(char **etext,long lcnt)
  3190 {
  3191     gunichar c;
  3192     gboolean isCR=FALSE;
  3193     char *theline=*etext;
  3194     char *eos=theline;
  3195     gchar *s;
  3196     for (;;)
  3197     {
  3198 	c=g_utf8_get_char(*etext);
  3199 	if (!c)
  3200 	{
  3201 	    if (*etext==theline)
  3202 		return NULL;
  3203 	    else if (pswit[LINE_END_SWITCH])
  3204 	    {
  3205 		if (pswit[ECHO_SWITCH])
  3206 		{
  3207 		    s=g_strndup(theline,eos-theline);
  3208 		    g_print("\n%s\n",s);
  3209 		    g_free(s);
  3210 		}
  3211 		if (!pswit[OVERVIEW_SWITCH])
  3212 		    /* There may, or may not, have been a CR */
  3213 		    g_print("    Line %ld - No LF?\n",lcnt);
  3214 		else
  3215 		    cnt_lineend++;
  3216 	    }
  3217 	    break;
  3218 	}
  3219 	*etext=g_utf8_next_char(*etext);
  3220 	/* either way, it's end of line */
  3221 	if (c=='\n')
  3222 	{
  3223 	    if (isCR)
  3224 		break;
  3225 	    else
  3226 	    {
  3227 		/* Error - a LF without a preceding CR */
  3228 		if (pswit[LINE_END_SWITCH])
  3229 		{
  3230 		    if (pswit[ECHO_SWITCH])
  3231 		    {
  3232 			s=g_strndup(theline,eos-theline);
  3233 			g_print("\n%s\n",s);
  3234 			g_free(s);
  3235 		    }
  3236 		    if (!pswit[OVERVIEW_SWITCH])
  3237 			g_print("    Line %ld - No CR?\n",lcnt);
  3238 		    else
  3239 			cnt_lineend++;
  3240 		}
  3241 		break;
  3242 	    }
  3243 	}
  3244 	if (c=='\r')
  3245 	{
  3246 	    if (isCR)
  3247 	    {
  3248 		/* Error - two successive CRs */
  3249 		if (pswit[LINE_END_SWITCH])
  3250 		{
  3251 		    if (pswit[ECHO_SWITCH])
  3252 		    {
  3253 			s=g_strndup(theline,eos-theline);
  3254 			g_print("\n%s\n",s);
  3255 			g_free(s);
  3256 		    }
  3257 		    if (!pswit[OVERVIEW_SWITCH])
  3258 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3259 		    else
  3260 			cnt_lineend++;
  3261 		}
  3262 	    }
  3263 	    isCR=TRUE;
  3264 	}
  3265 	else
  3266 	{
  3267 	    if (pswit[LINE_END_SWITCH] && isCR)
  3268 	    {
  3269 		if (pswit[ECHO_SWITCH])
  3270 		{
  3271 		    s=g_strndup(theline,eos-theline);
  3272 		    g_print("\n%s\n",s);
  3273 		    g_free(s);
  3274 		}
  3275 		if (!pswit[OVERVIEW_SWITCH])
  3276 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3277 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3278 		else
  3279 		    cnt_lineend++;
  3280 		*eos=' ';
  3281 	    }
  3282 	    isCR=FALSE;
  3283 	    eos=g_utf8_next_char(eos);
  3284 	}
  3285     }
  3286     *eos='\0';
  3287     if (pswit[MARKUP_SWITCH])  
  3288 	postprocess_for_HTML(theline);
  3289     if (pswit[DP_SWITCH])  
  3290 	postprocess_for_DP(theline);
  3291     return theline;
  3292 }
  3293 
  3294 /*
  3295  * mixdigit:
  3296  *
  3297  * Takes a "word" as a parameter, and checks whether it
  3298  * contains a mixture of alpha and digits. Generally, this is an
  3299  * error, but may not be for cases like 4th or L5 12s. 3d.
  3300  *
  3301  * Returns: TRUE iff an is error found.
  3302  */
  3303 gboolean mixdigit(const char *checkword)
  3304 {
  3305     gboolean wehaveadigit,wehavealetter,query;
  3306     const char *s,*nondigit;
  3307     wehaveadigit=wehavealetter=query=FALSE;
  3308     for (s=checkword;*s;s=g_utf8_next_char(s))
  3309 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3310 	    wehavealetter=TRUE;
  3311 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3312 	    wehaveadigit=TRUE;
  3313     if (wehaveadigit && wehavealetter)
  3314     {
  3315 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3316 	query=TRUE;
  3317 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3318 	  nondigit=g_utf8_next_char(nondigit))
  3319 	    ;
  3320 	/* digits, ending in st, rd, nd, th of either case */
  3321 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3322 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3323 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3324 	  !g_ascii_strcasecmp(nondigit,"th"))
  3325 	    query=FALSE;
  3326 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3327 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3328 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3329 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3330 	    query=FALSE;
  3331 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3332 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3333 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3334 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3335 	    query=FALSE;
  3336 	/* digits, ending in l, L, s or d */
  3337 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3338 	  !strcmp(nondigit,"d"))
  3339 	    query=FALSE;
  3340 	/*
  3341 	 * L at the start of a number, representing Britsh pounds, like L500.
  3342 	 * This is cute. We know the current word is mixed digit. If the first
  3343 	 * letter is L, there must be at least one digit following. If both
  3344 	 * digits and letters follow, we have a genuine error, else we have a
  3345 	 * capital L followed by digits, and we accept that as a non-error.
  3346 	 */
  3347 	if (g_utf8_get_char(checkword)=='L' &&
  3348 	  !mixdigit(g_utf8_next_char(checkword)))
  3349 	    query=FALSE;
  3350     }
  3351     return query;
  3352 }
  3353 
  3354 /*
  3355  * getaword:
  3356  *
  3357  * Extracts the first/next "word" from the line, and returns it.
  3358  * A word is defined as one English word unit--or at least that's the aim.
  3359  * "ptr" is advanced to the position in the line where we will start
  3360  * looking for the next word.
  3361  *
  3362  * Returns: A newly-allocated string.
  3363  */
  3364 gchar *getaword(const char **ptr)
  3365 {
  3366     const char *s,*t;
  3367     GString *word;
  3368     gunichar c,pc;
  3369     word=g_string_new(NULL);
  3370     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3371       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3372       **ptr;*ptr=g_utf8_next_char(*ptr))
  3373     {
  3374 	/* Handle exceptions for footnote markers like [1] */
  3375 	if (g_utf8_get_char(*ptr)=='[')
  3376 	{
  3377 	    g_string_append_c(word,'[');
  3378 	    s=g_utf8_next_char(*ptr);
  3379 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
  3380 		g_string_append_unichar(word,g_utf8_get_char(s));
  3381 	    if (g_utf8_get_char(s)==']')
  3382 	    {
  3383 		g_string_append_c(word,']');
  3384 		*ptr=g_utf8_next_char(s);
  3385 		return g_string_free(word,FALSE);
  3386 	    }
  3387 	    else
  3388 		g_string_truncate(word,0);
  3389 	}
  3390     }
  3391     /*
  3392      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3393      * Especially yucky is the case of L1,000
  3394      * This section looks for a pattern of characters including a digit
  3395      * followed by a comma or period followed by one or more digits.
  3396      * If found, it returns this whole pattern as a word; otherwise we discard
  3397      * the results and resume our normal programming.
  3398      */
  3399     s=*ptr;
  3400     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3401       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3402       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3403 	g_string_append_unichar(word,g_utf8_get_char(s));
  3404     if (word->len)
  3405     {
  3406 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3407 	{
  3408 	    c=g_utf8_get_char(t);
  3409 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3410 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3411 	    {
  3412 		*ptr=s;
  3413 		return g_string_free(word,FALSE);
  3414 	    }
  3415 	}
  3416     }
  3417     /* we didn't find a punctuated number - do the regular getword thing */
  3418     g_string_truncate(word,0);
  3419     c=g_utf8_get_char(*ptr);
  3420     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3421       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3422 	g_string_append_unichar(word,c);
  3423     return g_string_free(word,FALSE);
  3424 }
  3425 
  3426 /*
  3427  * isroman:
  3428  *
  3429  * Is this word a Roman Numeral?
  3430  *
  3431  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3432  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3433  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3434  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3435  * expressions thereof, except when it came to taxes. Allow any number of M,
  3436  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3437  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3438  * of optional Is.
  3439  */
  3440 gboolean isroman(const char *t)
  3441 {
  3442     const char *s;
  3443     if (!t || !*t)
  3444 	return FALSE;
  3445     s=t;
  3446     while (g_utf8_get_char(t)=='m' && *t)
  3447 	t++;
  3448     if (g_utf8_get_char(t)=='d')
  3449 	t++;
  3450     if (g_str_has_prefix(t,"cm"))
  3451 	t+=2;
  3452     if (g_str_has_prefix(t,"cd"))
  3453 	t+=2;
  3454     while (g_utf8_get_char(t)=='c' && *t)
  3455 	t++;
  3456     if (g_str_has_prefix(t,"xl"))
  3457 	t+=2;
  3458     if (g_str_has_prefix(t,"xc"))
  3459 	t+=2;
  3460     if (g_utf8_get_char(t)=='l')
  3461 	t++;
  3462     while (g_utf8_get_char(t)=='x' && *t)
  3463 	t++;
  3464     if (g_str_has_prefix(t,"ix"))
  3465 	t+=2;
  3466     if (g_str_has_prefix(t,"iv"))
  3467 	t+=2;
  3468     if (g_utf8_get_char(t)=='v')
  3469 	t++;
  3470     while (g_utf8_get_char(t)=='i' && *t)
  3471 	t++;
  3472     return !*t;
  3473 }
  3474 
  3475 /*
  3476  * postprocess_for_DP:
  3477  *
  3478  * Invoked with the -d switch from flgets().
  3479  * It simply "removes" from the line a hard-coded set of common
  3480  * DP-specific tags, so that the line passed to the main routine has
  3481  * been pre-cleaned of DP markup.
  3482  */
  3483 void postprocess_for_DP(char *theline)
  3484 {
  3485     char *s,*t;
  3486     int i;
  3487     if (!*theline) 
  3488 	return;
  3489     for (i=0;*DPmarkup[i];i++)
  3490 	while ((s=strstr(theline,DPmarkup[i])))
  3491 	{
  3492 	    t=s+strlen(DPmarkup[i]);
  3493 	    memmove(s,t,strlen(t)+1);
  3494 	}
  3495 }
  3496 
  3497 /*
  3498  * postprocess_for_HTML:
  3499  *
  3500  * Invoked with the -m switch from flgets().
  3501  * It simply "removes" from the line a hard-coded set of common
  3502  * HTML tags and "replaces" a hard-coded set of common HTML
  3503  * entities, so that the line passed to the main routine has
  3504  * been pre-cleaned of HTML.
  3505  */
  3506 void postprocess_for_HTML(char *theline)
  3507 {
  3508     while (losemarkup(theline))
  3509 	;
  3510     loseentities(theline);
  3511 }
  3512 
  3513 char *losemarkup(char *theline)
  3514 {
  3515     char *s,*t;
  3516     int i;
  3517     s=strchr(theline,'<');
  3518     t=s?strchr(s,'>'):NULL;
  3519     if (!s || !t)
  3520 	return NULL;
  3521     for (i=0;*markup[i];i++)
  3522 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3523 	{
  3524 	    t=g_utf8_next_char(t);
  3525 	    memmove(s,t,strlen(t)+1);
  3526 	    return s;
  3527 	}
  3528     /* It's an unrecognized <xxx>. */
  3529     return NULL;
  3530 }
  3531 
  3532 void loseentities(char *theline)
  3533 {
  3534     int i;
  3535     gsize nb;
  3536     char *amp,*scolon;
  3537     gchar *s,*t;
  3538     gunichar c;
  3539     GTree *entities=NULL;
  3540     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3541     if (!theline)
  3542     {
  3543 	if (entities)
  3544 	    g_tree_destroy(entities);
  3545 	entities=NULL;
  3546 	if (translit!=(GIConv)-1)
  3547 	    g_iconv_close(translit);
  3548 	translit=(GIConv)-1;
  3549 	if (to_utf8!=(GIConv)-1)
  3550 	    g_iconv_close(to_utf8);
  3551 	to_utf8=(GIConv)-1;
  3552 	return;
  3553     }
  3554     if (!*theline)
  3555 	return;
  3556     if (!entities)
  3557     {
  3558 	entities=g_tree_new((GCompareFunc)strcmp);
  3559 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3560 	    g_tree_insert(entities,HTMLentities[i].name,
  3561 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3562     }
  3563     if (translit==(GIConv)-1)
  3564 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3565     if (to_utf8==(GIConv)-1)
  3566 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3567     while((amp=strchr(theline,'&')))
  3568     {
  3569 	scolon=strchr(amp,';');
  3570 	if (scolon)
  3571 	{
  3572 	    if (amp[1]=='#')
  3573 	    {
  3574 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3575 		    c=strtol(amp+2,NULL,10);
  3576 		else if (amp[2]=='x' &&
  3577 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3578 		    c=strtol(amp+3,NULL,16);
  3579 	    }
  3580 	    else
  3581 	    {
  3582 		s=g_strndup(amp+1,scolon-(amp+1));
  3583 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3584 		g_free(s);
  3585 	    }
  3586 	}
  3587 	else
  3588 	    c=0;
  3589 	if (c)
  3590 	{
  3591 	    theline=amp;
  3592 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3593 		theline+=g_unichar_to_utf8(c,theline);
  3594 	    else
  3595 	    {
  3596 		s=g_malloc(6);
  3597 		nb=g_unichar_to_utf8(c,s);
  3598 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3599 		g_free(s);
  3600 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3601 		g_free(t);
  3602 		memcpy(theline,s,nb);
  3603 		g_free(s);
  3604 		theline+=nb;
  3605 	    }
  3606 	    memmove(theline,g_utf8_next_char(scolon),
  3607 	      strlen(g_utf8_next_char(scolon))+1);
  3608 	}
  3609 	else
  3610 	    theline=g_utf8_next_char(amp);
  3611     }
  3612 }
  3613 
  3614 gboolean tagcomp(const char *strin,const char *basetag)
  3615 {
  3616     gboolean retval;
  3617     gchar *s,*t;
  3618     if (g_utf8_get_char(strin)=='/')
  3619 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3620     else
  3621 	t=g_utf8_casefold(strin,-1);
  3622     s=g_utf8_casefold(basetag,-1);
  3623     retval=g_str_has_prefix(t,s);
  3624     g_free(s);
  3625     g_free(t);
  3626     return retval;
  3627 }
  3628 
  3629 void proghelp(GOptionContext *context)
  3630 {
  3631     gchar *help;
  3632     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3633     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3634     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3635     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3636       "For details, read the file COPYING.\n",stderr);
  3637     fputs("This is Free Software; "
  3638       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3639     fputs("read the file COPYING for details.\n\n",stderr);
  3640     help=g_option_context_get_help(context,TRUE,NULL);
  3641     fputs(help,stderr);
  3642     g_free(help);
  3643     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3644     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3645       "non-ASCII\n",stderr);
  3646     fputs("characters like accented letters, "
  3647       "lines longer than 75 or shorter than 55,\n",stderr);
  3648     fputs("unbalanced quotes or brackets, "
  3649       "a variety of badly formatted punctuation, \n",stderr);
  3650     fputs("HTML tags, some likely typos. "
  3651       "It is NOT a substitute for human judgement.\n",stderr);
  3652     fputs("\n",stderr);
  3653 }