bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Wed Oct 30 17:22:05 2013 +0000 (2013-10-30)
changeset 213 2995a39f4dba
parent 209 70cc629ec1e0
permissions -rw-r--r--
Added tag 2.0.67 for changeset aece0899b1d3
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 gboolean typo_compat,paranoid_compat;
   136 
   137 static GOptionEntry options[]={
   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   139       "Ignore DP-specific markup", NULL },
   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   142       "Don't ignore DP-specific markup", NULL },
   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   144       "Echo queried line", NULL },
   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   147       "Don't echo queried line", NULL },
   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   149       "Check single quotes", NULL },
   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   152       "Don't check single quotes", NULL },
   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   154       "Check common typos", NULL },
   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   157       "Don't check common typos", NULL },
   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   159       "Require closure of quotes on every paragraph", NULL },
   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   162       "Don't require closure of quotes on every paragraph", NULL },
   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   165       "Enable paranoid querying of everything", NULL },
   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   168       "Disable paranoid querying of everything", NULL },
   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   171       "Enable line end checking", NULL },
   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   174       "Disable line end checking", NULL },
   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   176       "Overview: just show counts", NULL },
   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   179       "Show individual warnings", NULL },
   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   181       "Output errors to stdout instead of stderr", NULL },
   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   184       "Output errors to stderr instead of stdout", NULL },
   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   186       "Echo header fields", NULL },
   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Don't echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "No special handling for markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   199       "Ignore file of user-defined typos", NULL },
   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   201       "Verbose - list everything", NULL },
   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   204       "Switch off verbose mode", NULL },
   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   206       "Set of characters valid for this ebook", "NAME" },
   207     { NULL }
   208 };
   209 
   210 /*
   211  * Options relating to configuration which make no sense from inside
   212  * a configuration file.
   213  */
   214 
   215 static GOptionEntry config_options[]={
   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   217       "Defaults for use on www upload", NULL },
   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   219       "Dump current config settings", NULL },
   220     { NULL }
   221 };
   222 
   223 static GOptionEntry compatibility_options[]={
   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   225       "Toggle checking for common typos", NULL },
   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   227       "Toggle both paranoid mode and common typos", NULL },
   228     { NULL }
   229 };
   230 
   231 long cnt_quote;		/* for overview mode, count of quote queries */
   232 long cnt_brack;		/* for overview mode, count of brackets queries */
   233 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   234 long cnt_odd;		/* for overview mode, count of odd character queries */
   235 long cnt_long;		/* for overview mode, count of long line errors */
   236 long cnt_short;		/* for overview mode, count of short line queries */
   237 long cnt_punct;		/* for overview mode,
   238 			   count of punctuation and spacing queries */
   239 long cnt_dash;		/* for overview mode, count of dash-related queries */
   240 long cnt_word;		/* for overview mode, count of word queries */
   241 long cnt_html;		/* for overview mode, count of html queries */
   242 long cnt_lineend;	/* for overview mode, count of line-end queries */
   243 long cnt_spacend;	/* count of lines with space at end */
   244 long linecnt;		/* count of total lines in the file */
   245 long checked_linecnt;	/* count of lines actually checked */
   246 
   247 void proghelp(GOptionContext *context);
   248 void procfile(const char *);
   249 
   250 gchar *running_from;
   251 
   252 gboolean mixdigit(const char *);
   253 gchar *getaword(const char *,const char **);
   254 char *flgets(char **,long,int);
   255 void postprocess_for_HTML(char *);
   256 char *linehasmarkup(char *);
   257 char *losemarkup(char *);
   258 gboolean tagcomp(const char *,const char *);
   259 void loseentities(char *);
   260 gboolean isroman(const char *);
   261 void postprocess_for_DP(char *);
   262 void print_as_windows_1252(const char *string);
   263 void print_as_utf_8(const char *string);
   264 
   265 GTree *qword,*qperiod;
   266 
   267 #ifdef __WIN32__
   268 UINT saved_cp;
   269 #endif
   270 
   271 gboolean set_charset(const char *name,GError **err)
   272 {
   273     /* The various UNICODE encodings all share the same character set. */
   274     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   275       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   276       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   277       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   278       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   279     int i;
   280     if (charset)
   281 	g_free(charset);
   282     if (charset_validator!=(GIConv)-1)
   283 	g_iconv_close(charset_validator);
   284     if (!name || !g_strcasecmp(name,"auto"))
   285     {
   286 	charset=NULL;
   287 	charset_validator=(GIConv)-1;
   288 	return TRUE;
   289     }
   290     else
   291 	charset=g_strdup(name);
   292     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   293 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   294 	{
   295 	    g_free(charset);
   296 	    charset=g_strdup("UTF-8");
   297 	    break;
   298 	}
   299     if (!strcmp(charset,"UTF-8"))
   300 	charset_validator=(GIConv)-1;
   301     else
   302     {
   303 	charset_validator=g_iconv_open(charset,"UTF-8");
   304 	if (charset_validator==(GIConv)-1)
   305 	{
   306 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   307 	      "Unknown character set \"%s\"",charset);
   308 	    return FALSE;
   309 	}
   310     }
   311     return TRUE;
   312 }
   313 
   314 GKeyFile *config;
   315 
   316 void config_file_update(GKeyFile *kf)
   317 {
   318     int i;
   319     const char *s;
   320     gboolean sw;
   321     for(i=0;options[i].long_name;i++)
   322     {
   323 	if (g_str_has_prefix(options[i].long_name,"no-"))
   324 	    continue;
   325 	if (options[i].arg==G_OPTION_ARG_NONE)
   326 	{
   327 	    sw=*(gboolean *)options[i].arg_data;
   328 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   329 		sw=!sw;
   330 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   331 	}
   332 	else if (options[i].arg==G_OPTION_ARG_STRING)
   333 	{
   334 	    s=*(gchar **)options[i].arg_data;
   335 	    if (!s)
   336 		s="auto";
   337 	    g_key_file_set_string(kf,"options",options[i].long_name,s);
   338 	}
   339 	else
   340 	    g_assert_not_reached();
   341     }
   342 }
   343 
   344 void config_file_add_comments(GKeyFile *kf)
   345 {
   346     int i;
   347     gchar *comment;
   348     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   349       NULL);
   350     for(i=0;options[i].long_name;i++)
   351     {
   352 	if (g_str_has_prefix(options[i].long_name,"no-"))
   353 	    continue;
   354 	comment=g_strconcat(" ",options[i].description,NULL);
   355 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   356 	g_free(comment);
   357     }
   358 }
   359 
   360 void dump_config(void)
   361 {
   362     gchar *s;
   363     if (config)
   364 	config_file_update(config);
   365     else
   366     {
   367 	config=g_key_file_new();
   368 	config_file_update(config);
   369 	config_file_add_comments(config);
   370     }
   371     s=g_key_file_to_data(config,NULL,NULL);
   372     if (s)
   373 	g_print("%s",s);
   374     g_free(s);
   375 }
   376 
   377 GKeyFile *read_config_file(gchar **full_path)
   378 {
   379     int i;
   380     GError *err=NULL;
   381     gchar **search_dirs;
   382     gchar *path;
   383     const char *search_path;
   384     GKeyFile *kf;
   385     kf=g_key_file_new();
   386     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   387     if (search_path)
   388     {
   389 #ifdef __WIN32__
   390 	search_dirs=g_strsplit(search_path,";",0);
   391 #else
   392 	search_dirs=g_strsplit(search_path,":",0);
   393 #endif
   394     }
   395     else
   396     {
   397 	search_dirs=g_new(gchar *,4);
   398 	search_dirs[0]=g_get_current_dir();
   399 	search_dirs[1]=g_strdup(running_from);
   400 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   401 	search_dirs[3]=NULL;
   402     }
   403     for(i=0;search_dirs[i];i++)
   404     {
   405 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   406 	if (g_key_file_load_from_file(kf,path,
   407 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   408 	    break;
   409 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   410 	{
   411 	    g_printerr("Bookloupe: Error reading %s\n",path);
   412 	    g_printerr("%s\n",err->message);
   413 	    exit(1);
   414 	}
   415 	g_clear_error(&err);
   416 	g_free(path);
   417 	path=NULL;
   418     }
   419     if (!search_dirs[i])
   420     {
   421 	g_key_file_free(kf);
   422 	kf=NULL;
   423     }
   424     g_strfreev(search_dirs);
   425     if (full_path && kf)
   426 	*full_path=path;
   427     else
   428 	g_free(path);
   429     return kf;
   430 }
   431 
   432 void parse_config_file(void)
   433 {
   434     int i,j;
   435     gchar *path,*s;
   436     gchar **keys;
   437     gboolean sw;
   438     GError *err=NULL;
   439     config=read_config_file(&path);
   440     if (config)
   441 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   442     else
   443 	keys=NULL;
   444     if (keys)
   445     {
   446 	for(i=0;keys[i];i++)
   447 	{
   448 	    for(j=0;options[j].long_name;j++)
   449 	    {
   450 		if (g_str_has_prefix(options[j].long_name,"no-"))
   451 		    continue;
   452 		else if (!strcmp(keys[i],options[j].long_name))
   453 		{
   454 		    if (options[j].arg==G_OPTION_ARG_NONE)
   455 		    {
   456 			sw=g_key_file_get_boolean(config,"options",keys[i],
   457 			  &err);
   458 			if (err)
   459 			{
   460 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   461 			      path,keys[i],err->message);
   462 			    g_clear_error(&err);
   463 			}
   464 			else
   465 			{
   466 			    if (options[j].flags&G_OPTION_FLAG_REVERSE)
   467 				sw=!sw;
   468 			    *(gboolean *)options[j].arg_data=sw;
   469 			}
   470 			break;
   471 		    }
   472 		    else if (options[j].arg==G_OPTION_ARG_STRING)
   473 		    {
   474 			s=g_key_file_get_string(config,"options",keys[i],
   475 			  &err);
   476 			if (err)
   477 			{
   478 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   479 			      path,keys[i],err->message);
   480 			    g_clear_error(&err);
   481 			}
   482 			else
   483 			{
   484 			    g_free(*(gchar **)options[j].arg_data);
   485 			    if (!g_strcmp0(s,"auto"))
   486 			    {
   487 				*(gchar **)options[j].arg_data=NULL;
   488 				g_free(s);
   489 			    }
   490 			    else
   491 				*(gchar **)options[j].arg_data=s;
   492 			}
   493 			break;
   494 		    }
   495 		    else
   496 			g_assert_not_reached();
   497 		}
   498 	    }
   499 	    if (!options[j].long_name)
   500 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   501 		  path,keys[i]);
   502 	}
   503 	g_strfreev(keys);
   504     }
   505     if (config)
   506 	g_free(path);
   507 }
   508 
   509 void parse_options(int *argc,char ***argv)
   510 {
   511     GError *err=NULL;
   512     GOptionContext *context;
   513     GOptionGroup *compatibility;
   514     context=g_option_context_new(
   515       "file - look for errors in Project Gutenberg(TM) etexts");
   516     g_option_context_add_main_entries(context,options,NULL);
   517     g_option_context_add_main_entries(context,config_options,NULL);
   518     compatibility=g_option_group_new("compatibility",
   519       "Options for Compatibility with Gutcheck:",
   520       "Show compatibility options",NULL,NULL);
   521     g_option_group_add_entries(compatibility,compatibility_options);
   522     g_option_context_add_group(context,compatibility);
   523     g_option_context_set_description(context,
   524       "For simplicity, only the switch options which reverse the\n"
   525       "default configuration are listed. In most cases, both vanilla\n"
   526       "and \"no-\" prefixed versions are available for use.");
   527     if (!g_option_context_parse(context,argc,argv,&err))
   528     {
   529 	g_printerr("Bookloupe: %s\n",err->message);
   530 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   531 	exit(1);
   532     }
   533     if (typo_compat)
   534 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   535     if (paranoid_compat)
   536     {
   537 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   538 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   539     }
   540     /*
   541      * Web uploads - for the moment, this is really just a placeholder
   542      * until we decide what processing we really want to do on web uploads
   543      */
   544     if (pswit[WEB_SWITCH])
   545     {
   546 	/* specific override for web uploads */
   547 	pswit[ECHO_SWITCH]=TRUE;
   548 	pswit[SQUOTE_SWITCH]=FALSE;
   549 	pswit[TYPO_SWITCH]=TRUE;
   550 	pswit[QPARA_SWITCH]=FALSE;
   551 	pswit[PARANOID_SWITCH]=TRUE;
   552 	pswit[LINE_END_SWITCH]=FALSE;
   553 	pswit[OVERVIEW_SWITCH]=FALSE;
   554 	pswit[STDOUT_SWITCH]=FALSE;
   555 	pswit[HEADER_SWITCH]=TRUE;
   556 	pswit[VERBOSE_SWITCH]=FALSE;
   557 	pswit[MARKUP_SWITCH]=FALSE;
   558 	pswit[USERTYPO_SWITCH]=FALSE;
   559 	pswit[DP_SWITCH]=FALSE;
   560     }
   561     if (opt_charset && !set_charset(opt_charset,&err))
   562     {
   563 	g_printerr("%s\n",err->message);
   564 	exit(1);
   565     }
   566     if (pswit[DUMP_CONFIG_SWITCH])
   567     {
   568 	dump_config();
   569 	exit(0);
   570     }
   571     g_free(opt_charset);
   572     opt_charset=NULL;
   573     if (pswit[OVERVIEW_SWITCH])
   574 	/* just print summary; don't echo */
   575 	pswit[ECHO_SWITCH]=FALSE;
   576     if (*argc<2)
   577     {
   578 	proghelp(context);
   579 	exit(1);
   580     }
   581     g_option_context_free(context);
   582 }
   583 
   584 /*
   585  * read_user_scannos:
   586  *
   587  * Read in the user-defined stealth scanno list.
   588  */
   589 void read_user_scannos(void)
   590 {
   591     GError *err=NULL;
   592     gchar *usertypo_file;
   593     gboolean okay;
   594     int i;
   595     gsize len,nb;
   596     gchar *contents,*utf8,**lines;
   597     usertypo_file=g_strdup("bookloupe.typ");
   598     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   599     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   600     {
   601 	g_clear_error(&err);
   602 	g_free(usertypo_file);
   603 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   604 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   605     }
   606     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   607     {
   608 	g_clear_error(&err);
   609 	g_free(usertypo_file);
   610 	usertypo_file=g_strdup("gutcheck.typ");
   611 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   612     }
   613     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   614     {
   615 	g_clear_error(&err);
   616 	g_free(usertypo_file);
   617 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   618 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   619     }
   620     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   621     {
   622 	g_free(usertypo_file);
   623 	g_print("   --> I couldn't find bookloupe.typ "
   624 	  "-- proceeding without user typos.\n");
   625 	return;
   626     }
   627     else if (!okay)
   628     {
   629 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   630 	g_free(usertypo_file);
   631 	g_clear_error(&err);
   632 	exit(1);
   633     }
   634     if (g_utf8_validate(contents,len,NULL))
   635     {
   636 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   637 	if (!charset)
   638 	    (void)set_charset("UNICODE",NULL);
   639     }
   640     else
   641 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   642     g_free(contents);
   643     lines=g_strsplit_set(utf8,"\r\n",0);
   644     g_free(utf8);
   645     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   646     for (i=0;lines[i];i++)
   647 	if (*(unsigned char *)lines[i]>'!')
   648 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   649 	else
   650 	    g_free(lines[i]);
   651     g_free(lines);
   652 }
   653 
   654 /*
   655  * read_etext:
   656  *
   657  * Read an etext returning a newly allocated string containing the file
   658  * contents or NULL on error.
   659  */
   660 gchar *read_etext(const char *filename,GError **err)
   661 {
   662     GError *tmp_err=NULL;
   663     gchar *contents,*utf8;
   664     gsize len,bytes_read,bytes_written;
   665     int i,line,col;
   666     if (!g_file_get_contents(filename,&contents,&len,err))
   667 	return NULL;
   668     if (g_utf8_validate(contents,len,NULL))
   669     {
   670 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   671 	g_set_print_handler(print_as_utf_8);
   672 #ifdef __WIN32__
   673 	SetConsoleOutputCP(CP_UTF8);
   674 #endif
   675     }
   676     else
   677     {
   678 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   679 	  &bytes_written,&tmp_err);
   680 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   681 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   682 	{
   683 	    line=col=1;
   684 	    for(i=0;i<bytes_read;i++)
   685 		if (contents[i]=='\n')
   686 		{
   687 		    line++;
   688 		    col=1;
   689 		}
   690 		else if (contents[i]!='\r')
   691 		    col++;
   692 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   693 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   694 	      "valid Windows-1252 character",
   695 	      ((unsigned char *)contents)[bytes_read],line,col);
   696 	}
   697 	else if (tmp_err)
   698 	    g_propagate_error(err,tmp_err);
   699 	g_set_print_handler(print_as_windows_1252);
   700 #ifdef __WIN32__
   701 	SetConsoleOutputCP(1252);
   702 #endif
   703     }
   704     g_free(contents);
   705     return utf8;
   706 }
   707 
   708 void cleanup_on_exit(void)
   709 {
   710 #ifdef __WIN32__
   711     SetConsoleOutputCP(saved_cp);
   712 #endif
   713 }
   714 
   715 int main(int argc,char **argv)
   716 {
   717 #ifdef __WIN32__
   718     atexit(cleanup_on_exit);
   719     saved_cp=GetConsoleOutputCP();
   720 #endif
   721     running_from=g_path_get_dirname(argv[0]);
   722     /* Paranoid checking is turned OFF, not on, by its switch */
   723     pswit[PARANOID_SWITCH]=TRUE;
   724     /* if running in paranoid mode, typo checks default to enabled */
   725     pswit[TYPO_SWITCH]=TRUE;
   726     /* Line-end checking is turned OFF, not on, by its switch */
   727     pswit[LINE_END_SWITCH]=TRUE;
   728     /* Echoing is turned OFF, not on, by its switch */
   729     pswit[ECHO_SWITCH]=TRUE;
   730     parse_config_file();
   731     parse_options(&argc,&argv);
   732     if (pswit[USERTYPO_SWITCH])
   733 	read_user_scannos();
   734     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   735     procfile(argv[1]);
   736     if (pswit[OVERVIEW_SWITCH])
   737     {
   738 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   739 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   740 	g_print("    --------------- Queries found --------------\n");
   741 	if (cnt_long)
   742 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   743 	if (cnt_short)
   744 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   745 	if (cnt_lineend)
   746 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   747 	if (cnt_word)
   748 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   749 	if (cnt_quote)
   750 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   751 	if (cnt_brack)
   752 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   753 	if (cnt_bin)
   754 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   755 	if (cnt_odd)
   756 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   757 	if (cnt_punct)
   758 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   759 	if (cnt_dash)
   760 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   761 	if (cnt_html)
   762 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   763 	g_print("\n");
   764 	g_print("    TOTAL QUERIES		  %14ld\n",
   765 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   766 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   767     }
   768     g_free(running_from);
   769     if (usertypo)
   770 	g_tree_unref(usertypo);
   771     set_charset(NULL,NULL);
   772     if (config)
   773 	g_key_file_free(config);
   774     return 0;
   775 }
   776 
   777 void count_dashes(const char *line,const char *dash,
   778   struct dash_results *results)
   779 {
   780     int i;
   781     gchar **tokens;
   782     gunichar pc,nc;
   783     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   784     if (!*line)
   785 	return;
   786     tokens=g_strsplit(line,dash,0);
   787     if (tokens[1])
   788 	results->base++;
   789     for(i=1;tokens[i];i++)
   790     {
   791 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   792 	nc=g_utf8_get_char(tokens[i]);
   793 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   794 	    spaced=TRUE;
   795 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   796 	    spaced2=TRUE;
   797 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   798 	    unspaced=TRUE;
   799     }
   800     if (spaced)
   801 	results->space++;
   802     if (spaced2)
   803 	/* count of lines with em-dashes with spaces both sides */
   804 	results->non_PG_space++;
   805     if (unspaced)
   806 	/* count of lines with PG-type em-dashes with no spaces */
   807 	results->PG_space++;
   808     g_strfreev(tokens);
   809 }
   810 
   811 /*
   812  * first_pass:
   813  *
   814  * Run a first pass - verify that it's a valid PG
   815  * file, decide whether to report some things that
   816  * occur many times in the text like long or short
   817  * lines, non-standard dashes, etc.
   818  */
   819 struct first_pass_results *first_pass(const char *etext)
   820 {
   821     gunichar laststart=CHAR_SPACE;
   822     const char *s;
   823     gchar *lc_line;
   824     int i,j,lbytes,llen;
   825     gchar **lines;
   826     unsigned int lastlen=0,lastblen=0;
   827     long spline=0,nspline=0;
   828     static struct first_pass_results results={0};
   829     struct dash_results tmp_dash_results;
   830     gchar *inword;
   831     QuoteClass qc;
   832     lines=g_strsplit(etext,"\n",0);
   833     if (!lines[0])
   834     {
   835 	/* An empty etext has no terminators */
   836 	results.newlines=DOS_NEWLINES;
   837     }
   838     else if (!lines[1])
   839     {
   840 	/*
   841 	 * If there are no LFs, we don't have UNIX-style
   842 	 * terminators, but we might have OS9-style ones.
   843 	 */
   844 	results.newlines=OS9_NEWLINES;
   845 	g_strfreev(lines);
   846 	lines=g_strsplit(etext,"\r",0);
   847 	if (!lines[0] || !lines[1])
   848 	    /* Looks like we don't have any terminators at all */
   849 	    results.newlines=DOS_NEWLINES;
   850     }
   851     else
   852     {
   853 	/* We might have UNIX-style terminators */
   854 	results.newlines=UNIX_NEWLINES;
   855     }
   856     for (j=0;lines[j];j++)
   857     {
   858 	lbytes=strlen(lines[j]);
   859 	if (lbytes>0 && lines[j][lbytes-1]=='\r')
   860 	{
   861 	    results.newlines=DOS_NEWLINES;
   862 	    do
   863 	    {
   864 		lines[j][--lbytes]='\0';
   865 	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
   866 	}
   867 	llen=g_utf8_strlen(lines[j],lbytes);
   868 	linecnt++;
   869 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   870 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   871 	{
   872 	    if (spline)
   873 		g_print("   --> Duplicate header?\n");
   874 	    spline=linecnt+1;   /* first line of non-header text, that is */
   875 	}
   876 	if (!strncmp(lines[j],"*** START",9) &&
   877 	  strstr(lines[j],"PROJECT GUTENBERG"))
   878 	{
   879 	    if (nspline)
   880 		g_print("   --> Duplicate header?\n");
   881 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   882 	}
   883 	if (spline || nspline)
   884 	{
   885 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   886 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   887 	    {
   888 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   889 		{
   890 		    if (results.footerline)
   891 		    {
   892 			/* it's an old-form header - we can detect duplicates */
   893 			if (!nspline)
   894 			    g_print("   --> Duplicate footer?\n");
   895 		    }
   896 		    else
   897 			results.footerline=linecnt;
   898 		}
   899 	    }
   900 	    g_free(lc_line);
   901 	}
   902 	if (spline)
   903 	    results.firstline=spline;
   904 	if (nspline)
   905 	    results.firstline=nspline;  /* override with new */
   906 	if (results.footerline)
   907 	    continue;    /* don't count the boilerplate in the footer */
   908 	results.totlen+=llen;
   909 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   910 	{
   911 	    if (g_utf8_get_char(s)>127)
   912 		results.binlen++;
   913 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   914 		results.alphalen++;
   915 	    if (s>lines[j])
   916 	    {
   917 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   918 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   919 		else
   920 		    qc=INVALID_QUOTE;
   921 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   922 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   923 		    results.endquote_count++;
   924 	    }
   925 	}
   926 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   927 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   928 	    results.shortline++;
   929 	if (lbytes>0 &&
   930 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   931 	    cnt_spacend++;
   932 	if (strstr(lines[j],".,"))
   933 	    results.dotcomma++;
   934 	/* only count ast lines for ignoring purposes where there is */
   935 	/* locase text on the line */
   936 	if (strchr(lines[j],'*'))
   937 	{
   938 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   939 		if (g_unichar_islower(g_utf8_get_char(s)))
   940 		    break;
   941 	    if (*s)
   942 		results.astline++;
   943 	}
   944 	if (strchr(lines[j],'/'))
   945 	    results.fslashline++;
   946 	if (lbytes>0)
   947 	{
   948 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   949 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   950 	      s=g_utf8_prev_char(s))
   951 		;
   952 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   953 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   954 		results.hyphens++;
   955 	}
   956 	if (llen>LONGEST_PG_LINE)
   957 	    results.longline++;
   958 	if (llen>WAY_TOO_LONG)
   959 	    results.verylongline++;
   960 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   961 	{
   962 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   963 	    if (i>0)
   964 		results.htmcount++;
   965 	    if (strstr(lines[j],"<i>"))
   966 		results.htmcount+=4; /* bonus marks! */
   967 	}
   968 	/* Check for spaced em-dashes */
   969 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   970 	count_dashes(lines[j],"--",&tmp_dash_results);
   971 	count_dashes(lines[j],"—",&tmp_dash_results);
   972 	if (tmp_dash_results.base)
   973 	    results.emdash.base++;
   974 	if (tmp_dash_results.non_PG_space)
   975 	    results.emdash.non_PG_space++;
   976 	if (tmp_dash_results.PG_space)
   977 	    results.emdash.PG_space++;
   978 	for (s=lines[j];*s;)
   979 	{
   980 	    inword=getaword(NULL,&s);
   981 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   982 		results.Dutchcount++;
   983 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   984 		results.Frenchcount++;
   985 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   986 		results.standalone_digit++;
   987 	    g_free(inword);
   988 	}
   989 	/* Check for spaced dashes */
   990 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   991 	    results.spacedash++;
   992 	lastblen=lastlen;
   993 	lastlen=llen;
   994 	laststart=lines[j][0];
   995     }
   996     g_strfreev(lines);
   997     return &results;
   998 }
   999 
  1000 /*
  1001  * report_first_pass:
  1002  *
  1003  * Make some snap decisions based on the first pass results.
  1004  */
  1005 struct warnings *report_first_pass(struct first_pass_results *results)
  1006 {
  1007     static struct warnings warnings={0};
  1008     warnings.newlines=results->newlines;
  1009     if (warnings.newlines==UNIX_NEWLINES)
  1010 	g_print("   --> No lines in this file have a CR. Not reporting them. "
  1011 	  "Project Gutenberg requires that all lineends be CR-LF.\n");
  1012     else if (warnings.newlines==OS9_NEWLINES)
  1013 	g_print("   --> No lines in this file have a LF. Not reporting them. "
  1014 	  "Project Gutenberg requires that all lineends be CR-LF.\n");
  1015     if (cnt_spacend>0)
  1016 	g_print("   --> %ld lines in this file have white space at end\n",
  1017 	  cnt_spacend);
  1018     warnings.dotcomma=1;
  1019     if (results->dotcomma>5)
  1020     {
  1021 	warnings.dotcomma=0;
  1022 	g_print("   --> %ld lines in this file contain '.,'. "
  1023 	  "Not reporting them.\n",results->dotcomma);
  1024     }
  1025     /*
  1026      * If more than 50 lines, or one-tenth, are short,
  1027      * don't bother reporting them.
  1028      */
  1029     warnings.shortline=1;
  1030     if (results->shortline>50 || results->shortline*10>linecnt)
  1031     {
  1032 	warnings.shortline=0;
  1033 	g_print("   --> %ld lines in this file are short. "
  1034 	  "Not reporting short lines.\n",results->shortline);
  1035     }
  1036     /*
  1037      * If more than 50 lines, or one-tenth, are long,
  1038      * don't bother reporting them.
  1039      */
  1040     warnings.longline=1;
  1041     if (results->longline>50 || results->longline*10>linecnt)
  1042     {
  1043 	warnings.longline=0;
  1044 	g_print("   --> %ld lines in this file are long. "
  1045 	  "Not reporting long lines.\n",results->longline);
  1046     }
  1047     /* If more than 10 lines contain asterisks, don't bother reporting them. */
  1048     warnings.ast=1;
  1049     if (results->astline>10)
  1050     {
  1051 	warnings.ast=0;
  1052 	g_print("   --> %ld lines in this file contain asterisks. "
  1053 	  "Not reporting them.\n",results->astline);
  1054     }
  1055     /*
  1056      * If more than 10 lines contain forward slashes,
  1057      * don't bother reporting them.
  1058      */
  1059     warnings.fslash=1;
  1060     if (results->fslashline>10)
  1061     {
  1062 	warnings.fslash=0;
  1063 	g_print("   --> %ld lines in this file contain forward slashes. "
  1064 	  "Not reporting them.\n",results->fslashline);
  1065     }
  1066     /*
  1067      * If more than 20 lines contain unpunctuated endquotes,
  1068      * don't bother reporting them.
  1069      */
  1070     warnings.endquote=1;
  1071     if (results->endquote_count>20)
  1072     {
  1073 	warnings.endquote=0;
  1074 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
  1075 	  "Not reporting them.\n",results->endquote_count);
  1076     }
  1077     /*
  1078      * If more than 15 lines contain standalone digits,
  1079      * don't bother reporting them.
  1080      */
  1081     warnings.digit=1;
  1082     if (results->standalone_digit>10)
  1083     {
  1084 	warnings.digit=0;
  1085 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
  1086 	  "Not reporting them.\n",results->standalone_digit);
  1087     }
  1088     /*
  1089      * If more than 20 lines contain hyphens at end,
  1090      * don't bother reporting them.
  1091      */
  1092     warnings.hyphen=1;
  1093     if (results->hyphens>20)
  1094     {
  1095 	warnings.hyphen=0;
  1096 	g_print("   --> %ld lines in this file have hyphens at end. "
  1097 	  "Not reporting them.\n",results->hyphens);
  1098     }
  1099     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1100     {
  1101 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1102 	pswit[MARKUP_SWITCH]=1;
  1103     }
  1104     if (results->verylongline>0)
  1105 	g_print("   --> %ld lines in this file are VERY long!\n",
  1106 	  results->verylongline);
  1107     /*
  1108      * If there are more non-PG spaced dashes than PG em-dashes,
  1109      * assume it's deliberate.
  1110      * Current PG guidelines say don't use them, but older texts do,
  1111      * and some people insist on them whatever the guidelines say.
  1112      */
  1113     warnings.dash=1;
  1114     if (results->spacedash+results->emdash.non_PG_space>
  1115       results->emdash.PG_space)
  1116     {
  1117 	warnings.dash=0;
  1118 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1119 	  "Not reporting them.\n",
  1120 	  results->spacedash+results->emdash.non_PG_space);
  1121     }
  1122     if (charset)
  1123 	warnings.bin=0;
  1124     else
  1125     {
  1126 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1127 	warnings.bin=1;
  1128 	/* If more than a quarter of characters are hi-bit, bug out. */
  1129 	if (results->binlen*4>results->totlen)
  1130 	{
  1131 	    g_print("   --> This file does not appear to be ASCII. "
  1132 	      "Terminating. Best of luck with it!\n");
  1133 	    exit(1);
  1134 	}
  1135 	if (results->alphalen*4<results->totlen)
  1136 	{
  1137 	    g_print("   --> This file does not appear to be text. "
  1138 	      "Terminating. Best of luck with it!\n");
  1139 	    exit(1);
  1140 	}
  1141 	if (results->binlen*100>results->totlen || results->binlen>100)
  1142 	{
  1143 	    g_print("   --> There are a lot of foreign letters here. "
  1144 	      "Not reporting them.\n");
  1145 	    if (!pswit[VERBOSE_SWITCH])
  1146 		warnings.bin=0;
  1147 	}
  1148     }
  1149     warnings.isDutch=FALSE;
  1150     if (results->Dutchcount>50)
  1151     {
  1152 	warnings.isDutch=TRUE;
  1153 	g_print("   --> This looks like Dutch - "
  1154 	  "switching off dashes and warnings for 's Middags case.\n");
  1155     }
  1156     warnings.isFrench=FALSE;
  1157     if (results->Frenchcount>50)
  1158     {
  1159 	warnings.isFrench=TRUE;
  1160 	g_print("   --> This looks like French - "
  1161 	  "switching off some doublepunct.\n");
  1162     }
  1163     if (results->firstline && results->footerline)
  1164 	g_print("    The PG header and footer appear to be already on.\n");
  1165     else
  1166     {
  1167 	if (results->firstline)
  1168 	    g_print("    The PG header is on - no footer.\n");
  1169 	if (results->footerline)
  1170 	    g_print("    The PG footer is on - no header.\n");
  1171     }
  1172     g_print("\n");
  1173     if (pswit[VERBOSE_SWITCH])
  1174     {
  1175 	warnings.shortline=1;
  1176 	warnings.dotcomma=1;
  1177 	warnings.longline=1;
  1178 	warnings.dash=1;
  1179 	warnings.digit=1;
  1180 	warnings.ast=1;
  1181 	warnings.fslash=1;
  1182 	warnings.hyphen=1;
  1183 	warnings.endquote=1;
  1184 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1185     }
  1186     if (warnings.isDutch)
  1187 	warnings.dash=0;
  1188     if (results->footerline>0 && results->firstline>0 &&
  1189       results->footerline>results->firstline &&
  1190       results->footerline-results->firstline<100)
  1191     {
  1192 	g_print("   --> I don't really know where this text starts. \n");
  1193 	g_print("       There are no reference points.\n");
  1194 	g_print("       I'm going to have to report the header and footer "
  1195 	  "as well.\n");
  1196 	results->firstline=0;
  1197     }
  1198     return &warnings;
  1199 }
  1200 
  1201 /*
  1202  * analyse_quotes:
  1203  *
  1204  * Look along the line, accumulate the count of quotes, and see
  1205  * if this is an empty line - i.e. a line with nothing on it
  1206  * but spaces.
  1207  * If line has just spaces, period, * and/or - on it, don't
  1208  * count it, since empty lines with asterisks or dashes to
  1209  * separate sections are common.
  1210  *
  1211  * Returns: TRUE if the line is empty.
  1212  */
  1213 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1214 {
  1215     int guessquote=0;
  1216     /* assume the line is empty until proven otherwise */
  1217     gboolean isemptyline=TRUE;
  1218     const char *s=aline,*sprev,*snext;
  1219     gunichar c;
  1220     sprev=NULL;
  1221     GError *tmp_err=NULL;
  1222     while (*s)
  1223     {
  1224 	snext=g_utf8_next_char(s);
  1225 	c=g_utf8_get_char(s);
  1226 	if (CHAR_IS_DQUOTE(c))
  1227 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1228 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1229 	{
  1230 	    if (s==aline)
  1231 	    {
  1232 		/*
  1233 		 * At start of line, it can only be a quotation mark.
  1234 		 * Hardcode a very common exception!
  1235 		 */
  1236 		if (!g_str_has_prefix(snext,"tis") &&
  1237 		  !g_str_has_prefix(snext,"Tis"))
  1238 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1239 	    }
  1240 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1241 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1242 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1243 		;
  1244 	    /* it's outside a word - let's check it out */
  1245 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1246 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1247 	    {
  1248 		/* certainly looks like a quotation mark */
  1249 		if (!g_str_has_prefix(snext,"tis") &&
  1250 		  !g_str_has_prefix(snext,"Tis"))
  1251 		    /* hardcode a very common exception! */
  1252 		{
  1253 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1254 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1255 		    else
  1256 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1257 		}
  1258 	    }
  1259 	    else
  1260 	    {
  1261 		/* now - is it a quotation mark? */
  1262 		guessquote=0;   /* accumulate clues */
  1263 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1264 		{
  1265 		    /* it follows a letter - could be either */
  1266 		    guessquote++;
  1267 		    if (g_utf8_get_char(sprev)=='s')
  1268 		    {
  1269 			/* looks like a plural apostrophe */
  1270 			guessquote-=3;
  1271 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1272 			    /* bonus marks! */
  1273 			    guessquote-=2;
  1274 		    }
  1275 		    if (innermost_quote_matches(counters,c))
  1276 			/*
  1277 			 * Give it the benefit of some doubt,
  1278 			 * if a squote is already open.
  1279 			 */
  1280 			guessquote++;
  1281 		    else
  1282 			guessquote--;
  1283 		    if (guessquote>=0)
  1284 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1285 		}
  1286 		else
  1287 		    /* no adjacent letter - it must be a quote of some kind */
  1288 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1289 	    }
  1290 	}
  1291 	if (tmp_err)
  1292 	{
  1293 	    if (pswit[ECHO_SWITCH])
  1294 		g_print("\n%s\n",aline);
  1295 	    if (!pswit[OVERVIEW_SWITCH])
  1296 		g_print("    Line %ld column %ld - %s\n",
  1297 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1298 	    g_clear_error(&tmp_err);
  1299 	}
  1300 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1301 	  c!='\r' && c!='\n')
  1302 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1303 	if (c==CHAR_UNDERSCORE)
  1304 	    counters->c_unders++;
  1305 	if (c==CHAR_OPEN_SBRACK)
  1306 	{
  1307 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1308 	      !matching_difference(counters,c) && s==aline &&
  1309 	      g_str_has_prefix(s,"[Illustration:"))
  1310 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1311 	    else
  1312 		increment_matching(counters,c,TRUE);
  1313 	}
  1314 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1315 	    increment_matching(counters,c,TRUE);
  1316 	if (c==CHAR_CLOSE_SBRACK)
  1317 	{
  1318 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1319 	      !matching_difference(counters,c) && !*snext)
  1320 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1321 	    else
  1322 		increment_matching(counters,c,FALSE);
  1323 	}
  1324 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1325 	    increment_matching(counters,c,FALSE);
  1326 	sprev=s;
  1327 	s=snext;
  1328     }
  1329     return isemptyline;
  1330 }
  1331 
  1332 /*
  1333  * check_for_control_characters:
  1334  *
  1335  * Check for invalid or questionable characters in the line
  1336  * Anything above 127 is invalid for plain ASCII, and
  1337  * non-printable control characters should also be flagged.
  1338  * Tabs should generally not be there.
  1339  */
  1340 void check_for_control_characters(const char *aline)
  1341 {
  1342     gunichar c;
  1343     const char *s;
  1344     for (s=aline;*s;s=g_utf8_next_char(s))
  1345     {
  1346 	c=g_utf8_get_char(s);
  1347 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1348 	{
  1349 	    if (pswit[ECHO_SWITCH])
  1350 		g_print("\n%s\n",aline);
  1351 	    if (!pswit[OVERVIEW_SWITCH])
  1352 		g_print("    Line %ld column %ld - Control character %u\n",
  1353 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1354 	    else
  1355 		cnt_bin++;
  1356 	}
  1357     }
  1358 }
  1359 
  1360 /*
  1361  * check_for_odd_characters:
  1362  *
  1363  * Check for binary and other odd characters.
  1364  */
  1365 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1366   gboolean isemptyline)
  1367 {
  1368     /* Don't repeat multiple warnings on one line. */
  1369     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1370     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1371     const char *s;
  1372     gunichar c;
  1373     gsize nb;
  1374     gchar *t;
  1375     for (s=aline;*s;s=g_utf8_next_char(s))
  1376     {
  1377 	c=g_utf8_get_char(s);
  1378 	if (warnings->bin && !eInvalidChar &&
  1379 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1380 	{
  1381 	    if (pswit[ECHO_SWITCH])
  1382 		g_print("\n%s\n",aline);
  1383 	    if (!pswit[OVERVIEW_SWITCH])
  1384 		if (c>127 && c<160 || c>255)
  1385 		    g_print("    Line %ld column %ld - "
  1386 		      "Non-ISO-8859 character %u\n",
  1387 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1388 		else
  1389 		    g_print("    Line %ld column %ld - "
  1390 		      "Non-ASCII character %u\n",
  1391 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1392 	    else
  1393 		cnt_bin++;
  1394 	    eInvalidChar=TRUE;
  1395 	}
  1396 	if (!eInvalidChar && charset)
  1397 	{
  1398 	    if (charset_validator==(GIConv)-1)
  1399 	    {
  1400 		if (!g_unichar_isdefined(c))
  1401 		{
  1402 		    if (pswit[ECHO_SWITCH])
  1403 			g_print("\n%s\n",aline);
  1404 		    if (!pswit[OVERVIEW_SWITCH])
  1405 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1406 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1407 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1408 		    else
  1409 			cnt_bin++;
  1410 		    eInvalidChar=TRUE;
  1411 		}
  1412 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1413 		  c>=100000 && c<=0x10FFFD)
  1414 		{
  1415 		    if (pswit[ECHO_SWITCH])
  1416 			g_print("\n%s\n",aline);
  1417 		    if (!pswit[OVERVIEW_SWITCH])
  1418 			g_print("    Line %ld column %ld - Private Use "
  1419 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1420 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1421 		    else
  1422 			cnt_bin++;
  1423 		    eInvalidChar=TRUE;
  1424 		}
  1425 	    }
  1426 	    else
  1427 	    {
  1428 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1429 		  charset_validator,NULL,&nb,NULL);
  1430 		if (t)
  1431 		    g_free(t);
  1432 		else
  1433 		{
  1434 		    if (pswit[ECHO_SWITCH])
  1435 			g_print("\n%s\n",aline);
  1436 		    if (!pswit[OVERVIEW_SWITCH])
  1437 			g_print("    Line %ld column %ld - Non-%s "
  1438 			  "character %u\n",linecnt,
  1439 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1440 		    else
  1441 			cnt_bin++;
  1442 		    eInvalidChar=TRUE;
  1443 		}
  1444 	    }
  1445 	}
  1446 	if (!eTab && c==CHAR_TAB)
  1447 	{
  1448 	    if (pswit[ECHO_SWITCH])
  1449 		g_print("\n%s\n",aline);
  1450 	    if (!pswit[OVERVIEW_SWITCH])
  1451 		g_print("    Line %ld column %ld - Tab character?\n",
  1452 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1453 	    else
  1454 		cnt_odd++;
  1455 	    eTab=TRUE;
  1456 	}
  1457 	if (!eTilde && c==CHAR_TILDE)
  1458 	{
  1459 	    /*
  1460 	     * Often used by OCR software to indicate an
  1461 	     * unrecognizable character.
  1462 	     */
  1463 	    if (pswit[ECHO_SWITCH])
  1464 		g_print("\n%s\n",aline);
  1465 	    if (!pswit[OVERVIEW_SWITCH])
  1466 		g_print("    Line %ld column %ld - Tilde character?\n",
  1467 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1468 	    else
  1469 		cnt_odd++;
  1470 	    eTilde=TRUE;
  1471 	}
  1472 	if (!eCarat && c==CHAR_CARAT)
  1473 	{  
  1474 	    if (pswit[ECHO_SWITCH])
  1475 		g_print("\n%s\n",aline);
  1476 	    if (!pswit[OVERVIEW_SWITCH])
  1477 		g_print("    Line %ld column %ld - Carat character?\n",
  1478 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1479 	    else
  1480 		cnt_odd++;
  1481 	    eCarat=TRUE;
  1482 	}
  1483 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1484 	{  
  1485 	    if (pswit[ECHO_SWITCH])
  1486 		g_print("\n%s\n",aline);
  1487 	    if (!pswit[OVERVIEW_SWITCH])
  1488 		g_print("    Line %ld column %ld - Forward slash?\n",
  1489 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1490 	    else
  1491 		cnt_odd++;
  1492 	    eFSlash=TRUE;
  1493 	}
  1494 	/*
  1495 	 * Report asterisks only in paranoid mode,
  1496 	 * since they're often deliberate.
  1497 	 */
  1498 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1499 	  c==CHAR_ASTERISK)
  1500 	{
  1501 	    if (pswit[ECHO_SWITCH])
  1502 		g_print("\n%s\n",aline);
  1503 	    if (!pswit[OVERVIEW_SWITCH])
  1504 		g_print("    Line %ld column %ld - Asterisk?\n",
  1505 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1506 	    else
  1507 		cnt_odd++;
  1508 	    eAst=TRUE;
  1509 	}
  1510     }
  1511 }
  1512 
  1513 /*
  1514  * check_for_long_line:
  1515  *
  1516  * Check for line too long.
  1517  */
  1518 void check_for_long_line(const char *aline)
  1519 {
  1520     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1521     {
  1522 	if (pswit[ECHO_SWITCH])
  1523 	    g_print("\n%s\n",aline);
  1524 	if (!pswit[OVERVIEW_SWITCH])
  1525 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1526 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1527 	else
  1528 	    cnt_long++;
  1529     }
  1530 }
  1531 
  1532 /*
  1533  * check_for_short_line:
  1534  *
  1535  * Check for line too short.
  1536  *
  1537  * This one is a bit trickier to implement: we don't want to
  1538  * flag the last line of a paragraph for being short, so we
  1539  * have to wait until we know that our current line is a
  1540  * "normal" line, then report the _previous_ line if it was too
  1541  * short. We also don't want to report indented lines like
  1542  * chapter heads or formatted quotations. We therefore keep
  1543  * last->len as the length of the last line examined, and
  1544  * last->blen as the length of the last but one, and try to
  1545  * suppress unnecessary warnings by checking that both were of
  1546  * "normal" length. We keep the first character of the last
  1547  * line in last->start, and if it was a space, we assume that
  1548  * the formatting is deliberate. I can't figure out a way to
  1549  * distinguish something like a quoted verse left-aligned or
  1550  * the header or footer of a letter from a paragraph of short
  1551  * lines - maybe if I examined the whole paragraph, and if the
  1552  * para has less than, say, 8 lines and if all lines are short,
  1553  * then just assume it's OK? Need to look at some texts to see
  1554  * how often a formula like this would get the right result.
  1555  */
  1556 void check_for_short_line(const char *aline,const struct line_properties *last)
  1557 {
  1558     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1559       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1560       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1561     {
  1562 	if (pswit[ECHO_SWITCH])
  1563 	    g_print("\n%s\n",prevline);
  1564 	if (!pswit[OVERVIEW_SWITCH])
  1565 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1566 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1567 	else
  1568 	    cnt_short++;
  1569     }
  1570 }
  1571 
  1572 /*
  1573  * check_for_starting_punctuation:
  1574  *
  1575  * Look for punctuation other than full ellipses at start of line.
  1576  */
  1577 void check_for_starting_punctuation(const char *aline)
  1578 {
  1579     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1580       !g_str_has_prefix(aline,". . ."))
  1581     {
  1582 	if (pswit[ECHO_SWITCH])
  1583 	    g_print("\n%s\n",aline);
  1584 	if (!pswit[OVERVIEW_SWITCH])
  1585 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1586 	      linecnt);
  1587 	else
  1588 	    cnt_punct++;
  1589     }
  1590 }
  1591 
  1592 /*
  1593  * str_emdash:
  1594  *
  1595  * Find the first em-dash, return a pointer to it and set <next> to the
  1596  * character following the dash.
  1597  */
  1598 char *str_emdash(const char *s,const char **next)
  1599 {
  1600     const char *s1,*s2;
  1601     s1=strstr(s,"--");
  1602     s2=strstr(s,"—");
  1603     if (!s1)
  1604     {
  1605 	if (s2)
  1606 	    *next=g_utf8_next_char(s2);
  1607 	return (char *)s2;
  1608     }
  1609     else if (!s2)
  1610     {
  1611 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1612 	return (char *)s1;
  1613     }
  1614     else if (s1<s2)
  1615     {
  1616 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1617 	return (char *)s1;
  1618     }
  1619     else
  1620     {
  1621 	*next=g_utf8_next_char(s2);
  1622 	return (char *)s2;
  1623     }
  1624 }
  1625 
  1626 /*
  1627  * check_for_spaced_emdash:
  1628  *
  1629  * Check for spaced em-dashes.
  1630  *
  1631  * We must check _all_ occurrences of em-dashes on the line
  1632  * hence the loop - even if the first dash is OK
  1633  * there may be another that's wrong later on.
  1634  */
  1635 void check_for_spaced_emdash(const char *aline)
  1636 {
  1637     const char *s,*t,*next;
  1638     for (s=aline;t=str_emdash(s,&next);s=next)
  1639     {
  1640 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1641 	  g_utf8_get_char(next)==CHAR_SPACE)
  1642 	{
  1643 	    if (pswit[ECHO_SWITCH])
  1644 		g_print("\n%s\n",aline);
  1645 	    if (!pswit[OVERVIEW_SWITCH])
  1646 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1647 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1648 	    else
  1649 		cnt_dash++;
  1650 	}
  1651     }
  1652 }
  1653 
  1654 /*
  1655  * check_for_spaced_dash:
  1656  *
  1657  * Check for spaced dashes.
  1658  */
  1659 void check_for_spaced_dash(const char *aline)
  1660 {
  1661     const char *s;
  1662     if ((s=strstr(aline," -")))
  1663     {
  1664 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1665 	{
  1666 	    if (pswit[ECHO_SWITCH])
  1667 		g_print("\n%s\n",aline);
  1668 	    if (!pswit[OVERVIEW_SWITCH])
  1669 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1670 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1671 	    else
  1672 		cnt_dash++;
  1673 	}
  1674     }
  1675     else if ((s=strstr(aline,"- ")))
  1676     {
  1677 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1678 	{
  1679 	    if (pswit[ECHO_SWITCH])
  1680 		g_print("\n%s\n",aline);
  1681 	    if (!pswit[OVERVIEW_SWITCH])
  1682 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1683 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1684 	    else
  1685 		cnt_dash++;
  1686 	}
  1687     }
  1688 }
  1689 
  1690 /*
  1691  * check_for_unmarked_paragraphs:
  1692  *
  1693  * Check for unmarked paragraphs indicated by separate speakers.
  1694  *
  1695  * May well be false positive:
  1696  * "Bravo!" "Wonderful!" called the crowd.
  1697  * but useful all the same.
  1698  */
  1699 void check_for_unmarked_paragraphs(const char *aline)
  1700 {
  1701     const char *s;
  1702     s=strstr(aline,"\"  \"");
  1703     if (!s)
  1704 	s=strstr(aline,"\" \"");
  1705     if (s)
  1706     {
  1707 	if (pswit[ECHO_SWITCH])
  1708 	    g_print("\n%s\n",aline);
  1709 	if (!pswit[OVERVIEW_SWITCH])
  1710 	    g_print("    Line %ld column %ld - "
  1711 	      "Query missing paragraph break?\n",
  1712 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1713 	else
  1714 	    cnt_punct++;
  1715     }
  1716 }
  1717 
  1718 /*
  1719  * check_for_jeebies:
  1720  *
  1721  * Check for "to he" and other easy h/b errors.
  1722  *
  1723  * This is a very inadequate effort on the h/b problem,
  1724  * but the phrase "to he" is always an error, whereas "to
  1725  * be" is quite common.
  1726  * Similarly, '"Quiet!", be said.' is a non-be error
  1727  * "to he" is _not_ always an error!:
  1728  *       "Where they went to he couldn't say."
  1729  * Another false positive:
  1730  *       What would "Cinderella" be without the . . .
  1731  * and another: "If he wants to he can see for himself."
  1732  */
  1733 void check_for_jeebies(const char *aline)
  1734 {
  1735     const char *s;
  1736     s=strstr(aline," be could ");
  1737     if (!s)
  1738 	s=strstr(aline," be would ");
  1739     if (!s)
  1740 	s=strstr(aline," was be ");
  1741     if (!s)
  1742 	s=strstr(aline," be is ");
  1743     if (!s)
  1744 	s=strstr(aline," is be ");
  1745     if (!s)
  1746 	s=strstr(aline,"\", be ");
  1747     if (!s)
  1748 	s=strstr(aline,"\" be ");
  1749     if (!s)
  1750 	s=strstr(aline,"\" be ");
  1751     if (!s)
  1752 	s=strstr(aline," to he ");
  1753     if (s)
  1754     {
  1755 	if (pswit[ECHO_SWITCH])
  1756 	    g_print("\n%s\n",aline);
  1757 	if (!pswit[OVERVIEW_SWITCH])
  1758 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1759 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1760 	else
  1761 	    cnt_word++;
  1762     }
  1763     s=strstr(aline," the had ");
  1764     if (!s)
  1765 	s=strstr(aline," a had ");
  1766     if (!s)
  1767 	s=strstr(aline," they bad ");
  1768     if (!s)
  1769 	s=strstr(aline," she bad ");
  1770     if (!s)
  1771 	s=strstr(aline," he bad ");
  1772     if (!s)
  1773 	s=strstr(aline," you bad ");
  1774     if (!s)
  1775 	s=strstr(aline," i bad ");
  1776     if (s)
  1777     {
  1778 	if (pswit[ECHO_SWITCH])
  1779 	    g_print("\n%s\n",aline);
  1780 	if (!pswit[OVERVIEW_SWITCH])
  1781 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1782 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1783 	else
  1784 	    cnt_word++;
  1785     }
  1786     s=strstr(aline,"; hut ");
  1787     if (!s)
  1788 	s=strstr(aline,", hut ");
  1789     if (s)
  1790     {
  1791 	if (pswit[ECHO_SWITCH])
  1792 	    g_print("\n%s\n",aline);
  1793 	if (!pswit[OVERVIEW_SWITCH])
  1794 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1795 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1796 	else
  1797 	    cnt_word++;
  1798     }
  1799 }
  1800 
  1801 /*
  1802  * check_for_mta_from:
  1803  *
  1804  * Special case - angled bracket in front of "From" placed there by an
  1805  * MTA when sending an e-mail.
  1806  */
  1807 void check_for_mta_from(const char *aline)
  1808 {
  1809     const char *s;
  1810     s=strstr(aline,">From");
  1811     if (s)
  1812     {
  1813 	if (pswit[ECHO_SWITCH])
  1814 	    g_print("\n%s\n",aline);
  1815 	if (!pswit[OVERVIEW_SWITCH])
  1816 	    g_print("    Line %ld column %ld - "
  1817 	      "Query angled bracket with From\n",
  1818 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1819 	else
  1820 	    cnt_punct++;
  1821     }
  1822 }
  1823 
  1824 /*
  1825  * check_for_orphan_character:
  1826  *
  1827  * Check for a single character line -
  1828  * often an overflow from bad wrapping.
  1829  */
  1830 void check_for_orphan_character(const char *aline)
  1831 {
  1832     gunichar c;
  1833     c=g_utf8_get_char(aline);
  1834     if (c && !*g_utf8_next_char(aline))
  1835     {
  1836 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1837 	    ; /* Nothing - ignore numerals alone on a line. */
  1838 	else
  1839 	{
  1840 	    if (pswit[ECHO_SWITCH])
  1841 		g_print("\n%s\n",aline);
  1842 	    if (!pswit[OVERVIEW_SWITCH])
  1843 		g_print("    Line %ld column 1 - Query single character line\n",
  1844 		  linecnt);
  1845 	    else
  1846 		cnt_punct++;
  1847 	}
  1848     }
  1849 }
  1850 
  1851 /*
  1852  * check_for_pling_scanno:
  1853  *
  1854  * Check for I" - often should be !
  1855  */
  1856 void check_for_pling_scanno(const char *aline)
  1857 {
  1858     const char *s;
  1859     s=strstr(aline," I\"");
  1860     if (s)
  1861     {
  1862 	if (pswit[ECHO_SWITCH])
  1863 	    g_print("\n%s\n",aline);
  1864 	if (!pswit[OVERVIEW_SWITCH])
  1865 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1866 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1867 	else
  1868 	    cnt_punct++;
  1869     }
  1870 }
  1871 
  1872 /*
  1873  * check_for_extra_period:
  1874  *
  1875  * Check for period without a capital letter. Cut-down from gutspell.
  1876  * Only works when it happens on a single line.
  1877  */
  1878 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1879 {
  1880     const char *s,*t,*s1,*sprev;
  1881     int i;
  1882     gsize len;
  1883     gboolean istypo;
  1884     gchar *testword;
  1885     gunichar c,nc,pc,*decomposition;
  1886     if (pswit[PARANOID_SWITCH])
  1887     {
  1888 	for (t=aline;t=strstr(t,". ");)
  1889 	{
  1890 	    if (t==aline)
  1891 	    {
  1892 		t=g_utf8_next_char(t);
  1893 		/* start of line punctuation is handled elsewhere */
  1894 		continue;
  1895 	    }
  1896 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1897 	    {
  1898 		t=g_utf8_next_char(t);
  1899 		continue;
  1900 	    }
  1901 	    if (warnings->isDutch)
  1902 	    {
  1903 		/* For Frank & Jeroen -- 's Middags case */
  1904 		gunichar c2,c3,c4,c5;
  1905 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1906 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1907 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1908 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1909 		if (CHAR_IS_APOSTROPHE(c2) &&
  1910 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1911 		  g_unichar_isupper(c5))
  1912 		{
  1913 		    t=g_utf8_next_char(t);
  1914 		    continue;
  1915 		}
  1916 	    }
  1917 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1918 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1919 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1920 		s1=g_utf8_next_char(s1);
  1921 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1922 	    {
  1923 		/* we have something to investigate */
  1924 		istypo=TRUE;
  1925 		/* so let's go back and find out */
  1926 		nc=g_utf8_get_char(t);
  1927 		s1=g_utf8_prev_char(t);
  1928 		c=g_utf8_get_char(s1);
  1929 		sprev=g_utf8_prev_char(s1);
  1930 		pc=g_utf8_get_char(sprev);
  1931 		while (s1>=aline &&
  1932 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1933 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1934 		  g_unichar_isalpha(nc)))
  1935 		{
  1936 		    nc=c;
  1937 		    s1=sprev;
  1938 		    c=pc;
  1939 		    sprev=g_utf8_prev_char(s1);
  1940 		    pc=g_utf8_get_char(sprev);
  1941 		}
  1942 		s1=g_utf8_next_char(s1);
  1943 		s=strchr(s1,'.');
  1944 		if (s)
  1945 		    testword=g_strndup(s1,s-s1);
  1946 		else
  1947 		    testword=g_strdup(s1);
  1948 		for (i=0;*abbrev[i];i++)
  1949 		    if (!strcmp(testword,abbrev[i]))
  1950 			istypo=FALSE;
  1951 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1952 		    istypo=FALSE;
  1953 		if (!*g_utf8_next_char(testword))
  1954 		    istypo=FALSE;
  1955 		if (isroman(testword))
  1956 		    istypo=FALSE;
  1957 		if (istypo)
  1958 		{
  1959 		    istypo=FALSE;
  1960 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1961 		    {
  1962 			decomposition=g_unicode_canonical_decomposition(
  1963 			  g_utf8_get_char(s),&len);
  1964 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1965 			    istypo=TRUE;
  1966 			g_free(decomposition);
  1967 		    }
  1968 		}
  1969 		if (istypo &&
  1970 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1971 		{
  1972 		    g_tree_insert(qperiod,g_strdup(testword),
  1973 		      GINT_TO_POINTER(1));
  1974 		    if (pswit[ECHO_SWITCH])
  1975 			g_print("\n%s\n",aline);
  1976 		    if (!pswit[OVERVIEW_SWITCH])
  1977 			g_print("    Line %ld column %ld - Extra period?\n",
  1978 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1979 		    else
  1980 			cnt_punct++;
  1981 		}
  1982 		g_free(testword);
  1983 	    }
  1984 	    t=g_utf8_next_char(t);
  1985 	}
  1986     }
  1987 }
  1988 
  1989 /*
  1990  * check_for_following_punctuation:
  1991  *
  1992  * Check for words usually not followed by punctuation.
  1993  */
  1994 void check_for_following_punctuation(const char *aline)
  1995 {
  1996     int i;
  1997     const char *s,*wordstart;
  1998     gunichar c;
  1999     gchar *inword,*t;
  2000     if (pswit[TYPO_SWITCH])
  2001     {
  2002 	for (s=aline;*s;)
  2003 	{
  2004 	    wordstart=s;
  2005 	    t=getaword(NULL,&s);
  2006 	    if (!*t)
  2007 	    {
  2008 		g_free(t);
  2009 		continue;
  2010 	    }
  2011 	    inword=g_utf8_strdown(t,-1);
  2012 	    g_free(t);
  2013 	    for (i=0;*nocomma[i];i++)
  2014 		if (!strcmp(inword,nocomma[i]))
  2015 		{
  2016 		    c=g_utf8_get_char(s);
  2017 		    if (c==',' || c==';' || c==':')
  2018 		    {
  2019 			if (pswit[ECHO_SWITCH])
  2020 			    g_print("\n%s\n",aline);
  2021 			if (!pswit[OVERVIEW_SWITCH])
  2022 			    g_print("    Line %ld column %ld - "
  2023 			      "Query punctuation after %s?\n",
  2024 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  2025 			      inword);
  2026 			else
  2027 			    cnt_punct++;
  2028 		    }
  2029 		}
  2030 	    for (i=0;*noperiod[i];i++)
  2031 		if (!strcmp(inword,noperiod[i]))
  2032 		{
  2033 		    c=g_utf8_get_char(s);
  2034 		    if (c=='.' || c=='!')
  2035 		    {
  2036 			if (pswit[ECHO_SWITCH])
  2037 			    g_print("\n%s\n",aline);
  2038 			if (!pswit[OVERVIEW_SWITCH])
  2039 			    g_print("    Line %ld column %ld - "
  2040 			      "Query punctuation after %s?\n",
  2041 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  2042 			      inword);
  2043 			else
  2044 			    cnt_punct++;
  2045 		    }
  2046 		}
  2047 	    g_free(inword);
  2048 	}
  2049     }
  2050 }
  2051 
  2052 /*
  2053  * check_for_typos:
  2054  *
  2055  * Check for commonly mistyped words, and digits like 0 for O in a word.
  2056  * Note that somewhat confusingly, this is also where we call getaword()
  2057  * with a non-NULL line so that it will issue warnings.
  2058  */
  2059 void check_for_typos(const char *aline,struct warnings *warnings)
  2060 {
  2061     const char *s,*t,*nt,*wordstart;
  2062     gchar *inword;
  2063     gunichar *decomposition;
  2064     gchar *testword;
  2065     int i,vowel,consonant,*dupcnt;
  2066     gboolean isdup,istypo,alower;
  2067     gunichar c,pc;
  2068     long offset,len;
  2069     gsize decomposition_len;
  2070     for (s=aline;*s;)
  2071     {
  2072 	wordstart=s;
  2073 	inword=getaword(aline,&s);
  2074 	if (!*inword)
  2075 	{
  2076 	    g_free(inword);
  2077 	    continue; /* don't bother with empty lines */
  2078 	}
  2079 	if (mixdigit(inword))
  2080 	{
  2081 	    if (pswit[ECHO_SWITCH])
  2082 		g_print("\n%s\n",aline);
  2083 	    if (!pswit[OVERVIEW_SWITCH])
  2084 		g_print("    Line %ld column %ld - Query digit in %s\n",
  2085 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  2086 	    else
  2087 		cnt_word++;
  2088 	}
  2089 	/*
  2090 	 * Put the word through a series of tests for likely typos and OCR
  2091 	 * errors.
  2092 	 */
  2093 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2094 	{
  2095 	    istypo=FALSE;
  2096 	    alower=FALSE;
  2097 	    for (t=inword;*t;t=g_utf8_next_char(t))
  2098 	    {
  2099 		c=g_utf8_get_char(t);
  2100 		nt=g_utf8_next_char(t);
  2101 		/* lowercase for testing */
  2102 		if (g_unichar_islower(c))
  2103 		    alower=TRUE;
  2104 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  2105 		{
  2106 		    /*
  2107 		     * We have an uppercase mid-word. However, there are
  2108 		     * common cases:
  2109 		     *   Mac and Mc like McGill
  2110 		     *   French contractions like l'Abbe
  2111 		     */
  2112 		    offset=g_utf8_pointer_to_offset(inword,t);
  2113 		    if (offset>0)
  2114 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  2115 		    else
  2116 			pc='\0';
  2117 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  2118 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  2119 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  2120 		      CHAR_IS_APOSTROPHE(pc))
  2121 			; /* do nothing! */
  2122 		    else
  2123 			istypo=TRUE;
  2124 		}
  2125 	    }
  2126 	    testword=g_utf8_casefold(inword,-1);
  2127 	}
  2128 	if (pswit[TYPO_SWITCH])
  2129 	{
  2130 	    /*
  2131 	     * Check for certain unlikely two-letter combinations at word
  2132 	     * start and end.
  2133 	     */
  2134 	    len=g_utf8_strlen(testword,-1);
  2135 	    if (len>1)
  2136 	    {
  2137 		for (i=0;*nostart[i];i++)
  2138 		    if (g_str_has_prefix(testword,nostart[i]))
  2139 			istypo=TRUE;
  2140 		for (i=0;*noend[i];i++)
  2141 		    if (g_str_has_suffix(testword,noend[i]))
  2142 			istypo=TRUE;
  2143 	    }
  2144 	    /* ght is common, gbt never. Like that. */
  2145 	    if (strstr(testword,"cb"))
  2146 		istypo=TRUE;
  2147 	    if (strstr(testword,"gbt"))
  2148 		istypo=TRUE;
  2149 	    if (strstr(testword,"pbt"))
  2150 		istypo=TRUE;
  2151 	    if (strstr(testword,"tbs"))
  2152 		istypo=TRUE;
  2153 	    if (strstr(testword,"mrn"))
  2154 		istypo=TRUE;
  2155 	    if (strstr(testword,"ahle"))
  2156 		istypo=TRUE;
  2157 	    if (strstr(testword,"ihle"))
  2158 		istypo=TRUE;
  2159 	    /*
  2160 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2161 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2162 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2163 	     * numerals, but "ii" is a common scanno.
  2164 	     */
  2165 	    if (strstr(testword,"tbi"))
  2166 		istypo=TRUE;
  2167 	    if (strstr(testword,"tbe"))
  2168 		istypo=TRUE;
  2169 	    if (strstr(testword,"ii"))
  2170 		istypo=TRUE;
  2171 	    /*
  2172 	     * Check for no vowels or no consonants.
  2173 	     * If none, flag a typo.
  2174 	     */
  2175 	    if (!istypo && len>1)
  2176 	    {
  2177 		vowel=consonant=0;
  2178 		for (t=testword;*t;t=g_utf8_next_char(t))
  2179 		{
  2180 		    c=g_utf8_get_char(t);
  2181 		    decomposition=
  2182 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2183 		    if (c=='y' || g_unichar_isdigit(c))
  2184 		    {
  2185 			/* Yah, this is loose. */
  2186 			vowel++;
  2187 			consonant++;
  2188 		    }
  2189 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2190 			vowel++;
  2191 		    else
  2192 			consonant++;
  2193 		    g_free(decomposition);
  2194 		}
  2195 		if (!vowel || !consonant)
  2196 		    istypo=TRUE;
  2197 	    }
  2198 	    /*
  2199 	     * Now exclude the word from being reported if it's in
  2200 	     * the okword list.
  2201 	     */
  2202 	    for (i=0;*okword[i];i++)
  2203 		if (!strcmp(testword,okword[i]))
  2204 		    istypo=FALSE;
  2205 	    /*
  2206 	     * What looks like a typo may be a Roman numeral.
  2207 	     * Exclude these.
  2208 	     */
  2209 	    if (istypo && isroman(testword))
  2210 		istypo=FALSE;
  2211 	    /* Check the manual list of typos. */
  2212 	    if (!istypo)
  2213 		for (i=0;*typo[i];i++)
  2214 		    if (!strcmp(testword,typo[i]))
  2215 			istypo=TRUE;
  2216 	    /*
  2217 	     * Check lowercase s, l, i and m - special cases.
  2218 	     *   "j" - often a semi-colon gone wrong.
  2219 	     *   "d" for a missing apostrophe - he d
  2220 	     *   "n" for "in"
  2221 	     */
  2222 	    if (!istypo && len==1 &&
  2223 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2224 		istypo=TRUE;
  2225 	    if (istypo)
  2226 	    {
  2227 		dupcnt=g_tree_lookup(qword,testword);
  2228 		if (dupcnt)
  2229 		{
  2230 		    (*dupcnt)++;
  2231 		    isdup=!pswit[VERBOSE_SWITCH];
  2232 		}
  2233 		else
  2234 		{
  2235 		    dupcnt=g_new0(int,1);
  2236 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2237 		    isdup=FALSE;
  2238 		}
  2239 		if (!isdup)
  2240 		{
  2241 		    if (pswit[ECHO_SWITCH])
  2242 			g_print("\n%s\n",aline);
  2243 		    if (!pswit[OVERVIEW_SWITCH])
  2244 		    {
  2245 			g_print("    Line %ld column %ld - Query word %s",
  2246 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2247 			  inword);
  2248 			if (!pswit[VERBOSE_SWITCH])
  2249 			    g_print(" - not reporting duplicates");
  2250 			g_print("\n");
  2251 		    }
  2252 		    else
  2253 			cnt_word++;
  2254 		}
  2255 	    }
  2256 	}
  2257 	/* check the user's list of typos */
  2258 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2259 	{
  2260 	    if (pswit[ECHO_SWITCH])
  2261 		g_print("\n%s\n",aline);
  2262 	    if (!pswit[OVERVIEW_SWITCH])  
  2263 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2264 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2265 	}
  2266 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2267 	    g_free(testword);
  2268 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2269 	{
  2270 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2271 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2272 	    {
  2273 		if (pswit[ECHO_SWITCH])
  2274 		    g_print("\n%s\n",aline);
  2275 		if (!pswit[OVERVIEW_SWITCH])
  2276 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2277 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2278 		      inword);
  2279 		else
  2280 		    cnt_word++;
  2281 	    }
  2282 	}
  2283 	g_free(inword);
  2284     }
  2285 }
  2286 
  2287 /*
  2288  * check_for_misspaced_punctuation:
  2289  *
  2290  * Look for added or missing spaces around punctuation and quotes.
  2291  * If there is a punctuation character like ! with no space on
  2292  * either side, suspect a missing!space. If there are spaces on
  2293  * both sides , assume a typo. If we see a double quote with no
  2294  * space or punctuation on either side of it, assume unspaced
  2295  * quotes "like"this.
  2296  */
  2297 void check_for_misspaced_punctuation(const char *aline,
  2298   struct parities *parities,gboolean isemptyline)
  2299 {
  2300     gboolean isacro,isellipsis;
  2301     const char *s;
  2302     gunichar c,nc,pc,n2c;
  2303     int parity;
  2304     c=g_utf8_get_char(aline);
  2305     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2306     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2307     {
  2308 	pc=c;
  2309 	c=nc;
  2310 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2311 	/* For each character in the line after the first. */
  2312 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2313 	{
  2314 	    /* we need to suppress warnings for acronyms like M.D. */
  2315 	    isacro=FALSE;
  2316 	    /* we need to suppress warnings for ellipsis . . . */
  2317 	    isellipsis=FALSE;
  2318 	    /*
  2319 	     * If there are letters on both sides of it or
  2320 	     * if it's strict punctuation followed by an alpha.
  2321 	     */
  2322 	    if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2323 	      g_utf8_strchr("?!,;:",-1,c)))
  2324 	    {
  2325 		if (c=='.')
  2326 		{
  2327 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2328 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2329 			isacro=TRUE;
  2330 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2331 		    if (nc && n2c=='.')
  2332 			isacro=TRUE;
  2333 		}
  2334 		if (!isacro)
  2335 		{
  2336 		    if (pswit[ECHO_SWITCH])
  2337 			g_print("\n%s\n",aline);
  2338 		    if (!pswit[OVERVIEW_SWITCH])
  2339 			g_print("    Line %ld column %ld - Missing space?\n",
  2340 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2341 		    else
  2342 			cnt_punct++;
  2343 		}
  2344 	    }
  2345 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2346 	    {
  2347 		/*
  2348 		 * If there are spaces on both sides,
  2349 		 * or space before and end of line.
  2350 		 */
  2351 		if (c=='.')
  2352 		{
  2353 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2354 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2355 			isellipsis=TRUE;
  2356 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2357 		    if (nc && n2c=='.')
  2358 			isellipsis=TRUE;
  2359 		}
  2360 		if (!isemptyline && !isellipsis)
  2361 		{
  2362 		    if (pswit[ECHO_SWITCH])
  2363 			g_print("\n%s\n",aline);
  2364 		    if (!pswit[OVERVIEW_SWITCH])
  2365 			g_print("    Line %ld column %ld - "
  2366 			  "Spaced punctuation?\n",linecnt,
  2367 			  g_utf8_pointer_to_offset(aline,s)+1);
  2368 		    else
  2369 			cnt_punct++;
  2370 		}
  2371 	    }
  2372 	}
  2373     }
  2374     /* Split out the characters that CANNOT be preceded by space. */
  2375     c=g_utf8_get_char(aline);
  2376     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2377     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2378     {
  2379 	pc=c;
  2380 	c=nc;
  2381 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2382 	/* for each character in the line after the first */
  2383 	if (g_utf8_strchr("?!,;:",-1,c))
  2384 	{
  2385 	    /* if it's punctuation that _cannot_ have a space before it */
  2386 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2387 	    {
  2388 		/*
  2389 		 * If nc DOES == space,
  2390 		 * it was already reported just above.
  2391 		 */
  2392 		if (pswit[ECHO_SWITCH])
  2393 		    g_print("\n%s\n",aline);
  2394 		if (!pswit[OVERVIEW_SWITCH])
  2395 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2396 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2397 		else
  2398 		    cnt_punct++;
  2399 	    }
  2400 	}
  2401     }
  2402     /*
  2403      * Special case " .X" where X is any alpha.
  2404      * This plugs a hole in the acronym code above.
  2405      * Inelegant, but maintainable.
  2406      */
  2407     c=g_utf8_get_char(aline);
  2408     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2409     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2410     {
  2411 	pc=c;
  2412 	c=nc;
  2413 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2414 	/* for each character in the line after the first */
  2415 	if (c=='.')
  2416 	{
  2417 	    /* if it's a period */
  2418 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2419 	    {
  2420 		/*
  2421 		 * If the period follows a space and
  2422 		 * is followed by a letter.
  2423 		 */
  2424 		if (pswit[ECHO_SWITCH])
  2425 		    g_print("\n%s\n",aline);
  2426 		if (!pswit[OVERVIEW_SWITCH])
  2427 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2428 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2429 		else
  2430 		    cnt_punct++;
  2431 	    }
  2432 	}
  2433     }
  2434     c=g_utf8_get_char(aline);
  2435     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2436     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2437     {
  2438 	pc=c;
  2439 	c=nc;
  2440 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2441 	/* for each character in the line after the first */
  2442 	if (CHAR_IS_DQUOTE(c))
  2443 	{
  2444 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2445 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2446 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2447 	    {
  2448 		if (pswit[ECHO_SWITCH])
  2449 		    g_print("\n%s\n",aline);
  2450 		if (!pswit[OVERVIEW_SWITCH])
  2451 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2452 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2453 		else
  2454 		    cnt_punct++;
  2455 	    }
  2456 	}
  2457     }
  2458     /* Check parity of quotes. */
  2459     nc=g_utf8_get_char(aline);
  2460     for (s=aline;*s;s=g_utf8_next_char(s))
  2461     {
  2462 	c=nc;
  2463 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2464 	if (CHAR_IS_DQUOTE(c))
  2465 	{
  2466 	    if (c==CHAR_DQUOTE)
  2467 	    {
  2468 		parities->dquote=!parities->dquote;
  2469 		parity=parities->dquote;
  2470 	    }
  2471 	    else if (c==CHAR_LD_QUOTE)
  2472 		parity=1;
  2473 	    else
  2474 		parity=0;
  2475 	    if (!parity)
  2476 	    {
  2477 		/* parity even */
  2478 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2479 		{
  2480 		    if (pswit[ECHO_SWITCH])
  2481 			g_print("\n%s\n",aline);
  2482 		    if (!pswit[OVERVIEW_SWITCH])
  2483 			g_print("    Line %ld column %ld - "
  2484 			  "Wrongspaced quotes?\n",
  2485 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2486 		    else
  2487 			cnt_punct++;
  2488 		}
  2489 	    }
  2490 	    else
  2491 	    {
  2492 		/* parity odd */
  2493 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2494 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2495 		{
  2496 		    if (pswit[ECHO_SWITCH])
  2497 			g_print("\n%s\n",aline);
  2498 		    if (!pswit[OVERVIEW_SWITCH])
  2499 			g_print("    Line %ld column %ld - "
  2500 			  "Wrongspaced quotes?\n",
  2501 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2502 		    else
  2503 			cnt_punct++;
  2504 		}
  2505 	    }
  2506 	}
  2507     }
  2508     c=g_utf8_get_char(aline);
  2509     if (CHAR_IS_DQUOTE(c))
  2510     {
  2511 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2512 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2513 	{
  2514 	    if (pswit[ECHO_SWITCH])
  2515 		g_print("\n%s\n",aline);
  2516 	    if (!pswit[OVERVIEW_SWITCH])
  2517 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2518 		  linecnt);
  2519 	    else
  2520 		cnt_punct++;
  2521 	}
  2522     }
  2523     if (pswit[SQUOTE_SWITCH])
  2524     {
  2525 	nc=g_utf8_get_char(aline);
  2526 	for (s=aline;*s;s=g_utf8_next_char(s))
  2527 	{
  2528 	    c=nc;
  2529 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2530 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2531 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2532 	      !g_unichar_isalpha(nc)))
  2533 	    {
  2534 		parities->squote=!parities->squote;
  2535 		if (!parities->squote)
  2536 		{
  2537 		    /* parity even */
  2538 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2539 		    {
  2540 			if (pswit[ECHO_SWITCH])
  2541 			    g_print("\n%s\n",aline);
  2542 			if (!pswit[OVERVIEW_SWITCH])
  2543 			    g_print("    Line %ld column %ld - "
  2544 			      "Wrongspaced singlequotes?\n",
  2545 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2546 			else
  2547 			    cnt_punct++;
  2548 		    }
  2549 		}
  2550 		else
  2551 		{
  2552 		    /* parity odd */
  2553 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2554 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2555 		    {
  2556 			if (pswit[ECHO_SWITCH])
  2557 			    g_print("\n%s\n",aline);
  2558 			if (!pswit[OVERVIEW_SWITCH])
  2559 			    g_print("    Line %ld column %ld - "
  2560 			      "Wrongspaced singlequotes?\n",
  2561 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2562 			else
  2563 			    cnt_punct++;
  2564 		    }
  2565 		}
  2566 	    }
  2567 	}
  2568     }
  2569 }
  2570 
  2571 /*
  2572  * str_follows_word:
  2573  *
  2574  * Given a position p within a string str, determine whether it follows the
  2575  * given word. This is roughly equivalent to the regular expression (?<=\bword)
  2576  * but has different boundary conditions.
  2577  */
  2578 static gboolean str_follows_word(const char *str,const char *p,const char *word)
  2579 {
  2580     int len=strlen(word);
  2581     if (p-len<str)
  2582 	return FALSE;
  2583     else if (!g_str_has_prefix(p-len,word))
  2584 	return FALSE;
  2585     else if (p-len==str)
  2586 	return TRUE;
  2587     else
  2588 	/* Using non-alpha as a word boundary. See UAX #29 for a better way. */
  2589 	return !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(p-len)));
  2590 }
  2591 
  2592 /*
  2593  * check_for_double_punctuation:
  2594  *
  2595  * Look for double punctuation like ,. or ,,
  2596  * Thanks to DW for the suggestion!
  2597  * In books with references, ".," and ".;" are common
  2598  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2599  * OTOH, from my initial tests, there are also fairly
  2600  * common errors. What to do? Make these cases paranoid?
  2601  * ".," is the most common, so warnings->dotcomma is used
  2602  * to suppress detailed reporting if it occurs often.
  2603  * Indeed, ".," is so common after "etc" or "&c" that
  2604  * we don't warn on these cases at all.
  2605  */
  2606 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2607 {
  2608     const char *s;
  2609     gunichar c,nc;
  2610     gboolean is_query;
  2611     nc=g_utf8_get_char(aline);
  2612     for (s=aline;*s;s=g_utf8_next_char(s))
  2613     {
  2614 	c=nc;
  2615 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2616 	/* for each punctuation character in the line */
  2617 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2618 	  g_utf8_strchr(".?!,;:",-1,nc))
  2619 	{
  2620 	    /* followed by punctuation, it's a query, unless . . . */
  2621 	    is_query=TRUE;
  2622 	    if (warnings->isFrench &&
  2623 	      (g_str_has_prefix(s,",...") || g_str_has_prefix(s,"...,") ||
  2624 	       g_str_has_prefix(s,";...") || g_str_has_prefix(s,"...;") ||
  2625 	       g_str_has_prefix(s,":...") || g_str_has_prefix(s,"...:") ||
  2626 	       g_str_has_prefix(s,"!...") || g_str_has_prefix(s,"...!") ||
  2627 	       g_str_has_prefix(s,"?...") || g_str_has_prefix(s,"...?")))
  2628 	    {
  2629 		s+=4;
  2630 		nc=g_utf8_get_char(g_utf8_next_char(s));
  2631 		is_query=FALSE;
  2632 	    }
  2633 	    else if (c==nc && (c=='.' || c=='?' || c=='!'))
  2634 	    {
  2635 		/* do nothing for .. !! and ?? which can be legit */
  2636 		is_query=FALSE;
  2637 	    }
  2638 	    else if (c=='.' && nc==',')
  2639 	    {
  2640 		if (!warnings->dotcomma || str_follows_word(aline,s,"etc") || 
  2641 		  str_follows_word(aline,s,"&c"))
  2642 		    is_query=FALSE;
  2643 	    }
  2644 	    if (is_query)
  2645 	    {
  2646 		if (pswit[ECHO_SWITCH])
  2647 		    g_print("\n%s\n",aline);
  2648 		if (!pswit[OVERVIEW_SWITCH])
  2649 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2650 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2651 		else
  2652 		    cnt_punct++;
  2653 	    }
  2654 	}
  2655     }
  2656 }
  2657 
  2658 /*
  2659  * check_for_spaced_quotes:
  2660  */
  2661 void check_for_spaced_quotes(const char *aline)
  2662 {
  2663     int i;
  2664     const char *s,*t;
  2665     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2666       CHAR_RS_QUOTE};
  2667     GString *pattern;
  2668     s=aline;
  2669     while ((t=strstr(s," \" ")))
  2670     {
  2671 	if (pswit[ECHO_SWITCH])
  2672 	    g_print("\n%s\n",aline);
  2673 	if (!pswit[OVERVIEW_SWITCH])
  2674 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2675 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2676 	else
  2677 	    cnt_punct++;
  2678 	s=g_utf8_next_char(g_utf8_next_char(t));
  2679     }
  2680     pattern=g_string_new(NULL);
  2681     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2682     {
  2683 	g_string_assign(pattern," ");
  2684 	g_string_append_unichar(pattern,single_quotes[i]);
  2685 	g_string_append_c(pattern,' ');
  2686 	s=aline;
  2687 	while ((t=strstr(s,pattern->str)))
  2688 	{
  2689 	    if (pswit[ECHO_SWITCH])
  2690 		g_print("\n%s\n",aline);
  2691 	    if (!pswit[OVERVIEW_SWITCH])
  2692 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2693 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2694 	    else
  2695 		cnt_punct++;
  2696 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2697 	}
  2698     }
  2699     g_string_free(pattern,TRUE);
  2700 }
  2701 
  2702 /*
  2703  * check_for_miscased_genative:
  2704  *
  2705  * Check special case of 'S instead of 's at end of word.
  2706  */
  2707 void check_for_miscased_genative(const char *aline)
  2708 {
  2709     const char *s;
  2710     gunichar c,nc,pc;
  2711     if (!*aline)
  2712 	return;
  2713     c=g_utf8_get_char(aline);
  2714     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2715     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2716     {
  2717 	pc=c;
  2718 	c=nc;
  2719 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2720 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2721 	{
  2722 	    if (pswit[ECHO_SWITCH])
  2723 		g_print("\n%s\n",aline);
  2724 	    if (!pswit[OVERVIEW_SWITCH])
  2725 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2726 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2727 	    else
  2728 		cnt_punct++;
  2729 	}
  2730     }
  2731 }
  2732 
  2733 /*
  2734  * check_end_of_line:
  2735  *
  2736  * Now check special cases - start and end of line -
  2737  * for single and double quotes. Start is sometimes [sic]
  2738  * but better to query it anyway.
  2739  * While we're here, check for dash at end of line.
  2740  */
  2741 void check_end_of_line(const char *aline,struct warnings *warnings)
  2742 {
  2743     int lbytes;
  2744     const char *s;
  2745     gunichar c1,c2;
  2746     lbytes=strlen(aline);
  2747     if (g_utf8_strlen(aline,lbytes)>1)
  2748     {
  2749 	s=g_utf8_prev_char(aline+lbytes);
  2750 	c1=g_utf8_get_char(s);
  2751 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2752 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2753 	{
  2754 	    if (pswit[ECHO_SWITCH])
  2755 		g_print("\n%s\n",aline);
  2756 	    if (!pswit[OVERVIEW_SWITCH])
  2757 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2758 		  g_utf8_strlen(aline,lbytes));
  2759 	    else
  2760 		cnt_punct++;
  2761 	}
  2762 	c1=g_utf8_get_char(aline);
  2763 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2764 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2765 	{
  2766 	    if (pswit[ECHO_SWITCH])
  2767 		g_print("\n%s\n",aline);
  2768 	    if (!pswit[OVERVIEW_SWITCH])
  2769 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2770 	    else
  2771 		cnt_punct++;
  2772 	}
  2773 	/*
  2774 	 * Dash at end of line may well be legit - paranoid mode only
  2775 	 * and don't report em-dash at line-end.
  2776 	 */
  2777 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2778 	{
  2779 	    for (s=g_utf8_prev_char(aline+lbytes);
  2780 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2781 		;
  2782 	    if (g_utf8_get_char(s)=='-' &&
  2783 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2784 	    {
  2785 		if (pswit[ECHO_SWITCH])
  2786 		    g_print("\n%s\n",aline);
  2787 		if (!pswit[OVERVIEW_SWITCH])
  2788 		    g_print("    Line %ld column %ld - "
  2789 		      "Hyphen at end of line?\n",
  2790 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2791 	    }
  2792 	}
  2793     }
  2794 }
  2795 
  2796 /*
  2797  * check_for_unspaced_bracket:
  2798  *
  2799  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2800  * If so, suspect a scanno like "a]most".
  2801  */
  2802 void check_for_unspaced_bracket(const char *aline)
  2803 {
  2804     const char *s;
  2805     gunichar c,nc,pc;
  2806     c=g_utf8_get_char(aline);
  2807     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2808     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2809     {
  2810 	pc=c;
  2811 	c=nc;
  2812 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2813 	if (!nc)
  2814 	    break;
  2815 	/* for each bracket character in the line except 1st & last */
  2816 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2817 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2818 	{
  2819 	    if (pswit[ECHO_SWITCH])
  2820 		g_print("\n%s\n",aline);
  2821 	    if (!pswit[OVERVIEW_SWITCH])
  2822 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2823 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2824 	    else
  2825 		cnt_punct++;
  2826 	}
  2827     }
  2828 }
  2829 
  2830 /*
  2831  * check_for_unpunctuated_endquote:
  2832  */
  2833 void check_for_unpunctuated_endquote(const char *aline)
  2834 {
  2835     const char *s;
  2836     gunichar c,nc,pc;
  2837     QuoteClass qc;
  2838     c=g_utf8_get_char(aline);
  2839     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2840     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2841     {
  2842 	pc=c;
  2843 	c=nc;
  2844 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2845 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2846 	/* for each character in the line except 1st */
  2847 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2848 	{
  2849 	    if (pswit[ECHO_SWITCH])
  2850 		g_print("\n%s\n",aline);
  2851 	    if (!pswit[OVERVIEW_SWITCH])
  2852 		g_print("    Line %ld column %ld - "
  2853 		  "endquote missing punctuation?\n",
  2854 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2855 	    else
  2856 		cnt_punct++;
  2857 	}
  2858     }
  2859 }
  2860 
  2861 /*
  2862  * check_for_html_tag:
  2863  *
  2864  * Check for <HTML TAG>.
  2865  *
  2866  * If there is a < in the line, followed at some point
  2867  * by a > then we suspect HTML.
  2868  */
  2869 void check_for_html_tag(const char *aline)
  2870 {
  2871     const char *open,*close;
  2872     gchar *tag;
  2873     open=strchr(aline,'<');
  2874     if (open)
  2875     {
  2876 	close=strchr(g_utf8_next_char(open),'>');
  2877 	if (close)
  2878 	{
  2879 	    if (pswit[ECHO_SWITCH])
  2880 		g_print("\n%s\n",aline);
  2881 	    if (!pswit[OVERVIEW_SWITCH])
  2882 	    {
  2883 		tag=g_strndup(open,close-open+1);
  2884 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2885 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2886 		g_free(tag);
  2887 	    }
  2888 	    else
  2889 		cnt_html++;
  2890 	}
  2891     }
  2892 }
  2893 
  2894 /*
  2895  * check_for_html_entity:
  2896  *
  2897  * Check for &symbol; HTML.
  2898  *
  2899  * If there is a & in the line, followed at
  2900  * some point by a ; then we suspect HTML.
  2901  */
  2902 void check_for_html_entity(const char *aline)
  2903 {
  2904     const char *s,*amp,*scolon;
  2905     gchar *entity;
  2906     amp=strchr(aline,'&');
  2907     if (amp)
  2908     {
  2909 	scolon=strchr(amp,';');
  2910 	if (scolon)
  2911 	{
  2912 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2913 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2914 		    break;		/* Don't report "Jones & Son;" */
  2915 	    if (s>=scolon)
  2916 	    {
  2917 		if (pswit[ECHO_SWITCH])
  2918 		    g_print("\n%s\n",aline);
  2919 		if (!pswit[OVERVIEW_SWITCH])
  2920 		{
  2921 		    entity=g_strndup(amp,scolon-amp+1);
  2922 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2923 		      linecnt,(int)(amp-aline)+1,entity);
  2924 		    g_free(entity);
  2925 		}
  2926 		else
  2927 		    cnt_html++;
  2928 	    }
  2929 	}
  2930     }
  2931 }
  2932 
  2933 /*
  2934  * check_for_omitted_punctuation:
  2935  *
  2936  * Check for omitted punctuation at end of paragraph by working back
  2937  * through prevline. DW.
  2938  * Need to check this only for "normal" paras.
  2939  * So what is a "normal" para?
  2940  *    Not normal if one-liner (chapter headings, etc.)
  2941  *    Not normal if doesn't contain at least one locase letter
  2942  *    Not normal if starts with space
  2943  */
  2944 void check_for_omitted_punctuation(const char *prevline,
  2945   struct line_properties *last,int start_para_line)
  2946 {
  2947     gboolean letter_on_line=FALSE;
  2948     const char *s;
  2949     gunichar c;
  2950     gboolean closing_quote;
  2951     for (s=prevline;*s;s=g_utf8_next_char(s))
  2952 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2953 	{
  2954 	    letter_on_line=TRUE;
  2955 	    break;
  2956 	}
  2957     /*
  2958      * This next "if" is a problem.
  2959      * If we say "start_para_line <= linecnt - 1", that includes
  2960      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2961      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2962      * misses genuine one-line paragraphs.
  2963      */
  2964     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2965       g_utf8_get_char(prevline)>CHAR_SPACE)
  2966     {
  2967 	s=prevline+strlen(prevline);
  2968 	do
  2969 	{
  2970 	    s=g_utf8_prev_char(s);
  2971 	    c=g_utf8_get_char(s);
  2972 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2973 		closing_quote=TRUE;
  2974 	    else
  2975 		closing_quote=FALSE;
  2976 	} while (closing_quote && s>prevline);
  2977 	for (;s>prevline;s=g_utf8_prev_char(s))
  2978 	{
  2979 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2980 	    {
  2981 		if (pswit[ECHO_SWITCH])
  2982 		    g_print("\n%s\n",prevline);
  2983 		if (!pswit[OVERVIEW_SWITCH])
  2984 		    g_print("    Line %ld column %ld - "
  2985 		      "No punctuation at para end?\n",
  2986 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2987 		else
  2988 		    cnt_punct++;
  2989 		break;
  2990 	    }
  2991 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2992 		break;
  2993 	}
  2994     }
  2995 }
  2996 
  2997 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2998 {
  2999     const char *word=key;
  3000     int *dupcnt=value;
  3001     if (*dupcnt)
  3002 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  3003 	  word,*dupcnt);
  3004     return FALSE;
  3005 }
  3006 
  3007 void print_as_windows_1252(const char *string)
  3008 {
  3009     gsize inbytes,outbytes;
  3010     gchar *buf,*bp;
  3011     static GIConv converter=(GIConv)-1;
  3012     if (!string)
  3013     {
  3014 	if (converter!=(GIConv)-1)
  3015 	    g_iconv_close(converter);
  3016 	converter=(GIConv)-1;
  3017 	return;
  3018     }
  3019     if (converter==(GIConv)-1)
  3020 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  3021     if (converter!=(GIConv)-1)
  3022     {
  3023 	inbytes=outbytes=strlen(string);
  3024 	bp=buf=g_malloc(outbytes+1);
  3025 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  3026 	*bp='\0';
  3027 	fputs(buf,stdout);
  3028 	g_free(buf);
  3029     }
  3030     else
  3031 	fputs(string,stdout);
  3032 }
  3033 
  3034 void print_as_utf_8(const char *string)
  3035 {
  3036     fputs(string,stdout);
  3037 }
  3038 
  3039 /*
  3040  * procfile:
  3041  *
  3042  * Process one file.
  3043  */
  3044 void procfile(const char *filename)
  3045 {
  3046     const char *s;
  3047     gchar *parastart=NULL;	/* first line of current para */
  3048     gchar *etext,*aline;
  3049     gchar *etext_ptr;
  3050     GError *err=NULL;
  3051     struct first_pass_results *first_pass_results;
  3052     struct warnings *warnings;
  3053     struct counters counters={0};
  3054     struct line_properties last={0};
  3055     struct parities parities={0};
  3056     struct pending pending={0};
  3057     gboolean isemptyline;
  3058     long start_para_line=0;
  3059     gboolean isnewpara=FALSE,enddash=FALSE;
  3060     last.start=CHAR_SPACE;
  3061     linecnt=checked_linecnt=0;
  3062     etext=read_etext(filename,&err);
  3063     if (!etext)
  3064     {
  3065 	if (pswit[STDOUT_SWITCH])
  3066 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  3067 	else
  3068 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  3069 	exit(1);
  3070     }
  3071     g_print("\n\nFile: %s\n\n",filename);
  3072     first_pass_results=first_pass(etext);
  3073     warnings=report_first_pass(first_pass_results);
  3074     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  3075     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  3076     /*
  3077      * Here we go with the main pass. Hold onto yer hat!
  3078      */
  3079     linecnt=0;
  3080     etext_ptr=etext;
  3081     while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
  3082     {
  3083 	linecnt++;
  3084 	if (linecnt==1)
  3085 	    isnewpara=TRUE;
  3086 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  3087 	    continue;    // skip DP page separators completely
  3088 	if (linecnt<first_pass_results->firstline ||
  3089 	  (first_pass_results->footerline>0 &&
  3090 	  linecnt>first_pass_results->footerline))
  3091 	{
  3092 	    if (pswit[HEADER_SWITCH])
  3093 	    {
  3094 		if (g_str_has_prefix(aline,"Title:"))
  3095 		    g_print("    %s\n",aline);
  3096 		if (g_str_has_prefix(aline,"Author:"))
  3097 		    g_print("    %s\n",aline);
  3098 		if (g_str_has_prefix(aline,"Release Date:"))
  3099 		    g_print("    %s\n",aline);
  3100 		if (g_str_has_prefix(aline,"Edition:"))
  3101 		    g_print("    %s\n\n",aline);
  3102 	    }
  3103 	    continue;		/* skip through the header */
  3104 	}
  3105 	checked_linecnt++;
  3106 	print_pending(aline,parastart,&pending);
  3107 	isemptyline=analyse_quotes(aline,&counters);
  3108 	if (isnewpara && !isemptyline)
  3109 	{
  3110 	    /* This line is the start of a new paragraph. */
  3111 	    start_para_line=linecnt;
  3112 	    /* Capture its first line in case we want to report it later. */
  3113 	    g_free(parastart);
  3114 	    parastart=g_strdup(aline);
  3115 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  3116 	    s=aline;
  3117 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  3118 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  3119 		s=g_utf8_next_char(s);
  3120 	    if (g_unichar_islower(g_utf8_get_char(s)))
  3121 	    {
  3122 		/* and its first letter is lowercase */
  3123 		if (pswit[ECHO_SWITCH])
  3124 		    g_print("\n%s\n",aline);
  3125 		if (!pswit[OVERVIEW_SWITCH])
  3126 		    g_print("    Line %ld column %ld - "
  3127 		      "Paragraph starts with lower-case\n",
  3128 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  3129 		else
  3130 		    cnt_punct++;
  3131 	    }
  3132 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  3133 	}
  3134 	/* Check for an em-dash broken at line end. */
  3135 	if (enddash && g_utf8_get_char(aline)=='-')
  3136 	{
  3137 	    if (pswit[ECHO_SWITCH])
  3138 		g_print("\n%s\n",aline);
  3139 	    if (!pswit[OVERVIEW_SWITCH])
  3140 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  3141 	    else
  3142 		cnt_punct++;
  3143 	}
  3144 	enddash=FALSE;
  3145 	for (s=g_utf8_prev_char(aline+strlen(aline));
  3146 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  3147 	    ;
  3148 	if (s>=aline && g_utf8_get_char(s)=='-')
  3149 	    enddash=TRUE;
  3150 	check_for_control_characters(aline);
  3151 	check_for_odd_characters(aline,warnings,isemptyline);
  3152 	if (warnings->longline)
  3153 	    check_for_long_line(aline);
  3154 	if (warnings->shortline)
  3155 	    check_for_short_line(aline,&last);
  3156 	last.blen=last.len;
  3157 	last.len=g_utf8_strlen(aline,-1);
  3158 	last.start=g_utf8_get_char(aline);
  3159 	check_for_starting_punctuation(aline);
  3160 	if (warnings->dash)
  3161 	{
  3162 	    check_for_spaced_emdash(aline);
  3163 	    check_for_spaced_dash(aline);
  3164 	}
  3165 	check_for_unmarked_paragraphs(aline);
  3166 	check_for_jeebies(aline);
  3167 	check_for_mta_from(aline);
  3168 	check_for_orphan_character(aline);
  3169 	check_for_pling_scanno(aline);
  3170 	check_for_extra_period(aline,warnings);
  3171 	check_for_following_punctuation(aline);
  3172 	check_for_typos(aline,warnings);
  3173 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3174 	check_for_double_punctuation(aline,warnings);
  3175 	check_for_spaced_quotes(aline);
  3176 	check_for_miscased_genative(aline);
  3177 	check_end_of_line(aline,warnings);
  3178 	check_for_unspaced_bracket(aline);
  3179 	if (warnings->endquote)
  3180 	    check_for_unpunctuated_endquote(aline);
  3181 	check_for_html_tag(aline);
  3182 	check_for_html_entity(aline);
  3183 	if (isemptyline)
  3184 	{
  3185 	    check_for_mismatched_quotes(&counters,&pending);
  3186 	    counters_reset(&counters);
  3187 	    /* let the next iteration know that it's starting a new para */
  3188 	    isnewpara=TRUE;
  3189 	    if (prevline)
  3190 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3191 	}
  3192 	g_free(prevline);
  3193 	prevline=g_strdup(aline);
  3194     }
  3195     linecnt++;
  3196     check_for_mismatched_quotes(&counters,&pending);
  3197     print_pending(NULL,parastart,&pending);
  3198     reset_pending(&pending);
  3199     if (prevline)
  3200     {
  3201 	g_free(prevline);
  3202 	prevline=NULL;
  3203     }
  3204     g_free(parastart);
  3205     g_free(prevline);
  3206     g_free(etext);
  3207     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3208 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3209     g_tree_unref(qword);
  3210     g_tree_unref(qperiod);
  3211     counters_destroy(&counters);
  3212     g_set_print_handler(NULL);
  3213     print_as_windows_1252(NULL);
  3214     if (pswit[MARKUP_SWITCH])  
  3215 	loseentities(NULL);
  3216 }
  3217 
  3218 /*
  3219  * flgets:
  3220  *
  3221  * Get one line from the input text. The setting of newlines has the following
  3222  * effect:
  3223  *
  3224  * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
  3225  *
  3226  * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
  3227  *		 the newline character.
  3228  *
  3229  * UNIX_NEWLINES: Check for the presence of CRs.
  3230  *
  3231  * In all cases, check that the last line is correctly terminated.
  3232  *
  3233  * Returns: a pointer to the line.
  3234  */
  3235 char *flgets(char **etext,long lcnt,int newlines)
  3236 {
  3237     gunichar c;
  3238     gboolean isCR=FALSE;
  3239     char *theline=*etext;
  3240     char *eos=theline;
  3241     gchar *s;
  3242     for (;;)
  3243     {
  3244 	c=g_utf8_get_char(*etext);
  3245 	if (!c)
  3246 	{
  3247 	    if (*etext==theline)
  3248 		return NULL;
  3249 	    else if (pswit[LINE_END_SWITCH])
  3250 	    {
  3251 		if (pswit[ECHO_SWITCH])
  3252 		{
  3253 		    s=g_strndup(theline,eos-theline);
  3254 		    g_print("\n%s\n",s);
  3255 		    g_free(s);
  3256 		}
  3257 		if (!pswit[OVERVIEW_SWITCH])
  3258 		{
  3259 		    if (newlines==OS9_NEWLINES)
  3260 			g_print("    Line %ld - No CR?\n",lcnt);
  3261 		    else
  3262 		    {
  3263 			/* There may, or may not, have been a CR */
  3264 			g_print("    Line %ld - No LF?\n",lcnt);
  3265 		    }
  3266 		}
  3267 		else
  3268 		    cnt_lineend++;
  3269 	    }
  3270 	    break;
  3271 	}
  3272 	*etext=g_utf8_next_char(*etext);
  3273 	/* either way, it's end of line */
  3274 	if (c=='\n')
  3275 	{
  3276 	    if (newlines==DOS_NEWLINES && !isCR)
  3277 	    {
  3278 		/* Error - a LF without a preceding CR */
  3279 		if (pswit[LINE_END_SWITCH])
  3280 		{
  3281 		    if (pswit[ECHO_SWITCH])
  3282 		    {
  3283 			s=g_strndup(theline,eos-theline);
  3284 			g_print("\n%s\n",s);
  3285 			g_free(s);
  3286 		    }
  3287 		    if (!pswit[OVERVIEW_SWITCH])
  3288 			g_print("    Line %ld - No CR?\n",lcnt);
  3289 		    else
  3290 			cnt_lineend++;
  3291 		}
  3292 	    }
  3293 	    break;
  3294 	}
  3295 	if (c=='\r')
  3296 	{
  3297 	    if (newlines==OS9_NEWLINES)
  3298 		break;
  3299 	    if (isCR || newlines==UNIX_NEWLINES)
  3300 	    {
  3301 		if (pswit[LINE_END_SWITCH])
  3302 		{
  3303 		    if (pswit[ECHO_SWITCH])
  3304 		    {
  3305 			s=g_strndup(theline,eos-theline);
  3306 			g_print("\n%s\n",s);
  3307 			g_free(s);
  3308 		    }
  3309 		    if (!pswit[OVERVIEW_SWITCH])
  3310 		    {
  3311 			if (newlines==UNIX_NEWLINES)
  3312 			    g_print("    Line %ld column %ld - Embedded CR?\n",
  3313 			      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3314 			else
  3315 			    g_print("    Line %ld - Two successive CRs?\n",
  3316 			      lcnt);
  3317 		    }
  3318 		    else
  3319 			cnt_lineend++;
  3320 		}
  3321 		if (newlines==UNIX_NEWLINES)
  3322 		    *eos=' ';
  3323 	    }
  3324 	    if (newlines==DOS_NEWLINES)
  3325 		isCR=TRUE;
  3326 	}
  3327 	else
  3328 	{
  3329 	    if (pswit[LINE_END_SWITCH] && isCR)
  3330 	    {
  3331 		if (pswit[ECHO_SWITCH])
  3332 		{
  3333 		    s=g_strndup(theline,eos-theline);
  3334 		    g_print("\n%s\n",s);
  3335 		    g_free(s);
  3336 		}
  3337 		if (!pswit[OVERVIEW_SWITCH])
  3338 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3339 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3340 		else
  3341 		    cnt_lineend++;
  3342 		*eos=' ';
  3343 	    }
  3344 	    isCR=FALSE;
  3345 	    eos=g_utf8_next_char(eos);
  3346 	}
  3347     }
  3348     *eos='\0';
  3349     if (pswit[MARKUP_SWITCH])  
  3350 	postprocess_for_HTML(theline);
  3351     if (pswit[DP_SWITCH])  
  3352 	postprocess_for_DP(theline);
  3353     return theline;
  3354 }
  3355 
  3356 /*
  3357  * mixdigit:
  3358  *
  3359  * Takes a "word" as a parameter, and checks whether it
  3360  * contains a mixture of alpha and digits. Generally, this is an
  3361  * error, but may not be for cases like 4th or L5 12s. 3d.
  3362  *
  3363  * Returns: TRUE iff an is error found.
  3364  */
  3365 gboolean mixdigit(const char *checkword)
  3366 {
  3367     gboolean wehaveadigit,wehavealetter,query;
  3368     const char *s,*nondigit;
  3369     wehaveadigit=wehavealetter=query=FALSE;
  3370     for (s=checkword;*s;s=g_utf8_next_char(s))
  3371 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3372 	    wehavealetter=TRUE;
  3373 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3374 	    wehaveadigit=TRUE;
  3375     if (wehaveadigit && wehavealetter)
  3376     {
  3377 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3378 	query=TRUE;
  3379 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3380 	  nondigit=g_utf8_next_char(nondigit))
  3381 	    ;
  3382 	/* digits, ending in st, rd, nd, th of either case */
  3383 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3384 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3385 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3386 	  !g_ascii_strcasecmp(nondigit,"th"))
  3387 	    query=FALSE;
  3388 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3389 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3390 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3391 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3392 	    query=FALSE;
  3393 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3394 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3395 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3396 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3397 	    query=FALSE;
  3398 	/* digits, ending in l, L, s or d */
  3399 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3400 	  !strcmp(nondigit,"d"))
  3401 	    query=FALSE;
  3402 	/*
  3403 	 * L at the start of a number, representing Britsh pounds, like L500.
  3404 	 * This is cute. We know the current word is mixed digit. If the first
  3405 	 * letter is L, there must be at least one digit following. If both
  3406 	 * digits and letters follow, we have a genuine error, else we have a
  3407 	 * capital L followed by digits, and we accept that as a non-error.
  3408 	 */
  3409 	if (g_utf8_get_char(checkword)=='L' &&
  3410 	  !mixdigit(g_utf8_next_char(checkword)))
  3411 	    query=FALSE;
  3412     }
  3413     return query;
  3414 }
  3415 
  3416 /*
  3417  * getaword:
  3418  *
  3419  * Extracts the first/next "word" from the line, and returns it.
  3420  * A word is defined as one English word unit--or at least that's the aim.
  3421  * "ptr" is advanced to the position in the line where we will start
  3422  * looking for the next word.
  3423  * If line is non-NULL, then it will be used to derive the column numbers for
  3424  * any warnings issued. If line is NULL, then warnings will be suppressed.
  3425  *
  3426  * Returns: A newly-allocated string.
  3427  */
  3428 gchar *getaword(const char *line,const char **ptr)
  3429 {
  3430     const char *s,*t,*t2;
  3431     GString *word;
  3432     gunichar c,pc;
  3433     int adjust;
  3434     gboolean initial_underlining=FALSE;
  3435     word=g_string_new(NULL);
  3436     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3437       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3438       **ptr;*ptr=g_utf8_next_char(*ptr))
  3439     {
  3440 	/* Handle exceptions for footnote markers like [1] */
  3441 	if (g_utf8_get_char(*ptr)=='[')
  3442 	{
  3443 	    g_string_append_c(word,'[');
  3444 	    s=g_utf8_next_char(*ptr);
  3445 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
  3446 		g_string_append_unichar(word,g_utf8_get_char(s));
  3447 	    if (g_utf8_get_char(s)==']')
  3448 	    {
  3449 		g_string_append_c(word,']');
  3450 		*ptr=g_utf8_next_char(s);
  3451 		return g_string_free(word,FALSE);
  3452 	    }
  3453 	    else
  3454 		g_string_truncate(word,0);
  3455 	}
  3456 	initial_underlining=g_utf8_get_char(*ptr)=='_';
  3457     }
  3458     /*
  3459      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3460      * Especially yucky is the case of L1,000
  3461      * This section looks for a pattern of characters including a digit
  3462      * followed by a comma or period followed by one or more digits.
  3463      * If found, it returns this whole pattern as a word; otherwise we discard
  3464      * the results and resume our normal programming.
  3465      */
  3466     s=*ptr;
  3467     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3468       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3469       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3470 	g_string_append_unichar(word,g_utf8_get_char(s));
  3471     if (word->len)
  3472     {
  3473 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3474 	{
  3475 	    c=g_utf8_get_char(t);
  3476 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3477 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3478 	    {
  3479 		*ptr=s;
  3480 		return g_string_free(word,FALSE);
  3481 	    }
  3482 	}
  3483     }
  3484     /* we didn't find a punctuated number - do the regular getword thing */
  3485     g_string_truncate(word,0);
  3486     s=*ptr;
  3487     c=g_utf8_get_char(s);
  3488     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' ||
  3489       CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s))
  3490 	g_string_append_unichar(word,c);
  3491     if (initial_underlining && word->str[word->len-1]=='_')
  3492     {
  3493 	/* _Simple_ or _Old-school_underlining_ */
  3494 	t=strchr(*ptr,'_');
  3495 	g_string_truncate(word,t-*ptr);
  3496 	*ptr=t;
  3497     }
  3498     else if (initial_underlining || (t=strchr(word->str,'_')))
  3499     {
  3500 	/* Part_ial_ underlining */
  3501 	adjust=0;
  3502 	if (initial_underlining)
  3503 	{
  3504 	    t2=strchr(word->str,'_');
  3505 	    if (t2)
  3506 	    {
  3507 		g_string_erase(word,t2-word->str,1);
  3508 		adjust++;
  3509 	    }
  3510 	    else
  3511 	    {
  3512 		if (line)
  3513 		{
  3514 		    if (pswit[ECHO_SWITCH])
  3515 			g_print("\n%s\n",line);
  3516 		    if (!pswit[OVERVIEW_SWITCH])
  3517 			g_print("    Line %ld column %ld - "
  3518 			  "Missing space or underscore?\n",linecnt,
  3519 			  g_utf8_pointer_to_offset(line,*ptr));
  3520 		    else
  3521 			cnt_punct++;
  3522 		}
  3523 		*ptr=s;
  3524 		return g_string_free(word,FALSE);
  3525 	    }
  3526 	}
  3527 	while ((t=strchr(word->str,'_')))
  3528 	{
  3529 	    t2=strchr(t+1,'_');
  3530 	    if (t2)
  3531 	    {
  3532 		g_string_erase(word,t-word->str,1);
  3533 		t2--;
  3534 		g_string_erase(word,t2-word->str,1);
  3535 		adjust+=2;
  3536 	    }
  3537 	    else
  3538 	    {
  3539 		g_string_truncate(word,t-word->str);
  3540 		adjust+=g_utf8_pointer_to_offset(word->str,t);
  3541 		*ptr=g_utf8_offset_to_pointer(*ptr,adjust);
  3542 		if (line)
  3543 		{
  3544 		    if (pswit[ECHO_SWITCH])
  3545 			g_print("\n%s\n",line);
  3546 		    if (!pswit[OVERVIEW_SWITCH])
  3547 			g_print("    Line %ld column %ld - "
  3548 			  "Missing space or underscore?\n",linecnt,
  3549 			  g_utf8_pointer_to_offset(line,*ptr)+1);
  3550 		    else
  3551 			cnt_punct++;
  3552 		}
  3553 		return g_string_free(word,FALSE);
  3554 	    }
  3555 	}
  3556 	*ptr=s;
  3557     }
  3558     else
  3559 	/* No underlining */
  3560 	*ptr=s;
  3561     return g_string_free(word,FALSE);
  3562 }
  3563 
  3564 /*
  3565  * isroman:
  3566  *
  3567  * Is this word a Roman Numeral?
  3568  *
  3569  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3570  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3571  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3572  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3573  * expressions thereof, except when it came to taxes. Allow any number of M,
  3574  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3575  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3576  * of optional Is.
  3577  */
  3578 gboolean isroman(const char *t)
  3579 {
  3580     const char *s;
  3581     if (!t || !*t)
  3582 	return FALSE;
  3583     s=t;
  3584     while (g_utf8_get_char(t)=='m' && *t)
  3585 	t++;
  3586     if (g_utf8_get_char(t)=='d')
  3587 	t++;
  3588     if (g_str_has_prefix(t,"cm"))
  3589 	t+=2;
  3590     if (g_str_has_prefix(t,"cd"))
  3591 	t+=2;
  3592     while (g_utf8_get_char(t)=='c' && *t)
  3593 	t++;
  3594     if (g_str_has_prefix(t,"xl"))
  3595 	t+=2;
  3596     if (g_str_has_prefix(t,"xc"))
  3597 	t+=2;
  3598     if (g_utf8_get_char(t)=='l')
  3599 	t++;
  3600     while (g_utf8_get_char(t)=='x' && *t)
  3601 	t++;
  3602     if (g_str_has_prefix(t,"ix"))
  3603 	t+=2;
  3604     if (g_str_has_prefix(t,"iv"))
  3605 	t+=2;
  3606     if (g_utf8_get_char(t)=='v')
  3607 	t++;
  3608     while (g_utf8_get_char(t)=='i' && *t)
  3609 	t++;
  3610     return !*t;
  3611 }
  3612 
  3613 /*
  3614  * postprocess_for_DP:
  3615  *
  3616  * Invoked with the -d switch from flgets().
  3617  * It simply "removes" from the line a hard-coded set of common
  3618  * DP-specific tags, so that the line passed to the main routine has
  3619  * been pre-cleaned of DP markup.
  3620  */
  3621 void postprocess_for_DP(char *theline)
  3622 {
  3623     char *s,*t;
  3624     int i;
  3625     if (!*theline) 
  3626 	return;
  3627     for (i=0;*DPmarkup[i];i++)
  3628 	while ((s=strstr(theline,DPmarkup[i])))
  3629 	{
  3630 	    t=s+strlen(DPmarkup[i]);
  3631 	    memmove(s,t,strlen(t)+1);
  3632 	}
  3633 }
  3634 
  3635 /*
  3636  * postprocess_for_HTML:
  3637  *
  3638  * Invoked with the -m switch from flgets().
  3639  * It simply "removes" from the line a hard-coded set of common
  3640  * HTML tags and "replaces" a hard-coded set of common HTML
  3641  * entities, so that the line passed to the main routine has
  3642  * been pre-cleaned of HTML.
  3643  */
  3644 void postprocess_for_HTML(char *theline)
  3645 {
  3646     while (losemarkup(theline))
  3647 	;
  3648     loseentities(theline);
  3649 }
  3650 
  3651 char *losemarkup(char *theline)
  3652 {
  3653     char *s,*t;
  3654     int i;
  3655     s=strchr(theline,'<');
  3656     t=s?strchr(s,'>'):NULL;
  3657     if (!s || !t)
  3658 	return NULL;
  3659     for (i=0;*markup[i];i++)
  3660 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3661 	{
  3662 	    t=g_utf8_next_char(t);
  3663 	    memmove(s,t,strlen(t)+1);
  3664 	    return s;
  3665 	}
  3666     /* It's an unrecognized <xxx>. */
  3667     return NULL;
  3668 }
  3669 
  3670 void loseentities(char *theline)
  3671 {
  3672     int i;
  3673     gsize nb;
  3674     char *amp,*scolon;
  3675     gchar *s,*t;
  3676     gunichar c;
  3677     GTree *entities=NULL;
  3678     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3679     if (!theline)
  3680     {
  3681 	if (entities)
  3682 	    g_tree_destroy(entities);
  3683 	entities=NULL;
  3684 	if (translit!=(GIConv)-1)
  3685 	    g_iconv_close(translit);
  3686 	translit=(GIConv)-1;
  3687 	if (to_utf8!=(GIConv)-1)
  3688 	    g_iconv_close(to_utf8);
  3689 	to_utf8=(GIConv)-1;
  3690 	return;
  3691     }
  3692     if (!*theline)
  3693 	return;
  3694     if (!entities)
  3695     {
  3696 	entities=g_tree_new((GCompareFunc)strcmp);
  3697 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3698 	    g_tree_insert(entities,HTMLentities[i].name,
  3699 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3700     }
  3701     if (translit==(GIConv)-1)
  3702 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3703     if (to_utf8==(GIConv)-1)
  3704 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3705     while((amp=strchr(theline,'&')))
  3706     {
  3707 	scolon=strchr(amp,';');
  3708 	if (scolon)
  3709 	{
  3710 	    if (amp[1]=='#')
  3711 	    {
  3712 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3713 		    c=strtol(amp+2,NULL,10);
  3714 		else if (amp[2]=='x' &&
  3715 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3716 		    c=strtol(amp+3,NULL,16);
  3717 	    }
  3718 	    else
  3719 	    {
  3720 		s=g_strndup(amp+1,scolon-(amp+1));
  3721 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3722 		g_free(s);
  3723 	    }
  3724 	}
  3725 	else
  3726 	    c=0;
  3727 	if (c)
  3728 	{
  3729 	    theline=amp;
  3730 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3731 		theline+=g_unichar_to_utf8(c,theline);
  3732 	    else
  3733 	    {
  3734 		s=g_malloc(6);
  3735 		nb=g_unichar_to_utf8(c,s);
  3736 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3737 		g_free(s);
  3738 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3739 		g_free(t);
  3740 		memcpy(theline,s,nb);
  3741 		g_free(s);
  3742 		theline+=nb;
  3743 	    }
  3744 	    memmove(theline,g_utf8_next_char(scolon),
  3745 	      strlen(g_utf8_next_char(scolon))+1);
  3746 	}
  3747 	else
  3748 	    theline=g_utf8_next_char(amp);
  3749     }
  3750 }
  3751 
  3752 gboolean tagcomp(const char *strin,const char *basetag)
  3753 {
  3754     gboolean retval;
  3755     gchar *s,*t;
  3756     if (g_utf8_get_char(strin)=='/')
  3757 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3758     else
  3759 	t=g_utf8_casefold(strin,-1);
  3760     s=g_utf8_casefold(basetag,-1);
  3761     retval=g_str_has_prefix(t,s);
  3762     g_free(s);
  3763     g_free(t);
  3764     return retval;
  3765 }
  3766 
  3767 void proghelp(GOptionContext *context)
  3768 {
  3769     gchar *help;
  3770     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3771     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3772     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3773     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3774       "For details, read the file COPYING.\n",stderr);
  3775     fputs("This is Free Software; "
  3776       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3777     fputs("read the file COPYING for details.\n\n",stderr);
  3778     help=g_option_context_get_help(context,TRUE,NULL);
  3779     fputs(help,stderr);
  3780     g_free(help);
  3781     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3782     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3783       "non-ASCII\n",stderr);
  3784     fputs("characters like accented letters, "
  3785       "lines longer than 75 or shorter than 55,\n",stderr);
  3786     fputs("unbalanced quotes or brackets, "
  3787       "a variety of badly formatted punctuation, \n",stderr);
  3788     fputs("HTML tags, some likely typos. "
  3789       "It is NOT a substitute for human judgement.\n",stderr);
  3790     fputs("\n",stderr);
  3791 }