bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Mon Sep 30 08:18:42 2013 +0100 (2013-09-30)
changeset 152 da598b05f8e8
parent 150 fd584db1d305
parent 151 a485f5dcc2de
child 155 1f1d40127177
permissions -rw-r--r--
Bugs #13+14: charsets in configuration files
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 gboolean typo_compat,paranoid_compat;
   136 
   137 static GOptionEntry options[]={
   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   139       "Ignore DP-specific markup", NULL },
   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   142       "Don't ignore DP-specific markup", NULL },
   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   144       "Echo queried line", NULL },
   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   147       "Don't echo queried line", NULL },
   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   149       "Check single quotes", NULL },
   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   152       "Don't check single quotes", NULL },
   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   154       "Check common typos", NULL },
   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   157       "Don't check common typos", NULL },
   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   159       "Require closure of quotes on every paragraph", NULL },
   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   162       "Don't require closure of quotes on every paragraph", NULL },
   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   165       "Enable paranoid querying of everything", NULL },
   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   168       "Disable paranoid querying of everything", NULL },
   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   171       "Enable line end checking", NULL },
   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   174       "Diable line end checking", NULL },
   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   176       "Overview: just show counts", NULL },
   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   179       "Show individual warnings", NULL },
   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   181       "Output errors to stdout instead of stderr", NULL },
   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   184       "Output errors to stderr instead of stdout", NULL },
   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   186       "Echo header fields", NULL },
   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Don't echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "No special handling for markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   199       "Ignore file of user-defined typos", NULL },
   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   201       "Verbose - list everything", NULL },
   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   204       "Switch off verbose mode", NULL },
   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   206       "Set of characters valid for this ebook", "NAME" },
   207     { NULL }
   208 };
   209 
   210 /*
   211  * Options relating to configuration which make no sense from inside
   212  * a configuration file.
   213  */
   214 
   215 static GOptionEntry config_options[]={
   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   217       "Defaults for use on www upload", NULL },
   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   219       "Dump current config settings", NULL },
   220     { NULL }
   221 };
   222 
   223 static GOptionEntry compatibility_options[]={
   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   225       "Toggle checking for common typos", NULL },
   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   227       "Toggle both paranoid mode and common typos", NULL },
   228     { NULL }
   229 };
   230 
   231 long cnt_quote;		/* for overview mode, count of quote queries */
   232 long cnt_brack;		/* for overview mode, count of brackets queries */
   233 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   234 long cnt_odd;		/* for overview mode, count of odd character queries */
   235 long cnt_long;		/* for overview mode, count of long line errors */
   236 long cnt_short;		/* for overview mode, count of short line queries */
   237 long cnt_punct;		/* for overview mode,
   238 			   count of punctuation and spacing queries */
   239 long cnt_dash;		/* for overview mode, count of dash-related queries */
   240 long cnt_word;		/* for overview mode, count of word queries */
   241 long cnt_html;		/* for overview mode, count of html queries */
   242 long cnt_lineend;	/* for overview mode, count of line-end queries */
   243 long cnt_spacend;	/* count of lines with space at end */
   244 long linecnt;		/* count of total lines in the file */
   245 long checked_linecnt;	/* count of lines actually checked */
   246 
   247 void proghelp(GOptionContext *context);
   248 void procfile(const char *);
   249 
   250 gchar *running_from;
   251 
   252 gboolean mixdigit(const char *);
   253 gchar *getaword(const char **);
   254 char *flgets(char **,long);
   255 void postprocess_for_HTML(char *);
   256 char *linehasmarkup(char *);
   257 char *losemarkup(char *);
   258 gboolean tagcomp(const char *,const char *);
   259 void loseentities(char *);
   260 gboolean isroman(const char *);
   261 void postprocess_for_DP(char *);
   262 void print_as_windows_1252(const char *string);
   263 void print_as_utf_8(const char *string);
   264 
   265 GTree *qword,*qperiod;
   266 
   267 #ifdef __WIN32__
   268 UINT saved_cp;
   269 #endif
   270 
   271 gboolean set_charset(const char *name,GError **err)
   272 {
   273     /* The various UNICODE encodings all share the same character set. */
   274     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   275       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   276       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   277       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   278       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   279     int i;
   280     if (charset)
   281 	g_free(charset);
   282     if (charset_validator!=(GIConv)-1)
   283 	g_iconv_close(charset_validator);
   284     if (!name || !g_strcasecmp(name,"auto"))
   285     {
   286 	charset=NULL;
   287 	charset_validator=(GIConv)-1;
   288 	return TRUE;
   289     }
   290     else
   291 	charset=g_strdup(name);
   292     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   293 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   294 	{
   295 	    g_free(charset);
   296 	    charset=g_strdup("UTF-8");
   297 	    break;
   298 	}
   299     if (!strcmp(charset,"UTF-8"))
   300 	charset_validator=(GIConv)-1;
   301     else
   302     {
   303 	charset_validator=g_iconv_open(charset,"UTF-8");
   304 	if (charset_validator==(GIConv)-1)
   305 	{
   306 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   307 	      "Unknown character set \"%s\"",charset);
   308 	    return FALSE;
   309 	}
   310     }
   311     return TRUE;
   312 }
   313 
   314 GKeyFile *config;
   315 
   316 void config_file_update(GKeyFile *kf)
   317 {
   318     int i;
   319     const char *s;
   320     gboolean sw;
   321     for(i=0;options[i].long_name;i++)
   322     {
   323 	if (g_str_has_prefix(options[i].long_name,"no-"))
   324 	    continue;
   325 	if (options[i].arg==G_OPTION_ARG_NONE)
   326 	{
   327 	    sw=*(gboolean *)options[i].arg_data;
   328 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   329 		sw=!sw;
   330 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   331 	}
   332 	else if (options[i].arg==G_OPTION_ARG_STRING)
   333 	{
   334 	    s=*(gchar **)options[i].arg_data;
   335 	    if (!s)
   336 		s="auto";
   337 	    g_key_file_set_string(kf,"options",options[i].long_name,s);
   338 	}
   339 	else
   340 	    g_assert_not_reached();
   341     }
   342 }
   343 
   344 void config_file_add_comments(GKeyFile *kf)
   345 {
   346     int i;
   347     gchar *comment;
   348     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   349       NULL);
   350     for(i=0;options[i].long_name;i++)
   351     {
   352 	if (g_str_has_prefix(options[i].long_name,"no-"))
   353 	    continue;
   354 	comment=g_strconcat(" ",options[i].description,NULL);
   355 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   356 	g_free(comment);
   357     }
   358 }
   359 
   360 void dump_config(void)
   361 {
   362     gchar *s;
   363     if (config)
   364 	config_file_update(config);
   365     else
   366     {
   367 	config=g_key_file_new();
   368 	config_file_update(config);
   369 	config_file_add_comments(config);
   370     }
   371     s=g_key_file_to_data(config,NULL,NULL);
   372     if (s)
   373 	g_print("%s",s);
   374     g_free(s);
   375 }
   376 
   377 GKeyFile *read_config_file(gchar **full_path)
   378 {
   379     int i;
   380     GError *err=NULL;
   381     gchar **search_dirs;
   382     gchar *path;
   383     const char *search_path;
   384     GKeyFile *kf;
   385     kf=g_key_file_new();
   386     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   387     if (search_path)
   388     {
   389 #ifdef __WIN32__
   390 	search_dirs=g_strsplit(search_path,";",0);
   391 #else
   392 	search_dirs=g_strsplit(search_path,":",0);
   393 #endif
   394     }
   395     else
   396     {
   397 	search_dirs=g_new(gchar *,4);
   398 	search_dirs[0]=g_get_current_dir();
   399 	search_dirs[1]=g_strdup(running_from);
   400 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   401 	search_dirs[3]=NULL;
   402     }
   403     for(i=0;search_dirs[i];i++)
   404     {
   405 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   406 	if (g_key_file_load_from_file(kf,path,
   407 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   408 	    break;
   409 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   410 	{
   411 	    g_printerr("Bookloupe: Error reading %s\n",path);
   412 	    g_printerr("%s\n",err->message);
   413 	    exit(1);
   414 	}
   415 	g_clear_error(&err);
   416 	g_free(path);
   417 	path=NULL;
   418     }
   419     if (!search_dirs[i])
   420     {
   421 	g_key_file_free(kf);
   422 	kf=NULL;
   423     }
   424     g_strfreev(search_dirs);
   425     if (full_path && kf)
   426 	*full_path=path;
   427     else
   428 	g_free(path);
   429     return kf;
   430 }
   431 
   432 void parse_config_file(void)
   433 {
   434     int i,j;
   435     gchar *path,*s;
   436     gchar **keys;
   437     gboolean sw;
   438     GError *err=NULL;
   439     config=read_config_file(&path);
   440     if (config)
   441 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   442     else
   443 	keys=NULL;
   444     if (keys)
   445     {
   446 	for(i=0;keys[i];i++)
   447 	{
   448 	    for(j=0;options[j].long_name;j++)
   449 	    {
   450 		if (g_str_has_prefix(options[j].long_name,"no-"))
   451 		    continue;
   452 		else if (!strcmp(keys[i],options[j].long_name))
   453 		{
   454 		    if (options[j].arg==G_OPTION_ARG_NONE)
   455 		    {
   456 			sw=g_key_file_get_boolean(config,"options",keys[i],
   457 			  &err);
   458 			if (err)
   459 			{
   460 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   461 			      path,keys[i],err->message);
   462 			    g_clear_error(&err);
   463 			}
   464 			else
   465 			{
   466 			    if (options[j].flags&G_OPTION_FLAG_REVERSE)
   467 				sw=!sw;
   468 			    *(gboolean *)options[j].arg_data=sw;
   469 			}
   470 			break;
   471 		    }
   472 		    else if (options[j].arg==G_OPTION_ARG_STRING)
   473 		    {
   474 			s=g_key_file_get_string(config,"options",keys[i],
   475 			  &err);
   476 			if (err)
   477 			{
   478 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   479 			      path,keys[i],err->message);
   480 			    g_clear_error(&err);
   481 			}
   482 			else
   483 			{
   484 			    g_free(*(gchar **)options[j].arg_data);
   485 			    if (!g_strcmp0(s,"auto"))
   486 			    {
   487 				*(gchar **)options[j].arg_data=NULL;
   488 				g_free(s);
   489 			    }
   490 			    else
   491 				*(gchar **)options[j].arg_data=s;
   492 			}
   493 			break;
   494 		    }
   495 		    else
   496 			g_assert_not_reached();
   497 		}
   498 	    }
   499 	    if (!options[j].long_name)
   500 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   501 		  path,keys[i]);
   502 	}
   503 	g_strfreev(keys);
   504     }
   505     if (config)
   506 	g_free(path);
   507 }
   508 
   509 void parse_options(int *argc,char ***argv)
   510 {
   511     GError *err=NULL;
   512     GOptionContext *context;
   513     GOptionGroup *compatibility;
   514     context=g_option_context_new(
   515       "file - look for errors in Project Gutenberg(TM) etexts");
   516     g_option_context_add_main_entries(context,options,NULL);
   517     g_option_context_add_main_entries(context,config_options,NULL);
   518     compatibility=g_option_group_new("compatibility",
   519       "Options for Compatibility with Gutcheck:",
   520       "Show compatibility options",NULL,NULL);
   521     g_option_group_add_entries(compatibility,compatibility_options);
   522     g_option_context_add_group(context,compatibility);
   523     g_option_context_set_description(context,
   524       "For simplicity, only the switch options which reverse the\n"
   525       "default configuration are listed. In most cases, both vanilla\n"
   526       "and \"no-\" prefixed versions are available for use.");
   527     if (!g_option_context_parse(context,argc,argv,&err))
   528     {
   529 	g_printerr("Bookloupe: %s\n",err->message);
   530 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   531 	exit(1);
   532     }
   533     if (typo_compat)
   534 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   535     if (paranoid_compat)
   536     {
   537 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   538 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   539     }
   540     /*
   541      * Web uploads - for the moment, this is really just a placeholder
   542      * until we decide what processing we really want to do on web uploads
   543      */
   544     if (pswit[WEB_SWITCH])
   545     {
   546 	/* specific override for web uploads */
   547 	pswit[ECHO_SWITCH]=TRUE;
   548 	pswit[SQUOTE_SWITCH]=FALSE;
   549 	pswit[TYPO_SWITCH]=TRUE;
   550 	pswit[QPARA_SWITCH]=FALSE;
   551 	pswit[PARANOID_SWITCH]=TRUE;
   552 	pswit[LINE_END_SWITCH]=FALSE;
   553 	pswit[OVERVIEW_SWITCH]=FALSE;
   554 	pswit[STDOUT_SWITCH]=FALSE;
   555 	pswit[HEADER_SWITCH]=TRUE;
   556 	pswit[VERBOSE_SWITCH]=FALSE;
   557 	pswit[MARKUP_SWITCH]=FALSE;
   558 	pswit[USERTYPO_SWITCH]=FALSE;
   559 	pswit[DP_SWITCH]=FALSE;
   560     }
   561     if (opt_charset && !set_charset(opt_charset,&err))
   562     {
   563 	g_printerr("%s\n",err->message);
   564 	exit(1);
   565     }
   566     if (pswit[DUMP_CONFIG_SWITCH])
   567     {
   568 	dump_config();
   569 	exit(0);
   570     }
   571     g_free(opt_charset);
   572     opt_charset=NULL;
   573     if (pswit[OVERVIEW_SWITCH])
   574 	/* just print summary; don't echo */
   575 	pswit[ECHO_SWITCH]=FALSE;
   576     if (*argc<2)
   577     {
   578 	proghelp(context);
   579 	exit(1);
   580     }
   581     g_option_context_free(context);
   582 }
   583 
   584 /*
   585  * read_user_scannos:
   586  *
   587  * Read in the user-defined stealth scanno list.
   588  */
   589 void read_user_scannos(void)
   590 {
   591     GError *err=NULL;
   592     gchar *usertypo_file;
   593     gboolean okay;
   594     int i;
   595     gsize len,nb;
   596     gchar *contents,*utf8,**lines;
   597     usertypo_file=g_strdup("bookloupe.typ");
   598     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   599     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   600     {
   601 	g_clear_error(&err);
   602 	g_free(usertypo_file);
   603 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   604 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   605     }
   606     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   607     {
   608 	g_clear_error(&err);
   609 	g_free(usertypo_file);
   610 	usertypo_file=g_strdup("gutcheck.typ");
   611 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   612     }
   613     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   614     {
   615 	g_clear_error(&err);
   616 	g_free(usertypo_file);
   617 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   618 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   619     }
   620     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   621     {
   622 	g_free(usertypo_file);
   623 	g_print("   --> I couldn't find bookloupe.typ "
   624 	  "-- proceeding without user typos.\n");
   625 	return;
   626     }
   627     else if (!okay)
   628     {
   629 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   630 	g_free(usertypo_file);
   631 	g_clear_error(&err);
   632 	exit(1);
   633     }
   634     if (g_utf8_validate(contents,len,NULL))
   635     {
   636 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   637 	if (!charset)
   638 	    (void)set_charset("UNICODE",NULL);
   639     }
   640     else
   641 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   642     g_free(contents);
   643     lines=g_strsplit_set(utf8,"\r\n",0);
   644     g_free(utf8);
   645     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   646     for (i=0;lines[i];i++)
   647 	if (*(unsigned char *)lines[i]>'!')
   648 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   649 	else
   650 	    g_free(lines[i]);
   651     g_free(lines);
   652 }
   653 
   654 /*
   655  * read_etext:
   656  *
   657  * Read an etext returning a newly allocated string containing the file
   658  * contents or NULL on error.
   659  */
   660 gchar *read_etext(const char *filename,GError **err)
   661 {
   662     GError *tmp_err=NULL;
   663     gchar *contents,*utf8;
   664     gsize len,bytes_read,bytes_written;
   665     int i,line,col;
   666     if (!g_file_get_contents(filename,&contents,&len,err))
   667 	return NULL;
   668     if (g_utf8_validate(contents,len,NULL))
   669     {
   670 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   671 	g_set_print_handler(print_as_utf_8);
   672 #ifdef __WIN32__
   673 	SetConsoleOutputCP(CP_UTF8);
   674 #endif
   675     }
   676     else
   677     {
   678 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   679 	  &bytes_written,&tmp_err);
   680 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   681 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   682 	{
   683 	    line=col=1;
   684 	    for(i=0;i<bytes_read;i++)
   685 		if (contents[i]=='\n')
   686 		{
   687 		    line++;
   688 		    col=1;
   689 		}
   690 		else if (contents[i]!='\r')
   691 		    col++;
   692 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   693 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   694 	      "valid Windows-1252 character",
   695 	      ((unsigned char *)contents)[bytes_read],line,col);
   696 	}
   697 	else if (tmp_err)
   698 	    g_propagate_error(err,tmp_err);
   699 	g_set_print_handler(print_as_windows_1252);
   700 #ifdef __WIN32__
   701 	SetConsoleOutputCP(1252);
   702 #endif
   703     }
   704     g_free(contents);
   705     return utf8;
   706 }
   707 
   708 void cleanup_on_exit(void)
   709 {
   710 #ifdef __WIN32__
   711     SetConsoleOutputCP(saved_cp);
   712 #endif
   713 }
   714 
   715 int main(int argc,char **argv)
   716 {
   717 #ifdef __WIN32__
   718     atexit(cleanup_on_exit);
   719     saved_cp=GetConsoleOutputCP();
   720 #endif
   721     running_from=g_path_get_dirname(argv[0]);
   722     /* Paranoid checking is turned OFF, not on, by its switch */
   723     pswit[PARANOID_SWITCH]=TRUE;
   724     /* if running in paranoid mode, typo checks default to enabled */
   725     pswit[TYPO_SWITCH]=TRUE;
   726     /* Line-end checking is turned OFF, not on, by its switch */
   727     pswit[LINE_END_SWITCH]=TRUE;
   728     /* Echoing is turned OFF, not on, by its switch */
   729     pswit[ECHO_SWITCH]=TRUE;
   730     parse_config_file();
   731     parse_options(&argc,&argv);
   732     if (pswit[USERTYPO_SWITCH])
   733 	read_user_scannos();
   734     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   735     procfile(argv[1]);
   736     if (pswit[OVERVIEW_SWITCH])
   737     {
   738 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   739 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   740 	g_print("    --------------- Queries found --------------\n");
   741 	if (cnt_long)
   742 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   743 	if (cnt_short)
   744 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   745 	if (cnt_lineend)
   746 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   747 	if (cnt_word)
   748 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   749 	if (cnt_quote)
   750 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   751 	if (cnt_brack)
   752 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   753 	if (cnt_bin)
   754 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   755 	if (cnt_odd)
   756 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   757 	if (cnt_punct)
   758 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   759 	if (cnt_dash)
   760 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   761 	if (cnt_html)
   762 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   763 	g_print("\n");
   764 	g_print("    TOTAL QUERIES		  %14ld\n",
   765 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   766 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   767     }
   768     g_free(running_from);
   769     if (usertypo)
   770 	g_tree_unref(usertypo);
   771     set_charset(NULL,NULL);
   772     if (config)
   773 	g_key_file_free(config);
   774     return 0;
   775 }
   776 
   777 /*
   778  * first_pass:
   779  *
   780  * Run a first pass - verify that it's a valid PG
   781  * file, decide whether to report some things that
   782  * occur many times in the text like long or short
   783  * lines, non-standard dashes, etc.
   784  */
   785 struct first_pass_results *first_pass(const char *etext)
   786 {
   787     gunichar laststart=CHAR_SPACE;
   788     const char *s;
   789     gchar *lc_line;
   790     int i,j,lbytes,llen;
   791     gchar **lines;
   792     unsigned int lastlen=0,lastblen=0;
   793     long spline=0,nspline=0;
   794     static struct first_pass_results results={0};
   795     gchar *inword;
   796     QuoteClass qc;
   797     lines=g_strsplit(etext,"\n",0);
   798     for (j=0;lines[j];j++)
   799     {
   800 	lbytes=strlen(lines[j]);
   801 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   802 	    lines[j][--lbytes]='\0';
   803 	llen=g_utf8_strlen(lines[j],lbytes);
   804 	linecnt++;
   805 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   806 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   807 	{
   808 	    if (spline)
   809 		g_print("   --> Duplicate header?\n");
   810 	    spline=linecnt+1;   /* first line of non-header text, that is */
   811 	}
   812 	if (!strncmp(lines[j],"*** START",9) &&
   813 	  strstr(lines[j],"PROJECT GUTENBERG"))
   814 	{
   815 	    if (nspline)
   816 		g_print("   --> Duplicate header?\n");
   817 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   818 	}
   819 	if (spline || nspline)
   820 	{
   821 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   822 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   823 	    {
   824 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   825 		{
   826 		    if (results.footerline)
   827 		    {
   828 			/* it's an old-form header - we can detect duplicates */
   829 			if (!nspline)
   830 			    g_print("   --> Duplicate footer?\n");
   831 		    }
   832 		    else
   833 			results.footerline=linecnt;
   834 		}
   835 	    }
   836 	    g_free(lc_line);
   837 	}
   838 	if (spline)
   839 	    results.firstline=spline;
   840 	if (nspline)
   841 	    results.firstline=nspline;  /* override with new */
   842 	if (results.footerline)
   843 	    continue;    /* don't count the boilerplate in the footer */
   844 	results.totlen+=llen;
   845 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   846 	{
   847 	    if (g_utf8_get_char(s)>127)
   848 		results.binlen++;
   849 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   850 		results.alphalen++;
   851 	    if (s>lines[j])
   852 	    {
   853 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   854 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   855 		else
   856 		    qc=INVALID_QUOTE;
   857 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   858 		  isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   859 		    results.endquote_count++;
   860 	    }
   861 	}
   862 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   863 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   864 	    results.shortline++;
   865 	if (lbytes>0 &&
   866 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   867 	    cnt_spacend++;
   868 	if (strstr(lines[j],".,"))
   869 	    results.dotcomma++;
   870 	/* only count ast lines for ignoring purposes where there is */
   871 	/* locase text on the line */
   872 	if (strchr(lines[j],'*'))
   873 	{
   874 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   875 		if (g_unichar_islower(g_utf8_get_char(s)))
   876 		    break;
   877 	    if (*s)
   878 		results.astline++;
   879 	}
   880 	if (strchr(lines[j],'/'))
   881 	    results.fslashline++;
   882 	if (lbytes>0)
   883 	{
   884 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   885 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   886 	      s=g_utf8_prev_char(s))
   887 		;
   888 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   889 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   890 		results.hyphens++;
   891 	}
   892 	if (llen>LONGEST_PG_LINE)
   893 	    results.longline++;
   894 	if (llen>WAY_TOO_LONG)
   895 	    results.verylongline++;
   896 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   897 	{
   898 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   899 	    if (i>0)
   900 		results.htmcount++;
   901 	    if (strstr(lines[j],"<i>"))
   902 		results.htmcount+=4; /* bonus marks! */
   903 	}
   904 	/* Check for spaced em-dashes */
   905 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   906 	{
   907 	    results.emdash++;
   908 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   909 		results.space_emdash++;
   910 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   911 		/* count of em-dashes with spaces both sides */
   912 		results.non_PG_space_emdash++;
   913 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   914 		/* count of PG-type em-dashes with no spaces */
   915 		results.PG_space_emdash++;
   916 	}
   917 	for (s=lines[j];*s;)
   918 	{
   919 	    inword=getaword(&s);
   920 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   921 		results.Dutchcount++;
   922 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   923 		results.Frenchcount++;
   924 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   925 		results.standalone_digit++;
   926 	    g_free(inword);
   927 	}
   928 	/* Check for spaced dashes */
   929 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   930 	    results.spacedash++;
   931 	lastblen=lastlen;
   932 	lastlen=llen;
   933 	laststart=lines[j][0];
   934     }
   935     g_strfreev(lines);
   936     return &results;
   937 }
   938 
   939 /*
   940  * report_first_pass:
   941  *
   942  * Make some snap decisions based on the first pass results.
   943  */
   944 struct warnings *report_first_pass(struct first_pass_results *results)
   945 {
   946     static struct warnings warnings={0};
   947     if (cnt_spacend>0)
   948 	g_print("   --> %ld lines in this file have white space at end\n",
   949 	  cnt_spacend);
   950     warnings.dotcomma=1;
   951     if (results->dotcomma>5)
   952     {
   953 	warnings.dotcomma=0;
   954 	g_print("   --> %ld lines in this file contain '.,'. "
   955 	  "Not reporting them.\n",results->dotcomma);
   956     }
   957     /*
   958      * If more than 50 lines, or one-tenth, are short,
   959      * don't bother reporting them.
   960      */
   961     warnings.shortline=1;
   962     if (results->shortline>50 || results->shortline*10>linecnt)
   963     {
   964 	warnings.shortline=0;
   965 	g_print("   --> %ld lines in this file are short. "
   966 	  "Not reporting short lines.\n",results->shortline);
   967     }
   968     /*
   969      * If more than 50 lines, or one-tenth, are long,
   970      * don't bother reporting them.
   971      */
   972     warnings.longline=1;
   973     if (results->longline>50 || results->longline*10>linecnt)
   974     {
   975 	warnings.longline=0;
   976 	g_print("   --> %ld lines in this file are long. "
   977 	  "Not reporting long lines.\n",results->longline);
   978     }
   979     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   980     warnings.ast=1;
   981     if (results->astline>10)
   982     {
   983 	warnings.ast=0;
   984 	g_print("   --> %ld lines in this file contain asterisks. "
   985 	  "Not reporting them.\n",results->astline);
   986     }
   987     /*
   988      * If more than 10 lines contain forward slashes,
   989      * don't bother reporting them.
   990      */
   991     warnings.fslash=1;
   992     if (results->fslashline>10)
   993     {
   994 	warnings.fslash=0;
   995 	g_print("   --> %ld lines in this file contain forward slashes. "
   996 	  "Not reporting them.\n",results->fslashline);
   997     }
   998     /*
   999      * If more than 20 lines contain unpunctuated endquotes,
  1000      * don't bother reporting them.
  1001      */
  1002     warnings.endquote=1;
  1003     if (results->endquote_count>20)
  1004     {
  1005 	warnings.endquote=0;
  1006 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
  1007 	  "Not reporting them.\n",results->endquote_count);
  1008     }
  1009     /*
  1010      * If more than 15 lines contain standalone digits,
  1011      * don't bother reporting them.
  1012      */
  1013     warnings.digit=1;
  1014     if (results->standalone_digit>10)
  1015     {
  1016 	warnings.digit=0;
  1017 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
  1018 	  "Not reporting them.\n",results->standalone_digit);
  1019     }
  1020     /*
  1021      * If more than 20 lines contain hyphens at end,
  1022      * don't bother reporting them.
  1023      */
  1024     warnings.hyphen=1;
  1025     if (results->hyphens>20)
  1026     {
  1027 	warnings.hyphen=0;
  1028 	g_print("   --> %ld lines in this file have hyphens at end. "
  1029 	  "Not reporting them.\n",results->hyphens);
  1030     }
  1031     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1032     {
  1033 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1034 	pswit[MARKUP_SWITCH]=1;
  1035     }
  1036     if (results->verylongline>0)
  1037 	g_print("   --> %ld lines in this file are VERY long!\n",
  1038 	  results->verylongline);
  1039     /*
  1040      * If there are more non-PG spaced dashes than PG em-dashes,
  1041      * assume it's deliberate.
  1042      * Current PG guidelines say don't use them, but older texts do,
  1043      * and some people insist on them whatever the guidelines say.
  1044      */
  1045     warnings.dash=1;
  1046     if (results->spacedash+results->non_PG_space_emdash>
  1047       results->PG_space_emdash)
  1048     {
  1049 	warnings.dash=0;
  1050 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1051 	  "Not reporting them.\n",
  1052 	  results->spacedash+results->non_PG_space_emdash);
  1053     }
  1054     if (charset)
  1055 	warnings.bin=0;
  1056     else
  1057     {
  1058 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
  1059 	warnings.bin=1;
  1060 	/* If more than a quarter of characters are hi-bit, bug out. */
  1061 	if (results->binlen*4>results->totlen)
  1062 	{
  1063 	    g_print("   --> This file does not appear to be ASCII. "
  1064 	      "Terminating. Best of luck with it!\n");
  1065 	    exit(1);
  1066 	}
  1067 	if (results->alphalen*4<results->totlen)
  1068 	{
  1069 	    g_print("   --> This file does not appear to be text. "
  1070 	      "Terminating. Best of luck with it!\n");
  1071 	    exit(1);
  1072 	}
  1073 	if (results->binlen*100>results->totlen || results->binlen>100)
  1074 	{
  1075 	    g_print("   --> There are a lot of foreign letters here. "
  1076 	      "Not reporting them.\n");
  1077 	    if (!pswit[VERBOSE_SWITCH])
  1078 		warnings.bin=0;
  1079 	}
  1080     }
  1081     warnings.isDutch=FALSE;
  1082     if (results->Dutchcount>50)
  1083     {
  1084 	warnings.isDutch=TRUE;
  1085 	g_print("   --> This looks like Dutch - "
  1086 	  "switching off dashes and warnings for 's Middags case.\n");
  1087     }
  1088     warnings.isFrench=FALSE;
  1089     if (results->Frenchcount>50)
  1090     {
  1091 	warnings.isFrench=TRUE;
  1092 	g_print("   --> This looks like French - "
  1093 	  "switching off some doublepunct.\n");
  1094     }
  1095     if (results->firstline && results->footerline)
  1096 	g_print("    The PG header and footer appear to be already on.\n");
  1097     else
  1098     {
  1099 	if (results->firstline)
  1100 	    g_print("    The PG header is on - no footer.\n");
  1101 	if (results->footerline)
  1102 	    g_print("    The PG footer is on - no header.\n");
  1103     }
  1104     g_print("\n");
  1105     if (pswit[VERBOSE_SWITCH])
  1106     {
  1107 	warnings.shortline=1;
  1108 	warnings.dotcomma=1;
  1109 	warnings.longline=1;
  1110 	warnings.dash=1;
  1111 	warnings.digit=1;
  1112 	warnings.ast=1;
  1113 	warnings.fslash=1;
  1114 	warnings.hyphen=1;
  1115 	warnings.endquote=1;
  1116 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1117     }
  1118     if (warnings.isDutch)
  1119 	warnings.dash=0;
  1120     if (results->footerline>0 && results->firstline>0 &&
  1121       results->footerline>results->firstline &&
  1122       results->footerline-results->firstline<100)
  1123     {
  1124 	g_print("   --> I don't really know where this text starts. \n");
  1125 	g_print("       There are no reference points.\n");
  1126 	g_print("       I'm going to have to report the header and footer "
  1127 	  "as well.\n");
  1128 	results->firstline=0;
  1129     }
  1130     return &warnings;
  1131 }
  1132 
  1133 /*
  1134  * analyse_quotes:
  1135  *
  1136  * Look along the line, accumulate the count of quotes, and see
  1137  * if this is an empty line - i.e. a line with nothing on it
  1138  * but spaces.
  1139  * If line has just spaces, period, * and/or - on it, don't
  1140  * count it, since empty lines with asterisks or dashes to
  1141  * separate sections are common.
  1142  *
  1143  * Returns: TRUE if the line is empty.
  1144  */
  1145 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
  1146 {
  1147     int guessquote=0;
  1148     /* assume the line is empty until proven otherwise */
  1149     gboolean isemptyline=TRUE;
  1150     const char *s=aline,*sprev,*snext;
  1151     gunichar c;
  1152     sprev=NULL;
  1153     GError *tmp_err=NULL;
  1154     while (*s)
  1155     {
  1156 	snext=g_utf8_next_char(s);
  1157 	c=g_utf8_get_char(s);
  1158 	if (CHAR_IS_DQUOTE(c))
  1159 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1160 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1161 	{
  1162 	    if (s==aline)
  1163 	    {
  1164 		/*
  1165 		 * At start of line, it can only be a quotation mark.
  1166 		 * Hardcode a very common exception!
  1167 		 */
  1168 		if (!g_str_has_prefix(snext,"tis") &&
  1169 		  !g_str_has_prefix(snext,"Tis"))
  1170 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1171 	    }
  1172 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1173 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1174 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1175 		;
  1176 	    /* it's outside a word - let's check it out */
  1177 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1178 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1179 	    {
  1180 		/* certainly looks like a quotation mark */
  1181 		if (!g_str_has_prefix(snext,"tis") &&
  1182 		  !g_str_has_prefix(snext,"Tis"))
  1183 		    /* hardcode a very common exception! */
  1184 		{
  1185 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1186 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1187 		    else
  1188 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1189 		}
  1190 	    }
  1191 	    else
  1192 	    {
  1193 		/* now - is it a quotation mark? */
  1194 		guessquote=0;   /* accumulate clues */
  1195 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1196 		{
  1197 		    /* it follows a letter - could be either */
  1198 		    guessquote++;
  1199 		    if (g_utf8_get_char(sprev)=='s')
  1200 		    {
  1201 			/* looks like a plural apostrophe */
  1202 			guessquote-=3;
  1203 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1204 			    /* bonus marks! */
  1205 			    guessquote-=2;
  1206 		    }
  1207 		    if (innermost_quote_matches(counters,c))
  1208 			/*
  1209 			 * Give it the benefit of some doubt,
  1210 			 * if a squote is already open.
  1211 			 */
  1212 			guessquote++;
  1213 		    else
  1214 			guessquote--;
  1215 		    if (guessquote>=0)
  1216 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1217 		}
  1218 		else
  1219 		    /* no adjacent letter - it must be a quote of some kind */
  1220 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1221 	    }
  1222 	}
  1223 	if (tmp_err)
  1224 	{
  1225 	    if (pswit[ECHO_SWITCH])
  1226 		g_print("\n%s\n",aline);
  1227 	    if (!pswit[OVERVIEW_SWITCH])
  1228 		g_print("    Line %ld column %ld - %s\n",
  1229 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1230 	    g_clear_error(&tmp_err);
  1231 	}
  1232 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1233 	  c!='\r' && c!='\n')
  1234 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1235 	if (c==CHAR_UNDERSCORE)
  1236 	    counters->c_unders++;
  1237 	if (c==CHAR_OPEN_SBRACK)
  1238 	{
  1239 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1240 	      !matching_difference(counters,c) && s==aline &&
  1241 	      g_str_has_prefix(s,"[Illustration:"))
  1242 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1243 	    else
  1244 		increment_matching(counters,c,TRUE);
  1245 	}
  1246 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1247 	    increment_matching(counters,c,TRUE);
  1248 	if (c==CHAR_CLOSE_SBRACK)
  1249 	{
  1250 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1251 	      !matching_difference(counters,c) && !*snext)
  1252 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1253 	    else
  1254 		increment_matching(counters,c,FALSE);
  1255 	}
  1256 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1257 	    increment_matching(counters,c,FALSE);
  1258 	sprev=s;
  1259 	s=snext;
  1260     }
  1261     return isemptyline;
  1262 }
  1263 
  1264 /*
  1265  * check_for_control_characters:
  1266  *
  1267  * Check for invalid or questionable characters in the line
  1268  * Anything above 127 is invalid for plain ASCII, and
  1269  * non-printable control characters should also be flagged.
  1270  * Tabs should generally not be there.
  1271  */
  1272 void check_for_control_characters(const char *aline)
  1273 {
  1274     gunichar c;
  1275     const char *s;
  1276     for (s=aline;*s;s=g_utf8_next_char(s))
  1277     {
  1278 	c=g_utf8_get_char(s);
  1279 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1280 	{
  1281 	    if (pswit[ECHO_SWITCH])
  1282 		g_print("\n%s\n",aline);
  1283 	    if (!pswit[OVERVIEW_SWITCH])
  1284 		g_print("    Line %ld column %ld - Control character %u\n",
  1285 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1286 	    else
  1287 		cnt_bin++;
  1288 	}
  1289     }
  1290 }
  1291 
  1292 /*
  1293  * check_for_odd_characters:
  1294  *
  1295  * Check for binary and other odd characters.
  1296  */
  1297 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1298   gboolean isemptyline)
  1299 {
  1300     /* Don't repeat multiple warnings on one line. */
  1301     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1302     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1303     const char *s;
  1304     gunichar c;
  1305     gsize nb;
  1306     gchar *t;
  1307     for (s=aline;*s;s=g_utf8_next_char(s))
  1308     {
  1309 	c=g_utf8_get_char(s);
  1310 	if (warnings->bin && !eInvalidChar &&
  1311 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1312 	{
  1313 	    if (pswit[ECHO_SWITCH])
  1314 		g_print("\n%s\n",aline);
  1315 	    if (!pswit[OVERVIEW_SWITCH])
  1316 		if (c>127 && c<160 || c>255)
  1317 		    g_print("    Line %ld column %ld - "
  1318 		      "Non-ISO-8859 character %u\n",
  1319 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1320 		else
  1321 		    g_print("    Line %ld column %ld - "
  1322 		      "Non-ASCII character %u\n",
  1323 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1324 	    else
  1325 		cnt_bin++;
  1326 	    eInvalidChar=TRUE;
  1327 	}
  1328 	if (!eInvalidChar && charset)
  1329 	{
  1330 	    if (charset_validator==(GIConv)-1)
  1331 	    {
  1332 		if (!g_unichar_isdefined(c))
  1333 		{
  1334 		    if (pswit[ECHO_SWITCH])
  1335 			g_print("\n%s\n",aline);
  1336 		    if (!pswit[OVERVIEW_SWITCH])
  1337 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1338 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1339 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1340 		    else
  1341 			cnt_bin++;
  1342 		    eInvalidChar=TRUE;
  1343 		}
  1344 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1345 		  c>=100000 && c<=0x10FFFD)
  1346 		{
  1347 		    if (pswit[ECHO_SWITCH])
  1348 			g_print("\n%s\n",aline);
  1349 		    if (!pswit[OVERVIEW_SWITCH])
  1350 			g_print("    Line %ld column %ld - Private Use "
  1351 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1352 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1353 		    else
  1354 			cnt_bin++;
  1355 		    eInvalidChar=TRUE;
  1356 		}
  1357 	    }
  1358 	    else
  1359 	    {
  1360 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1361 		  charset_validator,NULL,&nb,NULL);
  1362 		if (t)
  1363 		    g_free(t);
  1364 		else
  1365 		{
  1366 		    if (pswit[ECHO_SWITCH])
  1367 			g_print("\n%s\n",aline);
  1368 		    if (!pswit[OVERVIEW_SWITCH])
  1369 			g_print("    Line %ld column %ld - Non-%s "
  1370 			  "character %u\n",linecnt,
  1371 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1372 		    else
  1373 			cnt_bin++;
  1374 		    eInvalidChar=TRUE;
  1375 		}
  1376 	    }
  1377 	}
  1378 	if (!eTab && c==CHAR_TAB)
  1379 	{
  1380 	    if (pswit[ECHO_SWITCH])
  1381 		g_print("\n%s\n",aline);
  1382 	    if (!pswit[OVERVIEW_SWITCH])
  1383 		g_print("    Line %ld column %ld - Tab character?\n",
  1384 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1385 	    else
  1386 		cnt_odd++;
  1387 	    eTab=TRUE;
  1388 	}
  1389 	if (!eTilde && c==CHAR_TILDE)
  1390 	{
  1391 	    /*
  1392 	     * Often used by OCR software to indicate an
  1393 	     * unrecognizable character.
  1394 	     */
  1395 	    if (pswit[ECHO_SWITCH])
  1396 		g_print("\n%s\n",aline);
  1397 	    if (!pswit[OVERVIEW_SWITCH])
  1398 		g_print("    Line %ld column %ld - Tilde character?\n",
  1399 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1400 	    else
  1401 		cnt_odd++;
  1402 	    eTilde=TRUE;
  1403 	}
  1404 	if (!eCarat && c==CHAR_CARAT)
  1405 	{  
  1406 	    if (pswit[ECHO_SWITCH])
  1407 		g_print("\n%s\n",aline);
  1408 	    if (!pswit[OVERVIEW_SWITCH])
  1409 		g_print("    Line %ld column %ld - Carat character?\n",
  1410 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1411 	    else
  1412 		cnt_odd++;
  1413 	    eCarat=TRUE;
  1414 	}
  1415 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1416 	{  
  1417 	    if (pswit[ECHO_SWITCH])
  1418 		g_print("\n%s\n",aline);
  1419 	    if (!pswit[OVERVIEW_SWITCH])
  1420 		g_print("    Line %ld column %ld - Forward slash?\n",
  1421 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1422 	    else
  1423 		cnt_odd++;
  1424 	    eFSlash=TRUE;
  1425 	}
  1426 	/*
  1427 	 * Report asterisks only in paranoid mode,
  1428 	 * since they're often deliberate.
  1429 	 */
  1430 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1431 	  c==CHAR_ASTERISK)
  1432 	{
  1433 	    if (pswit[ECHO_SWITCH])
  1434 		g_print("\n%s\n",aline);
  1435 	    if (!pswit[OVERVIEW_SWITCH])
  1436 		g_print("    Line %ld column %ld - Asterisk?\n",
  1437 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1438 	    else
  1439 		cnt_odd++;
  1440 	    eAst=TRUE;
  1441 	}
  1442     }
  1443 }
  1444 
  1445 /*
  1446  * check_for_long_line:
  1447  *
  1448  * Check for line too long.
  1449  */
  1450 void check_for_long_line(const char *aline)
  1451 {
  1452     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1453     {
  1454 	if (pswit[ECHO_SWITCH])
  1455 	    g_print("\n%s\n",aline);
  1456 	if (!pswit[OVERVIEW_SWITCH])
  1457 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1458 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1459 	else
  1460 	    cnt_long++;
  1461     }
  1462 }
  1463 
  1464 /*
  1465  * check_for_short_line:
  1466  *
  1467  * Check for line too short.
  1468  *
  1469  * This one is a bit trickier to implement: we don't want to
  1470  * flag the last line of a paragraph for being short, so we
  1471  * have to wait until we know that our current line is a
  1472  * "normal" line, then report the _previous_ line if it was too
  1473  * short. We also don't want to report indented lines like
  1474  * chapter heads or formatted quotations. We therefore keep
  1475  * last->len as the length of the last line examined, and
  1476  * last->blen as the length of the last but one, and try to
  1477  * suppress unnecessary warnings by checking that both were of
  1478  * "normal" length. We keep the first character of the last
  1479  * line in last->start, and if it was a space, we assume that
  1480  * the formatting is deliberate. I can't figure out a way to
  1481  * distinguish something like a quoted verse left-aligned or
  1482  * the header or footer of a letter from a paragraph of short
  1483  * lines - maybe if I examined the whole paragraph, and if the
  1484  * para has less than, say, 8 lines and if all lines are short,
  1485  * then just assume it's OK? Need to look at some texts to see
  1486  * how often a formula like this would get the right result.
  1487  */
  1488 void check_for_short_line(const char *aline,const struct line_properties *last)
  1489 {
  1490     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1491       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1492       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1493     {
  1494 	if (pswit[ECHO_SWITCH])
  1495 	    g_print("\n%s\n",prevline);
  1496 	if (!pswit[OVERVIEW_SWITCH])
  1497 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1498 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1499 	else
  1500 	    cnt_short++;
  1501     }
  1502 }
  1503 
  1504 /*
  1505  * check_for_starting_punctuation:
  1506  *
  1507  * Look for punctuation other than full ellipses at start of line.
  1508  */
  1509 void check_for_starting_punctuation(const char *aline)
  1510 {
  1511     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1512       !g_str_has_prefix(aline,". . ."))
  1513     {
  1514 	if (pswit[ECHO_SWITCH])
  1515 	    g_print("\n%s\n",aline);
  1516 	if (!pswit[OVERVIEW_SWITCH])
  1517 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1518 	      linecnt);
  1519 	else
  1520 	    cnt_punct++;
  1521     }
  1522 }
  1523 
  1524 /*
  1525  * check_for_spaced_emdash:
  1526  *
  1527  * Check for spaced em-dashes.
  1528  *
  1529  * We must check _all_ occurrences of "--" on the line
  1530  * hence the loop - even if the first double-dash is OK
  1531  * there may be another that's wrong later on.
  1532  */
  1533 void check_for_spaced_emdash(const char *aline)
  1534 {
  1535     const char *s,*t,*next;
  1536     for (s=aline;t=strstr(s,"--");s=next)
  1537     {
  1538 	next=g_utf8_next_char(g_utf8_next_char(t));
  1539 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1540 	  g_utf8_get_char(next)==CHAR_SPACE)
  1541 	{
  1542 	    if (pswit[ECHO_SWITCH])
  1543 		g_print("\n%s\n",aline);
  1544 	    if (!pswit[OVERVIEW_SWITCH])
  1545 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1546 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1547 	    else
  1548 		cnt_dash++;
  1549 	}
  1550     }
  1551 }
  1552 
  1553 /*
  1554  * check_for_spaced_dash:
  1555  *
  1556  * Check for spaced dashes.
  1557  */
  1558 void check_for_spaced_dash(const char *aline)
  1559 {
  1560     const char *s;
  1561     if ((s=strstr(aline," -")))
  1562     {
  1563 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1564 	{
  1565 	    if (pswit[ECHO_SWITCH])
  1566 		g_print("\n%s\n",aline);
  1567 	    if (!pswit[OVERVIEW_SWITCH])
  1568 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1569 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1570 	    else
  1571 		cnt_dash++;
  1572 	}
  1573     }
  1574     else if ((s=strstr(aline,"- ")))
  1575     {
  1576 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1577 	{
  1578 	    if (pswit[ECHO_SWITCH])
  1579 		g_print("\n%s\n",aline);
  1580 	    if (!pswit[OVERVIEW_SWITCH])
  1581 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1582 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1583 	    else
  1584 		cnt_dash++;
  1585 	}
  1586     }
  1587 }
  1588 
  1589 /*
  1590  * check_for_unmarked_paragraphs:
  1591  *
  1592  * Check for unmarked paragraphs indicated by separate speakers.
  1593  *
  1594  * May well be false positive:
  1595  * "Bravo!" "Wonderful!" called the crowd.
  1596  * but useful all the same.
  1597  */
  1598 void check_for_unmarked_paragraphs(const char *aline)
  1599 {
  1600     const char *s;
  1601     s=strstr(aline,"\"  \"");
  1602     if (!s)
  1603 	s=strstr(aline,"\" \"");
  1604     if (s)
  1605     {
  1606 	if (pswit[ECHO_SWITCH])
  1607 	    g_print("\n%s\n",aline);
  1608 	if (!pswit[OVERVIEW_SWITCH])
  1609 	    g_print("    Line %ld column %ld - "
  1610 	      "Query missing paragraph break?\n",
  1611 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1612 	else
  1613 	    cnt_punct++;
  1614     }
  1615 }
  1616 
  1617 /*
  1618  * check_for_jeebies:
  1619  *
  1620  * Check for "to he" and other easy h/b errors.
  1621  *
  1622  * This is a very inadequate effort on the h/b problem,
  1623  * but the phrase "to he" is always an error, whereas "to
  1624  * be" is quite common.
  1625  * Similarly, '"Quiet!", be said.' is a non-be error
  1626  * "to he" is _not_ always an error!:
  1627  *       "Where they went to he couldn't say."
  1628  * Another false positive:
  1629  *       What would "Cinderella" be without the . . .
  1630  * and another: "If he wants to he can see for himself."
  1631  */
  1632 void check_for_jeebies(const char *aline)
  1633 {
  1634     const char *s;
  1635     s=strstr(aline," be could ");
  1636     if (!s)
  1637 	s=strstr(aline," be would ");
  1638     if (!s)
  1639 	s=strstr(aline," was be ");
  1640     if (!s)
  1641 	s=strstr(aline," be is ");
  1642     if (!s)
  1643 	s=strstr(aline," is be ");
  1644     if (!s)
  1645 	s=strstr(aline,"\", be ");
  1646     if (!s)
  1647 	s=strstr(aline,"\" be ");
  1648     if (!s)
  1649 	s=strstr(aline,"\" be ");
  1650     if (!s)
  1651 	s=strstr(aline," to he ");
  1652     if (s)
  1653     {
  1654 	if (pswit[ECHO_SWITCH])
  1655 	    g_print("\n%s\n",aline);
  1656 	if (!pswit[OVERVIEW_SWITCH])
  1657 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1658 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1659 	else
  1660 	    cnt_word++;
  1661     }
  1662     s=strstr(aline," the had ");
  1663     if (!s)
  1664 	s=strstr(aline," a had ");
  1665     if (!s)
  1666 	s=strstr(aline," they bad ");
  1667     if (!s)
  1668 	s=strstr(aline," she bad ");
  1669     if (!s)
  1670 	s=strstr(aline," he bad ");
  1671     if (!s)
  1672 	s=strstr(aline," you bad ");
  1673     if (!s)
  1674 	s=strstr(aline," i bad ");
  1675     if (s)
  1676     {
  1677 	if (pswit[ECHO_SWITCH])
  1678 	    g_print("\n%s\n",aline);
  1679 	if (!pswit[OVERVIEW_SWITCH])
  1680 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1681 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1682 	else
  1683 	    cnt_word++;
  1684     }
  1685     s=strstr(aline,"; hut ");
  1686     if (!s)
  1687 	s=strstr(aline,", hut ");
  1688     if (s)
  1689     {
  1690 	if (pswit[ECHO_SWITCH])
  1691 	    g_print("\n%s\n",aline);
  1692 	if (!pswit[OVERVIEW_SWITCH])
  1693 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1694 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1695 	else
  1696 	    cnt_word++;
  1697     }
  1698 }
  1699 
  1700 /*
  1701  * check_for_mta_from:
  1702  *
  1703  * Special case - angled bracket in front of "From" placed there by an
  1704  * MTA when sending an e-mail.
  1705  */
  1706 void check_for_mta_from(const char *aline)
  1707 {
  1708     const char *s;
  1709     s=strstr(aline,">From");
  1710     if (s)
  1711     {
  1712 	if (pswit[ECHO_SWITCH])
  1713 	    g_print("\n%s\n",aline);
  1714 	if (!pswit[OVERVIEW_SWITCH])
  1715 	    g_print("    Line %ld column %ld - "
  1716 	      "Query angled bracket with From\n",
  1717 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1718 	else
  1719 	    cnt_punct++;
  1720     }
  1721 }
  1722 
  1723 /*
  1724  * check_for_orphan_character:
  1725  *
  1726  * Check for a single character line -
  1727  * often an overflow from bad wrapping.
  1728  */
  1729 void check_for_orphan_character(const char *aline)
  1730 {
  1731     gunichar c;
  1732     c=g_utf8_get_char(aline);
  1733     if (c && !*g_utf8_next_char(aline))
  1734     {
  1735 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1736 	    ; /* Nothing - ignore numerals alone on a line. */
  1737 	else
  1738 	{
  1739 	    if (pswit[ECHO_SWITCH])
  1740 		g_print("\n%s\n",aline);
  1741 	    if (!pswit[OVERVIEW_SWITCH])
  1742 		g_print("    Line %ld column 1 - Query single character line\n",
  1743 		  linecnt);
  1744 	    else
  1745 		cnt_punct++;
  1746 	}
  1747     }
  1748 }
  1749 
  1750 /*
  1751  * check_for_pling_scanno:
  1752  *
  1753  * Check for I" - often should be !
  1754  */
  1755 void check_for_pling_scanno(const char *aline)
  1756 {
  1757     const char *s;
  1758     s=strstr(aline," I\"");
  1759     if (s)
  1760     {
  1761 	if (pswit[ECHO_SWITCH])
  1762 	    g_print("\n%s\n",aline);
  1763 	if (!pswit[OVERVIEW_SWITCH])
  1764 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1765 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1766 	else
  1767 	    cnt_punct++;
  1768     }
  1769 }
  1770 
  1771 /*
  1772  * check_for_extra_period:
  1773  *
  1774  * Check for period without a capital letter. Cut-down from gutspell.
  1775  * Only works when it happens on a single line.
  1776  */
  1777 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1778 {
  1779     const char *s,*t,*s1,*sprev;
  1780     int i;
  1781     gsize len;
  1782     gboolean istypo;
  1783     gchar *testword;
  1784     gunichar c,nc,pc,*decomposition;
  1785     if (pswit[PARANOID_SWITCH])
  1786     {
  1787 	for (t=aline;t=strstr(t,". ");)
  1788 	{
  1789 	    if (t==aline)
  1790 	    {
  1791 		t=g_utf8_next_char(t);
  1792 		/* start of line punctuation is handled elsewhere */
  1793 		continue;
  1794 	    }
  1795 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1796 	    {
  1797 		t=g_utf8_next_char(t);
  1798 		continue;
  1799 	    }
  1800 	    if (warnings->isDutch)
  1801 	    {
  1802 		/* For Frank & Jeroen -- 's Middags case */
  1803 		gunichar c2,c3,c4,c5;
  1804 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1805 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1806 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1807 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1808 		if (CHAR_IS_APOSTROPHE(c2) &&
  1809 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1810 		  g_unichar_isupper(c5))
  1811 		{
  1812 		    t=g_utf8_next_char(t);
  1813 		    continue;
  1814 		}
  1815 	    }
  1816 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1817 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1818 	      !isdigit(g_utf8_get_char(s1)))
  1819 		s1=g_utf8_next_char(s1);
  1820 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1821 	    {
  1822 		/* we have something to investigate */
  1823 		istypo=TRUE;
  1824 		/* so let's go back and find out */
  1825 		nc=g_utf8_get_char(t);
  1826 		s1=g_utf8_prev_char(t);
  1827 		c=g_utf8_get_char(s1);
  1828 		sprev=g_utf8_prev_char(s1);
  1829 		pc=g_utf8_get_char(sprev);
  1830 		while (s1>=aline &&
  1831 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1832 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1833 		  g_unichar_isalpha(nc)))
  1834 		{
  1835 		    nc=c;
  1836 		    s1=sprev;
  1837 		    c=pc;
  1838 		    sprev=g_utf8_prev_char(s1);
  1839 		    pc=g_utf8_get_char(sprev);
  1840 		}
  1841 		s1=g_utf8_next_char(s1);
  1842 		s=strchr(s1,'.');
  1843 		if (s)
  1844 		    testword=g_strndup(s1,s-s1);
  1845 		else
  1846 		    testword=g_strdup(s1);
  1847 		for (i=0;*abbrev[i];i++)
  1848 		    if (!strcmp(testword,abbrev[i]))
  1849 			istypo=FALSE;
  1850 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1851 		    istypo=FALSE;
  1852 		if (!*g_utf8_next_char(testword))
  1853 		    istypo=FALSE;
  1854 		if (isroman(testword))
  1855 		    istypo=FALSE;
  1856 		if (istypo)
  1857 		{
  1858 		    istypo=FALSE;
  1859 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1860 		    {
  1861 			decomposition=g_unicode_canonical_decomposition(
  1862 			  g_utf8_get_char(s),&len);
  1863 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1864 			    istypo=TRUE;
  1865 			g_free(decomposition);
  1866 		    }
  1867 		}
  1868 		if (istypo &&
  1869 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1870 		{
  1871 		    g_tree_insert(qperiod,g_strdup(testword),
  1872 		      GINT_TO_POINTER(1));
  1873 		    if (pswit[ECHO_SWITCH])
  1874 			g_print("\n%s\n",aline);
  1875 		    if (!pswit[OVERVIEW_SWITCH])
  1876 			g_print("    Line %ld column %ld - Extra period?\n",
  1877 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1878 		    else
  1879 			cnt_punct++;
  1880 		}
  1881 		g_free(testword);
  1882 	    }
  1883 	    t=g_utf8_next_char(t);
  1884 	}
  1885     }
  1886 }
  1887 
  1888 /*
  1889  * check_for_following_punctuation:
  1890  *
  1891  * Check for words usually not followed by punctuation.
  1892  */
  1893 void check_for_following_punctuation(const char *aline)
  1894 {
  1895     int i;
  1896     const char *s,*wordstart;
  1897     gunichar c;
  1898     gchar *inword,*t;
  1899     if (pswit[TYPO_SWITCH])
  1900     {
  1901 	for (s=aline;*s;)
  1902 	{
  1903 	    wordstart=s;
  1904 	    t=getaword(&s);
  1905 	    if (!*t)
  1906 	    {
  1907 		g_free(t);
  1908 		continue;
  1909 	    }
  1910 	    inword=g_utf8_strdown(t,-1);
  1911 	    g_free(t);
  1912 	    for (i=0;*nocomma[i];i++)
  1913 		if (!strcmp(inword,nocomma[i]))
  1914 		{
  1915 		    c=g_utf8_get_char(s);
  1916 		    if (c==',' || c==';' || c==':')
  1917 		    {
  1918 			if (pswit[ECHO_SWITCH])
  1919 			    g_print("\n%s\n",aline);
  1920 			if (!pswit[OVERVIEW_SWITCH])
  1921 			    g_print("    Line %ld column %ld - "
  1922 			      "Query punctuation after %s?\n",
  1923 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1924 			      inword);
  1925 			else
  1926 			    cnt_punct++;
  1927 		    }
  1928 		}
  1929 	    for (i=0;*noperiod[i];i++)
  1930 		if (!strcmp(inword,noperiod[i]))
  1931 		{
  1932 		    c=g_utf8_get_char(s);
  1933 		    if (c=='.' || c=='!')
  1934 		    {
  1935 			if (pswit[ECHO_SWITCH])
  1936 			    g_print("\n%s\n",aline);
  1937 			if (!pswit[OVERVIEW_SWITCH])
  1938 			    g_print("    Line %ld column %ld - "
  1939 			      "Query punctuation after %s?\n",
  1940 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1941 			      inword);
  1942 			else
  1943 			    cnt_punct++;
  1944 		    }
  1945 		}
  1946 	    g_free(inword);
  1947 	}
  1948     }
  1949 }
  1950 
  1951 /*
  1952  * check_for_typos:
  1953  *
  1954  * Check for commonly mistyped words,
  1955  * and digits like 0 for O in a word.
  1956  */
  1957 void check_for_typos(const char *aline,struct warnings *warnings)
  1958 {
  1959     const char *s,*t,*nt,*wordstart;
  1960     gchar *inword;
  1961     gunichar *decomposition;
  1962     gchar *testword;
  1963     int i,vowel,consonant,*dupcnt;
  1964     gboolean isdup,istypo,alower;
  1965     gunichar c,pc;
  1966     long offset,len;
  1967     gsize decomposition_len;
  1968     for (s=aline;*s;)
  1969     {
  1970 	wordstart=s;
  1971 	inword=getaword(&s);
  1972 	if (!*inword)
  1973 	{
  1974 	    g_free(inword);
  1975 	    continue; /* don't bother with empty lines */
  1976 	}
  1977 	if (mixdigit(inword))
  1978 	{
  1979 	    if (pswit[ECHO_SWITCH])
  1980 		g_print("\n%s\n",aline);
  1981 	    if (!pswit[OVERVIEW_SWITCH])
  1982 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1983 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1984 	    else
  1985 		cnt_word++;
  1986 	}
  1987 	/*
  1988 	 * Put the word through a series of tests for likely typos and OCR
  1989 	 * errors.
  1990 	 */
  1991 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1992 	{
  1993 	    istypo=FALSE;
  1994 	    alower=FALSE;
  1995 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1996 	    {
  1997 		c=g_utf8_get_char(t);
  1998 		nt=g_utf8_next_char(t);
  1999 		/* lowercase for testing */
  2000 		if (g_unichar_islower(c))
  2001 		    alower=TRUE;
  2002 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  2003 		{
  2004 		    /*
  2005 		     * We have an uppercase mid-word. However, there are
  2006 		     * common cases:
  2007 		     *   Mac and Mc like McGill
  2008 		     *   French contractions like l'Abbe
  2009 		     */
  2010 		    offset=g_utf8_pointer_to_offset(inword,t);
  2011 		    if (offset>0)
  2012 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  2013 		    else
  2014 			pc='\0';
  2015 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  2016 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  2017 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  2018 		      CHAR_IS_APOSTROPHE(pc))
  2019 			; /* do nothing! */
  2020 		    else
  2021 			istypo=TRUE;
  2022 		}
  2023 	    }
  2024 	    testword=g_utf8_casefold(inword,-1);
  2025 	}
  2026 	if (pswit[TYPO_SWITCH])
  2027 	{
  2028 	    /*
  2029 	     * Check for certain unlikely two-letter combinations at word
  2030 	     * start and end.
  2031 	     */
  2032 	    len=g_utf8_strlen(testword,-1);
  2033 	    if (len>1)
  2034 	    {
  2035 		for (i=0;*nostart[i];i++)
  2036 		    if (g_str_has_prefix(testword,nostart[i]))
  2037 			istypo=TRUE;
  2038 		for (i=0;*noend[i];i++)
  2039 		    if (g_str_has_suffix(testword,noend[i]))
  2040 			istypo=TRUE;
  2041 	    }
  2042 	    /* ght is common, gbt never. Like that. */
  2043 	    if (strstr(testword,"cb"))
  2044 		istypo=TRUE;
  2045 	    if (strstr(testword,"gbt"))
  2046 		istypo=TRUE;
  2047 	    if (strstr(testword,"pbt"))
  2048 		istypo=TRUE;
  2049 	    if (strstr(testword,"tbs"))
  2050 		istypo=TRUE;
  2051 	    if (strstr(testword,"mrn"))
  2052 		istypo=TRUE;
  2053 	    if (strstr(testword,"ahle"))
  2054 		istypo=TRUE;
  2055 	    if (strstr(testword,"ihle"))
  2056 		istypo=TRUE;
  2057 	    /*
  2058 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2059 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2060 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2061 	     * numerals, but "ii" is a common scanno.
  2062 	     */
  2063 	    if (strstr(testword,"tbi"))
  2064 		istypo=TRUE;
  2065 	    if (strstr(testword,"tbe"))
  2066 		istypo=TRUE;
  2067 	    if (strstr(testword,"ii"))
  2068 		istypo=TRUE;
  2069 	    /*
  2070 	     * Check for no vowels or no consonants.
  2071 	     * If none, flag a typo.
  2072 	     */
  2073 	    if (!istypo && len>1)
  2074 	    {
  2075 		vowel=consonant=0;
  2076 		for (t=testword;*t;t=g_utf8_next_char(t))
  2077 		{
  2078 		    c=g_utf8_get_char(t);
  2079 		    decomposition=
  2080 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2081 		    if (c=='y' || g_unichar_isdigit(c))
  2082 		    {
  2083 			/* Yah, this is loose. */
  2084 			vowel++;
  2085 			consonant++;
  2086 		    }
  2087 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2088 			vowel++;
  2089 		    else
  2090 			consonant++;
  2091 		    g_free(decomposition);
  2092 		}
  2093 		if (!vowel || !consonant)
  2094 		    istypo=TRUE;
  2095 	    }
  2096 	    /*
  2097 	     * Now exclude the word from being reported if it's in
  2098 	     * the okword list.
  2099 	     */
  2100 	    for (i=0;*okword[i];i++)
  2101 		if (!strcmp(testword,okword[i]))
  2102 		    istypo=FALSE;
  2103 	    /*
  2104 	     * What looks like a typo may be a Roman numeral.
  2105 	     * Exclude these.
  2106 	     */
  2107 	    if (istypo && isroman(testword))
  2108 		istypo=FALSE;
  2109 	    /* Check the manual list of typos. */
  2110 	    if (!istypo)
  2111 		for (i=0;*typo[i];i++)
  2112 		    if (!strcmp(testword,typo[i]))
  2113 			istypo=TRUE;
  2114 	    /*
  2115 	     * Check lowercase s, l, i and m - special cases.
  2116 	     *   "j" - often a semi-colon gone wrong.
  2117 	     *   "d" for a missing apostrophe - he d
  2118 	     *   "n" for "in"
  2119 	     */
  2120 	    if (!istypo && len==1 &&
  2121 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2122 		istypo=TRUE;
  2123 	    if (istypo)
  2124 	    {
  2125 		dupcnt=g_tree_lookup(qword,testword);
  2126 		if (dupcnt)
  2127 		{
  2128 		    (*dupcnt)++;
  2129 		    isdup=!pswit[VERBOSE_SWITCH];
  2130 		}
  2131 		else
  2132 		{
  2133 		    dupcnt=g_new0(int,1);
  2134 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2135 		    isdup=FALSE;
  2136 		}
  2137 		if (!isdup)
  2138 		{
  2139 		    if (pswit[ECHO_SWITCH])
  2140 			g_print("\n%s\n",aline);
  2141 		    if (!pswit[OVERVIEW_SWITCH])
  2142 		    {
  2143 			g_print("    Line %ld column %ld - Query word %s",
  2144 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2145 			  inword);
  2146 			if (!pswit[VERBOSE_SWITCH])
  2147 			    g_print(" - not reporting duplicates");
  2148 			g_print("\n");
  2149 		    }
  2150 		    else
  2151 			cnt_word++;
  2152 		}
  2153 	    }
  2154 	}
  2155 	/* check the user's list of typos */
  2156 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2157 	{
  2158 	    if (pswit[ECHO_SWITCH])
  2159 		g_print("\n%s\n",aline);
  2160 	    if (!pswit[OVERVIEW_SWITCH])  
  2161 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2162 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2163 	}
  2164 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2165 	    g_free(testword);
  2166 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2167 	{
  2168 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2169 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2170 	    {
  2171 		if (pswit[ECHO_SWITCH])
  2172 		    g_print("\n%s\n",aline);
  2173 		if (!pswit[OVERVIEW_SWITCH])
  2174 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2175 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2176 		      inword);
  2177 		else
  2178 		    cnt_word++;
  2179 	    }
  2180 	}
  2181 	g_free(inword);
  2182     }
  2183 }
  2184 
  2185 /*
  2186  * check_for_misspaced_punctuation:
  2187  *
  2188  * Look for added or missing spaces around punctuation and quotes.
  2189  * If there is a punctuation character like ! with no space on
  2190  * either side, suspect a missing!space. If there are spaces on
  2191  * both sides , assume a typo. If we see a double quote with no
  2192  * space or punctuation on either side of it, assume unspaced
  2193  * quotes "like"this.
  2194  */
  2195 void check_for_misspaced_punctuation(const char *aline,
  2196   struct parities *parities,gboolean isemptyline)
  2197 {
  2198     gboolean isacro,isellipsis;
  2199     const char *s;
  2200     gunichar c,nc,pc,n2c;
  2201     int parity;
  2202     c=g_utf8_get_char(aline);
  2203     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2204     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2205     {
  2206 	pc=c;
  2207 	c=nc;
  2208 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2209 	/* For each character in the line after the first. */
  2210 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2211 	{
  2212 	    /* we need to suppress warnings for acronyms like M.D. */
  2213 	    isacro=FALSE;
  2214 	    /* we need to suppress warnings for ellipsis . . . */
  2215 	    isellipsis=FALSE;
  2216 	    /*
  2217 	     * If there are letters on both sides of it or
  2218 	     * if it's strict punctuation followed by an alpha.
  2219 	     */
  2220 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2221 	      g_utf8_strchr("?!,;:",-1,c)))
  2222 	    {
  2223 		if (c=='.')
  2224 		{
  2225 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2226 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2227 			isacro=TRUE;
  2228 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2229 		    if (nc && n2c=='.')
  2230 			isacro=TRUE;
  2231 		}
  2232 		if (!isacro)
  2233 		{
  2234 		    if (pswit[ECHO_SWITCH])
  2235 			g_print("\n%s\n",aline);
  2236 		    if (!pswit[OVERVIEW_SWITCH])
  2237 			g_print("    Line %ld column %ld - Missing space?\n",
  2238 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2239 		    else
  2240 			cnt_punct++;
  2241 		}
  2242 	    }
  2243 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2244 	    {
  2245 		/*
  2246 		 * If there are spaces on both sides,
  2247 		 * or space before and end of line.
  2248 		 */
  2249 		if (c=='.')
  2250 		{
  2251 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2252 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2253 			isellipsis=TRUE;
  2254 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2255 		    if (nc && n2c=='.')
  2256 			isellipsis=TRUE;
  2257 		}
  2258 		if (!isemptyline && !isellipsis)
  2259 		{
  2260 		    if (pswit[ECHO_SWITCH])
  2261 			g_print("\n%s\n",aline);
  2262 		    if (!pswit[OVERVIEW_SWITCH])
  2263 			g_print("    Line %ld column %ld - "
  2264 			  "Spaced punctuation?\n",linecnt,
  2265 			  g_utf8_pointer_to_offset(aline,s)+1);
  2266 		    else
  2267 			cnt_punct++;
  2268 		}
  2269 	    }
  2270 	}
  2271     }
  2272     /* Split out the characters that CANNOT be preceded by space. */
  2273     c=g_utf8_get_char(aline);
  2274     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2275     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2276     {
  2277 	pc=c;
  2278 	c=nc;
  2279 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2280 	/* for each character in the line after the first */
  2281 	if (g_utf8_strchr("?!,;:",-1,c))
  2282 	{
  2283 	    /* if it's punctuation that _cannot_ have a space before it */
  2284 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2285 	    {
  2286 		/*
  2287 		 * If nc DOES == space,
  2288 		 * it was already reported just above.
  2289 		 */
  2290 		if (pswit[ECHO_SWITCH])
  2291 		    g_print("\n%s\n",aline);
  2292 		if (!pswit[OVERVIEW_SWITCH])
  2293 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2294 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2295 		else
  2296 		    cnt_punct++;
  2297 	    }
  2298 	}
  2299     }
  2300     /*
  2301      * Special case " .X" where X is any alpha.
  2302      * This plugs a hole in the acronym code above.
  2303      * Inelegant, but maintainable.
  2304      */
  2305     c=g_utf8_get_char(aline);
  2306     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2307     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2308     {
  2309 	pc=c;
  2310 	c=nc;
  2311 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2312 	/* for each character in the line after the first */
  2313 	if (c=='.')
  2314 	{
  2315 	    /* if it's a period */
  2316 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2317 	    {
  2318 		/*
  2319 		 * If the period follows a space and
  2320 		 * is followed by a letter.
  2321 		 */
  2322 		if (pswit[ECHO_SWITCH])
  2323 		    g_print("\n%s\n",aline);
  2324 		if (!pswit[OVERVIEW_SWITCH])
  2325 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2326 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2327 		else
  2328 		    cnt_punct++;
  2329 	    }
  2330 	}
  2331     }
  2332     c=g_utf8_get_char(aline);
  2333     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2334     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2335     {
  2336 	pc=c;
  2337 	c=nc;
  2338 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2339 	/* for each character in the line after the first */
  2340 	if (CHAR_IS_DQUOTE(c))
  2341 	{
  2342 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2343 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2344 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2345 	    {
  2346 		if (pswit[ECHO_SWITCH])
  2347 		    g_print("\n%s\n",aline);
  2348 		if (!pswit[OVERVIEW_SWITCH])
  2349 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2350 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2351 		else
  2352 		    cnt_punct++;
  2353 	    }
  2354 	}
  2355     }
  2356     /* Check parity of quotes. */
  2357     nc=g_utf8_get_char(aline);
  2358     for (s=aline;*s;s=g_utf8_next_char(s))
  2359     {
  2360 	c=nc;
  2361 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2362 	if (CHAR_IS_DQUOTE(c))
  2363 	{
  2364 	    if (c==CHAR_DQUOTE)
  2365 	    {
  2366 		parities->dquote=!parities->dquote;
  2367 		parity=parities->dquote;
  2368 	    }
  2369 	    else if (c==CHAR_LD_QUOTE)
  2370 		parity=1;
  2371 	    else
  2372 		parity=0;
  2373 	    if (!parity)
  2374 	    {
  2375 		/* parity even */
  2376 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2377 		{
  2378 		    if (pswit[ECHO_SWITCH])
  2379 			g_print("\n%s\n",aline);
  2380 		    if (!pswit[OVERVIEW_SWITCH])
  2381 			g_print("    Line %ld column %ld - "
  2382 			  "Wrongspaced quotes?\n",
  2383 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2384 		    else
  2385 			cnt_punct++;
  2386 		}
  2387 	    }
  2388 	    else
  2389 	    {
  2390 		/* parity odd */
  2391 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2392 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2393 		{
  2394 		    if (pswit[ECHO_SWITCH])
  2395 			g_print("\n%s\n",aline);
  2396 		    if (!pswit[OVERVIEW_SWITCH])
  2397 			g_print("    Line %ld column %ld - "
  2398 			  "Wrongspaced quotes?\n",
  2399 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2400 		    else
  2401 			cnt_punct++;
  2402 		}
  2403 	    }
  2404 	}
  2405     }
  2406     c=g_utf8_get_char(aline);
  2407     if (CHAR_IS_DQUOTE(c))
  2408     {
  2409 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2410 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2411 	{
  2412 	    if (pswit[ECHO_SWITCH])
  2413 		g_print("\n%s\n",aline);
  2414 	    if (!pswit[OVERVIEW_SWITCH])
  2415 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2416 		  linecnt);
  2417 	    else
  2418 		cnt_punct++;
  2419 	}
  2420     }
  2421     if (pswit[SQUOTE_SWITCH])
  2422     {
  2423 	nc=g_utf8_get_char(aline);
  2424 	for (s=aline;*s;s=g_utf8_next_char(s))
  2425 	{
  2426 	    c=nc;
  2427 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2428 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2429 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2430 	      !g_unichar_isalpha(nc)))
  2431 	    {
  2432 		parities->squote=!parities->squote;
  2433 		if (!parities->squote)
  2434 		{
  2435 		    /* parity even */
  2436 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2437 		    {
  2438 			if (pswit[ECHO_SWITCH])
  2439 			    g_print("\n%s\n",aline);
  2440 			if (!pswit[OVERVIEW_SWITCH])
  2441 			    g_print("    Line %ld column %ld - "
  2442 			      "Wrongspaced singlequotes?\n",
  2443 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2444 			else
  2445 			    cnt_punct++;
  2446 		    }
  2447 		}
  2448 		else
  2449 		{
  2450 		    /* parity odd */
  2451 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2452 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2453 		    {
  2454 			if (pswit[ECHO_SWITCH])
  2455 			    g_print("\n%s\n",aline);
  2456 			if (!pswit[OVERVIEW_SWITCH])
  2457 			    g_print("    Line %ld column %ld - "
  2458 			      "Wrongspaced singlequotes?\n",
  2459 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2460 			else
  2461 			    cnt_punct++;
  2462 		    }
  2463 		}
  2464 	    }
  2465 	}
  2466     }
  2467 }
  2468 
  2469 /*
  2470  * check_for_double_punctuation:
  2471  *
  2472  * Look for double punctuation like ,. or ,,
  2473  * Thanks to DW for the suggestion!
  2474  * In books with references, ".," and ".;" are common
  2475  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2476  * OTOH, from my initial tests, there are also fairly
  2477  * common errors. What to do? Make these cases paranoid?
  2478  * ".," is the most common, so warnings->dotcomma is used
  2479  * to suppress detailed reporting if it occurs often.
  2480  */
  2481 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2482 {
  2483     const char *s;
  2484     gunichar c,nc;
  2485     nc=g_utf8_get_char(aline);
  2486     for (s=aline;*s;s=g_utf8_next_char(s))
  2487     {
  2488 	c=nc;
  2489 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2490 	/* for each punctuation character in the line */
  2491 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2492 	  g_utf8_strchr(".?!,;:",-1,nc))
  2493 	{
  2494 	    /* followed by punctuation, it's a query, unless . . . */
  2495 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2496 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2497 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2498 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2499 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2500 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2501 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2502 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2503 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2504 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2505 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2506 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2507 	    {
  2508 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2509 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2510 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2511 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2512 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2513 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2514 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2515 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2516 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2517 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2518 		{
  2519 		    s+=4;
  2520 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2521 		}
  2522 		; /* do nothing for .. !! and ?? which can be legit */
  2523 	    }
  2524 	    else
  2525 	    {
  2526 		if (pswit[ECHO_SWITCH])
  2527 		    g_print("\n%s\n",aline);
  2528 		if (!pswit[OVERVIEW_SWITCH])
  2529 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2530 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2531 		else
  2532 		    cnt_punct++;
  2533 	    }
  2534 	}
  2535     }
  2536 }
  2537 
  2538 /*
  2539  * check_for_spaced_quotes:
  2540  */
  2541 void check_for_spaced_quotes(const char *aline)
  2542 {
  2543     int i;
  2544     const char *s,*t;
  2545     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2546       CHAR_RS_QUOTE};
  2547     GString *pattern;
  2548     s=aline;
  2549     while ((t=strstr(s," \" ")))
  2550     {
  2551 	if (pswit[ECHO_SWITCH])
  2552 	    g_print("\n%s\n",aline);
  2553 	if (!pswit[OVERVIEW_SWITCH])
  2554 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2555 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2556 	else
  2557 	    cnt_punct++;
  2558 	s=g_utf8_next_char(g_utf8_next_char(t));
  2559     }
  2560     pattern=g_string_new(NULL);
  2561     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2562     {
  2563 	g_string_assign(pattern," ");
  2564 	g_string_append_unichar(pattern,single_quotes[i]);
  2565 	g_string_append_c(pattern,' ');
  2566 	s=aline;
  2567 	while ((t=strstr(s,pattern->str)))
  2568 	{
  2569 	    if (pswit[ECHO_SWITCH])
  2570 		g_print("\n%s\n",aline);
  2571 	    if (!pswit[OVERVIEW_SWITCH])
  2572 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2573 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2574 	    else
  2575 		cnt_punct++;
  2576 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2577 	}
  2578     }
  2579     g_string_free(pattern,TRUE);
  2580 }
  2581 
  2582 /*
  2583  * check_for_miscased_genative:
  2584  *
  2585  * Check special case of 'S instead of 's at end of word.
  2586  */
  2587 void check_for_miscased_genative(const char *aline)
  2588 {
  2589     const char *s;
  2590     gunichar c,nc,pc;
  2591     if (!*aline)
  2592 	return;
  2593     c=g_utf8_get_char(aline);
  2594     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2595     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2596     {
  2597 	pc=c;
  2598 	c=nc;
  2599 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2600 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2601 	{
  2602 	    if (pswit[ECHO_SWITCH])
  2603 		g_print("\n%s\n",aline);
  2604 	    if (!pswit[OVERVIEW_SWITCH])
  2605 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2606 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2607 	    else
  2608 		cnt_punct++;
  2609 	}
  2610     }
  2611 }
  2612 
  2613 /*
  2614  * check_end_of_line:
  2615  *
  2616  * Now check special cases - start and end of line -
  2617  * for single and double quotes. Start is sometimes [sic]
  2618  * but better to query it anyway.
  2619  * While we're here, check for dash at end of line.
  2620  */
  2621 void check_end_of_line(const char *aline,struct warnings *warnings)
  2622 {
  2623     int lbytes;
  2624     const char *s;
  2625     gunichar c1,c2;
  2626     lbytes=strlen(aline);
  2627     if (g_utf8_strlen(aline,lbytes)>1)
  2628     {
  2629 	s=g_utf8_prev_char(aline+lbytes);
  2630 	c1=g_utf8_get_char(s);
  2631 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2632 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2633 	{
  2634 	    if (pswit[ECHO_SWITCH])
  2635 		g_print("\n%s\n",aline);
  2636 	    if (!pswit[OVERVIEW_SWITCH])
  2637 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2638 		  g_utf8_strlen(aline,lbytes));
  2639 	    else
  2640 		cnt_punct++;
  2641 	}
  2642 	c1=g_utf8_get_char(aline);
  2643 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2644 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2645 	{
  2646 	    if (pswit[ECHO_SWITCH])
  2647 		g_print("\n%s\n",aline);
  2648 	    if (!pswit[OVERVIEW_SWITCH])
  2649 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2650 	    else
  2651 		cnt_punct++;
  2652 	}
  2653 	/*
  2654 	 * Dash at end of line may well be legit - paranoid mode only
  2655 	 * and don't report em-dash at line-end.
  2656 	 */
  2657 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2658 	{
  2659 	    for (s=g_utf8_prev_char(aline+lbytes);
  2660 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2661 		;
  2662 	    if (g_utf8_get_char(s)=='-' &&
  2663 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2664 	    {
  2665 		if (pswit[ECHO_SWITCH])
  2666 		    g_print("\n%s\n",aline);
  2667 		if (!pswit[OVERVIEW_SWITCH])
  2668 		    g_print("    Line %ld column %ld - "
  2669 		      "Hyphen at end of line?\n",
  2670 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2671 	    }
  2672 	}
  2673     }
  2674 }
  2675 
  2676 /*
  2677  * check_for_unspaced_bracket:
  2678  *
  2679  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2680  * If so, suspect a scanno like "a]most".
  2681  */
  2682 void check_for_unspaced_bracket(const char *aline)
  2683 {
  2684     const char *s;
  2685     gunichar c,nc,pc;
  2686     c=g_utf8_get_char(aline);
  2687     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2688     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2689     {
  2690 	pc=c;
  2691 	c=nc;
  2692 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2693 	if (!nc)
  2694 	    break;
  2695 	/* for each bracket character in the line except 1st & last */
  2696 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2697 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2698 	{
  2699 	    if (pswit[ECHO_SWITCH])
  2700 		g_print("\n%s\n",aline);
  2701 	    if (!pswit[OVERVIEW_SWITCH])
  2702 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2703 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2704 	    else
  2705 		cnt_punct++;
  2706 	}
  2707     }
  2708 }
  2709 
  2710 /*
  2711  * check_for_unpunctuated_endquote:
  2712  */
  2713 void check_for_unpunctuated_endquote(const char *aline)
  2714 {
  2715     const char *s;
  2716     gunichar c,nc,pc;
  2717     QuoteClass qc;
  2718     c=g_utf8_get_char(aline);
  2719     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2720     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2721     {
  2722 	pc=c;
  2723 	c=nc;
  2724 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2725 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2726 	/* for each character in the line except 1st */
  2727 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
  2728 	{
  2729 	    if (pswit[ECHO_SWITCH])
  2730 		g_print("\n%s\n",aline);
  2731 	    if (!pswit[OVERVIEW_SWITCH])
  2732 		g_print("    Line %ld column %ld - "
  2733 		  "endquote missing punctuation?\n",
  2734 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2735 	    else
  2736 		cnt_punct++;
  2737 	}
  2738     }
  2739 }
  2740 
  2741 /*
  2742  * check_for_html_tag:
  2743  *
  2744  * Check for <HTML TAG>.
  2745  *
  2746  * If there is a < in the line, followed at some point
  2747  * by a > then we suspect HTML.
  2748  */
  2749 void check_for_html_tag(const char *aline)
  2750 {
  2751     const char *open,*close;
  2752     gchar *tag;
  2753     open=strchr(aline,'<');
  2754     if (open)
  2755     {
  2756 	close=strchr(g_utf8_next_char(open),'>');
  2757 	if (close)
  2758 	{
  2759 	    if (pswit[ECHO_SWITCH])
  2760 		g_print("\n%s\n",aline);
  2761 	    if (!pswit[OVERVIEW_SWITCH])
  2762 	    {
  2763 		tag=g_strndup(open,close-open+1);
  2764 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2765 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2766 		g_free(tag);
  2767 	    }
  2768 	    else
  2769 		cnt_html++;
  2770 	}
  2771     }
  2772 }
  2773 
  2774 /*
  2775  * check_for_html_entity:
  2776  *
  2777  * Check for &symbol; HTML.
  2778  *
  2779  * If there is a & in the line, followed at
  2780  * some point by a ; then we suspect HTML.
  2781  */
  2782 void check_for_html_entity(const char *aline)
  2783 {
  2784     const char *s,*amp,*scolon;
  2785     gchar *entity;
  2786     amp=strchr(aline,'&');
  2787     if (amp)
  2788     {
  2789 	scolon=strchr(amp,';');
  2790 	if (scolon)
  2791 	{
  2792 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2793 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2794 		    break;		/* Don't report "Jones & Son;" */
  2795 	    if (s>=scolon)
  2796 	    {
  2797 		if (pswit[ECHO_SWITCH])
  2798 		    g_print("\n%s\n",aline);
  2799 		if (!pswit[OVERVIEW_SWITCH])
  2800 		{
  2801 		    entity=g_strndup(amp,scolon-amp+1);
  2802 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2803 		      linecnt,(int)(amp-aline)+1,entity);
  2804 		    g_free(entity);
  2805 		}
  2806 		else
  2807 		    cnt_html++;
  2808 	    }
  2809 	}
  2810     }
  2811 }
  2812 
  2813 /*
  2814  * check_for_omitted_punctuation:
  2815  *
  2816  * Check for omitted punctuation at end of paragraph by working back
  2817  * through prevline. DW.
  2818  * Need to check this only for "normal" paras.
  2819  * So what is a "normal" para?
  2820  *    Not normal if one-liner (chapter headings, etc.)
  2821  *    Not normal if doesn't contain at least one locase letter
  2822  *    Not normal if starts with space
  2823  */
  2824 void check_for_omitted_punctuation(const char *prevline,
  2825   struct line_properties *last,int start_para_line)
  2826 {
  2827     gboolean letter_on_line=FALSE;
  2828     const char *s;
  2829     gunichar c;
  2830     gboolean closing_quote;
  2831     for (s=prevline;*s;s=g_utf8_next_char(s))
  2832 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2833 	{
  2834 	    letter_on_line=TRUE;
  2835 	    break;
  2836 	}
  2837     /*
  2838      * This next "if" is a problem.
  2839      * If we say "start_para_line <= linecnt - 1", that includes
  2840      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2841      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2842      * misses genuine one-line paragraphs.
  2843      */
  2844     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2845       g_utf8_get_char(prevline)>CHAR_SPACE)
  2846     {
  2847 	s=prevline+strlen(prevline);
  2848 	do
  2849 	{
  2850 	    s=g_utf8_prev_char(s);
  2851 	    c=g_utf8_get_char(s);
  2852 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2853 		closing_quote=TRUE;
  2854 	    else
  2855 		closing_quote=FALSE;
  2856 	} while (closing_quote && s>prevline);
  2857 	for (;s>prevline;s=g_utf8_prev_char(s))
  2858 	{
  2859 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2860 	    {
  2861 		if (pswit[ECHO_SWITCH])
  2862 		    g_print("\n%s\n",prevline);
  2863 		if (!pswit[OVERVIEW_SWITCH])
  2864 		    g_print("    Line %ld column %ld - "
  2865 		      "No punctuation at para end?\n",
  2866 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2867 		else
  2868 		    cnt_punct++;
  2869 		break;
  2870 	    }
  2871 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2872 		break;
  2873 	}
  2874     }
  2875 }
  2876 
  2877 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2878 {
  2879     const char *word=key;
  2880     int *dupcnt=value;
  2881     if (*dupcnt)
  2882 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2883 	  word,*dupcnt);
  2884     return FALSE;
  2885 }
  2886 
  2887 void print_as_windows_1252(const char *string)
  2888 {
  2889     gsize inbytes,outbytes;
  2890     gchar *buf,*bp;
  2891     static GIConv converter=(GIConv)-1;
  2892     if (!string)
  2893     {
  2894 	if (converter!=(GIConv)-1)
  2895 	    g_iconv_close(converter);
  2896 	converter=(GIConv)-1;
  2897 	return;
  2898     }
  2899     if (converter==(GIConv)-1)
  2900 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2901     if (converter!=(GIConv)-1)
  2902     {
  2903 	inbytes=outbytes=strlen(string);
  2904 	bp=buf=g_malloc(outbytes+1);
  2905 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2906 	*bp='\0';
  2907 	fputs(buf,stdout);
  2908 	g_free(buf);
  2909     }
  2910     else
  2911 	fputs(string,stdout);
  2912 }
  2913 
  2914 void print_as_utf_8(const char *string)
  2915 {
  2916     fputs(string,stdout);
  2917 }
  2918 
  2919 /*
  2920  * procfile:
  2921  *
  2922  * Process one file.
  2923  */
  2924 void procfile(const char *filename)
  2925 {
  2926     const char *s;
  2927     gchar *parastart=NULL;	/* first line of current para */
  2928     gchar *etext,*aline;
  2929     gchar *etext_ptr;
  2930     GError *err=NULL;
  2931     struct first_pass_results *first_pass_results;
  2932     struct warnings *warnings;
  2933     struct counters counters={0};
  2934     struct line_properties last={0};
  2935     struct parities parities={0};
  2936     struct pending pending={0};
  2937     gboolean isemptyline;
  2938     long start_para_line=0;
  2939     gboolean isnewpara=FALSE,enddash=FALSE;
  2940     last.start=CHAR_SPACE;
  2941     linecnt=checked_linecnt=0;
  2942     etext=read_etext(filename,&err);
  2943     if (!etext)
  2944     {
  2945 	if (pswit[STDOUT_SWITCH])
  2946 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2947 	else
  2948 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2949 	exit(1);
  2950     }
  2951     g_print("\n\nFile: %s\n\n",filename);
  2952     first_pass_results=first_pass(etext);
  2953     warnings=report_first_pass(first_pass_results);
  2954     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2955     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2956     /*
  2957      * Here we go with the main pass. Hold onto yer hat!
  2958      */
  2959     linecnt=0;
  2960     etext_ptr=etext;
  2961     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2962     {
  2963 	linecnt++;
  2964 	if (linecnt==1)
  2965 	    isnewpara=TRUE;
  2966 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2967 	    continue;    // skip DP page separators completely
  2968 	if (linecnt<first_pass_results->firstline ||
  2969 	  (first_pass_results->footerline>0 &&
  2970 	  linecnt>first_pass_results->footerline))
  2971 	{
  2972 	    if (pswit[HEADER_SWITCH])
  2973 	    {
  2974 		if (g_str_has_prefix(aline,"Title:"))
  2975 		    g_print("    %s\n",aline);
  2976 		if (g_str_has_prefix(aline,"Author:"))
  2977 		    g_print("    %s\n",aline);
  2978 		if (g_str_has_prefix(aline,"Release Date:"))
  2979 		    g_print("    %s\n",aline);
  2980 		if (g_str_has_prefix(aline,"Edition:"))
  2981 		    g_print("    %s\n\n",aline);
  2982 	    }
  2983 	    continue;		/* skip through the header */
  2984 	}
  2985 	checked_linecnt++;
  2986 	print_pending(aline,parastart,&pending);
  2987 	isemptyline=analyse_quotes(aline,linecnt,&counters);
  2988 	if (isnewpara && !isemptyline)
  2989 	{
  2990 	    /* This line is the start of a new paragraph. */
  2991 	    start_para_line=linecnt;
  2992 	    /* Capture its first line in case we want to report it later. */
  2993 	    g_free(parastart);
  2994 	    parastart=g_strdup(aline);
  2995 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2996 	    s=aline;
  2997 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2998 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2999 		s=g_utf8_next_char(s);
  3000 	    if (g_unichar_islower(g_utf8_get_char(s)))
  3001 	    {
  3002 		/* and its first letter is lowercase */
  3003 		if (pswit[ECHO_SWITCH])
  3004 		    g_print("\n%s\n",aline);
  3005 		if (!pswit[OVERVIEW_SWITCH])
  3006 		    g_print("    Line %ld column %ld - "
  3007 		      "Paragraph starts with lower-case\n",
  3008 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  3009 		else
  3010 		    cnt_punct++;
  3011 	    }
  3012 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  3013 	}
  3014 	/* Check for an em-dash broken at line end. */
  3015 	if (enddash && g_utf8_get_char(aline)=='-')
  3016 	{
  3017 	    if (pswit[ECHO_SWITCH])
  3018 		g_print("\n%s\n",aline);
  3019 	    if (!pswit[OVERVIEW_SWITCH])
  3020 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  3021 	    else
  3022 		cnt_punct++;
  3023 	}
  3024 	enddash=FALSE;
  3025 	for (s=g_utf8_prev_char(aline+strlen(aline));
  3026 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  3027 	    ;
  3028 	if (s>=aline && g_utf8_get_char(s)=='-')
  3029 	    enddash=TRUE;
  3030 	check_for_control_characters(aline);
  3031 	check_for_odd_characters(aline,warnings,isemptyline);
  3032 	if (warnings->longline)
  3033 	    check_for_long_line(aline);
  3034 	if (warnings->shortline)
  3035 	    check_for_short_line(aline,&last);
  3036 	last.blen=last.len;
  3037 	last.len=g_utf8_strlen(aline,-1);
  3038 	last.start=g_utf8_get_char(aline);
  3039 	check_for_starting_punctuation(aline);
  3040 	if (warnings->dash)
  3041 	{
  3042 	    check_for_spaced_emdash(aline);
  3043 	    check_for_spaced_dash(aline);
  3044 	}
  3045 	check_for_unmarked_paragraphs(aline);
  3046 	check_for_jeebies(aline);
  3047 	check_for_mta_from(aline);
  3048 	check_for_orphan_character(aline);
  3049 	check_for_pling_scanno(aline);
  3050 	check_for_extra_period(aline,warnings);
  3051 	check_for_following_punctuation(aline);
  3052 	check_for_typos(aline,warnings);
  3053 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3054 	check_for_double_punctuation(aline,warnings);
  3055 	check_for_spaced_quotes(aline);
  3056 	check_for_miscased_genative(aline);
  3057 	check_end_of_line(aline,warnings);
  3058 	check_for_unspaced_bracket(aline);
  3059 	if (warnings->endquote)
  3060 	    check_for_unpunctuated_endquote(aline);
  3061 	check_for_html_tag(aline);
  3062 	check_for_html_entity(aline);
  3063 	if (isemptyline)
  3064 	{
  3065 	    check_for_mismatched_quotes(&counters,&pending);
  3066 	    counters_reset(&counters);
  3067 	    /* let the next iteration know that it's starting a new para */
  3068 	    isnewpara=TRUE;
  3069 	    if (prevline)
  3070 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3071 	}
  3072 	g_free(prevline);
  3073 	prevline=g_strdup(aline);
  3074     }
  3075     linecnt++;
  3076     check_for_mismatched_quotes(&counters,&pending);
  3077     print_pending(NULL,parastart,&pending);
  3078     reset_pending(&pending);
  3079     if (prevline)
  3080     {
  3081 	g_free(prevline);
  3082 	prevline=NULL;
  3083     }
  3084     g_free(parastart);
  3085     g_free(prevline);
  3086     g_free(etext);
  3087     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3088 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3089     g_tree_unref(qword);
  3090     g_tree_unref(qperiod);
  3091     counters_destroy(&counters);
  3092     g_set_print_handler(NULL);
  3093     print_as_windows_1252(NULL);
  3094     if (pswit[MARKUP_SWITCH])  
  3095 	loseentities(NULL);
  3096 }
  3097 
  3098 /*
  3099  * flgets:
  3100  *
  3101  * Get one line from the input text, checking for
  3102  * the existence of exactly one CR/LF line-end per line.
  3103  *
  3104  * Returns: a pointer to the line.
  3105  */
  3106 char *flgets(char **etext,long lcnt)
  3107 {
  3108     gunichar c;
  3109     gboolean isCR=FALSE;
  3110     char *theline=*etext;
  3111     char *eos=theline;
  3112     gchar *s;
  3113     for (;;)
  3114     {
  3115 	c=g_utf8_get_char(*etext);
  3116 	*etext=g_utf8_next_char(*etext);
  3117 	if (!c)
  3118 	    return NULL;
  3119 	/* either way, it's end of line */
  3120 	if (c=='\n')
  3121 	{
  3122 	    if (isCR)
  3123 		break;
  3124 	    else
  3125 	    {
  3126 		/* Error - a LF without a preceding CR */
  3127 		if (pswit[LINE_END_SWITCH])
  3128 		{
  3129 		    if (pswit[ECHO_SWITCH])
  3130 		    {
  3131 			s=g_strndup(theline,eos-theline);
  3132 			g_print("\n%s\n",s);
  3133 			g_free(s);
  3134 		    }
  3135 		    if (!pswit[OVERVIEW_SWITCH])
  3136 			g_print("    Line %ld - No CR?\n",lcnt);
  3137 		    else
  3138 			cnt_lineend++;
  3139 		}
  3140 		break;
  3141 	    }
  3142 	}
  3143 	if (c=='\r')
  3144 	{
  3145 	    if (isCR)
  3146 	    {
  3147 		/* Error - two successive CRs */
  3148 		if (pswit[LINE_END_SWITCH])
  3149 		{
  3150 		    if (pswit[ECHO_SWITCH])
  3151 		    {
  3152 			s=g_strndup(theline,eos-theline);
  3153 			g_print("\n%s\n",s);
  3154 			g_free(s);
  3155 		    }
  3156 		    if (!pswit[OVERVIEW_SWITCH])
  3157 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  3158 		    else
  3159 			cnt_lineend++;
  3160 		}
  3161 	    }
  3162 	    isCR=TRUE;
  3163 	}
  3164 	else
  3165 	{
  3166 	    if (pswit[LINE_END_SWITCH] && isCR)
  3167 	    {
  3168 		if (pswit[ECHO_SWITCH])
  3169 		{
  3170 		    s=g_strndup(theline,eos-theline);
  3171 		    g_print("\n%s\n",s);
  3172 		    g_free(s);
  3173 		}
  3174 		if (!pswit[OVERVIEW_SWITCH])
  3175 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3176 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3177 		else
  3178 		    cnt_lineend++;
  3179 		*eos=' ';
  3180 	    }
  3181 	    isCR=FALSE;
  3182 	    eos=g_utf8_next_char(eos);
  3183 	}
  3184     }
  3185     *eos='\0';
  3186     if (pswit[MARKUP_SWITCH])  
  3187 	postprocess_for_HTML(theline);
  3188     if (pswit[DP_SWITCH])  
  3189 	postprocess_for_DP(theline);
  3190     return theline;
  3191 }
  3192 
  3193 /*
  3194  * mixdigit:
  3195  *
  3196  * Takes a "word" as a parameter, and checks whether it
  3197  * contains a mixture of alpha and digits. Generally, this is an
  3198  * error, but may not be for cases like 4th or L5 12s. 3d.
  3199  *
  3200  * Returns: TRUE iff an is error found.
  3201  */
  3202 gboolean mixdigit(const char *checkword)
  3203 {
  3204     gboolean wehaveadigit,wehavealetter,query;
  3205     const char *s,*nondigit;
  3206     wehaveadigit=wehavealetter=query=FALSE;
  3207     for (s=checkword;*s;s=g_utf8_next_char(s))
  3208 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3209 	    wehavealetter=TRUE;
  3210 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3211 	    wehaveadigit=TRUE;
  3212     if (wehaveadigit && wehavealetter)
  3213     {
  3214 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3215 	query=TRUE;
  3216 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3217 	  nondigit=g_utf8_next_char(nondigit))
  3218 	    ;
  3219 	/* digits, ending in st, rd, nd, th of either case */
  3220 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3221 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3222 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3223 	  !g_ascii_strcasecmp(nondigit,"th"))
  3224 	    query=FALSE;
  3225 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3226 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3227 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3228 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3229 	    query=FALSE;
  3230 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3231 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3232 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3233 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3234 	    query=FALSE;
  3235 	/* digits, ending in l, L, s or d */
  3236 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3237 	  !strcmp(nondigit,"d"))
  3238 	    query=FALSE;
  3239 	/*
  3240 	 * L at the start of a number, representing Britsh pounds, like L500.
  3241 	 * This is cute. We know the current word is mixed digit. If the first
  3242 	 * letter is L, there must be at least one digit following. If both
  3243 	 * digits and letters follow, we have a genuine error, else we have a
  3244 	 * capital L followed by digits, and we accept that as a non-error.
  3245 	 */
  3246 	if (g_utf8_get_char(checkword)=='L' &&
  3247 	  !mixdigit(g_utf8_next_char(checkword)))
  3248 	    query=FALSE;
  3249     }
  3250     return query;
  3251 }
  3252 
  3253 /*
  3254  * getaword:
  3255  *
  3256  * Extracts the first/next "word" from the line, and returns it.
  3257  * A word is defined as one English word unit--or at least that's the aim.
  3258  * "ptr" is advanced to the position in the line where we will start
  3259  * looking for the next word.
  3260  *
  3261  * Returns: A newly-allocated string.
  3262  */
  3263 gchar *getaword(const char **ptr)
  3264 {
  3265     const char *s,*t;
  3266     GString *word;
  3267     gunichar c,pc;
  3268     word=g_string_new(NULL);
  3269     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3270       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3271       **ptr;*ptr=g_utf8_next_char(*ptr))
  3272 	;
  3273     /*
  3274      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3275      * Especially yucky is the case of L1,000
  3276      * This section looks for a pattern of characters including a digit
  3277      * followed by a comma or period followed by one or more digits.
  3278      * If found, it returns this whole pattern as a word; otherwise we discard
  3279      * the results and resume our normal programming.
  3280      */
  3281     s=*ptr;
  3282     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3283       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3284       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3285 	g_string_append_unichar(word,g_utf8_get_char(s));
  3286     if (word->len)
  3287     {
  3288 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3289 	{
  3290 	    c=g_utf8_get_char(t);
  3291 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3292 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3293 	    {
  3294 		*ptr=s;
  3295 		return g_string_free(word,FALSE);
  3296 	    }
  3297 	}
  3298     }
  3299     /* we didn't find a punctuated number - do the regular getword thing */
  3300     g_string_truncate(word,0);
  3301     c=g_utf8_get_char(*ptr);
  3302     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3303       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3304 	g_string_append_unichar(word,c);
  3305     return g_string_free(word,FALSE);
  3306 }
  3307 
  3308 /*
  3309  * isroman:
  3310  *
  3311  * Is this word a Roman Numeral?
  3312  *
  3313  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3314  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3315  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3316  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3317  * expressions thereof, except when it came to taxes. Allow any number of M,
  3318  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3319  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3320  * of optional Is.
  3321  */
  3322 gboolean isroman(const char *t)
  3323 {
  3324     const char *s;
  3325     if (!t || !*t)
  3326 	return FALSE;
  3327     s=t;
  3328     while (g_utf8_get_char(t)=='m' && *t)
  3329 	t++;
  3330     if (g_utf8_get_char(t)=='d')
  3331 	t++;
  3332     if (g_str_has_prefix(t,"cm"))
  3333 	t+=2;
  3334     if (g_str_has_prefix(t,"cd"))
  3335 	t+=2;
  3336     while (g_utf8_get_char(t)=='c' && *t)
  3337 	t++;
  3338     if (g_str_has_prefix(t,"xl"))
  3339 	t+=2;
  3340     if (g_str_has_prefix(t,"xc"))
  3341 	t+=2;
  3342     if (g_utf8_get_char(t)=='l')
  3343 	t++;
  3344     while (g_utf8_get_char(t)=='x' && *t)
  3345 	t++;
  3346     if (g_str_has_prefix(t,"ix"))
  3347 	t+=2;
  3348     if (g_str_has_prefix(t,"iv"))
  3349 	t+=2;
  3350     if (g_utf8_get_char(t)=='v')
  3351 	t++;
  3352     while (g_utf8_get_char(t)=='i' && *t)
  3353 	t++;
  3354     return !*t;
  3355 }
  3356 
  3357 /*
  3358  * postprocess_for_DP:
  3359  *
  3360  * Invoked with the -d switch from flgets().
  3361  * It simply "removes" from the line a hard-coded set of common
  3362  * DP-specific tags, so that the line passed to the main routine has
  3363  * been pre-cleaned of DP markup.
  3364  */
  3365 void postprocess_for_DP(char *theline)
  3366 {
  3367     char *s,*t;
  3368     int i;
  3369     if (!*theline) 
  3370 	return;
  3371     for (i=0;*DPmarkup[i];i++)
  3372 	while ((s=strstr(theline,DPmarkup[i])))
  3373 	{
  3374 	    t=s+strlen(DPmarkup[i]);
  3375 	    memmove(s,t,strlen(t)+1);
  3376 	}
  3377 }
  3378 
  3379 /*
  3380  * postprocess_for_HTML:
  3381  *
  3382  * Invoked with the -m switch from flgets().
  3383  * It simply "removes" from the line a hard-coded set of common
  3384  * HTML tags and "replaces" a hard-coded set of common HTML
  3385  * entities, so that the line passed to the main routine has
  3386  * been pre-cleaned of HTML.
  3387  */
  3388 void postprocess_for_HTML(char *theline)
  3389 {
  3390     while (losemarkup(theline))
  3391 	;
  3392     loseentities(theline);
  3393 }
  3394 
  3395 char *losemarkup(char *theline)
  3396 {
  3397     char *s,*t;
  3398     int i;
  3399     s=strchr(theline,'<');
  3400     t=s?strchr(s,'>'):NULL;
  3401     if (!s || !t)
  3402 	return NULL;
  3403     for (i=0;*markup[i];i++)
  3404 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3405 	{
  3406 	    t=g_utf8_next_char(t);
  3407 	    memmove(s,t,strlen(t)+1);
  3408 	    return s;
  3409 	}
  3410     /* It's an unrecognized <xxx>. */
  3411     return NULL;
  3412 }
  3413 
  3414 void loseentities(char *theline)
  3415 {
  3416     int i;
  3417     gsize nb;
  3418     char *amp,*scolon;
  3419     gchar *s,*t;
  3420     gunichar c;
  3421     GTree *entities=NULL;
  3422     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3423     if (!theline)
  3424     {
  3425 	if (entities)
  3426 	    g_tree_destroy(entities);
  3427 	entities=NULL;
  3428 	if (translit!=(GIConv)-1)
  3429 	    g_iconv_close(translit);
  3430 	translit=(GIConv)-1;
  3431 	if (to_utf8!=(GIConv)-1)
  3432 	    g_iconv_close(to_utf8);
  3433 	to_utf8=(GIConv)-1;
  3434 	return;
  3435     }
  3436     if (!*theline)
  3437 	return;
  3438     if (!entities)
  3439     {
  3440 	entities=g_tree_new((GCompareFunc)strcmp);
  3441 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3442 	    g_tree_insert(entities,HTMLentities[i].name,
  3443 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3444     }
  3445     if (translit==(GIConv)-1)
  3446 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3447     if (to_utf8==(GIConv)-1)
  3448 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3449     while((amp=strchr(theline,'&')))
  3450     {
  3451 	scolon=strchr(amp,';');
  3452 	if (scolon)
  3453 	{
  3454 	    if (amp[1]=='#')
  3455 	    {
  3456 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3457 		    c=strtol(amp+2,NULL,10);
  3458 		else if (amp[2]=='x' &&
  3459 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3460 		    c=strtol(amp+3,NULL,16);
  3461 	    }
  3462 	    else
  3463 	    {
  3464 		s=g_strndup(amp+1,scolon-(amp+1));
  3465 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3466 		g_free(s);
  3467 	    }
  3468 	}
  3469 	else
  3470 	    c=0;
  3471 	if (c)
  3472 	{
  3473 	    theline=amp;
  3474 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3475 		theline+=g_unichar_to_utf8(c,theline);
  3476 	    else
  3477 	    {
  3478 		s=g_malloc(6);
  3479 		nb=g_unichar_to_utf8(c,s);
  3480 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3481 		g_free(s);
  3482 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3483 		g_free(t);
  3484 		memcpy(theline,s,nb);
  3485 		g_free(s);
  3486 		theline+=nb;
  3487 	    }
  3488 	    memmove(theline,g_utf8_next_char(scolon),
  3489 	      strlen(g_utf8_next_char(scolon))+1);
  3490 	}
  3491 	else
  3492 	    theline=g_utf8_next_char(amp);
  3493     }
  3494 }
  3495 
  3496 gboolean tagcomp(const char *strin,const char *basetag)
  3497 {
  3498     gboolean retval;
  3499     gchar *s,*t;
  3500     if (g_utf8_get_char(strin)=='/')
  3501 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3502     else
  3503 	t=g_utf8_casefold(strin,-1);
  3504     s=g_utf8_casefold(basetag,-1);
  3505     retval=g_str_has_prefix(t,s);
  3506     g_free(s);
  3507     g_free(t);
  3508     return retval;
  3509 }
  3510 
  3511 void proghelp(GOptionContext *context)
  3512 {
  3513     gchar *help;
  3514     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3515     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3516     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3517     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3518       "For details, read the file COPYING.\n",stderr);
  3519     fputs("This is Free Software; "
  3520       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3521     fputs("read the file COPYING for details.\n\n",stderr);
  3522     help=g_option_context_get_help(context,TRUE,NULL);
  3523     fputs(help,stderr);
  3524     g_free(help);
  3525     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3526     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3527       "non-ASCII\n",stderr);
  3528     fputs("characters like accented letters, "
  3529       "lines longer than 75 or shorter than 55,\n",stderr);
  3530     fputs("unbalanced quotes or brackets, "
  3531       "a variety of badly formatted punctuation, \n",stderr);
  3532     fputs("HTML tags, some likely typos. "
  3533       "It is NOT a substitute for human judgement.\n",stderr);
  3534     fputs("\n",stderr);
  3535 }