bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Fri Oct 25 11:15:18 2013 +0100 (2013-10-25)
changeset 102 ff0aa9b1397a
parent 101 f44c530f80da
child 103 d22d8cd4f628
permissions -rw-r--r--
Fix bug #14: Add a configuration file
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *prevline;
    36 
    37 /* Common typos. */
    38 char *typo[] = {
    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    68     "se", ""
    69 };
    70 
    71 GTree *usertypo;
    72 
    73 /* Common abbreviations and other OK words not to query as typos. */
    74 char *okword[] = {
    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    78     "outbid", "outbids", "frostbite", "frostbitten", ""
    79 };
    80 
    81 /* Common abbreviations that cause otherwise unexplained periods. */
    82 char *abbrev[] = {
    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    85 };
    86 
    87 /*
    88  * Two-Letter combinations that rarely if ever start words,
    89  * but are common scannos or otherwise common letter combinations.
    90  */
    91 char *nostart[] = {
    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    93 };
    94 
    95 /*
    96  * Two-Letter combinations that rarely if ever end words,
    97  * but are common scannos or otherwise common letter combinations.
    98  */
    99 char *noend[] = {
   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   101     "sw", "gr", "sl", "cl", "iy", ""
   102 };
   103 
   104 char *markup[] = {
   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   109 };
   110 
   111 char *DPmarkup[] = {
   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   113 };
   114 
   115 char *nocomma[] = {
   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   119     "during", "let", "toward", "among", ""
   120 };
   121 
   122 char *noperiod[] = {
   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   126     "among", "those", "into", "whom", "having", "thence", ""
   127 }; 
   128 
   129 gboolean pswit[SWITNO];  /* program switches */
   130 
   131 gboolean typo_compat,paranoid_compat;
   132 
   133 static GOptionEntry options[]={
   134     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   135       "Ignore DP-specific markup", NULL },
   136     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   137       G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   138       "Don't ignore DP-specific markup", NULL },
   139     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   140       "Echo queried line", NULL },
   141     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,
   142       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   143       "Don't echo queried line", NULL },
   144     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   145       "Check single quotes", NULL },
   146     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   147       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   148       "Don't check single quotes", NULL },
   149     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   150       "Check common typos", NULL },
   151     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   152       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   153       "Don't check common typos", NULL },
   154     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   155       "Require closure of quotes on every paragraph", NULL },
   156     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   157       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   158       "Don't require closure of quotes on every paragraph", NULL },
   159     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,
   160       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   161       "Enable paranoid querying of everything", NULL },
   162     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,
   163       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   164       "Disable paranoid querying of everything", NULL },
   165     { "line-end", 0, G_OPTION_FLAG_HIDDEN,
   166       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   167       "Enable line end checking", NULL },
   168     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,
   169       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   170       "Disable line end checking", NULL },
   171     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   172       "Overview: just show counts", NULL },
   173     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   174       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   175       "Show individual warnings", NULL },
   176     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   177       "Output errors to stdout instead of stderr", NULL },
   178     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   179       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   180       "Output errors to stderr instead of stdout", NULL },
   181     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   182       "Echo header fields", NULL },
   183     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   184       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   185       "Don't echo header fields", NULL },
   186     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   187       "Ignore markup in < >", NULL },
   188     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   189       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   190       "No special handling for markup in < >", NULL },
   191     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   192       "Use file of user-defined typos", NULL },
   193     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   194       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   195       "Ignore file of user-defined typos", NULL },
   196     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   197       "Verbose - list everything", NULL },
   198     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
   199       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   200       "Switch off verbose mode", NULL },
   201     { NULL }
   202 };
   203 
   204 /*
   205  * Options relating to configuration which make no sense from inside
   206  * a configuration file.
   207  */
   208 
   209 static GOptionEntry config_options[]={
   210     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   211       "Defaults for use on www upload", NULL },
   212     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,
   213       "Dump current config settings", NULL },
   214     { NULL }
   215 };
   216 
   217 static GOptionEntry compatibility_options[]={
   218     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,
   219       "Toggle checking for common typos", NULL },
   220     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,
   221       "Toggle both paranoid mode and common typos", NULL },
   222     { NULL }
   223 };
   224 
   225 long cnt_quote;		/* for overview mode, count of quote queries */
   226 long cnt_brack;		/* for overview mode, count of brackets queries */
   227 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   228 long cnt_odd;		/* for overview mode, count of odd character queries */
   229 long cnt_long;		/* for overview mode, count of long line errors */
   230 long cnt_short;		/* for overview mode, count of short line queries */
   231 long cnt_punct;		/* for overview mode,
   232 			   count of punctuation and spacing queries */
   233 long cnt_dash;		/* for overview mode, count of dash-related queries */
   234 long cnt_word;		/* for overview mode, count of word queries */
   235 long cnt_html;		/* for overview mode, count of html queries */
   236 long cnt_lineend;	/* for overview mode, count of line-end queries */
   237 long cnt_spacend;	/* count of lines with space at end */
   238 long linecnt;		/* count of total lines in the file */
   239 long checked_linecnt;	/* count of lines actually checked */
   240 
   241 void proghelp(GOptionContext *context);
   242 void procfile(const char *);
   243 
   244 gchar *running_from;
   245 
   246 gboolean mixdigit(const char *);
   247 gchar *getaword(const char **);
   248 char *flgets(char **,long,int);
   249 void postprocess_for_HTML(char *);
   250 char *linehasmarkup(char *);
   251 char *losemarkup(char *);
   252 gboolean tagcomp(const char *,const char *);
   253 void loseentities(char *);
   254 gboolean isroman(const char *);
   255 void postprocess_for_DP(char *);
   256 void print_as_windows_1252(const char *string);
   257 void print_as_utf_8(const char *string);
   258 
   259 GTree *qword,*qperiod;
   260 
   261 #ifdef __WIN32__
   262 UINT saved_cp;
   263 #endif
   264 
   265 GKeyFile *config;
   266 
   267 void config_file_update(GKeyFile *kf)
   268 {
   269     int i;
   270     gboolean sw;
   271     for(i=0;options[i].long_name;i++)
   272     {
   273 	if (g_str_has_prefix(options[i].long_name,"no-"))
   274 	    continue;
   275 	if (options[i].arg==G_OPTION_ARG_NONE)
   276 	{
   277 	    sw=*(gboolean *)options[i].arg_data;
   278 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)
   279 		sw=!sw;
   280 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
   281 	}
   282 	else
   283 	    g_assert_not_reached();
   284     }
   285 }
   286 
   287 void config_file_add_comments(GKeyFile *kf)
   288 {
   289     int i;
   290     gchar *comment;
   291     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",
   292       NULL);
   293     for(i=0;options[i].long_name;i++)
   294     {
   295 	if (g_str_has_prefix(options[i].long_name,"no-"))
   296 	    continue;
   297 	comment=g_strconcat(" ",options[i].description,NULL);
   298 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);
   299 	g_free(comment);
   300     }
   301 }
   302 
   303 void dump_config(void)
   304 {
   305     gchar *s;
   306     if (config)
   307 	config_file_update(config);
   308     else
   309     {
   310 	config=g_key_file_new();
   311 	config_file_update(config);
   312 	config_file_add_comments(config);
   313     }
   314     s=g_key_file_to_data(config,NULL,NULL);
   315     if (s)
   316 	g_print("%s",s);
   317     g_free(s);
   318 }
   319 
   320 GKeyFile *read_config_file(gchar **full_path)
   321 {
   322     int i;
   323     GError *err=NULL;
   324     gchar **search_dirs;
   325     gchar *path;
   326     const char *search_path;
   327     GKeyFile *kf;
   328     kf=g_key_file_new();
   329     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");
   330     if (search_path)
   331     {
   332 #ifdef __WIN32__
   333 	search_dirs=g_strsplit(search_path,";",0);
   334 #else
   335 	search_dirs=g_strsplit(search_path,":",0);
   336 #endif
   337     }
   338     else
   339     {
   340 	search_dirs=g_new(gchar *,4);
   341 	search_dirs[0]=g_get_current_dir();
   342 	search_dirs[1]=g_strdup(running_from);
   343 	search_dirs[2]=g_strdup(g_get_user_config_dir());
   344 	search_dirs[3]=NULL;
   345     }
   346     for(i=0;search_dirs[i];i++)
   347     {
   348 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);
   349 	if (g_key_file_load_from_file(kf,path,
   350 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))
   351 	    break;
   352 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   353 	{
   354 	    g_printerr("Bookloupe: Error reading %s\n",path);
   355 	    g_printerr("%s\n",err->message);
   356 	    exit(1);
   357 	}
   358 	g_clear_error(&err);
   359 	g_free(path);
   360 	path=NULL;
   361     }
   362     if (!search_dirs[i])
   363     {
   364 	g_key_file_free(kf);
   365 	kf=NULL;
   366     }
   367     g_strfreev(search_dirs);
   368     if (full_path && kf)
   369 	*full_path=path;
   370     else
   371 	g_free(path);
   372     return kf;
   373 }
   374 
   375 void parse_config_file(void)
   376 {
   377     int i,j;
   378     gchar *path;
   379     gchar **keys;
   380     gboolean sw;
   381     GError *err=NULL;
   382     config=read_config_file(&path);
   383     if (config)
   384 	keys=g_key_file_get_keys(config,"options",NULL,NULL);
   385     else
   386 	keys=NULL;
   387     if (keys)
   388     {
   389 	for(i=0;keys[i];i++)
   390 	{
   391 	    for(j=0;options[j].long_name;j++)
   392 	    {
   393 		if (g_str_has_prefix(options[j].long_name,"no-"))
   394 		    continue;
   395 		else if (!strcmp(keys[i],options[j].long_name))
   396 		{
   397 		    if (options[j].arg==G_OPTION_ARG_NONE)
   398 		    {
   399 			sw=g_key_file_get_boolean(config,"options",keys[i],
   400 			  &err);
   401 			if (err)
   402 			{
   403 			    g_printerr("Bookloupe: %s: options.%s: %s\n",
   404 			      path,keys[i],err->message);
   405 			    g_clear_error(&err);
   406 			}
   407 			if (options[j].flags&G_OPTION_FLAG_REVERSE)
   408 			    sw=!sw;
   409 			*(gboolean *)options[j].arg_data=sw;
   410 			break;
   411 		    }
   412 		    else
   413 			g_assert_not_reached();
   414 		}
   415 	    }
   416 	    if (!options[j].long_name)
   417 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",
   418 		  path,keys[i]);
   419 	}
   420 	g_strfreev(keys);
   421     }
   422     if (config)
   423 	g_free(path);
   424 }
   425 
   426 void parse_options(int *argc,char ***argv)
   427 {
   428     GError *err=NULL;
   429     GOptionContext *context;
   430     GOptionGroup *compatibility;
   431     context=g_option_context_new(
   432       "file - look for errors in Project Gutenberg(TM) etexts");
   433     g_option_context_add_main_entries(context,options,NULL);
   434     g_option_context_add_main_entries(context,config_options,NULL);
   435     compatibility=g_option_group_new("compatibility",
   436       "Options for Compatibility with Gutcheck:",
   437       "Show compatibility options",NULL,NULL);
   438     g_option_group_add_entries(compatibility,compatibility_options);
   439     g_option_context_add_group(context,compatibility);
   440     g_option_context_set_description(context,
   441       "For simplicity, only the switch options which reverse the\n"
   442       "default configuration are listed. In most cases, both vanilla\n"
   443       "and \"no-\" prefixed versions are available for use.");
   444     if (!g_option_context_parse(context,argc,argv,&err))
   445     {
   446 	g_printerr("Bookloupe: %s\n",err->message);
   447 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   448 	exit(1);
   449     }
   450     if (typo_compat)
   451 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   452     if (paranoid_compat)
   453     {
   454 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   455 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   456     }
   457     /*
   458      * Web uploads - for the moment, this is really just a placeholder
   459      * until we decide what processing we really want to do on web uploads
   460      */
   461     if (pswit[WEB_SWITCH])
   462     {
   463 	/* specific override for web uploads */
   464 	pswit[ECHO_SWITCH]=TRUE;
   465 	pswit[SQUOTE_SWITCH]=FALSE;
   466 	pswit[TYPO_SWITCH]=TRUE;
   467 	pswit[QPARA_SWITCH]=FALSE;
   468 	pswit[PARANOID_SWITCH]=TRUE;
   469 	pswit[LINE_END_SWITCH]=FALSE;
   470 	pswit[OVERVIEW_SWITCH]=FALSE;
   471 	pswit[STDOUT_SWITCH]=FALSE;
   472 	pswit[HEADER_SWITCH]=TRUE;
   473 	pswit[VERBOSE_SWITCH]=FALSE;
   474 	pswit[MARKUP_SWITCH]=FALSE;
   475 	pswit[USERTYPO_SWITCH]=FALSE;
   476 	pswit[DP_SWITCH]=FALSE;
   477     }
   478     if (pswit[DUMP_CONFIG_SWITCH])
   479     {
   480 	dump_config();
   481 	exit(0);
   482     }
   483     if (pswit[OVERVIEW_SWITCH])
   484 	/* just print summary; don't echo */
   485 	pswit[ECHO_SWITCH]=FALSE;
   486     if (*argc<2)
   487     {
   488 	proghelp(context);
   489 	exit(1);
   490     }
   491     g_option_context_free(context);
   492 }
   493 
   494 /*
   495  * read_user_scannos:
   496  *
   497  * Read in the user-defined stealth scanno list.
   498  */
   499 void read_user_scannos(void)
   500 {
   501     GError *err=NULL;
   502     gchar *usertypo_file;
   503     gboolean okay;
   504     int i;
   505     gsize len,nb;
   506     gchar *contents,*utf8,**lines;
   507     usertypo_file=g_strdup("bookloupe.typ");
   508     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   509     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   510     {
   511 	g_clear_error(&err);
   512 	g_free(usertypo_file);
   513 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   514 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   515     }
   516     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   517     {
   518 	g_clear_error(&err);
   519 	g_free(usertypo_file);
   520 	usertypo_file=g_strdup("gutcheck.typ");
   521 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   522     }
   523     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   524     {
   525 	g_clear_error(&err);
   526 	g_free(usertypo_file);
   527 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   528 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   529     }
   530     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   531     {
   532 	g_free(usertypo_file);
   533 	g_print("   --> I couldn't find bookloupe.typ "
   534 	  "-- proceeding without user typos.\n");
   535 	return;
   536     }
   537     else if (!okay)
   538     {
   539 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   540 	g_free(usertypo_file);
   541 	g_clear_error(&err);
   542 	exit(1);
   543     }
   544     if (g_utf8_validate(contents,len,NULL))
   545 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   546     else
   547 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   548     g_free(contents);
   549     lines=g_strsplit_set(utf8,"\r\n",0);
   550     g_free(utf8);
   551     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   552     for (i=0;lines[i];i++)
   553 	if (*(unsigned char *)lines[i]>'!')
   554 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   555 	else
   556 	    g_free(lines[i]);
   557     g_free(lines);
   558 }
   559 
   560 /*
   561  * read_etext:
   562  *
   563  * Read an etext returning a newly allocated string containing the file
   564  * contents or NULL on error.
   565  */
   566 gchar *read_etext(const char *filename,GError **err)
   567 {
   568     GError *tmp_err=NULL;
   569     gchar *contents,*utf8;
   570     gsize len,bytes_read,bytes_written;
   571     int i,line,col;
   572     if (!g_file_get_contents(filename,&contents,&len,err))
   573 	return NULL;
   574     if (g_utf8_validate(contents,len,NULL))
   575     {
   576 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   577 	g_set_print_handler(print_as_utf_8);
   578 #ifdef __WIN32__
   579 	SetConsoleOutputCP(CP_UTF8);
   580 #endif
   581     }
   582     else
   583     {
   584 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   585 	  &bytes_written,&tmp_err);
   586 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   587 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   588 	{
   589 	    line=col=1;
   590 	    for(i=0;i<bytes_read;i++)
   591 		if (contents[i]=='\n')
   592 		{
   593 		    line++;
   594 		    col=1;
   595 		}
   596 		else if (contents[i]!='\r')
   597 		    col++;
   598 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   599 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   600 	      "valid Windows-1252 character",
   601 	      ((unsigned char *)contents)[bytes_read],line,col);
   602 	}
   603 	else if (tmp_err)
   604 	    g_propagate_error(err,tmp_err);
   605 	g_set_print_handler(print_as_windows_1252);
   606 #ifdef __WIN32__
   607 	SetConsoleOutputCP(1252);
   608 #endif
   609     }
   610     g_free(contents);
   611     return utf8;
   612 }
   613 
   614 void cleanup_on_exit(void)
   615 {
   616 #ifdef __WIN32__
   617     SetConsoleOutputCP(saved_cp);
   618 #endif
   619 }
   620 
   621 int main(int argc,char **argv)
   622 {
   623 #ifdef __WIN32__
   624     atexit(cleanup_on_exit);
   625     saved_cp=GetConsoleOutputCP();
   626 #endif
   627     running_from=g_path_get_dirname(argv[0]);
   628     /* Paranoid checking is turned OFF, not on, by its switch */
   629     pswit[PARANOID_SWITCH]=TRUE;
   630     /* if running in paranoid mode, typo checks default to enabled */
   631     pswit[TYPO_SWITCH]=TRUE;
   632     /* Line-end checking is turned OFF, not on, by its switch */
   633     pswit[LINE_END_SWITCH]=TRUE;
   634     /* Echoing is turned OFF, not on, by its switch */
   635     pswit[ECHO_SWITCH]=TRUE;
   636     parse_config_file();
   637     parse_options(&argc,&argv);
   638     if (pswit[USERTYPO_SWITCH])
   639 	read_user_scannos();
   640     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   641     procfile(argv[1]);
   642     if (pswit[OVERVIEW_SWITCH])
   643     {
   644 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   645 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   646 	g_print("    --------------- Queries found --------------\n");
   647 	if (cnt_long)
   648 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   649 	if (cnt_short)
   650 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   651 	if (cnt_lineend)
   652 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   653 	if (cnt_word)
   654 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   655 	if (cnt_quote)
   656 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   657 	if (cnt_brack)
   658 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   659 	if (cnt_bin)
   660 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   661 	if (cnt_odd)
   662 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   663 	if (cnt_punct)
   664 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   665 	if (cnt_dash)
   666 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   667 	if (cnt_html)
   668 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   669 	g_print("\n");
   670 	g_print("    TOTAL QUERIES		  %14ld\n",
   671 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   672 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   673     }
   674     g_free(running_from);
   675     if (usertypo)
   676 	g_tree_unref(usertypo);
   677     if (config)
   678 	g_key_file_free(config);
   679     return 0;
   680 }
   681 
   682 void count_dashes(const char *line,const char *dash,
   683   struct dash_results *results)
   684 {
   685     int i;
   686     gchar **tokens;
   687     gunichar pc,nc;
   688     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   689     if (!*line)
   690 	return;
   691     tokens=g_strsplit(line,dash,0);
   692     if (tokens[1])
   693 	results->base++;
   694     for(i=1;tokens[i];i++)
   695     {
   696 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   697 	nc=g_utf8_get_char(tokens[i]);
   698 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   699 	    spaced=TRUE;
   700 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   701 	    spaced2=TRUE;
   702 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   703 	    unspaced=TRUE;
   704     }
   705     if (spaced)
   706 	results->space++;
   707     if (spaced2)
   708 	/* count of lines with em-dashes with spaces both sides */
   709 	results->non_PG_space++;
   710     if (unspaced)
   711 	/* count of lines with PG-type em-dashes with no spaces */
   712 	results->PG_space++;
   713     g_strfreev(tokens);
   714 }
   715 
   716 /*
   717  * first_pass:
   718  *
   719  * Run a first pass - verify that it's a valid PG
   720  * file, decide whether to report some things that
   721  * occur many times in the text like long or short
   722  * lines, non-standard dashes, etc.
   723  */
   724 struct first_pass_results *first_pass(const char *etext)
   725 {
   726     gunichar laststart=CHAR_SPACE;
   727     const char *s;
   728     gchar *lc_line;
   729     int i,j,lbytes,llen;
   730     gchar **lines;
   731     unsigned int lastlen=0,lastblen=0;
   732     long spline=0,nspline=0;
   733     static struct first_pass_results results={0};
   734     struct dash_results tmp_dash_results;
   735     gchar *inword;
   736     QuoteClass qc;
   737     lines=g_strsplit(etext,"\n",0);
   738     if (!lines[0])
   739     {
   740 	/* An empty etext has no terminators */
   741 	results.newlines=DOS_NEWLINES;
   742     }
   743     else if (!lines[1])
   744     {
   745 	/*
   746 	 * If there are no LFs, we don't have UNIX-style
   747 	 * terminators, but we might have OS9-style ones.
   748 	 */
   749 	results.newlines=OS9_NEWLINES;
   750 	g_strfreev(lines);
   751 	lines=g_strsplit(etext,"\r",0);
   752 	if (!lines[0] || !lines[1])
   753 	    /* Looks like we don't have any terminators at all */
   754 	    results.newlines=DOS_NEWLINES;
   755     }
   756     else
   757     {
   758 	/* We might have UNIX-style terminators */
   759 	results.newlines=UNIX_NEWLINES;
   760     }
   761     for (j=0;lines[j];j++)
   762     {
   763 	lbytes=strlen(lines[j]);
   764 	if (lbytes>0 && lines[j][lbytes-1]=='\r')
   765 	{
   766 	    results.newlines=DOS_NEWLINES;
   767 	    do
   768 	    {
   769 		lines[j][--lbytes]='\0';
   770 	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
   771 	}
   772 	llen=g_utf8_strlen(lines[j],lbytes);
   773 	linecnt++;
   774 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   775 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   776 	{
   777 	    if (spline)
   778 		g_print("   --> Duplicate header?\n");
   779 	    spline=linecnt+1;   /* first line of non-header text, that is */
   780 	}
   781 	if (!strncmp(lines[j],"*** START",9) &&
   782 	  strstr(lines[j],"PROJECT GUTENBERG"))
   783 	{
   784 	    if (nspline)
   785 		g_print("   --> Duplicate header?\n");
   786 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   787 	}
   788 	if (spline || nspline)
   789 	{
   790 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   791 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   792 	    {
   793 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   794 		{
   795 		    if (results.footerline)
   796 		    {
   797 			/* it's an old-form header - we can detect duplicates */
   798 			if (!nspline)
   799 			    g_print("   --> Duplicate footer?\n");
   800 		    }
   801 		    else
   802 			results.footerline=linecnt;
   803 		}
   804 	    }
   805 	    g_free(lc_line);
   806 	}
   807 	if (spline)
   808 	    results.firstline=spline;
   809 	if (nspline)
   810 	    results.firstline=nspline;  /* override with new */
   811 	if (results.footerline)
   812 	    continue;    /* don't count the boilerplate in the footer */
   813 	results.totlen+=llen;
   814 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   815 	{
   816 	    if (g_utf8_get_char(s)>127)
   817 		results.binlen++;
   818 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   819 		results.alphalen++;
   820 	    if (s>lines[j])
   821 	    {
   822 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   823 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   824 		else
   825 		    qc=INVALID_QUOTE;
   826 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   827 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   828 		    results.endquote_count++;
   829 	    }
   830 	}
   831 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   832 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   833 	    results.shortline++;
   834 	if (lbytes>0 &&
   835 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   836 	    cnt_spacend++;
   837 	if (strstr(lines[j],".,"))
   838 	    results.dotcomma++;
   839 	/* only count ast lines for ignoring purposes where there is */
   840 	/* locase text on the line */
   841 	if (strchr(lines[j],'*'))
   842 	{
   843 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   844 		if (g_unichar_islower(g_utf8_get_char(s)))
   845 		    break;
   846 	    if (*s)
   847 		results.astline++;
   848 	}
   849 	if (strchr(lines[j],'/'))
   850 	    results.fslashline++;
   851 	if (lbytes>0)
   852 	{
   853 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   854 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   855 	      s=g_utf8_prev_char(s))
   856 		;
   857 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   858 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   859 		results.hyphens++;
   860 	}
   861 	if (llen>LONGEST_PG_LINE)
   862 	    results.longline++;
   863 	if (llen>WAY_TOO_LONG)
   864 	    results.verylongline++;
   865 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   866 	{
   867 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   868 	    if (i>0)
   869 		results.htmcount++;
   870 	    if (strstr(lines[j],"<i>"))
   871 		results.htmcount+=4; /* bonus marks! */
   872 	}
   873 	/* Check for spaced em-dashes */
   874 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   875 	count_dashes(lines[j],"--",&tmp_dash_results);
   876 	count_dashes(lines[j],"—",&tmp_dash_results);
   877 	if (tmp_dash_results.base)
   878 	    results.emdash.base++;
   879 	if (tmp_dash_results.non_PG_space)
   880 	    results.emdash.non_PG_space++;
   881 	if (tmp_dash_results.PG_space)
   882 	    results.emdash.PG_space++;
   883 	for (s=lines[j];*s;)
   884 	{
   885 	    inword=getaword(&s);
   886 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   887 		results.Dutchcount++;
   888 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   889 		results.Frenchcount++;
   890 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   891 		results.standalone_digit++;
   892 	    g_free(inword);
   893 	}
   894 	/* Check for spaced dashes */
   895 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   896 	    results.spacedash++;
   897 	lastblen=lastlen;
   898 	lastlen=llen;
   899 	laststart=lines[j][0];
   900     }
   901     g_strfreev(lines);
   902     return &results;
   903 }
   904 
   905 /*
   906  * report_first_pass:
   907  *
   908  * Make some snap decisions based on the first pass results.
   909  */
   910 struct warnings *report_first_pass(struct first_pass_results *results)
   911 {
   912     static struct warnings warnings={0};
   913     warnings.newlines=results->newlines;
   914     if (warnings.newlines==UNIX_NEWLINES)
   915 	g_print("   --> No lines in this file have a CR. Not reporting them. "
   916 	  "Project Gutenberg requires that all lineends be CR-LF.\n");
   917     else if (warnings.newlines==OS9_NEWLINES)
   918 	g_print("   --> No lines in this file have a LF. Not reporting them. "
   919 	  "Project Gutenberg requires that all lineends be CR-LF.\n");
   920     if (cnt_spacend>0)
   921 	g_print("   --> %ld lines in this file have white space at end\n",
   922 	  cnt_spacend);
   923     warnings.dotcomma=1;
   924     if (results->dotcomma>5)
   925     {
   926 	warnings.dotcomma=0;
   927 	g_print("   --> %ld lines in this file contain '.,'. "
   928 	  "Not reporting them.\n",results->dotcomma);
   929     }
   930     /*
   931      * If more than 50 lines, or one-tenth, are short,
   932      * don't bother reporting them.
   933      */
   934     warnings.shortline=1;
   935     if (results->shortline>50 || results->shortline*10>linecnt)
   936     {
   937 	warnings.shortline=0;
   938 	g_print("   --> %ld lines in this file are short. "
   939 	  "Not reporting short lines.\n",results->shortline);
   940     }
   941     /*
   942      * If more than 50 lines, or one-tenth, are long,
   943      * don't bother reporting them.
   944      */
   945     warnings.longline=1;
   946     if (results->longline>50 || results->longline*10>linecnt)
   947     {
   948 	warnings.longline=0;
   949 	g_print("   --> %ld lines in this file are long. "
   950 	  "Not reporting long lines.\n",results->longline);
   951     }
   952     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   953     warnings.ast=1;
   954     if (results->astline>10)
   955     {
   956 	warnings.ast=0;
   957 	g_print("   --> %ld lines in this file contain asterisks. "
   958 	  "Not reporting them.\n",results->astline);
   959     }
   960     /*
   961      * If more than 10 lines contain forward slashes,
   962      * don't bother reporting them.
   963      */
   964     warnings.fslash=1;
   965     if (results->fslashline>10)
   966     {
   967 	warnings.fslash=0;
   968 	g_print("   --> %ld lines in this file contain forward slashes. "
   969 	  "Not reporting them.\n",results->fslashline);
   970     }
   971     /*
   972      * If more than 20 lines contain unpunctuated endquotes,
   973      * don't bother reporting them.
   974      */
   975     warnings.endquote=1;
   976     if (results->endquote_count>20)
   977     {
   978 	warnings.endquote=0;
   979 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   980 	  "Not reporting them.\n",results->endquote_count);
   981     }
   982     /*
   983      * If more than 15 lines contain standalone digits,
   984      * don't bother reporting them.
   985      */
   986     warnings.digit=1;
   987     if (results->standalone_digit>10)
   988     {
   989 	warnings.digit=0;
   990 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   991 	  "Not reporting them.\n",results->standalone_digit);
   992     }
   993     /*
   994      * If more than 20 lines contain hyphens at end,
   995      * don't bother reporting them.
   996      */
   997     warnings.hyphen=1;
   998     if (results->hyphens>20)
   999     {
  1000 	warnings.hyphen=0;
  1001 	g_print("   --> %ld lines in this file have hyphens at end. "
  1002 	  "Not reporting them.\n",results->hyphens);
  1003     }
  1004     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
  1005     {
  1006 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
  1007 	pswit[MARKUP_SWITCH]=1;
  1008     }
  1009     if (results->verylongline>0)
  1010 	g_print("   --> %ld lines in this file are VERY long!\n",
  1011 	  results->verylongline);
  1012     /*
  1013      * If there are more non-PG spaced dashes than PG em-dashes,
  1014      * assume it's deliberate.
  1015      * Current PG guidelines say don't use them, but older texts do,
  1016      * and some people insist on them whatever the guidelines say.
  1017      */
  1018     warnings.dash=1;
  1019     if (results->spacedash+results->emdash.non_PG_space>
  1020       results->emdash.PG_space)
  1021     {
  1022 	warnings.dash=0;
  1023 	g_print("   --> There are %ld spaced dashes and em-dashes. "
  1024 	  "Not reporting them.\n",
  1025 	  results->spacedash+results->emdash.non_PG_space);
  1026     }
  1027     /* If more than a quarter of characters are hi-bit, bug out. */
  1028     warnings.bin=1;
  1029     if (results->binlen*4>results->totlen)
  1030     {
  1031 	g_print("   --> This file does not appear to be ASCII. "
  1032 	  "Terminating. Best of luck with it!\n");
  1033 	exit(1);
  1034     }
  1035     if (results->alphalen*4<results->totlen)
  1036     {
  1037 	g_print("   --> This file does not appear to be text. "
  1038 	  "Terminating. Best of luck with it!\n");
  1039 	exit(1);
  1040     }
  1041     if (results->binlen*100>results->totlen || results->binlen>100)
  1042     {
  1043 	g_print("   --> There are a lot of foreign letters here. "
  1044 	  "Not reporting them.\n");
  1045 	warnings.bin=0;
  1046     }
  1047     warnings.isDutch=FALSE;
  1048     if (results->Dutchcount>50)
  1049     {
  1050 	warnings.isDutch=TRUE;
  1051 	g_print("   --> This looks like Dutch - "
  1052 	  "switching off dashes and warnings for 's Middags case.\n");
  1053     }
  1054     warnings.isFrench=FALSE;
  1055     if (results->Frenchcount>50)
  1056     {
  1057 	warnings.isFrench=TRUE;
  1058 	g_print("   --> This looks like French - "
  1059 	  "switching off some doublepunct.\n");
  1060     }
  1061     if (results->firstline && results->footerline)
  1062 	g_print("    The PG header and footer appear to be already on.\n");
  1063     else
  1064     {
  1065 	if (results->firstline)
  1066 	    g_print("    The PG header is on - no footer.\n");
  1067 	if (results->footerline)
  1068 	    g_print("    The PG footer is on - no header.\n");
  1069     }
  1070     g_print("\n");
  1071     if (pswit[VERBOSE_SWITCH])
  1072     {
  1073 	warnings.bin=1;
  1074 	warnings.shortline=1;
  1075 	warnings.dotcomma=1;
  1076 	warnings.longline=1;
  1077 	warnings.dash=1;
  1078 	warnings.digit=1;
  1079 	warnings.ast=1;
  1080 	warnings.fslash=1;
  1081 	warnings.hyphen=1;
  1082 	warnings.endquote=1;
  1083 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
  1084     }
  1085     if (warnings.isDutch)
  1086 	warnings.dash=0;
  1087     if (results->footerline>0 && results->firstline>0 &&
  1088       results->footerline>results->firstline &&
  1089       results->footerline-results->firstline<100)
  1090     {
  1091 	g_print("   --> I don't really know where this text starts. \n");
  1092 	g_print("       There are no reference points.\n");
  1093 	g_print("       I'm going to have to report the header and footer "
  1094 	  "as well.\n");
  1095 	results->firstline=0;
  1096     }
  1097     return &warnings;
  1098 }
  1099 
  1100 /*
  1101  * analyse_quotes:
  1102  *
  1103  * Look along the line, accumulate the count of quotes, and see
  1104  * if this is an empty line - i.e. a line with nothing on it
  1105  * but spaces.
  1106  * If line has just spaces, period, * and/or - on it, don't
  1107  * count it, since empty lines with asterisks or dashes to
  1108  * separate sections are common.
  1109  *
  1110  * Returns: TRUE if the line is empty.
  1111  */
  1112 gboolean analyse_quotes(const char *aline,struct counters *counters)
  1113 {
  1114     int guessquote=0;
  1115     /* assume the line is empty until proven otherwise */
  1116     gboolean isemptyline=TRUE;
  1117     const char *s=aline,*sprev,*snext;
  1118     gunichar c;
  1119     sprev=NULL;
  1120     GError *tmp_err=NULL;
  1121     while (*s)
  1122     {
  1123 	snext=g_utf8_next_char(s);
  1124 	c=g_utf8_get_char(s);
  1125 	if (CHAR_IS_DQUOTE(c))
  1126 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
  1127 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
  1128 	{
  1129 	    if (s==aline)
  1130 	    {
  1131 		/*
  1132 		 * At start of line, it can only be a quotation mark.
  1133 		 * Hardcode a very common exception!
  1134 		 */
  1135 		if (!g_str_has_prefix(snext,"tis") &&
  1136 		  !g_str_has_prefix(snext,"Tis"))
  1137 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1138 	    }
  1139 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
  1140 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1141 		/* Do nothing! it's definitely an apostrophe, not a quote */
  1142 		;
  1143 	    /* it's outside a word - let's check it out */
  1144 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
  1145 	      g_unichar_isalpha(g_utf8_get_char(snext)))
  1146 	    {
  1147 		/* certainly looks like a quotation mark */
  1148 		if (!g_str_has_prefix(snext,"tis") &&
  1149 		  !g_str_has_prefix(snext,"Tis"))
  1150 		    /* hardcode a very common exception! */
  1151 		{
  1152 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
  1153 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1154 		    else
  1155 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
  1156 		}
  1157 	    }
  1158 	    else
  1159 	    {
  1160 		/* now - is it a quotation mark? */
  1161 		guessquote=0;   /* accumulate clues */
  1162 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
  1163 		{
  1164 		    /* it follows a letter - could be either */
  1165 		    guessquote++;
  1166 		    if (g_utf8_get_char(sprev)=='s')
  1167 		    {
  1168 			/* looks like a plural apostrophe */
  1169 			guessquote-=3;
  1170 			if (g_utf8_get_char(snext)==CHAR_SPACE)
  1171 			    /* bonus marks! */
  1172 			    guessquote-=2;
  1173 		    }
  1174 		    if (innermost_quote_matches(counters,c))
  1175 			/*
  1176 			 * Give it the benefit of some doubt,
  1177 			 * if a squote is already open.
  1178 			 */
  1179 			guessquote++;
  1180 		    else
  1181 			guessquote--;
  1182 		    if (guessquote>=0)
  1183 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
  1184 		}
  1185 		else
  1186 		    /* no adjacent letter - it must be a quote of some kind */
  1187 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
  1188 	    }
  1189 	}
  1190 	if (tmp_err)
  1191 	{
  1192 	    if (pswit[ECHO_SWITCH])
  1193 		g_print("\n%s\n",aline);
  1194 	    if (!pswit[OVERVIEW_SWITCH])
  1195 		g_print("    Line %ld column %ld - %s\n",
  1196 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
  1197 	    g_clear_error(&tmp_err);
  1198 	}
  1199 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
  1200 	  c!='\r' && c!='\n')
  1201 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
  1202 	if (c==CHAR_UNDERSCORE)
  1203 	    counters->c_unders++;
  1204 	if (c==CHAR_OPEN_SBRACK)
  1205 	{
  1206 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
  1207 	      !matching_difference(counters,c) && s==aline &&
  1208 	      g_str_has_prefix(s,"[Illustration:"))
  1209 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
  1210 	    else
  1211 		increment_matching(counters,c,TRUE);
  1212 	}
  1213 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
  1214 	    increment_matching(counters,c,TRUE);
  1215 	if (c==CHAR_CLOSE_SBRACK)
  1216 	{
  1217 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1218 	      !matching_difference(counters,c) && !*snext)
  1219 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1220 	    else
  1221 		increment_matching(counters,c,FALSE);
  1222 	}
  1223 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1224 	    increment_matching(counters,c,FALSE);
  1225 	sprev=s;
  1226 	s=snext;
  1227     }
  1228     return isemptyline;
  1229 }
  1230 
  1231 /*
  1232  * check_for_control_characters:
  1233  *
  1234  * Check for invalid or questionable characters in the line
  1235  * Anything above 127 is invalid for plain ASCII, and
  1236  * non-printable control characters should also be flagged.
  1237  * Tabs should generally not be there.
  1238  */
  1239 void check_for_control_characters(const char *aline)
  1240 {
  1241     gunichar c;
  1242     const char *s;
  1243     for (s=aline;*s;s=g_utf8_next_char(s))
  1244     {
  1245 	c=g_utf8_get_char(s);
  1246 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1247 	{
  1248 	    if (pswit[ECHO_SWITCH])
  1249 		g_print("\n%s\n",aline);
  1250 	    if (!pswit[OVERVIEW_SWITCH])
  1251 		g_print("    Line %ld column %ld - Control character %u\n",
  1252 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1253 	    else
  1254 		cnt_bin++;
  1255 	}
  1256     }
  1257 }
  1258 
  1259 /*
  1260  * check_for_odd_characters:
  1261  *
  1262  * Check for binary and other odd characters.
  1263  */
  1264 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1265   gboolean isemptyline)
  1266 {
  1267     /* Don't repeat multiple warnings on one line. */
  1268     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
  1269     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1270     const char *s;
  1271     gunichar c;
  1272     for (s=aline;*s;s=g_utf8_next_char(s))
  1273     {
  1274 	c=g_utf8_get_char(s);
  1275 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1276 	{
  1277 	    if (pswit[ECHO_SWITCH])
  1278 		g_print("\n%s\n",aline);
  1279 	    if (!pswit[OVERVIEW_SWITCH])
  1280 		if (c>127 && c<160 || c>255)
  1281 		    g_print("    Line %ld column %ld - "
  1282 		      "Non-ISO-8859 character %u\n",
  1283 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1284 		else
  1285 		    g_print("    Line %ld column %ld - "
  1286 		      "Non-ASCII character %u\n",
  1287 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1288 	    else
  1289 		cnt_bin++;
  1290 	    eNon_A=TRUE;
  1291 	}
  1292 	if (!eTab && c==CHAR_TAB)
  1293 	{
  1294 	    if (pswit[ECHO_SWITCH])
  1295 		g_print("\n%s\n",aline);
  1296 	    if (!pswit[OVERVIEW_SWITCH])
  1297 		g_print("    Line %ld column %ld - Tab character?\n",
  1298 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1299 	    else
  1300 		cnt_odd++;
  1301 	    eTab=TRUE;
  1302 	}
  1303 	if (!eTilde && c==CHAR_TILDE)
  1304 	{
  1305 	    /*
  1306 	     * Often used by OCR software to indicate an
  1307 	     * unrecognizable character.
  1308 	     */
  1309 	    if (pswit[ECHO_SWITCH])
  1310 		g_print("\n%s\n",aline);
  1311 	    if (!pswit[OVERVIEW_SWITCH])
  1312 		g_print("    Line %ld column %ld - Tilde character?\n",
  1313 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1314 	    else
  1315 		cnt_odd++;
  1316 	    eTilde=TRUE;
  1317 	}
  1318 	if (!eCarat && c==CHAR_CARAT)
  1319 	{  
  1320 	    if (pswit[ECHO_SWITCH])
  1321 		g_print("\n%s\n",aline);
  1322 	    if (!pswit[OVERVIEW_SWITCH])
  1323 		g_print("    Line %ld column %ld - Carat character?\n",
  1324 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1325 	    else
  1326 		cnt_odd++;
  1327 	    eCarat=TRUE;
  1328 	}
  1329 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1330 	{  
  1331 	    if (pswit[ECHO_SWITCH])
  1332 		g_print("\n%s\n",aline);
  1333 	    if (!pswit[OVERVIEW_SWITCH])
  1334 		g_print("    Line %ld column %ld - Forward slash?\n",
  1335 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1336 	    else
  1337 		cnt_odd++;
  1338 	    eFSlash=TRUE;
  1339 	}
  1340 	/*
  1341 	 * Report asterisks only in paranoid mode,
  1342 	 * since they're often deliberate.
  1343 	 */
  1344 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1345 	  c==CHAR_ASTERISK)
  1346 	{
  1347 	    if (pswit[ECHO_SWITCH])
  1348 		g_print("\n%s\n",aline);
  1349 	    if (!pswit[OVERVIEW_SWITCH])
  1350 		g_print("    Line %ld column %ld - Asterisk?\n",
  1351 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1352 	    else
  1353 		cnt_odd++;
  1354 	    eAst=TRUE;
  1355 	}
  1356     }
  1357 }
  1358 
  1359 /*
  1360  * check_for_long_line:
  1361  *
  1362  * Check for line too long.
  1363  */
  1364 void check_for_long_line(const char *aline)
  1365 {
  1366     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1367     {
  1368 	if (pswit[ECHO_SWITCH])
  1369 	    g_print("\n%s\n",aline);
  1370 	if (!pswit[OVERVIEW_SWITCH])
  1371 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1372 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1373 	else
  1374 	    cnt_long++;
  1375     }
  1376 }
  1377 
  1378 /*
  1379  * check_for_short_line:
  1380  *
  1381  * Check for line too short.
  1382  *
  1383  * This one is a bit trickier to implement: we don't want to
  1384  * flag the last line of a paragraph for being short, so we
  1385  * have to wait until we know that our current line is a
  1386  * "normal" line, then report the _previous_ line if it was too
  1387  * short. We also don't want to report indented lines like
  1388  * chapter heads or formatted quotations. We therefore keep
  1389  * last->len as the length of the last line examined, and
  1390  * last->blen as the length of the last but one, and try to
  1391  * suppress unnecessary warnings by checking that both were of
  1392  * "normal" length. We keep the first character of the last
  1393  * line in last->start, and if it was a space, we assume that
  1394  * the formatting is deliberate. I can't figure out a way to
  1395  * distinguish something like a quoted verse left-aligned or
  1396  * the header or footer of a letter from a paragraph of short
  1397  * lines - maybe if I examined the whole paragraph, and if the
  1398  * para has less than, say, 8 lines and if all lines are short,
  1399  * then just assume it's OK? Need to look at some texts to see
  1400  * how often a formula like this would get the right result.
  1401  */
  1402 void check_for_short_line(const char *aline,const struct line_properties *last)
  1403 {
  1404     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1405       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1406       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1407     {
  1408 	if (pswit[ECHO_SWITCH])
  1409 	    g_print("\n%s\n",prevline);
  1410 	if (!pswit[OVERVIEW_SWITCH])
  1411 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1412 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1413 	else
  1414 	    cnt_short++;
  1415     }
  1416 }
  1417 
  1418 /*
  1419  * check_for_starting_punctuation:
  1420  *
  1421  * Look for punctuation other than full ellipses at start of line.
  1422  */
  1423 void check_for_starting_punctuation(const char *aline)
  1424 {
  1425     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1426       !g_str_has_prefix(aline,". . ."))
  1427     {
  1428 	if (pswit[ECHO_SWITCH])
  1429 	    g_print("\n%s\n",aline);
  1430 	if (!pswit[OVERVIEW_SWITCH])
  1431 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1432 	      linecnt);
  1433 	else
  1434 	    cnt_punct++;
  1435     }
  1436 }
  1437 
  1438 /*
  1439  * str_emdash:
  1440  *
  1441  * Find the first em-dash, return a pointer to it and set <next> to the
  1442  * character following the dash.
  1443  */
  1444 char *str_emdash(const char *s,const char **next)
  1445 {
  1446     const char *s1,*s2;
  1447     s1=strstr(s,"--");
  1448     s2=strstr(s,"—");
  1449     if (!s1)
  1450     {
  1451 	if (s2)
  1452 	    *next=g_utf8_next_char(s2);
  1453 	return (char *)s2;
  1454     }
  1455     else if (!s2)
  1456     {
  1457 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1458 	return (char *)s1;
  1459     }
  1460     else if (s1<s2)
  1461     {
  1462 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1463 	return (char *)s1;
  1464     }
  1465     else
  1466     {
  1467 	*next=g_utf8_next_char(s2);
  1468 	return (char *)s2;
  1469     }
  1470 }
  1471 
  1472 /*
  1473  * check_for_spaced_emdash:
  1474  *
  1475  * Check for spaced em-dashes.
  1476  *
  1477  * We must check _all_ occurrences of em-dashes on the line
  1478  * hence the loop - even if the first dash is OK
  1479  * there may be another that's wrong later on.
  1480  */
  1481 void check_for_spaced_emdash(const char *aline)
  1482 {
  1483     const char *s,*t,*next;
  1484     for (s=aline;t=str_emdash(s,&next);s=next)
  1485     {
  1486 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1487 	  g_utf8_get_char(next)==CHAR_SPACE)
  1488 	{
  1489 	    if (pswit[ECHO_SWITCH])
  1490 		g_print("\n%s\n",aline);
  1491 	    if (!pswit[OVERVIEW_SWITCH])
  1492 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1493 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1494 	    else
  1495 		cnt_dash++;
  1496 	}
  1497     }
  1498 }
  1499 
  1500 /*
  1501  * check_for_spaced_dash:
  1502  *
  1503  * Check for spaced dashes.
  1504  */
  1505 void check_for_spaced_dash(const char *aline)
  1506 {
  1507     const char *s;
  1508     if ((s=strstr(aline," -")))
  1509     {
  1510 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1511 	{
  1512 	    if (pswit[ECHO_SWITCH])
  1513 		g_print("\n%s\n",aline);
  1514 	    if (!pswit[OVERVIEW_SWITCH])
  1515 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1516 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1517 	    else
  1518 		cnt_dash++;
  1519 	}
  1520     }
  1521     else if ((s=strstr(aline,"- ")))
  1522     {
  1523 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1524 	{
  1525 	    if (pswit[ECHO_SWITCH])
  1526 		g_print("\n%s\n",aline);
  1527 	    if (!pswit[OVERVIEW_SWITCH])
  1528 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1529 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1530 	    else
  1531 		cnt_dash++;
  1532 	}
  1533     }
  1534 }
  1535 
  1536 /*
  1537  * check_for_unmarked_paragraphs:
  1538  *
  1539  * Check for unmarked paragraphs indicated by separate speakers.
  1540  *
  1541  * May well be false positive:
  1542  * "Bravo!" "Wonderful!" called the crowd.
  1543  * but useful all the same.
  1544  */
  1545 void check_for_unmarked_paragraphs(const char *aline)
  1546 {
  1547     const char *s;
  1548     s=strstr(aline,"\"  \"");
  1549     if (!s)
  1550 	s=strstr(aline,"\" \"");
  1551     if (s)
  1552     {
  1553 	if (pswit[ECHO_SWITCH])
  1554 	    g_print("\n%s\n",aline);
  1555 	if (!pswit[OVERVIEW_SWITCH])
  1556 	    g_print("    Line %ld column %ld - "
  1557 	      "Query missing paragraph break?\n",
  1558 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1559 	else
  1560 	    cnt_punct++;
  1561     }
  1562 }
  1563 
  1564 /*
  1565  * check_for_jeebies:
  1566  *
  1567  * Check for "to he" and other easy h/b errors.
  1568  *
  1569  * This is a very inadequate effort on the h/b problem,
  1570  * but the phrase "to he" is always an error, whereas "to
  1571  * be" is quite common.
  1572  * Similarly, '"Quiet!", be said.' is a non-be error
  1573  * "to he" is _not_ always an error!:
  1574  *       "Where they went to he couldn't say."
  1575  * Another false positive:
  1576  *       What would "Cinderella" be without the . . .
  1577  * and another: "If he wants to he can see for himself."
  1578  */
  1579 void check_for_jeebies(const char *aline)
  1580 {
  1581     const char *s;
  1582     s=strstr(aline," be could ");
  1583     if (!s)
  1584 	s=strstr(aline," be would ");
  1585     if (!s)
  1586 	s=strstr(aline," was be ");
  1587     if (!s)
  1588 	s=strstr(aline," be is ");
  1589     if (!s)
  1590 	s=strstr(aline," is be ");
  1591     if (!s)
  1592 	s=strstr(aline,"\", be ");
  1593     if (!s)
  1594 	s=strstr(aline,"\" be ");
  1595     if (!s)
  1596 	s=strstr(aline,"\" be ");
  1597     if (!s)
  1598 	s=strstr(aline," to he ");
  1599     if (s)
  1600     {
  1601 	if (pswit[ECHO_SWITCH])
  1602 	    g_print("\n%s\n",aline);
  1603 	if (!pswit[OVERVIEW_SWITCH])
  1604 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1605 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1606 	else
  1607 	    cnt_word++;
  1608     }
  1609     s=strstr(aline," the had ");
  1610     if (!s)
  1611 	s=strstr(aline," a had ");
  1612     if (!s)
  1613 	s=strstr(aline," they bad ");
  1614     if (!s)
  1615 	s=strstr(aline," she bad ");
  1616     if (!s)
  1617 	s=strstr(aline," he bad ");
  1618     if (!s)
  1619 	s=strstr(aline," you bad ");
  1620     if (!s)
  1621 	s=strstr(aline," i bad ");
  1622     if (s)
  1623     {
  1624 	if (pswit[ECHO_SWITCH])
  1625 	    g_print("\n%s\n",aline);
  1626 	if (!pswit[OVERVIEW_SWITCH])
  1627 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1628 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1629 	else
  1630 	    cnt_word++;
  1631     }
  1632     s=strstr(aline,"; hut ");
  1633     if (!s)
  1634 	s=strstr(aline,", hut ");
  1635     if (s)
  1636     {
  1637 	if (pswit[ECHO_SWITCH])
  1638 	    g_print("\n%s\n",aline);
  1639 	if (!pswit[OVERVIEW_SWITCH])
  1640 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1641 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1642 	else
  1643 	    cnt_word++;
  1644     }
  1645 }
  1646 
  1647 /*
  1648  * check_for_mta_from:
  1649  *
  1650  * Special case - angled bracket in front of "From" placed there by an
  1651  * MTA when sending an e-mail.
  1652  */
  1653 void check_for_mta_from(const char *aline)
  1654 {
  1655     const char *s;
  1656     s=strstr(aline,">From");
  1657     if (s)
  1658     {
  1659 	if (pswit[ECHO_SWITCH])
  1660 	    g_print("\n%s\n",aline);
  1661 	if (!pswit[OVERVIEW_SWITCH])
  1662 	    g_print("    Line %ld column %ld - "
  1663 	      "Query angled bracket with From\n",
  1664 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1665 	else
  1666 	    cnt_punct++;
  1667     }
  1668 }
  1669 
  1670 /*
  1671  * check_for_orphan_character:
  1672  *
  1673  * Check for a single character line -
  1674  * often an overflow from bad wrapping.
  1675  */
  1676 void check_for_orphan_character(const char *aline)
  1677 {
  1678     gunichar c;
  1679     c=g_utf8_get_char(aline);
  1680     if (c && !*g_utf8_next_char(aline))
  1681     {
  1682 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1683 	    ; /* Nothing - ignore numerals alone on a line. */
  1684 	else
  1685 	{
  1686 	    if (pswit[ECHO_SWITCH])
  1687 		g_print("\n%s\n",aline);
  1688 	    if (!pswit[OVERVIEW_SWITCH])
  1689 		g_print("    Line %ld column 1 - Query single character line\n",
  1690 		  linecnt);
  1691 	    else
  1692 		cnt_punct++;
  1693 	}
  1694     }
  1695 }
  1696 
  1697 /*
  1698  * check_for_pling_scanno:
  1699  *
  1700  * Check for I" - often should be !
  1701  */
  1702 void check_for_pling_scanno(const char *aline)
  1703 {
  1704     const char *s;
  1705     s=strstr(aline," I\"");
  1706     if (s)
  1707     {
  1708 	if (pswit[ECHO_SWITCH])
  1709 	    g_print("\n%s\n",aline);
  1710 	if (!pswit[OVERVIEW_SWITCH])
  1711 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1712 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1713 	else
  1714 	    cnt_punct++;
  1715     }
  1716 }
  1717 
  1718 /*
  1719  * check_for_extra_period:
  1720  *
  1721  * Check for period without a capital letter. Cut-down from gutspell.
  1722  * Only works when it happens on a single line.
  1723  */
  1724 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1725 {
  1726     const char *s,*t,*s1,*sprev;
  1727     int i;
  1728     gsize len;
  1729     gboolean istypo;
  1730     gchar *testword;
  1731     gunichar c,nc,pc,*decomposition;
  1732     if (pswit[PARANOID_SWITCH])
  1733     {
  1734 	for (t=aline;t=strstr(t,". ");)
  1735 	{
  1736 	    if (t==aline)
  1737 	    {
  1738 		t=g_utf8_next_char(t);
  1739 		/* start of line punctuation is handled elsewhere */
  1740 		continue;
  1741 	    }
  1742 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1743 	    {
  1744 		t=g_utf8_next_char(t);
  1745 		continue;
  1746 	    }
  1747 	    if (warnings->isDutch)
  1748 	    {
  1749 		/* For Frank & Jeroen -- 's Middags case */
  1750 		gunichar c2,c3,c4,c5;
  1751 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1752 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1753 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1754 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1755 		if (CHAR_IS_APOSTROPHE(c2) &&
  1756 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1757 		  g_unichar_isupper(c5))
  1758 		{
  1759 		    t=g_utf8_next_char(t);
  1760 		    continue;
  1761 		}
  1762 	    }
  1763 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1764 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1765 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1766 		s1=g_utf8_next_char(s1);
  1767 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1768 	    {
  1769 		/* we have something to investigate */
  1770 		istypo=TRUE;
  1771 		/* so let's go back and find out */
  1772 		nc=g_utf8_get_char(t);
  1773 		s1=g_utf8_prev_char(t);
  1774 		c=g_utf8_get_char(s1);
  1775 		sprev=g_utf8_prev_char(s1);
  1776 		pc=g_utf8_get_char(sprev);
  1777 		while (s1>=aline &&
  1778 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1779 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1780 		  g_unichar_isalpha(nc)))
  1781 		{
  1782 		    nc=c;
  1783 		    s1=sprev;
  1784 		    c=pc;
  1785 		    sprev=g_utf8_prev_char(s1);
  1786 		    pc=g_utf8_get_char(sprev);
  1787 		}
  1788 		s1=g_utf8_next_char(s1);
  1789 		s=strchr(s1,'.');
  1790 		if (s)
  1791 		    testword=g_strndup(s1,s-s1);
  1792 		else
  1793 		    testword=g_strdup(s1);
  1794 		for (i=0;*abbrev[i];i++)
  1795 		    if (!strcmp(testword,abbrev[i]))
  1796 			istypo=FALSE;
  1797 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1798 		    istypo=FALSE;
  1799 		if (!*g_utf8_next_char(testword))
  1800 		    istypo=FALSE;
  1801 		if (isroman(testword))
  1802 		    istypo=FALSE;
  1803 		if (istypo)
  1804 		{
  1805 		    istypo=FALSE;
  1806 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1807 		    {
  1808 			decomposition=g_unicode_canonical_decomposition(
  1809 			  g_utf8_get_char(s),&len);
  1810 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1811 			    istypo=TRUE;
  1812 			g_free(decomposition);
  1813 		    }
  1814 		}
  1815 		if (istypo &&
  1816 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1817 		{
  1818 		    g_tree_insert(qperiod,g_strdup(testword),
  1819 		      GINT_TO_POINTER(1));
  1820 		    if (pswit[ECHO_SWITCH])
  1821 			g_print("\n%s\n",aline);
  1822 		    if (!pswit[OVERVIEW_SWITCH])
  1823 			g_print("    Line %ld column %ld - Extra period?\n",
  1824 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1825 		    else
  1826 			cnt_punct++;
  1827 		}
  1828 		g_free(testword);
  1829 	    }
  1830 	    t=g_utf8_next_char(t);
  1831 	}
  1832     }
  1833 }
  1834 
  1835 /*
  1836  * check_for_following_punctuation:
  1837  *
  1838  * Check for words usually not followed by punctuation.
  1839  */
  1840 void check_for_following_punctuation(const char *aline)
  1841 {
  1842     int i;
  1843     const char *s,*wordstart;
  1844     gunichar c;
  1845     gchar *inword,*t;
  1846     if (pswit[TYPO_SWITCH])
  1847     {
  1848 	for (s=aline;*s;)
  1849 	{
  1850 	    wordstart=s;
  1851 	    t=getaword(&s);
  1852 	    if (!*t)
  1853 	    {
  1854 		g_free(t);
  1855 		continue;
  1856 	    }
  1857 	    inword=g_utf8_strdown(t,-1);
  1858 	    g_free(t);
  1859 	    for (i=0;*nocomma[i];i++)
  1860 		if (!strcmp(inword,nocomma[i]))
  1861 		{
  1862 		    c=g_utf8_get_char(s);
  1863 		    if (c==',' || c==';' || c==':')
  1864 		    {
  1865 			if (pswit[ECHO_SWITCH])
  1866 			    g_print("\n%s\n",aline);
  1867 			if (!pswit[OVERVIEW_SWITCH])
  1868 			    g_print("    Line %ld column %ld - "
  1869 			      "Query punctuation after %s?\n",
  1870 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1871 			      inword);
  1872 			else
  1873 			    cnt_punct++;
  1874 		    }
  1875 		}
  1876 	    for (i=0;*noperiod[i];i++)
  1877 		if (!strcmp(inword,noperiod[i]))
  1878 		{
  1879 		    c=g_utf8_get_char(s);
  1880 		    if (c=='.' || c=='!')
  1881 		    {
  1882 			if (pswit[ECHO_SWITCH])
  1883 			    g_print("\n%s\n",aline);
  1884 			if (!pswit[OVERVIEW_SWITCH])
  1885 			    g_print("    Line %ld column %ld - "
  1886 			      "Query punctuation after %s?\n",
  1887 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1888 			      inword);
  1889 			else
  1890 			    cnt_punct++;
  1891 		    }
  1892 		}
  1893 	    g_free(inword);
  1894 	}
  1895     }
  1896 }
  1897 
  1898 /*
  1899  * check_for_typos:
  1900  *
  1901  * Check for commonly mistyped words,
  1902  * and digits like 0 for O in a word.
  1903  */
  1904 void check_for_typos(const char *aline,struct warnings *warnings)
  1905 {
  1906     const char *s,*t,*nt,*wordstart;
  1907     gchar *inword;
  1908     gunichar *decomposition;
  1909     gchar *testword;
  1910     int i,vowel,consonant,*dupcnt;
  1911     gboolean isdup,istypo,alower;
  1912     gunichar c,pc;
  1913     long offset,len;
  1914     gsize decomposition_len;
  1915     for (s=aline;*s;)
  1916     {
  1917 	wordstart=s;
  1918 	inword=getaword(&s);
  1919 	if (!*inword)
  1920 	{
  1921 	    g_free(inword);
  1922 	    continue; /* don't bother with empty lines */
  1923 	}
  1924 	if (mixdigit(inword))
  1925 	{
  1926 	    if (pswit[ECHO_SWITCH])
  1927 		g_print("\n%s\n",aline);
  1928 	    if (!pswit[OVERVIEW_SWITCH])
  1929 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1930 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1931 	    else
  1932 		cnt_word++;
  1933 	}
  1934 	/*
  1935 	 * Put the word through a series of tests for likely typos and OCR
  1936 	 * errors.
  1937 	 */
  1938 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1939 	{
  1940 	    istypo=FALSE;
  1941 	    alower=FALSE;
  1942 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1943 	    {
  1944 		c=g_utf8_get_char(t);
  1945 		nt=g_utf8_next_char(t);
  1946 		/* lowercase for testing */
  1947 		if (g_unichar_islower(c))
  1948 		    alower=TRUE;
  1949 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1950 		{
  1951 		    /*
  1952 		     * We have an uppercase mid-word. However, there are
  1953 		     * common cases:
  1954 		     *   Mac and Mc like McGill
  1955 		     *   French contractions like l'Abbe
  1956 		     */
  1957 		    offset=g_utf8_pointer_to_offset(inword,t);
  1958 		    if (offset>0)
  1959 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1960 		    else
  1961 			pc='\0';
  1962 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1963 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1964 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1965 		      CHAR_IS_APOSTROPHE(pc))
  1966 			; /* do nothing! */
  1967 		    else
  1968 			istypo=TRUE;
  1969 		}
  1970 	    }
  1971 	    testword=g_utf8_casefold(inword,-1);
  1972 	}
  1973 	if (pswit[TYPO_SWITCH])
  1974 	{
  1975 	    /*
  1976 	     * Check for certain unlikely two-letter combinations at word
  1977 	     * start and end.
  1978 	     */
  1979 	    len=g_utf8_strlen(testword,-1);
  1980 	    if (len>1)
  1981 	    {
  1982 		for (i=0;*nostart[i];i++)
  1983 		    if (g_str_has_prefix(testword,nostart[i]))
  1984 			istypo=TRUE;
  1985 		for (i=0;*noend[i];i++)
  1986 		    if (g_str_has_suffix(testword,noend[i]))
  1987 			istypo=TRUE;
  1988 	    }
  1989 	    /* ght is common, gbt never. Like that. */
  1990 	    if (strstr(testword,"cb"))
  1991 		istypo=TRUE;
  1992 	    if (strstr(testword,"gbt"))
  1993 		istypo=TRUE;
  1994 	    if (strstr(testword,"pbt"))
  1995 		istypo=TRUE;
  1996 	    if (strstr(testword,"tbs"))
  1997 		istypo=TRUE;
  1998 	    if (strstr(testword,"mrn"))
  1999 		istypo=TRUE;
  2000 	    if (strstr(testword,"ahle"))
  2001 		istypo=TRUE;
  2002 	    if (strstr(testword,"ihle"))
  2003 		istypo=TRUE;
  2004 	    /*
  2005 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  2006 	     * Also "TBI" - frostbite, outbid - but uncommon.
  2007 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  2008 	     * numerals, but "ii" is a common scanno.
  2009 	     */
  2010 	    if (strstr(testword,"tbi"))
  2011 		istypo=TRUE;
  2012 	    if (strstr(testword,"tbe"))
  2013 		istypo=TRUE;
  2014 	    if (strstr(testword,"ii"))
  2015 		istypo=TRUE;
  2016 	    /*
  2017 	     * Check for no vowels or no consonants.
  2018 	     * If none, flag a typo.
  2019 	     */
  2020 	    if (!istypo && len>1)
  2021 	    {
  2022 		vowel=consonant=0;
  2023 		for (t=testword;*t;t=g_utf8_next_char(t))
  2024 		{
  2025 		    c=g_utf8_get_char(t);
  2026 		    decomposition=
  2027 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  2028 		    if (c=='y' || g_unichar_isdigit(c))
  2029 		    {
  2030 			/* Yah, this is loose. */
  2031 			vowel++;
  2032 			consonant++;
  2033 		    }
  2034 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  2035 			vowel++;
  2036 		    else
  2037 			consonant++;
  2038 		    g_free(decomposition);
  2039 		}
  2040 		if (!vowel || !consonant)
  2041 		    istypo=TRUE;
  2042 	    }
  2043 	    /*
  2044 	     * Now exclude the word from being reported if it's in
  2045 	     * the okword list.
  2046 	     */
  2047 	    for (i=0;*okword[i];i++)
  2048 		if (!strcmp(testword,okword[i]))
  2049 		    istypo=FALSE;
  2050 	    /*
  2051 	     * What looks like a typo may be a Roman numeral.
  2052 	     * Exclude these.
  2053 	     */
  2054 	    if (istypo && isroman(testword))
  2055 		istypo=FALSE;
  2056 	    /* Check the manual list of typos. */
  2057 	    if (!istypo)
  2058 		for (i=0;*typo[i];i++)
  2059 		    if (!strcmp(testword,typo[i]))
  2060 			istypo=TRUE;
  2061 	    /*
  2062 	     * Check lowercase s, l, i and m - special cases.
  2063 	     *   "j" - often a semi-colon gone wrong.
  2064 	     *   "d" for a missing apostrophe - he d
  2065 	     *   "n" for "in"
  2066 	     */
  2067 	    if (!istypo && len==1 &&
  2068 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  2069 		istypo=TRUE;
  2070 	    if (istypo)
  2071 	    {
  2072 		dupcnt=g_tree_lookup(qword,testword);
  2073 		if (dupcnt)
  2074 		{
  2075 		    (*dupcnt)++;
  2076 		    isdup=!pswit[VERBOSE_SWITCH];
  2077 		}
  2078 		else
  2079 		{
  2080 		    dupcnt=g_new0(int,1);
  2081 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  2082 		    isdup=FALSE;
  2083 		}
  2084 		if (!isdup)
  2085 		{
  2086 		    if (pswit[ECHO_SWITCH])
  2087 			g_print("\n%s\n",aline);
  2088 		    if (!pswit[OVERVIEW_SWITCH])
  2089 		    {
  2090 			g_print("    Line %ld column %ld - Query word %s",
  2091 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  2092 			  inword);
  2093 			if (!pswit[VERBOSE_SWITCH])
  2094 			    g_print(" - not reporting duplicates");
  2095 			g_print("\n");
  2096 		    }
  2097 		    else
  2098 			cnt_word++;
  2099 		}
  2100 	    }
  2101 	}
  2102 	/* check the user's list of typos */
  2103 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  2104 	{
  2105 	    if (pswit[ECHO_SWITCH])
  2106 		g_print("\n%s\n",aline);
  2107 	    if (!pswit[OVERVIEW_SWITCH])  
  2108 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  2109 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  2110 	}
  2111 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  2112 	    g_free(testword);
  2113 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  2114 	{
  2115 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  2116 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  2117 	    {
  2118 		if (pswit[ECHO_SWITCH])
  2119 		    g_print("\n%s\n",aline);
  2120 		if (!pswit[OVERVIEW_SWITCH])
  2121 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  2122 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  2123 		      inword);
  2124 		else
  2125 		    cnt_word++;
  2126 	    }
  2127 	}
  2128 	g_free(inword);
  2129     }
  2130 }
  2131 
  2132 /*
  2133  * check_for_misspaced_punctuation:
  2134  *
  2135  * Look for added or missing spaces around punctuation and quotes.
  2136  * If there is a punctuation character like ! with no space on
  2137  * either side, suspect a missing!space. If there are spaces on
  2138  * both sides , assume a typo. If we see a double quote with no
  2139  * space or punctuation on either side of it, assume unspaced
  2140  * quotes "like"this.
  2141  */
  2142 void check_for_misspaced_punctuation(const char *aline,
  2143   struct parities *parities,gboolean isemptyline)
  2144 {
  2145     gboolean isacro,isellipsis;
  2146     const char *s;
  2147     gunichar c,nc,pc,n2c;
  2148     int parity;
  2149     c=g_utf8_get_char(aline);
  2150     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2151     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2152     {
  2153 	pc=c;
  2154 	c=nc;
  2155 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2156 	/* For each character in the line after the first. */
  2157 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  2158 	{
  2159 	    /* we need to suppress warnings for acronyms like M.D. */
  2160 	    isacro=FALSE;
  2161 	    /* we need to suppress warnings for ellipsis . . . */
  2162 	    isellipsis=FALSE;
  2163 	    /*
  2164 	     * If there are letters on both sides of it or
  2165 	     * if it's strict punctuation followed by an alpha.
  2166 	     */
  2167 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2168 	      g_utf8_strchr("?!,;:",-1,c)))
  2169 	    {
  2170 		if (c=='.')
  2171 		{
  2172 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2173 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2174 			isacro=TRUE;
  2175 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2176 		    if (nc && n2c=='.')
  2177 			isacro=TRUE;
  2178 		}
  2179 		if (!isacro)
  2180 		{
  2181 		    if (pswit[ECHO_SWITCH])
  2182 			g_print("\n%s\n",aline);
  2183 		    if (!pswit[OVERVIEW_SWITCH])
  2184 			g_print("    Line %ld column %ld - Missing space?\n",
  2185 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2186 		    else
  2187 			cnt_punct++;
  2188 		}
  2189 	    }
  2190 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2191 	    {
  2192 		/*
  2193 		 * If there are spaces on both sides,
  2194 		 * or space before and end of line.
  2195 		 */
  2196 		if (c=='.')
  2197 		{
  2198 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2199 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2200 			isellipsis=TRUE;
  2201 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2202 		    if (nc && n2c=='.')
  2203 			isellipsis=TRUE;
  2204 		}
  2205 		if (!isemptyline && !isellipsis)
  2206 		{
  2207 		    if (pswit[ECHO_SWITCH])
  2208 			g_print("\n%s\n",aline);
  2209 		    if (!pswit[OVERVIEW_SWITCH])
  2210 			g_print("    Line %ld column %ld - "
  2211 			  "Spaced punctuation?\n",linecnt,
  2212 			  g_utf8_pointer_to_offset(aline,s)+1);
  2213 		    else
  2214 			cnt_punct++;
  2215 		}
  2216 	    }
  2217 	}
  2218     }
  2219     /* Split out the characters that CANNOT be preceded by space. */
  2220     c=g_utf8_get_char(aline);
  2221     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2222     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2223     {
  2224 	pc=c;
  2225 	c=nc;
  2226 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2227 	/* for each character in the line after the first */
  2228 	if (g_utf8_strchr("?!,;:",-1,c))
  2229 	{
  2230 	    /* if it's punctuation that _cannot_ have a space before it */
  2231 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2232 	    {
  2233 		/*
  2234 		 * If nc DOES == space,
  2235 		 * it was already reported just above.
  2236 		 */
  2237 		if (pswit[ECHO_SWITCH])
  2238 		    g_print("\n%s\n",aline);
  2239 		if (!pswit[OVERVIEW_SWITCH])
  2240 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2241 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2242 		else
  2243 		    cnt_punct++;
  2244 	    }
  2245 	}
  2246     }
  2247     /*
  2248      * Special case " .X" where X is any alpha.
  2249      * This plugs a hole in the acronym code above.
  2250      * Inelegant, but maintainable.
  2251      */
  2252     c=g_utf8_get_char(aline);
  2253     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2254     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2255     {
  2256 	pc=c;
  2257 	c=nc;
  2258 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2259 	/* for each character in the line after the first */
  2260 	if (c=='.')
  2261 	{
  2262 	    /* if it's a period */
  2263 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2264 	    {
  2265 		/*
  2266 		 * If the period follows a space and
  2267 		 * is followed by a letter.
  2268 		 */
  2269 		if (pswit[ECHO_SWITCH])
  2270 		    g_print("\n%s\n",aline);
  2271 		if (!pswit[OVERVIEW_SWITCH])
  2272 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2273 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2274 		else
  2275 		    cnt_punct++;
  2276 	    }
  2277 	}
  2278     }
  2279     c=g_utf8_get_char(aline);
  2280     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2281     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2282     {
  2283 	pc=c;
  2284 	c=nc;
  2285 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2286 	/* for each character in the line after the first */
  2287 	if (CHAR_IS_DQUOTE(c))
  2288 	{
  2289 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2290 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2291 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2292 	    {
  2293 		if (pswit[ECHO_SWITCH])
  2294 		    g_print("\n%s\n",aline);
  2295 		if (!pswit[OVERVIEW_SWITCH])
  2296 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2297 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2298 		else
  2299 		    cnt_punct++;
  2300 	    }
  2301 	}
  2302     }
  2303     /* Check parity of quotes. */
  2304     nc=g_utf8_get_char(aline);
  2305     for (s=aline;*s;s=g_utf8_next_char(s))
  2306     {
  2307 	c=nc;
  2308 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2309 	if (CHAR_IS_DQUOTE(c))
  2310 	{
  2311 	    if (c==CHAR_DQUOTE)
  2312 	    {
  2313 		parities->dquote=!parities->dquote;
  2314 		parity=parities->dquote;
  2315 	    }
  2316 	    else if (c==CHAR_LD_QUOTE)
  2317 		parity=1;
  2318 	    else
  2319 		parity=0;
  2320 	    if (!parity)
  2321 	    {
  2322 		/* parity even */
  2323 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2324 		{
  2325 		    if (pswit[ECHO_SWITCH])
  2326 			g_print("\n%s\n",aline);
  2327 		    if (!pswit[OVERVIEW_SWITCH])
  2328 			g_print("    Line %ld column %ld - "
  2329 			  "Wrongspaced quotes?\n",
  2330 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2331 		    else
  2332 			cnt_punct++;
  2333 		}
  2334 	    }
  2335 	    else
  2336 	    {
  2337 		/* parity odd */
  2338 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2339 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2340 		{
  2341 		    if (pswit[ECHO_SWITCH])
  2342 			g_print("\n%s\n",aline);
  2343 		    if (!pswit[OVERVIEW_SWITCH])
  2344 			g_print("    Line %ld column %ld - "
  2345 			  "Wrongspaced quotes?\n",
  2346 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2347 		    else
  2348 			cnt_punct++;
  2349 		}
  2350 	    }
  2351 	}
  2352     }
  2353     c=g_utf8_get_char(aline);
  2354     if (CHAR_IS_DQUOTE(c))
  2355     {
  2356 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2357 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2358 	{
  2359 	    if (pswit[ECHO_SWITCH])
  2360 		g_print("\n%s\n",aline);
  2361 	    if (!pswit[OVERVIEW_SWITCH])
  2362 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2363 		  linecnt);
  2364 	    else
  2365 		cnt_punct++;
  2366 	}
  2367     }
  2368     if (pswit[SQUOTE_SWITCH])
  2369     {
  2370 	nc=g_utf8_get_char(aline);
  2371 	for (s=aline;*s;s=g_utf8_next_char(s))
  2372 	{
  2373 	    c=nc;
  2374 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2375 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2376 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2377 	      !g_unichar_isalpha(nc)))
  2378 	    {
  2379 		parities->squote=!parities->squote;
  2380 		if (!parities->squote)
  2381 		{
  2382 		    /* parity even */
  2383 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2384 		    {
  2385 			if (pswit[ECHO_SWITCH])
  2386 			    g_print("\n%s\n",aline);
  2387 			if (!pswit[OVERVIEW_SWITCH])
  2388 			    g_print("    Line %ld column %ld - "
  2389 			      "Wrongspaced singlequotes?\n",
  2390 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2391 			else
  2392 			    cnt_punct++;
  2393 		    }
  2394 		}
  2395 		else
  2396 		{
  2397 		    /* parity odd */
  2398 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2399 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2400 		    {
  2401 			if (pswit[ECHO_SWITCH])
  2402 			    g_print("\n%s\n",aline);
  2403 			if (!pswit[OVERVIEW_SWITCH])
  2404 			    g_print("    Line %ld column %ld - "
  2405 			      "Wrongspaced singlequotes?\n",
  2406 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2407 			else
  2408 			    cnt_punct++;
  2409 		    }
  2410 		}
  2411 	    }
  2412 	}
  2413     }
  2414 }
  2415 
  2416 /*
  2417  * check_for_double_punctuation:
  2418  *
  2419  * Look for double punctuation like ,. or ,,
  2420  * Thanks to DW for the suggestion!
  2421  * In books with references, ".," and ".;" are common
  2422  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2423  * OTOH, from my initial tests, there are also fairly
  2424  * common errors. What to do? Make these cases paranoid?
  2425  * ".," is the most common, so warnings->dotcomma is used
  2426  * to suppress detailed reporting if it occurs often.
  2427  */
  2428 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2429 {
  2430     const char *s;
  2431     gunichar c,nc;
  2432     nc=g_utf8_get_char(aline);
  2433     for (s=aline;*s;s=g_utf8_next_char(s))
  2434     {
  2435 	c=nc;
  2436 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2437 	/* for each punctuation character in the line */
  2438 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2439 	  g_utf8_strchr(".?!,;:",-1,nc))
  2440 	{
  2441 	    /* followed by punctuation, it's a query, unless . . . */
  2442 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2443 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2444 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2445 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2446 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2447 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2448 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2449 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2450 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2451 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2452 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2453 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2454 	    {
  2455 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2456 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2457 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2458 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2459 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2460 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2461 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2462 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2463 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2464 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2465 		{
  2466 		    s+=4;
  2467 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2468 		}
  2469 		; /* do nothing for .. !! and ?? which can be legit */
  2470 	    }
  2471 	    else
  2472 	    {
  2473 		if (pswit[ECHO_SWITCH])
  2474 		    g_print("\n%s\n",aline);
  2475 		if (!pswit[OVERVIEW_SWITCH])
  2476 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2477 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2478 		else
  2479 		    cnt_punct++;
  2480 	    }
  2481 	}
  2482     }
  2483 }
  2484 
  2485 /*
  2486  * check_for_spaced_quotes:
  2487  */
  2488 void check_for_spaced_quotes(const char *aline)
  2489 {
  2490     int i;
  2491     const char *s,*t;
  2492     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2493       CHAR_RS_QUOTE};
  2494     GString *pattern;
  2495     s=aline;
  2496     while ((t=strstr(s," \" ")))
  2497     {
  2498 	if (pswit[ECHO_SWITCH])
  2499 	    g_print("\n%s\n",aline);
  2500 	if (!pswit[OVERVIEW_SWITCH])
  2501 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2502 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2503 	else
  2504 	    cnt_punct++;
  2505 	s=g_utf8_next_char(g_utf8_next_char(t));
  2506     }
  2507     pattern=g_string_new(NULL);
  2508     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2509     {
  2510 	g_string_assign(pattern," ");
  2511 	g_string_append_unichar(pattern,single_quotes[i]);
  2512 	g_string_append_c(pattern,' ');
  2513 	s=aline;
  2514 	while ((t=strstr(s,pattern->str)))
  2515 	{
  2516 	    if (pswit[ECHO_SWITCH])
  2517 		g_print("\n%s\n",aline);
  2518 	    if (!pswit[OVERVIEW_SWITCH])
  2519 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2520 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2521 	    else
  2522 		cnt_punct++;
  2523 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2524 	}
  2525     }
  2526     g_string_free(pattern,TRUE);
  2527 }
  2528 
  2529 /*
  2530  * check_for_miscased_genative:
  2531  *
  2532  * Check special case of 'S instead of 's at end of word.
  2533  */
  2534 void check_for_miscased_genative(const char *aline)
  2535 {
  2536     const char *s;
  2537     gunichar c,nc,pc;
  2538     if (!*aline)
  2539 	return;
  2540     c=g_utf8_get_char(aline);
  2541     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2542     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2543     {
  2544 	pc=c;
  2545 	c=nc;
  2546 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2547 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2548 	{
  2549 	    if (pswit[ECHO_SWITCH])
  2550 		g_print("\n%s\n",aline);
  2551 	    if (!pswit[OVERVIEW_SWITCH])
  2552 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2553 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2554 	    else
  2555 		cnt_punct++;
  2556 	}
  2557     }
  2558 }
  2559 
  2560 /*
  2561  * check_end_of_line:
  2562  *
  2563  * Now check special cases - start and end of line -
  2564  * for single and double quotes. Start is sometimes [sic]
  2565  * but better to query it anyway.
  2566  * While we're here, check for dash at end of line.
  2567  */
  2568 void check_end_of_line(const char *aline,struct warnings *warnings)
  2569 {
  2570     int lbytes;
  2571     const char *s;
  2572     gunichar c1,c2;
  2573     lbytes=strlen(aline);
  2574     if (g_utf8_strlen(aline,lbytes)>1)
  2575     {
  2576 	s=g_utf8_prev_char(aline+lbytes);
  2577 	c1=g_utf8_get_char(s);
  2578 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2579 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2580 	{
  2581 	    if (pswit[ECHO_SWITCH])
  2582 		g_print("\n%s\n",aline);
  2583 	    if (!pswit[OVERVIEW_SWITCH])
  2584 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2585 		  g_utf8_strlen(aline,lbytes));
  2586 	    else
  2587 		cnt_punct++;
  2588 	}
  2589 	c1=g_utf8_get_char(aline);
  2590 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2591 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2592 	{
  2593 	    if (pswit[ECHO_SWITCH])
  2594 		g_print("\n%s\n",aline);
  2595 	    if (!pswit[OVERVIEW_SWITCH])
  2596 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2597 	    else
  2598 		cnt_punct++;
  2599 	}
  2600 	/*
  2601 	 * Dash at end of line may well be legit - paranoid mode only
  2602 	 * and don't report em-dash at line-end.
  2603 	 */
  2604 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2605 	{
  2606 	    for (s=g_utf8_prev_char(aline+lbytes);
  2607 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2608 		;
  2609 	    if (g_utf8_get_char(s)=='-' &&
  2610 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2611 	    {
  2612 		if (pswit[ECHO_SWITCH])
  2613 		    g_print("\n%s\n",aline);
  2614 		if (!pswit[OVERVIEW_SWITCH])
  2615 		    g_print("    Line %ld column %ld - "
  2616 		      "Hyphen at end of line?\n",
  2617 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2618 	    }
  2619 	}
  2620     }
  2621 }
  2622 
  2623 /*
  2624  * check_for_unspaced_bracket:
  2625  *
  2626  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2627  * If so, suspect a scanno like "a]most".
  2628  */
  2629 void check_for_unspaced_bracket(const char *aline)
  2630 {
  2631     const char *s;
  2632     gunichar c,nc,pc;
  2633     c=g_utf8_get_char(aline);
  2634     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2635     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2636     {
  2637 	pc=c;
  2638 	c=nc;
  2639 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2640 	if (!nc)
  2641 	    break;
  2642 	/* for each bracket character in the line except 1st & last */
  2643 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2644 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2645 	{
  2646 	    if (pswit[ECHO_SWITCH])
  2647 		g_print("\n%s\n",aline);
  2648 	    if (!pswit[OVERVIEW_SWITCH])
  2649 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2650 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2651 	    else
  2652 		cnt_punct++;
  2653 	}
  2654     }
  2655 }
  2656 
  2657 /*
  2658  * check_for_unpunctuated_endquote:
  2659  */
  2660 void check_for_unpunctuated_endquote(const char *aline)
  2661 {
  2662     const char *s;
  2663     gunichar c,nc,pc;
  2664     QuoteClass qc;
  2665     c=g_utf8_get_char(aline);
  2666     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2667     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2668     {
  2669 	pc=c;
  2670 	c=nc;
  2671 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2672 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2673 	/* for each character in the line except 1st */
  2674 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2675 	{
  2676 	    if (pswit[ECHO_SWITCH])
  2677 		g_print("\n%s\n",aline);
  2678 	    if (!pswit[OVERVIEW_SWITCH])
  2679 		g_print("    Line %ld column %ld - "
  2680 		  "endquote missing punctuation?\n",
  2681 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2682 	    else
  2683 		cnt_punct++;
  2684 	}
  2685     }
  2686 }
  2687 
  2688 /*
  2689  * check_for_html_tag:
  2690  *
  2691  * Check for <HTML TAG>.
  2692  *
  2693  * If there is a < in the line, followed at some point
  2694  * by a > then we suspect HTML.
  2695  */
  2696 void check_for_html_tag(const char *aline)
  2697 {
  2698     const char *open,*close;
  2699     gchar *tag;
  2700     open=strchr(aline,'<');
  2701     if (open)
  2702     {
  2703 	close=strchr(g_utf8_next_char(open),'>');
  2704 	if (close)
  2705 	{
  2706 	    if (pswit[ECHO_SWITCH])
  2707 		g_print("\n%s\n",aline);
  2708 	    if (!pswit[OVERVIEW_SWITCH])
  2709 	    {
  2710 		tag=g_strndup(open,close-open+1);
  2711 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2712 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2713 		g_free(tag);
  2714 	    }
  2715 	    else
  2716 		cnt_html++;
  2717 	}
  2718     }
  2719 }
  2720 
  2721 /*
  2722  * check_for_html_entity:
  2723  *
  2724  * Check for &symbol; HTML.
  2725  *
  2726  * If there is a & in the line, followed at
  2727  * some point by a ; then we suspect HTML.
  2728  */
  2729 void check_for_html_entity(const char *aline)
  2730 {
  2731     const char *s,*amp,*scolon;
  2732     gchar *entity;
  2733     amp=strchr(aline,'&');
  2734     if (amp)
  2735     {
  2736 	scolon=strchr(amp,';');
  2737 	if (scolon)
  2738 	{
  2739 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2740 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2741 		    break;		/* Don't report "Jones & Son;" */
  2742 	    if (s>=scolon)
  2743 	    {
  2744 		if (pswit[ECHO_SWITCH])
  2745 		    g_print("\n%s\n",aline);
  2746 		if (!pswit[OVERVIEW_SWITCH])
  2747 		{
  2748 		    entity=g_strndup(amp,scolon-amp+1);
  2749 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2750 		      linecnt,(int)(amp-aline)+1,entity);
  2751 		    g_free(entity);
  2752 		}
  2753 		else
  2754 		    cnt_html++;
  2755 	    }
  2756 	}
  2757     }
  2758 }
  2759 
  2760 /*
  2761  * check_for_omitted_punctuation:
  2762  *
  2763  * Check for omitted punctuation at end of paragraph by working back
  2764  * through prevline. DW.
  2765  * Need to check this only for "normal" paras.
  2766  * So what is a "normal" para?
  2767  *    Not normal if one-liner (chapter headings, etc.)
  2768  *    Not normal if doesn't contain at least one locase letter
  2769  *    Not normal if starts with space
  2770  */
  2771 void check_for_omitted_punctuation(const char *prevline,
  2772   struct line_properties *last,int start_para_line)
  2773 {
  2774     gboolean letter_on_line=FALSE;
  2775     const char *s;
  2776     gunichar c;
  2777     gboolean closing_quote;
  2778     for (s=prevline;*s;s=g_utf8_next_char(s))
  2779 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2780 	{
  2781 	    letter_on_line=TRUE;
  2782 	    break;
  2783 	}
  2784     /*
  2785      * This next "if" is a problem.
  2786      * If we say "start_para_line <= linecnt - 1", that includes
  2787      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2788      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2789      * misses genuine one-line paragraphs.
  2790      */
  2791     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2792       g_utf8_get_char(prevline)>CHAR_SPACE)
  2793     {
  2794 	s=prevline+strlen(prevline);
  2795 	do
  2796 	{
  2797 	    s=g_utf8_prev_char(s);
  2798 	    c=g_utf8_get_char(s);
  2799 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2800 		closing_quote=TRUE;
  2801 	    else
  2802 		closing_quote=FALSE;
  2803 	} while (closing_quote && s>prevline);
  2804 	for (;s>prevline;s=g_utf8_prev_char(s))
  2805 	{
  2806 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2807 	    {
  2808 		if (pswit[ECHO_SWITCH])
  2809 		    g_print("\n%s\n",prevline);
  2810 		if (!pswit[OVERVIEW_SWITCH])
  2811 		    g_print("    Line %ld column %ld - "
  2812 		      "No punctuation at para end?\n",
  2813 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2814 		else
  2815 		    cnt_punct++;
  2816 		break;
  2817 	    }
  2818 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2819 		break;
  2820 	}
  2821     }
  2822 }
  2823 
  2824 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2825 {
  2826     const char *word=key;
  2827     int *dupcnt=value;
  2828     if (*dupcnt)
  2829 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2830 	  word,*dupcnt);
  2831     return FALSE;
  2832 }
  2833 
  2834 void print_as_windows_1252(const char *string)
  2835 {
  2836     gsize inbytes,outbytes;
  2837     gchar *buf,*bp;
  2838     static GIConv converter=(GIConv)-1;
  2839     if (!string)
  2840     {
  2841 	if (converter!=(GIConv)-1)
  2842 	    g_iconv_close(converter);
  2843 	converter=(GIConv)-1;
  2844 	return;
  2845     }
  2846     if (converter==(GIConv)-1)
  2847 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2848     if (converter!=(GIConv)-1)
  2849     {
  2850 	inbytes=outbytes=strlen(string);
  2851 	bp=buf=g_malloc(outbytes+1);
  2852 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2853 	*bp='\0';
  2854 	fputs(buf,stdout);
  2855 	g_free(buf);
  2856     }
  2857     else
  2858 	fputs(string,stdout);
  2859 }
  2860 
  2861 void print_as_utf_8(const char *string)
  2862 {
  2863     fputs(string,stdout);
  2864 }
  2865 
  2866 /*
  2867  * procfile:
  2868  *
  2869  * Process one file.
  2870  */
  2871 void procfile(const char *filename)
  2872 {
  2873     const char *s;
  2874     gchar *parastart=NULL;	/* first line of current para */
  2875     gchar *etext,*aline;
  2876     gchar *etext_ptr;
  2877     GError *err=NULL;
  2878     struct first_pass_results *first_pass_results;
  2879     struct warnings *warnings;
  2880     struct counters counters={0};
  2881     struct line_properties last={0};
  2882     struct parities parities={0};
  2883     struct pending pending={0};
  2884     gboolean isemptyline;
  2885     long start_para_line=0;
  2886     gboolean isnewpara=FALSE,enddash=FALSE;
  2887     last.start=CHAR_SPACE;
  2888     linecnt=checked_linecnt=0;
  2889     etext=read_etext(filename,&err);
  2890     if (!etext)
  2891     {
  2892 	if (pswit[STDOUT_SWITCH])
  2893 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2894 	else
  2895 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2896 	exit(1);
  2897     }
  2898     g_print("\n\nFile: %s\n\n",filename);
  2899     first_pass_results=first_pass(etext);
  2900     warnings=report_first_pass(first_pass_results);
  2901     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2902     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2903     /*
  2904      * Here we go with the main pass. Hold onto yer hat!
  2905      */
  2906     linecnt=0;
  2907     etext_ptr=etext;
  2908     while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))
  2909     {
  2910 	linecnt++;
  2911 	if (linecnt==1)
  2912 	    isnewpara=TRUE;
  2913 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2914 	    continue;    // skip DP page separators completely
  2915 	if (linecnt<first_pass_results->firstline ||
  2916 	  (first_pass_results->footerline>0 &&
  2917 	  linecnt>first_pass_results->footerline))
  2918 	{
  2919 	    if (pswit[HEADER_SWITCH])
  2920 	    {
  2921 		if (g_str_has_prefix(aline,"Title:"))
  2922 		    g_print("    %s\n",aline);
  2923 		if (g_str_has_prefix(aline,"Author:"))
  2924 		    g_print("    %s\n",aline);
  2925 		if (g_str_has_prefix(aline,"Release Date:"))
  2926 		    g_print("    %s\n",aline);
  2927 		if (g_str_has_prefix(aline,"Edition:"))
  2928 		    g_print("    %s\n\n",aline);
  2929 	    }
  2930 	    continue;		/* skip through the header */
  2931 	}
  2932 	checked_linecnt++;
  2933 	print_pending(aline,parastart,&pending);
  2934 	isemptyline=analyse_quotes(aline,&counters);
  2935 	if (isnewpara && !isemptyline)
  2936 	{
  2937 	    /* This line is the start of a new paragraph. */
  2938 	    start_para_line=linecnt;
  2939 	    /* Capture its first line in case we want to report it later. */
  2940 	    g_free(parastart);
  2941 	    parastart=g_strdup(aline);
  2942 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2943 	    s=aline;
  2944 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2945 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2946 		s=g_utf8_next_char(s);
  2947 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2948 	    {
  2949 		/* and its first letter is lowercase */
  2950 		if (pswit[ECHO_SWITCH])
  2951 		    g_print("\n%s\n",aline);
  2952 		if (!pswit[OVERVIEW_SWITCH])
  2953 		    g_print("    Line %ld column %ld - "
  2954 		      "Paragraph starts with lower-case\n",
  2955 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2956 		else
  2957 		    cnt_punct++;
  2958 	    }
  2959 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2960 	}
  2961 	/* Check for an em-dash broken at line end. */
  2962 	if (enddash && g_utf8_get_char(aline)=='-')
  2963 	{
  2964 	    if (pswit[ECHO_SWITCH])
  2965 		g_print("\n%s\n",aline);
  2966 	    if (!pswit[OVERVIEW_SWITCH])
  2967 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2968 	    else
  2969 		cnt_punct++;
  2970 	}
  2971 	enddash=FALSE;
  2972 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2973 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2974 	    ;
  2975 	if (s>=aline && g_utf8_get_char(s)=='-')
  2976 	    enddash=TRUE;
  2977 	check_for_control_characters(aline);
  2978 	if (warnings->bin)
  2979 	    check_for_odd_characters(aline,warnings,isemptyline);
  2980 	if (warnings->longline)
  2981 	    check_for_long_line(aline);
  2982 	if (warnings->shortline)
  2983 	    check_for_short_line(aline,&last);
  2984 	last.blen=last.len;
  2985 	last.len=g_utf8_strlen(aline,-1);
  2986 	last.start=g_utf8_get_char(aline);
  2987 	check_for_starting_punctuation(aline);
  2988 	if (warnings->dash)
  2989 	{
  2990 	    check_for_spaced_emdash(aline);
  2991 	    check_for_spaced_dash(aline);
  2992 	}
  2993 	check_for_unmarked_paragraphs(aline);
  2994 	check_for_jeebies(aline);
  2995 	check_for_mta_from(aline);
  2996 	check_for_orphan_character(aline);
  2997 	check_for_pling_scanno(aline);
  2998 	check_for_extra_period(aline,warnings);
  2999 	check_for_following_punctuation(aline);
  3000 	check_for_typos(aline,warnings);
  3001 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  3002 	check_for_double_punctuation(aline,warnings);
  3003 	check_for_spaced_quotes(aline);
  3004 	check_for_miscased_genative(aline);
  3005 	check_end_of_line(aline,warnings);
  3006 	check_for_unspaced_bracket(aline);
  3007 	if (warnings->endquote)
  3008 	    check_for_unpunctuated_endquote(aline);
  3009 	check_for_html_tag(aline);
  3010 	check_for_html_entity(aline);
  3011 	if (isemptyline)
  3012 	{
  3013 	    check_for_mismatched_quotes(&counters,&pending);
  3014 	    counters_reset(&counters);
  3015 	    /* let the next iteration know that it's starting a new para */
  3016 	    isnewpara=TRUE;
  3017 	    if (prevline)
  3018 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  3019 	}
  3020 	g_free(prevline);
  3021 	prevline=g_strdup(aline);
  3022     }
  3023     linecnt++;
  3024     check_for_mismatched_quotes(&counters,&pending);
  3025     print_pending(NULL,parastart,&pending);
  3026     reset_pending(&pending);
  3027     if (prevline)
  3028     {
  3029 	g_free(prevline);
  3030 	prevline=NULL;
  3031     }
  3032     g_free(parastart);
  3033     g_free(prevline);
  3034     g_free(etext);
  3035     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  3036 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  3037     g_tree_unref(qword);
  3038     g_tree_unref(qperiod);
  3039     counters_destroy(&counters);
  3040     g_set_print_handler(NULL);
  3041     print_as_windows_1252(NULL);
  3042     if (pswit[MARKUP_SWITCH])  
  3043 	loseentities(NULL);
  3044 }
  3045 
  3046 /*
  3047  * flgets:
  3048  *
  3049  * Get one line from the input text. The setting of newlines has the following
  3050  * effect:
  3051  *
  3052  * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.
  3053  *
  3054  * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as
  3055  *		 the newline character.
  3056  *
  3057  * UNIX_NEWLINES: Check for the presence of CRs.
  3058  *
  3059  * In all cases, check that the last line is correctly terminated.
  3060  *
  3061  * Returns: a pointer to the line.
  3062  */
  3063 char *flgets(char **etext,long lcnt,int newlines)
  3064 {
  3065     gunichar c;
  3066     gboolean isCR=FALSE;
  3067     char *theline=*etext;
  3068     char *eos=theline;
  3069     gchar *s;
  3070     for (;;)
  3071     {
  3072 	c=g_utf8_get_char(*etext);
  3073 	if (!c)
  3074 	{
  3075 	    if (*etext==theline)
  3076 		return NULL;
  3077 	    else if (pswit[LINE_END_SWITCH])
  3078 	    {
  3079 		if (pswit[ECHO_SWITCH])
  3080 		{
  3081 		    s=g_strndup(theline,eos-theline);
  3082 		    g_print("\n%s\n",s);
  3083 		    g_free(s);
  3084 		}
  3085 		if (!pswit[OVERVIEW_SWITCH])
  3086 		{
  3087 		    if (newlines==OS9_NEWLINES)
  3088 			g_print("    Line %ld - No CR?\n",lcnt);
  3089 		    else
  3090 		    {
  3091 			/* There may, or may not, have been a CR */
  3092 			g_print("    Line %ld - No LF?\n",lcnt);
  3093 		    }
  3094 		}
  3095 		else
  3096 		    cnt_lineend++;
  3097 	    }
  3098 	    break;
  3099 	}
  3100 	*etext=g_utf8_next_char(*etext);
  3101 	/* either way, it's end of line */
  3102 	if (c=='\n')
  3103 	{
  3104 	    if (newlines==DOS_NEWLINES && !isCR)
  3105 	    {
  3106 		/* Error - a LF without a preceding CR */
  3107 		if (pswit[LINE_END_SWITCH])
  3108 		{
  3109 		    if (pswit[ECHO_SWITCH])
  3110 		    {
  3111 			s=g_strndup(theline,eos-theline);
  3112 			g_print("\n%s\n",s);
  3113 			g_free(s);
  3114 		    }
  3115 		    if (!pswit[OVERVIEW_SWITCH])
  3116 			g_print("    Line %ld - No CR?\n",lcnt);
  3117 		    else
  3118 			cnt_lineend++;
  3119 		}
  3120 	    }
  3121 	    break;
  3122 	}
  3123 	if (c=='\r')
  3124 	{
  3125 	    if (newlines==OS9_NEWLINES)
  3126 		break;
  3127 	    if (isCR || newlines==UNIX_NEWLINES)
  3128 	    {
  3129 		if (pswit[LINE_END_SWITCH])
  3130 		{
  3131 		    if (pswit[ECHO_SWITCH])
  3132 		    {
  3133 			s=g_strndup(theline,eos-theline);
  3134 			g_print("\n%s\n",s);
  3135 			g_free(s);
  3136 		    }
  3137 		    if (!pswit[OVERVIEW_SWITCH])
  3138 		    {
  3139 			if (newlines==UNIX_NEWLINES)
  3140 			    g_print("    Line %ld column %ld - Embedded CR?\n",
  3141 			      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3142 			else
  3143 			    g_print("    Line %ld - Two successive CRs?\n",
  3144 			      lcnt);
  3145 		    }
  3146 		    else
  3147 			cnt_lineend++;
  3148 		}
  3149 		if (newlines==UNIX_NEWLINES)
  3150 		    *eos=' ';
  3151 	    }
  3152 	    if (newlines==DOS_NEWLINES)
  3153 		isCR=TRUE;
  3154 	}
  3155 	else
  3156 	{
  3157 	    if (pswit[LINE_END_SWITCH] && isCR)
  3158 	    {
  3159 		if (pswit[ECHO_SWITCH])
  3160 		{
  3161 		    s=g_strndup(theline,eos-theline);
  3162 		    g_print("\n%s\n",s);
  3163 		    g_free(s);
  3164 		}
  3165 		if (!pswit[OVERVIEW_SWITCH])
  3166 		    g_print("    Line %ld column %ld - CR without LF?\n",
  3167 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  3168 		else
  3169 		    cnt_lineend++;
  3170 		*eos=' ';
  3171 	    }
  3172 	    isCR=FALSE;
  3173 	    eos=g_utf8_next_char(eos);
  3174 	}
  3175     }
  3176     *eos='\0';
  3177     if (pswit[MARKUP_SWITCH])  
  3178 	postprocess_for_HTML(theline);
  3179     if (pswit[DP_SWITCH])  
  3180 	postprocess_for_DP(theline);
  3181     return theline;
  3182 }
  3183 
  3184 /*
  3185  * mixdigit:
  3186  *
  3187  * Takes a "word" as a parameter, and checks whether it
  3188  * contains a mixture of alpha and digits. Generally, this is an
  3189  * error, but may not be for cases like 4th or L5 12s. 3d.
  3190  *
  3191  * Returns: TRUE iff an is error found.
  3192  */
  3193 gboolean mixdigit(const char *checkword)
  3194 {
  3195     gboolean wehaveadigit,wehavealetter,query;
  3196     const char *s,*nondigit;
  3197     wehaveadigit=wehavealetter=query=FALSE;
  3198     for (s=checkword;*s;s=g_utf8_next_char(s))
  3199 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3200 	    wehavealetter=TRUE;
  3201 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3202 	    wehaveadigit=TRUE;
  3203     if (wehaveadigit && wehavealetter)
  3204     {
  3205 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3206 	query=TRUE;
  3207 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3208 	  nondigit=g_utf8_next_char(nondigit))
  3209 	    ;
  3210 	/* digits, ending in st, rd, nd, th of either case */
  3211 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3212 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3213 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3214 	  !g_ascii_strcasecmp(nondigit,"th"))
  3215 	    query=FALSE;
  3216 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3217 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3218 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3219 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3220 	    query=FALSE;
  3221 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3222 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3223 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3224 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3225 	    query=FALSE;
  3226 	/* digits, ending in l, L, s or d */
  3227 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3228 	  !strcmp(nondigit,"d"))
  3229 	    query=FALSE;
  3230 	/*
  3231 	 * L at the start of a number, representing Britsh pounds, like L500.
  3232 	 * This is cute. We know the current word is mixed digit. If the first
  3233 	 * letter is L, there must be at least one digit following. If both
  3234 	 * digits and letters follow, we have a genuine error, else we have a
  3235 	 * capital L followed by digits, and we accept that as a non-error.
  3236 	 */
  3237 	if (g_utf8_get_char(checkword)=='L' &&
  3238 	  !mixdigit(g_utf8_next_char(checkword)))
  3239 	    query=FALSE;
  3240     }
  3241     return query;
  3242 }
  3243 
  3244 /*
  3245  * getaword:
  3246  *
  3247  * Extracts the first/next "word" from the line, and returns it.
  3248  * A word is defined as one English word unit--or at least that's the aim.
  3249  * "ptr" is advanced to the position in the line where we will start
  3250  * looking for the next word.
  3251  *
  3252  * Returns: A newly-allocated string.
  3253  */
  3254 gchar *getaword(const char **ptr)
  3255 {
  3256     const char *s,*t;
  3257     GString *word;
  3258     gunichar c,pc;
  3259     word=g_string_new(NULL);
  3260     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3261       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3262       **ptr;*ptr=g_utf8_next_char(*ptr))
  3263     {
  3264 	/* Handle exceptions for footnote markers like [1] */
  3265 	if (g_utf8_get_char(*ptr)=='[')
  3266 	{
  3267 	    g_string_append_c(word,'[');
  3268 	    s=g_utf8_next_char(*ptr);
  3269 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
  3270 		g_string_append_unichar(word,g_utf8_get_char(s));
  3271 	    if (g_utf8_get_char(s)==']')
  3272 	    {
  3273 		g_string_append_c(word,']');
  3274 		*ptr=g_utf8_next_char(s);
  3275 		return g_string_free(word,FALSE);
  3276 	    }
  3277 	    else
  3278 		g_string_truncate(word,0);
  3279 	}
  3280     }
  3281     /*
  3282      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3283      * Especially yucky is the case of L1,000
  3284      * This section looks for a pattern of characters including a digit
  3285      * followed by a comma or period followed by one or more digits.
  3286      * If found, it returns this whole pattern as a word; otherwise we discard
  3287      * the results and resume our normal programming.
  3288      */
  3289     s=*ptr;
  3290     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3291       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3292       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3293 	g_string_append_unichar(word,g_utf8_get_char(s));
  3294     if (word->len)
  3295     {
  3296 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3297 	{
  3298 	    c=g_utf8_get_char(t);
  3299 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3300 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3301 	    {
  3302 		*ptr=s;
  3303 		return g_string_free(word,FALSE);
  3304 	    }
  3305 	}
  3306     }
  3307     /* we didn't find a punctuated number - do the regular getword thing */
  3308     g_string_truncate(word,0);
  3309     c=g_utf8_get_char(*ptr);
  3310     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3311       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3312 	g_string_append_unichar(word,c);
  3313     return g_string_free(word,FALSE);
  3314 }
  3315 
  3316 /*
  3317  * isroman:
  3318  *
  3319  * Is this word a Roman Numeral?
  3320  *
  3321  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3322  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3323  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3324  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3325  * expressions thereof, except when it came to taxes. Allow any number of M,
  3326  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3327  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3328  * of optional Is.
  3329  */
  3330 gboolean isroman(const char *t)
  3331 {
  3332     const char *s;
  3333     if (!t || !*t)
  3334 	return FALSE;
  3335     s=t;
  3336     while (g_utf8_get_char(t)=='m' && *t)
  3337 	t++;
  3338     if (g_utf8_get_char(t)=='d')
  3339 	t++;
  3340     if (g_str_has_prefix(t,"cm"))
  3341 	t+=2;
  3342     if (g_str_has_prefix(t,"cd"))
  3343 	t+=2;
  3344     while (g_utf8_get_char(t)=='c' && *t)
  3345 	t++;
  3346     if (g_str_has_prefix(t,"xl"))
  3347 	t+=2;
  3348     if (g_str_has_prefix(t,"xc"))
  3349 	t+=2;
  3350     if (g_utf8_get_char(t)=='l')
  3351 	t++;
  3352     while (g_utf8_get_char(t)=='x' && *t)
  3353 	t++;
  3354     if (g_str_has_prefix(t,"ix"))
  3355 	t+=2;
  3356     if (g_str_has_prefix(t,"iv"))
  3357 	t+=2;
  3358     if (g_utf8_get_char(t)=='v')
  3359 	t++;
  3360     while (g_utf8_get_char(t)=='i' && *t)
  3361 	t++;
  3362     return !*t;
  3363 }
  3364 
  3365 /*
  3366  * postprocess_for_DP:
  3367  *
  3368  * Invoked with the -d switch from flgets().
  3369  * It simply "removes" from the line a hard-coded set of common
  3370  * DP-specific tags, so that the line passed to the main routine has
  3371  * been pre-cleaned of DP markup.
  3372  */
  3373 void postprocess_for_DP(char *theline)
  3374 {
  3375     char *s,*t;
  3376     int i;
  3377     if (!*theline) 
  3378 	return;
  3379     for (i=0;*DPmarkup[i];i++)
  3380 	while ((s=strstr(theline,DPmarkup[i])))
  3381 	{
  3382 	    t=s+strlen(DPmarkup[i]);
  3383 	    memmove(s,t,strlen(t)+1);
  3384 	}
  3385 }
  3386 
  3387 /*
  3388  * postprocess_for_HTML:
  3389  *
  3390  * Invoked with the -m switch from flgets().
  3391  * It simply "removes" from the line a hard-coded set of common
  3392  * HTML tags and "replaces" a hard-coded set of common HTML
  3393  * entities, so that the line passed to the main routine has
  3394  * been pre-cleaned of HTML.
  3395  */
  3396 void postprocess_for_HTML(char *theline)
  3397 {
  3398     while (losemarkup(theline))
  3399 	;
  3400     loseentities(theline);
  3401 }
  3402 
  3403 char *losemarkup(char *theline)
  3404 {
  3405     char *s,*t;
  3406     int i;
  3407     s=strchr(theline,'<');
  3408     t=s?strchr(s,'>'):NULL;
  3409     if (!s || !t)
  3410 	return NULL;
  3411     for (i=0;*markup[i];i++)
  3412 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3413 	{
  3414 	    t=g_utf8_next_char(t);
  3415 	    memmove(s,t,strlen(t)+1);
  3416 	    return s;
  3417 	}
  3418     /* It's an unrecognized <xxx>. */
  3419     return NULL;
  3420 }
  3421 
  3422 void loseentities(char *theline)
  3423 {
  3424     int i;
  3425     gsize nb;
  3426     char *amp,*scolon;
  3427     gchar *s,*t;
  3428     gunichar c;
  3429     GTree *entities=NULL;
  3430     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3431     if (!theline)
  3432     {
  3433 	if (entities)
  3434 	    g_tree_destroy(entities);
  3435 	entities=NULL;
  3436 	if (translit!=(GIConv)-1)
  3437 	    g_iconv_close(translit);
  3438 	translit=(GIConv)-1;
  3439 	if (to_utf8!=(GIConv)-1)
  3440 	    g_iconv_close(to_utf8);
  3441 	to_utf8=(GIConv)-1;
  3442 	return;
  3443     }
  3444     if (!*theline)
  3445 	return;
  3446     if (!entities)
  3447     {
  3448 	entities=g_tree_new((GCompareFunc)strcmp);
  3449 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3450 	    g_tree_insert(entities,HTMLentities[i].name,
  3451 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3452     }
  3453     if (translit==(GIConv)-1)
  3454 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3455     if (to_utf8==(GIConv)-1)
  3456 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3457     while((amp=strchr(theline,'&')))
  3458     {
  3459 	scolon=strchr(amp,';');
  3460 	if (scolon)
  3461 	{
  3462 	    if (amp[1]=='#')
  3463 	    {
  3464 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3465 		    c=strtol(amp+2,NULL,10);
  3466 		else if (amp[2]=='x' &&
  3467 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3468 		    c=strtol(amp+3,NULL,16);
  3469 	    }
  3470 	    else
  3471 	    {
  3472 		s=g_strndup(amp+1,scolon-(amp+1));
  3473 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3474 		g_free(s);
  3475 	    }
  3476 	}
  3477 	else
  3478 	    c=0;
  3479 	if (c)
  3480 	{
  3481 	    theline=amp;
  3482 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3483 		theline+=g_unichar_to_utf8(c,theline);
  3484 	    else
  3485 	    {
  3486 		s=g_malloc(6);
  3487 		nb=g_unichar_to_utf8(c,s);
  3488 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3489 		g_free(s);
  3490 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3491 		g_free(t);
  3492 		memcpy(theline,s,nb);
  3493 		g_free(s);
  3494 		theline+=nb;
  3495 	    }
  3496 	    memmove(theline,g_utf8_next_char(scolon),
  3497 	      strlen(g_utf8_next_char(scolon))+1);
  3498 	}
  3499 	else
  3500 	    theline=g_utf8_next_char(amp);
  3501     }
  3502 }
  3503 
  3504 gboolean tagcomp(const char *strin,const char *basetag)
  3505 {
  3506     gboolean retval;
  3507     gchar *s,*t;
  3508     if (g_utf8_get_char(strin)=='/')
  3509 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3510     else
  3511 	t=g_utf8_casefold(strin,-1);
  3512     s=g_utf8_casefold(basetag,-1);
  3513     retval=g_str_has_prefix(t,s);
  3514     g_free(s);
  3515     g_free(t);
  3516     return retval;
  3517 }
  3518 
  3519 void proghelp(GOptionContext *context)
  3520 {
  3521     gchar *help;
  3522     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3523     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3524     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3525     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3526       "For details, read the file COPYING.\n",stderr);
  3527     fputs("This is Free Software; "
  3528       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3529     fputs("read the file COPYING for details.\n\n",stderr);
  3530     help=g_option_context_get_help(context,TRUE,NULL);
  3531     fputs(help,stderr);
  3532     g_free(help);
  3533     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3534     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3535       "non-ASCII\n",stderr);
  3536     fputs("characters like accented letters, "
  3537       "lines longer than 75 or shorter than 55,\n",stderr);
  3538     fputs("unbalanced quotes or brackets, "
  3539       "a variety of badly formatted punctuation, \n",stderr);
  3540     fputs("HTML tags, some likely typos. "
  3541       "It is NOT a substitute for human judgement.\n",stderr);
  3542     fputs("\n",stderr);
  3543 }