bookloupe-testing: bookloupe/bookloupe.c@a485f5dcc2de

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "bookloupe.h"

    31 #include "counters.h"

    32 #include "pending.h"

    33 #include "HTMLentities.h"

    35 gchar *prevline;

    37 /* Common typos. */

    38 char *typo[] = {

    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    68     "se", ""

    69 };

    71 GTree *usertypo;

    73 /* Common abbreviations and other OK words not to query as typos. */

    74 char *okword[] = {

    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    78     "outbid", "outbids", "frostbite", "frostbitten", ""

    79 };

    81 /* Common abbreviations that cause otherwise unexplained periods. */

    82 char *abbrev[] = {

    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    85 };

    87 /*

    88  * Two-Letter combinations that rarely if ever start words,

    89  * but are common scannos or otherwise common letter combinations.

    90  */

    91 char *nostart[] = {

    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    93 };

    95 /*

    96  * Two-Letter combinations that rarely if ever end words,

    97  * but are common scannos or otherwise common letter combinations.

    98  */

    99 char *noend[] = {

   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   101     "sw", "gr", "sl", "cl", "iy", ""

   102 };

   104 char *markup[] = {

   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   109 };

   111 char *DPmarkup[] = {

   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   113 };

   115 char *nocomma[] = {

   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   119     "during", "let", "toward", "among", ""

   120 };

   122 char *noperiod[] = {

   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   126     "among", "those", "into", "whom", "having", "thence", ""

   127 };

   129 gboolean pswit[SWITNO];  /* program switches */

   131 gboolean typo_compat,paranoid_compat;

   133 static GOptionEntry options[]={

   134     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   135       "Ignore DP-specific markup", NULL },

   136     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   137       G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   138       "Don't ignore DP-specific markup", NULL },

   139     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   140       "Echo queried line", NULL },

   141     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,

   142       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   143       "Don't echo queried line", NULL },

   144     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   145       "Check single quotes", NULL },

   146     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   147       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   148       "Don't check single quotes", NULL },

   149     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   150       "Check common typos", NULL },

   151     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   152       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   153       "Don't check common typos", NULL },

   154     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   155       "Require closure of quotes on every paragraph", NULL },

   156     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   157       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   158       "Don't require closure of quotes on every paragraph", NULL },

   159     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,

   160       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   161       "Enable paranoid querying of everything", NULL },

   162     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,

   163       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   164       "Disable paranoid querying of everything", NULL },

   165     { "line-end", 0, G_OPTION_FLAG_HIDDEN,

   166       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   167       "Enable line end checking", NULL },

   168     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,

   169       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   170       "Diable line end checking", NULL },

   171     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   172       "Overview: just show counts", NULL },

   173     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   174       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   175       "Show individual warnings", NULL },

   176     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   177       "Output errors to stdout instead of stderr", NULL },

   178     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   179       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   180       "Output errors to stderr instead of stdout", NULL },

   181     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   182       "Echo header fields", NULL },

   183     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   184       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   185       "Don't echo header fields", NULL },

   186     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   187       "Ignore markup in < >", NULL },

   188     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   189       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   190       "No special handling for markup in < >", NULL },

   191     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   192       "Use file of user-defined typos", NULL },

   193     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   194       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   195       "Ignore file of user-defined typos", NULL },

   196     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   197       "Verbose - list everything", NULL },

   198     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   199       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   200       "Switch off verbose mode", NULL },

   201     { NULL }

   202 };

   204 /*

   205  * Options relating to configuration which make no sense from inside

   206  * a configuration file.

   207  */

   209 static GOptionEntry config_options[]={

   210     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   211       "Defaults for use on www upload", NULL },

   212     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,

   213       "Dump current config settings", NULL },

   214     { NULL }

   215 };

   217 static GOptionEntry compatibility_options[]={

   218     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,

   219       "Toggle checking for common typos", NULL },

   220     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,

   221       "Toggle both paranoid mode and common typos", NULL },

   222     { NULL }

   223 };

   225 long cnt_quote;		/* for overview mode, count of quote queries */

   226 long cnt_brack;		/* for overview mode, count of brackets queries */

   227 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   228 long cnt_odd;		/* for overview mode, count of odd character queries */

   229 long cnt_long;		/* for overview mode, count of long line errors */

   230 long cnt_short;		/* for overview mode, count of short line queries */

   231 long cnt_punct;		/* for overview mode,

   232 			   count of punctuation and spacing queries */

   233 long cnt_dash;		/* for overview mode, count of dash-related queries */

   234 long cnt_word;		/* for overview mode, count of word queries */

   235 long cnt_html;		/* for overview mode, count of html queries */

   236 long cnt_lineend;	/* for overview mode, count of line-end queries */

   237 long cnt_spacend;	/* count of lines with space at end */

   238 long linecnt;		/* count of total lines in the file */

   239 long checked_linecnt;	/* count of lines actually checked */

   241 void proghelp(GOptionContext *context);

   242 void procfile(const char *);

   244 gchar *running_from;

   246 gboolean mixdigit(const char *);

   247 gchar *getaword(const char **);

   248 char *flgets(char **,long);

   249 void postprocess_for_HTML(char *);

   250 char *linehasmarkup(char *);

   251 char *losemarkup(char *);

   252 gboolean tagcomp(const char *,const char *);

   253 void loseentities(char *);

   254 gboolean isroman(const char *);

   255 void postprocess_for_DP(char *);

   256 void print_as_windows_1252(const char *string);

   257 void print_as_utf_8(const char *string);

   259 GTree *qword,*qperiod;

   261 #ifdef __WIN32__

   262 UINT saved_cp;

   263 #endif

   265 GKeyFile *config;

   267 void config_file_update(GKeyFile *kf)

   268 {

   269     int i;

   270     gboolean sw;

   271     for(i=0;options[i].long_name;i++)

   272     {

   273 	if (g_str_has_prefix(options[i].long_name,"no-"))

   274 	    continue;

   275 	if (options[i].arg==G_OPTION_ARG_NONE)

   276 	{

   277 	    sw=*(gboolean *)options[i].arg_data;

   278 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)

   279 		sw=!sw;

   280 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);

   281 	}

   282 	else

   283 	    g_assert_not_reached();

   284     }

   285 }

   287 void config_file_add_comments(GKeyFile *kf)

   288 {

   289     int i;

   290     gchar *comment;

   291     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",

   292       NULL);

   293     for(i=0;options[i].long_name;i++)

   294     {

   295 	if (g_str_has_prefix(options[i].long_name,"no-"))

   296 	    continue;

   297 	comment=g_strconcat(" ",options[i].description,NULL);

   298 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);

   299 	g_free(comment);

   300     }

   301 }

   303 void dump_config(void)

   304 {

   305     gchar *s;

   306     if (config)

   307 	config_file_update(config);

   308     else

   309     {

   310 	config=g_key_file_new();

   311 	config_file_update(config);

   312 	config_file_add_comments(config);

   313     }

   314     s=g_key_file_to_data(config,NULL,NULL);

   315     if (s)

   316 	g_print("%s",s);

   317     g_free(s);

   318 }

   320 GKeyFile *read_config_file(gchar **full_path)

   321 {

   322     int i;

   323     GError *err=NULL;

   324     gchar **search_dirs;

   325     gchar *path;

   326     const char *search_path;

   327     GKeyFile *kf;

   328     kf=g_key_file_new();

   329     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");

   330     if (search_path)

   331     {

   332 #ifdef __WIN32__

   333 	search_dirs=g_strsplit(search_path,";",0);

   334 #else

   335 	search_dirs=g_strsplit(search_path,":",0);

   336 #endif

   337     }

   338     else

   339     {

   340 	search_dirs=g_new(gchar *,4);

   341 	search_dirs[0]=g_get_current_dir();

   342 	search_dirs[1]=g_strdup(running_from);

   343 	search_dirs[2]=g_strdup(g_get_user_config_dir());

   344 	search_dirs[3]=NULL;

   345     }

   346     for(i=0;search_dirs[i];i++)

   347     {

   348 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);

   349 	if (g_key_file_load_from_file(kf,path,

   350 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))

   351 	    break;

   352 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   353 	{

   354 	    g_printerr("Bookloupe: Error reading %s\n",path);

   355 	    g_printerr("%s\n",err->message);

   356 	    exit(1);

   357 	}

   358 	g_clear_error(&err);

   359 	g_free(path);

   360 	path=NULL;

   361     }

   362     if (!search_dirs[i])

   363     {

   364 	g_key_file_free(kf);

   365 	kf=NULL;

   366     }

   367     g_strfreev(search_dirs);

   368     if (full_path && kf)

   369 	*full_path=path;

   370     else

   371 	g_free(path);

   372     return kf;

   373 }

   375 void parse_config_file(void)

   376 {

   377     int i,j;

   378     gchar *path;

   379     gchar **keys;

   380     gboolean sw;

   381     GError *err=NULL;

   382     config=read_config_file(&path);

   383     if (config)

   384 	keys=g_key_file_get_keys(config,"options",NULL,NULL);

   385     else

   386 	keys=NULL;

   387     if (keys)

   388     {

   389 	for(i=0;keys[i];i++)

   390 	{

   391 	    for(j=0;options[j].long_name;j++)

   392 	    {

   393 		if (g_str_has_prefix(options[j].long_name,"no-"))

   394 		    continue;

   395 		else if (!strcmp(keys[i],options[j].long_name))

   396 		{

   397 		    if (options[j].arg==G_OPTION_ARG_NONE)

   398 		    {

   399 			sw=g_key_file_get_boolean(config,"options",keys[i],

   400 			  &err);

   401 			if (err)

   402 			{

   403 			    g_printerr("Bookloupe: %s: options.%s: %s\n",

   404 			      path,keys[i],err->message);

   405 			    g_clear_error(&err);

   406 			}

   407 			if (options[j].flags&G_OPTION_FLAG_REVERSE)

   408 			    sw=!sw;

   409 			*(gboolean *)options[j].arg_data=sw;

   410 			break;

   411 		    }

   412 		    else

   413 			g_assert_not_reached();

   414 		}

   415 	    }

   416 	    if (!options[j].long_name)

   417 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",

   418 		  path,keys[i]);

   419 	}

   420 	g_strfreev(keys);

   421     }

   422     if (config)

   423 	g_free(path);

   424 }

   426 void parse_options(int *argc,char ***argv)

   427 {

   428     GError *err=NULL;

   429     GOptionContext *context;

   430     GOptionGroup *compatibility;

   431     context=g_option_context_new(

   432       "file - look for errors in Project Gutenberg(TM) etexts");

   433     g_option_context_add_main_entries(context,options,NULL);

   434     g_option_context_add_main_entries(context,config_options,NULL);

   435     compatibility=g_option_group_new("compatibility",

   436       "Options for Compatibility with Gutcheck:",

   437       "Show compatibility options",NULL,NULL);

   438     g_option_group_add_entries(compatibility,compatibility_options);

   439     g_option_context_add_group(context,compatibility);

   440     g_option_context_set_description(context,

   441       "For simplicity, only the switch options which reverse the\n"

   442       "default configuration are listed. In most cases, both vanilla\n"

   443       "and \"no-\" prefixed versions are available for use.");

   444     if (!g_option_context_parse(context,argc,argv,&err))

   445     {

   446 	g_printerr("Bookloupe: %s\n",err->message);

   447 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   448 	exit(1);

   449     }

   450     if (typo_compat)

   451 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   452     if (paranoid_compat)

   453     {

   454 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   455 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   456     }

   457     /*

   458      * Web uploads - for the moment, this is really just a placeholder

   459      * until we decide what processing we really want to do on web uploads

   460      */

   461     if (pswit[WEB_SWITCH])

   462     {

   463 	/* specific override for web uploads */

   464 	pswit[ECHO_SWITCH]=TRUE;

   465 	pswit[SQUOTE_SWITCH]=FALSE;

   466 	pswit[TYPO_SWITCH]=TRUE;

   467 	pswit[QPARA_SWITCH]=FALSE;

   468 	pswit[PARANOID_SWITCH]=TRUE;

   469 	pswit[LINE_END_SWITCH]=FALSE;

   470 	pswit[OVERVIEW_SWITCH]=FALSE;

   471 	pswit[STDOUT_SWITCH]=FALSE;

   472 	pswit[HEADER_SWITCH]=TRUE;

   473 	pswit[VERBOSE_SWITCH]=FALSE;

   474 	pswit[MARKUP_SWITCH]=FALSE;

   475 	pswit[USERTYPO_SWITCH]=FALSE;

   476 	pswit[DP_SWITCH]=FALSE;

   477     }

   478     if (pswit[DUMP_CONFIG_SWITCH])

   479     {

   480 	dump_config();

   481 	exit(0);

   482     }

   483     if (pswit[OVERVIEW_SWITCH])

   484 	/* just print summary; don't echo */

   485 	pswit[ECHO_SWITCH]=FALSE;

   486     if (*argc<2)

   487     {

   488 	proghelp(context);

   489 	exit(1);

   490     }

   491     g_option_context_free(context);

   492 }

   494 /*

   495  * read_user_scannos:

   496  *

   497  * Read in the user-defined stealth scanno list.

   498  */

   499 void read_user_scannos(void)

   500 {

   501     GError *err=NULL;

   502     gchar *usertypo_file;

   503     gboolean okay;

   504     int i;

   505     gsize len,nb;

   506     gchar *contents,*utf8,**lines;

   507     usertypo_file=g_strdup("bookloupe.typ");

   508     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   509     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   510     {

   511 	g_clear_error(&err);

   512 	g_free(usertypo_file);

   513 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   514 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   515     }

   516     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   517     {

   518 	g_clear_error(&err);

   519 	g_free(usertypo_file);

   520 	usertypo_file=g_strdup("gutcheck.typ");

   521 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   522     }

   523     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   524     {

   525 	g_clear_error(&err);

   526 	g_free(usertypo_file);

   527 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   528 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   529     }

   530     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   531     {

   532 	g_free(usertypo_file);

   533 	g_print("   --> I couldn't find bookloupe.typ "

   534 	  "-- proceeding without user typos.\n");

   535 	return;

   536     }

   537     else if (!okay)

   538     {

   539 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   540 	g_free(usertypo_file);

   541 	g_clear_error(&err);

   542 	exit(1);

   543     }

   544     if (g_utf8_validate(contents,len,NULL))

   545 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   546     else

   547 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   548     g_free(contents);

   549     lines=g_strsplit_set(utf8,"\r\n",0);

   550     g_free(utf8);

   551     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   552     for (i=0;lines[i];i++)

   553 	if (*(unsigned char *)lines[i]>'!')

   554 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   555 	else

   556 	    g_free(lines[i]);

   557     g_free(lines);

   558 }

   560 /*

   561  * read_etext:

   562  *

   563  * Read an etext returning a newly allocated string containing the file

   564  * contents or NULL on error.

   565  */

   566 gchar *read_etext(const char *filename,GError **err)

   567 {

   568     GError *tmp_err=NULL;

   569     gchar *contents,*utf8;

   570     gsize len,bytes_read,bytes_written;

   571     int i,line,col;

   572     if (!g_file_get_contents(filename,&contents,&len,err))

   573 	return NULL;

   574     if (g_utf8_validate(contents,len,NULL))

   575     {

   576 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   577 	g_set_print_handler(print_as_utf_8);

   578 #ifdef __WIN32__

   579 	SetConsoleOutputCP(CP_UTF8);

   580 #endif

   581     }

   582     else

   583     {

   584 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   585 	  &bytes_written,&tmp_err);

   586 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   587 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   588 	{

   589 	    line=col=1;

   590 	    for(i=0;i<bytes_read;i++)

   591 		if (contents[i]=='\n')

   592 		{

   593 		    line++;

   594 		    col=1;

   595 		}

   596 		else if (contents[i]!='\r')

   597 		    col++;

   598 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   599 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   600 	      "valid Windows-1252 character",

   601 	      ((unsigned char *)contents)[bytes_read],line,col);

   602 	}

   603 	else if (tmp_err)

   604 	    g_propagate_error(err,tmp_err);

   605 	g_set_print_handler(print_as_windows_1252);

   606 #ifdef __WIN32__

   607 	SetConsoleOutputCP(1252);

   608 #endif

   609     }

   610     g_free(contents);

   611     return utf8;

   612 }

   614 void cleanup_on_exit(void)

   615 {

   616 #ifdef __WIN32__

   617     SetConsoleOutputCP(saved_cp);

   618 #endif

   619 }

   621 int main(int argc,char **argv)

   622 {

   623 #ifdef __WIN32__

   624     atexit(cleanup_on_exit);

   625     saved_cp=GetConsoleOutputCP();

   626 #endif

   627     running_from=g_path_get_dirname(argv[0]);

   628     /* Paranoid checking is turned OFF, not on, by its switch */

   629     pswit[PARANOID_SWITCH]=TRUE;

   630     /* if running in paranoid mode, typo checks default to enabled */

   631     pswit[TYPO_SWITCH]=TRUE;

   632     /* Line-end checking is turned OFF, not on, by its switch */

   633     pswit[LINE_END_SWITCH]=TRUE;

   634     /* Echoing is turned OFF, not on, by its switch */

   635     pswit[ECHO_SWITCH]=TRUE;

   636     parse_config_file();

   637     parse_options(&argc,&argv);

   638     if (pswit[USERTYPO_SWITCH])

   639 	read_user_scannos();

   640     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   641     procfile(argv[1]);

   642     if (pswit[OVERVIEW_SWITCH])

   643     {

   644 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   645 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   646 	g_print("    --------------- Queries found --------------\n");

   647 	if (cnt_long)

   648 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   649 	if (cnt_short)

   650 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   651 	if (cnt_lineend)

   652 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   653 	if (cnt_word)

   654 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   655 	if (cnt_quote)

   656 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);

   657 	if (cnt_brack)

   658 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   659 	if (cnt_bin)

   660 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   661 	if (cnt_odd)

   662 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   663 	if (cnt_punct)

   664 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   665 	if (cnt_dash)

   666 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   667 	if (cnt_html)

   668 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   669 	g_print("\n");

   670 	g_print("    TOTAL QUERIES		  %14ld\n",

   671 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+

   672 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);

   673     }

   674     g_free(running_from);

   675     if (usertypo)

   676 	g_tree_unref(usertypo);

   677     if (config)

   678 	g_key_file_free(config);

   679     return 0;

   680 }

   682 /*

   683  * first_pass:

   684  *

   685  * Run a first pass - verify that it's a valid PG

   686  * file, decide whether to report some things that

   687  * occur many times in the text like long or short

   688  * lines, non-standard dashes, etc.

   689  */

   690 struct first_pass_results *first_pass(const char *etext)

   691 {

   692     gunichar laststart=CHAR_SPACE;

   693     const char *s;

   694     gchar *lc_line;

   695     int i,j,lbytes,llen;

   696     gchar **lines;

   697     unsigned int lastlen=0,lastblen=0;

   698     long spline=0,nspline=0;

   699     static struct first_pass_results results={0};

   700     gchar *inword;

   701     QuoteClass qc;

   702     lines=g_strsplit(etext,"\n",0);

   703     for (j=0;lines[j];j++)

   704     {

   705 	lbytes=strlen(lines[j]);

   706 	while (lbytes>0 && lines[j][lbytes-1]=='\r')

   707 	    lines[j][--lbytes]='\0';

   708 	llen=g_utf8_strlen(lines[j],lbytes);

   709 	linecnt++;

   710 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   711 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   712 	{

   713 	    if (spline)

   714 		g_print("   --> Duplicate header?\n");

   715 	    spline=linecnt+1;   /* first line of non-header text, that is */

   716 	}

   717 	if (!strncmp(lines[j],"*** START",9) &&

   718 	  strstr(lines[j],"PROJECT GUTENBERG"))

   719 	{

   720 	    if (nspline)

   721 		g_print("   --> Duplicate header?\n");

   722 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   723 	}

   724 	if (spline || nspline)

   725 	{

   726 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   727 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   728 	    {

   729 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   730 		{

   731 		    if (results.footerline)

   732 		    {

   733 			/* it's an old-form header - we can detect duplicates */

   734 			if (!nspline)

   735 			    g_print("   --> Duplicate footer?\n");

   736 		    }

   737 		    else

   738 			results.footerline=linecnt;

   739 		}

   740 	    }

   741 	    g_free(lc_line);

   742 	}

   743 	if (spline)

   744 	    results.firstline=spline;

   745 	if (nspline)

   746 	    results.firstline=nspline;  /* override with new */

   747 	if (results.footerline)

   748 	    continue;    /* don't count the boilerplate in the footer */

   749 	results.totlen+=llen;

   750 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   751 	{

   752 	    if (g_utf8_get_char(s)>127)

   753 		results.binlen++;

   754 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   755 		results.alphalen++;

   756 	    if (s>lines[j])

   757 	    {

   758 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))

   759 		    qc=QUOTE_CLASS(g_utf8_get_char(s));

   760 		else

   761 		    qc=INVALID_QUOTE;

   762 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&

   763 		  isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   764 		    results.endquote_count++;

   765 	    }

   766 	}

   767 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   768 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   769 	    results.shortline++;

   770 	if (lbytes>0 &&

   771 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   772 	    cnt_spacend++;

   773 	if (strstr(lines[j],".,"))

   774 	    results.dotcomma++;

   775 	/* only count ast lines for ignoring purposes where there is */

   776 	/* locase text on the line */

   777 	if (strchr(lines[j],'*'))

   778 	{

   779 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   780 		if (g_unichar_islower(g_utf8_get_char(s)))

   781 		    break;

   782 	    if (*s)

   783 		results.astline++;

   784 	}

   785 	if (strchr(lines[j],'/'))

   786 	    results.fslashline++;

   787 	if (lbytes>0)

   788 	{

   789 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   790 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   791 	      s=g_utf8_prev_char(s))

   792 		;

   793 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   794 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   795 		results.hyphens++;

   796 	}

   797 	if (llen>LONGEST_PG_LINE)

   798 	    results.longline++;

   799 	if (llen>WAY_TOO_LONG)

   800 	    results.verylongline++;

   801 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   802 	{

   803 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   804 	    if (i>0)

   805 		results.htmcount++;

   806 	    if (strstr(lines[j],"<i>"))

   807 		results.htmcount+=4; /* bonus marks! */

   808 	}

   809 	/* Check for spaced em-dashes */

   810 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))

   811 	{

   812 	    results.emdash++;

   813 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)

   814 		results.space_emdash++;

   815 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)

   816 		/* count of em-dashes with spaces both sides */

   817 		results.non_PG_space_emdash++;

   818 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)

   819 		/* count of PG-type em-dashes with no spaces */

   820 		results.PG_space_emdash++;

   821 	}

   822 	for (s=lines[j];*s;)

   823 	{

   824 	    inword=getaword(&s);

   825 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   826 		results.Dutchcount++;

   827 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   828 		results.Frenchcount++;

   829 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   830 		results.standalone_digit++;

   831 	    g_free(inword);

   832 	}

   833 	/* Check for spaced dashes */

   834 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   835 	    results.spacedash++;

   836 	lastblen=lastlen;

   837 	lastlen=llen;

   838 	laststart=lines[j][0];

   839     }

   840     g_strfreev(lines);

   841     return &results;

   842 }

   844 /*

   845  * report_first_pass:

   846  *

   847  * Make some snap decisions based on the first pass results.

   848  */

   849 struct warnings *report_first_pass(struct first_pass_results *results)

   850 {

   851     static struct warnings warnings={0};

   852     if (cnt_spacend>0)

   853 	g_print("   --> %ld lines in this file have white space at end\n",

   854 	  cnt_spacend);

   855     warnings.dotcomma=1;

   856     if (results->dotcomma>5)

   857     {

   858 	warnings.dotcomma=0;

   859 	g_print("   --> %ld lines in this file contain '.,'. "

   860 	  "Not reporting them.\n",results->dotcomma);

   861     }

   862     /*

   863      * If more than 50 lines, or one-tenth, are short,

   864      * don't bother reporting them.

   865      */

   866     warnings.shortline=1;

   867     if (results->shortline>50 || results->shortline*10>linecnt)

   868     {

   869 	warnings.shortline=0;

   870 	g_print("   --> %ld lines in this file are short. "

   871 	  "Not reporting short lines.\n",results->shortline);

   872     }

   873     /*

   874      * If more than 50 lines, or one-tenth, are long,

   875      * don't bother reporting them.

   876      */

   877     warnings.longline=1;

   878     if (results->longline>50 || results->longline*10>linecnt)

   879     {

   880 	warnings.longline=0;

   881 	g_print("   --> %ld lines in this file are long. "

   882 	  "Not reporting long lines.\n",results->longline);

   883     }

   884     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   885     warnings.ast=1;

   886     if (results->astline>10)

   887     {

   888 	warnings.ast=0;

   889 	g_print("   --> %ld lines in this file contain asterisks. "

   890 	  "Not reporting them.\n",results->astline);

   891     }

   892     /*

   893      * If more than 10 lines contain forward slashes,

   894      * don't bother reporting them.

   895      */

   896     warnings.fslash=1;

   897     if (results->fslashline>10)

   898     {

   899 	warnings.fslash=0;

   900 	g_print("   --> %ld lines in this file contain forward slashes. "

   901 	  "Not reporting them.\n",results->fslashline);

   902     }

   903     /*

   904      * If more than 20 lines contain unpunctuated endquotes,

   905      * don't bother reporting them.

   906      */

   907     warnings.endquote=1;

   908     if (results->endquote_count>20)

   909     {

   910 	warnings.endquote=0;

   911 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

   912 	  "Not reporting them.\n",results->endquote_count);

   913     }

   914     /*

   915      * If more than 15 lines contain standalone digits,

   916      * don't bother reporting them.

   917      */

   918     warnings.digit=1;

   919     if (results->standalone_digit>10)

   920     {

   921 	warnings.digit=0;

   922 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

   923 	  "Not reporting them.\n",results->standalone_digit);

   924     }

   925     /*

   926      * If more than 20 lines contain hyphens at end,

   927      * don't bother reporting them.

   928      */

   929     warnings.hyphen=1;

   930     if (results->hyphens>20)

   931     {

   932 	warnings.hyphen=0;

   933 	g_print("   --> %ld lines in this file have hyphens at end. "

   934 	  "Not reporting them.\n",results->hyphens);

   935     }

   936     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   937     {

   938 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   939 	pswit[MARKUP_SWITCH]=1;

   940     }

   941     if (results->verylongline>0)

   942 	g_print("   --> %ld lines in this file are VERY long!\n",

   943 	  results->verylongline);

   944     /*

   945      * If there are more non-PG spaced dashes than PG em-dashes,

   946      * assume it's deliberate.

   947      * Current PG guidelines say don't use them, but older texts do,

   948      * and some people insist on them whatever the guidelines say.

   949      */

   950     warnings.dash=1;

   951     if (results->spacedash+results->non_PG_space_emdash>

   952       results->PG_space_emdash)

   953     {

   954 	warnings.dash=0;

   955 	g_print("   --> There are %ld spaced dashes and em-dashes. "

   956 	  "Not reporting them.\n",

   957 	  results->spacedash+results->non_PG_space_emdash);

   958     }

   959     /* If more than a quarter of characters are hi-bit, bug out. */

   960     warnings.bin=1;

   961     if (results->binlen*4>results->totlen)

   962     {

   963 	g_print("   --> This file does not appear to be ASCII. "

   964 	  "Terminating. Best of luck with it!\n");

   965 	exit(1);

   966     }

   967     if (results->alphalen*4<results->totlen)

   968     {

   969 	g_print("   --> This file does not appear to be text. "

   970 	  "Terminating. Best of luck with it!\n");

   971 	exit(1);

   972     }

   973     if (results->binlen*100>results->totlen || results->binlen>100)

   974     {

   975 	g_print("   --> There are a lot of foreign letters here. "

   976 	  "Not reporting them.\n");

   977 	warnings.bin=0;

   978     }

   979     warnings.isDutch=FALSE;

   980     if (results->Dutchcount>50)

   981     {

   982 	warnings.isDutch=TRUE;

   983 	g_print("   --> This looks like Dutch - "

   984 	  "switching off dashes and warnings for 's Middags case.\n");

   985     }

   986     warnings.isFrench=FALSE;

   987     if (results->Frenchcount>50)

   988     {

   989 	warnings.isFrench=TRUE;

   990 	g_print("   --> This looks like French - "

   991 	  "switching off some doublepunct.\n");

   992     }

   993     if (results->firstline && results->footerline)

   994 	g_print("    The PG header and footer appear to be already on.\n");

   995     else

   996     {

   997 	if (results->firstline)

   998 	    g_print("    The PG header is on - no footer.\n");

   999 	if (results->footerline)

  1000 	    g_print("    The PG footer is on - no header.\n");

  1001     }

  1002     g_print("\n");

  1003     if (pswit[VERBOSE_SWITCH])

  1004     {

  1005 	warnings.bin=1;

  1006 	warnings.shortline=1;

  1007 	warnings.dotcomma=1;

  1008 	warnings.longline=1;

  1009 	warnings.dash=1;

  1010 	warnings.digit=1;

  1011 	warnings.ast=1;

  1012 	warnings.fslash=1;

  1013 	warnings.hyphen=1;

  1014 	warnings.endquote=1;

  1015 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

  1016     }

  1017     if (warnings.isDutch)

  1018 	warnings.dash=0;

  1019     if (results->footerline>0 && results->firstline>0 &&

  1020       results->footerline>results->firstline &&

  1021       results->footerline-results->firstline<100)

  1022     {

  1023 	g_print("   --> I don't really know where this text starts. \n");

  1024 	g_print("       There are no reference points.\n");

  1025 	g_print("       I'm going to have to report the header and footer "

  1026 	  "as well.\n");

  1027 	results->firstline=0;

  1028     }

  1029     return &warnings;

  1030 }

  1032 /*

  1033  * analyse_quotes:

  1034  *

  1035  * Look along the line, accumulate the count of quotes, and see

  1036  * if this is an empty line - i.e. a line with nothing on it

  1037  * but spaces.

  1038  * If line has just spaces, period, * and/or - on it, don't

  1039  * count it, since empty lines with asterisks or dashes to

  1040  * separate sections are common.

  1041  *

  1042  * Returns: TRUE if the line is empty.

  1043  */

  1044 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)

  1045 {

  1046     int guessquote=0;

  1047     /* assume the line is empty until proven otherwise */

  1048     gboolean isemptyline=TRUE;

  1049     const char *s=aline,*sprev,*snext;

  1050     gunichar c;

  1051     sprev=NULL;

  1052     GError *tmp_err=NULL;

  1053     while (*s)

  1054     {

  1055 	snext=g_utf8_next_char(s);

  1056 	c=g_utf8_get_char(s);

  1057 	if (CHAR_IS_DQUOTE(c))

  1058 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);

  1059 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])

  1060 	{

  1061 	    if (s==aline)

  1062 	    {

  1063 		/*

  1064 		 * At start of line, it can only be a quotation mark.

  1065 		 * Hardcode a very common exception!

  1066 		 */

  1067 		if (!g_str_has_prefix(snext,"tis") &&

  1068 		  !g_str_has_prefix(snext,"Tis"))

  1069 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1070 	    }

  1071 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

  1072 	      g_unichar_isalpha(g_utf8_get_char(snext)))

  1073 		/* Do nothing! it's definitely an apostrophe, not a quote */

  1074 		;

  1075 	    /* it's outside a word - let's check it out */

  1076 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||

  1077 	      g_unichar_isalpha(g_utf8_get_char(snext)))

  1078 	    {

  1079 		/* certainly looks like a quotation mark */

  1080 		if (!g_str_has_prefix(snext,"tis") &&

  1081 		  !g_str_has_prefix(snext,"Tis"))

  1082 		    /* hardcode a very common exception! */

  1083 		{

  1084 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))

  1085 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1086 		    else

  1087 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);

  1088 		}

  1089 	    }

  1090 	    else

  1091 	    {

  1092 		/* now - is it a quotation mark? */

  1093 		guessquote=0;   /* accumulate clues */

  1094 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

  1095 		{

  1096 		    /* it follows a letter - could be either */

  1097 		    guessquote++;

  1098 		    if (g_utf8_get_char(sprev)=='s')

  1099 		    {

  1100 			/* looks like a plural apostrophe */

  1101 			guessquote-=3;

  1102 			if (g_utf8_get_char(snext)==CHAR_SPACE)

  1103 			    /* bonus marks! */

  1104 			    guessquote-=2;

  1105 		    }

  1106 		    if (innermost_quote_matches(counters,c))

  1107 			/*

  1108 			 * Give it the benefit of some doubt,

  1109 			 * if a squote is already open.

  1110 			 */

  1111 			guessquote++;

  1112 		    else

  1113 			guessquote--;

  1114 		    if (guessquote>=0)

  1115 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);

  1116 		}

  1117 		else

  1118 		    /* no adjacent letter - it must be a quote of some kind */

  1119 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1120 	    }

  1121 	}

  1122 	if (tmp_err)

  1123 	{

  1124 	    if (pswit[ECHO_SWITCH])

  1125 		g_print("\n%s\n",aline);

  1126 	    if (!pswit[OVERVIEW_SWITCH])

  1127 		g_print("    Line %ld column %ld - %s\n",

  1128 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);

  1129 	    g_clear_error(&tmp_err);

  1130 	}

  1131 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

  1132 	  c!='\r' && c!='\n')

  1133 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

  1134 	if (c==CHAR_UNDERSCORE)

  1135 	    counters->c_unders++;

  1136 	if (c==CHAR_OPEN_SBRACK)

  1137 	{

  1138 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&

  1139 	      !matching_difference(counters,c) && s==aline &&

  1140 	      g_str_has_prefix(s,"[Illustration:"))

  1141 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);

  1142 	    else

  1143 		increment_matching(counters,c,TRUE);

  1144 	}

  1145 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)

  1146 	    increment_matching(counters,c,TRUE);

  1147 	if (c==CHAR_CLOSE_SBRACK)

  1148 	{

  1149 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&

  1150 	      !matching_difference(counters,c) && !*snext)

  1151 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);

  1152 	    else

  1153 		increment_matching(counters,c,FALSE);

  1154 	}

  1155 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)

  1156 	    increment_matching(counters,c,FALSE);

  1157 	sprev=s;

  1158 	s=snext;

  1159     }

  1160     return isemptyline;

  1161 }

  1163 /*

  1164  * check_for_control_characters:

  1165  *

  1166  * Check for invalid or questionable characters in the line

  1167  * Anything above 127 is invalid for plain ASCII, and

  1168  * non-printable control characters should also be flagged.

  1169  * Tabs should generally not be there.

  1170  */

  1171 void check_for_control_characters(const char *aline)

  1172 {

  1173     gunichar c;

  1174     const char *s;

  1175     for (s=aline;*s;s=g_utf8_next_char(s))

  1176     {

  1177 	c=g_utf8_get_char(s);

  1178 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

  1179 	{

  1180 	    if (pswit[ECHO_SWITCH])

  1181 		g_print("\n%s\n",aline);

  1182 	    if (!pswit[OVERVIEW_SWITCH])

  1183 		g_print("    Line %ld column %ld - Control character %u\n",

  1184 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

  1185 	    else

  1186 		cnt_bin++;

  1187 	}

  1188     }

  1189 }

  1191 /*

  1192  * check_for_odd_characters:

  1193  *

  1194  * Check for binary and other odd characters.

  1195  */

  1196 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1197   gboolean isemptyline)

  1198 {

  1199     /* Don't repeat multiple warnings on one line. */

  1200     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;

  1201     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

  1202     const char *s;

  1203     gunichar c;

  1204     for (s=aline;*s;s=g_utf8_next_char(s))

  1205     {

  1206 	c=g_utf8_get_char(s);

  1207 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

  1208 	{

  1209 	    if (pswit[ECHO_SWITCH])

  1210 		g_print("\n%s\n",aline);

  1211 	    if (!pswit[OVERVIEW_SWITCH])

  1212 		if (c>127 && c<160 || c>255)

  1213 		    g_print("    Line %ld column %ld - "

  1214 		      "Non-ISO-8859 character %u\n",

  1215 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1216 		else

  1217 		    g_print("    Line %ld column %ld - "

  1218 		      "Non-ASCII character %u\n",

  1219 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1220 	    else

  1221 		cnt_bin++;

  1222 	    eNon_A=TRUE;

  1223 	}

  1224 	if (!eTab && c==CHAR_TAB)

  1225 	{

  1226 	    if (pswit[ECHO_SWITCH])

  1227 		g_print("\n%s\n",aline);

  1228 	    if (!pswit[OVERVIEW_SWITCH])

  1229 		g_print("    Line %ld column %ld - Tab character?\n",

  1230 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1231 	    else

  1232 		cnt_odd++;

  1233 	    eTab=TRUE;

  1234 	}

  1235 	if (!eTilde && c==CHAR_TILDE)

  1236 	{

  1237 	    /*

  1238 	     * Often used by OCR software to indicate an

  1239 	     * unrecognizable character.

  1240 	     */

  1241 	    if (pswit[ECHO_SWITCH])

  1242 		g_print("\n%s\n",aline);

  1243 	    if (!pswit[OVERVIEW_SWITCH])

  1244 		g_print("    Line %ld column %ld - Tilde character?\n",

  1245 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1246 	    else

  1247 		cnt_odd++;

  1248 	    eTilde=TRUE;

  1249 	}

  1250 	if (!eCarat && c==CHAR_CARAT)

  1251 	{

  1252 	    if (pswit[ECHO_SWITCH])

  1253 		g_print("\n%s\n",aline);

  1254 	    if (!pswit[OVERVIEW_SWITCH])

  1255 		g_print("    Line %ld column %ld - Carat character?\n",

  1256 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1257 	    else

  1258 		cnt_odd++;

  1259 	    eCarat=TRUE;

  1260 	}

  1261 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

  1262 	{

  1263 	    if (pswit[ECHO_SWITCH])

  1264 		g_print("\n%s\n",aline);

  1265 	    if (!pswit[OVERVIEW_SWITCH])

  1266 		g_print("    Line %ld column %ld - Forward slash?\n",

  1267 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1268 	    else

  1269 		cnt_odd++;

  1270 	    eFSlash=TRUE;

  1271 	}

  1272 	/*

  1273 	 * Report asterisks only in paranoid mode,

  1274 	 * since they're often deliberate.

  1275 	 */

  1276 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1277 	  c==CHAR_ASTERISK)

  1278 	{

  1279 	    if (pswit[ECHO_SWITCH])

  1280 		g_print("\n%s\n",aline);

  1281 	    if (!pswit[OVERVIEW_SWITCH])

  1282 		g_print("    Line %ld column %ld - Asterisk?\n",

  1283 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1284 	    else

  1285 		cnt_odd++;

  1286 	    eAst=TRUE;

  1287 	}

  1288     }

  1289 }

  1291 /*

  1292  * check_for_long_line:

  1293  *

  1294  * Check for line too long.

  1295  */

  1296 void check_for_long_line(const char *aline)

  1297 {

  1298     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1299     {

  1300 	if (pswit[ECHO_SWITCH])

  1301 	    g_print("\n%s\n",aline);

  1302 	if (!pswit[OVERVIEW_SWITCH])

  1303 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1304 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1305 	else

  1306 	    cnt_long++;

  1307     }

  1308 }

  1310 /*

  1311  * check_for_short_line:

  1312  *

  1313  * Check for line too short.

  1314  *

  1315  * This one is a bit trickier to implement: we don't want to

  1316  * flag the last line of a paragraph for being short, so we

  1317  * have to wait until we know that our current line is a

  1318  * "normal" line, then report the _previous_ line if it was too

  1319  * short. We also don't want to report indented lines like

  1320  * chapter heads or formatted quotations. We therefore keep

  1321  * last->len as the length of the last line examined, and

  1322  * last->blen as the length of the last but one, and try to

  1323  * suppress unnecessary warnings by checking that both were of

  1324  * "normal" length. We keep the first character of the last

  1325  * line in last->start, and if it was a space, we assume that

  1326  * the formatting is deliberate. I can't figure out a way to

  1327  * distinguish something like a quoted verse left-aligned or

  1328  * the header or footer of a letter from a paragraph of short

  1329  * lines - maybe if I examined the whole paragraph, and if the

  1330  * para has less than, say, 8 lines and if all lines are short,

  1331  * then just assume it's OK? Need to look at some texts to see

  1332  * how often a formula like this would get the right result.

  1333  */

  1334 void check_for_short_line(const char *aline,const struct line_properties *last)

  1335 {

  1336     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1337       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1338       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1339     {

  1340 	if (pswit[ECHO_SWITCH])

  1341 	    g_print("\n%s\n",prevline);

  1342 	if (!pswit[OVERVIEW_SWITCH])

  1343 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1344 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1345 	else

  1346 	    cnt_short++;

  1347     }

  1348 }

  1350 /*

  1351  * check_for_starting_punctuation:

  1352  *

  1353  * Look for punctuation other than full ellipses at start of line.

  1354  */

  1355 void check_for_starting_punctuation(const char *aline)

  1356 {

  1357     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1358       !g_str_has_prefix(aline,". . ."))

  1359     {

  1360 	if (pswit[ECHO_SWITCH])

  1361 	    g_print("\n%s\n",aline);

  1362 	if (!pswit[OVERVIEW_SWITCH])

  1363 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1364 	      linecnt);

  1365 	else

  1366 	    cnt_punct++;

  1367     }

  1368 }

  1370 /*

  1371  * check_for_spaced_emdash:

  1372  *

  1373  * Check for spaced em-dashes.

  1374  *

  1375  * We must check _all_ occurrences of "--" on the line

  1376  * hence the loop - even if the first double-dash is OK

  1377  * there may be another that's wrong later on.

  1378  */

  1379 void check_for_spaced_emdash(const char *aline)

  1380 {

  1381     const char *s,*t,*next;

  1382     for (s=aline;t=strstr(s,"--");s=next)

  1383     {

  1384 	next=g_utf8_next_char(g_utf8_next_char(t));

  1385 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1386 	  g_utf8_get_char(next)==CHAR_SPACE)

  1387 	{

  1388 	    if (pswit[ECHO_SWITCH])

  1389 		g_print("\n%s\n",aline);

  1390 	    if (!pswit[OVERVIEW_SWITCH])

  1391 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1392 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1393 	    else

  1394 		cnt_dash++;

  1395 	}

  1396     }

  1397 }

  1399 /*

  1400  * check_for_spaced_dash:

  1401  *

  1402  * Check for spaced dashes.

  1403  */

  1404 void check_for_spaced_dash(const char *aline)

  1405 {

  1406     const char *s;

  1407     if ((s=strstr(aline," -")))

  1408     {

  1409 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1410 	{

  1411 	    if (pswit[ECHO_SWITCH])

  1412 		g_print("\n%s\n",aline);

  1413 	    if (!pswit[OVERVIEW_SWITCH])

  1414 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1415 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1416 	    else

  1417 		cnt_dash++;

  1418 	}

  1419     }

  1420     else if ((s=strstr(aline,"- ")))

  1421     {

  1422 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1423 	{

  1424 	    if (pswit[ECHO_SWITCH])

  1425 		g_print("\n%s\n",aline);

  1426 	    if (!pswit[OVERVIEW_SWITCH])

  1427 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1428 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1429 	    else

  1430 		cnt_dash++;

  1431 	}

  1432     }

  1433 }

  1435 /*

  1436  * check_for_unmarked_paragraphs:

  1437  *

  1438  * Check for unmarked paragraphs indicated by separate speakers.

  1439  *

  1440  * May well be false positive:

  1441  * "Bravo!" "Wonderful!" called the crowd.

  1442  * but useful all the same.

  1443  */

  1444 void check_for_unmarked_paragraphs(const char *aline)

  1445 {

  1446     const char *s;

  1447     s=strstr(aline,"\"  \"");

  1448     if (!s)

  1449 	s=strstr(aline,"\" \"");

  1450     if (s)

  1451     {

  1452 	if (pswit[ECHO_SWITCH])

  1453 	    g_print("\n%s\n",aline);

  1454 	if (!pswit[OVERVIEW_SWITCH])

  1455 	    g_print("    Line %ld column %ld - "

  1456 	      "Query missing paragraph break?\n",

  1457 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1458 	else

  1459 	    cnt_punct++;

  1460     }

  1461 }

  1463 /*

  1464  * check_for_jeebies:

  1465  *

  1466  * Check for "to he" and other easy h/b errors.

  1467  *

  1468  * This is a very inadequate effort on the h/b problem,

  1469  * but the phrase "to he" is always an error, whereas "to

  1470  * be" is quite common.

  1471  * Similarly, '"Quiet!", be said.' is a non-be error

  1472  * "to he" is _not_ always an error!:

  1473  *       "Where they went to he couldn't say."

  1474  * Another false positive:

  1475  *       What would "Cinderella" be without the . . .

  1476  * and another: "If he wants to he can see for himself."

  1477  */

  1478 void check_for_jeebies(const char *aline)

  1479 {

  1480     const char *s;

  1481     s=strstr(aline," be could ");

  1482     if (!s)

  1483 	s=strstr(aline," be would ");

  1484     if (!s)

  1485 	s=strstr(aline," was be ");

  1486     if (!s)

  1487 	s=strstr(aline," be is ");

  1488     if (!s)

  1489 	s=strstr(aline," is be ");

  1490     if (!s)

  1491 	s=strstr(aline,"\", be ");

  1492     if (!s)

  1493 	s=strstr(aline,"\" be ");

  1494     if (!s)

  1495 	s=strstr(aline,"\" be ");

  1496     if (!s)

  1497 	s=strstr(aline," to he ");

  1498     if (s)

  1499     {

  1500 	if (pswit[ECHO_SWITCH])

  1501 	    g_print("\n%s\n",aline);

  1502 	if (!pswit[OVERVIEW_SWITCH])

  1503 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1504 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1505 	else

  1506 	    cnt_word++;

  1507     }

  1508     s=strstr(aline," the had ");

  1509     if (!s)

  1510 	s=strstr(aline," a had ");

  1511     if (!s)

  1512 	s=strstr(aline," they bad ");

  1513     if (!s)

  1514 	s=strstr(aline," she bad ");

  1515     if (!s)

  1516 	s=strstr(aline," he bad ");

  1517     if (!s)

  1518 	s=strstr(aline," you bad ");

  1519     if (!s)

  1520 	s=strstr(aline," i bad ");

  1521     if (s)

  1522     {

  1523 	if (pswit[ECHO_SWITCH])

  1524 	    g_print("\n%s\n",aline);

  1525 	if (!pswit[OVERVIEW_SWITCH])

  1526 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1527 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1528 	else

  1529 	    cnt_word++;

  1530     }

  1531     s=strstr(aline,"; hut ");

  1532     if (!s)

  1533 	s=strstr(aline,", hut ");

  1534     if (s)

  1535     {

  1536 	if (pswit[ECHO_SWITCH])

  1537 	    g_print("\n%s\n",aline);

  1538 	if (!pswit[OVERVIEW_SWITCH])

  1539 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1540 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1541 	else

  1542 	    cnt_word++;

  1543     }

  1544 }

  1546 /*

  1547  * check_for_mta_from:

  1548  *

  1549  * Special case - angled bracket in front of "From" placed there by an

  1550  * MTA when sending an e-mail.

  1551  */

  1552 void check_for_mta_from(const char *aline)

  1553 {

  1554     const char *s;

  1555     s=strstr(aline,">From");

  1556     if (s)

  1557     {

  1558 	if (pswit[ECHO_SWITCH])

  1559 	    g_print("\n%s\n",aline);

  1560 	if (!pswit[OVERVIEW_SWITCH])

  1561 	    g_print("    Line %ld column %ld - "

  1562 	      "Query angled bracket with From\n",

  1563 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1564 	else

  1565 	    cnt_punct++;

  1566     }

  1567 }

  1569 /*

  1570  * check_for_orphan_character:

  1571  *

  1572  * Check for a single character line -

  1573  * often an overflow from bad wrapping.

  1574  */

  1575 void check_for_orphan_character(const char *aline)

  1576 {

  1577     gunichar c;

  1578     c=g_utf8_get_char(aline);

  1579     if (c && !*g_utf8_next_char(aline))

  1580     {

  1581 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1582 	    ; /* Nothing - ignore numerals alone on a line. */

  1583 	else

  1584 	{

  1585 	    if (pswit[ECHO_SWITCH])

  1586 		g_print("\n%s\n",aline);

  1587 	    if (!pswit[OVERVIEW_SWITCH])

  1588 		g_print("    Line %ld column 1 - Query single character line\n",

  1589 		  linecnt);

  1590 	    else

  1591 		cnt_punct++;

  1592 	}

  1593     }

  1594 }

  1596 /*

  1597  * check_for_pling_scanno:

  1598  *

  1599  * Check for I" - often should be !

  1600  */

  1601 void check_for_pling_scanno(const char *aline)

  1602 {

  1603     const char *s;

  1604     s=strstr(aline," I\"");

  1605     if (s)

  1606     {

  1607 	if (pswit[ECHO_SWITCH])

  1608 	    g_print("\n%s\n",aline);

  1609 	if (!pswit[OVERVIEW_SWITCH])

  1610 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1611 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1612 	else

  1613 	    cnt_punct++;

  1614     }

  1615 }

  1617 /*

  1618  * check_for_extra_period:

  1619  *

  1620  * Check for period without a capital letter. Cut-down from gutspell.

  1621  * Only works when it happens on a single line.

  1622  */

  1623 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1624 {

  1625     const char *s,*t,*s1,*sprev;

  1626     int i;

  1627     gsize len;

  1628     gboolean istypo;

  1629     gchar *testword;

  1630     gunichar c,nc,pc,*decomposition;

  1631     if (pswit[PARANOID_SWITCH])

  1632     {

  1633 	for (t=aline;t=strstr(t,". ");)

  1634 	{

  1635 	    if (t==aline)

  1636 	    {

  1637 		t=g_utf8_next_char(t);

  1638 		/* start of line punctuation is handled elsewhere */

  1639 		continue;

  1640 	    }

  1641 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1642 	    {

  1643 		t=g_utf8_next_char(t);

  1644 		continue;

  1645 	    }

  1646 	    if (warnings->isDutch)

  1647 	    {

  1648 		/* For Frank & Jeroen -- 's Middags case */

  1649 		gunichar c2,c3,c4,c5;

  1650 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1651 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1652 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1653 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1654 		if (CHAR_IS_APOSTROPHE(c2) &&

  1655 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&

  1656 		  g_unichar_isupper(c5))

  1657 		{

  1658 		    t=g_utf8_next_char(t);

  1659 		    continue;

  1660 		}

  1661 	    }

  1662 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1663 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1664 	      !isdigit(g_utf8_get_char(s1)))

  1665 		s1=g_utf8_next_char(s1);

  1666 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1667 	    {

  1668 		/* we have something to investigate */

  1669 		istypo=TRUE;

  1670 		/* so let's go back and find out */

  1671 		nc=g_utf8_get_char(t);

  1672 		s1=g_utf8_prev_char(t);

  1673 		c=g_utf8_get_char(s1);

  1674 		sprev=g_utf8_prev_char(s1);

  1675 		pc=g_utf8_get_char(sprev);

  1676 		while (s1>=aline &&

  1677 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||

  1678 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&

  1679 		  g_unichar_isalpha(nc)))

  1680 		{

  1681 		    nc=c;

  1682 		    s1=sprev;

  1683 		    c=pc;

  1684 		    sprev=g_utf8_prev_char(s1);

  1685 		    pc=g_utf8_get_char(sprev);

  1686 		}

  1687 		s1=g_utf8_next_char(s1);

  1688 		s=strchr(s1,'.');

  1689 		if (s)

  1690 		    testword=g_strndup(s1,s-s1);

  1691 		else

  1692 		    testword=g_strdup(s1);

  1693 		for (i=0;*abbrev[i];i++)

  1694 		    if (!strcmp(testword,abbrev[i]))

  1695 			istypo=FALSE;

  1696 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1697 		    istypo=FALSE;

  1698 		if (!*g_utf8_next_char(testword))

  1699 		    istypo=FALSE;

  1700 		if (isroman(testword))

  1701 		    istypo=FALSE;

  1702 		if (istypo)

  1703 		{

  1704 		    istypo=FALSE;

  1705 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1706 		    {

  1707 			decomposition=g_unicode_canonical_decomposition(

  1708 			  g_utf8_get_char(s),&len);

  1709 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1710 			    istypo=TRUE;

  1711 			g_free(decomposition);

  1712 		    }

  1713 		}

  1714 		if (istypo &&

  1715 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1716 		{

  1717 		    g_tree_insert(qperiod,g_strdup(testword),

  1718 		      GINT_TO_POINTER(1));

  1719 		    if (pswit[ECHO_SWITCH])

  1720 			g_print("\n%s\n",aline);

  1721 		    if (!pswit[OVERVIEW_SWITCH])

  1722 			g_print("    Line %ld column %ld - Extra period?\n",

  1723 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1724 		    else

  1725 			cnt_punct++;

  1726 		}

  1727 		g_free(testword);

  1728 	    }

  1729 	    t=g_utf8_next_char(t);

  1730 	}

  1731     }

  1732 }

  1734 /*

  1735  * check_for_following_punctuation:

  1736  *

  1737  * Check for words usually not followed by punctuation.

  1738  */

  1739 void check_for_following_punctuation(const char *aline)

  1740 {

  1741     int i;

  1742     const char *s,*wordstart;

  1743     gunichar c;

  1744     gchar *inword,*t;

  1745     if (pswit[TYPO_SWITCH])

  1746     {

  1747 	for (s=aline;*s;)

  1748 	{

  1749 	    wordstart=s;

  1750 	    t=getaword(&s);

  1751 	    if (!*t)

  1752 	    {

  1753 		g_free(t);

  1754 		continue;

  1755 	    }

  1756 	    inword=g_utf8_strdown(t,-1);

  1757 	    g_free(t);

  1758 	    for (i=0;*nocomma[i];i++)

  1759 		if (!strcmp(inword,nocomma[i]))

  1760 		{

  1761 		    c=g_utf8_get_char(s);

  1762 		    if (c==',' || c==';' || c==':')

  1763 		    {

  1764 			if (pswit[ECHO_SWITCH])

  1765 			    g_print("\n%s\n",aline);

  1766 			if (!pswit[OVERVIEW_SWITCH])

  1767 			    g_print("    Line %ld column %ld - "

  1768 			      "Query punctuation after %s?\n",

  1769 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1770 			      inword);

  1771 			else

  1772 			    cnt_punct++;

  1773 		    }

  1774 		}

  1775 	    for (i=0;*noperiod[i];i++)

  1776 		if (!strcmp(inword,noperiod[i]))

  1777 		{

  1778 		    c=g_utf8_get_char(s);

  1779 		    if (c=='.' || c=='!')

  1780 		    {

  1781 			if (pswit[ECHO_SWITCH])

  1782 			    g_print("\n%s\n",aline);

  1783 			if (!pswit[OVERVIEW_SWITCH])

  1784 			    g_print("    Line %ld column %ld - "

  1785 			      "Query punctuation after %s?\n",

  1786 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1787 			      inword);

  1788 			else

  1789 			    cnt_punct++;

  1790 		    }

  1791 		}

  1792 	    g_free(inword);

  1793 	}

  1794     }

  1795 }

  1797 /*

  1798  * check_for_typos:

  1799  *

  1800  * Check for commonly mistyped words,

  1801  * and digits like 0 for O in a word.

  1802  */

  1803 void check_for_typos(const char *aline,struct warnings *warnings)

  1804 {

  1805     const char *s,*t,*nt,*wordstart;

  1806     gchar *inword;

  1807     gunichar *decomposition;

  1808     gchar *testword;

  1809     int i,vowel,consonant,*dupcnt;

  1810     gboolean isdup,istypo,alower;

  1811     gunichar c,pc;

  1812     long offset,len;

  1813     gsize decomposition_len;

  1814     for (s=aline;*s;)

  1815     {

  1816 	wordstart=s;

  1817 	inword=getaword(&s);

  1818 	if (!*inword)

  1819 	{

  1820 	    g_free(inword);

  1821 	    continue; /* don't bother with empty lines */

  1822 	}

  1823 	if (mixdigit(inword))

  1824 	{

  1825 	    if (pswit[ECHO_SWITCH])

  1826 		g_print("\n%s\n",aline);

  1827 	    if (!pswit[OVERVIEW_SWITCH])

  1828 		g_print("    Line %ld column %ld - Query digit in %s\n",

  1829 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  1830 	    else

  1831 		cnt_word++;

  1832 	}

  1833 	/*

  1834 	 * Put the word through a series of tests for likely typos and OCR

  1835 	 * errors.

  1836 	 */

  1837 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1838 	{

  1839 	    istypo=FALSE;

  1840 	    alower=FALSE;

  1841 	    for (t=inword;*t;t=g_utf8_next_char(t))

  1842 	    {

  1843 		c=g_utf8_get_char(t);

  1844 		nt=g_utf8_next_char(t);

  1845 		/* lowercase for testing */

  1846 		if (g_unichar_islower(c))

  1847 		    alower=TRUE;

  1848 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  1849 		{

  1850 		    /*

  1851 		     * We have an uppercase mid-word. However, there are

  1852 		     * common cases:

  1853 		     *   Mac and Mc like McGill

  1854 		     *   French contractions like l'Abbe

  1855 		     */

  1856 		    offset=g_utf8_pointer_to_offset(inword,t);

  1857 		    if (offset>0)

  1858 			pc=g_utf8_get_char(g_utf8_prev_char(t));

  1859 		    else

  1860 			pc='\0';

  1861 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  1862 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  1863 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  1864 		      CHAR_IS_APOSTROPHE(pc))

  1865 			; /* do nothing! */

  1866 		    else

  1867 			istypo=TRUE;

  1868 		}

  1869 	    }

  1870 	    testword=g_utf8_casefold(inword,-1);

  1871 	}

  1872 	if (pswit[TYPO_SWITCH])

  1873 	{

  1874 	    /*

  1875 	     * Check for certain unlikely two-letter combinations at word

  1876 	     * start and end.

  1877 	     */

  1878 	    len=g_utf8_strlen(testword,-1);

  1879 	    if (len>1)

  1880 	    {

  1881 		for (i=0;*nostart[i];i++)

  1882 		    if (g_str_has_prefix(testword,nostart[i]))

  1883 			istypo=TRUE;

  1884 		for (i=0;*noend[i];i++)

  1885 		    if (g_str_has_suffix(testword,noend[i]))

  1886 			istypo=TRUE;

  1887 	    }

  1888 	    /* ght is common, gbt never. Like that. */

  1889 	    if (strstr(testword,"cb"))

  1890 		istypo=TRUE;

  1891 	    if (strstr(testword,"gbt"))

  1892 		istypo=TRUE;

  1893 	    if (strstr(testword,"pbt"))

  1894 		istypo=TRUE;

  1895 	    if (strstr(testword,"tbs"))

  1896 		istypo=TRUE;

  1897 	    if (strstr(testword,"mrn"))

  1898 		istypo=TRUE;

  1899 	    if (strstr(testword,"ahle"))

  1900 		istypo=TRUE;

  1901 	    if (strstr(testword,"ihle"))

  1902 		istypo=TRUE;

  1903 	    /*

  1904 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1905 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1906 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1907 	     * numerals, but "ii" is a common scanno.

  1908 	     */

  1909 	    if (strstr(testword,"tbi"))

  1910 		istypo=TRUE;

  1911 	    if (strstr(testword,"tbe"))

  1912 		istypo=TRUE;

  1913 	    if (strstr(testword,"ii"))

  1914 		istypo=TRUE;

  1915 	    /*

  1916 	     * Check for no vowels or no consonants.

  1917 	     * If none, flag a typo.

  1918 	     */

  1919 	    if (!istypo && len>1)

  1920 	    {

  1921 		vowel=consonant=0;

  1922 		for (t=testword;*t;t=g_utf8_next_char(t))

  1923 		{

  1924 		    c=g_utf8_get_char(t);

  1925 		    decomposition=

  1926 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  1927 		    if (c=='y' || g_unichar_isdigit(c))

  1928 		    {

  1929 			/* Yah, this is loose. */

  1930 			vowel++;

  1931 			consonant++;

  1932 		    }

  1933 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1934 			vowel++;

  1935 		    else

  1936 			consonant++;

  1937 		    g_free(decomposition);

  1938 		}

  1939 		if (!vowel || !consonant)

  1940 		    istypo=TRUE;

  1941 	    }

  1942 	    /*

  1943 	     * Now exclude the word from being reported if it's in

  1944 	     * the okword list.

  1945 	     */

  1946 	    for (i=0;*okword[i];i++)

  1947 		if (!strcmp(testword,okword[i]))

  1948 		    istypo=FALSE;

  1949 	    /*

  1950 	     * What looks like a typo may be a Roman numeral.

  1951 	     * Exclude these.

  1952 	     */

  1953 	    if (istypo && isroman(testword))

  1954 		istypo=FALSE;

  1955 	    /* Check the manual list of typos. */

  1956 	    if (!istypo)

  1957 		for (i=0;*typo[i];i++)

  1958 		    if (!strcmp(testword,typo[i]))

  1959 			istypo=TRUE;

  1960 	    /*

  1961 	     * Check lowercase s, l, i and m - special cases.

  1962 	     *   "j" - often a semi-colon gone wrong.

  1963 	     *   "d" for a missing apostrophe - he d

  1964 	     *   "n" for "in"

  1965 	     */

  1966 	    if (!istypo && len==1 &&

  1967 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  1968 		istypo=TRUE;

  1969 	    if (istypo)

  1970 	    {

  1971 		dupcnt=g_tree_lookup(qword,testword);

  1972 		if (dupcnt)

  1973 		{

  1974 		    (*dupcnt)++;

  1975 		    isdup=!pswit[VERBOSE_SWITCH];

  1976 		}

  1977 		else

  1978 		{

  1979 		    dupcnt=g_new0(int,1);

  1980 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1981 		    isdup=FALSE;

  1982 		}

  1983 		if (!isdup)

  1984 		{

  1985 		    if (pswit[ECHO_SWITCH])

  1986 			g_print("\n%s\n",aline);

  1987 		    if (!pswit[OVERVIEW_SWITCH])

  1988 		    {

  1989 			g_print("    Line %ld column %ld - Query word %s",

  1990 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  1991 			  inword);

  1992 			if (!pswit[VERBOSE_SWITCH])

  1993 			    g_print(" - not reporting duplicates");

  1994 			g_print("\n");

  1995 		    }

  1996 		    else

  1997 			cnt_word++;

  1998 		}

  1999 	    }

  2000 	}

  2001 	/* check the user's list of typos */

  2002 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  2003 	{

  2004 	    if (pswit[ECHO_SWITCH])

  2005 		g_print("\n%s\n",aline);

  2006 	    if (!pswit[OVERVIEW_SWITCH])

  2007 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  2008 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  2009 	}

  2010 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  2011 	    g_free(testword);

  2012 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  2013 	{

  2014 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  2015 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  2016 	    {

  2017 		if (pswit[ECHO_SWITCH])

  2018 		    g_print("\n%s\n",aline);

  2019 		if (!pswit[OVERVIEW_SWITCH])

  2020 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  2021 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  2022 		      inword);

  2023 		else

  2024 		    cnt_word++;

  2025 	    }

  2026 	}

  2027 	g_free(inword);

  2028     }

  2029 }

  2031 /*

  2032  * check_for_misspaced_punctuation:

  2033  *

  2034  * Look for added or missing spaces around punctuation and quotes.

  2035  * If there is a punctuation character like ! with no space on

  2036  * either side, suspect a missing!space. If there are spaces on

  2037  * both sides , assume a typo. If we see a double quote with no

  2038  * space or punctuation on either side of it, assume unspaced

  2039  * quotes "like"this.

  2040  */

  2041 void check_for_misspaced_punctuation(const char *aline,

  2042   struct parities *parities,gboolean isemptyline)

  2043 {

  2044     gboolean isacro,isellipsis;

  2045     const char *s;

  2046     gunichar c,nc,pc,n2c;

  2047     int parity;

  2048     c=g_utf8_get_char(aline);

  2049     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2050     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2051     {

  2052 	pc=c;

  2053 	c=nc;

  2054 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2055 	/* For each character in the line after the first. */

  2056 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  2057 	{

  2058 	    /* we need to suppress warnings for acronyms like M.D. */

  2059 	    isacro=FALSE;

  2060 	    /* we need to suppress warnings for ellipsis . . . */

  2061 	    isellipsis=FALSE;

  2062 	    /*

  2063 	     * If there are letters on both sides of it or

  2064 	     * if it's strict punctuation followed by an alpha.

  2065 	     */

  2066 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  2067 	      g_utf8_strchr("?!,;:",-1,c)))

  2068 	    {

  2069 		if (c=='.')

  2070 		{

  2071 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  2072 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  2073 			isacro=TRUE;

  2074 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  2075 		    if (nc && n2c=='.')

  2076 			isacro=TRUE;

  2077 		}

  2078 		if (!isacro)

  2079 		{

  2080 		    if (pswit[ECHO_SWITCH])

  2081 			g_print("\n%s\n",aline);

  2082 		    if (!pswit[OVERVIEW_SWITCH])

  2083 			g_print("    Line %ld column %ld - Missing space?\n",

  2084 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2085 		    else

  2086 			cnt_punct++;

  2087 		}

  2088 	    }

  2089 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  2090 	    {

  2091 		/*

  2092 		 * If there are spaces on both sides,

  2093 		 * or space before and end of line.

  2094 		 */

  2095 		if (c=='.')

  2096 		{

  2097 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  2098 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  2099 			isellipsis=TRUE;

  2100 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  2101 		    if (nc && n2c=='.')

  2102 			isellipsis=TRUE;

  2103 		}

  2104 		if (!isemptyline && !isellipsis)

  2105 		{

  2106 		    if (pswit[ECHO_SWITCH])

  2107 			g_print("\n%s\n",aline);

  2108 		    if (!pswit[OVERVIEW_SWITCH])

  2109 			g_print("    Line %ld column %ld - "

  2110 			  "Spaced punctuation?\n",linecnt,

  2111 			  g_utf8_pointer_to_offset(aline,s)+1);

  2112 		    else

  2113 			cnt_punct++;

  2114 		}

  2115 	    }

  2116 	}

  2117     }

  2118     /* Split out the characters that CANNOT be preceded by space. */

  2119     c=g_utf8_get_char(aline);

  2120     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2121     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2122     {

  2123 	pc=c;

  2124 	c=nc;

  2125 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2126 	/* for each character in the line after the first */

  2127 	if (g_utf8_strchr("?!,;:",-1,c))

  2128 	{

  2129 	    /* if it's punctuation that _cannot_ have a space before it */

  2130 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  2131 	    {

  2132 		/*

  2133 		 * If nc DOES == space,

  2134 		 * it was already reported just above.

  2135 		 */

  2136 		if (pswit[ECHO_SWITCH])

  2137 		    g_print("\n%s\n",aline);

  2138 		if (!pswit[OVERVIEW_SWITCH])

  2139 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2140 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2141 		else

  2142 		    cnt_punct++;

  2143 	    }

  2144 	}

  2145     }

  2146     /*

  2147      * Special case " .X" where X is any alpha.

  2148      * This plugs a hole in the acronym code above.

  2149      * Inelegant, but maintainable.

  2150      */

  2151     c=g_utf8_get_char(aline);

  2152     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2153     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2154     {

  2155 	pc=c;

  2156 	c=nc;

  2157 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2158 	/* for each character in the line after the first */

  2159 	if (c=='.')

  2160 	{

  2161 	    /* if it's a period */

  2162 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  2163 	    {

  2164 		/*

  2165 		 * If the period follows a space and

  2166 		 * is followed by a letter.

  2167 		 */

  2168 		if (pswit[ECHO_SWITCH])

  2169 		    g_print("\n%s\n",aline);

  2170 		if (!pswit[OVERVIEW_SWITCH])

  2171 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2172 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2173 		else

  2174 		    cnt_punct++;

  2175 	    }

  2176 	}

  2177     }

  2178     c=g_utf8_get_char(aline);

  2179     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2180     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2181     {

  2182 	pc=c;

  2183 	c=nc;

  2184 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2185 	/* for each character in the line after the first */

  2186 	if (CHAR_IS_DQUOTE(c))

  2187 	{

  2188 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  2189 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  2190 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  2191 	    {

  2192 		if (pswit[ECHO_SWITCH])

  2193 		    g_print("\n%s\n",aline);

  2194 		if (!pswit[OVERVIEW_SWITCH])

  2195 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  2196 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2197 		else

  2198 		    cnt_punct++;

  2199 	    }

  2200 	}

  2201     }

  2202     /* Check parity of quotes. */

  2203     nc=g_utf8_get_char(aline);

  2204     for (s=aline;*s;s=g_utf8_next_char(s))

  2205     {

  2206 	c=nc;

  2207 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2208 	if (CHAR_IS_DQUOTE(c))

  2209 	{

  2210 	    if (c==CHAR_DQUOTE)

  2211 	    {

  2212 		parities->dquote=!parities->dquote;

  2213 		parity=parities->dquote;

  2214 	    }

  2215 	    else if (c==CHAR_LD_QUOTE)

  2216 		parity=1;

  2217 	    else

  2218 		parity=0;

  2219 	    if (!parity)

  2220 	    {

  2221 		/* parity even */

  2222 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))

  2223 		{

  2224 		    if (pswit[ECHO_SWITCH])

  2225 			g_print("\n%s\n",aline);

  2226 		    if (!pswit[OVERVIEW_SWITCH])

  2227 			g_print("    Line %ld column %ld - "

  2228 			  "Wrongspaced quotes?\n",

  2229 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2230 		    else

  2231 			cnt_punct++;

  2232 		}

  2233 	    }

  2234 	    else

  2235 	    {

  2236 		/* parity odd */

  2237 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2238 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)

  2239 		{

  2240 		    if (pswit[ECHO_SWITCH])

  2241 			g_print("\n%s\n",aline);

  2242 		    if (!pswit[OVERVIEW_SWITCH])

  2243 			g_print("    Line %ld column %ld - "

  2244 			  "Wrongspaced quotes?\n",

  2245 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2246 		    else

  2247 			cnt_punct++;

  2248 		}

  2249 	    }

  2250 	}

  2251     }

  2252     c=g_utf8_get_char(aline);

  2253     if (CHAR_IS_DQUOTE(c))

  2254     {

  2255 	if (g_utf8_strchr(",;:!?)]} ",-1,

  2256 	  g_utf8_get_char(g_utf8_next_char(aline))))

  2257 	{

  2258 	    if (pswit[ECHO_SWITCH])

  2259 		g_print("\n%s\n",aline);

  2260 	    if (!pswit[OVERVIEW_SWITCH])

  2261 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  2262 		  linecnt);

  2263 	    else

  2264 		cnt_punct++;

  2265 	}

  2266     }

  2267     if (pswit[SQUOTE_SWITCH])

  2268     {

  2269 	nc=g_utf8_get_char(aline);

  2270 	for (s=aline;*s;s=g_utf8_next_char(s))

  2271 	{

  2272 	    c=nc;

  2273 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2274 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&

  2275 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2276 	      !g_unichar_isalpha(nc)))

  2277 	    {

  2278 		parities->squote=!parities->squote;

  2279 		if (!parities->squote)

  2280 		{

  2281 		    /* parity even */

  2282 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2283 		    {

  2284 			if (pswit[ECHO_SWITCH])

  2285 			    g_print("\n%s\n",aline);

  2286 			if (!pswit[OVERVIEW_SWITCH])

  2287 			    g_print("    Line %ld column %ld - "

  2288 			      "Wrongspaced singlequotes?\n",

  2289 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2290 			else

  2291 			    cnt_punct++;

  2292 		    }

  2293 		}

  2294 		else

  2295 		{

  2296 		    /* parity odd */

  2297 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2298 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2299 		    {

  2300 			if (pswit[ECHO_SWITCH])

  2301 			    g_print("\n%s\n",aline);

  2302 			if (!pswit[OVERVIEW_SWITCH])

  2303 			    g_print("    Line %ld column %ld - "

  2304 			      "Wrongspaced singlequotes?\n",

  2305 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2306 			else

  2307 			    cnt_punct++;

  2308 		    }

  2309 		}

  2310 	    }

  2311 	}

  2312     }

  2313 }

  2315 /*

  2316  * check_for_double_punctuation:

  2317  *

  2318  * Look for double punctuation like ,. or ,,

  2319  * Thanks to DW for the suggestion!

  2320  * In books with references, ".," and ".;" are common

  2321  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2322  * OTOH, from my initial tests, there are also fairly

  2323  * common errors. What to do? Make these cases paranoid?

  2324  * ".," is the most common, so warnings->dotcomma is used

  2325  * to suppress detailed reporting if it occurs often.

  2326  */

  2327 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2328 {

  2329     const char *s;

  2330     gunichar c,nc;

  2331     nc=g_utf8_get_char(aline);

  2332     for (s=aline;*s;s=g_utf8_next_char(s))

  2333     {

  2334 	c=nc;

  2335 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2336 	/* for each punctuation character in the line */

  2337 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2338 	  g_utf8_strchr(".?!,;:",-1,nc))

  2339 	{

  2340 	    /* followed by punctuation, it's a query, unless . . . */

  2341 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2342 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2343 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2344 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2345 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2346 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2347 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2348 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2349 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2350 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2351 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2352 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2353 	    {

  2354 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2355 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2356 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2357 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2358 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2359 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2360 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2361 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2362 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2363 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2364 		{

  2365 		    s+=4;

  2366 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2367 		}

  2368 		; /* do nothing for .. !! and ?? which can be legit */

  2369 	    }

  2370 	    else

  2371 	    {

  2372 		if (pswit[ECHO_SWITCH])

  2373 		    g_print("\n%s\n",aline);

  2374 		if (!pswit[OVERVIEW_SWITCH])

  2375 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2376 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2377 		else

  2378 		    cnt_punct++;

  2379 	    }

  2380 	}

  2381     }

  2382 }

  2384 /*

  2385  * check_for_spaced_quotes:

  2386  */

  2387 void check_for_spaced_quotes(const char *aline)

  2388 {

  2389     int i;

  2390     const char *s,*t;

  2391     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,

  2392       CHAR_RS_QUOTE};

  2393     GString *pattern;

  2394     s=aline;

  2395     while ((t=strstr(s," \" ")))

  2396     {

  2397 	if (pswit[ECHO_SWITCH])

  2398 	    g_print("\n%s\n",aline);

  2399 	if (!pswit[OVERVIEW_SWITCH])

  2400 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2401 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2402 	else

  2403 	    cnt_punct++;

  2404 	s=g_utf8_next_char(g_utf8_next_char(t));

  2405     }

  2406     pattern=g_string_new(NULL);

  2407     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)

  2408     {

  2409 	g_string_assign(pattern," ");

  2410 	g_string_append_unichar(pattern,single_quotes[i]);

  2411 	g_string_append_c(pattern,' ');

  2412 	s=aline;

  2413 	while ((t=strstr(s,pattern->str)))

  2414 	{

  2415 	    if (pswit[ECHO_SWITCH])

  2416 		g_print("\n%s\n",aline);

  2417 	    if (!pswit[OVERVIEW_SWITCH])

  2418 		g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2419 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2420 	    else

  2421 		cnt_punct++;

  2422 	    s=g_utf8_next_char(g_utf8_next_char(t));

  2423 	}

  2424     }

  2425     g_string_free(pattern,TRUE);

  2426 }

  2428 /*

  2429  * check_for_miscased_genative:

  2430  *

  2431  * Check special case of 'S instead of 's at end of word.

  2432  */

  2433 void check_for_miscased_genative(const char *aline)

  2434 {

  2435     const char *s;

  2436     gunichar c,nc,pc;

  2437     if (!*aline)

  2438 	return;

  2439     c=g_utf8_get_char(aline);

  2440     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2441     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2442     {

  2443 	pc=c;

  2444 	c=nc;

  2445 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2446 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))

  2447 	{

  2448 	    if (pswit[ECHO_SWITCH])

  2449 		g_print("\n%s\n",aline);

  2450 	    if (!pswit[OVERVIEW_SWITCH])

  2451 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2452 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2453 	    else

  2454 		cnt_punct++;

  2455 	}

  2456     }

  2457 }

  2459 /*

  2460  * check_end_of_line:

  2461  *

  2462  * Now check special cases - start and end of line -

  2463  * for single and double quotes. Start is sometimes [sic]

  2464  * but better to query it anyway.

  2465  * While we're here, check for dash at end of line.

  2466  */

  2467 void check_end_of_line(const char *aline,struct warnings *warnings)

  2468 {

  2469     int lbytes;

  2470     const char *s;

  2471     gunichar c1,c2;

  2472     lbytes=strlen(aline);

  2473     if (g_utf8_strlen(aline,lbytes)>1)

  2474     {

  2475 	s=g_utf8_prev_char(aline+lbytes);

  2476 	c1=g_utf8_get_char(s);

  2477 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2478 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)

  2479 	{

  2480 	    if (pswit[ECHO_SWITCH])

  2481 		g_print("\n%s\n",aline);

  2482 	    if (!pswit[OVERVIEW_SWITCH])

  2483 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2484 		  g_utf8_strlen(aline,lbytes));

  2485 	    else

  2486 		cnt_punct++;

  2487 	}

  2488 	c1=g_utf8_get_char(aline);

  2489 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2490 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)

  2491 	{

  2492 	    if (pswit[ECHO_SWITCH])

  2493 		g_print("\n%s\n",aline);

  2494 	    if (!pswit[OVERVIEW_SWITCH])

  2495 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2496 	    else

  2497 		cnt_punct++;

  2498 	}

  2499 	/*

  2500 	 * Dash at end of line may well be legit - paranoid mode only

  2501 	 * and don't report em-dash at line-end.

  2502 	 */

  2503 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2504 	{

  2505 	    for (s=g_utf8_prev_char(aline+lbytes);

  2506 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2507 		;

  2508 	    if (g_utf8_get_char(s)=='-' &&

  2509 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2510 	    {

  2511 		if (pswit[ECHO_SWITCH])

  2512 		    g_print("\n%s\n",aline);

  2513 		if (!pswit[OVERVIEW_SWITCH])

  2514 		    g_print("    Line %ld column %ld - "

  2515 		      "Hyphen at end of line?\n",

  2516 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2517 	    }

  2518 	}

  2519     }

  2520 }

  2522 /*

  2523  * check_for_unspaced_bracket:

  2524  *

  2525  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2526  * If so, suspect a scanno like "a]most".

  2527  */

  2528 void check_for_unspaced_bracket(const char *aline)

  2529 {

  2530     const char *s;

  2531     gunichar c,nc,pc;

  2532     c=g_utf8_get_char(aline);

  2533     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2534     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2535     {

  2536 	pc=c;

  2537 	c=nc;

  2538 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2539 	if (!nc)

  2540 	    break;

  2541 	/* for each bracket character in the line except 1st & last */

  2542 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2543 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2544 	{

  2545 	    if (pswit[ECHO_SWITCH])

  2546 		g_print("\n%s\n",aline);

  2547 	    if (!pswit[OVERVIEW_SWITCH])

  2548 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2549 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2550 	    else

  2551 		cnt_punct++;

  2552 	}

  2553     }

  2554 }

  2556 /*

  2557  * check_for_unpunctuated_endquote:

  2558  */

  2559 void check_for_unpunctuated_endquote(const char *aline)

  2560 {

  2561     const char *s;

  2562     gunichar c,nc,pc;

  2563     QuoteClass qc;

  2564     c=g_utf8_get_char(aline);

  2565     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2566     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2567     {

  2568 	pc=c;

  2569 	c=nc;

  2570 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;

  2571 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2572 	/* for each character in the line except 1st */

  2573 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))

  2574 	{

  2575 	    if (pswit[ECHO_SWITCH])

  2576 		g_print("\n%s\n",aline);

  2577 	    if (!pswit[OVERVIEW_SWITCH])

  2578 		g_print("    Line %ld column %ld - "

  2579 		  "endquote missing punctuation?\n",

  2580 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2581 	    else

  2582 		cnt_punct++;

  2583 	}

  2584     }

  2585 }

  2587 /*

  2588  * check_for_html_tag:

  2589  *

  2590  * Check for <HTML TAG>.

  2591  *

  2592  * If there is a < in the line, followed at some point

  2593  * by a > then we suspect HTML.

  2594  */

  2595 void check_for_html_tag(const char *aline)

  2596 {

  2597     const char *open,*close;

  2598     gchar *tag;

  2599     open=strchr(aline,'<');

  2600     if (open)

  2601     {

  2602 	close=strchr(g_utf8_next_char(open),'>');

  2603 	if (close)

  2604 	{

  2605 	    if (pswit[ECHO_SWITCH])

  2606 		g_print("\n%s\n",aline);

  2607 	    if (!pswit[OVERVIEW_SWITCH])

  2608 	    {

  2609 		tag=g_strndup(open,close-open+1);

  2610 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2611 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2612 		g_free(tag);

  2613 	    }

  2614 	    else

  2615 		cnt_html++;

  2616 	}

  2617     }

  2618 }

  2620 /*

  2621  * check_for_html_entity:

  2622  *

  2623  * Check for &symbol; HTML.

  2624  *

  2625  * If there is a & in the line, followed at

  2626  * some point by a ; then we suspect HTML.

  2627  */

  2628 void check_for_html_entity(const char *aline)

  2629 {

  2630     const char *s,*amp,*scolon;

  2631     gchar *entity;

  2632     amp=strchr(aline,'&');

  2633     if (amp)

  2634     {

  2635 	scolon=strchr(amp,';');

  2636 	if (scolon)

  2637 	{

  2638 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2639 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2640 		    break;		/* Don't report "Jones & Son;" */

  2641 	    if (s>=scolon)

  2642 	    {

  2643 		if (pswit[ECHO_SWITCH])

  2644 		    g_print("\n%s\n",aline);

  2645 		if (!pswit[OVERVIEW_SWITCH])

  2646 		{

  2647 		    entity=g_strndup(amp,scolon-amp+1);

  2648 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2649 		      linecnt,(int)(amp-aline)+1,entity);

  2650 		    g_free(entity);

  2651 		}

  2652 		else

  2653 		    cnt_html++;

  2654 	    }

  2655 	}

  2656     }

  2657 }

  2659 /*

  2660  * check_for_omitted_punctuation:

  2661  *

  2662  * Check for omitted punctuation at end of paragraph by working back

  2663  * through prevline. DW.

  2664  * Need to check this only for "normal" paras.

  2665  * So what is a "normal" para?

  2666  *    Not normal if one-liner (chapter headings, etc.)

  2667  *    Not normal if doesn't contain at least one locase letter

  2668  *    Not normal if starts with space

  2669  */

  2670 void check_for_omitted_punctuation(const char *prevline,

  2671   struct line_properties *last,int start_para_line)

  2672 {

  2673     gboolean letter_on_line=FALSE;

  2674     const char *s;

  2675     gunichar c;

  2676     gboolean closing_quote;

  2677     for (s=prevline;*s;s=g_utf8_next_char(s))

  2678 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2679 	{

  2680 	    letter_on_line=TRUE;

  2681 	    break;

  2682 	}

  2683     /*

  2684      * This next "if" is a problem.

  2685      * If we say "start_para_line <= linecnt - 1", that includes

  2686      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2687      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2688      * misses genuine one-line paragraphs.

  2689      */

  2690     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2691       g_utf8_get_char(prevline)>CHAR_SPACE)

  2692     {

  2693 	s=prevline+strlen(prevline);

  2694 	do

  2695 	{

  2696 	    s=g_utf8_prev_char(s);

  2697 	    c=g_utf8_get_char(s);

  2698 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)

  2699 		closing_quote=TRUE;

  2700 	    else

  2701 		closing_quote=FALSE;

  2702 	} while (closing_quote && s>prevline);

  2703 	for (;s>prevline;s=g_utf8_prev_char(s))

  2704 	{

  2705 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2706 	    {

  2707 		if (pswit[ECHO_SWITCH])

  2708 		    g_print("\n%s\n",prevline);

  2709 		if (!pswit[OVERVIEW_SWITCH])

  2710 		    g_print("    Line %ld column %ld - "

  2711 		      "No punctuation at para end?\n",

  2712 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2713 		else

  2714 		    cnt_punct++;

  2715 		break;

  2716 	    }

  2717 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))

  2718 		break;

  2719 	}

  2720     }

  2721 }

  2723 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2724 {

  2725     const char *word=key;

  2726     int *dupcnt=value;

  2727     if (*dupcnt)

  2728 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2729 	  word,*dupcnt);

  2730     return FALSE;

  2731 }

  2733 void print_as_windows_1252(const char *string)

  2734 {

  2735     gsize inbytes,outbytes;

  2736     gchar *buf,*bp;

  2737     static GIConv converter=(GIConv)-1;

  2738     if (!string)

  2739     {

  2740 	if (converter!=(GIConv)-1)

  2741 	    g_iconv_close(converter);

  2742 	converter=(GIConv)-1;

  2743 	return;

  2744     }

  2745     if (converter==(GIConv)-1)

  2746 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  2747     if (converter!=(GIConv)-1)

  2748     {

  2749 	inbytes=outbytes=strlen(string);

  2750 	bp=buf=g_malloc(outbytes+1);

  2751 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  2752 	*bp='\0';

  2753 	fputs(buf,stdout);

  2754 	g_free(buf);

  2755     }

  2756     else

  2757 	fputs(string,stdout);

  2758 }

  2760 void print_as_utf_8(const char *string)

  2761 {

  2762     fputs(string,stdout);

  2763 }

  2765 /*

  2766  * procfile:

  2767  *

  2768  * Process one file.

  2769  */

  2770 void procfile(const char *filename)

  2771 {

  2772     const char *s;

  2773     gchar *parastart=NULL;	/* first line of current para */

  2774     gchar *etext,*aline;

  2775     gchar *etext_ptr;

  2776     GError *err=NULL;

  2777     struct first_pass_results *first_pass_results;

  2778     struct warnings *warnings;

  2779     struct counters counters={0};

  2780     struct line_properties last={0};

  2781     struct parities parities={0};

  2782     struct pending pending={0};

  2783     gboolean isemptyline;

  2784     long start_para_line=0;

  2785     gboolean isnewpara=FALSE,enddash=FALSE;

  2786     last.start=CHAR_SPACE;

  2787     linecnt=checked_linecnt=0;

  2788     etext=read_etext(filename,&err);

  2789     if (!etext)

  2790     {

  2791 	if (pswit[STDOUT_SWITCH])

  2792 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2793 	else

  2794 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2795 	exit(1);

  2796     }

  2797     g_print("\n\nFile: %s\n\n",filename);

  2798     first_pass_results=first_pass(etext);

  2799     warnings=report_first_pass(first_pass_results);

  2800     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2801     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2802     /*

  2803      * Here we go with the main pass. Hold onto yer hat!

  2804      */

  2805     linecnt=0;

  2806     etext_ptr=etext;

  2807     while ((aline=flgets(&etext_ptr,linecnt+1)))

  2808     {

  2809 	linecnt++;

  2810 	if (linecnt==1)

  2811 	    isnewpara=TRUE;

  2812 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  2813 	    continue;    // skip DP page separators completely

  2814 	if (linecnt<first_pass_results->firstline ||

  2815 	  (first_pass_results->footerline>0 &&

  2816 	  linecnt>first_pass_results->footerline))

  2817 	{

  2818 	    if (pswit[HEADER_SWITCH])

  2819 	    {

  2820 		if (g_str_has_prefix(aline,"Title:"))

  2821 		    g_print("    %s\n",aline);

  2822 		if (g_str_has_prefix(aline,"Author:"))

  2823 		    g_print("    %s\n",aline);

  2824 		if (g_str_has_prefix(aline,"Release Date:"))

  2825 		    g_print("    %s\n",aline);

  2826 		if (g_str_has_prefix(aline,"Edition:"))

  2827 		    g_print("    %s\n\n",aline);

  2828 	    }

  2829 	    continue;		/* skip through the header */

  2830 	}

  2831 	checked_linecnt++;

  2832 	print_pending(aline,parastart,&pending);

  2833 	isemptyline=analyse_quotes(aline,linecnt,&counters);

  2834 	if (isnewpara && !isemptyline)

  2835 	{

  2836 	    /* This line is the start of a new paragraph. */

  2837 	    start_para_line=linecnt;

  2838 	    /* Capture its first line in case we want to report it later. */

  2839 	    g_free(parastart);

  2840 	    parastart=g_strdup(aline);

  2841 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2842 	    s=aline;

  2843 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  2844 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  2845 		s=g_utf8_next_char(s);

  2846 	    if (g_unichar_islower(g_utf8_get_char(s)))

  2847 	    {

  2848 		/* and its first letter is lowercase */

  2849 		if (pswit[ECHO_SWITCH])

  2850 		    g_print("\n%s\n",aline);

  2851 		if (!pswit[OVERVIEW_SWITCH])

  2852 		    g_print("    Line %ld column %ld - "

  2853 		      "Paragraph starts with lower-case\n",

  2854 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2855 		else

  2856 		    cnt_punct++;

  2857 	    }

  2858 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2859 	}

  2860 	/* Check for an em-dash broken at line end. */

  2861 	if (enddash && g_utf8_get_char(aline)=='-')

  2862 	{

  2863 	    if (pswit[ECHO_SWITCH])

  2864 		g_print("\n%s\n",aline);

  2865 	    if (!pswit[OVERVIEW_SWITCH])

  2866 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2867 	    else

  2868 		cnt_punct++;

  2869 	}

  2870 	enddash=FALSE;

  2871 	for (s=g_utf8_prev_char(aline+strlen(aline));

  2872 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  2873 	    ;

  2874 	if (s>=aline && g_utf8_get_char(s)=='-')

  2875 	    enddash=TRUE;

  2876 	check_for_control_characters(aline);

  2877 	if (warnings->bin)

  2878 	    check_for_odd_characters(aline,warnings,isemptyline);

  2879 	if (warnings->longline)

  2880 	    check_for_long_line(aline);

  2881 	if (warnings->shortline)

  2882 	    check_for_short_line(aline,&last);

  2883 	last.blen=last.len;

  2884 	last.len=g_utf8_strlen(aline,-1);

  2885 	last.start=g_utf8_get_char(aline);

  2886 	check_for_starting_punctuation(aline);

  2887 	if (warnings->dash)

  2888 	{

  2889 	    check_for_spaced_emdash(aline);

  2890 	    check_for_spaced_dash(aline);

  2891 	}

  2892 	check_for_unmarked_paragraphs(aline);

  2893 	check_for_jeebies(aline);

  2894 	check_for_mta_from(aline);

  2895 	check_for_orphan_character(aline);

  2896 	check_for_pling_scanno(aline);

  2897 	check_for_extra_period(aline,warnings);

  2898 	check_for_following_punctuation(aline);

  2899 	check_for_typos(aline,warnings);

  2900 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2901 	check_for_double_punctuation(aline,warnings);

  2902 	check_for_spaced_quotes(aline);

  2903 	check_for_miscased_genative(aline);

  2904 	check_end_of_line(aline,warnings);

  2905 	check_for_unspaced_bracket(aline);

  2906 	if (warnings->endquote)

  2907 	    check_for_unpunctuated_endquote(aline);

  2908 	check_for_html_tag(aline);

  2909 	check_for_html_entity(aline);

  2910 	if (isemptyline)

  2911 	{

  2912 	    check_for_mismatched_quotes(&counters,&pending);

  2913 	    counters_reset(&counters);

  2914 	    /* let the next iteration know that it's starting a new para */

  2915 	    isnewpara=TRUE;

  2916 	    if (prevline)

  2917 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2918 	}

  2919 	g_free(prevline);

  2920 	prevline=g_strdup(aline);

  2921     }

  2922     linecnt++;

  2923     check_for_mismatched_quotes(&counters,&pending);

  2924     print_pending(NULL,parastart,&pending);

  2925     reset_pending(&pending);

  2926     if (prevline)

  2927     {

  2928 	g_free(prevline);

  2929 	prevline=NULL;

  2930     }

  2931     g_free(parastart);

  2932     g_free(prevline);

  2933     g_free(etext);

  2934     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  2935 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2936     g_tree_unref(qword);

  2937     g_tree_unref(qperiod);

  2938     counters_destroy(&counters);

  2939     g_set_print_handler(NULL);

  2940     print_as_windows_1252(NULL);

  2941     if (pswit[MARKUP_SWITCH])

  2942 	loseentities(NULL);

  2943 }

  2945 /*

  2946  * flgets:

  2947  *

  2948  * Get one line from the input text, checking for

  2949  * the existence of exactly one CR/LF line-end per line.

  2950  *

  2951  * Returns: a pointer to the line.

  2952  */

  2953 char *flgets(char **etext,long lcnt)

  2954 {

  2955     gunichar c;

  2956     gboolean isCR=FALSE;

  2957     char *theline=*etext;

  2958     char *eos=theline;

  2959     gchar *s;

  2960     for (;;)

  2961     {

  2962 	c=g_utf8_get_char(*etext);

  2963 	*etext=g_utf8_next_char(*etext);

  2964 	if (!c)

  2965 	    return NULL;

  2966 	/* either way, it's end of line */

  2967 	if (c=='\n')

  2968 	{

  2969 	    if (isCR)

  2970 		break;

  2971 	    else

  2972 	    {

  2973 		/* Error - a LF without a preceding CR */

  2974 		if (pswit[LINE_END_SWITCH])

  2975 		{

  2976 		    if (pswit[ECHO_SWITCH])

  2977 		    {

  2978 			s=g_strndup(theline,eos-theline);

  2979 			g_print("\n%s\n",s);

  2980 			g_free(s);

  2981 		    }

  2982 		    if (!pswit[OVERVIEW_SWITCH])

  2983 			g_print("    Line %ld - No CR?\n",lcnt);

  2984 		    else

  2985 			cnt_lineend++;

  2986 		}

  2987 		break;

  2988 	    }

  2989 	}

  2990 	if (c=='\r')

  2991 	{

  2992 	    if (isCR)

  2993 	    {

  2994 		/* Error - two successive CRs */

  2995 		if (pswit[LINE_END_SWITCH])

  2996 		{

  2997 		    if (pswit[ECHO_SWITCH])

  2998 		    {

  2999 			s=g_strndup(theline,eos-theline);

  3000 			g_print("\n%s\n",s);

  3001 			g_free(s);

  3002 		    }

  3003 		    if (!pswit[OVERVIEW_SWITCH])

  3004 			g_print("    Line %ld - Two successive CRs?\n",lcnt);

  3005 		    else

  3006 			cnt_lineend++;

  3007 		}

  3008 	    }

  3009 	    isCR=TRUE;

  3010 	}

  3011 	else

  3012 	{

  3013 	    if (pswit[LINE_END_SWITCH] && isCR)

  3014 	    {

  3015 		if (pswit[ECHO_SWITCH])

  3016 		{

  3017 		    s=g_strndup(theline,eos-theline);

  3018 		    g_print("\n%s\n",s);

  3019 		    g_free(s);

  3020 		}

  3021 		if (!pswit[OVERVIEW_SWITCH])

  3022 		    g_print("    Line %ld column %ld - CR without LF?\n",

  3023 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  3024 		else

  3025 		    cnt_lineend++;

  3026 		*eos=' ';

  3027 	    }

  3028 	    isCR=FALSE;

  3029 	    eos=g_utf8_next_char(eos);

  3030 	}

  3031     }

  3032     *eos='\0';

  3033     if (pswit[MARKUP_SWITCH])

  3034 	postprocess_for_HTML(theline);

  3035     if (pswit[DP_SWITCH])

  3036 	postprocess_for_DP(theline);

  3037     return theline;

  3038 }

  3040 /*

  3041  * mixdigit:

  3042  *

  3043  * Takes a "word" as a parameter, and checks whether it

  3044  * contains a mixture of alpha and digits. Generally, this is an

  3045  * error, but may not be for cases like 4th or L5 12s. 3d.

  3046  *

  3047  * Returns: TRUE iff an is error found.

  3048  */

  3049 gboolean mixdigit(const char *checkword)

  3050 {

  3051     gboolean wehaveadigit,wehavealetter,query;

  3052     const char *s,*nondigit;

  3053     wehaveadigit=wehavealetter=query=FALSE;

  3054     for (s=checkword;*s;s=g_utf8_next_char(s))

  3055 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  3056 	    wehavealetter=TRUE;

  3057 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  3058 	    wehaveadigit=TRUE;

  3059     if (wehaveadigit && wehavealetter)

  3060     {

  3061 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  3062 	query=TRUE;

  3063 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  3064 	  nondigit=g_utf8_next_char(nondigit))

  3065 	    ;

  3066 	/* digits, ending in st, rd, nd, th of either case */

  3067 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  3068 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  3069 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  3070 	  !g_ascii_strcasecmp(nondigit,"th"))

  3071 	    query=FALSE;

  3072 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  3073 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  3074 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  3075 	  !g_ascii_strcasecmp(nondigit,"ths"))

  3076 	    query=FALSE;

  3077 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  3078 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  3079 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  3080 	  !g_ascii_strcasecmp(nondigit,"thly"))

  3081 	    query=FALSE;

  3082 	/* digits, ending in l, L, s or d */

  3083 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  3084 	  !strcmp(nondigit,"d"))

  3085 	    query=FALSE;

  3086 	/*

  3087 	 * L at the start of a number, representing Britsh pounds, like L500.

  3088 	 * This is cute. We know the current word is mixed digit. If the first

  3089 	 * letter is L, there must be at least one digit following. If both

  3090 	 * digits and letters follow, we have a genuine error, else we have a

  3091 	 * capital L followed by digits, and we accept that as a non-error.

  3092 	 */

  3093 	if (g_utf8_get_char(checkword)=='L' &&

  3094 	  !mixdigit(g_utf8_next_char(checkword)))

  3095 	    query=FALSE;

  3096     }

  3097     return query;

  3098 }

  3100 /*

  3101  * getaword:

  3102  *

  3103  * Extracts the first/next "word" from the line, and returns it.

  3104  * A word is defined as one English word unit--or at least that's the aim.

  3105  * "ptr" is advanced to the position in the line where we will start

  3106  * looking for the next word.

  3107  *

  3108  * Returns: A newly-allocated string.

  3109  */

  3110 gchar *getaword(const char **ptr)

  3111 {

  3112     const char *s,*t;

  3113     GString *word;

  3114     gunichar c,pc;

  3115     word=g_string_new(NULL);

  3116     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  3117       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  3118       **ptr;*ptr=g_utf8_next_char(*ptr))

  3119 	;

  3120     /*

  3121      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  3122      * Especially yucky is the case of L1,000

  3123      * This section looks for a pattern of characters including a digit

  3124      * followed by a comma or period followed by one or more digits.

  3125      * If found, it returns this whole pattern as a word; otherwise we discard

  3126      * the results and resume our normal programming.

  3127      */

  3128     s=*ptr;

  3129     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  3130       g_unichar_isalpha(g_utf8_get_char(s)) ||

  3131       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  3132 	g_string_append_unichar(word,g_utf8_get_char(s));

  3133     if (word->len)

  3134     {

  3135 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  3136 	{

  3137 	    c=g_utf8_get_char(t);

  3138 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  3139 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  3140 	    {

  3141 		*ptr=s;

  3142 		return g_string_free(word,FALSE);

  3143 	    }

  3144 	}

  3145     }

  3146     /* we didn't find a punctuated number - do the regular getword thing */

  3147     g_string_truncate(word,0);

  3148     c=g_utf8_get_char(*ptr);

  3149     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);

  3150       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))

  3151 	g_string_append_unichar(word,c);

  3152     return g_string_free(word,FALSE);

  3153 }

  3155 /*

  3156  * isroman:

  3157  *

  3158  * Is this word a Roman Numeral?

  3159  *

  3160  * It doesn't actually validate that the number is a valid Roman Numeral--for

  3161  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  3162  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  3163  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  3164  * expressions thereof, except when it came to taxes. Allow any number of M,

  3165  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3166  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3167  * of optional Is.

  3168  */

  3169 gboolean isroman(const char *t)

  3170 {

  3171     const char *s;

  3172     if (!t || !*t)

  3173 	return FALSE;

  3174     s=t;

  3175     while (g_utf8_get_char(t)=='m' && *t)

  3176 	t++;

  3177     if (g_utf8_get_char(t)=='d')

  3178 	t++;

  3179     if (g_str_has_prefix(t,"cm"))

  3180 	t+=2;

  3181     if (g_str_has_prefix(t,"cd"))

  3182 	t+=2;

  3183     while (g_utf8_get_char(t)=='c' && *t)

  3184 	t++;

  3185     if (g_str_has_prefix(t,"xl"))

  3186 	t+=2;

  3187     if (g_str_has_prefix(t,"xc"))

  3188 	t+=2;

  3189     if (g_utf8_get_char(t)=='l')

  3190 	t++;

  3191     while (g_utf8_get_char(t)=='x' && *t)

  3192 	t++;

  3193     if (g_str_has_prefix(t,"ix"))

  3194 	t+=2;

  3195     if (g_str_has_prefix(t,"iv"))

  3196 	t+=2;

  3197     if (g_utf8_get_char(t)=='v')

  3198 	t++;

  3199     while (g_utf8_get_char(t)=='i' && *t)

  3200 	t++;

  3201     return !*t;

  3202 }

  3204 /*

  3205  * postprocess_for_DP:

  3206  *

  3207  * Invoked with the -d switch from flgets().

  3208  * It simply "removes" from the line a hard-coded set of common

  3209  * DP-specific tags, so that the line passed to the main routine has

  3210  * been pre-cleaned of DP markup.

  3211  */

  3212 void postprocess_for_DP(char *theline)

  3213 {

  3214     char *s,*t;

  3215     int i;

  3216     if (!*theline)

  3217 	return;

  3218     for (i=0;*DPmarkup[i];i++)

  3219 	while ((s=strstr(theline,DPmarkup[i])))

  3220 	{

  3221 	    t=s+strlen(DPmarkup[i]);

  3222 	    memmove(s,t,strlen(t)+1);

  3223 	}

  3224 }

  3226 /*

  3227  * postprocess_for_HTML:

  3228  *

  3229  * Invoked with the -m switch from flgets().

  3230  * It simply "removes" from the line a hard-coded set of common

  3231  * HTML tags and "replaces" a hard-coded set of common HTML

  3232  * entities, so that the line passed to the main routine has

  3233  * been pre-cleaned of HTML.

  3234  */

  3235 void postprocess_for_HTML(char *theline)

  3236 {

  3237     while (losemarkup(theline))

  3238 	;

  3239     loseentities(theline);

  3240 }

  3242 char *losemarkup(char *theline)

  3243 {

  3244     char *s,*t;

  3245     int i;

  3246     s=strchr(theline,'<');

  3247     t=s?strchr(s,'>'):NULL;

  3248     if (!s || !t)

  3249 	return NULL;

  3250     for (i=0;*markup[i];i++)

  3251 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  3252 	{

  3253 	    t=g_utf8_next_char(t);

  3254 	    memmove(s,t,strlen(t)+1);

  3255 	    return s;

  3256 	}

  3257     /* It's an unrecognized <xxx>. */

  3258     return NULL;

  3259 }

  3261 void loseentities(char *theline)

  3262 {

  3263     int i;

  3264     gsize nb;

  3265     char *amp,*scolon;

  3266     gchar *s,*t;

  3267     gunichar c;

  3268     GTree *entities=NULL;

  3269     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3270     if (!theline)

  3271     {

  3272 	if (entities)

  3273 	    g_tree_destroy(entities);

  3274 	entities=NULL;

  3275 	if (translit!=(GIConv)-1)

  3276 	    g_iconv_close(translit);

  3277 	translit=(GIConv)-1;

  3278 	if (to_utf8!=(GIConv)-1)

  3279 	    g_iconv_close(to_utf8);

  3280 	to_utf8=(GIConv)-1;

  3281 	return;

  3282     }

  3283     if (!*theline)

  3284 	return;

  3285     if (!entities)

  3286     {

  3287 	entities=g_tree_new((GCompareFunc)strcmp);

  3288 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3289 	    g_tree_insert(entities,HTMLentities[i].name,

  3290 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3291     }

  3292     if (translit==(GIConv)-1)

  3293 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3294     if (to_utf8==(GIConv)-1)

  3295 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3296     while((amp=strchr(theline,'&')))

  3297     {

  3298 	scolon=strchr(amp,';');

  3299 	if (scolon)

  3300 	{

  3301 	    if (amp[1]=='#')

  3302 	    {

  3303 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3304 		    c=strtol(amp+2,NULL,10);

  3305 		else if (amp[2]=='x' &&

  3306 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3307 		    c=strtol(amp+3,NULL,16);

  3308 	    }

  3309 	    else

  3310 	    {

  3311 		s=g_strndup(amp+1,scolon-(amp+1));

  3312 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3313 		g_free(s);

  3314 	    }

  3315 	}

  3316 	else

  3317 	    c=0;

  3318 	if (c)

  3319 	{

  3320 	    theline=amp;

  3321 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3322 		theline+=g_unichar_to_utf8(c,theline);

  3323 	    else

  3324 	    {

  3325 		s=g_malloc(6);

  3326 		nb=g_unichar_to_utf8(c,s);

  3327 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3328 		g_free(s);

  3329 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3330 		g_free(t);

  3331 		memcpy(theline,s,nb);

  3332 		g_free(s);

  3333 		theline+=nb;

  3334 	    }

  3335 	    memmove(theline,g_utf8_next_char(scolon),

  3336 	      strlen(g_utf8_next_char(scolon))+1);

  3337 	}

  3338 	else

  3339 	    theline=g_utf8_next_char(amp);

  3340     }

  3341 }

  3343 gboolean tagcomp(const char *strin,const char *basetag)

  3344 {

  3345     gboolean retval;

  3346     gchar *s,*t;

  3347     if (g_utf8_get_char(strin)=='/')

  3348 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3349     else

  3350 	t=g_utf8_casefold(strin,-1);

  3351     s=g_utf8_casefold(basetag,-1);

  3352     retval=g_str_has_prefix(t,s);

  3353     g_free(s);

  3354     g_free(t);

  3355     return retval;

  3356 }

  3358 void proghelp(GOptionContext *context)

  3359 {

  3360     gchar *help;

  3361     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3362     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3363     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3364     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3365       "For details, read the file COPYING.\n",stderr);

  3366     fputs("This is Free Software; "

  3367       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3368     fputs("read the file COPYING for details.\n\n",stderr);

  3369     help=g_option_context_get_help(context,TRUE,NULL);

  3370     fputs(help,stderr);

  3371     g_free(help);

  3372     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3373     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3374       "non-ASCII\n",stderr);

  3375     fputs("characters like accented letters, "

  3376       "lines longer than 75 or shorter than 55,\n",stderr);

  3377     fputs("unbalanced quotes or brackets, "

  3378       "a variety of badly formatted punctuation, \n",stderr);

  3379     fputs("HTML tags, some likely typos. "

  3380       "It is NOT a substitute for human judgement.\n",stderr);

  3381     fputs("\n",stderr);

  3382 }

author	ali <ali@juiblex.co.uk>
	Sun Sep 29 22:51:27 2013 +0100 (2013-09-29)
changeset 151	a485f5dcc2de
parent 142	466f43a12118
child 152	da598b05f8e8
permissions	-rw-r--r--