bookloupe: bookloupe/bookloupe.c@70cc629ec1e0

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "bookloupe.h"

    31 #include "counters.h"

    32 #include "pending.h"

    33 #include "HTMLentities.h"

    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */

    36 GIConv charset_validator=(GIConv)-1;

    38 gchar *prevline;

    40 /* Common typos. */

    41 char *typo[] = {

    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    71     "se", ""

    72 };

    74 GTree *usertypo;

    76 /* Common abbreviations and other OK words not to query as typos. */

    77 char *okword[] = {

    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    81     "outbid", "outbids", "frostbite", "frostbitten", ""

    82 };

    84 /* Common abbreviations that cause otherwise unexplained periods. */

    85 char *abbrev[] = {

    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    88 };

    90 /*

    91  * Two-Letter combinations that rarely if ever start words,

    92  * but are common scannos or otherwise common letter combinations.

    93  */

    94 char *nostart[] = {

    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    96 };

    98 /*

    99  * Two-Letter combinations that rarely if ever end words,

   100  * but are common scannos or otherwise common letter combinations.

   101  */

   102 char *noend[] = {

   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   104     "sw", "gr", "sl", "cl", "iy", ""

   105 };

   107 char *markup[] = {

   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   112 };

   114 char *DPmarkup[] = {

   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   116 };

   118 char *nocomma[] = {

   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   122     "during", "let", "toward", "among", ""

   123 };

   125 char *noperiod[] = {

   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   129     "among", "those", "into", "whom", "having", "thence", ""

   130 };

   132 gboolean pswit[SWITNO];  /* program switches */

   133 gchar *opt_charset;

   135 gboolean typo_compat,paranoid_compat;

   137 static GOptionEntry options[]={

   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   139       "Ignore DP-specific markup", NULL },

   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   142       "Don't ignore DP-specific markup", NULL },

   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   144       "Echo queried line", NULL },

   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,

   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   147       "Don't echo queried line", NULL },

   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   149       "Check single quotes", NULL },

   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   152       "Don't check single quotes", NULL },

   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   154       "Check common typos", NULL },

   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   157       "Don't check common typos", NULL },

   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   159       "Require closure of quotes on every paragraph", NULL },

   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   162       "Don't require closure of quotes on every paragraph", NULL },

   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,

   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   165       "Enable paranoid querying of everything", NULL },

   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,

   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   168       "Disable paranoid querying of everything", NULL },

   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,

   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   171       "Enable line end checking", NULL },

   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,

   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   174       "Disable line end checking", NULL },

   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   176       "Overview: just show counts", NULL },

   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   179       "Show individual warnings", NULL },

   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   181       "Output errors to stdout instead of stderr", NULL },

   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   184       "Output errors to stderr instead of stdout", NULL },

   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   186       "Echo header fields", NULL },

   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   189       "Don't echo header fields", NULL },

   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   191       "Ignore markup in < >", NULL },

   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   194       "No special handling for markup in < >", NULL },

   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   196       "Use file of user-defined typos", NULL },

   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   199       "Ignore file of user-defined typos", NULL },

   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   201       "Verbose - list everything", NULL },

   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   204       "Switch off verbose mode", NULL },

   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,

   206       "Set of characters valid for this ebook", "NAME" },

   207     { NULL }

   208 };

   210 /*

   211  * Options relating to configuration which make no sense from inside

   212  * a configuration file.

   213  */

   215 static GOptionEntry config_options[]={

   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   217       "Defaults for use on www upload", NULL },

   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,

   219       "Dump current config settings", NULL },

   220     { NULL }

   221 };

   223 static GOptionEntry compatibility_options[]={

   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,

   225       "Toggle checking for common typos", NULL },

   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,

   227       "Toggle both paranoid mode and common typos", NULL },

   228     { NULL }

   229 };

   231 long cnt_quote;		/* for overview mode, count of quote queries */

   232 long cnt_brack;		/* for overview mode, count of brackets queries */

   233 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   234 long cnt_odd;		/* for overview mode, count of odd character queries */

   235 long cnt_long;		/* for overview mode, count of long line errors */

   236 long cnt_short;		/* for overview mode, count of short line queries */

   237 long cnt_punct;		/* for overview mode,

   238 			   count of punctuation and spacing queries */

   239 long cnt_dash;		/* for overview mode, count of dash-related queries */

   240 long cnt_word;		/* for overview mode, count of word queries */

   241 long cnt_html;		/* for overview mode, count of html queries */

   242 long cnt_lineend;	/* for overview mode, count of line-end queries */

   243 long cnt_spacend;	/* count of lines with space at end */

   244 long linecnt;		/* count of total lines in the file */

   245 long checked_linecnt;	/* count of lines actually checked */

   247 void proghelp(GOptionContext *context);

   248 void procfile(const char *);

   250 gchar *running_from;

   252 gboolean mixdigit(const char *);

   253 gchar *getaword(const char **);

   254 char *flgets(char **,long,int);

   255 void postprocess_for_HTML(char *);

   256 char *linehasmarkup(char *);

   257 char *losemarkup(char *);

   258 gboolean tagcomp(const char *,const char *);

   259 void loseentities(char *);

   260 gboolean isroman(const char *);

   261 void postprocess_for_DP(char *);

   262 void print_as_windows_1252(const char *string);

   263 void print_as_utf_8(const char *string);

   265 GTree *qword,*qperiod;

   267 #ifdef __WIN32__

   268 UINT saved_cp;

   269 #endif

   271 gboolean set_charset(const char *name,GError **err)

   272 {

   273     /* The various UNICODE encodings all share the same character set. */

   274     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",

   275       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",

   276       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",

   277       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",

   278       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };

   279     int i;

   280     if (charset)

   281 	g_free(charset);

   282     if (charset_validator!=(GIConv)-1)

   283 	g_iconv_close(charset_validator);

   284     if (!name || !g_strcasecmp(name,"auto"))

   285     {

   286 	charset=NULL;

   287 	charset_validator=(GIConv)-1;

   288 	return TRUE;

   289     }

   290     else

   291 	charset=g_strdup(name);

   292     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)

   293 	if (!g_strcasecmp(charset,unicode_aliases[i]))

   294 	{

   295 	    g_free(charset);

   296 	    charset=g_strdup("UTF-8");

   297 	    break;

   298 	}

   299     if (!strcmp(charset,"UTF-8"))

   300 	charset_validator=(GIConv)-1;

   301     else

   302     {

   303 	charset_validator=g_iconv_open(charset,"UTF-8");

   304 	if (charset_validator==(GIConv)-1)

   305 	{

   306 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,

   307 	      "Unknown character set \"%s\"",charset);

   308 	    return FALSE;

   309 	}

   310     }

   311     return TRUE;

   312 }

   314 GKeyFile *config;

   316 void config_file_update(GKeyFile *kf)

   317 {

   318     int i;

   319     const char *s;

   320     gboolean sw;

   321     for(i=0;options[i].long_name;i++)

   322     {

   323 	if (g_str_has_prefix(options[i].long_name,"no-"))

   324 	    continue;

   325 	if (options[i].arg==G_OPTION_ARG_NONE)

   326 	{

   327 	    sw=*(gboolean *)options[i].arg_data;

   328 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)

   329 		sw=!sw;

   330 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);

   331 	}

   332 	else if (options[i].arg==G_OPTION_ARG_STRING)

   333 	{

   334 	    s=*(gchar **)options[i].arg_data;

   335 	    if (!s)

   336 		s="auto";

   337 	    g_key_file_set_string(kf,"options",options[i].long_name,s);

   338 	}

   339 	else

   340 	    g_assert_not_reached();

   341     }

   342 }

   344 void config_file_add_comments(GKeyFile *kf)

   345 {

   346     int i;

   347     gchar *comment;

   348     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",

   349       NULL);

   350     for(i=0;options[i].long_name;i++)

   351     {

   352 	if (g_str_has_prefix(options[i].long_name,"no-"))

   353 	    continue;

   354 	comment=g_strconcat(" ",options[i].description,NULL);

   355 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);

   356 	g_free(comment);

   357     }

   358 }

   360 void dump_config(void)

   361 {

   362     gchar *s;

   363     if (config)

   364 	config_file_update(config);

   365     else

   366     {

   367 	config=g_key_file_new();

   368 	config_file_update(config);

   369 	config_file_add_comments(config);

   370     }

   371     s=g_key_file_to_data(config,NULL,NULL);

   372     if (s)

   373 	g_print("%s",s);

   374     g_free(s);

   375 }

   377 GKeyFile *read_config_file(gchar **full_path)

   378 {

   379     int i;

   380     GError *err=NULL;

   381     gchar **search_dirs;

   382     gchar *path;

   383     const char *search_path;

   384     GKeyFile *kf;

   385     kf=g_key_file_new();

   386     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");

   387     if (search_path)

   388     {

   389 #ifdef __WIN32__

   390 	search_dirs=g_strsplit(search_path,";",0);

   391 #else

   392 	search_dirs=g_strsplit(search_path,":",0);

   393 #endif

   394     }

   395     else

   396     {

   397 	search_dirs=g_new(gchar *,4);

   398 	search_dirs[0]=g_get_current_dir();

   399 	search_dirs[1]=g_strdup(running_from);

   400 	search_dirs[2]=g_strdup(g_get_user_config_dir());

   401 	search_dirs[3]=NULL;

   402     }

   403     for(i=0;search_dirs[i];i++)

   404     {

   405 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);

   406 	if (g_key_file_load_from_file(kf,path,

   407 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))

   408 	    break;

   409 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   410 	{

   411 	    g_printerr("Bookloupe: Error reading %s\n",path);

   412 	    g_printerr("%s\n",err->message);

   413 	    exit(1);

   414 	}

   415 	g_clear_error(&err);

   416 	g_free(path);

   417 	path=NULL;

   418     }

   419     if (!search_dirs[i])

   420     {

   421 	g_key_file_free(kf);

   422 	kf=NULL;

   423     }

   424     g_strfreev(search_dirs);

   425     if (full_path && kf)

   426 	*full_path=path;

   427     else

   428 	g_free(path);

   429     return kf;

   430 }

   432 void parse_config_file(void)

   433 {

   434     int i,j;

   435     gchar *path,*s;

   436     gchar **keys;

   437     gboolean sw;

   438     GError *err=NULL;

   439     config=read_config_file(&path);

   440     if (config)

   441 	keys=g_key_file_get_keys(config,"options",NULL,NULL);

   442     else

   443 	keys=NULL;

   444     if (keys)

   445     {

   446 	for(i=0;keys[i];i++)

   447 	{

   448 	    for(j=0;options[j].long_name;j++)

   449 	    {

   450 		if (g_str_has_prefix(options[j].long_name,"no-"))

   451 		    continue;

   452 		else if (!strcmp(keys[i],options[j].long_name))

   453 		{

   454 		    if (options[j].arg==G_OPTION_ARG_NONE)

   455 		    {

   456 			sw=g_key_file_get_boolean(config,"options",keys[i],

   457 			  &err);

   458 			if (err)

   459 			{

   460 			    g_printerr("Bookloupe: %s: options.%s: %s\n",

   461 			      path,keys[i],err->message);

   462 			    g_clear_error(&err);

   463 			}

   464 			else

   465 			{

   466 			    if (options[j].flags&G_OPTION_FLAG_REVERSE)

   467 				sw=!sw;

   468 			    *(gboolean *)options[j].arg_data=sw;

   469 			}

   470 			break;

   471 		    }

   472 		    else if (options[j].arg==G_OPTION_ARG_STRING)

   473 		    {

   474 			s=g_key_file_get_string(config,"options",keys[i],

   475 			  &err);

   476 			if (err)

   477 			{

   478 			    g_printerr("Bookloupe: %s: options.%s: %s\n",

   479 			      path,keys[i],err->message);

   480 			    g_clear_error(&err);

   481 			}

   482 			else

   483 			{

   484 			    g_free(*(gchar **)options[j].arg_data);

   485 			    if (!g_strcmp0(s,"auto"))

   486 			    {

   487 				*(gchar **)options[j].arg_data=NULL;

   488 				g_free(s);

   489 			    }

   490 			    else

   491 				*(gchar **)options[j].arg_data=s;

   492 			}

   493 			break;

   494 		    }

   495 		    else

   496 			g_assert_not_reached();

   497 		}

   498 	    }

   499 	    if (!options[j].long_name)

   500 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",

   501 		  path,keys[i]);

   502 	}

   503 	g_strfreev(keys);

   504     }

   505     if (config)

   506 	g_free(path);

   507 }

   509 void parse_options(int *argc,char ***argv)

   510 {

   511     GError *err=NULL;

   512     GOptionContext *context;

   513     GOptionGroup *compatibility;

   514     context=g_option_context_new(

   515       "file - look for errors in Project Gutenberg(TM) etexts");

   516     g_option_context_add_main_entries(context,options,NULL);

   517     g_option_context_add_main_entries(context,config_options,NULL);

   518     compatibility=g_option_group_new("compatibility",

   519       "Options for Compatibility with Gutcheck:",

   520       "Show compatibility options",NULL,NULL);

   521     g_option_group_add_entries(compatibility,compatibility_options);

   522     g_option_context_add_group(context,compatibility);

   523     g_option_context_set_description(context,

   524       "For simplicity, only the switch options which reverse the\n"

   525       "default configuration are listed. In most cases, both vanilla\n"

   526       "and \"no-\" prefixed versions are available for use.");

   527     if (!g_option_context_parse(context,argc,argv,&err))

   528     {

   529 	g_printerr("Bookloupe: %s\n",err->message);

   530 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   531 	exit(1);

   532     }

   533     if (typo_compat)

   534 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   535     if (paranoid_compat)

   536     {

   537 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   538 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   539     }

   540     /*

   541      * Web uploads - for the moment, this is really just a placeholder

   542      * until we decide what processing we really want to do on web uploads

   543      */

   544     if (pswit[WEB_SWITCH])

   545     {

   546 	/* specific override for web uploads */

   547 	pswit[ECHO_SWITCH]=TRUE;

   548 	pswit[SQUOTE_SWITCH]=FALSE;

   549 	pswit[TYPO_SWITCH]=TRUE;

   550 	pswit[QPARA_SWITCH]=FALSE;

   551 	pswit[PARANOID_SWITCH]=TRUE;

   552 	pswit[LINE_END_SWITCH]=FALSE;

   553 	pswit[OVERVIEW_SWITCH]=FALSE;

   554 	pswit[STDOUT_SWITCH]=FALSE;

   555 	pswit[HEADER_SWITCH]=TRUE;

   556 	pswit[VERBOSE_SWITCH]=FALSE;

   557 	pswit[MARKUP_SWITCH]=FALSE;

   558 	pswit[USERTYPO_SWITCH]=FALSE;

   559 	pswit[DP_SWITCH]=FALSE;

   560     }

   561     if (opt_charset && !set_charset(opt_charset,&err))

   562     {

   563 	g_printerr("%s\n",err->message);

   564 	exit(1);

   565     }

   566     if (pswit[DUMP_CONFIG_SWITCH])

   567     {

   568 	dump_config();

   569 	exit(0);

   570     }

   571     g_free(opt_charset);

   572     opt_charset=NULL;

   573     if (pswit[OVERVIEW_SWITCH])

   574 	/* just print summary; don't echo */

   575 	pswit[ECHO_SWITCH]=FALSE;

   576     if (*argc<2)

   577     {

   578 	proghelp(context);

   579 	exit(1);

   580     }

   581     g_option_context_free(context);

   582 }

   584 /*

   585  * read_user_scannos:

   586  *

   587  * Read in the user-defined stealth scanno list.

   588  */

   589 void read_user_scannos(void)

   590 {

   591     GError *err=NULL;

   592     gchar *usertypo_file;

   593     gboolean okay;

   594     int i;

   595     gsize len,nb;

   596     gchar *contents,*utf8,**lines;

   597     usertypo_file=g_strdup("bookloupe.typ");

   598     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   599     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   600     {

   601 	g_clear_error(&err);

   602 	g_free(usertypo_file);

   603 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   604 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   605     }

   606     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   607     {

   608 	g_clear_error(&err);

   609 	g_free(usertypo_file);

   610 	usertypo_file=g_strdup("gutcheck.typ");

   611 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   612     }

   613     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   614     {

   615 	g_clear_error(&err);

   616 	g_free(usertypo_file);

   617 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   618 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   619     }

   620     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   621     {

   622 	g_free(usertypo_file);

   623 	g_print("   --> I couldn't find bookloupe.typ "

   624 	  "-- proceeding without user typos.\n");

   625 	return;

   626     }

   627     else if (!okay)

   628     {

   629 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   630 	g_free(usertypo_file);

   631 	g_clear_error(&err);

   632 	exit(1);

   633     }

   634     if (g_utf8_validate(contents,len,NULL))

   635     {

   636 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   637 	if (!charset)

   638 	    (void)set_charset("UNICODE",NULL);

   639     }

   640     else

   641 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   642     g_free(contents);

   643     lines=g_strsplit_set(utf8,"\r\n",0);

   644     g_free(utf8);

   645     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   646     for (i=0;lines[i];i++)

   647 	if (*(unsigned char *)lines[i]>'!')

   648 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   649 	else

   650 	    g_free(lines[i]);

   651     g_free(lines);

   652 }

   654 /*

   655  * read_etext:

   656  *

   657  * Read an etext returning a newly allocated string containing the file

   658  * contents or NULL on error.

   659  */

   660 gchar *read_etext(const char *filename,GError **err)

   661 {

   662     GError *tmp_err=NULL;

   663     gchar *contents,*utf8;

   664     gsize len,bytes_read,bytes_written;

   665     int i,line,col;

   666     if (!g_file_get_contents(filename,&contents,&len,err))

   667 	return NULL;

   668     if (g_utf8_validate(contents,len,NULL))

   669     {

   670 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   671 	g_set_print_handler(print_as_utf_8);

   672 #ifdef __WIN32__

   673 	SetConsoleOutputCP(CP_UTF8);

   674 #endif

   675     }

   676     else

   677     {

   678 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   679 	  &bytes_written,&tmp_err);

   680 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   681 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   682 	{

   683 	    line=col=1;

   684 	    for(i=0;i<bytes_read;i++)

   685 		if (contents[i]=='\n')

   686 		{

   687 		    line++;

   688 		    col=1;

   689 		}

   690 		else if (contents[i]!='\r')

   691 		    col++;

   692 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   693 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   694 	      "valid Windows-1252 character",

   695 	      ((unsigned char *)contents)[bytes_read],line,col);

   696 	}

   697 	else if (tmp_err)

   698 	    g_propagate_error(err,tmp_err);

   699 	g_set_print_handler(print_as_windows_1252);

   700 #ifdef __WIN32__

   701 	SetConsoleOutputCP(1252);

   702 #endif

   703     }

   704     g_free(contents);

   705     return utf8;

   706 }

   708 void cleanup_on_exit(void)

   709 {

   710 #ifdef __WIN32__

   711     SetConsoleOutputCP(saved_cp);

   712 #endif

   713 }

   715 int main(int argc,char **argv)

   716 {

   717 #ifdef __WIN32__

   718     atexit(cleanup_on_exit);

   719     saved_cp=GetConsoleOutputCP();

   720 #endif

   721     running_from=g_path_get_dirname(argv[0]);

   722     /* Paranoid checking is turned OFF, not on, by its switch */

   723     pswit[PARANOID_SWITCH]=TRUE;

   724     /* if running in paranoid mode, typo checks default to enabled */

   725     pswit[TYPO_SWITCH]=TRUE;

   726     /* Line-end checking is turned OFF, not on, by its switch */

   727     pswit[LINE_END_SWITCH]=TRUE;

   728     /* Echoing is turned OFF, not on, by its switch */

   729     pswit[ECHO_SWITCH]=TRUE;

   730     parse_config_file();

   731     parse_options(&argc,&argv);

   732     if (pswit[USERTYPO_SWITCH])

   733 	read_user_scannos();

   734     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   735     procfile(argv[1]);

   736     if (pswit[OVERVIEW_SWITCH])

   737     {

   738 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   739 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   740 	g_print("    --------------- Queries found --------------\n");

   741 	if (cnt_long)

   742 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   743 	if (cnt_short)

   744 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   745 	if (cnt_lineend)

   746 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   747 	if (cnt_word)

   748 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   749 	if (cnt_quote)

   750 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);

   751 	if (cnt_brack)

   752 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   753 	if (cnt_bin)

   754 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   755 	if (cnt_odd)

   756 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   757 	if (cnt_punct)

   758 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   759 	if (cnt_dash)

   760 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   761 	if (cnt_html)

   762 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   763 	g_print("\n");

   764 	g_print("    TOTAL QUERIES		  %14ld\n",

   765 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+

   766 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);

   767     }

   768     g_free(running_from);

   769     if (usertypo)

   770 	g_tree_unref(usertypo);

   771     set_charset(NULL,NULL);

   772     if (config)

   773 	g_key_file_free(config);

   774     return 0;

   775 }

   777 void count_dashes(const char *line,const char *dash,

   778   struct dash_results *results)

   779 {

   780     int i;

   781     gchar **tokens;

   782     gunichar pc,nc;

   783     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;

   784     if (!*line)

   785 	return;

   786     tokens=g_strsplit(line,dash,0);

   787     if (tokens[1])

   788 	results->base++;

   789     for(i=1;tokens[i];i++)

   790     {

   791 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));

   792 	nc=g_utf8_get_char(tokens[i]);

   793 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))

   794 	    spaced=TRUE;

   795 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))

   796 	    spaced2=TRUE;

   797 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))

   798 	    unspaced=TRUE;

   799     }

   800     if (spaced)

   801 	results->space++;

   802     if (spaced2)

   803 	/* count of lines with em-dashes with spaces both sides */

   804 	results->non_PG_space++;

   805     if (unspaced)

   806 	/* count of lines with PG-type em-dashes with no spaces */

   807 	results->PG_space++;

   808     g_strfreev(tokens);

   809 }

   811 /*

   812  * first_pass:

   813  *

   814  * Run a first pass - verify that it's a valid PG

   815  * file, decide whether to report some things that

   816  * occur many times in the text like long or short

   817  * lines, non-standard dashes, etc.

   818  */

   819 struct first_pass_results *first_pass(const char *etext)

   820 {

   821     gunichar laststart=CHAR_SPACE;

   822     const char *s;

   823     gchar *lc_line;

   824     int i,j,lbytes,llen;

   825     gchar **lines;

   826     unsigned int lastlen=0,lastblen=0;

   827     long spline=0,nspline=0;

   828     static struct first_pass_results results={0};

   829     struct dash_results tmp_dash_results;

   830     gchar *inword;

   831     QuoteClass qc;

   832     lines=g_strsplit(etext,"\n",0);

   833     if (!lines[0])

   834     {

   835 	/* An empty etext has no terminators */

   836 	results.newlines=DOS_NEWLINES;

   837     }

   838     else if (!lines[1])

   839     {

   840 	/*

   841 	 * If there are no LFs, we don't have UNIX-style

   842 	 * terminators, but we might have OS9-style ones.

   843 	 */

   844 	results.newlines=OS9_NEWLINES;

   845 	g_strfreev(lines);

   846 	lines=g_strsplit(etext,"\r",0);

   847 	if (!lines[0] || !lines[1])

   848 	    /* Looks like we don't have any terminators at all */

   849 	    results.newlines=DOS_NEWLINES;

   850     }

   851     else

   852     {

   853 	/* We might have UNIX-style terminators */

   854 	results.newlines=UNIX_NEWLINES;

   855     }

   856     for (j=0;lines[j];j++)

   857     {

   858 	lbytes=strlen(lines[j]);

   859 	if (lbytes>0 && lines[j][lbytes-1]=='\r')

   860 	{

   861 	    results.newlines=DOS_NEWLINES;

   862 	    do

   863 	    {

   864 		lines[j][--lbytes]='\0';

   865 	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');

   866 	}

   867 	llen=g_utf8_strlen(lines[j],lbytes);

   868 	linecnt++;

   869 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   870 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   871 	{

   872 	    if (spline)

   873 		g_print("   --> Duplicate header?\n");

   874 	    spline=linecnt+1;   /* first line of non-header text, that is */

   875 	}

   876 	if (!strncmp(lines[j],"*** START",9) &&

   877 	  strstr(lines[j],"PROJECT GUTENBERG"))

   878 	{

   879 	    if (nspline)

   880 		g_print("   --> Duplicate header?\n");

   881 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   882 	}

   883 	if (spline || nspline)

   884 	{

   885 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   886 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   887 	    {

   888 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   889 		{

   890 		    if (results.footerline)

   891 		    {

   892 			/* it's an old-form header - we can detect duplicates */

   893 			if (!nspline)

   894 			    g_print("   --> Duplicate footer?\n");

   895 		    }

   896 		    else

   897 			results.footerline=linecnt;

   898 		}

   899 	    }

   900 	    g_free(lc_line);

   901 	}

   902 	if (spline)

   903 	    results.firstline=spline;

   904 	if (nspline)

   905 	    results.firstline=nspline;  /* override with new */

   906 	if (results.footerline)

   907 	    continue;    /* don't count the boilerplate in the footer */

   908 	results.totlen+=llen;

   909 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   910 	{

   911 	    if (g_utf8_get_char(s)>127)

   912 		results.binlen++;

   913 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   914 		results.alphalen++;

   915 	    if (s>lines[j])

   916 	    {

   917 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))

   918 		    qc=QUOTE_CLASS(g_utf8_get_char(s));

   919 		else

   920 		    qc=INVALID_QUOTE;

   921 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&

   922 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   923 		    results.endquote_count++;

   924 	    }

   925 	}

   926 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   927 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   928 	    results.shortline++;

   929 	if (lbytes>0 &&

   930 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   931 	    cnt_spacend++;

   932 	if (strstr(lines[j],".,"))

   933 	    results.dotcomma++;

   934 	/* only count ast lines for ignoring purposes where there is */

   935 	/* locase text on the line */

   936 	if (strchr(lines[j],'*'))

   937 	{

   938 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   939 		if (g_unichar_islower(g_utf8_get_char(s)))

   940 		    break;

   941 	    if (*s)

   942 		results.astline++;

   943 	}

   944 	if (strchr(lines[j],'/'))

   945 	    results.fslashline++;

   946 	if (lbytes>0)

   947 	{

   948 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   949 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   950 	      s=g_utf8_prev_char(s))

   951 		;

   952 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   953 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   954 		results.hyphens++;

   955 	}

   956 	if (llen>LONGEST_PG_LINE)

   957 	    results.longline++;

   958 	if (llen>WAY_TOO_LONG)

   959 	    results.verylongline++;

   960 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   961 	{

   962 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   963 	    if (i>0)

   964 		results.htmcount++;

   965 	    if (strstr(lines[j],"<i>"))

   966 		results.htmcount+=4; /* bonus marks! */

   967 	}

   968 	/* Check for spaced em-dashes */

   969 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));

   970 	count_dashes(lines[j],"--",&tmp_dash_results);

   971 	count_dashes(lines[j],"—",&tmp_dash_results);

   972 	if (tmp_dash_results.base)

   973 	    results.emdash.base++;

   974 	if (tmp_dash_results.non_PG_space)

   975 	    results.emdash.non_PG_space++;

   976 	if (tmp_dash_results.PG_space)

   977 	    results.emdash.PG_space++;

   978 	for (s=lines[j];*s;)

   979 	{

   980 	    inword=getaword(&s);

   981 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   982 		results.Dutchcount++;

   983 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   984 		results.Frenchcount++;

   985 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   986 		results.standalone_digit++;

   987 	    g_free(inword);

   988 	}

   989 	/* Check for spaced dashes */

   990 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   991 	    results.spacedash++;

   992 	lastblen=lastlen;

   993 	lastlen=llen;

   994 	laststart=lines[j][0];

   995     }

   996     g_strfreev(lines);

   997     return &results;

   998 }

  1000 /*

  1001  * report_first_pass:

  1002  *

  1003  * Make some snap decisions based on the first pass results.

  1004  */

  1005 struct warnings *report_first_pass(struct first_pass_results *results)

  1006 {

  1007     static struct warnings warnings={0};

  1008     warnings.newlines=results->newlines;

  1009     if (warnings.newlines==UNIX_NEWLINES)

  1010 	g_print("   --> No lines in this file have a CR. Not reporting them. "

  1011 	  "Project Gutenberg requires that all lineends be CR-LF.\n");

  1012     else if (warnings.newlines==OS9_NEWLINES)

  1013 	g_print("   --> No lines in this file have a LF. Not reporting them. "

  1014 	  "Project Gutenberg requires that all lineends be CR-LF.\n");

  1015     if (cnt_spacend>0)

  1016 	g_print("   --> %ld lines in this file have white space at end\n",

  1017 	  cnt_spacend);

  1018     warnings.dotcomma=1;

  1019     if (results->dotcomma>5)

  1020     {

  1021 	warnings.dotcomma=0;

  1022 	g_print("   --> %ld lines in this file contain '.,'. "

  1023 	  "Not reporting them.\n",results->dotcomma);

  1024     }

  1025     /*

  1026      * If more than 50 lines, or one-tenth, are short,

  1027      * don't bother reporting them.

  1028      */

  1029     warnings.shortline=1;

  1030     if (results->shortline>50 || results->shortline*10>linecnt)

  1031     {

  1032 	warnings.shortline=0;

  1033 	g_print("   --> %ld lines in this file are short. "

  1034 	  "Not reporting short lines.\n",results->shortline);

  1035     }

  1036     /*

  1037      * If more than 50 lines, or one-tenth, are long,

  1038      * don't bother reporting them.

  1039      */

  1040     warnings.longline=1;

  1041     if (results->longline>50 || results->longline*10>linecnt)

  1042     {

  1043 	warnings.longline=0;

  1044 	g_print("   --> %ld lines in this file are long. "

  1045 	  "Not reporting long lines.\n",results->longline);

  1046     }

  1047     /* If more than 10 lines contain asterisks, don't bother reporting them. */

  1048     warnings.ast=1;

  1049     if (results->astline>10)

  1050     {

  1051 	warnings.ast=0;

  1052 	g_print("   --> %ld lines in this file contain asterisks. "

  1053 	  "Not reporting them.\n",results->astline);

  1054     }

  1055     /*

  1056      * If more than 10 lines contain forward slashes,

  1057      * don't bother reporting them.

  1058      */

  1059     warnings.fslash=1;

  1060     if (results->fslashline>10)

  1061     {

  1062 	warnings.fslash=0;

  1063 	g_print("   --> %ld lines in this file contain forward slashes. "

  1064 	  "Not reporting them.\n",results->fslashline);

  1065     }

  1066     /*

  1067      * If more than 20 lines contain unpunctuated endquotes,

  1068      * don't bother reporting them.

  1069      */

  1070     warnings.endquote=1;

  1071     if (results->endquote_count>20)

  1072     {

  1073 	warnings.endquote=0;

  1074 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

  1075 	  "Not reporting them.\n",results->endquote_count);

  1076     }

  1077     /*

  1078      * If more than 15 lines contain standalone digits,

  1079      * don't bother reporting them.

  1080      */

  1081     warnings.digit=1;

  1082     if (results->standalone_digit>10)

  1083     {

  1084 	warnings.digit=0;

  1085 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

  1086 	  "Not reporting them.\n",results->standalone_digit);

  1087     }

  1088     /*

  1089      * If more than 20 lines contain hyphens at end,

  1090      * don't bother reporting them.

  1091      */

  1092     warnings.hyphen=1;

  1093     if (results->hyphens>20)

  1094     {

  1095 	warnings.hyphen=0;

  1096 	g_print("   --> %ld lines in this file have hyphens at end. "

  1097 	  "Not reporting them.\n",results->hyphens);

  1098     }

  1099     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

  1100     {

  1101 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

  1102 	pswit[MARKUP_SWITCH]=1;

  1103     }

  1104     if (results->verylongline>0)

  1105 	g_print("   --> %ld lines in this file are VERY long!\n",

  1106 	  results->verylongline);

  1107     /*

  1108      * If there are more non-PG spaced dashes than PG em-dashes,

  1109      * assume it's deliberate.

  1110      * Current PG guidelines say don't use them, but older texts do,

  1111      * and some people insist on them whatever the guidelines say.

  1112      */

  1113     warnings.dash=1;

  1114     if (results->spacedash+results->emdash.non_PG_space>

  1115       results->emdash.PG_space)

  1116     {

  1117 	warnings.dash=0;

  1118 	g_print("   --> There are %ld spaced dashes and em-dashes. "

  1119 	  "Not reporting them.\n",

  1120 	  results->spacedash+results->emdash.non_PG_space);

  1121     }

  1122     if (charset)

  1123 	warnings.bin=0;

  1124     else

  1125     {

  1126 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */

  1127 	warnings.bin=1;

  1128 	/* If more than a quarter of characters are hi-bit, bug out. */

  1129 	if (results->binlen*4>results->totlen)

  1130 	{

  1131 	    g_print("   --> This file does not appear to be ASCII. "

  1132 	      "Terminating. Best of luck with it!\n");

  1133 	    exit(1);

  1134 	}

  1135 	if (results->alphalen*4<results->totlen)

  1136 	{

  1137 	    g_print("   --> This file does not appear to be text. "

  1138 	      "Terminating. Best of luck with it!\n");

  1139 	    exit(1);

  1140 	}

  1141 	if (results->binlen*100>results->totlen || results->binlen>100)

  1142 	{

  1143 	    g_print("   --> There are a lot of foreign letters here. "

  1144 	      "Not reporting them.\n");

  1145 	    if (!pswit[VERBOSE_SWITCH])

  1146 		warnings.bin=0;

  1147 	}

  1148     }

  1149     warnings.isDutch=FALSE;

  1150     if (results->Dutchcount>50)

  1151     {

  1152 	warnings.isDutch=TRUE;

  1153 	g_print("   --> This looks like Dutch - "

  1154 	  "switching off dashes and warnings for 's Middags case.\n");

  1155     }

  1156     warnings.isFrench=FALSE;

  1157     if (results->Frenchcount>50)

  1158     {

  1159 	warnings.isFrench=TRUE;

  1160 	g_print("   --> This looks like French - "

  1161 	  "switching off some doublepunct.\n");

  1162     }

  1163     if (results->firstline && results->footerline)

  1164 	g_print("    The PG header and footer appear to be already on.\n");

  1165     else

  1166     {

  1167 	if (results->firstline)

  1168 	    g_print("    The PG header is on - no footer.\n");

  1169 	if (results->footerline)

  1170 	    g_print("    The PG footer is on - no header.\n");

  1171     }

  1172     g_print("\n");

  1173     if (pswit[VERBOSE_SWITCH])

  1174     {

  1175 	warnings.shortline=1;

  1176 	warnings.dotcomma=1;

  1177 	warnings.longline=1;

  1178 	warnings.dash=1;

  1179 	warnings.digit=1;

  1180 	warnings.ast=1;

  1181 	warnings.fslash=1;

  1182 	warnings.hyphen=1;

  1183 	warnings.endquote=1;

  1184 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

  1185     }

  1186     if (warnings.isDutch)

  1187 	warnings.dash=0;

  1188     if (results->footerline>0 && results->firstline>0 &&

  1189       results->footerline>results->firstline &&

  1190       results->footerline-results->firstline<100)

  1191     {

  1192 	g_print("   --> I don't really know where this text starts. \n");

  1193 	g_print("       There are no reference points.\n");

  1194 	g_print("       I'm going to have to report the header and footer "

  1195 	  "as well.\n");

  1196 	results->firstline=0;

  1197     }

  1198     return &warnings;

  1199 }

  1201 /*

  1202  * analyse_quotes:

  1203  *

  1204  * Look along the line, accumulate the count of quotes, and see

  1205  * if this is an empty line - i.e. a line with nothing on it

  1206  * but spaces.

  1207  * If line has just spaces, period, * and/or - on it, don't

  1208  * count it, since empty lines with asterisks or dashes to

  1209  * separate sections are common.

  1210  *

  1211  * Returns: TRUE if the line is empty.

  1212  */

  1213 gboolean analyse_quotes(const char *aline,struct counters *counters)

  1214 {

  1215     int guessquote=0;

  1216     /* assume the line is empty until proven otherwise */

  1217     gboolean isemptyline=TRUE;

  1218     const char *s=aline,*sprev,*snext;

  1219     gunichar c;

  1220     sprev=NULL;

  1221     GError *tmp_err=NULL;

  1222     while (*s)

  1223     {

  1224 	snext=g_utf8_next_char(s);

  1225 	c=g_utf8_get_char(s);

  1226 	if (CHAR_IS_DQUOTE(c))

  1227 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);

  1228 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])

  1229 	{

  1230 	    if (s==aline)

  1231 	    {

  1232 		/*

  1233 		 * At start of line, it can only be a quotation mark.

  1234 		 * Hardcode a very common exception!

  1235 		 */

  1236 		if (!g_str_has_prefix(snext,"tis") &&

  1237 		  !g_str_has_prefix(snext,"Tis"))

  1238 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1239 	    }

  1240 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

  1241 	      g_unichar_isalpha(g_utf8_get_char(snext)))

  1242 		/* Do nothing! it's definitely an apostrophe, not a quote */

  1243 		;

  1244 	    /* it's outside a word - let's check it out */

  1245 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||

  1246 	      g_unichar_isalpha(g_utf8_get_char(snext)))

  1247 	    {

  1248 		/* certainly looks like a quotation mark */

  1249 		if (!g_str_has_prefix(snext,"tis") &&

  1250 		  !g_str_has_prefix(snext,"Tis"))

  1251 		    /* hardcode a very common exception! */

  1252 		{

  1253 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))

  1254 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1255 		    else

  1256 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);

  1257 		}

  1258 	    }

  1259 	    else

  1260 	    {

  1261 		/* now - is it a quotation mark? */

  1262 		guessquote=0;   /* accumulate clues */

  1263 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

  1264 		{

  1265 		    /* it follows a letter - could be either */

  1266 		    guessquote++;

  1267 		    if (g_utf8_get_char(sprev)=='s')

  1268 		    {

  1269 			/* looks like a plural apostrophe */

  1270 			guessquote-=3;

  1271 			if (g_utf8_get_char(snext)==CHAR_SPACE)

  1272 			    /* bonus marks! */

  1273 			    guessquote-=2;

  1274 		    }

  1275 		    if (innermost_quote_matches(counters,c))

  1276 			/*

  1277 			 * Give it the benefit of some doubt,

  1278 			 * if a squote is already open.

  1279 			 */

  1280 			guessquote++;

  1281 		    else

  1282 			guessquote--;

  1283 		    if (guessquote>=0)

  1284 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);

  1285 		}

  1286 		else

  1287 		    /* no adjacent letter - it must be a quote of some kind */

  1288 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1289 	    }

  1290 	}

  1291 	if (tmp_err)

  1292 	{

  1293 	    if (pswit[ECHO_SWITCH])

  1294 		g_print("\n%s\n",aline);

  1295 	    if (!pswit[OVERVIEW_SWITCH])

  1296 		g_print("    Line %ld column %ld - %s\n",

  1297 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);

  1298 	    g_clear_error(&tmp_err);

  1299 	}

  1300 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

  1301 	  c!='\r' && c!='\n')

  1302 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

  1303 	if (c==CHAR_UNDERSCORE)

  1304 	    counters->c_unders++;

  1305 	if (c==CHAR_OPEN_SBRACK)

  1306 	{

  1307 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&

  1308 	      !matching_difference(counters,c) && s==aline &&

  1309 	      g_str_has_prefix(s,"[Illustration:"))

  1310 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);

  1311 	    else

  1312 		increment_matching(counters,c,TRUE);

  1313 	}

  1314 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)

  1315 	    increment_matching(counters,c,TRUE);

  1316 	if (c==CHAR_CLOSE_SBRACK)

  1317 	{

  1318 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&

  1319 	      !matching_difference(counters,c) && !*snext)

  1320 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);

  1321 	    else

  1322 		increment_matching(counters,c,FALSE);

  1323 	}

  1324 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)

  1325 	    increment_matching(counters,c,FALSE);

  1326 	sprev=s;

  1327 	s=snext;

  1328     }

  1329     return isemptyline;

  1330 }

  1332 /*

  1333  * check_for_control_characters:

  1334  *

  1335  * Check for invalid or questionable characters in the line

  1336  * Anything above 127 is invalid for plain ASCII, and

  1337  * non-printable control characters should also be flagged.

  1338  * Tabs should generally not be there.

  1339  */

  1340 void check_for_control_characters(const char *aline)

  1341 {

  1342     gunichar c;

  1343     const char *s;

  1344     for (s=aline;*s;s=g_utf8_next_char(s))

  1345     {

  1346 	c=g_utf8_get_char(s);

  1347 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

  1348 	{

  1349 	    if (pswit[ECHO_SWITCH])

  1350 		g_print("\n%s\n",aline);

  1351 	    if (!pswit[OVERVIEW_SWITCH])

  1352 		g_print("    Line %ld column %ld - Control character %u\n",

  1353 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

  1354 	    else

  1355 		cnt_bin++;

  1356 	}

  1357     }

  1358 }

  1360 /*

  1361  * check_for_odd_characters:

  1362  *

  1363  * Check for binary and other odd characters.

  1364  */

  1365 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1366   gboolean isemptyline)

  1367 {

  1368     /* Don't repeat multiple warnings on one line. */

  1369     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;

  1370     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

  1371     const char *s;

  1372     gunichar c;

  1373     gsize nb;

  1374     gchar *t;

  1375     for (s=aline;*s;s=g_utf8_next_char(s))

  1376     {

  1377 	c=g_utf8_get_char(s);

  1378 	if (warnings->bin && !eInvalidChar &&

  1379 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

  1380 	{

  1381 	    if (pswit[ECHO_SWITCH])

  1382 		g_print("\n%s\n",aline);

  1383 	    if (!pswit[OVERVIEW_SWITCH])

  1384 		if (c>127 && c<160 || c>255)

  1385 		    g_print("    Line %ld column %ld - "

  1386 		      "Non-ISO-8859 character %u\n",

  1387 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1388 		else

  1389 		    g_print("    Line %ld column %ld - "

  1390 		      "Non-ASCII character %u\n",

  1391 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1392 	    else

  1393 		cnt_bin++;

  1394 	    eInvalidChar=TRUE;

  1395 	}

  1396 	if (!eInvalidChar && charset)

  1397 	{

  1398 	    if (charset_validator==(GIConv)-1)

  1399 	    {

  1400 		if (!g_unichar_isdefined(c))

  1401 		{

  1402 		    if (pswit[ECHO_SWITCH])

  1403 			g_print("\n%s\n",aline);

  1404 		    if (!pswit[OVERVIEW_SWITCH])

  1405 			g_print("    Line %ld column %ld - Unassigned UNICODE "

  1406 			  "code point U+%04" G_GINT32_MODIFIER "X\n",

  1407 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1408 		    else

  1409 			cnt_bin++;

  1410 		    eInvalidChar=TRUE;

  1411 		}

  1412 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||

  1413 		  c>=100000 && c<=0x10FFFD)

  1414 		{

  1415 		    if (pswit[ECHO_SWITCH])

  1416 			g_print("\n%s\n",aline);

  1417 		    if (!pswit[OVERVIEW_SWITCH])

  1418 			g_print("    Line %ld column %ld - Private Use "

  1419 			  "character U+%04" G_GINT32_MODIFIER "X\n",

  1420 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1421 		    else

  1422 			cnt_bin++;

  1423 		    eInvalidChar=TRUE;

  1424 		}

  1425 	    }

  1426 	    else

  1427 	    {

  1428 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,

  1429 		  charset_validator,NULL,&nb,NULL);

  1430 		if (t)

  1431 		    g_free(t);

  1432 		else

  1433 		{

  1434 		    if (pswit[ECHO_SWITCH])

  1435 			g_print("\n%s\n",aline);

  1436 		    if (!pswit[OVERVIEW_SWITCH])

  1437 			g_print("    Line %ld column %ld - Non-%s "

  1438 			  "character %u\n",linecnt,

  1439 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);

  1440 		    else

  1441 			cnt_bin++;

  1442 		    eInvalidChar=TRUE;

  1443 		}

  1444 	    }

  1445 	}

  1446 	if (!eTab && c==CHAR_TAB)

  1447 	{

  1448 	    if (pswit[ECHO_SWITCH])

  1449 		g_print("\n%s\n",aline);

  1450 	    if (!pswit[OVERVIEW_SWITCH])

  1451 		g_print("    Line %ld column %ld - Tab character?\n",

  1452 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1453 	    else

  1454 		cnt_odd++;

  1455 	    eTab=TRUE;

  1456 	}

  1457 	if (!eTilde && c==CHAR_TILDE)

  1458 	{

  1459 	    /*

  1460 	     * Often used by OCR software to indicate an

  1461 	     * unrecognizable character.

  1462 	     */

  1463 	    if (pswit[ECHO_SWITCH])

  1464 		g_print("\n%s\n",aline);

  1465 	    if (!pswit[OVERVIEW_SWITCH])

  1466 		g_print("    Line %ld column %ld - Tilde character?\n",

  1467 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1468 	    else

  1469 		cnt_odd++;

  1470 	    eTilde=TRUE;

  1471 	}

  1472 	if (!eCarat && c==CHAR_CARAT)

  1473 	{

  1474 	    if (pswit[ECHO_SWITCH])

  1475 		g_print("\n%s\n",aline);

  1476 	    if (!pswit[OVERVIEW_SWITCH])

  1477 		g_print("    Line %ld column %ld - Carat character?\n",

  1478 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1479 	    else

  1480 		cnt_odd++;

  1481 	    eCarat=TRUE;

  1482 	}

  1483 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

  1484 	{

  1485 	    if (pswit[ECHO_SWITCH])

  1486 		g_print("\n%s\n",aline);

  1487 	    if (!pswit[OVERVIEW_SWITCH])

  1488 		g_print("    Line %ld column %ld - Forward slash?\n",

  1489 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1490 	    else

  1491 		cnt_odd++;

  1492 	    eFSlash=TRUE;

  1493 	}

  1494 	/*

  1495 	 * Report asterisks only in paranoid mode,

  1496 	 * since they're often deliberate.

  1497 	 */

  1498 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1499 	  c==CHAR_ASTERISK)

  1500 	{

  1501 	    if (pswit[ECHO_SWITCH])

  1502 		g_print("\n%s\n",aline);

  1503 	    if (!pswit[OVERVIEW_SWITCH])

  1504 		g_print("    Line %ld column %ld - Asterisk?\n",

  1505 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1506 	    else

  1507 		cnt_odd++;

  1508 	    eAst=TRUE;

  1509 	}

  1510     }

  1511 }

  1513 /*

  1514  * check_for_long_line:

  1515  *

  1516  * Check for line too long.

  1517  */

  1518 void check_for_long_line(const char *aline)

  1519 {

  1520     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1521     {

  1522 	if (pswit[ECHO_SWITCH])

  1523 	    g_print("\n%s\n",aline);

  1524 	if (!pswit[OVERVIEW_SWITCH])

  1525 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1526 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1527 	else

  1528 	    cnt_long++;

  1529     }

  1530 }

  1532 /*

  1533  * check_for_short_line:

  1534  *

  1535  * Check for line too short.

  1536  *

  1537  * This one is a bit trickier to implement: we don't want to

  1538  * flag the last line of a paragraph for being short, so we

  1539  * have to wait until we know that our current line is a

  1540  * "normal" line, then report the _previous_ line if it was too

  1541  * short. We also don't want to report indented lines like

  1542  * chapter heads or formatted quotations. We therefore keep

  1543  * last->len as the length of the last line examined, and

  1544  * last->blen as the length of the last but one, and try to

  1545  * suppress unnecessary warnings by checking that both were of

  1546  * "normal" length. We keep the first character of the last

  1547  * line in last->start, and if it was a space, we assume that

  1548  * the formatting is deliberate. I can't figure out a way to

  1549  * distinguish something like a quoted verse left-aligned or

  1550  * the header or footer of a letter from a paragraph of short

  1551  * lines - maybe if I examined the whole paragraph, and if the

  1552  * para has less than, say, 8 lines and if all lines are short,

  1553  * then just assume it's OK? Need to look at some texts to see

  1554  * how often a formula like this would get the right result.

  1555  */

  1556 void check_for_short_line(const char *aline,const struct line_properties *last)

  1557 {

  1558     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1559       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1560       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1561     {

  1562 	if (pswit[ECHO_SWITCH])

  1563 	    g_print("\n%s\n",prevline);

  1564 	if (!pswit[OVERVIEW_SWITCH])

  1565 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1566 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1567 	else

  1568 	    cnt_short++;

  1569     }

  1570 }

  1572 /*

  1573  * check_for_starting_punctuation:

  1574  *

  1575  * Look for punctuation other than full ellipses at start of line.

  1576  */

  1577 void check_for_starting_punctuation(const char *aline)

  1578 {

  1579     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1580       !g_str_has_prefix(aline,". . ."))

  1581     {

  1582 	if (pswit[ECHO_SWITCH])

  1583 	    g_print("\n%s\n",aline);

  1584 	if (!pswit[OVERVIEW_SWITCH])

  1585 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1586 	      linecnt);

  1587 	else

  1588 	    cnt_punct++;

  1589     }

  1590 }

  1592 /*

  1593  * str_emdash:

  1594  *

  1595  * Find the first em-dash, return a pointer to it and set <next> to the

  1596  * character following the dash.

  1597  */

  1598 char *str_emdash(const char *s,const char **next)

  1599 {

  1600     const char *s1,*s2;

  1601     s1=strstr(s,"--");

  1602     s2=strstr(s,"—");

  1603     if (!s1)

  1604     {

  1605 	if (s2)

  1606 	    *next=g_utf8_next_char(s2);

  1607 	return (char *)s2;

  1608     }

  1609     else if (!s2)

  1610     {

  1611 	*next=g_utf8_next_char(g_utf8_next_char(s1));

  1612 	return (char *)s1;

  1613     }

  1614     else if (s1<s2)

  1615     {

  1616 	*next=g_utf8_next_char(g_utf8_next_char(s1));

  1617 	return (char *)s1;

  1618     }

  1619     else

  1620     {

  1621 	*next=g_utf8_next_char(s2);

  1622 	return (char *)s2;

  1623     }

  1624 }

  1626 /*

  1627  * check_for_spaced_emdash:

  1628  *

  1629  * Check for spaced em-dashes.

  1630  *

  1631  * We must check _all_ occurrences of em-dashes on the line

  1632  * hence the loop - even if the first dash is OK

  1633  * there may be another that's wrong later on.

  1634  */

  1635 void check_for_spaced_emdash(const char *aline)

  1636 {

  1637     const char *s,*t,*next;

  1638     for (s=aline;t=str_emdash(s,&next);s=next)

  1639     {

  1640 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1641 	  g_utf8_get_char(next)==CHAR_SPACE)

  1642 	{

  1643 	    if (pswit[ECHO_SWITCH])

  1644 		g_print("\n%s\n",aline);

  1645 	    if (!pswit[OVERVIEW_SWITCH])

  1646 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1647 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1648 	    else

  1649 		cnt_dash++;

  1650 	}

  1651     }

  1652 }

  1654 /*

  1655  * check_for_spaced_dash:

  1656  *

  1657  * Check for spaced dashes.

  1658  */

  1659 void check_for_spaced_dash(const char *aline)

  1660 {

  1661     const char *s;

  1662     if ((s=strstr(aline," -")))

  1663     {

  1664 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1665 	{

  1666 	    if (pswit[ECHO_SWITCH])

  1667 		g_print("\n%s\n",aline);

  1668 	    if (!pswit[OVERVIEW_SWITCH])

  1669 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1670 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1671 	    else

  1672 		cnt_dash++;

  1673 	}

  1674     }

  1675     else if ((s=strstr(aline,"- ")))

  1676     {

  1677 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1678 	{

  1679 	    if (pswit[ECHO_SWITCH])

  1680 		g_print("\n%s\n",aline);

  1681 	    if (!pswit[OVERVIEW_SWITCH])

  1682 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1683 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1684 	    else

  1685 		cnt_dash++;

  1686 	}

  1687     }

  1688 }

  1690 /*

  1691  * check_for_unmarked_paragraphs:

  1692  *

  1693  * Check for unmarked paragraphs indicated by separate speakers.

  1694  *

  1695  * May well be false positive:

  1696  * "Bravo!" "Wonderful!" called the crowd.

  1697  * but useful all the same.

  1698  */

  1699 void check_for_unmarked_paragraphs(const char *aline)

  1700 {

  1701     const char *s;

  1702     s=strstr(aline,"\"  \"");

  1703     if (!s)

  1704 	s=strstr(aline,"\" \"");

  1705     if (s)

  1706     {

  1707 	if (pswit[ECHO_SWITCH])

  1708 	    g_print("\n%s\n",aline);

  1709 	if (!pswit[OVERVIEW_SWITCH])

  1710 	    g_print("    Line %ld column %ld - "

  1711 	      "Query missing paragraph break?\n",

  1712 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1713 	else

  1714 	    cnt_punct++;

  1715     }

  1716 }

  1718 /*

  1719  * check_for_jeebies:

  1720  *

  1721  * Check for "to he" and other easy h/b errors.

  1722  *

  1723  * This is a very inadequate effort on the h/b problem,

  1724  * but the phrase "to he" is always an error, whereas "to

  1725  * be" is quite common.

  1726  * Similarly, '"Quiet!", be said.' is a non-be error

  1727  * "to he" is _not_ always an error!:

  1728  *       "Where they went to he couldn't say."

  1729  * Another false positive:

  1730  *       What would "Cinderella" be without the . . .

  1731  * and another: "If he wants to he can see for himself."

  1732  */

  1733 void check_for_jeebies(const char *aline)

  1734 {

  1735     const char *s;

  1736     s=strstr(aline," be could ");

  1737     if (!s)

  1738 	s=strstr(aline," be would ");

  1739     if (!s)

  1740 	s=strstr(aline," was be ");

  1741     if (!s)

  1742 	s=strstr(aline," be is ");

  1743     if (!s)

  1744 	s=strstr(aline," is be ");

  1745     if (!s)

  1746 	s=strstr(aline,"\", be ");

  1747     if (!s)

  1748 	s=strstr(aline,"\" be ");

  1749     if (!s)

  1750 	s=strstr(aline,"\" be ");

  1751     if (!s)

  1752 	s=strstr(aline," to he ");

  1753     if (s)

  1754     {

  1755 	if (pswit[ECHO_SWITCH])

  1756 	    g_print("\n%s\n",aline);

  1757 	if (!pswit[OVERVIEW_SWITCH])

  1758 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1759 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1760 	else

  1761 	    cnt_word++;

  1762     }

  1763     s=strstr(aline," the had ");

  1764     if (!s)

  1765 	s=strstr(aline," a had ");

  1766     if (!s)

  1767 	s=strstr(aline," they bad ");

  1768     if (!s)

  1769 	s=strstr(aline," she bad ");

  1770     if (!s)

  1771 	s=strstr(aline," he bad ");

  1772     if (!s)

  1773 	s=strstr(aline," you bad ");

  1774     if (!s)

  1775 	s=strstr(aline," i bad ");

  1776     if (s)

  1777     {

  1778 	if (pswit[ECHO_SWITCH])

  1779 	    g_print("\n%s\n",aline);

  1780 	if (!pswit[OVERVIEW_SWITCH])

  1781 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1782 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1783 	else

  1784 	    cnt_word++;

  1785     }

  1786     s=strstr(aline,"; hut ");

  1787     if (!s)

  1788 	s=strstr(aline,", hut ");

  1789     if (s)

  1790     {

  1791 	if (pswit[ECHO_SWITCH])

  1792 	    g_print("\n%s\n",aline);

  1793 	if (!pswit[OVERVIEW_SWITCH])

  1794 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1795 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1796 	else

  1797 	    cnt_word++;

  1798     }

  1799 }

  1801 /*

  1802  * check_for_mta_from:

  1803  *

  1804  * Special case - angled bracket in front of "From" placed there by an

  1805  * MTA when sending an e-mail.

  1806  */

  1807 void check_for_mta_from(const char *aline)

  1808 {

  1809     const char *s;

  1810     s=strstr(aline,">From");

  1811     if (s)

  1812     {

  1813 	if (pswit[ECHO_SWITCH])

  1814 	    g_print("\n%s\n",aline);

  1815 	if (!pswit[OVERVIEW_SWITCH])

  1816 	    g_print("    Line %ld column %ld - "

  1817 	      "Query angled bracket with From\n",

  1818 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1819 	else

  1820 	    cnt_punct++;

  1821     }

  1822 }

  1824 /*

  1825  * check_for_orphan_character:

  1826  *

  1827  * Check for a single character line -

  1828  * often an overflow from bad wrapping.

  1829  */

  1830 void check_for_orphan_character(const char *aline)

  1831 {

  1832     gunichar c;

  1833     c=g_utf8_get_char(aline);

  1834     if (c && !*g_utf8_next_char(aline))

  1835     {

  1836 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1837 	    ; /* Nothing - ignore numerals alone on a line. */

  1838 	else

  1839 	{

  1840 	    if (pswit[ECHO_SWITCH])

  1841 		g_print("\n%s\n",aline);

  1842 	    if (!pswit[OVERVIEW_SWITCH])

  1843 		g_print("    Line %ld column 1 - Query single character line\n",

  1844 		  linecnt);

  1845 	    else

  1846 		cnt_punct++;

  1847 	}

  1848     }

  1849 }

  1851 /*

  1852  * check_for_pling_scanno:

  1853  *

  1854  * Check for I" - often should be !

  1855  */

  1856 void check_for_pling_scanno(const char *aline)

  1857 {

  1858     const char *s;

  1859     s=strstr(aline," I\"");

  1860     if (s)

  1861     {

  1862 	if (pswit[ECHO_SWITCH])

  1863 	    g_print("\n%s\n",aline);

  1864 	if (!pswit[OVERVIEW_SWITCH])

  1865 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1866 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1867 	else

  1868 	    cnt_punct++;

  1869     }

  1870 }

  1872 /*

  1873  * check_for_extra_period:

  1874  *

  1875  * Check for period without a capital letter. Cut-down from gutspell.

  1876  * Only works when it happens on a single line.

  1877  */

  1878 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1879 {

  1880     const char *s,*t,*s1,*sprev;

  1881     int i;

  1882     gsize len;

  1883     gboolean istypo;

  1884     gchar *testword;

  1885     gunichar c,nc,pc,*decomposition;

  1886     if (pswit[PARANOID_SWITCH])

  1887     {

  1888 	for (t=aline;t=strstr(t,". ");)

  1889 	{

  1890 	    if (t==aline)

  1891 	    {

  1892 		t=g_utf8_next_char(t);

  1893 		/* start of line punctuation is handled elsewhere */

  1894 		continue;

  1895 	    }

  1896 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1897 	    {

  1898 		t=g_utf8_next_char(t);

  1899 		continue;

  1900 	    }

  1901 	    if (warnings->isDutch)

  1902 	    {

  1903 		/* For Frank & Jeroen -- 's Middags case */

  1904 		gunichar c2,c3,c4,c5;

  1905 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1906 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1907 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1908 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1909 		if (CHAR_IS_APOSTROPHE(c2) &&

  1910 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&

  1911 		  g_unichar_isupper(c5))

  1912 		{

  1913 		    t=g_utf8_next_char(t);

  1914 		    continue;

  1915 		}

  1916 	    }

  1917 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1918 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1919 	      !g_unichar_isdigit(g_utf8_get_char(s1)))

  1920 		s1=g_utf8_next_char(s1);

  1921 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1922 	    {

  1923 		/* we have something to investigate */

  1924 		istypo=TRUE;

  1925 		/* so let's go back and find out */

  1926 		nc=g_utf8_get_char(t);

  1927 		s1=g_utf8_prev_char(t);

  1928 		c=g_utf8_get_char(s1);

  1929 		sprev=g_utf8_prev_char(s1);

  1930 		pc=g_utf8_get_char(sprev);

  1931 		while (s1>=aline &&

  1932 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||

  1933 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&

  1934 		  g_unichar_isalpha(nc)))

  1935 		{

  1936 		    nc=c;

  1937 		    s1=sprev;

  1938 		    c=pc;

  1939 		    sprev=g_utf8_prev_char(s1);

  1940 		    pc=g_utf8_get_char(sprev);

  1941 		}

  1942 		s1=g_utf8_next_char(s1);

  1943 		s=strchr(s1,'.');

  1944 		if (s)

  1945 		    testword=g_strndup(s1,s-s1);

  1946 		else

  1947 		    testword=g_strdup(s1);

  1948 		for (i=0;*abbrev[i];i++)

  1949 		    if (!strcmp(testword,abbrev[i]))

  1950 			istypo=FALSE;

  1951 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1952 		    istypo=FALSE;

  1953 		if (!*g_utf8_next_char(testword))

  1954 		    istypo=FALSE;

  1955 		if (isroman(testword))

  1956 		    istypo=FALSE;

  1957 		if (istypo)

  1958 		{

  1959 		    istypo=FALSE;

  1960 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1961 		    {

  1962 			decomposition=g_unicode_canonical_decomposition(

  1963 			  g_utf8_get_char(s),&len);

  1964 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1965 			    istypo=TRUE;

  1966 			g_free(decomposition);

  1967 		    }

  1968 		}

  1969 		if (istypo &&

  1970 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1971 		{

  1972 		    g_tree_insert(qperiod,g_strdup(testword),

  1973 		      GINT_TO_POINTER(1));

  1974 		    if (pswit[ECHO_SWITCH])

  1975 			g_print("\n%s\n",aline);

  1976 		    if (!pswit[OVERVIEW_SWITCH])

  1977 			g_print("    Line %ld column %ld - Extra period?\n",

  1978 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1979 		    else

  1980 			cnt_punct++;

  1981 		}

  1982 		g_free(testword);

  1983 	    }

  1984 	    t=g_utf8_next_char(t);

  1985 	}

  1986     }

  1987 }

  1989 /*

  1990  * check_for_following_punctuation:

  1991  *

  1992  * Check for words usually not followed by punctuation.

  1993  */

  1994 void check_for_following_punctuation(const char *aline)

  1995 {

  1996     int i;

  1997     const char *s,*wordstart;

  1998     gunichar c;

  1999     gchar *inword,*t;

  2000     if (pswit[TYPO_SWITCH])

  2001     {

  2002 	for (s=aline;*s;)

  2003 	{

  2004 	    wordstart=s;

  2005 	    t=getaword(&s);

  2006 	    if (!*t)

  2007 	    {

  2008 		g_free(t);

  2009 		continue;

  2010 	    }

  2011 	    inword=g_utf8_strdown(t,-1);

  2012 	    g_free(t);

  2013 	    for (i=0;*nocomma[i];i++)

  2014 		if (!strcmp(inword,nocomma[i]))

  2015 		{

  2016 		    c=g_utf8_get_char(s);

  2017 		    if (c==',' || c==';' || c==':')

  2018 		    {

  2019 			if (pswit[ECHO_SWITCH])

  2020 			    g_print("\n%s\n",aline);

  2021 			if (!pswit[OVERVIEW_SWITCH])

  2022 			    g_print("    Line %ld column %ld - "

  2023 			      "Query punctuation after %s?\n",

  2024 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  2025 			      inword);

  2026 			else

  2027 			    cnt_punct++;

  2028 		    }

  2029 		}

  2030 	    for (i=0;*noperiod[i];i++)

  2031 		if (!strcmp(inword,noperiod[i]))

  2032 		{

  2033 		    c=g_utf8_get_char(s);

  2034 		    if (c=='.' || c=='!')

  2035 		    {

  2036 			if (pswit[ECHO_SWITCH])

  2037 			    g_print("\n%s\n",aline);

  2038 			if (!pswit[OVERVIEW_SWITCH])

  2039 			    g_print("    Line %ld column %ld - "

  2040 			      "Query punctuation after %s?\n",

  2041 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  2042 			      inword);

  2043 			else

  2044 			    cnt_punct++;

  2045 		    }

  2046 		}

  2047 	    g_free(inword);

  2048 	}

  2049     }

  2050 }

  2052 /*

  2053  * check_for_typos:

  2054  *

  2055  * Check for commonly mistyped words,

  2056  * and digits like 0 for O in a word.

  2057  */

  2058 void check_for_typos(const char *aline,struct warnings *warnings)

  2059 {

  2060     const char *s,*t,*nt,*wordstart;

  2061     gchar *inword;

  2062     gunichar *decomposition;

  2063     gchar *testword;

  2064     int i,vowel,consonant,*dupcnt;

  2065     gboolean isdup,istypo,alower;

  2066     gunichar c,pc;

  2067     long offset,len;

  2068     gsize decomposition_len;

  2069     for (s=aline;*s;)

  2070     {

  2071 	wordstart=s;

  2072 	inword=getaword(&s);

  2073 	if (!*inword)

  2074 	{

  2075 	    g_free(inword);

  2076 	    continue; /* don't bother with empty lines */

  2077 	}

  2078 	if (mixdigit(inword))

  2079 	{

  2080 	    if (pswit[ECHO_SWITCH])

  2081 		g_print("\n%s\n",aline);

  2082 	    if (!pswit[OVERVIEW_SWITCH])

  2083 		g_print("    Line %ld column %ld - Query digit in %s\n",

  2084 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  2085 	    else

  2086 		cnt_word++;

  2087 	}

  2088 	/*

  2089 	 * Put the word through a series of tests for likely typos and OCR

  2090 	 * errors.

  2091 	 */

  2092 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  2093 	{

  2094 	    istypo=FALSE;

  2095 	    alower=FALSE;

  2096 	    for (t=inword;*t;t=g_utf8_next_char(t))

  2097 	    {

  2098 		c=g_utf8_get_char(t);

  2099 		nt=g_utf8_next_char(t);

  2100 		/* lowercase for testing */

  2101 		if (g_unichar_islower(c))

  2102 		    alower=TRUE;

  2103 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  2104 		{

  2105 		    /*

  2106 		     * We have an uppercase mid-word. However, there are

  2107 		     * common cases:

  2108 		     *   Mac and Mc like McGill

  2109 		     *   French contractions like l'Abbe

  2110 		     */

  2111 		    offset=g_utf8_pointer_to_offset(inword,t);

  2112 		    if (offset>0)

  2113 			pc=g_utf8_get_char(g_utf8_prev_char(t));

  2114 		    else

  2115 			pc='\0';

  2116 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  2117 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  2118 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  2119 		      CHAR_IS_APOSTROPHE(pc))

  2120 			; /* do nothing! */

  2121 		    else

  2122 			istypo=TRUE;

  2123 		}

  2124 	    }

  2125 	    testword=g_utf8_casefold(inword,-1);

  2126 	}

  2127 	if (pswit[TYPO_SWITCH])

  2128 	{

  2129 	    /*

  2130 	     * Check for certain unlikely two-letter combinations at word

  2131 	     * start and end.

  2132 	     */

  2133 	    len=g_utf8_strlen(testword,-1);

  2134 	    if (len>1)

  2135 	    {

  2136 		for (i=0;*nostart[i];i++)

  2137 		    if (g_str_has_prefix(testword,nostart[i]))

  2138 			istypo=TRUE;

  2139 		for (i=0;*noend[i];i++)

  2140 		    if (g_str_has_suffix(testword,noend[i]))

  2141 			istypo=TRUE;

  2142 	    }

  2143 	    /* ght is common, gbt never. Like that. */

  2144 	    if (strstr(testword,"cb"))

  2145 		istypo=TRUE;

  2146 	    if (strstr(testword,"gbt"))

  2147 		istypo=TRUE;

  2148 	    if (strstr(testword,"pbt"))

  2149 		istypo=TRUE;

  2150 	    if (strstr(testword,"tbs"))

  2151 		istypo=TRUE;

  2152 	    if (strstr(testword,"mrn"))

  2153 		istypo=TRUE;

  2154 	    if (strstr(testword,"ahle"))

  2155 		istypo=TRUE;

  2156 	    if (strstr(testword,"ihle"))

  2157 		istypo=TRUE;

  2158 	    /*

  2159 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  2160 	     * Also "TBI" - frostbite, outbid - but uncommon.

  2161 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  2162 	     * numerals, but "ii" is a common scanno.

  2163 	     */

  2164 	    if (strstr(testword,"tbi"))

  2165 		istypo=TRUE;

  2166 	    if (strstr(testword,"tbe"))

  2167 		istypo=TRUE;

  2168 	    if (strstr(testword,"ii"))

  2169 		istypo=TRUE;

  2170 	    /*

  2171 	     * Check for no vowels or no consonants.

  2172 	     * If none, flag a typo.

  2173 	     */

  2174 	    if (!istypo && len>1)

  2175 	    {

  2176 		vowel=consonant=0;

  2177 		for (t=testword;*t;t=g_utf8_next_char(t))

  2178 		{

  2179 		    c=g_utf8_get_char(t);

  2180 		    decomposition=

  2181 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  2182 		    if (c=='y' || g_unichar_isdigit(c))

  2183 		    {

  2184 			/* Yah, this is loose. */

  2185 			vowel++;

  2186 			consonant++;

  2187 		    }

  2188 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  2189 			vowel++;

  2190 		    else

  2191 			consonant++;

  2192 		    g_free(decomposition);

  2193 		}

  2194 		if (!vowel || !consonant)

  2195 		    istypo=TRUE;

  2196 	    }

  2197 	    /*

  2198 	     * Now exclude the word from being reported if it's in

  2199 	     * the okword list.

  2200 	     */

  2201 	    for (i=0;*okword[i];i++)

  2202 		if (!strcmp(testword,okword[i]))

  2203 		    istypo=FALSE;

  2204 	    /*

  2205 	     * What looks like a typo may be a Roman numeral.

  2206 	     * Exclude these.

  2207 	     */

  2208 	    if (istypo && isroman(testword))

  2209 		istypo=FALSE;

  2210 	    /* Check the manual list of typos. */

  2211 	    if (!istypo)

  2212 		for (i=0;*typo[i];i++)

  2213 		    if (!strcmp(testword,typo[i]))

  2214 			istypo=TRUE;

  2215 	    /*

  2216 	     * Check lowercase s, l, i and m - special cases.

  2217 	     *   "j" - often a semi-colon gone wrong.

  2218 	     *   "d" for a missing apostrophe - he d

  2219 	     *   "n" for "in"

  2220 	     */

  2221 	    if (!istypo && len==1 &&

  2222 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  2223 		istypo=TRUE;

  2224 	    if (istypo)

  2225 	    {

  2226 		dupcnt=g_tree_lookup(qword,testword);

  2227 		if (dupcnt)

  2228 		{

  2229 		    (*dupcnt)++;

  2230 		    isdup=!pswit[VERBOSE_SWITCH];

  2231 		}

  2232 		else

  2233 		{

  2234 		    dupcnt=g_new0(int,1);

  2235 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  2236 		    isdup=FALSE;

  2237 		}

  2238 		if (!isdup)

  2239 		{

  2240 		    if (pswit[ECHO_SWITCH])

  2241 			g_print("\n%s\n",aline);

  2242 		    if (!pswit[OVERVIEW_SWITCH])

  2243 		    {

  2244 			g_print("    Line %ld column %ld - Query word %s",

  2245 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  2246 			  inword);

  2247 			if (!pswit[VERBOSE_SWITCH])

  2248 			    g_print(" - not reporting duplicates");

  2249 			g_print("\n");

  2250 		    }

  2251 		    else

  2252 			cnt_word++;

  2253 		}

  2254 	    }

  2255 	}

  2256 	/* check the user's list of typos */

  2257 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  2258 	{

  2259 	    if (pswit[ECHO_SWITCH])

  2260 		g_print("\n%s\n",aline);

  2261 	    if (!pswit[OVERVIEW_SWITCH])

  2262 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  2263 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  2264 	}

  2265 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  2266 	    g_free(testword);

  2267 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  2268 	{

  2269 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  2270 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  2271 	    {

  2272 		if (pswit[ECHO_SWITCH])

  2273 		    g_print("\n%s\n",aline);

  2274 		if (!pswit[OVERVIEW_SWITCH])

  2275 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  2276 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  2277 		      inword);

  2278 		else

  2279 		    cnt_word++;

  2280 	    }

  2281 	}

  2282 	g_free(inword);

  2283     }

  2284 }

  2286 /*

  2287  * check_for_misspaced_punctuation:

  2288  *

  2289  * Look for added or missing spaces around punctuation and quotes.

  2290  * If there is a punctuation character like ! with no space on

  2291  * either side, suspect a missing!space. If there are spaces on

  2292  * both sides , assume a typo. If we see a double quote with no

  2293  * space or punctuation on either side of it, assume unspaced

  2294  * quotes "like"this.

  2295  */

  2296 void check_for_misspaced_punctuation(const char *aline,

  2297   struct parities *parities,gboolean isemptyline)

  2298 {

  2299     gboolean isacro,isellipsis;

  2300     const char *s;

  2301     gunichar c,nc,pc,n2c;

  2302     int parity;

  2303     c=g_utf8_get_char(aline);

  2304     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2305     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2306     {

  2307 	pc=c;

  2308 	c=nc;

  2309 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2310 	/* For each character in the line after the first. */

  2311 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  2312 	{

  2313 	    /* we need to suppress warnings for acronyms like M.D. */

  2314 	    isacro=FALSE;

  2315 	    /* we need to suppress warnings for ellipsis . . . */

  2316 	    isellipsis=FALSE;

  2317 	    /*

  2318 	     * If there are letters on both sides of it or

  2319 	     * if it's strict punctuation followed by an alpha.

  2320 	     */

  2321 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  2322 	      g_utf8_strchr("?!,;:",-1,c)))

  2323 	    {

  2324 		if (c=='.')

  2325 		{

  2326 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  2327 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  2328 			isacro=TRUE;

  2329 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  2330 		    if (nc && n2c=='.')

  2331 			isacro=TRUE;

  2332 		}

  2333 		if (!isacro)

  2334 		{

  2335 		    if (pswit[ECHO_SWITCH])

  2336 			g_print("\n%s\n",aline);

  2337 		    if (!pswit[OVERVIEW_SWITCH])

  2338 			g_print("    Line %ld column %ld - Missing space?\n",

  2339 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2340 		    else

  2341 			cnt_punct++;

  2342 		}

  2343 	    }

  2344 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  2345 	    {

  2346 		/*

  2347 		 * If there are spaces on both sides,

  2348 		 * or space before and end of line.

  2349 		 */

  2350 		if (c=='.')

  2351 		{

  2352 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  2353 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  2354 			isellipsis=TRUE;

  2355 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  2356 		    if (nc && n2c=='.')

  2357 			isellipsis=TRUE;

  2358 		}

  2359 		if (!isemptyline && !isellipsis)

  2360 		{

  2361 		    if (pswit[ECHO_SWITCH])

  2362 			g_print("\n%s\n",aline);

  2363 		    if (!pswit[OVERVIEW_SWITCH])

  2364 			g_print("    Line %ld column %ld - "

  2365 			  "Spaced punctuation?\n",linecnt,

  2366 			  g_utf8_pointer_to_offset(aline,s)+1);

  2367 		    else

  2368 			cnt_punct++;

  2369 		}

  2370 	    }

  2371 	}

  2372     }

  2373     /* Split out the characters that CANNOT be preceded by space. */

  2374     c=g_utf8_get_char(aline);

  2375     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2376     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2377     {

  2378 	pc=c;

  2379 	c=nc;

  2380 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2381 	/* for each character in the line after the first */

  2382 	if (g_utf8_strchr("?!,;:",-1,c))

  2383 	{

  2384 	    /* if it's punctuation that _cannot_ have a space before it */

  2385 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  2386 	    {

  2387 		/*

  2388 		 * If nc DOES == space,

  2389 		 * it was already reported just above.

  2390 		 */

  2391 		if (pswit[ECHO_SWITCH])

  2392 		    g_print("\n%s\n",aline);

  2393 		if (!pswit[OVERVIEW_SWITCH])

  2394 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2395 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2396 		else

  2397 		    cnt_punct++;

  2398 	    }

  2399 	}

  2400     }

  2401     /*

  2402      * Special case " .X" where X is any alpha.

  2403      * This plugs a hole in the acronym code above.

  2404      * Inelegant, but maintainable.

  2405      */

  2406     c=g_utf8_get_char(aline);

  2407     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2408     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2409     {

  2410 	pc=c;

  2411 	c=nc;

  2412 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2413 	/* for each character in the line after the first */

  2414 	if (c=='.')

  2415 	{

  2416 	    /* if it's a period */

  2417 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  2418 	    {

  2419 		/*

  2420 		 * If the period follows a space and

  2421 		 * is followed by a letter.

  2422 		 */

  2423 		if (pswit[ECHO_SWITCH])

  2424 		    g_print("\n%s\n",aline);

  2425 		if (!pswit[OVERVIEW_SWITCH])

  2426 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2427 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2428 		else

  2429 		    cnt_punct++;

  2430 	    }

  2431 	}

  2432     }

  2433     c=g_utf8_get_char(aline);

  2434     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2435     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2436     {

  2437 	pc=c;

  2438 	c=nc;

  2439 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2440 	/* for each character in the line after the first */

  2441 	if (CHAR_IS_DQUOTE(c))

  2442 	{

  2443 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  2444 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  2445 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  2446 	    {

  2447 		if (pswit[ECHO_SWITCH])

  2448 		    g_print("\n%s\n",aline);

  2449 		if (!pswit[OVERVIEW_SWITCH])

  2450 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  2451 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2452 		else

  2453 		    cnt_punct++;

  2454 	    }

  2455 	}

  2456     }

  2457     /* Check parity of quotes. */

  2458     nc=g_utf8_get_char(aline);

  2459     for (s=aline;*s;s=g_utf8_next_char(s))

  2460     {

  2461 	c=nc;

  2462 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2463 	if (CHAR_IS_DQUOTE(c))

  2464 	{

  2465 	    if (c==CHAR_DQUOTE)

  2466 	    {

  2467 		parities->dquote=!parities->dquote;

  2468 		parity=parities->dquote;

  2469 	    }

  2470 	    else if (c==CHAR_LD_QUOTE)

  2471 		parity=1;

  2472 	    else

  2473 		parity=0;

  2474 	    if (!parity)

  2475 	    {

  2476 		/* parity even */

  2477 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))

  2478 		{

  2479 		    if (pswit[ECHO_SWITCH])

  2480 			g_print("\n%s\n",aline);

  2481 		    if (!pswit[OVERVIEW_SWITCH])

  2482 			g_print("    Line %ld column %ld - "

  2483 			  "Wrongspaced quotes?\n",

  2484 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2485 		    else

  2486 			cnt_punct++;

  2487 		}

  2488 	    }

  2489 	    else

  2490 	    {

  2491 		/* parity odd */

  2492 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&

  2493 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)

  2494 		{

  2495 		    if (pswit[ECHO_SWITCH])

  2496 			g_print("\n%s\n",aline);

  2497 		    if (!pswit[OVERVIEW_SWITCH])

  2498 			g_print("    Line %ld column %ld - "

  2499 			  "Wrongspaced quotes?\n",

  2500 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2501 		    else

  2502 			cnt_punct++;

  2503 		}

  2504 	    }

  2505 	}

  2506     }

  2507     c=g_utf8_get_char(aline);

  2508     if (CHAR_IS_DQUOTE(c))

  2509     {

  2510 	if (g_utf8_strchr(",;:!?)]} ",-1,

  2511 	  g_utf8_get_char(g_utf8_next_char(aline))))

  2512 	{

  2513 	    if (pswit[ECHO_SWITCH])

  2514 		g_print("\n%s\n",aline);

  2515 	    if (!pswit[OVERVIEW_SWITCH])

  2516 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  2517 		  linecnt);

  2518 	    else

  2519 		cnt_punct++;

  2520 	}

  2521     }

  2522     if (pswit[SQUOTE_SWITCH])

  2523     {

  2524 	nc=g_utf8_get_char(aline);

  2525 	for (s=aline;*s;s=g_utf8_next_char(s))

  2526 	{

  2527 	    c=nc;

  2528 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2529 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&

  2530 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2531 	      !g_unichar_isalpha(nc)))

  2532 	    {

  2533 		parities->squote=!parities->squote;

  2534 		if (!parities->squote)

  2535 		{

  2536 		    /* parity even */

  2537 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2538 		    {

  2539 			if (pswit[ECHO_SWITCH])

  2540 			    g_print("\n%s\n",aline);

  2541 			if (!pswit[OVERVIEW_SWITCH])

  2542 			    g_print("    Line %ld column %ld - "

  2543 			      "Wrongspaced singlequotes?\n",

  2544 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2545 			else

  2546 			    cnt_punct++;

  2547 		    }

  2548 		}

  2549 		else

  2550 		{

  2551 		    /* parity odd */

  2552 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&

  2553 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2554 		    {

  2555 			if (pswit[ECHO_SWITCH])

  2556 			    g_print("\n%s\n",aline);

  2557 			if (!pswit[OVERVIEW_SWITCH])

  2558 			    g_print("    Line %ld column %ld - "

  2559 			      "Wrongspaced singlequotes?\n",

  2560 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2561 			else

  2562 			    cnt_punct++;

  2563 		    }

  2564 		}

  2565 	    }

  2566 	}

  2567     }

  2568 }

  2570 /*

  2571  * str_follows_word:

  2572  *

  2573  * Given a position p within a string str, determine whether it follows the

  2574  * given word. This is roughly equivalent to the regular expression (?<=\bword)

  2575  * but has different boundary conditions.

  2576  */

  2577 static gboolean str_follows_word(const char *str,const char *p,const char *word)

  2578 {

  2579     int len=strlen(word);

  2580     if (p-len<str)

  2581 	return FALSE;

  2582     else if (!g_str_has_prefix(p-len,word))

  2583 	return FALSE;

  2584     else if (p-len==str)

  2585 	return TRUE;

  2586     else

  2587 	/* Using non-alpha as a word boundary. See UAX #29 for a better way. */

  2588 	return !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(p-len)));

  2589 }

  2591 /*

  2592  * check_for_double_punctuation:

  2593  *

  2594  * Look for double punctuation like ,. or ,,

  2595  * Thanks to DW for the suggestion!

  2596  * In books with references, ".," and ".;" are common

  2597  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2598  * OTOH, from my initial tests, there are also fairly

  2599  * common errors. What to do? Make these cases paranoid?

  2600  * ".," is the most common, so warnings->dotcomma is used

  2601  * to suppress detailed reporting if it occurs often.

  2602  * Indeed, ".," is so common after "etc" or "&c" that

  2603  * we don't warn on these cases at all.

  2604  */

  2605 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2606 {

  2607     const char *s;

  2608     gunichar c,nc;

  2609     gboolean is_query;

  2610     nc=g_utf8_get_char(aline);

  2611     for (s=aline;*s;s=g_utf8_next_char(s))

  2612     {

  2613 	c=nc;

  2614 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2615 	/* for each punctuation character in the line */

  2616 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2617 	  g_utf8_strchr(".?!,;:",-1,nc))

  2618 	{

  2619 	    /* followed by punctuation, it's a query, unless . . . */

  2620 	    is_query=TRUE;

  2621 	    if (warnings->isFrench &&

  2622 	      (g_str_has_prefix(s,",...") || g_str_has_prefix(s,"...,") ||

  2623 	       g_str_has_prefix(s,";...") || g_str_has_prefix(s,"...;") ||

  2624 	       g_str_has_prefix(s,":...") || g_str_has_prefix(s,"...:") ||

  2625 	       g_str_has_prefix(s,"!...") || g_str_has_prefix(s,"...!") ||

  2626 	       g_str_has_prefix(s,"?...") || g_str_has_prefix(s,"...?")))

  2627 	    {

  2628 		s+=4;

  2629 		nc=g_utf8_get_char(g_utf8_next_char(s));

  2630 		is_query=FALSE;

  2631 	    }

  2632 	    else if (c==nc && (c=='.' || c=='?' || c=='!'))

  2633 	    {

  2634 		/* do nothing for .. !! and ?? which can be legit */

  2635 		is_query=FALSE;

  2636 	    }

  2637 	    else if (c=='.' && nc==',')

  2638 	    {

  2639 		if (!warnings->dotcomma || str_follows_word(aline,s,"etc") ||

  2640 		  str_follows_word(aline,s,"&c"))

  2641 		    is_query=FALSE;

  2642 	    }

  2643 	    if (is_query)

  2644 	    {

  2645 		if (pswit[ECHO_SWITCH])

  2646 		    g_print("\n%s\n",aline);

  2647 		if (!pswit[OVERVIEW_SWITCH])

  2648 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2649 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2650 		else

  2651 		    cnt_punct++;

  2652 	    }

  2653 	}

  2654     }

  2655 }

  2657 /*

  2658  * check_for_spaced_quotes:

  2659  */

  2660 void check_for_spaced_quotes(const char *aline)

  2661 {

  2662     int i;

  2663     const char *s,*t;

  2664     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,

  2665       CHAR_RS_QUOTE};

  2666     GString *pattern;

  2667     s=aline;

  2668     while ((t=strstr(s," \" ")))

  2669     {

  2670 	if (pswit[ECHO_SWITCH])

  2671 	    g_print("\n%s\n",aline);

  2672 	if (!pswit[OVERVIEW_SWITCH])

  2673 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2674 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2675 	else

  2676 	    cnt_punct++;

  2677 	s=g_utf8_next_char(g_utf8_next_char(t));

  2678     }

  2679     pattern=g_string_new(NULL);

  2680     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)

  2681     {

  2682 	g_string_assign(pattern," ");

  2683 	g_string_append_unichar(pattern,single_quotes[i]);

  2684 	g_string_append_c(pattern,' ');

  2685 	s=aline;

  2686 	while ((t=strstr(s,pattern->str)))

  2687 	{

  2688 	    if (pswit[ECHO_SWITCH])

  2689 		g_print("\n%s\n",aline);

  2690 	    if (!pswit[OVERVIEW_SWITCH])

  2691 		g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2692 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2693 	    else

  2694 		cnt_punct++;

  2695 	    s=g_utf8_next_char(g_utf8_next_char(t));

  2696 	}

  2697     }

  2698     g_string_free(pattern,TRUE);

  2699 }

  2701 /*

  2702  * check_for_miscased_genative:

  2703  *

  2704  * Check special case of 'S instead of 's at end of word.

  2705  */

  2706 void check_for_miscased_genative(const char *aline)

  2707 {

  2708     const char *s;

  2709     gunichar c,nc,pc;

  2710     if (!*aline)

  2711 	return;

  2712     c=g_utf8_get_char(aline);

  2713     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2714     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2715     {

  2716 	pc=c;

  2717 	c=nc;

  2718 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2719 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))

  2720 	{

  2721 	    if (pswit[ECHO_SWITCH])

  2722 		g_print("\n%s\n",aline);

  2723 	    if (!pswit[OVERVIEW_SWITCH])

  2724 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2725 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2726 	    else

  2727 		cnt_punct++;

  2728 	}

  2729     }

  2730 }

  2732 /*

  2733  * check_end_of_line:

  2734  *

  2735  * Now check special cases - start and end of line -

  2736  * for single and double quotes. Start is sometimes [sic]

  2737  * but better to query it anyway.

  2738  * While we're here, check for dash at end of line.

  2739  */

  2740 void check_end_of_line(const char *aline,struct warnings *warnings)

  2741 {

  2742     int lbytes;

  2743     const char *s;

  2744     gunichar c1,c2;

  2745     lbytes=strlen(aline);

  2746     if (g_utf8_strlen(aline,lbytes)>1)

  2747     {

  2748 	s=g_utf8_prev_char(aline+lbytes);

  2749 	c1=g_utf8_get_char(s);

  2750 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2751 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)

  2752 	{

  2753 	    if (pswit[ECHO_SWITCH])

  2754 		g_print("\n%s\n",aline);

  2755 	    if (!pswit[OVERVIEW_SWITCH])

  2756 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2757 		  g_utf8_strlen(aline,lbytes));

  2758 	    else

  2759 		cnt_punct++;

  2760 	}

  2761 	c1=g_utf8_get_char(aline);

  2762 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2763 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)

  2764 	{

  2765 	    if (pswit[ECHO_SWITCH])

  2766 		g_print("\n%s\n",aline);

  2767 	    if (!pswit[OVERVIEW_SWITCH])

  2768 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2769 	    else

  2770 		cnt_punct++;

  2771 	}

  2772 	/*

  2773 	 * Dash at end of line may well be legit - paranoid mode only

  2774 	 * and don't report em-dash at line-end.

  2775 	 */

  2776 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2777 	{

  2778 	    for (s=g_utf8_prev_char(aline+lbytes);

  2779 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2780 		;

  2781 	    if (g_utf8_get_char(s)=='-' &&

  2782 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2783 	    {

  2784 		if (pswit[ECHO_SWITCH])

  2785 		    g_print("\n%s\n",aline);

  2786 		if (!pswit[OVERVIEW_SWITCH])

  2787 		    g_print("    Line %ld column %ld - "

  2788 		      "Hyphen at end of line?\n",

  2789 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2790 	    }

  2791 	}

  2792     }

  2793 }

  2795 /*

  2796  * check_for_unspaced_bracket:

  2797  *

  2798  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2799  * If so, suspect a scanno like "a]most".

  2800  */

  2801 void check_for_unspaced_bracket(const char *aline)

  2802 {

  2803     const char *s;

  2804     gunichar c,nc,pc;

  2805     c=g_utf8_get_char(aline);

  2806     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2807     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2808     {

  2809 	pc=c;

  2810 	c=nc;

  2811 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2812 	if (!nc)

  2813 	    break;

  2814 	/* for each bracket character in the line except 1st & last */

  2815 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2816 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2817 	{

  2818 	    if (pswit[ECHO_SWITCH])

  2819 		g_print("\n%s\n",aline);

  2820 	    if (!pswit[OVERVIEW_SWITCH])

  2821 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2822 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2823 	    else

  2824 		cnt_punct++;

  2825 	}

  2826     }

  2827 }

  2829 /*

  2830  * check_for_unpunctuated_endquote:

  2831  */

  2832 void check_for_unpunctuated_endquote(const char *aline)

  2833 {

  2834     const char *s;

  2835     gunichar c,nc,pc;

  2836     QuoteClass qc;

  2837     c=g_utf8_get_char(aline);

  2838     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2839     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2840     {

  2841 	pc=c;

  2842 	c=nc;

  2843 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;

  2844 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2845 	/* for each character in the line except 1st */

  2846 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))

  2847 	{

  2848 	    if (pswit[ECHO_SWITCH])

  2849 		g_print("\n%s\n",aline);

  2850 	    if (!pswit[OVERVIEW_SWITCH])

  2851 		g_print("    Line %ld column %ld - "

  2852 		  "endquote missing punctuation?\n",

  2853 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2854 	    else

  2855 		cnt_punct++;

  2856 	}

  2857     }

  2858 }

  2860 /*

  2861  * check_for_html_tag:

  2862  *

  2863  * Check for <HTML TAG>.

  2864  *

  2865  * If there is a < in the line, followed at some point

  2866  * by a > then we suspect HTML.

  2867  */

  2868 void check_for_html_tag(const char *aline)

  2869 {

  2870     const char *open,*close;

  2871     gchar *tag;

  2872     open=strchr(aline,'<');

  2873     if (open)

  2874     {

  2875 	close=strchr(g_utf8_next_char(open),'>');

  2876 	if (close)

  2877 	{

  2878 	    if (pswit[ECHO_SWITCH])

  2879 		g_print("\n%s\n",aline);

  2880 	    if (!pswit[OVERVIEW_SWITCH])

  2881 	    {

  2882 		tag=g_strndup(open,close-open+1);

  2883 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2884 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2885 		g_free(tag);

  2886 	    }

  2887 	    else

  2888 		cnt_html++;

  2889 	}

  2890     }

  2891 }

  2893 /*

  2894  * check_for_html_entity:

  2895  *

  2896  * Check for &symbol; HTML.

  2897  *

  2898  * If there is a & in the line, followed at

  2899  * some point by a ; then we suspect HTML.

  2900  */

  2901 void check_for_html_entity(const char *aline)

  2902 {

  2903     const char *s,*amp,*scolon;

  2904     gchar *entity;

  2905     amp=strchr(aline,'&');

  2906     if (amp)

  2907     {

  2908 	scolon=strchr(amp,';');

  2909 	if (scolon)

  2910 	{

  2911 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2912 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2913 		    break;		/* Don't report "Jones & Son;" */

  2914 	    if (s>=scolon)

  2915 	    {

  2916 		if (pswit[ECHO_SWITCH])

  2917 		    g_print("\n%s\n",aline);

  2918 		if (!pswit[OVERVIEW_SWITCH])

  2919 		{

  2920 		    entity=g_strndup(amp,scolon-amp+1);

  2921 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2922 		      linecnt,(int)(amp-aline)+1,entity);

  2923 		    g_free(entity);

  2924 		}

  2925 		else

  2926 		    cnt_html++;

  2927 	    }

  2928 	}

  2929     }

  2930 }

  2932 /*

  2933  * check_for_omitted_punctuation:

  2934  *

  2935  * Check for omitted punctuation at end of paragraph by working back

  2936  * through prevline. DW.

  2937  * Need to check this only for "normal" paras.

  2938  * So what is a "normal" para?

  2939  *    Not normal if one-liner (chapter headings, etc.)

  2940  *    Not normal if doesn't contain at least one locase letter

  2941  *    Not normal if starts with space

  2942  */

  2943 void check_for_omitted_punctuation(const char *prevline,

  2944   struct line_properties *last,int start_para_line)

  2945 {

  2946     gboolean letter_on_line=FALSE;

  2947     const char *s;

  2948     gunichar c;

  2949     gboolean closing_quote;

  2950     for (s=prevline;*s;s=g_utf8_next_char(s))

  2951 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2952 	{

  2953 	    letter_on_line=TRUE;

  2954 	    break;

  2955 	}

  2956     /*

  2957      * This next "if" is a problem.

  2958      * If we say "start_para_line <= linecnt - 1", that includes

  2959      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2960      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2961      * misses genuine one-line paragraphs.

  2962      */

  2963     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2964       g_utf8_get_char(prevline)>CHAR_SPACE)

  2965     {

  2966 	s=prevline+strlen(prevline);

  2967 	do

  2968 	{

  2969 	    s=g_utf8_prev_char(s);

  2970 	    c=g_utf8_get_char(s);

  2971 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)

  2972 		closing_quote=TRUE;

  2973 	    else

  2974 		closing_quote=FALSE;

  2975 	} while (closing_quote && s>prevline);

  2976 	for (;s>prevline;s=g_utf8_prev_char(s))

  2977 	{

  2978 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2979 	    {

  2980 		if (pswit[ECHO_SWITCH])

  2981 		    g_print("\n%s\n",prevline);

  2982 		if (!pswit[OVERVIEW_SWITCH])

  2983 		    g_print("    Line %ld column %ld - "

  2984 		      "No punctuation at para end?\n",

  2985 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2986 		else

  2987 		    cnt_punct++;

  2988 		break;

  2989 	    }

  2990 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))

  2991 		break;

  2992 	}

  2993     }

  2994 }

  2996 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2997 {

  2998     const char *word=key;

  2999     int *dupcnt=value;

  3000     if (*dupcnt)

  3001 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  3002 	  word,*dupcnt);

  3003     return FALSE;

  3004 }

  3006 void print_as_windows_1252(const char *string)

  3007 {

  3008     gsize inbytes,outbytes;

  3009     gchar *buf,*bp;

  3010     static GIConv converter=(GIConv)-1;

  3011     if (!string)

  3012     {

  3013 	if (converter!=(GIConv)-1)

  3014 	    g_iconv_close(converter);

  3015 	converter=(GIConv)-1;

  3016 	return;

  3017     }

  3018     if (converter==(GIConv)-1)

  3019 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  3020     if (converter!=(GIConv)-1)

  3021     {

  3022 	inbytes=outbytes=strlen(string);

  3023 	bp=buf=g_malloc(outbytes+1);

  3024 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  3025 	*bp='\0';

  3026 	fputs(buf,stdout);

  3027 	g_free(buf);

  3028     }

  3029     else

  3030 	fputs(string,stdout);

  3031 }

  3033 void print_as_utf_8(const char *string)

  3034 {

  3035     fputs(string,stdout);

  3036 }

  3038 /*

  3039  * procfile:

  3040  *

  3041  * Process one file.

  3042  */

  3043 void procfile(const char *filename)

  3044 {

  3045     const char *s;

  3046     gchar *parastart=NULL;	/* first line of current para */

  3047     gchar *etext,*aline;

  3048     gchar *etext_ptr;

  3049     GError *err=NULL;

  3050     struct first_pass_results *first_pass_results;

  3051     struct warnings *warnings;

  3052     struct counters counters={0};

  3053     struct line_properties last={0};

  3054     struct parities parities={0};

  3055     struct pending pending={0};

  3056     gboolean isemptyline;

  3057     long start_para_line=0;

  3058     gboolean isnewpara=FALSE,enddash=FALSE;

  3059     last.start=CHAR_SPACE;

  3060     linecnt=checked_linecnt=0;

  3061     etext=read_etext(filename,&err);

  3062     if (!etext)

  3063     {

  3064 	if (pswit[STDOUT_SWITCH])

  3065 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  3066 	else

  3067 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  3068 	exit(1);

  3069     }

  3070     g_print("\n\nFile: %s\n\n",filename);

  3071     first_pass_results=first_pass(etext);

  3072     warnings=report_first_pass(first_pass_results);

  3073     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  3074     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  3075     /*

  3076      * Here we go with the main pass. Hold onto yer hat!

  3077      */

  3078     linecnt=0;

  3079     etext_ptr=etext;

  3080     while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))

  3081     {

  3082 	linecnt++;

  3083 	if (linecnt==1)

  3084 	    isnewpara=TRUE;

  3085 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  3086 	    continue;    // skip DP page separators completely

  3087 	if (linecnt<first_pass_results->firstline ||

  3088 	  (first_pass_results->footerline>0 &&

  3089 	  linecnt>first_pass_results->footerline))

  3090 	{

  3091 	    if (pswit[HEADER_SWITCH])

  3092 	    {

  3093 		if (g_str_has_prefix(aline,"Title:"))

  3094 		    g_print("    %s\n",aline);

  3095 		if (g_str_has_prefix(aline,"Author:"))

  3096 		    g_print("    %s\n",aline);

  3097 		if (g_str_has_prefix(aline,"Release Date:"))

  3098 		    g_print("    %s\n",aline);

  3099 		if (g_str_has_prefix(aline,"Edition:"))

  3100 		    g_print("    %s\n\n",aline);

  3101 	    }

  3102 	    continue;		/* skip through the header */

  3103 	}

  3104 	checked_linecnt++;

  3105 	print_pending(aline,parastart,&pending);

  3106 	isemptyline=analyse_quotes(aline,&counters);

  3107 	if (isnewpara && !isemptyline)

  3108 	{

  3109 	    /* This line is the start of a new paragraph. */

  3110 	    start_para_line=linecnt;

  3111 	    /* Capture its first line in case we want to report it later. */

  3112 	    g_free(parastart);

  3113 	    parastart=g_strdup(aline);

  3114 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  3115 	    s=aline;

  3116 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  3117 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  3118 		s=g_utf8_next_char(s);

  3119 	    if (g_unichar_islower(g_utf8_get_char(s)))

  3120 	    {

  3121 		/* and its first letter is lowercase */

  3122 		if (pswit[ECHO_SWITCH])

  3123 		    g_print("\n%s\n",aline);

  3124 		if (!pswit[OVERVIEW_SWITCH])

  3125 		    g_print("    Line %ld column %ld - "

  3126 		      "Paragraph starts with lower-case\n",

  3127 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  3128 		else

  3129 		    cnt_punct++;

  3130 	    }

  3131 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  3132 	}

  3133 	/* Check for an em-dash broken at line end. */

  3134 	if (enddash && g_utf8_get_char(aline)=='-')

  3135 	{

  3136 	    if (pswit[ECHO_SWITCH])

  3137 		g_print("\n%s\n",aline);

  3138 	    if (!pswit[OVERVIEW_SWITCH])

  3139 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  3140 	    else

  3141 		cnt_punct++;

  3142 	}

  3143 	enddash=FALSE;

  3144 	for (s=g_utf8_prev_char(aline+strlen(aline));

  3145 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  3146 	    ;

  3147 	if (s>=aline && g_utf8_get_char(s)=='-')

  3148 	    enddash=TRUE;

  3149 	check_for_control_characters(aline);

  3150 	check_for_odd_characters(aline,warnings,isemptyline);

  3151 	if (warnings->longline)

  3152 	    check_for_long_line(aline);

  3153 	if (warnings->shortline)

  3154 	    check_for_short_line(aline,&last);

  3155 	last.blen=last.len;

  3156 	last.len=g_utf8_strlen(aline,-1);

  3157 	last.start=g_utf8_get_char(aline);

  3158 	check_for_starting_punctuation(aline);

  3159 	if (warnings->dash)

  3160 	{

  3161 	    check_for_spaced_emdash(aline);

  3162 	    check_for_spaced_dash(aline);

  3163 	}

  3164 	check_for_unmarked_paragraphs(aline);

  3165 	check_for_jeebies(aline);

  3166 	check_for_mta_from(aline);

  3167 	check_for_orphan_character(aline);

  3168 	check_for_pling_scanno(aline);

  3169 	check_for_extra_period(aline,warnings);

  3170 	check_for_following_punctuation(aline);

  3171 	check_for_typos(aline,warnings);

  3172 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  3173 	check_for_double_punctuation(aline,warnings);

  3174 	check_for_spaced_quotes(aline);

  3175 	check_for_miscased_genative(aline);

  3176 	check_end_of_line(aline,warnings);

  3177 	check_for_unspaced_bracket(aline);

  3178 	if (warnings->endquote)

  3179 	    check_for_unpunctuated_endquote(aline);

  3180 	check_for_html_tag(aline);

  3181 	check_for_html_entity(aline);

  3182 	if (isemptyline)

  3183 	{

  3184 	    check_for_mismatched_quotes(&counters,&pending);

  3185 	    counters_reset(&counters);

  3186 	    /* let the next iteration know that it's starting a new para */

  3187 	    isnewpara=TRUE;

  3188 	    if (prevline)

  3189 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  3190 	}

  3191 	g_free(prevline);

  3192 	prevline=g_strdup(aline);

  3193     }

  3194     linecnt++;

  3195     check_for_mismatched_quotes(&counters,&pending);

  3196     print_pending(NULL,parastart,&pending);

  3197     reset_pending(&pending);

  3198     if (prevline)

  3199     {

  3200 	g_free(prevline);

  3201 	prevline=NULL;

  3202     }

  3203     g_free(parastart);

  3204     g_free(prevline);

  3205     g_free(etext);

  3206     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  3207 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  3208     g_tree_unref(qword);

  3209     g_tree_unref(qperiod);

  3210     counters_destroy(&counters);

  3211     g_set_print_handler(NULL);

  3212     print_as_windows_1252(NULL);

  3213     if (pswit[MARKUP_SWITCH])

  3214 	loseentities(NULL);

  3215 }

  3217 /*

  3218  * flgets:

  3219  *

  3220  * Get one line from the input text. The setting of newlines has the following

  3221  * effect:

  3222  *

  3223  * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.

  3224  *

  3225  * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as

  3226  *		 the newline character.

  3227  *

  3228  * UNIX_NEWLINES: Check for the presence of CRs.

  3229  *

  3230  * In all cases, check that the last line is correctly terminated.

  3231  *

  3232  * Returns: a pointer to the line.

  3233  */

  3234 char *flgets(char **etext,long lcnt,int newlines)

  3235 {

  3236     gunichar c;

  3237     gboolean isCR=FALSE;

  3238     char *theline=*etext;

  3239     char *eos=theline;

  3240     gchar *s;

  3241     for (;;)

  3242     {

  3243 	c=g_utf8_get_char(*etext);

  3244 	if (!c)

  3245 	{

  3246 	    if (*etext==theline)

  3247 		return NULL;

  3248 	    else if (pswit[LINE_END_SWITCH])

  3249 	    {

  3250 		if (pswit[ECHO_SWITCH])

  3251 		{

  3252 		    s=g_strndup(theline,eos-theline);

  3253 		    g_print("\n%s\n",s);

  3254 		    g_free(s);

  3255 		}

  3256 		if (!pswit[OVERVIEW_SWITCH])

  3257 		{

  3258 		    if (newlines==OS9_NEWLINES)

  3259 			g_print("    Line %ld - No CR?\n",lcnt);

  3260 		    else

  3261 		    {

  3262 			/* There may, or may not, have been a CR */

  3263 			g_print("    Line %ld - No LF?\n",lcnt);

  3264 		    }

  3265 		}

  3266 		else

  3267 		    cnt_lineend++;

  3268 	    }

  3269 	    break;

  3270 	}

  3271 	*etext=g_utf8_next_char(*etext);

  3272 	/* either way, it's end of line */

  3273 	if (c=='\n')

  3274 	{

  3275 	    if (newlines==DOS_NEWLINES && !isCR)

  3276 	    {

  3277 		/* Error - a LF without a preceding CR */

  3278 		if (pswit[LINE_END_SWITCH])

  3279 		{

  3280 		    if (pswit[ECHO_SWITCH])

  3281 		    {

  3282 			s=g_strndup(theline,eos-theline);

  3283 			g_print("\n%s\n",s);

  3284 			g_free(s);

  3285 		    }

  3286 		    if (!pswit[OVERVIEW_SWITCH])

  3287 			g_print("    Line %ld - No CR?\n",lcnt);

  3288 		    else

  3289 			cnt_lineend++;

  3290 		}

  3291 	    }

  3292 	    break;

  3293 	}

  3294 	if (c=='\r')

  3295 	{

  3296 	    if (newlines==OS9_NEWLINES)

  3297 		break;

  3298 	    if (isCR || newlines==UNIX_NEWLINES)

  3299 	    {

  3300 		if (pswit[LINE_END_SWITCH])

  3301 		{

  3302 		    if (pswit[ECHO_SWITCH])

  3303 		    {

  3304 			s=g_strndup(theline,eos-theline);

  3305 			g_print("\n%s\n",s);

  3306 			g_free(s);

  3307 		    }

  3308 		    if (!pswit[OVERVIEW_SWITCH])

  3309 		    {

  3310 			if (newlines==UNIX_NEWLINES)

  3311 			    g_print("    Line %ld column %ld - Embedded CR?\n",

  3312 			      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  3313 			else

  3314 			    g_print("    Line %ld - Two successive CRs?\n",

  3315 			      lcnt);

  3316 		    }

  3317 		    else

  3318 			cnt_lineend++;

  3319 		}

  3320 		if (newlines==UNIX_NEWLINES)

  3321 		    *eos=' ';

  3322 	    }

  3323 	    if (newlines==DOS_NEWLINES)

  3324 		isCR=TRUE;

  3325 	}

  3326 	else

  3327 	{

  3328 	    if (pswit[LINE_END_SWITCH] && isCR)

  3329 	    {

  3330 		if (pswit[ECHO_SWITCH])

  3331 		{

  3332 		    s=g_strndup(theline,eos-theline);

  3333 		    g_print("\n%s\n",s);

  3334 		    g_free(s);

  3335 		}

  3336 		if (!pswit[OVERVIEW_SWITCH])

  3337 		    g_print("    Line %ld column %ld - CR without LF?\n",

  3338 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  3339 		else

  3340 		    cnt_lineend++;

  3341 		*eos=' ';

  3342 	    }

  3343 	    isCR=FALSE;

  3344 	    eos=g_utf8_next_char(eos);

  3345 	}

  3346     }

  3347     *eos='\0';

  3348     if (pswit[MARKUP_SWITCH])

  3349 	postprocess_for_HTML(theline);

  3350     if (pswit[DP_SWITCH])

  3351 	postprocess_for_DP(theline);

  3352     return theline;

  3353 }

  3355 /*

  3356  * mixdigit:

  3357  *

  3358  * Takes a "word" as a parameter, and checks whether it

  3359  * contains a mixture of alpha and digits. Generally, this is an

  3360  * error, but may not be for cases like 4th or L5 12s. 3d.

  3361  *

  3362  * Returns: TRUE iff an is error found.

  3363  */

  3364 gboolean mixdigit(const char *checkword)

  3365 {

  3366     gboolean wehaveadigit,wehavealetter,query;

  3367     const char *s,*nondigit;

  3368     wehaveadigit=wehavealetter=query=FALSE;

  3369     for (s=checkword;*s;s=g_utf8_next_char(s))

  3370 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  3371 	    wehavealetter=TRUE;

  3372 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  3373 	    wehaveadigit=TRUE;

  3374     if (wehaveadigit && wehavealetter)

  3375     {

  3376 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  3377 	query=TRUE;

  3378 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  3379 	  nondigit=g_utf8_next_char(nondigit))

  3380 	    ;

  3381 	/* digits, ending in st, rd, nd, th of either case */

  3382 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  3383 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  3384 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  3385 	  !g_ascii_strcasecmp(nondigit,"th"))

  3386 	    query=FALSE;

  3387 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  3388 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  3389 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  3390 	  !g_ascii_strcasecmp(nondigit,"ths"))

  3391 	    query=FALSE;

  3392 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  3393 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  3394 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  3395 	  !g_ascii_strcasecmp(nondigit,"thly"))

  3396 	    query=FALSE;

  3397 	/* digits, ending in l, L, s or d */

  3398 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  3399 	  !strcmp(nondigit,"d"))

  3400 	    query=FALSE;

  3401 	/*

  3402 	 * L at the start of a number, representing Britsh pounds, like L500.

  3403 	 * This is cute. We know the current word is mixed digit. If the first

  3404 	 * letter is L, there must be at least one digit following. If both

  3405 	 * digits and letters follow, we have a genuine error, else we have a

  3406 	 * capital L followed by digits, and we accept that as a non-error.

  3407 	 */

  3408 	if (g_utf8_get_char(checkword)=='L' &&

  3409 	  !mixdigit(g_utf8_next_char(checkword)))

  3410 	    query=FALSE;

  3411     }

  3412     return query;

  3413 }

  3415 /*

  3416  * getaword:

  3417  *

  3418  * Extracts the first/next "word" from the line, and returns it.

  3419  * A word is defined as one English word unit--or at least that's the aim.

  3420  * "ptr" is advanced to the position in the line where we will start

  3421  * looking for the next word.

  3422  *

  3423  * Returns: A newly-allocated string.

  3424  */

  3425 gchar *getaword(const char **ptr)

  3426 {

  3427     const char *s,*t;

  3428     GString *word;

  3429     gunichar c,pc;

  3430     word=g_string_new(NULL);

  3431     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  3432       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  3433       **ptr;*ptr=g_utf8_next_char(*ptr))

  3434     {

  3435 	/* Handle exceptions for footnote markers like [1] */

  3436 	if (g_utf8_get_char(*ptr)=='[')

  3437 	{

  3438 	    g_string_append_c(word,'[');

  3439 	    s=g_utf8_next_char(*ptr);

  3440 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))

  3441 		g_string_append_unichar(word,g_utf8_get_char(s));

  3442 	    if (g_utf8_get_char(s)==']')

  3443 	    {

  3444 		g_string_append_c(word,']');

  3445 		*ptr=g_utf8_next_char(s);

  3446 		return g_string_free(word,FALSE);

  3447 	    }

  3448 	    else

  3449 		g_string_truncate(word,0);

  3450 	}

  3451     }

  3452     /*

  3453      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  3454      * Especially yucky is the case of L1,000

  3455      * This section looks for a pattern of characters including a digit

  3456      * followed by a comma or period followed by one or more digits.

  3457      * If found, it returns this whole pattern as a word; otherwise we discard

  3458      * the results and resume our normal programming.

  3459      */

  3460     s=*ptr;

  3461     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  3462       g_unichar_isalpha(g_utf8_get_char(s)) ||

  3463       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  3464 	g_string_append_unichar(word,g_utf8_get_char(s));

  3465     if (word->len)

  3466     {

  3467 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  3468 	{

  3469 	    c=g_utf8_get_char(t);

  3470 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  3471 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  3472 	    {

  3473 		*ptr=s;

  3474 		return g_string_free(word,FALSE);

  3475 	    }

  3476 	}

  3477     }

  3478     /* we didn't find a punctuated number - do the regular getword thing */

  3479     g_string_truncate(word,0);

  3480     c=g_utf8_get_char(*ptr);

  3481     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);

  3482       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))

  3483 	g_string_append_unichar(word,c);

  3484     return g_string_free(word,FALSE);

  3485 }

  3487 /*

  3488  * isroman:

  3489  *

  3490  * Is this word a Roman Numeral?

  3491  *

  3492  * It doesn't actually validate that the number is a valid Roman Numeral--for

  3493  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  3494  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  3495  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  3496  * expressions thereof, except when it came to taxes. Allow any number of M,

  3497  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3498  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3499  * of optional Is.

  3500  */

  3501 gboolean isroman(const char *t)

  3502 {

  3503     const char *s;

  3504     if (!t || !*t)

  3505 	return FALSE;

  3506     s=t;

  3507     while (g_utf8_get_char(t)=='m' && *t)

  3508 	t++;

  3509     if (g_utf8_get_char(t)=='d')

  3510 	t++;

  3511     if (g_str_has_prefix(t,"cm"))

  3512 	t+=2;

  3513     if (g_str_has_prefix(t,"cd"))

  3514 	t+=2;

  3515     while (g_utf8_get_char(t)=='c' && *t)

  3516 	t++;

  3517     if (g_str_has_prefix(t,"xl"))

  3518 	t+=2;

  3519     if (g_str_has_prefix(t,"xc"))

  3520 	t+=2;

  3521     if (g_utf8_get_char(t)=='l')

  3522 	t++;

  3523     while (g_utf8_get_char(t)=='x' && *t)

  3524 	t++;

  3525     if (g_str_has_prefix(t,"ix"))

  3526 	t+=2;

  3527     if (g_str_has_prefix(t,"iv"))

  3528 	t+=2;

  3529     if (g_utf8_get_char(t)=='v')

  3530 	t++;

  3531     while (g_utf8_get_char(t)=='i' && *t)

  3532 	t++;

  3533     return !*t;

  3534 }

  3536 /*

  3537  * postprocess_for_DP:

  3538  *

  3539  * Invoked with the -d switch from flgets().

  3540  * It simply "removes" from the line a hard-coded set of common

  3541  * DP-specific tags, so that the line passed to the main routine has

  3542  * been pre-cleaned of DP markup.

  3543  */

  3544 void postprocess_for_DP(char *theline)

  3545 {

  3546     char *s,*t;

  3547     int i;

  3548     if (!*theline)

  3549 	return;

  3550     for (i=0;*DPmarkup[i];i++)

  3551 	while ((s=strstr(theline,DPmarkup[i])))

  3552 	{

  3553 	    t=s+strlen(DPmarkup[i]);

  3554 	    memmove(s,t,strlen(t)+1);

  3555 	}

  3556 }

  3558 /*

  3559  * postprocess_for_HTML:

  3560  *

  3561  * Invoked with the -m switch from flgets().

  3562  * It simply "removes" from the line a hard-coded set of common

  3563  * HTML tags and "replaces" a hard-coded set of common HTML

  3564  * entities, so that the line passed to the main routine has

  3565  * been pre-cleaned of HTML.

  3566  */

  3567 void postprocess_for_HTML(char *theline)

  3568 {

  3569     while (losemarkup(theline))

  3570 	;

  3571     loseentities(theline);

  3572 }

  3574 char *losemarkup(char *theline)

  3575 {

  3576     char *s,*t;

  3577     int i;

  3578     s=strchr(theline,'<');

  3579     t=s?strchr(s,'>'):NULL;

  3580     if (!s || !t)

  3581 	return NULL;

  3582     for (i=0;*markup[i];i++)

  3583 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  3584 	{

  3585 	    t=g_utf8_next_char(t);

  3586 	    memmove(s,t,strlen(t)+1);

  3587 	    return s;

  3588 	}

  3589     /* It's an unrecognized <xxx>. */

  3590     return NULL;

  3591 }

  3593 void loseentities(char *theline)

  3594 {

  3595     int i;

  3596     gsize nb;

  3597     char *amp,*scolon;

  3598     gchar *s,*t;

  3599     gunichar c;

  3600     GTree *entities=NULL;

  3601     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3602     if (!theline)

  3603     {

  3604 	if (entities)

  3605 	    g_tree_destroy(entities);

  3606 	entities=NULL;

  3607 	if (translit!=(GIConv)-1)

  3608 	    g_iconv_close(translit);

  3609 	translit=(GIConv)-1;

  3610 	if (to_utf8!=(GIConv)-1)

  3611 	    g_iconv_close(to_utf8);

  3612 	to_utf8=(GIConv)-1;

  3613 	return;

  3614     }

  3615     if (!*theline)

  3616 	return;

  3617     if (!entities)

  3618     {

  3619 	entities=g_tree_new((GCompareFunc)strcmp);

  3620 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3621 	    g_tree_insert(entities,HTMLentities[i].name,

  3622 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3623     }

  3624     if (translit==(GIConv)-1)

  3625 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3626     if (to_utf8==(GIConv)-1)

  3627 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3628     while((amp=strchr(theline,'&')))

  3629     {

  3630 	scolon=strchr(amp,';');

  3631 	if (scolon)

  3632 	{

  3633 	    if (amp[1]=='#')

  3634 	    {

  3635 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3636 		    c=strtol(amp+2,NULL,10);

  3637 		else if (amp[2]=='x' &&

  3638 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3639 		    c=strtol(amp+3,NULL,16);

  3640 	    }

  3641 	    else

  3642 	    {

  3643 		s=g_strndup(amp+1,scolon-(amp+1));

  3644 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3645 		g_free(s);

  3646 	    }

  3647 	}

  3648 	else

  3649 	    c=0;

  3650 	if (c)

  3651 	{

  3652 	    theline=amp;

  3653 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3654 		theline+=g_unichar_to_utf8(c,theline);

  3655 	    else

  3656 	    {

  3657 		s=g_malloc(6);

  3658 		nb=g_unichar_to_utf8(c,s);

  3659 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3660 		g_free(s);

  3661 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3662 		g_free(t);

  3663 		memcpy(theline,s,nb);

  3664 		g_free(s);

  3665 		theline+=nb;

  3666 	    }

  3667 	    memmove(theline,g_utf8_next_char(scolon),

  3668 	      strlen(g_utf8_next_char(scolon))+1);

  3669 	}

  3670 	else

  3671 	    theline=g_utf8_next_char(amp);

  3672     }

  3673 }

  3675 gboolean tagcomp(const char *strin,const char *basetag)

  3676 {

  3677     gboolean retval;

  3678     gchar *s,*t;

  3679     if (g_utf8_get_char(strin)=='/')

  3680 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3681     else

  3682 	t=g_utf8_casefold(strin,-1);

  3683     s=g_utf8_casefold(basetag,-1);

  3684     retval=g_str_has_prefix(t,s);

  3685     g_free(s);

  3686     g_free(t);

  3687     return retval;

  3688 }

  3690 void proghelp(GOptionContext *context)

  3691 {

  3692     gchar *help;

  3693     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3694     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3695     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3696     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3697       "For details, read the file COPYING.\n",stderr);

  3698     fputs("This is Free Software; "

  3699       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3700     fputs("read the file COPYING for details.\n\n",stderr);

  3701     help=g_option_context_get_help(context,TRUE,NULL);

  3702     fputs(help,stderr);

  3703     g_free(help);

  3704     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3705     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3706       "non-ASCII\n",stderr);

  3707     fputs("characters like accented letters, "

  3708       "lines longer than 75 or shorter than 55,\n",stderr);

  3709     fputs("unbalanced quotes or brackets, "

  3710       "a variety of badly formatted punctuation, \n",stderr);

  3711     fputs("HTML tags, some likely typos. "

  3712       "It is NOT a substitute for human judgement.\n",stderr);

  3713     fputs("\n",stderr);

  3714 }

author	ali <ali@juiblex.co.uk>
	Wed Oct 16 22:51:29 2013 +0100 (2013-10-16)
changeset 104	70cc629ec1e0
parent 103	d22d8cd4f628
permissions	-rw-r--r--