bookloupe: bookloupe/bookloupe.c@d22d8cd4f628

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "bookloupe.h"

    31 #include "counters.h"

    32 #include "pending.h"

    33 #include "HTMLentities.h"

    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */

    36 GIConv charset_validator=(GIConv)-1;

    38 gchar *prevline;

    40 /* Common typos. */

    41 char *typo[] = {

    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    71     "se", ""

    72 };

    74 GTree *usertypo;

    76 /* Common abbreviations and other OK words not to query as typos. */

    77 char *okword[] = {

    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    81     "outbid", "outbids", "frostbite", "frostbitten", ""

    82 };

    84 /* Common abbreviations that cause otherwise unexplained periods. */

    85 char *abbrev[] = {

    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    88 };

    90 /*

    91  * Two-Letter combinations that rarely if ever start words,

    92  * but are common scannos or otherwise common letter combinations.

    93  */

    94 char *nostart[] = {

    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    96 };

    98 /*

    99  * Two-Letter combinations that rarely if ever end words,

   100  * but are common scannos or otherwise common letter combinations.

   101  */

   102 char *noend[] = {

   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   104     "sw", "gr", "sl", "cl", "iy", ""

   105 };

   107 char *markup[] = {

   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   112 };

   114 char *DPmarkup[] = {

   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   116 };

   118 char *nocomma[] = {

   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   122     "during", "let", "toward", "among", ""

   123 };

   125 char *noperiod[] = {

   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   129     "among", "those", "into", "whom", "having", "thence", ""

   130 };

   132 gboolean pswit[SWITNO];  /* program switches */

   133 gchar *opt_charset;

   135 gboolean typo_compat,paranoid_compat;

   137 static GOptionEntry options[]={

   138     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   139       "Ignore DP-specific markup", NULL },

   140     { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   141       G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   142       "Don't ignore DP-specific markup", NULL },

   143     { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   144       "Echo queried line", NULL },

   145     { "no-echo", 'e', G_OPTION_FLAG_REVERSE,

   146       G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   147       "Don't echo queried line", NULL },

   148     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   149       "Check single quotes", NULL },

   150     { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   151       G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   152       "Don't check single quotes", NULL },

   153     { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   154       "Check common typos", NULL },

   155     { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   156       G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   157       "Don't check common typos", NULL },

   158     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   159       "Require closure of quotes on every paragraph", NULL },

   160     { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   161       G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   162       "Don't require closure of quotes on every paragraph", NULL },

   163     { "paranoid", 0, G_OPTION_FLAG_HIDDEN,

   164       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   165       "Enable paranoid querying of everything", NULL },

   166     { "no-paranoid", 0, G_OPTION_FLAG_REVERSE,

   167       G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   168       "Disable paranoid querying of everything", NULL },

   169     { "line-end", 0, G_OPTION_FLAG_HIDDEN,

   170       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   171       "Enable line end checking", NULL },

   172     { "no-line-end", 'l', G_OPTION_FLAG_REVERSE,

   173       G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   174       "Disable line end checking", NULL },

   175     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   176       "Overview: just show counts", NULL },

   177     { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   178       G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   179       "Show individual warnings", NULL },

   180     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   181       "Output errors to stdout instead of stderr", NULL },

   182     { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   183       G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   184       "Output errors to stderr instead of stdout", NULL },

   185     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   186       "Echo header fields", NULL },

   187     { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   188       G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   189       "Don't echo header fields", NULL },

   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   191       "Ignore markup in < >", NULL },

   192     { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   193       G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   194       "No special handling for markup in < >", NULL },

   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   196       "Use file of user-defined typos", NULL },

   197     { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   198       G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   199       "Ignore file of user-defined typos", NULL },

   200     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   201       "Verbose - list everything", NULL },

   202     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,

   203       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   204       "Switch off verbose mode", NULL },

   205     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,

   206       "Set of characters valid for this ebook", "NAME" },

   207     { NULL }

   208 };

   210 /*

   211  * Options relating to configuration which make no sense from inside

   212  * a configuration file.

   213  */

   215 static GOptionEntry config_options[]={

   216     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   217       "Defaults for use on www upload", NULL },

   218     { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH,

   219       "Dump current config settings", NULL },

   220     { NULL }

   221 };

   223 static GOptionEntry compatibility_options[]={

   224     { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat,

   225       "Toggle checking for common typos", NULL },

   226     { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, &paranoid_compat,

   227       "Toggle both paranoid mode and common typos", NULL },

   228     { NULL }

   229 };

   231 long cnt_quote;		/* for overview mode, count of quote queries */

   232 long cnt_brack;		/* for overview mode, count of brackets queries */

   233 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   234 long cnt_odd;		/* for overview mode, count of odd character queries */

   235 long cnt_long;		/* for overview mode, count of long line errors */

   236 long cnt_short;		/* for overview mode, count of short line queries */

   237 long cnt_punct;		/* for overview mode,

   238 			   count of punctuation and spacing queries */

   239 long cnt_dash;		/* for overview mode, count of dash-related queries */

   240 long cnt_word;		/* for overview mode, count of word queries */

   241 long cnt_html;		/* for overview mode, count of html queries */

   242 long cnt_lineend;	/* for overview mode, count of line-end queries */

   243 long cnt_spacend;	/* count of lines with space at end */

   244 long linecnt;		/* count of total lines in the file */

   245 long checked_linecnt;	/* count of lines actually checked */

   247 void proghelp(GOptionContext *context);

   248 void procfile(const char *);

   250 gchar *running_from;

   252 gboolean mixdigit(const char *);

   253 gchar *getaword(const char **);

   254 char *flgets(char **,long,int);

   255 void postprocess_for_HTML(char *);

   256 char *linehasmarkup(char *);

   257 char *losemarkup(char *);

   258 gboolean tagcomp(const char *,const char *);

   259 void loseentities(char *);

   260 gboolean isroman(const char *);

   261 void postprocess_for_DP(char *);

   262 void print_as_windows_1252(const char *string);

   263 void print_as_utf_8(const char *string);

   265 GTree *qword,*qperiod;

   267 #ifdef __WIN32__

   268 UINT saved_cp;

   269 #endif

   271 gboolean set_charset(const char *name,GError **err)

   272 {

   273     /* The various UNICODE encodings all share the same character set. */

   274     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",

   275       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",

   276       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",

   277       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",

   278       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };

   279     int i;

   280     if (charset)

   281 	g_free(charset);

   282     if (charset_validator!=(GIConv)-1)

   283 	g_iconv_close(charset_validator);

   284     if (!name || !g_strcasecmp(name,"auto"))

   285     {

   286 	charset=NULL;

   287 	charset_validator=(GIConv)-1;

   288 	return TRUE;

   289     }

   290     else

   291 	charset=g_strdup(name);

   292     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)

   293 	if (!g_strcasecmp(charset,unicode_aliases[i]))

   294 	{

   295 	    g_free(charset);

   296 	    charset=g_strdup("UTF-8");

   297 	    break;

   298 	}

   299     if (!strcmp(charset,"UTF-8"))

   300 	charset_validator=(GIConv)-1;

   301     else

   302     {

   303 	charset_validator=g_iconv_open(charset,"UTF-8");

   304 	if (charset_validator==(GIConv)-1)

   305 	{

   306 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,

   307 	      "Unknown character set \"%s\"",charset);

   308 	    return FALSE;

   309 	}

   310     }

   311     return TRUE;

   312 }

   314 GKeyFile *config;

   316 void config_file_update(GKeyFile *kf)

   317 {

   318     int i;

   319     const char *s;

   320     gboolean sw;

   321     for(i=0;options[i].long_name;i++)

   322     {

   323 	if (g_str_has_prefix(options[i].long_name,"no-"))

   324 	    continue;

   325 	if (options[i].arg==G_OPTION_ARG_NONE)

   326 	{

   327 	    sw=*(gboolean *)options[i].arg_data;

   328 	    if (options[i].flags&G_OPTION_FLAG_REVERSE)

   329 		sw=!sw;

   330 	    g_key_file_set_boolean(kf,"options",options[i].long_name,sw);

   331 	}

   332 	else if (options[i].arg==G_OPTION_ARG_STRING)

   333 	{

   334 	    s=*(gchar **)options[i].arg_data;

   335 	    if (!s)

   336 		s="auto";

   337 	    g_key_file_set_string(kf,"options",options[i].long_name,s);

   338 	}

   339 	else

   340 	    g_assert_not_reached();

   341     }

   342 }

   344 void config_file_add_comments(GKeyFile *kf)

   345 {

   346     int i;

   347     gchar *comment;

   348     g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe",

   349       NULL);

   350     for(i=0;options[i].long_name;i++)

   351     {

   352 	if (g_str_has_prefix(options[i].long_name,"no-"))

   353 	    continue;

   354 	comment=g_strconcat(" ",options[i].description,NULL);

   355 	g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL);

   356 	g_free(comment);

   357     }

   358 }

   360 void dump_config(void)

   361 {

   362     gchar *s;

   363     if (config)

   364 	config_file_update(config);

   365     else

   366     {

   367 	config=g_key_file_new();

   368 	config_file_update(config);

   369 	config_file_add_comments(config);

   370     }

   371     s=g_key_file_to_data(config,NULL,NULL);

   372     if (s)

   373 	g_print("%s",s);

   374     g_free(s);

   375 }

   377 GKeyFile *read_config_file(gchar **full_path)

   378 {

   379     int i;

   380     GError *err=NULL;

   381     gchar **search_dirs;

   382     gchar *path;

   383     const char *search_path;

   384     GKeyFile *kf;

   385     kf=g_key_file_new();

   386     search_path=g_getenv("BOOKLOUPE_CONFIG_PATH");

   387     if (search_path)

   388     {

   389 #ifdef __WIN32__

   390 	search_dirs=g_strsplit(search_path,";",0);

   391 #else

   392 	search_dirs=g_strsplit(search_path,":",0);

   393 #endif

   394     }

   395     else

   396     {

   397 	search_dirs=g_new(gchar *,4);

   398 	search_dirs[0]=g_get_current_dir();

   399 	search_dirs[1]=g_strdup(running_from);

   400 	search_dirs[2]=g_strdup(g_get_user_config_dir());

   401 	search_dirs[3]=NULL;

   402     }

   403     for(i=0;search_dirs[i];i++)

   404     {

   405 	path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL);

   406 	if (g_key_file_load_from_file(kf,path,

   407 	  G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err))

   408 	    break;

   409 	if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   410 	{

   411 	    g_printerr("Bookloupe: Error reading %s\n",path);

   412 	    g_printerr("%s\n",err->message);

   413 	    exit(1);

   414 	}

   415 	g_clear_error(&err);

   416 	g_free(path);

   417 	path=NULL;

   418     }

   419     if (!search_dirs[i])

   420     {

   421 	g_key_file_free(kf);

   422 	kf=NULL;

   423     }

   424     g_strfreev(search_dirs);

   425     if (full_path && kf)

   426 	*full_path=path;

   427     else

   428 	g_free(path);

   429     return kf;

   430 }

   432 void parse_config_file(void)

   433 {

   434     int i,j;

   435     gchar *path,*s;

   436     gchar **keys;

   437     gboolean sw;

   438     GError *err=NULL;

   439     config=read_config_file(&path);

   440     if (config)

   441 	keys=g_key_file_get_keys(config,"options",NULL,NULL);

   442     else

   443 	keys=NULL;

   444     if (keys)

   445     {

   446 	for(i=0;keys[i];i++)

   447 	{

   448 	    for(j=0;options[j].long_name;j++)

   449 	    {

   450 		if (g_str_has_prefix(options[j].long_name,"no-"))

   451 		    continue;

   452 		else if (!strcmp(keys[i],options[j].long_name))

   453 		{

   454 		    if (options[j].arg==G_OPTION_ARG_NONE)

   455 		    {

   456 			sw=g_key_file_get_boolean(config,"options",keys[i],

   457 			  &err);

   458 			if (err)

   459 			{

   460 			    g_printerr("Bookloupe: %s: options.%s: %s\n",

   461 			      path,keys[i],err->message);

   462 			    g_clear_error(&err);

   463 			}

   464 			else

   465 			{

   466 			    if (options[j].flags&G_OPTION_FLAG_REVERSE)

   467 				sw=!sw;

   468 			    *(gboolean *)options[j].arg_data=sw;

   469 			}

   470 			break;

   471 		    }

   472 		    else if (options[j].arg==G_OPTION_ARG_STRING)

   473 		    {

   474 			s=g_key_file_get_string(config,"options",keys[i],

   475 			  &err);

   476 			if (err)

   477 			{

   478 			    g_printerr("Bookloupe: %s: options.%s: %s\n",

   479 			      path,keys[i],err->message);

   480 			    g_clear_error(&err);

   481 			}

   482 			else

   483 			{

   484 			    g_free(*(gchar **)options[j].arg_data);

   485 			    if (!g_strcmp0(s,"auto"))

   486 			    {

   487 				*(gchar **)options[j].arg_data=NULL;

   488 				g_free(s);

   489 			    }

   490 			    else

   491 				*(gchar **)options[j].arg_data=s;

   492 			}

   493 			break;

   494 		    }

   495 		    else

   496 			g_assert_not_reached();

   497 		}

   498 	    }

   499 	    if (!options[j].long_name)

   500 		g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n",

   501 		  path,keys[i]);

   502 	}

   503 	g_strfreev(keys);

   504     }

   505     if (config)

   506 	g_free(path);

   507 }

   509 void parse_options(int *argc,char ***argv)

   510 {

   511     GError *err=NULL;

   512     GOptionContext *context;

   513     GOptionGroup *compatibility;

   514     context=g_option_context_new(

   515       "file - look for errors in Project Gutenberg(TM) etexts");

   516     g_option_context_add_main_entries(context,options,NULL);

   517     g_option_context_add_main_entries(context,config_options,NULL);

   518     compatibility=g_option_group_new("compatibility",

   519       "Options for Compatibility with Gutcheck:",

   520       "Show compatibility options",NULL,NULL);

   521     g_option_group_add_entries(compatibility,compatibility_options);

   522     g_option_context_add_group(context,compatibility);

   523     g_option_context_set_description(context,

   524       "For simplicity, only the switch options which reverse the\n"

   525       "default configuration are listed. In most cases, both vanilla\n"

   526       "and \"no-\" prefixed versions are available for use.");

   527     if (!g_option_context_parse(context,argc,argv,&err))

   528     {

   529 	g_printerr("Bookloupe: %s\n",err->message);

   530 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   531 	exit(1);

   532     }

   533     if (typo_compat)

   534 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   535     if (paranoid_compat)

   536     {

   537 	pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   538 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   539     }

   540     /*

   541      * Web uploads - for the moment, this is really just a placeholder

   542      * until we decide what processing we really want to do on web uploads

   543      */

   544     if (pswit[WEB_SWITCH])

   545     {

   546 	/* specific override for web uploads */

   547 	pswit[ECHO_SWITCH]=TRUE;

   548 	pswit[SQUOTE_SWITCH]=FALSE;

   549 	pswit[TYPO_SWITCH]=TRUE;

   550 	pswit[QPARA_SWITCH]=FALSE;

   551 	pswit[PARANOID_SWITCH]=TRUE;

   552 	pswit[LINE_END_SWITCH]=FALSE;

   553 	pswit[OVERVIEW_SWITCH]=FALSE;

   554 	pswit[STDOUT_SWITCH]=FALSE;

   555 	pswit[HEADER_SWITCH]=TRUE;

   556 	pswit[VERBOSE_SWITCH]=FALSE;

   557 	pswit[MARKUP_SWITCH]=FALSE;

   558 	pswit[USERTYPO_SWITCH]=FALSE;

   559 	pswit[DP_SWITCH]=FALSE;

   560     }

   561     if (opt_charset && !set_charset(opt_charset,&err))

   562     {

   563 	g_printerr("%s\n",err->message);

   564 	exit(1);

   565     }

   566     if (pswit[DUMP_CONFIG_SWITCH])

   567     {

   568 	dump_config();

   569 	exit(0);

   570     }

   571     g_free(opt_charset);

   572     opt_charset=NULL;

   573     if (pswit[OVERVIEW_SWITCH])

   574 	/* just print summary; don't echo */

   575 	pswit[ECHO_SWITCH]=FALSE;

   576     if (*argc<2)

   577     {

   578 	proghelp(context);

   579 	exit(1);

   580     }

   581     g_option_context_free(context);

   582 }

   584 /*

   585  * read_user_scannos:

   586  *

   587  * Read in the user-defined stealth scanno list.

   588  */

   589 void read_user_scannos(void)

   590 {

   591     GError *err=NULL;

   592     gchar *usertypo_file;

   593     gboolean okay;

   594     int i;

   595     gsize len,nb;

   596     gchar *contents,*utf8,**lines;

   597     usertypo_file=g_strdup("bookloupe.typ");

   598     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   599     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   600     {

   601 	g_clear_error(&err);

   602 	g_free(usertypo_file);

   603 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   604 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   605     }

   606     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   607     {

   608 	g_clear_error(&err);

   609 	g_free(usertypo_file);

   610 	usertypo_file=g_strdup("gutcheck.typ");

   611 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   612     }

   613     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   614     {

   615 	g_clear_error(&err);

   616 	g_free(usertypo_file);

   617 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   618 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   619     }

   620     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   621     {

   622 	g_free(usertypo_file);

   623 	g_print("   --> I couldn't find bookloupe.typ "

   624 	  "-- proceeding without user typos.\n");

   625 	return;

   626     }

   627     else if (!okay)

   628     {

   629 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   630 	g_free(usertypo_file);

   631 	g_clear_error(&err);

   632 	exit(1);

   633     }

   634     if (g_utf8_validate(contents,len,NULL))

   635     {

   636 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   637 	if (!charset)

   638 	    (void)set_charset("UNICODE",NULL);

   639     }

   640     else

   641 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   642     g_free(contents);

   643     lines=g_strsplit_set(utf8,"\r\n",0);

   644     g_free(utf8);

   645     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   646     for (i=0;lines[i];i++)

   647 	if (*(unsigned char *)lines[i]>'!')

   648 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   649 	else

   650 	    g_free(lines[i]);

   651     g_free(lines);

   652 }

   654 /*

   655  * read_etext:

   656  *

   657  * Read an etext returning a newly allocated string containing the file

   658  * contents or NULL on error.

   659  */

   660 gchar *read_etext(const char *filename,GError **err)

   661 {

   662     GError *tmp_err=NULL;

   663     gchar *contents,*utf8;

   664     gsize len,bytes_read,bytes_written;

   665     int i,line,col;

   666     if (!g_file_get_contents(filename,&contents,&len,err))

   667 	return NULL;

   668     if (g_utf8_validate(contents,len,NULL))

   669     {

   670 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   671 	g_set_print_handler(print_as_utf_8);

   672 #ifdef __WIN32__

   673 	SetConsoleOutputCP(CP_UTF8);

   674 #endif

   675     }

   676     else

   677     {

   678 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   679 	  &bytes_written,&tmp_err);

   680 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   681 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   682 	{

   683 	    line=col=1;

   684 	    for(i=0;i<bytes_read;i++)

   685 		if (contents[i]=='\n')

   686 		{

   687 		    line++;

   688 		    col=1;

   689 		}

   690 		else if (contents[i]!='\r')

   691 		    col++;

   692 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   693 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   694 	      "valid Windows-1252 character",

   695 	      ((unsigned char *)contents)[bytes_read],line,col);

   696 	}

   697 	else if (tmp_err)

   698 	    g_propagate_error(err,tmp_err);

   699 	g_set_print_handler(print_as_windows_1252);

   700 #ifdef __WIN32__

   701 	SetConsoleOutputCP(1252);

   702 #endif

   703     }

   704     g_free(contents);

   705     return utf8;

   706 }

   708 void cleanup_on_exit(void)

   709 {

   710 #ifdef __WIN32__

   711     SetConsoleOutputCP(saved_cp);

   712 #endif

   713 }

   715 int main(int argc,char **argv)

   716 {

   717 #ifdef __WIN32__

   718     atexit(cleanup_on_exit);

   719     saved_cp=GetConsoleOutputCP();

   720 #endif

   721     running_from=g_path_get_dirname(argv[0]);

   722     /* Paranoid checking is turned OFF, not on, by its switch */

   723     pswit[PARANOID_SWITCH]=TRUE;

   724     /* if running in paranoid mode, typo checks default to enabled */

   725     pswit[TYPO_SWITCH]=TRUE;

   726     /* Line-end checking is turned OFF, not on, by its switch */

   727     pswit[LINE_END_SWITCH]=TRUE;

   728     /* Echoing is turned OFF, not on, by its switch */

   729     pswit[ECHO_SWITCH]=TRUE;

   730     parse_config_file();

   731     parse_options(&argc,&argv);

   732     if (pswit[USERTYPO_SWITCH])

   733 	read_user_scannos();

   734     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   735     procfile(argv[1]);

   736     if (pswit[OVERVIEW_SWITCH])

   737     {

   738 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   739 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   740 	g_print("    --------------- Queries found --------------\n");

   741 	if (cnt_long)

   742 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   743 	if (cnt_short)

   744 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   745 	if (cnt_lineend)

   746 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   747 	if (cnt_word)

   748 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   749 	if (cnt_quote)

   750 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);

   751 	if (cnt_brack)

   752 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   753 	if (cnt_bin)

   754 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   755 	if (cnt_odd)

   756 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   757 	if (cnt_punct)

   758 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   759 	if (cnt_dash)

   760 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   761 	if (cnt_html)

   762 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   763 	g_print("\n");

   764 	g_print("    TOTAL QUERIES		  %14ld\n",

   765 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+

   766 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);

   767     }

   768     g_free(running_from);

   769     if (usertypo)

   770 	g_tree_unref(usertypo);

   771     set_charset(NULL,NULL);

   772     if (config)

   773 	g_key_file_free(config);

   774     return 0;

   775 }

   777 void count_dashes(const char *line,const char *dash,

   778   struct dash_results *results)

   779 {

   780     int i;

   781     gchar **tokens;

   782     gunichar pc,nc;

   783     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;

   784     if (!*line)

   785 	return;

   786     tokens=g_strsplit(line,dash,0);

   787     if (tokens[1])

   788 	results->base++;

   789     for(i=1;tokens[i];i++)

   790     {

   791 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));

   792 	nc=g_utf8_get_char(tokens[i]);

   793 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))

   794 	    spaced=TRUE;

   795 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))

   796 	    spaced2=TRUE;

   797 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))

   798 	    unspaced=TRUE;

   799     }

   800     if (spaced)

   801 	results->space++;

   802     if (spaced2)

   803 	/* count of lines with em-dashes with spaces both sides */

   804 	results->non_PG_space++;

   805     if (unspaced)

   806 	/* count of lines with PG-type em-dashes with no spaces */

   807 	results->PG_space++;

   808     g_strfreev(tokens);

   809 }

   811 /*

   812  * first_pass:

   813  *

   814  * Run a first pass - verify that it's a valid PG

   815  * file, decide whether to report some things that

   816  * occur many times in the text like long or short

   817  * lines, non-standard dashes, etc.

   818  */

   819 struct first_pass_results *first_pass(const char *etext)

   820 {

   821     gunichar laststart=CHAR_SPACE;

   822     const char *s;

   823     gchar *lc_line;

   824     int i,j,lbytes,llen;

   825     gchar **lines;

   826     unsigned int lastlen=0,lastblen=0;

   827     long spline=0,nspline=0;

   828     static struct first_pass_results results={0};

   829     struct dash_results tmp_dash_results;

   830     gchar *inword;

   831     QuoteClass qc;

   832     lines=g_strsplit(etext,"\n",0);

   833     if (!lines[0])

   834     {

   835 	/* An empty etext has no terminators */

   836 	results.newlines=DOS_NEWLINES;

   837     }

   838     else if (!lines[1])

   839     {

   840 	/*

   841 	 * If there are no LFs, we don't have UNIX-style

   842 	 * terminators, but we might have OS9-style ones.

   843 	 */

   844 	results.newlines=OS9_NEWLINES;

   845 	g_strfreev(lines);

   846 	lines=g_strsplit(etext,"\r",0);

   847 	if (!lines[0] || !lines[1])

   848 	    /* Looks like we don't have any terminators at all */

   849 	    results.newlines=DOS_NEWLINES;

   850     }

   851     else

   852     {

   853 	/* We might have UNIX-style terminators */

   854 	results.newlines=UNIX_NEWLINES;

   855     }

   856     for (j=0;lines[j];j++)

   857     {

   858 	lbytes=strlen(lines[j]);

   859 	if (lbytes>0 && lines[j][lbytes-1]=='\r')

   860 	{

   861 	    results.newlines=DOS_NEWLINES;

   862 	    do

   863 	    {

   864 		lines[j][--lbytes]='\0';

   865 	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');

   866 	}

   867 	llen=g_utf8_strlen(lines[j],lbytes);

   868 	linecnt++;

   869 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   870 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   871 	{

   872 	    if (spline)

   873 		g_print("   --> Duplicate header?\n");

   874 	    spline=linecnt+1;   /* first line of non-header text, that is */

   875 	}

   876 	if (!strncmp(lines[j],"*** START",9) &&

   877 	  strstr(lines[j],"PROJECT GUTENBERG"))

   878 	{

   879 	    if (nspline)

   880 		g_print("   --> Duplicate header?\n");

   881 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   882 	}

   883 	if (spline || nspline)

   884 	{

   885 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   886 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   887 	    {

   888 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   889 		{

   890 		    if (results.footerline)

   891 		    {

   892 			/* it's an old-form header - we can detect duplicates */

   893 			if (!nspline)

   894 			    g_print("   --> Duplicate footer?\n");

   895 		    }

   896 		    else

   897 			results.footerline=linecnt;

   898 		}

   899 	    }

   900 	    g_free(lc_line);

   901 	}

   902 	if (spline)

   903 	    results.firstline=spline;

   904 	if (nspline)

   905 	    results.firstline=nspline;  /* override with new */

   906 	if (results.footerline)

   907 	    continue;    /* don't count the boilerplate in the footer */

   908 	results.totlen+=llen;

   909 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   910 	{

   911 	    if (g_utf8_get_char(s)>127)

   912 		results.binlen++;

   913 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   914 		results.alphalen++;

   915 	    if (s>lines[j])

   916 	    {

   917 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))

   918 		    qc=QUOTE_CLASS(g_utf8_get_char(s));

   919 		else

   920 		    qc=INVALID_QUOTE;

   921 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&

   922 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   923 		    results.endquote_count++;

   924 	    }

   925 	}

   926 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   927 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   928 	    results.shortline++;

   929 	if (lbytes>0 &&

   930 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   931 	    cnt_spacend++;

   932 	if (strstr(lines[j],".,"))

   933 	    results.dotcomma++;

   934 	/* only count ast lines for ignoring purposes where there is */

   935 	/* locase text on the line */

   936 	if (strchr(lines[j],'*'))

   937 	{

   938 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   939 		if (g_unichar_islower(g_utf8_get_char(s)))

   940 		    break;

   941 	    if (*s)

   942 		results.astline++;

   943 	}

   944 	if (strchr(lines[j],'/'))

   945 	    results.fslashline++;

   946 	if (lbytes>0)

   947 	{

   948 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   949 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   950 	      s=g_utf8_prev_char(s))

   951 		;

   952 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   953 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   954 		results.hyphens++;

   955 	}

   956 	if (llen>LONGEST_PG_LINE)

   957 	    results.longline++;

   958 	if (llen>WAY_TOO_LONG)

   959 	    results.verylongline++;

   960 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   961 	{

   962 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   963 	    if (i>0)

   964 		results.htmcount++;

   965 	    if (strstr(lines[j],"<i>"))

   966 		results.htmcount+=4; /* bonus marks! */

   967 	}

   968 	/* Check for spaced em-dashes */

   969 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));

   970 	count_dashes(lines[j],"--",&tmp_dash_results);

   971 	count_dashes(lines[j],"—",&tmp_dash_results);

   972 	if (tmp_dash_results.base)

   973 	    results.emdash.base++;

   974 	if (tmp_dash_results.non_PG_space)

   975 	    results.emdash.non_PG_space++;

   976 	if (tmp_dash_results.PG_space)

   977 	    results.emdash.PG_space++;

   978 	for (s=lines[j];*s;)

   979 	{

   980 	    inword=getaword(&s);

   981 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   982 		results.Dutchcount++;

   983 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   984 		results.Frenchcount++;

   985 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   986 		results.standalone_digit++;

   987 	    g_free(inword);

   988 	}

   989 	/* Check for spaced dashes */

   990 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   991 	    results.spacedash++;

   992 	lastblen=lastlen;

   993 	lastlen=llen;

   994 	laststart=lines[j][0];

   995     }

   996     g_strfreev(lines);

   997     return &results;

   998 }

  1000 /*

  1001  * report_first_pass:

  1002  *

  1003  * Make some snap decisions based on the first pass results.

  1004  */

  1005 struct warnings *report_first_pass(struct first_pass_results *results)

  1006 {

  1007     static struct warnings warnings={0};

  1008     warnings.newlines=results->newlines;

  1009     if (warnings.newlines==UNIX_NEWLINES)

  1010 	g_print("   --> No lines in this file have a CR. Not reporting them. "

  1011 	  "Project Gutenberg requires that all lineends be CR-LF.\n");

  1012     else if (warnings.newlines==OS9_NEWLINES)

  1013 	g_print("   --> No lines in this file have a LF. Not reporting them. "

  1014 	  "Project Gutenberg requires that all lineends be CR-LF.\n");

  1015     if (cnt_spacend>0)

  1016 	g_print("   --> %ld lines in this file have white space at end\n",

  1017 	  cnt_spacend);

  1018     warnings.dotcomma=1;

  1019     if (results->dotcomma>5)

  1020     {

  1021 	warnings.dotcomma=0;

  1022 	g_print("   --> %ld lines in this file contain '.,'. "

  1023 	  "Not reporting them.\n",results->dotcomma);

  1024     }

  1025     /*

  1026      * If more than 50 lines, or one-tenth, are short,

  1027      * don't bother reporting them.

  1028      */

  1029     warnings.shortline=1;

  1030     if (results->shortline>50 || results->shortline*10>linecnt)

  1031     {

  1032 	warnings.shortline=0;

  1033 	g_print("   --> %ld lines in this file are short. "

  1034 	  "Not reporting short lines.\n",results->shortline);

  1035     }

  1036     /*

  1037      * If more than 50 lines, or one-tenth, are long,

  1038      * don't bother reporting them.

  1039      */

  1040     warnings.longline=1;

  1041     if (results->longline>50 || results->longline*10>linecnt)

  1042     {

  1043 	warnings.longline=0;

  1044 	g_print("   --> %ld lines in this file are long. "

  1045 	  "Not reporting long lines.\n",results->longline);

  1046     }

  1047     /* If more than 10 lines contain asterisks, don't bother reporting them. */

  1048     warnings.ast=1;

  1049     if (results->astline>10)

  1050     {

  1051 	warnings.ast=0;

  1052 	g_print("   --> %ld lines in this file contain asterisks. "

  1053 	  "Not reporting them.\n",results->astline);

  1054     }

  1055     /*

  1056      * If more than 10 lines contain forward slashes,

  1057      * don't bother reporting them.

  1058      */

  1059     warnings.fslash=1;

  1060     if (results->fslashline>10)

  1061     {

  1062 	warnings.fslash=0;

  1063 	g_print("   --> %ld lines in this file contain forward slashes. "

  1064 	  "Not reporting them.\n",results->fslashline);

  1065     }

  1066     /*

  1067      * If more than 20 lines contain unpunctuated endquotes,

  1068      * don't bother reporting them.

  1069      */

  1070     warnings.endquote=1;

  1071     if (results->endquote_count>20)

  1072     {

  1073 	warnings.endquote=0;

  1074 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

  1075 	  "Not reporting them.\n",results->endquote_count);

  1076     }

  1077     /*

  1078      * If more than 15 lines contain standalone digits,

  1079      * don't bother reporting them.

  1080      */

  1081     warnings.digit=1;

  1082     if (results->standalone_digit>10)

  1083     {

  1084 	warnings.digit=0;

  1085 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

  1086 	  "Not reporting them.\n",results->standalone_digit);

  1087     }

  1088     /*

  1089      * If more than 20 lines contain hyphens at end,

  1090      * don't bother reporting them.

  1091      */

  1092     warnings.hyphen=1;

  1093     if (results->hyphens>20)

  1094     {

  1095 	warnings.hyphen=0;

  1096 	g_print("   --> %ld lines in this file have hyphens at end. "

  1097 	  "Not reporting them.\n",results->hyphens);

  1098     }

  1099     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

  1100     {

  1101 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

  1102 	pswit[MARKUP_SWITCH]=1;

  1103     }

  1104     if (results->verylongline>0)

  1105 	g_print("   --> %ld lines in this file are VERY long!\n",

  1106 	  results->verylongline);

  1107     /*

  1108      * If there are more non-PG spaced dashes than PG em-dashes,

  1109      * assume it's deliberate.

  1110      * Current PG guidelines say don't use them, but older texts do,

  1111      * and some people insist on them whatever the guidelines say.

  1112      */

  1113     warnings.dash=1;

  1114     if (results->spacedash+results->emdash.non_PG_space>

  1115       results->emdash.PG_space)

  1116     {

  1117 	warnings.dash=0;

  1118 	g_print("   --> There are %ld spaced dashes and em-dashes. "

  1119 	  "Not reporting them.\n",

  1120 	  results->spacedash+results->emdash.non_PG_space);

  1121     }

  1122     if (charset)

  1123 	warnings.bin=0;

  1124     else

  1125     {

  1126 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */

  1127 	warnings.bin=1;

  1128 	/* If more than a quarter of characters are hi-bit, bug out. */

  1129 	if (results->binlen*4>results->totlen)

  1130 	{

  1131 	    g_print("   --> This file does not appear to be ASCII. "

  1132 	      "Terminating. Best of luck with it!\n");

  1133 	    exit(1);

  1134 	}

  1135 	if (results->alphalen*4<results->totlen)

  1136 	{

  1137 	    g_print("   --> This file does not appear to be text. "

  1138 	      "Terminating. Best of luck with it!\n");

  1139 	    exit(1);

  1140 	}

  1141 	if (results->binlen*100>results->totlen || results->binlen>100)

  1142 	{

  1143 	    g_print("   --> There are a lot of foreign letters here. "

  1144 	      "Not reporting them.\n");

  1145 	    if (!pswit[VERBOSE_SWITCH])

  1146 		warnings.bin=0;

  1147 	}

  1148     }

  1149     warnings.isDutch=FALSE;

  1150     if (results->Dutchcount>50)

  1151     {

  1152 	warnings.isDutch=TRUE;

  1153 	g_print("   --> This looks like Dutch - "

  1154 	  "switching off dashes and warnings for 's Middags case.\n");

  1155     }

  1156     warnings.isFrench=FALSE;

  1157     if (results->Frenchcount>50)

  1158     {

  1159 	warnings.isFrench=TRUE;

  1160 	g_print("   --> This looks like French - "

  1161 	  "switching off some doublepunct.\n");

  1162     }

  1163     if (results->firstline && results->footerline)

  1164 	g_print("    The PG header and footer appear to be already on.\n");

  1165     else

  1166     {

  1167 	if (results->firstline)

  1168 	    g_print("    The PG header is on - no footer.\n");

  1169 	if (results->footerline)

  1170 	    g_print("    The PG footer is on - no header.\n");

  1171     }

  1172     g_print("\n");

  1173     if (pswit[VERBOSE_SWITCH])

  1174     {

  1175 	warnings.shortline=1;

  1176 	warnings.dotcomma=1;

  1177 	warnings.longline=1;

  1178 	warnings.dash=1;

  1179 	warnings.digit=1;

  1180 	warnings.ast=1;

  1181 	warnings.fslash=1;

  1182 	warnings.hyphen=1;

  1183 	warnings.endquote=1;

  1184 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

  1185     }

  1186     if (warnings.isDutch)

  1187 	warnings.dash=0;

  1188     if (results->footerline>0 && results->firstline>0 &&

  1189       results->footerline>results->firstline &&

  1190       results->footerline-results->firstline<100)

  1191     {

  1192 	g_print("   --> I don't really know where this text starts. \n");

  1193 	g_print("       There are no reference points.\n");

  1194 	g_print("       I'm going to have to report the header and footer "

  1195 	  "as well.\n");

  1196 	results->firstline=0;

  1197     }

  1198     return &warnings;

  1199 }

  1201 /*

  1202  * analyse_quotes:

  1203  *

  1204  * Look along the line, accumulate the count of quotes, and see

  1205  * if this is an empty line - i.e. a line with nothing on it

  1206  * but spaces.

  1207  * If line has just spaces, period, * and/or - on it, don't

  1208  * count it, since empty lines with asterisks or dashes to

  1209  * separate sections are common.

  1210  *

  1211  * Returns: TRUE if the line is empty.

  1212  */

  1213 gboolean analyse_quotes(const char *aline,struct counters *counters)

  1214 {

  1215     int guessquote=0;

  1216     /* assume the line is empty until proven otherwise */

  1217     gboolean isemptyline=TRUE;

  1218     const char *s=aline,*sprev,*snext;

  1219     gunichar c;

  1220     sprev=NULL;

  1221     GError *tmp_err=NULL;

  1222     while (*s)

  1223     {

  1224 	snext=g_utf8_next_char(s);

  1225 	c=g_utf8_get_char(s);

  1226 	if (CHAR_IS_DQUOTE(c))

  1227 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);

  1228 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])

  1229 	{

  1230 	    if (s==aline)

  1231 	    {

  1232 		/*

  1233 		 * At start of line, it can only be a quotation mark.

  1234 		 * Hardcode a very common exception!

  1235 		 */

  1236 		if (!g_str_has_prefix(snext,"tis") &&

  1237 		  !g_str_has_prefix(snext,"Tis"))

  1238 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1239 	    }

  1240 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

  1241 	      g_unichar_isalpha(g_utf8_get_char(snext)))

  1242 		/* Do nothing! it's definitely an apostrophe, not a quote */

  1243 		;

  1244 	    /* it's outside a word - let's check it out */

  1245 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||

  1246 	      g_unichar_isalpha(g_utf8_get_char(snext)))

  1247 	    {

  1248 		/* certainly looks like a quotation mark */

  1249 		if (!g_str_has_prefix(snext,"tis") &&

  1250 		  !g_str_has_prefix(snext,"Tis"))

  1251 		    /* hardcode a very common exception! */

  1252 		{

  1253 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))

  1254 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1255 		    else

  1256 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);

  1257 		}

  1258 	    }

  1259 	    else

  1260 	    {

  1261 		/* now - is it a quotation mark? */

  1262 		guessquote=0;   /* accumulate clues */

  1263 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

  1264 		{

  1265 		    /* it follows a letter - could be either */

  1266 		    guessquote++;

  1267 		    if (g_utf8_get_char(sprev)=='s')

  1268 		    {

  1269 			/* looks like a plural apostrophe */

  1270 			guessquote-=3;

  1271 			if (g_utf8_get_char(snext)==CHAR_SPACE)

  1272 			    /* bonus marks! */

  1273 			    guessquote-=2;

  1274 		    }

  1275 		    if (innermost_quote_matches(counters,c))

  1276 			/*

  1277 			 * Give it the benefit of some doubt,

  1278 			 * if a squote is already open.

  1279 			 */

  1280 			guessquote++;

  1281 		    else

  1282 			guessquote--;

  1283 		    if (guessquote>=0)

  1284 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);

  1285 		}

  1286 		else

  1287 		    /* no adjacent letter - it must be a quote of some kind */

  1288 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

  1289 	    }

  1290 	}

  1291 	if (tmp_err)

  1292 	{

  1293 	    if (pswit[ECHO_SWITCH])

  1294 		g_print("\n%s\n",aline);

  1295 	    if (!pswit[OVERVIEW_SWITCH])

  1296 		g_print("    Line %ld column %ld - %s\n",

  1297 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);

  1298 	    g_clear_error(&tmp_err);

  1299 	}

  1300 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

  1301 	  c!='\r' && c!='\n')

  1302 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

  1303 	if (c==CHAR_UNDERSCORE)

  1304 	    counters->c_unders++;

  1305 	if (c==CHAR_OPEN_SBRACK)

  1306 	{

  1307 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&

  1308 	      !matching_difference(counters,c) && s==aline &&

  1309 	      g_str_has_prefix(s,"[Illustration:"))

  1310 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);

  1311 	    else

  1312 		increment_matching(counters,c,TRUE);

  1313 	}

  1314 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)

  1315 	    increment_matching(counters,c,TRUE);

  1316 	if (c==CHAR_CLOSE_SBRACK)

  1317 	{

  1318 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&

  1319 	      !matching_difference(counters,c) && !*snext)

  1320 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);

  1321 	    else

  1322 		increment_matching(counters,c,FALSE);

  1323 	}

  1324 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)

  1325 	    increment_matching(counters,c,FALSE);

  1326 	sprev=s;

  1327 	s=snext;

  1328     }

  1329     return isemptyline;

  1330 }

  1332 /*

  1333  * check_for_control_characters:

  1334  *

  1335  * Check for invalid or questionable characters in the line

  1336  * Anything above 127 is invalid for plain ASCII, and

  1337  * non-printable control characters should also be flagged.

  1338  * Tabs should generally not be there.

  1339  */

  1340 void check_for_control_characters(const char *aline)

  1341 {

  1342     gunichar c;

  1343     const char *s;

  1344     for (s=aline;*s;s=g_utf8_next_char(s))

  1345     {

  1346 	c=g_utf8_get_char(s);

  1347 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

  1348 	{

  1349 	    if (pswit[ECHO_SWITCH])

  1350 		g_print("\n%s\n",aline);

  1351 	    if (!pswit[OVERVIEW_SWITCH])

  1352 		g_print("    Line %ld column %ld - Control character %u\n",

  1353 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

  1354 	    else

  1355 		cnt_bin++;

  1356 	}

  1357     }

  1358 }

  1360 /*

  1361  * check_for_odd_characters:

  1362  *

  1363  * Check for binary and other odd characters.

  1364  */

  1365 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

  1366   gboolean isemptyline)

  1367 {

  1368     /* Don't repeat multiple warnings on one line. */

  1369     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;

  1370     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

  1371     const char *s;

  1372     gunichar c;

  1373     gsize nb;

  1374     gchar *t;

  1375     for (s=aline;*s;s=g_utf8_next_char(s))

  1376     {

  1377 	c=g_utf8_get_char(s);

  1378 	if (warnings->bin && !eInvalidChar &&

  1379 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

  1380 	{

  1381 	    if (pswit[ECHO_SWITCH])

  1382 		g_print("\n%s\n",aline);

  1383 	    if (!pswit[OVERVIEW_SWITCH])

  1384 		if (c>127 && c<160 || c>255)

  1385 		    g_print("    Line %ld column %ld - "

  1386 		      "Non-ISO-8859 character %u\n",

  1387 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1388 		else

  1389 		    g_print("    Line %ld column %ld - "

  1390 		      "Non-ASCII character %u\n",

  1391 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1392 	    else

  1393 		cnt_bin++;

  1394 	    eInvalidChar=TRUE;

  1395 	}

  1396 	if (!eInvalidChar && charset)

  1397 	{

  1398 	    if (charset_validator==(GIConv)-1)

  1399 	    {

  1400 		if (!g_unichar_isdefined(c))

  1401 		{

  1402 		    if (pswit[ECHO_SWITCH])

  1403 			g_print("\n%s\n",aline);

  1404 		    if (!pswit[OVERVIEW_SWITCH])

  1405 			g_print("    Line %ld column %ld - Unassigned UNICODE "

  1406 			  "code point U+%04" G_GINT32_MODIFIER "X\n",

  1407 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1408 		    else

  1409 			cnt_bin++;

  1410 		    eInvalidChar=TRUE;

  1411 		}

  1412 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||

  1413 		  c>=100000 && c<=0x10FFFD)

  1414 		{

  1415 		    if (pswit[ECHO_SWITCH])

  1416 			g_print("\n%s\n",aline);

  1417 		    if (!pswit[OVERVIEW_SWITCH])

  1418 			g_print("    Line %ld column %ld - Private Use "

  1419 			  "character U+%04" G_GINT32_MODIFIER "X\n",

  1420 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1421 		    else

  1422 			cnt_bin++;

  1423 		    eInvalidChar=TRUE;

  1424 		}

  1425 	    }

  1426 	    else

  1427 	    {

  1428 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,

  1429 		  charset_validator,NULL,&nb,NULL);

  1430 		if (t)

  1431 		    g_free(t);

  1432 		else

  1433 		{

  1434 		    if (pswit[ECHO_SWITCH])

  1435 			g_print("\n%s\n",aline);

  1436 		    if (!pswit[OVERVIEW_SWITCH])

  1437 			g_print("    Line %ld column %ld - Non-%s "

  1438 			  "character %u\n",linecnt,

  1439 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);

  1440 		    else

  1441 			cnt_bin++;

  1442 		    eInvalidChar=TRUE;

  1443 		}

  1444 	    }

  1445 	}

  1446 	if (!eTab && c==CHAR_TAB)

  1447 	{

  1448 	    if (pswit[ECHO_SWITCH])

  1449 		g_print("\n%s\n",aline);

  1450 	    if (!pswit[OVERVIEW_SWITCH])

  1451 		g_print("    Line %ld column %ld - Tab character?\n",

  1452 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1453 	    else

  1454 		cnt_odd++;

  1455 	    eTab=TRUE;

  1456 	}

  1457 	if (!eTilde && c==CHAR_TILDE)

  1458 	{

  1459 	    /*

  1460 	     * Often used by OCR software to indicate an

  1461 	     * unrecognizable character.

  1462 	     */

  1463 	    if (pswit[ECHO_SWITCH])

  1464 		g_print("\n%s\n",aline);

  1465 	    if (!pswit[OVERVIEW_SWITCH])

  1466 		g_print("    Line %ld column %ld - Tilde character?\n",

  1467 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1468 	    else

  1469 		cnt_odd++;

  1470 	    eTilde=TRUE;

  1471 	}

  1472 	if (!eCarat && c==CHAR_CARAT)

  1473 	{

  1474 	    if (pswit[ECHO_SWITCH])

  1475 		g_print("\n%s\n",aline);

  1476 	    if (!pswit[OVERVIEW_SWITCH])

  1477 		g_print("    Line %ld column %ld - Carat character?\n",

  1478 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1479 	    else

  1480 		cnt_odd++;

  1481 	    eCarat=TRUE;

  1482 	}

  1483 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

  1484 	{

  1485 	    if (pswit[ECHO_SWITCH])

  1486 		g_print("\n%s\n",aline);

  1487 	    if (!pswit[OVERVIEW_SWITCH])

  1488 		g_print("    Line %ld column %ld - Forward slash?\n",

  1489 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1490 	    else

  1491 		cnt_odd++;

  1492 	    eFSlash=TRUE;

  1493 	}

  1494 	/*

  1495 	 * Report asterisks only in paranoid mode,

  1496 	 * since they're often deliberate.

  1497 	 */

  1498 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1499 	  c==CHAR_ASTERISK)

  1500 	{

  1501 	    if (pswit[ECHO_SWITCH])

  1502 		g_print("\n%s\n",aline);

  1503 	    if (!pswit[OVERVIEW_SWITCH])

  1504 		g_print("    Line %ld column %ld - Asterisk?\n",

  1505 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1506 	    else

  1507 		cnt_odd++;

  1508 	    eAst=TRUE;

  1509 	}

  1510     }

  1511 }

  1513 /*

  1514  * check_for_long_line:

  1515  *

  1516  * Check for line too long.

  1517  */

  1518 void check_for_long_line(const char *aline)

  1519 {

  1520     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1521     {

  1522 	if (pswit[ECHO_SWITCH])

  1523 	    g_print("\n%s\n",aline);

  1524 	if (!pswit[OVERVIEW_SWITCH])

  1525 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1526 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1527 	else

  1528 	    cnt_long++;

  1529     }

  1530 }

  1532 /*

  1533  * check_for_short_line:

  1534  *

  1535  * Check for line too short.

  1536  *

  1537  * This one is a bit trickier to implement: we don't want to

  1538  * flag the last line of a paragraph for being short, so we

  1539  * have to wait until we know that our current line is a

  1540  * "normal" line, then report the _previous_ line if it was too

  1541  * short. We also don't want to report indented lines like

  1542  * chapter heads or formatted quotations. We therefore keep

  1543  * last->len as the length of the last line examined, and

  1544  * last->blen as the length of the last but one, and try to

  1545  * suppress unnecessary warnings by checking that both were of

  1546  * "normal" length. We keep the first character of the last

  1547  * line in last->start, and if it was a space, we assume that

  1548  * the formatting is deliberate. I can't figure out a way to

  1549  * distinguish something like a quoted verse left-aligned or

  1550  * the header or footer of a letter from a paragraph of short

  1551  * lines - maybe if I examined the whole paragraph, and if the

  1552  * para has less than, say, 8 lines and if all lines are short,

  1553  * then just assume it's OK? Need to look at some texts to see

  1554  * how often a formula like this would get the right result.

  1555  */

  1556 void check_for_short_line(const char *aline,const struct line_properties *last)

  1557 {

  1558     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1559       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1560       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1561     {

  1562 	if (pswit[ECHO_SWITCH])

  1563 	    g_print("\n%s\n",prevline);

  1564 	if (!pswit[OVERVIEW_SWITCH])

  1565 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1566 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1567 	else

  1568 	    cnt_short++;

  1569     }

  1570 }

  1572 /*

  1573  * check_for_starting_punctuation:

  1574  *

  1575  * Look for punctuation other than full ellipses at start of line.

  1576  */

  1577 void check_for_starting_punctuation(const char *aline)

  1578 {

  1579     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1580       !g_str_has_prefix(aline,". . ."))

  1581     {

  1582 	if (pswit[ECHO_SWITCH])

  1583 	    g_print("\n%s\n",aline);

  1584 	if (!pswit[OVERVIEW_SWITCH])

  1585 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1586 	      linecnt);

  1587 	else

  1588 	    cnt_punct++;

  1589     }

  1590 }

  1592 /*

  1593  * str_emdash:

  1594  *

  1595  * Find the first em-dash, return a pointer to it and set <next> to the

  1596  * character following the dash.

  1597  */

  1598 char *str_emdash(const char *s,const char **next)

  1599 {

  1600     const char *s1,*s2;

  1601     s1=strstr(s,"--");

  1602     s2=strstr(s,"—");

  1603     if (!s1)

  1604     {

  1605 	if (s2)

  1606 	    *next=g_utf8_next_char(s2);

  1607 	return (char *)s2;

  1608     }

  1609     else if (!s2)

  1610     {

  1611 	*next=g_utf8_next_char(g_utf8_next_char(s1));

  1612 	return (char *)s1;

  1613     }

  1614     else if (s1<s2)

  1615     {

  1616 	*next=g_utf8_next_char(g_utf8_next_char(s1));

  1617 	return (char *)s1;

  1618     }

  1619     else

  1620     {

  1621 	*next=g_utf8_next_char(s2);

  1622 	return (char *)s2;

  1623     }

  1624 }

  1626 /*

  1627  * check_for_spaced_emdash:

  1628  *

  1629  * Check for spaced em-dashes.

  1630  *

  1631  * We must check _all_ occurrences of em-dashes on the line

  1632  * hence the loop - even if the first dash is OK

  1633  * there may be another that's wrong later on.

  1634  */

  1635 void check_for_spaced_emdash(const char *aline)

  1636 {

  1637     const char *s,*t,*next;

  1638     for (s=aline;t=str_emdash(s,&next);s=next)

  1639     {

  1640 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1641 	  g_utf8_get_char(next)==CHAR_SPACE)

  1642 	{

  1643 	    if (pswit[ECHO_SWITCH])

  1644 		g_print("\n%s\n",aline);

  1645 	    if (!pswit[OVERVIEW_SWITCH])

  1646 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1647 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1648 	    else

  1649 		cnt_dash++;

  1650 	}

  1651     }

  1652 }

  1654 /*

  1655  * check_for_spaced_dash:

  1656  *

  1657  * Check for spaced dashes.

  1658  */

  1659 void check_for_spaced_dash(const char *aline)

  1660 {

  1661     const char *s;

  1662     if ((s=strstr(aline," -")))

  1663     {

  1664 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1665 	{

  1666 	    if (pswit[ECHO_SWITCH])

  1667 		g_print("\n%s\n",aline);

  1668 	    if (!pswit[OVERVIEW_SWITCH])

  1669 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1670 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1671 	    else

  1672 		cnt_dash++;

  1673 	}

  1674     }

  1675     else if ((s=strstr(aline,"- ")))

  1676     {

  1677 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1678 	{

  1679 	    if (pswit[ECHO_SWITCH])

  1680 		g_print("\n%s\n",aline);

  1681 	    if (!pswit[OVERVIEW_SWITCH])

  1682 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1683 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1684 	    else

  1685 		cnt_dash++;

  1686 	}

  1687     }

  1688 }

  1690 /*

  1691  * check_for_unmarked_paragraphs:

  1692  *

  1693  * Check for unmarked paragraphs indicated by separate speakers.

  1694  *

  1695  * May well be false positive:

  1696  * "Bravo!" "Wonderful!" called the crowd.

  1697  * but useful all the same.

  1698  */

  1699 void check_for_unmarked_paragraphs(const char *aline)

  1700 {

  1701     const char *s;

  1702     s=strstr(aline,"\"  \"");

  1703     if (!s)

  1704 	s=strstr(aline,"\" \"");

  1705     if (s)

  1706     {

  1707 	if (pswit[ECHO_SWITCH])

  1708 	    g_print("\n%s\n",aline);

  1709 	if (!pswit[OVERVIEW_SWITCH])

  1710 	    g_print("    Line %ld column %ld - "

  1711 	      "Query missing paragraph break?\n",

  1712 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1713 	else

  1714 	    cnt_punct++;

  1715     }

  1716 }

  1718 /*

  1719  * check_for_jeebies:

  1720  *

  1721  * Check for "to he" and other easy h/b errors.

  1722  *

  1723  * This is a very inadequate effort on the h/b problem,

  1724  * but the phrase "to he" is always an error, whereas "to

  1725  * be" is quite common.

  1726  * Similarly, '"Quiet!", be said.' is a non-be error

  1727  * "to he" is _not_ always an error!:

  1728  *       "Where they went to he couldn't say."

  1729  * Another false positive:

  1730  *       What would "Cinderella" be without the . . .

  1731  * and another: "If he wants to he can see for himself."

  1732  */

  1733 void check_for_jeebies(const char *aline)

  1734 {

  1735     const char *s;

  1736     s=strstr(aline," be could ");

  1737     if (!s)

  1738 	s=strstr(aline," be would ");

  1739     if (!s)

  1740 	s=strstr(aline," was be ");

  1741     if (!s)

  1742 	s=strstr(aline," be is ");

  1743     if (!s)

  1744 	s=strstr(aline," is be ");

  1745     if (!s)

  1746 	s=strstr(aline,"\", be ");

  1747     if (!s)

  1748 	s=strstr(aline,"\" be ");

  1749     if (!s)

  1750 	s=strstr(aline,"\" be ");

  1751     if (!s)

  1752 	s=strstr(aline," to he ");

  1753     if (s)

  1754     {

  1755 	if (pswit[ECHO_SWITCH])

  1756 	    g_print("\n%s\n",aline);

  1757 	if (!pswit[OVERVIEW_SWITCH])

  1758 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1759 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1760 	else

  1761 	    cnt_word++;

  1762     }

  1763     s=strstr(aline," the had ");

  1764     if (!s)

  1765 	s=strstr(aline," a had ");

  1766     if (!s)

  1767 	s=strstr(aline," they bad ");

  1768     if (!s)

  1769 	s=strstr(aline," she bad ");

  1770     if (!s)

  1771 	s=strstr(aline," he bad ");

  1772     if (!s)

  1773 	s=strstr(aline," you bad ");

  1774     if (!s)

  1775 	s=strstr(aline," i bad ");

  1776     if (s)

  1777     {

  1778 	if (pswit[ECHO_SWITCH])

  1779 	    g_print("\n%s\n",aline);

  1780 	if (!pswit[OVERVIEW_SWITCH])

  1781 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1782 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1783 	else

  1784 	    cnt_word++;

  1785     }

  1786     s=strstr(aline,"; hut ");

  1787     if (!s)

  1788 	s=strstr(aline,", hut ");

  1789     if (s)

  1790     {

  1791 	if (pswit[ECHO_SWITCH])

  1792 	    g_print("\n%s\n",aline);

  1793 	if (!pswit[OVERVIEW_SWITCH])

  1794 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1795 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1796 	else

  1797 	    cnt_word++;

  1798     }

  1799 }

  1801 /*

  1802  * check_for_mta_from:

  1803  *

  1804  * Special case - angled bracket in front of "From" placed there by an

  1805  * MTA when sending an e-mail.

  1806  */

  1807 void check_for_mta_from(const char *aline)

  1808 {

  1809     const char *s;

  1810     s=strstr(aline,">From");

  1811     if (s)

  1812     {

  1813 	if (pswit[ECHO_SWITCH])

  1814 	    g_print("\n%s\n",aline);

  1815 	if (!pswit[OVERVIEW_SWITCH])

  1816 	    g_print("    Line %ld column %ld - "

  1817 	      "Query angled bracket with From\n",

  1818 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1819 	else

  1820 	    cnt_punct++;

  1821     }

  1822 }

  1824 /*

  1825  * check_for_orphan_character:

  1826  *

  1827  * Check for a single character line -

  1828  * often an overflow from bad wrapping.

  1829  */

  1830 void check_for_orphan_character(const char *aline)

  1831 {

  1832     gunichar c;

  1833     c=g_utf8_get_char(aline);

  1834     if (c && !*g_utf8_next_char(aline))

  1835     {

  1836 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1837 	    ; /* Nothing - ignore numerals alone on a line. */

  1838 	else

  1839 	{

  1840 	    if (pswit[ECHO_SWITCH])

  1841 		g_print("\n%s\n",aline);

  1842 	    if (!pswit[OVERVIEW_SWITCH])

  1843 		g_print("    Line %ld column 1 - Query single character line\n",

  1844 		  linecnt);

  1845 	    else

  1846 		cnt_punct++;

  1847 	}

  1848     }

  1849 }

  1851 /*

  1852  * check_for_pling_scanno:

  1853  *

  1854  * Check for I" - often should be !

  1855  */

  1856 void check_for_pling_scanno(const char *aline)

  1857 {

  1858     const char *s;

  1859     s=strstr(aline," I\"");

  1860     if (s)

  1861     {

  1862 	if (pswit[ECHO_SWITCH])

  1863 	    g_print("\n%s\n",aline);

  1864 	if (!pswit[OVERVIEW_SWITCH])

  1865 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1866 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1867 	else

  1868 	    cnt_punct++;

  1869     }

  1870 }

  1872 /*

  1873  * check_for_extra_period:

  1874  *

  1875  * Check for period without a capital letter. Cut-down from gutspell.

  1876  * Only works when it happens on a single line.

  1877  */

  1878 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1879 {

  1880     const char *s,*t,*s1,*sprev;

  1881     int i;

  1882     gsize len;

  1883     gboolean istypo;

  1884     gchar *testword;

  1885     gunichar c,nc,pc,*decomposition;

  1886     if (pswit[PARANOID_SWITCH])

  1887     {

  1888 	for (t=aline;t=strstr(t,". ");)

  1889 	{

  1890 	    if (t==aline)

  1891 	    {

  1892 		t=g_utf8_next_char(t);

  1893 		/* start of line punctuation is handled elsewhere */

  1894 		continue;

  1895 	    }

  1896 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1897 	    {

  1898 		t=g_utf8_next_char(t);

  1899 		continue;

  1900 	    }

  1901 	    if (warnings->isDutch)

  1902 	    {

  1903 		/* For Frank & Jeroen -- 's Middags case */

  1904 		gunichar c2,c3,c4,c5;

  1905 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1906 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1907 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1908 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1909 		if (CHAR_IS_APOSTROPHE(c2) &&

  1910 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&

  1911 		  g_unichar_isupper(c5))

  1912 		{

  1913 		    t=g_utf8_next_char(t);

  1914 		    continue;

  1915 		}

  1916 	    }

  1917 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1918 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1919 	      !g_unichar_isdigit(g_utf8_get_char(s1)))

  1920 		s1=g_utf8_next_char(s1);

  1921 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1922 	    {

  1923 		/* we have something to investigate */

  1924 		istypo=TRUE;

  1925 		/* so let's go back and find out */

  1926 		nc=g_utf8_get_char(t);

  1927 		s1=g_utf8_prev_char(t);

  1928 		c=g_utf8_get_char(s1);

  1929 		sprev=g_utf8_prev_char(s1);

  1930 		pc=g_utf8_get_char(sprev);

  1931 		while (s1>=aline &&

  1932 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||

  1933 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&

  1934 		  g_unichar_isalpha(nc)))

  1935 		{

  1936 		    nc=c;

  1937 		    s1=sprev;

  1938 		    c=pc;

  1939 		    sprev=g_utf8_prev_char(s1);

  1940 		    pc=g_utf8_get_char(sprev);

  1941 		}

  1942 		s1=g_utf8_next_char(s1);

  1943 		s=strchr(s1,'.');

  1944 		if (s)

  1945 		    testword=g_strndup(s1,s-s1);

  1946 		else

  1947 		    testword=g_strdup(s1);

  1948 		for (i=0;*abbrev[i];i++)

  1949 		    if (!strcmp(testword,abbrev[i]))

  1950 			istypo=FALSE;

  1951 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1952 		    istypo=FALSE;

  1953 		if (!*g_utf8_next_char(testword))

  1954 		    istypo=FALSE;

  1955 		if (isroman(testword))

  1956 		    istypo=FALSE;

  1957 		if (istypo)

  1958 		{

  1959 		    istypo=FALSE;

  1960 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1961 		    {

  1962 			decomposition=g_unicode_canonical_decomposition(

  1963 			  g_utf8_get_char(s),&len);

  1964 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1965 			    istypo=TRUE;

  1966 			g_free(decomposition);

  1967 		    }

  1968 		}

  1969 		if (istypo &&

  1970 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1971 		{

  1972 		    g_tree_insert(qperiod,g_strdup(testword),

  1973 		      GINT_TO_POINTER(1));

  1974 		    if (pswit[ECHO_SWITCH])

  1975 			g_print("\n%s\n",aline);

  1976 		    if (!pswit[OVERVIEW_SWITCH])

  1977 			g_print("    Line %ld column %ld - Extra period?\n",

  1978 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1979 		    else

  1980 			cnt_punct++;

  1981 		}

  1982 		g_free(testword);

  1983 	    }

  1984 	    t=g_utf8_next_char(t);

  1985 	}

  1986     }

  1987 }

  1989 /*

  1990  * check_for_following_punctuation:

  1991  *

  1992  * Check for words usually not followed by punctuation.

  1993  */

  1994 void check_for_following_punctuation(const char *aline)

  1995 {

  1996     int i;

  1997     const char *s,*wordstart;

  1998     gunichar c;

  1999     gchar *inword,*t;

  2000     if (pswit[TYPO_SWITCH])

  2001     {

  2002 	for (s=aline;*s;)

  2003 	{

  2004 	    wordstart=s;

  2005 	    t=getaword(&s);

  2006 	    if (!*t)

  2007 	    {

  2008 		g_free(t);

  2009 		continue;

  2010 	    }

  2011 	    inword=g_utf8_strdown(t,-1);

  2012 	    g_free(t);

  2013 	    for (i=0;*nocomma[i];i++)

  2014 		if (!strcmp(inword,nocomma[i]))

  2015 		{

  2016 		    c=g_utf8_get_char(s);

  2017 		    if (c==',' || c==';' || c==':')

  2018 		    {

  2019 			if (pswit[ECHO_SWITCH])

  2020 			    g_print("\n%s\n",aline);

  2021 			if (!pswit[OVERVIEW_SWITCH])

  2022 			    g_print("    Line %ld column %ld - "

  2023 			      "Query punctuation after %s?\n",

  2024 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  2025 			      inword);

  2026 			else

  2027 			    cnt_punct++;

  2028 		    }

  2029 		}

  2030 	    for (i=0;*noperiod[i];i++)

  2031 		if (!strcmp(inword,noperiod[i]))

  2032 		{

  2033 		    c=g_utf8_get_char(s);

  2034 		    if (c=='.' || c=='!')

  2035 		    {

  2036 			if (pswit[ECHO_SWITCH])

  2037 			    g_print("\n%s\n",aline);

  2038 			if (!pswit[OVERVIEW_SWITCH])

  2039 			    g_print("    Line %ld column %ld - "

  2040 			      "Query punctuation after %s?\n",

  2041 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  2042 			      inword);

  2043 			else

  2044 			    cnt_punct++;

  2045 		    }

  2046 		}

  2047 	    g_free(inword);

  2048 	}

  2049     }

  2050 }

  2052 /*

  2053  * check_for_typos:

  2054  *

  2055  * Check for commonly mistyped words,

  2056  * and digits like 0 for O in a word.

  2057  */

  2058 void check_for_typos(const char *aline,struct warnings *warnings)

  2059 {

  2060     const char *s,*t,*nt,*wordstart;

  2061     gchar *inword;

  2062     gunichar *decomposition;

  2063     gchar *testword;

  2064     int i,vowel,consonant,*dupcnt;

  2065     gboolean isdup,istypo,alower;

  2066     gunichar c,pc;

  2067     long offset,len;

  2068     gsize decomposition_len;

  2069     for (s=aline;*s;)

  2070     {

  2071 	wordstart=s;

  2072 	inword=getaword(&s);

  2073 	if (!*inword)

  2074 	{

  2075 	    g_free(inword);

  2076 	    continue; /* don't bother with empty lines */

  2077 	}

  2078 	if (mixdigit(inword))

  2079 	{

  2080 	    if (pswit[ECHO_SWITCH])

  2081 		g_print("\n%s\n",aline);

  2082 	    if (!pswit[OVERVIEW_SWITCH])

  2083 		g_print("    Line %ld column %ld - Query digit in %s\n",

  2084 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  2085 	    else

  2086 		cnt_word++;

  2087 	}

  2088 	/*

  2089 	 * Put the word through a series of tests for likely typos and OCR

  2090 	 * errors.

  2091 	 */

  2092 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  2093 	{

  2094 	    istypo=FALSE;

  2095 	    alower=FALSE;

  2096 	    for (t=inword;*t;t=g_utf8_next_char(t))

  2097 	    {

  2098 		c=g_utf8_get_char(t);

  2099 		nt=g_utf8_next_char(t);

  2100 		/* lowercase for testing */

  2101 		if (g_unichar_islower(c))

  2102 		    alower=TRUE;

  2103 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  2104 		{

  2105 		    /*

  2106 		     * We have an uppercase mid-word. However, there are

  2107 		     * common cases:

  2108 		     *   Mac and Mc like McGill

  2109 		     *   French contractions like l'Abbe

  2110 		     */

  2111 		    offset=g_utf8_pointer_to_offset(inword,t);

  2112 		    if (offset>0)

  2113 			pc=g_utf8_get_char(g_utf8_prev_char(t));

  2114 		    else

  2115 			pc='\0';

  2116 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  2117 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  2118 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  2119 		      CHAR_IS_APOSTROPHE(pc))

  2120 			; /* do nothing! */

  2121 		    else

  2122 			istypo=TRUE;

  2123 		}

  2124 	    }

  2125 	    testword=g_utf8_casefold(inword,-1);

  2126 	}

  2127 	if (pswit[TYPO_SWITCH])

  2128 	{

  2129 	    /*

  2130 	     * Check for certain unlikely two-letter combinations at word

  2131 	     * start and end.

  2132 	     */

  2133 	    len=g_utf8_strlen(testword,-1);

  2134 	    if (len>1)

  2135 	    {

  2136 		for (i=0;*nostart[i];i++)

  2137 		    if (g_str_has_prefix(testword,nostart[i]))

  2138 			istypo=TRUE;

  2139 		for (i=0;*noend[i];i++)

  2140 		    if (g_str_has_suffix(testword,noend[i]))

  2141 			istypo=TRUE;

  2142 	    }

  2143 	    /* ght is common, gbt never. Like that. */

  2144 	    if (strstr(testword,"cb"))

  2145 		istypo=TRUE;

  2146 	    if (strstr(testword,"gbt"))

  2147 		istypo=TRUE;

  2148 	    if (strstr(testword,"pbt"))

  2149 		istypo=TRUE;

  2150 	    if (strstr(testword,"tbs"))

  2151 		istypo=TRUE;

  2152 	    if (strstr(testword,"mrn"))

  2153 		istypo=TRUE;

  2154 	    if (strstr(testword,"ahle"))

  2155 		istypo=TRUE;

  2156 	    if (strstr(testword,"ihle"))

  2157 		istypo=TRUE;

  2158 	    /*

  2159 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  2160 	     * Also "TBI" - frostbite, outbid - but uncommon.

  2161 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  2162 	     * numerals, but "ii" is a common scanno.

  2163 	     */

  2164 	    if (strstr(testword,"tbi"))

  2165 		istypo=TRUE;

  2166 	    if (strstr(testword,"tbe"))

  2167 		istypo=TRUE;

  2168 	    if (strstr(testword,"ii"))

  2169 		istypo=TRUE;

  2170 	    /*

  2171 	     * Check for no vowels or no consonants.

  2172 	     * If none, flag a typo.

  2173 	     */

  2174 	    if (!istypo && len>1)

  2175 	    {

  2176 		vowel=consonant=0;

  2177 		for (t=testword;*t;t=g_utf8_next_char(t))

  2178 		{

  2179 		    c=g_utf8_get_char(t);

  2180 		    decomposition=

  2181 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  2182 		    if (c=='y' || g_unichar_isdigit(c))

  2183 		    {

  2184 			/* Yah, this is loose. */

  2185 			vowel++;

  2186 			consonant++;

  2187 		    }

  2188 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  2189 			vowel++;

  2190 		    else

  2191 			consonant++;

  2192 		    g_free(decomposition);

  2193 		}

  2194 		if (!vowel || !consonant)

  2195 		    istypo=TRUE;

  2196 	    }

  2197 	    /*

  2198 	     * Now exclude the word from being reported if it's in

  2199 	     * the okword list.

  2200 	     */

  2201 	    for (i=0;*okword[i];i++)

  2202 		if (!strcmp(testword,okword[i]))

  2203 		    istypo=FALSE;

  2204 	    /*

  2205 	     * What looks like a typo may be a Roman numeral.

  2206 	     * Exclude these.

  2207 	     */

  2208 	    if (istypo && isroman(testword))

  2209 		istypo=FALSE;

  2210 	    /* Check the manual list of typos. */

  2211 	    if (!istypo)

  2212 		for (i=0;*typo[i];i++)

  2213 		    if (!strcmp(testword,typo[i]))

  2214 			istypo=TRUE;

  2215 	    /*

  2216 	     * Check lowercase s, l, i and m - special cases.

  2217 	     *   "j" - often a semi-colon gone wrong.

  2218 	     *   "d" for a missing apostrophe - he d

  2219 	     *   "n" for "in"

  2220 	     */

  2221 	    if (!istypo && len==1 &&

  2222 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  2223 		istypo=TRUE;

  2224 	    if (istypo)

  2225 	    {

  2226 		dupcnt=g_tree_lookup(qword,testword);

  2227 		if (dupcnt)

  2228 		{

  2229 		    (*dupcnt)++;

  2230 		    isdup=!pswit[VERBOSE_SWITCH];

  2231 		}

  2232 		else

  2233 		{

  2234 		    dupcnt=g_new0(int,1);

  2235 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  2236 		    isdup=FALSE;

  2237 		}

  2238 		if (!isdup)

  2239 		{

  2240 		    if (pswit[ECHO_SWITCH])

  2241 			g_print("\n%s\n",aline);

  2242 		    if (!pswit[OVERVIEW_SWITCH])

  2243 		    {

  2244 			g_print("    Line %ld column %ld - Query word %s",

  2245 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  2246 			  inword);

  2247 			if (!pswit[VERBOSE_SWITCH])

  2248 			    g_print(" - not reporting duplicates");

  2249 			g_print("\n");

  2250 		    }

  2251 		    else

  2252 			cnt_word++;

  2253 		}

  2254 	    }

  2255 	}

  2256 	/* check the user's list of typos */

  2257 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  2258 	{

  2259 	    if (pswit[ECHO_SWITCH])

  2260 		g_print("\n%s\n",aline);

  2261 	    if (!pswit[OVERVIEW_SWITCH])

  2262 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  2263 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  2264 	}

  2265 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  2266 	    g_free(testword);

  2267 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  2268 	{

  2269 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  2270 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  2271 	    {

  2272 		if (pswit[ECHO_SWITCH])

  2273 		    g_print("\n%s\n",aline);

  2274 		if (!pswit[OVERVIEW_SWITCH])

  2275 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  2276 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  2277 		      inword);

  2278 		else

  2279 		    cnt_word++;

  2280 	    }

  2281 	}

  2282 	g_free(inword);

  2283     }

  2284 }

  2286 /*

  2287  * check_for_misspaced_punctuation:

  2288  *

  2289  * Look for added or missing spaces around punctuation and quotes.

  2290  * If there is a punctuation character like ! with no space on

  2291  * either side, suspect a missing!space. If there are spaces on

  2292  * both sides , assume a typo. If we see a double quote with no

  2293  * space or punctuation on either side of it, assume unspaced

  2294  * quotes "like"this.

  2295  */

  2296 void check_for_misspaced_punctuation(const char *aline,

  2297   struct parities *parities,gboolean isemptyline)

  2298 {

  2299     gboolean isacro,isellipsis;

  2300     const char *s;

  2301     gunichar c,nc,pc,n2c;

  2302     int parity;

  2303     c=g_utf8_get_char(aline);

  2304     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2305     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2306     {

  2307 	pc=c;

  2308 	c=nc;

  2309 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2310 	/* For each character in the line after the first. */

  2311 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  2312 	{

  2313 	    /* we need to suppress warnings for acronyms like M.D. */

  2314 	    isacro=FALSE;

  2315 	    /* we need to suppress warnings for ellipsis . . . */

  2316 	    isellipsis=FALSE;

  2317 	    /*

  2318 	     * If there are letters on both sides of it or

  2319 	     * if it's strict punctuation followed by an alpha.

  2320 	     */

  2321 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  2322 	      g_utf8_strchr("?!,;:",-1,c)))

  2323 	    {

  2324 		if (c=='.')

  2325 		{

  2326 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  2327 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  2328 			isacro=TRUE;

  2329 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  2330 		    if (nc && n2c=='.')

  2331 			isacro=TRUE;

  2332 		}

  2333 		if (!isacro)

  2334 		{

  2335 		    if (pswit[ECHO_SWITCH])

  2336 			g_print("\n%s\n",aline);

  2337 		    if (!pswit[OVERVIEW_SWITCH])

  2338 			g_print("    Line %ld column %ld - Missing space?\n",

  2339 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2340 		    else

  2341 			cnt_punct++;

  2342 		}

  2343 	    }

  2344 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  2345 	    {

  2346 		/*

  2347 		 * If there are spaces on both sides,

  2348 		 * or space before and end of line.

  2349 		 */

  2350 		if (c=='.')

  2351 		{

  2352 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  2353 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  2354 			isellipsis=TRUE;

  2355 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  2356 		    if (nc && n2c=='.')

  2357 			isellipsis=TRUE;

  2358 		}

  2359 		if (!isemptyline && !isellipsis)

  2360 		{

  2361 		    if (pswit[ECHO_SWITCH])

  2362 			g_print("\n%s\n",aline);

  2363 		    if (!pswit[OVERVIEW_SWITCH])

  2364 			g_print("    Line %ld column %ld - "

  2365 			  "Spaced punctuation?\n",linecnt,

  2366 			  g_utf8_pointer_to_offset(aline,s)+1);

  2367 		    else

  2368 			cnt_punct++;

  2369 		}

  2370 	    }

  2371 	}

  2372     }

  2373     /* Split out the characters that CANNOT be preceded by space. */

  2374     c=g_utf8_get_char(aline);

  2375     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2376     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2377     {

  2378 	pc=c;

  2379 	c=nc;

  2380 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2381 	/* for each character in the line after the first */

  2382 	if (g_utf8_strchr("?!,;:",-1,c))

  2383 	{

  2384 	    /* if it's punctuation that _cannot_ have a space before it */

  2385 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  2386 	    {

  2387 		/*

  2388 		 * If nc DOES == space,

  2389 		 * it was already reported just above.

  2390 		 */

  2391 		if (pswit[ECHO_SWITCH])

  2392 		    g_print("\n%s\n",aline);

  2393 		if (!pswit[OVERVIEW_SWITCH])

  2394 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2395 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2396 		else

  2397 		    cnt_punct++;

  2398 	    }

  2399 	}

  2400     }

  2401     /*

  2402      * Special case " .X" where X is any alpha.

  2403      * This plugs a hole in the acronym code above.

  2404      * Inelegant, but maintainable.

  2405      */

  2406     c=g_utf8_get_char(aline);

  2407     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2408     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2409     {

  2410 	pc=c;

  2411 	c=nc;

  2412 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2413 	/* for each character in the line after the first */

  2414 	if (c=='.')

  2415 	{

  2416 	    /* if it's a period */

  2417 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  2418 	    {

  2419 		/*

  2420 		 * If the period follows a space and

  2421 		 * is followed by a letter.

  2422 		 */

  2423 		if (pswit[ECHO_SWITCH])

  2424 		    g_print("\n%s\n",aline);

  2425 		if (!pswit[OVERVIEW_SWITCH])

  2426 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2427 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2428 		else

  2429 		    cnt_punct++;

  2430 	    }

  2431 	}

  2432     }

  2433     c=g_utf8_get_char(aline);

  2434     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2435     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2436     {

  2437 	pc=c;

  2438 	c=nc;

  2439 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2440 	/* for each character in the line after the first */

  2441 	if (CHAR_IS_DQUOTE(c))

  2442 	{

  2443 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  2444 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  2445 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  2446 	    {

  2447 		if (pswit[ECHO_SWITCH])

  2448 		    g_print("\n%s\n",aline);

  2449 		if (!pswit[OVERVIEW_SWITCH])

  2450 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  2451 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2452 		else

  2453 		    cnt_punct++;

  2454 	    }

  2455 	}

  2456     }

  2457     /* Check parity of quotes. */

  2458     nc=g_utf8_get_char(aline);

  2459     for (s=aline;*s;s=g_utf8_next_char(s))

  2460     {

  2461 	c=nc;

  2462 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2463 	if (CHAR_IS_DQUOTE(c))

  2464 	{

  2465 	    if (c==CHAR_DQUOTE)

  2466 	    {

  2467 		parities->dquote=!parities->dquote;

  2468 		parity=parities->dquote;

  2469 	    }

  2470 	    else if (c==CHAR_LD_QUOTE)

  2471 		parity=1;

  2472 	    else

  2473 		parity=0;

  2474 	    if (!parity)

  2475 	    {

  2476 		/* parity even */

  2477 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))

  2478 		{

  2479 		    if (pswit[ECHO_SWITCH])

  2480 			g_print("\n%s\n",aline);

  2481 		    if (!pswit[OVERVIEW_SWITCH])

  2482 			g_print("    Line %ld column %ld - "

  2483 			  "Wrongspaced quotes?\n",

  2484 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2485 		    else

  2486 			cnt_punct++;

  2487 		}

  2488 	    }

  2489 	    else

  2490 	    {

  2491 		/* parity odd */

  2492 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&

  2493 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)

  2494 		{

  2495 		    if (pswit[ECHO_SWITCH])

  2496 			g_print("\n%s\n",aline);

  2497 		    if (!pswit[OVERVIEW_SWITCH])

  2498 			g_print("    Line %ld column %ld - "

  2499 			  "Wrongspaced quotes?\n",

  2500 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2501 		    else

  2502 			cnt_punct++;

  2503 		}

  2504 	    }

  2505 	}

  2506     }

  2507     c=g_utf8_get_char(aline);

  2508     if (CHAR_IS_DQUOTE(c))

  2509     {

  2510 	if (g_utf8_strchr(",;:!?)]} ",-1,

  2511 	  g_utf8_get_char(g_utf8_next_char(aline))))

  2512 	{

  2513 	    if (pswit[ECHO_SWITCH])

  2514 		g_print("\n%s\n",aline);

  2515 	    if (!pswit[OVERVIEW_SWITCH])

  2516 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  2517 		  linecnt);

  2518 	    else

  2519 		cnt_punct++;

  2520 	}

  2521     }

  2522     if (pswit[SQUOTE_SWITCH])

  2523     {

  2524 	nc=g_utf8_get_char(aline);

  2525 	for (s=aline;*s;s=g_utf8_next_char(s))

  2526 	{

  2527 	    c=nc;

  2528 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2529 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&

  2530 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2531 	      !g_unichar_isalpha(nc)))

  2532 	    {

  2533 		parities->squote=!parities->squote;

  2534 		if (!parities->squote)

  2535 		{

  2536 		    /* parity even */

  2537 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2538 		    {

  2539 			if (pswit[ECHO_SWITCH])

  2540 			    g_print("\n%s\n",aline);

  2541 			if (!pswit[OVERVIEW_SWITCH])

  2542 			    g_print("    Line %ld column %ld - "

  2543 			      "Wrongspaced singlequotes?\n",

  2544 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2545 			else

  2546 			    cnt_punct++;

  2547 		    }

  2548 		}

  2549 		else

  2550 		{

  2551 		    /* parity odd */

  2552 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&

  2553 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2554 		    {

  2555 			if (pswit[ECHO_SWITCH])

  2556 			    g_print("\n%s\n",aline);

  2557 			if (!pswit[OVERVIEW_SWITCH])

  2558 			    g_print("    Line %ld column %ld - "

  2559 			      "Wrongspaced singlequotes?\n",

  2560 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2561 			else

  2562 			    cnt_punct++;

  2563 		    }

  2564 		}

  2565 	    }

  2566 	}

  2567     }

  2568 }

  2570 /*

  2571  * check_for_double_punctuation:

  2572  *

  2573  * Look for double punctuation like ,. or ,,

  2574  * Thanks to DW for the suggestion!

  2575  * In books with references, ".," and ".;" are common

  2576  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2577  * OTOH, from my initial tests, there are also fairly

  2578  * common errors. What to do? Make these cases paranoid?

  2579  * ".," is the most common, so warnings->dotcomma is used

  2580  * to suppress detailed reporting if it occurs often.

  2581  */

  2582 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2583 {

  2584     const char *s;

  2585     gunichar c,nc;

  2586     nc=g_utf8_get_char(aline);

  2587     for (s=aline;*s;s=g_utf8_next_char(s))

  2588     {

  2589 	c=nc;

  2590 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2591 	/* for each punctuation character in the line */

  2592 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2593 	  g_utf8_strchr(".?!,;:",-1,nc))

  2594 	{

  2595 	    /* followed by punctuation, it's a query, unless . . . */

  2596 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2597 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2598 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2599 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2600 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2601 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2602 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2603 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2604 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2605 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2606 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2607 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2608 	    {

  2609 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2610 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2611 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2612 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2613 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2614 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2615 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2616 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2617 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2618 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2619 		{

  2620 		    s+=4;

  2621 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2622 		}

  2623 		; /* do nothing for .. !! and ?? which can be legit */

  2624 	    }

  2625 	    else

  2626 	    {

  2627 		if (pswit[ECHO_SWITCH])

  2628 		    g_print("\n%s\n",aline);

  2629 		if (!pswit[OVERVIEW_SWITCH])

  2630 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2631 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2632 		else

  2633 		    cnt_punct++;

  2634 	    }

  2635 	}

  2636     }

  2637 }

  2639 /*

  2640  * check_for_spaced_quotes:

  2641  */

  2642 void check_for_spaced_quotes(const char *aline)

  2643 {

  2644     int i;

  2645     const char *s,*t;

  2646     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,

  2647       CHAR_RS_QUOTE};

  2648     GString *pattern;

  2649     s=aline;

  2650     while ((t=strstr(s," \" ")))

  2651     {

  2652 	if (pswit[ECHO_SWITCH])

  2653 	    g_print("\n%s\n",aline);

  2654 	if (!pswit[OVERVIEW_SWITCH])

  2655 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2656 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2657 	else

  2658 	    cnt_punct++;

  2659 	s=g_utf8_next_char(g_utf8_next_char(t));

  2660     }

  2661     pattern=g_string_new(NULL);

  2662     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)

  2663     {

  2664 	g_string_assign(pattern," ");

  2665 	g_string_append_unichar(pattern,single_quotes[i]);

  2666 	g_string_append_c(pattern,' ');

  2667 	s=aline;

  2668 	while ((t=strstr(s,pattern->str)))

  2669 	{

  2670 	    if (pswit[ECHO_SWITCH])

  2671 		g_print("\n%s\n",aline);

  2672 	    if (!pswit[OVERVIEW_SWITCH])

  2673 		g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2674 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2675 	    else

  2676 		cnt_punct++;

  2677 	    s=g_utf8_next_char(g_utf8_next_char(t));

  2678 	}

  2679     }

  2680     g_string_free(pattern,TRUE);

  2681 }

  2683 /*

  2684  * check_for_miscased_genative:

  2685  *

  2686  * Check special case of 'S instead of 's at end of word.

  2687  */

  2688 void check_for_miscased_genative(const char *aline)

  2689 {

  2690     const char *s;

  2691     gunichar c,nc,pc;

  2692     if (!*aline)

  2693 	return;

  2694     c=g_utf8_get_char(aline);

  2695     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2696     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2697     {

  2698 	pc=c;

  2699 	c=nc;

  2700 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2701 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))

  2702 	{

  2703 	    if (pswit[ECHO_SWITCH])

  2704 		g_print("\n%s\n",aline);

  2705 	    if (!pswit[OVERVIEW_SWITCH])

  2706 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2707 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2708 	    else

  2709 		cnt_punct++;

  2710 	}

  2711     }

  2712 }

  2714 /*

  2715  * check_end_of_line:

  2716  *

  2717  * Now check special cases - start and end of line -

  2718  * for single and double quotes. Start is sometimes [sic]

  2719  * but better to query it anyway.

  2720  * While we're here, check for dash at end of line.

  2721  */

  2722 void check_end_of_line(const char *aline,struct warnings *warnings)

  2723 {

  2724     int lbytes;

  2725     const char *s;

  2726     gunichar c1,c2;

  2727     lbytes=strlen(aline);

  2728     if (g_utf8_strlen(aline,lbytes)>1)

  2729     {

  2730 	s=g_utf8_prev_char(aline+lbytes);

  2731 	c1=g_utf8_get_char(s);

  2732 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2733 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)

  2734 	{

  2735 	    if (pswit[ECHO_SWITCH])

  2736 		g_print("\n%s\n",aline);

  2737 	    if (!pswit[OVERVIEW_SWITCH])

  2738 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2739 		  g_utf8_strlen(aline,lbytes));

  2740 	    else

  2741 		cnt_punct++;

  2742 	}

  2743 	c1=g_utf8_get_char(aline);

  2744 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2745 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)

  2746 	{

  2747 	    if (pswit[ECHO_SWITCH])

  2748 		g_print("\n%s\n",aline);

  2749 	    if (!pswit[OVERVIEW_SWITCH])

  2750 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2751 	    else

  2752 		cnt_punct++;

  2753 	}

  2754 	/*

  2755 	 * Dash at end of line may well be legit - paranoid mode only

  2756 	 * and don't report em-dash at line-end.

  2757 	 */

  2758 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2759 	{

  2760 	    for (s=g_utf8_prev_char(aline+lbytes);

  2761 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2762 		;

  2763 	    if (g_utf8_get_char(s)=='-' &&

  2764 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2765 	    {

  2766 		if (pswit[ECHO_SWITCH])

  2767 		    g_print("\n%s\n",aline);

  2768 		if (!pswit[OVERVIEW_SWITCH])

  2769 		    g_print("    Line %ld column %ld - "

  2770 		      "Hyphen at end of line?\n",

  2771 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2772 	    }

  2773 	}

  2774     }

  2775 }

  2777 /*

  2778  * check_for_unspaced_bracket:

  2779  *

  2780  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2781  * If so, suspect a scanno like "a]most".

  2782  */

  2783 void check_for_unspaced_bracket(const char *aline)

  2784 {

  2785     const char *s;

  2786     gunichar c,nc,pc;

  2787     c=g_utf8_get_char(aline);

  2788     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2789     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2790     {

  2791 	pc=c;

  2792 	c=nc;

  2793 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2794 	if (!nc)

  2795 	    break;

  2796 	/* for each bracket character in the line except 1st & last */

  2797 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2798 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2799 	{

  2800 	    if (pswit[ECHO_SWITCH])

  2801 		g_print("\n%s\n",aline);

  2802 	    if (!pswit[OVERVIEW_SWITCH])

  2803 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2804 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2805 	    else

  2806 		cnt_punct++;

  2807 	}

  2808     }

  2809 }

  2811 /*

  2812  * check_for_unpunctuated_endquote:

  2813  */

  2814 void check_for_unpunctuated_endquote(const char *aline)

  2815 {

  2816     const char *s;

  2817     gunichar c,nc,pc;

  2818     QuoteClass qc;

  2819     c=g_utf8_get_char(aline);

  2820     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2821     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2822     {

  2823 	pc=c;

  2824 	c=nc;

  2825 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;

  2826 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2827 	/* for each character in the line except 1st */

  2828 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))

  2829 	{

  2830 	    if (pswit[ECHO_SWITCH])

  2831 		g_print("\n%s\n",aline);

  2832 	    if (!pswit[OVERVIEW_SWITCH])

  2833 		g_print("    Line %ld column %ld - "

  2834 		  "endquote missing punctuation?\n",

  2835 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2836 	    else

  2837 		cnt_punct++;

  2838 	}

  2839     }

  2840 }

  2842 /*

  2843  * check_for_html_tag:

  2844  *

  2845  * Check for <HTML TAG>.

  2846  *

  2847  * If there is a < in the line, followed at some point

  2848  * by a > then we suspect HTML.

  2849  */

  2850 void check_for_html_tag(const char *aline)

  2851 {

  2852     const char *open,*close;

  2853     gchar *tag;

  2854     open=strchr(aline,'<');

  2855     if (open)

  2856     {

  2857 	close=strchr(g_utf8_next_char(open),'>');

  2858 	if (close)

  2859 	{

  2860 	    if (pswit[ECHO_SWITCH])

  2861 		g_print("\n%s\n",aline);

  2862 	    if (!pswit[OVERVIEW_SWITCH])

  2863 	    {

  2864 		tag=g_strndup(open,close-open+1);

  2865 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2866 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2867 		g_free(tag);

  2868 	    }

  2869 	    else

  2870 		cnt_html++;

  2871 	}

  2872     }

  2873 }

  2875 /*

  2876  * check_for_html_entity:

  2877  *

  2878  * Check for &symbol; HTML.

  2879  *

  2880  * If there is a & in the line, followed at

  2881  * some point by a ; then we suspect HTML.

  2882  */

  2883 void check_for_html_entity(const char *aline)

  2884 {

  2885     const char *s,*amp,*scolon;

  2886     gchar *entity;

  2887     amp=strchr(aline,'&');

  2888     if (amp)

  2889     {

  2890 	scolon=strchr(amp,';');

  2891 	if (scolon)

  2892 	{

  2893 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2894 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2895 		    break;		/* Don't report "Jones & Son;" */

  2896 	    if (s>=scolon)

  2897 	    {

  2898 		if (pswit[ECHO_SWITCH])

  2899 		    g_print("\n%s\n",aline);

  2900 		if (!pswit[OVERVIEW_SWITCH])

  2901 		{

  2902 		    entity=g_strndup(amp,scolon-amp+1);

  2903 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2904 		      linecnt,(int)(amp-aline)+1,entity);

  2905 		    g_free(entity);

  2906 		}

  2907 		else

  2908 		    cnt_html++;

  2909 	    }

  2910 	}

  2911     }

  2912 }

  2914 /*

  2915  * check_for_omitted_punctuation:

  2916  *

  2917  * Check for omitted punctuation at end of paragraph by working back

  2918  * through prevline. DW.

  2919  * Need to check this only for "normal" paras.

  2920  * So what is a "normal" para?

  2921  *    Not normal if one-liner (chapter headings, etc.)

  2922  *    Not normal if doesn't contain at least one locase letter

  2923  *    Not normal if starts with space

  2924  */

  2925 void check_for_omitted_punctuation(const char *prevline,

  2926   struct line_properties *last,int start_para_line)

  2927 {

  2928     gboolean letter_on_line=FALSE;

  2929     const char *s;

  2930     gunichar c;

  2931     gboolean closing_quote;

  2932     for (s=prevline;*s;s=g_utf8_next_char(s))

  2933 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2934 	{

  2935 	    letter_on_line=TRUE;

  2936 	    break;

  2937 	}

  2938     /*

  2939      * This next "if" is a problem.

  2940      * If we say "start_para_line <= linecnt - 1", that includes

  2941      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2942      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2943      * misses genuine one-line paragraphs.

  2944      */

  2945     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2946       g_utf8_get_char(prevline)>CHAR_SPACE)

  2947     {

  2948 	s=prevline+strlen(prevline);

  2949 	do

  2950 	{

  2951 	    s=g_utf8_prev_char(s);

  2952 	    c=g_utf8_get_char(s);

  2953 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)

  2954 		closing_quote=TRUE;

  2955 	    else

  2956 		closing_quote=FALSE;

  2957 	} while (closing_quote && s>prevline);

  2958 	for (;s>prevline;s=g_utf8_prev_char(s))

  2959 	{

  2960 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2961 	    {

  2962 		if (pswit[ECHO_SWITCH])

  2963 		    g_print("\n%s\n",prevline);

  2964 		if (!pswit[OVERVIEW_SWITCH])

  2965 		    g_print("    Line %ld column %ld - "

  2966 		      "No punctuation at para end?\n",

  2967 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2968 		else

  2969 		    cnt_punct++;

  2970 		break;

  2971 	    }

  2972 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))

  2973 		break;

  2974 	}

  2975     }

  2976 }

  2978 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2979 {

  2980     const char *word=key;

  2981     int *dupcnt=value;

  2982     if (*dupcnt)

  2983 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2984 	  word,*dupcnt);

  2985     return FALSE;

  2986 }

  2988 void print_as_windows_1252(const char *string)

  2989 {

  2990     gsize inbytes,outbytes;

  2991     gchar *buf,*bp;

  2992     static GIConv converter=(GIConv)-1;

  2993     if (!string)

  2994     {

  2995 	if (converter!=(GIConv)-1)

  2996 	    g_iconv_close(converter);

  2997 	converter=(GIConv)-1;

  2998 	return;

  2999     }

  3000     if (converter==(GIConv)-1)

  3001 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  3002     if (converter!=(GIConv)-1)

  3003     {

  3004 	inbytes=outbytes=strlen(string);

  3005 	bp=buf=g_malloc(outbytes+1);

  3006 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  3007 	*bp='\0';

  3008 	fputs(buf,stdout);

  3009 	g_free(buf);

  3010     }

  3011     else

  3012 	fputs(string,stdout);

  3013 }

  3015 void print_as_utf_8(const char *string)

  3016 {

  3017     fputs(string,stdout);

  3018 }

  3020 /*

  3021  * procfile:

  3022  *

  3023  * Process one file.

  3024  */

  3025 void procfile(const char *filename)

  3026 {

  3027     const char *s;

  3028     gchar *parastart=NULL;	/* first line of current para */

  3029     gchar *etext,*aline;

  3030     gchar *etext_ptr;

  3031     GError *err=NULL;

  3032     struct first_pass_results *first_pass_results;

  3033     struct warnings *warnings;

  3034     struct counters counters={0};

  3035     struct line_properties last={0};

  3036     struct parities parities={0};

  3037     struct pending pending={0};

  3038     gboolean isemptyline;

  3039     long start_para_line=0;

  3040     gboolean isnewpara=FALSE,enddash=FALSE;

  3041     last.start=CHAR_SPACE;

  3042     linecnt=checked_linecnt=0;

  3043     etext=read_etext(filename,&err);

  3044     if (!etext)

  3045     {

  3046 	if (pswit[STDOUT_SWITCH])

  3047 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  3048 	else

  3049 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  3050 	exit(1);

  3051     }

  3052     g_print("\n\nFile: %s\n\n",filename);

  3053     first_pass_results=first_pass(etext);

  3054     warnings=report_first_pass(first_pass_results);

  3055     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  3056     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  3057     /*

  3058      * Here we go with the main pass. Hold onto yer hat!

  3059      */

  3060     linecnt=0;

  3061     etext_ptr=etext;

  3062     while ((aline=flgets(&etext_ptr,linecnt+1,warnings->newlines)))

  3063     {

  3064 	linecnt++;

  3065 	if (linecnt==1)

  3066 	    isnewpara=TRUE;

  3067 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  3068 	    continue;    // skip DP page separators completely

  3069 	if (linecnt<first_pass_results->firstline ||

  3070 	  (first_pass_results->footerline>0 &&

  3071 	  linecnt>first_pass_results->footerline))

  3072 	{

  3073 	    if (pswit[HEADER_SWITCH])

  3074 	    {

  3075 		if (g_str_has_prefix(aline,"Title:"))

  3076 		    g_print("    %s\n",aline);

  3077 		if (g_str_has_prefix(aline,"Author:"))

  3078 		    g_print("    %s\n",aline);

  3079 		if (g_str_has_prefix(aline,"Release Date:"))

  3080 		    g_print("    %s\n",aline);

  3081 		if (g_str_has_prefix(aline,"Edition:"))

  3082 		    g_print("    %s\n\n",aline);

  3083 	    }

  3084 	    continue;		/* skip through the header */

  3085 	}

  3086 	checked_linecnt++;

  3087 	print_pending(aline,parastart,&pending);

  3088 	isemptyline=analyse_quotes(aline,&counters);

  3089 	if (isnewpara && !isemptyline)

  3090 	{

  3091 	    /* This line is the start of a new paragraph. */

  3092 	    start_para_line=linecnt;

  3093 	    /* Capture its first line in case we want to report it later. */

  3094 	    g_free(parastart);

  3095 	    parastart=g_strdup(aline);

  3096 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  3097 	    s=aline;

  3098 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  3099 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  3100 		s=g_utf8_next_char(s);

  3101 	    if (g_unichar_islower(g_utf8_get_char(s)))

  3102 	    {

  3103 		/* and its first letter is lowercase */

  3104 		if (pswit[ECHO_SWITCH])

  3105 		    g_print("\n%s\n",aline);

  3106 		if (!pswit[OVERVIEW_SWITCH])

  3107 		    g_print("    Line %ld column %ld - "

  3108 		      "Paragraph starts with lower-case\n",

  3109 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  3110 		else

  3111 		    cnt_punct++;

  3112 	    }

  3113 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  3114 	}

  3115 	/* Check for an em-dash broken at line end. */

  3116 	if (enddash && g_utf8_get_char(aline)=='-')

  3117 	{

  3118 	    if (pswit[ECHO_SWITCH])

  3119 		g_print("\n%s\n",aline);

  3120 	    if (!pswit[OVERVIEW_SWITCH])

  3121 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  3122 	    else

  3123 		cnt_punct++;

  3124 	}

  3125 	enddash=FALSE;

  3126 	for (s=g_utf8_prev_char(aline+strlen(aline));

  3127 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  3128 	    ;

  3129 	if (s>=aline && g_utf8_get_char(s)=='-')

  3130 	    enddash=TRUE;

  3131 	check_for_control_characters(aline);

  3132 	check_for_odd_characters(aline,warnings,isemptyline);

  3133 	if (warnings->longline)

  3134 	    check_for_long_line(aline);

  3135 	if (warnings->shortline)

  3136 	    check_for_short_line(aline,&last);

  3137 	last.blen=last.len;

  3138 	last.len=g_utf8_strlen(aline,-1);

  3139 	last.start=g_utf8_get_char(aline);

  3140 	check_for_starting_punctuation(aline);

  3141 	if (warnings->dash)

  3142 	{

  3143 	    check_for_spaced_emdash(aline);

  3144 	    check_for_spaced_dash(aline);

  3145 	}

  3146 	check_for_unmarked_paragraphs(aline);

  3147 	check_for_jeebies(aline);

  3148 	check_for_mta_from(aline);

  3149 	check_for_orphan_character(aline);

  3150 	check_for_pling_scanno(aline);

  3151 	check_for_extra_period(aline,warnings);

  3152 	check_for_following_punctuation(aline);

  3153 	check_for_typos(aline,warnings);

  3154 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  3155 	check_for_double_punctuation(aline,warnings);

  3156 	check_for_spaced_quotes(aline);

  3157 	check_for_miscased_genative(aline);

  3158 	check_end_of_line(aline,warnings);

  3159 	check_for_unspaced_bracket(aline);

  3160 	if (warnings->endquote)

  3161 	    check_for_unpunctuated_endquote(aline);

  3162 	check_for_html_tag(aline);

  3163 	check_for_html_entity(aline);

  3164 	if (isemptyline)

  3165 	{

  3166 	    check_for_mismatched_quotes(&counters,&pending);

  3167 	    counters_reset(&counters);

  3168 	    /* let the next iteration know that it's starting a new para */

  3169 	    isnewpara=TRUE;

  3170 	    if (prevline)

  3171 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  3172 	}

  3173 	g_free(prevline);

  3174 	prevline=g_strdup(aline);

  3175     }

  3176     linecnt++;

  3177     check_for_mismatched_quotes(&counters,&pending);

  3178     print_pending(NULL,parastart,&pending);

  3179     reset_pending(&pending);

  3180     if (prevline)

  3181     {

  3182 	g_free(prevline);

  3183 	prevline=NULL;

  3184     }

  3185     g_free(parastart);

  3186     g_free(prevline);

  3187     g_free(etext);

  3188     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  3189 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  3190     g_tree_unref(qword);

  3191     g_tree_unref(qperiod);

  3192     counters_destroy(&counters);

  3193     g_set_print_handler(NULL);

  3194     print_as_windows_1252(NULL);

  3195     if (pswit[MARKUP_SWITCH])

  3196 	loseentities(NULL);

  3197 }

  3199 /*

  3200  * flgets:

  3201  *

  3202  * Get one line from the input text. The setting of newlines has the following

  3203  * effect:

  3204  *

  3205  * DOS_NEWLINES: Check for the existence of exactly one CR-LF line-end per line.

  3206  *

  3207  * OS9_NEWLINES: Asserts that etext contains no LFs. CR is used as

  3208  *		 the newline character.

  3209  *

  3210  * UNIX_NEWLINES: Check for the presence of CRs.

  3211  *

  3212  * In all cases, check that the last line is correctly terminated.

  3213  *

  3214  * Returns: a pointer to the line.

  3215  */

  3216 char *flgets(char **etext,long lcnt,int newlines)

  3217 {

  3218     gunichar c;

  3219     gboolean isCR=FALSE;

  3220     char *theline=*etext;

  3221     char *eos=theline;

  3222     gchar *s;

  3223     for (;;)

  3224     {

  3225 	c=g_utf8_get_char(*etext);

  3226 	if (!c)

  3227 	{

  3228 	    if (*etext==theline)

  3229 		return NULL;

  3230 	    else if (pswit[LINE_END_SWITCH])

  3231 	    {

  3232 		if (pswit[ECHO_SWITCH])

  3233 		{

  3234 		    s=g_strndup(theline,eos-theline);

  3235 		    g_print("\n%s\n",s);

  3236 		    g_free(s);

  3237 		}

  3238 		if (!pswit[OVERVIEW_SWITCH])

  3239 		{

  3240 		    if (newlines==OS9_NEWLINES)

  3241 			g_print("    Line %ld - No CR?\n",lcnt);

  3242 		    else

  3243 		    {

  3244 			/* There may, or may not, have been a CR */

  3245 			g_print("    Line %ld - No LF?\n",lcnt);

  3246 		    }

  3247 		}

  3248 		else

  3249 		    cnt_lineend++;

  3250 	    }

  3251 	    break;

  3252 	}

  3253 	*etext=g_utf8_next_char(*etext);

  3254 	/* either way, it's end of line */

  3255 	if (c=='\n')

  3256 	{

  3257 	    if (newlines==DOS_NEWLINES && !isCR)

  3258 	    {

  3259 		/* Error - a LF without a preceding CR */

  3260 		if (pswit[LINE_END_SWITCH])

  3261 		{

  3262 		    if (pswit[ECHO_SWITCH])

  3263 		    {

  3264 			s=g_strndup(theline,eos-theline);

  3265 			g_print("\n%s\n",s);

  3266 			g_free(s);

  3267 		    }

  3268 		    if (!pswit[OVERVIEW_SWITCH])

  3269 			g_print("    Line %ld - No CR?\n",lcnt);

  3270 		    else

  3271 			cnt_lineend++;

  3272 		}

  3273 	    }

  3274 	    break;

  3275 	}

  3276 	if (c=='\r')

  3277 	{

  3278 	    if (newlines==OS9_NEWLINES)

  3279 		break;

  3280 	    if (isCR || newlines==UNIX_NEWLINES)

  3281 	    {

  3282 		if (pswit[LINE_END_SWITCH])

  3283 		{

  3284 		    if (pswit[ECHO_SWITCH])

  3285 		    {

  3286 			s=g_strndup(theline,eos-theline);

  3287 			g_print("\n%s\n",s);

  3288 			g_free(s);

  3289 		    }

  3290 		    if (!pswit[OVERVIEW_SWITCH])

  3291 		    {

  3292 			if (newlines==UNIX_NEWLINES)

  3293 			    g_print("    Line %ld column %ld - Embedded CR?\n",

  3294 			      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  3295 			else

  3296 			    g_print("    Line %ld - Two successive CRs?\n",

  3297 			      lcnt);

  3298 		    }

  3299 		    else

  3300 			cnt_lineend++;

  3301 		}

  3302 		if (newlines==UNIX_NEWLINES)

  3303 		    *eos=' ';

  3304 	    }

  3305 	    if (newlines==DOS_NEWLINES)

  3306 		isCR=TRUE;

  3307 	}

  3308 	else

  3309 	{

  3310 	    if (pswit[LINE_END_SWITCH] && isCR)

  3311 	    {

  3312 		if (pswit[ECHO_SWITCH])

  3313 		{

  3314 		    s=g_strndup(theline,eos-theline);

  3315 		    g_print("\n%s\n",s);

  3316 		    g_free(s);

  3317 		}

  3318 		if (!pswit[OVERVIEW_SWITCH])

  3319 		    g_print("    Line %ld column %ld - CR without LF?\n",

  3320 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  3321 		else

  3322 		    cnt_lineend++;

  3323 		*eos=' ';

  3324 	    }

  3325 	    isCR=FALSE;

  3326 	    eos=g_utf8_next_char(eos);

  3327 	}

  3328     }

  3329     *eos='\0';

  3330     if (pswit[MARKUP_SWITCH])

  3331 	postprocess_for_HTML(theline);

  3332     if (pswit[DP_SWITCH])

  3333 	postprocess_for_DP(theline);

  3334     return theline;

  3335 }

  3337 /*

  3338  * mixdigit:

  3339  *

  3340  * Takes a "word" as a parameter, and checks whether it

  3341  * contains a mixture of alpha and digits. Generally, this is an

  3342  * error, but may not be for cases like 4th or L5 12s. 3d.

  3343  *

  3344  * Returns: TRUE iff an is error found.

  3345  */

  3346 gboolean mixdigit(const char *checkword)

  3347 {

  3348     gboolean wehaveadigit,wehavealetter,query;

  3349     const char *s,*nondigit;

  3350     wehaveadigit=wehavealetter=query=FALSE;

  3351     for (s=checkword;*s;s=g_utf8_next_char(s))

  3352 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  3353 	    wehavealetter=TRUE;

  3354 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  3355 	    wehaveadigit=TRUE;

  3356     if (wehaveadigit && wehavealetter)

  3357     {

  3358 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  3359 	query=TRUE;

  3360 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  3361 	  nondigit=g_utf8_next_char(nondigit))

  3362 	    ;

  3363 	/* digits, ending in st, rd, nd, th of either case */

  3364 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  3365 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  3366 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  3367 	  !g_ascii_strcasecmp(nondigit,"th"))

  3368 	    query=FALSE;

  3369 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  3370 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  3371 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  3372 	  !g_ascii_strcasecmp(nondigit,"ths"))

  3373 	    query=FALSE;

  3374 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  3375 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  3376 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  3377 	  !g_ascii_strcasecmp(nondigit,"thly"))

  3378 	    query=FALSE;

  3379 	/* digits, ending in l, L, s or d */

  3380 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  3381 	  !strcmp(nondigit,"d"))

  3382 	    query=FALSE;

  3383 	/*

  3384 	 * L at the start of a number, representing Britsh pounds, like L500.

  3385 	 * This is cute. We know the current word is mixed digit. If the first

  3386 	 * letter is L, there must be at least one digit following. If both

  3387 	 * digits and letters follow, we have a genuine error, else we have a

  3388 	 * capital L followed by digits, and we accept that as a non-error.

  3389 	 */

  3390 	if (g_utf8_get_char(checkword)=='L' &&

  3391 	  !mixdigit(g_utf8_next_char(checkword)))

  3392 	    query=FALSE;

  3393     }

  3394     return query;

  3395 }

  3397 /*

  3398  * getaword:

  3399  *

  3400  * Extracts the first/next "word" from the line, and returns it.

  3401  * A word is defined as one English word unit--or at least that's the aim.

  3402  * "ptr" is advanced to the position in the line where we will start

  3403  * looking for the next word.

  3404  *

  3405  * Returns: A newly-allocated string.

  3406  */

  3407 gchar *getaword(const char **ptr)

  3408 {

  3409     const char *s,*t;

  3410     GString *word;

  3411     gunichar c,pc;

  3412     word=g_string_new(NULL);

  3413     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  3414       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  3415       **ptr;*ptr=g_utf8_next_char(*ptr))

  3416     {

  3417 	/* Handle exceptions for footnote markers like [1] */

  3418 	if (g_utf8_get_char(*ptr)=='[')

  3419 	{

  3420 	    g_string_append_c(word,'[');

  3421 	    s=g_utf8_next_char(*ptr);

  3422 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))

  3423 		g_string_append_unichar(word,g_utf8_get_char(s));

  3424 	    if (g_utf8_get_char(s)==']')

  3425 	    {

  3426 		g_string_append_c(word,']');

  3427 		*ptr=g_utf8_next_char(s);

  3428 		return g_string_free(word,FALSE);

  3429 	    }

  3430 	    else

  3431 		g_string_truncate(word,0);

  3432 	}

  3433     }

  3434     /*

  3435      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  3436      * Especially yucky is the case of L1,000

  3437      * This section looks for a pattern of characters including a digit

  3438      * followed by a comma or period followed by one or more digits.

  3439      * If found, it returns this whole pattern as a word; otherwise we discard

  3440      * the results and resume our normal programming.

  3441      */

  3442     s=*ptr;

  3443     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  3444       g_unichar_isalpha(g_utf8_get_char(s)) ||

  3445       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  3446 	g_string_append_unichar(word,g_utf8_get_char(s));

  3447     if (word->len)

  3448     {

  3449 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  3450 	{

  3451 	    c=g_utf8_get_char(t);

  3452 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  3453 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  3454 	    {

  3455 		*ptr=s;

  3456 		return g_string_free(word,FALSE);

  3457 	    }

  3458 	}

  3459     }

  3460     /* we didn't find a punctuated number - do the regular getword thing */

  3461     g_string_truncate(word,0);

  3462     c=g_utf8_get_char(*ptr);

  3463     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);

  3464       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))

  3465 	g_string_append_unichar(word,c);

  3466     return g_string_free(word,FALSE);

  3467 }

  3469 /*

  3470  * isroman:

  3471  *

  3472  * Is this word a Roman Numeral?

  3473  *

  3474  * It doesn't actually validate that the number is a valid Roman Numeral--for

  3475  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  3476  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  3477  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  3478  * expressions thereof, except when it came to taxes. Allow any number of M,

  3479  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3480  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3481  * of optional Is.

  3482  */

  3483 gboolean isroman(const char *t)

  3484 {

  3485     const char *s;

  3486     if (!t || !*t)

  3487 	return FALSE;

  3488     s=t;

  3489     while (g_utf8_get_char(t)=='m' && *t)

  3490 	t++;

  3491     if (g_utf8_get_char(t)=='d')

  3492 	t++;

  3493     if (g_str_has_prefix(t,"cm"))

  3494 	t+=2;

  3495     if (g_str_has_prefix(t,"cd"))

  3496 	t+=2;

  3497     while (g_utf8_get_char(t)=='c' && *t)

  3498 	t++;

  3499     if (g_str_has_prefix(t,"xl"))

  3500 	t+=2;

  3501     if (g_str_has_prefix(t,"xc"))

  3502 	t+=2;

  3503     if (g_utf8_get_char(t)=='l')

  3504 	t++;

  3505     while (g_utf8_get_char(t)=='x' && *t)

  3506 	t++;

  3507     if (g_str_has_prefix(t,"ix"))

  3508 	t+=2;

  3509     if (g_str_has_prefix(t,"iv"))

  3510 	t+=2;

  3511     if (g_utf8_get_char(t)=='v')

  3512 	t++;

  3513     while (g_utf8_get_char(t)=='i' && *t)

  3514 	t++;

  3515     return !*t;

  3516 }

  3518 /*

  3519  * postprocess_for_DP:

  3520  *

  3521  * Invoked with the -d switch from flgets().

  3522  * It simply "removes" from the line a hard-coded set of common

  3523  * DP-specific tags, so that the line passed to the main routine has

  3524  * been pre-cleaned of DP markup.

  3525  */

  3526 void postprocess_for_DP(char *theline)

  3527 {

  3528     char *s,*t;

  3529     int i;

  3530     if (!*theline)

  3531 	return;

  3532     for (i=0;*DPmarkup[i];i++)

  3533 	while ((s=strstr(theline,DPmarkup[i])))

  3534 	{

  3535 	    t=s+strlen(DPmarkup[i]);

  3536 	    memmove(s,t,strlen(t)+1);

  3537 	}

  3538 }

  3540 /*

  3541  * postprocess_for_HTML:

  3542  *

  3543  * Invoked with the -m switch from flgets().

  3544  * It simply "removes" from the line a hard-coded set of common

  3545  * HTML tags and "replaces" a hard-coded set of common HTML

  3546  * entities, so that the line passed to the main routine has

  3547  * been pre-cleaned of HTML.

  3548  */

  3549 void postprocess_for_HTML(char *theline)

  3550 {

  3551     while (losemarkup(theline))

  3552 	;

  3553     loseentities(theline);

  3554 }

  3556 char *losemarkup(char *theline)

  3557 {

  3558     char *s,*t;

  3559     int i;

  3560     s=strchr(theline,'<');

  3561     t=s?strchr(s,'>'):NULL;

  3562     if (!s || !t)

  3563 	return NULL;

  3564     for (i=0;*markup[i];i++)

  3565 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  3566 	{

  3567 	    t=g_utf8_next_char(t);

  3568 	    memmove(s,t,strlen(t)+1);

  3569 	    return s;

  3570 	}

  3571     /* It's an unrecognized <xxx>. */

  3572     return NULL;

  3573 }

  3575 void loseentities(char *theline)

  3576 {

  3577     int i;

  3578     gsize nb;

  3579     char *amp,*scolon;

  3580     gchar *s,*t;

  3581     gunichar c;

  3582     GTree *entities=NULL;

  3583     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3584     if (!theline)

  3585     {

  3586 	if (entities)

  3587 	    g_tree_destroy(entities);

  3588 	entities=NULL;

  3589 	if (translit!=(GIConv)-1)

  3590 	    g_iconv_close(translit);

  3591 	translit=(GIConv)-1;

  3592 	if (to_utf8!=(GIConv)-1)

  3593 	    g_iconv_close(to_utf8);

  3594 	to_utf8=(GIConv)-1;

  3595 	return;

  3596     }

  3597     if (!*theline)

  3598 	return;

  3599     if (!entities)

  3600     {

  3601 	entities=g_tree_new((GCompareFunc)strcmp);

  3602 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3603 	    g_tree_insert(entities,HTMLentities[i].name,

  3604 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3605     }

  3606     if (translit==(GIConv)-1)

  3607 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3608     if (to_utf8==(GIConv)-1)

  3609 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3610     while((amp=strchr(theline,'&')))

  3611     {

  3612 	scolon=strchr(amp,';');

  3613 	if (scolon)

  3614 	{

  3615 	    if (amp[1]=='#')

  3616 	    {

  3617 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3618 		    c=strtol(amp+2,NULL,10);

  3619 		else if (amp[2]=='x' &&

  3620 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3621 		    c=strtol(amp+3,NULL,16);

  3622 	    }

  3623 	    else

  3624 	    {

  3625 		s=g_strndup(amp+1,scolon-(amp+1));

  3626 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3627 		g_free(s);

  3628 	    }

  3629 	}

  3630 	else

  3631 	    c=0;

  3632 	if (c)

  3633 	{

  3634 	    theline=amp;

  3635 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3636 		theline+=g_unichar_to_utf8(c,theline);

  3637 	    else

  3638 	    {

  3639 		s=g_malloc(6);

  3640 		nb=g_unichar_to_utf8(c,s);

  3641 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3642 		g_free(s);

  3643 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3644 		g_free(t);

  3645 		memcpy(theline,s,nb);

  3646 		g_free(s);

  3647 		theline+=nb;

  3648 	    }

  3649 	    memmove(theline,g_utf8_next_char(scolon),

  3650 	      strlen(g_utf8_next_char(scolon))+1);

  3651 	}

  3652 	else

  3653 	    theline=g_utf8_next_char(amp);

  3654     }

  3655 }

  3657 gboolean tagcomp(const char *strin,const char *basetag)

  3658 {

  3659     gboolean retval;

  3660     gchar *s,*t;

  3661     if (g_utf8_get_char(strin)=='/')

  3662 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3663     else

  3664 	t=g_utf8_casefold(strin,-1);

  3665     s=g_utf8_casefold(basetag,-1);

  3666     retval=g_str_has_prefix(t,s);

  3667     g_free(s);

  3668     g_free(t);

  3669     return retval;

  3670 }

  3672 void proghelp(GOptionContext *context)

  3673 {

  3674     gchar *help;

  3675     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3676     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3677     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3678     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3679       "For details, read the file COPYING.\n",stderr);

  3680     fputs("This is Free Software; "

  3681       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3682     fputs("read the file COPYING for details.\n\n",stderr);

  3683     help=g_option_context_get_help(context,TRUE,NULL);

  3684     fputs(help,stderr);

  3685     g_free(help);

  3686     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3687     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3688       "non-ASCII\n",stderr);

  3689     fputs("characters like accented letters, "

  3690       "lines longer than 75 or shorter than 55,\n",stderr);

  3691     fputs("unbalanced quotes or brackets, "

  3692       "a variety of badly formatted punctuation, \n",stderr);

  3693     fputs("HTML tags, some likely typos. "

  3694       "It is NOT a substitute for human judgement.\n",stderr);

  3695     fputs("\n",stderr);

  3696 }

author	ali <ali@juiblex.co.uk>
	Sun Oct 27 17:01:47 2013 +0000 (2013-10-27)
changeset 103	d22d8cd4f628
parent 102	ff0aa9b1397a
child 104	70cc629ec1e0
permissions	-rw-r--r--