bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sun Sep 29 09:18:05 2013 +0100 (2013-09-29)
changeset 150 fd584db1d305
parent 142 466f43a12118
child 152 da598b05f8e8
permissions -rw-r--r--
Fix bug #13: Character sets
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 static GOptionEntry options[]={
   136     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   137       "Ignore DP-specific markup", NULL },
   138     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   139       "Don't echo queried line", NULL },
   140     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   141       "Check single quotes", NULL },
   142     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   143       "Check common typos", NULL },
   144     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   145       "Require closure of quotes on every paragraph", NULL },
   146     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   147       "Disable paranoid querying of everything", NULL },
   148     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   149       "Disable line end checking", NULL },
   150     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   151       "Overview: just show counts", NULL },
   152     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   153       "Output errors to stdout instead of stderr", NULL },
   154     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   155       "Echo header fields", NULL },
   156     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   157       "Ignore markup in < >", NULL },
   158     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   159       "Use file of user-defined typos", NULL },
   160     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   161       "Defaults for use on www upload", NULL },
   162     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   163       "Verbose - list everything", NULL },
   164     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   165       "Set of characters valid for this ebook", "NAME" },
   166     { NULL }
   167 };
   168 
   169 long cnt_quote;		/* for overview mode, count of quote queries */
   170 long cnt_brack;		/* for overview mode, count of brackets queries */
   171 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   172 long cnt_odd;		/* for overview mode, count of odd character queries */
   173 long cnt_long;		/* for overview mode, count of long line errors */
   174 long cnt_short;		/* for overview mode, count of short line queries */
   175 long cnt_punct;		/* for overview mode,
   176 			   count of punctuation and spacing queries */
   177 long cnt_dash;		/* for overview mode, count of dash-related queries */
   178 long cnt_word;		/* for overview mode, count of word queries */
   179 long cnt_html;		/* for overview mode, count of html queries */
   180 long cnt_lineend;	/* for overview mode, count of line-end queries */
   181 long cnt_spacend;	/* count of lines with space at end */
   182 long linecnt;		/* count of total lines in the file */
   183 long checked_linecnt;	/* count of lines actually checked */
   184 
   185 void proghelp(GOptionContext *context);
   186 void procfile(const char *);
   187 
   188 gchar *running_from;
   189 
   190 gboolean mixdigit(const char *);
   191 gchar *getaword(const char **);
   192 char *flgets(char **,long);
   193 void postprocess_for_HTML(char *);
   194 char *linehasmarkup(char *);
   195 char *losemarkup(char *);
   196 gboolean tagcomp(const char *,const char *);
   197 void loseentities(char *);
   198 gboolean isroman(const char *);
   199 void postprocess_for_DP(char *);
   200 void print_as_windows_1252(const char *string);
   201 void print_as_utf_8(const char *string);
   202 
   203 GTree *qword,*qperiod;
   204 
   205 #ifdef __WIN32__
   206 UINT saved_cp;
   207 #endif
   208 
   209 gboolean set_charset(const char *name,GError **err)
   210 {
   211     /* The various UNICODE encodings all share the same character set. */
   212     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   213       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   214       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   215       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   216       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   217     int i;
   218     if (charset)
   219 	g_free(charset);
   220     if (charset_validator!=(GIConv)-1)
   221 	g_iconv_close(charset_validator);
   222     if (!name || !g_strcasecmp(name,"auto"))
   223     {
   224 	charset=NULL;
   225 	charset_validator=(GIConv)-1;
   226 	return TRUE;
   227     }
   228     else
   229 	charset=g_strdup(name);
   230     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   231 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   232 	{
   233 	    g_free(charset);
   234 	    charset=g_strdup("UTF-8");
   235 	    break;
   236 	}
   237     if (!strcmp(charset,"UTF-8"))
   238 	charset_validator=(GIConv)-1;
   239     else
   240     {
   241 	charset_validator=g_iconv_open(charset,"UTF-8");
   242 	if (charset_validator==(GIConv)-1)
   243 	{
   244 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   245 	      "Unknown character set \"%s\"",charset);
   246 	    return FALSE;
   247 	}
   248     }
   249     return TRUE;
   250 }
   251 
   252 void parse_options(int *argc,char ***argv)
   253 {
   254     GError *err=NULL;
   255     GOptionContext *context;
   256     context=g_option_context_new(
   257       "file - looks for errors in Project Gutenberg(TM) etexts");
   258     g_option_context_add_main_entries(context,options,NULL);
   259     if (!g_option_context_parse(context,argc,argv,&err))
   260     {
   261 	g_printerr("Bookloupe: %s\n",err->message);
   262 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   263 	exit(1);
   264     }
   265     /* Paranoid checking is turned OFF, not on, by its switch */
   266     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   267     if (pswit[PARANOID_SWITCH])
   268 	/* if running in paranoid mode, typo checks default to enabled */
   269 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   270     /* Line-end checking is turned OFF, not on, by its switch */
   271     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   272     /* Echoing is turned OFF, not on, by its switch */
   273     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   274     if (pswit[OVERVIEW_SWITCH])
   275 	/* just print summary; don't echo */
   276 	pswit[ECHO_SWITCH]=FALSE;
   277     /*
   278      * Web uploads - for the moment, this is really just a placeholder
   279      * until we decide what processing we really want to do on web uploads
   280      */
   281     if (pswit[WEB_SWITCH])
   282     {
   283 	/* specific override for web uploads */
   284 	pswit[ECHO_SWITCH]=TRUE;
   285 	pswit[SQUOTE_SWITCH]=FALSE;
   286 	pswit[TYPO_SWITCH]=TRUE;
   287 	pswit[QPARA_SWITCH]=FALSE;
   288 	pswit[PARANOID_SWITCH]=TRUE;
   289 	pswit[LINE_END_SWITCH]=FALSE;
   290 	pswit[OVERVIEW_SWITCH]=FALSE;
   291 	pswit[STDOUT_SWITCH]=FALSE;
   292 	pswit[HEADER_SWITCH]=TRUE;
   293 	pswit[VERBOSE_SWITCH]=FALSE;
   294 	pswit[MARKUP_SWITCH]=FALSE;
   295 	pswit[USERTYPO_SWITCH]=FALSE;
   296 	pswit[DP_SWITCH]=FALSE;
   297     }
   298     if (opt_charset && !set_charset(opt_charset,&err))
   299     {
   300 	g_printerr("%s\n",err->message);
   301 	exit(1);
   302     }
   303     g_free(opt_charset);
   304     opt_charset=NULL;
   305     if (*argc<2)
   306     {
   307 	proghelp(context);
   308 	exit(1);
   309     }
   310     g_option_context_free(context);
   311 }
   312 
   313 /*
   314  * read_user_scannos:
   315  *
   316  * Read in the user-defined stealth scanno list.
   317  */
   318 void read_user_scannos(void)
   319 {
   320     GError *err=NULL;
   321     gchar *usertypo_file;
   322     gboolean okay;
   323     int i;
   324     gsize len,nb;
   325     gchar *contents,*utf8,**lines;
   326     usertypo_file=g_strdup("bookloupe.typ");
   327     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   328     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   329     {
   330 	g_clear_error(&err);
   331 	g_free(usertypo_file);
   332 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   333 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   334     }
   335     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   336     {
   337 	g_clear_error(&err);
   338 	g_free(usertypo_file);
   339 	usertypo_file=g_strdup("gutcheck.typ");
   340 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   341     }
   342     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   343     {
   344 	g_clear_error(&err);
   345 	g_free(usertypo_file);
   346 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   347 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   348     }
   349     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   350     {
   351 	g_free(usertypo_file);
   352 	g_print("   --> I couldn't find bookloupe.typ "
   353 	  "-- proceeding without user typos.\n");
   354 	return;
   355     }
   356     else if (!okay)
   357     {
   358 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   359 	g_free(usertypo_file);
   360 	g_clear_error(&err);
   361 	exit(1);
   362     }
   363     if (g_utf8_validate(contents,len,NULL))
   364     {
   365 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   366 	if (!charset)
   367 	    (void)set_charset("UNICODE",NULL);
   368     }
   369     else
   370 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   371     g_free(contents);
   372     lines=g_strsplit_set(utf8,"\r\n",0);
   373     g_free(utf8);
   374     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   375     for (i=0;lines[i];i++)
   376 	if (*(unsigned char *)lines[i]>'!')
   377 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   378 	else
   379 	    g_free(lines[i]);
   380     g_free(lines);
   381 }
   382 
   383 /*
   384  * read_etext:
   385  *
   386  * Read an etext returning a newly allocated string containing the file
   387  * contents or NULL on error.
   388  */
   389 gchar *read_etext(const char *filename,GError **err)
   390 {
   391     GError *tmp_err=NULL;
   392     gchar *contents,*utf8;
   393     gsize len,bytes_read,bytes_written;
   394     int i,line,col;
   395     if (!g_file_get_contents(filename,&contents,&len,err))
   396 	return NULL;
   397     if (g_utf8_validate(contents,len,NULL))
   398     {
   399 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   400 	g_set_print_handler(print_as_utf_8);
   401 #ifdef __WIN32__
   402 	SetConsoleOutputCP(CP_UTF8);
   403 #endif
   404     }
   405     else
   406     {
   407 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   408 	  &bytes_written,&tmp_err);
   409 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   410 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   411 	{
   412 	    line=col=1;
   413 	    for(i=0;i<bytes_read;i++)
   414 		if (contents[i]=='\n')
   415 		{
   416 		    line++;
   417 		    col=1;
   418 		}
   419 		else if (contents[i]!='\r')
   420 		    col++;
   421 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   422 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   423 	      "valid Windows-1252 character",
   424 	      ((unsigned char *)contents)[bytes_read],line,col);
   425 	}
   426 	else if (tmp_err)
   427 	    g_propagate_error(err,tmp_err);
   428 	g_set_print_handler(print_as_windows_1252);
   429 #ifdef __WIN32__
   430 	SetConsoleOutputCP(1252);
   431 #endif
   432     }
   433     g_free(contents);
   434     return utf8;
   435 }
   436 
   437 void cleanup_on_exit(void)
   438 {
   439 #ifdef __WIN32__
   440     SetConsoleOutputCP(saved_cp);
   441 #endif
   442 }
   443 
   444 int main(int argc,char **argv)
   445 {
   446 #ifdef __WIN32__
   447     atexit(cleanup_on_exit);
   448     saved_cp=GetConsoleOutputCP();
   449 #endif
   450     running_from=g_path_get_dirname(argv[0]);
   451     parse_options(&argc,&argv);
   452     if (pswit[USERTYPO_SWITCH])
   453 	read_user_scannos();
   454     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   455     procfile(argv[1]);
   456     if (pswit[OVERVIEW_SWITCH])
   457     {
   458 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   459 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   460 	g_print("    --------------- Queries found --------------\n");
   461 	if (cnt_long)
   462 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   463 	if (cnt_short)
   464 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   465 	if (cnt_lineend)
   466 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   467 	if (cnt_word)
   468 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   469 	if (cnt_quote)
   470 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   471 	if (cnt_brack)
   472 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   473 	if (cnt_bin)
   474 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   475 	if (cnt_odd)
   476 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   477 	if (cnt_punct)
   478 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   479 	if (cnt_dash)
   480 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   481 	if (cnt_html)
   482 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   483 	g_print("\n");
   484 	g_print("    TOTAL QUERIES		  %14ld\n",
   485 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   486 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   487     }
   488     g_free(running_from);
   489     if (usertypo)
   490 	g_tree_unref(usertypo);
   491     set_charset(NULL,NULL);
   492     return 0;
   493 }
   494 
   495 /*
   496  * first_pass:
   497  *
   498  * Run a first pass - verify that it's a valid PG
   499  * file, decide whether to report some things that
   500  * occur many times in the text like long or short
   501  * lines, non-standard dashes, etc.
   502  */
   503 struct first_pass_results *first_pass(const char *etext)
   504 {
   505     gunichar laststart=CHAR_SPACE;
   506     const char *s;
   507     gchar *lc_line;
   508     int i,j,lbytes,llen;
   509     gchar **lines;
   510     unsigned int lastlen=0,lastblen=0;
   511     long spline=0,nspline=0;
   512     static struct first_pass_results results={0};
   513     gchar *inword;
   514     QuoteClass qc;
   515     lines=g_strsplit(etext,"\n",0);
   516     for (j=0;lines[j];j++)
   517     {
   518 	lbytes=strlen(lines[j]);
   519 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   520 	    lines[j][--lbytes]='\0';
   521 	llen=g_utf8_strlen(lines[j],lbytes);
   522 	linecnt++;
   523 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   524 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   525 	{
   526 	    if (spline)
   527 		g_print("   --> Duplicate header?\n");
   528 	    spline=linecnt+1;   /* first line of non-header text, that is */
   529 	}
   530 	if (!strncmp(lines[j],"*** START",9) &&
   531 	  strstr(lines[j],"PROJECT GUTENBERG"))
   532 	{
   533 	    if (nspline)
   534 		g_print("   --> Duplicate header?\n");
   535 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   536 	}
   537 	if (spline || nspline)
   538 	{
   539 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   540 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   541 	    {
   542 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   543 		{
   544 		    if (results.footerline)
   545 		    {
   546 			/* it's an old-form header - we can detect duplicates */
   547 			if (!nspline)
   548 			    g_print("   --> Duplicate footer?\n");
   549 		    }
   550 		    else
   551 			results.footerline=linecnt;
   552 		}
   553 	    }
   554 	    g_free(lc_line);
   555 	}
   556 	if (spline)
   557 	    results.firstline=spline;
   558 	if (nspline)
   559 	    results.firstline=nspline;  /* override with new */
   560 	if (results.footerline)
   561 	    continue;    /* don't count the boilerplate in the footer */
   562 	results.totlen+=llen;
   563 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   564 	{
   565 	    if (g_utf8_get_char(s)>127)
   566 		results.binlen++;
   567 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   568 		results.alphalen++;
   569 	    if (s>lines[j])
   570 	    {
   571 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   572 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   573 		else
   574 		    qc=INVALID_QUOTE;
   575 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   576 		  isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   577 		    results.endquote_count++;
   578 	    }
   579 	}
   580 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   581 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   582 	    results.shortline++;
   583 	if (lbytes>0 &&
   584 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   585 	    cnt_spacend++;
   586 	if (strstr(lines[j],".,"))
   587 	    results.dotcomma++;
   588 	/* only count ast lines for ignoring purposes where there is */
   589 	/* locase text on the line */
   590 	if (strchr(lines[j],'*'))
   591 	{
   592 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   593 		if (g_unichar_islower(g_utf8_get_char(s)))
   594 		    break;
   595 	    if (*s)
   596 		results.astline++;
   597 	}
   598 	if (strchr(lines[j],'/'))
   599 	    results.fslashline++;
   600 	if (lbytes>0)
   601 	{
   602 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   603 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   604 	      s=g_utf8_prev_char(s))
   605 		;
   606 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   607 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   608 		results.hyphens++;
   609 	}
   610 	if (llen>LONGEST_PG_LINE)
   611 	    results.longline++;
   612 	if (llen>WAY_TOO_LONG)
   613 	    results.verylongline++;
   614 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   615 	{
   616 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   617 	    if (i>0)
   618 		results.htmcount++;
   619 	    if (strstr(lines[j],"<i>"))
   620 		results.htmcount+=4; /* bonus marks! */
   621 	}
   622 	/* Check for spaced em-dashes */
   623 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   624 	{
   625 	    results.emdash++;
   626 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   627 		results.space_emdash++;
   628 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   629 		/* count of em-dashes with spaces both sides */
   630 		results.non_PG_space_emdash++;
   631 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   632 		/* count of PG-type em-dashes with no spaces */
   633 		results.PG_space_emdash++;
   634 	}
   635 	for (s=lines[j];*s;)
   636 	{
   637 	    inword=getaword(&s);
   638 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   639 		results.Dutchcount++;
   640 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   641 		results.Frenchcount++;
   642 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   643 		results.standalone_digit++;
   644 	    g_free(inword);
   645 	}
   646 	/* Check for spaced dashes */
   647 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   648 	    results.spacedash++;
   649 	lastblen=lastlen;
   650 	lastlen=llen;
   651 	laststart=lines[j][0];
   652     }
   653     g_strfreev(lines);
   654     return &results;
   655 }
   656 
   657 /*
   658  * report_first_pass:
   659  *
   660  * Make some snap decisions based on the first pass results.
   661  */
   662 struct warnings *report_first_pass(struct first_pass_results *results)
   663 {
   664     static struct warnings warnings={0};
   665     if (cnt_spacend>0)
   666 	g_print("   --> %ld lines in this file have white space at end\n",
   667 	  cnt_spacend);
   668     warnings.dotcomma=1;
   669     if (results->dotcomma>5)
   670     {
   671 	warnings.dotcomma=0;
   672 	g_print("   --> %ld lines in this file contain '.,'. "
   673 	  "Not reporting them.\n",results->dotcomma);
   674     }
   675     /*
   676      * If more than 50 lines, or one-tenth, are short,
   677      * don't bother reporting them.
   678      */
   679     warnings.shortline=1;
   680     if (results->shortline>50 || results->shortline*10>linecnt)
   681     {
   682 	warnings.shortline=0;
   683 	g_print("   --> %ld lines in this file are short. "
   684 	  "Not reporting short lines.\n",results->shortline);
   685     }
   686     /*
   687      * If more than 50 lines, or one-tenth, are long,
   688      * don't bother reporting them.
   689      */
   690     warnings.longline=1;
   691     if (results->longline>50 || results->longline*10>linecnt)
   692     {
   693 	warnings.longline=0;
   694 	g_print("   --> %ld lines in this file are long. "
   695 	  "Not reporting long lines.\n",results->longline);
   696     }
   697     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   698     warnings.ast=1;
   699     if (results->astline>10)
   700     {
   701 	warnings.ast=0;
   702 	g_print("   --> %ld lines in this file contain asterisks. "
   703 	  "Not reporting them.\n",results->astline);
   704     }
   705     /*
   706      * If more than 10 lines contain forward slashes,
   707      * don't bother reporting them.
   708      */
   709     warnings.fslash=1;
   710     if (results->fslashline>10)
   711     {
   712 	warnings.fslash=0;
   713 	g_print("   --> %ld lines in this file contain forward slashes. "
   714 	  "Not reporting them.\n",results->fslashline);
   715     }
   716     /*
   717      * If more than 20 lines contain unpunctuated endquotes,
   718      * don't bother reporting them.
   719      */
   720     warnings.endquote=1;
   721     if (results->endquote_count>20)
   722     {
   723 	warnings.endquote=0;
   724 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   725 	  "Not reporting them.\n",results->endquote_count);
   726     }
   727     /*
   728      * If more than 15 lines contain standalone digits,
   729      * don't bother reporting them.
   730      */
   731     warnings.digit=1;
   732     if (results->standalone_digit>10)
   733     {
   734 	warnings.digit=0;
   735 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   736 	  "Not reporting them.\n",results->standalone_digit);
   737     }
   738     /*
   739      * If more than 20 lines contain hyphens at end,
   740      * don't bother reporting them.
   741      */
   742     warnings.hyphen=1;
   743     if (results->hyphens>20)
   744     {
   745 	warnings.hyphen=0;
   746 	g_print("   --> %ld lines in this file have hyphens at end. "
   747 	  "Not reporting them.\n",results->hyphens);
   748     }
   749     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   750     {
   751 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   752 	pswit[MARKUP_SWITCH]=1;
   753     }
   754     if (results->verylongline>0)
   755 	g_print("   --> %ld lines in this file are VERY long!\n",
   756 	  results->verylongline);
   757     /*
   758      * If there are more non-PG spaced dashes than PG em-dashes,
   759      * assume it's deliberate.
   760      * Current PG guidelines say don't use them, but older texts do,
   761      * and some people insist on them whatever the guidelines say.
   762      */
   763     warnings.dash=1;
   764     if (results->spacedash+results->non_PG_space_emdash>
   765       results->PG_space_emdash)
   766     {
   767 	warnings.dash=0;
   768 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   769 	  "Not reporting them.\n",
   770 	  results->spacedash+results->non_PG_space_emdash);
   771     }
   772     if (charset)
   773 	warnings.bin=0;
   774     else
   775     {
   776 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
   777 	warnings.bin=1;
   778 	/* If more than a quarter of characters are hi-bit, bug out. */
   779 	if (results->binlen*4>results->totlen)
   780 	{
   781 	    g_print("   --> This file does not appear to be ASCII. "
   782 	      "Terminating. Best of luck with it!\n");
   783 	    exit(1);
   784 	}
   785 	if (results->alphalen*4<results->totlen)
   786 	{
   787 	    g_print("   --> This file does not appear to be text. "
   788 	      "Terminating. Best of luck with it!\n");
   789 	    exit(1);
   790 	}
   791 	if (results->binlen*100>results->totlen || results->binlen>100)
   792 	{
   793 	    g_print("   --> There are a lot of foreign letters here. "
   794 	      "Not reporting them.\n");
   795 	    if (!pswit[VERBOSE_SWITCH])
   796 		warnings.bin=0;
   797 	}
   798     }
   799     warnings.isDutch=FALSE;
   800     if (results->Dutchcount>50)
   801     {
   802 	warnings.isDutch=TRUE;
   803 	g_print("   --> This looks like Dutch - "
   804 	  "switching off dashes and warnings for 's Middags case.\n");
   805     }
   806     warnings.isFrench=FALSE;
   807     if (results->Frenchcount>50)
   808     {
   809 	warnings.isFrench=TRUE;
   810 	g_print("   --> This looks like French - "
   811 	  "switching off some doublepunct.\n");
   812     }
   813     if (results->firstline && results->footerline)
   814 	g_print("    The PG header and footer appear to be already on.\n");
   815     else
   816     {
   817 	if (results->firstline)
   818 	    g_print("    The PG header is on - no footer.\n");
   819 	if (results->footerline)
   820 	    g_print("    The PG footer is on - no header.\n");
   821     }
   822     g_print("\n");
   823     if (pswit[VERBOSE_SWITCH])
   824     {
   825 	warnings.shortline=1;
   826 	warnings.dotcomma=1;
   827 	warnings.longline=1;
   828 	warnings.dash=1;
   829 	warnings.digit=1;
   830 	warnings.ast=1;
   831 	warnings.fslash=1;
   832 	warnings.hyphen=1;
   833 	warnings.endquote=1;
   834 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   835     }
   836     if (warnings.isDutch)
   837 	warnings.dash=0;
   838     if (results->footerline>0 && results->firstline>0 &&
   839       results->footerline>results->firstline &&
   840       results->footerline-results->firstline<100)
   841     {
   842 	g_print("   --> I don't really know where this text starts. \n");
   843 	g_print("       There are no reference points.\n");
   844 	g_print("       I'm going to have to report the header and footer "
   845 	  "as well.\n");
   846 	results->firstline=0;
   847     }
   848     return &warnings;
   849 }
   850 
   851 /*
   852  * analyse_quotes:
   853  *
   854  * Look along the line, accumulate the count of quotes, and see
   855  * if this is an empty line - i.e. a line with nothing on it
   856  * but spaces.
   857  * If line has just spaces, period, * and/or - on it, don't
   858  * count it, since empty lines with asterisks or dashes to
   859  * separate sections are common.
   860  *
   861  * Returns: TRUE if the line is empty.
   862  */
   863 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
   864 {
   865     int guessquote=0;
   866     /* assume the line is empty until proven otherwise */
   867     gboolean isemptyline=TRUE;
   868     const char *s=aline,*sprev,*snext;
   869     gunichar c;
   870     sprev=NULL;
   871     GError *tmp_err=NULL;
   872     while (*s)
   873     {
   874 	snext=g_utf8_next_char(s);
   875 	c=g_utf8_get_char(s);
   876 	if (CHAR_IS_DQUOTE(c))
   877 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
   878 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
   879 	{
   880 	    if (s==aline)
   881 	    {
   882 		/*
   883 		 * At start of line, it can only be a quotation mark.
   884 		 * Hardcode a very common exception!
   885 		 */
   886 		if (!g_str_has_prefix(snext,"tis") &&
   887 		  !g_str_has_prefix(snext,"Tis"))
   888 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   889 	    }
   890 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   891 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   892 		/* Do nothing! it's definitely an apostrophe, not a quote */
   893 		;
   894 	    /* it's outside a word - let's check it out */
   895 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   896 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   897 	    {
   898 		/* certainly looks like a quotation mark */
   899 		if (!g_str_has_prefix(snext,"tis") &&
   900 		  !g_str_has_prefix(snext,"Tis"))
   901 		    /* hardcode a very common exception! */
   902 		{
   903 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
   904 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   905 		    else
   906 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
   907 		}
   908 	    }
   909 	    else
   910 	    {
   911 		/* now - is it a quotation mark? */
   912 		guessquote=0;   /* accumulate clues */
   913 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   914 		{
   915 		    /* it follows a letter - could be either */
   916 		    guessquote++;
   917 		    if (g_utf8_get_char(sprev)=='s')
   918 		    {
   919 			/* looks like a plural apostrophe */
   920 			guessquote-=3;
   921 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   922 			    /* bonus marks! */
   923 			    guessquote-=2;
   924 		    }
   925 		    if (innermost_quote_matches(counters,c))
   926 			/*
   927 			 * Give it the benefit of some doubt,
   928 			 * if a squote is already open.
   929 			 */
   930 			guessquote++;
   931 		    else
   932 			guessquote--;
   933 		    if (guessquote>=0)
   934 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
   935 		}
   936 		else
   937 		    /* no adjacent letter - it must be a quote of some kind */
   938 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   939 	    }
   940 	}
   941 	if (tmp_err)
   942 	{
   943 	    if (pswit[ECHO_SWITCH])
   944 		g_print("\n%s\n",aline);
   945 	    if (!pswit[OVERVIEW_SWITCH])
   946 		g_print("    Line %ld column %ld - %s\n",
   947 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
   948 	    g_clear_error(&tmp_err);
   949 	}
   950 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   951 	  c!='\r' && c!='\n')
   952 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   953 	if (c==CHAR_UNDERSCORE)
   954 	    counters->c_unders++;
   955 	if (c==CHAR_OPEN_SBRACK)
   956 	{
   957 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
   958 	      !matching_difference(counters,c) && s==aline &&
   959 	      g_str_has_prefix(s,"[Illustration:"))
   960 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
   961 	    else
   962 		increment_matching(counters,c,TRUE);
   963 	}
   964 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
   965 	    increment_matching(counters,c,TRUE);
   966 	if (c==CHAR_CLOSE_SBRACK)
   967 	{
   968 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
   969 	      !matching_difference(counters,c) && !*snext)
   970 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
   971 	    else
   972 		increment_matching(counters,c,FALSE);
   973 	}
   974 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
   975 	    increment_matching(counters,c,FALSE);
   976 	sprev=s;
   977 	s=snext;
   978     }
   979     return isemptyline;
   980 }
   981 
   982 /*
   983  * check_for_control_characters:
   984  *
   985  * Check for invalid or questionable characters in the line
   986  * Anything above 127 is invalid for plain ASCII, and
   987  * non-printable control characters should also be flagged.
   988  * Tabs should generally not be there.
   989  */
   990 void check_for_control_characters(const char *aline)
   991 {
   992     gunichar c;
   993     const char *s;
   994     for (s=aline;*s;s=g_utf8_next_char(s))
   995     {
   996 	c=g_utf8_get_char(s);
   997 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   998 	{
   999 	    if (pswit[ECHO_SWITCH])
  1000 		g_print("\n%s\n",aline);
  1001 	    if (!pswit[OVERVIEW_SWITCH])
  1002 		g_print("    Line %ld column %ld - Control character %u\n",
  1003 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1004 	    else
  1005 		cnt_bin++;
  1006 	}
  1007     }
  1008 }
  1009 
  1010 /*
  1011  * check_for_odd_characters:
  1012  *
  1013  * Check for binary and other odd characters.
  1014  */
  1015 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1016   gboolean isemptyline)
  1017 {
  1018     /* Don't repeat multiple warnings on one line. */
  1019     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1020     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1021     const char *s;
  1022     gunichar c;
  1023     gsize nb;
  1024     gchar *t;
  1025     for (s=aline;*s;s=g_utf8_next_char(s))
  1026     {
  1027 	c=g_utf8_get_char(s);
  1028 	if (warnings->bin && !eInvalidChar &&
  1029 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1030 	{
  1031 	    if (pswit[ECHO_SWITCH])
  1032 		g_print("\n%s\n",aline);
  1033 	    if (!pswit[OVERVIEW_SWITCH])
  1034 		if (c>127 && c<160 || c>255)
  1035 		    g_print("    Line %ld column %ld - "
  1036 		      "Non-ISO-8859 character %u\n",
  1037 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1038 		else
  1039 		    g_print("    Line %ld column %ld - "
  1040 		      "Non-ASCII character %u\n",
  1041 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1042 	    else
  1043 		cnt_bin++;
  1044 	    eInvalidChar=TRUE;
  1045 	}
  1046 	if (!eInvalidChar && charset)
  1047 	{
  1048 	    if (charset_validator==(GIConv)-1)
  1049 	    {
  1050 		if (!g_unichar_isdefined(c))
  1051 		{
  1052 		    if (pswit[ECHO_SWITCH])
  1053 			g_print("\n%s\n",aline);
  1054 		    if (!pswit[OVERVIEW_SWITCH])
  1055 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1056 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1057 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1058 		    else
  1059 			cnt_bin++;
  1060 		    eInvalidChar=TRUE;
  1061 		}
  1062 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1063 		  c>=100000 && c<=0x10FFFD)
  1064 		{
  1065 		    if (pswit[ECHO_SWITCH])
  1066 			g_print("\n%s\n",aline);
  1067 		    if (!pswit[OVERVIEW_SWITCH])
  1068 			g_print("    Line %ld column %ld - Private Use "
  1069 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1070 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1071 		    else
  1072 			cnt_bin++;
  1073 		    eInvalidChar=TRUE;
  1074 		}
  1075 	    }
  1076 	    else
  1077 	    {
  1078 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1079 		  charset_validator,NULL,&nb,NULL);
  1080 		if (t)
  1081 		    g_free(t);
  1082 		else
  1083 		{
  1084 		    if (pswit[ECHO_SWITCH])
  1085 			g_print("\n%s\n",aline);
  1086 		    if (!pswit[OVERVIEW_SWITCH])
  1087 			g_print("    Line %ld column %ld - Non-%s "
  1088 			  "character %u\n",linecnt,
  1089 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1090 		    else
  1091 			cnt_bin++;
  1092 		    eInvalidChar=TRUE;
  1093 		}
  1094 	    }
  1095 	}
  1096 	if (!eTab && c==CHAR_TAB)
  1097 	{
  1098 	    if (pswit[ECHO_SWITCH])
  1099 		g_print("\n%s\n",aline);
  1100 	    if (!pswit[OVERVIEW_SWITCH])
  1101 		g_print("    Line %ld column %ld - Tab character?\n",
  1102 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1103 	    else
  1104 		cnt_odd++;
  1105 	    eTab=TRUE;
  1106 	}
  1107 	if (!eTilde && c==CHAR_TILDE)
  1108 	{
  1109 	    /*
  1110 	     * Often used by OCR software to indicate an
  1111 	     * unrecognizable character.
  1112 	     */
  1113 	    if (pswit[ECHO_SWITCH])
  1114 		g_print("\n%s\n",aline);
  1115 	    if (!pswit[OVERVIEW_SWITCH])
  1116 		g_print("    Line %ld column %ld - Tilde character?\n",
  1117 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1118 	    else
  1119 		cnt_odd++;
  1120 	    eTilde=TRUE;
  1121 	}
  1122 	if (!eCarat && c==CHAR_CARAT)
  1123 	{  
  1124 	    if (pswit[ECHO_SWITCH])
  1125 		g_print("\n%s\n",aline);
  1126 	    if (!pswit[OVERVIEW_SWITCH])
  1127 		g_print("    Line %ld column %ld - Carat character?\n",
  1128 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1129 	    else
  1130 		cnt_odd++;
  1131 	    eCarat=TRUE;
  1132 	}
  1133 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1134 	{  
  1135 	    if (pswit[ECHO_SWITCH])
  1136 		g_print("\n%s\n",aline);
  1137 	    if (!pswit[OVERVIEW_SWITCH])
  1138 		g_print("    Line %ld column %ld - Forward slash?\n",
  1139 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1140 	    else
  1141 		cnt_odd++;
  1142 	    eFSlash=TRUE;
  1143 	}
  1144 	/*
  1145 	 * Report asterisks only in paranoid mode,
  1146 	 * since they're often deliberate.
  1147 	 */
  1148 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1149 	  c==CHAR_ASTERISK)
  1150 	{
  1151 	    if (pswit[ECHO_SWITCH])
  1152 		g_print("\n%s\n",aline);
  1153 	    if (!pswit[OVERVIEW_SWITCH])
  1154 		g_print("    Line %ld column %ld - Asterisk?\n",
  1155 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1156 	    else
  1157 		cnt_odd++;
  1158 	    eAst=TRUE;
  1159 	}
  1160     }
  1161 }
  1162 
  1163 /*
  1164  * check_for_long_line:
  1165  *
  1166  * Check for line too long.
  1167  */
  1168 void check_for_long_line(const char *aline)
  1169 {
  1170     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1171     {
  1172 	if (pswit[ECHO_SWITCH])
  1173 	    g_print("\n%s\n",aline);
  1174 	if (!pswit[OVERVIEW_SWITCH])
  1175 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1176 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1177 	else
  1178 	    cnt_long++;
  1179     }
  1180 }
  1181 
  1182 /*
  1183  * check_for_short_line:
  1184  *
  1185  * Check for line too short.
  1186  *
  1187  * This one is a bit trickier to implement: we don't want to
  1188  * flag the last line of a paragraph for being short, so we
  1189  * have to wait until we know that our current line is a
  1190  * "normal" line, then report the _previous_ line if it was too
  1191  * short. We also don't want to report indented lines like
  1192  * chapter heads or formatted quotations. We therefore keep
  1193  * last->len as the length of the last line examined, and
  1194  * last->blen as the length of the last but one, and try to
  1195  * suppress unnecessary warnings by checking that both were of
  1196  * "normal" length. We keep the first character of the last
  1197  * line in last->start, and if it was a space, we assume that
  1198  * the formatting is deliberate. I can't figure out a way to
  1199  * distinguish something like a quoted verse left-aligned or
  1200  * the header or footer of a letter from a paragraph of short
  1201  * lines - maybe if I examined the whole paragraph, and if the
  1202  * para has less than, say, 8 lines and if all lines are short,
  1203  * then just assume it's OK? Need to look at some texts to see
  1204  * how often a formula like this would get the right result.
  1205  */
  1206 void check_for_short_line(const char *aline,const struct line_properties *last)
  1207 {
  1208     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1209       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1210       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1211     {
  1212 	if (pswit[ECHO_SWITCH])
  1213 	    g_print("\n%s\n",prevline);
  1214 	if (!pswit[OVERVIEW_SWITCH])
  1215 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1216 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1217 	else
  1218 	    cnt_short++;
  1219     }
  1220 }
  1221 
  1222 /*
  1223  * check_for_starting_punctuation:
  1224  *
  1225  * Look for punctuation other than full ellipses at start of line.
  1226  */
  1227 void check_for_starting_punctuation(const char *aline)
  1228 {
  1229     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1230       !g_str_has_prefix(aline,". . ."))
  1231     {
  1232 	if (pswit[ECHO_SWITCH])
  1233 	    g_print("\n%s\n",aline);
  1234 	if (!pswit[OVERVIEW_SWITCH])
  1235 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1236 	      linecnt);
  1237 	else
  1238 	    cnt_punct++;
  1239     }
  1240 }
  1241 
  1242 /*
  1243  * check_for_spaced_emdash:
  1244  *
  1245  * Check for spaced em-dashes.
  1246  *
  1247  * We must check _all_ occurrences of "--" on the line
  1248  * hence the loop - even if the first double-dash is OK
  1249  * there may be another that's wrong later on.
  1250  */
  1251 void check_for_spaced_emdash(const char *aline)
  1252 {
  1253     const char *s,*t,*next;
  1254     for (s=aline;t=strstr(s,"--");s=next)
  1255     {
  1256 	next=g_utf8_next_char(g_utf8_next_char(t));
  1257 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1258 	  g_utf8_get_char(next)==CHAR_SPACE)
  1259 	{
  1260 	    if (pswit[ECHO_SWITCH])
  1261 		g_print("\n%s\n",aline);
  1262 	    if (!pswit[OVERVIEW_SWITCH])
  1263 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1264 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1265 	    else
  1266 		cnt_dash++;
  1267 	}
  1268     }
  1269 }
  1270 
  1271 /*
  1272  * check_for_spaced_dash:
  1273  *
  1274  * Check for spaced dashes.
  1275  */
  1276 void check_for_spaced_dash(const char *aline)
  1277 {
  1278     const char *s;
  1279     if ((s=strstr(aline," -")))
  1280     {
  1281 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1282 	{
  1283 	    if (pswit[ECHO_SWITCH])
  1284 		g_print("\n%s\n",aline);
  1285 	    if (!pswit[OVERVIEW_SWITCH])
  1286 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1287 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1288 	    else
  1289 		cnt_dash++;
  1290 	}
  1291     }
  1292     else if ((s=strstr(aline,"- ")))
  1293     {
  1294 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1295 	{
  1296 	    if (pswit[ECHO_SWITCH])
  1297 		g_print("\n%s\n",aline);
  1298 	    if (!pswit[OVERVIEW_SWITCH])
  1299 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1300 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1301 	    else
  1302 		cnt_dash++;
  1303 	}
  1304     }
  1305 }
  1306 
  1307 /*
  1308  * check_for_unmarked_paragraphs:
  1309  *
  1310  * Check for unmarked paragraphs indicated by separate speakers.
  1311  *
  1312  * May well be false positive:
  1313  * "Bravo!" "Wonderful!" called the crowd.
  1314  * but useful all the same.
  1315  */
  1316 void check_for_unmarked_paragraphs(const char *aline)
  1317 {
  1318     const char *s;
  1319     s=strstr(aline,"\"  \"");
  1320     if (!s)
  1321 	s=strstr(aline,"\" \"");
  1322     if (s)
  1323     {
  1324 	if (pswit[ECHO_SWITCH])
  1325 	    g_print("\n%s\n",aline);
  1326 	if (!pswit[OVERVIEW_SWITCH])
  1327 	    g_print("    Line %ld column %ld - "
  1328 	      "Query missing paragraph break?\n",
  1329 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1330 	else
  1331 	    cnt_punct++;
  1332     }
  1333 }
  1334 
  1335 /*
  1336  * check_for_jeebies:
  1337  *
  1338  * Check for "to he" and other easy h/b errors.
  1339  *
  1340  * This is a very inadequate effort on the h/b problem,
  1341  * but the phrase "to he" is always an error, whereas "to
  1342  * be" is quite common.
  1343  * Similarly, '"Quiet!", be said.' is a non-be error
  1344  * "to he" is _not_ always an error!:
  1345  *       "Where they went to he couldn't say."
  1346  * Another false positive:
  1347  *       What would "Cinderella" be without the . . .
  1348  * and another: "If he wants to he can see for himself."
  1349  */
  1350 void check_for_jeebies(const char *aline)
  1351 {
  1352     const char *s;
  1353     s=strstr(aline," be could ");
  1354     if (!s)
  1355 	s=strstr(aline," be would ");
  1356     if (!s)
  1357 	s=strstr(aline," was be ");
  1358     if (!s)
  1359 	s=strstr(aline," be is ");
  1360     if (!s)
  1361 	s=strstr(aline," is be ");
  1362     if (!s)
  1363 	s=strstr(aline,"\", be ");
  1364     if (!s)
  1365 	s=strstr(aline,"\" be ");
  1366     if (!s)
  1367 	s=strstr(aline,"\" be ");
  1368     if (!s)
  1369 	s=strstr(aline," to he ");
  1370     if (s)
  1371     {
  1372 	if (pswit[ECHO_SWITCH])
  1373 	    g_print("\n%s\n",aline);
  1374 	if (!pswit[OVERVIEW_SWITCH])
  1375 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1376 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1377 	else
  1378 	    cnt_word++;
  1379     }
  1380     s=strstr(aline," the had ");
  1381     if (!s)
  1382 	s=strstr(aline," a had ");
  1383     if (!s)
  1384 	s=strstr(aline," they bad ");
  1385     if (!s)
  1386 	s=strstr(aline," she bad ");
  1387     if (!s)
  1388 	s=strstr(aline," he bad ");
  1389     if (!s)
  1390 	s=strstr(aline," you bad ");
  1391     if (!s)
  1392 	s=strstr(aline," i bad ");
  1393     if (s)
  1394     {
  1395 	if (pswit[ECHO_SWITCH])
  1396 	    g_print("\n%s\n",aline);
  1397 	if (!pswit[OVERVIEW_SWITCH])
  1398 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1399 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1400 	else
  1401 	    cnt_word++;
  1402     }
  1403     s=strstr(aline,"; hut ");
  1404     if (!s)
  1405 	s=strstr(aline,", hut ");
  1406     if (s)
  1407     {
  1408 	if (pswit[ECHO_SWITCH])
  1409 	    g_print("\n%s\n",aline);
  1410 	if (!pswit[OVERVIEW_SWITCH])
  1411 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1412 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1413 	else
  1414 	    cnt_word++;
  1415     }
  1416 }
  1417 
  1418 /*
  1419  * check_for_mta_from:
  1420  *
  1421  * Special case - angled bracket in front of "From" placed there by an
  1422  * MTA when sending an e-mail.
  1423  */
  1424 void check_for_mta_from(const char *aline)
  1425 {
  1426     const char *s;
  1427     s=strstr(aline,">From");
  1428     if (s)
  1429     {
  1430 	if (pswit[ECHO_SWITCH])
  1431 	    g_print("\n%s\n",aline);
  1432 	if (!pswit[OVERVIEW_SWITCH])
  1433 	    g_print("    Line %ld column %ld - "
  1434 	      "Query angled bracket with From\n",
  1435 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1436 	else
  1437 	    cnt_punct++;
  1438     }
  1439 }
  1440 
  1441 /*
  1442  * check_for_orphan_character:
  1443  *
  1444  * Check for a single character line -
  1445  * often an overflow from bad wrapping.
  1446  */
  1447 void check_for_orphan_character(const char *aline)
  1448 {
  1449     gunichar c;
  1450     c=g_utf8_get_char(aline);
  1451     if (c && !*g_utf8_next_char(aline))
  1452     {
  1453 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1454 	    ; /* Nothing - ignore numerals alone on a line. */
  1455 	else
  1456 	{
  1457 	    if (pswit[ECHO_SWITCH])
  1458 		g_print("\n%s\n",aline);
  1459 	    if (!pswit[OVERVIEW_SWITCH])
  1460 		g_print("    Line %ld column 1 - Query single character line\n",
  1461 		  linecnt);
  1462 	    else
  1463 		cnt_punct++;
  1464 	}
  1465     }
  1466 }
  1467 
  1468 /*
  1469  * check_for_pling_scanno:
  1470  *
  1471  * Check for I" - often should be !
  1472  */
  1473 void check_for_pling_scanno(const char *aline)
  1474 {
  1475     const char *s;
  1476     s=strstr(aline," I\"");
  1477     if (s)
  1478     {
  1479 	if (pswit[ECHO_SWITCH])
  1480 	    g_print("\n%s\n",aline);
  1481 	if (!pswit[OVERVIEW_SWITCH])
  1482 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1483 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1484 	else
  1485 	    cnt_punct++;
  1486     }
  1487 }
  1488 
  1489 /*
  1490  * check_for_extra_period:
  1491  *
  1492  * Check for period without a capital letter. Cut-down from gutspell.
  1493  * Only works when it happens on a single line.
  1494  */
  1495 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1496 {
  1497     const char *s,*t,*s1,*sprev;
  1498     int i;
  1499     gsize len;
  1500     gboolean istypo;
  1501     gchar *testword;
  1502     gunichar c,nc,pc,*decomposition;
  1503     if (pswit[PARANOID_SWITCH])
  1504     {
  1505 	for (t=aline;t=strstr(t,". ");)
  1506 	{
  1507 	    if (t==aline)
  1508 	    {
  1509 		t=g_utf8_next_char(t);
  1510 		/* start of line punctuation is handled elsewhere */
  1511 		continue;
  1512 	    }
  1513 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1514 	    {
  1515 		t=g_utf8_next_char(t);
  1516 		continue;
  1517 	    }
  1518 	    if (warnings->isDutch)
  1519 	    {
  1520 		/* For Frank & Jeroen -- 's Middags case */
  1521 		gunichar c2,c3,c4,c5;
  1522 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1523 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1524 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1525 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1526 		if (CHAR_IS_APOSTROPHE(c2) &&
  1527 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1528 		  g_unichar_isupper(c5))
  1529 		{
  1530 		    t=g_utf8_next_char(t);
  1531 		    continue;
  1532 		}
  1533 	    }
  1534 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1535 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1536 	      !isdigit(g_utf8_get_char(s1)))
  1537 		s1=g_utf8_next_char(s1);
  1538 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1539 	    {
  1540 		/* we have something to investigate */
  1541 		istypo=TRUE;
  1542 		/* so let's go back and find out */
  1543 		nc=g_utf8_get_char(t);
  1544 		s1=g_utf8_prev_char(t);
  1545 		c=g_utf8_get_char(s1);
  1546 		sprev=g_utf8_prev_char(s1);
  1547 		pc=g_utf8_get_char(sprev);
  1548 		while (s1>=aline &&
  1549 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1550 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1551 		  g_unichar_isalpha(nc)))
  1552 		{
  1553 		    nc=c;
  1554 		    s1=sprev;
  1555 		    c=pc;
  1556 		    sprev=g_utf8_prev_char(s1);
  1557 		    pc=g_utf8_get_char(sprev);
  1558 		}
  1559 		s1=g_utf8_next_char(s1);
  1560 		s=strchr(s1,'.');
  1561 		if (s)
  1562 		    testword=g_strndup(s1,s-s1);
  1563 		else
  1564 		    testword=g_strdup(s1);
  1565 		for (i=0;*abbrev[i];i++)
  1566 		    if (!strcmp(testword,abbrev[i]))
  1567 			istypo=FALSE;
  1568 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1569 		    istypo=FALSE;
  1570 		if (!*g_utf8_next_char(testword))
  1571 		    istypo=FALSE;
  1572 		if (isroman(testword))
  1573 		    istypo=FALSE;
  1574 		if (istypo)
  1575 		{
  1576 		    istypo=FALSE;
  1577 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1578 		    {
  1579 			decomposition=g_unicode_canonical_decomposition(
  1580 			  g_utf8_get_char(s),&len);
  1581 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1582 			    istypo=TRUE;
  1583 			g_free(decomposition);
  1584 		    }
  1585 		}
  1586 		if (istypo &&
  1587 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1588 		{
  1589 		    g_tree_insert(qperiod,g_strdup(testword),
  1590 		      GINT_TO_POINTER(1));
  1591 		    if (pswit[ECHO_SWITCH])
  1592 			g_print("\n%s\n",aline);
  1593 		    if (!pswit[OVERVIEW_SWITCH])
  1594 			g_print("    Line %ld column %ld - Extra period?\n",
  1595 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1596 		    else
  1597 			cnt_punct++;
  1598 		}
  1599 		g_free(testword);
  1600 	    }
  1601 	    t=g_utf8_next_char(t);
  1602 	}
  1603     }
  1604 }
  1605 
  1606 /*
  1607  * check_for_following_punctuation:
  1608  *
  1609  * Check for words usually not followed by punctuation.
  1610  */
  1611 void check_for_following_punctuation(const char *aline)
  1612 {
  1613     int i;
  1614     const char *s,*wordstart;
  1615     gunichar c;
  1616     gchar *inword,*t;
  1617     if (pswit[TYPO_SWITCH])
  1618     {
  1619 	for (s=aline;*s;)
  1620 	{
  1621 	    wordstart=s;
  1622 	    t=getaword(&s);
  1623 	    if (!*t)
  1624 	    {
  1625 		g_free(t);
  1626 		continue;
  1627 	    }
  1628 	    inword=g_utf8_strdown(t,-1);
  1629 	    g_free(t);
  1630 	    for (i=0;*nocomma[i];i++)
  1631 		if (!strcmp(inword,nocomma[i]))
  1632 		{
  1633 		    c=g_utf8_get_char(s);
  1634 		    if (c==',' || c==';' || c==':')
  1635 		    {
  1636 			if (pswit[ECHO_SWITCH])
  1637 			    g_print("\n%s\n",aline);
  1638 			if (!pswit[OVERVIEW_SWITCH])
  1639 			    g_print("    Line %ld column %ld - "
  1640 			      "Query punctuation after %s?\n",
  1641 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1642 			      inword);
  1643 			else
  1644 			    cnt_punct++;
  1645 		    }
  1646 		}
  1647 	    for (i=0;*noperiod[i];i++)
  1648 		if (!strcmp(inword,noperiod[i]))
  1649 		{
  1650 		    c=g_utf8_get_char(s);
  1651 		    if (c=='.' || c=='!')
  1652 		    {
  1653 			if (pswit[ECHO_SWITCH])
  1654 			    g_print("\n%s\n",aline);
  1655 			if (!pswit[OVERVIEW_SWITCH])
  1656 			    g_print("    Line %ld column %ld - "
  1657 			      "Query punctuation after %s?\n",
  1658 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1659 			      inword);
  1660 			else
  1661 			    cnt_punct++;
  1662 		    }
  1663 		}
  1664 	    g_free(inword);
  1665 	}
  1666     }
  1667 }
  1668 
  1669 /*
  1670  * check_for_typos:
  1671  *
  1672  * Check for commonly mistyped words,
  1673  * and digits like 0 for O in a word.
  1674  */
  1675 void check_for_typos(const char *aline,struct warnings *warnings)
  1676 {
  1677     const char *s,*t,*nt,*wordstart;
  1678     gchar *inword;
  1679     gunichar *decomposition;
  1680     gchar *testword;
  1681     int i,vowel,consonant,*dupcnt;
  1682     gboolean isdup,istypo,alower;
  1683     gunichar c,pc;
  1684     long offset,len;
  1685     gsize decomposition_len;
  1686     for (s=aline;*s;)
  1687     {
  1688 	wordstart=s;
  1689 	inword=getaword(&s);
  1690 	if (!*inword)
  1691 	{
  1692 	    g_free(inword);
  1693 	    continue; /* don't bother with empty lines */
  1694 	}
  1695 	if (mixdigit(inword))
  1696 	{
  1697 	    if (pswit[ECHO_SWITCH])
  1698 		g_print("\n%s\n",aline);
  1699 	    if (!pswit[OVERVIEW_SWITCH])
  1700 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1701 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1702 	    else
  1703 		cnt_word++;
  1704 	}
  1705 	/*
  1706 	 * Put the word through a series of tests for likely typos and OCR
  1707 	 * errors.
  1708 	 */
  1709 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1710 	{
  1711 	    istypo=FALSE;
  1712 	    alower=FALSE;
  1713 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1714 	    {
  1715 		c=g_utf8_get_char(t);
  1716 		nt=g_utf8_next_char(t);
  1717 		/* lowercase for testing */
  1718 		if (g_unichar_islower(c))
  1719 		    alower=TRUE;
  1720 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1721 		{
  1722 		    /*
  1723 		     * We have an uppercase mid-word. However, there are
  1724 		     * common cases:
  1725 		     *   Mac and Mc like McGill
  1726 		     *   French contractions like l'Abbe
  1727 		     */
  1728 		    offset=g_utf8_pointer_to_offset(inword,t);
  1729 		    if (offset>0)
  1730 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1731 		    else
  1732 			pc='\0';
  1733 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1734 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1735 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1736 		      CHAR_IS_APOSTROPHE(pc))
  1737 			; /* do nothing! */
  1738 		    else
  1739 			istypo=TRUE;
  1740 		}
  1741 	    }
  1742 	    testword=g_utf8_casefold(inword,-1);
  1743 	}
  1744 	if (pswit[TYPO_SWITCH])
  1745 	{
  1746 	    /*
  1747 	     * Check for certain unlikely two-letter combinations at word
  1748 	     * start and end.
  1749 	     */
  1750 	    len=g_utf8_strlen(testword,-1);
  1751 	    if (len>1)
  1752 	    {
  1753 		for (i=0;*nostart[i];i++)
  1754 		    if (g_str_has_prefix(testword,nostart[i]))
  1755 			istypo=TRUE;
  1756 		for (i=0;*noend[i];i++)
  1757 		    if (g_str_has_suffix(testword,noend[i]))
  1758 			istypo=TRUE;
  1759 	    }
  1760 	    /* ght is common, gbt never. Like that. */
  1761 	    if (strstr(testword,"cb"))
  1762 		istypo=TRUE;
  1763 	    if (strstr(testword,"gbt"))
  1764 		istypo=TRUE;
  1765 	    if (strstr(testword,"pbt"))
  1766 		istypo=TRUE;
  1767 	    if (strstr(testword,"tbs"))
  1768 		istypo=TRUE;
  1769 	    if (strstr(testword,"mrn"))
  1770 		istypo=TRUE;
  1771 	    if (strstr(testword,"ahle"))
  1772 		istypo=TRUE;
  1773 	    if (strstr(testword,"ihle"))
  1774 		istypo=TRUE;
  1775 	    /*
  1776 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1777 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1778 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1779 	     * numerals, but "ii" is a common scanno.
  1780 	     */
  1781 	    if (strstr(testword,"tbi"))
  1782 		istypo=TRUE;
  1783 	    if (strstr(testword,"tbe"))
  1784 		istypo=TRUE;
  1785 	    if (strstr(testword,"ii"))
  1786 		istypo=TRUE;
  1787 	    /*
  1788 	     * Check for no vowels or no consonants.
  1789 	     * If none, flag a typo.
  1790 	     */
  1791 	    if (!istypo && len>1)
  1792 	    {
  1793 		vowel=consonant=0;
  1794 		for (t=testword;*t;t=g_utf8_next_char(t))
  1795 		{
  1796 		    c=g_utf8_get_char(t);
  1797 		    decomposition=
  1798 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1799 		    if (c=='y' || g_unichar_isdigit(c))
  1800 		    {
  1801 			/* Yah, this is loose. */
  1802 			vowel++;
  1803 			consonant++;
  1804 		    }
  1805 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1806 			vowel++;
  1807 		    else
  1808 			consonant++;
  1809 		    g_free(decomposition);
  1810 		}
  1811 		if (!vowel || !consonant)
  1812 		    istypo=TRUE;
  1813 	    }
  1814 	    /*
  1815 	     * Now exclude the word from being reported if it's in
  1816 	     * the okword list.
  1817 	     */
  1818 	    for (i=0;*okword[i];i++)
  1819 		if (!strcmp(testword,okword[i]))
  1820 		    istypo=FALSE;
  1821 	    /*
  1822 	     * What looks like a typo may be a Roman numeral.
  1823 	     * Exclude these.
  1824 	     */
  1825 	    if (istypo && isroman(testword))
  1826 		istypo=FALSE;
  1827 	    /* Check the manual list of typos. */
  1828 	    if (!istypo)
  1829 		for (i=0;*typo[i];i++)
  1830 		    if (!strcmp(testword,typo[i]))
  1831 			istypo=TRUE;
  1832 	    /*
  1833 	     * Check lowercase s, l, i and m - special cases.
  1834 	     *   "j" - often a semi-colon gone wrong.
  1835 	     *   "d" for a missing apostrophe - he d
  1836 	     *   "n" for "in"
  1837 	     */
  1838 	    if (!istypo && len==1 &&
  1839 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1840 		istypo=TRUE;
  1841 	    if (istypo)
  1842 	    {
  1843 		dupcnt=g_tree_lookup(qword,testword);
  1844 		if (dupcnt)
  1845 		{
  1846 		    (*dupcnt)++;
  1847 		    isdup=!pswit[VERBOSE_SWITCH];
  1848 		}
  1849 		else
  1850 		{
  1851 		    dupcnt=g_new0(int,1);
  1852 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1853 		    isdup=FALSE;
  1854 		}
  1855 		if (!isdup)
  1856 		{
  1857 		    if (pswit[ECHO_SWITCH])
  1858 			g_print("\n%s\n",aline);
  1859 		    if (!pswit[OVERVIEW_SWITCH])
  1860 		    {
  1861 			g_print("    Line %ld column %ld - Query word %s",
  1862 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1863 			  inword);
  1864 			if (!pswit[VERBOSE_SWITCH])
  1865 			    g_print(" - not reporting duplicates");
  1866 			g_print("\n");
  1867 		    }
  1868 		    else
  1869 			cnt_word++;
  1870 		}
  1871 	    }
  1872 	}
  1873 	/* check the user's list of typos */
  1874 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1875 	{
  1876 	    if (pswit[ECHO_SWITCH])
  1877 		g_print("\n%s\n",aline);
  1878 	    if (!pswit[OVERVIEW_SWITCH])  
  1879 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1880 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1881 	}
  1882 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1883 	    g_free(testword);
  1884 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1885 	{
  1886 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1887 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1888 	    {
  1889 		if (pswit[ECHO_SWITCH])
  1890 		    g_print("\n%s\n",aline);
  1891 		if (!pswit[OVERVIEW_SWITCH])
  1892 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1893 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1894 		      inword);
  1895 		else
  1896 		    cnt_word++;
  1897 	    }
  1898 	}
  1899 	g_free(inword);
  1900     }
  1901 }
  1902 
  1903 /*
  1904  * check_for_misspaced_punctuation:
  1905  *
  1906  * Look for added or missing spaces around punctuation and quotes.
  1907  * If there is a punctuation character like ! with no space on
  1908  * either side, suspect a missing!space. If there are spaces on
  1909  * both sides , assume a typo. If we see a double quote with no
  1910  * space or punctuation on either side of it, assume unspaced
  1911  * quotes "like"this.
  1912  */
  1913 void check_for_misspaced_punctuation(const char *aline,
  1914   struct parities *parities,gboolean isemptyline)
  1915 {
  1916     gboolean isacro,isellipsis;
  1917     const char *s;
  1918     gunichar c,nc,pc,n2c;
  1919     int parity;
  1920     c=g_utf8_get_char(aline);
  1921     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1922     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1923     {
  1924 	pc=c;
  1925 	c=nc;
  1926 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1927 	/* For each character in the line after the first. */
  1928 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1929 	{
  1930 	    /* we need to suppress warnings for acronyms like M.D. */
  1931 	    isacro=FALSE;
  1932 	    /* we need to suppress warnings for ellipsis . . . */
  1933 	    isellipsis=FALSE;
  1934 	    /*
  1935 	     * If there are letters on both sides of it or
  1936 	     * if it's strict punctuation followed by an alpha.
  1937 	     */
  1938 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1939 	      g_utf8_strchr("?!,;:",-1,c)))
  1940 	    {
  1941 		if (c=='.')
  1942 		{
  1943 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1944 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1945 			isacro=TRUE;
  1946 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1947 		    if (nc && n2c=='.')
  1948 			isacro=TRUE;
  1949 		}
  1950 		if (!isacro)
  1951 		{
  1952 		    if (pswit[ECHO_SWITCH])
  1953 			g_print("\n%s\n",aline);
  1954 		    if (!pswit[OVERVIEW_SWITCH])
  1955 			g_print("    Line %ld column %ld - Missing space?\n",
  1956 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1957 		    else
  1958 			cnt_punct++;
  1959 		}
  1960 	    }
  1961 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1962 	    {
  1963 		/*
  1964 		 * If there are spaces on both sides,
  1965 		 * or space before and end of line.
  1966 		 */
  1967 		if (c=='.')
  1968 		{
  1969 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1970 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1971 			isellipsis=TRUE;
  1972 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1973 		    if (nc && n2c=='.')
  1974 			isellipsis=TRUE;
  1975 		}
  1976 		if (!isemptyline && !isellipsis)
  1977 		{
  1978 		    if (pswit[ECHO_SWITCH])
  1979 			g_print("\n%s\n",aline);
  1980 		    if (!pswit[OVERVIEW_SWITCH])
  1981 			g_print("    Line %ld column %ld - "
  1982 			  "Spaced punctuation?\n",linecnt,
  1983 			  g_utf8_pointer_to_offset(aline,s)+1);
  1984 		    else
  1985 			cnt_punct++;
  1986 		}
  1987 	    }
  1988 	}
  1989     }
  1990     /* Split out the characters that CANNOT be preceded by space. */
  1991     c=g_utf8_get_char(aline);
  1992     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1993     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1994     {
  1995 	pc=c;
  1996 	c=nc;
  1997 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1998 	/* for each character in the line after the first */
  1999 	if (g_utf8_strchr("?!,;:",-1,c))
  2000 	{
  2001 	    /* if it's punctuation that _cannot_ have a space before it */
  2002 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2003 	    {
  2004 		/*
  2005 		 * If nc DOES == space,
  2006 		 * it was already reported just above.
  2007 		 */
  2008 		if (pswit[ECHO_SWITCH])
  2009 		    g_print("\n%s\n",aline);
  2010 		if (!pswit[OVERVIEW_SWITCH])
  2011 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2012 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2013 		else
  2014 		    cnt_punct++;
  2015 	    }
  2016 	}
  2017     }
  2018     /*
  2019      * Special case " .X" where X is any alpha.
  2020      * This plugs a hole in the acronym code above.
  2021      * Inelegant, but maintainable.
  2022      */
  2023     c=g_utf8_get_char(aline);
  2024     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2025     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2026     {
  2027 	pc=c;
  2028 	c=nc;
  2029 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2030 	/* for each character in the line after the first */
  2031 	if (c=='.')
  2032 	{
  2033 	    /* if it's a period */
  2034 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2035 	    {
  2036 		/*
  2037 		 * If the period follows a space and
  2038 		 * is followed by a letter.
  2039 		 */
  2040 		if (pswit[ECHO_SWITCH])
  2041 		    g_print("\n%s\n",aline);
  2042 		if (!pswit[OVERVIEW_SWITCH])
  2043 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2044 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2045 		else
  2046 		    cnt_punct++;
  2047 	    }
  2048 	}
  2049     }
  2050     c=g_utf8_get_char(aline);
  2051     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2052     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2053     {
  2054 	pc=c;
  2055 	c=nc;
  2056 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2057 	/* for each character in the line after the first */
  2058 	if (CHAR_IS_DQUOTE(c))
  2059 	{
  2060 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2061 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2062 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2063 	    {
  2064 		if (pswit[ECHO_SWITCH])
  2065 		    g_print("\n%s\n",aline);
  2066 		if (!pswit[OVERVIEW_SWITCH])
  2067 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2068 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2069 		else
  2070 		    cnt_punct++;
  2071 	    }
  2072 	}
  2073     }
  2074     /* Check parity of quotes. */
  2075     nc=g_utf8_get_char(aline);
  2076     for (s=aline;*s;s=g_utf8_next_char(s))
  2077     {
  2078 	c=nc;
  2079 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2080 	if (CHAR_IS_DQUOTE(c))
  2081 	{
  2082 	    if (c==CHAR_DQUOTE)
  2083 	    {
  2084 		parities->dquote=!parities->dquote;
  2085 		parity=parities->dquote;
  2086 	    }
  2087 	    else if (c==CHAR_LD_QUOTE)
  2088 		parity=1;
  2089 	    else
  2090 		parity=0;
  2091 	    if (!parity)
  2092 	    {
  2093 		/* parity even */
  2094 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2095 		{
  2096 		    if (pswit[ECHO_SWITCH])
  2097 			g_print("\n%s\n",aline);
  2098 		    if (!pswit[OVERVIEW_SWITCH])
  2099 			g_print("    Line %ld column %ld - "
  2100 			  "Wrongspaced quotes?\n",
  2101 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2102 		    else
  2103 			cnt_punct++;
  2104 		}
  2105 	    }
  2106 	    else
  2107 	    {
  2108 		/* parity odd */
  2109 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2110 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2111 		{
  2112 		    if (pswit[ECHO_SWITCH])
  2113 			g_print("\n%s\n",aline);
  2114 		    if (!pswit[OVERVIEW_SWITCH])
  2115 			g_print("    Line %ld column %ld - "
  2116 			  "Wrongspaced quotes?\n",
  2117 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2118 		    else
  2119 			cnt_punct++;
  2120 		}
  2121 	    }
  2122 	}
  2123     }
  2124     c=g_utf8_get_char(aline);
  2125     if (CHAR_IS_DQUOTE(c))
  2126     {
  2127 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2128 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2129 	{
  2130 	    if (pswit[ECHO_SWITCH])
  2131 		g_print("\n%s\n",aline);
  2132 	    if (!pswit[OVERVIEW_SWITCH])
  2133 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2134 		  linecnt);
  2135 	    else
  2136 		cnt_punct++;
  2137 	}
  2138     }
  2139     if (pswit[SQUOTE_SWITCH])
  2140     {
  2141 	nc=g_utf8_get_char(aline);
  2142 	for (s=aline;*s;s=g_utf8_next_char(s))
  2143 	{
  2144 	    c=nc;
  2145 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2146 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2147 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2148 	      !g_unichar_isalpha(nc)))
  2149 	    {
  2150 		parities->squote=!parities->squote;
  2151 		if (!parities->squote)
  2152 		{
  2153 		    /* parity even */
  2154 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2155 		    {
  2156 			if (pswit[ECHO_SWITCH])
  2157 			    g_print("\n%s\n",aline);
  2158 			if (!pswit[OVERVIEW_SWITCH])
  2159 			    g_print("    Line %ld column %ld - "
  2160 			      "Wrongspaced singlequotes?\n",
  2161 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2162 			else
  2163 			    cnt_punct++;
  2164 		    }
  2165 		}
  2166 		else
  2167 		{
  2168 		    /* parity odd */
  2169 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2170 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2171 		    {
  2172 			if (pswit[ECHO_SWITCH])
  2173 			    g_print("\n%s\n",aline);
  2174 			if (!pswit[OVERVIEW_SWITCH])
  2175 			    g_print("    Line %ld column %ld - "
  2176 			      "Wrongspaced singlequotes?\n",
  2177 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2178 			else
  2179 			    cnt_punct++;
  2180 		    }
  2181 		}
  2182 	    }
  2183 	}
  2184     }
  2185 }
  2186 
  2187 /*
  2188  * check_for_double_punctuation:
  2189  *
  2190  * Look for double punctuation like ,. or ,,
  2191  * Thanks to DW for the suggestion!
  2192  * In books with references, ".," and ".;" are common
  2193  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2194  * OTOH, from my initial tests, there are also fairly
  2195  * common errors. What to do? Make these cases paranoid?
  2196  * ".," is the most common, so warnings->dotcomma is used
  2197  * to suppress detailed reporting if it occurs often.
  2198  */
  2199 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2200 {
  2201     const char *s;
  2202     gunichar c,nc;
  2203     nc=g_utf8_get_char(aline);
  2204     for (s=aline;*s;s=g_utf8_next_char(s))
  2205     {
  2206 	c=nc;
  2207 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2208 	/* for each punctuation character in the line */
  2209 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2210 	  g_utf8_strchr(".?!,;:",-1,nc))
  2211 	{
  2212 	    /* followed by punctuation, it's a query, unless . . . */
  2213 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2214 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2215 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2216 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2217 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2218 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2219 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2220 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2221 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2222 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2223 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2224 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2225 	    {
  2226 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2227 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2228 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2229 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2230 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2231 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2232 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2233 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2234 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2235 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2236 		{
  2237 		    s+=4;
  2238 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2239 		}
  2240 		; /* do nothing for .. !! and ?? which can be legit */
  2241 	    }
  2242 	    else
  2243 	    {
  2244 		if (pswit[ECHO_SWITCH])
  2245 		    g_print("\n%s\n",aline);
  2246 		if (!pswit[OVERVIEW_SWITCH])
  2247 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2248 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2249 		else
  2250 		    cnt_punct++;
  2251 	    }
  2252 	}
  2253     }
  2254 }
  2255 
  2256 /*
  2257  * check_for_spaced_quotes:
  2258  */
  2259 void check_for_spaced_quotes(const char *aline)
  2260 {
  2261     int i;
  2262     const char *s,*t;
  2263     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2264       CHAR_RS_QUOTE};
  2265     GString *pattern;
  2266     s=aline;
  2267     while ((t=strstr(s," \" ")))
  2268     {
  2269 	if (pswit[ECHO_SWITCH])
  2270 	    g_print("\n%s\n",aline);
  2271 	if (!pswit[OVERVIEW_SWITCH])
  2272 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2273 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2274 	else
  2275 	    cnt_punct++;
  2276 	s=g_utf8_next_char(g_utf8_next_char(t));
  2277     }
  2278     pattern=g_string_new(NULL);
  2279     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2280     {
  2281 	g_string_assign(pattern," ");
  2282 	g_string_append_unichar(pattern,single_quotes[i]);
  2283 	g_string_append_c(pattern,' ');
  2284 	s=aline;
  2285 	while ((t=strstr(s,pattern->str)))
  2286 	{
  2287 	    if (pswit[ECHO_SWITCH])
  2288 		g_print("\n%s\n",aline);
  2289 	    if (!pswit[OVERVIEW_SWITCH])
  2290 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2291 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2292 	    else
  2293 		cnt_punct++;
  2294 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2295 	}
  2296     }
  2297     g_string_free(pattern,TRUE);
  2298 }
  2299 
  2300 /*
  2301  * check_for_miscased_genative:
  2302  *
  2303  * Check special case of 'S instead of 's at end of word.
  2304  */
  2305 void check_for_miscased_genative(const char *aline)
  2306 {
  2307     const char *s;
  2308     gunichar c,nc,pc;
  2309     if (!*aline)
  2310 	return;
  2311     c=g_utf8_get_char(aline);
  2312     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2313     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2314     {
  2315 	pc=c;
  2316 	c=nc;
  2317 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2318 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2319 	{
  2320 	    if (pswit[ECHO_SWITCH])
  2321 		g_print("\n%s\n",aline);
  2322 	    if (!pswit[OVERVIEW_SWITCH])
  2323 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2324 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2325 	    else
  2326 		cnt_punct++;
  2327 	}
  2328     }
  2329 }
  2330 
  2331 /*
  2332  * check_end_of_line:
  2333  *
  2334  * Now check special cases - start and end of line -
  2335  * for single and double quotes. Start is sometimes [sic]
  2336  * but better to query it anyway.
  2337  * While we're here, check for dash at end of line.
  2338  */
  2339 void check_end_of_line(const char *aline,struct warnings *warnings)
  2340 {
  2341     int lbytes;
  2342     const char *s;
  2343     gunichar c1,c2;
  2344     lbytes=strlen(aline);
  2345     if (g_utf8_strlen(aline,lbytes)>1)
  2346     {
  2347 	s=g_utf8_prev_char(aline+lbytes);
  2348 	c1=g_utf8_get_char(s);
  2349 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2350 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2351 	{
  2352 	    if (pswit[ECHO_SWITCH])
  2353 		g_print("\n%s\n",aline);
  2354 	    if (!pswit[OVERVIEW_SWITCH])
  2355 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2356 		  g_utf8_strlen(aline,lbytes));
  2357 	    else
  2358 		cnt_punct++;
  2359 	}
  2360 	c1=g_utf8_get_char(aline);
  2361 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2362 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2363 	{
  2364 	    if (pswit[ECHO_SWITCH])
  2365 		g_print("\n%s\n",aline);
  2366 	    if (!pswit[OVERVIEW_SWITCH])
  2367 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2368 	    else
  2369 		cnt_punct++;
  2370 	}
  2371 	/*
  2372 	 * Dash at end of line may well be legit - paranoid mode only
  2373 	 * and don't report em-dash at line-end.
  2374 	 */
  2375 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2376 	{
  2377 	    for (s=g_utf8_prev_char(aline+lbytes);
  2378 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2379 		;
  2380 	    if (g_utf8_get_char(s)=='-' &&
  2381 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2382 	    {
  2383 		if (pswit[ECHO_SWITCH])
  2384 		    g_print("\n%s\n",aline);
  2385 		if (!pswit[OVERVIEW_SWITCH])
  2386 		    g_print("    Line %ld column %ld - "
  2387 		      "Hyphen at end of line?\n",
  2388 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2389 	    }
  2390 	}
  2391     }
  2392 }
  2393 
  2394 /*
  2395  * check_for_unspaced_bracket:
  2396  *
  2397  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2398  * If so, suspect a scanno like "a]most".
  2399  */
  2400 void check_for_unspaced_bracket(const char *aline)
  2401 {
  2402     const char *s;
  2403     gunichar c,nc,pc;
  2404     c=g_utf8_get_char(aline);
  2405     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2406     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2407     {
  2408 	pc=c;
  2409 	c=nc;
  2410 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2411 	if (!nc)
  2412 	    break;
  2413 	/* for each bracket character in the line except 1st & last */
  2414 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2415 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2416 	{
  2417 	    if (pswit[ECHO_SWITCH])
  2418 		g_print("\n%s\n",aline);
  2419 	    if (!pswit[OVERVIEW_SWITCH])
  2420 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2421 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2422 	    else
  2423 		cnt_punct++;
  2424 	}
  2425     }
  2426 }
  2427 
  2428 /*
  2429  * check_for_unpunctuated_endquote:
  2430  */
  2431 void check_for_unpunctuated_endquote(const char *aline)
  2432 {
  2433     const char *s;
  2434     gunichar c,nc,pc;
  2435     QuoteClass qc;
  2436     c=g_utf8_get_char(aline);
  2437     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2438     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2439     {
  2440 	pc=c;
  2441 	c=nc;
  2442 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2443 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2444 	/* for each character in the line except 1st */
  2445 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
  2446 	{
  2447 	    if (pswit[ECHO_SWITCH])
  2448 		g_print("\n%s\n",aline);
  2449 	    if (!pswit[OVERVIEW_SWITCH])
  2450 		g_print("    Line %ld column %ld - "
  2451 		  "endquote missing punctuation?\n",
  2452 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2453 	    else
  2454 		cnt_punct++;
  2455 	}
  2456     }
  2457 }
  2458 
  2459 /*
  2460  * check_for_html_tag:
  2461  *
  2462  * Check for <HTML TAG>.
  2463  *
  2464  * If there is a < in the line, followed at some point
  2465  * by a > then we suspect HTML.
  2466  */
  2467 void check_for_html_tag(const char *aline)
  2468 {
  2469     const char *open,*close;
  2470     gchar *tag;
  2471     open=strchr(aline,'<');
  2472     if (open)
  2473     {
  2474 	close=strchr(g_utf8_next_char(open),'>');
  2475 	if (close)
  2476 	{
  2477 	    if (pswit[ECHO_SWITCH])
  2478 		g_print("\n%s\n",aline);
  2479 	    if (!pswit[OVERVIEW_SWITCH])
  2480 	    {
  2481 		tag=g_strndup(open,close-open+1);
  2482 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2483 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2484 		g_free(tag);
  2485 	    }
  2486 	    else
  2487 		cnt_html++;
  2488 	}
  2489     }
  2490 }
  2491 
  2492 /*
  2493  * check_for_html_entity:
  2494  *
  2495  * Check for &symbol; HTML.
  2496  *
  2497  * If there is a & in the line, followed at
  2498  * some point by a ; then we suspect HTML.
  2499  */
  2500 void check_for_html_entity(const char *aline)
  2501 {
  2502     const char *s,*amp,*scolon;
  2503     gchar *entity;
  2504     amp=strchr(aline,'&');
  2505     if (amp)
  2506     {
  2507 	scolon=strchr(amp,';');
  2508 	if (scolon)
  2509 	{
  2510 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2511 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2512 		    break;		/* Don't report "Jones & Son;" */
  2513 	    if (s>=scolon)
  2514 	    {
  2515 		if (pswit[ECHO_SWITCH])
  2516 		    g_print("\n%s\n",aline);
  2517 		if (!pswit[OVERVIEW_SWITCH])
  2518 		{
  2519 		    entity=g_strndup(amp,scolon-amp+1);
  2520 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2521 		      linecnt,(int)(amp-aline)+1,entity);
  2522 		    g_free(entity);
  2523 		}
  2524 		else
  2525 		    cnt_html++;
  2526 	    }
  2527 	}
  2528     }
  2529 }
  2530 
  2531 /*
  2532  * check_for_omitted_punctuation:
  2533  *
  2534  * Check for omitted punctuation at end of paragraph by working back
  2535  * through prevline. DW.
  2536  * Need to check this only for "normal" paras.
  2537  * So what is a "normal" para?
  2538  *    Not normal if one-liner (chapter headings, etc.)
  2539  *    Not normal if doesn't contain at least one locase letter
  2540  *    Not normal if starts with space
  2541  */
  2542 void check_for_omitted_punctuation(const char *prevline,
  2543   struct line_properties *last,int start_para_line)
  2544 {
  2545     gboolean letter_on_line=FALSE;
  2546     const char *s;
  2547     gunichar c;
  2548     gboolean closing_quote;
  2549     for (s=prevline;*s;s=g_utf8_next_char(s))
  2550 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2551 	{
  2552 	    letter_on_line=TRUE;
  2553 	    break;
  2554 	}
  2555     /*
  2556      * This next "if" is a problem.
  2557      * If we say "start_para_line <= linecnt - 1", that includes
  2558      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2559      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2560      * misses genuine one-line paragraphs.
  2561      */
  2562     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2563       g_utf8_get_char(prevline)>CHAR_SPACE)
  2564     {
  2565 	s=prevline+strlen(prevline);
  2566 	do
  2567 	{
  2568 	    s=g_utf8_prev_char(s);
  2569 	    c=g_utf8_get_char(s);
  2570 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2571 		closing_quote=TRUE;
  2572 	    else
  2573 		closing_quote=FALSE;
  2574 	} while (closing_quote && s>prevline);
  2575 	for (;s>prevline;s=g_utf8_prev_char(s))
  2576 	{
  2577 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2578 	    {
  2579 		if (pswit[ECHO_SWITCH])
  2580 		    g_print("\n%s\n",prevline);
  2581 		if (!pswit[OVERVIEW_SWITCH])
  2582 		    g_print("    Line %ld column %ld - "
  2583 		      "No punctuation at para end?\n",
  2584 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2585 		else
  2586 		    cnt_punct++;
  2587 		break;
  2588 	    }
  2589 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2590 		break;
  2591 	}
  2592     }
  2593 }
  2594 
  2595 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2596 {
  2597     const char *word=key;
  2598     int *dupcnt=value;
  2599     if (*dupcnt)
  2600 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2601 	  word,*dupcnt);
  2602     return FALSE;
  2603 }
  2604 
  2605 void print_as_windows_1252(const char *string)
  2606 {
  2607     gsize inbytes,outbytes;
  2608     gchar *buf,*bp;
  2609     static GIConv converter=(GIConv)-1;
  2610     if (!string)
  2611     {
  2612 	if (converter!=(GIConv)-1)
  2613 	    g_iconv_close(converter);
  2614 	converter=(GIConv)-1;
  2615 	return;
  2616     }
  2617     if (converter==(GIConv)-1)
  2618 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2619     if (converter!=(GIConv)-1)
  2620     {
  2621 	inbytes=outbytes=strlen(string);
  2622 	bp=buf=g_malloc(outbytes+1);
  2623 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2624 	*bp='\0';
  2625 	fputs(buf,stdout);
  2626 	g_free(buf);
  2627     }
  2628     else
  2629 	fputs(string,stdout);
  2630 }
  2631 
  2632 void print_as_utf_8(const char *string)
  2633 {
  2634     fputs(string,stdout);
  2635 }
  2636 
  2637 /*
  2638  * procfile:
  2639  *
  2640  * Process one file.
  2641  */
  2642 void procfile(const char *filename)
  2643 {
  2644     const char *s;
  2645     gchar *parastart=NULL;	/* first line of current para */
  2646     gchar *etext,*aline;
  2647     gchar *etext_ptr;
  2648     GError *err=NULL;
  2649     struct first_pass_results *first_pass_results;
  2650     struct warnings *warnings;
  2651     struct counters counters={0};
  2652     struct line_properties last={0};
  2653     struct parities parities={0};
  2654     struct pending pending={0};
  2655     gboolean isemptyline;
  2656     long start_para_line=0;
  2657     gboolean isnewpara=FALSE,enddash=FALSE;
  2658     last.start=CHAR_SPACE;
  2659     linecnt=checked_linecnt=0;
  2660     etext=read_etext(filename,&err);
  2661     if (!etext)
  2662     {
  2663 	if (pswit[STDOUT_SWITCH])
  2664 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2665 	else
  2666 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2667 	exit(1);
  2668     }
  2669     g_print("\n\nFile: %s\n\n",filename);
  2670     first_pass_results=first_pass(etext);
  2671     warnings=report_first_pass(first_pass_results);
  2672     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2673     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2674     /*
  2675      * Here we go with the main pass. Hold onto yer hat!
  2676      */
  2677     linecnt=0;
  2678     etext_ptr=etext;
  2679     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2680     {
  2681 	linecnt++;
  2682 	if (linecnt==1)
  2683 	    isnewpara=TRUE;
  2684 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2685 	    continue;    // skip DP page separators completely
  2686 	if (linecnt<first_pass_results->firstline ||
  2687 	  (first_pass_results->footerline>0 &&
  2688 	  linecnt>first_pass_results->footerline))
  2689 	{
  2690 	    if (pswit[HEADER_SWITCH])
  2691 	    {
  2692 		if (g_str_has_prefix(aline,"Title:"))
  2693 		    g_print("    %s\n",aline);
  2694 		if (g_str_has_prefix(aline,"Author:"))
  2695 		    g_print("    %s\n",aline);
  2696 		if (g_str_has_prefix(aline,"Release Date:"))
  2697 		    g_print("    %s\n",aline);
  2698 		if (g_str_has_prefix(aline,"Edition:"))
  2699 		    g_print("    %s\n\n",aline);
  2700 	    }
  2701 	    continue;		/* skip through the header */
  2702 	}
  2703 	checked_linecnt++;
  2704 	print_pending(aline,parastart,&pending);
  2705 	isemptyline=analyse_quotes(aline,linecnt,&counters);
  2706 	if (isnewpara && !isemptyline)
  2707 	{
  2708 	    /* This line is the start of a new paragraph. */
  2709 	    start_para_line=linecnt;
  2710 	    /* Capture its first line in case we want to report it later. */
  2711 	    g_free(parastart);
  2712 	    parastart=g_strdup(aline);
  2713 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2714 	    s=aline;
  2715 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2716 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2717 		s=g_utf8_next_char(s);
  2718 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2719 	    {
  2720 		/* and its first letter is lowercase */
  2721 		if (pswit[ECHO_SWITCH])
  2722 		    g_print("\n%s\n",aline);
  2723 		if (!pswit[OVERVIEW_SWITCH])
  2724 		    g_print("    Line %ld column %ld - "
  2725 		      "Paragraph starts with lower-case\n",
  2726 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2727 		else
  2728 		    cnt_punct++;
  2729 	    }
  2730 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2731 	}
  2732 	/* Check for an em-dash broken at line end. */
  2733 	if (enddash && g_utf8_get_char(aline)=='-')
  2734 	{
  2735 	    if (pswit[ECHO_SWITCH])
  2736 		g_print("\n%s\n",aline);
  2737 	    if (!pswit[OVERVIEW_SWITCH])
  2738 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2739 	    else
  2740 		cnt_punct++;
  2741 	}
  2742 	enddash=FALSE;
  2743 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2744 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2745 	    ;
  2746 	if (s>=aline && g_utf8_get_char(s)=='-')
  2747 	    enddash=TRUE;
  2748 	check_for_control_characters(aline);
  2749 	check_for_odd_characters(aline,warnings,isemptyline);
  2750 	if (warnings->longline)
  2751 	    check_for_long_line(aline);
  2752 	if (warnings->shortline)
  2753 	    check_for_short_line(aline,&last);
  2754 	last.blen=last.len;
  2755 	last.len=g_utf8_strlen(aline,-1);
  2756 	last.start=g_utf8_get_char(aline);
  2757 	check_for_starting_punctuation(aline);
  2758 	if (warnings->dash)
  2759 	{
  2760 	    check_for_spaced_emdash(aline);
  2761 	    check_for_spaced_dash(aline);
  2762 	}
  2763 	check_for_unmarked_paragraphs(aline);
  2764 	check_for_jeebies(aline);
  2765 	check_for_mta_from(aline);
  2766 	check_for_orphan_character(aline);
  2767 	check_for_pling_scanno(aline);
  2768 	check_for_extra_period(aline,warnings);
  2769 	check_for_following_punctuation(aline);
  2770 	check_for_typos(aline,warnings);
  2771 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2772 	check_for_double_punctuation(aline,warnings);
  2773 	check_for_spaced_quotes(aline);
  2774 	check_for_miscased_genative(aline);
  2775 	check_end_of_line(aline,warnings);
  2776 	check_for_unspaced_bracket(aline);
  2777 	if (warnings->endquote)
  2778 	    check_for_unpunctuated_endquote(aline);
  2779 	check_for_html_tag(aline);
  2780 	check_for_html_entity(aline);
  2781 	if (isemptyline)
  2782 	{
  2783 	    check_for_mismatched_quotes(&counters,&pending);
  2784 	    counters_reset(&counters);
  2785 	    /* let the next iteration know that it's starting a new para */
  2786 	    isnewpara=TRUE;
  2787 	    if (prevline)
  2788 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2789 	}
  2790 	g_free(prevline);
  2791 	prevline=g_strdup(aline);
  2792     }
  2793     linecnt++;
  2794     check_for_mismatched_quotes(&counters,&pending);
  2795     print_pending(NULL,parastart,&pending);
  2796     reset_pending(&pending);
  2797     if (prevline)
  2798     {
  2799 	g_free(prevline);
  2800 	prevline=NULL;
  2801     }
  2802     g_free(parastart);
  2803     g_free(prevline);
  2804     g_free(etext);
  2805     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2806 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2807     g_tree_unref(qword);
  2808     g_tree_unref(qperiod);
  2809     counters_destroy(&counters);
  2810     g_set_print_handler(NULL);
  2811     print_as_windows_1252(NULL);
  2812     if (pswit[MARKUP_SWITCH])  
  2813 	loseentities(NULL);
  2814 }
  2815 
  2816 /*
  2817  * flgets:
  2818  *
  2819  * Get one line from the input text, checking for
  2820  * the existence of exactly one CR/LF line-end per line.
  2821  *
  2822  * Returns: a pointer to the line.
  2823  */
  2824 char *flgets(char **etext,long lcnt)
  2825 {
  2826     gunichar c;
  2827     gboolean isCR=FALSE;
  2828     char *theline=*etext;
  2829     char *eos=theline;
  2830     gchar *s;
  2831     for (;;)
  2832     {
  2833 	c=g_utf8_get_char(*etext);
  2834 	*etext=g_utf8_next_char(*etext);
  2835 	if (!c)
  2836 	    return NULL;
  2837 	/* either way, it's end of line */
  2838 	if (c=='\n')
  2839 	{
  2840 	    if (isCR)
  2841 		break;
  2842 	    else
  2843 	    {
  2844 		/* Error - a LF without a preceding CR */
  2845 		if (pswit[LINE_END_SWITCH])
  2846 		{
  2847 		    if (pswit[ECHO_SWITCH])
  2848 		    {
  2849 			s=g_strndup(theline,eos-theline);
  2850 			g_print("\n%s\n",s);
  2851 			g_free(s);
  2852 		    }
  2853 		    if (!pswit[OVERVIEW_SWITCH])
  2854 			g_print("    Line %ld - No CR?\n",lcnt);
  2855 		    else
  2856 			cnt_lineend++;
  2857 		}
  2858 		break;
  2859 	    }
  2860 	}
  2861 	if (c=='\r')
  2862 	{
  2863 	    if (isCR)
  2864 	    {
  2865 		/* Error - two successive CRs */
  2866 		if (pswit[LINE_END_SWITCH])
  2867 		{
  2868 		    if (pswit[ECHO_SWITCH])
  2869 		    {
  2870 			s=g_strndup(theline,eos-theline);
  2871 			g_print("\n%s\n",s);
  2872 			g_free(s);
  2873 		    }
  2874 		    if (!pswit[OVERVIEW_SWITCH])
  2875 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2876 		    else
  2877 			cnt_lineend++;
  2878 		}
  2879 	    }
  2880 	    isCR=TRUE;
  2881 	}
  2882 	else
  2883 	{
  2884 	    if (pswit[LINE_END_SWITCH] && isCR)
  2885 	    {
  2886 		if (pswit[ECHO_SWITCH])
  2887 		{
  2888 		    s=g_strndup(theline,eos-theline);
  2889 		    g_print("\n%s\n",s);
  2890 		    g_free(s);
  2891 		}
  2892 		if (!pswit[OVERVIEW_SWITCH])
  2893 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2894 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2895 		else
  2896 		    cnt_lineend++;
  2897 		*eos=' ';
  2898 	    }
  2899 	    isCR=FALSE;
  2900 	    eos=g_utf8_next_char(eos);
  2901 	}
  2902     }
  2903     *eos='\0';
  2904     if (pswit[MARKUP_SWITCH])  
  2905 	postprocess_for_HTML(theline);
  2906     if (pswit[DP_SWITCH])  
  2907 	postprocess_for_DP(theline);
  2908     return theline;
  2909 }
  2910 
  2911 /*
  2912  * mixdigit:
  2913  *
  2914  * Takes a "word" as a parameter, and checks whether it
  2915  * contains a mixture of alpha and digits. Generally, this is an
  2916  * error, but may not be for cases like 4th or L5 12s. 3d.
  2917  *
  2918  * Returns: TRUE iff an is error found.
  2919  */
  2920 gboolean mixdigit(const char *checkword)
  2921 {
  2922     gboolean wehaveadigit,wehavealetter,query;
  2923     const char *s,*nondigit;
  2924     wehaveadigit=wehavealetter=query=FALSE;
  2925     for (s=checkword;*s;s=g_utf8_next_char(s))
  2926 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2927 	    wehavealetter=TRUE;
  2928 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2929 	    wehaveadigit=TRUE;
  2930     if (wehaveadigit && wehavealetter)
  2931     {
  2932 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2933 	query=TRUE;
  2934 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2935 	  nondigit=g_utf8_next_char(nondigit))
  2936 	    ;
  2937 	/* digits, ending in st, rd, nd, th of either case */
  2938 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2939 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2940 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2941 	  !g_ascii_strcasecmp(nondigit,"th"))
  2942 	    query=FALSE;
  2943 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2944 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2945 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2946 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2947 	    query=FALSE;
  2948 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2949 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2950 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2951 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2952 	    query=FALSE;
  2953 	/* digits, ending in l, L, s or d */
  2954 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2955 	  !strcmp(nondigit,"d"))
  2956 	    query=FALSE;
  2957 	/*
  2958 	 * L at the start of a number, representing Britsh pounds, like L500.
  2959 	 * This is cute. We know the current word is mixed digit. If the first
  2960 	 * letter is L, there must be at least one digit following. If both
  2961 	 * digits and letters follow, we have a genuine error, else we have a
  2962 	 * capital L followed by digits, and we accept that as a non-error.
  2963 	 */
  2964 	if (g_utf8_get_char(checkword)=='L' &&
  2965 	  !mixdigit(g_utf8_next_char(checkword)))
  2966 	    query=FALSE;
  2967     }
  2968     return query;
  2969 }
  2970 
  2971 /*
  2972  * getaword:
  2973  *
  2974  * Extracts the first/next "word" from the line, and returns it.
  2975  * A word is defined as one English word unit--or at least that's the aim.
  2976  * "ptr" is advanced to the position in the line where we will start
  2977  * looking for the next word.
  2978  *
  2979  * Returns: A newly-allocated string.
  2980  */
  2981 gchar *getaword(const char **ptr)
  2982 {
  2983     const char *s,*t;
  2984     GString *word;
  2985     gunichar c,pc;
  2986     word=g_string_new(NULL);
  2987     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2988       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2989       **ptr;*ptr=g_utf8_next_char(*ptr))
  2990 	;
  2991     /*
  2992      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2993      * Especially yucky is the case of L1,000
  2994      * This section looks for a pattern of characters including a digit
  2995      * followed by a comma or period followed by one or more digits.
  2996      * If found, it returns this whole pattern as a word; otherwise we discard
  2997      * the results and resume our normal programming.
  2998      */
  2999     s=*ptr;
  3000     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3001       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3002       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3003 	g_string_append_unichar(word,g_utf8_get_char(s));
  3004     if (word->len)
  3005     {
  3006 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3007 	{
  3008 	    c=g_utf8_get_char(t);
  3009 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3010 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3011 	    {
  3012 		*ptr=s;
  3013 		return g_string_free(word,FALSE);
  3014 	    }
  3015 	}
  3016     }
  3017     /* we didn't find a punctuated number - do the regular getword thing */
  3018     g_string_truncate(word,0);
  3019     c=g_utf8_get_char(*ptr);
  3020     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3021       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3022 	g_string_append_unichar(word,c);
  3023     return g_string_free(word,FALSE);
  3024 }
  3025 
  3026 /*
  3027  * isroman:
  3028  *
  3029  * Is this word a Roman Numeral?
  3030  *
  3031  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3032  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3033  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3034  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3035  * expressions thereof, except when it came to taxes. Allow any number of M,
  3036  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3037  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3038  * of optional Is.
  3039  */
  3040 gboolean isroman(const char *t)
  3041 {
  3042     const char *s;
  3043     if (!t || !*t)
  3044 	return FALSE;
  3045     s=t;
  3046     while (g_utf8_get_char(t)=='m' && *t)
  3047 	t++;
  3048     if (g_utf8_get_char(t)=='d')
  3049 	t++;
  3050     if (g_str_has_prefix(t,"cm"))
  3051 	t+=2;
  3052     if (g_str_has_prefix(t,"cd"))
  3053 	t+=2;
  3054     while (g_utf8_get_char(t)=='c' && *t)
  3055 	t++;
  3056     if (g_str_has_prefix(t,"xl"))
  3057 	t+=2;
  3058     if (g_str_has_prefix(t,"xc"))
  3059 	t+=2;
  3060     if (g_utf8_get_char(t)=='l')
  3061 	t++;
  3062     while (g_utf8_get_char(t)=='x' && *t)
  3063 	t++;
  3064     if (g_str_has_prefix(t,"ix"))
  3065 	t+=2;
  3066     if (g_str_has_prefix(t,"iv"))
  3067 	t+=2;
  3068     if (g_utf8_get_char(t)=='v')
  3069 	t++;
  3070     while (g_utf8_get_char(t)=='i' && *t)
  3071 	t++;
  3072     return !*t;
  3073 }
  3074 
  3075 /*
  3076  * postprocess_for_DP:
  3077  *
  3078  * Invoked with the -d switch from flgets().
  3079  * It simply "removes" from the line a hard-coded set of common
  3080  * DP-specific tags, so that the line passed to the main routine has
  3081  * been pre-cleaned of DP markup.
  3082  */
  3083 void postprocess_for_DP(char *theline)
  3084 {
  3085     char *s,*t;
  3086     int i;
  3087     if (!*theline) 
  3088 	return;
  3089     for (i=0;*DPmarkup[i];i++)
  3090 	while ((s=strstr(theline,DPmarkup[i])))
  3091 	{
  3092 	    t=s+strlen(DPmarkup[i]);
  3093 	    memmove(s,t,strlen(t)+1);
  3094 	}
  3095 }
  3096 
  3097 /*
  3098  * postprocess_for_HTML:
  3099  *
  3100  * Invoked with the -m switch from flgets().
  3101  * It simply "removes" from the line a hard-coded set of common
  3102  * HTML tags and "replaces" a hard-coded set of common HTML
  3103  * entities, so that the line passed to the main routine has
  3104  * been pre-cleaned of HTML.
  3105  */
  3106 void postprocess_for_HTML(char *theline)
  3107 {
  3108     while (losemarkup(theline))
  3109 	;
  3110     loseentities(theline);
  3111 }
  3112 
  3113 char *losemarkup(char *theline)
  3114 {
  3115     char *s,*t;
  3116     int i;
  3117     s=strchr(theline,'<');
  3118     t=s?strchr(s,'>'):NULL;
  3119     if (!s || !t)
  3120 	return NULL;
  3121     for (i=0;*markup[i];i++)
  3122 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3123 	{
  3124 	    t=g_utf8_next_char(t);
  3125 	    memmove(s,t,strlen(t)+1);
  3126 	    return s;
  3127 	}
  3128     /* It's an unrecognized <xxx>. */
  3129     return NULL;
  3130 }
  3131 
  3132 void loseentities(char *theline)
  3133 {
  3134     int i;
  3135     gsize nb;
  3136     char *amp,*scolon;
  3137     gchar *s,*t;
  3138     gunichar c;
  3139     GTree *entities=NULL;
  3140     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3141     if (!theline)
  3142     {
  3143 	if (entities)
  3144 	    g_tree_destroy(entities);
  3145 	entities=NULL;
  3146 	if (translit!=(GIConv)-1)
  3147 	    g_iconv_close(translit);
  3148 	translit=(GIConv)-1;
  3149 	if (to_utf8!=(GIConv)-1)
  3150 	    g_iconv_close(to_utf8);
  3151 	to_utf8=(GIConv)-1;
  3152 	return;
  3153     }
  3154     if (!*theline)
  3155 	return;
  3156     if (!entities)
  3157     {
  3158 	entities=g_tree_new((GCompareFunc)strcmp);
  3159 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3160 	    g_tree_insert(entities,HTMLentities[i].name,
  3161 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3162     }
  3163     if (translit==(GIConv)-1)
  3164 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3165     if (to_utf8==(GIConv)-1)
  3166 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3167     while((amp=strchr(theline,'&')))
  3168     {
  3169 	scolon=strchr(amp,';');
  3170 	if (scolon)
  3171 	{
  3172 	    if (amp[1]=='#')
  3173 	    {
  3174 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3175 		    c=strtol(amp+2,NULL,10);
  3176 		else if (amp[2]=='x' &&
  3177 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3178 		    c=strtol(amp+3,NULL,16);
  3179 	    }
  3180 	    else
  3181 	    {
  3182 		s=g_strndup(amp+1,scolon-(amp+1));
  3183 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3184 		g_free(s);
  3185 	    }
  3186 	}
  3187 	else
  3188 	    c=0;
  3189 	if (c)
  3190 	{
  3191 	    theline=amp;
  3192 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3193 		theline+=g_unichar_to_utf8(c,theline);
  3194 	    else
  3195 	    {
  3196 		s=g_malloc(6);
  3197 		nb=g_unichar_to_utf8(c,s);
  3198 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3199 		g_free(s);
  3200 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3201 		g_free(t);
  3202 		memcpy(theline,s,nb);
  3203 		g_free(s);
  3204 		theline+=nb;
  3205 	    }
  3206 	    memmove(theline,g_utf8_next_char(scolon),
  3207 	      strlen(g_utf8_next_char(scolon))+1);
  3208 	}
  3209 	else
  3210 	    theline=g_utf8_next_char(amp);
  3211     }
  3212 }
  3213 
  3214 gboolean tagcomp(const char *strin,const char *basetag)
  3215 {
  3216     gboolean retval;
  3217     gchar *s,*t;
  3218     if (g_utf8_get_char(strin)=='/')
  3219 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3220     else
  3221 	t=g_utf8_casefold(strin,-1);
  3222     s=g_utf8_casefold(basetag,-1);
  3223     retval=g_str_has_prefix(t,s);
  3224     g_free(s);
  3225     g_free(t);
  3226     return retval;
  3227 }
  3228 
  3229 void proghelp(GOptionContext *context)
  3230 {
  3231     gchar *help;
  3232     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3233     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3234     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3235     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3236       "For details, read the file COPYING.\n",stderr);
  3237     fputs("This is Free Software; "
  3238       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3239     fputs("read the file COPYING for details.\n\n",stderr);
  3240     help=g_option_context_get_help(context,TRUE,NULL);
  3241     fputs(help,stderr);
  3242     g_free(help);
  3243     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3244     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3245       "non-ASCII\n",stderr);
  3246     fputs("characters like accented letters, "
  3247       "lines longer than 75 or shorter than 55,\n",stderr);
  3248     fputs("unbalanced quotes or brackets, "
  3249       "a variety of badly formatted punctuation, \n",stderr);
  3250     fputs("HTML tags, some likely typos. "
  3251       "It is NOT a substitute for human judgement.\n",stderr);
  3252     fputs("\n",stderr);
  3253 }