bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sun Sep 29 09:18:05 2013 +0100 (2013-09-29)
changeset 176 302b4681a857
parent 173 783eff3047bc
child 178 db7b24d83bed
permissions -rw-r--r--
Fix bug #13: Character sets
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 static GOptionEntry options[]={
   136     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   137       "Ignore DP-specific markup", NULL },
   138     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   139       "Don't echo queried line", NULL },
   140     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   141       "Check single quotes", NULL },
   142     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   143       "Check common typos", NULL },
   144     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   145       "Require closure of quotes on every paragraph", NULL },
   146     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   147       "Disable paranoid querying of everything", NULL },
   148     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   149       "Disable line end checking", NULL },
   150     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   151       "Overview: just show counts", NULL },
   152     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   153       "Output errors to stdout instead of stderr", NULL },
   154     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   155       "Echo header fields", NULL },
   156     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   157       "Ignore markup in < >", NULL },
   158     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   159       "Use file of user-defined typos", NULL },
   160     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   161       "Defaults for use on www upload", NULL },
   162     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   163       "Verbose - list everything", NULL },
   164     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   165       "Set of characters valid for this ebook", "NAME" },
   166     { NULL }
   167 };
   168 
   169 long cnt_quote;		/* for overview mode, count of quote queries */
   170 long cnt_brack;		/* for overview mode, count of brackets queries */
   171 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   172 long cnt_odd;		/* for overview mode, count of odd character queries */
   173 long cnt_long;		/* for overview mode, count of long line errors */
   174 long cnt_short;		/* for overview mode, count of short line queries */
   175 long cnt_punct;		/* for overview mode,
   176 			   count of punctuation and spacing queries */
   177 long cnt_dash;		/* for overview mode, count of dash-related queries */
   178 long cnt_word;		/* for overview mode, count of word queries */
   179 long cnt_html;		/* for overview mode, count of html queries */
   180 long cnt_lineend;	/* for overview mode, count of line-end queries */
   181 long cnt_spacend;	/* count of lines with space at end */
   182 long linecnt;		/* count of total lines in the file */
   183 long checked_linecnt;	/* count of lines actually checked */
   184 
   185 void proghelp(GOptionContext *context);
   186 void procfile(const char *);
   187 
   188 gchar *running_from;
   189 
   190 gboolean mixdigit(const char *);
   191 gchar *getaword(const char **);
   192 char *flgets(char **,long);
   193 void postprocess_for_HTML(char *);
   194 char *linehasmarkup(char *);
   195 char *losemarkup(char *);
   196 gboolean tagcomp(const char *,const char *);
   197 void loseentities(char *);
   198 gboolean isroman(const char *);
   199 void postprocess_for_DP(char *);
   200 void print_as_windows_1252(const char *string);
   201 void print_as_utf_8(const char *string);
   202 
   203 GTree *qword,*qperiod;
   204 
   205 #ifdef __WIN32__
   206 UINT saved_cp;
   207 #endif
   208 
   209 gboolean set_charset(const char *name,GError **err)
   210 {
   211     /* The various UNICODE encodings all share the same character set. */
   212     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   213       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   214       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   215       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   216       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   217     int i;
   218     if (charset)
   219 	g_free(charset);
   220     if (charset_validator!=(GIConv)-1)
   221 	g_iconv_close(charset_validator);
   222     if (!name || !g_strcasecmp(name,"auto"))
   223     {
   224 	charset=NULL;
   225 	charset_validator=(GIConv)-1;
   226 	return TRUE;
   227     }
   228     else
   229 	charset=g_strdup(name);
   230     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   231 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   232 	{
   233 	    g_free(charset);
   234 	    charset=g_strdup("UTF-8");
   235 	    break;
   236 	}
   237     if (!strcmp(charset,"UTF-8"))
   238 	charset_validator=(GIConv)-1;
   239     else
   240     {
   241 	charset_validator=g_iconv_open(charset,"UTF-8");
   242 	if (charset_validator==(GIConv)-1)
   243 	{
   244 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   245 	      "Unknown character set \"%s\"",charset);
   246 	    return FALSE;
   247 	}
   248     }
   249     return TRUE;
   250 }
   251 
   252 void parse_options(int *argc,char ***argv)
   253 {
   254     GError *err=NULL;
   255     GOptionContext *context;
   256     context=g_option_context_new(
   257       "file - looks for errors in Project Gutenberg(TM) etexts");
   258     g_option_context_add_main_entries(context,options,NULL);
   259     if (!g_option_context_parse(context,argc,argv,&err))
   260     {
   261 	g_printerr("Bookloupe: %s\n",err->message);
   262 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   263 	exit(1);
   264     }
   265     /* Paranoid checking is turned OFF, not on, by its switch */
   266     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   267     if (pswit[PARANOID_SWITCH])
   268 	/* if running in paranoid mode, typo checks default to enabled */
   269 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   270     /* Line-end checking is turned OFF, not on, by its switch */
   271     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   272     /* Echoing is turned OFF, not on, by its switch */
   273     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   274     if (pswit[OVERVIEW_SWITCH])
   275 	/* just print summary; don't echo */
   276 	pswit[ECHO_SWITCH]=FALSE;
   277     /*
   278      * Web uploads - for the moment, this is really just a placeholder
   279      * until we decide what processing we really want to do on web uploads
   280      */
   281     if (pswit[WEB_SWITCH])
   282     {
   283 	/* specific override for web uploads */
   284 	pswit[ECHO_SWITCH]=TRUE;
   285 	pswit[SQUOTE_SWITCH]=FALSE;
   286 	pswit[TYPO_SWITCH]=TRUE;
   287 	pswit[QPARA_SWITCH]=FALSE;
   288 	pswit[PARANOID_SWITCH]=TRUE;
   289 	pswit[LINE_END_SWITCH]=FALSE;
   290 	pswit[OVERVIEW_SWITCH]=FALSE;
   291 	pswit[STDOUT_SWITCH]=FALSE;
   292 	pswit[HEADER_SWITCH]=TRUE;
   293 	pswit[VERBOSE_SWITCH]=FALSE;
   294 	pswit[MARKUP_SWITCH]=FALSE;
   295 	pswit[USERTYPO_SWITCH]=FALSE;
   296 	pswit[DP_SWITCH]=FALSE;
   297     }
   298     if (opt_charset && !set_charset(opt_charset,&err))
   299     {
   300 	g_printerr("%s\n",err->message);
   301 	exit(1);
   302     }
   303     g_free(opt_charset);
   304     opt_charset=NULL;
   305     if (*argc<2)
   306     {
   307 	proghelp(context);
   308 	exit(1);
   309     }
   310     g_option_context_free(context);
   311 }
   312 
   313 /*
   314  * read_user_scannos:
   315  *
   316  * Read in the user-defined stealth scanno list.
   317  */
   318 void read_user_scannos(void)
   319 {
   320     GError *err=NULL;
   321     gchar *usertypo_file;
   322     gboolean okay;
   323     int i;
   324     gsize len,nb;
   325     gchar *contents,*utf8,**lines;
   326     usertypo_file=g_strdup("bookloupe.typ");
   327     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   328     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   329     {
   330 	g_clear_error(&err);
   331 	g_free(usertypo_file);
   332 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   333 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   334     }
   335     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   336     {
   337 	g_clear_error(&err);
   338 	g_free(usertypo_file);
   339 	usertypo_file=g_strdup("gutcheck.typ");
   340 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   341     }
   342     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   343     {
   344 	g_clear_error(&err);
   345 	g_free(usertypo_file);
   346 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   347 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   348     }
   349     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   350     {
   351 	g_free(usertypo_file);
   352 	g_print("   --> I couldn't find bookloupe.typ "
   353 	  "-- proceeding without user typos.\n");
   354 	return;
   355     }
   356     else if (!okay)
   357     {
   358 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   359 	g_free(usertypo_file);
   360 	g_clear_error(&err);
   361 	exit(1);
   362     }
   363     if (g_utf8_validate(contents,len,NULL))
   364     {
   365 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   366 	if (!charset)
   367 	    (void)set_charset("UNICODE",NULL);
   368     }
   369     else
   370 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   371     g_free(contents);
   372     lines=g_strsplit_set(utf8,"\r\n",0);
   373     g_free(utf8);
   374     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   375     for (i=0;lines[i];i++)
   376 	if (*(unsigned char *)lines[i]>'!')
   377 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   378 	else
   379 	    g_free(lines[i]);
   380     g_free(lines);
   381 }
   382 
   383 /*
   384  * read_etext:
   385  *
   386  * Read an etext returning a newly allocated string containing the file
   387  * contents or NULL on error.
   388  */
   389 gchar *read_etext(const char *filename,GError **err)
   390 {
   391     GError *tmp_err=NULL;
   392     gchar *contents,*utf8;
   393     gsize len,bytes_read,bytes_written;
   394     int i,line,col;
   395     if (!g_file_get_contents(filename,&contents,&len,err))
   396 	return NULL;
   397     if (g_utf8_validate(contents,len,NULL))
   398     {
   399 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   400 	g_set_print_handler(print_as_utf_8);
   401 #ifdef __WIN32__
   402 	SetConsoleOutputCP(CP_UTF8);
   403 #endif
   404     }
   405     else
   406     {
   407 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   408 	  &bytes_written,&tmp_err);
   409 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   410 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   411 	{
   412 	    line=col=1;
   413 	    for(i=0;i<bytes_read;i++)
   414 		if (contents[i]=='\n')
   415 		{
   416 		    line++;
   417 		    col=1;
   418 		}
   419 		else if (contents[i]!='\r')
   420 		    col++;
   421 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   422 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   423 	      "valid Windows-1252 character",
   424 	      ((unsigned char *)contents)[bytes_read],line,col);
   425 	}
   426 	else if (tmp_err)
   427 	    g_propagate_error(err,tmp_err);
   428 	g_set_print_handler(print_as_windows_1252);
   429 #ifdef __WIN32__
   430 	SetConsoleOutputCP(1252);
   431 #endif
   432     }
   433     g_free(contents);
   434     return utf8;
   435 }
   436 
   437 void cleanup_on_exit(void)
   438 {
   439 #ifdef __WIN32__
   440     SetConsoleOutputCP(saved_cp);
   441 #endif
   442 }
   443 
   444 int main(int argc,char **argv)
   445 {
   446 #ifdef __WIN32__
   447     atexit(cleanup_on_exit);
   448     saved_cp=GetConsoleOutputCP();
   449 #endif
   450     running_from=g_path_get_dirname(argv[0]);
   451     parse_options(&argc,&argv);
   452     if (pswit[USERTYPO_SWITCH])
   453 	read_user_scannos();
   454     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   455     procfile(argv[1]);
   456     if (pswit[OVERVIEW_SWITCH])
   457     {
   458 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   459 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   460 	g_print("    --------------- Queries found --------------\n");
   461 	if (cnt_long)
   462 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   463 	if (cnt_short)
   464 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   465 	if (cnt_lineend)
   466 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   467 	if (cnt_word)
   468 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   469 	if (cnt_quote)
   470 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   471 	if (cnt_brack)
   472 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   473 	if (cnt_bin)
   474 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   475 	if (cnt_odd)
   476 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   477 	if (cnt_punct)
   478 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   479 	if (cnt_dash)
   480 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   481 	if (cnt_html)
   482 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   483 	g_print("\n");
   484 	g_print("    TOTAL QUERIES		  %14ld\n",
   485 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   486 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   487     }
   488     g_free(running_from);
   489     if (usertypo)
   490 	g_tree_unref(usertypo);
   491     set_charset(NULL,NULL);
   492     return 0;
   493 }
   494 
   495 void count_dashes(const char *line,const char *dash,
   496   struct dash_results *results)
   497 {
   498     int i;
   499     gchar **tokens;
   500     gunichar pc,nc;
   501     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   502     if (!*line)
   503 	return;
   504     tokens=g_strsplit(line,dash,0);
   505     if (tokens[1])
   506 	results->base++;
   507     for(i=1;tokens[i];i++)
   508     {
   509 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   510 	nc=g_utf8_get_char(tokens[i]);
   511 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   512 	    spaced=TRUE;
   513 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   514 	    spaced2=TRUE;
   515 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   516 	    unspaced=TRUE;
   517     }
   518     if (spaced)
   519 	results->space++;
   520     if (spaced2)
   521 	/* count of lines with em-dashes with spaces both sides */
   522 	results->non_PG_space++;
   523     if (unspaced)
   524 	/* count of lines with PG-type em-dashes with no spaces */
   525 	results->PG_space++;
   526     g_strfreev(tokens);
   527 }
   528 
   529 /*
   530  * first_pass:
   531  *
   532  * Run a first pass - verify that it's a valid PG
   533  * file, decide whether to report some things that
   534  * occur many times in the text like long or short
   535  * lines, non-standard dashes, etc.
   536  */
   537 struct first_pass_results *first_pass(const char *etext)
   538 {
   539     gunichar laststart=CHAR_SPACE;
   540     const char *s;
   541     gchar *lc_line;
   542     int i,j,lbytes,llen;
   543     gchar **lines;
   544     unsigned int lastlen=0,lastblen=0;
   545     long spline=0,nspline=0;
   546     static struct first_pass_results results={0};
   547     struct dash_results tmp_dash_results;
   548     gchar *inword;
   549     QuoteClass qc;
   550     lines=g_strsplit(etext,"\n",0);
   551     for (j=0;lines[j];j++)
   552     {
   553 	lbytes=strlen(lines[j]);
   554 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   555 	    lines[j][--lbytes]='\0';
   556 	llen=g_utf8_strlen(lines[j],lbytes);
   557 	linecnt++;
   558 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   559 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   560 	{
   561 	    if (spline)
   562 		g_print("   --> Duplicate header?\n");
   563 	    spline=linecnt+1;   /* first line of non-header text, that is */
   564 	}
   565 	if (!strncmp(lines[j],"*** START",9) &&
   566 	  strstr(lines[j],"PROJECT GUTENBERG"))
   567 	{
   568 	    if (nspline)
   569 		g_print("   --> Duplicate header?\n");
   570 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   571 	}
   572 	if (spline || nspline)
   573 	{
   574 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   575 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   576 	    {
   577 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   578 		{
   579 		    if (results.footerline)
   580 		    {
   581 			/* it's an old-form header - we can detect duplicates */
   582 			if (!nspline)
   583 			    g_print("   --> Duplicate footer?\n");
   584 		    }
   585 		    else
   586 			results.footerline=linecnt;
   587 		}
   588 	    }
   589 	    g_free(lc_line);
   590 	}
   591 	if (spline)
   592 	    results.firstline=spline;
   593 	if (nspline)
   594 	    results.firstline=nspline;  /* override with new */
   595 	if (results.footerline)
   596 	    continue;    /* don't count the boilerplate in the footer */
   597 	results.totlen+=llen;
   598 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   599 	{
   600 	    if (g_utf8_get_char(s)>127)
   601 		results.binlen++;
   602 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   603 		results.alphalen++;
   604 	    if (s>lines[j])
   605 	    {
   606 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   607 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   608 		else
   609 		    qc=INVALID_QUOTE;
   610 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   611 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   612 		    results.endquote_count++;
   613 	    }
   614 	}
   615 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   616 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   617 	    results.shortline++;
   618 	if (lbytes>0 &&
   619 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   620 	    cnt_spacend++;
   621 	if (strstr(lines[j],".,"))
   622 	    results.dotcomma++;
   623 	/* only count ast lines for ignoring purposes where there is */
   624 	/* locase text on the line */
   625 	if (strchr(lines[j],'*'))
   626 	{
   627 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   628 		if (g_unichar_islower(g_utf8_get_char(s)))
   629 		    break;
   630 	    if (*s)
   631 		results.astline++;
   632 	}
   633 	if (strchr(lines[j],'/'))
   634 	    results.fslashline++;
   635 	if (lbytes>0)
   636 	{
   637 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   638 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   639 	      s=g_utf8_prev_char(s))
   640 		;
   641 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   642 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   643 		results.hyphens++;
   644 	}
   645 	if (llen>LONGEST_PG_LINE)
   646 	    results.longline++;
   647 	if (llen>WAY_TOO_LONG)
   648 	    results.verylongline++;
   649 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   650 	{
   651 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   652 	    if (i>0)
   653 		results.htmcount++;
   654 	    if (strstr(lines[j],"<i>"))
   655 		results.htmcount+=4; /* bonus marks! */
   656 	}
   657 	/* Check for spaced em-dashes */
   658 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   659 	count_dashes(lines[j],"--",&tmp_dash_results);
   660 	count_dashes(lines[j],"—",&tmp_dash_results);
   661 	if (tmp_dash_results.base)
   662 	    results.emdash.base++;
   663 	if (tmp_dash_results.non_PG_space)
   664 	    results.emdash.non_PG_space++;
   665 	if (tmp_dash_results.PG_space)
   666 	    results.emdash.PG_space++;
   667 	for (s=lines[j];*s;)
   668 	{
   669 	    inword=getaword(&s);
   670 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   671 		results.Dutchcount++;
   672 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   673 		results.Frenchcount++;
   674 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   675 		results.standalone_digit++;
   676 	    g_free(inword);
   677 	}
   678 	/* Check for spaced dashes */
   679 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   680 	    results.spacedash++;
   681 	lastblen=lastlen;
   682 	lastlen=llen;
   683 	laststart=lines[j][0];
   684     }
   685     g_strfreev(lines);
   686     return &results;
   687 }
   688 
   689 /*
   690  * report_first_pass:
   691  *
   692  * Make some snap decisions based on the first pass results.
   693  */
   694 struct warnings *report_first_pass(struct first_pass_results *results)
   695 {
   696     static struct warnings warnings={0};
   697     if (cnt_spacend>0)
   698 	g_print("   --> %ld lines in this file have white space at end\n",
   699 	  cnt_spacend);
   700     warnings.dotcomma=1;
   701     if (results->dotcomma>5)
   702     {
   703 	warnings.dotcomma=0;
   704 	g_print("   --> %ld lines in this file contain '.,'. "
   705 	  "Not reporting them.\n",results->dotcomma);
   706     }
   707     /*
   708      * If more than 50 lines, or one-tenth, are short,
   709      * don't bother reporting them.
   710      */
   711     warnings.shortline=1;
   712     if (results->shortline>50 || results->shortline*10>linecnt)
   713     {
   714 	warnings.shortline=0;
   715 	g_print("   --> %ld lines in this file are short. "
   716 	  "Not reporting short lines.\n",results->shortline);
   717     }
   718     /*
   719      * If more than 50 lines, or one-tenth, are long,
   720      * don't bother reporting them.
   721      */
   722     warnings.longline=1;
   723     if (results->longline>50 || results->longline*10>linecnt)
   724     {
   725 	warnings.longline=0;
   726 	g_print("   --> %ld lines in this file are long. "
   727 	  "Not reporting long lines.\n",results->longline);
   728     }
   729     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   730     warnings.ast=1;
   731     if (results->astline>10)
   732     {
   733 	warnings.ast=0;
   734 	g_print("   --> %ld lines in this file contain asterisks. "
   735 	  "Not reporting them.\n",results->astline);
   736     }
   737     /*
   738      * If more than 10 lines contain forward slashes,
   739      * don't bother reporting them.
   740      */
   741     warnings.fslash=1;
   742     if (results->fslashline>10)
   743     {
   744 	warnings.fslash=0;
   745 	g_print("   --> %ld lines in this file contain forward slashes. "
   746 	  "Not reporting them.\n",results->fslashline);
   747     }
   748     /*
   749      * If more than 20 lines contain unpunctuated endquotes,
   750      * don't bother reporting them.
   751      */
   752     warnings.endquote=1;
   753     if (results->endquote_count>20)
   754     {
   755 	warnings.endquote=0;
   756 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   757 	  "Not reporting them.\n",results->endquote_count);
   758     }
   759     /*
   760      * If more than 15 lines contain standalone digits,
   761      * don't bother reporting them.
   762      */
   763     warnings.digit=1;
   764     if (results->standalone_digit>10)
   765     {
   766 	warnings.digit=0;
   767 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   768 	  "Not reporting them.\n",results->standalone_digit);
   769     }
   770     /*
   771      * If more than 20 lines contain hyphens at end,
   772      * don't bother reporting them.
   773      */
   774     warnings.hyphen=1;
   775     if (results->hyphens>20)
   776     {
   777 	warnings.hyphen=0;
   778 	g_print("   --> %ld lines in this file have hyphens at end. "
   779 	  "Not reporting them.\n",results->hyphens);
   780     }
   781     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   782     {
   783 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   784 	pswit[MARKUP_SWITCH]=1;
   785     }
   786     if (results->verylongline>0)
   787 	g_print("   --> %ld lines in this file are VERY long!\n",
   788 	  results->verylongline);
   789     /*
   790      * If there are more non-PG spaced dashes than PG em-dashes,
   791      * assume it's deliberate.
   792      * Current PG guidelines say don't use them, but older texts do,
   793      * and some people insist on them whatever the guidelines say.
   794      */
   795     warnings.dash=1;
   796     if (results->spacedash+results->emdash.non_PG_space>
   797       results->emdash.PG_space)
   798     {
   799 	warnings.dash=0;
   800 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   801 	  "Not reporting them.\n",
   802 	  results->spacedash+results->emdash.non_PG_space);
   803     }
   804     if (charset)
   805 	warnings.bin=0;
   806     else
   807     {
   808 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
   809 	warnings.bin=1;
   810 	/* If more than a quarter of characters are hi-bit, bug out. */
   811 	if (results->binlen*4>results->totlen)
   812 	{
   813 	    g_print("   --> This file does not appear to be ASCII. "
   814 	      "Terminating. Best of luck with it!\n");
   815 	    exit(1);
   816 	}
   817 	if (results->alphalen*4<results->totlen)
   818 	{
   819 	    g_print("   --> This file does not appear to be text. "
   820 	      "Terminating. Best of luck with it!\n");
   821 	    exit(1);
   822 	}
   823 	if (results->binlen*100>results->totlen || results->binlen>100)
   824 	{
   825 	    g_print("   --> There are a lot of foreign letters here. "
   826 	      "Not reporting them.\n");
   827 	    if (!pswit[VERBOSE_SWITCH])
   828 		warnings.bin=0;
   829 	}
   830     }
   831     warnings.isDutch=FALSE;
   832     if (results->Dutchcount>50)
   833     {
   834 	warnings.isDutch=TRUE;
   835 	g_print("   --> This looks like Dutch - "
   836 	  "switching off dashes and warnings for 's Middags case.\n");
   837     }
   838     warnings.isFrench=FALSE;
   839     if (results->Frenchcount>50)
   840     {
   841 	warnings.isFrench=TRUE;
   842 	g_print("   --> This looks like French - "
   843 	  "switching off some doublepunct.\n");
   844     }
   845     if (results->firstline && results->footerline)
   846 	g_print("    The PG header and footer appear to be already on.\n");
   847     else
   848     {
   849 	if (results->firstline)
   850 	    g_print("    The PG header is on - no footer.\n");
   851 	if (results->footerline)
   852 	    g_print("    The PG footer is on - no header.\n");
   853     }
   854     g_print("\n");
   855     if (pswit[VERBOSE_SWITCH])
   856     {
   857 	warnings.shortline=1;
   858 	warnings.dotcomma=1;
   859 	warnings.longline=1;
   860 	warnings.dash=1;
   861 	warnings.digit=1;
   862 	warnings.ast=1;
   863 	warnings.fslash=1;
   864 	warnings.hyphen=1;
   865 	warnings.endquote=1;
   866 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   867     }
   868     if (warnings.isDutch)
   869 	warnings.dash=0;
   870     if (results->footerline>0 && results->firstline>0 &&
   871       results->footerline>results->firstline &&
   872       results->footerline-results->firstline<100)
   873     {
   874 	g_print("   --> I don't really know where this text starts. \n");
   875 	g_print("       There are no reference points.\n");
   876 	g_print("       I'm going to have to report the header and footer "
   877 	  "as well.\n");
   878 	results->firstline=0;
   879     }
   880     return &warnings;
   881 }
   882 
   883 /*
   884  * analyse_quotes:
   885  *
   886  * Look along the line, accumulate the count of quotes, and see
   887  * if this is an empty line - i.e. a line with nothing on it
   888  * but spaces.
   889  * If line has just spaces, period, * and/or - on it, don't
   890  * count it, since empty lines with asterisks or dashes to
   891  * separate sections are common.
   892  *
   893  * Returns: TRUE if the line is empty.
   894  */
   895 gboolean analyse_quotes(const char *aline,struct counters *counters)
   896 {
   897     int guessquote=0;
   898     /* assume the line is empty until proven otherwise */
   899     gboolean isemptyline=TRUE;
   900     const char *s=aline,*sprev,*snext;
   901     gunichar c;
   902     sprev=NULL;
   903     GError *tmp_err=NULL;
   904     while (*s)
   905     {
   906 	snext=g_utf8_next_char(s);
   907 	c=g_utf8_get_char(s);
   908 	if (CHAR_IS_DQUOTE(c))
   909 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
   910 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
   911 	{
   912 	    if (s==aline)
   913 	    {
   914 		/*
   915 		 * At start of line, it can only be a quotation mark.
   916 		 * Hardcode a very common exception!
   917 		 */
   918 		if (!g_str_has_prefix(snext,"tis") &&
   919 		  !g_str_has_prefix(snext,"Tis"))
   920 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   921 	    }
   922 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   923 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   924 		/* Do nothing! it's definitely an apostrophe, not a quote */
   925 		;
   926 	    /* it's outside a word - let's check it out */
   927 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   928 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   929 	    {
   930 		/* certainly looks like a quotation mark */
   931 		if (!g_str_has_prefix(snext,"tis") &&
   932 		  !g_str_has_prefix(snext,"Tis"))
   933 		    /* hardcode a very common exception! */
   934 		{
   935 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
   936 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   937 		    else
   938 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
   939 		}
   940 	    }
   941 	    else
   942 	    {
   943 		/* now - is it a quotation mark? */
   944 		guessquote=0;   /* accumulate clues */
   945 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   946 		{
   947 		    /* it follows a letter - could be either */
   948 		    guessquote++;
   949 		    if (g_utf8_get_char(sprev)=='s')
   950 		    {
   951 			/* looks like a plural apostrophe */
   952 			guessquote-=3;
   953 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   954 			    /* bonus marks! */
   955 			    guessquote-=2;
   956 		    }
   957 		    if (innermost_quote_matches(counters,c))
   958 			/*
   959 			 * Give it the benefit of some doubt,
   960 			 * if a squote is already open.
   961 			 */
   962 			guessquote++;
   963 		    else
   964 			guessquote--;
   965 		    if (guessquote>=0)
   966 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
   967 		}
   968 		else
   969 		    /* no adjacent letter - it must be a quote of some kind */
   970 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   971 	    }
   972 	}
   973 	if (tmp_err)
   974 	{
   975 	    if (pswit[ECHO_SWITCH])
   976 		g_print("\n%s\n",aline);
   977 	    if (!pswit[OVERVIEW_SWITCH])
   978 		g_print("    Line %ld column %ld - %s\n",
   979 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
   980 	    g_clear_error(&tmp_err);
   981 	}
   982 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   983 	  c!='\r' && c!='\n')
   984 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   985 	if (c==CHAR_UNDERSCORE)
   986 	    counters->c_unders++;
   987 	if (c==CHAR_OPEN_SBRACK)
   988 	{
   989 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
   990 	      !matching_difference(counters,c) && s==aline &&
   991 	      g_str_has_prefix(s,"[Illustration:"))
   992 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
   993 	    else
   994 		increment_matching(counters,c,TRUE);
   995 	}
   996 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
   997 	    increment_matching(counters,c,TRUE);
   998 	if (c==CHAR_CLOSE_SBRACK)
   999 	{
  1000 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
  1001 	      !matching_difference(counters,c) && !*snext)
  1002 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
  1003 	    else
  1004 		increment_matching(counters,c,FALSE);
  1005 	}
  1006 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
  1007 	    increment_matching(counters,c,FALSE);
  1008 	sprev=s;
  1009 	s=snext;
  1010     }
  1011     return isemptyline;
  1012 }
  1013 
  1014 /*
  1015  * check_for_control_characters:
  1016  *
  1017  * Check for invalid or questionable characters in the line
  1018  * Anything above 127 is invalid for plain ASCII, and
  1019  * non-printable control characters should also be flagged.
  1020  * Tabs should generally not be there.
  1021  */
  1022 void check_for_control_characters(const char *aline)
  1023 {
  1024     gunichar c;
  1025     const char *s;
  1026     for (s=aline;*s;s=g_utf8_next_char(s))
  1027     {
  1028 	c=g_utf8_get_char(s);
  1029 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
  1030 	{
  1031 	    if (pswit[ECHO_SWITCH])
  1032 		g_print("\n%s\n",aline);
  1033 	    if (!pswit[OVERVIEW_SWITCH])
  1034 		g_print("    Line %ld column %ld - Control character %u\n",
  1035 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
  1036 	    else
  1037 		cnt_bin++;
  1038 	}
  1039     }
  1040 }
  1041 
  1042 /*
  1043  * check_for_odd_characters:
  1044  *
  1045  * Check for binary and other odd characters.
  1046  */
  1047 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
  1048   gboolean isemptyline)
  1049 {
  1050     /* Don't repeat multiple warnings on one line. */
  1051     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1052     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1053     const char *s;
  1054     gunichar c;
  1055     gsize nb;
  1056     gchar *t;
  1057     for (s=aline;*s;s=g_utf8_next_char(s))
  1058     {
  1059 	c=g_utf8_get_char(s);
  1060 	if (warnings->bin && !eInvalidChar &&
  1061 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1062 	{
  1063 	    if (pswit[ECHO_SWITCH])
  1064 		g_print("\n%s\n",aline);
  1065 	    if (!pswit[OVERVIEW_SWITCH])
  1066 		if (c>127 && c<160 || c>255)
  1067 		    g_print("    Line %ld column %ld - "
  1068 		      "Non-ISO-8859 character %u\n",
  1069 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1070 		else
  1071 		    g_print("    Line %ld column %ld - "
  1072 		      "Non-ASCII character %u\n",
  1073 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1074 	    else
  1075 		cnt_bin++;
  1076 	    eInvalidChar=TRUE;
  1077 	}
  1078 	if (!eInvalidChar && charset)
  1079 	{
  1080 	    if (charset_validator==(GIConv)-1)
  1081 	    {
  1082 		if (!g_unichar_isdefined(c))
  1083 		{
  1084 		    if (pswit[ECHO_SWITCH])
  1085 			g_print("\n%s\n",aline);
  1086 		    if (!pswit[OVERVIEW_SWITCH])
  1087 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1088 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1089 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1090 		    else
  1091 			cnt_bin++;
  1092 		    eInvalidChar=TRUE;
  1093 		}
  1094 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1095 		  c>=100000 && c<=0x10FFFD)
  1096 		{
  1097 		    if (pswit[ECHO_SWITCH])
  1098 			g_print("\n%s\n",aline);
  1099 		    if (!pswit[OVERVIEW_SWITCH])
  1100 			g_print("    Line %ld column %ld - Private Use "
  1101 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1102 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1103 		    else
  1104 			cnt_bin++;
  1105 		    eInvalidChar=TRUE;
  1106 		}
  1107 	    }
  1108 	    else
  1109 	    {
  1110 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1111 		  charset_validator,NULL,&nb,NULL);
  1112 		if (t)
  1113 		    g_free(t);
  1114 		else
  1115 		{
  1116 		    if (pswit[ECHO_SWITCH])
  1117 			g_print("\n%s\n",aline);
  1118 		    if (!pswit[OVERVIEW_SWITCH])
  1119 			g_print("    Line %ld column %ld - Non-%s "
  1120 			  "character %u\n",linecnt,
  1121 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1122 		    else
  1123 			cnt_bin++;
  1124 		    eInvalidChar=TRUE;
  1125 		}
  1126 	    }
  1127 	}
  1128 	if (!eTab && c==CHAR_TAB)
  1129 	{
  1130 	    if (pswit[ECHO_SWITCH])
  1131 		g_print("\n%s\n",aline);
  1132 	    if (!pswit[OVERVIEW_SWITCH])
  1133 		g_print("    Line %ld column %ld - Tab character?\n",
  1134 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1135 	    else
  1136 		cnt_odd++;
  1137 	    eTab=TRUE;
  1138 	}
  1139 	if (!eTilde && c==CHAR_TILDE)
  1140 	{
  1141 	    /*
  1142 	     * Often used by OCR software to indicate an
  1143 	     * unrecognizable character.
  1144 	     */
  1145 	    if (pswit[ECHO_SWITCH])
  1146 		g_print("\n%s\n",aline);
  1147 	    if (!pswit[OVERVIEW_SWITCH])
  1148 		g_print("    Line %ld column %ld - Tilde character?\n",
  1149 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1150 	    else
  1151 		cnt_odd++;
  1152 	    eTilde=TRUE;
  1153 	}
  1154 	if (!eCarat && c==CHAR_CARAT)
  1155 	{  
  1156 	    if (pswit[ECHO_SWITCH])
  1157 		g_print("\n%s\n",aline);
  1158 	    if (!pswit[OVERVIEW_SWITCH])
  1159 		g_print("    Line %ld column %ld - Carat character?\n",
  1160 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1161 	    else
  1162 		cnt_odd++;
  1163 	    eCarat=TRUE;
  1164 	}
  1165 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1166 	{  
  1167 	    if (pswit[ECHO_SWITCH])
  1168 		g_print("\n%s\n",aline);
  1169 	    if (!pswit[OVERVIEW_SWITCH])
  1170 		g_print("    Line %ld column %ld - Forward slash?\n",
  1171 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1172 	    else
  1173 		cnt_odd++;
  1174 	    eFSlash=TRUE;
  1175 	}
  1176 	/*
  1177 	 * Report asterisks only in paranoid mode,
  1178 	 * since they're often deliberate.
  1179 	 */
  1180 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1181 	  c==CHAR_ASTERISK)
  1182 	{
  1183 	    if (pswit[ECHO_SWITCH])
  1184 		g_print("\n%s\n",aline);
  1185 	    if (!pswit[OVERVIEW_SWITCH])
  1186 		g_print("    Line %ld column %ld - Asterisk?\n",
  1187 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1188 	    else
  1189 		cnt_odd++;
  1190 	    eAst=TRUE;
  1191 	}
  1192     }
  1193 }
  1194 
  1195 /*
  1196  * check_for_long_line:
  1197  *
  1198  * Check for line too long.
  1199  */
  1200 void check_for_long_line(const char *aline)
  1201 {
  1202     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1203     {
  1204 	if (pswit[ECHO_SWITCH])
  1205 	    g_print("\n%s\n",aline);
  1206 	if (!pswit[OVERVIEW_SWITCH])
  1207 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1208 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1209 	else
  1210 	    cnt_long++;
  1211     }
  1212 }
  1213 
  1214 /*
  1215  * check_for_short_line:
  1216  *
  1217  * Check for line too short.
  1218  *
  1219  * This one is a bit trickier to implement: we don't want to
  1220  * flag the last line of a paragraph for being short, so we
  1221  * have to wait until we know that our current line is a
  1222  * "normal" line, then report the _previous_ line if it was too
  1223  * short. We also don't want to report indented lines like
  1224  * chapter heads or formatted quotations. We therefore keep
  1225  * last->len as the length of the last line examined, and
  1226  * last->blen as the length of the last but one, and try to
  1227  * suppress unnecessary warnings by checking that both were of
  1228  * "normal" length. We keep the first character of the last
  1229  * line in last->start, and if it was a space, we assume that
  1230  * the formatting is deliberate. I can't figure out a way to
  1231  * distinguish something like a quoted verse left-aligned or
  1232  * the header or footer of a letter from a paragraph of short
  1233  * lines - maybe if I examined the whole paragraph, and if the
  1234  * para has less than, say, 8 lines and if all lines are short,
  1235  * then just assume it's OK? Need to look at some texts to see
  1236  * how often a formula like this would get the right result.
  1237  */
  1238 void check_for_short_line(const char *aline,const struct line_properties *last)
  1239 {
  1240     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1241       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1242       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1243     {
  1244 	if (pswit[ECHO_SWITCH])
  1245 	    g_print("\n%s\n",prevline);
  1246 	if (!pswit[OVERVIEW_SWITCH])
  1247 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1248 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1249 	else
  1250 	    cnt_short++;
  1251     }
  1252 }
  1253 
  1254 /*
  1255  * check_for_starting_punctuation:
  1256  *
  1257  * Look for punctuation other than full ellipses at start of line.
  1258  */
  1259 void check_for_starting_punctuation(const char *aline)
  1260 {
  1261     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1262       !g_str_has_prefix(aline,". . ."))
  1263     {
  1264 	if (pswit[ECHO_SWITCH])
  1265 	    g_print("\n%s\n",aline);
  1266 	if (!pswit[OVERVIEW_SWITCH])
  1267 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1268 	      linecnt);
  1269 	else
  1270 	    cnt_punct++;
  1271     }
  1272 }
  1273 
  1274 /*
  1275  * str_emdash:
  1276  *
  1277  * Find the first em-dash, return a pointer to it and set <next> to the
  1278  * character following the dash.
  1279  */
  1280 char *str_emdash(const char *s,const char **next)
  1281 {
  1282     const char *s1,*s2;
  1283     s1=strstr(s,"--");
  1284     s2=strstr(s,"—");
  1285     if (!s1)
  1286     {
  1287 	if (s2)
  1288 	    *next=g_utf8_next_char(s2);
  1289 	return (char *)s2;
  1290     }
  1291     else if (!s2)
  1292     {
  1293 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1294 	return (char *)s1;
  1295     }
  1296     else if (s1<s2)
  1297     {
  1298 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1299 	return (char *)s1;
  1300     }
  1301     else
  1302     {
  1303 	*next=g_utf8_next_char(s2);
  1304 	return (char *)s2;
  1305     }
  1306 }
  1307 
  1308 /*
  1309  * check_for_spaced_emdash:
  1310  *
  1311  * Check for spaced em-dashes.
  1312  *
  1313  * We must check _all_ occurrences of em-dashes on the line
  1314  * hence the loop - even if the first dash is OK
  1315  * there may be another that's wrong later on.
  1316  */
  1317 void check_for_spaced_emdash(const char *aline)
  1318 {
  1319     const char *s,*t,*next;
  1320     for (s=aline;t=str_emdash(s,&next);s=next)
  1321     {
  1322 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1323 	  g_utf8_get_char(next)==CHAR_SPACE)
  1324 	{
  1325 	    if (pswit[ECHO_SWITCH])
  1326 		g_print("\n%s\n",aline);
  1327 	    if (!pswit[OVERVIEW_SWITCH])
  1328 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1329 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1330 	    else
  1331 		cnt_dash++;
  1332 	}
  1333     }
  1334 }
  1335 
  1336 /*
  1337  * check_for_spaced_dash:
  1338  *
  1339  * Check for spaced dashes.
  1340  */
  1341 void check_for_spaced_dash(const char *aline)
  1342 {
  1343     const char *s;
  1344     if ((s=strstr(aline," -")))
  1345     {
  1346 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1347 	{
  1348 	    if (pswit[ECHO_SWITCH])
  1349 		g_print("\n%s\n",aline);
  1350 	    if (!pswit[OVERVIEW_SWITCH])
  1351 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1352 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1353 	    else
  1354 		cnt_dash++;
  1355 	}
  1356     }
  1357     else if ((s=strstr(aline,"- ")))
  1358     {
  1359 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1360 	{
  1361 	    if (pswit[ECHO_SWITCH])
  1362 		g_print("\n%s\n",aline);
  1363 	    if (!pswit[OVERVIEW_SWITCH])
  1364 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1365 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1366 	    else
  1367 		cnt_dash++;
  1368 	}
  1369     }
  1370 }
  1371 
  1372 /*
  1373  * check_for_unmarked_paragraphs:
  1374  *
  1375  * Check for unmarked paragraphs indicated by separate speakers.
  1376  *
  1377  * May well be false positive:
  1378  * "Bravo!" "Wonderful!" called the crowd.
  1379  * but useful all the same.
  1380  */
  1381 void check_for_unmarked_paragraphs(const char *aline)
  1382 {
  1383     const char *s;
  1384     s=strstr(aline,"\"  \"");
  1385     if (!s)
  1386 	s=strstr(aline,"\" \"");
  1387     if (s)
  1388     {
  1389 	if (pswit[ECHO_SWITCH])
  1390 	    g_print("\n%s\n",aline);
  1391 	if (!pswit[OVERVIEW_SWITCH])
  1392 	    g_print("    Line %ld column %ld - "
  1393 	      "Query missing paragraph break?\n",
  1394 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1395 	else
  1396 	    cnt_punct++;
  1397     }
  1398 }
  1399 
  1400 /*
  1401  * check_for_jeebies:
  1402  *
  1403  * Check for "to he" and other easy h/b errors.
  1404  *
  1405  * This is a very inadequate effort on the h/b problem,
  1406  * but the phrase "to he" is always an error, whereas "to
  1407  * be" is quite common.
  1408  * Similarly, '"Quiet!", be said.' is a non-be error
  1409  * "to he" is _not_ always an error!:
  1410  *       "Where they went to he couldn't say."
  1411  * Another false positive:
  1412  *       What would "Cinderella" be without the . . .
  1413  * and another: "If he wants to he can see for himself."
  1414  */
  1415 void check_for_jeebies(const char *aline)
  1416 {
  1417     const char *s;
  1418     s=strstr(aline," be could ");
  1419     if (!s)
  1420 	s=strstr(aline," be would ");
  1421     if (!s)
  1422 	s=strstr(aline," was be ");
  1423     if (!s)
  1424 	s=strstr(aline," be is ");
  1425     if (!s)
  1426 	s=strstr(aline," is be ");
  1427     if (!s)
  1428 	s=strstr(aline,"\", be ");
  1429     if (!s)
  1430 	s=strstr(aline,"\" be ");
  1431     if (!s)
  1432 	s=strstr(aline,"\" be ");
  1433     if (!s)
  1434 	s=strstr(aline," to he ");
  1435     if (s)
  1436     {
  1437 	if (pswit[ECHO_SWITCH])
  1438 	    g_print("\n%s\n",aline);
  1439 	if (!pswit[OVERVIEW_SWITCH])
  1440 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1441 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1442 	else
  1443 	    cnt_word++;
  1444     }
  1445     s=strstr(aline," the had ");
  1446     if (!s)
  1447 	s=strstr(aline," a had ");
  1448     if (!s)
  1449 	s=strstr(aline," they bad ");
  1450     if (!s)
  1451 	s=strstr(aline," she bad ");
  1452     if (!s)
  1453 	s=strstr(aline," he bad ");
  1454     if (!s)
  1455 	s=strstr(aline," you bad ");
  1456     if (!s)
  1457 	s=strstr(aline," i bad ");
  1458     if (s)
  1459     {
  1460 	if (pswit[ECHO_SWITCH])
  1461 	    g_print("\n%s\n",aline);
  1462 	if (!pswit[OVERVIEW_SWITCH])
  1463 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1464 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1465 	else
  1466 	    cnt_word++;
  1467     }
  1468     s=strstr(aline,"; hut ");
  1469     if (!s)
  1470 	s=strstr(aline,", hut ");
  1471     if (s)
  1472     {
  1473 	if (pswit[ECHO_SWITCH])
  1474 	    g_print("\n%s\n",aline);
  1475 	if (!pswit[OVERVIEW_SWITCH])
  1476 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1477 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1478 	else
  1479 	    cnt_word++;
  1480     }
  1481 }
  1482 
  1483 /*
  1484  * check_for_mta_from:
  1485  *
  1486  * Special case - angled bracket in front of "From" placed there by an
  1487  * MTA when sending an e-mail.
  1488  */
  1489 void check_for_mta_from(const char *aline)
  1490 {
  1491     const char *s;
  1492     s=strstr(aline,">From");
  1493     if (s)
  1494     {
  1495 	if (pswit[ECHO_SWITCH])
  1496 	    g_print("\n%s\n",aline);
  1497 	if (!pswit[OVERVIEW_SWITCH])
  1498 	    g_print("    Line %ld column %ld - "
  1499 	      "Query angled bracket with From\n",
  1500 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1501 	else
  1502 	    cnt_punct++;
  1503     }
  1504 }
  1505 
  1506 /*
  1507  * check_for_orphan_character:
  1508  *
  1509  * Check for a single character line -
  1510  * often an overflow from bad wrapping.
  1511  */
  1512 void check_for_orphan_character(const char *aline)
  1513 {
  1514     gunichar c;
  1515     c=g_utf8_get_char(aline);
  1516     if (c && !*g_utf8_next_char(aline))
  1517     {
  1518 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1519 	    ; /* Nothing - ignore numerals alone on a line. */
  1520 	else
  1521 	{
  1522 	    if (pswit[ECHO_SWITCH])
  1523 		g_print("\n%s\n",aline);
  1524 	    if (!pswit[OVERVIEW_SWITCH])
  1525 		g_print("    Line %ld column 1 - Query single character line\n",
  1526 		  linecnt);
  1527 	    else
  1528 		cnt_punct++;
  1529 	}
  1530     }
  1531 }
  1532 
  1533 /*
  1534  * check_for_pling_scanno:
  1535  *
  1536  * Check for I" - often should be !
  1537  */
  1538 void check_for_pling_scanno(const char *aline)
  1539 {
  1540     const char *s;
  1541     s=strstr(aline," I\"");
  1542     if (s)
  1543     {
  1544 	if (pswit[ECHO_SWITCH])
  1545 	    g_print("\n%s\n",aline);
  1546 	if (!pswit[OVERVIEW_SWITCH])
  1547 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1548 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1549 	else
  1550 	    cnt_punct++;
  1551     }
  1552 }
  1553 
  1554 /*
  1555  * check_for_extra_period:
  1556  *
  1557  * Check for period without a capital letter. Cut-down from gutspell.
  1558  * Only works when it happens on a single line.
  1559  */
  1560 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1561 {
  1562     const char *s,*t,*s1,*sprev;
  1563     int i;
  1564     gsize len;
  1565     gboolean istypo;
  1566     gchar *testword;
  1567     gunichar c,nc,pc,*decomposition;
  1568     if (pswit[PARANOID_SWITCH])
  1569     {
  1570 	for (t=aline;t=strstr(t,". ");)
  1571 	{
  1572 	    if (t==aline)
  1573 	    {
  1574 		t=g_utf8_next_char(t);
  1575 		/* start of line punctuation is handled elsewhere */
  1576 		continue;
  1577 	    }
  1578 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1579 	    {
  1580 		t=g_utf8_next_char(t);
  1581 		continue;
  1582 	    }
  1583 	    if (warnings->isDutch)
  1584 	    {
  1585 		/* For Frank & Jeroen -- 's Middags case */
  1586 		gunichar c2,c3,c4,c5;
  1587 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1588 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1589 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1590 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1591 		if (CHAR_IS_APOSTROPHE(c2) &&
  1592 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1593 		  g_unichar_isupper(c5))
  1594 		{
  1595 		    t=g_utf8_next_char(t);
  1596 		    continue;
  1597 		}
  1598 	    }
  1599 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1600 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1601 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1602 		s1=g_utf8_next_char(s1);
  1603 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1604 	    {
  1605 		/* we have something to investigate */
  1606 		istypo=TRUE;
  1607 		/* so let's go back and find out */
  1608 		nc=g_utf8_get_char(t);
  1609 		s1=g_utf8_prev_char(t);
  1610 		c=g_utf8_get_char(s1);
  1611 		sprev=g_utf8_prev_char(s1);
  1612 		pc=g_utf8_get_char(sprev);
  1613 		while (s1>=aline &&
  1614 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1615 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1616 		  g_unichar_isalpha(nc)))
  1617 		{
  1618 		    nc=c;
  1619 		    s1=sprev;
  1620 		    c=pc;
  1621 		    sprev=g_utf8_prev_char(s1);
  1622 		    pc=g_utf8_get_char(sprev);
  1623 		}
  1624 		s1=g_utf8_next_char(s1);
  1625 		s=strchr(s1,'.');
  1626 		if (s)
  1627 		    testword=g_strndup(s1,s-s1);
  1628 		else
  1629 		    testword=g_strdup(s1);
  1630 		for (i=0;*abbrev[i];i++)
  1631 		    if (!strcmp(testword,abbrev[i]))
  1632 			istypo=FALSE;
  1633 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1634 		    istypo=FALSE;
  1635 		if (!*g_utf8_next_char(testword))
  1636 		    istypo=FALSE;
  1637 		if (isroman(testword))
  1638 		    istypo=FALSE;
  1639 		if (istypo)
  1640 		{
  1641 		    istypo=FALSE;
  1642 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1643 		    {
  1644 			decomposition=g_unicode_canonical_decomposition(
  1645 			  g_utf8_get_char(s),&len);
  1646 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1647 			    istypo=TRUE;
  1648 			g_free(decomposition);
  1649 		    }
  1650 		}
  1651 		if (istypo &&
  1652 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1653 		{
  1654 		    g_tree_insert(qperiod,g_strdup(testword),
  1655 		      GINT_TO_POINTER(1));
  1656 		    if (pswit[ECHO_SWITCH])
  1657 			g_print("\n%s\n",aline);
  1658 		    if (!pswit[OVERVIEW_SWITCH])
  1659 			g_print("    Line %ld column %ld - Extra period?\n",
  1660 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1661 		    else
  1662 			cnt_punct++;
  1663 		}
  1664 		g_free(testword);
  1665 	    }
  1666 	    t=g_utf8_next_char(t);
  1667 	}
  1668     }
  1669 }
  1670 
  1671 /*
  1672  * check_for_following_punctuation:
  1673  *
  1674  * Check for words usually not followed by punctuation.
  1675  */
  1676 void check_for_following_punctuation(const char *aline)
  1677 {
  1678     int i;
  1679     const char *s,*wordstart;
  1680     gunichar c;
  1681     gchar *inword,*t;
  1682     if (pswit[TYPO_SWITCH])
  1683     {
  1684 	for (s=aline;*s;)
  1685 	{
  1686 	    wordstart=s;
  1687 	    t=getaword(&s);
  1688 	    if (!*t)
  1689 	    {
  1690 		g_free(t);
  1691 		continue;
  1692 	    }
  1693 	    inword=g_utf8_strdown(t,-1);
  1694 	    g_free(t);
  1695 	    for (i=0;*nocomma[i];i++)
  1696 		if (!strcmp(inword,nocomma[i]))
  1697 		{
  1698 		    c=g_utf8_get_char(s);
  1699 		    if (c==',' || c==';' || c==':')
  1700 		    {
  1701 			if (pswit[ECHO_SWITCH])
  1702 			    g_print("\n%s\n",aline);
  1703 			if (!pswit[OVERVIEW_SWITCH])
  1704 			    g_print("    Line %ld column %ld - "
  1705 			      "Query punctuation after %s?\n",
  1706 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1707 			      inword);
  1708 			else
  1709 			    cnt_punct++;
  1710 		    }
  1711 		}
  1712 	    for (i=0;*noperiod[i];i++)
  1713 		if (!strcmp(inword,noperiod[i]))
  1714 		{
  1715 		    c=g_utf8_get_char(s);
  1716 		    if (c=='.' || c=='!')
  1717 		    {
  1718 			if (pswit[ECHO_SWITCH])
  1719 			    g_print("\n%s\n",aline);
  1720 			if (!pswit[OVERVIEW_SWITCH])
  1721 			    g_print("    Line %ld column %ld - "
  1722 			      "Query punctuation after %s?\n",
  1723 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1724 			      inword);
  1725 			else
  1726 			    cnt_punct++;
  1727 		    }
  1728 		}
  1729 	    g_free(inword);
  1730 	}
  1731     }
  1732 }
  1733 
  1734 /*
  1735  * check_for_typos:
  1736  *
  1737  * Check for commonly mistyped words,
  1738  * and digits like 0 for O in a word.
  1739  */
  1740 void check_for_typos(const char *aline,struct warnings *warnings)
  1741 {
  1742     const char *s,*t,*nt,*wordstart;
  1743     gchar *inword;
  1744     gunichar *decomposition;
  1745     gchar *testword;
  1746     int i,vowel,consonant,*dupcnt;
  1747     gboolean isdup,istypo,alower;
  1748     gunichar c,pc;
  1749     long offset,len;
  1750     gsize decomposition_len;
  1751     for (s=aline;*s;)
  1752     {
  1753 	wordstart=s;
  1754 	inword=getaword(&s);
  1755 	if (!*inword)
  1756 	{
  1757 	    g_free(inword);
  1758 	    continue; /* don't bother with empty lines */
  1759 	}
  1760 	if (mixdigit(inword))
  1761 	{
  1762 	    if (pswit[ECHO_SWITCH])
  1763 		g_print("\n%s\n",aline);
  1764 	    if (!pswit[OVERVIEW_SWITCH])
  1765 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1766 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1767 	    else
  1768 		cnt_word++;
  1769 	}
  1770 	/*
  1771 	 * Put the word through a series of tests for likely typos and OCR
  1772 	 * errors.
  1773 	 */
  1774 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1775 	{
  1776 	    istypo=FALSE;
  1777 	    alower=FALSE;
  1778 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1779 	    {
  1780 		c=g_utf8_get_char(t);
  1781 		nt=g_utf8_next_char(t);
  1782 		/* lowercase for testing */
  1783 		if (g_unichar_islower(c))
  1784 		    alower=TRUE;
  1785 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1786 		{
  1787 		    /*
  1788 		     * We have an uppercase mid-word. However, there are
  1789 		     * common cases:
  1790 		     *   Mac and Mc like McGill
  1791 		     *   French contractions like l'Abbe
  1792 		     */
  1793 		    offset=g_utf8_pointer_to_offset(inword,t);
  1794 		    if (offset>0)
  1795 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1796 		    else
  1797 			pc='\0';
  1798 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1799 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1800 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1801 		      CHAR_IS_APOSTROPHE(pc))
  1802 			; /* do nothing! */
  1803 		    else
  1804 			istypo=TRUE;
  1805 		}
  1806 	    }
  1807 	    testword=g_utf8_casefold(inword,-1);
  1808 	}
  1809 	if (pswit[TYPO_SWITCH])
  1810 	{
  1811 	    /*
  1812 	     * Check for certain unlikely two-letter combinations at word
  1813 	     * start and end.
  1814 	     */
  1815 	    len=g_utf8_strlen(testword,-1);
  1816 	    if (len>1)
  1817 	    {
  1818 		for (i=0;*nostart[i];i++)
  1819 		    if (g_str_has_prefix(testword,nostart[i]))
  1820 			istypo=TRUE;
  1821 		for (i=0;*noend[i];i++)
  1822 		    if (g_str_has_suffix(testword,noend[i]))
  1823 			istypo=TRUE;
  1824 	    }
  1825 	    /* ght is common, gbt never. Like that. */
  1826 	    if (strstr(testword,"cb"))
  1827 		istypo=TRUE;
  1828 	    if (strstr(testword,"gbt"))
  1829 		istypo=TRUE;
  1830 	    if (strstr(testword,"pbt"))
  1831 		istypo=TRUE;
  1832 	    if (strstr(testword,"tbs"))
  1833 		istypo=TRUE;
  1834 	    if (strstr(testword,"mrn"))
  1835 		istypo=TRUE;
  1836 	    if (strstr(testword,"ahle"))
  1837 		istypo=TRUE;
  1838 	    if (strstr(testword,"ihle"))
  1839 		istypo=TRUE;
  1840 	    /*
  1841 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1842 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1843 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1844 	     * numerals, but "ii" is a common scanno.
  1845 	     */
  1846 	    if (strstr(testword,"tbi"))
  1847 		istypo=TRUE;
  1848 	    if (strstr(testword,"tbe"))
  1849 		istypo=TRUE;
  1850 	    if (strstr(testword,"ii"))
  1851 		istypo=TRUE;
  1852 	    /*
  1853 	     * Check for no vowels or no consonants.
  1854 	     * If none, flag a typo.
  1855 	     */
  1856 	    if (!istypo && len>1)
  1857 	    {
  1858 		vowel=consonant=0;
  1859 		for (t=testword;*t;t=g_utf8_next_char(t))
  1860 		{
  1861 		    c=g_utf8_get_char(t);
  1862 		    decomposition=
  1863 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1864 		    if (c=='y' || g_unichar_isdigit(c))
  1865 		    {
  1866 			/* Yah, this is loose. */
  1867 			vowel++;
  1868 			consonant++;
  1869 		    }
  1870 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1871 			vowel++;
  1872 		    else
  1873 			consonant++;
  1874 		    g_free(decomposition);
  1875 		}
  1876 		if (!vowel || !consonant)
  1877 		    istypo=TRUE;
  1878 	    }
  1879 	    /*
  1880 	     * Now exclude the word from being reported if it's in
  1881 	     * the okword list.
  1882 	     */
  1883 	    for (i=0;*okword[i];i++)
  1884 		if (!strcmp(testword,okword[i]))
  1885 		    istypo=FALSE;
  1886 	    /*
  1887 	     * What looks like a typo may be a Roman numeral.
  1888 	     * Exclude these.
  1889 	     */
  1890 	    if (istypo && isroman(testword))
  1891 		istypo=FALSE;
  1892 	    /* Check the manual list of typos. */
  1893 	    if (!istypo)
  1894 		for (i=0;*typo[i];i++)
  1895 		    if (!strcmp(testword,typo[i]))
  1896 			istypo=TRUE;
  1897 	    /*
  1898 	     * Check lowercase s, l, i and m - special cases.
  1899 	     *   "j" - often a semi-colon gone wrong.
  1900 	     *   "d" for a missing apostrophe - he d
  1901 	     *   "n" for "in"
  1902 	     */
  1903 	    if (!istypo && len==1 &&
  1904 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1905 		istypo=TRUE;
  1906 	    if (istypo)
  1907 	    {
  1908 		dupcnt=g_tree_lookup(qword,testword);
  1909 		if (dupcnt)
  1910 		{
  1911 		    (*dupcnt)++;
  1912 		    isdup=!pswit[VERBOSE_SWITCH];
  1913 		}
  1914 		else
  1915 		{
  1916 		    dupcnt=g_new0(int,1);
  1917 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1918 		    isdup=FALSE;
  1919 		}
  1920 		if (!isdup)
  1921 		{
  1922 		    if (pswit[ECHO_SWITCH])
  1923 			g_print("\n%s\n",aline);
  1924 		    if (!pswit[OVERVIEW_SWITCH])
  1925 		    {
  1926 			g_print("    Line %ld column %ld - Query word %s",
  1927 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1928 			  inword);
  1929 			if (!pswit[VERBOSE_SWITCH])
  1930 			    g_print(" - not reporting duplicates");
  1931 			g_print("\n");
  1932 		    }
  1933 		    else
  1934 			cnt_word++;
  1935 		}
  1936 	    }
  1937 	}
  1938 	/* check the user's list of typos */
  1939 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1940 	{
  1941 	    if (pswit[ECHO_SWITCH])
  1942 		g_print("\n%s\n",aline);
  1943 	    if (!pswit[OVERVIEW_SWITCH])  
  1944 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1945 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1946 	}
  1947 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1948 	    g_free(testword);
  1949 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1950 	{
  1951 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1952 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1953 	    {
  1954 		if (pswit[ECHO_SWITCH])
  1955 		    g_print("\n%s\n",aline);
  1956 		if (!pswit[OVERVIEW_SWITCH])
  1957 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1958 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1959 		      inword);
  1960 		else
  1961 		    cnt_word++;
  1962 	    }
  1963 	}
  1964 	g_free(inword);
  1965     }
  1966 }
  1967 
  1968 /*
  1969  * check_for_misspaced_punctuation:
  1970  *
  1971  * Look for added or missing spaces around punctuation and quotes.
  1972  * If there is a punctuation character like ! with no space on
  1973  * either side, suspect a missing!space. If there are spaces on
  1974  * both sides , assume a typo. If we see a double quote with no
  1975  * space or punctuation on either side of it, assume unspaced
  1976  * quotes "like"this.
  1977  */
  1978 void check_for_misspaced_punctuation(const char *aline,
  1979   struct parities *parities,gboolean isemptyline)
  1980 {
  1981     gboolean isacro,isellipsis;
  1982     const char *s;
  1983     gunichar c,nc,pc,n2c;
  1984     int parity;
  1985     c=g_utf8_get_char(aline);
  1986     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1987     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1988     {
  1989 	pc=c;
  1990 	c=nc;
  1991 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1992 	/* For each character in the line after the first. */
  1993 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1994 	{
  1995 	    /* we need to suppress warnings for acronyms like M.D. */
  1996 	    isacro=FALSE;
  1997 	    /* we need to suppress warnings for ellipsis . . . */
  1998 	    isellipsis=FALSE;
  1999 	    /*
  2000 	     * If there are letters on both sides of it or
  2001 	     * if it's strict punctuation followed by an alpha.
  2002 	     */
  2003 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  2004 	      g_utf8_strchr("?!,;:",-1,c)))
  2005 	    {
  2006 		if (c=='.')
  2007 		{
  2008 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2009 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2010 			isacro=TRUE;
  2011 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2012 		    if (nc && n2c=='.')
  2013 			isacro=TRUE;
  2014 		}
  2015 		if (!isacro)
  2016 		{
  2017 		    if (pswit[ECHO_SWITCH])
  2018 			g_print("\n%s\n",aline);
  2019 		    if (!pswit[OVERVIEW_SWITCH])
  2020 			g_print("    Line %ld column %ld - Missing space?\n",
  2021 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2022 		    else
  2023 			cnt_punct++;
  2024 		}
  2025 	    }
  2026 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  2027 	    {
  2028 		/*
  2029 		 * If there are spaces on both sides,
  2030 		 * or space before and end of line.
  2031 		 */
  2032 		if (c=='.')
  2033 		{
  2034 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  2035 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  2036 			isellipsis=TRUE;
  2037 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  2038 		    if (nc && n2c=='.')
  2039 			isellipsis=TRUE;
  2040 		}
  2041 		if (!isemptyline && !isellipsis)
  2042 		{
  2043 		    if (pswit[ECHO_SWITCH])
  2044 			g_print("\n%s\n",aline);
  2045 		    if (!pswit[OVERVIEW_SWITCH])
  2046 			g_print("    Line %ld column %ld - "
  2047 			  "Spaced punctuation?\n",linecnt,
  2048 			  g_utf8_pointer_to_offset(aline,s)+1);
  2049 		    else
  2050 			cnt_punct++;
  2051 		}
  2052 	    }
  2053 	}
  2054     }
  2055     /* Split out the characters that CANNOT be preceded by space. */
  2056     c=g_utf8_get_char(aline);
  2057     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2058     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2059     {
  2060 	pc=c;
  2061 	c=nc;
  2062 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2063 	/* for each character in the line after the first */
  2064 	if (g_utf8_strchr("?!,;:",-1,c))
  2065 	{
  2066 	    /* if it's punctuation that _cannot_ have a space before it */
  2067 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  2068 	    {
  2069 		/*
  2070 		 * If nc DOES == space,
  2071 		 * it was already reported just above.
  2072 		 */
  2073 		if (pswit[ECHO_SWITCH])
  2074 		    g_print("\n%s\n",aline);
  2075 		if (!pswit[OVERVIEW_SWITCH])
  2076 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2077 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2078 		else
  2079 		    cnt_punct++;
  2080 	    }
  2081 	}
  2082     }
  2083     /*
  2084      * Special case " .X" where X is any alpha.
  2085      * This plugs a hole in the acronym code above.
  2086      * Inelegant, but maintainable.
  2087      */
  2088     c=g_utf8_get_char(aline);
  2089     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2090     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2091     {
  2092 	pc=c;
  2093 	c=nc;
  2094 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2095 	/* for each character in the line after the first */
  2096 	if (c=='.')
  2097 	{
  2098 	    /* if it's a period */
  2099 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2100 	    {
  2101 		/*
  2102 		 * If the period follows a space and
  2103 		 * is followed by a letter.
  2104 		 */
  2105 		if (pswit[ECHO_SWITCH])
  2106 		    g_print("\n%s\n",aline);
  2107 		if (!pswit[OVERVIEW_SWITCH])
  2108 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2109 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2110 		else
  2111 		    cnt_punct++;
  2112 	    }
  2113 	}
  2114     }
  2115     c=g_utf8_get_char(aline);
  2116     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2117     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2118     {
  2119 	pc=c;
  2120 	c=nc;
  2121 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2122 	/* for each character in the line after the first */
  2123 	if (CHAR_IS_DQUOTE(c))
  2124 	{
  2125 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2126 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2127 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2128 	    {
  2129 		if (pswit[ECHO_SWITCH])
  2130 		    g_print("\n%s\n",aline);
  2131 		if (!pswit[OVERVIEW_SWITCH])
  2132 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2133 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2134 		else
  2135 		    cnt_punct++;
  2136 	    }
  2137 	}
  2138     }
  2139     /* Check parity of quotes. */
  2140     nc=g_utf8_get_char(aline);
  2141     for (s=aline;*s;s=g_utf8_next_char(s))
  2142     {
  2143 	c=nc;
  2144 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2145 	if (CHAR_IS_DQUOTE(c))
  2146 	{
  2147 	    if (c==CHAR_DQUOTE)
  2148 	    {
  2149 		parities->dquote=!parities->dquote;
  2150 		parity=parities->dquote;
  2151 	    }
  2152 	    else if (c==CHAR_LD_QUOTE)
  2153 		parity=1;
  2154 	    else
  2155 		parity=0;
  2156 	    if (!parity)
  2157 	    {
  2158 		/* parity even */
  2159 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2160 		{
  2161 		    if (pswit[ECHO_SWITCH])
  2162 			g_print("\n%s\n",aline);
  2163 		    if (!pswit[OVERVIEW_SWITCH])
  2164 			g_print("    Line %ld column %ld - "
  2165 			  "Wrongspaced quotes?\n",
  2166 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2167 		    else
  2168 			cnt_punct++;
  2169 		}
  2170 	    }
  2171 	    else
  2172 	    {
  2173 		/* parity odd */
  2174 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2175 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2176 		{
  2177 		    if (pswit[ECHO_SWITCH])
  2178 			g_print("\n%s\n",aline);
  2179 		    if (!pswit[OVERVIEW_SWITCH])
  2180 			g_print("    Line %ld column %ld - "
  2181 			  "Wrongspaced quotes?\n",
  2182 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2183 		    else
  2184 			cnt_punct++;
  2185 		}
  2186 	    }
  2187 	}
  2188     }
  2189     c=g_utf8_get_char(aline);
  2190     if (CHAR_IS_DQUOTE(c))
  2191     {
  2192 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2193 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2194 	{
  2195 	    if (pswit[ECHO_SWITCH])
  2196 		g_print("\n%s\n",aline);
  2197 	    if (!pswit[OVERVIEW_SWITCH])
  2198 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2199 		  linecnt);
  2200 	    else
  2201 		cnt_punct++;
  2202 	}
  2203     }
  2204     if (pswit[SQUOTE_SWITCH])
  2205     {
  2206 	nc=g_utf8_get_char(aline);
  2207 	for (s=aline;*s;s=g_utf8_next_char(s))
  2208 	{
  2209 	    c=nc;
  2210 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2211 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2212 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2213 	      !g_unichar_isalpha(nc)))
  2214 	    {
  2215 		parities->squote=!parities->squote;
  2216 		if (!parities->squote)
  2217 		{
  2218 		    /* parity even */
  2219 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2220 		    {
  2221 			if (pswit[ECHO_SWITCH])
  2222 			    g_print("\n%s\n",aline);
  2223 			if (!pswit[OVERVIEW_SWITCH])
  2224 			    g_print("    Line %ld column %ld - "
  2225 			      "Wrongspaced singlequotes?\n",
  2226 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2227 			else
  2228 			    cnt_punct++;
  2229 		    }
  2230 		}
  2231 		else
  2232 		{
  2233 		    /* parity odd */
  2234 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2235 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2236 		    {
  2237 			if (pswit[ECHO_SWITCH])
  2238 			    g_print("\n%s\n",aline);
  2239 			if (!pswit[OVERVIEW_SWITCH])
  2240 			    g_print("    Line %ld column %ld - "
  2241 			      "Wrongspaced singlequotes?\n",
  2242 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2243 			else
  2244 			    cnt_punct++;
  2245 		    }
  2246 		}
  2247 	    }
  2248 	}
  2249     }
  2250 }
  2251 
  2252 /*
  2253  * check_for_double_punctuation:
  2254  *
  2255  * Look for double punctuation like ,. or ,,
  2256  * Thanks to DW for the suggestion!
  2257  * In books with references, ".," and ".;" are common
  2258  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2259  * OTOH, from my initial tests, there are also fairly
  2260  * common errors. What to do? Make these cases paranoid?
  2261  * ".," is the most common, so warnings->dotcomma is used
  2262  * to suppress detailed reporting if it occurs often.
  2263  */
  2264 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2265 {
  2266     const char *s;
  2267     gunichar c,nc;
  2268     nc=g_utf8_get_char(aline);
  2269     for (s=aline;*s;s=g_utf8_next_char(s))
  2270     {
  2271 	c=nc;
  2272 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2273 	/* for each punctuation character in the line */
  2274 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2275 	  g_utf8_strchr(".?!,;:",-1,nc))
  2276 	{
  2277 	    /* followed by punctuation, it's a query, unless . . . */
  2278 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2279 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2280 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2281 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2282 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2283 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2284 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2285 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2286 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2287 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2288 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2289 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2290 	    {
  2291 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2292 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2293 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2294 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2295 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2296 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2297 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2298 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2299 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2300 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2301 		{
  2302 		    s+=4;
  2303 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2304 		}
  2305 		; /* do nothing for .. !! and ?? which can be legit */
  2306 	    }
  2307 	    else
  2308 	    {
  2309 		if (pswit[ECHO_SWITCH])
  2310 		    g_print("\n%s\n",aline);
  2311 		if (!pswit[OVERVIEW_SWITCH])
  2312 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2313 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2314 		else
  2315 		    cnt_punct++;
  2316 	    }
  2317 	}
  2318     }
  2319 }
  2320 
  2321 /*
  2322  * check_for_spaced_quotes:
  2323  */
  2324 void check_for_spaced_quotes(const char *aline)
  2325 {
  2326     int i;
  2327     const char *s,*t;
  2328     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2329       CHAR_RS_QUOTE};
  2330     GString *pattern;
  2331     s=aline;
  2332     while ((t=strstr(s," \" ")))
  2333     {
  2334 	if (pswit[ECHO_SWITCH])
  2335 	    g_print("\n%s\n",aline);
  2336 	if (!pswit[OVERVIEW_SWITCH])
  2337 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2338 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2339 	else
  2340 	    cnt_punct++;
  2341 	s=g_utf8_next_char(g_utf8_next_char(t));
  2342     }
  2343     pattern=g_string_new(NULL);
  2344     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2345     {
  2346 	g_string_assign(pattern," ");
  2347 	g_string_append_unichar(pattern,single_quotes[i]);
  2348 	g_string_append_c(pattern,' ');
  2349 	s=aline;
  2350 	while ((t=strstr(s,pattern->str)))
  2351 	{
  2352 	    if (pswit[ECHO_SWITCH])
  2353 		g_print("\n%s\n",aline);
  2354 	    if (!pswit[OVERVIEW_SWITCH])
  2355 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2356 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2357 	    else
  2358 		cnt_punct++;
  2359 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2360 	}
  2361     }
  2362     g_string_free(pattern,TRUE);
  2363 }
  2364 
  2365 /*
  2366  * check_for_miscased_genative:
  2367  *
  2368  * Check special case of 'S instead of 's at end of word.
  2369  */
  2370 void check_for_miscased_genative(const char *aline)
  2371 {
  2372     const char *s;
  2373     gunichar c,nc,pc;
  2374     if (!*aline)
  2375 	return;
  2376     c=g_utf8_get_char(aline);
  2377     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2378     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2379     {
  2380 	pc=c;
  2381 	c=nc;
  2382 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2383 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2384 	{
  2385 	    if (pswit[ECHO_SWITCH])
  2386 		g_print("\n%s\n",aline);
  2387 	    if (!pswit[OVERVIEW_SWITCH])
  2388 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2389 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2390 	    else
  2391 		cnt_punct++;
  2392 	}
  2393     }
  2394 }
  2395 
  2396 /*
  2397  * check_end_of_line:
  2398  *
  2399  * Now check special cases - start and end of line -
  2400  * for single and double quotes. Start is sometimes [sic]
  2401  * but better to query it anyway.
  2402  * While we're here, check for dash at end of line.
  2403  */
  2404 void check_end_of_line(const char *aline,struct warnings *warnings)
  2405 {
  2406     int lbytes;
  2407     const char *s;
  2408     gunichar c1,c2;
  2409     lbytes=strlen(aline);
  2410     if (g_utf8_strlen(aline,lbytes)>1)
  2411     {
  2412 	s=g_utf8_prev_char(aline+lbytes);
  2413 	c1=g_utf8_get_char(s);
  2414 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2415 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2416 	{
  2417 	    if (pswit[ECHO_SWITCH])
  2418 		g_print("\n%s\n",aline);
  2419 	    if (!pswit[OVERVIEW_SWITCH])
  2420 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2421 		  g_utf8_strlen(aline,lbytes));
  2422 	    else
  2423 		cnt_punct++;
  2424 	}
  2425 	c1=g_utf8_get_char(aline);
  2426 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2427 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2428 	{
  2429 	    if (pswit[ECHO_SWITCH])
  2430 		g_print("\n%s\n",aline);
  2431 	    if (!pswit[OVERVIEW_SWITCH])
  2432 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2433 	    else
  2434 		cnt_punct++;
  2435 	}
  2436 	/*
  2437 	 * Dash at end of line may well be legit - paranoid mode only
  2438 	 * and don't report em-dash at line-end.
  2439 	 */
  2440 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2441 	{
  2442 	    for (s=g_utf8_prev_char(aline+lbytes);
  2443 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2444 		;
  2445 	    if (g_utf8_get_char(s)=='-' &&
  2446 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2447 	    {
  2448 		if (pswit[ECHO_SWITCH])
  2449 		    g_print("\n%s\n",aline);
  2450 		if (!pswit[OVERVIEW_SWITCH])
  2451 		    g_print("    Line %ld column %ld - "
  2452 		      "Hyphen at end of line?\n",
  2453 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2454 	    }
  2455 	}
  2456     }
  2457 }
  2458 
  2459 /*
  2460  * check_for_unspaced_bracket:
  2461  *
  2462  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2463  * If so, suspect a scanno like "a]most".
  2464  */
  2465 void check_for_unspaced_bracket(const char *aline)
  2466 {
  2467     const char *s;
  2468     gunichar c,nc,pc;
  2469     c=g_utf8_get_char(aline);
  2470     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2471     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2472     {
  2473 	pc=c;
  2474 	c=nc;
  2475 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2476 	if (!nc)
  2477 	    break;
  2478 	/* for each bracket character in the line except 1st & last */
  2479 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2480 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2481 	{
  2482 	    if (pswit[ECHO_SWITCH])
  2483 		g_print("\n%s\n",aline);
  2484 	    if (!pswit[OVERVIEW_SWITCH])
  2485 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2486 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2487 	    else
  2488 		cnt_punct++;
  2489 	}
  2490     }
  2491 }
  2492 
  2493 /*
  2494  * check_for_unpunctuated_endquote:
  2495  */
  2496 void check_for_unpunctuated_endquote(const char *aline)
  2497 {
  2498     const char *s;
  2499     gunichar c,nc,pc;
  2500     QuoteClass qc;
  2501     c=g_utf8_get_char(aline);
  2502     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2503     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2504     {
  2505 	pc=c;
  2506 	c=nc;
  2507 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2508 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2509 	/* for each character in the line except 1st */
  2510 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2511 	{
  2512 	    if (pswit[ECHO_SWITCH])
  2513 		g_print("\n%s\n",aline);
  2514 	    if (!pswit[OVERVIEW_SWITCH])
  2515 		g_print("    Line %ld column %ld - "
  2516 		  "endquote missing punctuation?\n",
  2517 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2518 	    else
  2519 		cnt_punct++;
  2520 	}
  2521     }
  2522 }
  2523 
  2524 /*
  2525  * check_for_html_tag:
  2526  *
  2527  * Check for <HTML TAG>.
  2528  *
  2529  * If there is a < in the line, followed at some point
  2530  * by a > then we suspect HTML.
  2531  */
  2532 void check_for_html_tag(const char *aline)
  2533 {
  2534     const char *open,*close;
  2535     gchar *tag;
  2536     open=strchr(aline,'<');
  2537     if (open)
  2538     {
  2539 	close=strchr(g_utf8_next_char(open),'>');
  2540 	if (close)
  2541 	{
  2542 	    if (pswit[ECHO_SWITCH])
  2543 		g_print("\n%s\n",aline);
  2544 	    if (!pswit[OVERVIEW_SWITCH])
  2545 	    {
  2546 		tag=g_strndup(open,close-open+1);
  2547 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2548 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2549 		g_free(tag);
  2550 	    }
  2551 	    else
  2552 		cnt_html++;
  2553 	}
  2554     }
  2555 }
  2556 
  2557 /*
  2558  * check_for_html_entity:
  2559  *
  2560  * Check for &symbol; HTML.
  2561  *
  2562  * If there is a & in the line, followed at
  2563  * some point by a ; then we suspect HTML.
  2564  */
  2565 void check_for_html_entity(const char *aline)
  2566 {
  2567     const char *s,*amp,*scolon;
  2568     gchar *entity;
  2569     amp=strchr(aline,'&');
  2570     if (amp)
  2571     {
  2572 	scolon=strchr(amp,';');
  2573 	if (scolon)
  2574 	{
  2575 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2576 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2577 		    break;		/* Don't report "Jones & Son;" */
  2578 	    if (s>=scolon)
  2579 	    {
  2580 		if (pswit[ECHO_SWITCH])
  2581 		    g_print("\n%s\n",aline);
  2582 		if (!pswit[OVERVIEW_SWITCH])
  2583 		{
  2584 		    entity=g_strndup(amp,scolon-amp+1);
  2585 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2586 		      linecnt,(int)(amp-aline)+1,entity);
  2587 		    g_free(entity);
  2588 		}
  2589 		else
  2590 		    cnt_html++;
  2591 	    }
  2592 	}
  2593     }
  2594 }
  2595 
  2596 /*
  2597  * check_for_omitted_punctuation:
  2598  *
  2599  * Check for omitted punctuation at end of paragraph by working back
  2600  * through prevline. DW.
  2601  * Need to check this only for "normal" paras.
  2602  * So what is a "normal" para?
  2603  *    Not normal if one-liner (chapter headings, etc.)
  2604  *    Not normal if doesn't contain at least one locase letter
  2605  *    Not normal if starts with space
  2606  */
  2607 void check_for_omitted_punctuation(const char *prevline,
  2608   struct line_properties *last,int start_para_line)
  2609 {
  2610     gboolean letter_on_line=FALSE;
  2611     const char *s;
  2612     gunichar c;
  2613     gboolean closing_quote;
  2614     for (s=prevline;*s;s=g_utf8_next_char(s))
  2615 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2616 	{
  2617 	    letter_on_line=TRUE;
  2618 	    break;
  2619 	}
  2620     /*
  2621      * This next "if" is a problem.
  2622      * If we say "start_para_line <= linecnt - 1", that includes
  2623      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2624      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2625      * misses genuine one-line paragraphs.
  2626      */
  2627     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2628       g_utf8_get_char(prevline)>CHAR_SPACE)
  2629     {
  2630 	s=prevline+strlen(prevline);
  2631 	do
  2632 	{
  2633 	    s=g_utf8_prev_char(s);
  2634 	    c=g_utf8_get_char(s);
  2635 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2636 		closing_quote=TRUE;
  2637 	    else
  2638 		closing_quote=FALSE;
  2639 	} while (closing_quote && s>prevline);
  2640 	for (;s>prevline;s=g_utf8_prev_char(s))
  2641 	{
  2642 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2643 	    {
  2644 		if (pswit[ECHO_SWITCH])
  2645 		    g_print("\n%s\n",prevline);
  2646 		if (!pswit[OVERVIEW_SWITCH])
  2647 		    g_print("    Line %ld column %ld - "
  2648 		      "No punctuation at para end?\n",
  2649 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2650 		else
  2651 		    cnt_punct++;
  2652 		break;
  2653 	    }
  2654 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2655 		break;
  2656 	}
  2657     }
  2658 }
  2659 
  2660 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2661 {
  2662     const char *word=key;
  2663     int *dupcnt=value;
  2664     if (*dupcnt)
  2665 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2666 	  word,*dupcnt);
  2667     return FALSE;
  2668 }
  2669 
  2670 void print_as_windows_1252(const char *string)
  2671 {
  2672     gsize inbytes,outbytes;
  2673     gchar *buf,*bp;
  2674     static GIConv converter=(GIConv)-1;
  2675     if (!string)
  2676     {
  2677 	if (converter!=(GIConv)-1)
  2678 	    g_iconv_close(converter);
  2679 	converter=(GIConv)-1;
  2680 	return;
  2681     }
  2682     if (converter==(GIConv)-1)
  2683 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2684     if (converter!=(GIConv)-1)
  2685     {
  2686 	inbytes=outbytes=strlen(string);
  2687 	bp=buf=g_malloc(outbytes+1);
  2688 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2689 	*bp='\0';
  2690 	fputs(buf,stdout);
  2691 	g_free(buf);
  2692     }
  2693     else
  2694 	fputs(string,stdout);
  2695 }
  2696 
  2697 void print_as_utf_8(const char *string)
  2698 {
  2699     fputs(string,stdout);
  2700 }
  2701 
  2702 /*
  2703  * procfile:
  2704  *
  2705  * Process one file.
  2706  */
  2707 void procfile(const char *filename)
  2708 {
  2709     const char *s;
  2710     gchar *parastart=NULL;	/* first line of current para */
  2711     gchar *etext,*aline;
  2712     gchar *etext_ptr;
  2713     GError *err=NULL;
  2714     struct first_pass_results *first_pass_results;
  2715     struct warnings *warnings;
  2716     struct counters counters={0};
  2717     struct line_properties last={0};
  2718     struct parities parities={0};
  2719     struct pending pending={0};
  2720     gboolean isemptyline;
  2721     long start_para_line=0;
  2722     gboolean isnewpara=FALSE,enddash=FALSE;
  2723     last.start=CHAR_SPACE;
  2724     linecnt=checked_linecnt=0;
  2725     etext=read_etext(filename,&err);
  2726     if (!etext)
  2727     {
  2728 	if (pswit[STDOUT_SWITCH])
  2729 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2730 	else
  2731 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2732 	exit(1);
  2733     }
  2734     g_print("\n\nFile: %s\n\n",filename);
  2735     first_pass_results=first_pass(etext);
  2736     warnings=report_first_pass(first_pass_results);
  2737     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2738     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2739     /*
  2740      * Here we go with the main pass. Hold onto yer hat!
  2741      */
  2742     linecnt=0;
  2743     etext_ptr=etext;
  2744     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2745     {
  2746 	linecnt++;
  2747 	if (linecnt==1)
  2748 	    isnewpara=TRUE;
  2749 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2750 	    continue;    // skip DP page separators completely
  2751 	if (linecnt<first_pass_results->firstline ||
  2752 	  (first_pass_results->footerline>0 &&
  2753 	  linecnt>first_pass_results->footerline))
  2754 	{
  2755 	    if (pswit[HEADER_SWITCH])
  2756 	    {
  2757 		if (g_str_has_prefix(aline,"Title:"))
  2758 		    g_print("    %s\n",aline);
  2759 		if (g_str_has_prefix(aline,"Author:"))
  2760 		    g_print("    %s\n",aline);
  2761 		if (g_str_has_prefix(aline,"Release Date:"))
  2762 		    g_print("    %s\n",aline);
  2763 		if (g_str_has_prefix(aline,"Edition:"))
  2764 		    g_print("    %s\n\n",aline);
  2765 	    }
  2766 	    continue;		/* skip through the header */
  2767 	}
  2768 	checked_linecnt++;
  2769 	print_pending(aline,parastart,&pending);
  2770 	isemptyline=analyse_quotes(aline,&counters);
  2771 	if (isnewpara && !isemptyline)
  2772 	{
  2773 	    /* This line is the start of a new paragraph. */
  2774 	    start_para_line=linecnt;
  2775 	    /* Capture its first line in case we want to report it later. */
  2776 	    g_free(parastart);
  2777 	    parastart=g_strdup(aline);
  2778 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2779 	    s=aline;
  2780 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2781 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2782 		s=g_utf8_next_char(s);
  2783 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2784 	    {
  2785 		/* and its first letter is lowercase */
  2786 		if (pswit[ECHO_SWITCH])
  2787 		    g_print("\n%s\n",aline);
  2788 		if (!pswit[OVERVIEW_SWITCH])
  2789 		    g_print("    Line %ld column %ld - "
  2790 		      "Paragraph starts with lower-case\n",
  2791 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2792 		else
  2793 		    cnt_punct++;
  2794 	    }
  2795 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2796 	}
  2797 	/* Check for an em-dash broken at line end. */
  2798 	if (enddash && g_utf8_get_char(aline)=='-')
  2799 	{
  2800 	    if (pswit[ECHO_SWITCH])
  2801 		g_print("\n%s\n",aline);
  2802 	    if (!pswit[OVERVIEW_SWITCH])
  2803 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2804 	    else
  2805 		cnt_punct++;
  2806 	}
  2807 	enddash=FALSE;
  2808 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2809 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2810 	    ;
  2811 	if (s>=aline && g_utf8_get_char(s)=='-')
  2812 	    enddash=TRUE;
  2813 	check_for_control_characters(aline);
  2814 	check_for_odd_characters(aline,warnings,isemptyline);
  2815 	if (warnings->longline)
  2816 	    check_for_long_line(aline);
  2817 	if (warnings->shortline)
  2818 	    check_for_short_line(aline,&last);
  2819 	last.blen=last.len;
  2820 	last.len=g_utf8_strlen(aline,-1);
  2821 	last.start=g_utf8_get_char(aline);
  2822 	check_for_starting_punctuation(aline);
  2823 	if (warnings->dash)
  2824 	{
  2825 	    check_for_spaced_emdash(aline);
  2826 	    check_for_spaced_dash(aline);
  2827 	}
  2828 	check_for_unmarked_paragraphs(aline);
  2829 	check_for_jeebies(aline);
  2830 	check_for_mta_from(aline);
  2831 	check_for_orphan_character(aline);
  2832 	check_for_pling_scanno(aline);
  2833 	check_for_extra_period(aline,warnings);
  2834 	check_for_following_punctuation(aline);
  2835 	check_for_typos(aline,warnings);
  2836 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2837 	check_for_double_punctuation(aline,warnings);
  2838 	check_for_spaced_quotes(aline);
  2839 	check_for_miscased_genative(aline);
  2840 	check_end_of_line(aline,warnings);
  2841 	check_for_unspaced_bracket(aline);
  2842 	if (warnings->endquote)
  2843 	    check_for_unpunctuated_endquote(aline);
  2844 	check_for_html_tag(aline);
  2845 	check_for_html_entity(aline);
  2846 	if (isemptyline)
  2847 	{
  2848 	    check_for_mismatched_quotes(&counters,&pending);
  2849 	    counters_reset(&counters);
  2850 	    /* let the next iteration know that it's starting a new para */
  2851 	    isnewpara=TRUE;
  2852 	    if (prevline)
  2853 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2854 	}
  2855 	g_free(prevline);
  2856 	prevline=g_strdup(aline);
  2857     }
  2858     linecnt++;
  2859     check_for_mismatched_quotes(&counters,&pending);
  2860     print_pending(NULL,parastart,&pending);
  2861     reset_pending(&pending);
  2862     if (prevline)
  2863     {
  2864 	g_free(prevline);
  2865 	prevline=NULL;
  2866     }
  2867     g_free(parastart);
  2868     g_free(prevline);
  2869     g_free(etext);
  2870     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2871 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2872     g_tree_unref(qword);
  2873     g_tree_unref(qperiod);
  2874     counters_destroy(&counters);
  2875     g_set_print_handler(NULL);
  2876     print_as_windows_1252(NULL);
  2877     if (pswit[MARKUP_SWITCH])  
  2878 	loseentities(NULL);
  2879 }
  2880 
  2881 /*
  2882  * flgets:
  2883  *
  2884  * Get one line from the input text, checking for
  2885  * the existence of exactly one CR/LF line-end per line.
  2886  *
  2887  * Returns: a pointer to the line.
  2888  */
  2889 char *flgets(char **etext,long lcnt)
  2890 {
  2891     gunichar c;
  2892     gboolean isCR=FALSE;
  2893     char *theline=*etext;
  2894     char *eos=theline;
  2895     gchar *s;
  2896     for (;;)
  2897     {
  2898 	c=g_utf8_get_char(*etext);
  2899 	if (!c)
  2900 	{
  2901 	    if (*etext==theline)
  2902 		return NULL;
  2903 	    else if (pswit[LINE_END_SWITCH])
  2904 	    {
  2905 		if (pswit[ECHO_SWITCH])
  2906 		{
  2907 		    s=g_strndup(theline,eos-theline);
  2908 		    g_print("\n%s\n",s);
  2909 		    g_free(s);
  2910 		}
  2911 		if (!pswit[OVERVIEW_SWITCH])
  2912 		    /* There may, or may not, have been a CR */
  2913 		    g_print("    Line %ld - No LF?\n",lcnt);
  2914 		else
  2915 		    cnt_lineend++;
  2916 	    }
  2917 	    break;
  2918 	}
  2919 	*etext=g_utf8_next_char(*etext);
  2920 	/* either way, it's end of line */
  2921 	if (c=='\n')
  2922 	{
  2923 	    if (isCR)
  2924 		break;
  2925 	    else
  2926 	    {
  2927 		/* Error - a LF without a preceding CR */
  2928 		if (pswit[LINE_END_SWITCH])
  2929 		{
  2930 		    if (pswit[ECHO_SWITCH])
  2931 		    {
  2932 			s=g_strndup(theline,eos-theline);
  2933 			g_print("\n%s\n",s);
  2934 			g_free(s);
  2935 		    }
  2936 		    if (!pswit[OVERVIEW_SWITCH])
  2937 			g_print("    Line %ld - No CR?\n",lcnt);
  2938 		    else
  2939 			cnt_lineend++;
  2940 		}
  2941 		break;
  2942 	    }
  2943 	}
  2944 	if (c=='\r')
  2945 	{
  2946 	    if (isCR)
  2947 	    {
  2948 		/* Error - two successive CRs */
  2949 		if (pswit[LINE_END_SWITCH])
  2950 		{
  2951 		    if (pswit[ECHO_SWITCH])
  2952 		    {
  2953 			s=g_strndup(theline,eos-theline);
  2954 			g_print("\n%s\n",s);
  2955 			g_free(s);
  2956 		    }
  2957 		    if (!pswit[OVERVIEW_SWITCH])
  2958 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2959 		    else
  2960 			cnt_lineend++;
  2961 		}
  2962 	    }
  2963 	    isCR=TRUE;
  2964 	}
  2965 	else
  2966 	{
  2967 	    if (pswit[LINE_END_SWITCH] && isCR)
  2968 	    {
  2969 		if (pswit[ECHO_SWITCH])
  2970 		{
  2971 		    s=g_strndup(theline,eos-theline);
  2972 		    g_print("\n%s\n",s);
  2973 		    g_free(s);
  2974 		}
  2975 		if (!pswit[OVERVIEW_SWITCH])
  2976 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2977 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2978 		else
  2979 		    cnt_lineend++;
  2980 		*eos=' ';
  2981 	    }
  2982 	    isCR=FALSE;
  2983 	    eos=g_utf8_next_char(eos);
  2984 	}
  2985     }
  2986     *eos='\0';
  2987     if (pswit[MARKUP_SWITCH])  
  2988 	postprocess_for_HTML(theline);
  2989     if (pswit[DP_SWITCH])  
  2990 	postprocess_for_DP(theline);
  2991     return theline;
  2992 }
  2993 
  2994 /*
  2995  * mixdigit:
  2996  *
  2997  * Takes a "word" as a parameter, and checks whether it
  2998  * contains a mixture of alpha and digits. Generally, this is an
  2999  * error, but may not be for cases like 4th or L5 12s. 3d.
  3000  *
  3001  * Returns: TRUE iff an is error found.
  3002  */
  3003 gboolean mixdigit(const char *checkword)
  3004 {
  3005     gboolean wehaveadigit,wehavealetter,query;
  3006     const char *s,*nondigit;
  3007     wehaveadigit=wehavealetter=query=FALSE;
  3008     for (s=checkword;*s;s=g_utf8_next_char(s))
  3009 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  3010 	    wehavealetter=TRUE;
  3011 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  3012 	    wehaveadigit=TRUE;
  3013     if (wehaveadigit && wehavealetter)
  3014     {
  3015 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  3016 	query=TRUE;
  3017 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  3018 	  nondigit=g_utf8_next_char(nondigit))
  3019 	    ;
  3020 	/* digits, ending in st, rd, nd, th of either case */
  3021 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  3022 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  3023 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  3024 	  !g_ascii_strcasecmp(nondigit,"th"))
  3025 	    query=FALSE;
  3026 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  3027 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  3028 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  3029 	  !g_ascii_strcasecmp(nondigit,"ths"))
  3030 	    query=FALSE;
  3031 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  3032 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  3033 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  3034 	  !g_ascii_strcasecmp(nondigit,"thly"))
  3035 	    query=FALSE;
  3036 	/* digits, ending in l, L, s or d */
  3037 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  3038 	  !strcmp(nondigit,"d"))
  3039 	    query=FALSE;
  3040 	/*
  3041 	 * L at the start of a number, representing Britsh pounds, like L500.
  3042 	 * This is cute. We know the current word is mixed digit. If the first
  3043 	 * letter is L, there must be at least one digit following. If both
  3044 	 * digits and letters follow, we have a genuine error, else we have a
  3045 	 * capital L followed by digits, and we accept that as a non-error.
  3046 	 */
  3047 	if (g_utf8_get_char(checkword)=='L' &&
  3048 	  !mixdigit(g_utf8_next_char(checkword)))
  3049 	    query=FALSE;
  3050     }
  3051     return query;
  3052 }
  3053 
  3054 /*
  3055  * getaword:
  3056  *
  3057  * Extracts the first/next "word" from the line, and returns it.
  3058  * A word is defined as one English word unit--or at least that's the aim.
  3059  * "ptr" is advanced to the position in the line where we will start
  3060  * looking for the next word.
  3061  *
  3062  * Returns: A newly-allocated string.
  3063  */
  3064 gchar *getaword(const char **ptr)
  3065 {
  3066     const char *s,*t;
  3067     GString *word;
  3068     gunichar c,pc;
  3069     word=g_string_new(NULL);
  3070     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3071       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3072       **ptr;*ptr=g_utf8_next_char(*ptr))
  3073 	;
  3074     /*
  3075      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3076      * Especially yucky is the case of L1,000
  3077      * This section looks for a pattern of characters including a digit
  3078      * followed by a comma or period followed by one or more digits.
  3079      * If found, it returns this whole pattern as a word; otherwise we discard
  3080      * the results and resume our normal programming.
  3081      */
  3082     s=*ptr;
  3083     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3084       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3085       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3086 	g_string_append_unichar(word,g_utf8_get_char(s));
  3087     if (word->len)
  3088     {
  3089 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3090 	{
  3091 	    c=g_utf8_get_char(t);
  3092 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3093 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3094 	    {
  3095 		*ptr=s;
  3096 		return g_string_free(word,FALSE);
  3097 	    }
  3098 	}
  3099     }
  3100     /* we didn't find a punctuated number - do the regular getword thing */
  3101     g_string_truncate(word,0);
  3102     c=g_utf8_get_char(*ptr);
  3103     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3104       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3105 	g_string_append_unichar(word,c);
  3106     return g_string_free(word,FALSE);
  3107 }
  3108 
  3109 /*
  3110  * isroman:
  3111  *
  3112  * Is this word a Roman Numeral?
  3113  *
  3114  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3115  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3116  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3117  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3118  * expressions thereof, except when it came to taxes. Allow any number of M,
  3119  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3120  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3121  * of optional Is.
  3122  */
  3123 gboolean isroman(const char *t)
  3124 {
  3125     const char *s;
  3126     if (!t || !*t)
  3127 	return FALSE;
  3128     s=t;
  3129     while (g_utf8_get_char(t)=='m' && *t)
  3130 	t++;
  3131     if (g_utf8_get_char(t)=='d')
  3132 	t++;
  3133     if (g_str_has_prefix(t,"cm"))
  3134 	t+=2;
  3135     if (g_str_has_prefix(t,"cd"))
  3136 	t+=2;
  3137     while (g_utf8_get_char(t)=='c' && *t)
  3138 	t++;
  3139     if (g_str_has_prefix(t,"xl"))
  3140 	t+=2;
  3141     if (g_str_has_prefix(t,"xc"))
  3142 	t+=2;
  3143     if (g_utf8_get_char(t)=='l')
  3144 	t++;
  3145     while (g_utf8_get_char(t)=='x' && *t)
  3146 	t++;
  3147     if (g_str_has_prefix(t,"ix"))
  3148 	t+=2;
  3149     if (g_str_has_prefix(t,"iv"))
  3150 	t+=2;
  3151     if (g_utf8_get_char(t)=='v')
  3152 	t++;
  3153     while (g_utf8_get_char(t)=='i' && *t)
  3154 	t++;
  3155     return !*t;
  3156 }
  3157 
  3158 /*
  3159  * postprocess_for_DP:
  3160  *
  3161  * Invoked with the -d switch from flgets().
  3162  * It simply "removes" from the line a hard-coded set of common
  3163  * DP-specific tags, so that the line passed to the main routine has
  3164  * been pre-cleaned of DP markup.
  3165  */
  3166 void postprocess_for_DP(char *theline)
  3167 {
  3168     char *s,*t;
  3169     int i;
  3170     if (!*theline) 
  3171 	return;
  3172     for (i=0;*DPmarkup[i];i++)
  3173 	while ((s=strstr(theline,DPmarkup[i])))
  3174 	{
  3175 	    t=s+strlen(DPmarkup[i]);
  3176 	    memmove(s,t,strlen(t)+1);
  3177 	}
  3178 }
  3179 
  3180 /*
  3181  * postprocess_for_HTML:
  3182  *
  3183  * Invoked with the -m switch from flgets().
  3184  * It simply "removes" from the line a hard-coded set of common
  3185  * HTML tags and "replaces" a hard-coded set of common HTML
  3186  * entities, so that the line passed to the main routine has
  3187  * been pre-cleaned of HTML.
  3188  */
  3189 void postprocess_for_HTML(char *theline)
  3190 {
  3191     while (losemarkup(theline))
  3192 	;
  3193     loseentities(theline);
  3194 }
  3195 
  3196 char *losemarkup(char *theline)
  3197 {
  3198     char *s,*t;
  3199     int i;
  3200     s=strchr(theline,'<');
  3201     t=s?strchr(s,'>'):NULL;
  3202     if (!s || !t)
  3203 	return NULL;
  3204     for (i=0;*markup[i];i++)
  3205 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3206 	{
  3207 	    t=g_utf8_next_char(t);
  3208 	    memmove(s,t,strlen(t)+1);
  3209 	    return s;
  3210 	}
  3211     /* It's an unrecognized <xxx>. */
  3212     return NULL;
  3213 }
  3214 
  3215 void loseentities(char *theline)
  3216 {
  3217     int i;
  3218     gsize nb;
  3219     char *amp,*scolon;
  3220     gchar *s,*t;
  3221     gunichar c;
  3222     GTree *entities=NULL;
  3223     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3224     if (!theline)
  3225     {
  3226 	if (entities)
  3227 	    g_tree_destroy(entities);
  3228 	entities=NULL;
  3229 	if (translit!=(GIConv)-1)
  3230 	    g_iconv_close(translit);
  3231 	translit=(GIConv)-1;
  3232 	if (to_utf8!=(GIConv)-1)
  3233 	    g_iconv_close(to_utf8);
  3234 	to_utf8=(GIConv)-1;
  3235 	return;
  3236     }
  3237     if (!*theline)
  3238 	return;
  3239     if (!entities)
  3240     {
  3241 	entities=g_tree_new((GCompareFunc)strcmp);
  3242 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3243 	    g_tree_insert(entities,HTMLentities[i].name,
  3244 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3245     }
  3246     if (translit==(GIConv)-1)
  3247 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3248     if (to_utf8==(GIConv)-1)
  3249 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3250     while((amp=strchr(theline,'&')))
  3251     {
  3252 	scolon=strchr(amp,';');
  3253 	if (scolon)
  3254 	{
  3255 	    if (amp[1]=='#')
  3256 	    {
  3257 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3258 		    c=strtol(amp+2,NULL,10);
  3259 		else if (amp[2]=='x' &&
  3260 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3261 		    c=strtol(amp+3,NULL,16);
  3262 	    }
  3263 	    else
  3264 	    {
  3265 		s=g_strndup(amp+1,scolon-(amp+1));
  3266 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3267 		g_free(s);
  3268 	    }
  3269 	}
  3270 	else
  3271 	    c=0;
  3272 	if (c)
  3273 	{
  3274 	    theline=amp;
  3275 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3276 		theline+=g_unichar_to_utf8(c,theline);
  3277 	    else
  3278 	    {
  3279 		s=g_malloc(6);
  3280 		nb=g_unichar_to_utf8(c,s);
  3281 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3282 		g_free(s);
  3283 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3284 		g_free(t);
  3285 		memcpy(theline,s,nb);
  3286 		g_free(s);
  3287 		theline+=nb;
  3288 	    }
  3289 	    memmove(theline,g_utf8_next_char(scolon),
  3290 	      strlen(g_utf8_next_char(scolon))+1);
  3291 	}
  3292 	else
  3293 	    theline=g_utf8_next_char(amp);
  3294     }
  3295 }
  3296 
  3297 gboolean tagcomp(const char *strin,const char *basetag)
  3298 {
  3299     gboolean retval;
  3300     gchar *s,*t;
  3301     if (g_utf8_get_char(strin)=='/')
  3302 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3303     else
  3304 	t=g_utf8_casefold(strin,-1);
  3305     s=g_utf8_casefold(basetag,-1);
  3306     retval=g_str_has_prefix(t,s);
  3307     g_free(s);
  3308     g_free(t);
  3309     return retval;
  3310 }
  3311 
  3312 void proghelp(GOptionContext *context)
  3313 {
  3314     gchar *help;
  3315     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3316     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3317     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3318     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3319       "For details, read the file COPYING.\n",stderr);
  3320     fputs("This is Free Software; "
  3321       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3322     fputs("read the file COPYING for details.\n\n",stderr);
  3323     help=g_option_context_get_help(context,TRUE,NULL);
  3324     fputs(help,stderr);
  3325     g_free(help);
  3326     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3327     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3328       "non-ASCII\n",stderr);
  3329     fputs("characters like accented letters, "
  3330       "lines longer than 75 or shorter than 55,\n",stderr);
  3331     fputs("unbalanced quotes or brackets, "
  3332       "a variety of badly formatted punctuation, \n",stderr);
  3333     fputs("HTML tags, some likely typos. "
  3334       "It is NOT a substitute for human judgement.\n",stderr);
  3335     fputs("\n",stderr);
  3336 }