bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sun Sep 29 09:19:46 2013 +0100 (2013-09-29)
changeset 131 2ff298db529e
parent 103 adc06e9e8470
child 132 237b058061f2
permissions -rw-r--r--
Fix bug #13: Character sets
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
    36 GIConv charset_validator=(GIConv)-1;
    37 
    38 gchar *prevline;
    39 
    40 /* Common typos. */
    41 char *typo[] = {
    42     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    43     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    44     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    45     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    46     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    47     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    48     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    49     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    50     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    51     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    52     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    53     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    54     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    55     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    56     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    57     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    58     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    59     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    60     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    61     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    62     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    63     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    64     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    65     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    66     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    67     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    68     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    69     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    70     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    71     "se", ""
    72 };
    73 
    74 GTree *usertypo;
    75 
    76 /* Common abbreviations and other OK words not to query as typos. */
    77 char *okword[] = {
    78     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    79     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    80     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    81     "outbid", "outbids", "frostbite", "frostbitten", ""
    82 };
    83 
    84 /* Common abbreviations that cause otherwise unexplained periods. */
    85 char *abbrev[] = {
    86     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    87     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    88 };
    89 
    90 /*
    91  * Two-Letter combinations that rarely if ever start words,
    92  * but are common scannos or otherwise common letter combinations.
    93  */
    94 char *nostart[] = {
    95     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    96 };
    97 
    98 /*
    99  * Two-Letter combinations that rarely if ever end words,
   100  * but are common scannos or otherwise common letter combinations.
   101  */
   102 char *noend[] = {
   103     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   104     "sw", "gr", "sl", "cl", "iy", ""
   105 };
   106 
   107 char *markup[] = {
   108     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   109     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   110     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   111     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   112 };
   113 
   114 char *DPmarkup[] = {
   115     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   116 };
   117 
   118 char *nocomma[] = {
   119     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   120     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   121     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   122     "during", "let", "toward", "among", ""
   123 };
   124 
   125 char *noperiod[] = {
   126     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   127     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   128     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   129     "among", "those", "into", "whom", "having", "thence", ""
   130 }; 
   131 
   132 gboolean pswit[SWITNO];  /* program switches */
   133 gchar *opt_charset;
   134 
   135 static GOptionEntry options[]={
   136     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   137       "Ignore DP-specific markup", NULL },
   138     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   139       "Don't echo queried line", NULL },
   140     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   141       "Check single quotes", NULL },
   142     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   143       "Check common typos", NULL },
   144     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   145       "Require closure of quotes on every paragraph", NULL },
   146     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   147       "Disable paranoid querying of everything", NULL },
   148     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   149       "Disable line end checking", NULL },
   150     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   151       "Overview: just show counts", NULL },
   152     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   153       "Output errors to stdout instead of stderr", NULL },
   154     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   155       "Echo header fields", NULL },
   156     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   157       "Ignore markup in < >", NULL },
   158     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   159       "Use file of user-defined typos", NULL },
   160     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   161       "Defaults for use on www upload", NULL },
   162     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   163       "Verbose - list everything", NULL },
   164     { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
   165       "Set of characters valid for this ebook", "NAME" },
   166     { NULL }
   167 };
   168 
   169 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   170 long cnt_squot;		/* for overview mode, count of singlequote queries */
   171 long cnt_brack;		/* for overview mode, count of brackets queries */
   172 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   173 long cnt_odd;		/* for overview mode, count of odd character queries */
   174 long cnt_long;		/* for overview mode, count of long line errors */
   175 long cnt_short;		/* for overview mode, count of short line queries */
   176 long cnt_punct;		/* for overview mode,
   177 			   count of punctuation and spacing queries */
   178 long cnt_dash;		/* for overview mode, count of dash-related queries */
   179 long cnt_word;		/* for overview mode, count of word queries */
   180 long cnt_html;		/* for overview mode, count of html queries */
   181 long cnt_lineend;	/* for overview mode, count of line-end queries */
   182 long cnt_spacend;	/* count of lines with space at end */
   183 long linecnt;		/* count of total lines in the file */
   184 long checked_linecnt;	/* count of lines actually checked */
   185 
   186 void proghelp(GOptionContext *context);
   187 void procfile(const char *);
   188 
   189 gchar *running_from;
   190 
   191 gboolean mixdigit(const char *);
   192 gchar *getaword(const char **);
   193 char *flgets(char **,long);
   194 void postprocess_for_HTML(char *);
   195 char *linehasmarkup(char *);
   196 char *losemarkup(char *);
   197 gboolean tagcomp(const char *,const char *);
   198 void loseentities(char *);
   199 gboolean isroman(const char *);
   200 void postprocess_for_DP(char *);
   201 void print_as_windows_1252(const char *string);
   202 void print_as_utf_8(const char *string);
   203 
   204 GTree *qword,*qperiod;
   205 
   206 #ifdef __WIN32__
   207 UINT saved_cp;
   208 #endif
   209 
   210 gboolean set_charset(const char *name,GError **err)
   211 {
   212     /* The various UNICODE encodings all share the same character set. */
   213     const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
   214       "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
   215       "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
   216       "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
   217       "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
   218     int i;
   219     if (charset)
   220 	g_free(charset);
   221     if (charset_validator!=(GIConv)-1)
   222 	g_iconv_close(charset_validator);
   223     if (!name || !g_strcasecmp(name,"auto"))
   224     {
   225 	charset=NULL;
   226 	charset_validator=(GIConv)-1;
   227 	return TRUE;
   228     }
   229     else
   230 	charset=g_strdup(name);
   231     for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
   232 	if (!g_strcasecmp(charset,unicode_aliases[i]))
   233 	{
   234 	    g_free(charset);
   235 	    charset=g_strdup("UTF-8");
   236 	    break;
   237 	}
   238     if (!strcmp(charset,"UTF-8"))
   239 	charset_validator=(GIConv)-1;
   240     else
   241     {
   242 	charset_validator=g_iconv_open(charset,"UTF-8");
   243 	if (charset_validator==(GIConv)-1)
   244 	{
   245 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
   246 	      "Unknown character set \"%s\"",charset);
   247 	    return FALSE;
   248 	}
   249     }
   250     return TRUE;
   251 }
   252 
   253 void parse_options(int *argc,char ***argv)
   254 {
   255     GError *err=NULL;
   256     GOptionContext *context;
   257     context=g_option_context_new(
   258       "file - looks for errors in Project Gutenberg(TM) etexts");
   259     g_option_context_add_main_entries(context,options,NULL);
   260     if (!g_option_context_parse(context,argc,argv,&err))
   261     {
   262 	g_printerr("Bookloupe: %s\n",err->message);
   263 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   264 	exit(1);
   265     }
   266     /* Paranoid checking is turned OFF, not on, by its switch */
   267     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   268     if (pswit[PARANOID_SWITCH])
   269 	/* if running in paranoid mode, typo checks default to enabled */
   270 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   271     /* Line-end checking is turned OFF, not on, by its switch */
   272     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   273     /* Echoing is turned OFF, not on, by its switch */
   274     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   275     if (pswit[OVERVIEW_SWITCH])
   276 	/* just print summary; don't echo */
   277 	pswit[ECHO_SWITCH]=FALSE;
   278     /*
   279      * Web uploads - for the moment, this is really just a placeholder
   280      * until we decide what processing we really want to do on web uploads
   281      */
   282     if (pswit[WEB_SWITCH])
   283     {
   284 	/* specific override for web uploads */
   285 	pswit[ECHO_SWITCH]=TRUE;
   286 	pswit[SQUOTE_SWITCH]=FALSE;
   287 	pswit[TYPO_SWITCH]=TRUE;
   288 	pswit[QPARA_SWITCH]=FALSE;
   289 	pswit[PARANOID_SWITCH]=TRUE;
   290 	pswit[LINE_END_SWITCH]=FALSE;
   291 	pswit[OVERVIEW_SWITCH]=FALSE;
   292 	pswit[STDOUT_SWITCH]=FALSE;
   293 	pswit[HEADER_SWITCH]=TRUE;
   294 	pswit[VERBOSE_SWITCH]=FALSE;
   295 	pswit[MARKUP_SWITCH]=FALSE;
   296 	pswit[USERTYPO_SWITCH]=FALSE;
   297 	pswit[DP_SWITCH]=FALSE;
   298     }
   299     if (opt_charset && !set_charset(opt_charset,&err))
   300     {
   301 	g_printerr("%s\n",err->message);
   302 	exit(1);
   303     }
   304     g_free(opt_charset);
   305     opt_charset=NULL;
   306     if (*argc<2)
   307     {
   308 	proghelp(context);
   309 	exit(1);
   310     }
   311     g_option_context_free(context);
   312 }
   313 
   314 /*
   315  * read_user_scannos:
   316  *
   317  * Read in the user-defined stealth scanno list.
   318  */
   319 void read_user_scannos(void)
   320 {
   321     GError *err=NULL;
   322     gchar *usertypo_file;
   323     gboolean okay;
   324     int i;
   325     gsize len,nb;
   326     gchar *contents,*utf8,**lines;
   327     usertypo_file=g_strdup("bookloupe.typ");
   328     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   329     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   330     {
   331 	g_clear_error(&err);
   332 	g_free(usertypo_file);
   333 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   334 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   335     }
   336     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   337     {
   338 	g_clear_error(&err);
   339 	g_free(usertypo_file);
   340 	usertypo_file=g_strdup("gutcheck.typ");
   341 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   342     }
   343     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   344     {
   345 	g_clear_error(&err);
   346 	g_free(usertypo_file);
   347 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   348 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   349     }
   350     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   351     {
   352 	g_free(usertypo_file);
   353 	g_print("   --> I couldn't find bookloupe.typ "
   354 	  "-- proceeding without user typos.\n");
   355 	return;
   356     }
   357     else if (!okay)
   358     {
   359 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   360 	g_free(usertypo_file);
   361 	g_clear_error(&err);
   362 	exit(1);
   363     }
   364     if (g_utf8_validate(contents,len,NULL))
   365     {
   366 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   367 	if (!charset)
   368 	    (void)set_charset("UNICODE",NULL);
   369     }
   370     else
   371 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   372     g_free(contents);
   373     lines=g_strsplit_set(utf8,"\r\n",0);
   374     g_free(utf8);
   375     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   376     for (i=0;lines[i];i++)
   377 	if (*(unsigned char *)lines[i]>'!')
   378 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   379 	else
   380 	    g_free(lines[i]);
   381     g_free(lines);
   382 }
   383 
   384 /*
   385  * read_etext:
   386  *
   387  * Read an etext returning a newly allocated string containing the file
   388  * contents or NULL on error.
   389  */
   390 gchar *read_etext(const char *filename,GError **err)
   391 {
   392     GError *tmp_err=NULL;
   393     gchar *contents,*utf8;
   394     gsize len,bytes_read,bytes_written;
   395     int i,line,col;
   396     if (!g_file_get_contents(filename,&contents,&len,err))
   397 	return NULL;
   398     if (g_utf8_validate(contents,len,NULL))
   399     {
   400 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   401 	g_set_print_handler(print_as_utf_8);
   402 #ifdef __WIN32__
   403 	SetConsoleOutputCP(CP_UTF8);
   404 #endif
   405     }
   406     else
   407     {
   408 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   409 	  &bytes_written,&tmp_err);
   410 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   411 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   412 	{
   413 	    line=col=1;
   414 	    for(i=0;i<bytes_read;i++)
   415 		if (contents[i]=='\n')
   416 		{
   417 		    line++;
   418 		    col=1;
   419 		}
   420 		else if (contents[i]!='\r')
   421 		    col++;
   422 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   423 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   424 	      "valid Windows-1252 character",
   425 	      ((unsigned char *)contents)[bytes_read],line,col);
   426 	}
   427 	else if (tmp_err)
   428 	    g_propagate_error(err,tmp_err);
   429 	g_set_print_handler(print_as_windows_1252);
   430 #ifdef __WIN32__
   431 	SetConsoleOutputCP(1252);
   432 #endif
   433     }
   434     g_free(contents);
   435     return utf8;
   436 }
   437 
   438 void cleanup_on_exit(void)
   439 {
   440 #ifdef __WIN32__
   441     SetConsoleOutputCP(saved_cp);
   442 #endif
   443 }
   444 
   445 int main(int argc,char **argv)
   446 {
   447 #ifdef __WIN32__
   448     atexit(cleanup_on_exit);
   449     saved_cp=GetConsoleOutputCP();
   450 #endif
   451     running_from=g_path_get_dirname(argv[0]);
   452     parse_options(&argc,&argv);
   453     if (pswit[USERTYPO_SWITCH])
   454 	read_user_scannos();
   455     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   456     procfile(argv[1]);
   457     if (pswit[OVERVIEW_SWITCH])
   458     {
   459 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   460 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   461 	g_print("    --------------- Queries found --------------\n");
   462 	if (cnt_long)
   463 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   464 	if (cnt_short)
   465 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   466 	if (cnt_lineend)
   467 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   468 	if (cnt_word)
   469 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   470 	if (cnt_dquot)
   471 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   472 	if (cnt_squot)
   473 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   474 	if (cnt_brack)
   475 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   476 	if (cnt_bin)
   477 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   478 	if (cnt_odd)
   479 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   480 	if (cnt_punct)
   481 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   482 	if (cnt_dash)
   483 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   484 	if (cnt_html)
   485 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   486 	g_print("\n");
   487 	g_print("    TOTAL QUERIES		  %14ld\n",
   488 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   489 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   490     }
   491     g_free(running_from);
   492     if (usertypo)
   493 	g_tree_unref(usertypo);
   494     set_charset(NULL,NULL);
   495     return 0;
   496 }
   497 
   498 /*
   499  * first_pass:
   500  *
   501  * Run a first pass - verify that it's a valid PG
   502  * file, decide whether to report some things that
   503  * occur many times in the text like long or short
   504  * lines, non-standard dashes, etc.
   505  */
   506 struct first_pass_results *first_pass(const char *etext)
   507 {
   508     gunichar laststart=CHAR_SPACE;
   509     const char *s;
   510     gchar *lc_line;
   511     int i,j,lbytes,llen;
   512     gchar **lines;
   513     unsigned int lastlen=0,lastblen=0;
   514     long spline=0,nspline=0;
   515     static struct first_pass_results results={0};
   516     gchar *inword;
   517     lines=g_strsplit(etext,"\n",0);
   518     for (j=0;lines[j];j++)
   519     {
   520 	lbytes=strlen(lines[j]);
   521 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   522 	    lines[j][--lbytes]='\0';
   523 	llen=g_utf8_strlen(lines[j],lbytes);
   524 	linecnt++;
   525 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   526 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   527 	{
   528 	    if (spline)
   529 		g_print("   --> Duplicate header?\n");
   530 	    spline=linecnt+1;   /* first line of non-header text, that is */
   531 	}
   532 	if (!strncmp(lines[j],"*** START",9) &&
   533 	  strstr(lines[j],"PROJECT GUTENBERG"))
   534 	{
   535 	    if (nspline)
   536 		g_print("   --> Duplicate header?\n");
   537 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   538 	}
   539 	if (spline || nspline)
   540 	{
   541 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   542 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   543 	    {
   544 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   545 		{
   546 		    if (results.footerline)
   547 		    {
   548 			/* it's an old-form header - we can detect duplicates */
   549 			if (!nspline)
   550 			    g_print("   --> Duplicate footer?\n");
   551 		    }
   552 		    else
   553 			results.footerline=linecnt;
   554 		}
   555 	    }
   556 	    g_free(lc_line);
   557 	}
   558 	if (spline)
   559 	    results.firstline=spline;
   560 	if (nspline)
   561 	    results.firstline=nspline;  /* override with new */
   562 	if (results.footerline)
   563 	    continue;    /* don't count the boilerplate in the footer */
   564 	results.totlen+=llen;
   565 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   566 	{
   567 	    if (g_utf8_get_char(s)>127)
   568 		results.binlen++;
   569 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   570 		results.alphalen++;
   571 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   572 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   573 		results.endquote_count++;
   574 	}
   575 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   576 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   577 	    results.shortline++;
   578 	if (lbytes>0 &&
   579 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   580 	    cnt_spacend++;
   581 	if (strstr(lines[j],".,"))
   582 	    results.dotcomma++;
   583 	/* only count ast lines for ignoring purposes where there is */
   584 	/* locase text on the line */
   585 	if (strchr(lines[j],'*'))
   586 	{
   587 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   588 		if (g_unichar_islower(g_utf8_get_char(s)))
   589 		    break;
   590 	    if (*s)
   591 		results.astline++;
   592 	}
   593 	if (strchr(lines[j],'/'))
   594 	    results.fslashline++;
   595 	if (lbytes>0)
   596 	{
   597 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   598 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   599 	      s=g_utf8_prev_char(s))
   600 		;
   601 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   602 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   603 		results.hyphens++;
   604 	}
   605 	if (llen>LONGEST_PG_LINE)
   606 	    results.longline++;
   607 	if (llen>WAY_TOO_LONG)
   608 	    results.verylongline++;
   609 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   610 	{
   611 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   612 	    if (i>0)
   613 		results.htmcount++;
   614 	    if (strstr(lines[j],"<i>"))
   615 		results.htmcount+=4; /* bonus marks! */
   616 	}
   617 	/* Check for spaced em-dashes */
   618 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   619 	{
   620 	    results.emdash++;
   621 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   622 		results.space_emdash++;
   623 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   624 		/* count of em-dashes with spaces both sides */
   625 		results.non_PG_space_emdash++;
   626 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   627 		/* count of PG-type em-dashes with no spaces */
   628 		results.PG_space_emdash++;
   629 	}
   630 	for (s=lines[j];*s;)
   631 	{
   632 	    inword=getaword(&s);
   633 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   634 		results.Dutchcount++;
   635 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   636 		results.Frenchcount++;
   637 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   638 		results.standalone_digit++;
   639 	    g_free(inword);
   640 	}
   641 	/* Check for spaced dashes */
   642 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   643 	    results.spacedash++;
   644 	lastblen=lastlen;
   645 	lastlen=llen;
   646 	laststart=lines[j][0];
   647     }
   648     g_strfreev(lines);
   649     return &results;
   650 }
   651 
   652 /*
   653  * report_first_pass:
   654  *
   655  * Make some snap decisions based on the first pass results.
   656  */
   657 struct warnings *report_first_pass(struct first_pass_results *results)
   658 {
   659     static struct warnings warnings={0};
   660     if (cnt_spacend>0)
   661 	g_print("   --> %ld lines in this file have white space at end\n",
   662 	  cnt_spacend);
   663     warnings.dotcomma=1;
   664     if (results->dotcomma>5)
   665     {
   666 	warnings.dotcomma=0;
   667 	g_print("   --> %ld lines in this file contain '.,'. "
   668 	  "Not reporting them.\n",results->dotcomma);
   669     }
   670     /*
   671      * If more than 50 lines, or one-tenth, are short,
   672      * don't bother reporting them.
   673      */
   674     warnings.shortline=1;
   675     if (results->shortline>50 || results->shortline*10>linecnt)
   676     {
   677 	warnings.shortline=0;
   678 	g_print("   --> %ld lines in this file are short. "
   679 	  "Not reporting short lines.\n",results->shortline);
   680     }
   681     /*
   682      * If more than 50 lines, or one-tenth, are long,
   683      * don't bother reporting them.
   684      */
   685     warnings.longline=1;
   686     if (results->longline>50 || results->longline*10>linecnt)
   687     {
   688 	warnings.longline=0;
   689 	g_print("   --> %ld lines in this file are long. "
   690 	  "Not reporting long lines.\n",results->longline);
   691     }
   692     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   693     warnings.ast=1;
   694     if (results->astline>10)
   695     {
   696 	warnings.ast=0;
   697 	g_print("   --> %ld lines in this file contain asterisks. "
   698 	  "Not reporting them.\n",results->astline);
   699     }
   700     /*
   701      * If more than 10 lines contain forward slashes,
   702      * don't bother reporting them.
   703      */
   704     warnings.fslash=1;
   705     if (results->fslashline>10)
   706     {
   707 	warnings.fslash=0;
   708 	g_print("   --> %ld lines in this file contain forward slashes. "
   709 	  "Not reporting them.\n",results->fslashline);
   710     }
   711     /*
   712      * If more than 20 lines contain unpunctuated endquotes,
   713      * don't bother reporting them.
   714      */
   715     warnings.endquote=1;
   716     if (results->endquote_count>20)
   717     {
   718 	warnings.endquote=0;
   719 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   720 	  "Not reporting them.\n",results->endquote_count);
   721     }
   722     /*
   723      * If more than 15 lines contain standalone digits,
   724      * don't bother reporting them.
   725      */
   726     warnings.digit=1;
   727     if (results->standalone_digit>10)
   728     {
   729 	warnings.digit=0;
   730 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   731 	  "Not reporting them.\n",results->standalone_digit);
   732     }
   733     /*
   734      * If more than 20 lines contain hyphens at end,
   735      * don't bother reporting them.
   736      */
   737     warnings.hyphen=1;
   738     if (results->hyphens>20)
   739     {
   740 	warnings.hyphen=0;
   741 	g_print("   --> %ld lines in this file have hyphens at end. "
   742 	  "Not reporting them.\n",results->hyphens);
   743     }
   744     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   745     {
   746 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   747 	pswit[MARKUP_SWITCH]=1;
   748     }
   749     if (results->verylongline>0)
   750 	g_print("   --> %ld lines in this file are VERY long!\n",
   751 	  results->verylongline);
   752     /*
   753      * If there are more non-PG spaced dashes than PG em-dashes,
   754      * assume it's deliberate.
   755      * Current PG guidelines say don't use them, but older texts do,
   756      * and some people insist on them whatever the guidelines say.
   757      */
   758     warnings.dash=1;
   759     if (results->spacedash+results->non_PG_space_emdash>
   760       results->PG_space_emdash)
   761     {
   762 	warnings.dash=0;
   763 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   764 	  "Not reporting them.\n",
   765 	  results->spacedash+results->non_PG_space_emdash);
   766     }
   767     if (charset)
   768 	warnings.bin=0;
   769     else
   770     {
   771 	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
   772 	warnings.bin=1;
   773 	/* If more than a quarter of characters are hi-bit, bug out. */
   774 	if (results->binlen*4>results->totlen)
   775 	{
   776 	    g_print("   --> This file does not appear to be ASCII. "
   777 	      "Terminating. Best of luck with it!\n");
   778 	    exit(1);
   779 	}
   780 	if (results->alphalen*4<results->totlen)
   781 	{
   782 	    g_print("   --> This file does not appear to be text. "
   783 	      "Terminating. Best of luck with it!\n");
   784 	    exit(1);
   785 	}
   786 	if (results->binlen*100>results->totlen || results->binlen>100)
   787 	{
   788 	    g_print("   --> There are a lot of foreign letters here. "
   789 	      "Not reporting them.\n");
   790 	    if (!pswit[VERBOSE_SWITCH])
   791 		warnings.bin=0;
   792 	}
   793     }
   794     warnings.isDutch=FALSE;
   795     if (results->Dutchcount>50)
   796     {
   797 	warnings.isDutch=TRUE;
   798 	g_print("   --> This looks like Dutch - "
   799 	  "switching off dashes and warnings for 's Middags case.\n");
   800     }
   801     warnings.isFrench=FALSE;
   802     if (results->Frenchcount>50)
   803     {
   804 	warnings.isFrench=TRUE;
   805 	g_print("   --> This looks like French - "
   806 	  "switching off some doublepunct.\n");
   807     }
   808     if (results->firstline && results->footerline)
   809 	g_print("    The PG header and footer appear to be already on.\n");
   810     else
   811     {
   812 	if (results->firstline)
   813 	    g_print("    The PG header is on - no footer.\n");
   814 	if (results->footerline)
   815 	    g_print("    The PG footer is on - no header.\n");
   816     }
   817     g_print("\n");
   818     if (pswit[VERBOSE_SWITCH])
   819     {
   820 	warnings.shortline=1;
   821 	warnings.dotcomma=1;
   822 	warnings.longline=1;
   823 	warnings.dash=1;
   824 	warnings.digit=1;
   825 	warnings.ast=1;
   826 	warnings.fslash=1;
   827 	warnings.hyphen=1;
   828 	warnings.endquote=1;
   829 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   830     }
   831     if (warnings.isDutch)
   832 	warnings.dash=0;
   833     if (results->footerline>0 && results->firstline>0 &&
   834       results->footerline>results->firstline &&
   835       results->footerline-results->firstline<100)
   836     {
   837 	g_print("   --> I don't really know where this text starts. \n");
   838 	g_print("       There are no reference points.\n");
   839 	g_print("       I'm going to have to report the header and footer "
   840 	  "as well.\n");
   841 	results->firstline=0;
   842     }
   843     return &warnings;
   844 }
   845 
   846 /*
   847  * analyse_quotes:
   848  *
   849  * Look along the line, accumulate the count of quotes, and see
   850  * if this is an empty line - i.e. a line with nothing on it
   851  * but spaces.
   852  * If line has just spaces, period, * and/or - on it, don't
   853  * count it, since empty lines with asterisks or dashes to
   854  * separate sections are common.
   855  *
   856  * Returns: TRUE if the line is empty.
   857  */
   858 gboolean analyse_quotes(const char *aline,struct counters *counters)
   859 {
   860     int guessquote=0;
   861     /* assume the line is empty until proven otherwise */
   862     gboolean isemptyline=TRUE;
   863     const char *s=aline,*sprev,*snext;
   864     gunichar c;
   865     sprev=NULL;
   866     while (*s)
   867     {
   868 	snext=g_utf8_next_char(s);
   869 	c=g_utf8_get_char(s);
   870 	if (c==CHAR_DQUOTE)
   871 	    counters->quot++;
   872 	if (CHAR_IS_SQUOTE(c))
   873 	{
   874 	    if (s==aline)
   875 	    {
   876 		/*
   877 		 * At start of line, it can only be an openquote.
   878 		 * Hardcode a very common exception!
   879 		 */
   880 		if (!g_str_has_prefix(snext,"tis") &&
   881 		  !g_str_has_prefix(snext,"Tis"))
   882 		    increment_matching(counters,c,TRUE);
   883 	    }
   884 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   885 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   886 		/* Do nothing! it's definitely an apostrophe, not a quote */
   887 		;
   888 	    /* it's outside a word - let's check it out */
   889 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   890 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   891 	    {
   892 		/* it damwell better BE an openquote */
   893 		if (!g_str_has_prefix(snext,"tis") &&
   894 		  !g_str_has_prefix(snext,"Tis"))
   895 		    /* hardcode a very common exception! */
   896 		    increment_matching(counters,c,TRUE);
   897 	    }
   898 	    else
   899 	    {
   900 		/* now - is it a closequote? */
   901 		guessquote=0;   /* accumulate clues */
   902 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   903 		{
   904 		    /* it follows a letter - could be either */
   905 		    guessquote++;
   906 		    if (g_utf8_get_char(sprev)=='s')
   907 		    {
   908 			/* looks like a plural apostrophe */
   909 			guessquote-=3;
   910 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   911 			    /* bonus marks! */
   912 			    guessquote-=2;
   913 		    }
   914 		}
   915 		/* it doesn't have a letter either side */
   916 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
   917 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
   918 		    guessquote+=8; /* looks like a closequote */
   919 		else
   920 		    guessquote++;
   921 		if (matching_difference(counters,CHAR_SQUOTE)>0)
   922 		    /*
   923 		     * Give it the benefit of some doubt,
   924 		     * if a squote is already open.
   925 		     */
   926 		    guessquote++;
   927 		else
   928 		    guessquote--;
   929 		if (guessquote>=0)
   930 		    increment_matching(counters,c,FALSE);
   931 	    }
   932 	}
   933 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   934 	  c!='\r' && c!='\n')
   935 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   936 	if (c==CHAR_UNDERSCORE)
   937 	    counters->c_unders++;
   938 	if (c==CHAR_OPEN_SBRACK)
   939 	{
   940 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
   941 	      !matching_difference(counters,c) && s==aline &&
   942 	      g_str_has_prefix(s,"[Illustration:"))
   943 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
   944 	    else
   945 		increment_matching(counters,c,TRUE);
   946 	}
   947 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
   948 	    increment_matching(counters,c,TRUE);
   949 	if (c==CHAR_CLOSE_SBRACK)
   950 	{
   951 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
   952 	      !matching_difference(counters,c) && !*snext)
   953 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
   954 	    else
   955 		increment_matching(counters,c,FALSE);
   956 	}
   957 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
   958 	    increment_matching(counters,c,FALSE);
   959 	sprev=s;
   960 	s=snext;
   961     }
   962     return isemptyline;
   963 }
   964 
   965 /*
   966  * check_for_control_characters:
   967  *
   968  * Check for invalid or questionable characters in the line
   969  * Anything above 127 is invalid for plain ASCII, and
   970  * non-printable control characters should also be flagged.
   971  * Tabs should generally not be there.
   972  */
   973 void check_for_control_characters(const char *aline)
   974 {
   975     gunichar c;
   976     const char *s;
   977     for (s=aline;*s;s=g_utf8_next_char(s))
   978     {
   979 	c=g_utf8_get_char(s);
   980 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   981 	{
   982 	    if (pswit[ECHO_SWITCH])
   983 		g_print("\n%s\n",aline);
   984 	    if (!pswit[OVERVIEW_SWITCH])
   985 		g_print("    Line %ld column %ld - Control character %u\n",
   986 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   987 	    else
   988 		cnt_bin++;
   989 	}
   990     }
   991 }
   992 
   993 /*
   994  * check_for_odd_characters:
   995  *
   996  * Check for binary and other odd characters.
   997  */
   998 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   999   gboolean isemptyline)
  1000 {
  1001     /* Don't repeat multiple warnings on one line. */
  1002     gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
  1003     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
  1004     const char *s;
  1005     gunichar c;
  1006     gsize nb;
  1007     gchar *t;
  1008     for (s=aline;*s;s=g_utf8_next_char(s))
  1009     {
  1010 	c=g_utf8_get_char(s);
  1011 	if (warnings->bin && !eInvalidChar &&
  1012 	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
  1013 	{
  1014 	    if (pswit[ECHO_SWITCH])
  1015 		g_print("\n%s\n",aline);
  1016 	    if (!pswit[OVERVIEW_SWITCH])
  1017 		if (c>127 && c<160 || c>255)
  1018 		    g_print("    Line %ld column %ld - "
  1019 		      "Non-ISO-8859 character %u\n",
  1020 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1021 		else
  1022 		    g_print("    Line %ld column %ld - "
  1023 		      "Non-ASCII character %u\n",
  1024 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1025 	    else
  1026 		cnt_bin++;
  1027 	    eInvalidChar=TRUE;
  1028 	}
  1029 	if (!eInvalidChar && charset)
  1030 	{
  1031 	    if (charset_validator==(GIConv)-1)
  1032 	    {
  1033 		if (!g_unichar_isdefined(c))
  1034 		{
  1035 		    if (pswit[ECHO_SWITCH])
  1036 			g_print("\n%s\n",aline);
  1037 		    if (!pswit[OVERVIEW_SWITCH])
  1038 			g_print("    Line %ld column %ld - Unassigned UNICODE "
  1039 			  "code point U+%04" G_GINT32_MODIFIER "X\n",
  1040 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1041 		    else
  1042 			cnt_bin++;
  1043 		    eInvalidChar=TRUE;
  1044 		}
  1045 		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
  1046 		  c>=100000 && c<=0x10FFFD)
  1047 		{
  1048 		    if (pswit[ECHO_SWITCH])
  1049 			g_print("\n%s\n",aline);
  1050 		    if (!pswit[OVERVIEW_SWITCH])
  1051 			g_print("    Line %ld column %ld - Private Use "
  1052 			  "character U+%04" G_GINT32_MODIFIER "X\n",
  1053 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1054 		    else
  1055 			cnt_bin++;
  1056 		    eInvalidChar=TRUE;
  1057 		}
  1058 	    }
  1059 	    else
  1060 	    {
  1061 		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
  1062 		  charset_validator,NULL,&nb,NULL);
  1063 		if (t)
  1064 		    g_free(t);
  1065 		else
  1066 		{
  1067 		    if (pswit[ECHO_SWITCH])
  1068 			g_print("\n%s\n",aline);
  1069 		    if (!pswit[OVERVIEW_SWITCH])
  1070 			g_print("    Line %ld column %ld - Non-%s "
  1071 			  "character %u\n",linecnt,
  1072 			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
  1073 		    else
  1074 			cnt_bin++;
  1075 		    eInvalidChar=TRUE;
  1076 		}
  1077 	    }
  1078 	}
  1079 	if (!eTab && c==CHAR_TAB)
  1080 	{
  1081 	    if (pswit[ECHO_SWITCH])
  1082 		g_print("\n%s\n",aline);
  1083 	    if (!pswit[OVERVIEW_SWITCH])
  1084 		g_print("    Line %ld column %ld - Tab character?\n",
  1085 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1086 	    else
  1087 		cnt_odd++;
  1088 	    eTab=TRUE;
  1089 	}
  1090 	if (!eTilde && c==CHAR_TILDE)
  1091 	{
  1092 	    /*
  1093 	     * Often used by OCR software to indicate an
  1094 	     * unrecognizable character.
  1095 	     */
  1096 	    if (pswit[ECHO_SWITCH])
  1097 		g_print("\n%s\n",aline);
  1098 	    if (!pswit[OVERVIEW_SWITCH])
  1099 		g_print("    Line %ld column %ld - Tilde character?\n",
  1100 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1101 	    else
  1102 		cnt_odd++;
  1103 	    eTilde=TRUE;
  1104 	}
  1105 	if (!eCarat && c==CHAR_CARAT)
  1106 	{  
  1107 	    if (pswit[ECHO_SWITCH])
  1108 		g_print("\n%s\n",aline);
  1109 	    if (!pswit[OVERVIEW_SWITCH])
  1110 		g_print("    Line %ld column %ld - Carat character?\n",
  1111 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1112 	    else
  1113 		cnt_odd++;
  1114 	    eCarat=TRUE;
  1115 	}
  1116 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1117 	{  
  1118 	    if (pswit[ECHO_SWITCH])
  1119 		g_print("\n%s\n",aline);
  1120 	    if (!pswit[OVERVIEW_SWITCH])
  1121 		g_print("    Line %ld column %ld - Forward slash?\n",
  1122 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1123 	    else
  1124 		cnt_odd++;
  1125 	    eFSlash=TRUE;
  1126 	}
  1127 	/*
  1128 	 * Report asterisks only in paranoid mode,
  1129 	 * since they're often deliberate.
  1130 	 */
  1131 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1132 	  c==CHAR_ASTERISK)
  1133 	{
  1134 	    if (pswit[ECHO_SWITCH])
  1135 		g_print("\n%s\n",aline);
  1136 	    if (!pswit[OVERVIEW_SWITCH])
  1137 		g_print("    Line %ld column %ld - Asterisk?\n",
  1138 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1139 	    else
  1140 		cnt_odd++;
  1141 	    eAst=TRUE;
  1142 	}
  1143     }
  1144 }
  1145 
  1146 /*
  1147  * check_for_long_line:
  1148  *
  1149  * Check for line too long.
  1150  */
  1151 void check_for_long_line(const char *aline)
  1152 {
  1153     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1154     {
  1155 	if (pswit[ECHO_SWITCH])
  1156 	    g_print("\n%s\n",aline);
  1157 	if (!pswit[OVERVIEW_SWITCH])
  1158 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1159 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1160 	else
  1161 	    cnt_long++;
  1162     }
  1163 }
  1164 
  1165 /*
  1166  * check_for_short_line:
  1167  *
  1168  * Check for line too short.
  1169  *
  1170  * This one is a bit trickier to implement: we don't want to
  1171  * flag the last line of a paragraph for being short, so we
  1172  * have to wait until we know that our current line is a
  1173  * "normal" line, then report the _previous_ line if it was too
  1174  * short. We also don't want to report indented lines like
  1175  * chapter heads or formatted quotations. We therefore keep
  1176  * last->len as the length of the last line examined, and
  1177  * last->blen as the length of the last but one, and try to
  1178  * suppress unnecessary warnings by checking that both were of
  1179  * "normal" length. We keep the first character of the last
  1180  * line in last->start, and if it was a space, we assume that
  1181  * the formatting is deliberate. I can't figure out a way to
  1182  * distinguish something like a quoted verse left-aligned or
  1183  * the header or footer of a letter from a paragraph of short
  1184  * lines - maybe if I examined the whole paragraph, and if the
  1185  * para has less than, say, 8 lines and if all lines are short,
  1186  * then just assume it's OK? Need to look at some texts to see
  1187  * how often a formula like this would get the right result.
  1188  */
  1189 void check_for_short_line(const char *aline,const struct line_properties *last)
  1190 {
  1191     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1192       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1193       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1194     {
  1195 	if (pswit[ECHO_SWITCH])
  1196 	    g_print("\n%s\n",prevline);
  1197 	if (!pswit[OVERVIEW_SWITCH])
  1198 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1199 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1200 	else
  1201 	    cnt_short++;
  1202     }
  1203 }
  1204 
  1205 /*
  1206  * check_for_starting_punctuation:
  1207  *
  1208  * Look for punctuation other than full ellipses at start of line.
  1209  */
  1210 void check_for_starting_punctuation(const char *aline)
  1211 {
  1212     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1213       !g_str_has_prefix(aline,". . ."))
  1214     {
  1215 	if (pswit[ECHO_SWITCH])
  1216 	    g_print("\n%s\n",aline);
  1217 	if (!pswit[OVERVIEW_SWITCH])
  1218 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1219 	      linecnt);
  1220 	else
  1221 	    cnt_punct++;
  1222     }
  1223 }
  1224 
  1225 /*
  1226  * check_for_spaced_emdash:
  1227  *
  1228  * Check for spaced em-dashes.
  1229  *
  1230  * We must check _all_ occurrences of "--" on the line
  1231  * hence the loop - even if the first double-dash is OK
  1232  * there may be another that's wrong later on.
  1233  */
  1234 void check_for_spaced_emdash(const char *aline)
  1235 {
  1236     const char *s,*t,*next;
  1237     for (s=aline;t=strstr(s,"--");s=next)
  1238     {
  1239 	next=g_utf8_next_char(g_utf8_next_char(t));
  1240 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1241 	  g_utf8_get_char(next)==CHAR_SPACE)
  1242 	{
  1243 	    if (pswit[ECHO_SWITCH])
  1244 		g_print("\n%s\n",aline);
  1245 	    if (!pswit[OVERVIEW_SWITCH])
  1246 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1247 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1248 	    else
  1249 		cnt_dash++;
  1250 	}
  1251     }
  1252 }
  1253 
  1254 /*
  1255  * check_for_spaced_dash:
  1256  *
  1257  * Check for spaced dashes.
  1258  */
  1259 void check_for_spaced_dash(const char *aline)
  1260 {
  1261     const char *s;
  1262     if ((s=strstr(aline," -")))
  1263     {
  1264 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1265 	{
  1266 	    if (pswit[ECHO_SWITCH])
  1267 		g_print("\n%s\n",aline);
  1268 	    if (!pswit[OVERVIEW_SWITCH])
  1269 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1270 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1271 	    else
  1272 		cnt_dash++;
  1273 	}
  1274     }
  1275     else if ((s=strstr(aline,"- ")))
  1276     {
  1277 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1278 	{
  1279 	    if (pswit[ECHO_SWITCH])
  1280 		g_print("\n%s\n",aline);
  1281 	    if (!pswit[OVERVIEW_SWITCH])
  1282 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1283 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1284 	    else
  1285 		cnt_dash++;
  1286 	}
  1287     }
  1288 }
  1289 
  1290 /*
  1291  * check_for_unmarked_paragraphs:
  1292  *
  1293  * Check for unmarked paragraphs indicated by separate speakers.
  1294  *
  1295  * May well be false positive:
  1296  * "Bravo!" "Wonderful!" called the crowd.
  1297  * but useful all the same.
  1298  */
  1299 void check_for_unmarked_paragraphs(const char *aline)
  1300 {
  1301     const char *s;
  1302     s=strstr(aline,"\"  \"");
  1303     if (!s)
  1304 	s=strstr(aline,"\" \"");
  1305     if (s)
  1306     {
  1307 	if (pswit[ECHO_SWITCH])
  1308 	    g_print("\n%s\n",aline);
  1309 	if (!pswit[OVERVIEW_SWITCH])
  1310 	    g_print("    Line %ld column %ld - "
  1311 	      "Query missing paragraph break?\n",
  1312 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1313 	else
  1314 	    cnt_punct++;
  1315     }
  1316 }
  1317 
  1318 /*
  1319  * check_for_jeebies:
  1320  *
  1321  * Check for "to he" and other easy h/b errors.
  1322  *
  1323  * This is a very inadequate effort on the h/b problem,
  1324  * but the phrase "to he" is always an error, whereas "to
  1325  * be" is quite common.
  1326  * Similarly, '"Quiet!", be said.' is a non-be error
  1327  * "to he" is _not_ always an error!:
  1328  *       "Where they went to he couldn't say."
  1329  * Another false positive:
  1330  *       What would "Cinderella" be without the . . .
  1331  * and another: "If he wants to he can see for himself."
  1332  */
  1333 void check_for_jeebies(const char *aline)
  1334 {
  1335     const char *s;
  1336     s=strstr(aline," be could ");
  1337     if (!s)
  1338 	s=strstr(aline," be would ");
  1339     if (!s)
  1340 	s=strstr(aline," was be ");
  1341     if (!s)
  1342 	s=strstr(aline," be is ");
  1343     if (!s)
  1344 	s=strstr(aline," is be ");
  1345     if (!s)
  1346 	s=strstr(aline,"\", be ");
  1347     if (!s)
  1348 	s=strstr(aline,"\" be ");
  1349     if (!s)
  1350 	s=strstr(aline,"\" be ");
  1351     if (!s)
  1352 	s=strstr(aline," to he ");
  1353     if (s)
  1354     {
  1355 	if (pswit[ECHO_SWITCH])
  1356 	    g_print("\n%s\n",aline);
  1357 	if (!pswit[OVERVIEW_SWITCH])
  1358 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1359 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1360 	else
  1361 	    cnt_word++;
  1362     }
  1363     s=strstr(aline," the had ");
  1364     if (!s)
  1365 	s=strstr(aline," a had ");
  1366     if (!s)
  1367 	s=strstr(aline," they bad ");
  1368     if (!s)
  1369 	s=strstr(aline," she bad ");
  1370     if (!s)
  1371 	s=strstr(aline," he bad ");
  1372     if (!s)
  1373 	s=strstr(aline," you bad ");
  1374     if (!s)
  1375 	s=strstr(aline," i bad ");
  1376     if (s)
  1377     {
  1378 	if (pswit[ECHO_SWITCH])
  1379 	    g_print("\n%s\n",aline);
  1380 	if (!pswit[OVERVIEW_SWITCH])
  1381 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1382 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1383 	else
  1384 	    cnt_word++;
  1385     }
  1386     s=strstr(aline,"; hut ");
  1387     if (!s)
  1388 	s=strstr(aline,", hut ");
  1389     if (s)
  1390     {
  1391 	if (pswit[ECHO_SWITCH])
  1392 	    g_print("\n%s\n",aline);
  1393 	if (!pswit[OVERVIEW_SWITCH])
  1394 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1395 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1396 	else
  1397 	    cnt_word++;
  1398     }
  1399 }
  1400 
  1401 /*
  1402  * check_for_mta_from:
  1403  *
  1404  * Special case - angled bracket in front of "From" placed there by an
  1405  * MTA when sending an e-mail.
  1406  */
  1407 void check_for_mta_from(const char *aline)
  1408 {
  1409     const char *s;
  1410     s=strstr(aline,">From");
  1411     if (s)
  1412     {
  1413 	if (pswit[ECHO_SWITCH])
  1414 	    g_print("\n%s\n",aline);
  1415 	if (!pswit[OVERVIEW_SWITCH])
  1416 	    g_print("    Line %ld column %ld - "
  1417 	      "Query angled bracket with From\n",
  1418 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1419 	else
  1420 	    cnt_punct++;
  1421     }
  1422 }
  1423 
  1424 /*
  1425  * check_for_orphan_character:
  1426  *
  1427  * Check for a single character line -
  1428  * often an overflow from bad wrapping.
  1429  */
  1430 void check_for_orphan_character(const char *aline)
  1431 {
  1432     gunichar c;
  1433     c=g_utf8_get_char(aline);
  1434     if (c && !*g_utf8_next_char(aline))
  1435     {
  1436 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1437 	    ; /* Nothing - ignore numerals alone on a line. */
  1438 	else
  1439 	{
  1440 	    if (pswit[ECHO_SWITCH])
  1441 		g_print("\n%s\n",aline);
  1442 	    if (!pswit[OVERVIEW_SWITCH])
  1443 		g_print("    Line %ld column 1 - Query single character line\n",
  1444 		  linecnt);
  1445 	    else
  1446 		cnt_punct++;
  1447 	}
  1448     }
  1449 }
  1450 
  1451 /*
  1452  * check_for_pling_scanno:
  1453  *
  1454  * Check for I" - often should be !
  1455  */
  1456 void check_for_pling_scanno(const char *aline)
  1457 {
  1458     const char *s;
  1459     s=strstr(aline," I\"");
  1460     if (s)
  1461     {
  1462 	if (pswit[ECHO_SWITCH])
  1463 	    g_print("\n%s\n",aline);
  1464 	if (!pswit[OVERVIEW_SWITCH])
  1465 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1466 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1467 	else
  1468 	    cnt_punct++;
  1469     }
  1470 }
  1471 
  1472 /*
  1473  * check_for_extra_period:
  1474  *
  1475  * Check for period without a capital letter. Cut-down from gutspell.
  1476  * Only works when it happens on a single line.
  1477  */
  1478 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1479 {
  1480     const char *s,*t,*s1,*sprev;
  1481     int i;
  1482     gsize len;
  1483     gboolean istypo;
  1484     gchar *testword;
  1485     gunichar c,nc,pc,*decomposition;
  1486     if (pswit[PARANOID_SWITCH])
  1487     {
  1488 	for (t=aline;t=strstr(t,". ");)
  1489 	{
  1490 	    if (t==aline)
  1491 	    {
  1492 		t=g_utf8_next_char(t);
  1493 		/* start of line punctuation is handled elsewhere */
  1494 		continue;
  1495 	    }
  1496 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1497 	    {
  1498 		t=g_utf8_next_char(t);
  1499 		continue;
  1500 	    }
  1501 	    if (warnings->isDutch)
  1502 	    {
  1503 		/* For Frank & Jeroen -- 's Middags case */
  1504 		gunichar c2,c3,c4,c5;
  1505 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1506 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1507 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1508 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1509 		if (CHAR_IS_APOSTROPHE(c2) &&
  1510 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1511 		  g_unichar_isupper(c5))
  1512 		{
  1513 		    t=g_utf8_next_char(t);
  1514 		    continue;
  1515 		}
  1516 	    }
  1517 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1518 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1519 	      !isdigit(g_utf8_get_char(s1)))
  1520 		s1=g_utf8_next_char(s1);
  1521 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1522 	    {
  1523 		/* we have something to investigate */
  1524 		istypo=TRUE;
  1525 		/* so let's go back and find out */
  1526 		nc=g_utf8_get_char(t);
  1527 		s1=g_utf8_prev_char(t);
  1528 		c=g_utf8_get_char(s1);
  1529 		sprev=g_utf8_prev_char(s1);
  1530 		pc=g_utf8_get_char(sprev);
  1531 		while (s1>=aline &&
  1532 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1533 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1534 		  g_unichar_isalpha(nc)))
  1535 		{
  1536 		    nc=c;
  1537 		    s1=sprev;
  1538 		    c=pc;
  1539 		    sprev=g_utf8_prev_char(s1);
  1540 		    pc=g_utf8_get_char(sprev);
  1541 		}
  1542 		s1=g_utf8_next_char(s1);
  1543 		s=strchr(s1,'.');
  1544 		if (s)
  1545 		    testword=g_strndup(s1,s-s1);
  1546 		else
  1547 		    testword=g_strdup(s1);
  1548 		for (i=0;*abbrev[i];i++)
  1549 		    if (!strcmp(testword,abbrev[i]))
  1550 			istypo=FALSE;
  1551 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1552 		    istypo=FALSE;
  1553 		if (!*g_utf8_next_char(testword))
  1554 		    istypo=FALSE;
  1555 		if (isroman(testword))
  1556 		    istypo=FALSE;
  1557 		if (istypo)
  1558 		{
  1559 		    istypo=FALSE;
  1560 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1561 		    {
  1562 			decomposition=g_unicode_canonical_decomposition(
  1563 			  g_utf8_get_char(s),&len);
  1564 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1565 			    istypo=TRUE;
  1566 			g_free(decomposition);
  1567 		    }
  1568 		}
  1569 		if (istypo &&
  1570 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1571 		{
  1572 		    g_tree_insert(qperiod,g_strdup(testword),
  1573 		      GINT_TO_POINTER(1));
  1574 		    if (pswit[ECHO_SWITCH])
  1575 			g_print("\n%s\n",aline);
  1576 		    if (!pswit[OVERVIEW_SWITCH])
  1577 			g_print("    Line %ld column %ld - Extra period?\n",
  1578 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1579 		    else
  1580 			cnt_punct++;
  1581 		}
  1582 		g_free(testword);
  1583 	    }
  1584 	    t=g_utf8_next_char(t);
  1585 	}
  1586     }
  1587 }
  1588 
  1589 /*
  1590  * check_for_following_punctuation:
  1591  *
  1592  * Check for words usually not followed by punctuation.
  1593  */
  1594 void check_for_following_punctuation(const char *aline)
  1595 {
  1596     int i;
  1597     const char *s,*wordstart;
  1598     gunichar c;
  1599     gchar *inword,*t;
  1600     if (pswit[TYPO_SWITCH])
  1601     {
  1602 	for (s=aline;*s;)
  1603 	{
  1604 	    wordstart=s;
  1605 	    t=getaword(&s);
  1606 	    if (!*t)
  1607 	    {
  1608 		g_free(t);
  1609 		continue;
  1610 	    }
  1611 	    inword=g_utf8_strdown(t,-1);
  1612 	    g_free(t);
  1613 	    for (i=0;*nocomma[i];i++)
  1614 		if (!strcmp(inword,nocomma[i]))
  1615 		{
  1616 		    c=g_utf8_get_char(s);
  1617 		    if (c==',' || c==';' || c==':')
  1618 		    {
  1619 			if (pswit[ECHO_SWITCH])
  1620 			    g_print("\n%s\n",aline);
  1621 			if (!pswit[OVERVIEW_SWITCH])
  1622 			    g_print("    Line %ld column %ld - "
  1623 			      "Query punctuation after %s?\n",
  1624 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1625 			      inword);
  1626 			else
  1627 			    cnt_punct++;
  1628 		    }
  1629 		}
  1630 	    for (i=0;*noperiod[i];i++)
  1631 		if (!strcmp(inword,noperiod[i]))
  1632 		{
  1633 		    c=g_utf8_get_char(s);
  1634 		    if (c=='.' || c=='!')
  1635 		    {
  1636 			if (pswit[ECHO_SWITCH])
  1637 			    g_print("\n%s\n",aline);
  1638 			if (!pswit[OVERVIEW_SWITCH])
  1639 			    g_print("    Line %ld column %ld - "
  1640 			      "Query punctuation after %s?\n",
  1641 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1642 			      inword);
  1643 			else
  1644 			    cnt_punct++;
  1645 		    }
  1646 		}
  1647 	    g_free(inword);
  1648 	}
  1649     }
  1650 }
  1651 
  1652 /*
  1653  * check_for_typos:
  1654  *
  1655  * Check for commonly mistyped words,
  1656  * and digits like 0 for O in a word.
  1657  */
  1658 void check_for_typos(const char *aline,struct warnings *warnings)
  1659 {
  1660     const char *s,*t,*nt,*wordstart;
  1661     gchar *inword;
  1662     gunichar *decomposition;
  1663     gchar *testword;
  1664     int i,vowel,consonant,*dupcnt;
  1665     gboolean isdup,istypo,alower;
  1666     gunichar c,pc;
  1667     long offset,len;
  1668     gsize decomposition_len;
  1669     for (s=aline;*s;)
  1670     {
  1671 	wordstart=s;
  1672 	inword=getaword(&s);
  1673 	if (!*inword)
  1674 	{
  1675 	    g_free(inword);
  1676 	    continue; /* don't bother with empty lines */
  1677 	}
  1678 	if (mixdigit(inword))
  1679 	{
  1680 	    if (pswit[ECHO_SWITCH])
  1681 		g_print("\n%s\n",aline);
  1682 	    if (!pswit[OVERVIEW_SWITCH])
  1683 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1684 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1685 	    else
  1686 		cnt_word++;
  1687 	}
  1688 	/*
  1689 	 * Put the word through a series of tests for likely typos and OCR
  1690 	 * errors.
  1691 	 */
  1692 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1693 	{
  1694 	    istypo=FALSE;
  1695 	    alower=FALSE;
  1696 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1697 	    {
  1698 		c=g_utf8_get_char(t);
  1699 		nt=g_utf8_next_char(t);
  1700 		/* lowercase for testing */
  1701 		if (g_unichar_islower(c))
  1702 		    alower=TRUE;
  1703 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1704 		{
  1705 		    /*
  1706 		     * We have an uppercase mid-word. However, there are
  1707 		     * common cases:
  1708 		     *   Mac and Mc like McGill
  1709 		     *   French contractions like l'Abbe
  1710 		     */
  1711 		    offset=g_utf8_pointer_to_offset(inword,t);
  1712 		    if (offset>0)
  1713 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1714 		    else
  1715 			pc='\0';
  1716 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1717 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1718 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1719 		      CHAR_IS_APOSTROPHE(pc))
  1720 			; /* do nothing! */
  1721 		    else
  1722 			istypo=TRUE;
  1723 		}
  1724 	    }
  1725 	    testword=g_utf8_casefold(inword,-1);
  1726 	}
  1727 	if (pswit[TYPO_SWITCH])
  1728 	{
  1729 	    /*
  1730 	     * Check for certain unlikely two-letter combinations at word
  1731 	     * start and end.
  1732 	     */
  1733 	    len=g_utf8_strlen(testword,-1);
  1734 	    if (len>1)
  1735 	    {
  1736 		for (i=0;*nostart[i];i++)
  1737 		    if (g_str_has_prefix(testword,nostart[i]))
  1738 			istypo=TRUE;
  1739 		for (i=0;*noend[i];i++)
  1740 		    if (g_str_has_suffix(testword,noend[i]))
  1741 			istypo=TRUE;
  1742 	    }
  1743 	    /* ght is common, gbt never. Like that. */
  1744 	    if (strstr(testword,"cb"))
  1745 		istypo=TRUE;
  1746 	    if (strstr(testword,"gbt"))
  1747 		istypo=TRUE;
  1748 	    if (strstr(testword,"pbt"))
  1749 		istypo=TRUE;
  1750 	    if (strstr(testword,"tbs"))
  1751 		istypo=TRUE;
  1752 	    if (strstr(testword,"mrn"))
  1753 		istypo=TRUE;
  1754 	    if (strstr(testword,"ahle"))
  1755 		istypo=TRUE;
  1756 	    if (strstr(testword,"ihle"))
  1757 		istypo=TRUE;
  1758 	    /*
  1759 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1760 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1761 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1762 	     * numerals, but "ii" is a common scanno.
  1763 	     */
  1764 	    if (strstr(testword,"tbi"))
  1765 		istypo=TRUE;
  1766 	    if (strstr(testword,"tbe"))
  1767 		istypo=TRUE;
  1768 	    if (strstr(testword,"ii"))
  1769 		istypo=TRUE;
  1770 	    /*
  1771 	     * Check for no vowels or no consonants.
  1772 	     * If none, flag a typo.
  1773 	     */
  1774 	    if (!istypo && len>1)
  1775 	    {
  1776 		vowel=consonant=0;
  1777 		for (t=testword;*t;t=g_utf8_next_char(t))
  1778 		{
  1779 		    c=g_utf8_get_char(t);
  1780 		    decomposition=
  1781 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1782 		    if (c=='y' || g_unichar_isdigit(c))
  1783 		    {
  1784 			/* Yah, this is loose. */
  1785 			vowel++;
  1786 			consonant++;
  1787 		    }
  1788 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1789 			vowel++;
  1790 		    else
  1791 			consonant++;
  1792 		    g_free(decomposition);
  1793 		}
  1794 		if (!vowel || !consonant)
  1795 		    istypo=TRUE;
  1796 	    }
  1797 	    /*
  1798 	     * Now exclude the word from being reported if it's in
  1799 	     * the okword list.
  1800 	     */
  1801 	    for (i=0;*okword[i];i++)
  1802 		if (!strcmp(testword,okword[i]))
  1803 		    istypo=FALSE;
  1804 	    /*
  1805 	     * What looks like a typo may be a Roman numeral.
  1806 	     * Exclude these.
  1807 	     */
  1808 	    if (istypo && isroman(testword))
  1809 		istypo=FALSE;
  1810 	    /* Check the manual list of typos. */
  1811 	    if (!istypo)
  1812 		for (i=0;*typo[i];i++)
  1813 		    if (!strcmp(testword,typo[i]))
  1814 			istypo=TRUE;
  1815 	    /*
  1816 	     * Check lowercase s, l, i and m - special cases.
  1817 	     *   "j" - often a semi-colon gone wrong.
  1818 	     *   "d" for a missing apostrophe - he d
  1819 	     *   "n" for "in"
  1820 	     */
  1821 	    if (!istypo && len==1 &&
  1822 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1823 		istypo=TRUE;
  1824 	    if (istypo)
  1825 	    {
  1826 		dupcnt=g_tree_lookup(qword,testword);
  1827 		if (dupcnt)
  1828 		{
  1829 		    (*dupcnt)++;
  1830 		    isdup=!pswit[VERBOSE_SWITCH];
  1831 		}
  1832 		else
  1833 		{
  1834 		    dupcnt=g_new0(int,1);
  1835 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1836 		    isdup=FALSE;
  1837 		}
  1838 		if (!isdup)
  1839 		{
  1840 		    if (pswit[ECHO_SWITCH])
  1841 			g_print("\n%s\n",aline);
  1842 		    if (!pswit[OVERVIEW_SWITCH])
  1843 		    {
  1844 			g_print("    Line %ld column %ld - Query word %s",
  1845 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1846 			  inword);
  1847 			if (!pswit[VERBOSE_SWITCH])
  1848 			    g_print(" - not reporting duplicates");
  1849 			g_print("\n");
  1850 		    }
  1851 		    else
  1852 			cnt_word++;
  1853 		}
  1854 	    }
  1855 	}
  1856 	/* check the user's list of typos */
  1857 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1858 	{
  1859 	    if (pswit[ECHO_SWITCH])
  1860 		g_print("\n%s\n",aline);
  1861 	    if (!pswit[OVERVIEW_SWITCH])  
  1862 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1863 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1864 	}
  1865 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1866 	    g_free(testword);
  1867 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1868 	{
  1869 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1870 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1871 	    {
  1872 		if (pswit[ECHO_SWITCH])
  1873 		    g_print("\n%s\n",aline);
  1874 		if (!pswit[OVERVIEW_SWITCH])
  1875 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1876 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1877 		      inword);
  1878 		else
  1879 		    cnt_word++;
  1880 	    }
  1881 	}
  1882 	g_free(inword);
  1883     }
  1884 }
  1885 
  1886 /*
  1887  * check_for_misspaced_punctuation:
  1888  *
  1889  * Look for added or missing spaces around punctuation and quotes.
  1890  * If there is a punctuation character like ! with no space on
  1891  * either side, suspect a missing!space. If there are spaces on
  1892  * both sides , assume a typo. If we see a double quote with no
  1893  * space or punctuation on either side of it, assume unspaced
  1894  * quotes "like"this.
  1895  */
  1896 void check_for_misspaced_punctuation(const char *aline,
  1897   struct parities *parities,gboolean isemptyline)
  1898 {
  1899     gboolean isacro,isellipsis;
  1900     const char *s;
  1901     gunichar c,nc,pc,n2c;
  1902     c=g_utf8_get_char(aline);
  1903     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1904     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1905     {
  1906 	pc=c;
  1907 	c=nc;
  1908 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1909 	/* For each character in the line after the first. */
  1910 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1911 	{
  1912 	    /* we need to suppress warnings for acronyms like M.D. */
  1913 	    isacro=FALSE;
  1914 	    /* we need to suppress warnings for ellipsis . . . */
  1915 	    isellipsis=FALSE;
  1916 	    /*
  1917 	     * If there are letters on both sides of it or
  1918 	     * if it's strict punctuation followed by an alpha.
  1919 	     */
  1920 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1921 	      g_utf8_strchr("?!,;:",-1,c)))
  1922 	    {
  1923 		if (c=='.')
  1924 		{
  1925 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1926 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1927 			isacro=TRUE;
  1928 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1929 		    if (nc && n2c=='.')
  1930 			isacro=TRUE;
  1931 		}
  1932 		if (!isacro)
  1933 		{
  1934 		    if (pswit[ECHO_SWITCH])
  1935 			g_print("\n%s\n",aline);
  1936 		    if (!pswit[OVERVIEW_SWITCH])
  1937 			g_print("    Line %ld column %ld - Missing space?\n",
  1938 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1939 		    else
  1940 			cnt_punct++;
  1941 		}
  1942 	    }
  1943 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1944 	    {
  1945 		/*
  1946 		 * If there are spaces on both sides,
  1947 		 * or space before and end of line.
  1948 		 */
  1949 		if (c=='.')
  1950 		{
  1951 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1952 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1953 			isellipsis=TRUE;
  1954 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1955 		    if (nc && n2c=='.')
  1956 			isellipsis=TRUE;
  1957 		}
  1958 		if (!isemptyline && !isellipsis)
  1959 		{
  1960 		    if (pswit[ECHO_SWITCH])
  1961 			g_print("\n%s\n",aline);
  1962 		    if (!pswit[OVERVIEW_SWITCH])
  1963 			g_print("    Line %ld column %ld - "
  1964 			  "Spaced punctuation?\n",linecnt,
  1965 			  g_utf8_pointer_to_offset(aline,s)+1);
  1966 		    else
  1967 			cnt_punct++;
  1968 		}
  1969 	    }
  1970 	}
  1971     }
  1972     /* Split out the characters that CANNOT be preceded by space. */
  1973     c=g_utf8_get_char(aline);
  1974     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1975     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1976     {
  1977 	pc=c;
  1978 	c=nc;
  1979 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1980 	/* for each character in the line after the first */
  1981 	if (g_utf8_strchr("?!,;:",-1,c))
  1982 	{
  1983 	    /* if it's punctuation that _cannot_ have a space before it */
  1984 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1985 	    {
  1986 		/*
  1987 		 * If nc DOES == space,
  1988 		 * it was already reported just above.
  1989 		 */
  1990 		if (pswit[ECHO_SWITCH])
  1991 		    g_print("\n%s\n",aline);
  1992 		if (!pswit[OVERVIEW_SWITCH])
  1993 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1994 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1995 		else
  1996 		    cnt_punct++;
  1997 	    }
  1998 	}
  1999     }
  2000     /*
  2001      * Special case " .X" where X is any alpha.
  2002      * This plugs a hole in the acronym code above.
  2003      * Inelegant, but maintainable.
  2004      */
  2005     c=g_utf8_get_char(aline);
  2006     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2007     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2008     {
  2009 	pc=c;
  2010 	c=nc;
  2011 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2012 	/* for each character in the line after the first */
  2013 	if (c=='.')
  2014 	{
  2015 	    /* if it's a period */
  2016 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  2017 	    {
  2018 		/*
  2019 		 * If the period follows a space and
  2020 		 * is followed by a letter.
  2021 		 */
  2022 		if (pswit[ECHO_SWITCH])
  2023 		    g_print("\n%s\n",aline);
  2024 		if (!pswit[OVERVIEW_SWITCH])
  2025 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  2026 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2027 		else
  2028 		    cnt_punct++;
  2029 	    }
  2030 	}
  2031     }
  2032     c=g_utf8_get_char(aline);
  2033     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2034     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2035     {
  2036 	pc=c;
  2037 	c=nc;
  2038 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2039 	/* for each character in the line after the first */
  2040 	if (c==CHAR_DQUOTE)
  2041 	{
  2042 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2043 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2044 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2045 	    {
  2046 		if (pswit[ECHO_SWITCH])
  2047 		    g_print("\n%s\n",aline);
  2048 		if (!pswit[OVERVIEW_SWITCH])
  2049 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2050 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2051 		else
  2052 		    cnt_punct++;
  2053 	    }
  2054 	}
  2055     }
  2056     /* Check parity of quotes. */
  2057     nc=g_utf8_get_char(aline);
  2058     for (s=aline;*s;s=g_utf8_next_char(s))
  2059     {
  2060 	c=nc;
  2061 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2062 	if (c==CHAR_DQUOTE)
  2063 	{
  2064 	    parities->dquote=!parities->dquote;
  2065 	    if (!parities->dquote)
  2066 	    {
  2067 		/* parity even */
  2068 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2069 		{
  2070 		    if (pswit[ECHO_SWITCH])
  2071 			g_print("\n%s\n",aline);
  2072 		    if (!pswit[OVERVIEW_SWITCH])
  2073 			g_print("    Line %ld column %ld - "
  2074 			  "Wrongspaced quotes?\n",
  2075 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2076 		    else
  2077 			cnt_punct++;
  2078 		}
  2079 	    }
  2080 	    else
  2081 	    {
  2082 		/* parity odd */
  2083 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2084 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2085 		{
  2086 		    if (pswit[ECHO_SWITCH])
  2087 			g_print("\n%s\n",aline);
  2088 		    if (!pswit[OVERVIEW_SWITCH])
  2089 			g_print("    Line %ld column %ld - "
  2090 			  "Wrongspaced quotes?\n",
  2091 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2092 		    else
  2093 			cnt_punct++;
  2094 		}
  2095 	    }
  2096 	}
  2097     }
  2098     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  2099     {
  2100 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2101 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2102 	{
  2103 	    if (pswit[ECHO_SWITCH])
  2104 		g_print("\n%s\n",aline);
  2105 	    if (!pswit[OVERVIEW_SWITCH])
  2106 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2107 		  linecnt);
  2108 	    else
  2109 		cnt_punct++;
  2110 	}
  2111     }
  2112     if (pswit[SQUOTE_SWITCH])
  2113     {
  2114 	nc=g_utf8_get_char(aline);
  2115 	for (s=aline;*s;s=g_utf8_next_char(s))
  2116 	{
  2117 	    c=nc;
  2118 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2119 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2120 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2121 	      !g_unichar_isalpha(nc)))
  2122 	    {
  2123 		parities->squote=!parities->squote;
  2124 		if (!parities->squote)
  2125 		{
  2126 		    /* parity even */
  2127 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2128 		    {
  2129 			if (pswit[ECHO_SWITCH])
  2130 			    g_print("\n%s\n",aline);
  2131 			if (!pswit[OVERVIEW_SWITCH])
  2132 			    g_print("    Line %ld column %ld - "
  2133 			      "Wrongspaced singlequotes?\n",
  2134 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2135 			else
  2136 			    cnt_punct++;
  2137 		    }
  2138 		}
  2139 		else
  2140 		{
  2141 		    /* parity odd */
  2142 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2143 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2144 		    {
  2145 			if (pswit[ECHO_SWITCH])
  2146 			    g_print("\n%s\n",aline);
  2147 			if (!pswit[OVERVIEW_SWITCH])
  2148 			    g_print("    Line %ld column %ld - "
  2149 			      "Wrongspaced singlequotes?\n",
  2150 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2151 			else
  2152 			    cnt_punct++;
  2153 		    }
  2154 		}
  2155 	    }
  2156 	}
  2157     }
  2158 }
  2159 
  2160 /*
  2161  * check_for_double_punctuation:
  2162  *
  2163  * Look for double punctuation like ,. or ,,
  2164  * Thanks to DW for the suggestion!
  2165  * In books with references, ".," and ".;" are common
  2166  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2167  * OTOH, from my initial tests, there are also fairly
  2168  * common errors. What to do? Make these cases paranoid?
  2169  * ".," is the most common, so warnings->dotcomma is used
  2170  * to suppress detailed reporting if it occurs often.
  2171  */
  2172 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2173 {
  2174     const char *s;
  2175     gunichar c,nc;
  2176     nc=g_utf8_get_char(aline);
  2177     for (s=aline;*s;s=g_utf8_next_char(s))
  2178     {
  2179 	c=nc;
  2180 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2181 	/* for each punctuation character in the line */
  2182 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2183 	  g_utf8_strchr(".?!,;:",-1,nc))
  2184 	{
  2185 	    /* followed by punctuation, it's a query, unless . . . */
  2186 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2187 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2188 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2189 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2190 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2191 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2192 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2193 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2194 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2195 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2196 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2197 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2198 	    {
  2199 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2200 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2201 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2202 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2203 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2204 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2205 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2206 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2207 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2208 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2209 		{
  2210 		    s+=4;
  2211 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2212 		}
  2213 		; /* do nothing for .. !! and ?? which can be legit */
  2214 	    }
  2215 	    else
  2216 	    {
  2217 		if (pswit[ECHO_SWITCH])
  2218 		    g_print("\n%s\n",aline);
  2219 		if (!pswit[OVERVIEW_SWITCH])
  2220 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2221 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2222 		else
  2223 		    cnt_punct++;
  2224 	    }
  2225 	}
  2226     }
  2227 }
  2228 
  2229 /*
  2230  * check_for_spaced_quotes:
  2231  */
  2232 void check_for_spaced_quotes(const char *aline)
  2233 {
  2234     int i;
  2235     const char *s,*t;
  2236     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2237       CHAR_RS_QUOTE};
  2238     GString *pattern;
  2239     s=aline;
  2240     while ((t=strstr(s," \" ")))
  2241     {
  2242 	if (pswit[ECHO_SWITCH])
  2243 	    g_print("\n%s\n",aline);
  2244 	if (!pswit[OVERVIEW_SWITCH])
  2245 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2246 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2247 	else
  2248 	    cnt_punct++;
  2249 	s=g_utf8_next_char(g_utf8_next_char(t));
  2250     }
  2251     pattern=g_string_new(NULL);
  2252     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2253     {
  2254 	g_string_assign(pattern," ");
  2255 	g_string_append_unichar(pattern,single_quotes[i]);
  2256 	g_string_append_c(pattern,' ');
  2257 	s=aline;
  2258 	while ((t=strstr(s,pattern->str)))
  2259 	{
  2260 	    if (pswit[ECHO_SWITCH])
  2261 		g_print("\n%s\n",aline);
  2262 	    if (!pswit[OVERVIEW_SWITCH])
  2263 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2264 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2265 	    else
  2266 		cnt_punct++;
  2267 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2268 	}
  2269     }
  2270     g_string_free(pattern,TRUE);
  2271 }
  2272 
  2273 /*
  2274  * check_for_miscased_genative:
  2275  *
  2276  * Check special case of 'S instead of 's at end of word.
  2277  */
  2278 void check_for_miscased_genative(const char *aline)
  2279 {
  2280     const char *s;
  2281     gunichar c,nc,pc;
  2282     if (!*aline)
  2283 	return;
  2284     c=g_utf8_get_char(aline);
  2285     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2286     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2287     {
  2288 	pc=c;
  2289 	c=nc;
  2290 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2291 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2292 	{
  2293 	    if (pswit[ECHO_SWITCH])
  2294 		g_print("\n%s\n",aline);
  2295 	    if (!pswit[OVERVIEW_SWITCH])
  2296 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2297 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2298 	    else
  2299 		cnt_punct++;
  2300 	}
  2301     }
  2302 }
  2303 
  2304 /*
  2305  * check_end_of_line:
  2306  *
  2307  * Now check special cases - start and end of line -
  2308  * for single and double quotes. Start is sometimes [sic]
  2309  * but better to query it anyway.
  2310  * While we're here, check for dash at end of line.
  2311  */
  2312 void check_end_of_line(const char *aline,struct warnings *warnings)
  2313 {
  2314     int lbytes;
  2315     const char *s;
  2316     gunichar c1,c2;
  2317     lbytes=strlen(aline);
  2318     if (g_utf8_strlen(aline,lbytes)>1)
  2319     {
  2320 	s=g_utf8_prev_char(aline+lbytes);
  2321 	c1=g_utf8_get_char(s);
  2322 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2323 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2324 	{
  2325 	    if (pswit[ECHO_SWITCH])
  2326 		g_print("\n%s\n",aline);
  2327 	    if (!pswit[OVERVIEW_SWITCH])
  2328 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2329 		  g_utf8_strlen(aline,lbytes));
  2330 	    else
  2331 		cnt_punct++;
  2332 	}
  2333 	c1=g_utf8_get_char(aline);
  2334 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2335 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2336 	{
  2337 	    if (pswit[ECHO_SWITCH])
  2338 		g_print("\n%s\n",aline);
  2339 	    if (!pswit[OVERVIEW_SWITCH])
  2340 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2341 	    else
  2342 		cnt_punct++;
  2343 	}
  2344 	/*
  2345 	 * Dash at end of line may well be legit - paranoid mode only
  2346 	 * and don't report em-dash at line-end.
  2347 	 */
  2348 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2349 	{
  2350 	    for (s=g_utf8_prev_char(aline+lbytes);
  2351 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2352 		;
  2353 	    if (g_utf8_get_char(s)=='-' &&
  2354 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2355 	    {
  2356 		if (pswit[ECHO_SWITCH])
  2357 		    g_print("\n%s\n",aline);
  2358 		if (!pswit[OVERVIEW_SWITCH])
  2359 		    g_print("    Line %ld column %ld - "
  2360 		      "Hyphen at end of line?\n",
  2361 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2362 	    }
  2363 	}
  2364     }
  2365 }
  2366 
  2367 /*
  2368  * check_for_unspaced_bracket:
  2369  *
  2370  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2371  * If so, suspect a scanno like "a]most".
  2372  */
  2373 void check_for_unspaced_bracket(const char *aline)
  2374 {
  2375     const char *s;
  2376     gunichar c,nc,pc;
  2377     c=g_utf8_get_char(aline);
  2378     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2379     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2380     {
  2381 	pc=c;
  2382 	c=nc;
  2383 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2384 	if (!nc)
  2385 	    break;
  2386 	/* for each bracket character in the line except 1st & last */
  2387 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2388 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2389 	{
  2390 	    if (pswit[ECHO_SWITCH])
  2391 		g_print("\n%s\n",aline);
  2392 	    if (!pswit[OVERVIEW_SWITCH])
  2393 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2394 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2395 	    else
  2396 		cnt_punct++;
  2397 	}
  2398     }
  2399 }
  2400 
  2401 /*
  2402  * check_for_unpunctuated_endquote:
  2403  */
  2404 void check_for_unpunctuated_endquote(const char *aline)
  2405 {
  2406     const char *s;
  2407     gunichar c,nc,pc;
  2408     c=g_utf8_get_char(aline);
  2409     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2410     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2411     {
  2412 	pc=c;
  2413 	c=nc;
  2414 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2415 	/* for each character in the line except 1st */
  2416 	if (c==CHAR_DQUOTE && isalpha(pc))
  2417 	{
  2418 	    if (pswit[ECHO_SWITCH])
  2419 		g_print("\n%s\n",aline);
  2420 	    if (!pswit[OVERVIEW_SWITCH])
  2421 		g_print("    Line %ld column %ld - "
  2422 		  "endquote missing punctuation?\n",
  2423 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2424 	    else
  2425 		cnt_punct++;
  2426 	}
  2427     }
  2428 }
  2429 
  2430 /*
  2431  * check_for_html_tag:
  2432  *
  2433  * Check for <HTML TAG>.
  2434  *
  2435  * If there is a < in the line, followed at some point
  2436  * by a > then we suspect HTML.
  2437  */
  2438 void check_for_html_tag(const char *aline)
  2439 {
  2440     const char *open,*close;
  2441     gchar *tag;
  2442     open=strchr(aline,'<');
  2443     if (open)
  2444     {
  2445 	close=strchr(g_utf8_next_char(open),'>');
  2446 	if (close)
  2447 	{
  2448 	    if (pswit[ECHO_SWITCH])
  2449 		g_print("\n%s\n",aline);
  2450 	    if (!pswit[OVERVIEW_SWITCH])
  2451 	    {
  2452 		tag=g_strndup(open,close-open+1);
  2453 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2454 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2455 		g_free(tag);
  2456 	    }
  2457 	    else
  2458 		cnt_html++;
  2459 	}
  2460     }
  2461 }
  2462 
  2463 /*
  2464  * check_for_html_entity:
  2465  *
  2466  * Check for &symbol; HTML.
  2467  *
  2468  * If there is a & in the line, followed at
  2469  * some point by a ; then we suspect HTML.
  2470  */
  2471 void check_for_html_entity(const char *aline)
  2472 {
  2473     const char *s,*amp,*scolon;
  2474     gchar *entity;
  2475     amp=strchr(aline,'&');
  2476     if (amp)
  2477     {
  2478 	scolon=strchr(amp,';');
  2479 	if (scolon)
  2480 	{
  2481 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2482 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2483 		    break;		/* Don't report "Jones & Son;" */
  2484 	    if (s>=scolon)
  2485 	    {
  2486 		if (pswit[ECHO_SWITCH])
  2487 		    g_print("\n%s\n",aline);
  2488 		if (!pswit[OVERVIEW_SWITCH])
  2489 		{
  2490 		    entity=g_strndup(amp,scolon-amp+1);
  2491 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2492 		      linecnt,(int)(amp-aline)+1,entity);
  2493 		    g_free(entity);
  2494 		}
  2495 		else
  2496 		    cnt_html++;
  2497 	    }
  2498 	}
  2499     }
  2500 }
  2501 
  2502 /*
  2503  * check_for_omitted_punctuation:
  2504  *
  2505  * Check for omitted punctuation at end of paragraph by working back
  2506  * through prevline. DW.
  2507  * Need to check this only for "normal" paras.
  2508  * So what is a "normal" para?
  2509  *    Not normal if one-liner (chapter headings, etc.)
  2510  *    Not normal if doesn't contain at least one locase letter
  2511  *    Not normal if starts with space
  2512  */
  2513 void check_for_omitted_punctuation(const char *prevline,
  2514   struct line_properties *last,int start_para_line)
  2515 {
  2516     gboolean letter_on_line=FALSE;
  2517     const char *s;
  2518     gunichar c;
  2519     for (s=prevline;*s;s=g_utf8_next_char(s))
  2520 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2521 	{
  2522 	    letter_on_line=TRUE;
  2523 	    break;
  2524 	}
  2525     /*
  2526      * This next "if" is a problem.
  2527      * If we say "start_para_line <= linecnt - 1", that includes
  2528      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2529      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2530      * misses genuine one-line paragraphs.
  2531      */
  2532     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2533       g_utf8_get_char(prevline)>CHAR_SPACE)
  2534     {
  2535 	s=prevline+strlen(prevline);
  2536 	do
  2537 	{
  2538 	    s=g_utf8_prev_char(s);
  2539 	    c=g_utf8_get_char(s);
  2540 	} while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
  2541 	for (;s>prevline;s=g_utf8_prev_char(s))
  2542 	{
  2543 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2544 	    {
  2545 		if (pswit[ECHO_SWITCH])
  2546 		    g_print("\n%s\n",prevline);
  2547 		if (!pswit[OVERVIEW_SWITCH])
  2548 		    g_print("    Line %ld column %ld - "
  2549 		      "No punctuation at para end?\n",
  2550 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2551 		else
  2552 		    cnt_punct++;
  2553 		break;
  2554 	    }
  2555 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2556 		break;
  2557 	}
  2558     }
  2559 }
  2560 
  2561 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2562 {
  2563     const char *word=key;
  2564     int *dupcnt=value;
  2565     if (*dupcnt)
  2566 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2567 	  word,*dupcnt);
  2568     return FALSE;
  2569 }
  2570 
  2571 void print_as_windows_1252(const char *string)
  2572 {
  2573     gsize inbytes,outbytes;
  2574     gchar *buf,*bp;
  2575     static GIConv converter=(GIConv)-1;
  2576     if (!string)
  2577     {
  2578 	if (converter!=(GIConv)-1)
  2579 	    g_iconv_close(converter);
  2580 	converter=(GIConv)-1;
  2581 	return;
  2582     }
  2583     if (converter==(GIConv)-1)
  2584 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2585     if (converter!=(GIConv)-1)
  2586     {
  2587 	inbytes=outbytes=strlen(string);
  2588 	bp=buf=g_malloc(outbytes+1);
  2589 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2590 	*bp='\0';
  2591 	fputs(buf,stdout);
  2592 	g_free(buf);
  2593     }
  2594     else
  2595 	fputs(string,stdout);
  2596 }
  2597 
  2598 void print_as_utf_8(const char *string)
  2599 {
  2600     fputs(string,stdout);
  2601 }
  2602 
  2603 /*
  2604  * procfile:
  2605  *
  2606  * Process one file.
  2607  */
  2608 void procfile(const char *filename)
  2609 {
  2610     const char *s;
  2611     gchar *parastart=NULL;	/* first line of current para */
  2612     gchar *etext,*aline;
  2613     gchar *etext_ptr;
  2614     GError *err=NULL;
  2615     struct first_pass_results *first_pass_results;
  2616     struct warnings *warnings;
  2617     struct counters counters={0};
  2618     struct line_properties last={0};
  2619     struct parities parities={0};
  2620     struct pending pending={0};
  2621     gboolean isemptyline;
  2622     long start_para_line=0;
  2623     gboolean isnewpara=FALSE,enddash=FALSE;
  2624     last.start=CHAR_SPACE;
  2625     linecnt=checked_linecnt=0;
  2626     etext=read_etext(filename,&err);
  2627     if (!etext)
  2628     {
  2629 	if (pswit[STDOUT_SWITCH])
  2630 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2631 	else
  2632 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2633 	exit(1);
  2634     }
  2635     g_print("\n\nFile: %s\n\n",filename);
  2636     first_pass_results=first_pass(etext);
  2637     warnings=report_first_pass(first_pass_results);
  2638     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2639     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2640     /*
  2641      * Here we go with the main pass. Hold onto yer hat!
  2642      */
  2643     linecnt=0;
  2644     etext_ptr=etext;
  2645     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2646     {
  2647 	linecnt++;
  2648 	if (linecnt==1)
  2649 	    isnewpara=TRUE;
  2650 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2651 	    continue;    // skip DP page separators completely
  2652 	if (linecnt<first_pass_results->firstline ||
  2653 	  (first_pass_results->footerline>0 &&
  2654 	  linecnt>first_pass_results->footerline))
  2655 	{
  2656 	    if (pswit[HEADER_SWITCH])
  2657 	    {
  2658 		if (g_str_has_prefix(aline,"Title:"))
  2659 		    g_print("    %s\n",aline);
  2660 		if (g_str_has_prefix(aline,"Author:"))
  2661 		    g_print("    %s\n",aline);
  2662 		if (g_str_has_prefix(aline,"Release Date:"))
  2663 		    g_print("    %s\n",aline);
  2664 		if (g_str_has_prefix(aline,"Edition:"))
  2665 		    g_print("    %s\n\n",aline);
  2666 	    }
  2667 	    continue;		/* skip through the header */
  2668 	}
  2669 	checked_linecnt++;
  2670 	print_pending(aline,parastart,&pending);
  2671 	isemptyline=analyse_quotes(aline,&counters);
  2672 	if (isnewpara && !isemptyline)
  2673 	{
  2674 	    /* This line is the start of a new paragraph. */
  2675 	    start_para_line=linecnt;
  2676 	    /* Capture its first line in case we want to report it later. */
  2677 	    g_free(parastart);
  2678 	    parastart=g_strdup(aline);
  2679 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2680 	    s=aline;
  2681 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2682 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2683 		s=g_utf8_next_char(s);
  2684 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2685 	    {
  2686 		/* and its first letter is lowercase */
  2687 		if (pswit[ECHO_SWITCH])
  2688 		    g_print("\n%s\n",aline);
  2689 		if (!pswit[OVERVIEW_SWITCH])
  2690 		    g_print("    Line %ld column %ld - "
  2691 		      "Paragraph starts with lower-case\n",
  2692 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2693 		else
  2694 		    cnt_punct++;
  2695 	    }
  2696 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2697 	}
  2698 	/* Check for an em-dash broken at line end. */
  2699 	if (enddash && g_utf8_get_char(aline)=='-')
  2700 	{
  2701 	    if (pswit[ECHO_SWITCH])
  2702 		g_print("\n%s\n",aline);
  2703 	    if (!pswit[OVERVIEW_SWITCH])
  2704 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2705 	    else
  2706 		cnt_punct++;
  2707 	}
  2708 	enddash=FALSE;
  2709 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2710 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2711 	    ;
  2712 	if (s>=aline && g_utf8_get_char(s)=='-')
  2713 	    enddash=TRUE;
  2714 	check_for_control_characters(aline);
  2715 	check_for_odd_characters(aline,warnings,isemptyline);
  2716 	if (warnings->longline)
  2717 	    check_for_long_line(aline);
  2718 	if (warnings->shortline)
  2719 	    check_for_short_line(aline,&last);
  2720 	last.blen=last.len;
  2721 	last.len=g_utf8_strlen(aline,-1);
  2722 	last.start=g_utf8_get_char(aline);
  2723 	check_for_starting_punctuation(aline);
  2724 	if (warnings->dash)
  2725 	{
  2726 	    check_for_spaced_emdash(aline);
  2727 	    check_for_spaced_dash(aline);
  2728 	}
  2729 	check_for_unmarked_paragraphs(aline);
  2730 	check_for_jeebies(aline);
  2731 	check_for_mta_from(aline);
  2732 	check_for_orphan_character(aline);
  2733 	check_for_pling_scanno(aline);
  2734 	check_for_extra_period(aline,warnings);
  2735 	check_for_following_punctuation(aline);
  2736 	check_for_typos(aline,warnings);
  2737 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2738 	check_for_double_punctuation(aline,warnings);
  2739 	check_for_spaced_quotes(aline);
  2740 	check_for_miscased_genative(aline);
  2741 	check_end_of_line(aline,warnings);
  2742 	check_for_unspaced_bracket(aline);
  2743 	if (warnings->endquote)
  2744 	    check_for_unpunctuated_endquote(aline);
  2745 	check_for_html_tag(aline);
  2746 	check_for_html_entity(aline);
  2747 	if (isemptyline)
  2748 	{
  2749 	    check_for_mismatched_quotes(&counters,&pending);
  2750 	    counters_reset(&counters);
  2751 	    /* let the next iteration know that it's starting a new para */
  2752 	    isnewpara=TRUE;
  2753 	    if (prevline)
  2754 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2755 	}
  2756 	g_free(prevline);
  2757 	prevline=g_strdup(aline);
  2758     }
  2759     linecnt++;
  2760     check_for_mismatched_quotes(&counters,&pending);
  2761     print_pending(NULL,parastart,&pending);
  2762     reset_pending(&pending);
  2763     if (prevline)
  2764     {
  2765 	g_free(prevline);
  2766 	prevline=NULL;
  2767     }
  2768     g_free(parastart);
  2769     g_free(prevline);
  2770     g_free(etext);
  2771     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2772 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2773     g_tree_unref(qword);
  2774     g_tree_unref(qperiod);
  2775     counters_destroy(&counters);
  2776     g_set_print_handler(NULL);
  2777     print_as_windows_1252(NULL);
  2778     if (pswit[MARKUP_SWITCH])  
  2779 	loseentities(NULL);
  2780 }
  2781 
  2782 /*
  2783  * flgets:
  2784  *
  2785  * Get one line from the input text, checking for
  2786  * the existence of exactly one CR/LF line-end per line.
  2787  *
  2788  * Returns: a pointer to the line.
  2789  */
  2790 char *flgets(char **etext,long lcnt)
  2791 {
  2792     gunichar c;
  2793     gboolean isCR=FALSE;
  2794     char *theline=*etext;
  2795     char *eos=theline;
  2796     gchar *s;
  2797     for (;;)
  2798     {
  2799 	c=g_utf8_get_char(*etext);
  2800 	*etext=g_utf8_next_char(*etext);
  2801 	if (!c)
  2802 	    return NULL;
  2803 	/* either way, it's end of line */
  2804 	if (c=='\n')
  2805 	{
  2806 	    if (isCR)
  2807 		break;
  2808 	    else
  2809 	    {
  2810 		/* Error - a LF without a preceding CR */
  2811 		if (pswit[LINE_END_SWITCH])
  2812 		{
  2813 		    if (pswit[ECHO_SWITCH])
  2814 		    {
  2815 			s=g_strndup(theline,eos-theline);
  2816 			g_print("\n%s\n",s);
  2817 			g_free(s);
  2818 		    }
  2819 		    if (!pswit[OVERVIEW_SWITCH])
  2820 			g_print("    Line %ld - No CR?\n",lcnt);
  2821 		    else
  2822 			cnt_lineend++;
  2823 		}
  2824 		break;
  2825 	    }
  2826 	}
  2827 	if (c=='\r')
  2828 	{
  2829 	    if (isCR)
  2830 	    {
  2831 		/* Error - two successive CRs */
  2832 		if (pswit[LINE_END_SWITCH])
  2833 		{
  2834 		    if (pswit[ECHO_SWITCH])
  2835 		    {
  2836 			s=g_strndup(theline,eos-theline);
  2837 			g_print("\n%s\n",s);
  2838 			g_free(s);
  2839 		    }
  2840 		    if (!pswit[OVERVIEW_SWITCH])
  2841 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2842 		    else
  2843 			cnt_lineend++;
  2844 		}
  2845 	    }
  2846 	    isCR=TRUE;
  2847 	}
  2848 	else
  2849 	{
  2850 	    if (pswit[LINE_END_SWITCH] && isCR)
  2851 	    {
  2852 		if (pswit[ECHO_SWITCH])
  2853 		{
  2854 		    s=g_strndup(theline,eos-theline);
  2855 		    g_print("\n%s\n",s);
  2856 		    g_free(s);
  2857 		}
  2858 		if (!pswit[OVERVIEW_SWITCH])
  2859 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2860 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2861 		else
  2862 		    cnt_lineend++;
  2863 		*eos=' ';
  2864 	    }
  2865 	    isCR=FALSE;
  2866 	    eos=g_utf8_next_char(eos);
  2867 	}
  2868     }
  2869     *eos='\0';
  2870     if (pswit[MARKUP_SWITCH])  
  2871 	postprocess_for_HTML(theline);
  2872     if (pswit[DP_SWITCH])  
  2873 	postprocess_for_DP(theline);
  2874     return theline;
  2875 }
  2876 
  2877 /*
  2878  * mixdigit:
  2879  *
  2880  * Takes a "word" as a parameter, and checks whether it
  2881  * contains a mixture of alpha and digits. Generally, this is an
  2882  * error, but may not be for cases like 4th or L5 12s. 3d.
  2883  *
  2884  * Returns: TRUE iff an is error found.
  2885  */
  2886 gboolean mixdigit(const char *checkword)
  2887 {
  2888     gboolean wehaveadigit,wehavealetter,query;
  2889     const char *s,*nondigit;
  2890     wehaveadigit=wehavealetter=query=FALSE;
  2891     for (s=checkword;*s;s=g_utf8_next_char(s))
  2892 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2893 	    wehavealetter=TRUE;
  2894 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2895 	    wehaveadigit=TRUE;
  2896     if (wehaveadigit && wehavealetter)
  2897     {
  2898 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2899 	query=TRUE;
  2900 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2901 	  nondigit=g_utf8_next_char(nondigit))
  2902 	    ;
  2903 	/* digits, ending in st, rd, nd, th of either case */
  2904 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2905 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2906 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2907 	  !g_ascii_strcasecmp(nondigit,"th"))
  2908 	    query=FALSE;
  2909 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2910 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2911 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2912 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2913 	    query=FALSE;
  2914 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2915 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2916 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2917 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2918 	    query=FALSE;
  2919 	/* digits, ending in l, L, s or d */
  2920 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2921 	  !strcmp(nondigit,"d"))
  2922 	    query=FALSE;
  2923 	/*
  2924 	 * L at the start of a number, representing Britsh pounds, like L500.
  2925 	 * This is cute. We know the current word is mixed digit. If the first
  2926 	 * letter is L, there must be at least one digit following. If both
  2927 	 * digits and letters follow, we have a genuine error, else we have a
  2928 	 * capital L followed by digits, and we accept that as a non-error.
  2929 	 */
  2930 	if (g_utf8_get_char(checkword)=='L' &&
  2931 	  !mixdigit(g_utf8_next_char(checkword)))
  2932 	    query=FALSE;
  2933     }
  2934     return query;
  2935 }
  2936 
  2937 /*
  2938  * getaword:
  2939  *
  2940  * Extracts the first/next "word" from the line, and returns it.
  2941  * A word is defined as one English word unit--or at least that's the aim.
  2942  * "ptr" is advanced to the position in the line where we will start
  2943  * looking for the next word.
  2944  *
  2945  * Returns: A newly-allocated string.
  2946  */
  2947 gchar *getaword(const char **ptr)
  2948 {
  2949     const char *s,*t;
  2950     GString *word;
  2951     gunichar c,pc;
  2952     word=g_string_new(NULL);
  2953     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2954       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2955       **ptr;*ptr=g_utf8_next_char(*ptr))
  2956 	;
  2957     /*
  2958      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2959      * Especially yucky is the case of L1,000
  2960      * This section looks for a pattern of characters including a digit
  2961      * followed by a comma or period followed by one or more digits.
  2962      * If found, it returns this whole pattern as a word; otherwise we discard
  2963      * the results and resume our normal programming.
  2964      */
  2965     s=*ptr;
  2966     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  2967       g_unichar_isalpha(g_utf8_get_char(s)) ||
  2968       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  2969 	g_string_append_unichar(word,g_utf8_get_char(s));
  2970     if (word->len)
  2971     {
  2972 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  2973 	{
  2974 	    c=g_utf8_get_char(t);
  2975 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  2976 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  2977 	    {
  2978 		*ptr=s;
  2979 		return g_string_free(word,FALSE);
  2980 	    }
  2981 	}
  2982     }
  2983     /* we didn't find a punctuated number - do the regular getword thing */
  2984     g_string_truncate(word,0);
  2985     c=g_utf8_get_char(*ptr);
  2986     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  2987       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  2988 	g_string_append_unichar(word,c);
  2989     return g_string_free(word,FALSE);
  2990 }
  2991 
  2992 /*
  2993  * isroman:
  2994  *
  2995  * Is this word a Roman Numeral?
  2996  *
  2997  * It doesn't actually validate that the number is a valid Roman Numeral--for
  2998  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  2999  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3000  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3001  * expressions thereof, except when it came to taxes. Allow any number of M,
  3002  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3003  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3004  * of optional Is.
  3005  */
  3006 gboolean isroman(const char *t)
  3007 {
  3008     const char *s;
  3009     if (!t || !*t)
  3010 	return FALSE;
  3011     s=t;
  3012     while (g_utf8_get_char(t)=='m' && *t)
  3013 	t++;
  3014     if (g_utf8_get_char(t)=='d')
  3015 	t++;
  3016     if (g_str_has_prefix(t,"cm"))
  3017 	t+=2;
  3018     if (g_str_has_prefix(t,"cd"))
  3019 	t+=2;
  3020     while (g_utf8_get_char(t)=='c' && *t)
  3021 	t++;
  3022     if (g_str_has_prefix(t,"xl"))
  3023 	t+=2;
  3024     if (g_str_has_prefix(t,"xc"))
  3025 	t+=2;
  3026     if (g_utf8_get_char(t)=='l')
  3027 	t++;
  3028     while (g_utf8_get_char(t)=='x' && *t)
  3029 	t++;
  3030     if (g_str_has_prefix(t,"ix"))
  3031 	t+=2;
  3032     if (g_str_has_prefix(t,"iv"))
  3033 	t+=2;
  3034     if (g_utf8_get_char(t)=='v')
  3035 	t++;
  3036     while (g_utf8_get_char(t)=='i' && *t)
  3037 	t++;
  3038     return !*t;
  3039 }
  3040 
  3041 /*
  3042  * postprocess_for_DP:
  3043  *
  3044  * Invoked with the -d switch from flgets().
  3045  * It simply "removes" from the line a hard-coded set of common
  3046  * DP-specific tags, so that the line passed to the main routine has
  3047  * been pre-cleaned of DP markup.
  3048  */
  3049 void postprocess_for_DP(char *theline)
  3050 {
  3051     char *s,*t;
  3052     int i;
  3053     if (!*theline) 
  3054 	return;
  3055     for (i=0;*DPmarkup[i];i++)
  3056 	while ((s=strstr(theline,DPmarkup[i])))
  3057 	{
  3058 	    t=s+strlen(DPmarkup[i]);
  3059 	    memmove(s,t,strlen(t)+1);
  3060 	}
  3061 }
  3062 
  3063 /*
  3064  * postprocess_for_HTML:
  3065  *
  3066  * Invoked with the -m switch from flgets().
  3067  * It simply "removes" from the line a hard-coded set of common
  3068  * HTML tags and "replaces" a hard-coded set of common HTML
  3069  * entities, so that the line passed to the main routine has
  3070  * been pre-cleaned of HTML.
  3071  */
  3072 void postprocess_for_HTML(char *theline)
  3073 {
  3074     while (losemarkup(theline))
  3075 	;
  3076     loseentities(theline);
  3077 }
  3078 
  3079 char *losemarkup(char *theline)
  3080 {
  3081     char *s,*t;
  3082     int i;
  3083     s=strchr(theline,'<');
  3084     t=s?strchr(s,'>'):NULL;
  3085     if (!s || !t)
  3086 	return NULL;
  3087     for (i=0;*markup[i];i++)
  3088 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3089 	{
  3090 	    t=g_utf8_next_char(t);
  3091 	    memmove(s,t,strlen(t)+1);
  3092 	    return s;
  3093 	}
  3094     /* It's an unrecognized <xxx>. */
  3095     return NULL;
  3096 }
  3097 
  3098 void loseentities(char *theline)
  3099 {
  3100     int i;
  3101     gsize nb;
  3102     char *amp,*scolon;
  3103     gchar *s,*t;
  3104     gunichar c;
  3105     GTree *entities=NULL;
  3106     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3107     if (!theline)
  3108     {
  3109 	if (entities)
  3110 	    g_tree_destroy(entities);
  3111 	entities=NULL;
  3112 	if (translit!=(GIConv)-1)
  3113 	    g_iconv_close(translit);
  3114 	translit=(GIConv)-1;
  3115 	if (to_utf8!=(GIConv)-1)
  3116 	    g_iconv_close(to_utf8);
  3117 	to_utf8=(GIConv)-1;
  3118 	return;
  3119     }
  3120     if (!*theline)
  3121 	return;
  3122     if (!entities)
  3123     {
  3124 	entities=g_tree_new((GCompareFunc)strcmp);
  3125 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3126 	    g_tree_insert(entities,HTMLentities[i].name,
  3127 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3128     }
  3129     if (translit==(GIConv)-1)
  3130 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3131     if (to_utf8==(GIConv)-1)
  3132 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3133     while((amp=strchr(theline,'&')))
  3134     {
  3135 	scolon=strchr(amp,';');
  3136 	if (scolon)
  3137 	{
  3138 	    if (amp[1]=='#')
  3139 	    {
  3140 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3141 		    c=strtol(amp+2,NULL,10);
  3142 		else if (amp[2]=='x' &&
  3143 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3144 		    c=strtol(amp+3,NULL,16);
  3145 	    }
  3146 	    else
  3147 	    {
  3148 		s=g_strndup(amp+1,scolon-(amp+1));
  3149 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3150 		g_free(s);
  3151 	    }
  3152 	}
  3153 	else
  3154 	    c=0;
  3155 	if (c)
  3156 	{
  3157 	    theline=amp;
  3158 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3159 		theline+=g_unichar_to_utf8(c,theline);
  3160 	    else
  3161 	    {
  3162 		s=g_malloc(6);
  3163 		nb=g_unichar_to_utf8(c,s);
  3164 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3165 		g_free(s);
  3166 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3167 		g_free(t);
  3168 		memcpy(theline,s,nb);
  3169 		g_free(s);
  3170 		theline+=nb;
  3171 	    }
  3172 	    memmove(theline,g_utf8_next_char(scolon),
  3173 	      strlen(g_utf8_next_char(scolon))+1);
  3174 	}
  3175 	else
  3176 	    theline=g_utf8_next_char(amp);
  3177     }
  3178 }
  3179 
  3180 gboolean tagcomp(const char *strin,const char *basetag)
  3181 {
  3182     gboolean retval;
  3183     gchar *s,*t;
  3184     if (g_utf8_get_char(strin)=='/')
  3185 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3186     else
  3187 	t=g_utf8_casefold(strin,-1);
  3188     s=g_utf8_casefold(basetag,-1);
  3189     retval=g_str_has_prefix(t,s);
  3190     g_free(s);
  3191     g_free(t);
  3192     return retval;
  3193 }
  3194 
  3195 void proghelp(GOptionContext *context)
  3196 {
  3197     gchar *help;
  3198     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3199     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3200     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3201     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3202       "For details, read the file COPYING.\n",stderr);
  3203     fputs("This is Free Software; "
  3204       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3205     fputs("read the file COPYING for details.\n\n",stderr);
  3206     help=g_option_context_get_help(context,TRUE,NULL);
  3207     fputs(help,stderr);
  3208     g_free(help);
  3209     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3210     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3211       "non-ASCII\n",stderr);
  3212     fputs("characters like accented letters, "
  3213       "lines longer than 75 or shorter than 55,\n",stderr);
  3214     fputs("unbalanced quotes or brackets, "
  3215       "a variety of badly formatted punctuation, \n",stderr);
  3216     fputs("HTML tags, some likely typos. "
  3217       "It is NOT a substitute for human judgement.\n",stderr);
  3218     fputs("\n",stderr);
  3219 }