bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Wed Oct 16 22:51:29 2013 +0100 (2013-10-16)
changeset 189 43b8447c9ea7
parent 174 ad92d11d59b8
child 194 b1d73702edb2
child 203 53532149d849
permissions -rw-r--r--
Fix bug #28: Don't report ., as double punctuation after "etc" or "&c"
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *prevline;
    36 
    37 /* Common typos. */
    38 char *typo[] = {
    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    68     "se", ""
    69 };
    70 
    71 GTree *usertypo;
    72 
    73 /* Common abbreviations and other OK words not to query as typos. */
    74 char *okword[] = {
    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    78     "outbid", "outbids", "frostbite", "frostbitten", ""
    79 };
    80 
    81 /* Common abbreviations that cause otherwise unexplained periods. */
    82 char *abbrev[] = {
    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    85 };
    86 
    87 /*
    88  * Two-Letter combinations that rarely if ever start words,
    89  * but are common scannos or otherwise common letter combinations.
    90  */
    91 char *nostart[] = {
    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    93 };
    94 
    95 /*
    96  * Two-Letter combinations that rarely if ever end words,
    97  * but are common scannos or otherwise common letter combinations.
    98  */
    99 char *noend[] = {
   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   101     "sw", "gr", "sl", "cl", "iy", ""
   102 };
   103 
   104 char *markup[] = {
   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   109 };
   110 
   111 char *DPmarkup[] = {
   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   113 };
   114 
   115 char *nocomma[] = {
   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   119     "during", "let", "toward", "among", ""
   120 };
   121 
   122 char *noperiod[] = {
   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   126     "among", "those", "into", "whom", "having", "thence", ""
   127 }; 
   128 
   129 gboolean pswit[SWITNO];  /* program switches */
   130 
   131 static GOptionEntry options[]={
   132     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   133       "Ignore DP-specific markup", NULL },
   134     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   135       "Don't echo queried line", NULL },
   136     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   137       "Check single quotes", NULL },
   138     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   139       "Check common typos", NULL },
   140     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   141       "Require closure of quotes on every paragraph", NULL },
   142     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   143       "Disable paranoid querying of everything", NULL },
   144     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   145       "Disable line end checking", NULL },
   146     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   147       "Overview: just show counts", NULL },
   148     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   149       "Output errors to stdout instead of stderr", NULL },
   150     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   151       "Echo header fields", NULL },
   152     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   153       "Ignore markup in < >", NULL },
   154     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   155       "Use file of user-defined typos", NULL },
   156     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   157       "Defaults for use on www upload", NULL },
   158     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   159       "Verbose - list everything", NULL },
   160     { NULL }
   161 };
   162 
   163 long cnt_quote;		/* for overview mode, count of quote queries */
   164 long cnt_brack;		/* for overview mode, count of brackets queries */
   165 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   166 long cnt_odd;		/* for overview mode, count of odd character queries */
   167 long cnt_long;		/* for overview mode, count of long line errors */
   168 long cnt_short;		/* for overview mode, count of short line queries */
   169 long cnt_punct;		/* for overview mode,
   170 			   count of punctuation and spacing queries */
   171 long cnt_dash;		/* for overview mode, count of dash-related queries */
   172 long cnt_word;		/* for overview mode, count of word queries */
   173 long cnt_html;		/* for overview mode, count of html queries */
   174 long cnt_lineend;	/* for overview mode, count of line-end queries */
   175 long cnt_spacend;	/* count of lines with space at end */
   176 long linecnt;		/* count of total lines in the file */
   177 long checked_linecnt;	/* count of lines actually checked */
   178 
   179 void proghelp(GOptionContext *context);
   180 void procfile(const char *);
   181 
   182 gchar *running_from;
   183 
   184 gboolean mixdigit(const char *);
   185 gchar *getaword(const char **);
   186 char *flgets(char **,long);
   187 void postprocess_for_HTML(char *);
   188 char *linehasmarkup(char *);
   189 char *losemarkup(char *);
   190 gboolean tagcomp(const char *,const char *);
   191 void loseentities(char *);
   192 gboolean isroman(const char *);
   193 void postprocess_for_DP(char *);
   194 void print_as_windows_1252(const char *string);
   195 void print_as_utf_8(const char *string);
   196 
   197 GTree *qword,*qperiod;
   198 
   199 #ifdef __WIN32__
   200 UINT saved_cp;
   201 #endif
   202 
   203 void parse_options(int *argc,char ***argv)
   204 {
   205     GError *err=NULL;
   206     GOptionContext *context;
   207     context=g_option_context_new(
   208       "file - looks for errors in Project Gutenberg(TM) etexts");
   209     g_option_context_add_main_entries(context,options,NULL);
   210     if (!g_option_context_parse(context,argc,argv,&err))
   211     {
   212 	g_printerr("Bookloupe: %s\n",err->message);
   213 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   214 	exit(1);
   215     }
   216     /* Paranoid checking is turned OFF, not on, by its switch */
   217     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   218     if (pswit[PARANOID_SWITCH])
   219 	/* if running in paranoid mode, typo checks default to enabled */
   220 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   221     /* Line-end checking is turned OFF, not on, by its switch */
   222     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   223     /* Echoing is turned OFF, not on, by its switch */
   224     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   225     if (pswit[OVERVIEW_SWITCH])
   226 	/* just print summary; don't echo */
   227 	pswit[ECHO_SWITCH]=FALSE;
   228     /*
   229      * Web uploads - for the moment, this is really just a placeholder
   230      * until we decide what processing we really want to do on web uploads
   231      */
   232     if (pswit[WEB_SWITCH])
   233     {
   234 	/* specific override for web uploads */
   235 	pswit[ECHO_SWITCH]=TRUE;
   236 	pswit[SQUOTE_SWITCH]=FALSE;
   237 	pswit[TYPO_SWITCH]=TRUE;
   238 	pswit[QPARA_SWITCH]=FALSE;
   239 	pswit[PARANOID_SWITCH]=TRUE;
   240 	pswit[LINE_END_SWITCH]=FALSE;
   241 	pswit[OVERVIEW_SWITCH]=FALSE;
   242 	pswit[STDOUT_SWITCH]=FALSE;
   243 	pswit[HEADER_SWITCH]=TRUE;
   244 	pswit[VERBOSE_SWITCH]=FALSE;
   245 	pswit[MARKUP_SWITCH]=FALSE;
   246 	pswit[USERTYPO_SWITCH]=FALSE;
   247 	pswit[DP_SWITCH]=FALSE;
   248     }
   249     if (*argc<2)
   250     {
   251 	proghelp(context);
   252 	exit(1);
   253     }
   254     g_option_context_free(context);
   255 }
   256 
   257 /*
   258  * read_user_scannos:
   259  *
   260  * Read in the user-defined stealth scanno list.
   261  */
   262 void read_user_scannos(void)
   263 {
   264     GError *err=NULL;
   265     gchar *usertypo_file;
   266     gboolean okay;
   267     int i;
   268     gsize len,nb;
   269     gchar *contents,*utf8,**lines;
   270     usertypo_file=g_strdup("bookloupe.typ");
   271     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   272     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   273     {
   274 	g_clear_error(&err);
   275 	g_free(usertypo_file);
   276 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   277 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   278     }
   279     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   280     {
   281 	g_clear_error(&err);
   282 	g_free(usertypo_file);
   283 	usertypo_file=g_strdup("gutcheck.typ");
   284 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   285     }
   286     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   287     {
   288 	g_clear_error(&err);
   289 	g_free(usertypo_file);
   290 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   291 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   292     }
   293     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   294     {
   295 	g_free(usertypo_file);
   296 	g_print("   --> I couldn't find bookloupe.typ "
   297 	  "-- proceeding without user typos.\n");
   298 	return;
   299     }
   300     else if (!okay)
   301     {
   302 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   303 	g_free(usertypo_file);
   304 	g_clear_error(&err);
   305 	exit(1);
   306     }
   307     if (g_utf8_validate(contents,len,NULL))
   308 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   309     else
   310 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   311     g_free(contents);
   312     lines=g_strsplit_set(utf8,"\r\n",0);
   313     g_free(utf8);
   314     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   315     for (i=0;lines[i];i++)
   316 	if (*(unsigned char *)lines[i]>'!')
   317 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   318 	else
   319 	    g_free(lines[i]);
   320     g_free(lines);
   321 }
   322 
   323 /*
   324  * read_etext:
   325  *
   326  * Read an etext returning a newly allocated string containing the file
   327  * contents or NULL on error.
   328  */
   329 gchar *read_etext(const char *filename,GError **err)
   330 {
   331     GError *tmp_err=NULL;
   332     gchar *contents,*utf8;
   333     gsize len,bytes_read,bytes_written;
   334     int i,line,col;
   335     if (!g_file_get_contents(filename,&contents,&len,err))
   336 	return NULL;
   337     if (g_utf8_validate(contents,len,NULL))
   338     {
   339 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   340 	g_set_print_handler(print_as_utf_8);
   341 #ifdef __WIN32__
   342 	SetConsoleOutputCP(CP_UTF8);
   343 #endif
   344     }
   345     else
   346     {
   347 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   348 	  &bytes_written,&tmp_err);
   349 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   350 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   351 	{
   352 	    line=col=1;
   353 	    for(i=0;i<bytes_read;i++)
   354 		if (contents[i]=='\n')
   355 		{
   356 		    line++;
   357 		    col=1;
   358 		}
   359 		else if (contents[i]!='\r')
   360 		    col++;
   361 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   362 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   363 	      "valid Windows-1252 character",
   364 	      ((unsigned char *)contents)[bytes_read],line,col);
   365 	}
   366 	else if (tmp_err)
   367 	    g_propagate_error(err,tmp_err);
   368 	g_set_print_handler(print_as_windows_1252);
   369 #ifdef __WIN32__
   370 	SetConsoleOutputCP(1252);
   371 #endif
   372     }
   373     g_free(contents);
   374     return utf8;
   375 }
   376 
   377 void cleanup_on_exit(void)
   378 {
   379 #ifdef __WIN32__
   380     SetConsoleOutputCP(saved_cp);
   381 #endif
   382 }
   383 
   384 int main(int argc,char **argv)
   385 {
   386 #ifdef __WIN32__
   387     atexit(cleanup_on_exit);
   388     saved_cp=GetConsoleOutputCP();
   389 #endif
   390     running_from=g_path_get_dirname(argv[0]);
   391     parse_options(&argc,&argv);
   392     if (pswit[USERTYPO_SWITCH])
   393 	read_user_scannos();
   394     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   395     procfile(argv[1]);
   396     if (pswit[OVERVIEW_SWITCH])
   397     {
   398 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   399 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   400 	g_print("    --------------- Queries found --------------\n");
   401 	if (cnt_long)
   402 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   403 	if (cnt_short)
   404 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   405 	if (cnt_lineend)
   406 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   407 	if (cnt_word)
   408 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   409 	if (cnt_quote)
   410 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   411 	if (cnt_brack)
   412 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   413 	if (cnt_bin)
   414 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   415 	if (cnt_odd)
   416 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   417 	if (cnt_punct)
   418 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   419 	if (cnt_dash)
   420 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   421 	if (cnt_html)
   422 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   423 	g_print("\n");
   424 	g_print("    TOTAL QUERIES		  %14ld\n",
   425 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   426 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   427     }
   428     g_free(running_from);
   429     if (usertypo)
   430 	g_tree_unref(usertypo);
   431     return 0;
   432 }
   433 
   434 void count_dashes(const char *line,const char *dash,
   435   struct dash_results *results)
   436 {
   437     int i;
   438     gchar **tokens;
   439     gunichar pc,nc;
   440     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   441     if (!*line)
   442 	return;
   443     tokens=g_strsplit(line,dash,0);
   444     if (tokens[1])
   445 	results->base++;
   446     for(i=1;tokens[i];i++)
   447     {
   448 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   449 	nc=g_utf8_get_char(tokens[i]);
   450 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   451 	    spaced=TRUE;
   452 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   453 	    spaced2=TRUE;
   454 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   455 	    unspaced=TRUE;
   456     }
   457     if (spaced)
   458 	results->space++;
   459     if (spaced2)
   460 	/* count of lines with em-dashes with spaces both sides */
   461 	results->non_PG_space++;
   462     if (unspaced)
   463 	/* count of lines with PG-type em-dashes with no spaces */
   464 	results->PG_space++;
   465     g_strfreev(tokens);
   466 }
   467 
   468 /*
   469  * first_pass:
   470  *
   471  * Run a first pass - verify that it's a valid PG
   472  * file, decide whether to report some things that
   473  * occur many times in the text like long or short
   474  * lines, non-standard dashes, etc.
   475  */
   476 struct first_pass_results *first_pass(const char *etext)
   477 {
   478     gunichar laststart=CHAR_SPACE;
   479     const char *s;
   480     gchar *lc_line;
   481     int i,j,lbytes,llen;
   482     gchar **lines;
   483     unsigned int lastlen=0,lastblen=0;
   484     long spline=0,nspline=0;
   485     static struct first_pass_results results={0};
   486     struct dash_results tmp_dash_results;
   487     gchar *inword;
   488     QuoteClass qc;
   489     lines=g_strsplit(etext,"\n",0);
   490     for (j=0;lines[j];j++)
   491     {
   492 	lbytes=strlen(lines[j]);
   493 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   494 	    lines[j][--lbytes]='\0';
   495 	llen=g_utf8_strlen(lines[j],lbytes);
   496 	linecnt++;
   497 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   498 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   499 	{
   500 	    if (spline)
   501 		g_print("   --> Duplicate header?\n");
   502 	    spline=linecnt+1;   /* first line of non-header text, that is */
   503 	}
   504 	if (!strncmp(lines[j],"*** START",9) &&
   505 	  strstr(lines[j],"PROJECT GUTENBERG"))
   506 	{
   507 	    if (nspline)
   508 		g_print("   --> Duplicate header?\n");
   509 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   510 	}
   511 	if (spline || nspline)
   512 	{
   513 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   514 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   515 	    {
   516 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   517 		{
   518 		    if (results.footerline)
   519 		    {
   520 			/* it's an old-form header - we can detect duplicates */
   521 			if (!nspline)
   522 			    g_print("   --> Duplicate footer?\n");
   523 		    }
   524 		    else
   525 			results.footerline=linecnt;
   526 		}
   527 	    }
   528 	    g_free(lc_line);
   529 	}
   530 	if (spline)
   531 	    results.firstline=spline;
   532 	if (nspline)
   533 	    results.firstline=nspline;  /* override with new */
   534 	if (results.footerline)
   535 	    continue;    /* don't count the boilerplate in the footer */
   536 	results.totlen+=llen;
   537 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   538 	{
   539 	    if (g_utf8_get_char(s)>127)
   540 		results.binlen++;
   541 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   542 		results.alphalen++;
   543 	    if (s>lines[j])
   544 	    {
   545 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   546 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   547 		else
   548 		    qc=INVALID_QUOTE;
   549 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   550 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   551 		    results.endquote_count++;
   552 	    }
   553 	}
   554 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   555 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   556 	    results.shortline++;
   557 	if (lbytes>0 &&
   558 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   559 	    cnt_spacend++;
   560 	if (strstr(lines[j],".,"))
   561 	    results.dotcomma++;
   562 	/* only count ast lines for ignoring purposes where there is */
   563 	/* locase text on the line */
   564 	if (strchr(lines[j],'*'))
   565 	{
   566 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   567 		if (g_unichar_islower(g_utf8_get_char(s)))
   568 		    break;
   569 	    if (*s)
   570 		results.astline++;
   571 	}
   572 	if (strchr(lines[j],'/'))
   573 	    results.fslashline++;
   574 	if (lbytes>0)
   575 	{
   576 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   577 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   578 	      s=g_utf8_prev_char(s))
   579 		;
   580 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   581 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   582 		results.hyphens++;
   583 	}
   584 	if (llen>LONGEST_PG_LINE)
   585 	    results.longline++;
   586 	if (llen>WAY_TOO_LONG)
   587 	    results.verylongline++;
   588 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   589 	{
   590 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   591 	    if (i>0)
   592 		results.htmcount++;
   593 	    if (strstr(lines[j],"<i>"))
   594 		results.htmcount+=4; /* bonus marks! */
   595 	}
   596 	/* Check for spaced em-dashes */
   597 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   598 	count_dashes(lines[j],"--",&tmp_dash_results);
   599 	count_dashes(lines[j],"—",&tmp_dash_results);
   600 	if (tmp_dash_results.base)
   601 	    results.emdash.base++;
   602 	if (tmp_dash_results.non_PG_space)
   603 	    results.emdash.non_PG_space++;
   604 	if (tmp_dash_results.PG_space)
   605 	    results.emdash.PG_space++;
   606 	for (s=lines[j];*s;)
   607 	{
   608 	    inword=getaword(&s);
   609 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   610 		results.Dutchcount++;
   611 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   612 		results.Frenchcount++;
   613 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   614 		results.standalone_digit++;
   615 	    g_free(inword);
   616 	}
   617 	/* Check for spaced dashes */
   618 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   619 	    results.spacedash++;
   620 	lastblen=lastlen;
   621 	lastlen=llen;
   622 	laststart=lines[j][0];
   623     }
   624     g_strfreev(lines);
   625     return &results;
   626 }
   627 
   628 /*
   629  * report_first_pass:
   630  *
   631  * Make some snap decisions based on the first pass results.
   632  */
   633 struct warnings *report_first_pass(struct first_pass_results *results)
   634 {
   635     static struct warnings warnings={0};
   636     if (cnt_spacend>0)
   637 	g_print("   --> %ld lines in this file have white space at end\n",
   638 	  cnt_spacend);
   639     warnings.dotcomma=1;
   640     if (results->dotcomma>5)
   641     {
   642 	warnings.dotcomma=0;
   643 	g_print("   --> %ld lines in this file contain '.,'. "
   644 	  "Not reporting them.\n",results->dotcomma);
   645     }
   646     /*
   647      * If more than 50 lines, or one-tenth, are short,
   648      * don't bother reporting them.
   649      */
   650     warnings.shortline=1;
   651     if (results->shortline>50 || results->shortline*10>linecnt)
   652     {
   653 	warnings.shortline=0;
   654 	g_print("   --> %ld lines in this file are short. "
   655 	  "Not reporting short lines.\n",results->shortline);
   656     }
   657     /*
   658      * If more than 50 lines, or one-tenth, are long,
   659      * don't bother reporting them.
   660      */
   661     warnings.longline=1;
   662     if (results->longline>50 || results->longline*10>linecnt)
   663     {
   664 	warnings.longline=0;
   665 	g_print("   --> %ld lines in this file are long. "
   666 	  "Not reporting long lines.\n",results->longline);
   667     }
   668     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   669     warnings.ast=1;
   670     if (results->astline>10)
   671     {
   672 	warnings.ast=0;
   673 	g_print("   --> %ld lines in this file contain asterisks. "
   674 	  "Not reporting them.\n",results->astline);
   675     }
   676     /*
   677      * If more than 10 lines contain forward slashes,
   678      * don't bother reporting them.
   679      */
   680     warnings.fslash=1;
   681     if (results->fslashline>10)
   682     {
   683 	warnings.fslash=0;
   684 	g_print("   --> %ld lines in this file contain forward slashes. "
   685 	  "Not reporting them.\n",results->fslashline);
   686     }
   687     /*
   688      * If more than 20 lines contain unpunctuated endquotes,
   689      * don't bother reporting them.
   690      */
   691     warnings.endquote=1;
   692     if (results->endquote_count>20)
   693     {
   694 	warnings.endquote=0;
   695 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   696 	  "Not reporting them.\n",results->endquote_count);
   697     }
   698     /*
   699      * If more than 15 lines contain standalone digits,
   700      * don't bother reporting them.
   701      */
   702     warnings.digit=1;
   703     if (results->standalone_digit>10)
   704     {
   705 	warnings.digit=0;
   706 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   707 	  "Not reporting them.\n",results->standalone_digit);
   708     }
   709     /*
   710      * If more than 20 lines contain hyphens at end,
   711      * don't bother reporting them.
   712      */
   713     warnings.hyphen=1;
   714     if (results->hyphens>20)
   715     {
   716 	warnings.hyphen=0;
   717 	g_print("   --> %ld lines in this file have hyphens at end. "
   718 	  "Not reporting them.\n",results->hyphens);
   719     }
   720     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   721     {
   722 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   723 	pswit[MARKUP_SWITCH]=1;
   724     }
   725     if (results->verylongline>0)
   726 	g_print("   --> %ld lines in this file are VERY long!\n",
   727 	  results->verylongline);
   728     /*
   729      * If there are more non-PG spaced dashes than PG em-dashes,
   730      * assume it's deliberate.
   731      * Current PG guidelines say don't use them, but older texts do,
   732      * and some people insist on them whatever the guidelines say.
   733      */
   734     warnings.dash=1;
   735     if (results->spacedash+results->emdash.non_PG_space>
   736       results->emdash.PG_space)
   737     {
   738 	warnings.dash=0;
   739 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   740 	  "Not reporting them.\n",
   741 	  results->spacedash+results->emdash.non_PG_space);
   742     }
   743     /* If more than a quarter of characters are hi-bit, bug out. */
   744     warnings.bin=1;
   745     if (results->binlen*4>results->totlen)
   746     {
   747 	g_print("   --> This file does not appear to be ASCII. "
   748 	  "Terminating. Best of luck with it!\n");
   749 	exit(1);
   750     }
   751     if (results->alphalen*4<results->totlen)
   752     {
   753 	g_print("   --> This file does not appear to be text. "
   754 	  "Terminating. Best of luck with it!\n");
   755 	exit(1);
   756     }
   757     if (results->binlen*100>results->totlen || results->binlen>100)
   758     {
   759 	g_print("   --> There are a lot of foreign letters here. "
   760 	  "Not reporting them.\n");
   761 	warnings.bin=0;
   762     }
   763     warnings.isDutch=FALSE;
   764     if (results->Dutchcount>50)
   765     {
   766 	warnings.isDutch=TRUE;
   767 	g_print("   --> This looks like Dutch - "
   768 	  "switching off dashes and warnings for 's Middags case.\n");
   769     }
   770     warnings.isFrench=FALSE;
   771     if (results->Frenchcount>50)
   772     {
   773 	warnings.isFrench=TRUE;
   774 	g_print("   --> This looks like French - "
   775 	  "switching off some doublepunct.\n");
   776     }
   777     if (results->firstline && results->footerline)
   778 	g_print("    The PG header and footer appear to be already on.\n");
   779     else
   780     {
   781 	if (results->firstline)
   782 	    g_print("    The PG header is on - no footer.\n");
   783 	if (results->footerline)
   784 	    g_print("    The PG footer is on - no header.\n");
   785     }
   786     g_print("\n");
   787     if (pswit[VERBOSE_SWITCH])
   788     {
   789 	warnings.bin=1;
   790 	warnings.shortline=1;
   791 	warnings.dotcomma=1;
   792 	warnings.longline=1;
   793 	warnings.dash=1;
   794 	warnings.digit=1;
   795 	warnings.ast=1;
   796 	warnings.fslash=1;
   797 	warnings.hyphen=1;
   798 	warnings.endquote=1;
   799 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   800     }
   801     if (warnings.isDutch)
   802 	warnings.dash=0;
   803     if (results->footerline>0 && results->firstline>0 &&
   804       results->footerline>results->firstline &&
   805       results->footerline-results->firstline<100)
   806     {
   807 	g_print("   --> I don't really know where this text starts. \n");
   808 	g_print("       There are no reference points.\n");
   809 	g_print("       I'm going to have to report the header and footer "
   810 	  "as well.\n");
   811 	results->firstline=0;
   812     }
   813     return &warnings;
   814 }
   815 
   816 /*
   817  * analyse_quotes:
   818  *
   819  * Look along the line, accumulate the count of quotes, and see
   820  * if this is an empty line - i.e. a line with nothing on it
   821  * but spaces.
   822  * If line has just spaces, period, * and/or - on it, don't
   823  * count it, since empty lines with asterisks or dashes to
   824  * separate sections are common.
   825  *
   826  * Returns: TRUE if the line is empty.
   827  */
   828 gboolean analyse_quotes(const char *aline,struct counters *counters)
   829 {
   830     int guessquote=0;
   831     /* assume the line is empty until proven otherwise */
   832     gboolean isemptyline=TRUE;
   833     const char *s=aline,*sprev,*snext;
   834     gunichar c;
   835     sprev=NULL;
   836     GError *tmp_err=NULL;
   837     while (*s)
   838     {
   839 	snext=g_utf8_next_char(s);
   840 	c=g_utf8_get_char(s);
   841 	if (CHAR_IS_DQUOTE(c))
   842 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
   843 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
   844 	{
   845 	    if (s==aline)
   846 	    {
   847 		/*
   848 		 * At start of line, it can only be a quotation mark.
   849 		 * Hardcode a very common exception!
   850 		 */
   851 		if (!g_str_has_prefix(snext,"tis") &&
   852 		  !g_str_has_prefix(snext,"Tis"))
   853 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   854 	    }
   855 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   856 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   857 		/* Do nothing! it's definitely an apostrophe, not a quote */
   858 		;
   859 	    /* it's outside a word - let's check it out */
   860 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   861 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   862 	    {
   863 		/* certainly looks like a quotation mark */
   864 		if (!g_str_has_prefix(snext,"tis") &&
   865 		  !g_str_has_prefix(snext,"Tis"))
   866 		    /* hardcode a very common exception! */
   867 		{
   868 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
   869 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   870 		    else
   871 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
   872 		}
   873 	    }
   874 	    else
   875 	    {
   876 		/* now - is it a quotation mark? */
   877 		guessquote=0;   /* accumulate clues */
   878 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   879 		{
   880 		    /* it follows a letter - could be either */
   881 		    guessquote++;
   882 		    if (g_utf8_get_char(sprev)=='s')
   883 		    {
   884 			/* looks like a plural apostrophe */
   885 			guessquote-=3;
   886 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   887 			    /* bonus marks! */
   888 			    guessquote-=2;
   889 		    }
   890 		    if (innermost_quote_matches(counters,c))
   891 			/*
   892 			 * Give it the benefit of some doubt,
   893 			 * if a squote is already open.
   894 			 */
   895 			guessquote++;
   896 		    else
   897 			guessquote--;
   898 		    if (guessquote>=0)
   899 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
   900 		}
   901 		else
   902 		    /* no adjacent letter - it must be a quote of some kind */
   903 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   904 	    }
   905 	}
   906 	if (tmp_err)
   907 	{
   908 	    if (pswit[ECHO_SWITCH])
   909 		g_print("\n%s\n",aline);
   910 	    if (!pswit[OVERVIEW_SWITCH])
   911 		g_print("    Line %ld column %ld - %s\n",
   912 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
   913 	    g_clear_error(&tmp_err);
   914 	}
   915 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   916 	  c!='\r' && c!='\n')
   917 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   918 	if (c==CHAR_UNDERSCORE)
   919 	    counters->c_unders++;
   920 	if (c==CHAR_OPEN_SBRACK)
   921 	{
   922 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
   923 	      !matching_difference(counters,c) && s==aline &&
   924 	      g_str_has_prefix(s,"[Illustration:"))
   925 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
   926 	    else
   927 		increment_matching(counters,c,TRUE);
   928 	}
   929 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
   930 	    increment_matching(counters,c,TRUE);
   931 	if (c==CHAR_CLOSE_SBRACK)
   932 	{
   933 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
   934 	      !matching_difference(counters,c) && !*snext)
   935 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
   936 	    else
   937 		increment_matching(counters,c,FALSE);
   938 	}
   939 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
   940 	    increment_matching(counters,c,FALSE);
   941 	sprev=s;
   942 	s=snext;
   943     }
   944     return isemptyline;
   945 }
   946 
   947 /*
   948  * check_for_control_characters:
   949  *
   950  * Check for invalid or questionable characters in the line
   951  * Anything above 127 is invalid for plain ASCII, and
   952  * non-printable control characters should also be flagged.
   953  * Tabs should generally not be there.
   954  */
   955 void check_for_control_characters(const char *aline)
   956 {
   957     gunichar c;
   958     const char *s;
   959     for (s=aline;*s;s=g_utf8_next_char(s))
   960     {
   961 	c=g_utf8_get_char(s);
   962 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   963 	{
   964 	    if (pswit[ECHO_SWITCH])
   965 		g_print("\n%s\n",aline);
   966 	    if (!pswit[OVERVIEW_SWITCH])
   967 		g_print("    Line %ld column %ld - Control character %u\n",
   968 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   969 	    else
   970 		cnt_bin++;
   971 	}
   972     }
   973 }
   974 
   975 /*
   976  * check_for_odd_characters:
   977  *
   978  * Check for binary and other odd characters.
   979  */
   980 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   981   gboolean isemptyline)
   982 {
   983     /* Don't repeat multiple warnings on one line. */
   984     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   985     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   986     const char *s;
   987     gunichar c;
   988     for (s=aline;*s;s=g_utf8_next_char(s))
   989     {
   990 	c=g_utf8_get_char(s);
   991 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   992 	{
   993 	    if (pswit[ECHO_SWITCH])
   994 		g_print("\n%s\n",aline);
   995 	    if (!pswit[OVERVIEW_SWITCH])
   996 		if (c>127 && c<160 || c>255)
   997 		    g_print("    Line %ld column %ld - "
   998 		      "Non-ISO-8859 character %u\n",
   999 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1000 		else
  1001 		    g_print("    Line %ld column %ld - "
  1002 		      "Non-ASCII character %u\n",
  1003 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1004 	    else
  1005 		cnt_bin++;
  1006 	    eNon_A=TRUE;
  1007 	}
  1008 	if (!eTab && c==CHAR_TAB)
  1009 	{
  1010 	    if (pswit[ECHO_SWITCH])
  1011 		g_print("\n%s\n",aline);
  1012 	    if (!pswit[OVERVIEW_SWITCH])
  1013 		g_print("    Line %ld column %ld - Tab character?\n",
  1014 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1015 	    else
  1016 		cnt_odd++;
  1017 	    eTab=TRUE;
  1018 	}
  1019 	if (!eTilde && c==CHAR_TILDE)
  1020 	{
  1021 	    /*
  1022 	     * Often used by OCR software to indicate an
  1023 	     * unrecognizable character.
  1024 	     */
  1025 	    if (pswit[ECHO_SWITCH])
  1026 		g_print("\n%s\n",aline);
  1027 	    if (!pswit[OVERVIEW_SWITCH])
  1028 		g_print("    Line %ld column %ld - Tilde character?\n",
  1029 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1030 	    else
  1031 		cnt_odd++;
  1032 	    eTilde=TRUE;
  1033 	}
  1034 	if (!eCarat && c==CHAR_CARAT)
  1035 	{  
  1036 	    if (pswit[ECHO_SWITCH])
  1037 		g_print("\n%s\n",aline);
  1038 	    if (!pswit[OVERVIEW_SWITCH])
  1039 		g_print("    Line %ld column %ld - Carat character?\n",
  1040 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1041 	    else
  1042 		cnt_odd++;
  1043 	    eCarat=TRUE;
  1044 	}
  1045 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1046 	{  
  1047 	    if (pswit[ECHO_SWITCH])
  1048 		g_print("\n%s\n",aline);
  1049 	    if (!pswit[OVERVIEW_SWITCH])
  1050 		g_print("    Line %ld column %ld - Forward slash?\n",
  1051 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1052 	    else
  1053 		cnt_odd++;
  1054 	    eFSlash=TRUE;
  1055 	}
  1056 	/*
  1057 	 * Report asterisks only in paranoid mode,
  1058 	 * since they're often deliberate.
  1059 	 */
  1060 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1061 	  c==CHAR_ASTERISK)
  1062 	{
  1063 	    if (pswit[ECHO_SWITCH])
  1064 		g_print("\n%s\n",aline);
  1065 	    if (!pswit[OVERVIEW_SWITCH])
  1066 		g_print("    Line %ld column %ld - Asterisk?\n",
  1067 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1068 	    else
  1069 		cnt_odd++;
  1070 	    eAst=TRUE;
  1071 	}
  1072     }
  1073 }
  1074 
  1075 /*
  1076  * check_for_long_line:
  1077  *
  1078  * Check for line too long.
  1079  */
  1080 void check_for_long_line(const char *aline)
  1081 {
  1082     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1083     {
  1084 	if (pswit[ECHO_SWITCH])
  1085 	    g_print("\n%s\n",aline);
  1086 	if (!pswit[OVERVIEW_SWITCH])
  1087 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1088 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1089 	else
  1090 	    cnt_long++;
  1091     }
  1092 }
  1093 
  1094 /*
  1095  * check_for_short_line:
  1096  *
  1097  * Check for line too short.
  1098  *
  1099  * This one is a bit trickier to implement: we don't want to
  1100  * flag the last line of a paragraph for being short, so we
  1101  * have to wait until we know that our current line is a
  1102  * "normal" line, then report the _previous_ line if it was too
  1103  * short. We also don't want to report indented lines like
  1104  * chapter heads or formatted quotations. We therefore keep
  1105  * last->len as the length of the last line examined, and
  1106  * last->blen as the length of the last but one, and try to
  1107  * suppress unnecessary warnings by checking that both were of
  1108  * "normal" length. We keep the first character of the last
  1109  * line in last->start, and if it was a space, we assume that
  1110  * the formatting is deliberate. I can't figure out a way to
  1111  * distinguish something like a quoted verse left-aligned or
  1112  * the header or footer of a letter from a paragraph of short
  1113  * lines - maybe if I examined the whole paragraph, and if the
  1114  * para has less than, say, 8 lines and if all lines are short,
  1115  * then just assume it's OK? Need to look at some texts to see
  1116  * how often a formula like this would get the right result.
  1117  */
  1118 void check_for_short_line(const char *aline,const struct line_properties *last)
  1119 {
  1120     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1121       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1122       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1123     {
  1124 	if (pswit[ECHO_SWITCH])
  1125 	    g_print("\n%s\n",prevline);
  1126 	if (!pswit[OVERVIEW_SWITCH])
  1127 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1128 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1129 	else
  1130 	    cnt_short++;
  1131     }
  1132 }
  1133 
  1134 /*
  1135  * check_for_starting_punctuation:
  1136  *
  1137  * Look for punctuation other than full ellipses at start of line.
  1138  */
  1139 void check_for_starting_punctuation(const char *aline)
  1140 {
  1141     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1142       !g_str_has_prefix(aline,". . ."))
  1143     {
  1144 	if (pswit[ECHO_SWITCH])
  1145 	    g_print("\n%s\n",aline);
  1146 	if (!pswit[OVERVIEW_SWITCH])
  1147 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1148 	      linecnt);
  1149 	else
  1150 	    cnt_punct++;
  1151     }
  1152 }
  1153 
  1154 /*
  1155  * str_emdash:
  1156  *
  1157  * Find the first em-dash, return a pointer to it and set <next> to the
  1158  * character following the dash.
  1159  */
  1160 char *str_emdash(const char *s,const char **next)
  1161 {
  1162     const char *s1,*s2;
  1163     s1=strstr(s,"--");
  1164     s2=strstr(s,"—");
  1165     if (!s1)
  1166     {
  1167 	if (s2)
  1168 	    *next=g_utf8_next_char(s2);
  1169 	return (char *)s2;
  1170     }
  1171     else if (!s2)
  1172     {
  1173 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1174 	return (char *)s1;
  1175     }
  1176     else if (s1<s2)
  1177     {
  1178 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1179 	return (char *)s1;
  1180     }
  1181     else
  1182     {
  1183 	*next=g_utf8_next_char(s2);
  1184 	return (char *)s2;
  1185     }
  1186 }
  1187 
  1188 /*
  1189  * check_for_spaced_emdash:
  1190  *
  1191  * Check for spaced em-dashes.
  1192  *
  1193  * We must check _all_ occurrences of em-dashes on the line
  1194  * hence the loop - even if the first dash is OK
  1195  * there may be another that's wrong later on.
  1196  */
  1197 void check_for_spaced_emdash(const char *aline)
  1198 {
  1199     const char *s,*t,*next;
  1200     for (s=aline;t=str_emdash(s,&next);s=next)
  1201     {
  1202 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1203 	  g_utf8_get_char(next)==CHAR_SPACE)
  1204 	{
  1205 	    if (pswit[ECHO_SWITCH])
  1206 		g_print("\n%s\n",aline);
  1207 	    if (!pswit[OVERVIEW_SWITCH])
  1208 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1209 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1210 	    else
  1211 		cnt_dash++;
  1212 	}
  1213     }
  1214 }
  1215 
  1216 /*
  1217  * check_for_spaced_dash:
  1218  *
  1219  * Check for spaced dashes.
  1220  */
  1221 void check_for_spaced_dash(const char *aline)
  1222 {
  1223     const char *s;
  1224     if ((s=strstr(aline," -")))
  1225     {
  1226 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1227 	{
  1228 	    if (pswit[ECHO_SWITCH])
  1229 		g_print("\n%s\n",aline);
  1230 	    if (!pswit[OVERVIEW_SWITCH])
  1231 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1232 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1233 	    else
  1234 		cnt_dash++;
  1235 	}
  1236     }
  1237     else if ((s=strstr(aline,"- ")))
  1238     {
  1239 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1240 	{
  1241 	    if (pswit[ECHO_SWITCH])
  1242 		g_print("\n%s\n",aline);
  1243 	    if (!pswit[OVERVIEW_SWITCH])
  1244 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1245 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1246 	    else
  1247 		cnt_dash++;
  1248 	}
  1249     }
  1250 }
  1251 
  1252 /*
  1253  * check_for_unmarked_paragraphs:
  1254  *
  1255  * Check for unmarked paragraphs indicated by separate speakers.
  1256  *
  1257  * May well be false positive:
  1258  * "Bravo!" "Wonderful!" called the crowd.
  1259  * but useful all the same.
  1260  */
  1261 void check_for_unmarked_paragraphs(const char *aline)
  1262 {
  1263     const char *s;
  1264     s=strstr(aline,"\"  \"");
  1265     if (!s)
  1266 	s=strstr(aline,"\" \"");
  1267     if (s)
  1268     {
  1269 	if (pswit[ECHO_SWITCH])
  1270 	    g_print("\n%s\n",aline);
  1271 	if (!pswit[OVERVIEW_SWITCH])
  1272 	    g_print("    Line %ld column %ld - "
  1273 	      "Query missing paragraph break?\n",
  1274 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1275 	else
  1276 	    cnt_punct++;
  1277     }
  1278 }
  1279 
  1280 /*
  1281  * check_for_jeebies:
  1282  *
  1283  * Check for "to he" and other easy h/b errors.
  1284  *
  1285  * This is a very inadequate effort on the h/b problem,
  1286  * but the phrase "to he" is always an error, whereas "to
  1287  * be" is quite common.
  1288  * Similarly, '"Quiet!", be said.' is a non-be error
  1289  * "to he" is _not_ always an error!:
  1290  *       "Where they went to he couldn't say."
  1291  * Another false positive:
  1292  *       What would "Cinderella" be without the . . .
  1293  * and another: "If he wants to he can see for himself."
  1294  */
  1295 void check_for_jeebies(const char *aline)
  1296 {
  1297     const char *s;
  1298     s=strstr(aline," be could ");
  1299     if (!s)
  1300 	s=strstr(aline," be would ");
  1301     if (!s)
  1302 	s=strstr(aline," was be ");
  1303     if (!s)
  1304 	s=strstr(aline," be is ");
  1305     if (!s)
  1306 	s=strstr(aline," is be ");
  1307     if (!s)
  1308 	s=strstr(aline,"\", be ");
  1309     if (!s)
  1310 	s=strstr(aline,"\" be ");
  1311     if (!s)
  1312 	s=strstr(aline,"\" be ");
  1313     if (!s)
  1314 	s=strstr(aline," to he ");
  1315     if (s)
  1316     {
  1317 	if (pswit[ECHO_SWITCH])
  1318 	    g_print("\n%s\n",aline);
  1319 	if (!pswit[OVERVIEW_SWITCH])
  1320 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1321 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1322 	else
  1323 	    cnt_word++;
  1324     }
  1325     s=strstr(aline," the had ");
  1326     if (!s)
  1327 	s=strstr(aline," a had ");
  1328     if (!s)
  1329 	s=strstr(aline," they bad ");
  1330     if (!s)
  1331 	s=strstr(aline," she bad ");
  1332     if (!s)
  1333 	s=strstr(aline," he bad ");
  1334     if (!s)
  1335 	s=strstr(aline," you bad ");
  1336     if (!s)
  1337 	s=strstr(aline," i bad ");
  1338     if (s)
  1339     {
  1340 	if (pswit[ECHO_SWITCH])
  1341 	    g_print("\n%s\n",aline);
  1342 	if (!pswit[OVERVIEW_SWITCH])
  1343 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1344 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1345 	else
  1346 	    cnt_word++;
  1347     }
  1348     s=strstr(aline,"; hut ");
  1349     if (!s)
  1350 	s=strstr(aline,", hut ");
  1351     if (s)
  1352     {
  1353 	if (pswit[ECHO_SWITCH])
  1354 	    g_print("\n%s\n",aline);
  1355 	if (!pswit[OVERVIEW_SWITCH])
  1356 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1357 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1358 	else
  1359 	    cnt_word++;
  1360     }
  1361 }
  1362 
  1363 /*
  1364  * check_for_mta_from:
  1365  *
  1366  * Special case - angled bracket in front of "From" placed there by an
  1367  * MTA when sending an e-mail.
  1368  */
  1369 void check_for_mta_from(const char *aline)
  1370 {
  1371     const char *s;
  1372     s=strstr(aline,">From");
  1373     if (s)
  1374     {
  1375 	if (pswit[ECHO_SWITCH])
  1376 	    g_print("\n%s\n",aline);
  1377 	if (!pswit[OVERVIEW_SWITCH])
  1378 	    g_print("    Line %ld column %ld - "
  1379 	      "Query angled bracket with From\n",
  1380 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1381 	else
  1382 	    cnt_punct++;
  1383     }
  1384 }
  1385 
  1386 /*
  1387  * check_for_orphan_character:
  1388  *
  1389  * Check for a single character line -
  1390  * often an overflow from bad wrapping.
  1391  */
  1392 void check_for_orphan_character(const char *aline)
  1393 {
  1394     gunichar c;
  1395     c=g_utf8_get_char(aline);
  1396     if (c && !*g_utf8_next_char(aline))
  1397     {
  1398 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1399 	    ; /* Nothing - ignore numerals alone on a line. */
  1400 	else
  1401 	{
  1402 	    if (pswit[ECHO_SWITCH])
  1403 		g_print("\n%s\n",aline);
  1404 	    if (!pswit[OVERVIEW_SWITCH])
  1405 		g_print("    Line %ld column 1 - Query single character line\n",
  1406 		  linecnt);
  1407 	    else
  1408 		cnt_punct++;
  1409 	}
  1410     }
  1411 }
  1412 
  1413 /*
  1414  * check_for_pling_scanno:
  1415  *
  1416  * Check for I" - often should be !
  1417  */
  1418 void check_for_pling_scanno(const char *aline)
  1419 {
  1420     const char *s;
  1421     s=strstr(aline," I\"");
  1422     if (s)
  1423     {
  1424 	if (pswit[ECHO_SWITCH])
  1425 	    g_print("\n%s\n",aline);
  1426 	if (!pswit[OVERVIEW_SWITCH])
  1427 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1428 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1429 	else
  1430 	    cnt_punct++;
  1431     }
  1432 }
  1433 
  1434 /*
  1435  * check_for_extra_period:
  1436  *
  1437  * Check for period without a capital letter. Cut-down from gutspell.
  1438  * Only works when it happens on a single line.
  1439  */
  1440 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1441 {
  1442     const char *s,*t,*s1,*sprev;
  1443     int i;
  1444     gsize len;
  1445     gboolean istypo;
  1446     gchar *testword;
  1447     gunichar c,nc,pc,*decomposition;
  1448     if (pswit[PARANOID_SWITCH])
  1449     {
  1450 	for (t=aline;t=strstr(t,". ");)
  1451 	{
  1452 	    if (t==aline)
  1453 	    {
  1454 		t=g_utf8_next_char(t);
  1455 		/* start of line punctuation is handled elsewhere */
  1456 		continue;
  1457 	    }
  1458 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1459 	    {
  1460 		t=g_utf8_next_char(t);
  1461 		continue;
  1462 	    }
  1463 	    if (warnings->isDutch)
  1464 	    {
  1465 		/* For Frank & Jeroen -- 's Middags case */
  1466 		gunichar c2,c3,c4,c5;
  1467 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1468 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1469 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1470 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1471 		if (CHAR_IS_APOSTROPHE(c2) &&
  1472 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1473 		  g_unichar_isupper(c5))
  1474 		{
  1475 		    t=g_utf8_next_char(t);
  1476 		    continue;
  1477 		}
  1478 	    }
  1479 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1480 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1481 	      !g_unichar_isdigit(g_utf8_get_char(s1)))
  1482 		s1=g_utf8_next_char(s1);
  1483 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1484 	    {
  1485 		/* we have something to investigate */
  1486 		istypo=TRUE;
  1487 		/* so let's go back and find out */
  1488 		nc=g_utf8_get_char(t);
  1489 		s1=g_utf8_prev_char(t);
  1490 		c=g_utf8_get_char(s1);
  1491 		sprev=g_utf8_prev_char(s1);
  1492 		pc=g_utf8_get_char(sprev);
  1493 		while (s1>=aline &&
  1494 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1495 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1496 		  g_unichar_isalpha(nc)))
  1497 		{
  1498 		    nc=c;
  1499 		    s1=sprev;
  1500 		    c=pc;
  1501 		    sprev=g_utf8_prev_char(s1);
  1502 		    pc=g_utf8_get_char(sprev);
  1503 		}
  1504 		s1=g_utf8_next_char(s1);
  1505 		s=strchr(s1,'.');
  1506 		if (s)
  1507 		    testword=g_strndup(s1,s-s1);
  1508 		else
  1509 		    testword=g_strdup(s1);
  1510 		for (i=0;*abbrev[i];i++)
  1511 		    if (!strcmp(testword,abbrev[i]))
  1512 			istypo=FALSE;
  1513 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1514 		    istypo=FALSE;
  1515 		if (!*g_utf8_next_char(testword))
  1516 		    istypo=FALSE;
  1517 		if (isroman(testword))
  1518 		    istypo=FALSE;
  1519 		if (istypo)
  1520 		{
  1521 		    istypo=FALSE;
  1522 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1523 		    {
  1524 			decomposition=g_unicode_canonical_decomposition(
  1525 			  g_utf8_get_char(s),&len);
  1526 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1527 			    istypo=TRUE;
  1528 			g_free(decomposition);
  1529 		    }
  1530 		}
  1531 		if (istypo &&
  1532 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1533 		{
  1534 		    g_tree_insert(qperiod,g_strdup(testword),
  1535 		      GINT_TO_POINTER(1));
  1536 		    if (pswit[ECHO_SWITCH])
  1537 			g_print("\n%s\n",aline);
  1538 		    if (!pswit[OVERVIEW_SWITCH])
  1539 			g_print("    Line %ld column %ld - Extra period?\n",
  1540 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1541 		    else
  1542 			cnt_punct++;
  1543 		}
  1544 		g_free(testword);
  1545 	    }
  1546 	    t=g_utf8_next_char(t);
  1547 	}
  1548     }
  1549 }
  1550 
  1551 /*
  1552  * check_for_following_punctuation:
  1553  *
  1554  * Check for words usually not followed by punctuation.
  1555  */
  1556 void check_for_following_punctuation(const char *aline)
  1557 {
  1558     int i;
  1559     const char *s,*wordstart;
  1560     gunichar c;
  1561     gchar *inword,*t;
  1562     if (pswit[TYPO_SWITCH])
  1563     {
  1564 	for (s=aline;*s;)
  1565 	{
  1566 	    wordstart=s;
  1567 	    t=getaword(&s);
  1568 	    if (!*t)
  1569 	    {
  1570 		g_free(t);
  1571 		continue;
  1572 	    }
  1573 	    inword=g_utf8_strdown(t,-1);
  1574 	    g_free(t);
  1575 	    for (i=0;*nocomma[i];i++)
  1576 		if (!strcmp(inword,nocomma[i]))
  1577 		{
  1578 		    c=g_utf8_get_char(s);
  1579 		    if (c==',' || c==';' || c==':')
  1580 		    {
  1581 			if (pswit[ECHO_SWITCH])
  1582 			    g_print("\n%s\n",aline);
  1583 			if (!pswit[OVERVIEW_SWITCH])
  1584 			    g_print("    Line %ld column %ld - "
  1585 			      "Query punctuation after %s?\n",
  1586 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1587 			      inword);
  1588 			else
  1589 			    cnt_punct++;
  1590 		    }
  1591 		}
  1592 	    for (i=0;*noperiod[i];i++)
  1593 		if (!strcmp(inword,noperiod[i]))
  1594 		{
  1595 		    c=g_utf8_get_char(s);
  1596 		    if (c=='.' || c=='!')
  1597 		    {
  1598 			if (pswit[ECHO_SWITCH])
  1599 			    g_print("\n%s\n",aline);
  1600 			if (!pswit[OVERVIEW_SWITCH])
  1601 			    g_print("    Line %ld column %ld - "
  1602 			      "Query punctuation after %s?\n",
  1603 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1604 			      inword);
  1605 			else
  1606 			    cnt_punct++;
  1607 		    }
  1608 		}
  1609 	    g_free(inword);
  1610 	}
  1611     }
  1612 }
  1613 
  1614 /*
  1615  * check_for_typos:
  1616  *
  1617  * Check for commonly mistyped words,
  1618  * and digits like 0 for O in a word.
  1619  */
  1620 void check_for_typos(const char *aline,struct warnings *warnings)
  1621 {
  1622     const char *s,*t,*nt,*wordstart;
  1623     gchar *inword;
  1624     gunichar *decomposition;
  1625     gchar *testword;
  1626     int i,vowel,consonant,*dupcnt;
  1627     gboolean isdup,istypo,alower;
  1628     gunichar c,pc;
  1629     long offset,len;
  1630     gsize decomposition_len;
  1631     for (s=aline;*s;)
  1632     {
  1633 	wordstart=s;
  1634 	inword=getaword(&s);
  1635 	if (!*inword)
  1636 	{
  1637 	    g_free(inword);
  1638 	    continue; /* don't bother with empty lines */
  1639 	}
  1640 	if (mixdigit(inword))
  1641 	{
  1642 	    if (pswit[ECHO_SWITCH])
  1643 		g_print("\n%s\n",aline);
  1644 	    if (!pswit[OVERVIEW_SWITCH])
  1645 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1646 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1647 	    else
  1648 		cnt_word++;
  1649 	}
  1650 	/*
  1651 	 * Put the word through a series of tests for likely typos and OCR
  1652 	 * errors.
  1653 	 */
  1654 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1655 	{
  1656 	    istypo=FALSE;
  1657 	    alower=FALSE;
  1658 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1659 	    {
  1660 		c=g_utf8_get_char(t);
  1661 		nt=g_utf8_next_char(t);
  1662 		/* lowercase for testing */
  1663 		if (g_unichar_islower(c))
  1664 		    alower=TRUE;
  1665 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1666 		{
  1667 		    /*
  1668 		     * We have an uppercase mid-word. However, there are
  1669 		     * common cases:
  1670 		     *   Mac and Mc like McGill
  1671 		     *   French contractions like l'Abbe
  1672 		     */
  1673 		    offset=g_utf8_pointer_to_offset(inword,t);
  1674 		    if (offset>0)
  1675 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1676 		    else
  1677 			pc='\0';
  1678 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1679 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1680 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1681 		      CHAR_IS_APOSTROPHE(pc))
  1682 			; /* do nothing! */
  1683 		    else
  1684 			istypo=TRUE;
  1685 		}
  1686 	    }
  1687 	    testword=g_utf8_casefold(inword,-1);
  1688 	}
  1689 	if (pswit[TYPO_SWITCH])
  1690 	{
  1691 	    /*
  1692 	     * Check for certain unlikely two-letter combinations at word
  1693 	     * start and end.
  1694 	     */
  1695 	    len=g_utf8_strlen(testword,-1);
  1696 	    if (len>1)
  1697 	    {
  1698 		for (i=0;*nostart[i];i++)
  1699 		    if (g_str_has_prefix(testword,nostart[i]))
  1700 			istypo=TRUE;
  1701 		for (i=0;*noend[i];i++)
  1702 		    if (g_str_has_suffix(testword,noend[i]))
  1703 			istypo=TRUE;
  1704 	    }
  1705 	    /* ght is common, gbt never. Like that. */
  1706 	    if (strstr(testword,"cb"))
  1707 		istypo=TRUE;
  1708 	    if (strstr(testword,"gbt"))
  1709 		istypo=TRUE;
  1710 	    if (strstr(testword,"pbt"))
  1711 		istypo=TRUE;
  1712 	    if (strstr(testword,"tbs"))
  1713 		istypo=TRUE;
  1714 	    if (strstr(testword,"mrn"))
  1715 		istypo=TRUE;
  1716 	    if (strstr(testword,"ahle"))
  1717 		istypo=TRUE;
  1718 	    if (strstr(testword,"ihle"))
  1719 		istypo=TRUE;
  1720 	    /*
  1721 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1722 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1723 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1724 	     * numerals, but "ii" is a common scanno.
  1725 	     */
  1726 	    if (strstr(testword,"tbi"))
  1727 		istypo=TRUE;
  1728 	    if (strstr(testword,"tbe"))
  1729 		istypo=TRUE;
  1730 	    if (strstr(testword,"ii"))
  1731 		istypo=TRUE;
  1732 	    /*
  1733 	     * Check for no vowels or no consonants.
  1734 	     * If none, flag a typo.
  1735 	     */
  1736 	    if (!istypo && len>1)
  1737 	    {
  1738 		vowel=consonant=0;
  1739 		for (t=testword;*t;t=g_utf8_next_char(t))
  1740 		{
  1741 		    c=g_utf8_get_char(t);
  1742 		    decomposition=
  1743 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1744 		    if (c=='y' || g_unichar_isdigit(c))
  1745 		    {
  1746 			/* Yah, this is loose. */
  1747 			vowel++;
  1748 			consonant++;
  1749 		    }
  1750 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1751 			vowel++;
  1752 		    else
  1753 			consonant++;
  1754 		    g_free(decomposition);
  1755 		}
  1756 		if (!vowel || !consonant)
  1757 		    istypo=TRUE;
  1758 	    }
  1759 	    /*
  1760 	     * Now exclude the word from being reported if it's in
  1761 	     * the okword list.
  1762 	     */
  1763 	    for (i=0;*okword[i];i++)
  1764 		if (!strcmp(testword,okword[i]))
  1765 		    istypo=FALSE;
  1766 	    /*
  1767 	     * What looks like a typo may be a Roman numeral.
  1768 	     * Exclude these.
  1769 	     */
  1770 	    if (istypo && isroman(testword))
  1771 		istypo=FALSE;
  1772 	    /* Check the manual list of typos. */
  1773 	    if (!istypo)
  1774 		for (i=0;*typo[i];i++)
  1775 		    if (!strcmp(testword,typo[i]))
  1776 			istypo=TRUE;
  1777 	    /*
  1778 	     * Check lowercase s, l, i and m - special cases.
  1779 	     *   "j" - often a semi-colon gone wrong.
  1780 	     *   "d" for a missing apostrophe - he d
  1781 	     *   "n" for "in"
  1782 	     */
  1783 	    if (!istypo && len==1 &&
  1784 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1785 		istypo=TRUE;
  1786 	    if (istypo)
  1787 	    {
  1788 		dupcnt=g_tree_lookup(qword,testword);
  1789 		if (dupcnt)
  1790 		{
  1791 		    (*dupcnt)++;
  1792 		    isdup=!pswit[VERBOSE_SWITCH];
  1793 		}
  1794 		else
  1795 		{
  1796 		    dupcnt=g_new0(int,1);
  1797 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1798 		    isdup=FALSE;
  1799 		}
  1800 		if (!isdup)
  1801 		{
  1802 		    if (pswit[ECHO_SWITCH])
  1803 			g_print("\n%s\n",aline);
  1804 		    if (!pswit[OVERVIEW_SWITCH])
  1805 		    {
  1806 			g_print("    Line %ld column %ld - Query word %s",
  1807 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1808 			  inword);
  1809 			if (!pswit[VERBOSE_SWITCH])
  1810 			    g_print(" - not reporting duplicates");
  1811 			g_print("\n");
  1812 		    }
  1813 		    else
  1814 			cnt_word++;
  1815 		}
  1816 	    }
  1817 	}
  1818 	/* check the user's list of typos */
  1819 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1820 	{
  1821 	    if (pswit[ECHO_SWITCH])
  1822 		g_print("\n%s\n",aline);
  1823 	    if (!pswit[OVERVIEW_SWITCH])  
  1824 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1825 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1826 	}
  1827 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1828 	    g_free(testword);
  1829 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1830 	{
  1831 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1832 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1833 	    {
  1834 		if (pswit[ECHO_SWITCH])
  1835 		    g_print("\n%s\n",aline);
  1836 		if (!pswit[OVERVIEW_SWITCH])
  1837 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1838 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1839 		      inword);
  1840 		else
  1841 		    cnt_word++;
  1842 	    }
  1843 	}
  1844 	g_free(inword);
  1845     }
  1846 }
  1847 
  1848 /*
  1849  * check_for_misspaced_punctuation:
  1850  *
  1851  * Look for added or missing spaces around punctuation and quotes.
  1852  * If there is a punctuation character like ! with no space on
  1853  * either side, suspect a missing!space. If there are spaces on
  1854  * both sides , assume a typo. If we see a double quote with no
  1855  * space or punctuation on either side of it, assume unspaced
  1856  * quotes "like"this.
  1857  */
  1858 void check_for_misspaced_punctuation(const char *aline,
  1859   struct parities *parities,gboolean isemptyline)
  1860 {
  1861     gboolean isacro,isellipsis;
  1862     const char *s;
  1863     gunichar c,nc,pc,n2c;
  1864     int parity;
  1865     c=g_utf8_get_char(aline);
  1866     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1867     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1868     {
  1869 	pc=c;
  1870 	c=nc;
  1871 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1872 	/* For each character in the line after the first. */
  1873 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1874 	{
  1875 	    /* we need to suppress warnings for acronyms like M.D. */
  1876 	    isacro=FALSE;
  1877 	    /* we need to suppress warnings for ellipsis . . . */
  1878 	    isellipsis=FALSE;
  1879 	    /*
  1880 	     * If there are letters on both sides of it or
  1881 	     * if it's strict punctuation followed by an alpha.
  1882 	     */
  1883 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1884 	      g_utf8_strchr("?!,;:",-1,c)))
  1885 	    {
  1886 		if (c=='.')
  1887 		{
  1888 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1889 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1890 			isacro=TRUE;
  1891 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1892 		    if (nc && n2c=='.')
  1893 			isacro=TRUE;
  1894 		}
  1895 		if (!isacro)
  1896 		{
  1897 		    if (pswit[ECHO_SWITCH])
  1898 			g_print("\n%s\n",aline);
  1899 		    if (!pswit[OVERVIEW_SWITCH])
  1900 			g_print("    Line %ld column %ld - Missing space?\n",
  1901 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1902 		    else
  1903 			cnt_punct++;
  1904 		}
  1905 	    }
  1906 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1907 	    {
  1908 		/*
  1909 		 * If there are spaces on both sides,
  1910 		 * or space before and end of line.
  1911 		 */
  1912 		if (c=='.')
  1913 		{
  1914 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1915 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1916 			isellipsis=TRUE;
  1917 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1918 		    if (nc && n2c=='.')
  1919 			isellipsis=TRUE;
  1920 		}
  1921 		if (!isemptyline && !isellipsis)
  1922 		{
  1923 		    if (pswit[ECHO_SWITCH])
  1924 			g_print("\n%s\n",aline);
  1925 		    if (!pswit[OVERVIEW_SWITCH])
  1926 			g_print("    Line %ld column %ld - "
  1927 			  "Spaced punctuation?\n",linecnt,
  1928 			  g_utf8_pointer_to_offset(aline,s)+1);
  1929 		    else
  1930 			cnt_punct++;
  1931 		}
  1932 	    }
  1933 	}
  1934     }
  1935     /* Split out the characters that CANNOT be preceded by space. */
  1936     c=g_utf8_get_char(aline);
  1937     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1938     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1939     {
  1940 	pc=c;
  1941 	c=nc;
  1942 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1943 	/* for each character in the line after the first */
  1944 	if (g_utf8_strchr("?!,;:",-1,c))
  1945 	{
  1946 	    /* if it's punctuation that _cannot_ have a space before it */
  1947 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1948 	    {
  1949 		/*
  1950 		 * If nc DOES == space,
  1951 		 * it was already reported just above.
  1952 		 */
  1953 		if (pswit[ECHO_SWITCH])
  1954 		    g_print("\n%s\n",aline);
  1955 		if (!pswit[OVERVIEW_SWITCH])
  1956 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1957 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1958 		else
  1959 		    cnt_punct++;
  1960 	    }
  1961 	}
  1962     }
  1963     /*
  1964      * Special case " .X" where X is any alpha.
  1965      * This plugs a hole in the acronym code above.
  1966      * Inelegant, but maintainable.
  1967      */
  1968     c=g_utf8_get_char(aline);
  1969     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1970     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1971     {
  1972 	pc=c;
  1973 	c=nc;
  1974 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1975 	/* for each character in the line after the first */
  1976 	if (c=='.')
  1977 	{
  1978 	    /* if it's a period */
  1979 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1980 	    {
  1981 		/*
  1982 		 * If the period follows a space and
  1983 		 * is followed by a letter.
  1984 		 */
  1985 		if (pswit[ECHO_SWITCH])
  1986 		    g_print("\n%s\n",aline);
  1987 		if (!pswit[OVERVIEW_SWITCH])
  1988 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1989 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1990 		else
  1991 		    cnt_punct++;
  1992 	    }
  1993 	}
  1994     }
  1995     c=g_utf8_get_char(aline);
  1996     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1997     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1998     {
  1999 	pc=c;
  2000 	c=nc;
  2001 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2002 	/* for each character in the line after the first */
  2003 	if (CHAR_IS_DQUOTE(c))
  2004 	{
  2005 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2006 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2007 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2008 	    {
  2009 		if (pswit[ECHO_SWITCH])
  2010 		    g_print("\n%s\n",aline);
  2011 		if (!pswit[OVERVIEW_SWITCH])
  2012 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2013 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2014 		else
  2015 		    cnt_punct++;
  2016 	    }
  2017 	}
  2018     }
  2019     /* Check parity of quotes. */
  2020     nc=g_utf8_get_char(aline);
  2021     for (s=aline;*s;s=g_utf8_next_char(s))
  2022     {
  2023 	c=nc;
  2024 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2025 	if (CHAR_IS_DQUOTE(c))
  2026 	{
  2027 	    if (c==CHAR_DQUOTE)
  2028 	    {
  2029 		parities->dquote=!parities->dquote;
  2030 		parity=parities->dquote;
  2031 	    }
  2032 	    else if (c==CHAR_LD_QUOTE)
  2033 		parity=1;
  2034 	    else
  2035 		parity=0;
  2036 	    if (!parity)
  2037 	    {
  2038 		/* parity even */
  2039 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))
  2040 		{
  2041 		    if (pswit[ECHO_SWITCH])
  2042 			g_print("\n%s\n",aline);
  2043 		    if (!pswit[OVERVIEW_SWITCH])
  2044 			g_print("    Line %ld column %ld - "
  2045 			  "Wrongspaced quotes?\n",
  2046 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2047 		    else
  2048 			cnt_punct++;
  2049 		}
  2050 	    }
  2051 	    else
  2052 	    {
  2053 		/* parity odd */
  2054 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2055 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)
  2056 		{
  2057 		    if (pswit[ECHO_SWITCH])
  2058 			g_print("\n%s\n",aline);
  2059 		    if (!pswit[OVERVIEW_SWITCH])
  2060 			g_print("    Line %ld column %ld - "
  2061 			  "Wrongspaced quotes?\n",
  2062 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2063 		    else
  2064 			cnt_punct++;
  2065 		}
  2066 	    }
  2067 	}
  2068     }
  2069     c=g_utf8_get_char(aline);
  2070     if (CHAR_IS_DQUOTE(c))
  2071     {
  2072 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2073 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2074 	{
  2075 	    if (pswit[ECHO_SWITCH])
  2076 		g_print("\n%s\n",aline);
  2077 	    if (!pswit[OVERVIEW_SWITCH])
  2078 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2079 		  linecnt);
  2080 	    else
  2081 		cnt_punct++;
  2082 	}
  2083     }
  2084     if (pswit[SQUOTE_SWITCH])
  2085     {
  2086 	nc=g_utf8_get_char(aline);
  2087 	for (s=aline;*s;s=g_utf8_next_char(s))
  2088 	{
  2089 	    c=nc;
  2090 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2091 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2092 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2093 	      !g_unichar_isalpha(nc)))
  2094 	    {
  2095 		parities->squote=!parities->squote;
  2096 		if (!parities->squote)
  2097 		{
  2098 		    /* parity even */
  2099 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2100 		    {
  2101 			if (pswit[ECHO_SWITCH])
  2102 			    g_print("\n%s\n",aline);
  2103 			if (!pswit[OVERVIEW_SWITCH])
  2104 			    g_print("    Line %ld column %ld - "
  2105 			      "Wrongspaced singlequotes?\n",
  2106 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2107 			else
  2108 			    cnt_punct++;
  2109 		    }
  2110 		}
  2111 		else
  2112 		{
  2113 		    /* parity odd */
  2114 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&
  2115 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2116 		    {
  2117 			if (pswit[ECHO_SWITCH])
  2118 			    g_print("\n%s\n",aline);
  2119 			if (!pswit[OVERVIEW_SWITCH])
  2120 			    g_print("    Line %ld column %ld - "
  2121 			      "Wrongspaced singlequotes?\n",
  2122 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2123 			else
  2124 			    cnt_punct++;
  2125 		    }
  2126 		}
  2127 	    }
  2128 	}
  2129     }
  2130 }
  2131 
  2132 /*
  2133  * str_follows_word:
  2134  *
  2135  * Given a position p within a string str, determine whether it follows the
  2136  * given word. This is roughly equivalent to the regular expression (?<=\bword)
  2137  * but has different boundary conditions.
  2138  */
  2139 static gboolean str_follows_word(const char *str,const char *p,const char *word)
  2140 {
  2141     int len=strlen(word);
  2142     if (p-len<str)
  2143 	return FALSE;
  2144     else if (!g_str_has_prefix(p-len,word))
  2145 	return FALSE;
  2146     else if (p-len==str)
  2147 	return TRUE;
  2148     else
  2149 	/* Using non-alpha as a word boundary. See UAX #29 for a better way. */
  2150 	return !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(p-len)));
  2151 }
  2152 
  2153 /*
  2154  * check_for_double_punctuation:
  2155  *
  2156  * Look for double punctuation like ,. or ,,
  2157  * Thanks to DW for the suggestion!
  2158  * In books with references, ".," and ".;" are common
  2159  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2160  * OTOH, from my initial tests, there are also fairly
  2161  * common errors. What to do? Make these cases paranoid?
  2162  * ".," is the most common, so warnings->dotcomma is used
  2163  * to suppress detailed reporting if it occurs often.
  2164  * Indeed, ".," is so common after "etc" or "&c" that
  2165  * we don't warn on these cases at all.
  2166  */
  2167 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2168 {
  2169     const char *s;
  2170     gunichar c,nc;
  2171     gboolean is_query;
  2172     nc=g_utf8_get_char(aline);
  2173     for (s=aline;*s;s=g_utf8_next_char(s))
  2174     {
  2175 	c=nc;
  2176 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2177 	/* for each punctuation character in the line */
  2178 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2179 	  g_utf8_strchr(".?!,;:",-1,nc))
  2180 	{
  2181 	    /* followed by punctuation, it's a query, unless . . . */
  2182 	    is_query=TRUE;
  2183 	    if (warnings->isFrench &&
  2184 	      (g_str_has_prefix(s,",...") || g_str_has_prefix(s,"...,") ||
  2185 	       g_str_has_prefix(s,";...") || g_str_has_prefix(s,"...;") ||
  2186 	       g_str_has_prefix(s,":...") || g_str_has_prefix(s,"...:") ||
  2187 	       g_str_has_prefix(s,"!...") || g_str_has_prefix(s,"...!") ||
  2188 	       g_str_has_prefix(s,"?...") || g_str_has_prefix(s,"...?")))
  2189 	    {
  2190 		s+=4;
  2191 		nc=g_utf8_get_char(g_utf8_next_char(s));
  2192 		is_query=FALSE;
  2193 	    }
  2194 	    else if (c==nc && (c=='.' || c=='?' || c=='!'))
  2195 	    {
  2196 		/* do nothing for .. !! and ?? which can be legit */
  2197 		is_query=FALSE;
  2198 	    }
  2199 	    else if (c=='.' && nc==',')
  2200 	    {
  2201 		if (!warnings->dotcomma || str_follows_word(aline,s,"etc") || 
  2202 		  str_follows_word(aline,s,"&c"))
  2203 		    is_query=FALSE;
  2204 	    }
  2205 	    if (is_query)
  2206 	    {
  2207 		if (pswit[ECHO_SWITCH])
  2208 		    g_print("\n%s\n",aline);
  2209 		if (!pswit[OVERVIEW_SWITCH])
  2210 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2211 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2212 		else
  2213 		    cnt_punct++;
  2214 	    }
  2215 	}
  2216     }
  2217 }
  2218 
  2219 /*
  2220  * check_for_spaced_quotes:
  2221  */
  2222 void check_for_spaced_quotes(const char *aline)
  2223 {
  2224     int i;
  2225     const char *s,*t;
  2226     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2227       CHAR_RS_QUOTE};
  2228     GString *pattern;
  2229     s=aline;
  2230     while ((t=strstr(s," \" ")))
  2231     {
  2232 	if (pswit[ECHO_SWITCH])
  2233 	    g_print("\n%s\n",aline);
  2234 	if (!pswit[OVERVIEW_SWITCH])
  2235 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2236 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2237 	else
  2238 	    cnt_punct++;
  2239 	s=g_utf8_next_char(g_utf8_next_char(t));
  2240     }
  2241     pattern=g_string_new(NULL);
  2242     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2243     {
  2244 	g_string_assign(pattern," ");
  2245 	g_string_append_unichar(pattern,single_quotes[i]);
  2246 	g_string_append_c(pattern,' ');
  2247 	s=aline;
  2248 	while ((t=strstr(s,pattern->str)))
  2249 	{
  2250 	    if (pswit[ECHO_SWITCH])
  2251 		g_print("\n%s\n",aline);
  2252 	    if (!pswit[OVERVIEW_SWITCH])
  2253 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2254 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2255 	    else
  2256 		cnt_punct++;
  2257 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2258 	}
  2259     }
  2260     g_string_free(pattern,TRUE);
  2261 }
  2262 
  2263 /*
  2264  * check_for_miscased_genative:
  2265  *
  2266  * Check special case of 'S instead of 's at end of word.
  2267  */
  2268 void check_for_miscased_genative(const char *aline)
  2269 {
  2270     const char *s;
  2271     gunichar c,nc,pc;
  2272     if (!*aline)
  2273 	return;
  2274     c=g_utf8_get_char(aline);
  2275     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2276     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2277     {
  2278 	pc=c;
  2279 	c=nc;
  2280 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2281 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2282 	{
  2283 	    if (pswit[ECHO_SWITCH])
  2284 		g_print("\n%s\n",aline);
  2285 	    if (!pswit[OVERVIEW_SWITCH])
  2286 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2287 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2288 	    else
  2289 		cnt_punct++;
  2290 	}
  2291     }
  2292 }
  2293 
  2294 /*
  2295  * check_end_of_line:
  2296  *
  2297  * Now check special cases - start and end of line -
  2298  * for single and double quotes. Start is sometimes [sic]
  2299  * but better to query it anyway.
  2300  * While we're here, check for dash at end of line.
  2301  */
  2302 void check_end_of_line(const char *aline,struct warnings *warnings)
  2303 {
  2304     int lbytes;
  2305     const char *s;
  2306     gunichar c1,c2;
  2307     lbytes=strlen(aline);
  2308     if (g_utf8_strlen(aline,lbytes)>1)
  2309     {
  2310 	s=g_utf8_prev_char(aline+lbytes);
  2311 	c1=g_utf8_get_char(s);
  2312 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2313 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2314 	{
  2315 	    if (pswit[ECHO_SWITCH])
  2316 		g_print("\n%s\n",aline);
  2317 	    if (!pswit[OVERVIEW_SWITCH])
  2318 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2319 		  g_utf8_strlen(aline,lbytes));
  2320 	    else
  2321 		cnt_punct++;
  2322 	}
  2323 	c1=g_utf8_get_char(aline);
  2324 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2325 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2326 	{
  2327 	    if (pswit[ECHO_SWITCH])
  2328 		g_print("\n%s\n",aline);
  2329 	    if (!pswit[OVERVIEW_SWITCH])
  2330 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2331 	    else
  2332 		cnt_punct++;
  2333 	}
  2334 	/*
  2335 	 * Dash at end of line may well be legit - paranoid mode only
  2336 	 * and don't report em-dash at line-end.
  2337 	 */
  2338 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2339 	{
  2340 	    for (s=g_utf8_prev_char(aline+lbytes);
  2341 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2342 		;
  2343 	    if (g_utf8_get_char(s)=='-' &&
  2344 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2345 	    {
  2346 		if (pswit[ECHO_SWITCH])
  2347 		    g_print("\n%s\n",aline);
  2348 		if (!pswit[OVERVIEW_SWITCH])
  2349 		    g_print("    Line %ld column %ld - "
  2350 		      "Hyphen at end of line?\n",
  2351 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2352 	    }
  2353 	}
  2354     }
  2355 }
  2356 
  2357 /*
  2358  * check_for_unspaced_bracket:
  2359  *
  2360  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2361  * If so, suspect a scanno like "a]most".
  2362  */
  2363 void check_for_unspaced_bracket(const char *aline)
  2364 {
  2365     const char *s;
  2366     gunichar c,nc,pc;
  2367     c=g_utf8_get_char(aline);
  2368     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2369     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2370     {
  2371 	pc=c;
  2372 	c=nc;
  2373 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2374 	if (!nc)
  2375 	    break;
  2376 	/* for each bracket character in the line except 1st & last */
  2377 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2378 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2379 	{
  2380 	    if (pswit[ECHO_SWITCH])
  2381 		g_print("\n%s\n",aline);
  2382 	    if (!pswit[OVERVIEW_SWITCH])
  2383 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2384 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2385 	    else
  2386 		cnt_punct++;
  2387 	}
  2388     }
  2389 }
  2390 
  2391 /*
  2392  * check_for_unpunctuated_endquote:
  2393  */
  2394 void check_for_unpunctuated_endquote(const char *aline)
  2395 {
  2396     const char *s;
  2397     gunichar c,nc,pc;
  2398     QuoteClass qc;
  2399     c=g_utf8_get_char(aline);
  2400     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2401     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2402     {
  2403 	pc=c;
  2404 	c=nc;
  2405 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2406 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2407 	/* for each character in the line except 1st */
  2408 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2409 	{
  2410 	    if (pswit[ECHO_SWITCH])
  2411 		g_print("\n%s\n",aline);
  2412 	    if (!pswit[OVERVIEW_SWITCH])
  2413 		g_print("    Line %ld column %ld - "
  2414 		  "endquote missing punctuation?\n",
  2415 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2416 	    else
  2417 		cnt_punct++;
  2418 	}
  2419     }
  2420 }
  2421 
  2422 /*
  2423  * check_for_html_tag:
  2424  *
  2425  * Check for <HTML TAG>.
  2426  *
  2427  * If there is a < in the line, followed at some point
  2428  * by a > then we suspect HTML.
  2429  */
  2430 void check_for_html_tag(const char *aline)
  2431 {
  2432     const char *open,*close;
  2433     gchar *tag;
  2434     open=strchr(aline,'<');
  2435     if (open)
  2436     {
  2437 	close=strchr(g_utf8_next_char(open),'>');
  2438 	if (close)
  2439 	{
  2440 	    if (pswit[ECHO_SWITCH])
  2441 		g_print("\n%s\n",aline);
  2442 	    if (!pswit[OVERVIEW_SWITCH])
  2443 	    {
  2444 		tag=g_strndup(open,close-open+1);
  2445 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2446 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2447 		g_free(tag);
  2448 	    }
  2449 	    else
  2450 		cnt_html++;
  2451 	}
  2452     }
  2453 }
  2454 
  2455 /*
  2456  * check_for_html_entity:
  2457  *
  2458  * Check for &symbol; HTML.
  2459  *
  2460  * If there is a & in the line, followed at
  2461  * some point by a ; then we suspect HTML.
  2462  */
  2463 void check_for_html_entity(const char *aline)
  2464 {
  2465     const char *s,*amp,*scolon;
  2466     gchar *entity;
  2467     amp=strchr(aline,'&');
  2468     if (amp)
  2469     {
  2470 	scolon=strchr(amp,';');
  2471 	if (scolon)
  2472 	{
  2473 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2474 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2475 		    break;		/* Don't report "Jones & Son;" */
  2476 	    if (s>=scolon)
  2477 	    {
  2478 		if (pswit[ECHO_SWITCH])
  2479 		    g_print("\n%s\n",aline);
  2480 		if (!pswit[OVERVIEW_SWITCH])
  2481 		{
  2482 		    entity=g_strndup(amp,scolon-amp+1);
  2483 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2484 		      linecnt,(int)(amp-aline)+1,entity);
  2485 		    g_free(entity);
  2486 		}
  2487 		else
  2488 		    cnt_html++;
  2489 	    }
  2490 	}
  2491     }
  2492 }
  2493 
  2494 /*
  2495  * check_for_omitted_punctuation:
  2496  *
  2497  * Check for omitted punctuation at end of paragraph by working back
  2498  * through prevline. DW.
  2499  * Need to check this only for "normal" paras.
  2500  * So what is a "normal" para?
  2501  *    Not normal if one-liner (chapter headings, etc.)
  2502  *    Not normal if doesn't contain at least one locase letter
  2503  *    Not normal if starts with space
  2504  */
  2505 void check_for_omitted_punctuation(const char *prevline,
  2506   struct line_properties *last,int start_para_line)
  2507 {
  2508     gboolean letter_on_line=FALSE;
  2509     const char *s;
  2510     gunichar c;
  2511     gboolean closing_quote;
  2512     for (s=prevline;*s;s=g_utf8_next_char(s))
  2513 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2514 	{
  2515 	    letter_on_line=TRUE;
  2516 	    break;
  2517 	}
  2518     /*
  2519      * This next "if" is a problem.
  2520      * If we say "start_para_line <= linecnt - 1", that includes
  2521      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2522      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2523      * misses genuine one-line paragraphs.
  2524      */
  2525     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2526       g_utf8_get_char(prevline)>CHAR_SPACE)
  2527     {
  2528 	s=prevline+strlen(prevline);
  2529 	do
  2530 	{
  2531 	    s=g_utf8_prev_char(s);
  2532 	    c=g_utf8_get_char(s);
  2533 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2534 		closing_quote=TRUE;
  2535 	    else
  2536 		closing_quote=FALSE;
  2537 	} while (closing_quote && s>prevline);
  2538 	for (;s>prevline;s=g_utf8_prev_char(s))
  2539 	{
  2540 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2541 	    {
  2542 		if (pswit[ECHO_SWITCH])
  2543 		    g_print("\n%s\n",prevline);
  2544 		if (!pswit[OVERVIEW_SWITCH])
  2545 		    g_print("    Line %ld column %ld - "
  2546 		      "No punctuation at para end?\n",
  2547 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2548 		else
  2549 		    cnt_punct++;
  2550 		break;
  2551 	    }
  2552 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2553 		break;
  2554 	}
  2555     }
  2556 }
  2557 
  2558 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2559 {
  2560     const char *word=key;
  2561     int *dupcnt=value;
  2562     if (*dupcnt)
  2563 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2564 	  word,*dupcnt);
  2565     return FALSE;
  2566 }
  2567 
  2568 void print_as_windows_1252(const char *string)
  2569 {
  2570     gsize inbytes,outbytes;
  2571     gchar *buf,*bp;
  2572     static GIConv converter=(GIConv)-1;
  2573     if (!string)
  2574     {
  2575 	if (converter!=(GIConv)-1)
  2576 	    g_iconv_close(converter);
  2577 	converter=(GIConv)-1;
  2578 	return;
  2579     }
  2580     if (converter==(GIConv)-1)
  2581 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2582     if (converter!=(GIConv)-1)
  2583     {
  2584 	inbytes=outbytes=strlen(string);
  2585 	bp=buf=g_malloc(outbytes+1);
  2586 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2587 	*bp='\0';
  2588 	fputs(buf,stdout);
  2589 	g_free(buf);
  2590     }
  2591     else
  2592 	fputs(string,stdout);
  2593 }
  2594 
  2595 void print_as_utf_8(const char *string)
  2596 {
  2597     fputs(string,stdout);
  2598 }
  2599 
  2600 /*
  2601  * procfile:
  2602  *
  2603  * Process one file.
  2604  */
  2605 void procfile(const char *filename)
  2606 {
  2607     const char *s;
  2608     gchar *parastart=NULL;	/* first line of current para */
  2609     gchar *etext,*aline;
  2610     gchar *etext_ptr;
  2611     GError *err=NULL;
  2612     struct first_pass_results *first_pass_results;
  2613     struct warnings *warnings;
  2614     struct counters counters={0};
  2615     struct line_properties last={0};
  2616     struct parities parities={0};
  2617     struct pending pending={0};
  2618     gboolean isemptyline;
  2619     long start_para_line=0;
  2620     gboolean isnewpara=FALSE,enddash=FALSE;
  2621     last.start=CHAR_SPACE;
  2622     linecnt=checked_linecnt=0;
  2623     etext=read_etext(filename,&err);
  2624     if (!etext)
  2625     {
  2626 	if (pswit[STDOUT_SWITCH])
  2627 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2628 	else
  2629 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2630 	exit(1);
  2631     }
  2632     g_print("\n\nFile: %s\n\n",filename);
  2633     first_pass_results=first_pass(etext);
  2634     warnings=report_first_pass(first_pass_results);
  2635     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2636     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2637     /*
  2638      * Here we go with the main pass. Hold onto yer hat!
  2639      */
  2640     linecnt=0;
  2641     etext_ptr=etext;
  2642     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2643     {
  2644 	linecnt++;
  2645 	if (linecnt==1)
  2646 	    isnewpara=TRUE;
  2647 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2648 	    continue;    // skip DP page separators completely
  2649 	if (linecnt<first_pass_results->firstline ||
  2650 	  (first_pass_results->footerline>0 &&
  2651 	  linecnt>first_pass_results->footerline))
  2652 	{
  2653 	    if (pswit[HEADER_SWITCH])
  2654 	    {
  2655 		if (g_str_has_prefix(aline,"Title:"))
  2656 		    g_print("    %s\n",aline);
  2657 		if (g_str_has_prefix(aline,"Author:"))
  2658 		    g_print("    %s\n",aline);
  2659 		if (g_str_has_prefix(aline,"Release Date:"))
  2660 		    g_print("    %s\n",aline);
  2661 		if (g_str_has_prefix(aline,"Edition:"))
  2662 		    g_print("    %s\n\n",aline);
  2663 	    }
  2664 	    continue;		/* skip through the header */
  2665 	}
  2666 	checked_linecnt++;
  2667 	print_pending(aline,parastart,&pending);
  2668 	isemptyline=analyse_quotes(aline,&counters);
  2669 	if (isnewpara && !isemptyline)
  2670 	{
  2671 	    /* This line is the start of a new paragraph. */
  2672 	    start_para_line=linecnt;
  2673 	    /* Capture its first line in case we want to report it later. */
  2674 	    g_free(parastart);
  2675 	    parastart=g_strdup(aline);
  2676 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2677 	    s=aline;
  2678 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2679 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2680 		s=g_utf8_next_char(s);
  2681 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2682 	    {
  2683 		/* and its first letter is lowercase */
  2684 		if (pswit[ECHO_SWITCH])
  2685 		    g_print("\n%s\n",aline);
  2686 		if (!pswit[OVERVIEW_SWITCH])
  2687 		    g_print("    Line %ld column %ld - "
  2688 		      "Paragraph starts with lower-case\n",
  2689 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2690 		else
  2691 		    cnt_punct++;
  2692 	    }
  2693 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2694 	}
  2695 	/* Check for an em-dash broken at line end. */
  2696 	if (enddash && g_utf8_get_char(aline)=='-')
  2697 	{
  2698 	    if (pswit[ECHO_SWITCH])
  2699 		g_print("\n%s\n",aline);
  2700 	    if (!pswit[OVERVIEW_SWITCH])
  2701 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2702 	    else
  2703 		cnt_punct++;
  2704 	}
  2705 	enddash=FALSE;
  2706 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2707 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2708 	    ;
  2709 	if (s>=aline && g_utf8_get_char(s)=='-')
  2710 	    enddash=TRUE;
  2711 	check_for_control_characters(aline);
  2712 	if (warnings->bin)
  2713 	    check_for_odd_characters(aline,warnings,isemptyline);
  2714 	if (warnings->longline)
  2715 	    check_for_long_line(aline);
  2716 	if (warnings->shortline)
  2717 	    check_for_short_line(aline,&last);
  2718 	last.blen=last.len;
  2719 	last.len=g_utf8_strlen(aline,-1);
  2720 	last.start=g_utf8_get_char(aline);
  2721 	check_for_starting_punctuation(aline);
  2722 	if (warnings->dash)
  2723 	{
  2724 	    check_for_spaced_emdash(aline);
  2725 	    check_for_spaced_dash(aline);
  2726 	}
  2727 	check_for_unmarked_paragraphs(aline);
  2728 	check_for_jeebies(aline);
  2729 	check_for_mta_from(aline);
  2730 	check_for_orphan_character(aline);
  2731 	check_for_pling_scanno(aline);
  2732 	check_for_extra_period(aline,warnings);
  2733 	check_for_following_punctuation(aline);
  2734 	check_for_typos(aline,warnings);
  2735 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2736 	check_for_double_punctuation(aline,warnings);
  2737 	check_for_spaced_quotes(aline);
  2738 	check_for_miscased_genative(aline);
  2739 	check_end_of_line(aline,warnings);
  2740 	check_for_unspaced_bracket(aline);
  2741 	if (warnings->endquote)
  2742 	    check_for_unpunctuated_endquote(aline);
  2743 	check_for_html_tag(aline);
  2744 	check_for_html_entity(aline);
  2745 	if (isemptyline)
  2746 	{
  2747 	    check_for_mismatched_quotes(&counters,&pending);
  2748 	    counters_reset(&counters);
  2749 	    /* let the next iteration know that it's starting a new para */
  2750 	    isnewpara=TRUE;
  2751 	    if (prevline)
  2752 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2753 	}
  2754 	g_free(prevline);
  2755 	prevline=g_strdup(aline);
  2756     }
  2757     linecnt++;
  2758     check_for_mismatched_quotes(&counters,&pending);
  2759     print_pending(NULL,parastart,&pending);
  2760     reset_pending(&pending);
  2761     if (prevline)
  2762     {
  2763 	g_free(prevline);
  2764 	prevline=NULL;
  2765     }
  2766     g_free(parastart);
  2767     g_free(prevline);
  2768     g_free(etext);
  2769     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2770 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2771     g_tree_unref(qword);
  2772     g_tree_unref(qperiod);
  2773     counters_destroy(&counters);
  2774     g_set_print_handler(NULL);
  2775     print_as_windows_1252(NULL);
  2776     if (pswit[MARKUP_SWITCH])  
  2777 	loseentities(NULL);
  2778 }
  2779 
  2780 /*
  2781  * flgets:
  2782  *
  2783  * Get one line from the input text, checking for
  2784  * the existence of exactly one CR/LF line-end per line.
  2785  *
  2786  * Returns: a pointer to the line.
  2787  */
  2788 char *flgets(char **etext,long lcnt)
  2789 {
  2790     gunichar c;
  2791     gboolean isCR=FALSE;
  2792     char *theline=*etext;
  2793     char *eos=theline;
  2794     gchar *s;
  2795     for (;;)
  2796     {
  2797 	c=g_utf8_get_char(*etext);
  2798 	if (!c)
  2799 	{
  2800 	    if (*etext==theline)
  2801 		return NULL;
  2802 	    else if (pswit[LINE_END_SWITCH])
  2803 	    {
  2804 		if (pswit[ECHO_SWITCH])
  2805 		{
  2806 		    s=g_strndup(theline,eos-theline);
  2807 		    g_print("\n%s\n",s);
  2808 		    g_free(s);
  2809 		}
  2810 		if (!pswit[OVERVIEW_SWITCH])
  2811 		    /* There may, or may not, have been a CR */
  2812 		    g_print("    Line %ld - No LF?\n",lcnt);
  2813 		else
  2814 		    cnt_lineend++;
  2815 	    }
  2816 	    break;
  2817 	}
  2818 	*etext=g_utf8_next_char(*etext);
  2819 	/* either way, it's end of line */
  2820 	if (c=='\n')
  2821 	{
  2822 	    if (isCR)
  2823 		break;
  2824 	    else
  2825 	    {
  2826 		/* Error - a LF without a preceding CR */
  2827 		if (pswit[LINE_END_SWITCH])
  2828 		{
  2829 		    if (pswit[ECHO_SWITCH])
  2830 		    {
  2831 			s=g_strndup(theline,eos-theline);
  2832 			g_print("\n%s\n",s);
  2833 			g_free(s);
  2834 		    }
  2835 		    if (!pswit[OVERVIEW_SWITCH])
  2836 			g_print("    Line %ld - No CR?\n",lcnt);
  2837 		    else
  2838 			cnt_lineend++;
  2839 		}
  2840 		break;
  2841 	    }
  2842 	}
  2843 	if (c=='\r')
  2844 	{
  2845 	    if (isCR)
  2846 	    {
  2847 		/* Error - two successive CRs */
  2848 		if (pswit[LINE_END_SWITCH])
  2849 		{
  2850 		    if (pswit[ECHO_SWITCH])
  2851 		    {
  2852 			s=g_strndup(theline,eos-theline);
  2853 			g_print("\n%s\n",s);
  2854 			g_free(s);
  2855 		    }
  2856 		    if (!pswit[OVERVIEW_SWITCH])
  2857 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2858 		    else
  2859 			cnt_lineend++;
  2860 		}
  2861 	    }
  2862 	    isCR=TRUE;
  2863 	}
  2864 	else
  2865 	{
  2866 	    if (pswit[LINE_END_SWITCH] && isCR)
  2867 	    {
  2868 		if (pswit[ECHO_SWITCH])
  2869 		{
  2870 		    s=g_strndup(theline,eos-theline);
  2871 		    g_print("\n%s\n",s);
  2872 		    g_free(s);
  2873 		}
  2874 		if (!pswit[OVERVIEW_SWITCH])
  2875 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2876 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2877 		else
  2878 		    cnt_lineend++;
  2879 		*eos=' ';
  2880 	    }
  2881 	    isCR=FALSE;
  2882 	    eos=g_utf8_next_char(eos);
  2883 	}
  2884     }
  2885     *eos='\0';
  2886     if (pswit[MARKUP_SWITCH])  
  2887 	postprocess_for_HTML(theline);
  2888     if (pswit[DP_SWITCH])  
  2889 	postprocess_for_DP(theline);
  2890     return theline;
  2891 }
  2892 
  2893 /*
  2894  * mixdigit:
  2895  *
  2896  * Takes a "word" as a parameter, and checks whether it
  2897  * contains a mixture of alpha and digits. Generally, this is an
  2898  * error, but may not be for cases like 4th or L5 12s. 3d.
  2899  *
  2900  * Returns: TRUE iff an is error found.
  2901  */
  2902 gboolean mixdigit(const char *checkword)
  2903 {
  2904     gboolean wehaveadigit,wehavealetter,query;
  2905     const char *s,*nondigit;
  2906     wehaveadigit=wehavealetter=query=FALSE;
  2907     for (s=checkword;*s;s=g_utf8_next_char(s))
  2908 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2909 	    wehavealetter=TRUE;
  2910 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2911 	    wehaveadigit=TRUE;
  2912     if (wehaveadigit && wehavealetter)
  2913     {
  2914 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2915 	query=TRUE;
  2916 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2917 	  nondigit=g_utf8_next_char(nondigit))
  2918 	    ;
  2919 	/* digits, ending in st, rd, nd, th of either case */
  2920 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2921 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2922 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2923 	  !g_ascii_strcasecmp(nondigit,"th"))
  2924 	    query=FALSE;
  2925 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2926 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2927 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2928 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2929 	    query=FALSE;
  2930 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2931 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2932 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2933 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2934 	    query=FALSE;
  2935 	/* digits, ending in l, L, s or d */
  2936 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2937 	  !strcmp(nondigit,"d"))
  2938 	    query=FALSE;
  2939 	/*
  2940 	 * L at the start of a number, representing Britsh pounds, like L500.
  2941 	 * This is cute. We know the current word is mixed digit. If the first
  2942 	 * letter is L, there must be at least one digit following. If both
  2943 	 * digits and letters follow, we have a genuine error, else we have a
  2944 	 * capital L followed by digits, and we accept that as a non-error.
  2945 	 */
  2946 	if (g_utf8_get_char(checkword)=='L' &&
  2947 	  !mixdigit(g_utf8_next_char(checkword)))
  2948 	    query=FALSE;
  2949     }
  2950     return query;
  2951 }
  2952 
  2953 /*
  2954  * getaword:
  2955  *
  2956  * Extracts the first/next "word" from the line, and returns it.
  2957  * A word is defined as one English word unit--or at least that's the aim.
  2958  * "ptr" is advanced to the position in the line where we will start
  2959  * looking for the next word.
  2960  *
  2961  * Returns: A newly-allocated string.
  2962  */
  2963 gchar *getaword(const char **ptr)
  2964 {
  2965     const char *s,*t;
  2966     GString *word;
  2967     gunichar c,pc;
  2968     word=g_string_new(NULL);
  2969     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2970       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2971       **ptr;*ptr=g_utf8_next_char(*ptr))
  2972     {
  2973 	/* Handle exceptions for footnote markers like [1] */
  2974 	if (g_utf8_get_char(*ptr)=='[')
  2975 	{
  2976 	    g_string_append_c(word,'[');
  2977 	    s=g_utf8_next_char(*ptr);
  2978 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))
  2979 		g_string_append_unichar(word,g_utf8_get_char(s));
  2980 	    if (g_utf8_get_char(s)==']')
  2981 	    {
  2982 		g_string_append_c(word,']');
  2983 		*ptr=g_utf8_next_char(s);
  2984 		return g_string_free(word,FALSE);
  2985 	    }
  2986 	    else
  2987 		g_string_truncate(word,0);
  2988 	}
  2989     }
  2990     /*
  2991      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2992      * Especially yucky is the case of L1,000
  2993      * This section looks for a pattern of characters including a digit
  2994      * followed by a comma or period followed by one or more digits.
  2995      * If found, it returns this whole pattern as a word; otherwise we discard
  2996      * the results and resume our normal programming.
  2997      */
  2998     s=*ptr;
  2999     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3000       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3001       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3002 	g_string_append_unichar(word,g_utf8_get_char(s));
  3003     if (word->len)
  3004     {
  3005 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  3006 	{
  3007 	    c=g_utf8_get_char(t);
  3008 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  3009 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3010 	    {
  3011 		*ptr=s;
  3012 		return g_string_free(word,FALSE);
  3013 	    }
  3014 	}
  3015     }
  3016     /* we didn't find a punctuated number - do the regular getword thing */
  3017     g_string_truncate(word,0);
  3018     c=g_utf8_get_char(*ptr);
  3019     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  3020       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  3021 	g_string_append_unichar(word,c);
  3022     return g_string_free(word,FALSE);
  3023 }
  3024 
  3025 /*
  3026  * isroman:
  3027  *
  3028  * Is this word a Roman Numeral?
  3029  *
  3030  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3031  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3032  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3033  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3034  * expressions thereof, except when it came to taxes. Allow any number of M,
  3035  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3036  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3037  * of optional Is.
  3038  */
  3039 gboolean isroman(const char *t)
  3040 {
  3041     const char *s;
  3042     if (!t || !*t)
  3043 	return FALSE;
  3044     s=t;
  3045     while (g_utf8_get_char(t)=='m' && *t)
  3046 	t++;
  3047     if (g_utf8_get_char(t)=='d')
  3048 	t++;
  3049     if (g_str_has_prefix(t,"cm"))
  3050 	t+=2;
  3051     if (g_str_has_prefix(t,"cd"))
  3052 	t+=2;
  3053     while (g_utf8_get_char(t)=='c' && *t)
  3054 	t++;
  3055     if (g_str_has_prefix(t,"xl"))
  3056 	t+=2;
  3057     if (g_str_has_prefix(t,"xc"))
  3058 	t+=2;
  3059     if (g_utf8_get_char(t)=='l')
  3060 	t++;
  3061     while (g_utf8_get_char(t)=='x' && *t)
  3062 	t++;
  3063     if (g_str_has_prefix(t,"ix"))
  3064 	t+=2;
  3065     if (g_str_has_prefix(t,"iv"))
  3066 	t+=2;
  3067     if (g_utf8_get_char(t)=='v')
  3068 	t++;
  3069     while (g_utf8_get_char(t)=='i' && *t)
  3070 	t++;
  3071     return !*t;
  3072 }
  3073 
  3074 /*
  3075  * postprocess_for_DP:
  3076  *
  3077  * Invoked with the -d switch from flgets().
  3078  * It simply "removes" from the line a hard-coded set of common
  3079  * DP-specific tags, so that the line passed to the main routine has
  3080  * been pre-cleaned of DP markup.
  3081  */
  3082 void postprocess_for_DP(char *theline)
  3083 {
  3084     char *s,*t;
  3085     int i;
  3086     if (!*theline) 
  3087 	return;
  3088     for (i=0;*DPmarkup[i];i++)
  3089 	while ((s=strstr(theline,DPmarkup[i])))
  3090 	{
  3091 	    t=s+strlen(DPmarkup[i]);
  3092 	    memmove(s,t,strlen(t)+1);
  3093 	}
  3094 }
  3095 
  3096 /*
  3097  * postprocess_for_HTML:
  3098  *
  3099  * Invoked with the -m switch from flgets().
  3100  * It simply "removes" from the line a hard-coded set of common
  3101  * HTML tags and "replaces" a hard-coded set of common HTML
  3102  * entities, so that the line passed to the main routine has
  3103  * been pre-cleaned of HTML.
  3104  */
  3105 void postprocess_for_HTML(char *theline)
  3106 {
  3107     while (losemarkup(theline))
  3108 	;
  3109     loseentities(theline);
  3110 }
  3111 
  3112 char *losemarkup(char *theline)
  3113 {
  3114     char *s,*t;
  3115     int i;
  3116     s=strchr(theline,'<');
  3117     t=s?strchr(s,'>'):NULL;
  3118     if (!s || !t)
  3119 	return NULL;
  3120     for (i=0;*markup[i];i++)
  3121 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3122 	{
  3123 	    t=g_utf8_next_char(t);
  3124 	    memmove(s,t,strlen(t)+1);
  3125 	    return s;
  3126 	}
  3127     /* It's an unrecognized <xxx>. */
  3128     return NULL;
  3129 }
  3130 
  3131 void loseentities(char *theline)
  3132 {
  3133     int i;
  3134     gsize nb;
  3135     char *amp,*scolon;
  3136     gchar *s,*t;
  3137     gunichar c;
  3138     GTree *entities=NULL;
  3139     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3140     if (!theline)
  3141     {
  3142 	if (entities)
  3143 	    g_tree_destroy(entities);
  3144 	entities=NULL;
  3145 	if (translit!=(GIConv)-1)
  3146 	    g_iconv_close(translit);
  3147 	translit=(GIConv)-1;
  3148 	if (to_utf8!=(GIConv)-1)
  3149 	    g_iconv_close(to_utf8);
  3150 	to_utf8=(GIConv)-1;
  3151 	return;
  3152     }
  3153     if (!*theline)
  3154 	return;
  3155     if (!entities)
  3156     {
  3157 	entities=g_tree_new((GCompareFunc)strcmp);
  3158 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3159 	    g_tree_insert(entities,HTMLentities[i].name,
  3160 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3161     }
  3162     if (translit==(GIConv)-1)
  3163 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3164     if (to_utf8==(GIConv)-1)
  3165 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3166     while((amp=strchr(theline,'&')))
  3167     {
  3168 	scolon=strchr(amp,';');
  3169 	if (scolon)
  3170 	{
  3171 	    if (amp[1]=='#')
  3172 	    {
  3173 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3174 		    c=strtol(amp+2,NULL,10);
  3175 		else if (amp[2]=='x' &&
  3176 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3177 		    c=strtol(amp+3,NULL,16);
  3178 	    }
  3179 	    else
  3180 	    {
  3181 		s=g_strndup(amp+1,scolon-(amp+1));
  3182 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3183 		g_free(s);
  3184 	    }
  3185 	}
  3186 	else
  3187 	    c=0;
  3188 	if (c)
  3189 	{
  3190 	    theline=amp;
  3191 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3192 		theline+=g_unichar_to_utf8(c,theline);
  3193 	    else
  3194 	    {
  3195 		s=g_malloc(6);
  3196 		nb=g_unichar_to_utf8(c,s);
  3197 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3198 		g_free(s);
  3199 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3200 		g_free(t);
  3201 		memcpy(theline,s,nb);
  3202 		g_free(s);
  3203 		theline+=nb;
  3204 	    }
  3205 	    memmove(theline,g_utf8_next_char(scolon),
  3206 	      strlen(g_utf8_next_char(scolon))+1);
  3207 	}
  3208 	else
  3209 	    theline=g_utf8_next_char(amp);
  3210     }
  3211 }
  3212 
  3213 gboolean tagcomp(const char *strin,const char *basetag)
  3214 {
  3215     gboolean retval;
  3216     gchar *s,*t;
  3217     if (g_utf8_get_char(strin)=='/')
  3218 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3219     else
  3220 	t=g_utf8_casefold(strin,-1);
  3221     s=g_utf8_casefold(basetag,-1);
  3222     retval=g_str_has_prefix(t,s);
  3223     g_free(s);
  3224     g_free(t);
  3225     return retval;
  3226 }
  3227 
  3228 void proghelp(GOptionContext *context)
  3229 {
  3230     gchar *help;
  3231     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3232     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3233     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3234     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3235       "For details, read the file COPYING.\n",stderr);
  3236     fputs("This is Free Software; "
  3237       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3238     fputs("read the file COPYING for details.\n\n",stderr);
  3239     help=g_option_context_get_help(context,TRUE,NULL);
  3240     fputs(help,stderr);
  3241     g_free(help);
  3242     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3243     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3244       "non-ASCII\n",stderr);
  3245     fputs("characters like accented letters, "
  3246       "lines longer than 75 or shorter than 55,\n",stderr);
  3247     fputs("unbalanced quotes or brackets, "
  3248       "a variety of badly formatted punctuation, \n",stderr);
  3249     fputs("HTML tags, some likely typos. "
  3250       "It is NOT a substitute for human judgement.\n",stderr);
  3251     fputs("\n",stderr);
  3252 }