bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Tue Oct 15 07:57:45 2013 +0100 (2013-10-15)
changeset 98 37da646396b9
parent 97 c45fa3843618
child 99 783eff3047bc
permissions -rw-r--r--
Fix bug #29: analyse_quotes() shadows the linecnt global variable for no good reason
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *prevline;
    36 
    37 /* Common typos. */
    38 char *typo[] = {
    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    68     "se", ""
    69 };
    70 
    71 GTree *usertypo;
    72 
    73 /* Common abbreviations and other OK words not to query as typos. */
    74 char *okword[] = {
    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    78     "outbid", "outbids", "frostbite", "frostbitten", ""
    79 };
    80 
    81 /* Common abbreviations that cause otherwise unexplained periods. */
    82 char *abbrev[] = {
    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    85 };
    86 
    87 /*
    88  * Two-Letter combinations that rarely if ever start words,
    89  * but are common scannos or otherwise common letter combinations.
    90  */
    91 char *nostart[] = {
    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    93 };
    94 
    95 /*
    96  * Two-Letter combinations that rarely if ever end words,
    97  * but are common scannos or otherwise common letter combinations.
    98  */
    99 char *noend[] = {
   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   101     "sw", "gr", "sl", "cl", "iy", ""
   102 };
   103 
   104 char *markup[] = {
   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   109 };
   110 
   111 char *DPmarkup[] = {
   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   113 };
   114 
   115 char *nocomma[] = {
   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   119     "during", "let", "toward", "among", ""
   120 };
   121 
   122 char *noperiod[] = {
   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   126     "among", "those", "into", "whom", "having", "thence", ""
   127 }; 
   128 
   129 gboolean pswit[SWITNO];  /* program switches */
   130 
   131 static GOptionEntry options[]={
   132     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   133       "Ignore DP-specific markup", NULL },
   134     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   135       "Don't echo queried line", NULL },
   136     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   137       "Check single quotes", NULL },
   138     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   139       "Check common typos", NULL },
   140     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   141       "Require closure of quotes on every paragraph", NULL },
   142     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   143       "Disable paranoid querying of everything", NULL },
   144     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   145       "Disable line end checking", NULL },
   146     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   147       "Overview: just show counts", NULL },
   148     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   149       "Output errors to stdout instead of stderr", NULL },
   150     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   151       "Echo header fields", NULL },
   152     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   153       "Ignore markup in < >", NULL },
   154     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   155       "Use file of user-defined typos", NULL },
   156     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   157       "Defaults for use on www upload", NULL },
   158     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   159       "Verbose - list everything", NULL },
   160     { NULL }
   161 };
   162 
   163 long cnt_quote;		/* for overview mode, count of quote queries */
   164 long cnt_brack;		/* for overview mode, count of brackets queries */
   165 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   166 long cnt_odd;		/* for overview mode, count of odd character queries */
   167 long cnt_long;		/* for overview mode, count of long line errors */
   168 long cnt_short;		/* for overview mode, count of short line queries */
   169 long cnt_punct;		/* for overview mode,
   170 			   count of punctuation and spacing queries */
   171 long cnt_dash;		/* for overview mode, count of dash-related queries */
   172 long cnt_word;		/* for overview mode, count of word queries */
   173 long cnt_html;		/* for overview mode, count of html queries */
   174 long cnt_lineend;	/* for overview mode, count of line-end queries */
   175 long cnt_spacend;	/* count of lines with space at end */
   176 long linecnt;		/* count of total lines in the file */
   177 long checked_linecnt;	/* count of lines actually checked */
   178 
   179 void proghelp(GOptionContext *context);
   180 void procfile(const char *);
   181 
   182 gchar *running_from;
   183 
   184 gboolean mixdigit(const char *);
   185 gchar *getaword(const char **);
   186 char *flgets(char **,long);
   187 void postprocess_for_HTML(char *);
   188 char *linehasmarkup(char *);
   189 char *losemarkup(char *);
   190 gboolean tagcomp(const char *,const char *);
   191 void loseentities(char *);
   192 gboolean isroman(const char *);
   193 void postprocess_for_DP(char *);
   194 void print_as_windows_1252(const char *string);
   195 void print_as_utf_8(const char *string);
   196 
   197 GTree *qword,*qperiod;
   198 
   199 #ifdef __WIN32__
   200 UINT saved_cp;
   201 #endif
   202 
   203 void parse_options(int *argc,char ***argv)
   204 {
   205     GError *err=NULL;
   206     GOptionContext *context;
   207     context=g_option_context_new(
   208       "file - looks for errors in Project Gutenberg(TM) etexts");
   209     g_option_context_add_main_entries(context,options,NULL);
   210     if (!g_option_context_parse(context,argc,argv,&err))
   211     {
   212 	g_printerr("Bookloupe: %s\n",err->message);
   213 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   214 	exit(1);
   215     }
   216     /* Paranoid checking is turned OFF, not on, by its switch */
   217     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   218     if (pswit[PARANOID_SWITCH])
   219 	/* if running in paranoid mode, typo checks default to enabled */
   220 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   221     /* Line-end checking is turned OFF, not on, by its switch */
   222     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   223     /* Echoing is turned OFF, not on, by its switch */
   224     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   225     if (pswit[OVERVIEW_SWITCH])
   226 	/* just print summary; don't echo */
   227 	pswit[ECHO_SWITCH]=FALSE;
   228     /*
   229      * Web uploads - for the moment, this is really just a placeholder
   230      * until we decide what processing we really want to do on web uploads
   231      */
   232     if (pswit[WEB_SWITCH])
   233     {
   234 	/* specific override for web uploads */
   235 	pswit[ECHO_SWITCH]=TRUE;
   236 	pswit[SQUOTE_SWITCH]=FALSE;
   237 	pswit[TYPO_SWITCH]=TRUE;
   238 	pswit[QPARA_SWITCH]=FALSE;
   239 	pswit[PARANOID_SWITCH]=TRUE;
   240 	pswit[LINE_END_SWITCH]=FALSE;
   241 	pswit[OVERVIEW_SWITCH]=FALSE;
   242 	pswit[STDOUT_SWITCH]=FALSE;
   243 	pswit[HEADER_SWITCH]=TRUE;
   244 	pswit[VERBOSE_SWITCH]=FALSE;
   245 	pswit[MARKUP_SWITCH]=FALSE;
   246 	pswit[USERTYPO_SWITCH]=FALSE;
   247 	pswit[DP_SWITCH]=FALSE;
   248     }
   249     if (*argc<2)
   250     {
   251 	proghelp(context);
   252 	exit(1);
   253     }
   254     g_option_context_free(context);
   255 }
   256 
   257 /*
   258  * read_user_scannos:
   259  *
   260  * Read in the user-defined stealth scanno list.
   261  */
   262 void read_user_scannos(void)
   263 {
   264     GError *err=NULL;
   265     gchar *usertypo_file;
   266     gboolean okay;
   267     int i;
   268     gsize len,nb;
   269     gchar *contents,*utf8,**lines;
   270     usertypo_file=g_strdup("bookloupe.typ");
   271     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   272     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   273     {
   274 	g_clear_error(&err);
   275 	g_free(usertypo_file);
   276 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   277 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   278     }
   279     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   280     {
   281 	g_clear_error(&err);
   282 	g_free(usertypo_file);
   283 	usertypo_file=g_strdup("gutcheck.typ");
   284 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   285     }
   286     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   287     {
   288 	g_clear_error(&err);
   289 	g_free(usertypo_file);
   290 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   291 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   292     }
   293     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   294     {
   295 	g_free(usertypo_file);
   296 	g_print("   --> I couldn't find bookloupe.typ "
   297 	  "-- proceeding without user typos.\n");
   298 	return;
   299     }
   300     else if (!okay)
   301     {
   302 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   303 	g_free(usertypo_file);
   304 	g_clear_error(&err);
   305 	exit(1);
   306     }
   307     if (g_utf8_validate(contents,len,NULL))
   308 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   309     else
   310 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   311     g_free(contents);
   312     lines=g_strsplit_set(utf8,"\r\n",0);
   313     g_free(utf8);
   314     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   315     for (i=0;lines[i];i++)
   316 	if (*(unsigned char *)lines[i]>'!')
   317 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   318 	else
   319 	    g_free(lines[i]);
   320     g_free(lines);
   321 }
   322 
   323 /*
   324  * read_etext:
   325  *
   326  * Read an etext returning a newly allocated string containing the file
   327  * contents or NULL on error.
   328  */
   329 gchar *read_etext(const char *filename,GError **err)
   330 {
   331     GError *tmp_err=NULL;
   332     gchar *contents,*utf8;
   333     gsize len,bytes_read,bytes_written;
   334     int i,line,col;
   335     if (!g_file_get_contents(filename,&contents,&len,err))
   336 	return NULL;
   337     if (g_utf8_validate(contents,len,NULL))
   338     {
   339 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   340 	g_set_print_handler(print_as_utf_8);
   341 #ifdef __WIN32__
   342 	SetConsoleOutputCP(CP_UTF8);
   343 #endif
   344     }
   345     else
   346     {
   347 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   348 	  &bytes_written,&tmp_err);
   349 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   350 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   351 	{
   352 	    line=col=1;
   353 	    for(i=0;i<bytes_read;i++)
   354 		if (contents[i]=='\n')
   355 		{
   356 		    line++;
   357 		    col=1;
   358 		}
   359 		else if (contents[i]!='\r')
   360 		    col++;
   361 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   362 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   363 	      "valid Windows-1252 character",
   364 	      ((unsigned char *)contents)[bytes_read],line,col);
   365 	}
   366 	else if (tmp_err)
   367 	    g_propagate_error(err,tmp_err);
   368 	g_set_print_handler(print_as_windows_1252);
   369 #ifdef __WIN32__
   370 	SetConsoleOutputCP(1252);
   371 #endif
   372     }
   373     g_free(contents);
   374     return utf8;
   375 }
   376 
   377 void cleanup_on_exit(void)
   378 {
   379 #ifdef __WIN32__
   380     SetConsoleOutputCP(saved_cp);
   381 #endif
   382 }
   383 
   384 int main(int argc,char **argv)
   385 {
   386 #ifdef __WIN32__
   387     atexit(cleanup_on_exit);
   388     saved_cp=GetConsoleOutputCP();
   389 #endif
   390     running_from=g_path_get_dirname(argv[0]);
   391     parse_options(&argc,&argv);
   392     if (pswit[USERTYPO_SWITCH])
   393 	read_user_scannos();
   394     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   395     procfile(argv[1]);
   396     if (pswit[OVERVIEW_SWITCH])
   397     {
   398 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   399 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   400 	g_print("    --------------- Queries found --------------\n");
   401 	if (cnt_long)
   402 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   403 	if (cnt_short)
   404 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   405 	if (cnt_lineend)
   406 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   407 	if (cnt_word)
   408 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   409 	if (cnt_quote)
   410 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   411 	if (cnt_brack)
   412 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   413 	if (cnt_bin)
   414 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   415 	if (cnt_odd)
   416 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   417 	if (cnt_punct)
   418 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   419 	if (cnt_dash)
   420 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   421 	if (cnt_html)
   422 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   423 	g_print("\n");
   424 	g_print("    TOTAL QUERIES		  %14ld\n",
   425 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   426 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   427     }
   428     g_free(running_from);
   429     if (usertypo)
   430 	g_tree_unref(usertypo);
   431     return 0;
   432 }
   433 
   434 void count_dashes(const char *line,const char *dash,
   435   struct dash_results *results)
   436 {
   437     int i;
   438     gchar **tokens;
   439     gunichar pc,nc;
   440     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
   441     if (!*line)
   442 	return;
   443     tokens=g_strsplit(line,dash,0);
   444     if (tokens[1])
   445 	results->base++;
   446     for(i=1;tokens[i];i++)
   447     {
   448 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
   449 	nc=g_utf8_get_char(tokens[i]);
   450 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
   451 	    spaced=TRUE;
   452 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
   453 	    spaced2=TRUE;
   454 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
   455 	    unspaced=TRUE;
   456     }
   457     if (spaced)
   458 	results->space++;
   459     if (spaced2)
   460 	/* count of lines with em-dashes with spaces both sides */
   461 	results->non_PG_space++;
   462     if (unspaced)
   463 	/* count of lines with PG-type em-dashes with no spaces */
   464 	results->PG_space++;
   465     g_strfreev(tokens);
   466 }
   467 
   468 /*
   469  * first_pass:
   470  *
   471  * Run a first pass - verify that it's a valid PG
   472  * file, decide whether to report some things that
   473  * occur many times in the text like long or short
   474  * lines, non-standard dashes, etc.
   475  */
   476 struct first_pass_results *first_pass(const char *etext)
   477 {
   478     gunichar laststart=CHAR_SPACE;
   479     const char *s;
   480     gchar *lc_line;
   481     int i,j,lbytes,llen;
   482     gchar **lines;
   483     unsigned int lastlen=0,lastblen=0;
   484     long spline=0,nspline=0;
   485     static struct first_pass_results results={0};
   486     struct dash_results tmp_dash_results;
   487     gchar *inword;
   488     QuoteClass qc;
   489     lines=g_strsplit(etext,"\n",0);
   490     for (j=0;lines[j];j++)
   491     {
   492 	lbytes=strlen(lines[j]);
   493 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   494 	    lines[j][--lbytes]='\0';
   495 	llen=g_utf8_strlen(lines[j],lbytes);
   496 	linecnt++;
   497 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   498 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   499 	{
   500 	    if (spline)
   501 		g_print("   --> Duplicate header?\n");
   502 	    spline=linecnt+1;   /* first line of non-header text, that is */
   503 	}
   504 	if (!strncmp(lines[j],"*** START",9) &&
   505 	  strstr(lines[j],"PROJECT GUTENBERG"))
   506 	{
   507 	    if (nspline)
   508 		g_print("   --> Duplicate header?\n");
   509 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   510 	}
   511 	if (spline || nspline)
   512 	{
   513 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   514 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   515 	    {
   516 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   517 		{
   518 		    if (results.footerline)
   519 		    {
   520 			/* it's an old-form header - we can detect duplicates */
   521 			if (!nspline)
   522 			    g_print("   --> Duplicate footer?\n");
   523 		    }
   524 		    else
   525 			results.footerline=linecnt;
   526 		}
   527 	    }
   528 	    g_free(lc_line);
   529 	}
   530 	if (spline)
   531 	    results.firstline=spline;
   532 	if (nspline)
   533 	    results.firstline=nspline;  /* override with new */
   534 	if (results.footerline)
   535 	    continue;    /* don't count the boilerplate in the footer */
   536 	results.totlen+=llen;
   537 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   538 	{
   539 	    if (g_utf8_get_char(s)>127)
   540 		results.binlen++;
   541 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   542 		results.alphalen++;
   543 	    if (s>lines[j])
   544 	    {
   545 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   546 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   547 		else
   548 		    qc=INVALID_QUOTE;
   549 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   550 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   551 		    results.endquote_count++;
   552 	    }
   553 	}
   554 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   555 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   556 	    results.shortline++;
   557 	if (lbytes>0 &&
   558 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   559 	    cnt_spacend++;
   560 	if (strstr(lines[j],".,"))
   561 	    results.dotcomma++;
   562 	/* only count ast lines for ignoring purposes where there is */
   563 	/* locase text on the line */
   564 	if (strchr(lines[j],'*'))
   565 	{
   566 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   567 		if (g_unichar_islower(g_utf8_get_char(s)))
   568 		    break;
   569 	    if (*s)
   570 		results.astline++;
   571 	}
   572 	if (strchr(lines[j],'/'))
   573 	    results.fslashline++;
   574 	if (lbytes>0)
   575 	{
   576 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   577 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   578 	      s=g_utf8_prev_char(s))
   579 		;
   580 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   581 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   582 		results.hyphens++;
   583 	}
   584 	if (llen>LONGEST_PG_LINE)
   585 	    results.longline++;
   586 	if (llen>WAY_TOO_LONG)
   587 	    results.verylongline++;
   588 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   589 	{
   590 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   591 	    if (i>0)
   592 		results.htmcount++;
   593 	    if (strstr(lines[j],"<i>"))
   594 		results.htmcount+=4; /* bonus marks! */
   595 	}
   596 	/* Check for spaced em-dashes */
   597 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
   598 	count_dashes(lines[j],"--",&tmp_dash_results);
   599 	count_dashes(lines[j],"—",&tmp_dash_results);
   600 	if (tmp_dash_results.base)
   601 	    results.emdash.base++;
   602 	if (tmp_dash_results.non_PG_space)
   603 	    results.emdash.non_PG_space++;
   604 	if (tmp_dash_results.PG_space)
   605 	    results.emdash.PG_space++;
   606 	for (s=lines[j];*s;)
   607 	{
   608 	    inword=getaword(&s);
   609 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   610 		results.Dutchcount++;
   611 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   612 		results.Frenchcount++;
   613 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   614 		results.standalone_digit++;
   615 	    g_free(inword);
   616 	}
   617 	/* Check for spaced dashes */
   618 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   619 	    results.spacedash++;
   620 	lastblen=lastlen;
   621 	lastlen=llen;
   622 	laststart=lines[j][0];
   623     }
   624     g_strfreev(lines);
   625     return &results;
   626 }
   627 
   628 /*
   629  * report_first_pass:
   630  *
   631  * Make some snap decisions based on the first pass results.
   632  */
   633 struct warnings *report_first_pass(struct first_pass_results *results)
   634 {
   635     static struct warnings warnings={0};
   636     if (cnt_spacend>0)
   637 	g_print("   --> %ld lines in this file have white space at end\n",
   638 	  cnt_spacend);
   639     warnings.dotcomma=1;
   640     if (results->dotcomma>5)
   641     {
   642 	warnings.dotcomma=0;
   643 	g_print("   --> %ld lines in this file contain '.,'. "
   644 	  "Not reporting them.\n",results->dotcomma);
   645     }
   646     /*
   647      * If more than 50 lines, or one-tenth, are short,
   648      * don't bother reporting them.
   649      */
   650     warnings.shortline=1;
   651     if (results->shortline>50 || results->shortline*10>linecnt)
   652     {
   653 	warnings.shortline=0;
   654 	g_print("   --> %ld lines in this file are short. "
   655 	  "Not reporting short lines.\n",results->shortline);
   656     }
   657     /*
   658      * If more than 50 lines, or one-tenth, are long,
   659      * don't bother reporting them.
   660      */
   661     warnings.longline=1;
   662     if (results->longline>50 || results->longline*10>linecnt)
   663     {
   664 	warnings.longline=0;
   665 	g_print("   --> %ld lines in this file are long. "
   666 	  "Not reporting long lines.\n",results->longline);
   667     }
   668     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   669     warnings.ast=1;
   670     if (results->astline>10)
   671     {
   672 	warnings.ast=0;
   673 	g_print("   --> %ld lines in this file contain asterisks. "
   674 	  "Not reporting them.\n",results->astline);
   675     }
   676     /*
   677      * If more than 10 lines contain forward slashes,
   678      * don't bother reporting them.
   679      */
   680     warnings.fslash=1;
   681     if (results->fslashline>10)
   682     {
   683 	warnings.fslash=0;
   684 	g_print("   --> %ld lines in this file contain forward slashes. "
   685 	  "Not reporting them.\n",results->fslashline);
   686     }
   687     /*
   688      * If more than 20 lines contain unpunctuated endquotes,
   689      * don't bother reporting them.
   690      */
   691     warnings.endquote=1;
   692     if (results->endquote_count>20)
   693     {
   694 	warnings.endquote=0;
   695 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   696 	  "Not reporting them.\n",results->endquote_count);
   697     }
   698     /*
   699      * If more than 15 lines contain standalone digits,
   700      * don't bother reporting them.
   701      */
   702     warnings.digit=1;
   703     if (results->standalone_digit>10)
   704     {
   705 	warnings.digit=0;
   706 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   707 	  "Not reporting them.\n",results->standalone_digit);
   708     }
   709     /*
   710      * If more than 20 lines contain hyphens at end,
   711      * don't bother reporting them.
   712      */
   713     warnings.hyphen=1;
   714     if (results->hyphens>20)
   715     {
   716 	warnings.hyphen=0;
   717 	g_print("   --> %ld lines in this file have hyphens at end. "
   718 	  "Not reporting them.\n",results->hyphens);
   719     }
   720     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   721     {
   722 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   723 	pswit[MARKUP_SWITCH]=1;
   724     }
   725     if (results->verylongline>0)
   726 	g_print("   --> %ld lines in this file are VERY long!\n",
   727 	  results->verylongline);
   728     /*
   729      * If there are more non-PG spaced dashes than PG em-dashes,
   730      * assume it's deliberate.
   731      * Current PG guidelines say don't use them, but older texts do,
   732      * and some people insist on them whatever the guidelines say.
   733      */
   734     warnings.dash=1;
   735     if (results->spacedash+results->emdash.non_PG_space>
   736       results->emdash.PG_space)
   737     {
   738 	warnings.dash=0;
   739 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   740 	  "Not reporting them.\n",
   741 	  results->spacedash+results->emdash.non_PG_space);
   742     }
   743     /* If more than a quarter of characters are hi-bit, bug out. */
   744     warnings.bin=1;
   745     if (results->binlen*4>results->totlen)
   746     {
   747 	g_print("   --> This file does not appear to be ASCII. "
   748 	  "Terminating. Best of luck with it!\n");
   749 	exit(1);
   750     }
   751     if (results->alphalen*4<results->totlen)
   752     {
   753 	g_print("   --> This file does not appear to be text. "
   754 	  "Terminating. Best of luck with it!\n");
   755 	exit(1);
   756     }
   757     if (results->binlen*100>results->totlen || results->binlen>100)
   758     {
   759 	g_print("   --> There are a lot of foreign letters here. "
   760 	  "Not reporting them.\n");
   761 	warnings.bin=0;
   762     }
   763     warnings.isDutch=FALSE;
   764     if (results->Dutchcount>50)
   765     {
   766 	warnings.isDutch=TRUE;
   767 	g_print("   --> This looks like Dutch - "
   768 	  "switching off dashes and warnings for 's Middags case.\n");
   769     }
   770     warnings.isFrench=FALSE;
   771     if (results->Frenchcount>50)
   772     {
   773 	warnings.isFrench=TRUE;
   774 	g_print("   --> This looks like French - "
   775 	  "switching off some doublepunct.\n");
   776     }
   777     if (results->firstline && results->footerline)
   778 	g_print("    The PG header and footer appear to be already on.\n");
   779     else
   780     {
   781 	if (results->firstline)
   782 	    g_print("    The PG header is on - no footer.\n");
   783 	if (results->footerline)
   784 	    g_print("    The PG footer is on - no header.\n");
   785     }
   786     g_print("\n");
   787     if (pswit[VERBOSE_SWITCH])
   788     {
   789 	warnings.bin=1;
   790 	warnings.shortline=1;
   791 	warnings.dotcomma=1;
   792 	warnings.longline=1;
   793 	warnings.dash=1;
   794 	warnings.digit=1;
   795 	warnings.ast=1;
   796 	warnings.fslash=1;
   797 	warnings.hyphen=1;
   798 	warnings.endquote=1;
   799 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   800     }
   801     if (warnings.isDutch)
   802 	warnings.dash=0;
   803     if (results->footerline>0 && results->firstline>0 &&
   804       results->footerline>results->firstline &&
   805       results->footerline-results->firstline<100)
   806     {
   807 	g_print("   --> I don't really know where this text starts. \n");
   808 	g_print("       There are no reference points.\n");
   809 	g_print("       I'm going to have to report the header and footer "
   810 	  "as well.\n");
   811 	results->firstline=0;
   812     }
   813     return &warnings;
   814 }
   815 
   816 /*
   817  * analyse_quotes:
   818  *
   819  * Look along the line, accumulate the count of quotes, and see
   820  * if this is an empty line - i.e. a line with nothing on it
   821  * but spaces.
   822  * If line has just spaces, period, * and/or - on it, don't
   823  * count it, since empty lines with asterisks or dashes to
   824  * separate sections are common.
   825  *
   826  * Returns: TRUE if the line is empty.
   827  */
   828 gboolean analyse_quotes(const char *aline,struct counters *counters)
   829 {
   830     int guessquote=0;
   831     /* assume the line is empty until proven otherwise */
   832     gboolean isemptyline=TRUE;
   833     const char *s=aline,*sprev,*snext;
   834     gunichar c;
   835     sprev=NULL;
   836     GError *tmp_err=NULL;
   837     while (*s)
   838     {
   839 	snext=g_utf8_next_char(s);
   840 	c=g_utf8_get_char(s);
   841 	if (CHAR_IS_DQUOTE(c))
   842 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
   843 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
   844 	{
   845 	    if (s==aline)
   846 	    {
   847 		/*
   848 		 * At start of line, it can only be a quotation mark.
   849 		 * Hardcode a very common exception!
   850 		 */
   851 		if (!g_str_has_prefix(snext,"tis") &&
   852 		  !g_str_has_prefix(snext,"Tis"))
   853 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   854 	    }
   855 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   856 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   857 		/* Do nothing! it's definitely an apostrophe, not a quote */
   858 		;
   859 	    /* it's outside a word - let's check it out */
   860 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   861 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   862 	    {
   863 		/* certainly looks like a quotation mark */
   864 		if (!g_str_has_prefix(snext,"tis") &&
   865 		  !g_str_has_prefix(snext,"Tis"))
   866 		    /* hardcode a very common exception! */
   867 		{
   868 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
   869 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   870 		    else
   871 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
   872 		}
   873 	    }
   874 	    else
   875 	    {
   876 		/* now - is it a quotation mark? */
   877 		guessquote=0;   /* accumulate clues */
   878 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   879 		{
   880 		    /* it follows a letter - could be either */
   881 		    guessquote++;
   882 		    if (g_utf8_get_char(sprev)=='s')
   883 		    {
   884 			/* looks like a plural apostrophe */
   885 			guessquote-=3;
   886 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   887 			    /* bonus marks! */
   888 			    guessquote-=2;
   889 		    }
   890 		    if (innermost_quote_matches(counters,c))
   891 			/*
   892 			 * Give it the benefit of some doubt,
   893 			 * if a squote is already open.
   894 			 */
   895 			guessquote++;
   896 		    else
   897 			guessquote--;
   898 		    if (guessquote>=0)
   899 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
   900 		}
   901 		else
   902 		    /* no adjacent letter - it must be a quote of some kind */
   903 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   904 	    }
   905 	}
   906 	if (tmp_err)
   907 	{
   908 	    if (pswit[ECHO_SWITCH])
   909 		g_print("\n%s\n",aline);
   910 	    if (!pswit[OVERVIEW_SWITCH])
   911 		g_print("    Line %ld column %ld - %s\n",
   912 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
   913 	    g_clear_error(&tmp_err);
   914 	}
   915 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   916 	  c!='\r' && c!='\n')
   917 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   918 	if (c==CHAR_UNDERSCORE)
   919 	    counters->c_unders++;
   920 	if (c==CHAR_OPEN_SBRACK)
   921 	{
   922 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
   923 	      !matching_difference(counters,c) && s==aline &&
   924 	      g_str_has_prefix(s,"[Illustration:"))
   925 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
   926 	    else
   927 		increment_matching(counters,c,TRUE);
   928 	}
   929 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
   930 	    increment_matching(counters,c,TRUE);
   931 	if (c==CHAR_CLOSE_SBRACK)
   932 	{
   933 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
   934 	      !matching_difference(counters,c) && !*snext)
   935 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
   936 	    else
   937 		increment_matching(counters,c,FALSE);
   938 	}
   939 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
   940 	    increment_matching(counters,c,FALSE);
   941 	sprev=s;
   942 	s=snext;
   943     }
   944     return isemptyline;
   945 }
   946 
   947 /*
   948  * check_for_control_characters:
   949  *
   950  * Check for invalid or questionable characters in the line
   951  * Anything above 127 is invalid for plain ASCII, and
   952  * non-printable control characters should also be flagged.
   953  * Tabs should generally not be there.
   954  */
   955 void check_for_control_characters(const char *aline)
   956 {
   957     gunichar c;
   958     const char *s;
   959     for (s=aline;*s;s=g_utf8_next_char(s))
   960     {
   961 	c=g_utf8_get_char(s);
   962 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   963 	{
   964 	    if (pswit[ECHO_SWITCH])
   965 		g_print("\n%s\n",aline);
   966 	    if (!pswit[OVERVIEW_SWITCH])
   967 		g_print("    Line %ld column %ld - Control character %u\n",
   968 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   969 	    else
   970 		cnt_bin++;
   971 	}
   972     }
   973 }
   974 
   975 /*
   976  * check_for_odd_characters:
   977  *
   978  * Check for binary and other odd characters.
   979  */
   980 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   981   gboolean isemptyline)
   982 {
   983     /* Don't repeat multiple warnings on one line. */
   984     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   985     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   986     const char *s;
   987     gunichar c;
   988     for (s=aline;*s;s=g_utf8_next_char(s))
   989     {
   990 	c=g_utf8_get_char(s);
   991 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   992 	{
   993 	    if (pswit[ECHO_SWITCH])
   994 		g_print("\n%s\n",aline);
   995 	    if (!pswit[OVERVIEW_SWITCH])
   996 		if (c>127 && c<160 || c>255)
   997 		    g_print("    Line %ld column %ld - "
   998 		      "Non-ISO-8859 character %u\n",
   999 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1000 		else
  1001 		    g_print("    Line %ld column %ld - "
  1002 		      "Non-ASCII character %u\n",
  1003 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
  1004 	    else
  1005 		cnt_bin++;
  1006 	    eNon_A=TRUE;
  1007 	}
  1008 	if (!eTab && c==CHAR_TAB)
  1009 	{
  1010 	    if (pswit[ECHO_SWITCH])
  1011 		g_print("\n%s\n",aline);
  1012 	    if (!pswit[OVERVIEW_SWITCH])
  1013 		g_print("    Line %ld column %ld - Tab character?\n",
  1014 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1015 	    else
  1016 		cnt_odd++;
  1017 	    eTab=TRUE;
  1018 	}
  1019 	if (!eTilde && c==CHAR_TILDE)
  1020 	{
  1021 	    /*
  1022 	     * Often used by OCR software to indicate an
  1023 	     * unrecognizable character.
  1024 	     */
  1025 	    if (pswit[ECHO_SWITCH])
  1026 		g_print("\n%s\n",aline);
  1027 	    if (!pswit[OVERVIEW_SWITCH])
  1028 		g_print("    Line %ld column %ld - Tilde character?\n",
  1029 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1030 	    else
  1031 		cnt_odd++;
  1032 	    eTilde=TRUE;
  1033 	}
  1034 	if (!eCarat && c==CHAR_CARAT)
  1035 	{  
  1036 	    if (pswit[ECHO_SWITCH])
  1037 		g_print("\n%s\n",aline);
  1038 	    if (!pswit[OVERVIEW_SWITCH])
  1039 		g_print("    Line %ld column %ld - Carat character?\n",
  1040 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1041 	    else
  1042 		cnt_odd++;
  1043 	    eCarat=TRUE;
  1044 	}
  1045 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1046 	{  
  1047 	    if (pswit[ECHO_SWITCH])
  1048 		g_print("\n%s\n",aline);
  1049 	    if (!pswit[OVERVIEW_SWITCH])
  1050 		g_print("    Line %ld column %ld - Forward slash?\n",
  1051 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1052 	    else
  1053 		cnt_odd++;
  1054 	    eFSlash=TRUE;
  1055 	}
  1056 	/*
  1057 	 * Report asterisks only in paranoid mode,
  1058 	 * since they're often deliberate.
  1059 	 */
  1060 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1061 	  c==CHAR_ASTERISK)
  1062 	{
  1063 	    if (pswit[ECHO_SWITCH])
  1064 		g_print("\n%s\n",aline);
  1065 	    if (!pswit[OVERVIEW_SWITCH])
  1066 		g_print("    Line %ld column %ld - Asterisk?\n",
  1067 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1068 	    else
  1069 		cnt_odd++;
  1070 	    eAst=TRUE;
  1071 	}
  1072     }
  1073 }
  1074 
  1075 /*
  1076  * check_for_long_line:
  1077  *
  1078  * Check for line too long.
  1079  */
  1080 void check_for_long_line(const char *aline)
  1081 {
  1082     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1083     {
  1084 	if (pswit[ECHO_SWITCH])
  1085 	    g_print("\n%s\n",aline);
  1086 	if (!pswit[OVERVIEW_SWITCH])
  1087 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1088 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1089 	else
  1090 	    cnt_long++;
  1091     }
  1092 }
  1093 
  1094 /*
  1095  * check_for_short_line:
  1096  *
  1097  * Check for line too short.
  1098  *
  1099  * This one is a bit trickier to implement: we don't want to
  1100  * flag the last line of a paragraph for being short, so we
  1101  * have to wait until we know that our current line is a
  1102  * "normal" line, then report the _previous_ line if it was too
  1103  * short. We also don't want to report indented lines like
  1104  * chapter heads or formatted quotations. We therefore keep
  1105  * last->len as the length of the last line examined, and
  1106  * last->blen as the length of the last but one, and try to
  1107  * suppress unnecessary warnings by checking that both were of
  1108  * "normal" length. We keep the first character of the last
  1109  * line in last->start, and if it was a space, we assume that
  1110  * the formatting is deliberate. I can't figure out a way to
  1111  * distinguish something like a quoted verse left-aligned or
  1112  * the header or footer of a letter from a paragraph of short
  1113  * lines - maybe if I examined the whole paragraph, and if the
  1114  * para has less than, say, 8 lines and if all lines are short,
  1115  * then just assume it's OK? Need to look at some texts to see
  1116  * how often a formula like this would get the right result.
  1117  */
  1118 void check_for_short_line(const char *aline,const struct line_properties *last)
  1119 {
  1120     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1121       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1122       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1123     {
  1124 	if (pswit[ECHO_SWITCH])
  1125 	    g_print("\n%s\n",prevline);
  1126 	if (!pswit[OVERVIEW_SWITCH])
  1127 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1128 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1129 	else
  1130 	    cnt_short++;
  1131     }
  1132 }
  1133 
  1134 /*
  1135  * check_for_starting_punctuation:
  1136  *
  1137  * Look for punctuation other than full ellipses at start of line.
  1138  */
  1139 void check_for_starting_punctuation(const char *aline)
  1140 {
  1141     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1142       !g_str_has_prefix(aline,". . ."))
  1143     {
  1144 	if (pswit[ECHO_SWITCH])
  1145 	    g_print("\n%s\n",aline);
  1146 	if (!pswit[OVERVIEW_SWITCH])
  1147 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1148 	      linecnt);
  1149 	else
  1150 	    cnt_punct++;
  1151     }
  1152 }
  1153 
  1154 /*
  1155  * str_emdash:
  1156  *
  1157  * Find the first em-dash, return a pointer to it and set <next> to the
  1158  * character following the dash.
  1159  */
  1160 char *str_emdash(const char *s,const char **next)
  1161 {
  1162     const char *s1,*s2;
  1163     s1=strstr(s,"--");
  1164     s2=strstr(s,"—");
  1165     if (!s1)
  1166     {
  1167 	if (s2)
  1168 	    *next=g_utf8_next_char(s2);
  1169 	return (char *)s2;
  1170     }
  1171     else if (!s2)
  1172     {
  1173 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1174 	return (char *)s1;
  1175     }
  1176     else if (s1<s2)
  1177     {
  1178 	*next=g_utf8_next_char(g_utf8_next_char(s1));
  1179 	return (char *)s1;
  1180     }
  1181     else
  1182     {
  1183 	*next=g_utf8_next_char(s2);
  1184 	return (char *)s2;
  1185     }
  1186 }
  1187 
  1188 /*
  1189  * check_for_spaced_emdash:
  1190  *
  1191  * Check for spaced em-dashes.
  1192  *
  1193  * We must check _all_ occurrences of em-dashes on the line
  1194  * hence the loop - even if the first dash is OK
  1195  * there may be another that's wrong later on.
  1196  */
  1197 void check_for_spaced_emdash(const char *aline)
  1198 {
  1199     const char *s,*t,*next;
  1200     for (s=aline;t=str_emdash(s,&next);s=next)
  1201     {
  1202 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1203 	  g_utf8_get_char(next)==CHAR_SPACE)
  1204 	{
  1205 	    if (pswit[ECHO_SWITCH])
  1206 		g_print("\n%s\n",aline);
  1207 	    if (!pswit[OVERVIEW_SWITCH])
  1208 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1209 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1210 	    else
  1211 		cnt_dash++;
  1212 	}
  1213     }
  1214 }
  1215 
  1216 /*
  1217  * check_for_spaced_dash:
  1218  *
  1219  * Check for spaced dashes.
  1220  */
  1221 void check_for_spaced_dash(const char *aline)
  1222 {
  1223     const char *s;
  1224     if ((s=strstr(aline," -")))
  1225     {
  1226 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1227 	{
  1228 	    if (pswit[ECHO_SWITCH])
  1229 		g_print("\n%s\n",aline);
  1230 	    if (!pswit[OVERVIEW_SWITCH])
  1231 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1232 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1233 	    else
  1234 		cnt_dash++;
  1235 	}
  1236     }
  1237     else if ((s=strstr(aline,"- ")))
  1238     {
  1239 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1240 	{
  1241 	    if (pswit[ECHO_SWITCH])
  1242 		g_print("\n%s\n",aline);
  1243 	    if (!pswit[OVERVIEW_SWITCH])
  1244 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1245 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1246 	    else
  1247 		cnt_dash++;
  1248 	}
  1249     }
  1250 }
  1251 
  1252 /*
  1253  * check_for_unmarked_paragraphs:
  1254  *
  1255  * Check for unmarked paragraphs indicated by separate speakers.
  1256  *
  1257  * May well be false positive:
  1258  * "Bravo!" "Wonderful!" called the crowd.
  1259  * but useful all the same.
  1260  */
  1261 void check_for_unmarked_paragraphs(const char *aline)
  1262 {
  1263     const char *s;
  1264     s=strstr(aline,"\"  \"");
  1265     if (!s)
  1266 	s=strstr(aline,"\" \"");
  1267     if (s)
  1268     {
  1269 	if (pswit[ECHO_SWITCH])
  1270 	    g_print("\n%s\n",aline);
  1271 	if (!pswit[OVERVIEW_SWITCH])
  1272 	    g_print("    Line %ld column %ld - "
  1273 	      "Query missing paragraph break?\n",
  1274 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1275 	else
  1276 	    cnt_punct++;
  1277     }
  1278 }
  1279 
  1280 /*
  1281  * check_for_jeebies:
  1282  *
  1283  * Check for "to he" and other easy h/b errors.
  1284  *
  1285  * This is a very inadequate effort on the h/b problem,
  1286  * but the phrase "to he" is always an error, whereas "to
  1287  * be" is quite common.
  1288  * Similarly, '"Quiet!", be said.' is a non-be error
  1289  * "to he" is _not_ always an error!:
  1290  *       "Where they went to he couldn't say."
  1291  * Another false positive:
  1292  *       What would "Cinderella" be without the . . .
  1293  * and another: "If he wants to he can see for himself."
  1294  */
  1295 void check_for_jeebies(const char *aline)
  1296 {
  1297     const char *s;
  1298     s=strstr(aline," be could ");
  1299     if (!s)
  1300 	s=strstr(aline," be would ");
  1301     if (!s)
  1302 	s=strstr(aline," was be ");
  1303     if (!s)
  1304 	s=strstr(aline," be is ");
  1305     if (!s)
  1306 	s=strstr(aline," is be ");
  1307     if (!s)
  1308 	s=strstr(aline,"\", be ");
  1309     if (!s)
  1310 	s=strstr(aline,"\" be ");
  1311     if (!s)
  1312 	s=strstr(aline,"\" be ");
  1313     if (!s)
  1314 	s=strstr(aline," to he ");
  1315     if (s)
  1316     {
  1317 	if (pswit[ECHO_SWITCH])
  1318 	    g_print("\n%s\n",aline);
  1319 	if (!pswit[OVERVIEW_SWITCH])
  1320 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1321 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1322 	else
  1323 	    cnt_word++;
  1324     }
  1325     s=strstr(aline," the had ");
  1326     if (!s)
  1327 	s=strstr(aline," a had ");
  1328     if (!s)
  1329 	s=strstr(aline," they bad ");
  1330     if (!s)
  1331 	s=strstr(aline," she bad ");
  1332     if (!s)
  1333 	s=strstr(aline," he bad ");
  1334     if (!s)
  1335 	s=strstr(aline," you bad ");
  1336     if (!s)
  1337 	s=strstr(aline," i bad ");
  1338     if (s)
  1339     {
  1340 	if (pswit[ECHO_SWITCH])
  1341 	    g_print("\n%s\n",aline);
  1342 	if (!pswit[OVERVIEW_SWITCH])
  1343 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1344 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1345 	else
  1346 	    cnt_word++;
  1347     }
  1348     s=strstr(aline,"; hut ");
  1349     if (!s)
  1350 	s=strstr(aline,", hut ");
  1351     if (s)
  1352     {
  1353 	if (pswit[ECHO_SWITCH])
  1354 	    g_print("\n%s\n",aline);
  1355 	if (!pswit[OVERVIEW_SWITCH])
  1356 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1357 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1358 	else
  1359 	    cnt_word++;
  1360     }
  1361 }
  1362 
  1363 /*
  1364  * check_for_mta_from:
  1365  *
  1366  * Special case - angled bracket in front of "From" placed there by an
  1367  * MTA when sending an e-mail.
  1368  */
  1369 void check_for_mta_from(const char *aline)
  1370 {
  1371     const char *s;
  1372     s=strstr(aline,">From");
  1373     if (s)
  1374     {
  1375 	if (pswit[ECHO_SWITCH])
  1376 	    g_print("\n%s\n",aline);
  1377 	if (!pswit[OVERVIEW_SWITCH])
  1378 	    g_print("    Line %ld column %ld - "
  1379 	      "Query angled bracket with From\n",
  1380 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1381 	else
  1382 	    cnt_punct++;
  1383     }
  1384 }
  1385 
  1386 /*
  1387  * check_for_orphan_character:
  1388  *
  1389  * Check for a single character line -
  1390  * often an overflow from bad wrapping.
  1391  */
  1392 void check_for_orphan_character(const char *aline)
  1393 {
  1394     gunichar c;
  1395     c=g_utf8_get_char(aline);
  1396     if (c && !*g_utf8_next_char(aline))
  1397     {
  1398 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1399 	    ; /* Nothing - ignore numerals alone on a line. */
  1400 	else
  1401 	{
  1402 	    if (pswit[ECHO_SWITCH])
  1403 		g_print("\n%s\n",aline);
  1404 	    if (!pswit[OVERVIEW_SWITCH])
  1405 		g_print("    Line %ld column 1 - Query single character line\n",
  1406 		  linecnt);
  1407 	    else
  1408 		cnt_punct++;
  1409 	}
  1410     }
  1411 }
  1412 
  1413 /*
  1414  * check_for_pling_scanno:
  1415  *
  1416  * Check for I" - often should be !
  1417  */
  1418 void check_for_pling_scanno(const char *aline)
  1419 {
  1420     const char *s;
  1421     s=strstr(aline," I\"");
  1422     if (s)
  1423     {
  1424 	if (pswit[ECHO_SWITCH])
  1425 	    g_print("\n%s\n",aline);
  1426 	if (!pswit[OVERVIEW_SWITCH])
  1427 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1428 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1429 	else
  1430 	    cnt_punct++;
  1431     }
  1432 }
  1433 
  1434 /*
  1435  * check_for_extra_period:
  1436  *
  1437  * Check for period without a capital letter. Cut-down from gutspell.
  1438  * Only works when it happens on a single line.
  1439  */
  1440 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1441 {
  1442     const char *s,*t,*s1,*sprev;
  1443     int i;
  1444     gsize len;
  1445     gboolean istypo;
  1446     gchar *testword;
  1447     gunichar c,nc,pc,*decomposition;
  1448     if (pswit[PARANOID_SWITCH])
  1449     {
  1450 	for (t=aline;t=strstr(t,". ");)
  1451 	{
  1452 	    if (t==aline)
  1453 	    {
  1454 		t=g_utf8_next_char(t);
  1455 		/* start of line punctuation is handled elsewhere */
  1456 		continue;
  1457 	    }
  1458 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1459 	    {
  1460 		t=g_utf8_next_char(t);
  1461 		continue;
  1462 	    }
  1463 	    if (warnings->isDutch)
  1464 	    {
  1465 		/* For Frank & Jeroen -- 's Middags case */
  1466 		gunichar c2,c3,c4,c5;
  1467 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1468 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1469 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1470 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1471 		if (CHAR_IS_APOSTROPHE(c2) &&
  1472 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1473 		  g_unichar_isupper(c5))
  1474 		{
  1475 		    t=g_utf8_next_char(t);
  1476 		    continue;
  1477 		}
  1478 	    }
  1479 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1480 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1481 	      !isdigit(g_utf8_get_char(s1)))
  1482 		s1=g_utf8_next_char(s1);
  1483 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1484 	    {
  1485 		/* we have something to investigate */
  1486 		istypo=TRUE;
  1487 		/* so let's go back and find out */
  1488 		nc=g_utf8_get_char(t);
  1489 		s1=g_utf8_prev_char(t);
  1490 		c=g_utf8_get_char(s1);
  1491 		sprev=g_utf8_prev_char(s1);
  1492 		pc=g_utf8_get_char(sprev);
  1493 		while (s1>=aline &&
  1494 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1495 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1496 		  g_unichar_isalpha(nc)))
  1497 		{
  1498 		    nc=c;
  1499 		    s1=sprev;
  1500 		    c=pc;
  1501 		    sprev=g_utf8_prev_char(s1);
  1502 		    pc=g_utf8_get_char(sprev);
  1503 		}
  1504 		s1=g_utf8_next_char(s1);
  1505 		s=strchr(s1,'.');
  1506 		if (s)
  1507 		    testword=g_strndup(s1,s-s1);
  1508 		else
  1509 		    testword=g_strdup(s1);
  1510 		for (i=0;*abbrev[i];i++)
  1511 		    if (!strcmp(testword,abbrev[i]))
  1512 			istypo=FALSE;
  1513 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1514 		    istypo=FALSE;
  1515 		if (!*g_utf8_next_char(testword))
  1516 		    istypo=FALSE;
  1517 		if (isroman(testword))
  1518 		    istypo=FALSE;
  1519 		if (istypo)
  1520 		{
  1521 		    istypo=FALSE;
  1522 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1523 		    {
  1524 			decomposition=g_unicode_canonical_decomposition(
  1525 			  g_utf8_get_char(s),&len);
  1526 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1527 			    istypo=TRUE;
  1528 			g_free(decomposition);
  1529 		    }
  1530 		}
  1531 		if (istypo &&
  1532 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1533 		{
  1534 		    g_tree_insert(qperiod,g_strdup(testword),
  1535 		      GINT_TO_POINTER(1));
  1536 		    if (pswit[ECHO_SWITCH])
  1537 			g_print("\n%s\n",aline);
  1538 		    if (!pswit[OVERVIEW_SWITCH])
  1539 			g_print("    Line %ld column %ld - Extra period?\n",
  1540 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1541 		    else
  1542 			cnt_punct++;
  1543 		}
  1544 		g_free(testword);
  1545 	    }
  1546 	    t=g_utf8_next_char(t);
  1547 	}
  1548     }
  1549 }
  1550 
  1551 /*
  1552  * check_for_following_punctuation:
  1553  *
  1554  * Check for words usually not followed by punctuation.
  1555  */
  1556 void check_for_following_punctuation(const char *aline)
  1557 {
  1558     int i;
  1559     const char *s,*wordstart;
  1560     gunichar c;
  1561     gchar *inword,*t;
  1562     if (pswit[TYPO_SWITCH])
  1563     {
  1564 	for (s=aline;*s;)
  1565 	{
  1566 	    wordstart=s;
  1567 	    t=getaword(&s);
  1568 	    if (!*t)
  1569 	    {
  1570 		g_free(t);
  1571 		continue;
  1572 	    }
  1573 	    inword=g_utf8_strdown(t,-1);
  1574 	    g_free(t);
  1575 	    for (i=0;*nocomma[i];i++)
  1576 		if (!strcmp(inword,nocomma[i]))
  1577 		{
  1578 		    c=g_utf8_get_char(s);
  1579 		    if (c==',' || c==';' || c==':')
  1580 		    {
  1581 			if (pswit[ECHO_SWITCH])
  1582 			    g_print("\n%s\n",aline);
  1583 			if (!pswit[OVERVIEW_SWITCH])
  1584 			    g_print("    Line %ld column %ld - "
  1585 			      "Query punctuation after %s?\n",
  1586 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1587 			      inword);
  1588 			else
  1589 			    cnt_punct++;
  1590 		    }
  1591 		}
  1592 	    for (i=0;*noperiod[i];i++)
  1593 		if (!strcmp(inword,noperiod[i]))
  1594 		{
  1595 		    c=g_utf8_get_char(s);
  1596 		    if (c=='.' || c=='!')
  1597 		    {
  1598 			if (pswit[ECHO_SWITCH])
  1599 			    g_print("\n%s\n",aline);
  1600 			if (!pswit[OVERVIEW_SWITCH])
  1601 			    g_print("    Line %ld column %ld - "
  1602 			      "Query punctuation after %s?\n",
  1603 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1604 			      inword);
  1605 			else
  1606 			    cnt_punct++;
  1607 		    }
  1608 		}
  1609 	    g_free(inword);
  1610 	}
  1611     }
  1612 }
  1613 
  1614 /*
  1615  * check_for_typos:
  1616  *
  1617  * Check for commonly mistyped words,
  1618  * and digits like 0 for O in a word.
  1619  */
  1620 void check_for_typos(const char *aline,struct warnings *warnings)
  1621 {
  1622     const char *s,*t,*nt,*wordstart;
  1623     gchar *inword;
  1624     gunichar *decomposition;
  1625     gchar *testword;
  1626     int i,vowel,consonant,*dupcnt;
  1627     gboolean isdup,istypo,alower;
  1628     gunichar c,pc;
  1629     long offset,len;
  1630     gsize decomposition_len;
  1631     for (s=aline;*s;)
  1632     {
  1633 	wordstart=s;
  1634 	inword=getaword(&s);
  1635 	if (!*inword)
  1636 	{
  1637 	    g_free(inword);
  1638 	    continue; /* don't bother with empty lines */
  1639 	}
  1640 	if (mixdigit(inword))
  1641 	{
  1642 	    if (pswit[ECHO_SWITCH])
  1643 		g_print("\n%s\n",aline);
  1644 	    if (!pswit[OVERVIEW_SWITCH])
  1645 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1646 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1647 	    else
  1648 		cnt_word++;
  1649 	}
  1650 	/*
  1651 	 * Put the word through a series of tests for likely typos and OCR
  1652 	 * errors.
  1653 	 */
  1654 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1655 	{
  1656 	    istypo=FALSE;
  1657 	    alower=FALSE;
  1658 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1659 	    {
  1660 		c=g_utf8_get_char(t);
  1661 		nt=g_utf8_next_char(t);
  1662 		/* lowercase for testing */
  1663 		if (g_unichar_islower(c))
  1664 		    alower=TRUE;
  1665 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1666 		{
  1667 		    /*
  1668 		     * We have an uppercase mid-word. However, there are
  1669 		     * common cases:
  1670 		     *   Mac and Mc like McGill
  1671 		     *   French contractions like l'Abbe
  1672 		     */
  1673 		    offset=g_utf8_pointer_to_offset(inword,t);
  1674 		    if (offset>0)
  1675 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1676 		    else
  1677 			pc='\0';
  1678 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1679 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1680 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1681 		      CHAR_IS_APOSTROPHE(pc))
  1682 			; /* do nothing! */
  1683 		    else
  1684 			istypo=TRUE;
  1685 		}
  1686 	    }
  1687 	    testword=g_utf8_casefold(inword,-1);
  1688 	}
  1689 	if (pswit[TYPO_SWITCH])
  1690 	{
  1691 	    /*
  1692 	     * Check for certain unlikely two-letter combinations at word
  1693 	     * start and end.
  1694 	     */
  1695 	    len=g_utf8_strlen(testword,-1);
  1696 	    if (len>1)
  1697 	    {
  1698 		for (i=0;*nostart[i];i++)
  1699 		    if (g_str_has_prefix(testword,nostart[i]))
  1700 			istypo=TRUE;
  1701 		for (i=0;*noend[i];i++)
  1702 		    if (g_str_has_suffix(testword,noend[i]))
  1703 			istypo=TRUE;
  1704 	    }
  1705 	    /* ght is common, gbt never. Like that. */
  1706 	    if (strstr(testword,"cb"))
  1707 		istypo=TRUE;
  1708 	    if (strstr(testword,"gbt"))
  1709 		istypo=TRUE;
  1710 	    if (strstr(testword,"pbt"))
  1711 		istypo=TRUE;
  1712 	    if (strstr(testword,"tbs"))
  1713 		istypo=TRUE;
  1714 	    if (strstr(testword,"mrn"))
  1715 		istypo=TRUE;
  1716 	    if (strstr(testword,"ahle"))
  1717 		istypo=TRUE;
  1718 	    if (strstr(testword,"ihle"))
  1719 		istypo=TRUE;
  1720 	    /*
  1721 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1722 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1723 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1724 	     * numerals, but "ii" is a common scanno.
  1725 	     */
  1726 	    if (strstr(testword,"tbi"))
  1727 		istypo=TRUE;
  1728 	    if (strstr(testword,"tbe"))
  1729 		istypo=TRUE;
  1730 	    if (strstr(testword,"ii"))
  1731 		istypo=TRUE;
  1732 	    /*
  1733 	     * Check for no vowels or no consonants.
  1734 	     * If none, flag a typo.
  1735 	     */
  1736 	    if (!istypo && len>1)
  1737 	    {
  1738 		vowel=consonant=0;
  1739 		for (t=testword;*t;t=g_utf8_next_char(t))
  1740 		{
  1741 		    c=g_utf8_get_char(t);
  1742 		    decomposition=
  1743 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1744 		    if (c=='y' || g_unichar_isdigit(c))
  1745 		    {
  1746 			/* Yah, this is loose. */
  1747 			vowel++;
  1748 			consonant++;
  1749 		    }
  1750 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1751 			vowel++;
  1752 		    else
  1753 			consonant++;
  1754 		    g_free(decomposition);
  1755 		}
  1756 		if (!vowel || !consonant)
  1757 		    istypo=TRUE;
  1758 	    }
  1759 	    /*
  1760 	     * Now exclude the word from being reported if it's in
  1761 	     * the okword list.
  1762 	     */
  1763 	    for (i=0;*okword[i];i++)
  1764 		if (!strcmp(testword,okword[i]))
  1765 		    istypo=FALSE;
  1766 	    /*
  1767 	     * What looks like a typo may be a Roman numeral.
  1768 	     * Exclude these.
  1769 	     */
  1770 	    if (istypo && isroman(testword))
  1771 		istypo=FALSE;
  1772 	    /* Check the manual list of typos. */
  1773 	    if (!istypo)
  1774 		for (i=0;*typo[i];i++)
  1775 		    if (!strcmp(testword,typo[i]))
  1776 			istypo=TRUE;
  1777 	    /*
  1778 	     * Check lowercase s, l, i and m - special cases.
  1779 	     *   "j" - often a semi-colon gone wrong.
  1780 	     *   "d" for a missing apostrophe - he d
  1781 	     *   "n" for "in"
  1782 	     */
  1783 	    if (!istypo && len==1 &&
  1784 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1785 		istypo=TRUE;
  1786 	    if (istypo)
  1787 	    {
  1788 		dupcnt=g_tree_lookup(qword,testword);
  1789 		if (dupcnt)
  1790 		{
  1791 		    (*dupcnt)++;
  1792 		    isdup=!pswit[VERBOSE_SWITCH];
  1793 		}
  1794 		else
  1795 		{
  1796 		    dupcnt=g_new0(int,1);
  1797 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1798 		    isdup=FALSE;
  1799 		}
  1800 		if (!isdup)
  1801 		{
  1802 		    if (pswit[ECHO_SWITCH])
  1803 			g_print("\n%s\n",aline);
  1804 		    if (!pswit[OVERVIEW_SWITCH])
  1805 		    {
  1806 			g_print("    Line %ld column %ld - Query word %s",
  1807 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1808 			  inword);
  1809 			if (!pswit[VERBOSE_SWITCH])
  1810 			    g_print(" - not reporting duplicates");
  1811 			g_print("\n");
  1812 		    }
  1813 		    else
  1814 			cnt_word++;
  1815 		}
  1816 	    }
  1817 	}
  1818 	/* check the user's list of typos */
  1819 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1820 	{
  1821 	    if (pswit[ECHO_SWITCH])
  1822 		g_print("\n%s\n",aline);
  1823 	    if (!pswit[OVERVIEW_SWITCH])  
  1824 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1825 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1826 	}
  1827 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1828 	    g_free(testword);
  1829 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1830 	{
  1831 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1832 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1833 	    {
  1834 		if (pswit[ECHO_SWITCH])
  1835 		    g_print("\n%s\n",aline);
  1836 		if (!pswit[OVERVIEW_SWITCH])
  1837 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1838 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1839 		      inword);
  1840 		else
  1841 		    cnt_word++;
  1842 	    }
  1843 	}
  1844 	g_free(inword);
  1845     }
  1846 }
  1847 
  1848 /*
  1849  * check_for_misspaced_punctuation:
  1850  *
  1851  * Look for added or missing spaces around punctuation and quotes.
  1852  * If there is a punctuation character like ! with no space on
  1853  * either side, suspect a missing!space. If there are spaces on
  1854  * both sides , assume a typo. If we see a double quote with no
  1855  * space or punctuation on either side of it, assume unspaced
  1856  * quotes "like"this.
  1857  */
  1858 void check_for_misspaced_punctuation(const char *aline,
  1859   struct parities *parities,gboolean isemptyline)
  1860 {
  1861     gboolean isacro,isellipsis;
  1862     const char *s;
  1863     gunichar c,nc,pc,n2c;
  1864     int parity;
  1865     c=g_utf8_get_char(aline);
  1866     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1867     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1868     {
  1869 	pc=c;
  1870 	c=nc;
  1871 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1872 	/* For each character in the line after the first. */
  1873 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1874 	{
  1875 	    /* we need to suppress warnings for acronyms like M.D. */
  1876 	    isacro=FALSE;
  1877 	    /* we need to suppress warnings for ellipsis . . . */
  1878 	    isellipsis=FALSE;
  1879 	    /*
  1880 	     * If there are letters on both sides of it or
  1881 	     * if it's strict punctuation followed by an alpha.
  1882 	     */
  1883 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1884 	      g_utf8_strchr("?!,;:",-1,c)))
  1885 	    {
  1886 		if (c=='.')
  1887 		{
  1888 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1889 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1890 			isacro=TRUE;
  1891 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1892 		    if (nc && n2c=='.')
  1893 			isacro=TRUE;
  1894 		}
  1895 		if (!isacro)
  1896 		{
  1897 		    if (pswit[ECHO_SWITCH])
  1898 			g_print("\n%s\n",aline);
  1899 		    if (!pswit[OVERVIEW_SWITCH])
  1900 			g_print("    Line %ld column %ld - Missing space?\n",
  1901 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1902 		    else
  1903 			cnt_punct++;
  1904 		}
  1905 	    }
  1906 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1907 	    {
  1908 		/*
  1909 		 * If there are spaces on both sides,
  1910 		 * or space before and end of line.
  1911 		 */
  1912 		if (c=='.')
  1913 		{
  1914 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1915 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1916 			isellipsis=TRUE;
  1917 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1918 		    if (nc && n2c=='.')
  1919 			isellipsis=TRUE;
  1920 		}
  1921 		if (!isemptyline && !isellipsis)
  1922 		{
  1923 		    if (pswit[ECHO_SWITCH])
  1924 			g_print("\n%s\n",aline);
  1925 		    if (!pswit[OVERVIEW_SWITCH])
  1926 			g_print("    Line %ld column %ld - "
  1927 			  "Spaced punctuation?\n",linecnt,
  1928 			  g_utf8_pointer_to_offset(aline,s)+1);
  1929 		    else
  1930 			cnt_punct++;
  1931 		}
  1932 	    }
  1933 	}
  1934     }
  1935     /* Split out the characters that CANNOT be preceded by space. */
  1936     c=g_utf8_get_char(aline);
  1937     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1938     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1939     {
  1940 	pc=c;
  1941 	c=nc;
  1942 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1943 	/* for each character in the line after the first */
  1944 	if (g_utf8_strchr("?!,;:",-1,c))
  1945 	{
  1946 	    /* if it's punctuation that _cannot_ have a space before it */
  1947 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1948 	    {
  1949 		/*
  1950 		 * If nc DOES == space,
  1951 		 * it was already reported just above.
  1952 		 */
  1953 		if (pswit[ECHO_SWITCH])
  1954 		    g_print("\n%s\n",aline);
  1955 		if (!pswit[OVERVIEW_SWITCH])
  1956 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1957 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1958 		else
  1959 		    cnt_punct++;
  1960 	    }
  1961 	}
  1962     }
  1963     /*
  1964      * Special case " .X" where X is any alpha.
  1965      * This plugs a hole in the acronym code above.
  1966      * Inelegant, but maintainable.
  1967      */
  1968     c=g_utf8_get_char(aline);
  1969     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1970     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1971     {
  1972 	pc=c;
  1973 	c=nc;
  1974 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1975 	/* for each character in the line after the first */
  1976 	if (c=='.')
  1977 	{
  1978 	    /* if it's a period */
  1979 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1980 	    {
  1981 		/*
  1982 		 * If the period follows a space and
  1983 		 * is followed by a letter.
  1984 		 */
  1985 		if (pswit[ECHO_SWITCH])
  1986 		    g_print("\n%s\n",aline);
  1987 		if (!pswit[OVERVIEW_SWITCH])
  1988 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1989 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1990 		else
  1991 		    cnt_punct++;
  1992 	    }
  1993 	}
  1994     }
  1995     c=g_utf8_get_char(aline);
  1996     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1997     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1998     {
  1999 	pc=c;
  2000 	c=nc;
  2001 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2002 	/* for each character in the line after the first */
  2003 	if (CHAR_IS_DQUOTE(c))
  2004 	{
  2005 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  2006 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  2007 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  2008 	    {
  2009 		if (pswit[ECHO_SWITCH])
  2010 		    g_print("\n%s\n",aline);
  2011 		if (!pswit[OVERVIEW_SWITCH])
  2012 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  2013 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2014 		else
  2015 		    cnt_punct++;
  2016 	    }
  2017 	}
  2018     }
  2019     /* Check parity of quotes. */
  2020     nc=g_utf8_get_char(aline);
  2021     for (s=aline;*s;s=g_utf8_next_char(s))
  2022     {
  2023 	c=nc;
  2024 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2025 	if (CHAR_IS_DQUOTE(c))
  2026 	{
  2027 	    if (c==CHAR_DQUOTE)
  2028 	    {
  2029 		parities->dquote=!parities->dquote;
  2030 		parity=parities->dquote;
  2031 	    }
  2032 	    else if (c==CHAR_LD_QUOTE)
  2033 		parity=1;
  2034 	    else
  2035 		parity=0;
  2036 	    if (!parity)
  2037 	    {
  2038 		/* parity even */
  2039 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  2040 		{
  2041 		    if (pswit[ECHO_SWITCH])
  2042 			g_print("\n%s\n",aline);
  2043 		    if (!pswit[OVERVIEW_SWITCH])
  2044 			g_print("    Line %ld column %ld - "
  2045 			  "Wrongspaced quotes?\n",
  2046 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2047 		    else
  2048 			cnt_punct++;
  2049 		}
  2050 	    }
  2051 	    else
  2052 	    {
  2053 		/* parity odd */
  2054 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2055 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  2056 		{
  2057 		    if (pswit[ECHO_SWITCH])
  2058 			g_print("\n%s\n",aline);
  2059 		    if (!pswit[OVERVIEW_SWITCH])
  2060 			g_print("    Line %ld column %ld - "
  2061 			  "Wrongspaced quotes?\n",
  2062 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2063 		    else
  2064 			cnt_punct++;
  2065 		}
  2066 	    }
  2067 	}
  2068     }
  2069     c=g_utf8_get_char(aline);
  2070     if (CHAR_IS_DQUOTE(c))
  2071     {
  2072 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2073 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2074 	{
  2075 	    if (pswit[ECHO_SWITCH])
  2076 		g_print("\n%s\n",aline);
  2077 	    if (!pswit[OVERVIEW_SWITCH])
  2078 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2079 		  linecnt);
  2080 	    else
  2081 		cnt_punct++;
  2082 	}
  2083     }
  2084     if (pswit[SQUOTE_SWITCH])
  2085     {
  2086 	nc=g_utf8_get_char(aline);
  2087 	for (s=aline;*s;s=g_utf8_next_char(s))
  2088 	{
  2089 	    c=nc;
  2090 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2091 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2092 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2093 	      !g_unichar_isalpha(nc)))
  2094 	    {
  2095 		parities->squote=!parities->squote;
  2096 		if (!parities->squote)
  2097 		{
  2098 		    /* parity even */
  2099 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2100 		    {
  2101 			if (pswit[ECHO_SWITCH])
  2102 			    g_print("\n%s\n",aline);
  2103 			if (!pswit[OVERVIEW_SWITCH])
  2104 			    g_print("    Line %ld column %ld - "
  2105 			      "Wrongspaced singlequotes?\n",
  2106 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2107 			else
  2108 			    cnt_punct++;
  2109 		    }
  2110 		}
  2111 		else
  2112 		{
  2113 		    /* parity odd */
  2114 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2115 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2116 		    {
  2117 			if (pswit[ECHO_SWITCH])
  2118 			    g_print("\n%s\n",aline);
  2119 			if (!pswit[OVERVIEW_SWITCH])
  2120 			    g_print("    Line %ld column %ld - "
  2121 			      "Wrongspaced singlequotes?\n",
  2122 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2123 			else
  2124 			    cnt_punct++;
  2125 		    }
  2126 		}
  2127 	    }
  2128 	}
  2129     }
  2130 }
  2131 
  2132 /*
  2133  * check_for_double_punctuation:
  2134  *
  2135  * Look for double punctuation like ,. or ,,
  2136  * Thanks to DW for the suggestion!
  2137  * In books with references, ".," and ".;" are common
  2138  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2139  * OTOH, from my initial tests, there are also fairly
  2140  * common errors. What to do? Make these cases paranoid?
  2141  * ".," is the most common, so warnings->dotcomma is used
  2142  * to suppress detailed reporting if it occurs often.
  2143  */
  2144 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2145 {
  2146     const char *s;
  2147     gunichar c,nc;
  2148     nc=g_utf8_get_char(aline);
  2149     for (s=aline;*s;s=g_utf8_next_char(s))
  2150     {
  2151 	c=nc;
  2152 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2153 	/* for each punctuation character in the line */
  2154 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2155 	  g_utf8_strchr(".?!,;:",-1,nc))
  2156 	{
  2157 	    /* followed by punctuation, it's a query, unless . . . */
  2158 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2159 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2160 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2161 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2162 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2163 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2164 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2165 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2166 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2167 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2168 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2169 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2170 	    {
  2171 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2172 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2173 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2174 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2175 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2176 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2177 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2178 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2179 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2180 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2181 		{
  2182 		    s+=4;
  2183 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2184 		}
  2185 		; /* do nothing for .. !! and ?? which can be legit */
  2186 	    }
  2187 	    else
  2188 	    {
  2189 		if (pswit[ECHO_SWITCH])
  2190 		    g_print("\n%s\n",aline);
  2191 		if (!pswit[OVERVIEW_SWITCH])
  2192 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2193 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2194 		else
  2195 		    cnt_punct++;
  2196 	    }
  2197 	}
  2198     }
  2199 }
  2200 
  2201 /*
  2202  * check_for_spaced_quotes:
  2203  */
  2204 void check_for_spaced_quotes(const char *aline)
  2205 {
  2206     int i;
  2207     const char *s,*t;
  2208     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2209       CHAR_RS_QUOTE};
  2210     GString *pattern;
  2211     s=aline;
  2212     while ((t=strstr(s," \" ")))
  2213     {
  2214 	if (pswit[ECHO_SWITCH])
  2215 	    g_print("\n%s\n",aline);
  2216 	if (!pswit[OVERVIEW_SWITCH])
  2217 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2218 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2219 	else
  2220 	    cnt_punct++;
  2221 	s=g_utf8_next_char(g_utf8_next_char(t));
  2222     }
  2223     pattern=g_string_new(NULL);
  2224     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2225     {
  2226 	g_string_assign(pattern," ");
  2227 	g_string_append_unichar(pattern,single_quotes[i]);
  2228 	g_string_append_c(pattern,' ');
  2229 	s=aline;
  2230 	while ((t=strstr(s,pattern->str)))
  2231 	{
  2232 	    if (pswit[ECHO_SWITCH])
  2233 		g_print("\n%s\n",aline);
  2234 	    if (!pswit[OVERVIEW_SWITCH])
  2235 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2236 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2237 	    else
  2238 		cnt_punct++;
  2239 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2240 	}
  2241     }
  2242     g_string_free(pattern,TRUE);
  2243 }
  2244 
  2245 /*
  2246  * check_for_miscased_genative:
  2247  *
  2248  * Check special case of 'S instead of 's at end of word.
  2249  */
  2250 void check_for_miscased_genative(const char *aline)
  2251 {
  2252     const char *s;
  2253     gunichar c,nc,pc;
  2254     if (!*aline)
  2255 	return;
  2256     c=g_utf8_get_char(aline);
  2257     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2258     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2259     {
  2260 	pc=c;
  2261 	c=nc;
  2262 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2263 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2264 	{
  2265 	    if (pswit[ECHO_SWITCH])
  2266 		g_print("\n%s\n",aline);
  2267 	    if (!pswit[OVERVIEW_SWITCH])
  2268 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2269 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2270 	    else
  2271 		cnt_punct++;
  2272 	}
  2273     }
  2274 }
  2275 
  2276 /*
  2277  * check_end_of_line:
  2278  *
  2279  * Now check special cases - start and end of line -
  2280  * for single and double quotes. Start is sometimes [sic]
  2281  * but better to query it anyway.
  2282  * While we're here, check for dash at end of line.
  2283  */
  2284 void check_end_of_line(const char *aline,struct warnings *warnings)
  2285 {
  2286     int lbytes;
  2287     const char *s;
  2288     gunichar c1,c2;
  2289     lbytes=strlen(aline);
  2290     if (g_utf8_strlen(aline,lbytes)>1)
  2291     {
  2292 	s=g_utf8_prev_char(aline+lbytes);
  2293 	c1=g_utf8_get_char(s);
  2294 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2295 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2296 	{
  2297 	    if (pswit[ECHO_SWITCH])
  2298 		g_print("\n%s\n",aline);
  2299 	    if (!pswit[OVERVIEW_SWITCH])
  2300 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2301 		  g_utf8_strlen(aline,lbytes));
  2302 	    else
  2303 		cnt_punct++;
  2304 	}
  2305 	c1=g_utf8_get_char(aline);
  2306 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2307 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2308 	{
  2309 	    if (pswit[ECHO_SWITCH])
  2310 		g_print("\n%s\n",aline);
  2311 	    if (!pswit[OVERVIEW_SWITCH])
  2312 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2313 	    else
  2314 		cnt_punct++;
  2315 	}
  2316 	/*
  2317 	 * Dash at end of line may well be legit - paranoid mode only
  2318 	 * and don't report em-dash at line-end.
  2319 	 */
  2320 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2321 	{
  2322 	    for (s=g_utf8_prev_char(aline+lbytes);
  2323 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2324 		;
  2325 	    if (g_utf8_get_char(s)=='-' &&
  2326 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2327 	    {
  2328 		if (pswit[ECHO_SWITCH])
  2329 		    g_print("\n%s\n",aline);
  2330 		if (!pswit[OVERVIEW_SWITCH])
  2331 		    g_print("    Line %ld column %ld - "
  2332 		      "Hyphen at end of line?\n",
  2333 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2334 	    }
  2335 	}
  2336     }
  2337 }
  2338 
  2339 /*
  2340  * check_for_unspaced_bracket:
  2341  *
  2342  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2343  * If so, suspect a scanno like "a]most".
  2344  */
  2345 void check_for_unspaced_bracket(const char *aline)
  2346 {
  2347     const char *s;
  2348     gunichar c,nc,pc;
  2349     c=g_utf8_get_char(aline);
  2350     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2351     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2352     {
  2353 	pc=c;
  2354 	c=nc;
  2355 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2356 	if (!nc)
  2357 	    break;
  2358 	/* for each bracket character in the line except 1st & last */
  2359 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2360 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2361 	{
  2362 	    if (pswit[ECHO_SWITCH])
  2363 		g_print("\n%s\n",aline);
  2364 	    if (!pswit[OVERVIEW_SWITCH])
  2365 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2366 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2367 	    else
  2368 		cnt_punct++;
  2369 	}
  2370     }
  2371 }
  2372 
  2373 /*
  2374  * check_for_unpunctuated_endquote:
  2375  */
  2376 void check_for_unpunctuated_endquote(const char *aline)
  2377 {
  2378     const char *s;
  2379     gunichar c,nc,pc;
  2380     QuoteClass qc;
  2381     c=g_utf8_get_char(aline);
  2382     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2383     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2384     {
  2385 	pc=c;
  2386 	c=nc;
  2387 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2388 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2389 	/* for each character in the line except 1st */
  2390 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
  2391 	{
  2392 	    if (pswit[ECHO_SWITCH])
  2393 		g_print("\n%s\n",aline);
  2394 	    if (!pswit[OVERVIEW_SWITCH])
  2395 		g_print("    Line %ld column %ld - "
  2396 		  "endquote missing punctuation?\n",
  2397 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2398 	    else
  2399 		cnt_punct++;
  2400 	}
  2401     }
  2402 }
  2403 
  2404 /*
  2405  * check_for_html_tag:
  2406  *
  2407  * Check for <HTML TAG>.
  2408  *
  2409  * If there is a < in the line, followed at some point
  2410  * by a > then we suspect HTML.
  2411  */
  2412 void check_for_html_tag(const char *aline)
  2413 {
  2414     const char *open,*close;
  2415     gchar *tag;
  2416     open=strchr(aline,'<');
  2417     if (open)
  2418     {
  2419 	close=strchr(g_utf8_next_char(open),'>');
  2420 	if (close)
  2421 	{
  2422 	    if (pswit[ECHO_SWITCH])
  2423 		g_print("\n%s\n",aline);
  2424 	    if (!pswit[OVERVIEW_SWITCH])
  2425 	    {
  2426 		tag=g_strndup(open,close-open+1);
  2427 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2428 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2429 		g_free(tag);
  2430 	    }
  2431 	    else
  2432 		cnt_html++;
  2433 	}
  2434     }
  2435 }
  2436 
  2437 /*
  2438  * check_for_html_entity:
  2439  *
  2440  * Check for &symbol; HTML.
  2441  *
  2442  * If there is a & in the line, followed at
  2443  * some point by a ; then we suspect HTML.
  2444  */
  2445 void check_for_html_entity(const char *aline)
  2446 {
  2447     const char *s,*amp,*scolon;
  2448     gchar *entity;
  2449     amp=strchr(aline,'&');
  2450     if (amp)
  2451     {
  2452 	scolon=strchr(amp,';');
  2453 	if (scolon)
  2454 	{
  2455 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2456 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2457 		    break;		/* Don't report "Jones & Son;" */
  2458 	    if (s>=scolon)
  2459 	    {
  2460 		if (pswit[ECHO_SWITCH])
  2461 		    g_print("\n%s\n",aline);
  2462 		if (!pswit[OVERVIEW_SWITCH])
  2463 		{
  2464 		    entity=g_strndup(amp,scolon-amp+1);
  2465 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2466 		      linecnt,(int)(amp-aline)+1,entity);
  2467 		    g_free(entity);
  2468 		}
  2469 		else
  2470 		    cnt_html++;
  2471 	    }
  2472 	}
  2473     }
  2474 }
  2475 
  2476 /*
  2477  * check_for_omitted_punctuation:
  2478  *
  2479  * Check for omitted punctuation at end of paragraph by working back
  2480  * through prevline. DW.
  2481  * Need to check this only for "normal" paras.
  2482  * So what is a "normal" para?
  2483  *    Not normal if one-liner (chapter headings, etc.)
  2484  *    Not normal if doesn't contain at least one locase letter
  2485  *    Not normal if starts with space
  2486  */
  2487 void check_for_omitted_punctuation(const char *prevline,
  2488   struct line_properties *last,int start_para_line)
  2489 {
  2490     gboolean letter_on_line=FALSE;
  2491     const char *s;
  2492     gunichar c;
  2493     gboolean closing_quote;
  2494     for (s=prevline;*s;s=g_utf8_next_char(s))
  2495 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2496 	{
  2497 	    letter_on_line=TRUE;
  2498 	    break;
  2499 	}
  2500     /*
  2501      * This next "if" is a problem.
  2502      * If we say "start_para_line <= linecnt - 1", that includes
  2503      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2504      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2505      * misses genuine one-line paragraphs.
  2506      */
  2507     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2508       g_utf8_get_char(prevline)>CHAR_SPACE)
  2509     {
  2510 	s=prevline+strlen(prevline);
  2511 	do
  2512 	{
  2513 	    s=g_utf8_prev_char(s);
  2514 	    c=g_utf8_get_char(s);
  2515 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2516 		closing_quote=TRUE;
  2517 	    else
  2518 		closing_quote=FALSE;
  2519 	} while (closing_quote && s>prevline);
  2520 	for (;s>prevline;s=g_utf8_prev_char(s))
  2521 	{
  2522 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2523 	    {
  2524 		if (pswit[ECHO_SWITCH])
  2525 		    g_print("\n%s\n",prevline);
  2526 		if (!pswit[OVERVIEW_SWITCH])
  2527 		    g_print("    Line %ld column %ld - "
  2528 		      "No punctuation at para end?\n",
  2529 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2530 		else
  2531 		    cnt_punct++;
  2532 		break;
  2533 	    }
  2534 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
  2535 		break;
  2536 	}
  2537     }
  2538 }
  2539 
  2540 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2541 {
  2542     const char *word=key;
  2543     int *dupcnt=value;
  2544     if (*dupcnt)
  2545 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2546 	  word,*dupcnt);
  2547     return FALSE;
  2548 }
  2549 
  2550 void print_as_windows_1252(const char *string)
  2551 {
  2552     gsize inbytes,outbytes;
  2553     gchar *buf,*bp;
  2554     static GIConv converter=(GIConv)-1;
  2555     if (!string)
  2556     {
  2557 	if (converter!=(GIConv)-1)
  2558 	    g_iconv_close(converter);
  2559 	converter=(GIConv)-1;
  2560 	return;
  2561     }
  2562     if (converter==(GIConv)-1)
  2563 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2564     if (converter!=(GIConv)-1)
  2565     {
  2566 	inbytes=outbytes=strlen(string);
  2567 	bp=buf=g_malloc(outbytes+1);
  2568 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2569 	*bp='\0';
  2570 	fputs(buf,stdout);
  2571 	g_free(buf);
  2572     }
  2573     else
  2574 	fputs(string,stdout);
  2575 }
  2576 
  2577 void print_as_utf_8(const char *string)
  2578 {
  2579     fputs(string,stdout);
  2580 }
  2581 
  2582 /*
  2583  * procfile:
  2584  *
  2585  * Process one file.
  2586  */
  2587 void procfile(const char *filename)
  2588 {
  2589     const char *s;
  2590     gchar *parastart=NULL;	/* first line of current para */
  2591     gchar *etext,*aline;
  2592     gchar *etext_ptr;
  2593     GError *err=NULL;
  2594     struct first_pass_results *first_pass_results;
  2595     struct warnings *warnings;
  2596     struct counters counters={0};
  2597     struct line_properties last={0};
  2598     struct parities parities={0};
  2599     struct pending pending={0};
  2600     gboolean isemptyline;
  2601     long start_para_line=0;
  2602     gboolean isnewpara=FALSE,enddash=FALSE;
  2603     last.start=CHAR_SPACE;
  2604     linecnt=checked_linecnt=0;
  2605     etext=read_etext(filename,&err);
  2606     if (!etext)
  2607     {
  2608 	if (pswit[STDOUT_SWITCH])
  2609 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2610 	else
  2611 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2612 	exit(1);
  2613     }
  2614     g_print("\n\nFile: %s\n\n",filename);
  2615     first_pass_results=first_pass(etext);
  2616     warnings=report_first_pass(first_pass_results);
  2617     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2618     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2619     /*
  2620      * Here we go with the main pass. Hold onto yer hat!
  2621      */
  2622     linecnt=0;
  2623     etext_ptr=etext;
  2624     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2625     {
  2626 	linecnt++;
  2627 	if (linecnt==1)
  2628 	    isnewpara=TRUE;
  2629 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2630 	    continue;    // skip DP page separators completely
  2631 	if (linecnt<first_pass_results->firstline ||
  2632 	  (first_pass_results->footerline>0 &&
  2633 	  linecnt>first_pass_results->footerline))
  2634 	{
  2635 	    if (pswit[HEADER_SWITCH])
  2636 	    {
  2637 		if (g_str_has_prefix(aline,"Title:"))
  2638 		    g_print("    %s\n",aline);
  2639 		if (g_str_has_prefix(aline,"Author:"))
  2640 		    g_print("    %s\n",aline);
  2641 		if (g_str_has_prefix(aline,"Release Date:"))
  2642 		    g_print("    %s\n",aline);
  2643 		if (g_str_has_prefix(aline,"Edition:"))
  2644 		    g_print("    %s\n\n",aline);
  2645 	    }
  2646 	    continue;		/* skip through the header */
  2647 	}
  2648 	checked_linecnt++;
  2649 	print_pending(aline,parastart,&pending);
  2650 	isemptyline=analyse_quotes(aline,&counters);
  2651 	if (isnewpara && !isemptyline)
  2652 	{
  2653 	    /* This line is the start of a new paragraph. */
  2654 	    start_para_line=linecnt;
  2655 	    /* Capture its first line in case we want to report it later. */
  2656 	    g_free(parastart);
  2657 	    parastart=g_strdup(aline);
  2658 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2659 	    s=aline;
  2660 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2661 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2662 		s=g_utf8_next_char(s);
  2663 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2664 	    {
  2665 		/* and its first letter is lowercase */
  2666 		if (pswit[ECHO_SWITCH])
  2667 		    g_print("\n%s\n",aline);
  2668 		if (!pswit[OVERVIEW_SWITCH])
  2669 		    g_print("    Line %ld column %ld - "
  2670 		      "Paragraph starts with lower-case\n",
  2671 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2672 		else
  2673 		    cnt_punct++;
  2674 	    }
  2675 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2676 	}
  2677 	/* Check for an em-dash broken at line end. */
  2678 	if (enddash && g_utf8_get_char(aline)=='-')
  2679 	{
  2680 	    if (pswit[ECHO_SWITCH])
  2681 		g_print("\n%s\n",aline);
  2682 	    if (!pswit[OVERVIEW_SWITCH])
  2683 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2684 	    else
  2685 		cnt_punct++;
  2686 	}
  2687 	enddash=FALSE;
  2688 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2689 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2690 	    ;
  2691 	if (s>=aline && g_utf8_get_char(s)=='-')
  2692 	    enddash=TRUE;
  2693 	check_for_control_characters(aline);
  2694 	if (warnings->bin)
  2695 	    check_for_odd_characters(aline,warnings,isemptyline);
  2696 	if (warnings->longline)
  2697 	    check_for_long_line(aline);
  2698 	if (warnings->shortline)
  2699 	    check_for_short_line(aline,&last);
  2700 	last.blen=last.len;
  2701 	last.len=g_utf8_strlen(aline,-1);
  2702 	last.start=g_utf8_get_char(aline);
  2703 	check_for_starting_punctuation(aline);
  2704 	if (warnings->dash)
  2705 	{
  2706 	    check_for_spaced_emdash(aline);
  2707 	    check_for_spaced_dash(aline);
  2708 	}
  2709 	check_for_unmarked_paragraphs(aline);
  2710 	check_for_jeebies(aline);
  2711 	check_for_mta_from(aline);
  2712 	check_for_orphan_character(aline);
  2713 	check_for_pling_scanno(aline);
  2714 	check_for_extra_period(aline,warnings);
  2715 	check_for_following_punctuation(aline);
  2716 	check_for_typos(aline,warnings);
  2717 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2718 	check_for_double_punctuation(aline,warnings);
  2719 	check_for_spaced_quotes(aline);
  2720 	check_for_miscased_genative(aline);
  2721 	check_end_of_line(aline,warnings);
  2722 	check_for_unspaced_bracket(aline);
  2723 	if (warnings->endquote)
  2724 	    check_for_unpunctuated_endquote(aline);
  2725 	check_for_html_tag(aline);
  2726 	check_for_html_entity(aline);
  2727 	if (isemptyline)
  2728 	{
  2729 	    check_for_mismatched_quotes(&counters,&pending);
  2730 	    counters_reset(&counters);
  2731 	    /* let the next iteration know that it's starting a new para */
  2732 	    isnewpara=TRUE;
  2733 	    if (prevline)
  2734 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2735 	}
  2736 	g_free(prevline);
  2737 	prevline=g_strdup(aline);
  2738     }
  2739     linecnt++;
  2740     check_for_mismatched_quotes(&counters,&pending);
  2741     print_pending(NULL,parastart,&pending);
  2742     reset_pending(&pending);
  2743     if (prevline)
  2744     {
  2745 	g_free(prevline);
  2746 	prevline=NULL;
  2747     }
  2748     g_free(parastart);
  2749     g_free(prevline);
  2750     g_free(etext);
  2751     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2752 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2753     g_tree_unref(qword);
  2754     g_tree_unref(qperiod);
  2755     counters_destroy(&counters);
  2756     g_set_print_handler(NULL);
  2757     print_as_windows_1252(NULL);
  2758     if (pswit[MARKUP_SWITCH])  
  2759 	loseentities(NULL);
  2760 }
  2761 
  2762 /*
  2763  * flgets:
  2764  *
  2765  * Get one line from the input text, checking for
  2766  * the existence of exactly one CR/LF line-end per line.
  2767  *
  2768  * Returns: a pointer to the line.
  2769  */
  2770 char *flgets(char **etext,long lcnt)
  2771 {
  2772     gunichar c;
  2773     gboolean isCR=FALSE;
  2774     char *theline=*etext;
  2775     char *eos=theline;
  2776     gchar *s;
  2777     for (;;)
  2778     {
  2779 	c=g_utf8_get_char(*etext);
  2780 	*etext=g_utf8_next_char(*etext);
  2781 	if (!c)
  2782 	    return NULL;
  2783 	/* either way, it's end of line */
  2784 	if (c=='\n')
  2785 	{
  2786 	    if (isCR)
  2787 		break;
  2788 	    else
  2789 	    {
  2790 		/* Error - a LF without a preceding CR */
  2791 		if (pswit[LINE_END_SWITCH])
  2792 		{
  2793 		    if (pswit[ECHO_SWITCH])
  2794 		    {
  2795 			s=g_strndup(theline,eos-theline);
  2796 			g_print("\n%s\n",s);
  2797 			g_free(s);
  2798 		    }
  2799 		    if (!pswit[OVERVIEW_SWITCH])
  2800 			g_print("    Line %ld - No CR?\n",lcnt);
  2801 		    else
  2802 			cnt_lineend++;
  2803 		}
  2804 		break;
  2805 	    }
  2806 	}
  2807 	if (c=='\r')
  2808 	{
  2809 	    if (isCR)
  2810 	    {
  2811 		/* Error - two successive CRs */
  2812 		if (pswit[LINE_END_SWITCH])
  2813 		{
  2814 		    if (pswit[ECHO_SWITCH])
  2815 		    {
  2816 			s=g_strndup(theline,eos-theline);
  2817 			g_print("\n%s\n",s);
  2818 			g_free(s);
  2819 		    }
  2820 		    if (!pswit[OVERVIEW_SWITCH])
  2821 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2822 		    else
  2823 			cnt_lineend++;
  2824 		}
  2825 	    }
  2826 	    isCR=TRUE;
  2827 	}
  2828 	else
  2829 	{
  2830 	    if (pswit[LINE_END_SWITCH] && isCR)
  2831 	    {
  2832 		if (pswit[ECHO_SWITCH])
  2833 		{
  2834 		    s=g_strndup(theline,eos-theline);
  2835 		    g_print("\n%s\n",s);
  2836 		    g_free(s);
  2837 		}
  2838 		if (!pswit[OVERVIEW_SWITCH])
  2839 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2840 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2841 		else
  2842 		    cnt_lineend++;
  2843 		*eos=' ';
  2844 	    }
  2845 	    isCR=FALSE;
  2846 	    eos=g_utf8_next_char(eos);
  2847 	}
  2848     }
  2849     *eos='\0';
  2850     if (pswit[MARKUP_SWITCH])  
  2851 	postprocess_for_HTML(theline);
  2852     if (pswit[DP_SWITCH])  
  2853 	postprocess_for_DP(theline);
  2854     return theline;
  2855 }
  2856 
  2857 /*
  2858  * mixdigit:
  2859  *
  2860  * Takes a "word" as a parameter, and checks whether it
  2861  * contains a mixture of alpha and digits. Generally, this is an
  2862  * error, but may not be for cases like 4th or L5 12s. 3d.
  2863  *
  2864  * Returns: TRUE iff an is error found.
  2865  */
  2866 gboolean mixdigit(const char *checkword)
  2867 {
  2868     gboolean wehaveadigit,wehavealetter,query;
  2869     const char *s,*nondigit;
  2870     wehaveadigit=wehavealetter=query=FALSE;
  2871     for (s=checkword;*s;s=g_utf8_next_char(s))
  2872 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2873 	    wehavealetter=TRUE;
  2874 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2875 	    wehaveadigit=TRUE;
  2876     if (wehaveadigit && wehavealetter)
  2877     {
  2878 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2879 	query=TRUE;
  2880 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2881 	  nondigit=g_utf8_next_char(nondigit))
  2882 	    ;
  2883 	/* digits, ending in st, rd, nd, th of either case */
  2884 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2885 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2886 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2887 	  !g_ascii_strcasecmp(nondigit,"th"))
  2888 	    query=FALSE;
  2889 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2890 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2891 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2892 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2893 	    query=FALSE;
  2894 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2895 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2896 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2897 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2898 	    query=FALSE;
  2899 	/* digits, ending in l, L, s or d */
  2900 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2901 	  !strcmp(nondigit,"d"))
  2902 	    query=FALSE;
  2903 	/*
  2904 	 * L at the start of a number, representing Britsh pounds, like L500.
  2905 	 * This is cute. We know the current word is mixed digit. If the first
  2906 	 * letter is L, there must be at least one digit following. If both
  2907 	 * digits and letters follow, we have a genuine error, else we have a
  2908 	 * capital L followed by digits, and we accept that as a non-error.
  2909 	 */
  2910 	if (g_utf8_get_char(checkword)=='L' &&
  2911 	  !mixdigit(g_utf8_next_char(checkword)))
  2912 	    query=FALSE;
  2913     }
  2914     return query;
  2915 }
  2916 
  2917 /*
  2918  * getaword:
  2919  *
  2920  * Extracts the first/next "word" from the line, and returns it.
  2921  * A word is defined as one English word unit--or at least that's the aim.
  2922  * "ptr" is advanced to the position in the line where we will start
  2923  * looking for the next word.
  2924  *
  2925  * Returns: A newly-allocated string.
  2926  */
  2927 gchar *getaword(const char **ptr)
  2928 {
  2929     const char *s,*t;
  2930     GString *word;
  2931     gunichar c,pc;
  2932     word=g_string_new(NULL);
  2933     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2934       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2935       **ptr;*ptr=g_utf8_next_char(*ptr))
  2936 	;
  2937     /*
  2938      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2939      * Especially yucky is the case of L1,000
  2940      * This section looks for a pattern of characters including a digit
  2941      * followed by a comma or period followed by one or more digits.
  2942      * If found, it returns this whole pattern as a word; otherwise we discard
  2943      * the results and resume our normal programming.
  2944      */
  2945     s=*ptr;
  2946     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  2947       g_unichar_isalpha(g_utf8_get_char(s)) ||
  2948       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  2949 	g_string_append_unichar(word,g_utf8_get_char(s));
  2950     if (word->len)
  2951     {
  2952 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  2953 	{
  2954 	    c=g_utf8_get_char(t);
  2955 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  2956 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  2957 	    {
  2958 		*ptr=s;
  2959 		return g_string_free(word,FALSE);
  2960 	    }
  2961 	}
  2962     }
  2963     /* we didn't find a punctuated number - do the regular getword thing */
  2964     g_string_truncate(word,0);
  2965     c=g_utf8_get_char(*ptr);
  2966     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  2967       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  2968 	g_string_append_unichar(word,c);
  2969     return g_string_free(word,FALSE);
  2970 }
  2971 
  2972 /*
  2973  * isroman:
  2974  *
  2975  * Is this word a Roman Numeral?
  2976  *
  2977  * It doesn't actually validate that the number is a valid Roman Numeral--for
  2978  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  2979  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  2980  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  2981  * expressions thereof, except when it came to taxes. Allow any number of M,
  2982  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  2983  * XL or an optional XC, an optional IX or IV, an optional V and any number
  2984  * of optional Is.
  2985  */
  2986 gboolean isroman(const char *t)
  2987 {
  2988     const char *s;
  2989     if (!t || !*t)
  2990 	return FALSE;
  2991     s=t;
  2992     while (g_utf8_get_char(t)=='m' && *t)
  2993 	t++;
  2994     if (g_utf8_get_char(t)=='d')
  2995 	t++;
  2996     if (g_str_has_prefix(t,"cm"))
  2997 	t+=2;
  2998     if (g_str_has_prefix(t,"cd"))
  2999 	t+=2;
  3000     while (g_utf8_get_char(t)=='c' && *t)
  3001 	t++;
  3002     if (g_str_has_prefix(t,"xl"))
  3003 	t+=2;
  3004     if (g_str_has_prefix(t,"xc"))
  3005 	t+=2;
  3006     if (g_utf8_get_char(t)=='l')
  3007 	t++;
  3008     while (g_utf8_get_char(t)=='x' && *t)
  3009 	t++;
  3010     if (g_str_has_prefix(t,"ix"))
  3011 	t+=2;
  3012     if (g_str_has_prefix(t,"iv"))
  3013 	t+=2;
  3014     if (g_utf8_get_char(t)=='v')
  3015 	t++;
  3016     while (g_utf8_get_char(t)=='i' && *t)
  3017 	t++;
  3018     return !*t;
  3019 }
  3020 
  3021 /*
  3022  * postprocess_for_DP:
  3023  *
  3024  * Invoked with the -d switch from flgets().
  3025  * It simply "removes" from the line a hard-coded set of common
  3026  * DP-specific tags, so that the line passed to the main routine has
  3027  * been pre-cleaned of DP markup.
  3028  */
  3029 void postprocess_for_DP(char *theline)
  3030 {
  3031     char *s,*t;
  3032     int i;
  3033     if (!*theline) 
  3034 	return;
  3035     for (i=0;*DPmarkup[i];i++)
  3036 	while ((s=strstr(theline,DPmarkup[i])))
  3037 	{
  3038 	    t=s+strlen(DPmarkup[i]);
  3039 	    memmove(s,t,strlen(t)+1);
  3040 	}
  3041 }
  3042 
  3043 /*
  3044  * postprocess_for_HTML:
  3045  *
  3046  * Invoked with the -m switch from flgets().
  3047  * It simply "removes" from the line a hard-coded set of common
  3048  * HTML tags and "replaces" a hard-coded set of common HTML
  3049  * entities, so that the line passed to the main routine has
  3050  * been pre-cleaned of HTML.
  3051  */
  3052 void postprocess_for_HTML(char *theline)
  3053 {
  3054     while (losemarkup(theline))
  3055 	;
  3056     loseentities(theline);
  3057 }
  3058 
  3059 char *losemarkup(char *theline)
  3060 {
  3061     char *s,*t;
  3062     int i;
  3063     s=strchr(theline,'<');
  3064     t=s?strchr(s,'>'):NULL;
  3065     if (!s || !t)
  3066 	return NULL;
  3067     for (i=0;*markup[i];i++)
  3068 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3069 	{
  3070 	    t=g_utf8_next_char(t);
  3071 	    memmove(s,t,strlen(t)+1);
  3072 	    return s;
  3073 	}
  3074     /* It's an unrecognized <xxx>. */
  3075     return NULL;
  3076 }
  3077 
  3078 void loseentities(char *theline)
  3079 {
  3080     int i;
  3081     gsize nb;
  3082     char *amp,*scolon;
  3083     gchar *s,*t;
  3084     gunichar c;
  3085     GTree *entities=NULL;
  3086     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3087     if (!theline)
  3088     {
  3089 	if (entities)
  3090 	    g_tree_destroy(entities);
  3091 	entities=NULL;
  3092 	if (translit!=(GIConv)-1)
  3093 	    g_iconv_close(translit);
  3094 	translit=(GIConv)-1;
  3095 	if (to_utf8!=(GIConv)-1)
  3096 	    g_iconv_close(to_utf8);
  3097 	to_utf8=(GIConv)-1;
  3098 	return;
  3099     }
  3100     if (!*theline)
  3101 	return;
  3102     if (!entities)
  3103     {
  3104 	entities=g_tree_new((GCompareFunc)strcmp);
  3105 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3106 	    g_tree_insert(entities,HTMLentities[i].name,
  3107 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3108     }
  3109     if (translit==(GIConv)-1)
  3110 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3111     if (to_utf8==(GIConv)-1)
  3112 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3113     while((amp=strchr(theline,'&')))
  3114     {
  3115 	scolon=strchr(amp,';');
  3116 	if (scolon)
  3117 	{
  3118 	    if (amp[1]=='#')
  3119 	    {
  3120 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3121 		    c=strtol(amp+2,NULL,10);
  3122 		else if (amp[2]=='x' &&
  3123 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3124 		    c=strtol(amp+3,NULL,16);
  3125 	    }
  3126 	    else
  3127 	    {
  3128 		s=g_strndup(amp+1,scolon-(amp+1));
  3129 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3130 		g_free(s);
  3131 	    }
  3132 	}
  3133 	else
  3134 	    c=0;
  3135 	if (c)
  3136 	{
  3137 	    theline=amp;
  3138 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3139 		theline+=g_unichar_to_utf8(c,theline);
  3140 	    else
  3141 	    {
  3142 		s=g_malloc(6);
  3143 		nb=g_unichar_to_utf8(c,s);
  3144 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3145 		g_free(s);
  3146 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3147 		g_free(t);
  3148 		memcpy(theline,s,nb);
  3149 		g_free(s);
  3150 		theline+=nb;
  3151 	    }
  3152 	    memmove(theline,g_utf8_next_char(scolon),
  3153 	      strlen(g_utf8_next_char(scolon))+1);
  3154 	}
  3155 	else
  3156 	    theline=g_utf8_next_char(amp);
  3157     }
  3158 }
  3159 
  3160 gboolean tagcomp(const char *strin,const char *basetag)
  3161 {
  3162     gboolean retval;
  3163     gchar *s,*t;
  3164     if (g_utf8_get_char(strin)=='/')
  3165 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3166     else
  3167 	t=g_utf8_casefold(strin,-1);
  3168     s=g_utf8_casefold(basetag,-1);
  3169     retval=g_str_has_prefix(t,s);
  3170     g_free(s);
  3171     g_free(t);
  3172     return retval;
  3173 }
  3174 
  3175 void proghelp(GOptionContext *context)
  3176 {
  3177     gchar *help;
  3178     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3179     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3180     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3181     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3182       "For details, read the file COPYING.\n",stderr);
  3183     fputs("This is Free Software; "
  3184       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3185     fputs("read the file COPYING for details.\n\n",stderr);
  3186     help=g_option_context_get_help(context,TRUE,NULL);
  3187     fputs(help,stderr);
  3188     g_free(help);
  3189     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3190     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3191       "non-ASCII\n",stderr);
  3192     fputs("characters like accented letters, "
  3193       "lines longer than 75 or shorter than 55,\n",stderr);
  3194     fputs("unbalanced quotes or brackets, "
  3195       "a variety of badly formatted punctuation, \n",stderr);
  3196     fputs("HTML tags, some likely typos. "
  3197       "It is NOT a substitute for human judgement.\n",stderr);
  3198     fputs("\n",stderr);
  3199 }