bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Fri Sep 27 07:19:36 2013 +0100 (2013-09-27)
changeset 125 927fb871d2e3
parent 103 adc06e9e8470
child 128 f3c293593d44
child 132 237b058061f2
permissions -rw-r--r--
Update documentation for 2.0.57
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *prevline;
    36 
    37 /* Common typos. */
    38 char *typo[] = {
    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    68     "se", ""
    69 };
    70 
    71 GTree *usertypo;
    72 
    73 /* Common abbreviations and other OK words not to query as typos. */
    74 char *okword[] = {
    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    78     "outbid", "outbids", "frostbite", "frostbitten", ""
    79 };
    80 
    81 /* Common abbreviations that cause otherwise unexplained periods. */
    82 char *abbrev[] = {
    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    85 };
    86 
    87 /*
    88  * Two-Letter combinations that rarely if ever start words,
    89  * but are common scannos or otherwise common letter combinations.
    90  */
    91 char *nostart[] = {
    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    93 };
    94 
    95 /*
    96  * Two-Letter combinations that rarely if ever end words,
    97  * but are common scannos or otherwise common letter combinations.
    98  */
    99 char *noend[] = {
   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   101     "sw", "gr", "sl", "cl", "iy", ""
   102 };
   103 
   104 char *markup[] = {
   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   109 };
   110 
   111 char *DPmarkup[] = {
   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   113 };
   114 
   115 char *nocomma[] = {
   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   119     "during", "let", "toward", "among", ""
   120 };
   121 
   122 char *noperiod[] = {
   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   126     "among", "those", "into", "whom", "having", "thence", ""
   127 }; 
   128 
   129 gboolean pswit[SWITNO];  /* program switches */
   130 
   131 static GOptionEntry options[]={
   132     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   133       "Ignore DP-specific markup", NULL },
   134     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   135       "Don't echo queried line", NULL },
   136     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   137       "Check single quotes", NULL },
   138     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   139       "Check common typos", NULL },
   140     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   141       "Require closure of quotes on every paragraph", NULL },
   142     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   143       "Disable paranoid querying of everything", NULL },
   144     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   145       "Disable line end checking", NULL },
   146     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   147       "Overview: just show counts", NULL },
   148     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   149       "Output errors to stdout instead of stderr", NULL },
   150     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   151       "Echo header fields", NULL },
   152     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   153       "Ignore markup in < >", NULL },
   154     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   155       "Use file of user-defined typos", NULL },
   156     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   157       "Defaults for use on www upload", NULL },
   158     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   159       "Verbose - list everything", NULL },
   160     { NULL }
   161 };
   162 
   163 long cnt_quote;		/* for overview mode, count of quote queries */
   164 long cnt_brack;		/* for overview mode, count of brackets queries */
   165 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   166 long cnt_odd;		/* for overview mode, count of odd character queries */
   167 long cnt_long;		/* for overview mode, count of long line errors */
   168 long cnt_short;		/* for overview mode, count of short line queries */
   169 long cnt_punct;		/* for overview mode,
   170 			   count of punctuation and spacing queries */
   171 long cnt_dash;		/* for overview mode, count of dash-related queries */
   172 long cnt_word;		/* for overview mode, count of word queries */
   173 long cnt_html;		/* for overview mode, count of html queries */
   174 long cnt_lineend;	/* for overview mode, count of line-end queries */
   175 long cnt_spacend;	/* count of lines with space at end */
   176 long linecnt;		/* count of total lines in the file */
   177 long checked_linecnt;	/* count of lines actually checked */
   178 
   179 void proghelp(GOptionContext *context);
   180 void procfile(const char *);
   181 
   182 gchar *running_from;
   183 
   184 gboolean mixdigit(const char *);
   185 gchar *getaword(const char **);
   186 char *flgets(char **,long);
   187 void postprocess_for_HTML(char *);
   188 char *linehasmarkup(char *);
   189 char *losemarkup(char *);
   190 gboolean tagcomp(const char *,const char *);
   191 void loseentities(char *);
   192 gboolean isroman(const char *);
   193 void postprocess_for_DP(char *);
   194 void print_as_windows_1252(const char *string);
   195 void print_as_utf_8(const char *string);
   196 
   197 GTree *qword,*qperiod;
   198 
   199 #ifdef __WIN32__
   200 UINT saved_cp;
   201 #endif
   202 
   203 void parse_options(int *argc,char ***argv)
   204 {
   205     GError *err=NULL;
   206     GOptionContext *context;
   207     context=g_option_context_new(
   208       "file - looks for errors in Project Gutenberg(TM) etexts");
   209     g_option_context_add_main_entries(context,options,NULL);
   210     if (!g_option_context_parse(context,argc,argv,&err))
   211     {
   212 	g_printerr("Bookloupe: %s\n",err->message);
   213 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   214 	exit(1);
   215     }
   216     /* Paranoid checking is turned OFF, not on, by its switch */
   217     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   218     if (pswit[PARANOID_SWITCH])
   219 	/* if running in paranoid mode, typo checks default to enabled */
   220 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   221     /* Line-end checking is turned OFF, not on, by its switch */
   222     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   223     /* Echoing is turned OFF, not on, by its switch */
   224     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   225     if (pswit[OVERVIEW_SWITCH])
   226 	/* just print summary; don't echo */
   227 	pswit[ECHO_SWITCH]=FALSE;
   228     /*
   229      * Web uploads - for the moment, this is really just a placeholder
   230      * until we decide what processing we really want to do on web uploads
   231      */
   232     if (pswit[WEB_SWITCH])
   233     {
   234 	/* specific override for web uploads */
   235 	pswit[ECHO_SWITCH]=TRUE;
   236 	pswit[SQUOTE_SWITCH]=FALSE;
   237 	pswit[TYPO_SWITCH]=TRUE;
   238 	pswit[QPARA_SWITCH]=FALSE;
   239 	pswit[PARANOID_SWITCH]=TRUE;
   240 	pswit[LINE_END_SWITCH]=FALSE;
   241 	pswit[OVERVIEW_SWITCH]=FALSE;
   242 	pswit[STDOUT_SWITCH]=FALSE;
   243 	pswit[HEADER_SWITCH]=TRUE;
   244 	pswit[VERBOSE_SWITCH]=FALSE;
   245 	pswit[MARKUP_SWITCH]=FALSE;
   246 	pswit[USERTYPO_SWITCH]=FALSE;
   247 	pswit[DP_SWITCH]=FALSE;
   248     }
   249     if (*argc<2)
   250     {
   251 	proghelp(context);
   252 	exit(1);
   253     }
   254     g_option_context_free(context);
   255 }
   256 
   257 /*
   258  * read_user_scannos:
   259  *
   260  * Read in the user-defined stealth scanno list.
   261  */
   262 void read_user_scannos(void)
   263 {
   264     GError *err=NULL;
   265     gchar *usertypo_file;
   266     gboolean okay;
   267     int i;
   268     gsize len,nb;
   269     gchar *contents,*utf8,**lines;
   270     usertypo_file=g_strdup("bookloupe.typ");
   271     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   272     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   273     {
   274 	g_clear_error(&err);
   275 	g_free(usertypo_file);
   276 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   277 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   278     }
   279     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   280     {
   281 	g_clear_error(&err);
   282 	g_free(usertypo_file);
   283 	usertypo_file=g_strdup("gutcheck.typ");
   284 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   285     }
   286     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   287     {
   288 	g_clear_error(&err);
   289 	g_free(usertypo_file);
   290 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   291 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   292     }
   293     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   294     {
   295 	g_free(usertypo_file);
   296 	g_print("   --> I couldn't find bookloupe.typ "
   297 	  "-- proceeding without user typos.\n");
   298 	return;
   299     }
   300     else if (!okay)
   301     {
   302 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   303 	g_free(usertypo_file);
   304 	g_clear_error(&err);
   305 	exit(1);
   306     }
   307     if (g_utf8_validate(contents,len,NULL))
   308 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   309     else
   310 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   311     g_free(contents);
   312     lines=g_strsplit_set(utf8,"\r\n",0);
   313     g_free(utf8);
   314     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   315     for (i=0;lines[i];i++)
   316 	if (*(unsigned char *)lines[i]>'!')
   317 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   318 	else
   319 	    g_free(lines[i]);
   320     g_free(lines);
   321 }
   322 
   323 /*
   324  * read_etext:
   325  *
   326  * Read an etext returning a newly allocated string containing the file
   327  * contents or NULL on error.
   328  */
   329 gchar *read_etext(const char *filename,GError **err)
   330 {
   331     GError *tmp_err=NULL;
   332     gchar *contents,*utf8;
   333     gsize len,bytes_read,bytes_written;
   334     int i,line,col;
   335     if (!g_file_get_contents(filename,&contents,&len,err))
   336 	return NULL;
   337     if (g_utf8_validate(contents,len,NULL))
   338     {
   339 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   340 	g_set_print_handler(print_as_utf_8);
   341 #ifdef __WIN32__
   342 	SetConsoleOutputCP(CP_UTF8);
   343 #endif
   344     }
   345     else
   346     {
   347 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   348 	  &bytes_written,&tmp_err);
   349 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   350 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   351 	{
   352 	    line=col=1;
   353 	    for(i=0;i<bytes_read;i++)
   354 		if (contents[i]=='\n')
   355 		{
   356 		    line++;
   357 		    col=1;
   358 		}
   359 		else if (contents[i]!='\r')
   360 		    col++;
   361 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   362 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   363 	      "valid Windows-1252 character",
   364 	      ((unsigned char *)contents)[bytes_read],line,col);
   365 	}
   366 	else if (tmp_err)
   367 	    g_propagate_error(err,tmp_err);
   368 	g_set_print_handler(print_as_windows_1252);
   369 #ifdef __WIN32__
   370 	SetConsoleOutputCP(1252);
   371 #endif
   372     }
   373     g_free(contents);
   374     return utf8;
   375 }
   376 
   377 void cleanup_on_exit(void)
   378 {
   379 #ifdef __WIN32__
   380     SetConsoleOutputCP(saved_cp);
   381 #endif
   382 }
   383 
   384 int main(int argc,char **argv)
   385 {
   386 #ifdef __WIN32__
   387     atexit(cleanup_on_exit);
   388     saved_cp=GetConsoleOutputCP();
   389 #endif
   390     running_from=g_path_get_dirname(argv[0]);
   391     parse_options(&argc,&argv);
   392     if (pswit[USERTYPO_SWITCH])
   393 	read_user_scannos();
   394     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   395     procfile(argv[1]);
   396     if (pswit[OVERVIEW_SWITCH])
   397     {
   398 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   399 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   400 	g_print("    --------------- Queries found --------------\n");
   401 	if (cnt_long)
   402 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   403 	if (cnt_short)
   404 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   405 	if (cnt_lineend)
   406 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   407 	if (cnt_word)
   408 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   409 	if (cnt_quote)
   410 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   411 	if (cnt_brack)
   412 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   413 	if (cnt_bin)
   414 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   415 	if (cnt_odd)
   416 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   417 	if (cnt_punct)
   418 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   419 	if (cnt_dash)
   420 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   421 	if (cnt_html)
   422 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   423 	g_print("\n");
   424 	g_print("    TOTAL QUERIES		  %14ld\n",
   425 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   426 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   427     }
   428     g_free(running_from);
   429     if (usertypo)
   430 	g_tree_unref(usertypo);
   431     return 0;
   432 }
   433 
   434 /*
   435  * first_pass:
   436  *
   437  * Run a first pass - verify that it's a valid PG
   438  * file, decide whether to report some things that
   439  * occur many times in the text like long or short
   440  * lines, non-standard dashes, etc.
   441  */
   442 struct first_pass_results *first_pass(const char *etext)
   443 {
   444     gunichar laststart=CHAR_SPACE;
   445     const char *s;
   446     gchar *lc_line;
   447     int i,j,lbytes,llen;
   448     gchar **lines;
   449     unsigned int lastlen=0,lastblen=0;
   450     long spline=0,nspline=0;
   451     static struct first_pass_results results={0};
   452     gchar *inword;
   453     lines=g_strsplit(etext,"\n",0);
   454     for (j=0;lines[j];j++)
   455     {
   456 	lbytes=strlen(lines[j]);
   457 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   458 	    lines[j][--lbytes]='\0';
   459 	llen=g_utf8_strlen(lines[j],lbytes);
   460 	linecnt++;
   461 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   462 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   463 	{
   464 	    if (spline)
   465 		g_print("   --> Duplicate header?\n");
   466 	    spline=linecnt+1;   /* first line of non-header text, that is */
   467 	}
   468 	if (!strncmp(lines[j],"*** START",9) &&
   469 	  strstr(lines[j],"PROJECT GUTENBERG"))
   470 	{
   471 	    if (nspline)
   472 		g_print("   --> Duplicate header?\n");
   473 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   474 	}
   475 	if (spline || nspline)
   476 	{
   477 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   478 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   479 	    {
   480 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   481 		{
   482 		    if (results.footerline)
   483 		    {
   484 			/* it's an old-form header - we can detect duplicates */
   485 			if (!nspline)
   486 			    g_print("   --> Duplicate footer?\n");
   487 		    }
   488 		    else
   489 			results.footerline=linecnt;
   490 		}
   491 	    }
   492 	    g_free(lc_line);
   493 	}
   494 	if (spline)
   495 	    results.firstline=spline;
   496 	if (nspline)
   497 	    results.firstline=nspline;  /* override with new */
   498 	if (results.footerline)
   499 	    continue;    /* don't count the boilerplate in the footer */
   500 	results.totlen+=llen;
   501 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   502 	{
   503 	    if (g_utf8_get_char(s)>127)
   504 		results.binlen++;
   505 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   506 		results.alphalen++;
   507 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   508 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   509 		results.endquote_count++;
   510 	}
   511 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   512 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   513 	    results.shortline++;
   514 	if (lbytes>0 &&
   515 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   516 	    cnt_spacend++;
   517 	if (strstr(lines[j],".,"))
   518 	    results.dotcomma++;
   519 	/* only count ast lines for ignoring purposes where there is */
   520 	/* locase text on the line */
   521 	if (strchr(lines[j],'*'))
   522 	{
   523 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   524 		if (g_unichar_islower(g_utf8_get_char(s)))
   525 		    break;
   526 	    if (*s)
   527 		results.astline++;
   528 	}
   529 	if (strchr(lines[j],'/'))
   530 	    results.fslashline++;
   531 	if (lbytes>0)
   532 	{
   533 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   534 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   535 	      s=g_utf8_prev_char(s))
   536 		;
   537 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   538 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   539 		results.hyphens++;
   540 	}
   541 	if (llen>LONGEST_PG_LINE)
   542 	    results.longline++;
   543 	if (llen>WAY_TOO_LONG)
   544 	    results.verylongline++;
   545 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   546 	{
   547 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   548 	    if (i>0)
   549 		results.htmcount++;
   550 	    if (strstr(lines[j],"<i>"))
   551 		results.htmcount+=4; /* bonus marks! */
   552 	}
   553 	/* Check for spaced em-dashes */
   554 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   555 	{
   556 	    results.emdash++;
   557 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   558 		results.space_emdash++;
   559 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   560 		/* count of em-dashes with spaces both sides */
   561 		results.non_PG_space_emdash++;
   562 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   563 		/* count of PG-type em-dashes with no spaces */
   564 		results.PG_space_emdash++;
   565 	}
   566 	for (s=lines[j];*s;)
   567 	{
   568 	    inword=getaword(&s);
   569 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   570 		results.Dutchcount++;
   571 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   572 		results.Frenchcount++;
   573 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   574 		results.standalone_digit++;
   575 	    g_free(inword);
   576 	}
   577 	/* Check for spaced dashes */
   578 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   579 	    results.spacedash++;
   580 	lastblen=lastlen;
   581 	lastlen=llen;
   582 	laststart=lines[j][0];
   583     }
   584     g_strfreev(lines);
   585     return &results;
   586 }
   587 
   588 /*
   589  * report_first_pass:
   590  *
   591  * Make some snap decisions based on the first pass results.
   592  */
   593 struct warnings *report_first_pass(struct first_pass_results *results)
   594 {
   595     static struct warnings warnings={0};
   596     if (cnt_spacend>0)
   597 	g_print("   --> %ld lines in this file have white space at end\n",
   598 	  cnt_spacend);
   599     warnings.dotcomma=1;
   600     if (results->dotcomma>5)
   601     {
   602 	warnings.dotcomma=0;
   603 	g_print("   --> %ld lines in this file contain '.,'. "
   604 	  "Not reporting them.\n",results->dotcomma);
   605     }
   606     /*
   607      * If more than 50 lines, or one-tenth, are short,
   608      * don't bother reporting them.
   609      */
   610     warnings.shortline=1;
   611     if (results->shortline>50 || results->shortline*10>linecnt)
   612     {
   613 	warnings.shortline=0;
   614 	g_print("   --> %ld lines in this file are short. "
   615 	  "Not reporting short lines.\n",results->shortline);
   616     }
   617     /*
   618      * If more than 50 lines, or one-tenth, are long,
   619      * don't bother reporting them.
   620      */
   621     warnings.longline=1;
   622     if (results->longline>50 || results->longline*10>linecnt)
   623     {
   624 	warnings.longline=0;
   625 	g_print("   --> %ld lines in this file are long. "
   626 	  "Not reporting long lines.\n",results->longline);
   627     }
   628     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   629     warnings.ast=1;
   630     if (results->astline>10)
   631     {
   632 	warnings.ast=0;
   633 	g_print("   --> %ld lines in this file contain asterisks. "
   634 	  "Not reporting them.\n",results->astline);
   635     }
   636     /*
   637      * If more than 10 lines contain forward slashes,
   638      * don't bother reporting them.
   639      */
   640     warnings.fslash=1;
   641     if (results->fslashline>10)
   642     {
   643 	warnings.fslash=0;
   644 	g_print("   --> %ld lines in this file contain forward slashes. "
   645 	  "Not reporting them.\n",results->fslashline);
   646     }
   647     /*
   648      * If more than 20 lines contain unpunctuated endquotes,
   649      * don't bother reporting them.
   650      */
   651     warnings.endquote=1;
   652     if (results->endquote_count>20)
   653     {
   654 	warnings.endquote=0;
   655 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   656 	  "Not reporting them.\n",results->endquote_count);
   657     }
   658     /*
   659      * If more than 15 lines contain standalone digits,
   660      * don't bother reporting them.
   661      */
   662     warnings.digit=1;
   663     if (results->standalone_digit>10)
   664     {
   665 	warnings.digit=0;
   666 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   667 	  "Not reporting them.\n",results->standalone_digit);
   668     }
   669     /*
   670      * If more than 20 lines contain hyphens at end,
   671      * don't bother reporting them.
   672      */
   673     warnings.hyphen=1;
   674     if (results->hyphens>20)
   675     {
   676 	warnings.hyphen=0;
   677 	g_print("   --> %ld lines in this file have hyphens at end. "
   678 	  "Not reporting them.\n",results->hyphens);
   679     }
   680     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   681     {
   682 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   683 	pswit[MARKUP_SWITCH]=1;
   684     }
   685     if (results->verylongline>0)
   686 	g_print("   --> %ld lines in this file are VERY long!\n",
   687 	  results->verylongline);
   688     /*
   689      * If there are more non-PG spaced dashes than PG em-dashes,
   690      * assume it's deliberate.
   691      * Current PG guidelines say don't use them, but older texts do,
   692      * and some people insist on them whatever the guidelines say.
   693      */
   694     warnings.dash=1;
   695     if (results->spacedash+results->non_PG_space_emdash>
   696       results->PG_space_emdash)
   697     {
   698 	warnings.dash=0;
   699 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   700 	  "Not reporting them.\n",
   701 	  results->spacedash+results->non_PG_space_emdash);
   702     }
   703     /* If more than a quarter of characters are hi-bit, bug out. */
   704     warnings.bin=1;
   705     if (results->binlen*4>results->totlen)
   706     {
   707 	g_print("   --> This file does not appear to be ASCII. "
   708 	  "Terminating. Best of luck with it!\n");
   709 	exit(1);
   710     }
   711     if (results->alphalen*4<results->totlen)
   712     {
   713 	g_print("   --> This file does not appear to be text. "
   714 	  "Terminating. Best of luck with it!\n");
   715 	exit(1);
   716     }
   717     if (results->binlen*100>results->totlen || results->binlen>100)
   718     {
   719 	g_print("   --> There are a lot of foreign letters here. "
   720 	  "Not reporting them.\n");
   721 	warnings.bin=0;
   722     }
   723     warnings.isDutch=FALSE;
   724     if (results->Dutchcount>50)
   725     {
   726 	warnings.isDutch=TRUE;
   727 	g_print("   --> This looks like Dutch - "
   728 	  "switching off dashes and warnings for 's Middags case.\n");
   729     }
   730     warnings.isFrench=FALSE;
   731     if (results->Frenchcount>50)
   732     {
   733 	warnings.isFrench=TRUE;
   734 	g_print("   --> This looks like French - "
   735 	  "switching off some doublepunct.\n");
   736     }
   737     if (results->firstline && results->footerline)
   738 	g_print("    The PG header and footer appear to be already on.\n");
   739     else
   740     {
   741 	if (results->firstline)
   742 	    g_print("    The PG header is on - no footer.\n");
   743 	if (results->footerline)
   744 	    g_print("    The PG footer is on - no header.\n");
   745     }
   746     g_print("\n");
   747     if (pswit[VERBOSE_SWITCH])
   748     {
   749 	warnings.bin=1;
   750 	warnings.shortline=1;
   751 	warnings.dotcomma=1;
   752 	warnings.longline=1;
   753 	warnings.dash=1;
   754 	warnings.digit=1;
   755 	warnings.ast=1;
   756 	warnings.fslash=1;
   757 	warnings.hyphen=1;
   758 	warnings.endquote=1;
   759 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   760     }
   761     if (warnings.isDutch)
   762 	warnings.dash=0;
   763     if (results->footerline>0 && results->firstline>0 &&
   764       results->footerline>results->firstline &&
   765       results->footerline-results->firstline<100)
   766     {
   767 	g_print("   --> I don't really know where this text starts. \n");
   768 	g_print("       There are no reference points.\n");
   769 	g_print("       I'm going to have to report the header and footer "
   770 	  "as well.\n");
   771 	results->firstline=0;
   772     }
   773     return &warnings;
   774 }
   775 
   776 /*
   777  * analyse_quotes:
   778  *
   779  * Look along the line, accumulate the count of quotes, and see
   780  * if this is an empty line - i.e. a line with nothing on it
   781  * but spaces.
   782  * If line has just spaces, period, * and/or - on it, don't
   783  * count it, since empty lines with asterisks or dashes to
   784  * separate sections are common.
   785  *
   786  * Returns: TRUE if the line is empty.
   787  */
   788 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
   789 {
   790     int guessquote=0;
   791     /* assume the line is empty until proven otherwise */
   792     gboolean isemptyline=TRUE;
   793     const char *s=aline,*sprev,*snext;
   794     gunichar c;
   795     sprev=NULL;
   796     GError *tmp_err=NULL;
   797     while (*s)
   798     {
   799 	snext=g_utf8_next_char(s);
   800 	c=g_utf8_get_char(s);
   801 	if (CHAR_IS_DQUOTE(c))
   802 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
   803 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
   804 	{
   805 	    if (s==aline)
   806 	    {
   807 		/*
   808 		 * At start of line, it can only be a quotation mark.
   809 		 * Hardcode a very common exception!
   810 		 */
   811 		if (!g_str_has_prefix(snext,"tis") &&
   812 		  !g_str_has_prefix(snext,"Tis"))
   813 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   814 	    }
   815 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   816 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   817 		/* Do nothing! it's definitely an apostrophe, not a quote */
   818 		;
   819 	    /* it's outside a word - let's check it out */
   820 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   821 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   822 	    {
   823 		/* certainly looks like a quotation mark */
   824 		if (!g_str_has_prefix(snext,"tis") &&
   825 		  !g_str_has_prefix(snext,"Tis"))
   826 		    /* hardcode a very common exception! */
   827 		{
   828 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
   829 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   830 		    else
   831 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
   832 		}
   833 	    }
   834 	    else
   835 	    {
   836 		/* now - is it a quotation mark? */
   837 		guessquote=0;   /* accumulate clues */
   838 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   839 		{
   840 		    /* it follows a letter - could be either */
   841 		    guessquote++;
   842 		    if (g_utf8_get_char(sprev)=='s')
   843 		    {
   844 			/* looks like a plural apostrophe */
   845 			guessquote-=3;
   846 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   847 			    /* bonus marks! */
   848 			    guessquote-=2;
   849 		    }
   850 		    if (innermost_quote_matches(counters,c))
   851 			/*
   852 			 * Give it the benefit of some doubt,
   853 			 * if a squote is already open.
   854 			 */
   855 			guessquote++;
   856 		    else
   857 			guessquote--;
   858 		    if (guessquote>=0)
   859 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
   860 		}
   861 		else
   862 		    /* no adjacent letter - it must be a quote of some kind */
   863 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   864 	    }
   865 	}
   866 	if (tmp_err)
   867 	{
   868 	    if (pswit[ECHO_SWITCH])
   869 		g_print("\n%s\n",aline);
   870 	    if (!pswit[OVERVIEW_SWITCH])
   871 		g_print("    Line %ld column %ld - %s\n",
   872 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
   873 	    g_clear_error(&tmp_err);
   874 	}
   875 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   876 	  c!='\r' && c!='\n')
   877 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   878 	if (c==CHAR_UNDERSCORE)
   879 	    counters->c_unders++;
   880 	if (c==CHAR_OPEN_SBRACK)
   881 	{
   882 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
   883 	      !matching_difference(counters,c) && s==aline &&
   884 	      g_str_has_prefix(s,"[Illustration:"))
   885 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
   886 	    else
   887 		increment_matching(counters,c,TRUE);
   888 	}
   889 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
   890 	    increment_matching(counters,c,TRUE);
   891 	if (c==CHAR_CLOSE_SBRACK)
   892 	{
   893 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
   894 	      !matching_difference(counters,c) && !*snext)
   895 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
   896 	    else
   897 		increment_matching(counters,c,FALSE);
   898 	}
   899 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
   900 	    increment_matching(counters,c,FALSE);
   901 	sprev=s;
   902 	s=snext;
   903     }
   904     return isemptyline;
   905 }
   906 
   907 /*
   908  * check_for_control_characters:
   909  *
   910  * Check for invalid or questionable characters in the line
   911  * Anything above 127 is invalid for plain ASCII, and
   912  * non-printable control characters should also be flagged.
   913  * Tabs should generally not be there.
   914  */
   915 void check_for_control_characters(const char *aline)
   916 {
   917     gunichar c;
   918     const char *s;
   919     for (s=aline;*s;s=g_utf8_next_char(s))
   920     {
   921 	c=g_utf8_get_char(s);
   922 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   923 	{
   924 	    if (pswit[ECHO_SWITCH])
   925 		g_print("\n%s\n",aline);
   926 	    if (!pswit[OVERVIEW_SWITCH])
   927 		g_print("    Line %ld column %ld - Control character %u\n",
   928 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   929 	    else
   930 		cnt_bin++;
   931 	}
   932     }
   933 }
   934 
   935 /*
   936  * check_for_odd_characters:
   937  *
   938  * Check for binary and other odd characters.
   939  */
   940 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   941   gboolean isemptyline)
   942 {
   943     /* Don't repeat multiple warnings on one line. */
   944     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   945     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   946     const char *s;
   947     gunichar c;
   948     for (s=aline;*s;s=g_utf8_next_char(s))
   949     {
   950 	c=g_utf8_get_char(s);
   951 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   952 	{
   953 	    if (pswit[ECHO_SWITCH])
   954 		g_print("\n%s\n",aline);
   955 	    if (!pswit[OVERVIEW_SWITCH])
   956 		if (c>127 && c<160 || c>255)
   957 		    g_print("    Line %ld column %ld - "
   958 		      "Non-ISO-8859 character %u\n",
   959 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   960 		else
   961 		    g_print("    Line %ld column %ld - "
   962 		      "Non-ASCII character %u\n",
   963 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   964 	    else
   965 		cnt_bin++;
   966 	    eNon_A=TRUE;
   967 	}
   968 	if (!eTab && c==CHAR_TAB)
   969 	{
   970 	    if (pswit[ECHO_SWITCH])
   971 		g_print("\n%s\n",aline);
   972 	    if (!pswit[OVERVIEW_SWITCH])
   973 		g_print("    Line %ld column %ld - Tab character?\n",
   974 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   975 	    else
   976 		cnt_odd++;
   977 	    eTab=TRUE;
   978 	}
   979 	if (!eTilde && c==CHAR_TILDE)
   980 	{
   981 	    /*
   982 	     * Often used by OCR software to indicate an
   983 	     * unrecognizable character.
   984 	     */
   985 	    if (pswit[ECHO_SWITCH])
   986 		g_print("\n%s\n",aline);
   987 	    if (!pswit[OVERVIEW_SWITCH])
   988 		g_print("    Line %ld column %ld - Tilde character?\n",
   989 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   990 	    else
   991 		cnt_odd++;
   992 	    eTilde=TRUE;
   993 	}
   994 	if (!eCarat && c==CHAR_CARAT)
   995 	{  
   996 	    if (pswit[ECHO_SWITCH])
   997 		g_print("\n%s\n",aline);
   998 	    if (!pswit[OVERVIEW_SWITCH])
   999 		g_print("    Line %ld column %ld - Carat character?\n",
  1000 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1001 	    else
  1002 		cnt_odd++;
  1003 	    eCarat=TRUE;
  1004 	}
  1005 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1006 	{  
  1007 	    if (pswit[ECHO_SWITCH])
  1008 		g_print("\n%s\n",aline);
  1009 	    if (!pswit[OVERVIEW_SWITCH])
  1010 		g_print("    Line %ld column %ld - Forward slash?\n",
  1011 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1012 	    else
  1013 		cnt_odd++;
  1014 	    eFSlash=TRUE;
  1015 	}
  1016 	/*
  1017 	 * Report asterisks only in paranoid mode,
  1018 	 * since they're often deliberate.
  1019 	 */
  1020 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1021 	  c==CHAR_ASTERISK)
  1022 	{
  1023 	    if (pswit[ECHO_SWITCH])
  1024 		g_print("\n%s\n",aline);
  1025 	    if (!pswit[OVERVIEW_SWITCH])
  1026 		g_print("    Line %ld column %ld - Asterisk?\n",
  1027 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1028 	    else
  1029 		cnt_odd++;
  1030 	    eAst=TRUE;
  1031 	}
  1032     }
  1033 }
  1034 
  1035 /*
  1036  * check_for_long_line:
  1037  *
  1038  * Check for line too long.
  1039  */
  1040 void check_for_long_line(const char *aline)
  1041 {
  1042     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1043     {
  1044 	if (pswit[ECHO_SWITCH])
  1045 	    g_print("\n%s\n",aline);
  1046 	if (!pswit[OVERVIEW_SWITCH])
  1047 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1048 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1049 	else
  1050 	    cnt_long++;
  1051     }
  1052 }
  1053 
  1054 /*
  1055  * check_for_short_line:
  1056  *
  1057  * Check for line too short.
  1058  *
  1059  * This one is a bit trickier to implement: we don't want to
  1060  * flag the last line of a paragraph for being short, so we
  1061  * have to wait until we know that our current line is a
  1062  * "normal" line, then report the _previous_ line if it was too
  1063  * short. We also don't want to report indented lines like
  1064  * chapter heads or formatted quotations. We therefore keep
  1065  * last->len as the length of the last line examined, and
  1066  * last->blen as the length of the last but one, and try to
  1067  * suppress unnecessary warnings by checking that both were of
  1068  * "normal" length. We keep the first character of the last
  1069  * line in last->start, and if it was a space, we assume that
  1070  * the formatting is deliberate. I can't figure out a way to
  1071  * distinguish something like a quoted verse left-aligned or
  1072  * the header or footer of a letter from a paragraph of short
  1073  * lines - maybe if I examined the whole paragraph, and if the
  1074  * para has less than, say, 8 lines and if all lines are short,
  1075  * then just assume it's OK? Need to look at some texts to see
  1076  * how often a formula like this would get the right result.
  1077  */
  1078 void check_for_short_line(const char *aline,const struct line_properties *last)
  1079 {
  1080     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1081       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1082       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1083     {
  1084 	if (pswit[ECHO_SWITCH])
  1085 	    g_print("\n%s\n",prevline);
  1086 	if (!pswit[OVERVIEW_SWITCH])
  1087 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1088 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1089 	else
  1090 	    cnt_short++;
  1091     }
  1092 }
  1093 
  1094 /*
  1095  * check_for_starting_punctuation:
  1096  *
  1097  * Look for punctuation other than full ellipses at start of line.
  1098  */
  1099 void check_for_starting_punctuation(const char *aline)
  1100 {
  1101     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1102       !g_str_has_prefix(aline,". . ."))
  1103     {
  1104 	if (pswit[ECHO_SWITCH])
  1105 	    g_print("\n%s\n",aline);
  1106 	if (!pswit[OVERVIEW_SWITCH])
  1107 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1108 	      linecnt);
  1109 	else
  1110 	    cnt_punct++;
  1111     }
  1112 }
  1113 
  1114 /*
  1115  * check_for_spaced_emdash:
  1116  *
  1117  * Check for spaced em-dashes.
  1118  *
  1119  * We must check _all_ occurrences of "--" on the line
  1120  * hence the loop - even if the first double-dash is OK
  1121  * there may be another that's wrong later on.
  1122  */
  1123 void check_for_spaced_emdash(const char *aline)
  1124 {
  1125     const char *s,*t,*next;
  1126     for (s=aline;t=strstr(s,"--");s=next)
  1127     {
  1128 	next=g_utf8_next_char(g_utf8_next_char(t));
  1129 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1130 	  g_utf8_get_char(next)==CHAR_SPACE)
  1131 	{
  1132 	    if (pswit[ECHO_SWITCH])
  1133 		g_print("\n%s\n",aline);
  1134 	    if (!pswit[OVERVIEW_SWITCH])
  1135 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1136 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1137 	    else
  1138 		cnt_dash++;
  1139 	}
  1140     }
  1141 }
  1142 
  1143 /*
  1144  * check_for_spaced_dash:
  1145  *
  1146  * Check for spaced dashes.
  1147  */
  1148 void check_for_spaced_dash(const char *aline)
  1149 {
  1150     const char *s;
  1151     if ((s=strstr(aline," -")))
  1152     {
  1153 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1154 	{
  1155 	    if (pswit[ECHO_SWITCH])
  1156 		g_print("\n%s\n",aline);
  1157 	    if (!pswit[OVERVIEW_SWITCH])
  1158 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1159 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1160 	    else
  1161 		cnt_dash++;
  1162 	}
  1163     }
  1164     else if ((s=strstr(aline,"- ")))
  1165     {
  1166 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1167 	{
  1168 	    if (pswit[ECHO_SWITCH])
  1169 		g_print("\n%s\n",aline);
  1170 	    if (!pswit[OVERVIEW_SWITCH])
  1171 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1172 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1173 	    else
  1174 		cnt_dash++;
  1175 	}
  1176     }
  1177 }
  1178 
  1179 /*
  1180  * check_for_unmarked_paragraphs:
  1181  *
  1182  * Check for unmarked paragraphs indicated by separate speakers.
  1183  *
  1184  * May well be false positive:
  1185  * "Bravo!" "Wonderful!" called the crowd.
  1186  * but useful all the same.
  1187  */
  1188 void check_for_unmarked_paragraphs(const char *aline)
  1189 {
  1190     const char *s;
  1191     s=strstr(aline,"\"  \"");
  1192     if (!s)
  1193 	s=strstr(aline,"\" \"");
  1194     if (s)
  1195     {
  1196 	if (pswit[ECHO_SWITCH])
  1197 	    g_print("\n%s\n",aline);
  1198 	if (!pswit[OVERVIEW_SWITCH])
  1199 	    g_print("    Line %ld column %ld - "
  1200 	      "Query missing paragraph break?\n",
  1201 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1202 	else
  1203 	    cnt_punct++;
  1204     }
  1205 }
  1206 
  1207 /*
  1208  * check_for_jeebies:
  1209  *
  1210  * Check for "to he" and other easy h/b errors.
  1211  *
  1212  * This is a very inadequate effort on the h/b problem,
  1213  * but the phrase "to he" is always an error, whereas "to
  1214  * be" is quite common.
  1215  * Similarly, '"Quiet!", be said.' is a non-be error
  1216  * "to he" is _not_ always an error!:
  1217  *       "Where they went to he couldn't say."
  1218  * Another false positive:
  1219  *       What would "Cinderella" be without the . . .
  1220  * and another: "If he wants to he can see for himself."
  1221  */
  1222 void check_for_jeebies(const char *aline)
  1223 {
  1224     const char *s;
  1225     s=strstr(aline," be could ");
  1226     if (!s)
  1227 	s=strstr(aline," be would ");
  1228     if (!s)
  1229 	s=strstr(aline," was be ");
  1230     if (!s)
  1231 	s=strstr(aline," be is ");
  1232     if (!s)
  1233 	s=strstr(aline," is be ");
  1234     if (!s)
  1235 	s=strstr(aline,"\", be ");
  1236     if (!s)
  1237 	s=strstr(aline,"\" be ");
  1238     if (!s)
  1239 	s=strstr(aline,"\" be ");
  1240     if (!s)
  1241 	s=strstr(aline," to he ");
  1242     if (s)
  1243     {
  1244 	if (pswit[ECHO_SWITCH])
  1245 	    g_print("\n%s\n",aline);
  1246 	if (!pswit[OVERVIEW_SWITCH])
  1247 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1248 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1249 	else
  1250 	    cnt_word++;
  1251     }
  1252     s=strstr(aline," the had ");
  1253     if (!s)
  1254 	s=strstr(aline," a had ");
  1255     if (!s)
  1256 	s=strstr(aline," they bad ");
  1257     if (!s)
  1258 	s=strstr(aline," she bad ");
  1259     if (!s)
  1260 	s=strstr(aline," he bad ");
  1261     if (!s)
  1262 	s=strstr(aline," you bad ");
  1263     if (!s)
  1264 	s=strstr(aline," i bad ");
  1265     if (s)
  1266     {
  1267 	if (pswit[ECHO_SWITCH])
  1268 	    g_print("\n%s\n",aline);
  1269 	if (!pswit[OVERVIEW_SWITCH])
  1270 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1271 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1272 	else
  1273 	    cnt_word++;
  1274     }
  1275     s=strstr(aline,"; hut ");
  1276     if (!s)
  1277 	s=strstr(aline,", hut ");
  1278     if (s)
  1279     {
  1280 	if (pswit[ECHO_SWITCH])
  1281 	    g_print("\n%s\n",aline);
  1282 	if (!pswit[OVERVIEW_SWITCH])
  1283 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1284 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1285 	else
  1286 	    cnt_word++;
  1287     }
  1288 }
  1289 
  1290 /*
  1291  * check_for_mta_from:
  1292  *
  1293  * Special case - angled bracket in front of "From" placed there by an
  1294  * MTA when sending an e-mail.
  1295  */
  1296 void check_for_mta_from(const char *aline)
  1297 {
  1298     const char *s;
  1299     s=strstr(aline,">From");
  1300     if (s)
  1301     {
  1302 	if (pswit[ECHO_SWITCH])
  1303 	    g_print("\n%s\n",aline);
  1304 	if (!pswit[OVERVIEW_SWITCH])
  1305 	    g_print("    Line %ld column %ld - "
  1306 	      "Query angled bracket with From\n",
  1307 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1308 	else
  1309 	    cnt_punct++;
  1310     }
  1311 }
  1312 
  1313 /*
  1314  * check_for_orphan_character:
  1315  *
  1316  * Check for a single character line -
  1317  * often an overflow from bad wrapping.
  1318  */
  1319 void check_for_orphan_character(const char *aline)
  1320 {
  1321     gunichar c;
  1322     c=g_utf8_get_char(aline);
  1323     if (c && !*g_utf8_next_char(aline))
  1324     {
  1325 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1326 	    ; /* Nothing - ignore numerals alone on a line. */
  1327 	else
  1328 	{
  1329 	    if (pswit[ECHO_SWITCH])
  1330 		g_print("\n%s\n",aline);
  1331 	    if (!pswit[OVERVIEW_SWITCH])
  1332 		g_print("    Line %ld column 1 - Query single character line\n",
  1333 		  linecnt);
  1334 	    else
  1335 		cnt_punct++;
  1336 	}
  1337     }
  1338 }
  1339 
  1340 /*
  1341  * check_for_pling_scanno:
  1342  *
  1343  * Check for I" - often should be !
  1344  */
  1345 void check_for_pling_scanno(const char *aline)
  1346 {
  1347     const char *s;
  1348     s=strstr(aline," I\"");
  1349     if (s)
  1350     {
  1351 	if (pswit[ECHO_SWITCH])
  1352 	    g_print("\n%s\n",aline);
  1353 	if (!pswit[OVERVIEW_SWITCH])
  1354 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1355 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1356 	else
  1357 	    cnt_punct++;
  1358     }
  1359 }
  1360 
  1361 /*
  1362  * check_for_extra_period:
  1363  *
  1364  * Check for period without a capital letter. Cut-down from gutspell.
  1365  * Only works when it happens on a single line.
  1366  */
  1367 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1368 {
  1369     const char *s,*t,*s1,*sprev;
  1370     int i;
  1371     gsize len;
  1372     gboolean istypo;
  1373     gchar *testword;
  1374     gunichar c,nc,pc,*decomposition;
  1375     if (pswit[PARANOID_SWITCH])
  1376     {
  1377 	for (t=aline;t=strstr(t,". ");)
  1378 	{
  1379 	    if (t==aline)
  1380 	    {
  1381 		t=g_utf8_next_char(t);
  1382 		/* start of line punctuation is handled elsewhere */
  1383 		continue;
  1384 	    }
  1385 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1386 	    {
  1387 		t=g_utf8_next_char(t);
  1388 		continue;
  1389 	    }
  1390 	    if (warnings->isDutch)
  1391 	    {
  1392 		/* For Frank & Jeroen -- 's Middags case */
  1393 		gunichar c2,c3,c4,c5;
  1394 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1395 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1396 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1397 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1398 		if (CHAR_IS_APOSTROPHE(c2) &&
  1399 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1400 		  g_unichar_isupper(c5))
  1401 		{
  1402 		    t=g_utf8_next_char(t);
  1403 		    continue;
  1404 		}
  1405 	    }
  1406 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1407 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1408 	      !isdigit(g_utf8_get_char(s1)))
  1409 		s1=g_utf8_next_char(s1);
  1410 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1411 	    {
  1412 		/* we have something to investigate */
  1413 		istypo=TRUE;
  1414 		/* so let's go back and find out */
  1415 		nc=g_utf8_get_char(t);
  1416 		s1=g_utf8_prev_char(t);
  1417 		c=g_utf8_get_char(s1);
  1418 		sprev=g_utf8_prev_char(s1);
  1419 		pc=g_utf8_get_char(sprev);
  1420 		while (s1>=aline &&
  1421 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1422 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1423 		  g_unichar_isalpha(nc)))
  1424 		{
  1425 		    nc=c;
  1426 		    s1=sprev;
  1427 		    c=pc;
  1428 		    sprev=g_utf8_prev_char(s1);
  1429 		    pc=g_utf8_get_char(sprev);
  1430 		}
  1431 		s1=g_utf8_next_char(s1);
  1432 		s=strchr(s1,'.');
  1433 		if (s)
  1434 		    testword=g_strndup(s1,s-s1);
  1435 		else
  1436 		    testword=g_strdup(s1);
  1437 		for (i=0;*abbrev[i];i++)
  1438 		    if (!strcmp(testword,abbrev[i]))
  1439 			istypo=FALSE;
  1440 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1441 		    istypo=FALSE;
  1442 		if (!*g_utf8_next_char(testword))
  1443 		    istypo=FALSE;
  1444 		if (isroman(testword))
  1445 		    istypo=FALSE;
  1446 		if (istypo)
  1447 		{
  1448 		    istypo=FALSE;
  1449 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1450 		    {
  1451 			decomposition=g_unicode_canonical_decomposition(
  1452 			  g_utf8_get_char(s),&len);
  1453 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1454 			    istypo=TRUE;
  1455 			g_free(decomposition);
  1456 		    }
  1457 		}
  1458 		if (istypo &&
  1459 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1460 		{
  1461 		    g_tree_insert(qperiod,g_strdup(testword),
  1462 		      GINT_TO_POINTER(1));
  1463 		    if (pswit[ECHO_SWITCH])
  1464 			g_print("\n%s\n",aline);
  1465 		    if (!pswit[OVERVIEW_SWITCH])
  1466 			g_print("    Line %ld column %ld - Extra period?\n",
  1467 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1468 		    else
  1469 			cnt_punct++;
  1470 		}
  1471 		g_free(testword);
  1472 	    }
  1473 	    t=g_utf8_next_char(t);
  1474 	}
  1475     }
  1476 }
  1477 
  1478 /*
  1479  * check_for_following_punctuation:
  1480  *
  1481  * Check for words usually not followed by punctuation.
  1482  */
  1483 void check_for_following_punctuation(const char *aline)
  1484 {
  1485     int i;
  1486     const char *s,*wordstart;
  1487     gunichar c;
  1488     gchar *inword,*t;
  1489     if (pswit[TYPO_SWITCH])
  1490     {
  1491 	for (s=aline;*s;)
  1492 	{
  1493 	    wordstart=s;
  1494 	    t=getaword(&s);
  1495 	    if (!*t)
  1496 	    {
  1497 		g_free(t);
  1498 		continue;
  1499 	    }
  1500 	    inword=g_utf8_strdown(t,-1);
  1501 	    g_free(t);
  1502 	    for (i=0;*nocomma[i];i++)
  1503 		if (!strcmp(inword,nocomma[i]))
  1504 		{
  1505 		    c=g_utf8_get_char(s);
  1506 		    if (c==',' || c==';' || c==':')
  1507 		    {
  1508 			if (pswit[ECHO_SWITCH])
  1509 			    g_print("\n%s\n",aline);
  1510 			if (!pswit[OVERVIEW_SWITCH])
  1511 			    g_print("    Line %ld column %ld - "
  1512 			      "Query punctuation after %s?\n",
  1513 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1514 			      inword);
  1515 			else
  1516 			    cnt_punct++;
  1517 		    }
  1518 		}
  1519 	    for (i=0;*noperiod[i];i++)
  1520 		if (!strcmp(inword,noperiod[i]))
  1521 		{
  1522 		    c=g_utf8_get_char(s);
  1523 		    if (c=='.' || c=='!')
  1524 		    {
  1525 			if (pswit[ECHO_SWITCH])
  1526 			    g_print("\n%s\n",aline);
  1527 			if (!pswit[OVERVIEW_SWITCH])
  1528 			    g_print("    Line %ld column %ld - "
  1529 			      "Query punctuation after %s?\n",
  1530 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1531 			      inword);
  1532 			else
  1533 			    cnt_punct++;
  1534 		    }
  1535 		}
  1536 	    g_free(inword);
  1537 	}
  1538     }
  1539 }
  1540 
  1541 /*
  1542  * check_for_typos:
  1543  *
  1544  * Check for commonly mistyped words,
  1545  * and digits like 0 for O in a word.
  1546  */
  1547 void check_for_typos(const char *aline,struct warnings *warnings)
  1548 {
  1549     const char *s,*t,*nt,*wordstart;
  1550     gchar *inword;
  1551     gunichar *decomposition;
  1552     gchar *testword;
  1553     int i,vowel,consonant,*dupcnt;
  1554     gboolean isdup,istypo,alower;
  1555     gunichar c,pc;
  1556     long offset,len;
  1557     gsize decomposition_len;
  1558     for (s=aline;*s;)
  1559     {
  1560 	wordstart=s;
  1561 	inword=getaword(&s);
  1562 	if (!*inword)
  1563 	{
  1564 	    g_free(inword);
  1565 	    continue; /* don't bother with empty lines */
  1566 	}
  1567 	if (mixdigit(inword))
  1568 	{
  1569 	    if (pswit[ECHO_SWITCH])
  1570 		g_print("\n%s\n",aline);
  1571 	    if (!pswit[OVERVIEW_SWITCH])
  1572 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1573 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1574 	    else
  1575 		cnt_word++;
  1576 	}
  1577 	/*
  1578 	 * Put the word through a series of tests for likely typos and OCR
  1579 	 * errors.
  1580 	 */
  1581 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1582 	{
  1583 	    istypo=FALSE;
  1584 	    alower=FALSE;
  1585 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1586 	    {
  1587 		c=g_utf8_get_char(t);
  1588 		nt=g_utf8_next_char(t);
  1589 		/* lowercase for testing */
  1590 		if (g_unichar_islower(c))
  1591 		    alower=TRUE;
  1592 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1593 		{
  1594 		    /*
  1595 		     * We have an uppercase mid-word. However, there are
  1596 		     * common cases:
  1597 		     *   Mac and Mc like McGill
  1598 		     *   French contractions like l'Abbe
  1599 		     */
  1600 		    offset=g_utf8_pointer_to_offset(inword,t);
  1601 		    if (offset>0)
  1602 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1603 		    else
  1604 			pc='\0';
  1605 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1606 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1607 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1608 		      CHAR_IS_APOSTROPHE(pc))
  1609 			; /* do nothing! */
  1610 		    else
  1611 			istypo=TRUE;
  1612 		}
  1613 	    }
  1614 	    testword=g_utf8_casefold(inword,-1);
  1615 	}
  1616 	if (pswit[TYPO_SWITCH])
  1617 	{
  1618 	    /*
  1619 	     * Check for certain unlikely two-letter combinations at word
  1620 	     * start and end.
  1621 	     */
  1622 	    len=g_utf8_strlen(testword,-1);
  1623 	    if (len>1)
  1624 	    {
  1625 		for (i=0;*nostart[i];i++)
  1626 		    if (g_str_has_prefix(testword,nostart[i]))
  1627 			istypo=TRUE;
  1628 		for (i=0;*noend[i];i++)
  1629 		    if (g_str_has_suffix(testword,noend[i]))
  1630 			istypo=TRUE;
  1631 	    }
  1632 	    /* ght is common, gbt never. Like that. */
  1633 	    if (strstr(testword,"cb"))
  1634 		istypo=TRUE;
  1635 	    if (strstr(testword,"gbt"))
  1636 		istypo=TRUE;
  1637 	    if (strstr(testword,"pbt"))
  1638 		istypo=TRUE;
  1639 	    if (strstr(testword,"tbs"))
  1640 		istypo=TRUE;
  1641 	    if (strstr(testword,"mrn"))
  1642 		istypo=TRUE;
  1643 	    if (strstr(testword,"ahle"))
  1644 		istypo=TRUE;
  1645 	    if (strstr(testword,"ihle"))
  1646 		istypo=TRUE;
  1647 	    /*
  1648 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1649 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1650 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1651 	     * numerals, but "ii" is a common scanno.
  1652 	     */
  1653 	    if (strstr(testword,"tbi"))
  1654 		istypo=TRUE;
  1655 	    if (strstr(testword,"tbe"))
  1656 		istypo=TRUE;
  1657 	    if (strstr(testword,"ii"))
  1658 		istypo=TRUE;
  1659 	    /*
  1660 	     * Check for no vowels or no consonants.
  1661 	     * If none, flag a typo.
  1662 	     */
  1663 	    if (!istypo && len>1)
  1664 	    {
  1665 		vowel=consonant=0;
  1666 		for (t=testword;*t;t=g_utf8_next_char(t))
  1667 		{
  1668 		    c=g_utf8_get_char(t);
  1669 		    decomposition=
  1670 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1671 		    if (c=='y' || g_unichar_isdigit(c))
  1672 		    {
  1673 			/* Yah, this is loose. */
  1674 			vowel++;
  1675 			consonant++;
  1676 		    }
  1677 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1678 			vowel++;
  1679 		    else
  1680 			consonant++;
  1681 		    g_free(decomposition);
  1682 		}
  1683 		if (!vowel || !consonant)
  1684 		    istypo=TRUE;
  1685 	    }
  1686 	    /*
  1687 	     * Now exclude the word from being reported if it's in
  1688 	     * the okword list.
  1689 	     */
  1690 	    for (i=0;*okword[i];i++)
  1691 		if (!strcmp(testword,okword[i]))
  1692 		    istypo=FALSE;
  1693 	    /*
  1694 	     * What looks like a typo may be a Roman numeral.
  1695 	     * Exclude these.
  1696 	     */
  1697 	    if (istypo && isroman(testword))
  1698 		istypo=FALSE;
  1699 	    /* Check the manual list of typos. */
  1700 	    if (!istypo)
  1701 		for (i=0;*typo[i];i++)
  1702 		    if (!strcmp(testword,typo[i]))
  1703 			istypo=TRUE;
  1704 	    /*
  1705 	     * Check lowercase s, l, i and m - special cases.
  1706 	     *   "j" - often a semi-colon gone wrong.
  1707 	     *   "d" for a missing apostrophe - he d
  1708 	     *   "n" for "in"
  1709 	     */
  1710 	    if (!istypo && len==1 &&
  1711 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1712 		istypo=TRUE;
  1713 	    if (istypo)
  1714 	    {
  1715 		dupcnt=g_tree_lookup(qword,testword);
  1716 		if (dupcnt)
  1717 		{
  1718 		    (*dupcnt)++;
  1719 		    isdup=!pswit[VERBOSE_SWITCH];
  1720 		}
  1721 		else
  1722 		{
  1723 		    dupcnt=g_new0(int,1);
  1724 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1725 		    isdup=FALSE;
  1726 		}
  1727 		if (!isdup)
  1728 		{
  1729 		    if (pswit[ECHO_SWITCH])
  1730 			g_print("\n%s\n",aline);
  1731 		    if (!pswit[OVERVIEW_SWITCH])
  1732 		    {
  1733 			g_print("    Line %ld column %ld - Query word %s",
  1734 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1735 			  inword);
  1736 			if (!pswit[VERBOSE_SWITCH])
  1737 			    g_print(" - not reporting duplicates");
  1738 			g_print("\n");
  1739 		    }
  1740 		    else
  1741 			cnt_word++;
  1742 		}
  1743 	    }
  1744 	}
  1745 	/* check the user's list of typos */
  1746 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1747 	{
  1748 	    if (pswit[ECHO_SWITCH])
  1749 		g_print("\n%s\n",aline);
  1750 	    if (!pswit[OVERVIEW_SWITCH])  
  1751 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1752 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1753 	}
  1754 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1755 	    g_free(testword);
  1756 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1757 	{
  1758 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1759 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1760 	    {
  1761 		if (pswit[ECHO_SWITCH])
  1762 		    g_print("\n%s\n",aline);
  1763 		if (!pswit[OVERVIEW_SWITCH])
  1764 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1765 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1766 		      inword);
  1767 		else
  1768 		    cnt_word++;
  1769 	    }
  1770 	}
  1771 	g_free(inword);
  1772     }
  1773 }
  1774 
  1775 /*
  1776  * check_for_misspaced_punctuation:
  1777  *
  1778  * Look for added or missing spaces around punctuation and quotes.
  1779  * If there is a punctuation character like ! with no space on
  1780  * either side, suspect a missing!space. If there are spaces on
  1781  * both sides , assume a typo. If we see a double quote with no
  1782  * space or punctuation on either side of it, assume unspaced
  1783  * quotes "like"this.
  1784  */
  1785 void check_for_misspaced_punctuation(const char *aline,
  1786   struct parities *parities,gboolean isemptyline)
  1787 {
  1788     gboolean isacro,isellipsis;
  1789     const char *s;
  1790     gunichar c,nc,pc,n2c;
  1791     c=g_utf8_get_char(aline);
  1792     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1793     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1794     {
  1795 	pc=c;
  1796 	c=nc;
  1797 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1798 	/* For each character in the line after the first. */
  1799 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1800 	{
  1801 	    /* we need to suppress warnings for acronyms like M.D. */
  1802 	    isacro=FALSE;
  1803 	    /* we need to suppress warnings for ellipsis . . . */
  1804 	    isellipsis=FALSE;
  1805 	    /*
  1806 	     * If there are letters on both sides of it or
  1807 	     * if it's strict punctuation followed by an alpha.
  1808 	     */
  1809 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1810 	      g_utf8_strchr("?!,;:",-1,c)))
  1811 	    {
  1812 		if (c=='.')
  1813 		{
  1814 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1815 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1816 			isacro=TRUE;
  1817 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1818 		    if (nc && n2c=='.')
  1819 			isacro=TRUE;
  1820 		}
  1821 		if (!isacro)
  1822 		{
  1823 		    if (pswit[ECHO_SWITCH])
  1824 			g_print("\n%s\n",aline);
  1825 		    if (!pswit[OVERVIEW_SWITCH])
  1826 			g_print("    Line %ld column %ld - Missing space?\n",
  1827 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1828 		    else
  1829 			cnt_punct++;
  1830 		}
  1831 	    }
  1832 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1833 	    {
  1834 		/*
  1835 		 * If there are spaces on both sides,
  1836 		 * or space before and end of line.
  1837 		 */
  1838 		if (c=='.')
  1839 		{
  1840 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1841 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1842 			isellipsis=TRUE;
  1843 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1844 		    if (nc && n2c=='.')
  1845 			isellipsis=TRUE;
  1846 		}
  1847 		if (!isemptyline && !isellipsis)
  1848 		{
  1849 		    if (pswit[ECHO_SWITCH])
  1850 			g_print("\n%s\n",aline);
  1851 		    if (!pswit[OVERVIEW_SWITCH])
  1852 			g_print("    Line %ld column %ld - "
  1853 			  "Spaced punctuation?\n",linecnt,
  1854 			  g_utf8_pointer_to_offset(aline,s)+1);
  1855 		    else
  1856 			cnt_punct++;
  1857 		}
  1858 	    }
  1859 	}
  1860     }
  1861     /* Split out the characters that CANNOT be preceded by space. */
  1862     c=g_utf8_get_char(aline);
  1863     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1864     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1865     {
  1866 	pc=c;
  1867 	c=nc;
  1868 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1869 	/* for each character in the line after the first */
  1870 	if (g_utf8_strchr("?!,;:",-1,c))
  1871 	{
  1872 	    /* if it's punctuation that _cannot_ have a space before it */
  1873 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1874 	    {
  1875 		/*
  1876 		 * If nc DOES == space,
  1877 		 * it was already reported just above.
  1878 		 */
  1879 		if (pswit[ECHO_SWITCH])
  1880 		    g_print("\n%s\n",aline);
  1881 		if (!pswit[OVERVIEW_SWITCH])
  1882 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1883 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1884 		else
  1885 		    cnt_punct++;
  1886 	    }
  1887 	}
  1888     }
  1889     /*
  1890      * Special case " .X" where X is any alpha.
  1891      * This plugs a hole in the acronym code above.
  1892      * Inelegant, but maintainable.
  1893      */
  1894     c=g_utf8_get_char(aline);
  1895     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1896     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1897     {
  1898 	pc=c;
  1899 	c=nc;
  1900 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1901 	/* for each character in the line after the first */
  1902 	if (c=='.')
  1903 	{
  1904 	    /* if it's a period */
  1905 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1906 	    {
  1907 		/*
  1908 		 * If the period follows a space and
  1909 		 * is followed by a letter.
  1910 		 */
  1911 		if (pswit[ECHO_SWITCH])
  1912 		    g_print("\n%s\n",aline);
  1913 		if (!pswit[OVERVIEW_SWITCH])
  1914 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1915 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1916 		else
  1917 		    cnt_punct++;
  1918 	    }
  1919 	}
  1920     }
  1921     c=g_utf8_get_char(aline);
  1922     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1923     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1924     {
  1925 	pc=c;
  1926 	c=nc;
  1927 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1928 	/* for each character in the line after the first */
  1929 	if (c==CHAR_DQUOTE)
  1930 	{
  1931 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  1932 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  1933 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  1934 	    {
  1935 		if (pswit[ECHO_SWITCH])
  1936 		    g_print("\n%s\n",aline);
  1937 		if (!pswit[OVERVIEW_SWITCH])
  1938 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  1939 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1940 		else
  1941 		    cnt_punct++;
  1942 	    }
  1943 	}
  1944     }
  1945     /* Check parity of quotes. */
  1946     nc=g_utf8_get_char(aline);
  1947     for (s=aline;*s;s=g_utf8_next_char(s))
  1948     {
  1949 	c=nc;
  1950 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1951 	if (c==CHAR_DQUOTE)
  1952 	{
  1953 	    parities->dquote=!parities->dquote;
  1954 	    if (!parities->dquote)
  1955 	    {
  1956 		/* parity even */
  1957 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  1958 		{
  1959 		    if (pswit[ECHO_SWITCH])
  1960 			g_print("\n%s\n",aline);
  1961 		    if (!pswit[OVERVIEW_SWITCH])
  1962 			g_print("    Line %ld column %ld - "
  1963 			  "Wrongspaced quotes?\n",
  1964 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1965 		    else
  1966 			cnt_punct++;
  1967 		}
  1968 	    }
  1969 	    else
  1970 	    {
  1971 		/* parity odd */
  1972 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  1973 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  1974 		{
  1975 		    if (pswit[ECHO_SWITCH])
  1976 			g_print("\n%s\n",aline);
  1977 		    if (!pswit[OVERVIEW_SWITCH])
  1978 			g_print("    Line %ld column %ld - "
  1979 			  "Wrongspaced quotes?\n",
  1980 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1981 		    else
  1982 			cnt_punct++;
  1983 		}
  1984 	    }
  1985 	}
  1986     }
  1987     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  1988     {
  1989 	if (g_utf8_strchr(",;:!?)]} ",-1,
  1990 	  g_utf8_get_char(g_utf8_next_char(aline))))
  1991 	{
  1992 	    if (pswit[ECHO_SWITCH])
  1993 		g_print("\n%s\n",aline);
  1994 	    if (!pswit[OVERVIEW_SWITCH])
  1995 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  1996 		  linecnt);
  1997 	    else
  1998 		cnt_punct++;
  1999 	}
  2000     }
  2001     if (pswit[SQUOTE_SWITCH])
  2002     {
  2003 	nc=g_utf8_get_char(aline);
  2004 	for (s=aline;*s;s=g_utf8_next_char(s))
  2005 	{
  2006 	    c=nc;
  2007 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2008 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2009 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2010 	      !g_unichar_isalpha(nc)))
  2011 	    {
  2012 		parities->squote=!parities->squote;
  2013 		if (!parities->squote)
  2014 		{
  2015 		    /* parity even */
  2016 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2017 		    {
  2018 			if (pswit[ECHO_SWITCH])
  2019 			    g_print("\n%s\n",aline);
  2020 			if (!pswit[OVERVIEW_SWITCH])
  2021 			    g_print("    Line %ld column %ld - "
  2022 			      "Wrongspaced singlequotes?\n",
  2023 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2024 			else
  2025 			    cnt_punct++;
  2026 		    }
  2027 		}
  2028 		else
  2029 		{
  2030 		    /* parity odd */
  2031 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2032 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2033 		    {
  2034 			if (pswit[ECHO_SWITCH])
  2035 			    g_print("\n%s\n",aline);
  2036 			if (!pswit[OVERVIEW_SWITCH])
  2037 			    g_print("    Line %ld column %ld - "
  2038 			      "Wrongspaced singlequotes?\n",
  2039 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2040 			else
  2041 			    cnt_punct++;
  2042 		    }
  2043 		}
  2044 	    }
  2045 	}
  2046     }
  2047 }
  2048 
  2049 /*
  2050  * check_for_double_punctuation:
  2051  *
  2052  * Look for double punctuation like ,. or ,,
  2053  * Thanks to DW for the suggestion!
  2054  * In books with references, ".," and ".;" are common
  2055  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2056  * OTOH, from my initial tests, there are also fairly
  2057  * common errors. What to do? Make these cases paranoid?
  2058  * ".," is the most common, so warnings->dotcomma is used
  2059  * to suppress detailed reporting if it occurs often.
  2060  */
  2061 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2062 {
  2063     const char *s;
  2064     gunichar c,nc;
  2065     nc=g_utf8_get_char(aline);
  2066     for (s=aline;*s;s=g_utf8_next_char(s))
  2067     {
  2068 	c=nc;
  2069 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2070 	/* for each punctuation character in the line */
  2071 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2072 	  g_utf8_strchr(".?!,;:",-1,nc))
  2073 	{
  2074 	    /* followed by punctuation, it's a query, unless . . . */
  2075 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2076 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2077 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2078 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2079 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2080 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2081 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2082 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2083 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2084 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2085 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2086 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2087 	    {
  2088 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2089 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2090 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2091 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2092 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2093 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2094 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2095 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2096 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2097 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2098 		{
  2099 		    s+=4;
  2100 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2101 		}
  2102 		; /* do nothing for .. !! and ?? which can be legit */
  2103 	    }
  2104 	    else
  2105 	    {
  2106 		if (pswit[ECHO_SWITCH])
  2107 		    g_print("\n%s\n",aline);
  2108 		if (!pswit[OVERVIEW_SWITCH])
  2109 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2110 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2111 		else
  2112 		    cnt_punct++;
  2113 	    }
  2114 	}
  2115     }
  2116 }
  2117 
  2118 /*
  2119  * check_for_spaced_quotes:
  2120  */
  2121 void check_for_spaced_quotes(const char *aline)
  2122 {
  2123     int i;
  2124     const char *s,*t;
  2125     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2126       CHAR_RS_QUOTE};
  2127     GString *pattern;
  2128     s=aline;
  2129     while ((t=strstr(s," \" ")))
  2130     {
  2131 	if (pswit[ECHO_SWITCH])
  2132 	    g_print("\n%s\n",aline);
  2133 	if (!pswit[OVERVIEW_SWITCH])
  2134 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2135 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2136 	else
  2137 	    cnt_punct++;
  2138 	s=g_utf8_next_char(g_utf8_next_char(t));
  2139     }
  2140     pattern=g_string_new(NULL);
  2141     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2142     {
  2143 	g_string_assign(pattern," ");
  2144 	g_string_append_unichar(pattern,single_quotes[i]);
  2145 	g_string_append_c(pattern,' ');
  2146 	s=aline;
  2147 	while ((t=strstr(s,pattern->str)))
  2148 	{
  2149 	    if (pswit[ECHO_SWITCH])
  2150 		g_print("\n%s\n",aline);
  2151 	    if (!pswit[OVERVIEW_SWITCH])
  2152 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2153 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2154 	    else
  2155 		cnt_punct++;
  2156 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2157 	}
  2158     }
  2159     g_string_free(pattern,TRUE);
  2160 }
  2161 
  2162 /*
  2163  * check_for_miscased_genative:
  2164  *
  2165  * Check special case of 'S instead of 's at end of word.
  2166  */
  2167 void check_for_miscased_genative(const char *aline)
  2168 {
  2169     const char *s;
  2170     gunichar c,nc,pc;
  2171     if (!*aline)
  2172 	return;
  2173     c=g_utf8_get_char(aline);
  2174     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2175     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2176     {
  2177 	pc=c;
  2178 	c=nc;
  2179 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2180 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2181 	{
  2182 	    if (pswit[ECHO_SWITCH])
  2183 		g_print("\n%s\n",aline);
  2184 	    if (!pswit[OVERVIEW_SWITCH])
  2185 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2186 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2187 	    else
  2188 		cnt_punct++;
  2189 	}
  2190     }
  2191 }
  2192 
  2193 /*
  2194  * check_end_of_line:
  2195  *
  2196  * Now check special cases - start and end of line -
  2197  * for single and double quotes. Start is sometimes [sic]
  2198  * but better to query it anyway.
  2199  * While we're here, check for dash at end of line.
  2200  */
  2201 void check_end_of_line(const char *aline,struct warnings *warnings)
  2202 {
  2203     int lbytes;
  2204     const char *s;
  2205     gunichar c1,c2;
  2206     lbytes=strlen(aline);
  2207     if (g_utf8_strlen(aline,lbytes)>1)
  2208     {
  2209 	s=g_utf8_prev_char(aline+lbytes);
  2210 	c1=g_utf8_get_char(s);
  2211 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2212 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2213 	{
  2214 	    if (pswit[ECHO_SWITCH])
  2215 		g_print("\n%s\n",aline);
  2216 	    if (!pswit[OVERVIEW_SWITCH])
  2217 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2218 		  g_utf8_strlen(aline,lbytes));
  2219 	    else
  2220 		cnt_punct++;
  2221 	}
  2222 	c1=g_utf8_get_char(aline);
  2223 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2224 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2225 	{
  2226 	    if (pswit[ECHO_SWITCH])
  2227 		g_print("\n%s\n",aline);
  2228 	    if (!pswit[OVERVIEW_SWITCH])
  2229 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2230 	    else
  2231 		cnt_punct++;
  2232 	}
  2233 	/*
  2234 	 * Dash at end of line may well be legit - paranoid mode only
  2235 	 * and don't report em-dash at line-end.
  2236 	 */
  2237 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2238 	{
  2239 	    for (s=g_utf8_prev_char(aline+lbytes);
  2240 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2241 		;
  2242 	    if (g_utf8_get_char(s)=='-' &&
  2243 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2244 	    {
  2245 		if (pswit[ECHO_SWITCH])
  2246 		    g_print("\n%s\n",aline);
  2247 		if (!pswit[OVERVIEW_SWITCH])
  2248 		    g_print("    Line %ld column %ld - "
  2249 		      "Hyphen at end of line?\n",
  2250 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2251 	    }
  2252 	}
  2253     }
  2254 }
  2255 
  2256 /*
  2257  * check_for_unspaced_bracket:
  2258  *
  2259  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2260  * If so, suspect a scanno like "a]most".
  2261  */
  2262 void check_for_unspaced_bracket(const char *aline)
  2263 {
  2264     const char *s;
  2265     gunichar c,nc,pc;
  2266     c=g_utf8_get_char(aline);
  2267     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2268     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2269     {
  2270 	pc=c;
  2271 	c=nc;
  2272 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2273 	if (!nc)
  2274 	    break;
  2275 	/* for each bracket character in the line except 1st & last */
  2276 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2277 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2278 	{
  2279 	    if (pswit[ECHO_SWITCH])
  2280 		g_print("\n%s\n",aline);
  2281 	    if (!pswit[OVERVIEW_SWITCH])
  2282 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2283 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2284 	    else
  2285 		cnt_punct++;
  2286 	}
  2287     }
  2288 }
  2289 
  2290 /*
  2291  * check_for_unpunctuated_endquote:
  2292  */
  2293 void check_for_unpunctuated_endquote(const char *aline)
  2294 {
  2295     const char *s;
  2296     gunichar c,nc,pc;
  2297     c=g_utf8_get_char(aline);
  2298     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2299     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2300     {
  2301 	pc=c;
  2302 	c=nc;
  2303 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2304 	/* for each character in the line except 1st */
  2305 	if (c==CHAR_DQUOTE && isalpha(pc))
  2306 	{
  2307 	    if (pswit[ECHO_SWITCH])
  2308 		g_print("\n%s\n",aline);
  2309 	    if (!pswit[OVERVIEW_SWITCH])
  2310 		g_print("    Line %ld column %ld - "
  2311 		  "endquote missing punctuation?\n",
  2312 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2313 	    else
  2314 		cnt_punct++;
  2315 	}
  2316     }
  2317 }
  2318 
  2319 /*
  2320  * check_for_html_tag:
  2321  *
  2322  * Check for <HTML TAG>.
  2323  *
  2324  * If there is a < in the line, followed at some point
  2325  * by a > then we suspect HTML.
  2326  */
  2327 void check_for_html_tag(const char *aline)
  2328 {
  2329     const char *open,*close;
  2330     gchar *tag;
  2331     open=strchr(aline,'<');
  2332     if (open)
  2333     {
  2334 	close=strchr(g_utf8_next_char(open),'>');
  2335 	if (close)
  2336 	{
  2337 	    if (pswit[ECHO_SWITCH])
  2338 		g_print("\n%s\n",aline);
  2339 	    if (!pswit[OVERVIEW_SWITCH])
  2340 	    {
  2341 		tag=g_strndup(open,close-open+1);
  2342 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2343 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2344 		g_free(tag);
  2345 	    }
  2346 	    else
  2347 		cnt_html++;
  2348 	}
  2349     }
  2350 }
  2351 
  2352 /*
  2353  * check_for_html_entity:
  2354  *
  2355  * Check for &symbol; HTML.
  2356  *
  2357  * If there is a & in the line, followed at
  2358  * some point by a ; then we suspect HTML.
  2359  */
  2360 void check_for_html_entity(const char *aline)
  2361 {
  2362     const char *s,*amp,*scolon;
  2363     gchar *entity;
  2364     amp=strchr(aline,'&');
  2365     if (amp)
  2366     {
  2367 	scolon=strchr(amp,';');
  2368 	if (scolon)
  2369 	{
  2370 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2371 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2372 		    break;		/* Don't report "Jones & Son;" */
  2373 	    if (s>=scolon)
  2374 	    {
  2375 		if (pswit[ECHO_SWITCH])
  2376 		    g_print("\n%s\n",aline);
  2377 		if (!pswit[OVERVIEW_SWITCH])
  2378 		{
  2379 		    entity=g_strndup(amp,scolon-amp+1);
  2380 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2381 		      linecnt,(int)(amp-aline)+1,entity);
  2382 		    g_free(entity);
  2383 		}
  2384 		else
  2385 		    cnt_html++;
  2386 	    }
  2387 	}
  2388     }
  2389 }
  2390 
  2391 /*
  2392  * check_for_omitted_punctuation:
  2393  *
  2394  * Check for omitted punctuation at end of paragraph by working back
  2395  * through prevline. DW.
  2396  * Need to check this only for "normal" paras.
  2397  * So what is a "normal" para?
  2398  *    Not normal if one-liner (chapter headings, etc.)
  2399  *    Not normal if doesn't contain at least one locase letter
  2400  *    Not normal if starts with space
  2401  */
  2402 void check_for_omitted_punctuation(const char *prevline,
  2403   struct line_properties *last,int start_para_line)
  2404 {
  2405     gboolean letter_on_line=FALSE;
  2406     const char *s;
  2407     gunichar c;
  2408     gboolean closing_quote;
  2409     for (s=prevline;*s;s=g_utf8_next_char(s))
  2410 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2411 	{
  2412 	    letter_on_line=TRUE;
  2413 	    break;
  2414 	}
  2415     /*
  2416      * This next "if" is a problem.
  2417      * If we say "start_para_line <= linecnt - 1", that includes
  2418      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2419      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2420      * misses genuine one-line paragraphs.
  2421      */
  2422     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2423       g_utf8_get_char(prevline)>CHAR_SPACE)
  2424     {
  2425 	s=prevline+strlen(prevline);
  2426 	do
  2427 	{
  2428 	    s=g_utf8_prev_char(s);
  2429 	    c=g_utf8_get_char(s);
  2430 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2431 		closing_quote=TRUE;
  2432 	    else
  2433 		closing_quote=FALSE;
  2434 	} while (closing_quote && s>prevline);
  2435 	for (;s>prevline;s=g_utf8_prev_char(s))
  2436 	{
  2437 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2438 	    {
  2439 		if (pswit[ECHO_SWITCH])
  2440 		    g_print("\n%s\n",prevline);
  2441 		if (!pswit[OVERVIEW_SWITCH])
  2442 		    g_print("    Line %ld column %ld - "
  2443 		      "No punctuation at para end?\n",
  2444 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2445 		else
  2446 		    cnt_punct++;
  2447 		break;
  2448 	    }
  2449 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2450 		break;
  2451 	}
  2452     }
  2453 }
  2454 
  2455 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2456 {
  2457     const char *word=key;
  2458     int *dupcnt=value;
  2459     if (*dupcnt)
  2460 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2461 	  word,*dupcnt);
  2462     return FALSE;
  2463 }
  2464 
  2465 void print_as_windows_1252(const char *string)
  2466 {
  2467     gsize inbytes,outbytes;
  2468     gchar *buf,*bp;
  2469     static GIConv converter=(GIConv)-1;
  2470     if (!string)
  2471     {
  2472 	if (converter!=(GIConv)-1)
  2473 	    g_iconv_close(converter);
  2474 	converter=(GIConv)-1;
  2475 	return;
  2476     }
  2477     if (converter==(GIConv)-1)
  2478 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2479     if (converter!=(GIConv)-1)
  2480     {
  2481 	inbytes=outbytes=strlen(string);
  2482 	bp=buf=g_malloc(outbytes+1);
  2483 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2484 	*bp='\0';
  2485 	fputs(buf,stdout);
  2486 	g_free(buf);
  2487     }
  2488     else
  2489 	fputs(string,stdout);
  2490 }
  2491 
  2492 void print_as_utf_8(const char *string)
  2493 {
  2494     fputs(string,stdout);
  2495 }
  2496 
  2497 /*
  2498  * procfile:
  2499  *
  2500  * Process one file.
  2501  */
  2502 void procfile(const char *filename)
  2503 {
  2504     const char *s;
  2505     gchar *parastart=NULL;	/* first line of current para */
  2506     gchar *etext,*aline;
  2507     gchar *etext_ptr;
  2508     GError *err=NULL;
  2509     struct first_pass_results *first_pass_results;
  2510     struct warnings *warnings;
  2511     struct counters counters={0};
  2512     struct line_properties last={0};
  2513     struct parities parities={0};
  2514     struct pending pending={0};
  2515     gboolean isemptyline;
  2516     long start_para_line=0;
  2517     gboolean isnewpara=FALSE,enddash=FALSE;
  2518     last.start=CHAR_SPACE;
  2519     linecnt=checked_linecnt=0;
  2520     etext=read_etext(filename,&err);
  2521     if (!etext)
  2522     {
  2523 	if (pswit[STDOUT_SWITCH])
  2524 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2525 	else
  2526 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2527 	exit(1);
  2528     }
  2529     g_print("\n\nFile: %s\n\n",filename);
  2530     first_pass_results=first_pass(etext);
  2531     warnings=report_first_pass(first_pass_results);
  2532     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2533     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2534     /*
  2535      * Here we go with the main pass. Hold onto yer hat!
  2536      */
  2537     linecnt=0;
  2538     etext_ptr=etext;
  2539     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2540     {
  2541 	linecnt++;
  2542 	if (linecnt==1)
  2543 	    isnewpara=TRUE;
  2544 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2545 	    continue;    // skip DP page separators completely
  2546 	if (linecnt<first_pass_results->firstline ||
  2547 	  (first_pass_results->footerline>0 &&
  2548 	  linecnt>first_pass_results->footerline))
  2549 	{
  2550 	    if (pswit[HEADER_SWITCH])
  2551 	    {
  2552 		if (g_str_has_prefix(aline,"Title:"))
  2553 		    g_print("    %s\n",aline);
  2554 		if (g_str_has_prefix(aline,"Author:"))
  2555 		    g_print("    %s\n",aline);
  2556 		if (g_str_has_prefix(aline,"Release Date:"))
  2557 		    g_print("    %s\n",aline);
  2558 		if (g_str_has_prefix(aline,"Edition:"))
  2559 		    g_print("    %s\n\n",aline);
  2560 	    }
  2561 	    continue;		/* skip through the header */
  2562 	}
  2563 	checked_linecnt++;
  2564 	print_pending(aline,parastart,&pending);
  2565 	isemptyline=analyse_quotes(aline,linecnt,&counters);
  2566 	if (isnewpara && !isemptyline)
  2567 	{
  2568 	    /* This line is the start of a new paragraph. */
  2569 	    start_para_line=linecnt;
  2570 	    /* Capture its first line in case we want to report it later. */
  2571 	    g_free(parastart);
  2572 	    parastart=g_strdup(aline);
  2573 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2574 	    s=aline;
  2575 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2576 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2577 		s=g_utf8_next_char(s);
  2578 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2579 	    {
  2580 		/* and its first letter is lowercase */
  2581 		if (pswit[ECHO_SWITCH])
  2582 		    g_print("\n%s\n",aline);
  2583 		if (!pswit[OVERVIEW_SWITCH])
  2584 		    g_print("    Line %ld column %ld - "
  2585 		      "Paragraph starts with lower-case\n",
  2586 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2587 		else
  2588 		    cnt_punct++;
  2589 	    }
  2590 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2591 	}
  2592 	/* Check for an em-dash broken at line end. */
  2593 	if (enddash && g_utf8_get_char(aline)=='-')
  2594 	{
  2595 	    if (pswit[ECHO_SWITCH])
  2596 		g_print("\n%s\n",aline);
  2597 	    if (!pswit[OVERVIEW_SWITCH])
  2598 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2599 	    else
  2600 		cnt_punct++;
  2601 	}
  2602 	enddash=FALSE;
  2603 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2604 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2605 	    ;
  2606 	if (s>=aline && g_utf8_get_char(s)=='-')
  2607 	    enddash=TRUE;
  2608 	check_for_control_characters(aline);
  2609 	if (warnings->bin)
  2610 	    check_for_odd_characters(aline,warnings,isemptyline);
  2611 	if (warnings->longline)
  2612 	    check_for_long_line(aline);
  2613 	if (warnings->shortline)
  2614 	    check_for_short_line(aline,&last);
  2615 	last.blen=last.len;
  2616 	last.len=g_utf8_strlen(aline,-1);
  2617 	last.start=g_utf8_get_char(aline);
  2618 	check_for_starting_punctuation(aline);
  2619 	if (warnings->dash)
  2620 	{
  2621 	    check_for_spaced_emdash(aline);
  2622 	    check_for_spaced_dash(aline);
  2623 	}
  2624 	check_for_unmarked_paragraphs(aline);
  2625 	check_for_jeebies(aline);
  2626 	check_for_mta_from(aline);
  2627 	check_for_orphan_character(aline);
  2628 	check_for_pling_scanno(aline);
  2629 	check_for_extra_period(aline,warnings);
  2630 	check_for_following_punctuation(aline);
  2631 	check_for_typos(aline,warnings);
  2632 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2633 	check_for_double_punctuation(aline,warnings);
  2634 	check_for_spaced_quotes(aline);
  2635 	check_for_miscased_genative(aline);
  2636 	check_end_of_line(aline,warnings);
  2637 	check_for_unspaced_bracket(aline);
  2638 	if (warnings->endquote)
  2639 	    check_for_unpunctuated_endquote(aline);
  2640 	check_for_html_tag(aline);
  2641 	check_for_html_entity(aline);
  2642 	if (isemptyline)
  2643 	{
  2644 	    check_for_mismatched_quotes(&counters,&pending);
  2645 	    counters_reset(&counters);
  2646 	    /* let the next iteration know that it's starting a new para */
  2647 	    isnewpara=TRUE;
  2648 	    if (prevline)
  2649 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2650 	}
  2651 	g_free(prevline);
  2652 	prevline=g_strdup(aline);
  2653     }
  2654     linecnt++;
  2655     check_for_mismatched_quotes(&counters,&pending);
  2656     print_pending(NULL,parastart,&pending);
  2657     reset_pending(&pending);
  2658     if (prevline)
  2659     {
  2660 	g_free(prevline);
  2661 	prevline=NULL;
  2662     }
  2663     g_free(parastart);
  2664     g_free(prevline);
  2665     g_free(etext);
  2666     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2667 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2668     g_tree_unref(qword);
  2669     g_tree_unref(qperiod);
  2670     counters_destroy(&counters);
  2671     g_set_print_handler(NULL);
  2672     print_as_windows_1252(NULL);
  2673     if (pswit[MARKUP_SWITCH])  
  2674 	loseentities(NULL);
  2675 }
  2676 
  2677 /*
  2678  * flgets:
  2679  *
  2680  * Get one line from the input text, checking for
  2681  * the existence of exactly one CR/LF line-end per line.
  2682  *
  2683  * Returns: a pointer to the line.
  2684  */
  2685 char *flgets(char **etext,long lcnt)
  2686 {
  2687     gunichar c;
  2688     gboolean isCR=FALSE;
  2689     char *theline=*etext;
  2690     char *eos=theline;
  2691     gchar *s;
  2692     for (;;)
  2693     {
  2694 	c=g_utf8_get_char(*etext);
  2695 	*etext=g_utf8_next_char(*etext);
  2696 	if (!c)
  2697 	    return NULL;
  2698 	/* either way, it's end of line */
  2699 	if (c=='\n')
  2700 	{
  2701 	    if (isCR)
  2702 		break;
  2703 	    else
  2704 	    {
  2705 		/* Error - a LF without a preceding CR */
  2706 		if (pswit[LINE_END_SWITCH])
  2707 		{
  2708 		    if (pswit[ECHO_SWITCH])
  2709 		    {
  2710 			s=g_strndup(theline,eos-theline);
  2711 			g_print("\n%s\n",s);
  2712 			g_free(s);
  2713 		    }
  2714 		    if (!pswit[OVERVIEW_SWITCH])
  2715 			g_print("    Line %ld - No CR?\n",lcnt);
  2716 		    else
  2717 			cnt_lineend++;
  2718 		}
  2719 		break;
  2720 	    }
  2721 	}
  2722 	if (c=='\r')
  2723 	{
  2724 	    if (isCR)
  2725 	    {
  2726 		/* Error - two successive CRs */
  2727 		if (pswit[LINE_END_SWITCH])
  2728 		{
  2729 		    if (pswit[ECHO_SWITCH])
  2730 		    {
  2731 			s=g_strndup(theline,eos-theline);
  2732 			g_print("\n%s\n",s);
  2733 			g_free(s);
  2734 		    }
  2735 		    if (!pswit[OVERVIEW_SWITCH])
  2736 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2737 		    else
  2738 			cnt_lineend++;
  2739 		}
  2740 	    }
  2741 	    isCR=TRUE;
  2742 	}
  2743 	else
  2744 	{
  2745 	    if (pswit[LINE_END_SWITCH] && isCR)
  2746 	    {
  2747 		if (pswit[ECHO_SWITCH])
  2748 		{
  2749 		    s=g_strndup(theline,eos-theline);
  2750 		    g_print("\n%s\n",s);
  2751 		    g_free(s);
  2752 		}
  2753 		if (!pswit[OVERVIEW_SWITCH])
  2754 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2755 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2756 		else
  2757 		    cnt_lineend++;
  2758 		*eos=' ';
  2759 	    }
  2760 	    isCR=FALSE;
  2761 	    eos=g_utf8_next_char(eos);
  2762 	}
  2763     }
  2764     *eos='\0';
  2765     if (pswit[MARKUP_SWITCH])  
  2766 	postprocess_for_HTML(theline);
  2767     if (pswit[DP_SWITCH])  
  2768 	postprocess_for_DP(theline);
  2769     return theline;
  2770 }
  2771 
  2772 /*
  2773  * mixdigit:
  2774  *
  2775  * Takes a "word" as a parameter, and checks whether it
  2776  * contains a mixture of alpha and digits. Generally, this is an
  2777  * error, but may not be for cases like 4th or L5 12s. 3d.
  2778  *
  2779  * Returns: TRUE iff an is error found.
  2780  */
  2781 gboolean mixdigit(const char *checkword)
  2782 {
  2783     gboolean wehaveadigit,wehavealetter,query;
  2784     const char *s,*nondigit;
  2785     wehaveadigit=wehavealetter=query=FALSE;
  2786     for (s=checkword;*s;s=g_utf8_next_char(s))
  2787 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2788 	    wehavealetter=TRUE;
  2789 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2790 	    wehaveadigit=TRUE;
  2791     if (wehaveadigit && wehavealetter)
  2792     {
  2793 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2794 	query=TRUE;
  2795 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2796 	  nondigit=g_utf8_next_char(nondigit))
  2797 	    ;
  2798 	/* digits, ending in st, rd, nd, th of either case */
  2799 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2800 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2801 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2802 	  !g_ascii_strcasecmp(nondigit,"th"))
  2803 	    query=FALSE;
  2804 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2805 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2806 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2807 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2808 	    query=FALSE;
  2809 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2810 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2811 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2812 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2813 	    query=FALSE;
  2814 	/* digits, ending in l, L, s or d */
  2815 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2816 	  !strcmp(nondigit,"d"))
  2817 	    query=FALSE;
  2818 	/*
  2819 	 * L at the start of a number, representing Britsh pounds, like L500.
  2820 	 * This is cute. We know the current word is mixed digit. If the first
  2821 	 * letter is L, there must be at least one digit following. If both
  2822 	 * digits and letters follow, we have a genuine error, else we have a
  2823 	 * capital L followed by digits, and we accept that as a non-error.
  2824 	 */
  2825 	if (g_utf8_get_char(checkword)=='L' &&
  2826 	  !mixdigit(g_utf8_next_char(checkword)))
  2827 	    query=FALSE;
  2828     }
  2829     return query;
  2830 }
  2831 
  2832 /*
  2833  * getaword:
  2834  *
  2835  * Extracts the first/next "word" from the line, and returns it.
  2836  * A word is defined as one English word unit--or at least that's the aim.
  2837  * "ptr" is advanced to the position in the line where we will start
  2838  * looking for the next word.
  2839  *
  2840  * Returns: A newly-allocated string.
  2841  */
  2842 gchar *getaword(const char **ptr)
  2843 {
  2844     const char *s,*t;
  2845     GString *word;
  2846     gunichar c,pc;
  2847     word=g_string_new(NULL);
  2848     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2849       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2850       **ptr;*ptr=g_utf8_next_char(*ptr))
  2851 	;
  2852     /*
  2853      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2854      * Especially yucky is the case of L1,000
  2855      * This section looks for a pattern of characters including a digit
  2856      * followed by a comma or period followed by one or more digits.
  2857      * If found, it returns this whole pattern as a word; otherwise we discard
  2858      * the results and resume our normal programming.
  2859      */
  2860     s=*ptr;
  2861     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  2862       g_unichar_isalpha(g_utf8_get_char(s)) ||
  2863       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  2864 	g_string_append_unichar(word,g_utf8_get_char(s));
  2865     if (word->len)
  2866     {
  2867 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  2868 	{
  2869 	    c=g_utf8_get_char(t);
  2870 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  2871 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  2872 	    {
  2873 		*ptr=s;
  2874 		return g_string_free(word,FALSE);
  2875 	    }
  2876 	}
  2877     }
  2878     /* we didn't find a punctuated number - do the regular getword thing */
  2879     g_string_truncate(word,0);
  2880     c=g_utf8_get_char(*ptr);
  2881     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  2882       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  2883 	g_string_append_unichar(word,c);
  2884     return g_string_free(word,FALSE);
  2885 }
  2886 
  2887 /*
  2888  * isroman:
  2889  *
  2890  * Is this word a Roman Numeral?
  2891  *
  2892  * It doesn't actually validate that the number is a valid Roman Numeral--for
  2893  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  2894  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  2895  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  2896  * expressions thereof, except when it came to taxes. Allow any number of M,
  2897  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  2898  * XL or an optional XC, an optional IX or IV, an optional V and any number
  2899  * of optional Is.
  2900  */
  2901 gboolean isroman(const char *t)
  2902 {
  2903     const char *s;
  2904     if (!t || !*t)
  2905 	return FALSE;
  2906     s=t;
  2907     while (g_utf8_get_char(t)=='m' && *t)
  2908 	t++;
  2909     if (g_utf8_get_char(t)=='d')
  2910 	t++;
  2911     if (g_str_has_prefix(t,"cm"))
  2912 	t+=2;
  2913     if (g_str_has_prefix(t,"cd"))
  2914 	t+=2;
  2915     while (g_utf8_get_char(t)=='c' && *t)
  2916 	t++;
  2917     if (g_str_has_prefix(t,"xl"))
  2918 	t+=2;
  2919     if (g_str_has_prefix(t,"xc"))
  2920 	t+=2;
  2921     if (g_utf8_get_char(t)=='l')
  2922 	t++;
  2923     while (g_utf8_get_char(t)=='x' && *t)
  2924 	t++;
  2925     if (g_str_has_prefix(t,"ix"))
  2926 	t+=2;
  2927     if (g_str_has_prefix(t,"iv"))
  2928 	t+=2;
  2929     if (g_utf8_get_char(t)=='v')
  2930 	t++;
  2931     while (g_utf8_get_char(t)=='i' && *t)
  2932 	t++;
  2933     return !*t;
  2934 }
  2935 
  2936 /*
  2937  * postprocess_for_DP:
  2938  *
  2939  * Invoked with the -d switch from flgets().
  2940  * It simply "removes" from the line a hard-coded set of common
  2941  * DP-specific tags, so that the line passed to the main routine has
  2942  * been pre-cleaned of DP markup.
  2943  */
  2944 void postprocess_for_DP(char *theline)
  2945 {
  2946     char *s,*t;
  2947     int i;
  2948     if (!*theline) 
  2949 	return;
  2950     for (i=0;*DPmarkup[i];i++)
  2951 	while ((s=strstr(theline,DPmarkup[i])))
  2952 	{
  2953 	    t=s+strlen(DPmarkup[i]);
  2954 	    memmove(s,t,strlen(t)+1);
  2955 	}
  2956 }
  2957 
  2958 /*
  2959  * postprocess_for_HTML:
  2960  *
  2961  * Invoked with the -m switch from flgets().
  2962  * It simply "removes" from the line a hard-coded set of common
  2963  * HTML tags and "replaces" a hard-coded set of common HTML
  2964  * entities, so that the line passed to the main routine has
  2965  * been pre-cleaned of HTML.
  2966  */
  2967 void postprocess_for_HTML(char *theline)
  2968 {
  2969     while (losemarkup(theline))
  2970 	;
  2971     loseentities(theline);
  2972 }
  2973 
  2974 char *losemarkup(char *theline)
  2975 {
  2976     char *s,*t;
  2977     int i;
  2978     s=strchr(theline,'<');
  2979     t=s?strchr(s,'>'):NULL;
  2980     if (!s || !t)
  2981 	return NULL;
  2982     for (i=0;*markup[i];i++)
  2983 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  2984 	{
  2985 	    t=g_utf8_next_char(t);
  2986 	    memmove(s,t,strlen(t)+1);
  2987 	    return s;
  2988 	}
  2989     /* It's an unrecognized <xxx>. */
  2990     return NULL;
  2991 }
  2992 
  2993 void loseentities(char *theline)
  2994 {
  2995     int i;
  2996     gsize nb;
  2997     char *amp,*scolon;
  2998     gchar *s,*t;
  2999     gunichar c;
  3000     GTree *entities=NULL;
  3001     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3002     if (!theline)
  3003     {
  3004 	if (entities)
  3005 	    g_tree_destroy(entities);
  3006 	entities=NULL;
  3007 	if (translit!=(GIConv)-1)
  3008 	    g_iconv_close(translit);
  3009 	translit=(GIConv)-1;
  3010 	if (to_utf8!=(GIConv)-1)
  3011 	    g_iconv_close(to_utf8);
  3012 	to_utf8=(GIConv)-1;
  3013 	return;
  3014     }
  3015     if (!*theline)
  3016 	return;
  3017     if (!entities)
  3018     {
  3019 	entities=g_tree_new((GCompareFunc)strcmp);
  3020 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3021 	    g_tree_insert(entities,HTMLentities[i].name,
  3022 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3023     }
  3024     if (translit==(GIConv)-1)
  3025 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3026     if (to_utf8==(GIConv)-1)
  3027 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3028     while((amp=strchr(theline,'&')))
  3029     {
  3030 	scolon=strchr(amp,';');
  3031 	if (scolon)
  3032 	{
  3033 	    if (amp[1]=='#')
  3034 	    {
  3035 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3036 		    c=strtol(amp+2,NULL,10);
  3037 		else if (amp[2]=='x' &&
  3038 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3039 		    c=strtol(amp+3,NULL,16);
  3040 	    }
  3041 	    else
  3042 	    {
  3043 		s=g_strndup(amp+1,scolon-(amp+1));
  3044 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3045 		g_free(s);
  3046 	    }
  3047 	}
  3048 	else
  3049 	    c=0;
  3050 	if (c)
  3051 	{
  3052 	    theline=amp;
  3053 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3054 		theline+=g_unichar_to_utf8(c,theline);
  3055 	    else
  3056 	    {
  3057 		s=g_malloc(6);
  3058 		nb=g_unichar_to_utf8(c,s);
  3059 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3060 		g_free(s);
  3061 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3062 		g_free(t);
  3063 		memcpy(theline,s,nb);
  3064 		g_free(s);
  3065 		theline+=nb;
  3066 	    }
  3067 	    memmove(theline,g_utf8_next_char(scolon),
  3068 	      strlen(g_utf8_next_char(scolon))+1);
  3069 	}
  3070 	else
  3071 	    theline=g_utf8_next_char(amp);
  3072     }
  3073 }
  3074 
  3075 gboolean tagcomp(const char *strin,const char *basetag)
  3076 {
  3077     gboolean retval;
  3078     gchar *s,*t;
  3079     if (g_utf8_get_char(strin)=='/')
  3080 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3081     else
  3082 	t=g_utf8_casefold(strin,-1);
  3083     s=g_utf8_casefold(basetag,-1);
  3084     retval=g_str_has_prefix(t,s);
  3085     g_free(s);
  3086     g_free(t);
  3087     return retval;
  3088 }
  3089 
  3090 void proghelp(GOptionContext *context)
  3091 {
  3092     gchar *help;
  3093     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3094     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3095     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3096     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3097       "For details, read the file COPYING.\n",stderr);
  3098     fputs("This is Free Software; "
  3099       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3100     fputs("read the file COPYING for details.\n\n",stderr);
  3101     help=g_option_context_get_help(context,TRUE,NULL);
  3102     fputs(help,stderr);
  3103     g_free(help);
  3104     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3105     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3106       "non-ASCII\n",stderr);
  3107     fputs("characters like accented letters, "
  3108       "lines longer than 75 or shorter than 55,\n",stderr);
  3109     fputs("unbalanced quotes or brackets, "
  3110       "a variety of badly formatted punctuation, \n",stderr);
  3111     fputs("HTML tags, some likely typos. "
  3112       "It is NOT a substitute for human judgement.\n",stderr);
  3113     fputs("\n",stderr);
  3114 }