bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Wed Oct 02 23:51:18 2013 +0100 (2013-10-02)
changeset 142 466f43a12118
parent 103 adc06e9e8470
child 144 d7a97f077f9e
child 147 bb31577536d1
child 150 fd584db1d305
child 151 a485f5dcc2de
child 158 408b56cff0bd
permissions -rw-r--r--
Fix bug #11: Test for balanced "slanted" UTF-8 quotation marks 8220/8221
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "pending.h"
    33 #include "HTMLentities.h"
    34 
    35 gchar *prevline;
    36 
    37 /* Common typos. */
    38 char *typo[] = {
    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    68     "se", ""
    69 };
    70 
    71 GTree *usertypo;
    72 
    73 /* Common abbreviations and other OK words not to query as typos. */
    74 char *okword[] = {
    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    78     "outbid", "outbids", "frostbite", "frostbitten", ""
    79 };
    80 
    81 /* Common abbreviations that cause otherwise unexplained periods. */
    82 char *abbrev[] = {
    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    85 };
    86 
    87 /*
    88  * Two-Letter combinations that rarely if ever start words,
    89  * but are common scannos or otherwise common letter combinations.
    90  */
    91 char *nostart[] = {
    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    93 };
    94 
    95 /*
    96  * Two-Letter combinations that rarely if ever end words,
    97  * but are common scannos or otherwise common letter combinations.
    98  */
    99 char *noend[] = {
   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   101     "sw", "gr", "sl", "cl", "iy", ""
   102 };
   103 
   104 char *markup[] = {
   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   109 };
   110 
   111 char *DPmarkup[] = {
   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   113 };
   114 
   115 char *nocomma[] = {
   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   119     "during", "let", "toward", "among", ""
   120 };
   121 
   122 char *noperiod[] = {
   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   126     "among", "those", "into", "whom", "having", "thence", ""
   127 }; 
   128 
   129 gboolean pswit[SWITNO];  /* program switches */
   130 
   131 static GOptionEntry options[]={
   132     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   133       "Ignore DP-specific markup", NULL },
   134     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   135       "Don't echo queried line", NULL },
   136     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   137       "Check single quotes", NULL },
   138     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   139       "Check common typos", NULL },
   140     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   141       "Require closure of quotes on every paragraph", NULL },
   142     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   143       "Disable paranoid querying of everything", NULL },
   144     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   145       "Disable line end checking", NULL },
   146     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   147       "Overview: just show counts", NULL },
   148     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   149       "Output errors to stdout instead of stderr", NULL },
   150     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   151       "Echo header fields", NULL },
   152     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   153       "Ignore markup in < >", NULL },
   154     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   155       "Use file of user-defined typos", NULL },
   156     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   157       "Defaults for use on www upload", NULL },
   158     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   159       "Verbose - list everything", NULL },
   160     { NULL }
   161 };
   162 
   163 long cnt_quote;		/* for overview mode, count of quote queries */
   164 long cnt_brack;		/* for overview mode, count of brackets queries */
   165 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   166 long cnt_odd;		/* for overview mode, count of odd character queries */
   167 long cnt_long;		/* for overview mode, count of long line errors */
   168 long cnt_short;		/* for overview mode, count of short line queries */
   169 long cnt_punct;		/* for overview mode,
   170 			   count of punctuation and spacing queries */
   171 long cnt_dash;		/* for overview mode, count of dash-related queries */
   172 long cnt_word;		/* for overview mode, count of word queries */
   173 long cnt_html;		/* for overview mode, count of html queries */
   174 long cnt_lineend;	/* for overview mode, count of line-end queries */
   175 long cnt_spacend;	/* count of lines with space at end */
   176 long linecnt;		/* count of total lines in the file */
   177 long checked_linecnt;	/* count of lines actually checked */
   178 
   179 void proghelp(GOptionContext *context);
   180 void procfile(const char *);
   181 
   182 gchar *running_from;
   183 
   184 gboolean mixdigit(const char *);
   185 gchar *getaword(const char **);
   186 char *flgets(char **,long);
   187 void postprocess_for_HTML(char *);
   188 char *linehasmarkup(char *);
   189 char *losemarkup(char *);
   190 gboolean tagcomp(const char *,const char *);
   191 void loseentities(char *);
   192 gboolean isroman(const char *);
   193 void postprocess_for_DP(char *);
   194 void print_as_windows_1252(const char *string);
   195 void print_as_utf_8(const char *string);
   196 
   197 GTree *qword,*qperiod;
   198 
   199 #ifdef __WIN32__
   200 UINT saved_cp;
   201 #endif
   202 
   203 void parse_options(int *argc,char ***argv)
   204 {
   205     GError *err=NULL;
   206     GOptionContext *context;
   207     context=g_option_context_new(
   208       "file - looks for errors in Project Gutenberg(TM) etexts");
   209     g_option_context_add_main_entries(context,options,NULL);
   210     if (!g_option_context_parse(context,argc,argv,&err))
   211     {
   212 	g_printerr("Bookloupe: %s\n",err->message);
   213 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   214 	exit(1);
   215     }
   216     /* Paranoid checking is turned OFF, not on, by its switch */
   217     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   218     if (pswit[PARANOID_SWITCH])
   219 	/* if running in paranoid mode, typo checks default to enabled */
   220 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   221     /* Line-end checking is turned OFF, not on, by its switch */
   222     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   223     /* Echoing is turned OFF, not on, by its switch */
   224     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   225     if (pswit[OVERVIEW_SWITCH])
   226 	/* just print summary; don't echo */
   227 	pswit[ECHO_SWITCH]=FALSE;
   228     /*
   229      * Web uploads - for the moment, this is really just a placeholder
   230      * until we decide what processing we really want to do on web uploads
   231      */
   232     if (pswit[WEB_SWITCH])
   233     {
   234 	/* specific override for web uploads */
   235 	pswit[ECHO_SWITCH]=TRUE;
   236 	pswit[SQUOTE_SWITCH]=FALSE;
   237 	pswit[TYPO_SWITCH]=TRUE;
   238 	pswit[QPARA_SWITCH]=FALSE;
   239 	pswit[PARANOID_SWITCH]=TRUE;
   240 	pswit[LINE_END_SWITCH]=FALSE;
   241 	pswit[OVERVIEW_SWITCH]=FALSE;
   242 	pswit[STDOUT_SWITCH]=FALSE;
   243 	pswit[HEADER_SWITCH]=TRUE;
   244 	pswit[VERBOSE_SWITCH]=FALSE;
   245 	pswit[MARKUP_SWITCH]=FALSE;
   246 	pswit[USERTYPO_SWITCH]=FALSE;
   247 	pswit[DP_SWITCH]=FALSE;
   248     }
   249     if (*argc<2)
   250     {
   251 	proghelp(context);
   252 	exit(1);
   253     }
   254     g_option_context_free(context);
   255 }
   256 
   257 /*
   258  * read_user_scannos:
   259  *
   260  * Read in the user-defined stealth scanno list.
   261  */
   262 void read_user_scannos(void)
   263 {
   264     GError *err=NULL;
   265     gchar *usertypo_file;
   266     gboolean okay;
   267     int i;
   268     gsize len,nb;
   269     gchar *contents,*utf8,**lines;
   270     usertypo_file=g_strdup("bookloupe.typ");
   271     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   272     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   273     {
   274 	g_clear_error(&err);
   275 	g_free(usertypo_file);
   276 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   277 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   278     }
   279     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   280     {
   281 	g_clear_error(&err);
   282 	g_free(usertypo_file);
   283 	usertypo_file=g_strdup("gutcheck.typ");
   284 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   285     }
   286     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   287     {
   288 	g_clear_error(&err);
   289 	g_free(usertypo_file);
   290 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   291 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   292     }
   293     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   294     {
   295 	g_free(usertypo_file);
   296 	g_print("   --> I couldn't find bookloupe.typ "
   297 	  "-- proceeding without user typos.\n");
   298 	return;
   299     }
   300     else if (!okay)
   301     {
   302 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   303 	g_free(usertypo_file);
   304 	g_clear_error(&err);
   305 	exit(1);
   306     }
   307     if (g_utf8_validate(contents,len,NULL))
   308 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   309     else
   310 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   311     g_free(contents);
   312     lines=g_strsplit_set(utf8,"\r\n",0);
   313     g_free(utf8);
   314     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   315     for (i=0;lines[i];i++)
   316 	if (*(unsigned char *)lines[i]>'!')
   317 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   318 	else
   319 	    g_free(lines[i]);
   320     g_free(lines);
   321 }
   322 
   323 /*
   324  * read_etext:
   325  *
   326  * Read an etext returning a newly allocated string containing the file
   327  * contents or NULL on error.
   328  */
   329 gchar *read_etext(const char *filename,GError **err)
   330 {
   331     GError *tmp_err=NULL;
   332     gchar *contents,*utf8;
   333     gsize len,bytes_read,bytes_written;
   334     int i,line,col;
   335     if (!g_file_get_contents(filename,&contents,&len,err))
   336 	return NULL;
   337     if (g_utf8_validate(contents,len,NULL))
   338     {
   339 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   340 	g_set_print_handler(print_as_utf_8);
   341 #ifdef __WIN32__
   342 	SetConsoleOutputCP(CP_UTF8);
   343 #endif
   344     }
   345     else
   346     {
   347 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   348 	  &bytes_written,&tmp_err);
   349 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   350 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   351 	{
   352 	    line=col=1;
   353 	    for(i=0;i<bytes_read;i++)
   354 		if (contents[i]=='\n')
   355 		{
   356 		    line++;
   357 		    col=1;
   358 		}
   359 		else if (contents[i]!='\r')
   360 		    col++;
   361 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   362 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   363 	      "valid Windows-1252 character",
   364 	      ((unsigned char *)contents)[bytes_read],line,col);
   365 	}
   366 	else if (tmp_err)
   367 	    g_propagate_error(err,tmp_err);
   368 	g_set_print_handler(print_as_windows_1252);
   369 #ifdef __WIN32__
   370 	SetConsoleOutputCP(1252);
   371 #endif
   372     }
   373     g_free(contents);
   374     return utf8;
   375 }
   376 
   377 void cleanup_on_exit(void)
   378 {
   379 #ifdef __WIN32__
   380     SetConsoleOutputCP(saved_cp);
   381 #endif
   382 }
   383 
   384 int main(int argc,char **argv)
   385 {
   386 #ifdef __WIN32__
   387     atexit(cleanup_on_exit);
   388     saved_cp=GetConsoleOutputCP();
   389 #endif
   390     running_from=g_path_get_dirname(argv[0]);
   391     parse_options(&argc,&argv);
   392     if (pswit[USERTYPO_SWITCH])
   393 	read_user_scannos();
   394     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   395     procfile(argv[1]);
   396     if (pswit[OVERVIEW_SWITCH])
   397     {
   398 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   399 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   400 	g_print("    --------------- Queries found --------------\n");
   401 	if (cnt_long)
   402 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   403 	if (cnt_short)
   404 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   405 	if (cnt_lineend)
   406 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   407 	if (cnt_word)
   408 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   409 	if (cnt_quote)
   410 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);
   411 	if (cnt_brack)
   412 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   413 	if (cnt_bin)
   414 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   415 	if (cnt_odd)
   416 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   417 	if (cnt_punct)
   418 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   419 	if (cnt_dash)
   420 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   421 	if (cnt_html)
   422 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   423 	g_print("\n");
   424 	g_print("    TOTAL QUERIES		  %14ld\n",
   425 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+
   426 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);
   427     }
   428     g_free(running_from);
   429     if (usertypo)
   430 	g_tree_unref(usertypo);
   431     return 0;
   432 }
   433 
   434 /*
   435  * first_pass:
   436  *
   437  * Run a first pass - verify that it's a valid PG
   438  * file, decide whether to report some things that
   439  * occur many times in the text like long or short
   440  * lines, non-standard dashes, etc.
   441  */
   442 struct first_pass_results *first_pass(const char *etext)
   443 {
   444     gunichar laststart=CHAR_SPACE;
   445     const char *s;
   446     gchar *lc_line;
   447     int i,j,lbytes,llen;
   448     gchar **lines;
   449     unsigned int lastlen=0,lastblen=0;
   450     long spline=0,nspline=0;
   451     static struct first_pass_results results={0};
   452     gchar *inword;
   453     QuoteClass qc;
   454     lines=g_strsplit(etext,"\n",0);
   455     for (j=0;lines[j];j++)
   456     {
   457 	lbytes=strlen(lines[j]);
   458 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   459 	    lines[j][--lbytes]='\0';
   460 	llen=g_utf8_strlen(lines[j],lbytes);
   461 	linecnt++;
   462 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   463 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   464 	{
   465 	    if (spline)
   466 		g_print("   --> Duplicate header?\n");
   467 	    spline=linecnt+1;   /* first line of non-header text, that is */
   468 	}
   469 	if (!strncmp(lines[j],"*** START",9) &&
   470 	  strstr(lines[j],"PROJECT GUTENBERG"))
   471 	{
   472 	    if (nspline)
   473 		g_print("   --> Duplicate header?\n");
   474 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   475 	}
   476 	if (spline || nspline)
   477 	{
   478 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   479 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   480 	    {
   481 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   482 		{
   483 		    if (results.footerline)
   484 		    {
   485 			/* it's an old-form header - we can detect duplicates */
   486 			if (!nspline)
   487 			    g_print("   --> Duplicate footer?\n");
   488 		    }
   489 		    else
   490 			results.footerline=linecnt;
   491 		}
   492 	    }
   493 	    g_free(lc_line);
   494 	}
   495 	if (spline)
   496 	    results.firstline=spline;
   497 	if (nspline)
   498 	    results.firstline=nspline;  /* override with new */
   499 	if (results.footerline)
   500 	    continue;    /* don't count the boilerplate in the footer */
   501 	results.totlen+=llen;
   502 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   503 	{
   504 	    if (g_utf8_get_char(s)>127)
   505 		results.binlen++;
   506 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   507 		results.alphalen++;
   508 	    if (s>lines[j])
   509 	    {
   510 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))
   511 		    qc=QUOTE_CLASS(g_utf8_get_char(s));
   512 		else
   513 		    qc=INVALID_QUOTE;
   514 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
   515 		  isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   516 		    results.endquote_count++;
   517 	    }
   518 	}
   519 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   520 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   521 	    results.shortline++;
   522 	if (lbytes>0 &&
   523 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   524 	    cnt_spacend++;
   525 	if (strstr(lines[j],".,"))
   526 	    results.dotcomma++;
   527 	/* only count ast lines for ignoring purposes where there is */
   528 	/* locase text on the line */
   529 	if (strchr(lines[j],'*'))
   530 	{
   531 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   532 		if (g_unichar_islower(g_utf8_get_char(s)))
   533 		    break;
   534 	    if (*s)
   535 		results.astline++;
   536 	}
   537 	if (strchr(lines[j],'/'))
   538 	    results.fslashline++;
   539 	if (lbytes>0)
   540 	{
   541 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   542 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   543 	      s=g_utf8_prev_char(s))
   544 		;
   545 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   546 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   547 		results.hyphens++;
   548 	}
   549 	if (llen>LONGEST_PG_LINE)
   550 	    results.longline++;
   551 	if (llen>WAY_TOO_LONG)
   552 	    results.verylongline++;
   553 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   554 	{
   555 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   556 	    if (i>0)
   557 		results.htmcount++;
   558 	    if (strstr(lines[j],"<i>"))
   559 		results.htmcount+=4; /* bonus marks! */
   560 	}
   561 	/* Check for spaced em-dashes */
   562 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   563 	{
   564 	    results.emdash++;
   565 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   566 		results.space_emdash++;
   567 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   568 		/* count of em-dashes with spaces both sides */
   569 		results.non_PG_space_emdash++;
   570 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   571 		/* count of PG-type em-dashes with no spaces */
   572 		results.PG_space_emdash++;
   573 	}
   574 	for (s=lines[j];*s;)
   575 	{
   576 	    inword=getaword(&s);
   577 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   578 		results.Dutchcount++;
   579 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   580 		results.Frenchcount++;
   581 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   582 		results.standalone_digit++;
   583 	    g_free(inword);
   584 	}
   585 	/* Check for spaced dashes */
   586 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   587 	    results.spacedash++;
   588 	lastblen=lastlen;
   589 	lastlen=llen;
   590 	laststart=lines[j][0];
   591     }
   592     g_strfreev(lines);
   593     return &results;
   594 }
   595 
   596 /*
   597  * report_first_pass:
   598  *
   599  * Make some snap decisions based on the first pass results.
   600  */
   601 struct warnings *report_first_pass(struct first_pass_results *results)
   602 {
   603     static struct warnings warnings={0};
   604     if (cnt_spacend>0)
   605 	g_print("   --> %ld lines in this file have white space at end\n",
   606 	  cnt_spacend);
   607     warnings.dotcomma=1;
   608     if (results->dotcomma>5)
   609     {
   610 	warnings.dotcomma=0;
   611 	g_print("   --> %ld lines in this file contain '.,'. "
   612 	  "Not reporting them.\n",results->dotcomma);
   613     }
   614     /*
   615      * If more than 50 lines, or one-tenth, are short,
   616      * don't bother reporting them.
   617      */
   618     warnings.shortline=1;
   619     if (results->shortline>50 || results->shortline*10>linecnt)
   620     {
   621 	warnings.shortline=0;
   622 	g_print("   --> %ld lines in this file are short. "
   623 	  "Not reporting short lines.\n",results->shortline);
   624     }
   625     /*
   626      * If more than 50 lines, or one-tenth, are long,
   627      * don't bother reporting them.
   628      */
   629     warnings.longline=1;
   630     if (results->longline>50 || results->longline*10>linecnt)
   631     {
   632 	warnings.longline=0;
   633 	g_print("   --> %ld lines in this file are long. "
   634 	  "Not reporting long lines.\n",results->longline);
   635     }
   636     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   637     warnings.ast=1;
   638     if (results->astline>10)
   639     {
   640 	warnings.ast=0;
   641 	g_print("   --> %ld lines in this file contain asterisks. "
   642 	  "Not reporting them.\n",results->astline);
   643     }
   644     /*
   645      * If more than 10 lines contain forward slashes,
   646      * don't bother reporting them.
   647      */
   648     warnings.fslash=1;
   649     if (results->fslashline>10)
   650     {
   651 	warnings.fslash=0;
   652 	g_print("   --> %ld lines in this file contain forward slashes. "
   653 	  "Not reporting them.\n",results->fslashline);
   654     }
   655     /*
   656      * If more than 20 lines contain unpunctuated endquotes,
   657      * don't bother reporting them.
   658      */
   659     warnings.endquote=1;
   660     if (results->endquote_count>20)
   661     {
   662 	warnings.endquote=0;
   663 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   664 	  "Not reporting them.\n",results->endquote_count);
   665     }
   666     /*
   667      * If more than 15 lines contain standalone digits,
   668      * don't bother reporting them.
   669      */
   670     warnings.digit=1;
   671     if (results->standalone_digit>10)
   672     {
   673 	warnings.digit=0;
   674 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   675 	  "Not reporting them.\n",results->standalone_digit);
   676     }
   677     /*
   678      * If more than 20 lines contain hyphens at end,
   679      * don't bother reporting them.
   680      */
   681     warnings.hyphen=1;
   682     if (results->hyphens>20)
   683     {
   684 	warnings.hyphen=0;
   685 	g_print("   --> %ld lines in this file have hyphens at end. "
   686 	  "Not reporting them.\n",results->hyphens);
   687     }
   688     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   689     {
   690 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   691 	pswit[MARKUP_SWITCH]=1;
   692     }
   693     if (results->verylongline>0)
   694 	g_print("   --> %ld lines in this file are VERY long!\n",
   695 	  results->verylongline);
   696     /*
   697      * If there are more non-PG spaced dashes than PG em-dashes,
   698      * assume it's deliberate.
   699      * Current PG guidelines say don't use them, but older texts do,
   700      * and some people insist on them whatever the guidelines say.
   701      */
   702     warnings.dash=1;
   703     if (results->spacedash+results->non_PG_space_emdash>
   704       results->PG_space_emdash)
   705     {
   706 	warnings.dash=0;
   707 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   708 	  "Not reporting them.\n",
   709 	  results->spacedash+results->non_PG_space_emdash);
   710     }
   711     /* If more than a quarter of characters are hi-bit, bug out. */
   712     warnings.bin=1;
   713     if (results->binlen*4>results->totlen)
   714     {
   715 	g_print("   --> This file does not appear to be ASCII. "
   716 	  "Terminating. Best of luck with it!\n");
   717 	exit(1);
   718     }
   719     if (results->alphalen*4<results->totlen)
   720     {
   721 	g_print("   --> This file does not appear to be text. "
   722 	  "Terminating. Best of luck with it!\n");
   723 	exit(1);
   724     }
   725     if (results->binlen*100>results->totlen || results->binlen>100)
   726     {
   727 	g_print("   --> There are a lot of foreign letters here. "
   728 	  "Not reporting them.\n");
   729 	warnings.bin=0;
   730     }
   731     warnings.isDutch=FALSE;
   732     if (results->Dutchcount>50)
   733     {
   734 	warnings.isDutch=TRUE;
   735 	g_print("   --> This looks like Dutch - "
   736 	  "switching off dashes and warnings for 's Middags case.\n");
   737     }
   738     warnings.isFrench=FALSE;
   739     if (results->Frenchcount>50)
   740     {
   741 	warnings.isFrench=TRUE;
   742 	g_print("   --> This looks like French - "
   743 	  "switching off some doublepunct.\n");
   744     }
   745     if (results->firstline && results->footerline)
   746 	g_print("    The PG header and footer appear to be already on.\n");
   747     else
   748     {
   749 	if (results->firstline)
   750 	    g_print("    The PG header is on - no footer.\n");
   751 	if (results->footerline)
   752 	    g_print("    The PG footer is on - no header.\n");
   753     }
   754     g_print("\n");
   755     if (pswit[VERBOSE_SWITCH])
   756     {
   757 	warnings.bin=1;
   758 	warnings.shortline=1;
   759 	warnings.dotcomma=1;
   760 	warnings.longline=1;
   761 	warnings.dash=1;
   762 	warnings.digit=1;
   763 	warnings.ast=1;
   764 	warnings.fslash=1;
   765 	warnings.hyphen=1;
   766 	warnings.endquote=1;
   767 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   768     }
   769     if (warnings.isDutch)
   770 	warnings.dash=0;
   771     if (results->footerline>0 && results->firstline>0 &&
   772       results->footerline>results->firstline &&
   773       results->footerline-results->firstline<100)
   774     {
   775 	g_print("   --> I don't really know where this text starts. \n");
   776 	g_print("       There are no reference points.\n");
   777 	g_print("       I'm going to have to report the header and footer "
   778 	  "as well.\n");
   779 	results->firstline=0;
   780     }
   781     return &warnings;
   782 }
   783 
   784 /*
   785  * analyse_quotes:
   786  *
   787  * Look along the line, accumulate the count of quotes, and see
   788  * if this is an empty line - i.e. a line with nothing on it
   789  * but spaces.
   790  * If line has just spaces, period, * and/or - on it, don't
   791  * count it, since empty lines with asterisks or dashes to
   792  * separate sections are common.
   793  *
   794  * Returns: TRUE if the line is empty.
   795  */
   796 gboolean analyse_quotes(const char *aline,int linecnt,struct counters *counters)
   797 {
   798     int guessquote=0;
   799     /* assume the line is empty until proven otherwise */
   800     gboolean isemptyline=TRUE;
   801     const char *s=aline,*sprev,*snext;
   802     gunichar c;
   803     sprev=NULL;
   804     GError *tmp_err=NULL;
   805     while (*s)
   806     {
   807 	snext=g_utf8_next_char(s);
   808 	c=g_utf8_get_char(s);
   809 	if (CHAR_IS_DQUOTE(c))
   810 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);
   811 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])
   812 	{
   813 	    if (s==aline)
   814 	    {
   815 		/*
   816 		 * At start of line, it can only be a quotation mark.
   817 		 * Hardcode a very common exception!
   818 		 */
   819 		if (!g_str_has_prefix(snext,"tis") &&
   820 		  !g_str_has_prefix(snext,"Tis"))
   821 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   822 	    }
   823 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   824 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   825 		/* Do nothing! it's definitely an apostrophe, not a quote */
   826 		;
   827 	    /* it's outside a word - let's check it out */
   828 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   829 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   830 	    {
   831 		/* certainly looks like a quotation mark */
   832 		if (!g_str_has_prefix(snext,"tis") &&
   833 		  !g_str_has_prefix(snext,"Tis"))
   834 		    /* hardcode a very common exception! */
   835 		{
   836 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))
   837 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   838 		    else
   839 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);
   840 		}
   841 	    }
   842 	    else
   843 	    {
   844 		/* now - is it a quotation mark? */
   845 		guessquote=0;   /* accumulate clues */
   846 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   847 		{
   848 		    /* it follows a letter - could be either */
   849 		    guessquote++;
   850 		    if (g_utf8_get_char(sprev)=='s')
   851 		    {
   852 			/* looks like a plural apostrophe */
   853 			guessquote-=3;
   854 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   855 			    /* bonus marks! */
   856 			    guessquote-=2;
   857 		    }
   858 		    if (innermost_quote_matches(counters,c))
   859 			/*
   860 			 * Give it the benefit of some doubt,
   861 			 * if a squote is already open.
   862 			 */
   863 			guessquote++;
   864 		    else
   865 			guessquote--;
   866 		    if (guessquote>=0)
   867 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);
   868 		}
   869 		else
   870 		    /* no adjacent letter - it must be a quote of some kind */
   871 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);
   872 	    }
   873 	}
   874 	if (tmp_err)
   875 	{
   876 	    if (pswit[ECHO_SWITCH])
   877 		g_print("\n%s\n",aline);
   878 	    if (!pswit[OVERVIEW_SWITCH])
   879 		g_print("    Line %ld column %ld - %s\n",
   880 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);
   881 	    g_clear_error(&tmp_err);
   882 	}
   883 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   884 	  c!='\r' && c!='\n')
   885 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   886 	if (c==CHAR_UNDERSCORE)
   887 	    counters->c_unders++;
   888 	if (c==CHAR_OPEN_SBRACK)
   889 	{
   890 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&
   891 	      !matching_difference(counters,c) && s==aline &&
   892 	      g_str_has_prefix(s,"[Illustration:"))
   893 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);
   894 	    else
   895 		increment_matching(counters,c,TRUE);
   896 	}
   897 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)
   898 	    increment_matching(counters,c,TRUE);
   899 	if (c==CHAR_CLOSE_SBRACK)
   900 	{
   901 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&
   902 	      !matching_difference(counters,c) && !*snext)
   903 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);
   904 	    else
   905 		increment_matching(counters,c,FALSE);
   906 	}
   907 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)
   908 	    increment_matching(counters,c,FALSE);
   909 	sprev=s;
   910 	s=snext;
   911     }
   912     return isemptyline;
   913 }
   914 
   915 /*
   916  * check_for_control_characters:
   917  *
   918  * Check for invalid or questionable characters in the line
   919  * Anything above 127 is invalid for plain ASCII, and
   920  * non-printable control characters should also be flagged.
   921  * Tabs should generally not be there.
   922  */
   923 void check_for_control_characters(const char *aline)
   924 {
   925     gunichar c;
   926     const char *s;
   927     for (s=aline;*s;s=g_utf8_next_char(s))
   928     {
   929 	c=g_utf8_get_char(s);
   930 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   931 	{
   932 	    if (pswit[ECHO_SWITCH])
   933 		g_print("\n%s\n",aline);
   934 	    if (!pswit[OVERVIEW_SWITCH])
   935 		g_print("    Line %ld column %ld - Control character %u\n",
   936 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   937 	    else
   938 		cnt_bin++;
   939 	}
   940     }
   941 }
   942 
   943 /*
   944  * check_for_odd_characters:
   945  *
   946  * Check for binary and other odd characters.
   947  */
   948 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   949   gboolean isemptyline)
   950 {
   951     /* Don't repeat multiple warnings on one line. */
   952     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   953     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   954     const char *s;
   955     gunichar c;
   956     for (s=aline;*s;s=g_utf8_next_char(s))
   957     {
   958 	c=g_utf8_get_char(s);
   959 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   960 	{
   961 	    if (pswit[ECHO_SWITCH])
   962 		g_print("\n%s\n",aline);
   963 	    if (!pswit[OVERVIEW_SWITCH])
   964 		if (c>127 && c<160 || c>255)
   965 		    g_print("    Line %ld column %ld - "
   966 		      "Non-ISO-8859 character %u\n",
   967 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   968 		else
   969 		    g_print("    Line %ld column %ld - "
   970 		      "Non-ASCII character %u\n",
   971 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   972 	    else
   973 		cnt_bin++;
   974 	    eNon_A=TRUE;
   975 	}
   976 	if (!eTab && c==CHAR_TAB)
   977 	{
   978 	    if (pswit[ECHO_SWITCH])
   979 		g_print("\n%s\n",aline);
   980 	    if (!pswit[OVERVIEW_SWITCH])
   981 		g_print("    Line %ld column %ld - Tab character?\n",
   982 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   983 	    else
   984 		cnt_odd++;
   985 	    eTab=TRUE;
   986 	}
   987 	if (!eTilde && c==CHAR_TILDE)
   988 	{
   989 	    /*
   990 	     * Often used by OCR software to indicate an
   991 	     * unrecognizable character.
   992 	     */
   993 	    if (pswit[ECHO_SWITCH])
   994 		g_print("\n%s\n",aline);
   995 	    if (!pswit[OVERVIEW_SWITCH])
   996 		g_print("    Line %ld column %ld - Tilde character?\n",
   997 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   998 	    else
   999 		cnt_odd++;
  1000 	    eTilde=TRUE;
  1001 	}
  1002 	if (!eCarat && c==CHAR_CARAT)
  1003 	{  
  1004 	    if (pswit[ECHO_SWITCH])
  1005 		g_print("\n%s\n",aline);
  1006 	    if (!pswit[OVERVIEW_SWITCH])
  1007 		g_print("    Line %ld column %ld - Carat character?\n",
  1008 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1009 	    else
  1010 		cnt_odd++;
  1011 	    eCarat=TRUE;
  1012 	}
  1013 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1014 	{  
  1015 	    if (pswit[ECHO_SWITCH])
  1016 		g_print("\n%s\n",aline);
  1017 	    if (!pswit[OVERVIEW_SWITCH])
  1018 		g_print("    Line %ld column %ld - Forward slash?\n",
  1019 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1020 	    else
  1021 		cnt_odd++;
  1022 	    eFSlash=TRUE;
  1023 	}
  1024 	/*
  1025 	 * Report asterisks only in paranoid mode,
  1026 	 * since they're often deliberate.
  1027 	 */
  1028 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1029 	  c==CHAR_ASTERISK)
  1030 	{
  1031 	    if (pswit[ECHO_SWITCH])
  1032 		g_print("\n%s\n",aline);
  1033 	    if (!pswit[OVERVIEW_SWITCH])
  1034 		g_print("    Line %ld column %ld - Asterisk?\n",
  1035 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1036 	    else
  1037 		cnt_odd++;
  1038 	    eAst=TRUE;
  1039 	}
  1040     }
  1041 }
  1042 
  1043 /*
  1044  * check_for_long_line:
  1045  *
  1046  * Check for line too long.
  1047  */
  1048 void check_for_long_line(const char *aline)
  1049 {
  1050     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1051     {
  1052 	if (pswit[ECHO_SWITCH])
  1053 	    g_print("\n%s\n",aline);
  1054 	if (!pswit[OVERVIEW_SWITCH])
  1055 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1056 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1057 	else
  1058 	    cnt_long++;
  1059     }
  1060 }
  1061 
  1062 /*
  1063  * check_for_short_line:
  1064  *
  1065  * Check for line too short.
  1066  *
  1067  * This one is a bit trickier to implement: we don't want to
  1068  * flag the last line of a paragraph for being short, so we
  1069  * have to wait until we know that our current line is a
  1070  * "normal" line, then report the _previous_ line if it was too
  1071  * short. We also don't want to report indented lines like
  1072  * chapter heads or formatted quotations. We therefore keep
  1073  * last->len as the length of the last line examined, and
  1074  * last->blen as the length of the last but one, and try to
  1075  * suppress unnecessary warnings by checking that both were of
  1076  * "normal" length. We keep the first character of the last
  1077  * line in last->start, and if it was a space, we assume that
  1078  * the formatting is deliberate. I can't figure out a way to
  1079  * distinguish something like a quoted verse left-aligned or
  1080  * the header or footer of a letter from a paragraph of short
  1081  * lines - maybe if I examined the whole paragraph, and if the
  1082  * para has less than, say, 8 lines and if all lines are short,
  1083  * then just assume it's OK? Need to look at some texts to see
  1084  * how often a formula like this would get the right result.
  1085  */
  1086 void check_for_short_line(const char *aline,const struct line_properties *last)
  1087 {
  1088     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1089       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1090       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1091     {
  1092 	if (pswit[ECHO_SWITCH])
  1093 	    g_print("\n%s\n",prevline);
  1094 	if (!pswit[OVERVIEW_SWITCH])
  1095 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1096 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1097 	else
  1098 	    cnt_short++;
  1099     }
  1100 }
  1101 
  1102 /*
  1103  * check_for_starting_punctuation:
  1104  *
  1105  * Look for punctuation other than full ellipses at start of line.
  1106  */
  1107 void check_for_starting_punctuation(const char *aline)
  1108 {
  1109     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1110       !g_str_has_prefix(aline,". . ."))
  1111     {
  1112 	if (pswit[ECHO_SWITCH])
  1113 	    g_print("\n%s\n",aline);
  1114 	if (!pswit[OVERVIEW_SWITCH])
  1115 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1116 	      linecnt);
  1117 	else
  1118 	    cnt_punct++;
  1119     }
  1120 }
  1121 
  1122 /*
  1123  * check_for_spaced_emdash:
  1124  *
  1125  * Check for spaced em-dashes.
  1126  *
  1127  * We must check _all_ occurrences of "--" on the line
  1128  * hence the loop - even if the first double-dash is OK
  1129  * there may be another that's wrong later on.
  1130  */
  1131 void check_for_spaced_emdash(const char *aline)
  1132 {
  1133     const char *s,*t,*next;
  1134     for (s=aline;t=strstr(s,"--");s=next)
  1135     {
  1136 	next=g_utf8_next_char(g_utf8_next_char(t));
  1137 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1138 	  g_utf8_get_char(next)==CHAR_SPACE)
  1139 	{
  1140 	    if (pswit[ECHO_SWITCH])
  1141 		g_print("\n%s\n",aline);
  1142 	    if (!pswit[OVERVIEW_SWITCH])
  1143 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1144 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1145 	    else
  1146 		cnt_dash++;
  1147 	}
  1148     }
  1149 }
  1150 
  1151 /*
  1152  * check_for_spaced_dash:
  1153  *
  1154  * Check for spaced dashes.
  1155  */
  1156 void check_for_spaced_dash(const char *aline)
  1157 {
  1158     const char *s;
  1159     if ((s=strstr(aline," -")))
  1160     {
  1161 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1162 	{
  1163 	    if (pswit[ECHO_SWITCH])
  1164 		g_print("\n%s\n",aline);
  1165 	    if (!pswit[OVERVIEW_SWITCH])
  1166 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1167 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1168 	    else
  1169 		cnt_dash++;
  1170 	}
  1171     }
  1172     else if ((s=strstr(aline,"- ")))
  1173     {
  1174 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1175 	{
  1176 	    if (pswit[ECHO_SWITCH])
  1177 		g_print("\n%s\n",aline);
  1178 	    if (!pswit[OVERVIEW_SWITCH])
  1179 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1180 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1181 	    else
  1182 		cnt_dash++;
  1183 	}
  1184     }
  1185 }
  1186 
  1187 /*
  1188  * check_for_unmarked_paragraphs:
  1189  *
  1190  * Check for unmarked paragraphs indicated by separate speakers.
  1191  *
  1192  * May well be false positive:
  1193  * "Bravo!" "Wonderful!" called the crowd.
  1194  * but useful all the same.
  1195  */
  1196 void check_for_unmarked_paragraphs(const char *aline)
  1197 {
  1198     const char *s;
  1199     s=strstr(aline,"\"  \"");
  1200     if (!s)
  1201 	s=strstr(aline,"\" \"");
  1202     if (s)
  1203     {
  1204 	if (pswit[ECHO_SWITCH])
  1205 	    g_print("\n%s\n",aline);
  1206 	if (!pswit[OVERVIEW_SWITCH])
  1207 	    g_print("    Line %ld column %ld - "
  1208 	      "Query missing paragraph break?\n",
  1209 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1210 	else
  1211 	    cnt_punct++;
  1212     }
  1213 }
  1214 
  1215 /*
  1216  * check_for_jeebies:
  1217  *
  1218  * Check for "to he" and other easy h/b errors.
  1219  *
  1220  * This is a very inadequate effort on the h/b problem,
  1221  * but the phrase "to he" is always an error, whereas "to
  1222  * be" is quite common.
  1223  * Similarly, '"Quiet!", be said.' is a non-be error
  1224  * "to he" is _not_ always an error!:
  1225  *       "Where they went to he couldn't say."
  1226  * Another false positive:
  1227  *       What would "Cinderella" be without the . . .
  1228  * and another: "If he wants to he can see for himself."
  1229  */
  1230 void check_for_jeebies(const char *aline)
  1231 {
  1232     const char *s;
  1233     s=strstr(aline," be could ");
  1234     if (!s)
  1235 	s=strstr(aline," be would ");
  1236     if (!s)
  1237 	s=strstr(aline," was be ");
  1238     if (!s)
  1239 	s=strstr(aline," be is ");
  1240     if (!s)
  1241 	s=strstr(aline," is be ");
  1242     if (!s)
  1243 	s=strstr(aline,"\", be ");
  1244     if (!s)
  1245 	s=strstr(aline,"\" be ");
  1246     if (!s)
  1247 	s=strstr(aline,"\" be ");
  1248     if (!s)
  1249 	s=strstr(aline," to he ");
  1250     if (s)
  1251     {
  1252 	if (pswit[ECHO_SWITCH])
  1253 	    g_print("\n%s\n",aline);
  1254 	if (!pswit[OVERVIEW_SWITCH])
  1255 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1256 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1257 	else
  1258 	    cnt_word++;
  1259     }
  1260     s=strstr(aline," the had ");
  1261     if (!s)
  1262 	s=strstr(aline," a had ");
  1263     if (!s)
  1264 	s=strstr(aline," they bad ");
  1265     if (!s)
  1266 	s=strstr(aline," she bad ");
  1267     if (!s)
  1268 	s=strstr(aline," he bad ");
  1269     if (!s)
  1270 	s=strstr(aline," you bad ");
  1271     if (!s)
  1272 	s=strstr(aline," i bad ");
  1273     if (s)
  1274     {
  1275 	if (pswit[ECHO_SWITCH])
  1276 	    g_print("\n%s\n",aline);
  1277 	if (!pswit[OVERVIEW_SWITCH])
  1278 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1279 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1280 	else
  1281 	    cnt_word++;
  1282     }
  1283     s=strstr(aline,"; hut ");
  1284     if (!s)
  1285 	s=strstr(aline,", hut ");
  1286     if (s)
  1287     {
  1288 	if (pswit[ECHO_SWITCH])
  1289 	    g_print("\n%s\n",aline);
  1290 	if (!pswit[OVERVIEW_SWITCH])
  1291 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1292 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1293 	else
  1294 	    cnt_word++;
  1295     }
  1296 }
  1297 
  1298 /*
  1299  * check_for_mta_from:
  1300  *
  1301  * Special case - angled bracket in front of "From" placed there by an
  1302  * MTA when sending an e-mail.
  1303  */
  1304 void check_for_mta_from(const char *aline)
  1305 {
  1306     const char *s;
  1307     s=strstr(aline,">From");
  1308     if (s)
  1309     {
  1310 	if (pswit[ECHO_SWITCH])
  1311 	    g_print("\n%s\n",aline);
  1312 	if (!pswit[OVERVIEW_SWITCH])
  1313 	    g_print("    Line %ld column %ld - "
  1314 	      "Query angled bracket with From\n",
  1315 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1316 	else
  1317 	    cnt_punct++;
  1318     }
  1319 }
  1320 
  1321 /*
  1322  * check_for_orphan_character:
  1323  *
  1324  * Check for a single character line -
  1325  * often an overflow from bad wrapping.
  1326  */
  1327 void check_for_orphan_character(const char *aline)
  1328 {
  1329     gunichar c;
  1330     c=g_utf8_get_char(aline);
  1331     if (c && !*g_utf8_next_char(aline))
  1332     {
  1333 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1334 	    ; /* Nothing - ignore numerals alone on a line. */
  1335 	else
  1336 	{
  1337 	    if (pswit[ECHO_SWITCH])
  1338 		g_print("\n%s\n",aline);
  1339 	    if (!pswit[OVERVIEW_SWITCH])
  1340 		g_print("    Line %ld column 1 - Query single character line\n",
  1341 		  linecnt);
  1342 	    else
  1343 		cnt_punct++;
  1344 	}
  1345     }
  1346 }
  1347 
  1348 /*
  1349  * check_for_pling_scanno:
  1350  *
  1351  * Check for I" - often should be !
  1352  */
  1353 void check_for_pling_scanno(const char *aline)
  1354 {
  1355     const char *s;
  1356     s=strstr(aline," I\"");
  1357     if (s)
  1358     {
  1359 	if (pswit[ECHO_SWITCH])
  1360 	    g_print("\n%s\n",aline);
  1361 	if (!pswit[OVERVIEW_SWITCH])
  1362 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1363 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1364 	else
  1365 	    cnt_punct++;
  1366     }
  1367 }
  1368 
  1369 /*
  1370  * check_for_extra_period:
  1371  *
  1372  * Check for period without a capital letter. Cut-down from gutspell.
  1373  * Only works when it happens on a single line.
  1374  */
  1375 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1376 {
  1377     const char *s,*t,*s1,*sprev;
  1378     int i;
  1379     gsize len;
  1380     gboolean istypo;
  1381     gchar *testword;
  1382     gunichar c,nc,pc,*decomposition;
  1383     if (pswit[PARANOID_SWITCH])
  1384     {
  1385 	for (t=aline;t=strstr(t,". ");)
  1386 	{
  1387 	    if (t==aline)
  1388 	    {
  1389 		t=g_utf8_next_char(t);
  1390 		/* start of line punctuation is handled elsewhere */
  1391 		continue;
  1392 	    }
  1393 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1394 	    {
  1395 		t=g_utf8_next_char(t);
  1396 		continue;
  1397 	    }
  1398 	    if (warnings->isDutch)
  1399 	    {
  1400 		/* For Frank & Jeroen -- 's Middags case */
  1401 		gunichar c2,c3,c4,c5;
  1402 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1403 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1404 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1405 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1406 		if (CHAR_IS_APOSTROPHE(c2) &&
  1407 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1408 		  g_unichar_isupper(c5))
  1409 		{
  1410 		    t=g_utf8_next_char(t);
  1411 		    continue;
  1412 		}
  1413 	    }
  1414 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1415 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1416 	      !isdigit(g_utf8_get_char(s1)))
  1417 		s1=g_utf8_next_char(s1);
  1418 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1419 	    {
  1420 		/* we have something to investigate */
  1421 		istypo=TRUE;
  1422 		/* so let's go back and find out */
  1423 		nc=g_utf8_get_char(t);
  1424 		s1=g_utf8_prev_char(t);
  1425 		c=g_utf8_get_char(s1);
  1426 		sprev=g_utf8_prev_char(s1);
  1427 		pc=g_utf8_get_char(sprev);
  1428 		while (s1>=aline &&
  1429 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1430 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1431 		  g_unichar_isalpha(nc)))
  1432 		{
  1433 		    nc=c;
  1434 		    s1=sprev;
  1435 		    c=pc;
  1436 		    sprev=g_utf8_prev_char(s1);
  1437 		    pc=g_utf8_get_char(sprev);
  1438 		}
  1439 		s1=g_utf8_next_char(s1);
  1440 		s=strchr(s1,'.');
  1441 		if (s)
  1442 		    testword=g_strndup(s1,s-s1);
  1443 		else
  1444 		    testword=g_strdup(s1);
  1445 		for (i=0;*abbrev[i];i++)
  1446 		    if (!strcmp(testword,abbrev[i]))
  1447 			istypo=FALSE;
  1448 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1449 		    istypo=FALSE;
  1450 		if (!*g_utf8_next_char(testword))
  1451 		    istypo=FALSE;
  1452 		if (isroman(testword))
  1453 		    istypo=FALSE;
  1454 		if (istypo)
  1455 		{
  1456 		    istypo=FALSE;
  1457 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1458 		    {
  1459 			decomposition=g_unicode_canonical_decomposition(
  1460 			  g_utf8_get_char(s),&len);
  1461 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1462 			    istypo=TRUE;
  1463 			g_free(decomposition);
  1464 		    }
  1465 		}
  1466 		if (istypo &&
  1467 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1468 		{
  1469 		    g_tree_insert(qperiod,g_strdup(testword),
  1470 		      GINT_TO_POINTER(1));
  1471 		    if (pswit[ECHO_SWITCH])
  1472 			g_print("\n%s\n",aline);
  1473 		    if (!pswit[OVERVIEW_SWITCH])
  1474 			g_print("    Line %ld column %ld - Extra period?\n",
  1475 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1476 		    else
  1477 			cnt_punct++;
  1478 		}
  1479 		g_free(testword);
  1480 	    }
  1481 	    t=g_utf8_next_char(t);
  1482 	}
  1483     }
  1484 }
  1485 
  1486 /*
  1487  * check_for_following_punctuation:
  1488  *
  1489  * Check for words usually not followed by punctuation.
  1490  */
  1491 void check_for_following_punctuation(const char *aline)
  1492 {
  1493     int i;
  1494     const char *s,*wordstart;
  1495     gunichar c;
  1496     gchar *inword,*t;
  1497     if (pswit[TYPO_SWITCH])
  1498     {
  1499 	for (s=aline;*s;)
  1500 	{
  1501 	    wordstart=s;
  1502 	    t=getaword(&s);
  1503 	    if (!*t)
  1504 	    {
  1505 		g_free(t);
  1506 		continue;
  1507 	    }
  1508 	    inword=g_utf8_strdown(t,-1);
  1509 	    g_free(t);
  1510 	    for (i=0;*nocomma[i];i++)
  1511 		if (!strcmp(inword,nocomma[i]))
  1512 		{
  1513 		    c=g_utf8_get_char(s);
  1514 		    if (c==',' || c==';' || c==':')
  1515 		    {
  1516 			if (pswit[ECHO_SWITCH])
  1517 			    g_print("\n%s\n",aline);
  1518 			if (!pswit[OVERVIEW_SWITCH])
  1519 			    g_print("    Line %ld column %ld - "
  1520 			      "Query punctuation after %s?\n",
  1521 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1522 			      inword);
  1523 			else
  1524 			    cnt_punct++;
  1525 		    }
  1526 		}
  1527 	    for (i=0;*noperiod[i];i++)
  1528 		if (!strcmp(inword,noperiod[i]))
  1529 		{
  1530 		    c=g_utf8_get_char(s);
  1531 		    if (c=='.' || c=='!')
  1532 		    {
  1533 			if (pswit[ECHO_SWITCH])
  1534 			    g_print("\n%s\n",aline);
  1535 			if (!pswit[OVERVIEW_SWITCH])
  1536 			    g_print("    Line %ld column %ld - "
  1537 			      "Query punctuation after %s?\n",
  1538 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1539 			      inword);
  1540 			else
  1541 			    cnt_punct++;
  1542 		    }
  1543 		}
  1544 	    g_free(inword);
  1545 	}
  1546     }
  1547 }
  1548 
  1549 /*
  1550  * check_for_typos:
  1551  *
  1552  * Check for commonly mistyped words,
  1553  * and digits like 0 for O in a word.
  1554  */
  1555 void check_for_typos(const char *aline,struct warnings *warnings)
  1556 {
  1557     const char *s,*t,*nt,*wordstart;
  1558     gchar *inword;
  1559     gunichar *decomposition;
  1560     gchar *testword;
  1561     int i,vowel,consonant,*dupcnt;
  1562     gboolean isdup,istypo,alower;
  1563     gunichar c,pc;
  1564     long offset,len;
  1565     gsize decomposition_len;
  1566     for (s=aline;*s;)
  1567     {
  1568 	wordstart=s;
  1569 	inword=getaword(&s);
  1570 	if (!*inword)
  1571 	{
  1572 	    g_free(inword);
  1573 	    continue; /* don't bother with empty lines */
  1574 	}
  1575 	if (mixdigit(inword))
  1576 	{
  1577 	    if (pswit[ECHO_SWITCH])
  1578 		g_print("\n%s\n",aline);
  1579 	    if (!pswit[OVERVIEW_SWITCH])
  1580 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1581 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1582 	    else
  1583 		cnt_word++;
  1584 	}
  1585 	/*
  1586 	 * Put the word through a series of tests for likely typos and OCR
  1587 	 * errors.
  1588 	 */
  1589 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1590 	{
  1591 	    istypo=FALSE;
  1592 	    alower=FALSE;
  1593 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1594 	    {
  1595 		c=g_utf8_get_char(t);
  1596 		nt=g_utf8_next_char(t);
  1597 		/* lowercase for testing */
  1598 		if (g_unichar_islower(c))
  1599 		    alower=TRUE;
  1600 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1601 		{
  1602 		    /*
  1603 		     * We have an uppercase mid-word. However, there are
  1604 		     * common cases:
  1605 		     *   Mac and Mc like McGill
  1606 		     *   French contractions like l'Abbe
  1607 		     */
  1608 		    offset=g_utf8_pointer_to_offset(inword,t);
  1609 		    if (offset>0)
  1610 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1611 		    else
  1612 			pc='\0';
  1613 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1614 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1615 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1616 		      CHAR_IS_APOSTROPHE(pc))
  1617 			; /* do nothing! */
  1618 		    else
  1619 			istypo=TRUE;
  1620 		}
  1621 	    }
  1622 	    testword=g_utf8_casefold(inword,-1);
  1623 	}
  1624 	if (pswit[TYPO_SWITCH])
  1625 	{
  1626 	    /*
  1627 	     * Check for certain unlikely two-letter combinations at word
  1628 	     * start and end.
  1629 	     */
  1630 	    len=g_utf8_strlen(testword,-1);
  1631 	    if (len>1)
  1632 	    {
  1633 		for (i=0;*nostart[i];i++)
  1634 		    if (g_str_has_prefix(testword,nostart[i]))
  1635 			istypo=TRUE;
  1636 		for (i=0;*noend[i];i++)
  1637 		    if (g_str_has_suffix(testword,noend[i]))
  1638 			istypo=TRUE;
  1639 	    }
  1640 	    /* ght is common, gbt never. Like that. */
  1641 	    if (strstr(testword,"cb"))
  1642 		istypo=TRUE;
  1643 	    if (strstr(testword,"gbt"))
  1644 		istypo=TRUE;
  1645 	    if (strstr(testword,"pbt"))
  1646 		istypo=TRUE;
  1647 	    if (strstr(testword,"tbs"))
  1648 		istypo=TRUE;
  1649 	    if (strstr(testword,"mrn"))
  1650 		istypo=TRUE;
  1651 	    if (strstr(testword,"ahle"))
  1652 		istypo=TRUE;
  1653 	    if (strstr(testword,"ihle"))
  1654 		istypo=TRUE;
  1655 	    /*
  1656 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1657 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1658 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1659 	     * numerals, but "ii" is a common scanno.
  1660 	     */
  1661 	    if (strstr(testword,"tbi"))
  1662 		istypo=TRUE;
  1663 	    if (strstr(testword,"tbe"))
  1664 		istypo=TRUE;
  1665 	    if (strstr(testword,"ii"))
  1666 		istypo=TRUE;
  1667 	    /*
  1668 	     * Check for no vowels or no consonants.
  1669 	     * If none, flag a typo.
  1670 	     */
  1671 	    if (!istypo && len>1)
  1672 	    {
  1673 		vowel=consonant=0;
  1674 		for (t=testword;*t;t=g_utf8_next_char(t))
  1675 		{
  1676 		    c=g_utf8_get_char(t);
  1677 		    decomposition=
  1678 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1679 		    if (c=='y' || g_unichar_isdigit(c))
  1680 		    {
  1681 			/* Yah, this is loose. */
  1682 			vowel++;
  1683 			consonant++;
  1684 		    }
  1685 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1686 			vowel++;
  1687 		    else
  1688 			consonant++;
  1689 		    g_free(decomposition);
  1690 		}
  1691 		if (!vowel || !consonant)
  1692 		    istypo=TRUE;
  1693 	    }
  1694 	    /*
  1695 	     * Now exclude the word from being reported if it's in
  1696 	     * the okword list.
  1697 	     */
  1698 	    for (i=0;*okword[i];i++)
  1699 		if (!strcmp(testword,okword[i]))
  1700 		    istypo=FALSE;
  1701 	    /*
  1702 	     * What looks like a typo may be a Roman numeral.
  1703 	     * Exclude these.
  1704 	     */
  1705 	    if (istypo && isroman(testword))
  1706 		istypo=FALSE;
  1707 	    /* Check the manual list of typos. */
  1708 	    if (!istypo)
  1709 		for (i=0;*typo[i];i++)
  1710 		    if (!strcmp(testword,typo[i]))
  1711 			istypo=TRUE;
  1712 	    /*
  1713 	     * Check lowercase s, l, i and m - special cases.
  1714 	     *   "j" - often a semi-colon gone wrong.
  1715 	     *   "d" for a missing apostrophe - he d
  1716 	     *   "n" for "in"
  1717 	     */
  1718 	    if (!istypo && len==1 &&
  1719 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1720 		istypo=TRUE;
  1721 	    if (istypo)
  1722 	    {
  1723 		dupcnt=g_tree_lookup(qword,testword);
  1724 		if (dupcnt)
  1725 		{
  1726 		    (*dupcnt)++;
  1727 		    isdup=!pswit[VERBOSE_SWITCH];
  1728 		}
  1729 		else
  1730 		{
  1731 		    dupcnt=g_new0(int,1);
  1732 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1733 		    isdup=FALSE;
  1734 		}
  1735 		if (!isdup)
  1736 		{
  1737 		    if (pswit[ECHO_SWITCH])
  1738 			g_print("\n%s\n",aline);
  1739 		    if (!pswit[OVERVIEW_SWITCH])
  1740 		    {
  1741 			g_print("    Line %ld column %ld - Query word %s",
  1742 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1743 			  inword);
  1744 			if (!pswit[VERBOSE_SWITCH])
  1745 			    g_print(" - not reporting duplicates");
  1746 			g_print("\n");
  1747 		    }
  1748 		    else
  1749 			cnt_word++;
  1750 		}
  1751 	    }
  1752 	}
  1753 	/* check the user's list of typos */
  1754 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1755 	{
  1756 	    if (pswit[ECHO_SWITCH])
  1757 		g_print("\n%s\n",aline);
  1758 	    if (!pswit[OVERVIEW_SWITCH])  
  1759 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1760 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1761 	}
  1762 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1763 	    g_free(testword);
  1764 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1765 	{
  1766 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1767 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1768 	    {
  1769 		if (pswit[ECHO_SWITCH])
  1770 		    g_print("\n%s\n",aline);
  1771 		if (!pswit[OVERVIEW_SWITCH])
  1772 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1773 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1774 		      inword);
  1775 		else
  1776 		    cnt_word++;
  1777 	    }
  1778 	}
  1779 	g_free(inword);
  1780     }
  1781 }
  1782 
  1783 /*
  1784  * check_for_misspaced_punctuation:
  1785  *
  1786  * Look for added or missing spaces around punctuation and quotes.
  1787  * If there is a punctuation character like ! with no space on
  1788  * either side, suspect a missing!space. If there are spaces on
  1789  * both sides , assume a typo. If we see a double quote with no
  1790  * space or punctuation on either side of it, assume unspaced
  1791  * quotes "like"this.
  1792  */
  1793 void check_for_misspaced_punctuation(const char *aline,
  1794   struct parities *parities,gboolean isemptyline)
  1795 {
  1796     gboolean isacro,isellipsis;
  1797     const char *s;
  1798     gunichar c,nc,pc,n2c;
  1799     int parity;
  1800     c=g_utf8_get_char(aline);
  1801     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1802     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1803     {
  1804 	pc=c;
  1805 	c=nc;
  1806 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1807 	/* For each character in the line after the first. */
  1808 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1809 	{
  1810 	    /* we need to suppress warnings for acronyms like M.D. */
  1811 	    isacro=FALSE;
  1812 	    /* we need to suppress warnings for ellipsis . . . */
  1813 	    isellipsis=FALSE;
  1814 	    /*
  1815 	     * If there are letters on both sides of it or
  1816 	     * if it's strict punctuation followed by an alpha.
  1817 	     */
  1818 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1819 	      g_utf8_strchr("?!,;:",-1,c)))
  1820 	    {
  1821 		if (c=='.')
  1822 		{
  1823 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1824 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1825 			isacro=TRUE;
  1826 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1827 		    if (nc && n2c=='.')
  1828 			isacro=TRUE;
  1829 		}
  1830 		if (!isacro)
  1831 		{
  1832 		    if (pswit[ECHO_SWITCH])
  1833 			g_print("\n%s\n",aline);
  1834 		    if (!pswit[OVERVIEW_SWITCH])
  1835 			g_print("    Line %ld column %ld - Missing space?\n",
  1836 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1837 		    else
  1838 			cnt_punct++;
  1839 		}
  1840 	    }
  1841 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1842 	    {
  1843 		/*
  1844 		 * If there are spaces on both sides,
  1845 		 * or space before and end of line.
  1846 		 */
  1847 		if (c=='.')
  1848 		{
  1849 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1850 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1851 			isellipsis=TRUE;
  1852 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1853 		    if (nc && n2c=='.')
  1854 			isellipsis=TRUE;
  1855 		}
  1856 		if (!isemptyline && !isellipsis)
  1857 		{
  1858 		    if (pswit[ECHO_SWITCH])
  1859 			g_print("\n%s\n",aline);
  1860 		    if (!pswit[OVERVIEW_SWITCH])
  1861 			g_print("    Line %ld column %ld - "
  1862 			  "Spaced punctuation?\n",linecnt,
  1863 			  g_utf8_pointer_to_offset(aline,s)+1);
  1864 		    else
  1865 			cnt_punct++;
  1866 		}
  1867 	    }
  1868 	}
  1869     }
  1870     /* Split out the characters that CANNOT be preceded by space. */
  1871     c=g_utf8_get_char(aline);
  1872     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1873     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1874     {
  1875 	pc=c;
  1876 	c=nc;
  1877 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1878 	/* for each character in the line after the first */
  1879 	if (g_utf8_strchr("?!,;:",-1,c))
  1880 	{
  1881 	    /* if it's punctuation that _cannot_ have a space before it */
  1882 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1883 	    {
  1884 		/*
  1885 		 * If nc DOES == space,
  1886 		 * it was already reported just above.
  1887 		 */
  1888 		if (pswit[ECHO_SWITCH])
  1889 		    g_print("\n%s\n",aline);
  1890 		if (!pswit[OVERVIEW_SWITCH])
  1891 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1892 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1893 		else
  1894 		    cnt_punct++;
  1895 	    }
  1896 	}
  1897     }
  1898     /*
  1899      * Special case " .X" where X is any alpha.
  1900      * This plugs a hole in the acronym code above.
  1901      * Inelegant, but maintainable.
  1902      */
  1903     c=g_utf8_get_char(aline);
  1904     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1905     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1906     {
  1907 	pc=c;
  1908 	c=nc;
  1909 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1910 	/* for each character in the line after the first */
  1911 	if (c=='.')
  1912 	{
  1913 	    /* if it's a period */
  1914 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1915 	    {
  1916 		/*
  1917 		 * If the period follows a space and
  1918 		 * is followed by a letter.
  1919 		 */
  1920 		if (pswit[ECHO_SWITCH])
  1921 		    g_print("\n%s\n",aline);
  1922 		if (!pswit[OVERVIEW_SWITCH])
  1923 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1924 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1925 		else
  1926 		    cnt_punct++;
  1927 	    }
  1928 	}
  1929     }
  1930     c=g_utf8_get_char(aline);
  1931     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1932     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1933     {
  1934 	pc=c;
  1935 	c=nc;
  1936 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1937 	/* for each character in the line after the first */
  1938 	if (CHAR_IS_DQUOTE(c))
  1939 	{
  1940 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  1941 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  1942 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  1943 	    {
  1944 		if (pswit[ECHO_SWITCH])
  1945 		    g_print("\n%s\n",aline);
  1946 		if (!pswit[OVERVIEW_SWITCH])
  1947 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  1948 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1949 		else
  1950 		    cnt_punct++;
  1951 	    }
  1952 	}
  1953     }
  1954     /* Check parity of quotes. */
  1955     nc=g_utf8_get_char(aline);
  1956     for (s=aline;*s;s=g_utf8_next_char(s))
  1957     {
  1958 	c=nc;
  1959 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1960 	if (CHAR_IS_DQUOTE(c))
  1961 	{
  1962 	    if (c==CHAR_DQUOTE)
  1963 	    {
  1964 		parities->dquote=!parities->dquote;
  1965 		parity=parities->dquote;
  1966 	    }
  1967 	    else if (c==CHAR_LD_QUOTE)
  1968 		parity=1;
  1969 	    else
  1970 		parity=0;
  1971 	    if (!parity)
  1972 	    {
  1973 		/* parity even */
  1974 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  1975 		{
  1976 		    if (pswit[ECHO_SWITCH])
  1977 			g_print("\n%s\n",aline);
  1978 		    if (!pswit[OVERVIEW_SWITCH])
  1979 			g_print("    Line %ld column %ld - "
  1980 			  "Wrongspaced quotes?\n",
  1981 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1982 		    else
  1983 			cnt_punct++;
  1984 		}
  1985 	    }
  1986 	    else
  1987 	    {
  1988 		/* parity odd */
  1989 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  1990 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  1991 		{
  1992 		    if (pswit[ECHO_SWITCH])
  1993 			g_print("\n%s\n",aline);
  1994 		    if (!pswit[OVERVIEW_SWITCH])
  1995 			g_print("    Line %ld column %ld - "
  1996 			  "Wrongspaced quotes?\n",
  1997 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1998 		    else
  1999 			cnt_punct++;
  2000 		}
  2001 	    }
  2002 	}
  2003     }
  2004     c=g_utf8_get_char(aline);
  2005     if (CHAR_IS_DQUOTE(c))
  2006     {
  2007 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2008 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2009 	{
  2010 	    if (pswit[ECHO_SWITCH])
  2011 		g_print("\n%s\n",aline);
  2012 	    if (!pswit[OVERVIEW_SWITCH])
  2013 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2014 		  linecnt);
  2015 	    else
  2016 		cnt_punct++;
  2017 	}
  2018     }
  2019     if (pswit[SQUOTE_SWITCH])
  2020     {
  2021 	nc=g_utf8_get_char(aline);
  2022 	for (s=aline;*s;s=g_utf8_next_char(s))
  2023 	{
  2024 	    c=nc;
  2025 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2026 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  2027 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2028 	      !g_unichar_isalpha(nc)))
  2029 	    {
  2030 		parities->squote=!parities->squote;
  2031 		if (!parities->squote)
  2032 		{
  2033 		    /* parity even */
  2034 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2035 		    {
  2036 			if (pswit[ECHO_SWITCH])
  2037 			    g_print("\n%s\n",aline);
  2038 			if (!pswit[OVERVIEW_SWITCH])
  2039 			    g_print("    Line %ld column %ld - "
  2040 			      "Wrongspaced singlequotes?\n",
  2041 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2042 			else
  2043 			    cnt_punct++;
  2044 		    }
  2045 		}
  2046 		else
  2047 		{
  2048 		    /* parity odd */
  2049 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2050 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2051 		    {
  2052 			if (pswit[ECHO_SWITCH])
  2053 			    g_print("\n%s\n",aline);
  2054 			if (!pswit[OVERVIEW_SWITCH])
  2055 			    g_print("    Line %ld column %ld - "
  2056 			      "Wrongspaced singlequotes?\n",
  2057 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2058 			else
  2059 			    cnt_punct++;
  2060 		    }
  2061 		}
  2062 	    }
  2063 	}
  2064     }
  2065 }
  2066 
  2067 /*
  2068  * check_for_double_punctuation:
  2069  *
  2070  * Look for double punctuation like ,. or ,,
  2071  * Thanks to DW for the suggestion!
  2072  * In books with references, ".," and ".;" are common
  2073  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2074  * OTOH, from my initial tests, there are also fairly
  2075  * common errors. What to do? Make these cases paranoid?
  2076  * ".," is the most common, so warnings->dotcomma is used
  2077  * to suppress detailed reporting if it occurs often.
  2078  */
  2079 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2080 {
  2081     const char *s;
  2082     gunichar c,nc;
  2083     nc=g_utf8_get_char(aline);
  2084     for (s=aline;*s;s=g_utf8_next_char(s))
  2085     {
  2086 	c=nc;
  2087 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2088 	/* for each punctuation character in the line */
  2089 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2090 	  g_utf8_strchr(".?!,;:",-1,nc))
  2091 	{
  2092 	    /* followed by punctuation, it's a query, unless . . . */
  2093 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2094 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2095 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2096 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2097 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2098 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2099 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2100 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2101 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2102 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2103 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2104 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2105 	    {
  2106 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2107 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2108 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2109 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2110 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2111 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2112 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2113 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2114 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2115 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2116 		{
  2117 		    s+=4;
  2118 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2119 		}
  2120 		; /* do nothing for .. !! and ?? which can be legit */
  2121 	    }
  2122 	    else
  2123 	    {
  2124 		if (pswit[ECHO_SWITCH])
  2125 		    g_print("\n%s\n",aline);
  2126 		if (!pswit[OVERVIEW_SWITCH])
  2127 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2128 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2129 		else
  2130 		    cnt_punct++;
  2131 	    }
  2132 	}
  2133     }
  2134 }
  2135 
  2136 /*
  2137  * check_for_spaced_quotes:
  2138  */
  2139 void check_for_spaced_quotes(const char *aline)
  2140 {
  2141     int i;
  2142     const char *s,*t;
  2143     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2144       CHAR_RS_QUOTE};
  2145     GString *pattern;
  2146     s=aline;
  2147     while ((t=strstr(s," \" ")))
  2148     {
  2149 	if (pswit[ECHO_SWITCH])
  2150 	    g_print("\n%s\n",aline);
  2151 	if (!pswit[OVERVIEW_SWITCH])
  2152 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2153 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2154 	else
  2155 	    cnt_punct++;
  2156 	s=g_utf8_next_char(g_utf8_next_char(t));
  2157     }
  2158     pattern=g_string_new(NULL);
  2159     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2160     {
  2161 	g_string_assign(pattern," ");
  2162 	g_string_append_unichar(pattern,single_quotes[i]);
  2163 	g_string_append_c(pattern,' ');
  2164 	s=aline;
  2165 	while ((t=strstr(s,pattern->str)))
  2166 	{
  2167 	    if (pswit[ECHO_SWITCH])
  2168 		g_print("\n%s\n",aline);
  2169 	    if (!pswit[OVERVIEW_SWITCH])
  2170 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2171 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2172 	    else
  2173 		cnt_punct++;
  2174 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2175 	}
  2176     }
  2177     g_string_free(pattern,TRUE);
  2178 }
  2179 
  2180 /*
  2181  * check_for_miscased_genative:
  2182  *
  2183  * Check special case of 'S instead of 's at end of word.
  2184  */
  2185 void check_for_miscased_genative(const char *aline)
  2186 {
  2187     const char *s;
  2188     gunichar c,nc,pc;
  2189     if (!*aline)
  2190 	return;
  2191     c=g_utf8_get_char(aline);
  2192     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2193     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2194     {
  2195 	pc=c;
  2196 	c=nc;
  2197 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2198 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2199 	{
  2200 	    if (pswit[ECHO_SWITCH])
  2201 		g_print("\n%s\n",aline);
  2202 	    if (!pswit[OVERVIEW_SWITCH])
  2203 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2204 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2205 	    else
  2206 		cnt_punct++;
  2207 	}
  2208     }
  2209 }
  2210 
  2211 /*
  2212  * check_end_of_line:
  2213  *
  2214  * Now check special cases - start and end of line -
  2215  * for single and double quotes. Start is sometimes [sic]
  2216  * but better to query it anyway.
  2217  * While we're here, check for dash at end of line.
  2218  */
  2219 void check_end_of_line(const char *aline,struct warnings *warnings)
  2220 {
  2221     int lbytes;
  2222     const char *s;
  2223     gunichar c1,c2;
  2224     lbytes=strlen(aline);
  2225     if (g_utf8_strlen(aline,lbytes)>1)
  2226     {
  2227 	s=g_utf8_prev_char(aline+lbytes);
  2228 	c1=g_utf8_get_char(s);
  2229 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2230 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2231 	{
  2232 	    if (pswit[ECHO_SWITCH])
  2233 		g_print("\n%s\n",aline);
  2234 	    if (!pswit[OVERVIEW_SWITCH])
  2235 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2236 		  g_utf8_strlen(aline,lbytes));
  2237 	    else
  2238 		cnt_punct++;
  2239 	}
  2240 	c1=g_utf8_get_char(aline);
  2241 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2242 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2243 	{
  2244 	    if (pswit[ECHO_SWITCH])
  2245 		g_print("\n%s\n",aline);
  2246 	    if (!pswit[OVERVIEW_SWITCH])
  2247 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2248 	    else
  2249 		cnt_punct++;
  2250 	}
  2251 	/*
  2252 	 * Dash at end of line may well be legit - paranoid mode only
  2253 	 * and don't report em-dash at line-end.
  2254 	 */
  2255 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2256 	{
  2257 	    for (s=g_utf8_prev_char(aline+lbytes);
  2258 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2259 		;
  2260 	    if (g_utf8_get_char(s)=='-' &&
  2261 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2262 	    {
  2263 		if (pswit[ECHO_SWITCH])
  2264 		    g_print("\n%s\n",aline);
  2265 		if (!pswit[OVERVIEW_SWITCH])
  2266 		    g_print("    Line %ld column %ld - "
  2267 		      "Hyphen at end of line?\n",
  2268 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2269 	    }
  2270 	}
  2271     }
  2272 }
  2273 
  2274 /*
  2275  * check_for_unspaced_bracket:
  2276  *
  2277  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2278  * If so, suspect a scanno like "a]most".
  2279  */
  2280 void check_for_unspaced_bracket(const char *aline)
  2281 {
  2282     const char *s;
  2283     gunichar c,nc,pc;
  2284     c=g_utf8_get_char(aline);
  2285     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2286     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2287     {
  2288 	pc=c;
  2289 	c=nc;
  2290 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2291 	if (!nc)
  2292 	    break;
  2293 	/* for each bracket character in the line except 1st & last */
  2294 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2295 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2296 	{
  2297 	    if (pswit[ECHO_SWITCH])
  2298 		g_print("\n%s\n",aline);
  2299 	    if (!pswit[OVERVIEW_SWITCH])
  2300 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2301 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2302 	    else
  2303 		cnt_punct++;
  2304 	}
  2305     }
  2306 }
  2307 
  2308 /*
  2309  * check_for_unpunctuated_endquote:
  2310  */
  2311 void check_for_unpunctuated_endquote(const char *aline)
  2312 {
  2313     const char *s;
  2314     gunichar c,nc,pc;
  2315     QuoteClass qc;
  2316     c=g_utf8_get_char(aline);
  2317     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2318     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2319     {
  2320 	pc=c;
  2321 	c=nc;
  2322 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
  2323 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2324 	/* for each character in the line except 1st */
  2325 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
  2326 	{
  2327 	    if (pswit[ECHO_SWITCH])
  2328 		g_print("\n%s\n",aline);
  2329 	    if (!pswit[OVERVIEW_SWITCH])
  2330 		g_print("    Line %ld column %ld - "
  2331 		  "endquote missing punctuation?\n",
  2332 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2333 	    else
  2334 		cnt_punct++;
  2335 	}
  2336     }
  2337 }
  2338 
  2339 /*
  2340  * check_for_html_tag:
  2341  *
  2342  * Check for <HTML TAG>.
  2343  *
  2344  * If there is a < in the line, followed at some point
  2345  * by a > then we suspect HTML.
  2346  */
  2347 void check_for_html_tag(const char *aline)
  2348 {
  2349     const char *open,*close;
  2350     gchar *tag;
  2351     open=strchr(aline,'<');
  2352     if (open)
  2353     {
  2354 	close=strchr(g_utf8_next_char(open),'>');
  2355 	if (close)
  2356 	{
  2357 	    if (pswit[ECHO_SWITCH])
  2358 		g_print("\n%s\n",aline);
  2359 	    if (!pswit[OVERVIEW_SWITCH])
  2360 	    {
  2361 		tag=g_strndup(open,close-open+1);
  2362 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2363 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2364 		g_free(tag);
  2365 	    }
  2366 	    else
  2367 		cnt_html++;
  2368 	}
  2369     }
  2370 }
  2371 
  2372 /*
  2373  * check_for_html_entity:
  2374  *
  2375  * Check for &symbol; HTML.
  2376  *
  2377  * If there is a & in the line, followed at
  2378  * some point by a ; then we suspect HTML.
  2379  */
  2380 void check_for_html_entity(const char *aline)
  2381 {
  2382     const char *s,*amp,*scolon;
  2383     gchar *entity;
  2384     amp=strchr(aline,'&');
  2385     if (amp)
  2386     {
  2387 	scolon=strchr(amp,';');
  2388 	if (scolon)
  2389 	{
  2390 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2391 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2392 		    break;		/* Don't report "Jones & Son;" */
  2393 	    if (s>=scolon)
  2394 	    {
  2395 		if (pswit[ECHO_SWITCH])
  2396 		    g_print("\n%s\n",aline);
  2397 		if (!pswit[OVERVIEW_SWITCH])
  2398 		{
  2399 		    entity=g_strndup(amp,scolon-amp+1);
  2400 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2401 		      linecnt,(int)(amp-aline)+1,entity);
  2402 		    g_free(entity);
  2403 		}
  2404 		else
  2405 		    cnt_html++;
  2406 	    }
  2407 	}
  2408     }
  2409 }
  2410 
  2411 /*
  2412  * check_for_omitted_punctuation:
  2413  *
  2414  * Check for omitted punctuation at end of paragraph by working back
  2415  * through prevline. DW.
  2416  * Need to check this only for "normal" paras.
  2417  * So what is a "normal" para?
  2418  *    Not normal if one-liner (chapter headings, etc.)
  2419  *    Not normal if doesn't contain at least one locase letter
  2420  *    Not normal if starts with space
  2421  */
  2422 void check_for_omitted_punctuation(const char *prevline,
  2423   struct line_properties *last,int start_para_line)
  2424 {
  2425     gboolean letter_on_line=FALSE;
  2426     const char *s;
  2427     gunichar c;
  2428     gboolean closing_quote;
  2429     for (s=prevline;*s;s=g_utf8_next_char(s))
  2430 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2431 	{
  2432 	    letter_on_line=TRUE;
  2433 	    break;
  2434 	}
  2435     /*
  2436      * This next "if" is a problem.
  2437      * If we say "start_para_line <= linecnt - 1", that includes
  2438      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2439      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2440      * misses genuine one-line paragraphs.
  2441      */
  2442     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2443       g_utf8_get_char(prevline)>CHAR_SPACE)
  2444     {
  2445 	s=prevline+strlen(prevline);
  2446 	do
  2447 	{
  2448 	    s=g_utf8_prev_char(s);
  2449 	    c=g_utf8_get_char(s);
  2450 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)
  2451 		closing_quote=TRUE;
  2452 	    else
  2453 		closing_quote=FALSE;
  2454 	} while (closing_quote && s>prevline);
  2455 	for (;s>prevline;s=g_utf8_prev_char(s))
  2456 	{
  2457 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2458 	    {
  2459 		if (pswit[ECHO_SWITCH])
  2460 		    g_print("\n%s\n",prevline);
  2461 		if (!pswit[OVERVIEW_SWITCH])
  2462 		    g_print("    Line %ld column %ld - "
  2463 		      "No punctuation at para end?\n",
  2464 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2465 		else
  2466 		    cnt_punct++;
  2467 		break;
  2468 	    }
  2469 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2470 		break;
  2471 	}
  2472     }
  2473 }
  2474 
  2475 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2476 {
  2477     const char *word=key;
  2478     int *dupcnt=value;
  2479     if (*dupcnt)
  2480 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2481 	  word,*dupcnt);
  2482     return FALSE;
  2483 }
  2484 
  2485 void print_as_windows_1252(const char *string)
  2486 {
  2487     gsize inbytes,outbytes;
  2488     gchar *buf,*bp;
  2489     static GIConv converter=(GIConv)-1;
  2490     if (!string)
  2491     {
  2492 	if (converter!=(GIConv)-1)
  2493 	    g_iconv_close(converter);
  2494 	converter=(GIConv)-1;
  2495 	return;
  2496     }
  2497     if (converter==(GIConv)-1)
  2498 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2499     if (converter!=(GIConv)-1)
  2500     {
  2501 	inbytes=outbytes=strlen(string);
  2502 	bp=buf=g_malloc(outbytes+1);
  2503 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2504 	*bp='\0';
  2505 	fputs(buf,stdout);
  2506 	g_free(buf);
  2507     }
  2508     else
  2509 	fputs(string,stdout);
  2510 }
  2511 
  2512 void print_as_utf_8(const char *string)
  2513 {
  2514     fputs(string,stdout);
  2515 }
  2516 
  2517 /*
  2518  * procfile:
  2519  *
  2520  * Process one file.
  2521  */
  2522 void procfile(const char *filename)
  2523 {
  2524     const char *s;
  2525     gchar *parastart=NULL;	/* first line of current para */
  2526     gchar *etext,*aline;
  2527     gchar *etext_ptr;
  2528     GError *err=NULL;
  2529     struct first_pass_results *first_pass_results;
  2530     struct warnings *warnings;
  2531     struct counters counters={0};
  2532     struct line_properties last={0};
  2533     struct parities parities={0};
  2534     struct pending pending={0};
  2535     gboolean isemptyline;
  2536     long start_para_line=0;
  2537     gboolean isnewpara=FALSE,enddash=FALSE;
  2538     last.start=CHAR_SPACE;
  2539     linecnt=checked_linecnt=0;
  2540     etext=read_etext(filename,&err);
  2541     if (!etext)
  2542     {
  2543 	if (pswit[STDOUT_SWITCH])
  2544 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2545 	else
  2546 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2547 	exit(1);
  2548     }
  2549     g_print("\n\nFile: %s\n\n",filename);
  2550     first_pass_results=first_pass(etext);
  2551     warnings=report_first_pass(first_pass_results);
  2552     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2553     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2554     /*
  2555      * Here we go with the main pass. Hold onto yer hat!
  2556      */
  2557     linecnt=0;
  2558     etext_ptr=etext;
  2559     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2560     {
  2561 	linecnt++;
  2562 	if (linecnt==1)
  2563 	    isnewpara=TRUE;
  2564 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2565 	    continue;    // skip DP page separators completely
  2566 	if (linecnt<first_pass_results->firstline ||
  2567 	  (first_pass_results->footerline>0 &&
  2568 	  linecnt>first_pass_results->footerline))
  2569 	{
  2570 	    if (pswit[HEADER_SWITCH])
  2571 	    {
  2572 		if (g_str_has_prefix(aline,"Title:"))
  2573 		    g_print("    %s\n",aline);
  2574 		if (g_str_has_prefix(aline,"Author:"))
  2575 		    g_print("    %s\n",aline);
  2576 		if (g_str_has_prefix(aline,"Release Date:"))
  2577 		    g_print("    %s\n",aline);
  2578 		if (g_str_has_prefix(aline,"Edition:"))
  2579 		    g_print("    %s\n\n",aline);
  2580 	    }
  2581 	    continue;		/* skip through the header */
  2582 	}
  2583 	checked_linecnt++;
  2584 	print_pending(aline,parastart,&pending);
  2585 	isemptyline=analyse_quotes(aline,linecnt,&counters);
  2586 	if (isnewpara && !isemptyline)
  2587 	{
  2588 	    /* This line is the start of a new paragraph. */
  2589 	    start_para_line=linecnt;
  2590 	    /* Capture its first line in case we want to report it later. */
  2591 	    g_free(parastart);
  2592 	    parastart=g_strdup(aline);
  2593 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2594 	    s=aline;
  2595 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2596 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2597 		s=g_utf8_next_char(s);
  2598 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2599 	    {
  2600 		/* and its first letter is lowercase */
  2601 		if (pswit[ECHO_SWITCH])
  2602 		    g_print("\n%s\n",aline);
  2603 		if (!pswit[OVERVIEW_SWITCH])
  2604 		    g_print("    Line %ld column %ld - "
  2605 		      "Paragraph starts with lower-case\n",
  2606 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2607 		else
  2608 		    cnt_punct++;
  2609 	    }
  2610 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2611 	}
  2612 	/* Check for an em-dash broken at line end. */
  2613 	if (enddash && g_utf8_get_char(aline)=='-')
  2614 	{
  2615 	    if (pswit[ECHO_SWITCH])
  2616 		g_print("\n%s\n",aline);
  2617 	    if (!pswit[OVERVIEW_SWITCH])
  2618 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2619 	    else
  2620 		cnt_punct++;
  2621 	}
  2622 	enddash=FALSE;
  2623 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2624 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2625 	    ;
  2626 	if (s>=aline && g_utf8_get_char(s)=='-')
  2627 	    enddash=TRUE;
  2628 	check_for_control_characters(aline);
  2629 	if (warnings->bin)
  2630 	    check_for_odd_characters(aline,warnings,isemptyline);
  2631 	if (warnings->longline)
  2632 	    check_for_long_line(aline);
  2633 	if (warnings->shortline)
  2634 	    check_for_short_line(aline,&last);
  2635 	last.blen=last.len;
  2636 	last.len=g_utf8_strlen(aline,-1);
  2637 	last.start=g_utf8_get_char(aline);
  2638 	check_for_starting_punctuation(aline);
  2639 	if (warnings->dash)
  2640 	{
  2641 	    check_for_spaced_emdash(aline);
  2642 	    check_for_spaced_dash(aline);
  2643 	}
  2644 	check_for_unmarked_paragraphs(aline);
  2645 	check_for_jeebies(aline);
  2646 	check_for_mta_from(aline);
  2647 	check_for_orphan_character(aline);
  2648 	check_for_pling_scanno(aline);
  2649 	check_for_extra_period(aline,warnings);
  2650 	check_for_following_punctuation(aline);
  2651 	check_for_typos(aline,warnings);
  2652 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2653 	check_for_double_punctuation(aline,warnings);
  2654 	check_for_spaced_quotes(aline);
  2655 	check_for_miscased_genative(aline);
  2656 	check_end_of_line(aline,warnings);
  2657 	check_for_unspaced_bracket(aline);
  2658 	if (warnings->endquote)
  2659 	    check_for_unpunctuated_endquote(aline);
  2660 	check_for_html_tag(aline);
  2661 	check_for_html_entity(aline);
  2662 	if (isemptyline)
  2663 	{
  2664 	    check_for_mismatched_quotes(&counters,&pending);
  2665 	    counters_reset(&counters);
  2666 	    /* let the next iteration know that it's starting a new para */
  2667 	    isnewpara=TRUE;
  2668 	    if (prevline)
  2669 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2670 	}
  2671 	g_free(prevline);
  2672 	prevline=g_strdup(aline);
  2673     }
  2674     linecnt++;
  2675     check_for_mismatched_quotes(&counters,&pending);
  2676     print_pending(NULL,parastart,&pending);
  2677     reset_pending(&pending);
  2678     if (prevline)
  2679     {
  2680 	g_free(prevline);
  2681 	prevline=NULL;
  2682     }
  2683     g_free(parastart);
  2684     g_free(prevline);
  2685     g_free(etext);
  2686     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2687 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2688     g_tree_unref(qword);
  2689     g_tree_unref(qperiod);
  2690     counters_destroy(&counters);
  2691     g_set_print_handler(NULL);
  2692     print_as_windows_1252(NULL);
  2693     if (pswit[MARKUP_SWITCH])  
  2694 	loseentities(NULL);
  2695 }
  2696 
  2697 /*
  2698  * flgets:
  2699  *
  2700  * Get one line from the input text, checking for
  2701  * the existence of exactly one CR/LF line-end per line.
  2702  *
  2703  * Returns: a pointer to the line.
  2704  */
  2705 char *flgets(char **etext,long lcnt)
  2706 {
  2707     gunichar c;
  2708     gboolean isCR=FALSE;
  2709     char *theline=*etext;
  2710     char *eos=theline;
  2711     gchar *s;
  2712     for (;;)
  2713     {
  2714 	c=g_utf8_get_char(*etext);
  2715 	*etext=g_utf8_next_char(*etext);
  2716 	if (!c)
  2717 	    return NULL;
  2718 	/* either way, it's end of line */
  2719 	if (c=='\n')
  2720 	{
  2721 	    if (isCR)
  2722 		break;
  2723 	    else
  2724 	    {
  2725 		/* Error - a LF without a preceding CR */
  2726 		if (pswit[LINE_END_SWITCH])
  2727 		{
  2728 		    if (pswit[ECHO_SWITCH])
  2729 		    {
  2730 			s=g_strndup(theline,eos-theline);
  2731 			g_print("\n%s\n",s);
  2732 			g_free(s);
  2733 		    }
  2734 		    if (!pswit[OVERVIEW_SWITCH])
  2735 			g_print("    Line %ld - No CR?\n",lcnt);
  2736 		    else
  2737 			cnt_lineend++;
  2738 		}
  2739 		break;
  2740 	    }
  2741 	}
  2742 	if (c=='\r')
  2743 	{
  2744 	    if (isCR)
  2745 	    {
  2746 		/* Error - two successive CRs */
  2747 		if (pswit[LINE_END_SWITCH])
  2748 		{
  2749 		    if (pswit[ECHO_SWITCH])
  2750 		    {
  2751 			s=g_strndup(theline,eos-theline);
  2752 			g_print("\n%s\n",s);
  2753 			g_free(s);
  2754 		    }
  2755 		    if (!pswit[OVERVIEW_SWITCH])
  2756 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2757 		    else
  2758 			cnt_lineend++;
  2759 		}
  2760 	    }
  2761 	    isCR=TRUE;
  2762 	}
  2763 	else
  2764 	{
  2765 	    if (pswit[LINE_END_SWITCH] && isCR)
  2766 	    {
  2767 		if (pswit[ECHO_SWITCH])
  2768 		{
  2769 		    s=g_strndup(theline,eos-theline);
  2770 		    g_print("\n%s\n",s);
  2771 		    g_free(s);
  2772 		}
  2773 		if (!pswit[OVERVIEW_SWITCH])
  2774 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2775 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2776 		else
  2777 		    cnt_lineend++;
  2778 		*eos=' ';
  2779 	    }
  2780 	    isCR=FALSE;
  2781 	    eos=g_utf8_next_char(eos);
  2782 	}
  2783     }
  2784     *eos='\0';
  2785     if (pswit[MARKUP_SWITCH])  
  2786 	postprocess_for_HTML(theline);
  2787     if (pswit[DP_SWITCH])  
  2788 	postprocess_for_DP(theline);
  2789     return theline;
  2790 }
  2791 
  2792 /*
  2793  * mixdigit:
  2794  *
  2795  * Takes a "word" as a parameter, and checks whether it
  2796  * contains a mixture of alpha and digits. Generally, this is an
  2797  * error, but may not be for cases like 4th or L5 12s. 3d.
  2798  *
  2799  * Returns: TRUE iff an is error found.
  2800  */
  2801 gboolean mixdigit(const char *checkword)
  2802 {
  2803     gboolean wehaveadigit,wehavealetter,query;
  2804     const char *s,*nondigit;
  2805     wehaveadigit=wehavealetter=query=FALSE;
  2806     for (s=checkword;*s;s=g_utf8_next_char(s))
  2807 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2808 	    wehavealetter=TRUE;
  2809 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2810 	    wehaveadigit=TRUE;
  2811     if (wehaveadigit && wehavealetter)
  2812     {
  2813 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2814 	query=TRUE;
  2815 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2816 	  nondigit=g_utf8_next_char(nondigit))
  2817 	    ;
  2818 	/* digits, ending in st, rd, nd, th of either case */
  2819 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2820 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2821 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2822 	  !g_ascii_strcasecmp(nondigit,"th"))
  2823 	    query=FALSE;
  2824 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2825 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2826 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2827 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2828 	    query=FALSE;
  2829 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2830 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2831 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2832 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2833 	    query=FALSE;
  2834 	/* digits, ending in l, L, s or d */
  2835 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2836 	  !strcmp(nondigit,"d"))
  2837 	    query=FALSE;
  2838 	/*
  2839 	 * L at the start of a number, representing Britsh pounds, like L500.
  2840 	 * This is cute. We know the current word is mixed digit. If the first
  2841 	 * letter is L, there must be at least one digit following. If both
  2842 	 * digits and letters follow, we have a genuine error, else we have a
  2843 	 * capital L followed by digits, and we accept that as a non-error.
  2844 	 */
  2845 	if (g_utf8_get_char(checkword)=='L' &&
  2846 	  !mixdigit(g_utf8_next_char(checkword)))
  2847 	    query=FALSE;
  2848     }
  2849     return query;
  2850 }
  2851 
  2852 /*
  2853  * getaword:
  2854  *
  2855  * Extracts the first/next "word" from the line, and returns it.
  2856  * A word is defined as one English word unit--or at least that's the aim.
  2857  * "ptr" is advanced to the position in the line where we will start
  2858  * looking for the next word.
  2859  *
  2860  * Returns: A newly-allocated string.
  2861  */
  2862 gchar *getaword(const char **ptr)
  2863 {
  2864     const char *s,*t;
  2865     GString *word;
  2866     gunichar c,pc;
  2867     word=g_string_new(NULL);
  2868     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2869       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2870       **ptr;*ptr=g_utf8_next_char(*ptr))
  2871 	;
  2872     /*
  2873      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2874      * Especially yucky is the case of L1,000
  2875      * This section looks for a pattern of characters including a digit
  2876      * followed by a comma or period followed by one or more digits.
  2877      * If found, it returns this whole pattern as a word; otherwise we discard
  2878      * the results and resume our normal programming.
  2879      */
  2880     s=*ptr;
  2881     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  2882       g_unichar_isalpha(g_utf8_get_char(s)) ||
  2883       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  2884 	g_string_append_unichar(word,g_utf8_get_char(s));
  2885     if (word->len)
  2886     {
  2887 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  2888 	{
  2889 	    c=g_utf8_get_char(t);
  2890 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  2891 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  2892 	    {
  2893 		*ptr=s;
  2894 		return g_string_free(word,FALSE);
  2895 	    }
  2896 	}
  2897     }
  2898     /* we didn't find a punctuated number - do the regular getword thing */
  2899     g_string_truncate(word,0);
  2900     c=g_utf8_get_char(*ptr);
  2901     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
  2902       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
  2903 	g_string_append_unichar(word,c);
  2904     return g_string_free(word,FALSE);
  2905 }
  2906 
  2907 /*
  2908  * isroman:
  2909  *
  2910  * Is this word a Roman Numeral?
  2911  *
  2912  * It doesn't actually validate that the number is a valid Roman Numeral--for
  2913  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  2914  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  2915  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  2916  * expressions thereof, except when it came to taxes. Allow any number of M,
  2917  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  2918  * XL or an optional XC, an optional IX or IV, an optional V and any number
  2919  * of optional Is.
  2920  */
  2921 gboolean isroman(const char *t)
  2922 {
  2923     const char *s;
  2924     if (!t || !*t)
  2925 	return FALSE;
  2926     s=t;
  2927     while (g_utf8_get_char(t)=='m' && *t)
  2928 	t++;
  2929     if (g_utf8_get_char(t)=='d')
  2930 	t++;
  2931     if (g_str_has_prefix(t,"cm"))
  2932 	t+=2;
  2933     if (g_str_has_prefix(t,"cd"))
  2934 	t+=2;
  2935     while (g_utf8_get_char(t)=='c' && *t)
  2936 	t++;
  2937     if (g_str_has_prefix(t,"xl"))
  2938 	t+=2;
  2939     if (g_str_has_prefix(t,"xc"))
  2940 	t+=2;
  2941     if (g_utf8_get_char(t)=='l')
  2942 	t++;
  2943     while (g_utf8_get_char(t)=='x' && *t)
  2944 	t++;
  2945     if (g_str_has_prefix(t,"ix"))
  2946 	t+=2;
  2947     if (g_str_has_prefix(t,"iv"))
  2948 	t+=2;
  2949     if (g_utf8_get_char(t)=='v')
  2950 	t++;
  2951     while (g_utf8_get_char(t)=='i' && *t)
  2952 	t++;
  2953     return !*t;
  2954 }
  2955 
  2956 /*
  2957  * postprocess_for_DP:
  2958  *
  2959  * Invoked with the -d switch from flgets().
  2960  * It simply "removes" from the line a hard-coded set of common
  2961  * DP-specific tags, so that the line passed to the main routine has
  2962  * been pre-cleaned of DP markup.
  2963  */
  2964 void postprocess_for_DP(char *theline)
  2965 {
  2966     char *s,*t;
  2967     int i;
  2968     if (!*theline) 
  2969 	return;
  2970     for (i=0;*DPmarkup[i];i++)
  2971 	while ((s=strstr(theline,DPmarkup[i])))
  2972 	{
  2973 	    t=s+strlen(DPmarkup[i]);
  2974 	    memmove(s,t,strlen(t)+1);
  2975 	}
  2976 }
  2977 
  2978 /*
  2979  * postprocess_for_HTML:
  2980  *
  2981  * Invoked with the -m switch from flgets().
  2982  * It simply "removes" from the line a hard-coded set of common
  2983  * HTML tags and "replaces" a hard-coded set of common HTML
  2984  * entities, so that the line passed to the main routine has
  2985  * been pre-cleaned of HTML.
  2986  */
  2987 void postprocess_for_HTML(char *theline)
  2988 {
  2989     while (losemarkup(theline))
  2990 	;
  2991     loseentities(theline);
  2992 }
  2993 
  2994 char *losemarkup(char *theline)
  2995 {
  2996     char *s,*t;
  2997     int i;
  2998     s=strchr(theline,'<');
  2999     t=s?strchr(s,'>'):NULL;
  3000     if (!s || !t)
  3001 	return NULL;
  3002     for (i=0;*markup[i];i++)
  3003 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3004 	{
  3005 	    t=g_utf8_next_char(t);
  3006 	    memmove(s,t,strlen(t)+1);
  3007 	    return s;
  3008 	}
  3009     /* It's an unrecognized <xxx>. */
  3010     return NULL;
  3011 }
  3012 
  3013 void loseentities(char *theline)
  3014 {
  3015     int i;
  3016     gsize nb;
  3017     char *amp,*scolon;
  3018     gchar *s,*t;
  3019     gunichar c;
  3020     GTree *entities=NULL;
  3021     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3022     if (!theline)
  3023     {
  3024 	if (entities)
  3025 	    g_tree_destroy(entities);
  3026 	entities=NULL;
  3027 	if (translit!=(GIConv)-1)
  3028 	    g_iconv_close(translit);
  3029 	translit=(GIConv)-1;
  3030 	if (to_utf8!=(GIConv)-1)
  3031 	    g_iconv_close(to_utf8);
  3032 	to_utf8=(GIConv)-1;
  3033 	return;
  3034     }
  3035     if (!*theline)
  3036 	return;
  3037     if (!entities)
  3038     {
  3039 	entities=g_tree_new((GCompareFunc)strcmp);
  3040 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3041 	    g_tree_insert(entities,HTMLentities[i].name,
  3042 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3043     }
  3044     if (translit==(GIConv)-1)
  3045 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3046     if (to_utf8==(GIConv)-1)
  3047 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3048     while((amp=strchr(theline,'&')))
  3049     {
  3050 	scolon=strchr(amp,';');
  3051 	if (scolon)
  3052 	{
  3053 	    if (amp[1]=='#')
  3054 	    {
  3055 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3056 		    c=strtol(amp+2,NULL,10);
  3057 		else if (amp[2]=='x' &&
  3058 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3059 		    c=strtol(amp+3,NULL,16);
  3060 	    }
  3061 	    else
  3062 	    {
  3063 		s=g_strndup(amp+1,scolon-(amp+1));
  3064 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3065 		g_free(s);
  3066 	    }
  3067 	}
  3068 	else
  3069 	    c=0;
  3070 	if (c)
  3071 	{
  3072 	    theline=amp;
  3073 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3074 		theline+=g_unichar_to_utf8(c,theline);
  3075 	    else
  3076 	    {
  3077 		s=g_malloc(6);
  3078 		nb=g_unichar_to_utf8(c,s);
  3079 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3080 		g_free(s);
  3081 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3082 		g_free(t);
  3083 		memcpy(theline,s,nb);
  3084 		g_free(s);
  3085 		theline+=nb;
  3086 	    }
  3087 	    memmove(theline,g_utf8_next_char(scolon),
  3088 	      strlen(g_utf8_next_char(scolon))+1);
  3089 	}
  3090 	else
  3091 	    theline=g_utf8_next_char(amp);
  3092     }
  3093 }
  3094 
  3095 gboolean tagcomp(const char *strin,const char *basetag)
  3096 {
  3097     gboolean retval;
  3098     gchar *s,*t;
  3099     if (g_utf8_get_char(strin)=='/')
  3100 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3101     else
  3102 	t=g_utf8_casefold(strin,-1);
  3103     s=g_utf8_casefold(basetag,-1);
  3104     retval=g_str_has_prefix(t,s);
  3105     g_free(s);
  3106     g_free(t);
  3107     return retval;
  3108 }
  3109 
  3110 void proghelp(GOptionContext *context)
  3111 {
  3112     gchar *help;
  3113     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3114     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3115     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3116     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3117       "For details, read the file COPYING.\n",stderr);
  3118     fputs("This is Free Software; "
  3119       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3120     fputs("read the file COPYING for details.\n\n",stderr);
  3121     help=g_option_context_get_help(context,TRUE,NULL);
  3122     fputs(help,stderr);
  3123     g_free(help);
  3124     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3125     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3126       "non-ASCII\n",stderr);
  3127     fputs("characters like accented letters, "
  3128       "lines longer than 75 or shorter than 55,\n",stderr);
  3129     fputs("unbalanced quotes or brackets, "
  3130       "a variety of badly formatted punctuation, \n",stderr);
  3131     fputs("HTML tags, some likely typos. "
  3132       "It is NOT a substitute for human judgement.\n",stderr);
  3133     fputs("\n",stderr);
  3134 }