bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Sat Sep 21 18:28:40 2013 +0100 (2013-09-21)
changeset 94 3f655b1b0d93
parent 86 c42c068d2996
permissions -rw-r--r--
Fix bug #6: BL treats a slanted apostrophe ? as a word separator, not as a contraction or possessive
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "bookloupe.h"
    31 #include "counters.h"
    32 #include "HTMLentities.h"
    33 
    34 gchar *prevline;
    35 
    36 /* Common typos. */
    37 char *typo[] = {
    38     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    39     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    40     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    41     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    42     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    43     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    44     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    45     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    46     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    47     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    48     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    49     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    50     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    51     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    52     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    53     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    54     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    55     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    56     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    57     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    58     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    59     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    60     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    61     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    62     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    63     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    64     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    65     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    66     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    67     "se", ""
    68 };
    69 
    70 GTree *usertypo;
    71 
    72 /* Common abbreviations and other OK words not to query as typos. */
    73 char *okword[] = {
    74     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    75     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    76     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    77     "outbid", "outbids", "frostbite", "frostbitten", ""
    78 };
    79 
    80 /* Common abbreviations that cause otherwise unexplained periods. */
    81 char *abbrev[] = {
    82     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    83     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    84 };
    85 
    86 /*
    87  * Two-Letter combinations that rarely if ever start words,
    88  * but are common scannos or otherwise common letter combinations.
    89  */
    90 char *nostart[] = {
    91     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    92 };
    93 
    94 /*
    95  * Two-Letter combinations that rarely if ever end words,
    96  * but are common scannos or otherwise common letter combinations.
    97  */
    98 char *noend[] = {
    99     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
   100     "sw", "gr", "sl", "cl", "iy", ""
   101 };
   102 
   103 char *markup[] = {
   104     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   105     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   106     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   107     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   108 };
   109 
   110 char *DPmarkup[] = {
   111     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   112 };
   113 
   114 char *nocomma[] = {
   115     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   116     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   117     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   118     "during", "let", "toward", "among", ""
   119 };
   120 
   121 char *noperiod[] = {
   122     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   123     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   124     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   125     "among", "those", "into", "whom", "having", "thence", ""
   126 }; 
   127 
   128 gboolean pswit[SWITNO];  /* program switches */
   129 
   130 static GOptionEntry options[]={
   131     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   132       "Ignore DP-specific markup", NULL },
   133     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   134       "Don't echo queried line", NULL },
   135     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   136       "Check single quotes", NULL },
   137     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   138       "Check common typos", NULL },
   139     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   140       "Require closure of quotes on every paragraph", NULL },
   141     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   142       "Disable paranoid querying of everything", NULL },
   143     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   144       "Disable line end checking", NULL },
   145     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   146       "Overview: just show counts", NULL },
   147     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   148       "Output errors to stdout instead of stderr", NULL },
   149     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   150       "Echo header fields", NULL },
   151     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   152       "Ignore markup in < >", NULL },
   153     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   154       "Use file of user-defined typos", NULL },
   155     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   156       "Defaults for use on www upload", NULL },
   157     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   158       "Verbose - list everything", NULL },
   159     { NULL }
   160 };
   161 
   162 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   163 long cnt_squot;		/* for overview mode, count of singlequote queries */
   164 long cnt_brack;		/* for overview mode, count of brackets queries */
   165 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   166 long cnt_odd;		/* for overview mode, count of odd character queries */
   167 long cnt_long;		/* for overview mode, count of long line errors */
   168 long cnt_short;		/* for overview mode, count of short line queries */
   169 long cnt_punct;		/* for overview mode,
   170 			   count of punctuation and spacing queries */
   171 long cnt_dash;		/* for overview mode, count of dash-related queries */
   172 long cnt_word;		/* for overview mode, count of word queries */
   173 long cnt_html;		/* for overview mode, count of html queries */
   174 long cnt_lineend;	/* for overview mode, count of line-end queries */
   175 long cnt_spacend;	/* count of lines with space at end */
   176 long linecnt;		/* count of total lines in the file */
   177 long checked_linecnt;	/* count of lines actually checked */
   178 
   179 void proghelp(GOptionContext *context);
   180 void procfile(const char *);
   181 
   182 gchar *running_from;
   183 
   184 gboolean mixdigit(const char *);
   185 gchar *getaword(const char **);
   186 char *flgets(char **,long);
   187 void postprocess_for_HTML(char *);
   188 char *linehasmarkup(char *);
   189 char *losemarkup(char *);
   190 gboolean tagcomp(const char *,const char *);
   191 void loseentities(char *);
   192 gboolean isroman(const char *);
   193 void postprocess_for_DP(char *);
   194 void print_as_windows_1252(const char *string);
   195 void print_as_utf_8(const char *string);
   196 
   197 GTree *qword,*qperiod;
   198 
   199 #ifdef __WIN32__
   200 UINT saved_cp;
   201 #endif
   202 
   203 void parse_options(int *argc,char ***argv)
   204 {
   205     GError *err=NULL;
   206     GOptionContext *context;
   207     context=g_option_context_new(
   208       "file - looks for errors in Project Gutenberg(TM) etexts");
   209     g_option_context_add_main_entries(context,options,NULL);
   210     if (!g_option_context_parse(context,argc,argv,&err))
   211     {
   212 	g_printerr("Bookloupe: %s\n",err->message);
   213 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   214 	exit(1);
   215     }
   216     /* Paranoid checking is turned OFF, not on, by its switch */
   217     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   218     if (pswit[PARANOID_SWITCH])
   219 	/* if running in paranoid mode, typo checks default to enabled */
   220 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   221     /* Line-end checking is turned OFF, not on, by its switch */
   222     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   223     /* Echoing is turned OFF, not on, by its switch */
   224     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   225     if (pswit[OVERVIEW_SWITCH])
   226 	/* just print summary; don't echo */
   227 	pswit[ECHO_SWITCH]=FALSE;
   228     /*
   229      * Web uploads - for the moment, this is really just a placeholder
   230      * until we decide what processing we really want to do on web uploads
   231      */
   232     if (pswit[WEB_SWITCH])
   233     {
   234 	/* specific override for web uploads */
   235 	pswit[ECHO_SWITCH]=TRUE;
   236 	pswit[SQUOTE_SWITCH]=FALSE;
   237 	pswit[TYPO_SWITCH]=TRUE;
   238 	pswit[QPARA_SWITCH]=FALSE;
   239 	pswit[PARANOID_SWITCH]=TRUE;
   240 	pswit[LINE_END_SWITCH]=FALSE;
   241 	pswit[OVERVIEW_SWITCH]=FALSE;
   242 	pswit[STDOUT_SWITCH]=FALSE;
   243 	pswit[HEADER_SWITCH]=TRUE;
   244 	pswit[VERBOSE_SWITCH]=FALSE;
   245 	pswit[MARKUP_SWITCH]=FALSE;
   246 	pswit[USERTYPO_SWITCH]=FALSE;
   247 	pswit[DP_SWITCH]=FALSE;
   248     }
   249     if (*argc<2)
   250     {
   251 	proghelp(context);
   252 	exit(1);
   253     }
   254     g_option_context_free(context);
   255 }
   256 
   257 /*
   258  * read_user_scannos:
   259  *
   260  * Read in the user-defined stealth scanno list.
   261  */
   262 void read_user_scannos(void)
   263 {
   264     GError *err=NULL;
   265     gchar *usertypo_file;
   266     gboolean okay;
   267     int i;
   268     gsize len,nb;
   269     gchar *contents,*utf8,**lines;
   270     usertypo_file=g_strdup("bookloupe.typ");
   271     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   272     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   273     {
   274 	g_clear_error(&err);
   275 	g_free(usertypo_file);
   276 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   277 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   278     }
   279     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   280     {
   281 	g_clear_error(&err);
   282 	g_free(usertypo_file);
   283 	usertypo_file=g_strdup("gutcheck.typ");
   284 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   285     }
   286     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   287     {
   288 	g_clear_error(&err);
   289 	g_free(usertypo_file);
   290 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   291 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   292     }
   293     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   294     {
   295 	g_free(usertypo_file);
   296 	g_print("   --> I couldn't find bookloupe.typ "
   297 	  "-- proceeding without user typos.\n");
   298 	return;
   299     }
   300     else if (!okay)
   301     {
   302 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   303 	g_free(usertypo_file);
   304 	g_clear_error(&err);
   305 	exit(1);
   306     }
   307     if (g_utf8_validate(contents,len,NULL))
   308 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   309     else
   310 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   311     g_free(contents);
   312     lines=g_strsplit_set(utf8,"\r\n",0);
   313     g_free(utf8);
   314     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   315     for (i=0;lines[i];i++)
   316 	if (*(unsigned char *)lines[i]>'!')
   317 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   318 	else
   319 	    g_free(lines[i]);
   320     g_free(lines);
   321 }
   322 
   323 /*
   324  * read_etext:
   325  *
   326  * Read an etext returning a newly allocated string containing the file
   327  * contents or NULL on error.
   328  */
   329 gchar *read_etext(const char *filename,GError **err)
   330 {
   331     GError *tmp_err=NULL;
   332     gchar *contents,*utf8;
   333     gsize len,bytes_read,bytes_written;
   334     int i,line,col;
   335     if (!g_file_get_contents(filename,&contents,&len,err))
   336 	return NULL;
   337     if (g_utf8_validate(contents,len,NULL))
   338     {
   339 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   340 	g_set_print_handler(print_as_utf_8);
   341 #ifdef __WIN32__
   342 	SetConsoleOutputCP(CP_UTF8);
   343 #endif
   344     }
   345     else
   346     {
   347 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,
   348 	  &bytes_written,&tmp_err);
   349 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,
   350 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
   351 	{
   352 	    line=col=1;
   353 	    for(i=0;i<bytes_read;i++)
   354 		if (contents[i]=='\n')
   355 		{
   356 		    line++;
   357 		    col=1;
   358 		}
   359 		else if (contents[i]!='\r')
   360 		    col++;
   361 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   362 	      "Input conversion failed. Byte %d at line %d, column %d is not a "
   363 	      "valid Windows-1252 character",
   364 	      ((unsigned char *)contents)[bytes_read],line,col);
   365 	}
   366 	else if (tmp_err)
   367 	    g_propagate_error(err,tmp_err);
   368 	g_set_print_handler(print_as_windows_1252);
   369 #ifdef __WIN32__
   370 	SetConsoleOutputCP(1252);
   371 #endif
   372     }
   373     g_free(contents);
   374     return utf8;
   375 }
   376 
   377 void cleanup_on_exit(void)
   378 {
   379 #ifdef __WIN32__
   380     SetConsoleOutputCP(saved_cp);
   381 #endif
   382 }
   383 
   384 int main(int argc,char **argv)
   385 {
   386 #ifdef __WIN32__
   387     atexit(cleanup_on_exit);
   388     saved_cp=GetConsoleOutputCP();
   389 #endif
   390     running_from=g_path_get_dirname(argv[0]);
   391     parse_options(&argc,&argv);
   392     if (pswit[USERTYPO_SWITCH])
   393 	read_user_scannos();
   394     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   395     procfile(argv[1]);
   396     if (pswit[OVERVIEW_SWITCH])
   397     {
   398 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   399 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   400 	g_print("    --------------- Queries found --------------\n");
   401 	if (cnt_long)
   402 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   403 	if (cnt_short)
   404 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   405 	if (cnt_lineend)
   406 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   407 	if (cnt_word)
   408 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   409 	if (cnt_dquot)
   410 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   411 	if (cnt_squot)
   412 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   413 	if (cnt_brack)
   414 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   415 	if (cnt_bin)
   416 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   417 	if (cnt_odd)
   418 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   419 	if (cnt_punct)
   420 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   421 	if (cnt_dash)
   422 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   423 	if (cnt_html)
   424 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   425 	g_print("\n");
   426 	g_print("    TOTAL QUERIES		  %14ld\n",
   427 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   428 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   429     }
   430     g_free(running_from);
   431     if (usertypo)
   432 	g_tree_unref(usertypo);
   433     return 0;
   434 }
   435 
   436 /*
   437  * first_pass:
   438  *
   439  * Run a first pass - verify that it's a valid PG
   440  * file, decide whether to report some things that
   441  * occur many times in the text like long or short
   442  * lines, non-standard dashes, etc.
   443  */
   444 struct first_pass_results *first_pass(const char *etext)
   445 {
   446     gunichar laststart=CHAR_SPACE;
   447     const char *s;
   448     gchar *lc_line;
   449     int i,j,lbytes,llen;
   450     gchar **lines;
   451     unsigned int lastlen=0,lastblen=0;
   452     long spline=0,nspline=0;
   453     static struct first_pass_results results={0};
   454     gchar *inword;
   455     lines=g_strsplit(etext,"\n",0);
   456     for (j=0;lines[j];j++)
   457     {
   458 	lbytes=strlen(lines[j]);
   459 	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   460 	    lines[j][--lbytes]='\0';
   461 	llen=g_utf8_strlen(lines[j],lbytes);
   462 	linecnt++;
   463 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   464 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   465 	{
   466 	    if (spline)
   467 		g_print("   --> Duplicate header?\n");
   468 	    spline=linecnt+1;   /* first line of non-header text, that is */
   469 	}
   470 	if (!strncmp(lines[j],"*** START",9) &&
   471 	  strstr(lines[j],"PROJECT GUTENBERG"))
   472 	{
   473 	    if (nspline)
   474 		g_print("   --> Duplicate header?\n");
   475 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   476 	}
   477 	if (spline || nspline)
   478 	{
   479 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   480 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   481 	    {
   482 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   483 		{
   484 		    if (results.footerline)
   485 		    {
   486 			/* it's an old-form header - we can detect duplicates */
   487 			if (!nspline)
   488 			    g_print("   --> Duplicate footer?\n");
   489 		    }
   490 		    else
   491 			results.footerline=linecnt;
   492 		}
   493 	    }
   494 	    g_free(lc_line);
   495 	}
   496 	if (spline)
   497 	    results.firstline=spline;
   498 	if (nspline)
   499 	    results.firstline=nspline;  /* override with new */
   500 	if (results.footerline)
   501 	    continue;    /* don't count the boilerplate in the footer */
   502 	results.totlen+=llen;
   503 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   504 	{
   505 	    if (g_utf8_get_char(s)>127)
   506 		results.binlen++;
   507 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   508 		results.alphalen++;
   509 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   510 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   511 		results.endquote_count++;
   512 	}
   513 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   514 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   515 	    results.shortline++;
   516 	if (lbytes>0 &&
   517 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   518 	    cnt_spacend++;
   519 	if (strstr(lines[j],".,"))
   520 	    results.dotcomma++;
   521 	/* only count ast lines for ignoring purposes where there is */
   522 	/* locase text on the line */
   523 	if (strchr(lines[j],'*'))
   524 	{
   525 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   526 		if (g_unichar_islower(g_utf8_get_char(s)))
   527 		    break;
   528 	    if (*s)
   529 		results.astline++;
   530 	}
   531 	if (strchr(lines[j],'/'))
   532 	    results.fslashline++;
   533 	if (lbytes>0)
   534 	{
   535 	    for (s=g_utf8_prev_char(lines[j]+lbytes);
   536 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;
   537 	      s=g_utf8_prev_char(s))
   538 		;
   539 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   540 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
   541 		results.hyphens++;
   542 	}
   543 	if (llen>LONGEST_PG_LINE)
   544 	    results.longline++;
   545 	if (llen>WAY_TOO_LONG)
   546 	    results.verylongline++;
   547 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   548 	{
   549 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   550 	    if (i>0)
   551 		results.htmcount++;
   552 	    if (strstr(lines[j],"<i>"))
   553 		results.htmcount+=4; /* bonus marks! */
   554 	}
   555 	/* Check for spaced em-dashes */
   556 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   557 	{
   558 	    results.emdash++;
   559 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   560 		results.space_emdash++;
   561 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   562 		/* count of em-dashes with spaces both sides */
   563 		results.non_PG_space_emdash++;
   564 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   565 		/* count of PG-type em-dashes with no spaces */
   566 		results.PG_space_emdash++;
   567 	}
   568 	for (s=lines[j];*s;)
   569 	{
   570 	    inword=getaword(&s);
   571 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   572 		results.Dutchcount++;
   573 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   574 		results.Frenchcount++;
   575 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   576 		results.standalone_digit++;
   577 	    g_free(inword);
   578 	}
   579 	/* Check for spaced dashes */
   580 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   581 	    results.spacedash++;
   582 	lastblen=lastlen;
   583 	lastlen=llen;
   584 	laststart=lines[j][0];
   585     }
   586     g_strfreev(lines);
   587     return &results;
   588 }
   589 
   590 /*
   591  * report_first_pass:
   592  *
   593  * Make some snap decisions based on the first pass results.
   594  */
   595 struct warnings *report_first_pass(struct first_pass_results *results)
   596 {
   597     static struct warnings warnings={0};
   598     if (cnt_spacend>0)
   599 	g_print("   --> %ld lines in this file have white space at end\n",
   600 	  cnt_spacend);
   601     warnings.dotcomma=1;
   602     if (results->dotcomma>5)
   603     {
   604 	warnings.dotcomma=0;
   605 	g_print("   --> %ld lines in this file contain '.,'. "
   606 	  "Not reporting them.\n",results->dotcomma);
   607     }
   608     /*
   609      * If more than 50 lines, or one-tenth, are short,
   610      * don't bother reporting them.
   611      */
   612     warnings.shortline=1;
   613     if (results->shortline>50 || results->shortline*10>linecnt)
   614     {
   615 	warnings.shortline=0;
   616 	g_print("   --> %ld lines in this file are short. "
   617 	  "Not reporting short lines.\n",results->shortline);
   618     }
   619     /*
   620      * If more than 50 lines, or one-tenth, are long,
   621      * don't bother reporting them.
   622      */
   623     warnings.longline=1;
   624     if (results->longline>50 || results->longline*10>linecnt)
   625     {
   626 	warnings.longline=0;
   627 	g_print("   --> %ld lines in this file are long. "
   628 	  "Not reporting long lines.\n",results->longline);
   629     }
   630     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   631     warnings.ast=1;
   632     if (results->astline>10)
   633     {
   634 	warnings.ast=0;
   635 	g_print("   --> %ld lines in this file contain asterisks. "
   636 	  "Not reporting them.\n",results->astline);
   637     }
   638     /*
   639      * If more than 10 lines contain forward slashes,
   640      * don't bother reporting them.
   641      */
   642     warnings.fslash=1;
   643     if (results->fslashline>10)
   644     {
   645 	warnings.fslash=0;
   646 	g_print("   --> %ld lines in this file contain forward slashes. "
   647 	  "Not reporting them.\n",results->fslashline);
   648     }
   649     /*
   650      * If more than 20 lines contain unpunctuated endquotes,
   651      * don't bother reporting them.
   652      */
   653     warnings.endquote=1;
   654     if (results->endquote_count>20)
   655     {
   656 	warnings.endquote=0;
   657 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   658 	  "Not reporting them.\n",results->endquote_count);
   659     }
   660     /*
   661      * If more than 15 lines contain standalone digits,
   662      * don't bother reporting them.
   663      */
   664     warnings.digit=1;
   665     if (results->standalone_digit>10)
   666     {
   667 	warnings.digit=0;
   668 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   669 	  "Not reporting them.\n",results->standalone_digit);
   670     }
   671     /*
   672      * If more than 20 lines contain hyphens at end,
   673      * don't bother reporting them.
   674      */
   675     warnings.hyphen=1;
   676     if (results->hyphens>20)
   677     {
   678 	warnings.hyphen=0;
   679 	g_print("   --> %ld lines in this file have hyphens at end. "
   680 	  "Not reporting them.\n",results->hyphens);
   681     }
   682     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   683     {
   684 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   685 	pswit[MARKUP_SWITCH]=1;
   686     }
   687     if (results->verylongline>0)
   688 	g_print("   --> %ld lines in this file are VERY long!\n",
   689 	  results->verylongline);
   690     /*
   691      * If there are more non-PG spaced dashes than PG em-dashes,
   692      * assume it's deliberate.
   693      * Current PG guidelines say don't use them, but older texts do,
   694      * and some people insist on them whatever the guidelines say.
   695      */
   696     warnings.dash=1;
   697     if (results->spacedash+results->non_PG_space_emdash>
   698       results->PG_space_emdash)
   699     {
   700 	warnings.dash=0;
   701 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   702 	  "Not reporting them.\n",
   703 	  results->spacedash+results->non_PG_space_emdash);
   704     }
   705     /* If more than a quarter of characters are hi-bit, bug out. */
   706     warnings.bin=1;
   707     if (results->binlen*4>results->totlen)
   708     {
   709 	g_print("   --> This file does not appear to be ASCII. "
   710 	  "Terminating. Best of luck with it!\n");
   711 	exit(1);
   712     }
   713     if (results->alphalen*4<results->totlen)
   714     {
   715 	g_print("   --> This file does not appear to be text. "
   716 	  "Terminating. Best of luck with it!\n");
   717 	exit(1);
   718     }
   719     if (results->binlen*100>results->totlen || results->binlen>100)
   720     {
   721 	g_print("   --> There are a lot of foreign letters here. "
   722 	  "Not reporting them.\n");
   723 	warnings.bin=0;
   724     }
   725     warnings.isDutch=FALSE;
   726     if (results->Dutchcount>50)
   727     {
   728 	warnings.isDutch=TRUE;
   729 	g_print("   --> This looks like Dutch - "
   730 	  "switching off dashes and warnings for 's Middags case.\n");
   731     }
   732     warnings.isFrench=FALSE;
   733     if (results->Frenchcount>50)
   734     {
   735 	warnings.isFrench=TRUE;
   736 	g_print("   --> This looks like French - "
   737 	  "switching off some doublepunct.\n");
   738     }
   739     if (results->firstline && results->footerline)
   740 	g_print("    The PG header and footer appear to be already on.\n");
   741     else
   742     {
   743 	if (results->firstline)
   744 	    g_print("    The PG header is on - no footer.\n");
   745 	if (results->footerline)
   746 	    g_print("    The PG footer is on - no header.\n");
   747     }
   748     g_print("\n");
   749     if (pswit[VERBOSE_SWITCH])
   750     {
   751 	warnings.bin=1;
   752 	warnings.shortline=1;
   753 	warnings.dotcomma=1;
   754 	warnings.longline=1;
   755 	warnings.dash=1;
   756 	warnings.digit=1;
   757 	warnings.ast=1;
   758 	warnings.fslash=1;
   759 	warnings.hyphen=1;
   760 	warnings.endquote=1;
   761 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   762     }
   763     if (warnings.isDutch)
   764 	warnings.dash=0;
   765     if (results->footerline>0 && results->firstline>0 &&
   766       results->footerline>results->firstline &&
   767       results->footerline-results->firstline<100)
   768     {
   769 	g_print("   --> I don't really know where this text starts. \n");
   770 	g_print("       There are no reference points.\n");
   771 	g_print("       I'm going to have to report the header and footer "
   772 	  "as well.\n");
   773 	results->firstline=0;
   774     }
   775     return &warnings;
   776 }
   777 
   778 /*
   779  * analyse_quotes:
   780  *
   781  * Look along the line, accumulate the count of quotes, and see
   782  * if this is an empty line - i.e. a line with nothing on it
   783  * but spaces.
   784  * If line has just spaces, period, * and/or - on it, don't
   785  * count it, since empty lines with asterisks or dashes to
   786  * separate sections are common.
   787  *
   788  * Returns: TRUE if the line is empty.
   789  */
   790 gboolean analyse_quotes(const char *aline,struct counters *counters)
   791 {
   792     int guessquote=0;
   793     /* assume the line is empty until proven otherwise */
   794     gboolean isemptyline=TRUE;
   795     const char *s=aline,*sprev,*snext;
   796     gunichar c;
   797     sprev=NULL;
   798     while (*s)
   799     {
   800 	snext=g_utf8_next_char(s);
   801 	c=g_utf8_get_char(s);
   802 	if (c==CHAR_DQUOTE)
   803 	    counters->quot++;
   804 	if (CHAR_IS_SQUOTE(c))
   805 	{
   806 	    if (s==aline)
   807 	    {
   808 		/*
   809 		 * At start of line, it can only be an openquote.
   810 		 * Hardcode a very common exception!
   811 		 */
   812 		if (!g_str_has_prefix(snext,"tis") &&
   813 		  !g_str_has_prefix(snext,"Tis"))
   814 		    increment_matching(counters,c,TRUE);
   815 	    }
   816 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   817 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   818 		/* Do nothing! it's definitely an apostrophe, not a quote */
   819 		;
   820 	    /* it's outside a word - let's check it out */
   821 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   822 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   823 	    {
   824 		/* it damwell better BE an openquote */
   825 		if (!g_str_has_prefix(snext,"tis") &&
   826 		  !g_str_has_prefix(snext,"Tis"))
   827 		    /* hardcode a very common exception! */
   828 		    increment_matching(counters,c,TRUE);
   829 	    }
   830 	    else
   831 	    {
   832 		/* now - is it a closequote? */
   833 		guessquote=0;   /* accumulate clues */
   834 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   835 		{
   836 		    /* it follows a letter - could be either */
   837 		    guessquote++;
   838 		    if (g_utf8_get_char(sprev)=='s')
   839 		    {
   840 			/* looks like a plural apostrophe */
   841 			guessquote-=3;
   842 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   843 			    /* bonus marks! */
   844 			    guessquote-=2;
   845 		    }
   846 		}
   847 		/* it doesn't have a letter either side */
   848 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
   849 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
   850 		    guessquote+=8; /* looks like a closequote */
   851 		else
   852 		    guessquote++;
   853 		if (matching_difference(counters,CHAR_SQUOTE)>0)
   854 		    /*
   855 		     * Give it the benefit of some doubt,
   856 		     * if a squote is already open.
   857 		     */
   858 		    guessquote++;
   859 		else
   860 		    guessquote--;
   861 		if (guessquote>=0)
   862 		    increment_matching(counters,c,FALSE);
   863 	    }
   864 	}
   865 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   866 	  c!='\r' && c!='\n')
   867 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   868 	if (c==CHAR_UNDERSCORE)
   869 	    counters->c_unders++;
   870 	if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK || c==CHAR_OPEN_SBRACK)
   871 	    increment_matching(counters,c,TRUE);
   872 	if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK ||
   873 	  c==CHAR_CLOSE_SBRACK)
   874 	    increment_matching(counters,c,FALSE);
   875 	sprev=s;
   876 	s=snext;
   877     }
   878     return isemptyline;
   879 }
   880 
   881 /*
   882  * check_for_control_characters:
   883  *
   884  * Check for invalid or questionable characters in the line
   885  * Anything above 127 is invalid for plain ASCII, and
   886  * non-printable control characters should also be flagged.
   887  * Tabs should generally not be there.
   888  */
   889 void check_for_control_characters(const char *aline)
   890 {
   891     gunichar c;
   892     const char *s;
   893     for (s=aline;*s;s=g_utf8_next_char(s))
   894     {
   895 	c=g_utf8_get_char(s);
   896 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   897 	{
   898 	    if (pswit[ECHO_SWITCH])
   899 		g_print("\n%s\n",aline);
   900 	    if (!pswit[OVERVIEW_SWITCH])
   901 		g_print("    Line %ld column %ld - Control character %u\n",
   902 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   903 	    else
   904 		cnt_bin++;
   905 	}
   906     }
   907 }
   908 
   909 /*
   910  * check_for_odd_characters:
   911  *
   912  * Check for binary and other odd characters.
   913  */
   914 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   915   gboolean isemptyline)
   916 {
   917     /* Don't repeat multiple warnings on one line. */
   918     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   919     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   920     const char *s;
   921     gunichar c;
   922     for (s=aline;*s;s=g_utf8_next_char(s))
   923     {
   924 	c=g_utf8_get_char(s);
   925 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   926 	{
   927 	    if (pswit[ECHO_SWITCH])
   928 		g_print("\n%s\n",aline);
   929 	    if (!pswit[OVERVIEW_SWITCH])
   930 		if (c>127 && c<160 || c>255)
   931 		    g_print("    Line %ld column %ld - "
   932 		      "Non-ISO-8859 character %u\n",
   933 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   934 		else
   935 		    g_print("    Line %ld column %ld - "
   936 		      "Non-ASCII character %u\n",
   937 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   938 	    else
   939 		cnt_bin++;
   940 	    eNon_A=TRUE;
   941 	}
   942 	if (!eTab && c==CHAR_TAB)
   943 	{
   944 	    if (pswit[ECHO_SWITCH])
   945 		g_print("\n%s\n",aline);
   946 	    if (!pswit[OVERVIEW_SWITCH])
   947 		g_print("    Line %ld column %ld - Tab character?\n",
   948 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   949 	    else
   950 		cnt_odd++;
   951 	    eTab=TRUE;
   952 	}
   953 	if (!eTilde && c==CHAR_TILDE)
   954 	{
   955 	    /*
   956 	     * Often used by OCR software to indicate an
   957 	     * unrecognizable character.
   958 	     */
   959 	    if (pswit[ECHO_SWITCH])
   960 		g_print("\n%s\n",aline);
   961 	    if (!pswit[OVERVIEW_SWITCH])
   962 		g_print("    Line %ld column %ld - Tilde character?\n",
   963 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   964 	    else
   965 		cnt_odd++;
   966 	    eTilde=TRUE;
   967 	}
   968 	if (!eCarat && c==CHAR_CARAT)
   969 	{  
   970 	    if (pswit[ECHO_SWITCH])
   971 		g_print("\n%s\n",aline);
   972 	    if (!pswit[OVERVIEW_SWITCH])
   973 		g_print("    Line %ld column %ld - Carat character?\n",
   974 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   975 	    else
   976 		cnt_odd++;
   977 	    eCarat=TRUE;
   978 	}
   979 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
   980 	{  
   981 	    if (pswit[ECHO_SWITCH])
   982 		g_print("\n%s\n",aline);
   983 	    if (!pswit[OVERVIEW_SWITCH])
   984 		g_print("    Line %ld column %ld - Forward slash?\n",
   985 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   986 	    else
   987 		cnt_odd++;
   988 	    eFSlash=TRUE;
   989 	}
   990 	/*
   991 	 * Report asterisks only in paranoid mode,
   992 	 * since they're often deliberate.
   993 	 */
   994 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
   995 	  c==CHAR_ASTERISK)
   996 	{
   997 	    if (pswit[ECHO_SWITCH])
   998 		g_print("\n%s\n",aline);
   999 	    if (!pswit[OVERVIEW_SWITCH])
  1000 		g_print("    Line %ld column %ld - Asterisk?\n",
  1001 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1002 	    else
  1003 		cnt_odd++;
  1004 	    eAst=TRUE;
  1005 	}
  1006     }
  1007 }
  1008 
  1009 /*
  1010  * check_for_long_line:
  1011  *
  1012  * Check for line too long.
  1013  */
  1014 void check_for_long_line(const char *aline)
  1015 {
  1016     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1017     {
  1018 	if (pswit[ECHO_SWITCH])
  1019 	    g_print("\n%s\n",aline);
  1020 	if (!pswit[OVERVIEW_SWITCH])
  1021 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1022 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1023 	else
  1024 	    cnt_long++;
  1025     }
  1026 }
  1027 
  1028 /*
  1029  * check_for_short_line:
  1030  *
  1031  * Check for line too short.
  1032  *
  1033  * This one is a bit trickier to implement: we don't want to
  1034  * flag the last line of a paragraph for being short, so we
  1035  * have to wait until we know that our current line is a
  1036  * "normal" line, then report the _previous_ line if it was too
  1037  * short. We also don't want to report indented lines like
  1038  * chapter heads or formatted quotations. We therefore keep
  1039  * last->len as the length of the last line examined, and
  1040  * last->blen as the length of the last but one, and try to
  1041  * suppress unnecessary warnings by checking that both were of
  1042  * "normal" length. We keep the first character of the last
  1043  * line in last->start, and if it was a space, we assume that
  1044  * the formatting is deliberate. I can't figure out a way to
  1045  * distinguish something like a quoted verse left-aligned or
  1046  * the header or footer of a letter from a paragraph of short
  1047  * lines - maybe if I examined the whole paragraph, and if the
  1048  * para has less than, say, 8 lines and if all lines are short,
  1049  * then just assume it's OK? Need to look at some texts to see
  1050  * how often a formula like this would get the right result.
  1051  */
  1052 void check_for_short_line(const char *aline,const struct line_properties *last)
  1053 {
  1054     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1055       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1056       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1057     {
  1058 	if (pswit[ECHO_SWITCH])
  1059 	    g_print("\n%s\n",prevline);
  1060 	if (!pswit[OVERVIEW_SWITCH])
  1061 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1062 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1063 	else
  1064 	    cnt_short++;
  1065     }
  1066 }
  1067 
  1068 /*
  1069  * check_for_starting_punctuation:
  1070  *
  1071  * Look for punctuation other than full ellipses at start of line.
  1072  */
  1073 void check_for_starting_punctuation(const char *aline)
  1074 {
  1075     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1076       !g_str_has_prefix(aline,". . ."))
  1077     {
  1078 	if (pswit[ECHO_SWITCH])
  1079 	    g_print("\n%s\n",aline);
  1080 	if (!pswit[OVERVIEW_SWITCH])
  1081 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1082 	      linecnt);
  1083 	else
  1084 	    cnt_punct++;
  1085     }
  1086 }
  1087 
  1088 /*
  1089  * check_for_spaced_emdash:
  1090  *
  1091  * Check for spaced em-dashes.
  1092  *
  1093  * We must check _all_ occurrences of "--" on the line
  1094  * hence the loop - even if the first double-dash is OK
  1095  * there may be another that's wrong later on.
  1096  */
  1097 void check_for_spaced_emdash(const char *aline)
  1098 {
  1099     const char *s,*t,*next;
  1100     for (s=aline;t=strstr(s,"--");s=next)
  1101     {
  1102 	next=g_utf8_next_char(g_utf8_next_char(t));
  1103 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1104 	  g_utf8_get_char(next)==CHAR_SPACE)
  1105 	{
  1106 	    if (pswit[ECHO_SWITCH])
  1107 		g_print("\n%s\n",aline);
  1108 	    if (!pswit[OVERVIEW_SWITCH])
  1109 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1110 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1111 	    else
  1112 		cnt_dash++;
  1113 	}
  1114     }
  1115 }
  1116 
  1117 /*
  1118  * check_for_spaced_dash:
  1119  *
  1120  * Check for spaced dashes.
  1121  */
  1122 void check_for_spaced_dash(const char *aline)
  1123 {
  1124     const char *s;
  1125     if ((s=strstr(aline," -")))
  1126     {
  1127 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1128 	{
  1129 	    if (pswit[ECHO_SWITCH])
  1130 		g_print("\n%s\n",aline);
  1131 	    if (!pswit[OVERVIEW_SWITCH])
  1132 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1133 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1134 	    else
  1135 		cnt_dash++;
  1136 	}
  1137     }
  1138     else if ((s=strstr(aline,"- ")))
  1139     {
  1140 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1141 	{
  1142 	    if (pswit[ECHO_SWITCH])
  1143 		g_print("\n%s\n",aline);
  1144 	    if (!pswit[OVERVIEW_SWITCH])
  1145 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1146 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1147 	    else
  1148 		cnt_dash++;
  1149 	}
  1150     }
  1151 }
  1152 
  1153 /*
  1154  * check_for_unmarked_paragraphs:
  1155  *
  1156  * Check for unmarked paragraphs indicated by separate speakers.
  1157  *
  1158  * May well be false positive:
  1159  * "Bravo!" "Wonderful!" called the crowd.
  1160  * but useful all the same.
  1161  */
  1162 void check_for_unmarked_paragraphs(const char *aline)
  1163 {
  1164     const char *s;
  1165     s=strstr(aline,"\"  \"");
  1166     if (!s)
  1167 	s=strstr(aline,"\" \"");
  1168     if (s)
  1169     {
  1170 	if (pswit[ECHO_SWITCH])
  1171 	    g_print("\n%s\n",aline);
  1172 	if (!pswit[OVERVIEW_SWITCH])
  1173 	    g_print("    Line %ld column %ld - "
  1174 	      "Query missing paragraph break?\n",
  1175 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1176 	else
  1177 	    cnt_punct++;
  1178     }
  1179 }
  1180 
  1181 /*
  1182  * check_for_jeebies:
  1183  *
  1184  * Check for "to he" and other easy h/b errors.
  1185  *
  1186  * This is a very inadequate effort on the h/b problem,
  1187  * but the phrase "to he" is always an error, whereas "to
  1188  * be" is quite common.
  1189  * Similarly, '"Quiet!", be said.' is a non-be error
  1190  * "to he" is _not_ always an error!:
  1191  *       "Where they went to he couldn't say."
  1192  * Another false positive:
  1193  *       What would "Cinderella" be without the . . .
  1194  * and another: "If he wants to he can see for himself."
  1195  */
  1196 void check_for_jeebies(const char *aline)
  1197 {
  1198     const char *s;
  1199     s=strstr(aline," be could ");
  1200     if (!s)
  1201 	s=strstr(aline," be would ");
  1202     if (!s)
  1203 	s=strstr(aline," was be ");
  1204     if (!s)
  1205 	s=strstr(aline," be is ");
  1206     if (!s)
  1207 	s=strstr(aline," is be ");
  1208     if (!s)
  1209 	s=strstr(aline,"\", be ");
  1210     if (!s)
  1211 	s=strstr(aline,"\" be ");
  1212     if (!s)
  1213 	s=strstr(aline,"\" be ");
  1214     if (!s)
  1215 	s=strstr(aline," to he ");
  1216     if (s)
  1217     {
  1218 	if (pswit[ECHO_SWITCH])
  1219 	    g_print("\n%s\n",aline);
  1220 	if (!pswit[OVERVIEW_SWITCH])
  1221 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1222 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1223 	else
  1224 	    cnt_word++;
  1225     }
  1226     s=strstr(aline," the had ");
  1227     if (!s)
  1228 	s=strstr(aline," a had ");
  1229     if (!s)
  1230 	s=strstr(aline," they bad ");
  1231     if (!s)
  1232 	s=strstr(aline," she bad ");
  1233     if (!s)
  1234 	s=strstr(aline," he bad ");
  1235     if (!s)
  1236 	s=strstr(aline," you bad ");
  1237     if (!s)
  1238 	s=strstr(aline," i bad ");
  1239     if (s)
  1240     {
  1241 	if (pswit[ECHO_SWITCH])
  1242 	    g_print("\n%s\n",aline);
  1243 	if (!pswit[OVERVIEW_SWITCH])
  1244 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1245 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1246 	else
  1247 	    cnt_word++;
  1248     }
  1249     s=strstr(aline,"; hut ");
  1250     if (!s)
  1251 	s=strstr(aline,", hut ");
  1252     if (s)
  1253     {
  1254 	if (pswit[ECHO_SWITCH])
  1255 	    g_print("\n%s\n",aline);
  1256 	if (!pswit[OVERVIEW_SWITCH])
  1257 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1258 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1259 	else
  1260 	    cnt_word++;
  1261     }
  1262 }
  1263 
  1264 /*
  1265  * check_for_mta_from:
  1266  *
  1267  * Special case - angled bracket in front of "From" placed there by an
  1268  * MTA when sending an e-mail.
  1269  */
  1270 void check_for_mta_from(const char *aline)
  1271 {
  1272     const char *s;
  1273     s=strstr(aline,">From");
  1274     if (s)
  1275     {
  1276 	if (pswit[ECHO_SWITCH])
  1277 	    g_print("\n%s\n",aline);
  1278 	if (!pswit[OVERVIEW_SWITCH])
  1279 	    g_print("    Line %ld column %ld - "
  1280 	      "Query angled bracket with From\n",
  1281 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1282 	else
  1283 	    cnt_punct++;
  1284     }
  1285 }
  1286 
  1287 /*
  1288  * check_for_orphan_character:
  1289  *
  1290  * Check for a single character line -
  1291  * often an overflow from bad wrapping.
  1292  */
  1293 void check_for_orphan_character(const char *aline)
  1294 {
  1295     gunichar c;
  1296     c=g_utf8_get_char(aline);
  1297     if (c && !*g_utf8_next_char(aline))
  1298     {
  1299 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1300 	    ; /* Nothing - ignore numerals alone on a line. */
  1301 	else
  1302 	{
  1303 	    if (pswit[ECHO_SWITCH])
  1304 		g_print("\n%s\n",aline);
  1305 	    if (!pswit[OVERVIEW_SWITCH])
  1306 		g_print("    Line %ld column 1 - Query single character line\n",
  1307 		  linecnt);
  1308 	    else
  1309 		cnt_punct++;
  1310 	}
  1311     }
  1312 }
  1313 
  1314 /*
  1315  * check_for_pling_scanno:
  1316  *
  1317  * Check for I" - often should be !
  1318  */
  1319 void check_for_pling_scanno(const char *aline)
  1320 {
  1321     const char *s;
  1322     s=strstr(aline," I\"");
  1323     if (s)
  1324     {
  1325 	if (pswit[ECHO_SWITCH])
  1326 	    g_print("\n%s\n",aline);
  1327 	if (!pswit[OVERVIEW_SWITCH])
  1328 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1329 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1330 	else
  1331 	    cnt_punct++;
  1332     }
  1333 }
  1334 
  1335 /*
  1336  * check_for_extra_period:
  1337  *
  1338  * Check for period without a capital letter. Cut-down from gutspell.
  1339  * Only works when it happens on a single line.
  1340  */
  1341 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1342 {
  1343     const char *s,*t,*s1,*sprev;
  1344     int i;
  1345     gsize len;
  1346     gboolean istypo;
  1347     gchar *testword;
  1348     gunichar c,nc,pc,*decomposition;
  1349     if (pswit[PARANOID_SWITCH])
  1350     {
  1351 	for (t=aline;t=strstr(t,". ");)
  1352 	{
  1353 	    if (t==aline)
  1354 	    {
  1355 		t=g_utf8_next_char(t);
  1356 		/* start of line punctuation is handled elsewhere */
  1357 		continue;
  1358 	    }
  1359 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1360 	    {
  1361 		t=g_utf8_next_char(t);
  1362 		continue;
  1363 	    }
  1364 	    if (warnings->isDutch)
  1365 	    {
  1366 		/* For Frank & Jeroen -- 's Middags case */
  1367 		gunichar c2,c3,c4,c5;
  1368 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1369 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1370 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1371 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1372 		if (CHAR_IS_APOSTROPHE(c2) &&
  1373 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
  1374 		  g_unichar_isupper(c5))
  1375 		{
  1376 		    t=g_utf8_next_char(t);
  1377 		    continue;
  1378 		}
  1379 	    }
  1380 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1381 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1382 	      !isdigit(g_utf8_get_char(s1)))
  1383 		s1=g_utf8_next_char(s1);
  1384 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1385 	    {
  1386 		/* we have something to investigate */
  1387 		istypo=TRUE;
  1388 		/* so let's go back and find out */
  1389 		nc=g_utf8_get_char(t);
  1390 		s1=g_utf8_prev_char(t);
  1391 		c=g_utf8_get_char(s1);
  1392 		sprev=g_utf8_prev_char(s1);
  1393 		pc=g_utf8_get_char(sprev);
  1394 		while (s1>=aline &&
  1395 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
  1396 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
  1397 		  g_unichar_isalpha(nc)))
  1398 		{
  1399 		    nc=c;
  1400 		    s1=sprev;
  1401 		    c=pc;
  1402 		    sprev=g_utf8_prev_char(s1);
  1403 		    pc=g_utf8_get_char(sprev);
  1404 		}
  1405 		s1=g_utf8_next_char(s1);
  1406 		s=strchr(s1,'.');
  1407 		if (s)
  1408 		    testword=g_strndup(s1,s-s1);
  1409 		else
  1410 		    testword=g_strdup(s1);
  1411 		for (i=0;*abbrev[i];i++)
  1412 		    if (!strcmp(testword,abbrev[i]))
  1413 			istypo=FALSE;
  1414 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1415 		    istypo=FALSE;
  1416 		if (!*g_utf8_next_char(testword))
  1417 		    istypo=FALSE;
  1418 		if (isroman(testword))
  1419 		    istypo=FALSE;
  1420 		if (istypo)
  1421 		{
  1422 		    istypo=FALSE;
  1423 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1424 		    {
  1425 			decomposition=g_unicode_canonical_decomposition(
  1426 			  g_utf8_get_char(s),&len);
  1427 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1428 			    istypo=TRUE;
  1429 			g_free(decomposition);
  1430 		    }
  1431 		}
  1432 		if (istypo &&
  1433 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1434 		{
  1435 		    g_tree_insert(qperiod,g_strdup(testword),
  1436 		      GINT_TO_POINTER(1));
  1437 		    if (pswit[ECHO_SWITCH])
  1438 			g_print("\n%s\n",aline);
  1439 		    if (!pswit[OVERVIEW_SWITCH])
  1440 			g_print("    Line %ld column %ld - Extra period?\n",
  1441 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1442 		    else
  1443 			cnt_punct++;
  1444 		}
  1445 		g_free(testword);
  1446 	    }
  1447 	    t=g_utf8_next_char(t);
  1448 	}
  1449     }
  1450 }
  1451 
  1452 /*
  1453  * check_for_following_punctuation:
  1454  *
  1455  * Check for words usually not followed by punctuation.
  1456  */
  1457 void check_for_following_punctuation(const char *aline)
  1458 {
  1459     int i;
  1460     const char *s,*wordstart;
  1461     gunichar c;
  1462     gchar *inword,*t;
  1463     if (pswit[TYPO_SWITCH])
  1464     {
  1465 	for (s=aline;*s;)
  1466 	{
  1467 	    wordstart=s;
  1468 	    t=getaword(&s);
  1469 	    if (!*t)
  1470 	    {
  1471 		g_free(t);
  1472 		continue;
  1473 	    }
  1474 	    inword=g_utf8_strdown(t,-1);
  1475 	    g_free(t);
  1476 	    for (i=0;*nocomma[i];i++)
  1477 		if (!strcmp(inword,nocomma[i]))
  1478 		{
  1479 		    c=g_utf8_get_char(s);
  1480 		    if (c==',' || c==';' || c==':')
  1481 		    {
  1482 			if (pswit[ECHO_SWITCH])
  1483 			    g_print("\n%s\n",aline);
  1484 			if (!pswit[OVERVIEW_SWITCH])
  1485 			    g_print("    Line %ld column %ld - "
  1486 			      "Query punctuation after %s?\n",
  1487 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1488 			      inword);
  1489 			else
  1490 			    cnt_punct++;
  1491 		    }
  1492 		}
  1493 	    for (i=0;*noperiod[i];i++)
  1494 		if (!strcmp(inword,noperiod[i]))
  1495 		{
  1496 		    c=g_utf8_get_char(s);
  1497 		    if (c=='.' || c=='!')
  1498 		    {
  1499 			if (pswit[ECHO_SWITCH])
  1500 			    g_print("\n%s\n",aline);
  1501 			if (!pswit[OVERVIEW_SWITCH])
  1502 			    g_print("    Line %ld column %ld - "
  1503 			      "Query punctuation after %s?\n",
  1504 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1505 			      inword);
  1506 			else
  1507 			    cnt_punct++;
  1508 		    }
  1509 		}
  1510 	    g_free(inword);
  1511 	}
  1512     }
  1513 }
  1514 
  1515 /*
  1516  * check_for_typos:
  1517  *
  1518  * Check for commonly mistyped words,
  1519  * and digits like 0 for O in a word.
  1520  */
  1521 void check_for_typos(const char *aline,struct warnings *warnings)
  1522 {
  1523     const char *s,*t,*nt,*wordstart;
  1524     gchar *inword;
  1525     gunichar *decomposition;
  1526     gchar *testword;
  1527     int i,vowel,consonant,*dupcnt;
  1528     gboolean isdup,istypo,alower;
  1529     gunichar c,pc;
  1530     long offset,len;
  1531     gsize decomposition_len;
  1532     for (s=aline;*s;)
  1533     {
  1534 	wordstart=s;
  1535 	inword=getaword(&s);
  1536 	if (!*inword)
  1537 	{
  1538 	    g_free(inword);
  1539 	    continue; /* don't bother with empty lines */
  1540 	}
  1541 	if (mixdigit(inword))
  1542 	{
  1543 	    if (pswit[ECHO_SWITCH])
  1544 		g_print("\n%s\n",aline);
  1545 	    if (!pswit[OVERVIEW_SWITCH])
  1546 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1547 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1548 	    else
  1549 		cnt_word++;
  1550 	}
  1551 	/*
  1552 	 * Put the word through a series of tests for likely typos and OCR
  1553 	 * errors.
  1554 	 */
  1555 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1556 	{
  1557 	    istypo=FALSE;
  1558 	    alower=FALSE;
  1559 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1560 	    {
  1561 		c=g_utf8_get_char(t);
  1562 		nt=g_utf8_next_char(t);
  1563 		/* lowercase for testing */
  1564 		if (g_unichar_islower(c))
  1565 		    alower=TRUE;
  1566 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1567 		{
  1568 		    /*
  1569 		     * We have an uppercase mid-word. However, there are
  1570 		     * common cases:
  1571 		     *   Mac and Mc like McGill
  1572 		     *   French contractions like l'Abbe
  1573 		     */
  1574 		    offset=g_utf8_pointer_to_offset(inword,t);
  1575 		    if (offset>0)
  1576 			pc=g_utf8_get_char(g_utf8_prev_char(t));
  1577 		    else
  1578 			pc='\0';
  1579 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1580 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1581 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1582 		      CHAR_IS_APOSTROPHE(pc))
  1583 			; /* do nothing! */
  1584 		    else
  1585 			istypo=TRUE;
  1586 		}
  1587 	    }
  1588 	    testword=g_utf8_casefold(inword,-1);
  1589 	}
  1590 	if (pswit[TYPO_SWITCH])
  1591 	{
  1592 	    /*
  1593 	     * Check for certain unlikely two-letter combinations at word
  1594 	     * start and end.
  1595 	     */
  1596 	    len=g_utf8_strlen(testword,-1);
  1597 	    if (len>1)
  1598 	    {
  1599 		for (i=0;*nostart[i];i++)
  1600 		    if (g_str_has_prefix(testword,nostart[i]))
  1601 			istypo=TRUE;
  1602 		for (i=0;*noend[i];i++)
  1603 		    if (g_str_has_suffix(testword,noend[i]))
  1604 			istypo=TRUE;
  1605 	    }
  1606 	    /* ght is common, gbt never. Like that. */
  1607 	    if (strstr(testword,"cb"))
  1608 		istypo=TRUE;
  1609 	    if (strstr(testword,"gbt"))
  1610 		istypo=TRUE;
  1611 	    if (strstr(testword,"pbt"))
  1612 		istypo=TRUE;
  1613 	    if (strstr(testword,"tbs"))
  1614 		istypo=TRUE;
  1615 	    if (strstr(testword,"mrn"))
  1616 		istypo=TRUE;
  1617 	    if (strstr(testword,"ahle"))
  1618 		istypo=TRUE;
  1619 	    if (strstr(testword,"ihle"))
  1620 		istypo=TRUE;
  1621 	    /*
  1622 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1623 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1624 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1625 	     * numerals, but "ii" is a common scanno.
  1626 	     */
  1627 	    if (strstr(testword,"tbi"))
  1628 		istypo=TRUE;
  1629 	    if (strstr(testword,"tbe"))
  1630 		istypo=TRUE;
  1631 	    if (strstr(testword,"ii"))
  1632 		istypo=TRUE;
  1633 	    /*
  1634 	     * Check for no vowels or no consonants.
  1635 	     * If none, flag a typo.
  1636 	     */
  1637 	    if (!istypo && len>1)
  1638 	    {
  1639 		vowel=consonant=0;
  1640 		for (t=testword;*t;t=g_utf8_next_char(t))
  1641 		{
  1642 		    c=g_utf8_get_char(t);
  1643 		    decomposition=
  1644 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1645 		    if (c=='y' || g_unichar_isdigit(c))
  1646 		    {
  1647 			/* Yah, this is loose. */
  1648 			vowel++;
  1649 			consonant++;
  1650 		    }
  1651 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1652 			vowel++;
  1653 		    else
  1654 			consonant++;
  1655 		    g_free(decomposition);
  1656 		}
  1657 		if (!vowel || !consonant)
  1658 		    istypo=TRUE;
  1659 	    }
  1660 	    /*
  1661 	     * Now exclude the word from being reported if it's in
  1662 	     * the okword list.
  1663 	     */
  1664 	    for (i=0;*okword[i];i++)
  1665 		if (!strcmp(testword,okword[i]))
  1666 		    istypo=FALSE;
  1667 	    /*
  1668 	     * What looks like a typo may be a Roman numeral.
  1669 	     * Exclude these.
  1670 	     */
  1671 	    if (istypo && isroman(testword))
  1672 		istypo=FALSE;
  1673 	    /* Check the manual list of typos. */
  1674 	    if (!istypo)
  1675 		for (i=0;*typo[i];i++)
  1676 		    if (!strcmp(testword,typo[i]))
  1677 			istypo=TRUE;
  1678 	    /*
  1679 	     * Check lowercase s, l, i and m - special cases.
  1680 	     *   "j" - often a semi-colon gone wrong.
  1681 	     *   "d" for a missing apostrophe - he d
  1682 	     *   "n" for "in"
  1683 	     */
  1684 	    if (!istypo && len==1 &&
  1685 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1686 		istypo=TRUE;
  1687 	    if (istypo)
  1688 	    {
  1689 		dupcnt=g_tree_lookup(qword,testword);
  1690 		if (dupcnt)
  1691 		{
  1692 		    (*dupcnt)++;
  1693 		    isdup=!pswit[VERBOSE_SWITCH];
  1694 		}
  1695 		else
  1696 		{
  1697 		    dupcnt=g_new0(int,1);
  1698 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1699 		    isdup=FALSE;
  1700 		}
  1701 		if (!isdup)
  1702 		{
  1703 		    if (pswit[ECHO_SWITCH])
  1704 			g_print("\n%s\n",aline);
  1705 		    if (!pswit[OVERVIEW_SWITCH])
  1706 		    {
  1707 			g_print("    Line %ld column %ld - Query word %s",
  1708 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1709 			  inword);
  1710 			if (!pswit[VERBOSE_SWITCH])
  1711 			    g_print(" - not reporting duplicates");
  1712 			g_print("\n");
  1713 		    }
  1714 		    else
  1715 			cnt_word++;
  1716 		}
  1717 	    }
  1718 	}
  1719 	/* check the user's list of typos */
  1720 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1721 	{
  1722 	    if (pswit[ECHO_SWITCH])
  1723 		g_print("\n%s\n",aline);
  1724 	    if (!pswit[OVERVIEW_SWITCH])  
  1725 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1726 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1727 	}
  1728 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1729 	    g_free(testword);
  1730 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1731 	{
  1732 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1733 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1734 	    {
  1735 		if (pswit[ECHO_SWITCH])
  1736 		    g_print("\n%s\n",aline);
  1737 		if (!pswit[OVERVIEW_SWITCH])
  1738 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1739 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1740 		      inword);
  1741 		else
  1742 		    cnt_word++;
  1743 	    }
  1744 	}
  1745 	g_free(inword);
  1746     }
  1747 }
  1748 
  1749 /*
  1750  * check_for_misspaced_punctuation:
  1751  *
  1752  * Look for added or missing spaces around punctuation and quotes.
  1753  * If there is a punctuation character like ! with no space on
  1754  * either side, suspect a missing!space. If there are spaces on
  1755  * both sides , assume a typo. If we see a double quote with no
  1756  * space or punctuation on either side of it, assume unspaced
  1757  * quotes "like"this.
  1758  */
  1759 void check_for_misspaced_punctuation(const char *aline,
  1760   struct parities *parities,gboolean isemptyline)
  1761 {
  1762     gboolean isacro,isellipsis;
  1763     const char *s;
  1764     gunichar c,nc,pc,n2c;
  1765     c=g_utf8_get_char(aline);
  1766     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1767     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1768     {
  1769 	pc=c;
  1770 	c=nc;
  1771 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1772 	/* For each character in the line after the first. */
  1773 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1774 	{
  1775 	    /* we need to suppress warnings for acronyms like M.D. */
  1776 	    isacro=FALSE;
  1777 	    /* we need to suppress warnings for ellipsis . . . */
  1778 	    isellipsis=FALSE;
  1779 	    /*
  1780 	     * If there are letters on both sides of it or
  1781 	     * if it's strict punctuation followed by an alpha.
  1782 	     */
  1783 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1784 	      g_utf8_strchr("?!,;:",-1,c)))
  1785 	    {
  1786 		if (c=='.')
  1787 		{
  1788 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1789 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1790 			isacro=TRUE;
  1791 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1792 		    if (nc && n2c=='.')
  1793 			isacro=TRUE;
  1794 		}
  1795 		if (!isacro)
  1796 		{
  1797 		    if (pswit[ECHO_SWITCH])
  1798 			g_print("\n%s\n",aline);
  1799 		    if (!pswit[OVERVIEW_SWITCH])
  1800 			g_print("    Line %ld column %ld - Missing space?\n",
  1801 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1802 		    else
  1803 			cnt_punct++;
  1804 		}
  1805 	    }
  1806 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1807 	    {
  1808 		/*
  1809 		 * If there are spaces on both sides,
  1810 		 * or space before and end of line.
  1811 		 */
  1812 		if (c=='.')
  1813 		{
  1814 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1815 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1816 			isellipsis=TRUE;
  1817 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1818 		    if (nc && n2c=='.')
  1819 			isellipsis=TRUE;
  1820 		}
  1821 		if (!isemptyline && !isellipsis)
  1822 		{
  1823 		    if (pswit[ECHO_SWITCH])
  1824 			g_print("\n%s\n",aline);
  1825 		    if (!pswit[OVERVIEW_SWITCH])
  1826 			g_print("    Line %ld column %ld - "
  1827 			  "Spaced punctuation?\n",linecnt,
  1828 			  g_utf8_pointer_to_offset(aline,s)+1);
  1829 		    else
  1830 			cnt_punct++;
  1831 		}
  1832 	    }
  1833 	}
  1834     }
  1835     /* Split out the characters that CANNOT be preceded by space. */
  1836     c=g_utf8_get_char(aline);
  1837     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1838     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1839     {
  1840 	pc=c;
  1841 	c=nc;
  1842 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1843 	/* for each character in the line after the first */
  1844 	if (g_utf8_strchr("?!,;:",-1,c))
  1845 	{
  1846 	    /* if it's punctuation that _cannot_ have a space before it */
  1847 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1848 	    {
  1849 		/*
  1850 		 * If nc DOES == space,
  1851 		 * it was already reported just above.
  1852 		 */
  1853 		if (pswit[ECHO_SWITCH])
  1854 		    g_print("\n%s\n",aline);
  1855 		if (!pswit[OVERVIEW_SWITCH])
  1856 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1857 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1858 		else
  1859 		    cnt_punct++;
  1860 	    }
  1861 	}
  1862     }
  1863     /*
  1864      * Special case " .X" where X is any alpha.
  1865      * This plugs a hole in the acronym code above.
  1866      * Inelegant, but maintainable.
  1867      */
  1868     c=g_utf8_get_char(aline);
  1869     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1870     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1871     {
  1872 	pc=c;
  1873 	c=nc;
  1874 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1875 	/* for each character in the line after the first */
  1876 	if (c=='.')
  1877 	{
  1878 	    /* if it's a period */
  1879 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1880 	    {
  1881 		/*
  1882 		 * If the period follows a space and
  1883 		 * is followed by a letter.
  1884 		 */
  1885 		if (pswit[ECHO_SWITCH])
  1886 		    g_print("\n%s\n",aline);
  1887 		if (!pswit[OVERVIEW_SWITCH])
  1888 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1889 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1890 		else
  1891 		    cnt_punct++;
  1892 	    }
  1893 	}
  1894     }
  1895     c=g_utf8_get_char(aline);
  1896     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1897     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1898     {
  1899 	pc=c;
  1900 	c=nc;
  1901 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1902 	/* for each character in the line after the first */
  1903 	if (c==CHAR_DQUOTE)
  1904 	{
  1905 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  1906 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  1907 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  1908 	    {
  1909 		if (pswit[ECHO_SWITCH])
  1910 		    g_print("\n%s\n",aline);
  1911 		if (!pswit[OVERVIEW_SWITCH])
  1912 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  1913 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1914 		else
  1915 		    cnt_punct++;
  1916 	    }
  1917 	}
  1918     }
  1919     /* Check parity of quotes. */
  1920     nc=g_utf8_get_char(aline);
  1921     for (s=aline;*s;s=g_utf8_next_char(s))
  1922     {
  1923 	c=nc;
  1924 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1925 	if (c==CHAR_DQUOTE)
  1926 	{
  1927 	    parities->dquote=!parities->dquote;
  1928 	    if (!parities->dquote)
  1929 	    {
  1930 		/* parity even */
  1931 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  1932 		{
  1933 		    if (pswit[ECHO_SWITCH])
  1934 			g_print("\n%s\n",aline);
  1935 		    if (!pswit[OVERVIEW_SWITCH])
  1936 			g_print("    Line %ld column %ld - "
  1937 			  "Wrongspaced quotes?\n",
  1938 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1939 		    else
  1940 			cnt_punct++;
  1941 		}
  1942 	    }
  1943 	    else
  1944 	    {
  1945 		/* parity odd */
  1946 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  1947 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  1948 		{
  1949 		    if (pswit[ECHO_SWITCH])
  1950 			g_print("\n%s\n",aline);
  1951 		    if (!pswit[OVERVIEW_SWITCH])
  1952 			g_print("    Line %ld column %ld - "
  1953 			  "Wrongspaced quotes?\n",
  1954 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1955 		    else
  1956 			cnt_punct++;
  1957 		}
  1958 	    }
  1959 	}
  1960     }
  1961     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  1962     {
  1963 	if (g_utf8_strchr(",;:!?)]} ",-1,
  1964 	  g_utf8_get_char(g_utf8_next_char(aline))))
  1965 	{
  1966 	    if (pswit[ECHO_SWITCH])
  1967 		g_print("\n%s\n",aline);
  1968 	    if (!pswit[OVERVIEW_SWITCH])
  1969 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  1970 		  linecnt);
  1971 	    else
  1972 		cnt_punct++;
  1973 	}
  1974     }
  1975     if (pswit[SQUOTE_SWITCH])
  1976     {
  1977 	nc=g_utf8_get_char(aline);
  1978 	for (s=aline;*s;s=g_utf8_next_char(s))
  1979 	{
  1980 	    c=nc;
  1981 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  1982 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
  1983 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  1984 	      !g_unichar_isalpha(nc)))
  1985 	    {
  1986 		parities->squote=!parities->squote;
  1987 		if (!parities->squote)
  1988 		{
  1989 		    /* parity even */
  1990 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  1991 		    {
  1992 			if (pswit[ECHO_SWITCH])
  1993 			    g_print("\n%s\n",aline);
  1994 			if (!pswit[OVERVIEW_SWITCH])
  1995 			    g_print("    Line %ld column %ld - "
  1996 			      "Wrongspaced singlequotes?\n",
  1997 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1998 			else
  1999 			    cnt_punct++;
  2000 		    }
  2001 		}
  2002 		else
  2003 		{
  2004 		    /* parity odd */
  2005 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2006 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2007 		    {
  2008 			if (pswit[ECHO_SWITCH])
  2009 			    g_print("\n%s\n",aline);
  2010 			if (!pswit[OVERVIEW_SWITCH])
  2011 			    g_print("    Line %ld column %ld - "
  2012 			      "Wrongspaced singlequotes?\n",
  2013 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2014 			else
  2015 			    cnt_punct++;
  2016 		    }
  2017 		}
  2018 	    }
  2019 	}
  2020     }
  2021 }
  2022 
  2023 /*
  2024  * check_for_double_punctuation:
  2025  *
  2026  * Look for double punctuation like ,. or ,,
  2027  * Thanks to DW for the suggestion!
  2028  * In books with references, ".," and ".;" are common
  2029  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2030  * OTOH, from my initial tests, there are also fairly
  2031  * common errors. What to do? Make these cases paranoid?
  2032  * ".," is the most common, so warnings->dotcomma is used
  2033  * to suppress detailed reporting if it occurs often.
  2034  */
  2035 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2036 {
  2037     const char *s;
  2038     gunichar c,nc;
  2039     nc=g_utf8_get_char(aline);
  2040     for (s=aline;*s;s=g_utf8_next_char(s))
  2041     {
  2042 	c=nc;
  2043 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2044 	/* for each punctuation character in the line */
  2045 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2046 	  g_utf8_strchr(".?!,;:",-1,nc))
  2047 	{
  2048 	    /* followed by punctuation, it's a query, unless . . . */
  2049 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2050 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2051 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2052 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2053 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2054 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2055 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2056 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2057 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2058 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2059 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2060 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2061 	    {
  2062 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2063 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2064 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2065 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2066 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2067 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2068 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2069 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2070 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2071 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2072 		{
  2073 		    s+=4;
  2074 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2075 		}
  2076 		; /* do nothing for .. !! and ?? which can be legit */
  2077 	    }
  2078 	    else
  2079 	    {
  2080 		if (pswit[ECHO_SWITCH])
  2081 		    g_print("\n%s\n",aline);
  2082 		if (!pswit[OVERVIEW_SWITCH])
  2083 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2084 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2085 		else
  2086 		    cnt_punct++;
  2087 	    }
  2088 	}
  2089     }
  2090 }
  2091 
  2092 /*
  2093  * check_for_spaced_quotes:
  2094  */
  2095 void check_for_spaced_quotes(const char *aline)
  2096 {
  2097     int i;
  2098     const char *s,*t;
  2099     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
  2100       CHAR_RS_QUOTE};
  2101     GString *pattern;
  2102     s=aline;
  2103     while ((t=strstr(s," \" ")))
  2104     {
  2105 	if (pswit[ECHO_SWITCH])
  2106 	    g_print("\n%s\n",aline);
  2107 	if (!pswit[OVERVIEW_SWITCH])
  2108 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2109 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2110 	else
  2111 	    cnt_punct++;
  2112 	s=g_utf8_next_char(g_utf8_next_char(t));
  2113     }
  2114     pattern=g_string_new(NULL);
  2115     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
  2116     {
  2117 	g_string_assign(pattern," ");
  2118 	g_string_append_unichar(pattern,single_quotes[i]);
  2119 	g_string_append_c(pattern,' ');
  2120 	s=aline;
  2121 	while ((t=strstr(s,pattern->str)))
  2122 	{
  2123 	    if (pswit[ECHO_SWITCH])
  2124 		g_print("\n%s\n",aline);
  2125 	    if (!pswit[OVERVIEW_SWITCH])
  2126 		g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2127 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2128 	    else
  2129 		cnt_punct++;
  2130 	    s=g_utf8_next_char(g_utf8_next_char(t));
  2131 	}
  2132     }
  2133     g_string_free(pattern,TRUE);
  2134 }
  2135 
  2136 /*
  2137  * check_for_miscased_genative:
  2138  *
  2139  * Check special case of 'S instead of 's at end of word.
  2140  */
  2141 void check_for_miscased_genative(const char *aline)
  2142 {
  2143     const char *s;
  2144     gunichar c,nc,pc;
  2145     if (!*aline)
  2146 	return;
  2147     c=g_utf8_get_char(aline);
  2148     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2149     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2150     {
  2151 	pc=c;
  2152 	c=nc;
  2153 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2154 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
  2155 	{
  2156 	    if (pswit[ECHO_SWITCH])
  2157 		g_print("\n%s\n",aline);
  2158 	    if (!pswit[OVERVIEW_SWITCH])
  2159 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2160 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2161 	    else
  2162 		cnt_punct++;
  2163 	}
  2164     }
  2165 }
  2166 
  2167 /*
  2168  * check_end_of_line:
  2169  *
  2170  * Now check special cases - start and end of line -
  2171  * for single and double quotes. Start is sometimes [sic]
  2172  * but better to query it anyway.
  2173  * While we're here, check for dash at end of line.
  2174  */
  2175 void check_end_of_line(const char *aline,struct warnings *warnings)
  2176 {
  2177     int lbytes;
  2178     const char *s;
  2179     gunichar c1,c2;
  2180     lbytes=strlen(aline);
  2181     if (g_utf8_strlen(aline,lbytes)>1)
  2182     {
  2183 	s=g_utf8_prev_char(aline+lbytes);
  2184 	c1=g_utf8_get_char(s);
  2185 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2186 	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
  2187 	{
  2188 	    if (pswit[ECHO_SWITCH])
  2189 		g_print("\n%s\n",aline);
  2190 	    if (!pswit[OVERVIEW_SWITCH])
  2191 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2192 		  g_utf8_strlen(aline,lbytes));
  2193 	    else
  2194 		cnt_punct++;
  2195 	}
  2196 	c1=g_utf8_get_char(aline);
  2197 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2198 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
  2199 	{
  2200 	    if (pswit[ECHO_SWITCH])
  2201 		g_print("\n%s\n",aline);
  2202 	    if (!pswit[OVERVIEW_SWITCH])
  2203 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2204 	    else
  2205 		cnt_punct++;
  2206 	}
  2207 	/*
  2208 	 * Dash at end of line may well be legit - paranoid mode only
  2209 	 * and don't report em-dash at line-end.
  2210 	 */
  2211 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2212 	{
  2213 	    for (s=g_utf8_prev_char(aline+lbytes);
  2214 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2215 		;
  2216 	    if (g_utf8_get_char(s)=='-' &&
  2217 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2218 	    {
  2219 		if (pswit[ECHO_SWITCH])
  2220 		    g_print("\n%s\n",aline);
  2221 		if (!pswit[OVERVIEW_SWITCH])
  2222 		    g_print("    Line %ld column %ld - "
  2223 		      "Hyphen at end of line?\n",
  2224 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2225 	    }
  2226 	}
  2227     }
  2228 }
  2229 
  2230 /*
  2231  * check_for_unspaced_bracket:
  2232  *
  2233  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2234  * If so, suspect a scanno like "a]most".
  2235  */
  2236 void check_for_unspaced_bracket(const char *aline)
  2237 {
  2238     const char *s;
  2239     gunichar c,nc,pc;
  2240     c=g_utf8_get_char(aline);
  2241     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2242     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2243     {
  2244 	pc=c;
  2245 	c=nc;
  2246 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2247 	if (!nc)
  2248 	    break;
  2249 	/* for each bracket character in the line except 1st & last */
  2250 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2251 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2252 	{
  2253 	    if (pswit[ECHO_SWITCH])
  2254 		g_print("\n%s\n",aline);
  2255 	    if (!pswit[OVERVIEW_SWITCH])
  2256 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2257 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2258 	    else
  2259 		cnt_punct++;
  2260 	}
  2261     }
  2262 }
  2263 
  2264 /*
  2265  * check_for_unpunctuated_endquote:
  2266  */
  2267 void check_for_unpunctuated_endquote(const char *aline)
  2268 {
  2269     const char *s;
  2270     gunichar c,nc,pc;
  2271     c=g_utf8_get_char(aline);
  2272     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2273     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2274     {
  2275 	pc=c;
  2276 	c=nc;
  2277 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2278 	/* for each character in the line except 1st */
  2279 	if (c==CHAR_DQUOTE && isalpha(pc))
  2280 	{
  2281 	    if (pswit[ECHO_SWITCH])
  2282 		g_print("\n%s\n",aline);
  2283 	    if (!pswit[OVERVIEW_SWITCH])
  2284 		g_print("    Line %ld column %ld - "
  2285 		  "endquote missing punctuation?\n",
  2286 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2287 	    else
  2288 		cnt_punct++;
  2289 	}
  2290     }
  2291 }
  2292 
  2293 /*
  2294  * check_for_html_tag:
  2295  *
  2296  * Check for <HTML TAG>.
  2297  *
  2298  * If there is a < in the line, followed at some point
  2299  * by a > then we suspect HTML.
  2300  */
  2301 void check_for_html_tag(const char *aline)
  2302 {
  2303     const char *open,*close;
  2304     gchar *tag;
  2305     open=strchr(aline,'<');
  2306     if (open)
  2307     {
  2308 	close=strchr(g_utf8_next_char(open),'>');
  2309 	if (close)
  2310 	{
  2311 	    if (pswit[ECHO_SWITCH])
  2312 		g_print("\n%s\n",aline);
  2313 	    if (!pswit[OVERVIEW_SWITCH])
  2314 	    {
  2315 		tag=g_strndup(open,close-open+1);
  2316 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2317 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2318 		g_free(tag);
  2319 	    }
  2320 	    else
  2321 		cnt_html++;
  2322 	}
  2323     }
  2324 }
  2325 
  2326 /*
  2327  * check_for_html_entity:
  2328  *
  2329  * Check for &symbol; HTML.
  2330  *
  2331  * If there is a & in the line, followed at
  2332  * some point by a ; then we suspect HTML.
  2333  */
  2334 void check_for_html_entity(const char *aline)
  2335 {
  2336     const char *s,*amp,*scolon;
  2337     gchar *entity;
  2338     amp=strchr(aline,'&');
  2339     if (amp)
  2340     {
  2341 	scolon=strchr(amp,';');
  2342 	if (scolon)
  2343 	{
  2344 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2345 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2346 		    break;		/* Don't report "Jones & Son;" */
  2347 	    if (s>=scolon)
  2348 	    {
  2349 		if (pswit[ECHO_SWITCH])
  2350 		    g_print("\n%s\n",aline);
  2351 		if (!pswit[OVERVIEW_SWITCH])
  2352 		{
  2353 		    entity=g_strndup(amp,scolon-amp+1);
  2354 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2355 		      linecnt,(int)(amp-aline)+1,entity);
  2356 		    g_free(entity);
  2357 		}
  2358 		else
  2359 		    cnt_html++;
  2360 	    }
  2361 	}
  2362     }
  2363 }
  2364 
  2365 /*
  2366  * print_pending:
  2367  *
  2368  * If we are in a state of unbalanced quotes, and this line
  2369  * doesn't begin with a quote, output the stored error message.
  2370  * If the -P switch was used, print the warning even if the
  2371  * new para starts with quotes.
  2372  */
  2373 void print_pending(const char *aline,const char *parastart,
  2374   struct pending *pending)
  2375 {
  2376     const char *s;
  2377     gunichar c;
  2378     s=aline;
  2379     while (*s==' ')
  2380 	s++;
  2381     c=g_utf8_get_char(s);
  2382     if (pending->dquote)
  2383     {
  2384 	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  2385 	{
  2386 	    if (!pswit[OVERVIEW_SWITCH])
  2387 	    {
  2388 		if (pswit[ECHO_SWITCH])
  2389 		    g_print("\n%s\n",parastart);
  2390 		g_print("%s\n",pending->dquote);
  2391 	    }
  2392 	    else
  2393 		cnt_dquot++;
  2394 	}
  2395 	g_free(pending->dquote);
  2396 	pending->dquote=NULL;
  2397     }
  2398     if (pending->squote)
  2399     {
  2400 	if (!CHAR_IS_SQUOTE(c) || pswit[QPARA_SWITCH] || pending->squot)
  2401 	{
  2402 	    if (!pswit[OVERVIEW_SWITCH])
  2403 	    {
  2404 		if (pswit[ECHO_SWITCH])
  2405 		    g_print("\n%s\n",parastart);
  2406 		g_print("%s\n",pending->squote);
  2407 	    }
  2408 	    else
  2409 		cnt_squot++;
  2410 	}
  2411 	g_free(pending->squote);
  2412 	pending->squote=NULL;
  2413     }
  2414     if (pending->rbrack)
  2415     {
  2416 	if (!pswit[OVERVIEW_SWITCH])
  2417 	{
  2418 	    if (pswit[ECHO_SWITCH])
  2419 		g_print("\n%s\n",parastart);
  2420 	    g_print("%s\n",pending->rbrack);
  2421 	}
  2422 	else
  2423 	    cnt_brack++;
  2424 	g_free(pending->rbrack);
  2425 	pending->rbrack=NULL;
  2426     }
  2427     if (pending->sbrack)
  2428     {
  2429 	if (!pswit[OVERVIEW_SWITCH])
  2430 	{
  2431 	    if (pswit[ECHO_SWITCH])
  2432 		g_print("\n%s\n",parastart);
  2433 	    g_print("%s\n",pending->sbrack);
  2434 	}
  2435 	else
  2436 	    cnt_brack++;
  2437 	g_free(pending->sbrack);
  2438 	pending->sbrack=NULL;
  2439     }
  2440     if (pending->cbrack)
  2441     {
  2442 	if (!pswit[OVERVIEW_SWITCH])
  2443 	{
  2444 	    if (pswit[ECHO_SWITCH])
  2445 		g_print("\n%s\n",parastart);
  2446 	    g_print("%s\n",pending->cbrack);
  2447 	}
  2448 	else
  2449 	    cnt_brack++;
  2450 	g_free(pending->cbrack);
  2451 	pending->cbrack=NULL;
  2452     }
  2453     if (pending->unders)
  2454     {
  2455 	if (!pswit[OVERVIEW_SWITCH])
  2456 	{
  2457 	    if (pswit[ECHO_SWITCH])
  2458 		g_print("\n%s\n",parastart);
  2459 	    g_print("%s\n",pending->unders);
  2460 	}
  2461 	else
  2462 	    cnt_brack++;
  2463 	g_free(pending->unders);
  2464 	pending->unders=NULL;
  2465     }
  2466 }
  2467 
  2468 /*
  2469  * check_for_mismatched_quotes:
  2470  *
  2471  * At end of paragraph, check for mismatched quotes.
  2472  *
  2473  * We don't want to report an error immediately, since it is a
  2474  * common convention to omit the quotes at end of paragraph if
  2475  * the next paragraph is a continuation of the same speaker.
  2476  * Where this is the case, the next para should begin with a
  2477  * quote, so we store the warning message and only display it
  2478  * at the top of the next iteration if the new para doesn't
  2479  * start with a quote.
  2480  * The -p switch overrides this default, and warns of unclosed
  2481  * quotes on _every_ paragraph, whether the next begins with a
  2482  * quote or not.
  2483  */
  2484 void check_for_mismatched_quotes(const struct counters *counters,
  2485   struct pending *pending)
  2486 {
  2487     int squote_straight,squote_curved;
  2488     if (counters->quot%2)
  2489 	pending->dquote=
  2490 	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
  2491     if (pswit[SQUOTE_SWITCH])
  2492     {
  2493 	if (matching_count(counters,CHAR_SQUOTE,TRUE))
  2494 	    squote_straight=matching_difference(counters,CHAR_SQUOTE);
  2495 	else
  2496 	    squote_straight=0;
  2497 	if (matching_count(counters,CHAR_LS_QUOTE,TRUE))
  2498 	    squote_curved=matching_difference(counters,CHAR_LS_QUOTE);
  2499 	else
  2500 	    squote_curved=0;
  2501 	if (squote_straight || squote_curved)
  2502 	    pending->squote=
  2503 	      g_strdup_printf("    Line %ld - Mismatched singlequotes?",
  2504 	      linecnt);
  2505 	if (squote_straight && squote_straight!=1 ||
  2506 	  squote_curved && squote_curved!=1)
  2507 	    /*
  2508 	     * Flag it to be noted regardless of the
  2509 	     * first char of the next para.
  2510 	     */
  2511 	    pending->squot=1;
  2512     }
  2513     if (matching_difference(counters,CHAR_OPEN_RBRACK))
  2514 	pending->rbrack=
  2515 	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
  2516     if (matching_difference(counters,CHAR_OPEN_SBRACK))
  2517 	pending->sbrack=
  2518 	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
  2519     if (matching_difference(counters,CHAR_OPEN_CBRACK))
  2520 	pending->cbrack=
  2521 	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
  2522     if (counters->c_unders%2)
  2523 	pending->unders=
  2524 	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
  2525 }
  2526 
  2527 /*
  2528  * check_for_omitted_punctuation:
  2529  *
  2530  * Check for omitted punctuation at end of paragraph by working back
  2531  * through prevline. DW.
  2532  * Need to check this only for "normal" paras.
  2533  * So what is a "normal" para?
  2534  *    Not normal if one-liner (chapter headings, etc.)
  2535  *    Not normal if doesn't contain at least one locase letter
  2536  *    Not normal if starts with space
  2537  */
  2538 void check_for_omitted_punctuation(const char *prevline,
  2539   struct line_properties *last,int start_para_line)
  2540 {
  2541     gboolean letter_on_line=FALSE;
  2542     const char *s;
  2543     gunichar c;
  2544     for (s=prevline;*s;s=g_utf8_next_char(s))
  2545 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2546 	{
  2547 	    letter_on_line=TRUE;
  2548 	    break;
  2549 	}
  2550     /*
  2551      * This next "if" is a problem.
  2552      * If we say "start_para_line <= linecnt - 1", that includes
  2553      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2554      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2555      * misses genuine one-line paragraphs.
  2556      */
  2557     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2558       g_utf8_get_char(prevline)>CHAR_SPACE)
  2559     {
  2560 	s=prevline+strlen(prevline);
  2561 	do
  2562 	{
  2563 	    s=g_utf8_prev_char(s);
  2564 	    c=g_utf8_get_char(s);
  2565 	} while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
  2566 	for (;s>prevline;s=g_utf8_prev_char(s))
  2567 	{
  2568 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2569 	    {
  2570 		if (pswit[ECHO_SWITCH])
  2571 		    g_print("\n%s\n",prevline);
  2572 		if (!pswit[OVERVIEW_SWITCH])
  2573 		    g_print("    Line %ld column %ld - "
  2574 		      "No punctuation at para end?\n",
  2575 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2576 		else
  2577 		    cnt_punct++;
  2578 		break;
  2579 	    }
  2580 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2581 		break;
  2582 	}
  2583     }
  2584 }
  2585 
  2586 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2587 {
  2588     const char *word=key;
  2589     int *dupcnt=value;
  2590     if (*dupcnt)
  2591 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2592 	  word,*dupcnt);
  2593     return FALSE;
  2594 }
  2595 
  2596 void print_as_windows_1252(const char *string)
  2597 {
  2598     gsize inbytes,outbytes;
  2599     gchar *buf,*bp;
  2600     static GIConv converter=(GIConv)-1;
  2601     if (!string)
  2602     {
  2603 	if (converter!=(GIConv)-1)
  2604 	    g_iconv_close(converter);
  2605 	converter=(GIConv)-1;
  2606 	return;
  2607     }
  2608     if (converter==(GIConv)-1)
  2609 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2610     if (converter!=(GIConv)-1)
  2611     {
  2612 	inbytes=outbytes=strlen(string);
  2613 	bp=buf=g_malloc(outbytes+1);
  2614 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2615 	*bp='\0';
  2616 	fputs(buf,stdout);
  2617 	g_free(buf);
  2618     }
  2619     else
  2620 	fputs(string,stdout);
  2621 }
  2622 
  2623 void print_as_utf_8(const char *string)
  2624 {
  2625     fputs(string,stdout);
  2626 }
  2627 
  2628 /*
  2629  * procfile:
  2630  *
  2631  * Process one file.
  2632  */
  2633 void procfile(const char *filename)
  2634 {
  2635     const char *s;
  2636     gchar *parastart=NULL;	/* first line of current para */
  2637     gchar *etext,*aline;
  2638     gchar *etext_ptr;
  2639     GError *err=NULL;
  2640     struct first_pass_results *first_pass_results;
  2641     struct warnings *warnings;
  2642     struct counters counters={0};
  2643     struct line_properties last={0};
  2644     struct parities parities={0};
  2645     struct pending pending={0};
  2646     gboolean isemptyline;
  2647     long start_para_line=0;
  2648     gboolean isnewpara=FALSE,enddash=FALSE;
  2649     last.start=CHAR_SPACE;
  2650     linecnt=checked_linecnt=0;
  2651     etext=read_etext(filename,&err);
  2652     if (!etext)
  2653     {
  2654 	if (pswit[STDOUT_SWITCH])
  2655 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2656 	else
  2657 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2658 	exit(1);
  2659     }
  2660     g_print("\n\nFile: %s\n\n",filename);
  2661     first_pass_results=first_pass(etext);
  2662     warnings=report_first_pass(first_pass_results);
  2663     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2664     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2665     /*
  2666      * Here we go with the main pass. Hold onto yer hat!
  2667      */
  2668     linecnt=0;
  2669     etext_ptr=etext;
  2670     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2671     {
  2672 	linecnt++;
  2673 	if (linecnt==1)
  2674 	    isnewpara=TRUE;
  2675 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2676 	    continue;    // skip DP page separators completely
  2677 	if (linecnt<first_pass_results->firstline ||
  2678 	  (first_pass_results->footerline>0 &&
  2679 	  linecnt>first_pass_results->footerline))
  2680 	{
  2681 	    if (pswit[HEADER_SWITCH])
  2682 	    {
  2683 		if (g_str_has_prefix(aline,"Title:"))
  2684 		    g_print("    %s\n",aline);
  2685 		if (g_str_has_prefix(aline,"Author:"))
  2686 		    g_print("    %s\n",aline);
  2687 		if (g_str_has_prefix(aline,"Release Date:"))
  2688 		    g_print("    %s\n",aline);
  2689 		if (g_str_has_prefix(aline,"Edition:"))
  2690 		    g_print("    %s\n\n",aline);
  2691 	    }
  2692 	    continue;		/* skip through the header */
  2693 	}
  2694 	checked_linecnt++;
  2695 	print_pending(aline,parastart,&pending);
  2696 	memset(&pending,0,sizeof(pending));
  2697 	isemptyline=analyse_quotes(aline,&counters);
  2698 	if (isnewpara && !isemptyline)
  2699 	{
  2700 	    /* This line is the start of a new paragraph. */
  2701 	    start_para_line=linecnt;
  2702 	    /* Capture its first line in case we want to report it later. */
  2703 	    g_free(parastart);
  2704 	    parastart=g_strdup(aline);
  2705 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2706 	    s=aline;
  2707 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2708 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2709 		s=g_utf8_next_char(s);
  2710 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2711 	    {
  2712 		/* and its first letter is lowercase */
  2713 		if (pswit[ECHO_SWITCH])
  2714 		    g_print("\n%s\n",aline);
  2715 		if (!pswit[OVERVIEW_SWITCH])
  2716 		    g_print("    Line %ld column %ld - "
  2717 		      "Paragraph starts with lower-case\n",
  2718 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2719 		else
  2720 		    cnt_punct++;
  2721 	    }
  2722 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2723 	}
  2724 	/* Check for an em-dash broken at line end. */
  2725 	if (enddash && g_utf8_get_char(aline)=='-')
  2726 	{
  2727 	    if (pswit[ECHO_SWITCH])
  2728 		g_print("\n%s\n",aline);
  2729 	    if (!pswit[OVERVIEW_SWITCH])
  2730 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2731 	    else
  2732 		cnt_punct++;
  2733 	}
  2734 	enddash=FALSE;
  2735 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2736 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2737 	    ;
  2738 	if (s>=aline && g_utf8_get_char(s)=='-')
  2739 	    enddash=TRUE;
  2740 	check_for_control_characters(aline);
  2741 	if (warnings->bin)
  2742 	    check_for_odd_characters(aline,warnings,isemptyline);
  2743 	if (warnings->longline)
  2744 	    check_for_long_line(aline);
  2745 	if (warnings->shortline)
  2746 	    check_for_short_line(aline,&last);
  2747 	last.blen=last.len;
  2748 	last.len=g_utf8_strlen(aline,-1);
  2749 	last.start=g_utf8_get_char(aline);
  2750 	check_for_starting_punctuation(aline);
  2751 	if (warnings->dash)
  2752 	{
  2753 	    check_for_spaced_emdash(aline);
  2754 	    check_for_spaced_dash(aline);
  2755 	}
  2756 	check_for_unmarked_paragraphs(aline);
  2757 	check_for_jeebies(aline);
  2758 	check_for_mta_from(aline);
  2759 	check_for_orphan_character(aline);
  2760 	check_for_pling_scanno(aline);
  2761 	check_for_extra_period(aline,warnings);
  2762 	check_for_following_punctuation(aline);
  2763 	check_for_typos(aline,warnings);
  2764 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2765 	check_for_double_punctuation(aline,warnings);
  2766 	check_for_spaced_quotes(aline);
  2767 	check_for_miscased_genative(aline);
  2768 	check_end_of_line(aline,warnings);
  2769 	check_for_unspaced_bracket(aline);
  2770 	if (warnings->endquote)
  2771 	    check_for_unpunctuated_endquote(aline);
  2772 	check_for_html_tag(aline);
  2773 	check_for_html_entity(aline);
  2774 	if (isemptyline)
  2775 	{
  2776 	    check_for_mismatched_quotes(&counters,&pending);
  2777 	    memset(&counters,0,sizeof(counters));
  2778 	    /* let the next iteration know that it's starting a new para */
  2779 	    isnewpara=TRUE;
  2780 	    if (prevline)
  2781 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2782 	}
  2783 	g_free(prevline);
  2784 	prevline=g_strdup(aline);
  2785     }
  2786     if (prevline)
  2787     {
  2788 	g_free(prevline);
  2789 	prevline=NULL;
  2790     }
  2791     g_free(parastart);
  2792     g_free(prevline);
  2793     g_free(etext);
  2794     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])
  2795 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2796     g_tree_unref(qword);
  2797     g_tree_unref(qperiod);
  2798     counters_destroy(&counters);
  2799     g_set_print_handler(NULL);
  2800     print_as_windows_1252(NULL);
  2801     if (pswit[MARKUP_SWITCH])  
  2802 	loseentities(NULL);
  2803 }
  2804 
  2805 /*
  2806  * flgets:
  2807  *
  2808  * Get one line from the input text, checking for
  2809  * the existence of exactly one CR/LF line-end per line.
  2810  *
  2811  * Returns: a pointer to the line.
  2812  */
  2813 char *flgets(char **etext,long lcnt)
  2814 {
  2815     gunichar c;
  2816     gboolean isCR=FALSE;
  2817     char *theline=*etext;
  2818     char *eos=theline;
  2819     gchar *s;
  2820     for (;;)
  2821     {
  2822 	c=g_utf8_get_char(*etext);
  2823 	*etext=g_utf8_next_char(*etext);
  2824 	if (!c)
  2825 	    return NULL;
  2826 	/* either way, it's end of line */
  2827 	if (c=='\n')
  2828 	{
  2829 	    if (isCR)
  2830 		break;
  2831 	    else
  2832 	    {
  2833 		/* Error - a LF without a preceding CR */
  2834 		if (pswit[LINE_END_SWITCH])
  2835 		{
  2836 		    if (pswit[ECHO_SWITCH])
  2837 		    {
  2838 			s=g_strndup(theline,eos-theline);
  2839 			g_print("\n%s\n",s);
  2840 			g_free(s);
  2841 		    }
  2842 		    if (!pswit[OVERVIEW_SWITCH])
  2843 			g_print("    Line %ld - No CR?\n",lcnt);
  2844 		    else
  2845 			cnt_lineend++;
  2846 		}
  2847 		break;
  2848 	    }
  2849 	}
  2850 	if (c=='\r')
  2851 	{
  2852 	    if (isCR)
  2853 	    {
  2854 		/* Error - two successive CRs */
  2855 		if (pswit[LINE_END_SWITCH])
  2856 		{
  2857 		    if (pswit[ECHO_SWITCH])
  2858 		    {
  2859 			s=g_strndup(theline,eos-theline);
  2860 			g_print("\n%s\n",s);
  2861 			g_free(s);
  2862 		    }
  2863 		    if (!pswit[OVERVIEW_SWITCH])
  2864 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2865 		    else
  2866 			cnt_lineend++;
  2867 		}
  2868 	    }
  2869 	    isCR=TRUE;
  2870 	}
  2871 	else
  2872 	{
  2873 	    if (pswit[LINE_END_SWITCH] && isCR)
  2874 	    {
  2875 		if (pswit[ECHO_SWITCH])
  2876 		{
  2877 		    s=g_strndup(theline,eos-theline);
  2878 		    g_print("\n%s\n",s);
  2879 		    g_free(s);
  2880 		}
  2881 		if (!pswit[OVERVIEW_SWITCH])
  2882 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2883 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2884 		else
  2885 		    cnt_lineend++;
  2886 		*eos=' ';
  2887 	    }
  2888 	    isCR=FALSE;
  2889 	    eos=g_utf8_next_char(eos);
  2890 	}
  2891     }
  2892     *eos='\0';
  2893     if (pswit[MARKUP_SWITCH])  
  2894 	postprocess_for_HTML(theline);
  2895     if (pswit[DP_SWITCH])  
  2896 	postprocess_for_DP(theline);
  2897     return theline;
  2898 }
  2899 
  2900 /*
  2901  * mixdigit:
  2902  *
  2903  * Takes a "word" as a parameter, and checks whether it
  2904  * contains a mixture of alpha and digits. Generally, this is an
  2905  * error, but may not be for cases like 4th or L5 12s. 3d.
  2906  *
  2907  * Returns: TRUE iff an is error found.
  2908  */
  2909 gboolean mixdigit(const char *checkword)
  2910 {
  2911     gboolean wehaveadigit,wehavealetter,query;
  2912     const char *s,*nondigit;
  2913     wehaveadigit=wehavealetter=query=FALSE;
  2914     for (s=checkword;*s;s=g_utf8_next_char(s))
  2915 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2916 	    wehavealetter=TRUE;
  2917 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2918 	    wehaveadigit=TRUE;
  2919     if (wehaveadigit && wehavealetter)
  2920     {
  2921 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2922 	query=TRUE;
  2923 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2924 	  nondigit=g_utf8_next_char(nondigit))
  2925 	    ;
  2926 	/* digits, ending in st, rd, nd, th of either case */
  2927 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2928 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2929 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2930 	  !g_ascii_strcasecmp(nondigit,"th"))
  2931 	    query=FALSE;
  2932 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2933 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2934 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2935 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2936 	    query=FALSE;
  2937 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2938 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2939 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2940 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2941 	    query=FALSE;
  2942 	/* digits, ending in l, L, s or d */
  2943 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2944 	  !strcmp(nondigit,"d"))
  2945 	    query=FALSE;
  2946 	/*
  2947 	 * L at the start of a number, representing Britsh pounds, like L500.
  2948 	 * This is cute. We know the current word is mixed digit. If the first
  2949 	 * letter is L, there must be at least one digit following. If both
  2950 	 * digits and letters follow, we have a genuine error, else we have a
  2951 	 * capital L followed by digits, and we accept that as a non-error.
  2952 	 */
  2953 	if (g_utf8_get_char(checkword)=='L' &&
  2954 	  !mixdigit(g_utf8_next_char(checkword)))
  2955 	    query=FALSE;
  2956     }
  2957     return query;
  2958 }
  2959 
  2960 /*
  2961  * getaword:
  2962  *
  2963  * Extracts the first/next "word" from the line, and returns it.
  2964  * A word is defined as one English word unit--or at least that's the aim.
  2965  * "ptr" is advanced to the position in the line where we will start
  2966  * looking for the next word.
  2967  *
  2968  * Returns: A newly-allocated string.
  2969  */
  2970 gchar *getaword(const char **ptr)
  2971 {
  2972     const char *s,*t;
  2973     GString *word;
  2974     gunichar c,pc;
  2975     word=g_string_new(NULL);
  2976     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2977       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2978       **ptr;*ptr=g_utf8_next_char(*ptr))
  2979 	;
  2980     /*
  2981      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2982      * Especially yucky is the case of L1,000
  2983      * This section looks for a pattern of characters including a digit
  2984      * followed by a comma or period followed by one or more digits.
  2985      * If found, it returns this whole pattern as a word; otherwise we discard
  2986      * the results and resume our normal programming.
  2987      */
  2988     s=*ptr;
  2989     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  2990       g_unichar_isalpha(g_utf8_get_char(s)) ||
  2991       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  2992 	g_string_append_unichar(word,g_utf8_get_char(s));
  2993     if (word->len)
  2994     {
  2995 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))
  2996 	{
  2997 	    c=g_utf8_get_char(t);
  2998 	    pc=g_utf8_get_char(g_utf8_prev_char(t));
  2999 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3000 	    {
  3001 		*ptr=s;
  3002 		return g_string_free(word,FALSE);
  3003 	    }
  3004 	}
  3005     }
  3006     /* we didn't find a punctuated number - do the regular getword thing */
  3007     g_string_truncate(word,0);
  3008     for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
  3009       g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
  3010       g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
  3011 	g_string_append_unichar(word,g_utf8_get_char(*ptr));
  3012     return g_string_free(word,FALSE);
  3013 }
  3014 
  3015 /*
  3016  * isroman:
  3017  *
  3018  * Is this word a Roman Numeral?
  3019  *
  3020  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3021  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3022  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3023  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3024  * expressions thereof, except when it came to taxes. Allow any number of M,
  3025  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3026  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3027  * of optional Is.
  3028  */
  3029 gboolean isroman(const char *t)
  3030 {
  3031     const char *s;
  3032     if (!t || !*t)
  3033 	return FALSE;
  3034     s=t;
  3035     while (g_utf8_get_char(t)=='m' && *t)
  3036 	t++;
  3037     if (g_utf8_get_char(t)=='d')
  3038 	t++;
  3039     if (g_str_has_prefix(t,"cm"))
  3040 	t+=2;
  3041     if (g_str_has_prefix(t,"cd"))
  3042 	t+=2;
  3043     while (g_utf8_get_char(t)=='c' && *t)
  3044 	t++;
  3045     if (g_str_has_prefix(t,"xl"))
  3046 	t+=2;
  3047     if (g_str_has_prefix(t,"xc"))
  3048 	t+=2;
  3049     if (g_utf8_get_char(t)=='l')
  3050 	t++;
  3051     while (g_utf8_get_char(t)=='x' && *t)
  3052 	t++;
  3053     if (g_str_has_prefix(t,"ix"))
  3054 	t+=2;
  3055     if (g_str_has_prefix(t,"iv"))
  3056 	t+=2;
  3057     if (g_utf8_get_char(t)=='v')
  3058 	t++;
  3059     while (g_utf8_get_char(t)=='i' && *t)
  3060 	t++;
  3061     return !*t;
  3062 }
  3063 
  3064 /*
  3065  * postprocess_for_DP:
  3066  *
  3067  * Invoked with the -d switch from flgets().
  3068  * It simply "removes" from the line a hard-coded set of common
  3069  * DP-specific tags, so that the line passed to the main routine has
  3070  * been pre-cleaned of DP markup.
  3071  */
  3072 void postprocess_for_DP(char *theline)
  3073 {
  3074     char *s,*t;
  3075     int i;
  3076     if (!*theline) 
  3077 	return;
  3078     for (i=0;*DPmarkup[i];i++)
  3079 	while ((s=strstr(theline,DPmarkup[i])))
  3080 	{
  3081 	    t=s+strlen(DPmarkup[i]);
  3082 	    memmove(s,t,strlen(t)+1);
  3083 	}
  3084 }
  3085 
  3086 /*
  3087  * postprocess_for_HTML:
  3088  *
  3089  * Invoked with the -m switch from flgets().
  3090  * It simply "removes" from the line a hard-coded set of common
  3091  * HTML tags and "replaces" a hard-coded set of common HTML
  3092  * entities, so that the line passed to the main routine has
  3093  * been pre-cleaned of HTML.
  3094  */
  3095 void postprocess_for_HTML(char *theline)
  3096 {
  3097     while (losemarkup(theline))
  3098 	;
  3099     loseentities(theline);
  3100 }
  3101 
  3102 char *losemarkup(char *theline)
  3103 {
  3104     char *s,*t;
  3105     int i;
  3106     s=strchr(theline,'<');
  3107     t=s?strchr(s,'>'):NULL;
  3108     if (!s || !t)
  3109 	return NULL;
  3110     for (i=0;*markup[i];i++)
  3111 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3112 	{
  3113 	    t=g_utf8_next_char(t);
  3114 	    memmove(s,t,strlen(t)+1);
  3115 	    return s;
  3116 	}
  3117     /* It's an unrecognized <xxx>. */
  3118     return NULL;
  3119 }
  3120 
  3121 void loseentities(char *theline)
  3122 {
  3123     int i;
  3124     gsize nb;
  3125     char *amp,*scolon;
  3126     gchar *s,*t;
  3127     gunichar c;
  3128     GTree *entities=NULL;
  3129     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3130     if (!theline)
  3131     {
  3132 	if (entities)
  3133 	    g_tree_destroy(entities);
  3134 	entities=NULL;
  3135 	if (translit!=(GIConv)-1)
  3136 	    g_iconv_close(translit);
  3137 	translit=(GIConv)-1;
  3138 	if (to_utf8!=(GIConv)-1)
  3139 	    g_iconv_close(to_utf8);
  3140 	to_utf8=(GIConv)-1;
  3141 	return;
  3142     }
  3143     if (!*theline)
  3144 	return;
  3145     if (!entities)
  3146     {
  3147 	entities=g_tree_new((GCompareFunc)strcmp);
  3148 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3149 	    g_tree_insert(entities,HTMLentities[i].name,
  3150 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3151     }
  3152     if (translit==(GIConv)-1)
  3153 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3154     if (to_utf8==(GIConv)-1)
  3155 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3156     while((amp=strchr(theline,'&')))
  3157     {
  3158 	scolon=strchr(amp,';');
  3159 	if (scolon)
  3160 	{
  3161 	    if (amp[1]=='#')
  3162 	    {
  3163 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3164 		    c=strtol(amp+2,NULL,10);
  3165 		else if (amp[2]=='x' &&
  3166 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3167 		    c=strtol(amp+3,NULL,16);
  3168 	    }
  3169 	    else
  3170 	    {
  3171 		s=g_strndup(amp+1,scolon-(amp+1));
  3172 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3173 		g_free(s);
  3174 	    }
  3175 	}
  3176 	else
  3177 	    c=0;
  3178 	if (c)
  3179 	{
  3180 	    theline=amp;
  3181 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3182 		theline+=g_unichar_to_utf8(c,theline);
  3183 	    else
  3184 	    {
  3185 		s=g_malloc(6);
  3186 		nb=g_unichar_to_utf8(c,s);
  3187 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3188 		g_free(s);
  3189 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3190 		g_free(t);
  3191 		memcpy(theline,s,nb);
  3192 		g_free(s);
  3193 		theline+=nb;
  3194 	    }
  3195 	    memmove(theline,g_utf8_next_char(scolon),
  3196 	      strlen(g_utf8_next_char(scolon))+1);
  3197 	}
  3198 	else
  3199 	    theline=g_utf8_next_char(amp);
  3200     }
  3201 }
  3202 
  3203 gboolean tagcomp(const char *strin,const char *basetag)
  3204 {
  3205     gboolean retval;
  3206     gchar *s,*t;
  3207     if (g_utf8_get_char(strin)=='/')
  3208 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3209     else
  3210 	t=g_utf8_casefold(strin,-1);
  3211     s=g_utf8_casefold(basetag,-1);
  3212     retval=g_str_has_prefix(t,s);
  3213     g_free(s);
  3214     g_free(t);
  3215     return retval;
  3216 }
  3217 
  3218 void proghelp(GOptionContext *context)
  3219 {
  3220     gchar *help;
  3221     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3222     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3223     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3224     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3225       "For details, read the file COPYING.\n",stderr);
  3226     fputs("This is Free Software; "
  3227       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3228     fputs("read the file COPYING for details.\n\n",stderr);
  3229     help=g_option_context_get_help(context,TRUE,NULL);
  3230     fputs(help,stderr);
  3231     g_free(help);
  3232     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3233     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3234       "non-ASCII\n",stderr);
  3235     fputs("characters like accented letters, "
  3236       "lines longer than 75 or shorter than 55,\n",stderr);
  3237     fputs("unbalanced quotes or brackets, "
  3238       "a variety of badly formatted punctuation, \n",stderr);
  3239     fputs("HTML tags, some likely typos. "
  3240       "It is NOT a substitute for human judgement.\n",stderr);
  3241     fputs("\n",stderr);
  3242 }