bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Thu May 30 17:16:37 2013 +0100 (2013-05-30)
changeset 71 82d3cc398b54
parent 70 aa916da2e452
child 72 52d4a7f926b4
permissions -rw-r--r--
Use official HTML 4 character entity definitions
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #include <glib.h>
    26 #include <bl/bl.h>
    27 #include "HTMLentities.h"
    28 
    29 gchar *prevline;
    30 
    31 /* Common typos. */
    32 char *typo[] = {
    33     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    34     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    35     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    36     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    37     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    38     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    39     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    40     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    41     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    42     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    43     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    44     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    45     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    46     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    47     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    48     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    49     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    50     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    51     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    52     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    53     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    54     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    55     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    56     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    57     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    58     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    59     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    60     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    61     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    62     "se", ""
    63 };
    64 
    65 GTree *usertypo;
    66 
    67 /* Common abbreviations and other OK words not to query as typos. */
    68 char *okword[] = {
    69     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    70     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    71     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    72     "outbid", "outbids", "frostbite", "frostbitten", ""
    73 };
    74 
    75 /* Common abbreviations that cause otherwise unexplained periods. */
    76 char *abbrev[] = {
    77     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    78     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    79 };
    80 
    81 /*
    82  * Two-Letter combinations that rarely if ever start words,
    83  * but are common scannos or otherwise common letter combinations.
    84  */
    85 char *nostart[] = {
    86     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    87 };
    88 
    89 /*
    90  * Two-Letter combinations that rarely if ever end words,
    91  * but are common scannos or otherwise common letter combinations.
    92  */
    93 char *noend[] = {
    94     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
    95     "sw", "gr", "sl", "cl", "iy", ""
    96 };
    97 
    98 char *markup[] = {
    99     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   100     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   101     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   102     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   103 };
   104 
   105 char *DPmarkup[] = {
   106     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   107 };
   108 
   109 char *nocomma[] = {
   110     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   111     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   112     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   113     "during", "let", "toward", "among", ""
   114 };
   115 
   116 char *noperiod[] = {
   117     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   118     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   119     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   120     "among", "those", "into", "whom", "having", "thence", ""
   121 }; 
   122 
   123 /* special characters */
   124 #define CHAR_SPACE	  32
   125 #define CHAR_TAB	   9
   126 #define CHAR_LF		  10
   127 #define CHAR_CR		  13
   128 #define CHAR_DQUOTE	  34
   129 #define CHAR_SQUOTE	  39
   130 #define CHAR_OPEN_SQUOTE  96
   131 #define CHAR_TILDE	 126
   132 #define CHAR_ASTERISK	  42
   133 #define CHAR_FORESLASH	  47
   134 #define CHAR_CARAT	  94
   135 
   136 #define CHAR_UNDERSCORE    '_'
   137 #define CHAR_OPEN_CBRACK   '{'
   138 #define CHAR_CLOSE_CBRACK  '}'
   139 #define CHAR_OPEN_RBRACK   '('
   140 #define CHAR_CLOSE_RBRACK  ')'
   141 #define CHAR_OPEN_SBRACK   '['
   142 #define CHAR_CLOSE_SBRACK  ']'
   143 
   144 /* longest and shortest normal PG line lengths */
   145 #define LONGEST_PG_LINE   75
   146 #define WAY_TOO_LONG      80
   147 #define SHORTEST_PG_LINE  55
   148 
   149 enum {
   150     ECHO_SWITCH,
   151     SQUOTE_SWITCH,
   152     TYPO_SWITCH,
   153     QPARA_SWITCH,
   154     PARANOID_SWITCH,
   155     LINE_END_SWITCH,
   156     OVERVIEW_SWITCH,
   157     STDOUT_SWITCH,
   158     HEADER_SWITCH,
   159     WEB_SWITCH,
   160     VERBOSE_SWITCH,
   161     MARKUP_SWITCH,
   162     USERTYPO_SWITCH,
   163     DP_SWITCH,
   164     SWITNO
   165 };
   166 
   167 gboolean pswit[SWITNO];  /* program switches */
   168 
   169 static GOptionEntry options[]={
   170     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   171       "Ignore DP-specific markup", NULL },
   172     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   173       "Don't echo queried line", NULL },
   174     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   175       "Check single quotes", NULL },
   176     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   177       "Check common typos", NULL },
   178     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   179       "Require closure of quotes on every paragraph", NULL },
   180     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   181       "Disable paranoid querying of everything", NULL },
   182     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   183       "Disable line end checking", NULL },
   184     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   185       "Overview: just show counts", NULL },
   186     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   187       "Output errors to stdout instead of stderr", NULL },
   188     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   193       "Use file of user-defined typos", NULL },
   194     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   195       "Defaults for use on www upload", NULL },
   196     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   197       "Verbose - list everything", NULL },
   198     { NULL }
   199 };
   200 
   201 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   202 long cnt_squot;		/* for overview mode, count of singlequote queries */
   203 long cnt_brack;		/* for overview mode, count of brackets queries */
   204 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   205 long cnt_odd;		/* for overview mode, count of odd character queries */
   206 long cnt_long;		/* for overview mode, count of long line errors */
   207 long cnt_short;		/* for overview mode, count of short line queries */
   208 long cnt_punct;		/* for overview mode,
   209 			   count of punctuation and spacing queries */
   210 long cnt_dash;		/* for overview mode, count of dash-related queries */
   211 long cnt_word;		/* for overview mode, count of word queries */
   212 long cnt_html;		/* for overview mode, count of html queries */
   213 long cnt_lineend;	/* for overview mode, count of line-end queries */
   214 long cnt_spacend;	/* count of lines with space at end */
   215 long linecnt;		/* count of total lines in the file */
   216 long checked_linecnt;	/* count of lines actually checked */
   217 
   218 void proghelp(GOptionContext *context);
   219 void procfile(const char *);
   220 
   221 gchar *running_from;
   222 
   223 gboolean mixdigit(const char *);
   224 gchar *getaword(const char **);
   225 char *flgets(char **,long);
   226 void postprocess_for_HTML(char *);
   227 char *linehasmarkup(char *);
   228 char *losemarkup(char *);
   229 gboolean tagcomp(const char *,const char *);
   230 void loseentities(char *);
   231 gboolean isroman(const char *);
   232 void postprocess_for_DP(char *);
   233 
   234 GTree *qword,*qperiod;
   235 
   236 struct first_pass_results {
   237     long firstline,astline;
   238     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
   239     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
   240     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
   241     int Dutchcount,Frenchcount;
   242 };
   243 
   244 struct warnings {
   245     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   246     int endquote;
   247     gboolean isDutch,isFrench;
   248 };
   249 
   250 struct counters {
   251     long quot;
   252     int c_unders,c_brack,s_brack,r_brack;
   253     int open_single_quote,close_single_quote;
   254 };
   255 
   256 struct line_properties {
   257     unsigned int len,blen;
   258     gunichar start;
   259 };
   260 
   261 struct parities {
   262     int dquote,squote;
   263 };
   264 
   265 struct pending {
   266     char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
   267     long squot;
   268 };
   269 
   270 void parse_options(int *argc,char ***argv)
   271 {
   272     GError *err=NULL;
   273     GOptionContext *context;
   274     context=g_option_context_new(
   275       "file - looks for errors in Project Gutenberg(TM) etexts");
   276     g_option_context_add_main_entries(context,options,NULL);
   277     if (!g_option_context_parse(context,argc,argv,&err))
   278     {
   279 	g_printerr("Bookloupe: %s\n",err->message);
   280 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   281 	exit(1);
   282     }
   283     /* Paranoid checking is turned OFF, not on, by its switch */
   284     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   285     if (pswit[PARANOID_SWITCH])
   286 	/* if running in paranoid mode, typo checks default to enabled */
   287 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   288     /* Line-end checking is turned OFF, not on, by its switch */
   289     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   290     /* Echoing is turned OFF, not on, by its switch */
   291     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   292     if (pswit[OVERVIEW_SWITCH])
   293 	/* just print summary; don't echo */
   294 	pswit[ECHO_SWITCH]=FALSE;
   295     /*
   296      * Web uploads - for the moment, this is really just a placeholder
   297      * until we decide what processing we really want to do on web uploads
   298      */
   299     if (pswit[WEB_SWITCH])
   300     {
   301 	/* specific override for web uploads */
   302 	pswit[ECHO_SWITCH]=TRUE;
   303 	pswit[SQUOTE_SWITCH]=FALSE;
   304 	pswit[TYPO_SWITCH]=TRUE;
   305 	pswit[QPARA_SWITCH]=FALSE;
   306 	pswit[PARANOID_SWITCH]=TRUE;
   307 	pswit[LINE_END_SWITCH]=FALSE;
   308 	pswit[OVERVIEW_SWITCH]=FALSE;
   309 	pswit[STDOUT_SWITCH]=FALSE;
   310 	pswit[HEADER_SWITCH]=TRUE;
   311 	pswit[VERBOSE_SWITCH]=FALSE;
   312 	pswit[MARKUP_SWITCH]=FALSE;
   313 	pswit[USERTYPO_SWITCH]=FALSE;
   314 	pswit[DP_SWITCH]=FALSE;
   315     }
   316     if (*argc<2)
   317     {
   318 	proghelp(context);
   319 	exit(1);
   320     }
   321     g_option_context_free(context);
   322 }
   323 
   324 /*
   325  * read_user_scannos:
   326  *
   327  * Read in the user-defined stealth scanno list.
   328  */
   329 void read_user_scannos(void)
   330 {
   331     GError *err=NULL;
   332     gchar *usertypo_file;
   333     gboolean okay;
   334     int i;
   335     gsize len,nb;
   336     gchar *contents,*utf8,**lines;
   337     usertypo_file=g_strdup("bookloupe.typ");
   338     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   339     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   340     {
   341 	g_clear_error(&err);
   342 	g_free(usertypo_file);
   343 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   344 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   345     }
   346     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   347     {
   348 	g_clear_error(&err);
   349 	g_free(usertypo_file);
   350 	usertypo_file=g_strdup("gutcheck.typ");
   351 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   352     }
   353     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   354     {
   355 	g_clear_error(&err);
   356 	g_free(usertypo_file);
   357 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   358 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   359     }
   360     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   361     {
   362 	g_free(usertypo_file);
   363 	g_print("   --> I couldn't find bookloupe.typ "
   364 	  "-- proceeding without user typos.\n");
   365 	return;
   366     }
   367     else if (!okay)
   368     {
   369 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   370 	g_free(usertypo_file);
   371 	g_clear_error(&err);
   372 	exit(1);
   373     }
   374     utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   375     g_free(contents);
   376     lines=g_strsplit_set(utf8,"\r\n",0);
   377     g_free(utf8);
   378     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   379     for (i=0;lines[i];i++)
   380 	if (*(unsigned char *)lines[i]>'!')
   381 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   382 	else
   383 	    g_free(lines[i]);
   384     g_free(lines);
   385 }
   386 
   387 /*
   388  * read_etext:
   389  *
   390  * Read an etext returning a newly allocated string containing the file
   391  * contents or NULL on error.
   392  */
   393 gchar *read_etext(const char *filename,GError **err)
   394 {
   395     gchar *contents,*utf8;
   396     gsize len,nb;
   397     if (!g_file_get_contents(filename,&contents,&len,err))
   398 	return NULL;
   399     utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   400     g_free(contents);
   401     return utf8;
   402 }
   403 
   404 int main(int argc,char **argv)
   405 {
   406     running_from=g_path_get_dirname(argv[0]);
   407     parse_options(&argc,&argv);
   408     if (pswit[USERTYPO_SWITCH])
   409 	read_user_scannos();
   410     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   411     procfile(argv[1]);
   412     if (pswit[OVERVIEW_SWITCH])
   413     {
   414 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   415 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   416 	g_print("    --------------- Queries found --------------\n");
   417 	if (cnt_long)
   418 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   419 	if (cnt_short)
   420 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   421 	if (cnt_lineend)
   422 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   423 	if (cnt_word)
   424 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   425 	if (cnt_dquot)
   426 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   427 	if (cnt_squot)
   428 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   429 	if (cnt_brack)
   430 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   431 	if (cnt_bin)
   432 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   433 	if (cnt_odd)
   434 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   435 	if (cnt_punct)
   436 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   437 	if (cnt_dash)
   438 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   439 	if (cnt_html)
   440 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   441 	g_print("\n");
   442 	g_print("    TOTAL QUERIES		  %14ld\n",
   443 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   444 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   445     }
   446     g_free(running_from);
   447     if (usertypo)
   448 	g_tree_unref(usertypo);
   449     return 0;
   450 }
   451 
   452 /*
   453  * first_pass:
   454  *
   455  * Run a first pass - verify that it's a valid PG
   456  * file, decide whether to report some things that
   457  * occur many times in the text like long or short
   458  * lines, non-standard dashes, etc.
   459  */
   460 struct first_pass_results *first_pass(const char *etext)
   461 {
   462     gunichar laststart=CHAR_SPACE;
   463     const char *s;
   464     gchar *lc_line;
   465     int i,j,lbytes,llen;
   466     gchar **lines;
   467     unsigned int lastlen=0,lastblen=0;
   468     long spline=0,nspline=0;
   469     static struct first_pass_results results={0};
   470     gchar *inword;
   471     lines=g_strsplit(etext,"\n",0);
   472     for (j=0;lines[j];j++)
   473     {
   474 	lbytes=strlen(lines[j]);
   475 	while (lines[j][lbytes-1]=='\r')
   476 	    lines[j][--lbytes]='\0';
   477 	llen=g_utf8_strlen(lines[j],lbytes);
   478 	linecnt++;
   479 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   480 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   481 	{
   482 	    if (spline)
   483 		g_print("   --> Duplicate header?\n");
   484 	    spline=linecnt+1;   /* first line of non-header text, that is */
   485 	}
   486 	if (!strncmp(lines[j],"*** START",9) &&
   487 	  strstr(lines[j],"PROJECT GUTENBERG"))
   488 	{
   489 	    if (nspline)
   490 		g_print("   --> Duplicate header?\n");
   491 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   492 	}
   493 	if (spline || nspline)
   494 	{
   495 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   496 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   497 	    {
   498 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   499 		{
   500 		    if (results.footerline)
   501 		    {
   502 			/* it's an old-form header - we can detect duplicates */
   503 			if (!nspline)
   504 			    g_print("   --> Duplicate footer?\n");
   505 		    }
   506 		    else
   507 			results.footerline=linecnt;
   508 		}
   509 	    }
   510 	    g_free(lc_line);
   511 	}
   512 	if (spline)
   513 	    results.firstline=spline;
   514 	if (nspline)
   515 	    results.firstline=nspline;  /* override with new */
   516 	if (results.footerline)
   517 	    continue;    /* don't count the boilerplate in the footer */
   518 	results.totlen+=llen;
   519 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   520 	{
   521 	    if (g_utf8_get_char(s)>127)
   522 		results.binlen++;
   523 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   524 		results.alphalen++;
   525 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   526 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   527 		results.endquote_count++;
   528 	}
   529 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   530 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   531 	    results.shortline++;
   532 	if (lbytes>0 &&
   533 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   534 	    cnt_spacend++;
   535 	if (strstr(lines[j],".,"))
   536 	    results.dotcomma++;
   537 	/* only count ast lines for ignoring purposes where there is */
   538 	/* locase text on the line */
   539 	if (strchr(lines[j],'*'))
   540 	{
   541 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   542 		if (g_unichar_islower(g_utf8_get_char(s)))
   543 		    break;
   544 	    if (*s)
   545 		results.astline++;
   546 	}
   547 	if (strchr(lines[j],'/'))
   548 	    results.fslashline++;
   549 	for (s=g_utf8_prev_char(lines[j]+lbytes);
   550 	  s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
   551 	    ;
   552 	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   553 	  g_utf8_get_char(g_utf8_prev_char(s))!='-')
   554 	    results.hyphens++;
   555 	if (llen>LONGEST_PG_LINE)
   556 	    results.longline++;
   557 	if (llen>WAY_TOO_LONG)
   558 	    results.verylongline++;
   559 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   560 	{
   561 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   562 	    if (i>0)
   563 		results.htmcount++;
   564 	    if (strstr(lines[j],"<i>"))
   565 		results.htmcount+=4; /* bonus marks! */
   566 	}
   567 	/* Check for spaced em-dashes */
   568 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   569 	{
   570 	    results.emdash++;
   571 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   572 		results.space_emdash++;
   573 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   574 		/* count of em-dashes with spaces both sides */
   575 		results.non_PG_space_emdash++;
   576 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   577 		/* count of PG-type em-dashes with no spaces */
   578 		results.PG_space_emdash++;
   579 	}
   580 	for (s=lines[j];*s;)
   581 	{
   582 	    inword=getaword(&s);
   583 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   584 		results.Dutchcount++;
   585 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   586 		results.Frenchcount++;
   587 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   588 		results.standalone_digit++;
   589 	    g_free(inword);
   590 	}
   591 	/* Check for spaced dashes */
   592 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   593 	    results.spacedash++;
   594 	lastblen=lastlen;
   595 	lastlen=llen;
   596 	laststart=lines[j][0];
   597     }
   598     g_strfreev(lines);
   599     return &results;
   600 }
   601 
   602 /*
   603  * report_first_pass:
   604  *
   605  * Make some snap decisions based on the first pass results.
   606  */
   607 struct warnings *report_first_pass(struct first_pass_results *results)
   608 {
   609     static struct warnings warnings={0};
   610     if (cnt_spacend>0)
   611 	g_print("   --> %ld lines in this file have white space at end\n",
   612 	  cnt_spacend);
   613     warnings.dotcomma=1;
   614     if (results->dotcomma>5)
   615     {
   616 	warnings.dotcomma=0;
   617 	g_print("   --> %ld lines in this file contain '.,'. "
   618 	  "Not reporting them.\n",results->dotcomma);
   619     }
   620     /*
   621      * If more than 50 lines, or one-tenth, are short,
   622      * don't bother reporting them.
   623      */
   624     warnings.shortline=1;
   625     if (results->shortline>50 || results->shortline*10>linecnt)
   626     {
   627 	warnings.shortline=0;
   628 	g_print("   --> %ld lines in this file are short. "
   629 	  "Not reporting short lines.\n",results->shortline);
   630     }
   631     /*
   632      * If more than 50 lines, or one-tenth, are long,
   633      * don't bother reporting them.
   634      */
   635     warnings.longline=1;
   636     if (results->longline>50 || results->longline*10>linecnt)
   637     {
   638 	warnings.longline=0;
   639 	g_print("   --> %ld lines in this file are long. "
   640 	  "Not reporting long lines.\n",results->longline);
   641     }
   642     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   643     warnings.ast=1;
   644     if (results->astline>10)
   645     {
   646 	warnings.ast=0;
   647 	g_print("   --> %ld lines in this file contain asterisks. "
   648 	  "Not reporting them.\n",results->astline);
   649     }
   650     /*
   651      * If more than 10 lines contain forward slashes,
   652      * don't bother reporting them.
   653      */
   654     warnings.fslash=1;
   655     if (results->fslashline>10)
   656     {
   657 	warnings.fslash=0;
   658 	g_print("   --> %ld lines in this file contain forward slashes. "
   659 	  "Not reporting them.\n",results->fslashline);
   660     }
   661     /*
   662      * If more than 20 lines contain unpunctuated endquotes,
   663      * don't bother reporting them.
   664      */
   665     warnings.endquote=1;
   666     if (results->endquote_count>20)
   667     {
   668 	warnings.endquote=0;
   669 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   670 	  "Not reporting them.\n",results->endquote_count);
   671     }
   672     /*
   673      * If more than 15 lines contain standalone digits,
   674      * don't bother reporting them.
   675      */
   676     warnings.digit=1;
   677     if (results->standalone_digit>10)
   678     {
   679 	warnings.digit=0;
   680 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   681 	  "Not reporting them.\n",results->standalone_digit);
   682     }
   683     /*
   684      * If more than 20 lines contain hyphens at end,
   685      * don't bother reporting them.
   686      */
   687     warnings.hyphen=1;
   688     if (results->hyphens>20)
   689     {
   690 	warnings.hyphen=0;
   691 	g_print("   --> %ld lines in this file have hyphens at end. "
   692 	  "Not reporting them.\n",results->hyphens);
   693     }
   694     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   695     {
   696 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   697 	pswit[MARKUP_SWITCH]=1;
   698     }
   699     if (results->verylongline>0)
   700 	g_print("   --> %ld lines in this file are VERY long!\n",
   701 	  results->verylongline);
   702     /*
   703      * If there are more non-PG spaced dashes than PG em-dashes,
   704      * assume it's deliberate.
   705      * Current PG guidelines say don't use them, but older texts do,
   706      * and some people insist on them whatever the guidelines say.
   707      */
   708     warnings.dash=1;
   709     if (results->spacedash+results->non_PG_space_emdash>
   710       results->PG_space_emdash)
   711     {
   712 	warnings.dash=0;
   713 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   714 	  "Not reporting them.\n",
   715 	  results->spacedash+results->non_PG_space_emdash);
   716     }
   717     /* If more than a quarter of characters are hi-bit, bug out. */
   718     warnings.bin=1;
   719     if (results->binlen*4>results->totlen)
   720     {
   721 	g_print("   --> This file does not appear to be ASCII. "
   722 	  "Terminating. Best of luck with it!\n");
   723 	exit(1);
   724     }
   725     if (results->alphalen*4<results->totlen)
   726     {
   727 	g_print("   --> This file does not appear to be text. "
   728 	  "Terminating. Best of luck with it!\n");
   729 	exit(1);
   730     }
   731     if (results->binlen*100>results->totlen || results->binlen>100)
   732     {
   733 	g_print("   --> There are a lot of foreign letters here. "
   734 	  "Not reporting them.\n");
   735 	warnings.bin=0;
   736     }
   737     warnings.isDutch=FALSE;
   738     if (results->Dutchcount>50)
   739     {
   740 	warnings.isDutch=TRUE;
   741 	g_print("   --> This looks like Dutch - "
   742 	  "switching off dashes and warnings for 's Middags case.\n");
   743     }
   744     warnings.isFrench=FALSE;
   745     if (results->Frenchcount>50)
   746     {
   747 	warnings.isFrench=TRUE;
   748 	g_print("   --> This looks like French - "
   749 	  "switching off some doublepunct.\n");
   750     }
   751     if (results->firstline && results->footerline)
   752 	g_print("    The PG header and footer appear to be already on.\n");
   753     else
   754     {
   755 	if (results->firstline)
   756 	    g_print("    The PG header is on - no footer.\n");
   757 	if (results->footerline)
   758 	    g_print("    The PG footer is on - no header.\n");
   759     }
   760     g_print("\n");
   761     if (pswit[VERBOSE_SWITCH])
   762     {
   763 	warnings.bin=1;
   764 	warnings.shortline=1;
   765 	warnings.dotcomma=1;
   766 	warnings.longline=1;
   767 	warnings.dash=1;
   768 	warnings.digit=1;
   769 	warnings.ast=1;
   770 	warnings.fslash=1;
   771 	warnings.hyphen=1;
   772 	warnings.endquote=1;
   773 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   774     }
   775     if (warnings.isDutch)
   776 	warnings.dash=0;
   777     if (results->footerline>0 && results->firstline>0 &&
   778       results->footerline>results->firstline &&
   779       results->footerline-results->firstline<100)
   780     {
   781 	g_print("   --> I don't really know where this text starts. \n");
   782 	g_print("       There are no reference points.\n");
   783 	g_print("       I'm going to have to report the header and footer "
   784 	  "as well.\n");
   785 	results->firstline=0;
   786     }
   787     return &warnings;
   788 }
   789 
   790 /*
   791  * analyse_quotes:
   792  *
   793  * Look along the line, accumulate the count of quotes, and see
   794  * if this is an empty line - i.e. a line with nothing on it
   795  * but spaces.
   796  * If line has just spaces, period, * and/or - on it, don't
   797  * count it, since empty lines with asterisks or dashes to
   798  * separate sections are common.
   799  *
   800  * Returns: TRUE if the line is empty.
   801  */
   802 gboolean analyse_quotes(const char *aline,struct counters *counters)
   803 {
   804     int guessquote=0;
   805     /* assume the line is empty until proven otherwise */
   806     gboolean isemptyline=TRUE;
   807     const char *s=aline,*sprev,*snext;
   808     gunichar c;
   809     sprev=NULL;
   810     while (*s)
   811     {
   812 	snext=g_utf8_next_char(s);
   813 	c=g_utf8_get_char(s);
   814 	if (c==CHAR_DQUOTE)
   815 	    counters->quot++;
   816 	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
   817 	{
   818 	    if (s==aline)
   819 	    {
   820 		/*
   821 		 * At start of line, it can only be an openquote.
   822 		 * Hardcode a very common exception!
   823 		 */
   824 		if (!g_str_has_prefix(snext,"tis") &&
   825 		  !g_str_has_prefix(snext,"Tis"))
   826 		    counters->open_single_quote++;
   827 	    }
   828 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   829 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   830 		/* Do nothing! it's definitely an apostrophe, not a quote */
   831 		;
   832 	    /* it's outside a word - let's check it out */
   833 	    else if (c==CHAR_OPEN_SQUOTE ||
   834 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   835 	    {
   836 		/* it damwell better BE an openquote */
   837 		if (!g_str_has_prefix(snext,"tis") &&
   838 		  !g_str_has_prefix(snext,"Tis"))
   839 		    /* hardcode a very common exception! */
   840 		    counters->open_single_quote++;
   841 	    }
   842 	    else
   843 	    {
   844 		/* now - is it a closequote? */
   845 		guessquote=0;   /* accumulate clues */
   846 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   847 		{
   848 		    /* it follows a letter - could be either */
   849 		    guessquote++;
   850 		    if (g_utf8_get_char(sprev)=='s')
   851 		    {
   852 			/* looks like a plural apostrophe */
   853 			guessquote-=3;
   854 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   855 			    /* bonus marks! */
   856 			    guessquote-=2;
   857 		    }
   858 		}
   859 		/* it doesn't have a letter either side */
   860 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
   861 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
   862 		    guessquote+=8; /* looks like a closequote */
   863 		else
   864 		    guessquote++;
   865 		if (counters->open_single_quote>counters->close_single_quote)
   866 		    /*
   867 		     * Give it the benefit of some doubt,
   868 		     * if a squote is already open.
   869 		     */
   870 		    guessquote++;
   871 		else
   872 		    guessquote--;
   873 		if (guessquote>=0)
   874 		    counters->close_single_quote++;
   875 	    }
   876 	}
   877 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   878 	  c!='\r' && c!='\n')
   879 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   880 	if (c==CHAR_UNDERSCORE)
   881 	    counters->c_unders++;
   882 	if (c==CHAR_OPEN_CBRACK)
   883 	    counters->c_brack++;
   884 	if (c==CHAR_CLOSE_CBRACK)
   885 	    counters->c_brack--;
   886 	if (c==CHAR_OPEN_RBRACK)
   887 	    counters->r_brack++;
   888 	if (c==CHAR_CLOSE_RBRACK)
   889 	    counters->r_brack--;
   890 	if (c==CHAR_OPEN_SBRACK)
   891 	    counters->s_brack++;
   892 	if (c==CHAR_CLOSE_SBRACK)
   893 	    counters->s_brack--;
   894 	sprev=s;
   895 	s=snext;
   896     }
   897     return isemptyline;
   898 }
   899 
   900 /*
   901  * check_for_control_characters:
   902  *
   903  * Check for invalid or questionable characters in the line
   904  * Anything above 127 is invalid for plain ASCII, and
   905  * non-printable control characters should also be flagged.
   906  * Tabs should generally not be there.
   907  */
   908 void check_for_control_characters(const char *aline)
   909 {
   910     gunichar c;
   911     const char *s;
   912     for (s=aline;*s;s=g_utf8_next_char(s))
   913     {
   914 	c=g_utf8_get_char(s);
   915 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   916 	{
   917 	    if (pswit[ECHO_SWITCH])
   918 		g_print("\n%s\n",aline);
   919 	    if (!pswit[OVERVIEW_SWITCH])
   920 		g_print("    Line %ld column %ld - Control character %u\n",
   921 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   922 	    else
   923 		cnt_bin++;
   924 	}
   925     }
   926 }
   927 
   928 /*
   929  * check_for_odd_characters:
   930  *
   931  * Check for binary and other odd characters.
   932  */
   933 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   934   gboolean isemptyline)
   935 {
   936     /* Don't repeat multiple warnings on one line. */
   937     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   938     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   939     const char *s;
   940     gunichar c;
   941     for (s=aline;*s;s=g_utf8_next_char(s))
   942     {
   943 	c=g_utf8_get_char(s);
   944 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   945 	{
   946 	    if (pswit[ECHO_SWITCH])
   947 		g_print("\n%s\n",aline);
   948 	    if (!pswit[OVERVIEW_SWITCH])
   949 		if (c>127 && c<160 || c>255)
   950 		    g_print("    Line %ld column %ld - "
   951 		      "Non-ISO-8859 character %u\n",
   952 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   953 		else
   954 		    g_print("    Line %ld column %ld - "
   955 		      "Non-ASCII character %u\n",
   956 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   957 	    else
   958 		cnt_bin++;
   959 	    eNon_A=TRUE;
   960 	}
   961 	if (!eTab && c==CHAR_TAB)
   962 	{
   963 	    if (pswit[ECHO_SWITCH])
   964 		g_print("\n%s\n",aline);
   965 	    if (!pswit[OVERVIEW_SWITCH])
   966 		g_print("    Line %ld column %ld - Tab character?\n",
   967 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   968 	    else
   969 		cnt_odd++;
   970 	    eTab=TRUE;
   971 	}
   972 	if (!eTilde && c==CHAR_TILDE)
   973 	{
   974 	    /*
   975 	     * Often used by OCR software to indicate an
   976 	     * unrecognizable character.
   977 	     */
   978 	    if (pswit[ECHO_SWITCH])
   979 		g_print("\n%s\n",aline);
   980 	    if (!pswit[OVERVIEW_SWITCH])
   981 		g_print("    Line %ld column %ld - Tilde character?\n",
   982 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   983 	    else
   984 		cnt_odd++;
   985 	    eTilde=TRUE;
   986 	}
   987 	if (!eCarat && c==CHAR_CARAT)
   988 	{  
   989 	    if (pswit[ECHO_SWITCH])
   990 		g_print("\n%s\n",aline);
   991 	    if (!pswit[OVERVIEW_SWITCH])
   992 		g_print("    Line %ld column %ld - Carat character?\n",
   993 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   994 	    else
   995 		cnt_odd++;
   996 	    eCarat=TRUE;
   997 	}
   998 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
   999 	{  
  1000 	    if (pswit[ECHO_SWITCH])
  1001 		g_print("\n%s\n",aline);
  1002 	    if (!pswit[OVERVIEW_SWITCH])
  1003 		g_print("    Line %ld column %ld - Forward slash?\n",
  1004 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1005 	    else
  1006 		cnt_odd++;
  1007 	    eFSlash=TRUE;
  1008 	}
  1009 	/*
  1010 	 * Report asterisks only in paranoid mode,
  1011 	 * since they're often deliberate.
  1012 	 */
  1013 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1014 	  c==CHAR_ASTERISK)
  1015 	{
  1016 	    if (pswit[ECHO_SWITCH])
  1017 		g_print("\n%s\n",aline);
  1018 	    if (!pswit[OVERVIEW_SWITCH])
  1019 		g_print("    Line %ld column %ld - Asterisk?\n",
  1020 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1021 	    else
  1022 		cnt_odd++;
  1023 	    eAst=TRUE;
  1024 	}
  1025     }
  1026 }
  1027 
  1028 /*
  1029  * check_for_long_line:
  1030  *
  1031  * Check for line too long.
  1032  */
  1033 void check_for_long_line(const char *aline)
  1034 {
  1035     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1036     {
  1037 	if (pswit[ECHO_SWITCH])
  1038 	    g_print("\n%s\n",aline);
  1039 	if (!pswit[OVERVIEW_SWITCH])
  1040 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1041 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1042 	else
  1043 	    cnt_long++;
  1044     }
  1045 }
  1046 
  1047 /*
  1048  * check_for_short_line:
  1049  *
  1050  * Check for line too short.
  1051  *
  1052  * This one is a bit trickier to implement: we don't want to
  1053  * flag the last line of a paragraph for being short, so we
  1054  * have to wait until we know that our current line is a
  1055  * "normal" line, then report the _previous_ line if it was too
  1056  * short. We also don't want to report indented lines like
  1057  * chapter heads or formatted quotations. We therefore keep
  1058  * last->len as the length of the last line examined, and
  1059  * last->blen as the length of the last but one, and try to
  1060  * suppress unnecessary warnings by checking that both were of
  1061  * "normal" length. We keep the first character of the last
  1062  * line in last->start, and if it was a space, we assume that
  1063  * the formatting is deliberate. I can't figure out a way to
  1064  * distinguish something like a quoted verse left-aligned or
  1065  * the header or footer of a letter from a paragraph of short
  1066  * lines - maybe if I examined the whole paragraph, and if the
  1067  * para has less than, say, 8 lines and if all lines are short,
  1068  * then just assume it's OK? Need to look at some texts to see
  1069  * how often a formula like this would get the right result.
  1070  */
  1071 void check_for_short_line(const char *aline,const struct line_properties *last)
  1072 {
  1073     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1074       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1075       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1076     {
  1077 	if (pswit[ECHO_SWITCH])
  1078 	    g_print("\n%s\n",prevline);
  1079 	if (!pswit[OVERVIEW_SWITCH])
  1080 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1081 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1082 	else
  1083 	    cnt_short++;
  1084     }
  1085 }
  1086 
  1087 /*
  1088  * check_for_starting_punctuation:
  1089  *
  1090  * Look for punctuation other than full ellipses at start of line.
  1091  */
  1092 void check_for_starting_punctuation(const char *aline)
  1093 {
  1094     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1095       !g_str_has_prefix(aline,". . ."))
  1096     {
  1097 	if (pswit[ECHO_SWITCH])
  1098 	    g_print("\n%s\n",aline);
  1099 	if (!pswit[OVERVIEW_SWITCH])
  1100 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1101 	      linecnt);
  1102 	else
  1103 	    cnt_punct++;
  1104     }
  1105 }
  1106 
  1107 /*
  1108  * check_for_spaced_emdash:
  1109  *
  1110  * Check for spaced em-dashes.
  1111  *
  1112  * We must check _all_ occurrences of "--" on the line
  1113  * hence the loop - even if the first double-dash is OK
  1114  * there may be another that's wrong later on.
  1115  */
  1116 void check_for_spaced_emdash(const char *aline)
  1117 {
  1118     const char *s,*t,*next;
  1119     for (s=aline;t=strstr(s,"--");s=next)
  1120     {
  1121 	next=g_utf8_next_char(g_utf8_next_char(t));
  1122 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1123 	  g_utf8_get_char(next)==CHAR_SPACE)
  1124 	{
  1125 	    if (pswit[ECHO_SWITCH])
  1126 		g_print("\n%s\n",aline);
  1127 	    if (!pswit[OVERVIEW_SWITCH])
  1128 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1129 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1130 	    else
  1131 		cnt_dash++;
  1132 	}
  1133     }
  1134 }
  1135 
  1136 /*
  1137  * check_for_spaced_dash:
  1138  *
  1139  * Check for spaced dashes.
  1140  */
  1141 void check_for_spaced_dash(const char *aline)
  1142 {
  1143     const char *s;
  1144     if ((s=strstr(aline," -")))
  1145     {
  1146 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1147 	{
  1148 	    if (pswit[ECHO_SWITCH])
  1149 		g_print("\n%s\n",aline);
  1150 	    if (!pswit[OVERVIEW_SWITCH])
  1151 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1152 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1153 	    else
  1154 		cnt_dash++;
  1155 	}
  1156     }
  1157     else if ((s=strstr(aline,"- ")))
  1158     {
  1159 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1160 	{
  1161 	    if (pswit[ECHO_SWITCH])
  1162 		g_print("\n%s\n",aline);
  1163 	    if (!pswit[OVERVIEW_SWITCH])
  1164 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1165 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1166 	    else
  1167 		cnt_dash++;
  1168 	}
  1169     }
  1170 }
  1171 
  1172 /*
  1173  * check_for_unmarked_paragraphs:
  1174  *
  1175  * Check for unmarked paragraphs indicated by separate speakers.
  1176  *
  1177  * May well be false positive:
  1178  * "Bravo!" "Wonderful!" called the crowd.
  1179  * but useful all the same.
  1180  */
  1181 void check_for_unmarked_paragraphs(const char *aline)
  1182 {
  1183     const char *s;
  1184     s=strstr(aline,"\"  \"");
  1185     if (!s)
  1186 	s=strstr(aline,"\" \"");
  1187     if (s)
  1188     {
  1189 	if (pswit[ECHO_SWITCH])
  1190 	    g_print("\n%s\n",aline);
  1191 	if (!pswit[OVERVIEW_SWITCH])
  1192 	    g_print("    Line %ld column %ld - "
  1193 	      "Query missing paragraph break?\n",
  1194 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1195 	else
  1196 	    cnt_punct++;
  1197     }
  1198 }
  1199 
  1200 /*
  1201  * check_for_jeebies:
  1202  *
  1203  * Check for "to he" and other easy h/b errors.
  1204  *
  1205  * This is a very inadequate effort on the h/b problem,
  1206  * but the phrase "to he" is always an error, whereas "to
  1207  * be" is quite common.
  1208  * Similarly, '"Quiet!", be said.' is a non-be error
  1209  * "to he" is _not_ always an error!:
  1210  *       "Where they went to he couldn't say."
  1211  * Another false positive:
  1212  *       What would "Cinderella" be without the . . .
  1213  * and another: "If he wants to he can see for himself."
  1214  */
  1215 void check_for_jeebies(const char *aline)
  1216 {
  1217     const char *s;
  1218     s=strstr(aline," be could ");
  1219     if (!s)
  1220 	s=strstr(aline," be would ");
  1221     if (!s)
  1222 	s=strstr(aline," was be ");
  1223     if (!s)
  1224 	s=strstr(aline," be is ");
  1225     if (!s)
  1226 	s=strstr(aline," is be ");
  1227     if (!s)
  1228 	s=strstr(aline,"\", be ");
  1229     if (!s)
  1230 	s=strstr(aline,"\" be ");
  1231     if (!s)
  1232 	s=strstr(aline,"\" be ");
  1233     if (!s)
  1234 	s=strstr(aline," to he ");
  1235     if (s)
  1236     {
  1237 	if (pswit[ECHO_SWITCH])
  1238 	    g_print("\n%s\n",aline);
  1239 	if (!pswit[OVERVIEW_SWITCH])
  1240 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1241 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1242 	else
  1243 	    cnt_word++;
  1244     }
  1245     s=strstr(aline," the had ");
  1246     if (!s)
  1247 	s=strstr(aline," a had ");
  1248     if (!s)
  1249 	s=strstr(aline," they bad ");
  1250     if (!s)
  1251 	s=strstr(aline," she bad ");
  1252     if (!s)
  1253 	s=strstr(aline," he bad ");
  1254     if (!s)
  1255 	s=strstr(aline," you bad ");
  1256     if (!s)
  1257 	s=strstr(aline," i bad ");
  1258     if (s)
  1259     {
  1260 	if (pswit[ECHO_SWITCH])
  1261 	    g_print("\n%s\n",aline);
  1262 	if (!pswit[OVERVIEW_SWITCH])
  1263 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1264 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1265 	else
  1266 	    cnt_word++;
  1267     }
  1268     s=strstr(aline,"; hut ");
  1269     if (!s)
  1270 	s=strstr(aline,", hut ");
  1271     if (s)
  1272     {
  1273 	if (pswit[ECHO_SWITCH])
  1274 	    g_print("\n%s\n",aline);
  1275 	if (!pswit[OVERVIEW_SWITCH])
  1276 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1277 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1278 	else
  1279 	    cnt_word++;
  1280     }
  1281 }
  1282 
  1283 /*
  1284  * check_for_mta_from:
  1285  *
  1286  * Special case - angled bracket in front of "From" placed there by an
  1287  * MTA when sending an e-mail.
  1288  */
  1289 void check_for_mta_from(const char *aline)
  1290 {
  1291     const char *s;
  1292     s=strstr(aline,">From");
  1293     if (s)
  1294     {
  1295 	if (pswit[ECHO_SWITCH])
  1296 	    g_print("\n%s\n",aline);
  1297 	if (!pswit[OVERVIEW_SWITCH])
  1298 	    g_print("    Line %ld column %ld - "
  1299 	      "Query angled bracket with From\n",
  1300 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1301 	else
  1302 	    cnt_punct++;
  1303     }
  1304 }
  1305 
  1306 /*
  1307  * check_for_orphan_character:
  1308  *
  1309  * Check for a single character line -
  1310  * often an overflow from bad wrapping.
  1311  */
  1312 void check_for_orphan_character(const char *aline)
  1313 {
  1314     gunichar c;
  1315     c=g_utf8_get_char(aline);
  1316     if (c && !*g_utf8_next_char(aline))
  1317     {
  1318 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1319 	    ; /* Nothing - ignore numerals alone on a line. */
  1320 	else
  1321 	{
  1322 	    if (pswit[ECHO_SWITCH])
  1323 		g_print("\n%s\n",aline);
  1324 	    if (!pswit[OVERVIEW_SWITCH])
  1325 		g_print("    Line %ld column 1 - Query single character line\n",
  1326 		  linecnt);
  1327 	    else
  1328 		cnt_punct++;
  1329 	}
  1330     }
  1331 }
  1332 
  1333 /*
  1334  * check_for_pling_scanno:
  1335  *
  1336  * Check for I" - often should be !
  1337  */
  1338 void check_for_pling_scanno(const char *aline)
  1339 {
  1340     const char *s;
  1341     s=strstr(aline," I\"");
  1342     if (s)
  1343     {
  1344 	if (pswit[ECHO_SWITCH])
  1345 	    g_print("\n%s\n",aline);
  1346 	if (!pswit[OVERVIEW_SWITCH])
  1347 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1348 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1349 	else
  1350 	    cnt_punct++;
  1351     }
  1352 }
  1353 
  1354 /*
  1355  * check_for_extra_period:
  1356  *
  1357  * Check for period without a capital letter. Cut-down from gutspell.
  1358  * Only works when it happens on a single line.
  1359  */
  1360 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1361 {
  1362     const char *s,*t,*s1;
  1363     int i;
  1364     gsize len;
  1365     gboolean istypo;
  1366     gchar *testword;
  1367     gunichar *decomposition;
  1368     if (pswit[PARANOID_SWITCH])
  1369     {
  1370 	for (t=aline;t=strstr(t,". ");)
  1371 	{
  1372 	    if (t==aline)
  1373 	    {
  1374 		t=g_utf8_next_char(t);
  1375 		/* start of line punctuation is handled elsewhere */
  1376 		continue;
  1377 	    }
  1378 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1379 	    {
  1380 		t=g_utf8_next_char(t);
  1381 		continue;
  1382 	    }
  1383 	    if (warnings->isDutch)
  1384 	    {
  1385 		/* For Frank & Jeroen -- 's Middags case */
  1386 		gunichar c2,c3,c4,c5;
  1387 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1388 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1389 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1390 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1391 		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
  1392 		  c4==CHAR_SPACE && g_unichar_isupper(c5))
  1393 		{
  1394 		    t=g_utf8_next_char(t);
  1395 		    continue;
  1396 		}
  1397 	    }
  1398 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1399 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1400 	      !isdigit(g_utf8_get_char(s1)))
  1401 		s1=g_utf8_next_char(s1);
  1402 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1403 	    {
  1404 		/* we have something to investigate */
  1405 		istypo=TRUE;
  1406 		/* so let's go back and find out */
  1407 		for (s1=g_utf8_prev_char(t);s1>=aline &&
  1408 		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||
  1409 		  g_unichar_isdigit(g_utf8_get_char(s1)) ||
  1410 		  g_utf8_get_char(s1)==CHAR_SQUOTE &&
  1411 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
  1412 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
  1413 		  s1=g_utf8_prev_char(s1))
  1414 		    ;
  1415 		s1=g_utf8_next_char(s1);
  1416 		s=strchr(s1,'.');
  1417 		if (s)
  1418 		    testword=g_strndup(s1,s-s1);
  1419 		else
  1420 		    testword=g_strdup(s1);
  1421 		for (i=0;*abbrev[i];i++)
  1422 		    if (!strcmp(testword,abbrev[i]))
  1423 			istypo=FALSE;
  1424 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1425 		    istypo=FALSE;
  1426 		if (!*g_utf8_next_char(testword))
  1427 		    istypo=FALSE;
  1428 		if (isroman(testword))
  1429 		    istypo=FALSE;
  1430 		if (istypo)
  1431 		{
  1432 		    istypo=FALSE;
  1433 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1434 		    {
  1435 			decomposition=g_unicode_canonical_decomposition(
  1436 			  g_utf8_get_char(s),&len);
  1437 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1438 			    istypo=TRUE;
  1439 			g_free(decomposition);
  1440 		    }
  1441 		}
  1442 		if (istypo &&
  1443 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1444 		{
  1445 		    g_tree_insert(qperiod,g_strdup(testword),
  1446 		      GINT_TO_POINTER(1));
  1447 		    if (pswit[ECHO_SWITCH])
  1448 			g_print("\n%s\n",aline);
  1449 		    if (!pswit[OVERVIEW_SWITCH])
  1450 			g_print("    Line %ld column %ld - Extra period?\n",
  1451 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1452 		    else
  1453 			cnt_punct++;
  1454 		}
  1455 		g_free(testword);
  1456 	    }
  1457 	    t=g_utf8_next_char(t);
  1458 	}
  1459     }
  1460 }
  1461 
  1462 /*
  1463  * check_for_following_punctuation:
  1464  *
  1465  * Check for words usually not followed by punctuation.
  1466  */
  1467 void check_for_following_punctuation(const char *aline)
  1468 {
  1469     int i;
  1470     const char *s,*wordstart;
  1471     gunichar c;
  1472     gchar *inword,*t;
  1473     if (pswit[TYPO_SWITCH])
  1474     {
  1475 	for (s=aline;*s;)
  1476 	{
  1477 	    wordstart=s;
  1478 	    t=getaword(&s);
  1479 	    if (!*t)
  1480 	    {
  1481 		g_free(t);
  1482 		continue;
  1483 	    }
  1484 	    inword=g_utf8_strdown(t,-1);
  1485 	    g_free(t);
  1486 	    for (i=0;*nocomma[i];i++)
  1487 		if (!strcmp(inword,nocomma[i]))
  1488 		{
  1489 		    c=g_utf8_get_char(s);
  1490 		    if (c==',' || c==';' || c==':')
  1491 		    {
  1492 			if (pswit[ECHO_SWITCH])
  1493 			    g_print("\n%s\n",aline);
  1494 			if (!pswit[OVERVIEW_SWITCH])
  1495 			    g_print("    Line %ld column %ld - "
  1496 			      "Query punctuation after %s?\n",
  1497 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1498 			      inword);
  1499 			else
  1500 			    cnt_punct++;
  1501 		    }
  1502 		}
  1503 	    for (i=0;*noperiod[i];i++)
  1504 		if (!strcmp(inword,noperiod[i]))
  1505 		{
  1506 		    c=g_utf8_get_char(s);
  1507 		    if (c=='.' || c=='!')
  1508 		    {
  1509 			if (pswit[ECHO_SWITCH])
  1510 			    g_print("\n%s\n",aline);
  1511 			if (!pswit[OVERVIEW_SWITCH])
  1512 			    g_print("    Line %ld column %ld - "
  1513 			      "Query punctuation after %s?\n",
  1514 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1515 			      inword);
  1516 			else
  1517 			    cnt_punct++;
  1518 		    }
  1519 		}
  1520 	    g_free(inword);
  1521 	}
  1522     }
  1523 }
  1524 
  1525 /*
  1526  * check_for_typos:
  1527  *
  1528  * Check for commonly mistyped words,
  1529  * and digits like 0 for O in a word.
  1530  */
  1531 void check_for_typos(const char *aline,struct warnings *warnings)
  1532 {
  1533     const char *s,*t,*nt,*wordstart;
  1534     gchar *inword;
  1535     gunichar *decomposition;
  1536     gchar *testword;
  1537     int i,vowel,consonant,*dupcnt;
  1538     gboolean isdup,istypo,alower;
  1539     gunichar c;
  1540     long offset,len;
  1541     gsize decomposition_len;
  1542     for (s=aline;*s;)
  1543     {
  1544 	wordstart=s;
  1545 	inword=getaword(&s);
  1546 	if (!*inword)
  1547 	{
  1548 	    g_free(inword);
  1549 	    continue; /* don't bother with empty lines */
  1550 	}
  1551 	if (mixdigit(inword))
  1552 	{
  1553 	    if (pswit[ECHO_SWITCH])
  1554 		g_print("\n%s\n",aline);
  1555 	    if (!pswit[OVERVIEW_SWITCH])
  1556 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1557 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1558 	    else
  1559 		cnt_word++;
  1560 	}
  1561 	/*
  1562 	 * Put the word through a series of tests for likely typos and OCR
  1563 	 * errors.
  1564 	 */
  1565 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1566 	{
  1567 	    istypo=FALSE;
  1568 	    alower=FALSE;
  1569 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1570 	    {
  1571 		c=g_utf8_get_char(t);
  1572 		nt=g_utf8_next_char(t);
  1573 		/* lowercase for testing */
  1574 		if (g_unichar_islower(c))
  1575 		    alower=TRUE;
  1576 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1577 		{
  1578 		    /*
  1579 		     * We have an uppercase mid-word. However, there are
  1580 		     * common cases:
  1581 		     *   Mac and Mc like McGill
  1582 		     *   French contractions like l'Abbe
  1583 		     */
  1584 		    offset=g_utf8_pointer_to_offset(inword,t);
  1585 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1586 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1587 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1588 		      offset>0 &&
  1589 		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
  1590 			; /* do nothing! */
  1591 		    else
  1592 			istypo=TRUE;
  1593 		}
  1594 	    }
  1595 	    testword=g_utf8_casefold(inword,-1);
  1596 	}
  1597 	if (pswit[TYPO_SWITCH])
  1598 	{
  1599 	    /*
  1600 	     * Check for certain unlikely two-letter combinations at word
  1601 	     * start and end.
  1602 	     */
  1603 	    len=g_utf8_strlen(testword,-1);
  1604 	    if (len>1)
  1605 	    {
  1606 		for (i=0;*nostart[i];i++)
  1607 		    if (g_str_has_prefix(testword,nostart[i]))
  1608 			istypo=TRUE;
  1609 		for (i=0;*noend[i];i++)
  1610 		    if (g_str_has_suffix(testword,noend[i]))
  1611 			istypo=TRUE;
  1612 	    }
  1613 	    /* ght is common, gbt never. Like that. */
  1614 	    if (strstr(testword,"cb"))
  1615 		istypo=TRUE;
  1616 	    if (strstr(testword,"gbt"))
  1617 		istypo=TRUE;
  1618 	    if (strstr(testword,"pbt"))
  1619 		istypo=TRUE;
  1620 	    if (strstr(testword,"tbs"))
  1621 		istypo=TRUE;
  1622 	    if (strstr(testword,"mrn"))
  1623 		istypo=TRUE;
  1624 	    if (strstr(testword,"ahle"))
  1625 		istypo=TRUE;
  1626 	    if (strstr(testword,"ihle"))
  1627 		istypo=TRUE;
  1628 	    /*
  1629 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1630 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1631 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1632 	     * numerals, but "ii" is a common scanno.
  1633 	     */
  1634 	    if (strstr(testword,"tbi"))
  1635 		istypo=TRUE;
  1636 	    if (strstr(testword,"tbe"))
  1637 		istypo=TRUE;
  1638 	    if (strstr(testword,"ii"))
  1639 		istypo=TRUE;
  1640 	    /*
  1641 	     * Check for no vowels or no consonants.
  1642 	     * If none, flag a typo.
  1643 	     */
  1644 	    if (!istypo && len>1)
  1645 	    {
  1646 		vowel=consonant=0;
  1647 		for (t=testword;*t;t=g_utf8_next_char(t))
  1648 		{
  1649 		    c=g_utf8_get_char(t);
  1650 		    decomposition=
  1651 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1652 		    if (c=='y' || g_unichar_isdigit(c))
  1653 		    {
  1654 			/* Yah, this is loose. */
  1655 			vowel++;
  1656 			consonant++;
  1657 		    }
  1658 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1659 			vowel++;
  1660 		    else
  1661 			consonant++;
  1662 		    g_free(decomposition);
  1663 		}
  1664 		if (!vowel || !consonant)
  1665 		    istypo=TRUE;
  1666 	    }
  1667 	    /*
  1668 	     * Now exclude the word from being reported if it's in
  1669 	     * the okword list.
  1670 	     */
  1671 	    for (i=0;*okword[i];i++)
  1672 		if (!strcmp(testword,okword[i]))
  1673 		    istypo=FALSE;
  1674 	    /*
  1675 	     * What looks like a typo may be a Roman numeral.
  1676 	     * Exclude these.
  1677 	     */
  1678 	    if (istypo && isroman(testword))
  1679 		istypo=FALSE;
  1680 	    /* Check the manual list of typos. */
  1681 	    if (!istypo)
  1682 		for (i=0;*typo[i];i++)
  1683 		    if (!strcmp(testword,typo[i]))
  1684 			istypo=TRUE;
  1685 	    /*
  1686 	     * Check lowercase s, l, i and m - special cases.
  1687 	     *   "j" - often a semi-colon gone wrong.
  1688 	     *   "d" for a missing apostrophe - he d
  1689 	     *   "n" for "in"
  1690 	     */
  1691 	    if (!istypo && len==1 &&
  1692 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1693 		istypo=TRUE;
  1694 	    if (istypo)
  1695 	    {
  1696 		dupcnt=g_tree_lookup(qword,testword);
  1697 		if (dupcnt)
  1698 		{
  1699 		    (*dupcnt)++;
  1700 		    isdup=!pswit[VERBOSE_SWITCH];
  1701 		}
  1702 		else
  1703 		{
  1704 		    dupcnt=g_new0(int,1);
  1705 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1706 		    isdup=FALSE;
  1707 		}
  1708 		if (!isdup)
  1709 		{
  1710 		    if (pswit[ECHO_SWITCH])
  1711 			g_print("\n%s\n",aline);
  1712 		    if (!pswit[OVERVIEW_SWITCH])
  1713 		    {
  1714 			g_print("    Line %ld column %ld - Query word %s",
  1715 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1716 			  inword);
  1717 			if (!pswit[VERBOSE_SWITCH])
  1718 			    g_print(" - not reporting duplicates");
  1719 			g_print("\n");
  1720 		    }
  1721 		    else
  1722 			cnt_word++;
  1723 		}
  1724 	    }
  1725 	}
  1726 	/* check the user's list of typos */
  1727 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1728 	{
  1729 	    if (pswit[ECHO_SWITCH])
  1730 		g_print("\n%s\n",aline);
  1731 	    if (!pswit[OVERVIEW_SWITCH])  
  1732 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1733 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1734 	}
  1735 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1736 	    g_free(testword);
  1737 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1738 	{
  1739 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1740 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1741 	    {
  1742 		if (pswit[ECHO_SWITCH])
  1743 		    g_print("\n%s\n",aline);
  1744 		if (!pswit[OVERVIEW_SWITCH])
  1745 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1746 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1747 		      inword);
  1748 		else
  1749 		    cnt_word++;
  1750 	    }
  1751 	}
  1752 	g_free(inword);
  1753     }
  1754 }
  1755 
  1756 /*
  1757  * check_for_misspaced_punctuation:
  1758  *
  1759  * Look for added or missing spaces around punctuation and quotes.
  1760  * If there is a punctuation character like ! with no space on
  1761  * either side, suspect a missing!space. If there are spaces on
  1762  * both sides , assume a typo. If we see a double quote with no
  1763  * space or punctuation on either side of it, assume unspaced
  1764  * quotes "like"this.
  1765  */
  1766 void check_for_misspaced_punctuation(const char *aline,
  1767   struct parities *parities,gboolean isemptyline)
  1768 {
  1769     gboolean isacro,isellipsis;
  1770     const char *s;
  1771     gunichar c,nc,pc,n2c;
  1772     c=g_utf8_get_char(aline);
  1773     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1774     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1775     {
  1776 	pc=c;
  1777 	c=nc;
  1778 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1779 	/* For each character in the line after the first. */
  1780 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1781 	{
  1782 	    /* we need to suppress warnings for acronyms like M.D. */
  1783 	    isacro=FALSE;
  1784 	    /* we need to suppress warnings for ellipsis . . . */
  1785 	    isellipsis=FALSE;
  1786 	    /*
  1787 	     * If there are letters on both sides of it or
  1788 	     * if it's strict punctuation followed by an alpha.
  1789 	     */
  1790 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1791 	      g_utf8_strchr("?!,;:",-1,c)))
  1792 	    {
  1793 		if (c=='.')
  1794 		{
  1795 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1796 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1797 			isacro=TRUE;
  1798 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1799 		    if (nc && n2c=='.')
  1800 			isacro=TRUE;
  1801 		}
  1802 		if (!isacro)
  1803 		{
  1804 		    if (pswit[ECHO_SWITCH])
  1805 			g_print("\n%s\n",aline);
  1806 		    if (!pswit[OVERVIEW_SWITCH])
  1807 			g_print("    Line %ld column %ld - Missing space?\n",
  1808 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1809 		    else
  1810 			cnt_punct++;
  1811 		}
  1812 	    }
  1813 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1814 	    {
  1815 		/*
  1816 		 * If there are spaces on both sides,
  1817 		 * or space before and end of line.
  1818 		 */
  1819 		if (c=='.')
  1820 		{
  1821 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1822 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1823 			isellipsis=TRUE;
  1824 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1825 		    if (nc && n2c=='.')
  1826 			isellipsis=TRUE;
  1827 		}
  1828 		if (!isemptyline && !isellipsis)
  1829 		{
  1830 		    if (pswit[ECHO_SWITCH])
  1831 			g_print("\n%s\n",aline);
  1832 		    if (!pswit[OVERVIEW_SWITCH])
  1833 			g_print("    Line %ld column %ld - "
  1834 			  "Spaced punctuation?\n",linecnt,
  1835 			  g_utf8_pointer_to_offset(aline,s)+1);
  1836 		    else
  1837 			cnt_punct++;
  1838 		}
  1839 	    }
  1840 	}
  1841     }
  1842     /* Split out the characters that CANNOT be preceded by space. */
  1843     c=g_utf8_get_char(aline);
  1844     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1845     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1846     {
  1847 	pc=c;
  1848 	c=nc;
  1849 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1850 	/* for each character in the line after the first */
  1851 	if (g_utf8_strchr("?!,;:",-1,c))
  1852 	{
  1853 	    /* if it's punctuation that _cannot_ have a space before it */
  1854 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1855 	    {
  1856 		/*
  1857 		 * If nc DOES == space,
  1858 		 * it was already reported just above.
  1859 		 */
  1860 		if (pswit[ECHO_SWITCH])
  1861 		    g_print("\n%s\n",aline);
  1862 		if (!pswit[OVERVIEW_SWITCH])
  1863 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1864 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1865 		else
  1866 		    cnt_punct++;
  1867 	    }
  1868 	}
  1869     }
  1870     /*
  1871      * Special case " .X" where X is any alpha.
  1872      * This plugs a hole in the acronym code above.
  1873      * Inelegant, but maintainable.
  1874      */
  1875     c=g_utf8_get_char(aline);
  1876     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1877     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1878     {
  1879 	pc=c;
  1880 	c=nc;
  1881 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1882 	/* for each character in the line after the first */
  1883 	if (c=='.')
  1884 	{
  1885 	    /* if it's a period */
  1886 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1887 	    {
  1888 		/*
  1889 		 * If the period follows a space and
  1890 		 * is followed by a letter.
  1891 		 */
  1892 		if (pswit[ECHO_SWITCH])
  1893 		    g_print("\n%s\n",aline);
  1894 		if (!pswit[OVERVIEW_SWITCH])
  1895 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1896 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1897 		else
  1898 		    cnt_punct++;
  1899 	    }
  1900 	}
  1901     }
  1902     c=g_utf8_get_char(aline);
  1903     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1904     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1905     {
  1906 	pc=c;
  1907 	c=nc;
  1908 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1909 	/* for each character in the line after the first */
  1910 	if (c==CHAR_DQUOTE)
  1911 	{
  1912 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  1913 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  1914 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  1915 	    {
  1916 		if (pswit[ECHO_SWITCH])
  1917 		    g_print("\n%s\n",aline);
  1918 		if (!pswit[OVERVIEW_SWITCH])
  1919 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  1920 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1921 		else
  1922 		    cnt_punct++;
  1923 	    }
  1924 	}
  1925     }
  1926     /* Check parity of quotes. */
  1927     nc=g_utf8_get_char(aline);
  1928     for (s=aline;*s;s=g_utf8_next_char(s))
  1929     {
  1930 	c=nc;
  1931 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1932 	if (c==CHAR_DQUOTE)
  1933 	{
  1934 	    parities->dquote=!parities->dquote;
  1935 	    if (!parities->dquote)
  1936 	    {
  1937 		/* parity even */
  1938 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  1939 		{
  1940 		    if (pswit[ECHO_SWITCH])
  1941 			g_print("\n%s\n",aline);
  1942 		    if (!pswit[OVERVIEW_SWITCH])
  1943 			g_print("    Line %ld column %ld - "
  1944 			  "Wrongspaced quotes?\n",
  1945 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1946 		    else
  1947 			cnt_punct++;
  1948 		}
  1949 	    }
  1950 	    else
  1951 	    {
  1952 		/* parity odd */
  1953 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  1954 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  1955 		{
  1956 		    if (pswit[ECHO_SWITCH])
  1957 			g_print("\n%s\n",aline);
  1958 		    if (!pswit[OVERVIEW_SWITCH])
  1959 			g_print("    Line %ld column %ld - "
  1960 			  "Wrongspaced quotes?\n",
  1961 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1962 		    else
  1963 			cnt_punct++;
  1964 		}
  1965 	    }
  1966 	}
  1967     }
  1968     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  1969     {
  1970 	if (g_utf8_strchr(",;:!?)]} ",-1,
  1971 	  g_utf8_get_char(g_utf8_next_char(aline))))
  1972 	{
  1973 	    if (pswit[ECHO_SWITCH])
  1974 		g_print("\n%s\n",aline);
  1975 	    if (!pswit[OVERVIEW_SWITCH])
  1976 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  1977 		  linecnt);
  1978 	    else
  1979 		cnt_punct++;
  1980 	}
  1981     }
  1982     if (pswit[SQUOTE_SWITCH])
  1983     {
  1984 	nc=g_utf8_get_char(aline);
  1985 	for (s=aline;*s;s=g_utf8_next_char(s))
  1986 	{
  1987 	    c=nc;
  1988 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  1989 	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
  1990 	      s>aline &&
  1991 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  1992 	      !g_unichar_isalpha(nc)))
  1993 	    {
  1994 		parities->squote=!parities->squote;
  1995 		if (!parities->squote)
  1996 		{
  1997 		    /* parity even */
  1998 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  1999 		    {
  2000 			if (pswit[ECHO_SWITCH])
  2001 			    g_print("\n%s\n",aline);
  2002 			if (!pswit[OVERVIEW_SWITCH])
  2003 			    g_print("    Line %ld column %ld - "
  2004 			      "Wrongspaced singlequotes?\n",
  2005 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2006 			else
  2007 			    cnt_punct++;
  2008 		    }
  2009 		}
  2010 		else
  2011 		{
  2012 		    /* parity odd */
  2013 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2014 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2015 		    {
  2016 			if (pswit[ECHO_SWITCH])
  2017 			    g_print("\n%s\n",aline);
  2018 			if (!pswit[OVERVIEW_SWITCH])
  2019 			    g_print("    Line %ld column %ld - "
  2020 			      "Wrongspaced singlequotes?\n",
  2021 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2022 			else
  2023 			    cnt_punct++;
  2024 		    }
  2025 		}
  2026 	    }
  2027 	}
  2028     }
  2029 }
  2030 
  2031 /*
  2032  * check_for_double_punctuation:
  2033  *
  2034  * Look for double punctuation like ,. or ,,
  2035  * Thanks to DW for the suggestion!
  2036  * In books with references, ".," and ".;" are common
  2037  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2038  * OTOH, from my initial tests, there are also fairly
  2039  * common errors. What to do? Make these cases paranoid?
  2040  * ".," is the most common, so warnings->dotcomma is used
  2041  * to suppress detailed reporting if it occurs often.
  2042  */
  2043 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2044 {
  2045     const char *s;
  2046     gunichar c,nc;
  2047     nc=g_utf8_get_char(aline);
  2048     for (s=aline;*s;s=g_utf8_next_char(s))
  2049     {
  2050 	c=nc;
  2051 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2052 	/* for each punctuation character in the line */
  2053 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2054 	  g_utf8_strchr(".?!,;:",-1,nc))
  2055 	{
  2056 	    /* followed by punctuation, it's a query, unless . . . */
  2057 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2058 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2059 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2060 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2061 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2062 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2063 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2064 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2065 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2066 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2067 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2068 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2069 	    {
  2070 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2071 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2072 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2073 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2074 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2075 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2076 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2077 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2078 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2079 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2080 		{
  2081 		    s+=4;
  2082 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2083 		}
  2084 		; /* do nothing for .. !! and ?? which can be legit */
  2085 	    }
  2086 	    else
  2087 	    {
  2088 		if (pswit[ECHO_SWITCH])
  2089 		    g_print("\n%s\n",aline);
  2090 		if (!pswit[OVERVIEW_SWITCH])
  2091 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2092 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2093 		else
  2094 		    cnt_punct++;
  2095 	    }
  2096 	}
  2097     }
  2098 }
  2099 
  2100 /*
  2101  * check_for_spaced_quotes:
  2102  */
  2103 void check_for_spaced_quotes(const char *aline)
  2104 {
  2105     const char *s,*t;
  2106     s=aline;
  2107     while ((t=strstr(s," \" ")))
  2108     {
  2109 	if (pswit[ECHO_SWITCH])
  2110 	    g_print("\n%s\n",aline);
  2111 	if (!pswit[OVERVIEW_SWITCH])
  2112 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2113 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2114 	else
  2115 	    cnt_punct++;
  2116 	s=g_utf8_next_char(g_utf8_next_char(t));
  2117     }
  2118     s=aline;
  2119     while ((t=strstr(s," ' ")))
  2120     {
  2121 	if (pswit[ECHO_SWITCH])
  2122 	    g_print("\n%s\n",aline);
  2123 	if (!pswit[OVERVIEW_SWITCH])
  2124 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2125 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2126 	else
  2127 	    cnt_punct++;
  2128 	s=g_utf8_next_char(g_utf8_next_char(t));
  2129     }
  2130     s=aline;
  2131     while ((t=strstr(s," ` ")))
  2132     {
  2133 	if (pswit[ECHO_SWITCH])
  2134 	    g_print("\n%s\n",aline);
  2135 	if (!pswit[OVERVIEW_SWITCH])
  2136 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2137 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2138 	else
  2139 	    cnt_punct++;
  2140 	s=g_utf8_next_char(g_utf8_next_char(t));
  2141     }
  2142 }
  2143 
  2144 /*
  2145  * check_for_miscased_genative:
  2146  *
  2147  * Check special case of 'S instead of 's at end of word.
  2148  */
  2149 void check_for_miscased_genative(const char *aline)
  2150 {
  2151     const char *s;
  2152     gunichar c,nc,pc;
  2153     if (!*aline)
  2154 	return;
  2155     c=g_utf8_get_char(aline);
  2156     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2157     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2158     {
  2159 	pc=c;
  2160 	c=nc;
  2161 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2162 	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
  2163 	{
  2164 	    if (pswit[ECHO_SWITCH])
  2165 		g_print("\n%s\n",aline);
  2166 	    if (!pswit[OVERVIEW_SWITCH])
  2167 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2168 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2169 	    else
  2170 		cnt_punct++;
  2171 	}
  2172     }
  2173 }
  2174 
  2175 /*
  2176  * check_end_of_line:
  2177  *
  2178  * Now check special cases - start and end of line -
  2179  * for single and double quotes. Start is sometimes [sic]
  2180  * but better to query it anyway.
  2181  * While we're here, check for dash at end of line.
  2182  */
  2183 void check_end_of_line(const char *aline,struct warnings *warnings)
  2184 {
  2185     int lbytes;
  2186     const char *s;
  2187     gunichar c1,c2;
  2188     lbytes=strlen(aline);
  2189     if (g_utf8_strlen(aline,lbytes)>1)
  2190     {
  2191 	s=g_utf8_prev_char(aline+lbytes);
  2192 	c1=g_utf8_get_char(s);
  2193 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2194 	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
  2195 	  c2==CHAR_SPACE)
  2196 	{
  2197 	    if (pswit[ECHO_SWITCH])
  2198 		g_print("\n%s\n",aline);
  2199 	    if (!pswit[OVERVIEW_SWITCH])
  2200 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2201 		  g_utf8_strlen(aline,lbytes));
  2202 	    else
  2203 		cnt_punct++;
  2204 	}
  2205 	c1=g_utf8_get_char(aline);
  2206 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2207 	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
  2208 	{
  2209 	    if (pswit[ECHO_SWITCH])
  2210 		g_print("\n%s\n",aline);
  2211 	    if (!pswit[OVERVIEW_SWITCH])
  2212 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2213 	    else
  2214 		cnt_punct++;
  2215 	}
  2216 	/*
  2217 	 * Dash at end of line may well be legit - paranoid mode only
  2218 	 * and don't report em-dash at line-end.
  2219 	 */
  2220 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2221 	{
  2222 	    for (s=g_utf8_prev_char(aline+lbytes);
  2223 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2224 		;
  2225 	    if (g_utf8_get_char(s)=='-' &&
  2226 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2227 	    {
  2228 		if (pswit[ECHO_SWITCH])
  2229 		    g_print("\n%s\n",aline);
  2230 		if (!pswit[OVERVIEW_SWITCH])
  2231 		    g_print("    Line %ld column %ld - "
  2232 		      "Hyphen at end of line?\n",
  2233 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2234 	    }
  2235 	}
  2236     }
  2237 }
  2238 
  2239 /*
  2240  * check_for_unspaced_bracket:
  2241  *
  2242  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2243  * If so, suspect a scanno like "a]most".
  2244  */
  2245 void check_for_unspaced_bracket(const char *aline)
  2246 {
  2247     const char *s;
  2248     gunichar c,nc,pc;
  2249     c=g_utf8_get_char(aline);
  2250     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2251     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2252     {
  2253 	pc=c;
  2254 	c=nc;
  2255 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2256 	if (!nc)
  2257 	    break;
  2258 	/* for each bracket character in the line except 1st & last */
  2259 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2260 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2261 	{
  2262 	    if (pswit[ECHO_SWITCH])
  2263 		g_print("\n%s\n",aline);
  2264 	    if (!pswit[OVERVIEW_SWITCH])
  2265 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2266 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2267 	    else
  2268 		cnt_punct++;
  2269 	}
  2270     }
  2271 }
  2272 
  2273 /*
  2274  * check_for_unpunctuated_endquote:
  2275  */
  2276 void check_for_unpunctuated_endquote(const char *aline)
  2277 {
  2278     const char *s;
  2279     gunichar c,nc,pc;
  2280     c=g_utf8_get_char(aline);
  2281     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2282     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2283     {
  2284 	pc=c;
  2285 	c=nc;
  2286 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2287 	/* for each character in the line except 1st */
  2288 	if (c==CHAR_DQUOTE && isalpha(pc))
  2289 	{
  2290 	    if (pswit[ECHO_SWITCH])
  2291 		g_print("\n%s\n",aline);
  2292 	    if (!pswit[OVERVIEW_SWITCH])
  2293 		g_print("    Line %ld column %ld - "
  2294 		  "endquote missing punctuation?\n",
  2295 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2296 	    else
  2297 		cnt_punct++;
  2298 	}
  2299     }
  2300 }
  2301 
  2302 /*
  2303  * check_for_html_tag:
  2304  *
  2305  * Check for <HTML TAG>.
  2306  *
  2307  * If there is a < in the line, followed at some point
  2308  * by a > then we suspect HTML.
  2309  */
  2310 void check_for_html_tag(const char *aline)
  2311 {
  2312     const char *open,*close;
  2313     gchar *tag;
  2314     open=strchr(aline,'<');
  2315     if (open)
  2316     {
  2317 	close=strchr(g_utf8_next_char(open),'>');
  2318 	if (close)
  2319 	{
  2320 	    if (pswit[ECHO_SWITCH])
  2321 		g_print("\n%s\n",aline);
  2322 	    if (!pswit[OVERVIEW_SWITCH])
  2323 	    {
  2324 		tag=g_strndup(open,close-open+1);
  2325 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2326 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2327 		g_free(tag);
  2328 	    }
  2329 	    else
  2330 		cnt_html++;
  2331 	}
  2332     }
  2333 }
  2334 
  2335 /*
  2336  * check_for_html_entity:
  2337  *
  2338  * Check for &symbol; HTML.
  2339  *
  2340  * If there is a & in the line, followed at
  2341  * some point by a ; then we suspect HTML.
  2342  */
  2343 void check_for_html_entity(const char *aline)
  2344 {
  2345     const char *s,*amp,*scolon;
  2346     gchar *entity;
  2347     amp=strchr(aline,'&');
  2348     if (amp)
  2349     {
  2350 	scolon=strchr(amp,';');
  2351 	if (scolon)
  2352 	{
  2353 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2354 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2355 		    break;		/* Don't report "Jones & Son;" */
  2356 	    if (s>=scolon)
  2357 	    {
  2358 		if (pswit[ECHO_SWITCH])
  2359 		    g_print("\n%s\n",aline);
  2360 		if (!pswit[OVERVIEW_SWITCH])
  2361 		{
  2362 		    entity=g_strndup(amp,scolon-amp+1);
  2363 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2364 		      linecnt,(int)(amp-aline)+1,entity);
  2365 		    g_free(entity);
  2366 		}
  2367 		else
  2368 		    cnt_html++;
  2369 	    }
  2370 	}
  2371     }
  2372 }
  2373 
  2374 /*
  2375  * print_pending:
  2376  *
  2377  * If we are in a state of unbalanced quotes, and this line
  2378  * doesn't begin with a quote, output the stored error message.
  2379  * If the -P switch was used, print the warning even if the
  2380  * new para starts with quotes.
  2381  */
  2382 void print_pending(const char *aline,const char *parastart,
  2383   struct pending *pending)
  2384 {
  2385     const char *s;
  2386     gunichar c;
  2387     s=aline;
  2388     while (*s==' ')
  2389 	s++;
  2390     c=g_utf8_get_char(s);
  2391     if (pending->dquote)
  2392     {
  2393 	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  2394 	{
  2395 	    if (!pswit[OVERVIEW_SWITCH])
  2396 	    {
  2397 		if (pswit[ECHO_SWITCH])
  2398 		    g_print("\n%s\n",parastart);
  2399 		g_print("%s\n",pending->dquote);
  2400 	    }
  2401 	    else
  2402 		cnt_dquot++;
  2403 	}
  2404 	g_free(pending->dquote);
  2405 	pending->dquote=NULL;
  2406     }
  2407     if (pending->squote)
  2408     {
  2409 	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
  2410 	  pending->squot)
  2411 	{
  2412 	    if (!pswit[OVERVIEW_SWITCH])
  2413 	    {
  2414 		if (pswit[ECHO_SWITCH])
  2415 		    g_print("\n%s\n",parastart);
  2416 		g_print("%s\n",pending->squote);
  2417 	    }
  2418 	    else
  2419 		cnt_squot++;
  2420 	}
  2421 	g_free(pending->squote);
  2422 	pending->squote=NULL;
  2423     }
  2424     if (pending->rbrack)
  2425     {
  2426 	if (!pswit[OVERVIEW_SWITCH])
  2427 	{
  2428 	    if (pswit[ECHO_SWITCH])
  2429 		g_print("\n%s\n",parastart);
  2430 	    g_print("%s\n",pending->rbrack);
  2431 	}
  2432 	else
  2433 	    cnt_brack++;
  2434 	g_free(pending->rbrack);
  2435 	pending->rbrack=NULL;
  2436     }
  2437     if (pending->sbrack)
  2438     {
  2439 	if (!pswit[OVERVIEW_SWITCH])
  2440 	{
  2441 	    if (pswit[ECHO_SWITCH])
  2442 		g_print("\n%s\n",parastart);
  2443 	    g_print("%s\n",pending->sbrack);
  2444 	}
  2445 	else
  2446 	    cnt_brack++;
  2447 	g_free(pending->sbrack);
  2448 	pending->sbrack=NULL;
  2449     }
  2450     if (pending->cbrack)
  2451     {
  2452 	if (!pswit[OVERVIEW_SWITCH])
  2453 	{
  2454 	    if (pswit[ECHO_SWITCH])
  2455 		g_print("\n%s\n",parastart);
  2456 	    g_print("%s\n",pending->cbrack);
  2457 	}
  2458 	else
  2459 	    cnt_brack++;
  2460 	g_free(pending->cbrack);
  2461 	pending->cbrack=NULL;
  2462     }
  2463     if (pending->unders)
  2464     {
  2465 	if (!pswit[OVERVIEW_SWITCH])
  2466 	{
  2467 	    if (pswit[ECHO_SWITCH])
  2468 		g_print("\n%s\n",parastart);
  2469 	    g_print("%s\n",pending->unders);
  2470 	}
  2471 	else
  2472 	    cnt_brack++;
  2473 	g_free(pending->unders);
  2474 	pending->unders=NULL;
  2475     }
  2476 }
  2477 
  2478 /*
  2479  * check_for_mismatched_quotes:
  2480  *
  2481  * At end of paragraph, check for mismatched quotes.
  2482  *
  2483  * We don't want to report an error immediately, since it is a
  2484  * common convention to omit the quotes at end of paragraph if
  2485  * the next paragraph is a continuation of the same speaker.
  2486  * Where this is the case, the next para should begin with a
  2487  * quote, so we store the warning message and only display it
  2488  * at the top of the next iteration if the new para doesn't
  2489  * start with a quote.
  2490  * The -p switch overrides this default, and warns of unclosed
  2491  * quotes on _every_ paragraph, whether the next begins with a
  2492  * quote or not.
  2493  */
  2494 void check_for_mismatched_quotes(const struct counters *counters,
  2495   struct pending *pending)
  2496 {
  2497     if (counters->quot%2)
  2498 	pending->dquote=
  2499 	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
  2500     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  2501       counters->open_single_quote!=counters->close_single_quote)
  2502 	pending->squote=
  2503 	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
  2504     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  2505       counters->open_single_quote!=counters->close_single_quote &&
  2506       counters->open_single_quote!=counters->close_single_quote+1)
  2507 	/*
  2508 	 * Flag it to be noted regardless of the
  2509 	 * first char of the next para.
  2510 	 */
  2511 	pending->squot=1;
  2512     if (counters->r_brack)
  2513 	pending->rbrack=
  2514 	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
  2515     if (counters->s_brack)
  2516 	pending->sbrack=
  2517 	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
  2518     if (counters->c_brack)
  2519 	pending->cbrack=
  2520 	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
  2521     if (counters->c_unders%2)
  2522 	pending->unders=
  2523 	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
  2524 }
  2525 
  2526 /*
  2527  * check_for_omitted_punctuation:
  2528  *
  2529  * Check for omitted punctuation at end of paragraph by working back
  2530  * through prevline. DW.
  2531  * Need to check this only for "normal" paras.
  2532  * So what is a "normal" para?
  2533  *    Not normal if one-liner (chapter headings, etc.)
  2534  *    Not normal if doesn't contain at least one locase letter
  2535  *    Not normal if starts with space
  2536  */
  2537 void check_for_omitted_punctuation(const char *prevline,
  2538   struct line_properties *last,int start_para_line)
  2539 {
  2540     gboolean letter_on_line=FALSE;
  2541     const char *s;
  2542     for (s=prevline;*s;s=g_utf8_next_char(s))
  2543 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2544 	{
  2545 	    letter_on_line=TRUE;
  2546 	    break;
  2547 	}
  2548     /*
  2549      * This next "if" is a problem.
  2550      * If we say "start_para_line <= linecnt - 1", that includes
  2551      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2552      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2553      * misses genuine one-line paragraphs.
  2554      */
  2555     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2556       g_utf8_get_char(prevline)>CHAR_SPACE)
  2557     {
  2558 	for (s=g_utf8_prev_char(prevline+strlen(prevline));
  2559 	  (g_utf8_get_char(s)==CHAR_DQUOTE ||
  2560 	  g_utf8_get_char(s)==CHAR_SQUOTE) &&
  2561 	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
  2562 	  s=g_utf8_prev_char(s))
  2563 	    ;
  2564 	for (;s>prevline;s=g_utf8_prev_char(s))
  2565 	{
  2566 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2567 	    {
  2568 		if (pswit[ECHO_SWITCH])
  2569 		    g_print("\n%s\n",prevline);
  2570 		if (!pswit[OVERVIEW_SWITCH])
  2571 		    g_print("    Line %ld column %ld - "
  2572 		      "No punctuation at para end?\n",
  2573 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2574 		else
  2575 		    cnt_punct++;
  2576 		break;
  2577 	    }
  2578 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2579 		break;
  2580 	}
  2581     }
  2582 }
  2583 
  2584 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2585 {
  2586     const char *word=key;
  2587     int *dupcnt=value;
  2588     if (*dupcnt)
  2589 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2590 	  word,*dupcnt);
  2591     return FALSE;
  2592 }
  2593 
  2594 void print_as_windows_1252(const char *string)
  2595 {
  2596     gsize inbytes,outbytes;
  2597     gchar *buf,*bp;
  2598     GIConv converter=(GIConv)-1;
  2599     if (!string)
  2600     {
  2601 	if (converter!=(GIConv)-1)
  2602 	    g_iconv_close(converter);
  2603 	converter=(GIConv)-1;
  2604 	return;
  2605     }
  2606     if (converter=(GIConv)-1)
  2607 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2608     if (converter!=(GIConv)-1)
  2609     {
  2610 	inbytes=outbytes=strlen(string);
  2611 	bp=buf=g_malloc(outbytes+1);
  2612 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2613 	*bp='\0';
  2614 	fputs(buf,stdout);
  2615 	g_free(buf);
  2616     }
  2617     else
  2618 	fputs(string,stdout);
  2619 }
  2620 
  2621 /*
  2622  * procfile:
  2623  *
  2624  * Process one file.
  2625  */
  2626 void procfile(const char *filename)
  2627 {
  2628     const char *s;
  2629     gchar *parastart=NULL;	/* first line of current para */
  2630     gchar *etext,*aline;
  2631     gchar *etext_ptr;
  2632     GError *err=NULL;
  2633     struct first_pass_results *first_pass_results;
  2634     struct warnings *warnings;
  2635     struct counters counters={0};
  2636     struct line_properties last={0};
  2637     struct parities parities={0};
  2638     struct pending pending={0};
  2639     gboolean isemptyline;
  2640     long start_para_line=0;
  2641     gboolean isnewpara=FALSE,enddash=FALSE;
  2642     last.start=CHAR_SPACE;
  2643     linecnt=checked_linecnt=0;
  2644     etext=read_etext(filename,&err);
  2645     if (!etext)
  2646     {
  2647 	if (pswit[STDOUT_SWITCH])
  2648 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2649 	else
  2650 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2651 	exit(1);
  2652     }
  2653     g_set_print_handler(print_as_windows_1252);
  2654     g_print("\n\nFile: %s\n\n",filename);
  2655     first_pass_results=first_pass(etext);
  2656     warnings=report_first_pass(first_pass_results);
  2657     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2658     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2659     /*
  2660      * Here we go with the main pass. Hold onto yer hat!
  2661      */
  2662     linecnt=0;
  2663     etext_ptr=etext;
  2664     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2665     {
  2666 	linecnt++;
  2667 	if (linecnt==1)
  2668 	    isnewpara=TRUE;
  2669 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2670 	    continue;    // skip DP page separators completely
  2671 	if (linecnt<first_pass_results->firstline ||
  2672 	  (first_pass_results->footerline>0 &&
  2673 	  linecnt>first_pass_results->footerline))
  2674 	{
  2675 	    if (pswit[HEADER_SWITCH])
  2676 	    {
  2677 		if (g_str_has_prefix(aline,"Title:"))
  2678 		    g_print("    %s\n",aline);
  2679 		if (g_str_has_prefix(aline,"Author:"))
  2680 		    g_print("    %s\n",aline);
  2681 		if (g_str_has_prefix(aline,"Release Date:"))
  2682 		    g_print("    %s\n",aline);
  2683 		if (g_str_has_prefix(aline,"Edition:"))
  2684 		    g_print("    %s\n\n",aline);
  2685 	    }
  2686 	    continue;		/* skip through the header */
  2687 	}
  2688 	checked_linecnt++;
  2689 	print_pending(aline,parastart,&pending);
  2690 	memset(&pending,0,sizeof(pending));
  2691 	isemptyline=analyse_quotes(aline,&counters);
  2692 	if (isnewpara && !isemptyline)
  2693 	{
  2694 	    /* This line is the start of a new paragraph. */
  2695 	    start_para_line=linecnt;
  2696 	    /* Capture its first line in case we want to report it later. */
  2697 	    g_free(parastart);
  2698 	    parastart=g_strdup(aline);
  2699 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2700 	    s=aline;
  2701 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2702 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2703 		s=g_utf8_next_char(s);
  2704 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2705 	    {
  2706 		/* and its first letter is lowercase */
  2707 		if (pswit[ECHO_SWITCH])
  2708 		    g_print("\n%s\n",aline);
  2709 		if (!pswit[OVERVIEW_SWITCH])
  2710 		    g_print("    Line %ld column %ld - "
  2711 		      "Paragraph starts with lower-case\n",
  2712 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2713 		else
  2714 		    cnt_punct++;
  2715 	    }
  2716 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2717 	}
  2718 	/* Check for an em-dash broken at line end. */
  2719 	if (enddash && g_utf8_get_char(aline)=='-')
  2720 	{
  2721 	    if (pswit[ECHO_SWITCH])
  2722 		g_print("\n%s\n",aline);
  2723 	    if (!pswit[OVERVIEW_SWITCH])
  2724 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2725 	    else
  2726 		cnt_punct++;
  2727 	}
  2728 	enddash=FALSE;
  2729 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2730 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2731 	    ;
  2732 	if (s>=aline && g_utf8_get_char(s)=='-')
  2733 	    enddash=TRUE;
  2734 	check_for_control_characters(aline);
  2735 	if (warnings->bin)
  2736 	    check_for_odd_characters(aline,warnings,isemptyline);
  2737 	if (warnings->longline)
  2738 	    check_for_long_line(aline);
  2739 	if (warnings->shortline)
  2740 	    check_for_short_line(aline,&last);
  2741 	last.blen=last.len;
  2742 	last.len=g_utf8_strlen(aline,-1);
  2743 	last.start=g_utf8_get_char(aline);
  2744 	check_for_starting_punctuation(aline);
  2745 	if (warnings->dash)
  2746 	{
  2747 	    check_for_spaced_emdash(aline);
  2748 	    check_for_spaced_dash(aline);
  2749 	}
  2750 	check_for_unmarked_paragraphs(aline);
  2751 	check_for_jeebies(aline);
  2752 	check_for_mta_from(aline);
  2753 	check_for_orphan_character(aline);
  2754 	check_for_pling_scanno(aline);
  2755 	check_for_extra_period(aline,warnings);
  2756 	check_for_following_punctuation(aline);
  2757 	check_for_typos(aline,warnings);
  2758 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2759 	check_for_double_punctuation(aline,warnings);
  2760 	check_for_spaced_quotes(aline);
  2761 	check_for_miscased_genative(aline);
  2762 	check_end_of_line(aline,warnings);
  2763 	check_for_unspaced_bracket(aline);
  2764 	if (warnings->endquote)
  2765 	    check_for_unpunctuated_endquote(aline);
  2766 	check_for_html_tag(aline);
  2767 	check_for_html_entity(aline);
  2768 	if (isemptyline)
  2769 	{
  2770 	    check_for_mismatched_quotes(&counters,&pending);
  2771 	    memset(&counters,0,sizeof(counters));
  2772 	    /* let the next iteration know that it's starting a new para */
  2773 	    isnewpara=TRUE;
  2774 	    if (prevline)
  2775 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2776 	}
  2777 	g_free(prevline);
  2778 	prevline=g_strdup(aline);
  2779     }
  2780     if (prevline)
  2781     {
  2782 	g_free(prevline);
  2783 	prevline=NULL;
  2784     }
  2785     g_free(parastart);
  2786     g_free(prevline);
  2787     g_free(etext);
  2788     if (!pswit[OVERVIEW_SWITCH])
  2789 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2790     g_tree_unref(qword);
  2791     g_tree_unref(qperiod);
  2792     g_set_print_handler(NULL);
  2793     print_as_windows_1252(NULL);
  2794     if (pswit[MARKUP_SWITCH])  
  2795 	loseentities(NULL);
  2796 }
  2797 
  2798 /*
  2799  * flgets:
  2800  *
  2801  * Get one line from the input text, checking for
  2802  * the existence of exactly one CR/LF line-end per line.
  2803  *
  2804  * Returns: a pointer to the line.
  2805  */
  2806 char *flgets(char **etext,long lcnt)
  2807 {
  2808     gunichar c;
  2809     gboolean isCR=FALSE;
  2810     char *theline=*etext;
  2811     char *eos=theline;
  2812     gchar *s;
  2813     for (;;)
  2814     {
  2815 	c=g_utf8_get_char(*etext);
  2816 	*etext=g_utf8_next_char(*etext);
  2817 	if (!c)
  2818 	    return NULL;
  2819 	/* either way, it's end of line */
  2820 	if (c=='\n')
  2821 	{
  2822 	    if (isCR)
  2823 		break;
  2824 	    else
  2825 	    {
  2826 		/* Error - a LF without a preceding CR */
  2827 		if (pswit[LINE_END_SWITCH])
  2828 		{
  2829 		    if (pswit[ECHO_SWITCH])
  2830 		    {
  2831 			s=g_strndup(theline,eos-theline);
  2832 			g_print("\n%s\n",s);
  2833 			g_free(s);
  2834 		    }
  2835 		    if (!pswit[OVERVIEW_SWITCH])
  2836 			g_print("    Line %ld - No CR?\n",lcnt);
  2837 		    else
  2838 			cnt_lineend++;
  2839 		}
  2840 		break;
  2841 	    }
  2842 	}
  2843 	if (c=='\r')
  2844 	{
  2845 	    if (isCR)
  2846 	    {
  2847 		/* Error - two successive CRs */
  2848 		if (pswit[LINE_END_SWITCH])
  2849 		{
  2850 		    if (pswit[ECHO_SWITCH])
  2851 		    {
  2852 			s=g_strndup(theline,eos-theline);
  2853 			g_print("\n%s\n",s);
  2854 			g_free(s);
  2855 		    }
  2856 		    if (!pswit[OVERVIEW_SWITCH])
  2857 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2858 		    else
  2859 			cnt_lineend++;
  2860 		}
  2861 	    }
  2862 	    isCR=TRUE;
  2863 	}
  2864 	else
  2865 	{
  2866 	    if (pswit[LINE_END_SWITCH] && isCR)
  2867 	    {
  2868 		if (pswit[ECHO_SWITCH])
  2869 		{
  2870 		    s=g_strndup(theline,eos-theline);
  2871 		    g_print("\n%s\n",s);
  2872 		    g_free(s);
  2873 		}
  2874 		if (!pswit[OVERVIEW_SWITCH])
  2875 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2876 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2877 		else
  2878 		    cnt_lineend++;
  2879 		*eos=' ';
  2880 	    }
  2881 	    isCR=FALSE;
  2882 	    eos=g_utf8_next_char(eos);
  2883 	}
  2884     }
  2885     *eos='\0';
  2886     if (pswit[MARKUP_SWITCH])  
  2887 	postprocess_for_HTML(theline);
  2888     if (pswit[DP_SWITCH])  
  2889 	postprocess_for_DP(theline);
  2890     return theline;
  2891 }
  2892 
  2893 /*
  2894  * mixdigit:
  2895  *
  2896  * Takes a "word" as a parameter, and checks whether it
  2897  * contains a mixture of alpha and digits. Generally, this is an
  2898  * error, but may not be for cases like 4th or L5 12s. 3d.
  2899  *
  2900  * Returns: TRUE iff an is error found.
  2901  */
  2902 gboolean mixdigit(const char *checkword)
  2903 {
  2904     gboolean wehaveadigit,wehavealetter,query;
  2905     const char *s,*nondigit;
  2906     wehaveadigit=wehavealetter=query=FALSE;
  2907     for (s=checkword;*s;s=g_utf8_next_char(s))
  2908 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2909 	    wehavealetter=TRUE;
  2910 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2911 	    wehaveadigit=TRUE;
  2912     if (wehaveadigit && wehavealetter)
  2913     {
  2914 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2915 	query=TRUE;
  2916 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2917 	  nondigit=g_utf8_next_char(nondigit))
  2918 	    ;
  2919 	/* digits, ending in st, rd, nd, th of either case */
  2920 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2921 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2922 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2923 	  !g_ascii_strcasecmp(nondigit,"th"))
  2924 	    query=FALSE;
  2925 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2926 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2927 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2928 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2929 	    query=FALSE;
  2930 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2931 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2932 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2933 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2934 	    query=FALSE;
  2935 	/* digits, ending in l, L, s or d */
  2936 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2937 	  !strcmp(nondigit,"d"))
  2938 	    query=FALSE;
  2939 	/*
  2940 	 * L at the start of a number, representing Britsh pounds, like L500.
  2941 	 * This is cute. We know the current word is mixed digit. If the first
  2942 	 * letter is L, there must be at least one digit following. If both
  2943 	 * digits and letters follow, we have a genuine error, else we have a
  2944 	 * capital L followed by digits, and we accept that as a non-error.
  2945 	 */
  2946 	if (g_utf8_get_char(checkword)=='L' &&
  2947 	  !mixdigit(g_utf8_next_char(checkword)))
  2948 	    query=FALSE;
  2949     }
  2950     return query;
  2951 }
  2952 
  2953 /*
  2954  * getaword:
  2955  *
  2956  * Extracts the first/next "word" from the line, and returns it.
  2957  * A word is defined as one English word unit--or at least that's the aim.
  2958  * "ptr" is advanced to the position in the line where we will start
  2959  * looking for the next word.
  2960  *
  2961  * Returns: A newly-allocated string.
  2962  */
  2963 gchar *getaword(const char **ptr)
  2964 {
  2965     const char *s,*t;
  2966     GString *word;
  2967     gunichar c,pc;
  2968     word=g_string_new(NULL);
  2969     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2970       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2971       **ptr;*ptr=g_utf8_next_char(*ptr))
  2972 	;
  2973     /*
  2974      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2975      * Especially yucky is the case of L1,000
  2976      * This section looks for a pattern of characters including a digit
  2977      * followed by a comma or period followed by one or more digits.
  2978      * If found, it returns this whole pattern as a word; otherwise we discard
  2979      * the results and resume our normal programming.
  2980      */
  2981     s=*ptr;
  2982     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  2983       g_unichar_isalpha(g_utf8_get_char(s)) ||
  2984       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  2985 	g_string_append_unichar(word,g_utf8_get_char(s));
  2986     for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
  2987       t=g_utf8_next_char(t))
  2988     {
  2989 	c=g_utf8_get_char(t);
  2990 	pc=g_utf8_get_char(g_utf8_prev_char(t));
  2991 	if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  2992 	{
  2993 	    *ptr=s;
  2994 	    return g_string_free(word,FALSE);
  2995 	}
  2996     }
  2997     /* we didn't find a punctuated number - do the regular getword thing */
  2998     g_string_truncate(word,0);
  2999     for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
  3000       g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
  3001       g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
  3002 	g_string_append_unichar(word,g_utf8_get_char(*ptr));
  3003     return g_string_free(word,FALSE);
  3004 }
  3005 
  3006 /*
  3007  * isroman:
  3008  *
  3009  * Is this word a Roman Numeral?
  3010  *
  3011  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3012  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3013  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3014  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3015  * expressions thereof, except when it came to taxes. Allow any number of M,
  3016  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3017  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3018  * of optional Is.
  3019  */
  3020 gboolean isroman(const char *t)
  3021 {
  3022     const char *s;
  3023     if (!t || !*t)
  3024 	return FALSE;
  3025     s=t;
  3026     while (g_utf8_get_char(t)=='m' && *t)
  3027 	t++;
  3028     if (g_utf8_get_char(t)=='d')
  3029 	t++;
  3030     if (g_str_has_prefix(t,"cm"))
  3031 	t+=2;
  3032     if (g_str_has_prefix(t,"cd"))
  3033 	t+=2;
  3034     while (g_utf8_get_char(t)=='c' && *t)
  3035 	t++;
  3036     if (g_str_has_prefix(t,"xl"))
  3037 	t+=2;
  3038     if (g_str_has_prefix(t,"xc"))
  3039 	t+=2;
  3040     if (g_utf8_get_char(t)=='l')
  3041 	t++;
  3042     while (g_utf8_get_char(t)=='x' && *t)
  3043 	t++;
  3044     if (g_str_has_prefix(t,"ix"))
  3045 	t+=2;
  3046     if (g_str_has_prefix(t,"iv"))
  3047 	t+=2;
  3048     if (g_utf8_get_char(t)=='v')
  3049 	t++;
  3050     while (g_utf8_get_char(t)=='i' && *t)
  3051 	t++;
  3052     return !*t;
  3053 }
  3054 
  3055 /*
  3056  * postprocess_for_DP:
  3057  *
  3058  * Invoked with the -d switch from flgets().
  3059  * It simply "removes" from the line a hard-coded set of common
  3060  * DP-specific tags, so that the line passed to the main routine has
  3061  * been pre-cleaned of DP markup.
  3062  */
  3063 void postprocess_for_DP(char *theline)
  3064 {
  3065     char *s,*t;
  3066     int i;
  3067     if (!*theline) 
  3068 	return;
  3069     for (i=0;*DPmarkup[i];i++)
  3070 	while ((s=strstr(theline,DPmarkup[i])))
  3071 	{
  3072 	    t=s+strlen(DPmarkup[i]);
  3073 	    memmove(s,t,strlen(t)+1);
  3074 	}
  3075 }
  3076 
  3077 /*
  3078  * postprocess_for_HTML:
  3079  *
  3080  * Invoked with the -m switch from flgets().
  3081  * It simply "removes" from the line a hard-coded set of common
  3082  * HTML tags and "replaces" a hard-coded set of common HTML
  3083  * entities, so that the line passed to the main routine has
  3084  * been pre-cleaned of HTML.
  3085  */
  3086 void postprocess_for_HTML(char *theline)
  3087 {
  3088     while (losemarkup(theline))
  3089 	;
  3090     loseentities(theline);
  3091 }
  3092 
  3093 char *losemarkup(char *theline)
  3094 {
  3095     char *s,*t;
  3096     int i;
  3097     s=strchr(theline,'<');
  3098     t=s?strchr(s,'>'):NULL;
  3099     if (!s || !t)
  3100 	return NULL;
  3101     for (i=0;*markup[i];i++)
  3102 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3103 	{
  3104 	    t=g_utf8_next_char(t);
  3105 	    memmove(s,t,strlen(t)+1);
  3106 	    return s;
  3107 	}
  3108     /* It's an unrecognized <xxx>. */
  3109     return NULL;
  3110 }
  3111 
  3112 void loseentities(char *theline)
  3113 {
  3114     int i;
  3115     gsize nb;
  3116     char *amp,*scolon;
  3117     gchar *s,*t;
  3118     gunichar c;
  3119     GTree *entities=NULL;
  3120     GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3121     if (!theline)
  3122     {
  3123 	if (entities)
  3124 	    g_tree_destroy(entities);
  3125 	entities=NULL;
  3126 	if (translit==(GIConv)-1)
  3127 	    g_iconv_close(translit);
  3128 	translit=(GIConv)-1;
  3129 	if (to_utf8==(GIConv)-1)
  3130 	    g_iconv_close(to_utf8);
  3131 	to_utf8=(GIConv)-1;
  3132 	return;
  3133     }
  3134     if (!*theline)
  3135 	return;
  3136     if (!entities)
  3137     {
  3138 	entities=g_tree_new((GCompareFunc)strcmp);
  3139 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3140 	    g_tree_insert(entities,HTMLentities[i].name,
  3141 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3142     }
  3143     if (translit==(GIConv)-1)
  3144 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3145     if (to_utf8==(GIConv)-1)
  3146 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3147     while((amp=strchr(theline,'&')))
  3148     {
  3149 	scolon=strchr(amp,';');
  3150 	if (scolon)
  3151 	{
  3152 	    if (amp[1]=='#')
  3153 	    {
  3154 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3155 		    c=strtol(amp+2,NULL,10);
  3156 		else if (amp[2]=='x' &&
  3157 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3158 		    c=strtol(amp+3,NULL,16);
  3159 	    }
  3160 	    else
  3161 	    {
  3162 		s=g_strndup(amp+1,scolon-(amp+1));
  3163 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3164 		g_free(s);
  3165 	    }
  3166 	}
  3167 	else
  3168 	    c=0;
  3169 	if (c)
  3170 	{
  3171 	    theline=amp;
  3172 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3173 		theline+=g_unichar_to_utf8(c,theline);
  3174 	    else
  3175 	    {
  3176 		s=g_malloc(6);
  3177 		nb=g_unichar_to_utf8(c,s);
  3178 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3179 		g_free(s);
  3180 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3181 		g_free(t);
  3182 		memcpy(theline,s,nb);
  3183 		g_free(s);
  3184 		theline+=nb;
  3185 	    }
  3186 	    memmove(theline,g_utf8_next_char(scolon),
  3187 	      strlen(g_utf8_next_char(scolon))+1);
  3188 	}
  3189 	else
  3190 	    theline=g_utf8_next_char(amp);
  3191     }
  3192 }
  3193 
  3194 gboolean tagcomp(const char *strin,const char *basetag)
  3195 {
  3196     gboolean retval;
  3197     gchar *s,*t;
  3198     if (g_utf8_get_char(strin)=='/')
  3199 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3200     else
  3201 	t=g_utf8_casefold(strin,-1);
  3202     s=g_utf8_casefold(basetag,-1);
  3203     retval=g_str_has_prefix(t,s);
  3204     g_free(s);
  3205     g_free(t);
  3206     return retval;
  3207 }
  3208 
  3209 void proghelp(GOptionContext *context)
  3210 {
  3211     gchar *help;
  3212     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3213     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3214     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3215     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3216       "For details, read the file COPYING.\n",stderr);
  3217     fputs("This is Free Software; "
  3218       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3219     fputs("read the file COPYING for details.\n\n",stderr);
  3220     help=g_option_context_get_help(context,TRUE,NULL);
  3221     fputs(help,stderr);
  3222     g_free(help);
  3223     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3224     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3225       "non-ASCII\n",stderr);
  3226     fputs("characters like accented letters, "
  3227       "lines longer than 75 or shorter than 55,\n",stderr);
  3228     fputs("unbalanced quotes or brackets, "
  3229       "a variety of badly formatted punctuation, \n",stderr);
  3230     fputs("HTML tags, some likely typos. "
  3231       "It is NOT a substitute for human judgement.\n",stderr);
  3232     fputs("\n",stderr);
  3233 }