bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Thu May 30 18:33:44 2013 +0100 (2013-05-30)
changeset 72 52d4a7f926b4
parent 71 82d3cc398b54
child 73 cffa80824f8c
permissions -rw-r--r--
Support WINDOWS-1252 characters encoded as UTF-8
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #include <glib.h>
    26 #include <bl/bl.h>
    27 #include "HTMLentities.h"
    28 
    29 gchar *prevline;
    30 
    31 /* Common typos. */
    32 char *typo[] = {
    33     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    34     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    35     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    36     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    37     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    38     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    39     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    40     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    41     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    42     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    43     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    44     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    45     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    46     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    47     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    48     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    49     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    50     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    51     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    52     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    53     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    54     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    55     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    56     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    57     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    58     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    59     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    60     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    61     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    62     "se", ""
    63 };
    64 
    65 GTree *usertypo;
    66 
    67 /* Common abbreviations and other OK words not to query as typos. */
    68 char *okword[] = {
    69     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    70     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    71     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    72     "outbid", "outbids", "frostbite", "frostbitten", ""
    73 };
    74 
    75 /* Common abbreviations that cause otherwise unexplained periods. */
    76 char *abbrev[] = {
    77     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    78     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    79 };
    80 
    81 /*
    82  * Two-Letter combinations that rarely if ever start words,
    83  * but are common scannos or otherwise common letter combinations.
    84  */
    85 char *nostart[] = {
    86     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    87 };
    88 
    89 /*
    90  * Two-Letter combinations that rarely if ever end words,
    91  * but are common scannos or otherwise common letter combinations.
    92  */
    93 char *noend[] = {
    94     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
    95     "sw", "gr", "sl", "cl", "iy", ""
    96 };
    97 
    98 char *markup[] = {
    99     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   100     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   101     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   102     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   103 };
   104 
   105 char *DPmarkup[] = {
   106     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   107 };
   108 
   109 char *nocomma[] = {
   110     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   111     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   112     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   113     "during", "let", "toward", "among", ""
   114 };
   115 
   116 char *noperiod[] = {
   117     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   118     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   119     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   120     "among", "those", "into", "whom", "having", "thence", ""
   121 }; 
   122 
   123 /* special characters */
   124 #define CHAR_SPACE	  32
   125 #define CHAR_TAB	   9
   126 #define CHAR_LF		  10
   127 #define CHAR_CR		  13
   128 #define CHAR_DQUOTE	  34
   129 #define CHAR_SQUOTE	  39
   130 #define CHAR_OPEN_SQUOTE  96
   131 #define CHAR_TILDE	 126
   132 #define CHAR_ASTERISK	  42
   133 #define CHAR_FORESLASH	  47
   134 #define CHAR_CARAT	  94
   135 
   136 #define CHAR_UNDERSCORE    '_'
   137 #define CHAR_OPEN_CBRACK   '{'
   138 #define CHAR_CLOSE_CBRACK  '}'
   139 #define CHAR_OPEN_RBRACK   '('
   140 #define CHAR_CLOSE_RBRACK  ')'
   141 #define CHAR_OPEN_SBRACK   '['
   142 #define CHAR_CLOSE_SBRACK  ']'
   143 
   144 /* longest and shortest normal PG line lengths */
   145 #define LONGEST_PG_LINE   75
   146 #define WAY_TOO_LONG      80
   147 #define SHORTEST_PG_LINE  55
   148 
   149 enum {
   150     ECHO_SWITCH,
   151     SQUOTE_SWITCH,
   152     TYPO_SWITCH,
   153     QPARA_SWITCH,
   154     PARANOID_SWITCH,
   155     LINE_END_SWITCH,
   156     OVERVIEW_SWITCH,
   157     STDOUT_SWITCH,
   158     HEADER_SWITCH,
   159     WEB_SWITCH,
   160     VERBOSE_SWITCH,
   161     MARKUP_SWITCH,
   162     USERTYPO_SWITCH,
   163     DP_SWITCH,
   164     SWITNO
   165 };
   166 
   167 gboolean pswit[SWITNO];  /* program switches */
   168 
   169 static GOptionEntry options[]={
   170     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   171       "Ignore DP-specific markup", NULL },
   172     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   173       "Don't echo queried line", NULL },
   174     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   175       "Check single quotes", NULL },
   176     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   177       "Check common typos", NULL },
   178     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   179       "Require closure of quotes on every paragraph", NULL },
   180     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   181       "Disable paranoid querying of everything", NULL },
   182     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   183       "Disable line end checking", NULL },
   184     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   185       "Overview: just show counts", NULL },
   186     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   187       "Output errors to stdout instead of stderr", NULL },
   188     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   189       "Echo header fields", NULL },
   190     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   191       "Ignore markup in < >", NULL },
   192     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   193       "Use file of user-defined typos", NULL },
   194     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   195       "Defaults for use on www upload", NULL },
   196     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   197       "Verbose - list everything", NULL },
   198     { NULL }
   199 };
   200 
   201 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   202 long cnt_squot;		/* for overview mode, count of singlequote queries */
   203 long cnt_brack;		/* for overview mode, count of brackets queries */
   204 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   205 long cnt_odd;		/* for overview mode, count of odd character queries */
   206 long cnt_long;		/* for overview mode, count of long line errors */
   207 long cnt_short;		/* for overview mode, count of short line queries */
   208 long cnt_punct;		/* for overview mode,
   209 			   count of punctuation and spacing queries */
   210 long cnt_dash;		/* for overview mode, count of dash-related queries */
   211 long cnt_word;		/* for overview mode, count of word queries */
   212 long cnt_html;		/* for overview mode, count of html queries */
   213 long cnt_lineend;	/* for overview mode, count of line-end queries */
   214 long cnt_spacend;	/* count of lines with space at end */
   215 long linecnt;		/* count of total lines in the file */
   216 long checked_linecnt;	/* count of lines actually checked */
   217 
   218 void proghelp(GOptionContext *context);
   219 void procfile(const char *);
   220 
   221 gchar *running_from;
   222 
   223 gboolean mixdigit(const char *);
   224 gchar *getaword(const char **);
   225 char *flgets(char **,long);
   226 void postprocess_for_HTML(char *);
   227 char *linehasmarkup(char *);
   228 char *losemarkup(char *);
   229 gboolean tagcomp(const char *,const char *);
   230 void loseentities(char *);
   231 gboolean isroman(const char *);
   232 void postprocess_for_DP(char *);
   233 void print_as_windows_1252(const char *string);
   234 void print_as_utf_8(const char *string);
   235 
   236 GTree *qword,*qperiod;
   237 
   238 struct first_pass_results {
   239     long firstline,astline;
   240     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
   241     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
   242     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
   243     int Dutchcount,Frenchcount;
   244 };
   245 
   246 struct warnings {
   247     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   248     int endquote;
   249     gboolean isDutch,isFrench;
   250 };
   251 
   252 struct counters {
   253     long quot;
   254     int c_unders,c_brack,s_brack,r_brack;
   255     int open_single_quote,close_single_quote;
   256 };
   257 
   258 struct line_properties {
   259     unsigned int len,blen;
   260     gunichar start;
   261 };
   262 
   263 struct parities {
   264     int dquote,squote;
   265 };
   266 
   267 struct pending {
   268     char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
   269     long squot;
   270 };
   271 
   272 void parse_options(int *argc,char ***argv)
   273 {
   274     GError *err=NULL;
   275     GOptionContext *context;
   276     context=g_option_context_new(
   277       "file - looks for errors in Project Gutenberg(TM) etexts");
   278     g_option_context_add_main_entries(context,options,NULL);
   279     if (!g_option_context_parse(context,argc,argv,&err))
   280     {
   281 	g_printerr("Bookloupe: %s\n",err->message);
   282 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   283 	exit(1);
   284     }
   285     /* Paranoid checking is turned OFF, not on, by its switch */
   286     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   287     if (pswit[PARANOID_SWITCH])
   288 	/* if running in paranoid mode, typo checks default to enabled */
   289 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   290     /* Line-end checking is turned OFF, not on, by its switch */
   291     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   292     /* Echoing is turned OFF, not on, by its switch */
   293     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   294     if (pswit[OVERVIEW_SWITCH])
   295 	/* just print summary; don't echo */
   296 	pswit[ECHO_SWITCH]=FALSE;
   297     /*
   298      * Web uploads - for the moment, this is really just a placeholder
   299      * until we decide what processing we really want to do on web uploads
   300      */
   301     if (pswit[WEB_SWITCH])
   302     {
   303 	/* specific override for web uploads */
   304 	pswit[ECHO_SWITCH]=TRUE;
   305 	pswit[SQUOTE_SWITCH]=FALSE;
   306 	pswit[TYPO_SWITCH]=TRUE;
   307 	pswit[QPARA_SWITCH]=FALSE;
   308 	pswit[PARANOID_SWITCH]=TRUE;
   309 	pswit[LINE_END_SWITCH]=FALSE;
   310 	pswit[OVERVIEW_SWITCH]=FALSE;
   311 	pswit[STDOUT_SWITCH]=FALSE;
   312 	pswit[HEADER_SWITCH]=TRUE;
   313 	pswit[VERBOSE_SWITCH]=FALSE;
   314 	pswit[MARKUP_SWITCH]=FALSE;
   315 	pswit[USERTYPO_SWITCH]=FALSE;
   316 	pswit[DP_SWITCH]=FALSE;
   317     }
   318     if (*argc<2)
   319     {
   320 	proghelp(context);
   321 	exit(1);
   322     }
   323     g_option_context_free(context);
   324 }
   325 
   326 /*
   327  * read_user_scannos:
   328  *
   329  * Read in the user-defined stealth scanno list.
   330  */
   331 void read_user_scannos(void)
   332 {
   333     GError *err=NULL;
   334     gchar *usertypo_file;
   335     gboolean okay;
   336     int i;
   337     gsize len,nb;
   338     gchar *contents,*utf8,**lines;
   339     usertypo_file=g_strdup("bookloupe.typ");
   340     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   341     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   342     {
   343 	g_clear_error(&err);
   344 	g_free(usertypo_file);
   345 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   346 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   347     }
   348     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   349     {
   350 	g_clear_error(&err);
   351 	g_free(usertypo_file);
   352 	usertypo_file=g_strdup("gutcheck.typ");
   353 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   354     }
   355     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   356     {
   357 	g_clear_error(&err);
   358 	g_free(usertypo_file);
   359 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   360 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   361     }
   362     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   363     {
   364 	g_free(usertypo_file);
   365 	g_print("   --> I couldn't find bookloupe.typ "
   366 	  "-- proceeding without user typos.\n");
   367 	return;
   368     }
   369     else if (!okay)
   370     {
   371 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   372 	g_free(usertypo_file);
   373 	g_clear_error(&err);
   374 	exit(1);
   375     }
   376     if (g_utf8_validate(contents,len,NULL))
   377 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   378     else
   379 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   380     g_free(contents);
   381     lines=g_strsplit_set(utf8,"\r\n",0);
   382     g_free(utf8);
   383     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   384     for (i=0;lines[i];i++)
   385 	if (*(unsigned char *)lines[i]>'!')
   386 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   387 	else
   388 	    g_free(lines[i]);
   389     g_free(lines);
   390 }
   391 
   392 /*
   393  * read_etext:
   394  *
   395  * Read an etext returning a newly allocated string containing the file
   396  * contents or NULL on error.
   397  */
   398 gchar *read_etext(const char *filename,GError **err)
   399 {
   400     gchar *contents,*utf8;
   401     gsize len,nb;
   402     if (!g_file_get_contents(filename,&contents,&len,err))
   403 	return NULL;
   404     if (g_utf8_validate(contents,len,NULL))
   405     {
   406 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   407 	g_set_print_handler(print_as_utf_8);
   408     }
   409     else
   410     {
   411 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   412 	g_set_print_handler(print_as_windows_1252);
   413     }
   414     g_free(contents);
   415     return utf8;
   416 }
   417 
   418 int main(int argc,char **argv)
   419 {
   420     running_from=g_path_get_dirname(argv[0]);
   421     parse_options(&argc,&argv);
   422     if (pswit[USERTYPO_SWITCH])
   423 	read_user_scannos();
   424     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   425     procfile(argv[1]);
   426     if (pswit[OVERVIEW_SWITCH])
   427     {
   428 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   429 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   430 	g_print("    --------------- Queries found --------------\n");
   431 	if (cnt_long)
   432 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   433 	if (cnt_short)
   434 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   435 	if (cnt_lineend)
   436 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   437 	if (cnt_word)
   438 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   439 	if (cnt_dquot)
   440 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   441 	if (cnt_squot)
   442 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   443 	if (cnt_brack)
   444 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   445 	if (cnt_bin)
   446 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   447 	if (cnt_odd)
   448 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   449 	if (cnt_punct)
   450 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   451 	if (cnt_dash)
   452 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   453 	if (cnt_html)
   454 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   455 	g_print("\n");
   456 	g_print("    TOTAL QUERIES		  %14ld\n",
   457 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   458 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   459     }
   460     g_free(running_from);
   461     if (usertypo)
   462 	g_tree_unref(usertypo);
   463     return 0;
   464 }
   465 
   466 /*
   467  * first_pass:
   468  *
   469  * Run a first pass - verify that it's a valid PG
   470  * file, decide whether to report some things that
   471  * occur many times in the text like long or short
   472  * lines, non-standard dashes, etc.
   473  */
   474 struct first_pass_results *first_pass(const char *etext)
   475 {
   476     gunichar laststart=CHAR_SPACE;
   477     const char *s;
   478     gchar *lc_line;
   479     int i,j,lbytes,llen;
   480     gchar **lines;
   481     unsigned int lastlen=0,lastblen=0;
   482     long spline=0,nspline=0;
   483     static struct first_pass_results results={0};
   484     gchar *inword;
   485     lines=g_strsplit(etext,"\n",0);
   486     for (j=0;lines[j];j++)
   487     {
   488 	lbytes=strlen(lines[j]);
   489 	while (lines[j][lbytes-1]=='\r')
   490 	    lines[j][--lbytes]='\0';
   491 	llen=g_utf8_strlen(lines[j],lbytes);
   492 	linecnt++;
   493 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   494 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   495 	{
   496 	    if (spline)
   497 		g_print("   --> Duplicate header?\n");
   498 	    spline=linecnt+1;   /* first line of non-header text, that is */
   499 	}
   500 	if (!strncmp(lines[j],"*** START",9) &&
   501 	  strstr(lines[j],"PROJECT GUTENBERG"))
   502 	{
   503 	    if (nspline)
   504 		g_print("   --> Duplicate header?\n");
   505 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   506 	}
   507 	if (spline || nspline)
   508 	{
   509 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   510 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   511 	    {
   512 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   513 		{
   514 		    if (results.footerline)
   515 		    {
   516 			/* it's an old-form header - we can detect duplicates */
   517 			if (!nspline)
   518 			    g_print("   --> Duplicate footer?\n");
   519 		    }
   520 		    else
   521 			results.footerline=linecnt;
   522 		}
   523 	    }
   524 	    g_free(lc_line);
   525 	}
   526 	if (spline)
   527 	    results.firstline=spline;
   528 	if (nspline)
   529 	    results.firstline=nspline;  /* override with new */
   530 	if (results.footerline)
   531 	    continue;    /* don't count the boilerplate in the footer */
   532 	results.totlen+=llen;
   533 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   534 	{
   535 	    if (g_utf8_get_char(s)>127)
   536 		results.binlen++;
   537 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   538 		results.alphalen++;
   539 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   540 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   541 		results.endquote_count++;
   542 	}
   543 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   544 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   545 	    results.shortline++;
   546 	if (lbytes>0 &&
   547 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   548 	    cnt_spacend++;
   549 	if (strstr(lines[j],".,"))
   550 	    results.dotcomma++;
   551 	/* only count ast lines for ignoring purposes where there is */
   552 	/* locase text on the line */
   553 	if (strchr(lines[j],'*'))
   554 	{
   555 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   556 		if (g_unichar_islower(g_utf8_get_char(s)))
   557 		    break;
   558 	    if (*s)
   559 		results.astline++;
   560 	}
   561 	if (strchr(lines[j],'/'))
   562 	    results.fslashline++;
   563 	for (s=g_utf8_prev_char(lines[j]+lbytes);
   564 	  s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
   565 	    ;
   566 	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   567 	  g_utf8_get_char(g_utf8_prev_char(s))!='-')
   568 	    results.hyphens++;
   569 	if (llen>LONGEST_PG_LINE)
   570 	    results.longline++;
   571 	if (llen>WAY_TOO_LONG)
   572 	    results.verylongline++;
   573 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   574 	{
   575 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   576 	    if (i>0)
   577 		results.htmcount++;
   578 	    if (strstr(lines[j],"<i>"))
   579 		results.htmcount+=4; /* bonus marks! */
   580 	}
   581 	/* Check for spaced em-dashes */
   582 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   583 	{
   584 	    results.emdash++;
   585 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   586 		results.space_emdash++;
   587 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   588 		/* count of em-dashes with spaces both sides */
   589 		results.non_PG_space_emdash++;
   590 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   591 		/* count of PG-type em-dashes with no spaces */
   592 		results.PG_space_emdash++;
   593 	}
   594 	for (s=lines[j];*s;)
   595 	{
   596 	    inword=getaword(&s);
   597 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   598 		results.Dutchcount++;
   599 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   600 		results.Frenchcount++;
   601 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   602 		results.standalone_digit++;
   603 	    g_free(inword);
   604 	}
   605 	/* Check for spaced dashes */
   606 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   607 	    results.spacedash++;
   608 	lastblen=lastlen;
   609 	lastlen=llen;
   610 	laststart=lines[j][0];
   611     }
   612     g_strfreev(lines);
   613     return &results;
   614 }
   615 
   616 /*
   617  * report_first_pass:
   618  *
   619  * Make some snap decisions based on the first pass results.
   620  */
   621 struct warnings *report_first_pass(struct first_pass_results *results)
   622 {
   623     static struct warnings warnings={0};
   624     if (cnt_spacend>0)
   625 	g_print("   --> %ld lines in this file have white space at end\n",
   626 	  cnt_spacend);
   627     warnings.dotcomma=1;
   628     if (results->dotcomma>5)
   629     {
   630 	warnings.dotcomma=0;
   631 	g_print("   --> %ld lines in this file contain '.,'. "
   632 	  "Not reporting them.\n",results->dotcomma);
   633     }
   634     /*
   635      * If more than 50 lines, or one-tenth, are short,
   636      * don't bother reporting them.
   637      */
   638     warnings.shortline=1;
   639     if (results->shortline>50 || results->shortline*10>linecnt)
   640     {
   641 	warnings.shortline=0;
   642 	g_print("   --> %ld lines in this file are short. "
   643 	  "Not reporting short lines.\n",results->shortline);
   644     }
   645     /*
   646      * If more than 50 lines, or one-tenth, are long,
   647      * don't bother reporting them.
   648      */
   649     warnings.longline=1;
   650     if (results->longline>50 || results->longline*10>linecnt)
   651     {
   652 	warnings.longline=0;
   653 	g_print("   --> %ld lines in this file are long. "
   654 	  "Not reporting long lines.\n",results->longline);
   655     }
   656     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   657     warnings.ast=1;
   658     if (results->astline>10)
   659     {
   660 	warnings.ast=0;
   661 	g_print("   --> %ld lines in this file contain asterisks. "
   662 	  "Not reporting them.\n",results->astline);
   663     }
   664     /*
   665      * If more than 10 lines contain forward slashes,
   666      * don't bother reporting them.
   667      */
   668     warnings.fslash=1;
   669     if (results->fslashline>10)
   670     {
   671 	warnings.fslash=0;
   672 	g_print("   --> %ld lines in this file contain forward slashes. "
   673 	  "Not reporting them.\n",results->fslashline);
   674     }
   675     /*
   676      * If more than 20 lines contain unpunctuated endquotes,
   677      * don't bother reporting them.
   678      */
   679     warnings.endquote=1;
   680     if (results->endquote_count>20)
   681     {
   682 	warnings.endquote=0;
   683 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   684 	  "Not reporting them.\n",results->endquote_count);
   685     }
   686     /*
   687      * If more than 15 lines contain standalone digits,
   688      * don't bother reporting them.
   689      */
   690     warnings.digit=1;
   691     if (results->standalone_digit>10)
   692     {
   693 	warnings.digit=0;
   694 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   695 	  "Not reporting them.\n",results->standalone_digit);
   696     }
   697     /*
   698      * If more than 20 lines contain hyphens at end,
   699      * don't bother reporting them.
   700      */
   701     warnings.hyphen=1;
   702     if (results->hyphens>20)
   703     {
   704 	warnings.hyphen=0;
   705 	g_print("   --> %ld lines in this file have hyphens at end. "
   706 	  "Not reporting them.\n",results->hyphens);
   707     }
   708     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   709     {
   710 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   711 	pswit[MARKUP_SWITCH]=1;
   712     }
   713     if (results->verylongline>0)
   714 	g_print("   --> %ld lines in this file are VERY long!\n",
   715 	  results->verylongline);
   716     /*
   717      * If there are more non-PG spaced dashes than PG em-dashes,
   718      * assume it's deliberate.
   719      * Current PG guidelines say don't use them, but older texts do,
   720      * and some people insist on them whatever the guidelines say.
   721      */
   722     warnings.dash=1;
   723     if (results->spacedash+results->non_PG_space_emdash>
   724       results->PG_space_emdash)
   725     {
   726 	warnings.dash=0;
   727 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   728 	  "Not reporting them.\n",
   729 	  results->spacedash+results->non_PG_space_emdash);
   730     }
   731     /* If more than a quarter of characters are hi-bit, bug out. */
   732     warnings.bin=1;
   733     if (results->binlen*4>results->totlen)
   734     {
   735 	g_print("   --> This file does not appear to be ASCII. "
   736 	  "Terminating. Best of luck with it!\n");
   737 	exit(1);
   738     }
   739     if (results->alphalen*4<results->totlen)
   740     {
   741 	g_print("   --> This file does not appear to be text. "
   742 	  "Terminating. Best of luck with it!\n");
   743 	exit(1);
   744     }
   745     if (results->binlen*100>results->totlen || results->binlen>100)
   746     {
   747 	g_print("   --> There are a lot of foreign letters here. "
   748 	  "Not reporting them.\n");
   749 	warnings.bin=0;
   750     }
   751     warnings.isDutch=FALSE;
   752     if (results->Dutchcount>50)
   753     {
   754 	warnings.isDutch=TRUE;
   755 	g_print("   --> This looks like Dutch - "
   756 	  "switching off dashes and warnings for 's Middags case.\n");
   757     }
   758     warnings.isFrench=FALSE;
   759     if (results->Frenchcount>50)
   760     {
   761 	warnings.isFrench=TRUE;
   762 	g_print("   --> This looks like French - "
   763 	  "switching off some doublepunct.\n");
   764     }
   765     if (results->firstline && results->footerline)
   766 	g_print("    The PG header and footer appear to be already on.\n");
   767     else
   768     {
   769 	if (results->firstline)
   770 	    g_print("    The PG header is on - no footer.\n");
   771 	if (results->footerline)
   772 	    g_print("    The PG footer is on - no header.\n");
   773     }
   774     g_print("\n");
   775     if (pswit[VERBOSE_SWITCH])
   776     {
   777 	warnings.bin=1;
   778 	warnings.shortline=1;
   779 	warnings.dotcomma=1;
   780 	warnings.longline=1;
   781 	warnings.dash=1;
   782 	warnings.digit=1;
   783 	warnings.ast=1;
   784 	warnings.fslash=1;
   785 	warnings.hyphen=1;
   786 	warnings.endquote=1;
   787 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   788     }
   789     if (warnings.isDutch)
   790 	warnings.dash=0;
   791     if (results->footerline>0 && results->firstline>0 &&
   792       results->footerline>results->firstline &&
   793       results->footerline-results->firstline<100)
   794     {
   795 	g_print("   --> I don't really know where this text starts. \n");
   796 	g_print("       There are no reference points.\n");
   797 	g_print("       I'm going to have to report the header and footer "
   798 	  "as well.\n");
   799 	results->firstline=0;
   800     }
   801     return &warnings;
   802 }
   803 
   804 /*
   805  * analyse_quotes:
   806  *
   807  * Look along the line, accumulate the count of quotes, and see
   808  * if this is an empty line - i.e. a line with nothing on it
   809  * but spaces.
   810  * If line has just spaces, period, * and/or - on it, don't
   811  * count it, since empty lines with asterisks or dashes to
   812  * separate sections are common.
   813  *
   814  * Returns: TRUE if the line is empty.
   815  */
   816 gboolean analyse_quotes(const char *aline,struct counters *counters)
   817 {
   818     int guessquote=0;
   819     /* assume the line is empty until proven otherwise */
   820     gboolean isemptyline=TRUE;
   821     const char *s=aline,*sprev,*snext;
   822     gunichar c;
   823     sprev=NULL;
   824     while (*s)
   825     {
   826 	snext=g_utf8_next_char(s);
   827 	c=g_utf8_get_char(s);
   828 	if (c==CHAR_DQUOTE)
   829 	    counters->quot++;
   830 	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
   831 	{
   832 	    if (s==aline)
   833 	    {
   834 		/*
   835 		 * At start of line, it can only be an openquote.
   836 		 * Hardcode a very common exception!
   837 		 */
   838 		if (!g_str_has_prefix(snext,"tis") &&
   839 		  !g_str_has_prefix(snext,"Tis"))
   840 		    counters->open_single_quote++;
   841 	    }
   842 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   843 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   844 		/* Do nothing! it's definitely an apostrophe, not a quote */
   845 		;
   846 	    /* it's outside a word - let's check it out */
   847 	    else if (c==CHAR_OPEN_SQUOTE ||
   848 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   849 	    {
   850 		/* it damwell better BE an openquote */
   851 		if (!g_str_has_prefix(snext,"tis") &&
   852 		  !g_str_has_prefix(snext,"Tis"))
   853 		    /* hardcode a very common exception! */
   854 		    counters->open_single_quote++;
   855 	    }
   856 	    else
   857 	    {
   858 		/* now - is it a closequote? */
   859 		guessquote=0;   /* accumulate clues */
   860 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   861 		{
   862 		    /* it follows a letter - could be either */
   863 		    guessquote++;
   864 		    if (g_utf8_get_char(sprev)=='s')
   865 		    {
   866 			/* looks like a plural apostrophe */
   867 			guessquote-=3;
   868 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   869 			    /* bonus marks! */
   870 			    guessquote-=2;
   871 		    }
   872 		}
   873 		/* it doesn't have a letter either side */
   874 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
   875 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
   876 		    guessquote+=8; /* looks like a closequote */
   877 		else
   878 		    guessquote++;
   879 		if (counters->open_single_quote>counters->close_single_quote)
   880 		    /*
   881 		     * Give it the benefit of some doubt,
   882 		     * if a squote is already open.
   883 		     */
   884 		    guessquote++;
   885 		else
   886 		    guessquote--;
   887 		if (guessquote>=0)
   888 		    counters->close_single_quote++;
   889 	    }
   890 	}
   891 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   892 	  c!='\r' && c!='\n')
   893 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   894 	if (c==CHAR_UNDERSCORE)
   895 	    counters->c_unders++;
   896 	if (c==CHAR_OPEN_CBRACK)
   897 	    counters->c_brack++;
   898 	if (c==CHAR_CLOSE_CBRACK)
   899 	    counters->c_brack--;
   900 	if (c==CHAR_OPEN_RBRACK)
   901 	    counters->r_brack++;
   902 	if (c==CHAR_CLOSE_RBRACK)
   903 	    counters->r_brack--;
   904 	if (c==CHAR_OPEN_SBRACK)
   905 	    counters->s_brack++;
   906 	if (c==CHAR_CLOSE_SBRACK)
   907 	    counters->s_brack--;
   908 	sprev=s;
   909 	s=snext;
   910     }
   911     return isemptyline;
   912 }
   913 
   914 /*
   915  * check_for_control_characters:
   916  *
   917  * Check for invalid or questionable characters in the line
   918  * Anything above 127 is invalid for plain ASCII, and
   919  * non-printable control characters should also be flagged.
   920  * Tabs should generally not be there.
   921  */
   922 void check_for_control_characters(const char *aline)
   923 {
   924     gunichar c;
   925     const char *s;
   926     for (s=aline;*s;s=g_utf8_next_char(s))
   927     {
   928 	c=g_utf8_get_char(s);
   929 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   930 	{
   931 	    if (pswit[ECHO_SWITCH])
   932 		g_print("\n%s\n",aline);
   933 	    if (!pswit[OVERVIEW_SWITCH])
   934 		g_print("    Line %ld column %ld - Control character %u\n",
   935 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   936 	    else
   937 		cnt_bin++;
   938 	}
   939     }
   940 }
   941 
   942 /*
   943  * check_for_odd_characters:
   944  *
   945  * Check for binary and other odd characters.
   946  */
   947 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   948   gboolean isemptyline)
   949 {
   950     /* Don't repeat multiple warnings on one line. */
   951     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   952     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   953     const char *s;
   954     gunichar c;
   955     for (s=aline;*s;s=g_utf8_next_char(s))
   956     {
   957 	c=g_utf8_get_char(s);
   958 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   959 	{
   960 	    if (pswit[ECHO_SWITCH])
   961 		g_print("\n%s\n",aline);
   962 	    if (!pswit[OVERVIEW_SWITCH])
   963 		if (c>127 && c<160 || c>255)
   964 		    g_print("    Line %ld column %ld - "
   965 		      "Non-ISO-8859 character %u\n",
   966 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   967 		else
   968 		    g_print("    Line %ld column %ld - "
   969 		      "Non-ASCII character %u\n",
   970 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   971 	    else
   972 		cnt_bin++;
   973 	    eNon_A=TRUE;
   974 	}
   975 	if (!eTab && c==CHAR_TAB)
   976 	{
   977 	    if (pswit[ECHO_SWITCH])
   978 		g_print("\n%s\n",aline);
   979 	    if (!pswit[OVERVIEW_SWITCH])
   980 		g_print("    Line %ld column %ld - Tab character?\n",
   981 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   982 	    else
   983 		cnt_odd++;
   984 	    eTab=TRUE;
   985 	}
   986 	if (!eTilde && c==CHAR_TILDE)
   987 	{
   988 	    /*
   989 	     * Often used by OCR software to indicate an
   990 	     * unrecognizable character.
   991 	     */
   992 	    if (pswit[ECHO_SWITCH])
   993 		g_print("\n%s\n",aline);
   994 	    if (!pswit[OVERVIEW_SWITCH])
   995 		g_print("    Line %ld column %ld - Tilde character?\n",
   996 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   997 	    else
   998 		cnt_odd++;
   999 	    eTilde=TRUE;
  1000 	}
  1001 	if (!eCarat && c==CHAR_CARAT)
  1002 	{  
  1003 	    if (pswit[ECHO_SWITCH])
  1004 		g_print("\n%s\n",aline);
  1005 	    if (!pswit[OVERVIEW_SWITCH])
  1006 		g_print("    Line %ld column %ld - Carat character?\n",
  1007 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1008 	    else
  1009 		cnt_odd++;
  1010 	    eCarat=TRUE;
  1011 	}
  1012 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1013 	{  
  1014 	    if (pswit[ECHO_SWITCH])
  1015 		g_print("\n%s\n",aline);
  1016 	    if (!pswit[OVERVIEW_SWITCH])
  1017 		g_print("    Line %ld column %ld - Forward slash?\n",
  1018 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1019 	    else
  1020 		cnt_odd++;
  1021 	    eFSlash=TRUE;
  1022 	}
  1023 	/*
  1024 	 * Report asterisks only in paranoid mode,
  1025 	 * since they're often deliberate.
  1026 	 */
  1027 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1028 	  c==CHAR_ASTERISK)
  1029 	{
  1030 	    if (pswit[ECHO_SWITCH])
  1031 		g_print("\n%s\n",aline);
  1032 	    if (!pswit[OVERVIEW_SWITCH])
  1033 		g_print("    Line %ld column %ld - Asterisk?\n",
  1034 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1035 	    else
  1036 		cnt_odd++;
  1037 	    eAst=TRUE;
  1038 	}
  1039     }
  1040 }
  1041 
  1042 /*
  1043  * check_for_long_line:
  1044  *
  1045  * Check for line too long.
  1046  */
  1047 void check_for_long_line(const char *aline)
  1048 {
  1049     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1050     {
  1051 	if (pswit[ECHO_SWITCH])
  1052 	    g_print("\n%s\n",aline);
  1053 	if (!pswit[OVERVIEW_SWITCH])
  1054 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1055 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1056 	else
  1057 	    cnt_long++;
  1058     }
  1059 }
  1060 
  1061 /*
  1062  * check_for_short_line:
  1063  *
  1064  * Check for line too short.
  1065  *
  1066  * This one is a bit trickier to implement: we don't want to
  1067  * flag the last line of a paragraph for being short, so we
  1068  * have to wait until we know that our current line is a
  1069  * "normal" line, then report the _previous_ line if it was too
  1070  * short. We also don't want to report indented lines like
  1071  * chapter heads or formatted quotations. We therefore keep
  1072  * last->len as the length of the last line examined, and
  1073  * last->blen as the length of the last but one, and try to
  1074  * suppress unnecessary warnings by checking that both were of
  1075  * "normal" length. We keep the first character of the last
  1076  * line in last->start, and if it was a space, we assume that
  1077  * the formatting is deliberate. I can't figure out a way to
  1078  * distinguish something like a quoted verse left-aligned or
  1079  * the header or footer of a letter from a paragraph of short
  1080  * lines - maybe if I examined the whole paragraph, and if the
  1081  * para has less than, say, 8 lines and if all lines are short,
  1082  * then just assume it's OK? Need to look at some texts to see
  1083  * how often a formula like this would get the right result.
  1084  */
  1085 void check_for_short_line(const char *aline,const struct line_properties *last)
  1086 {
  1087     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1088       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1089       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1090     {
  1091 	if (pswit[ECHO_SWITCH])
  1092 	    g_print("\n%s\n",prevline);
  1093 	if (!pswit[OVERVIEW_SWITCH])
  1094 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1095 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1096 	else
  1097 	    cnt_short++;
  1098     }
  1099 }
  1100 
  1101 /*
  1102  * check_for_starting_punctuation:
  1103  *
  1104  * Look for punctuation other than full ellipses at start of line.
  1105  */
  1106 void check_for_starting_punctuation(const char *aline)
  1107 {
  1108     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1109       !g_str_has_prefix(aline,". . ."))
  1110     {
  1111 	if (pswit[ECHO_SWITCH])
  1112 	    g_print("\n%s\n",aline);
  1113 	if (!pswit[OVERVIEW_SWITCH])
  1114 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1115 	      linecnt);
  1116 	else
  1117 	    cnt_punct++;
  1118     }
  1119 }
  1120 
  1121 /*
  1122  * check_for_spaced_emdash:
  1123  *
  1124  * Check for spaced em-dashes.
  1125  *
  1126  * We must check _all_ occurrences of "--" on the line
  1127  * hence the loop - even if the first double-dash is OK
  1128  * there may be another that's wrong later on.
  1129  */
  1130 void check_for_spaced_emdash(const char *aline)
  1131 {
  1132     const char *s,*t,*next;
  1133     for (s=aline;t=strstr(s,"--");s=next)
  1134     {
  1135 	next=g_utf8_next_char(g_utf8_next_char(t));
  1136 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1137 	  g_utf8_get_char(next)==CHAR_SPACE)
  1138 	{
  1139 	    if (pswit[ECHO_SWITCH])
  1140 		g_print("\n%s\n",aline);
  1141 	    if (!pswit[OVERVIEW_SWITCH])
  1142 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1143 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1144 	    else
  1145 		cnt_dash++;
  1146 	}
  1147     }
  1148 }
  1149 
  1150 /*
  1151  * check_for_spaced_dash:
  1152  *
  1153  * Check for spaced dashes.
  1154  */
  1155 void check_for_spaced_dash(const char *aline)
  1156 {
  1157     const char *s;
  1158     if ((s=strstr(aline," -")))
  1159     {
  1160 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1161 	{
  1162 	    if (pswit[ECHO_SWITCH])
  1163 		g_print("\n%s\n",aline);
  1164 	    if (!pswit[OVERVIEW_SWITCH])
  1165 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1166 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1167 	    else
  1168 		cnt_dash++;
  1169 	}
  1170     }
  1171     else if ((s=strstr(aline,"- ")))
  1172     {
  1173 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1174 	{
  1175 	    if (pswit[ECHO_SWITCH])
  1176 		g_print("\n%s\n",aline);
  1177 	    if (!pswit[OVERVIEW_SWITCH])
  1178 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1179 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1180 	    else
  1181 		cnt_dash++;
  1182 	}
  1183     }
  1184 }
  1185 
  1186 /*
  1187  * check_for_unmarked_paragraphs:
  1188  *
  1189  * Check for unmarked paragraphs indicated by separate speakers.
  1190  *
  1191  * May well be false positive:
  1192  * "Bravo!" "Wonderful!" called the crowd.
  1193  * but useful all the same.
  1194  */
  1195 void check_for_unmarked_paragraphs(const char *aline)
  1196 {
  1197     const char *s;
  1198     s=strstr(aline,"\"  \"");
  1199     if (!s)
  1200 	s=strstr(aline,"\" \"");
  1201     if (s)
  1202     {
  1203 	if (pswit[ECHO_SWITCH])
  1204 	    g_print("\n%s\n",aline);
  1205 	if (!pswit[OVERVIEW_SWITCH])
  1206 	    g_print("    Line %ld column %ld - "
  1207 	      "Query missing paragraph break?\n",
  1208 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1209 	else
  1210 	    cnt_punct++;
  1211     }
  1212 }
  1213 
  1214 /*
  1215  * check_for_jeebies:
  1216  *
  1217  * Check for "to he" and other easy h/b errors.
  1218  *
  1219  * This is a very inadequate effort on the h/b problem,
  1220  * but the phrase "to he" is always an error, whereas "to
  1221  * be" is quite common.
  1222  * Similarly, '"Quiet!", be said.' is a non-be error
  1223  * "to he" is _not_ always an error!:
  1224  *       "Where they went to he couldn't say."
  1225  * Another false positive:
  1226  *       What would "Cinderella" be without the . . .
  1227  * and another: "If he wants to he can see for himself."
  1228  */
  1229 void check_for_jeebies(const char *aline)
  1230 {
  1231     const char *s;
  1232     s=strstr(aline," be could ");
  1233     if (!s)
  1234 	s=strstr(aline," be would ");
  1235     if (!s)
  1236 	s=strstr(aline," was be ");
  1237     if (!s)
  1238 	s=strstr(aline," be is ");
  1239     if (!s)
  1240 	s=strstr(aline," is be ");
  1241     if (!s)
  1242 	s=strstr(aline,"\", be ");
  1243     if (!s)
  1244 	s=strstr(aline,"\" be ");
  1245     if (!s)
  1246 	s=strstr(aline,"\" be ");
  1247     if (!s)
  1248 	s=strstr(aline," to he ");
  1249     if (s)
  1250     {
  1251 	if (pswit[ECHO_SWITCH])
  1252 	    g_print("\n%s\n",aline);
  1253 	if (!pswit[OVERVIEW_SWITCH])
  1254 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1255 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1256 	else
  1257 	    cnt_word++;
  1258     }
  1259     s=strstr(aline," the had ");
  1260     if (!s)
  1261 	s=strstr(aline," a had ");
  1262     if (!s)
  1263 	s=strstr(aline," they bad ");
  1264     if (!s)
  1265 	s=strstr(aline," she bad ");
  1266     if (!s)
  1267 	s=strstr(aline," he bad ");
  1268     if (!s)
  1269 	s=strstr(aline," you bad ");
  1270     if (!s)
  1271 	s=strstr(aline," i bad ");
  1272     if (s)
  1273     {
  1274 	if (pswit[ECHO_SWITCH])
  1275 	    g_print("\n%s\n",aline);
  1276 	if (!pswit[OVERVIEW_SWITCH])
  1277 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1278 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1279 	else
  1280 	    cnt_word++;
  1281     }
  1282     s=strstr(aline,"; hut ");
  1283     if (!s)
  1284 	s=strstr(aline,", hut ");
  1285     if (s)
  1286     {
  1287 	if (pswit[ECHO_SWITCH])
  1288 	    g_print("\n%s\n",aline);
  1289 	if (!pswit[OVERVIEW_SWITCH])
  1290 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1291 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1292 	else
  1293 	    cnt_word++;
  1294     }
  1295 }
  1296 
  1297 /*
  1298  * check_for_mta_from:
  1299  *
  1300  * Special case - angled bracket in front of "From" placed there by an
  1301  * MTA when sending an e-mail.
  1302  */
  1303 void check_for_mta_from(const char *aline)
  1304 {
  1305     const char *s;
  1306     s=strstr(aline,">From");
  1307     if (s)
  1308     {
  1309 	if (pswit[ECHO_SWITCH])
  1310 	    g_print("\n%s\n",aline);
  1311 	if (!pswit[OVERVIEW_SWITCH])
  1312 	    g_print("    Line %ld column %ld - "
  1313 	      "Query angled bracket with From\n",
  1314 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1315 	else
  1316 	    cnt_punct++;
  1317     }
  1318 }
  1319 
  1320 /*
  1321  * check_for_orphan_character:
  1322  *
  1323  * Check for a single character line -
  1324  * often an overflow from bad wrapping.
  1325  */
  1326 void check_for_orphan_character(const char *aline)
  1327 {
  1328     gunichar c;
  1329     c=g_utf8_get_char(aline);
  1330     if (c && !*g_utf8_next_char(aline))
  1331     {
  1332 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1333 	    ; /* Nothing - ignore numerals alone on a line. */
  1334 	else
  1335 	{
  1336 	    if (pswit[ECHO_SWITCH])
  1337 		g_print("\n%s\n",aline);
  1338 	    if (!pswit[OVERVIEW_SWITCH])
  1339 		g_print("    Line %ld column 1 - Query single character line\n",
  1340 		  linecnt);
  1341 	    else
  1342 		cnt_punct++;
  1343 	}
  1344     }
  1345 }
  1346 
  1347 /*
  1348  * check_for_pling_scanno:
  1349  *
  1350  * Check for I" - often should be !
  1351  */
  1352 void check_for_pling_scanno(const char *aline)
  1353 {
  1354     const char *s;
  1355     s=strstr(aline," I\"");
  1356     if (s)
  1357     {
  1358 	if (pswit[ECHO_SWITCH])
  1359 	    g_print("\n%s\n",aline);
  1360 	if (!pswit[OVERVIEW_SWITCH])
  1361 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1362 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1363 	else
  1364 	    cnt_punct++;
  1365     }
  1366 }
  1367 
  1368 /*
  1369  * check_for_extra_period:
  1370  *
  1371  * Check for period without a capital letter. Cut-down from gutspell.
  1372  * Only works when it happens on a single line.
  1373  */
  1374 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1375 {
  1376     const char *s,*t,*s1;
  1377     int i;
  1378     gsize len;
  1379     gboolean istypo;
  1380     gchar *testword;
  1381     gunichar *decomposition;
  1382     if (pswit[PARANOID_SWITCH])
  1383     {
  1384 	for (t=aline;t=strstr(t,". ");)
  1385 	{
  1386 	    if (t==aline)
  1387 	    {
  1388 		t=g_utf8_next_char(t);
  1389 		/* start of line punctuation is handled elsewhere */
  1390 		continue;
  1391 	    }
  1392 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1393 	    {
  1394 		t=g_utf8_next_char(t);
  1395 		continue;
  1396 	    }
  1397 	    if (warnings->isDutch)
  1398 	    {
  1399 		/* For Frank & Jeroen -- 's Middags case */
  1400 		gunichar c2,c3,c4,c5;
  1401 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1402 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1403 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1404 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1405 		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
  1406 		  c4==CHAR_SPACE && g_unichar_isupper(c5))
  1407 		{
  1408 		    t=g_utf8_next_char(t);
  1409 		    continue;
  1410 		}
  1411 	    }
  1412 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1413 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1414 	      !isdigit(g_utf8_get_char(s1)))
  1415 		s1=g_utf8_next_char(s1);
  1416 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1417 	    {
  1418 		/* we have something to investigate */
  1419 		istypo=TRUE;
  1420 		/* so let's go back and find out */
  1421 		for (s1=g_utf8_prev_char(t);s1>=aline &&
  1422 		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||
  1423 		  g_unichar_isdigit(g_utf8_get_char(s1)) ||
  1424 		  g_utf8_get_char(s1)==CHAR_SQUOTE &&
  1425 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
  1426 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
  1427 		  s1=g_utf8_prev_char(s1))
  1428 		    ;
  1429 		s1=g_utf8_next_char(s1);
  1430 		s=strchr(s1,'.');
  1431 		if (s)
  1432 		    testword=g_strndup(s1,s-s1);
  1433 		else
  1434 		    testword=g_strdup(s1);
  1435 		for (i=0;*abbrev[i];i++)
  1436 		    if (!strcmp(testword,abbrev[i]))
  1437 			istypo=FALSE;
  1438 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1439 		    istypo=FALSE;
  1440 		if (!*g_utf8_next_char(testword))
  1441 		    istypo=FALSE;
  1442 		if (isroman(testword))
  1443 		    istypo=FALSE;
  1444 		if (istypo)
  1445 		{
  1446 		    istypo=FALSE;
  1447 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1448 		    {
  1449 			decomposition=g_unicode_canonical_decomposition(
  1450 			  g_utf8_get_char(s),&len);
  1451 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1452 			    istypo=TRUE;
  1453 			g_free(decomposition);
  1454 		    }
  1455 		}
  1456 		if (istypo &&
  1457 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1458 		{
  1459 		    g_tree_insert(qperiod,g_strdup(testword),
  1460 		      GINT_TO_POINTER(1));
  1461 		    if (pswit[ECHO_SWITCH])
  1462 			g_print("\n%s\n",aline);
  1463 		    if (!pswit[OVERVIEW_SWITCH])
  1464 			g_print("    Line %ld column %ld - Extra period?\n",
  1465 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1466 		    else
  1467 			cnt_punct++;
  1468 		}
  1469 		g_free(testword);
  1470 	    }
  1471 	    t=g_utf8_next_char(t);
  1472 	}
  1473     }
  1474 }
  1475 
  1476 /*
  1477  * check_for_following_punctuation:
  1478  *
  1479  * Check for words usually not followed by punctuation.
  1480  */
  1481 void check_for_following_punctuation(const char *aline)
  1482 {
  1483     int i;
  1484     const char *s,*wordstart;
  1485     gunichar c;
  1486     gchar *inword,*t;
  1487     if (pswit[TYPO_SWITCH])
  1488     {
  1489 	for (s=aline;*s;)
  1490 	{
  1491 	    wordstart=s;
  1492 	    t=getaword(&s);
  1493 	    if (!*t)
  1494 	    {
  1495 		g_free(t);
  1496 		continue;
  1497 	    }
  1498 	    inword=g_utf8_strdown(t,-1);
  1499 	    g_free(t);
  1500 	    for (i=0;*nocomma[i];i++)
  1501 		if (!strcmp(inword,nocomma[i]))
  1502 		{
  1503 		    c=g_utf8_get_char(s);
  1504 		    if (c==',' || c==';' || c==':')
  1505 		    {
  1506 			if (pswit[ECHO_SWITCH])
  1507 			    g_print("\n%s\n",aline);
  1508 			if (!pswit[OVERVIEW_SWITCH])
  1509 			    g_print("    Line %ld column %ld - "
  1510 			      "Query punctuation after %s?\n",
  1511 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1512 			      inword);
  1513 			else
  1514 			    cnt_punct++;
  1515 		    }
  1516 		}
  1517 	    for (i=0;*noperiod[i];i++)
  1518 		if (!strcmp(inword,noperiod[i]))
  1519 		{
  1520 		    c=g_utf8_get_char(s);
  1521 		    if (c=='.' || c=='!')
  1522 		    {
  1523 			if (pswit[ECHO_SWITCH])
  1524 			    g_print("\n%s\n",aline);
  1525 			if (!pswit[OVERVIEW_SWITCH])
  1526 			    g_print("    Line %ld column %ld - "
  1527 			      "Query punctuation after %s?\n",
  1528 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1529 			      inword);
  1530 			else
  1531 			    cnt_punct++;
  1532 		    }
  1533 		}
  1534 	    g_free(inword);
  1535 	}
  1536     }
  1537 }
  1538 
  1539 /*
  1540  * check_for_typos:
  1541  *
  1542  * Check for commonly mistyped words,
  1543  * and digits like 0 for O in a word.
  1544  */
  1545 void check_for_typos(const char *aline,struct warnings *warnings)
  1546 {
  1547     const char *s,*t,*nt,*wordstart;
  1548     gchar *inword;
  1549     gunichar *decomposition;
  1550     gchar *testword;
  1551     int i,vowel,consonant,*dupcnt;
  1552     gboolean isdup,istypo,alower;
  1553     gunichar c;
  1554     long offset,len;
  1555     gsize decomposition_len;
  1556     for (s=aline;*s;)
  1557     {
  1558 	wordstart=s;
  1559 	inword=getaword(&s);
  1560 	if (!*inword)
  1561 	{
  1562 	    g_free(inword);
  1563 	    continue; /* don't bother with empty lines */
  1564 	}
  1565 	if (mixdigit(inword))
  1566 	{
  1567 	    if (pswit[ECHO_SWITCH])
  1568 		g_print("\n%s\n",aline);
  1569 	    if (!pswit[OVERVIEW_SWITCH])
  1570 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1571 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1572 	    else
  1573 		cnt_word++;
  1574 	}
  1575 	/*
  1576 	 * Put the word through a series of tests for likely typos and OCR
  1577 	 * errors.
  1578 	 */
  1579 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1580 	{
  1581 	    istypo=FALSE;
  1582 	    alower=FALSE;
  1583 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1584 	    {
  1585 		c=g_utf8_get_char(t);
  1586 		nt=g_utf8_next_char(t);
  1587 		/* lowercase for testing */
  1588 		if (g_unichar_islower(c))
  1589 		    alower=TRUE;
  1590 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1591 		{
  1592 		    /*
  1593 		     * We have an uppercase mid-word. However, there are
  1594 		     * common cases:
  1595 		     *   Mac and Mc like McGill
  1596 		     *   French contractions like l'Abbe
  1597 		     */
  1598 		    offset=g_utf8_pointer_to_offset(inword,t);
  1599 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1600 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1601 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1602 		      offset>0 &&
  1603 		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
  1604 			; /* do nothing! */
  1605 		    else
  1606 			istypo=TRUE;
  1607 		}
  1608 	    }
  1609 	    testword=g_utf8_casefold(inword,-1);
  1610 	}
  1611 	if (pswit[TYPO_SWITCH])
  1612 	{
  1613 	    /*
  1614 	     * Check for certain unlikely two-letter combinations at word
  1615 	     * start and end.
  1616 	     */
  1617 	    len=g_utf8_strlen(testword,-1);
  1618 	    if (len>1)
  1619 	    {
  1620 		for (i=0;*nostart[i];i++)
  1621 		    if (g_str_has_prefix(testword,nostart[i]))
  1622 			istypo=TRUE;
  1623 		for (i=0;*noend[i];i++)
  1624 		    if (g_str_has_suffix(testword,noend[i]))
  1625 			istypo=TRUE;
  1626 	    }
  1627 	    /* ght is common, gbt never. Like that. */
  1628 	    if (strstr(testword,"cb"))
  1629 		istypo=TRUE;
  1630 	    if (strstr(testword,"gbt"))
  1631 		istypo=TRUE;
  1632 	    if (strstr(testword,"pbt"))
  1633 		istypo=TRUE;
  1634 	    if (strstr(testword,"tbs"))
  1635 		istypo=TRUE;
  1636 	    if (strstr(testword,"mrn"))
  1637 		istypo=TRUE;
  1638 	    if (strstr(testword,"ahle"))
  1639 		istypo=TRUE;
  1640 	    if (strstr(testword,"ihle"))
  1641 		istypo=TRUE;
  1642 	    /*
  1643 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1644 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1645 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1646 	     * numerals, but "ii" is a common scanno.
  1647 	     */
  1648 	    if (strstr(testword,"tbi"))
  1649 		istypo=TRUE;
  1650 	    if (strstr(testword,"tbe"))
  1651 		istypo=TRUE;
  1652 	    if (strstr(testword,"ii"))
  1653 		istypo=TRUE;
  1654 	    /*
  1655 	     * Check for no vowels or no consonants.
  1656 	     * If none, flag a typo.
  1657 	     */
  1658 	    if (!istypo && len>1)
  1659 	    {
  1660 		vowel=consonant=0;
  1661 		for (t=testword;*t;t=g_utf8_next_char(t))
  1662 		{
  1663 		    c=g_utf8_get_char(t);
  1664 		    decomposition=
  1665 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1666 		    if (c=='y' || g_unichar_isdigit(c))
  1667 		    {
  1668 			/* Yah, this is loose. */
  1669 			vowel++;
  1670 			consonant++;
  1671 		    }
  1672 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1673 			vowel++;
  1674 		    else
  1675 			consonant++;
  1676 		    g_free(decomposition);
  1677 		}
  1678 		if (!vowel || !consonant)
  1679 		    istypo=TRUE;
  1680 	    }
  1681 	    /*
  1682 	     * Now exclude the word from being reported if it's in
  1683 	     * the okword list.
  1684 	     */
  1685 	    for (i=0;*okword[i];i++)
  1686 		if (!strcmp(testword,okword[i]))
  1687 		    istypo=FALSE;
  1688 	    /*
  1689 	     * What looks like a typo may be a Roman numeral.
  1690 	     * Exclude these.
  1691 	     */
  1692 	    if (istypo && isroman(testword))
  1693 		istypo=FALSE;
  1694 	    /* Check the manual list of typos. */
  1695 	    if (!istypo)
  1696 		for (i=0;*typo[i];i++)
  1697 		    if (!strcmp(testword,typo[i]))
  1698 			istypo=TRUE;
  1699 	    /*
  1700 	     * Check lowercase s, l, i and m - special cases.
  1701 	     *   "j" - often a semi-colon gone wrong.
  1702 	     *   "d" for a missing apostrophe - he d
  1703 	     *   "n" for "in"
  1704 	     */
  1705 	    if (!istypo && len==1 &&
  1706 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1707 		istypo=TRUE;
  1708 	    if (istypo)
  1709 	    {
  1710 		dupcnt=g_tree_lookup(qword,testword);
  1711 		if (dupcnt)
  1712 		{
  1713 		    (*dupcnt)++;
  1714 		    isdup=!pswit[VERBOSE_SWITCH];
  1715 		}
  1716 		else
  1717 		{
  1718 		    dupcnt=g_new0(int,1);
  1719 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1720 		    isdup=FALSE;
  1721 		}
  1722 		if (!isdup)
  1723 		{
  1724 		    if (pswit[ECHO_SWITCH])
  1725 			g_print("\n%s\n",aline);
  1726 		    if (!pswit[OVERVIEW_SWITCH])
  1727 		    {
  1728 			g_print("    Line %ld column %ld - Query word %s",
  1729 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1730 			  inword);
  1731 			if (!pswit[VERBOSE_SWITCH])
  1732 			    g_print(" - not reporting duplicates");
  1733 			g_print("\n");
  1734 		    }
  1735 		    else
  1736 			cnt_word++;
  1737 		}
  1738 	    }
  1739 	}
  1740 	/* check the user's list of typos */
  1741 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1742 	{
  1743 	    if (pswit[ECHO_SWITCH])
  1744 		g_print("\n%s\n",aline);
  1745 	    if (!pswit[OVERVIEW_SWITCH])  
  1746 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1747 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1748 	}
  1749 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1750 	    g_free(testword);
  1751 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1752 	{
  1753 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1754 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1755 	    {
  1756 		if (pswit[ECHO_SWITCH])
  1757 		    g_print("\n%s\n",aline);
  1758 		if (!pswit[OVERVIEW_SWITCH])
  1759 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1760 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1761 		      inword);
  1762 		else
  1763 		    cnt_word++;
  1764 	    }
  1765 	}
  1766 	g_free(inword);
  1767     }
  1768 }
  1769 
  1770 /*
  1771  * check_for_misspaced_punctuation:
  1772  *
  1773  * Look for added or missing spaces around punctuation and quotes.
  1774  * If there is a punctuation character like ! with no space on
  1775  * either side, suspect a missing!space. If there are spaces on
  1776  * both sides , assume a typo. If we see a double quote with no
  1777  * space or punctuation on either side of it, assume unspaced
  1778  * quotes "like"this.
  1779  */
  1780 void check_for_misspaced_punctuation(const char *aline,
  1781   struct parities *parities,gboolean isemptyline)
  1782 {
  1783     gboolean isacro,isellipsis;
  1784     const char *s;
  1785     gunichar c,nc,pc,n2c;
  1786     c=g_utf8_get_char(aline);
  1787     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1788     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1789     {
  1790 	pc=c;
  1791 	c=nc;
  1792 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1793 	/* For each character in the line after the first. */
  1794 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1795 	{
  1796 	    /* we need to suppress warnings for acronyms like M.D. */
  1797 	    isacro=FALSE;
  1798 	    /* we need to suppress warnings for ellipsis . . . */
  1799 	    isellipsis=FALSE;
  1800 	    /*
  1801 	     * If there are letters on both sides of it or
  1802 	     * if it's strict punctuation followed by an alpha.
  1803 	     */
  1804 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1805 	      g_utf8_strchr("?!,;:",-1,c)))
  1806 	    {
  1807 		if (c=='.')
  1808 		{
  1809 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1810 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1811 			isacro=TRUE;
  1812 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1813 		    if (nc && n2c=='.')
  1814 			isacro=TRUE;
  1815 		}
  1816 		if (!isacro)
  1817 		{
  1818 		    if (pswit[ECHO_SWITCH])
  1819 			g_print("\n%s\n",aline);
  1820 		    if (!pswit[OVERVIEW_SWITCH])
  1821 			g_print("    Line %ld column %ld - Missing space?\n",
  1822 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1823 		    else
  1824 			cnt_punct++;
  1825 		}
  1826 	    }
  1827 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1828 	    {
  1829 		/*
  1830 		 * If there are spaces on both sides,
  1831 		 * or space before and end of line.
  1832 		 */
  1833 		if (c=='.')
  1834 		{
  1835 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1836 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1837 			isellipsis=TRUE;
  1838 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1839 		    if (nc && n2c=='.')
  1840 			isellipsis=TRUE;
  1841 		}
  1842 		if (!isemptyline && !isellipsis)
  1843 		{
  1844 		    if (pswit[ECHO_SWITCH])
  1845 			g_print("\n%s\n",aline);
  1846 		    if (!pswit[OVERVIEW_SWITCH])
  1847 			g_print("    Line %ld column %ld - "
  1848 			  "Spaced punctuation?\n",linecnt,
  1849 			  g_utf8_pointer_to_offset(aline,s)+1);
  1850 		    else
  1851 			cnt_punct++;
  1852 		}
  1853 	    }
  1854 	}
  1855     }
  1856     /* Split out the characters that CANNOT be preceded by space. */
  1857     c=g_utf8_get_char(aline);
  1858     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1859     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1860     {
  1861 	pc=c;
  1862 	c=nc;
  1863 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1864 	/* for each character in the line after the first */
  1865 	if (g_utf8_strchr("?!,;:",-1,c))
  1866 	{
  1867 	    /* if it's punctuation that _cannot_ have a space before it */
  1868 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1869 	    {
  1870 		/*
  1871 		 * If nc DOES == space,
  1872 		 * it was already reported just above.
  1873 		 */
  1874 		if (pswit[ECHO_SWITCH])
  1875 		    g_print("\n%s\n",aline);
  1876 		if (!pswit[OVERVIEW_SWITCH])
  1877 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1878 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1879 		else
  1880 		    cnt_punct++;
  1881 	    }
  1882 	}
  1883     }
  1884     /*
  1885      * Special case " .X" where X is any alpha.
  1886      * This plugs a hole in the acronym code above.
  1887      * Inelegant, but maintainable.
  1888      */
  1889     c=g_utf8_get_char(aline);
  1890     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1891     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1892     {
  1893 	pc=c;
  1894 	c=nc;
  1895 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1896 	/* for each character in the line after the first */
  1897 	if (c=='.')
  1898 	{
  1899 	    /* if it's a period */
  1900 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1901 	    {
  1902 		/*
  1903 		 * If the period follows a space and
  1904 		 * is followed by a letter.
  1905 		 */
  1906 		if (pswit[ECHO_SWITCH])
  1907 		    g_print("\n%s\n",aline);
  1908 		if (!pswit[OVERVIEW_SWITCH])
  1909 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1910 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1911 		else
  1912 		    cnt_punct++;
  1913 	    }
  1914 	}
  1915     }
  1916     c=g_utf8_get_char(aline);
  1917     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1918     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1919     {
  1920 	pc=c;
  1921 	c=nc;
  1922 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1923 	/* for each character in the line after the first */
  1924 	if (c==CHAR_DQUOTE)
  1925 	{
  1926 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  1927 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  1928 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  1929 	    {
  1930 		if (pswit[ECHO_SWITCH])
  1931 		    g_print("\n%s\n",aline);
  1932 		if (!pswit[OVERVIEW_SWITCH])
  1933 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  1934 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1935 		else
  1936 		    cnt_punct++;
  1937 	    }
  1938 	}
  1939     }
  1940     /* Check parity of quotes. */
  1941     nc=g_utf8_get_char(aline);
  1942     for (s=aline;*s;s=g_utf8_next_char(s))
  1943     {
  1944 	c=nc;
  1945 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1946 	if (c==CHAR_DQUOTE)
  1947 	{
  1948 	    parities->dquote=!parities->dquote;
  1949 	    if (!parities->dquote)
  1950 	    {
  1951 		/* parity even */
  1952 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  1953 		{
  1954 		    if (pswit[ECHO_SWITCH])
  1955 			g_print("\n%s\n",aline);
  1956 		    if (!pswit[OVERVIEW_SWITCH])
  1957 			g_print("    Line %ld column %ld - "
  1958 			  "Wrongspaced quotes?\n",
  1959 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1960 		    else
  1961 			cnt_punct++;
  1962 		}
  1963 	    }
  1964 	    else
  1965 	    {
  1966 		/* parity odd */
  1967 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  1968 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  1969 		{
  1970 		    if (pswit[ECHO_SWITCH])
  1971 			g_print("\n%s\n",aline);
  1972 		    if (!pswit[OVERVIEW_SWITCH])
  1973 			g_print("    Line %ld column %ld - "
  1974 			  "Wrongspaced quotes?\n",
  1975 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1976 		    else
  1977 			cnt_punct++;
  1978 		}
  1979 	    }
  1980 	}
  1981     }
  1982     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  1983     {
  1984 	if (g_utf8_strchr(",;:!?)]} ",-1,
  1985 	  g_utf8_get_char(g_utf8_next_char(aline))))
  1986 	{
  1987 	    if (pswit[ECHO_SWITCH])
  1988 		g_print("\n%s\n",aline);
  1989 	    if (!pswit[OVERVIEW_SWITCH])
  1990 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  1991 		  linecnt);
  1992 	    else
  1993 		cnt_punct++;
  1994 	}
  1995     }
  1996     if (pswit[SQUOTE_SWITCH])
  1997     {
  1998 	nc=g_utf8_get_char(aline);
  1999 	for (s=aline;*s;s=g_utf8_next_char(s))
  2000 	{
  2001 	    c=nc;
  2002 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2003 	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
  2004 	      s>aline &&
  2005 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2006 	      !g_unichar_isalpha(nc)))
  2007 	    {
  2008 		parities->squote=!parities->squote;
  2009 		if (!parities->squote)
  2010 		{
  2011 		    /* parity even */
  2012 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2013 		    {
  2014 			if (pswit[ECHO_SWITCH])
  2015 			    g_print("\n%s\n",aline);
  2016 			if (!pswit[OVERVIEW_SWITCH])
  2017 			    g_print("    Line %ld column %ld - "
  2018 			      "Wrongspaced singlequotes?\n",
  2019 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2020 			else
  2021 			    cnt_punct++;
  2022 		    }
  2023 		}
  2024 		else
  2025 		{
  2026 		    /* parity odd */
  2027 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2028 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2029 		    {
  2030 			if (pswit[ECHO_SWITCH])
  2031 			    g_print("\n%s\n",aline);
  2032 			if (!pswit[OVERVIEW_SWITCH])
  2033 			    g_print("    Line %ld column %ld - "
  2034 			      "Wrongspaced singlequotes?\n",
  2035 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2036 			else
  2037 			    cnt_punct++;
  2038 		    }
  2039 		}
  2040 	    }
  2041 	}
  2042     }
  2043 }
  2044 
  2045 /*
  2046  * check_for_double_punctuation:
  2047  *
  2048  * Look for double punctuation like ,. or ,,
  2049  * Thanks to DW for the suggestion!
  2050  * In books with references, ".," and ".;" are common
  2051  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2052  * OTOH, from my initial tests, there are also fairly
  2053  * common errors. What to do? Make these cases paranoid?
  2054  * ".," is the most common, so warnings->dotcomma is used
  2055  * to suppress detailed reporting if it occurs often.
  2056  */
  2057 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2058 {
  2059     const char *s;
  2060     gunichar c,nc;
  2061     nc=g_utf8_get_char(aline);
  2062     for (s=aline;*s;s=g_utf8_next_char(s))
  2063     {
  2064 	c=nc;
  2065 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2066 	/* for each punctuation character in the line */
  2067 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2068 	  g_utf8_strchr(".?!,;:",-1,nc))
  2069 	{
  2070 	    /* followed by punctuation, it's a query, unless . . . */
  2071 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2072 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2073 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2074 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2075 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2076 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2077 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2078 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2079 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2080 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2081 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2082 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2083 	    {
  2084 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2085 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2086 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2087 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2088 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2089 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2090 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2091 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2092 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2093 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2094 		{
  2095 		    s+=4;
  2096 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2097 		}
  2098 		; /* do nothing for .. !! and ?? which can be legit */
  2099 	    }
  2100 	    else
  2101 	    {
  2102 		if (pswit[ECHO_SWITCH])
  2103 		    g_print("\n%s\n",aline);
  2104 		if (!pswit[OVERVIEW_SWITCH])
  2105 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2106 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2107 		else
  2108 		    cnt_punct++;
  2109 	    }
  2110 	}
  2111     }
  2112 }
  2113 
  2114 /*
  2115  * check_for_spaced_quotes:
  2116  */
  2117 void check_for_spaced_quotes(const char *aline)
  2118 {
  2119     const char *s,*t;
  2120     s=aline;
  2121     while ((t=strstr(s," \" ")))
  2122     {
  2123 	if (pswit[ECHO_SWITCH])
  2124 	    g_print("\n%s\n",aline);
  2125 	if (!pswit[OVERVIEW_SWITCH])
  2126 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2127 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2128 	else
  2129 	    cnt_punct++;
  2130 	s=g_utf8_next_char(g_utf8_next_char(t));
  2131     }
  2132     s=aline;
  2133     while ((t=strstr(s," ' ")))
  2134     {
  2135 	if (pswit[ECHO_SWITCH])
  2136 	    g_print("\n%s\n",aline);
  2137 	if (!pswit[OVERVIEW_SWITCH])
  2138 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2139 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2140 	else
  2141 	    cnt_punct++;
  2142 	s=g_utf8_next_char(g_utf8_next_char(t));
  2143     }
  2144     s=aline;
  2145     while ((t=strstr(s," ` ")))
  2146     {
  2147 	if (pswit[ECHO_SWITCH])
  2148 	    g_print("\n%s\n",aline);
  2149 	if (!pswit[OVERVIEW_SWITCH])
  2150 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2151 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2152 	else
  2153 	    cnt_punct++;
  2154 	s=g_utf8_next_char(g_utf8_next_char(t));
  2155     }
  2156 }
  2157 
  2158 /*
  2159  * check_for_miscased_genative:
  2160  *
  2161  * Check special case of 'S instead of 's at end of word.
  2162  */
  2163 void check_for_miscased_genative(const char *aline)
  2164 {
  2165     const char *s;
  2166     gunichar c,nc,pc;
  2167     if (!*aline)
  2168 	return;
  2169     c=g_utf8_get_char(aline);
  2170     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2171     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2172     {
  2173 	pc=c;
  2174 	c=nc;
  2175 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2176 	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
  2177 	{
  2178 	    if (pswit[ECHO_SWITCH])
  2179 		g_print("\n%s\n",aline);
  2180 	    if (!pswit[OVERVIEW_SWITCH])
  2181 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2182 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2183 	    else
  2184 		cnt_punct++;
  2185 	}
  2186     }
  2187 }
  2188 
  2189 /*
  2190  * check_end_of_line:
  2191  *
  2192  * Now check special cases - start and end of line -
  2193  * for single and double quotes. Start is sometimes [sic]
  2194  * but better to query it anyway.
  2195  * While we're here, check for dash at end of line.
  2196  */
  2197 void check_end_of_line(const char *aline,struct warnings *warnings)
  2198 {
  2199     int lbytes;
  2200     const char *s;
  2201     gunichar c1,c2;
  2202     lbytes=strlen(aline);
  2203     if (g_utf8_strlen(aline,lbytes)>1)
  2204     {
  2205 	s=g_utf8_prev_char(aline+lbytes);
  2206 	c1=g_utf8_get_char(s);
  2207 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2208 	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
  2209 	  c2==CHAR_SPACE)
  2210 	{
  2211 	    if (pswit[ECHO_SWITCH])
  2212 		g_print("\n%s\n",aline);
  2213 	    if (!pswit[OVERVIEW_SWITCH])
  2214 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2215 		  g_utf8_strlen(aline,lbytes));
  2216 	    else
  2217 		cnt_punct++;
  2218 	}
  2219 	c1=g_utf8_get_char(aline);
  2220 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2221 	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
  2222 	{
  2223 	    if (pswit[ECHO_SWITCH])
  2224 		g_print("\n%s\n",aline);
  2225 	    if (!pswit[OVERVIEW_SWITCH])
  2226 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2227 	    else
  2228 		cnt_punct++;
  2229 	}
  2230 	/*
  2231 	 * Dash at end of line may well be legit - paranoid mode only
  2232 	 * and don't report em-dash at line-end.
  2233 	 */
  2234 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2235 	{
  2236 	    for (s=g_utf8_prev_char(aline+lbytes);
  2237 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2238 		;
  2239 	    if (g_utf8_get_char(s)=='-' &&
  2240 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2241 	    {
  2242 		if (pswit[ECHO_SWITCH])
  2243 		    g_print("\n%s\n",aline);
  2244 		if (!pswit[OVERVIEW_SWITCH])
  2245 		    g_print("    Line %ld column %ld - "
  2246 		      "Hyphen at end of line?\n",
  2247 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2248 	    }
  2249 	}
  2250     }
  2251 }
  2252 
  2253 /*
  2254  * check_for_unspaced_bracket:
  2255  *
  2256  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2257  * If so, suspect a scanno like "a]most".
  2258  */
  2259 void check_for_unspaced_bracket(const char *aline)
  2260 {
  2261     const char *s;
  2262     gunichar c,nc,pc;
  2263     c=g_utf8_get_char(aline);
  2264     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2265     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2266     {
  2267 	pc=c;
  2268 	c=nc;
  2269 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2270 	if (!nc)
  2271 	    break;
  2272 	/* for each bracket character in the line except 1st & last */
  2273 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2274 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2275 	{
  2276 	    if (pswit[ECHO_SWITCH])
  2277 		g_print("\n%s\n",aline);
  2278 	    if (!pswit[OVERVIEW_SWITCH])
  2279 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2280 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2281 	    else
  2282 		cnt_punct++;
  2283 	}
  2284     }
  2285 }
  2286 
  2287 /*
  2288  * check_for_unpunctuated_endquote:
  2289  */
  2290 void check_for_unpunctuated_endquote(const char *aline)
  2291 {
  2292     const char *s;
  2293     gunichar c,nc,pc;
  2294     c=g_utf8_get_char(aline);
  2295     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2296     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2297     {
  2298 	pc=c;
  2299 	c=nc;
  2300 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2301 	/* for each character in the line except 1st */
  2302 	if (c==CHAR_DQUOTE && isalpha(pc))
  2303 	{
  2304 	    if (pswit[ECHO_SWITCH])
  2305 		g_print("\n%s\n",aline);
  2306 	    if (!pswit[OVERVIEW_SWITCH])
  2307 		g_print("    Line %ld column %ld - "
  2308 		  "endquote missing punctuation?\n",
  2309 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2310 	    else
  2311 		cnt_punct++;
  2312 	}
  2313     }
  2314 }
  2315 
  2316 /*
  2317  * check_for_html_tag:
  2318  *
  2319  * Check for <HTML TAG>.
  2320  *
  2321  * If there is a < in the line, followed at some point
  2322  * by a > then we suspect HTML.
  2323  */
  2324 void check_for_html_tag(const char *aline)
  2325 {
  2326     const char *open,*close;
  2327     gchar *tag;
  2328     open=strchr(aline,'<');
  2329     if (open)
  2330     {
  2331 	close=strchr(g_utf8_next_char(open),'>');
  2332 	if (close)
  2333 	{
  2334 	    if (pswit[ECHO_SWITCH])
  2335 		g_print("\n%s\n",aline);
  2336 	    if (!pswit[OVERVIEW_SWITCH])
  2337 	    {
  2338 		tag=g_strndup(open,close-open+1);
  2339 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2340 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2341 		g_free(tag);
  2342 	    }
  2343 	    else
  2344 		cnt_html++;
  2345 	}
  2346     }
  2347 }
  2348 
  2349 /*
  2350  * check_for_html_entity:
  2351  *
  2352  * Check for &symbol; HTML.
  2353  *
  2354  * If there is a & in the line, followed at
  2355  * some point by a ; then we suspect HTML.
  2356  */
  2357 void check_for_html_entity(const char *aline)
  2358 {
  2359     const char *s,*amp,*scolon;
  2360     gchar *entity;
  2361     amp=strchr(aline,'&');
  2362     if (amp)
  2363     {
  2364 	scolon=strchr(amp,';');
  2365 	if (scolon)
  2366 	{
  2367 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2368 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2369 		    break;		/* Don't report "Jones & Son;" */
  2370 	    if (s>=scolon)
  2371 	    {
  2372 		if (pswit[ECHO_SWITCH])
  2373 		    g_print("\n%s\n",aline);
  2374 		if (!pswit[OVERVIEW_SWITCH])
  2375 		{
  2376 		    entity=g_strndup(amp,scolon-amp+1);
  2377 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2378 		      linecnt,(int)(amp-aline)+1,entity);
  2379 		    g_free(entity);
  2380 		}
  2381 		else
  2382 		    cnt_html++;
  2383 	    }
  2384 	}
  2385     }
  2386 }
  2387 
  2388 /*
  2389  * print_pending:
  2390  *
  2391  * If we are in a state of unbalanced quotes, and this line
  2392  * doesn't begin with a quote, output the stored error message.
  2393  * If the -P switch was used, print the warning even if the
  2394  * new para starts with quotes.
  2395  */
  2396 void print_pending(const char *aline,const char *parastart,
  2397   struct pending *pending)
  2398 {
  2399     const char *s;
  2400     gunichar c;
  2401     s=aline;
  2402     while (*s==' ')
  2403 	s++;
  2404     c=g_utf8_get_char(s);
  2405     if (pending->dquote)
  2406     {
  2407 	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  2408 	{
  2409 	    if (!pswit[OVERVIEW_SWITCH])
  2410 	    {
  2411 		if (pswit[ECHO_SWITCH])
  2412 		    g_print("\n%s\n",parastart);
  2413 		g_print("%s\n",pending->dquote);
  2414 	    }
  2415 	    else
  2416 		cnt_dquot++;
  2417 	}
  2418 	g_free(pending->dquote);
  2419 	pending->dquote=NULL;
  2420     }
  2421     if (pending->squote)
  2422     {
  2423 	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
  2424 	  pending->squot)
  2425 	{
  2426 	    if (!pswit[OVERVIEW_SWITCH])
  2427 	    {
  2428 		if (pswit[ECHO_SWITCH])
  2429 		    g_print("\n%s\n",parastart);
  2430 		g_print("%s\n",pending->squote);
  2431 	    }
  2432 	    else
  2433 		cnt_squot++;
  2434 	}
  2435 	g_free(pending->squote);
  2436 	pending->squote=NULL;
  2437     }
  2438     if (pending->rbrack)
  2439     {
  2440 	if (!pswit[OVERVIEW_SWITCH])
  2441 	{
  2442 	    if (pswit[ECHO_SWITCH])
  2443 		g_print("\n%s\n",parastart);
  2444 	    g_print("%s\n",pending->rbrack);
  2445 	}
  2446 	else
  2447 	    cnt_brack++;
  2448 	g_free(pending->rbrack);
  2449 	pending->rbrack=NULL;
  2450     }
  2451     if (pending->sbrack)
  2452     {
  2453 	if (!pswit[OVERVIEW_SWITCH])
  2454 	{
  2455 	    if (pswit[ECHO_SWITCH])
  2456 		g_print("\n%s\n",parastart);
  2457 	    g_print("%s\n",pending->sbrack);
  2458 	}
  2459 	else
  2460 	    cnt_brack++;
  2461 	g_free(pending->sbrack);
  2462 	pending->sbrack=NULL;
  2463     }
  2464     if (pending->cbrack)
  2465     {
  2466 	if (!pswit[OVERVIEW_SWITCH])
  2467 	{
  2468 	    if (pswit[ECHO_SWITCH])
  2469 		g_print("\n%s\n",parastart);
  2470 	    g_print("%s\n",pending->cbrack);
  2471 	}
  2472 	else
  2473 	    cnt_brack++;
  2474 	g_free(pending->cbrack);
  2475 	pending->cbrack=NULL;
  2476     }
  2477     if (pending->unders)
  2478     {
  2479 	if (!pswit[OVERVIEW_SWITCH])
  2480 	{
  2481 	    if (pswit[ECHO_SWITCH])
  2482 		g_print("\n%s\n",parastart);
  2483 	    g_print("%s\n",pending->unders);
  2484 	}
  2485 	else
  2486 	    cnt_brack++;
  2487 	g_free(pending->unders);
  2488 	pending->unders=NULL;
  2489     }
  2490 }
  2491 
  2492 /*
  2493  * check_for_mismatched_quotes:
  2494  *
  2495  * At end of paragraph, check for mismatched quotes.
  2496  *
  2497  * We don't want to report an error immediately, since it is a
  2498  * common convention to omit the quotes at end of paragraph if
  2499  * the next paragraph is a continuation of the same speaker.
  2500  * Where this is the case, the next para should begin with a
  2501  * quote, so we store the warning message and only display it
  2502  * at the top of the next iteration if the new para doesn't
  2503  * start with a quote.
  2504  * The -p switch overrides this default, and warns of unclosed
  2505  * quotes on _every_ paragraph, whether the next begins with a
  2506  * quote or not.
  2507  */
  2508 void check_for_mismatched_quotes(const struct counters *counters,
  2509   struct pending *pending)
  2510 {
  2511     if (counters->quot%2)
  2512 	pending->dquote=
  2513 	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
  2514     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  2515       counters->open_single_quote!=counters->close_single_quote)
  2516 	pending->squote=
  2517 	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
  2518     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  2519       counters->open_single_quote!=counters->close_single_quote &&
  2520       counters->open_single_quote!=counters->close_single_quote+1)
  2521 	/*
  2522 	 * Flag it to be noted regardless of the
  2523 	 * first char of the next para.
  2524 	 */
  2525 	pending->squot=1;
  2526     if (counters->r_brack)
  2527 	pending->rbrack=
  2528 	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
  2529     if (counters->s_brack)
  2530 	pending->sbrack=
  2531 	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
  2532     if (counters->c_brack)
  2533 	pending->cbrack=
  2534 	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
  2535     if (counters->c_unders%2)
  2536 	pending->unders=
  2537 	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
  2538 }
  2539 
  2540 /*
  2541  * check_for_omitted_punctuation:
  2542  *
  2543  * Check for omitted punctuation at end of paragraph by working back
  2544  * through prevline. DW.
  2545  * Need to check this only for "normal" paras.
  2546  * So what is a "normal" para?
  2547  *    Not normal if one-liner (chapter headings, etc.)
  2548  *    Not normal if doesn't contain at least one locase letter
  2549  *    Not normal if starts with space
  2550  */
  2551 void check_for_omitted_punctuation(const char *prevline,
  2552   struct line_properties *last,int start_para_line)
  2553 {
  2554     gboolean letter_on_line=FALSE;
  2555     const char *s;
  2556     for (s=prevline;*s;s=g_utf8_next_char(s))
  2557 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2558 	{
  2559 	    letter_on_line=TRUE;
  2560 	    break;
  2561 	}
  2562     /*
  2563      * This next "if" is a problem.
  2564      * If we say "start_para_line <= linecnt - 1", that includes
  2565      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2566      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2567      * misses genuine one-line paragraphs.
  2568      */
  2569     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2570       g_utf8_get_char(prevline)>CHAR_SPACE)
  2571     {
  2572 	for (s=g_utf8_prev_char(prevline+strlen(prevline));
  2573 	  (g_utf8_get_char(s)==CHAR_DQUOTE ||
  2574 	  g_utf8_get_char(s)==CHAR_SQUOTE) &&
  2575 	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
  2576 	  s=g_utf8_prev_char(s))
  2577 	    ;
  2578 	for (;s>prevline;s=g_utf8_prev_char(s))
  2579 	{
  2580 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2581 	    {
  2582 		if (pswit[ECHO_SWITCH])
  2583 		    g_print("\n%s\n",prevline);
  2584 		if (!pswit[OVERVIEW_SWITCH])
  2585 		    g_print("    Line %ld column %ld - "
  2586 		      "No punctuation at para end?\n",
  2587 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2588 		else
  2589 		    cnt_punct++;
  2590 		break;
  2591 	    }
  2592 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2593 		break;
  2594 	}
  2595     }
  2596 }
  2597 
  2598 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2599 {
  2600     const char *word=key;
  2601     int *dupcnt=value;
  2602     if (*dupcnt)
  2603 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2604 	  word,*dupcnt);
  2605     return FALSE;
  2606 }
  2607 
  2608 void print_as_windows_1252(const char *string)
  2609 {
  2610     gsize inbytes,outbytes;
  2611     gchar *buf,*bp;
  2612     GIConv converter=(GIConv)-1;
  2613     if (!string)
  2614     {
  2615 	if (converter!=(GIConv)-1)
  2616 	    g_iconv_close(converter);
  2617 	converter=(GIConv)-1;
  2618 	return;
  2619     }
  2620     if (converter=(GIConv)-1)
  2621 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2622     if (converter!=(GIConv)-1)
  2623     {
  2624 	inbytes=outbytes=strlen(string);
  2625 	bp=buf=g_malloc(outbytes+1);
  2626 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2627 	*bp='\0';
  2628 	fputs(buf,stdout);
  2629 	g_free(buf);
  2630     }
  2631     else
  2632 	fputs(string,stdout);
  2633 }
  2634 
  2635 void print_as_utf_8(const char *string)
  2636 {
  2637     fputs(string,stdout);
  2638 }
  2639 
  2640 /*
  2641  * procfile:
  2642  *
  2643  * Process one file.
  2644  */
  2645 void procfile(const char *filename)
  2646 {
  2647     const char *s;
  2648     gchar *parastart=NULL;	/* first line of current para */
  2649     gchar *etext,*aline;
  2650     gchar *etext_ptr;
  2651     GError *err=NULL;
  2652     struct first_pass_results *first_pass_results;
  2653     struct warnings *warnings;
  2654     struct counters counters={0};
  2655     struct line_properties last={0};
  2656     struct parities parities={0};
  2657     struct pending pending={0};
  2658     gboolean isemptyline;
  2659     long start_para_line=0;
  2660     gboolean isnewpara=FALSE,enddash=FALSE;
  2661     last.start=CHAR_SPACE;
  2662     linecnt=checked_linecnt=0;
  2663     etext=read_etext(filename,&err);
  2664     if (!etext)
  2665     {
  2666 	if (pswit[STDOUT_SWITCH])
  2667 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2668 	else
  2669 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2670 	exit(1);
  2671     }
  2672     g_print("\n\nFile: %s\n\n",filename);
  2673     first_pass_results=first_pass(etext);
  2674     warnings=report_first_pass(first_pass_results);
  2675     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2676     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2677     /*
  2678      * Here we go with the main pass. Hold onto yer hat!
  2679      */
  2680     linecnt=0;
  2681     etext_ptr=etext;
  2682     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2683     {
  2684 	linecnt++;
  2685 	if (linecnt==1)
  2686 	    isnewpara=TRUE;
  2687 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2688 	    continue;    // skip DP page separators completely
  2689 	if (linecnt<first_pass_results->firstline ||
  2690 	  (first_pass_results->footerline>0 &&
  2691 	  linecnt>first_pass_results->footerline))
  2692 	{
  2693 	    if (pswit[HEADER_SWITCH])
  2694 	    {
  2695 		if (g_str_has_prefix(aline,"Title:"))
  2696 		    g_print("    %s\n",aline);
  2697 		if (g_str_has_prefix(aline,"Author:"))
  2698 		    g_print("    %s\n",aline);
  2699 		if (g_str_has_prefix(aline,"Release Date:"))
  2700 		    g_print("    %s\n",aline);
  2701 		if (g_str_has_prefix(aline,"Edition:"))
  2702 		    g_print("    %s\n\n",aline);
  2703 	    }
  2704 	    continue;		/* skip through the header */
  2705 	}
  2706 	checked_linecnt++;
  2707 	print_pending(aline,parastart,&pending);
  2708 	memset(&pending,0,sizeof(pending));
  2709 	isemptyline=analyse_quotes(aline,&counters);
  2710 	if (isnewpara && !isemptyline)
  2711 	{
  2712 	    /* This line is the start of a new paragraph. */
  2713 	    start_para_line=linecnt;
  2714 	    /* Capture its first line in case we want to report it later. */
  2715 	    g_free(parastart);
  2716 	    parastart=g_strdup(aline);
  2717 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2718 	    s=aline;
  2719 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2720 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2721 		s=g_utf8_next_char(s);
  2722 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2723 	    {
  2724 		/* and its first letter is lowercase */
  2725 		if (pswit[ECHO_SWITCH])
  2726 		    g_print("\n%s\n",aline);
  2727 		if (!pswit[OVERVIEW_SWITCH])
  2728 		    g_print("    Line %ld column %ld - "
  2729 		      "Paragraph starts with lower-case\n",
  2730 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2731 		else
  2732 		    cnt_punct++;
  2733 	    }
  2734 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2735 	}
  2736 	/* Check for an em-dash broken at line end. */
  2737 	if (enddash && g_utf8_get_char(aline)=='-')
  2738 	{
  2739 	    if (pswit[ECHO_SWITCH])
  2740 		g_print("\n%s\n",aline);
  2741 	    if (!pswit[OVERVIEW_SWITCH])
  2742 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2743 	    else
  2744 		cnt_punct++;
  2745 	}
  2746 	enddash=FALSE;
  2747 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2748 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2749 	    ;
  2750 	if (s>=aline && g_utf8_get_char(s)=='-')
  2751 	    enddash=TRUE;
  2752 	check_for_control_characters(aline);
  2753 	if (warnings->bin)
  2754 	    check_for_odd_characters(aline,warnings,isemptyline);
  2755 	if (warnings->longline)
  2756 	    check_for_long_line(aline);
  2757 	if (warnings->shortline)
  2758 	    check_for_short_line(aline,&last);
  2759 	last.blen=last.len;
  2760 	last.len=g_utf8_strlen(aline,-1);
  2761 	last.start=g_utf8_get_char(aline);
  2762 	check_for_starting_punctuation(aline);
  2763 	if (warnings->dash)
  2764 	{
  2765 	    check_for_spaced_emdash(aline);
  2766 	    check_for_spaced_dash(aline);
  2767 	}
  2768 	check_for_unmarked_paragraphs(aline);
  2769 	check_for_jeebies(aline);
  2770 	check_for_mta_from(aline);
  2771 	check_for_orphan_character(aline);
  2772 	check_for_pling_scanno(aline);
  2773 	check_for_extra_period(aline,warnings);
  2774 	check_for_following_punctuation(aline);
  2775 	check_for_typos(aline,warnings);
  2776 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2777 	check_for_double_punctuation(aline,warnings);
  2778 	check_for_spaced_quotes(aline);
  2779 	check_for_miscased_genative(aline);
  2780 	check_end_of_line(aline,warnings);
  2781 	check_for_unspaced_bracket(aline);
  2782 	if (warnings->endquote)
  2783 	    check_for_unpunctuated_endquote(aline);
  2784 	check_for_html_tag(aline);
  2785 	check_for_html_entity(aline);
  2786 	if (isemptyline)
  2787 	{
  2788 	    check_for_mismatched_quotes(&counters,&pending);
  2789 	    memset(&counters,0,sizeof(counters));
  2790 	    /* let the next iteration know that it's starting a new para */
  2791 	    isnewpara=TRUE;
  2792 	    if (prevline)
  2793 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2794 	}
  2795 	g_free(prevline);
  2796 	prevline=g_strdup(aline);
  2797     }
  2798     if (prevline)
  2799     {
  2800 	g_free(prevline);
  2801 	prevline=NULL;
  2802     }
  2803     g_free(parastart);
  2804     g_free(prevline);
  2805     g_free(etext);
  2806     if (!pswit[OVERVIEW_SWITCH])
  2807 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2808     g_tree_unref(qword);
  2809     g_tree_unref(qperiod);
  2810     g_set_print_handler(NULL);
  2811     print_as_windows_1252(NULL);
  2812     if (pswit[MARKUP_SWITCH])  
  2813 	loseentities(NULL);
  2814 }
  2815 
  2816 /*
  2817  * flgets:
  2818  *
  2819  * Get one line from the input text, checking for
  2820  * the existence of exactly one CR/LF line-end per line.
  2821  *
  2822  * Returns: a pointer to the line.
  2823  */
  2824 char *flgets(char **etext,long lcnt)
  2825 {
  2826     gunichar c;
  2827     gboolean isCR=FALSE;
  2828     char *theline=*etext;
  2829     char *eos=theline;
  2830     gchar *s;
  2831     for (;;)
  2832     {
  2833 	c=g_utf8_get_char(*etext);
  2834 	*etext=g_utf8_next_char(*etext);
  2835 	if (!c)
  2836 	    return NULL;
  2837 	/* either way, it's end of line */
  2838 	if (c=='\n')
  2839 	{
  2840 	    if (isCR)
  2841 		break;
  2842 	    else
  2843 	    {
  2844 		/* Error - a LF without a preceding CR */
  2845 		if (pswit[LINE_END_SWITCH])
  2846 		{
  2847 		    if (pswit[ECHO_SWITCH])
  2848 		    {
  2849 			s=g_strndup(theline,eos-theline);
  2850 			g_print("\n%s\n",s);
  2851 			g_free(s);
  2852 		    }
  2853 		    if (!pswit[OVERVIEW_SWITCH])
  2854 			g_print("    Line %ld - No CR?\n",lcnt);
  2855 		    else
  2856 			cnt_lineend++;
  2857 		}
  2858 		break;
  2859 	    }
  2860 	}
  2861 	if (c=='\r')
  2862 	{
  2863 	    if (isCR)
  2864 	    {
  2865 		/* Error - two successive CRs */
  2866 		if (pswit[LINE_END_SWITCH])
  2867 		{
  2868 		    if (pswit[ECHO_SWITCH])
  2869 		    {
  2870 			s=g_strndup(theline,eos-theline);
  2871 			g_print("\n%s\n",s);
  2872 			g_free(s);
  2873 		    }
  2874 		    if (!pswit[OVERVIEW_SWITCH])
  2875 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2876 		    else
  2877 			cnt_lineend++;
  2878 		}
  2879 	    }
  2880 	    isCR=TRUE;
  2881 	}
  2882 	else
  2883 	{
  2884 	    if (pswit[LINE_END_SWITCH] && isCR)
  2885 	    {
  2886 		if (pswit[ECHO_SWITCH])
  2887 		{
  2888 		    s=g_strndup(theline,eos-theline);
  2889 		    g_print("\n%s\n",s);
  2890 		    g_free(s);
  2891 		}
  2892 		if (!pswit[OVERVIEW_SWITCH])
  2893 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2894 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2895 		else
  2896 		    cnt_lineend++;
  2897 		*eos=' ';
  2898 	    }
  2899 	    isCR=FALSE;
  2900 	    eos=g_utf8_next_char(eos);
  2901 	}
  2902     }
  2903     *eos='\0';
  2904     if (pswit[MARKUP_SWITCH])  
  2905 	postprocess_for_HTML(theline);
  2906     if (pswit[DP_SWITCH])  
  2907 	postprocess_for_DP(theline);
  2908     return theline;
  2909 }
  2910 
  2911 /*
  2912  * mixdigit:
  2913  *
  2914  * Takes a "word" as a parameter, and checks whether it
  2915  * contains a mixture of alpha and digits. Generally, this is an
  2916  * error, but may not be for cases like 4th or L5 12s. 3d.
  2917  *
  2918  * Returns: TRUE iff an is error found.
  2919  */
  2920 gboolean mixdigit(const char *checkword)
  2921 {
  2922     gboolean wehaveadigit,wehavealetter,query;
  2923     const char *s,*nondigit;
  2924     wehaveadigit=wehavealetter=query=FALSE;
  2925     for (s=checkword;*s;s=g_utf8_next_char(s))
  2926 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2927 	    wehavealetter=TRUE;
  2928 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2929 	    wehaveadigit=TRUE;
  2930     if (wehaveadigit && wehavealetter)
  2931     {
  2932 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2933 	query=TRUE;
  2934 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2935 	  nondigit=g_utf8_next_char(nondigit))
  2936 	    ;
  2937 	/* digits, ending in st, rd, nd, th of either case */
  2938 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2939 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2940 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2941 	  !g_ascii_strcasecmp(nondigit,"th"))
  2942 	    query=FALSE;
  2943 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2944 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2945 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2946 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2947 	    query=FALSE;
  2948 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2949 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2950 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2951 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2952 	    query=FALSE;
  2953 	/* digits, ending in l, L, s or d */
  2954 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2955 	  !strcmp(nondigit,"d"))
  2956 	    query=FALSE;
  2957 	/*
  2958 	 * L at the start of a number, representing Britsh pounds, like L500.
  2959 	 * This is cute. We know the current word is mixed digit. If the first
  2960 	 * letter is L, there must be at least one digit following. If both
  2961 	 * digits and letters follow, we have a genuine error, else we have a
  2962 	 * capital L followed by digits, and we accept that as a non-error.
  2963 	 */
  2964 	if (g_utf8_get_char(checkword)=='L' &&
  2965 	  !mixdigit(g_utf8_next_char(checkword)))
  2966 	    query=FALSE;
  2967     }
  2968     return query;
  2969 }
  2970 
  2971 /*
  2972  * getaword:
  2973  *
  2974  * Extracts the first/next "word" from the line, and returns it.
  2975  * A word is defined as one English word unit--or at least that's the aim.
  2976  * "ptr" is advanced to the position in the line where we will start
  2977  * looking for the next word.
  2978  *
  2979  * Returns: A newly-allocated string.
  2980  */
  2981 gchar *getaword(const char **ptr)
  2982 {
  2983     const char *s,*t;
  2984     GString *word;
  2985     gunichar c,pc;
  2986     word=g_string_new(NULL);
  2987     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  2988       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  2989       **ptr;*ptr=g_utf8_next_char(*ptr))
  2990 	;
  2991     /*
  2992      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  2993      * Especially yucky is the case of L1,000
  2994      * This section looks for a pattern of characters including a digit
  2995      * followed by a comma or period followed by one or more digits.
  2996      * If found, it returns this whole pattern as a word; otherwise we discard
  2997      * the results and resume our normal programming.
  2998      */
  2999     s=*ptr;
  3000     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3001       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3002       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3003 	g_string_append_unichar(word,g_utf8_get_char(s));
  3004     for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
  3005       t=g_utf8_next_char(t))
  3006     {
  3007 	c=g_utf8_get_char(t);
  3008 	pc=g_utf8_get_char(g_utf8_prev_char(t));
  3009 	if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3010 	{
  3011 	    *ptr=s;
  3012 	    return g_string_free(word,FALSE);
  3013 	}
  3014     }
  3015     /* we didn't find a punctuated number - do the regular getword thing */
  3016     g_string_truncate(word,0);
  3017     for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
  3018       g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
  3019       g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
  3020 	g_string_append_unichar(word,g_utf8_get_char(*ptr));
  3021     return g_string_free(word,FALSE);
  3022 }
  3023 
  3024 /*
  3025  * isroman:
  3026  *
  3027  * Is this word a Roman Numeral?
  3028  *
  3029  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3030  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3031  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3032  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3033  * expressions thereof, except when it came to taxes. Allow any number of M,
  3034  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3035  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3036  * of optional Is.
  3037  */
  3038 gboolean isroman(const char *t)
  3039 {
  3040     const char *s;
  3041     if (!t || !*t)
  3042 	return FALSE;
  3043     s=t;
  3044     while (g_utf8_get_char(t)=='m' && *t)
  3045 	t++;
  3046     if (g_utf8_get_char(t)=='d')
  3047 	t++;
  3048     if (g_str_has_prefix(t,"cm"))
  3049 	t+=2;
  3050     if (g_str_has_prefix(t,"cd"))
  3051 	t+=2;
  3052     while (g_utf8_get_char(t)=='c' && *t)
  3053 	t++;
  3054     if (g_str_has_prefix(t,"xl"))
  3055 	t+=2;
  3056     if (g_str_has_prefix(t,"xc"))
  3057 	t+=2;
  3058     if (g_utf8_get_char(t)=='l')
  3059 	t++;
  3060     while (g_utf8_get_char(t)=='x' && *t)
  3061 	t++;
  3062     if (g_str_has_prefix(t,"ix"))
  3063 	t+=2;
  3064     if (g_str_has_prefix(t,"iv"))
  3065 	t+=2;
  3066     if (g_utf8_get_char(t)=='v')
  3067 	t++;
  3068     while (g_utf8_get_char(t)=='i' && *t)
  3069 	t++;
  3070     return !*t;
  3071 }
  3072 
  3073 /*
  3074  * postprocess_for_DP:
  3075  *
  3076  * Invoked with the -d switch from flgets().
  3077  * It simply "removes" from the line a hard-coded set of common
  3078  * DP-specific tags, so that the line passed to the main routine has
  3079  * been pre-cleaned of DP markup.
  3080  */
  3081 void postprocess_for_DP(char *theline)
  3082 {
  3083     char *s,*t;
  3084     int i;
  3085     if (!*theline) 
  3086 	return;
  3087     for (i=0;*DPmarkup[i];i++)
  3088 	while ((s=strstr(theline,DPmarkup[i])))
  3089 	{
  3090 	    t=s+strlen(DPmarkup[i]);
  3091 	    memmove(s,t,strlen(t)+1);
  3092 	}
  3093 }
  3094 
  3095 /*
  3096  * postprocess_for_HTML:
  3097  *
  3098  * Invoked with the -m switch from flgets().
  3099  * It simply "removes" from the line a hard-coded set of common
  3100  * HTML tags and "replaces" a hard-coded set of common HTML
  3101  * entities, so that the line passed to the main routine has
  3102  * been pre-cleaned of HTML.
  3103  */
  3104 void postprocess_for_HTML(char *theline)
  3105 {
  3106     while (losemarkup(theline))
  3107 	;
  3108     loseentities(theline);
  3109 }
  3110 
  3111 char *losemarkup(char *theline)
  3112 {
  3113     char *s,*t;
  3114     int i;
  3115     s=strchr(theline,'<');
  3116     t=s?strchr(s,'>'):NULL;
  3117     if (!s || !t)
  3118 	return NULL;
  3119     for (i=0;*markup[i];i++)
  3120 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3121 	{
  3122 	    t=g_utf8_next_char(t);
  3123 	    memmove(s,t,strlen(t)+1);
  3124 	    return s;
  3125 	}
  3126     /* It's an unrecognized <xxx>. */
  3127     return NULL;
  3128 }
  3129 
  3130 void loseentities(char *theline)
  3131 {
  3132     int i;
  3133     gsize nb;
  3134     char *amp,*scolon;
  3135     gchar *s,*t;
  3136     gunichar c;
  3137     GTree *entities=NULL;
  3138     GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3139     if (!theline)
  3140     {
  3141 	if (entities)
  3142 	    g_tree_destroy(entities);
  3143 	entities=NULL;
  3144 	if (translit==(GIConv)-1)
  3145 	    g_iconv_close(translit);
  3146 	translit=(GIConv)-1;
  3147 	if (to_utf8==(GIConv)-1)
  3148 	    g_iconv_close(to_utf8);
  3149 	to_utf8=(GIConv)-1;
  3150 	return;
  3151     }
  3152     if (!*theline)
  3153 	return;
  3154     if (!entities)
  3155     {
  3156 	entities=g_tree_new((GCompareFunc)strcmp);
  3157 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3158 	    g_tree_insert(entities,HTMLentities[i].name,
  3159 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3160     }
  3161     if (translit==(GIConv)-1)
  3162 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3163     if (to_utf8==(GIConv)-1)
  3164 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3165     while((amp=strchr(theline,'&')))
  3166     {
  3167 	scolon=strchr(amp,';');
  3168 	if (scolon)
  3169 	{
  3170 	    if (amp[1]=='#')
  3171 	    {
  3172 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3173 		    c=strtol(amp+2,NULL,10);
  3174 		else if (amp[2]=='x' &&
  3175 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3176 		    c=strtol(amp+3,NULL,16);
  3177 	    }
  3178 	    else
  3179 	    {
  3180 		s=g_strndup(amp+1,scolon-(amp+1));
  3181 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3182 		g_free(s);
  3183 	    }
  3184 	}
  3185 	else
  3186 	    c=0;
  3187 	if (c)
  3188 	{
  3189 	    theline=amp;
  3190 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3191 		theline+=g_unichar_to_utf8(c,theline);
  3192 	    else
  3193 	    {
  3194 		s=g_malloc(6);
  3195 		nb=g_unichar_to_utf8(c,s);
  3196 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3197 		g_free(s);
  3198 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3199 		g_free(t);
  3200 		memcpy(theline,s,nb);
  3201 		g_free(s);
  3202 		theline+=nb;
  3203 	    }
  3204 	    memmove(theline,g_utf8_next_char(scolon),
  3205 	      strlen(g_utf8_next_char(scolon))+1);
  3206 	}
  3207 	else
  3208 	    theline=g_utf8_next_char(amp);
  3209     }
  3210 }
  3211 
  3212 gboolean tagcomp(const char *strin,const char *basetag)
  3213 {
  3214     gboolean retval;
  3215     gchar *s,*t;
  3216     if (g_utf8_get_char(strin)=='/')
  3217 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3218     else
  3219 	t=g_utf8_casefold(strin,-1);
  3220     s=g_utf8_casefold(basetag,-1);
  3221     retval=g_str_has_prefix(t,s);
  3222     g_free(s);
  3223     g_free(t);
  3224     return retval;
  3225 }
  3226 
  3227 void proghelp(GOptionContext *context)
  3228 {
  3229     gchar *help;
  3230     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3231     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3232     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3233     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3234       "For details, read the file COPYING.\n",stderr);
  3235     fputs("This is Free Software; "
  3236       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3237     fputs("read the file COPYING for details.\n\n",stderr);
  3238     help=g_option_context_get_help(context,TRUE,NULL);
  3239     fputs(help,stderr);
  3240     g_free(help);
  3241     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3242     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3243       "non-ASCII\n",stderr);
  3244     fputs("characters like accented letters, "
  3245       "lines longer than 75 or shorter than 55,\n",stderr);
  3246     fputs("unbalanced quotes or brackets, "
  3247       "a variety of badly formatted punctuation, \n",stderr);
  3248     fputs("HTML tags, some likely typos. "
  3249       "It is NOT a substitute for human judgement.\n",stderr);
  3250     fputs("\n",stderr);
  3251 }