bookloupe/bookloupe.c
author ali <ali@juiblex.co.uk>
Thu May 30 19:26:24 2013 +0100 (2013-05-30)
changeset 73 cffa80824f8c
parent 72 52d4a7f926b4
child 76 4e6e7cc6b50d
permissions -rw-r--r--
Use SetConsoleOutputCP() under MS-Windows
     1 /*************************************************************************/
     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */
     3 /*									 */
     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */
     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */
     6 /*									 */
     7 /* This program is free software; you can redistribute it and/or modify  */
     8 /* it under the terms of the GNU General Public License as published by  */
     9 /* the Free Software Foundation; either version 2 of the License, or     */
    10 /* (at your option) any later version.					 */
    11 /*									 */
    12 /* This program is distributed in the hope that it will be useful,       */
    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */
    15 /* GNU General Public License for more details.				 */
    16 /*									 */
    17 /* You should have received a copy of the GNU General Public License	 */
    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */
    19 /*************************************************************************/
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <ctype.h>
    25 #ifdef __WIN32__
    26 #include <windows.h>
    27 #endif
    28 #include <glib.h>
    29 #include <bl/bl.h>
    30 #include "HTMLentities.h"
    31 
    32 gchar *prevline;
    33 
    34 /* Common typos. */
    35 char *typo[] = {
    36     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",
    37     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",
    38     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",
    39     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",
    40     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",
    41     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",
    42     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",
    43     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",
    44     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",
    45     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",
    46     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
    47     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",
    48     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",
    49     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",
    50     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",
    51     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",
    52     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",
    53     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
    54     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",
    55     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",
    56     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",
    57     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",
    58     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",
    59     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",
    60     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",
    61     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",
    62     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",
    63     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",
    64     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",
    65     "se", ""
    66 };
    67 
    68 GTree *usertypo;
    69 
    70 /* Common abbreviations and other OK words not to query as typos. */
    71 char *okword[] = {
    72     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",
    73     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
    74     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",
    75     "outbid", "outbids", "frostbite", "frostbitten", ""
    76 };
    77 
    78 /* Common abbreviations that cause otherwise unexplained periods. */
    79 char *abbrev[] = {
    80     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",
    81     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""
    82 };
    83 
    84 /*
    85  * Two-Letter combinations that rarely if ever start words,
    86  * but are common scannos or otherwise common letter combinations.
    87  */
    88 char *nostart[] = {
    89     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""
    90 };
    91 
    92 /*
    93  * Two-Letter combinations that rarely if ever end words,
    94  * but are common scannos or otherwise common letter combinations.
    95  */
    96 char *noend[] = {
    97     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",
    98     "sw", "gr", "sl", "cl", "iy", ""
    99 };
   100 
   101 char *markup[] = {
   102     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",
   103     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",
   104     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",
   105     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""
   106 };
   107 
   108 char *DPmarkup[] = {
   109     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""
   110 };
   111 
   112 char *nocomma[] = {
   113     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",
   114     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",
   115     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",
   116     "during", "let", "toward", "among", ""
   117 };
   118 
   119 char *noperiod[] = {
   120     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
   121     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
   122     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",
   123     "among", "those", "into", "whom", "having", "thence", ""
   124 }; 
   125 
   126 /* special characters */
   127 #define CHAR_SPACE	  32
   128 #define CHAR_TAB	   9
   129 #define CHAR_LF		  10
   130 #define CHAR_CR		  13
   131 #define CHAR_DQUOTE	  34
   132 #define CHAR_SQUOTE	  39
   133 #define CHAR_OPEN_SQUOTE  96
   134 #define CHAR_TILDE	 126
   135 #define CHAR_ASTERISK	  42
   136 #define CHAR_FORESLASH	  47
   137 #define CHAR_CARAT	  94
   138 
   139 #define CHAR_UNDERSCORE    '_'
   140 #define CHAR_OPEN_CBRACK   '{'
   141 #define CHAR_CLOSE_CBRACK  '}'
   142 #define CHAR_OPEN_RBRACK   '('
   143 #define CHAR_CLOSE_RBRACK  ')'
   144 #define CHAR_OPEN_SBRACK   '['
   145 #define CHAR_CLOSE_SBRACK  ']'
   146 
   147 /* longest and shortest normal PG line lengths */
   148 #define LONGEST_PG_LINE   75
   149 #define WAY_TOO_LONG      80
   150 #define SHORTEST_PG_LINE  55
   151 
   152 enum {
   153     ECHO_SWITCH,
   154     SQUOTE_SWITCH,
   155     TYPO_SWITCH,
   156     QPARA_SWITCH,
   157     PARANOID_SWITCH,
   158     LINE_END_SWITCH,
   159     OVERVIEW_SWITCH,
   160     STDOUT_SWITCH,
   161     HEADER_SWITCH,
   162     WEB_SWITCH,
   163     VERBOSE_SWITCH,
   164     MARKUP_SWITCH,
   165     USERTYPO_SWITCH,
   166     DP_SWITCH,
   167     SWITNO
   168 };
   169 
   170 gboolean pswit[SWITNO];  /* program switches */
   171 
   172 static GOptionEntry options[]={
   173     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
   174       "Ignore DP-specific markup", NULL },
   175     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   176       "Don't echo queried line", NULL },
   177     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   178       "Check single quotes", NULL },
   179     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   180       "Check common typos", NULL },
   181     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   182       "Require closure of quotes on every paragraph", NULL },
   183     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   184       "Disable paranoid querying of everything", NULL },
   185     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   186       "Disable line end checking", NULL },
   187     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   188       "Overview: just show counts", NULL },
   189     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   190       "Output errors to stdout instead of stderr", NULL },
   191     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   192       "Echo header fields", NULL },
   193     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   194       "Ignore markup in < >", NULL },
   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   196       "Use file of user-defined typos", NULL },
   197     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   198       "Defaults for use on www upload", NULL },
   199     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   200       "Verbose - list everything", NULL },
   201     { NULL }
   202 };
   203 
   204 long cnt_dquot;		/* for overview mode, count of doublequote queries */
   205 long cnt_squot;		/* for overview mode, count of singlequote queries */
   206 long cnt_brack;		/* for overview mode, count of brackets queries */
   207 long cnt_bin;		/* for overview mode, count of non-ASCII queries */
   208 long cnt_odd;		/* for overview mode, count of odd character queries */
   209 long cnt_long;		/* for overview mode, count of long line errors */
   210 long cnt_short;		/* for overview mode, count of short line queries */
   211 long cnt_punct;		/* for overview mode,
   212 			   count of punctuation and spacing queries */
   213 long cnt_dash;		/* for overview mode, count of dash-related queries */
   214 long cnt_word;		/* for overview mode, count of word queries */
   215 long cnt_html;		/* for overview mode, count of html queries */
   216 long cnt_lineend;	/* for overview mode, count of line-end queries */
   217 long cnt_spacend;	/* count of lines with space at end */
   218 long linecnt;		/* count of total lines in the file */
   219 long checked_linecnt;	/* count of lines actually checked */
   220 
   221 void proghelp(GOptionContext *context);
   222 void procfile(const char *);
   223 
   224 gchar *running_from;
   225 
   226 gboolean mixdigit(const char *);
   227 gchar *getaword(const char **);
   228 char *flgets(char **,long);
   229 void postprocess_for_HTML(char *);
   230 char *linehasmarkup(char *);
   231 char *losemarkup(char *);
   232 gboolean tagcomp(const char *,const char *);
   233 void loseentities(char *);
   234 gboolean isroman(const char *);
   235 void postprocess_for_DP(char *);
   236 void print_as_windows_1252(const char *string);
   237 void print_as_utf_8(const char *string);
   238 
   239 GTree *qword,*qperiod;
   240 
   241 #ifdef __WIN32__
   242 UINT saved_cp;
   243 #endif
   244 
   245 struct first_pass_results {
   246     long firstline,astline;
   247     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
   248     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
   249     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
   250     int Dutchcount,Frenchcount;
   251 };
   252 
   253 struct warnings {
   254     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   255     int endquote;
   256     gboolean isDutch,isFrench;
   257 };
   258 
   259 struct counters {
   260     long quot;
   261     int c_unders,c_brack,s_brack,r_brack;
   262     int open_single_quote,close_single_quote;
   263 };
   264 
   265 struct line_properties {
   266     unsigned int len,blen;
   267     gunichar start;
   268 };
   269 
   270 struct parities {
   271     int dquote,squote;
   272 };
   273 
   274 struct pending {
   275     char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
   276     long squot;
   277 };
   278 
   279 void parse_options(int *argc,char ***argv)
   280 {
   281     GError *err=NULL;
   282     GOptionContext *context;
   283     context=g_option_context_new(
   284       "file - looks for errors in Project Gutenberg(TM) etexts");
   285     g_option_context_add_main_entries(context,options,NULL);
   286     if (!g_option_context_parse(context,argc,argv,&err))
   287     {
   288 	g_printerr("Bookloupe: %s\n",err->message);
   289 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   290 	exit(1);
   291     }
   292     /* Paranoid checking is turned OFF, not on, by its switch */
   293     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   294     if (pswit[PARANOID_SWITCH])
   295 	/* if running in paranoid mode, typo checks default to enabled */
   296 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   297     /* Line-end checking is turned OFF, not on, by its switch */
   298     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   299     /* Echoing is turned OFF, not on, by its switch */
   300     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   301     if (pswit[OVERVIEW_SWITCH])
   302 	/* just print summary; don't echo */
   303 	pswit[ECHO_SWITCH]=FALSE;
   304     /*
   305      * Web uploads - for the moment, this is really just a placeholder
   306      * until we decide what processing we really want to do on web uploads
   307      */
   308     if (pswit[WEB_SWITCH])
   309     {
   310 	/* specific override for web uploads */
   311 	pswit[ECHO_SWITCH]=TRUE;
   312 	pswit[SQUOTE_SWITCH]=FALSE;
   313 	pswit[TYPO_SWITCH]=TRUE;
   314 	pswit[QPARA_SWITCH]=FALSE;
   315 	pswit[PARANOID_SWITCH]=TRUE;
   316 	pswit[LINE_END_SWITCH]=FALSE;
   317 	pswit[OVERVIEW_SWITCH]=FALSE;
   318 	pswit[STDOUT_SWITCH]=FALSE;
   319 	pswit[HEADER_SWITCH]=TRUE;
   320 	pswit[VERBOSE_SWITCH]=FALSE;
   321 	pswit[MARKUP_SWITCH]=FALSE;
   322 	pswit[USERTYPO_SWITCH]=FALSE;
   323 	pswit[DP_SWITCH]=FALSE;
   324     }
   325     if (*argc<2)
   326     {
   327 	proghelp(context);
   328 	exit(1);
   329     }
   330     g_option_context_free(context);
   331 }
   332 
   333 /*
   334  * read_user_scannos:
   335  *
   336  * Read in the user-defined stealth scanno list.
   337  */
   338 void read_user_scannos(void)
   339 {
   340     GError *err=NULL;
   341     gchar *usertypo_file;
   342     gboolean okay;
   343     int i;
   344     gsize len,nb;
   345     gchar *contents,*utf8,**lines;
   346     usertypo_file=g_strdup("bookloupe.typ");
   347     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   348     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   349     {
   350 	g_clear_error(&err);
   351 	g_free(usertypo_file);
   352 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   353 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   354     }
   355     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   356     {
   357 	g_clear_error(&err);
   358 	g_free(usertypo_file);
   359 	usertypo_file=g_strdup("gutcheck.typ");
   360 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   361     }
   362     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   363     {
   364 	g_clear_error(&err);
   365 	g_free(usertypo_file);
   366 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   367 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   368     }
   369     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   370     {
   371 	g_free(usertypo_file);
   372 	g_print("   --> I couldn't find bookloupe.typ "
   373 	  "-- proceeding without user typos.\n");
   374 	return;
   375     }
   376     else if (!okay)
   377     {
   378 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   379 	g_free(usertypo_file);
   380 	g_clear_error(&err);
   381 	exit(1);
   382     }
   383     if (g_utf8_validate(contents,len,NULL))
   384 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   385     else
   386 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   387     g_free(contents);
   388     lines=g_strsplit_set(utf8,"\r\n",0);
   389     g_free(utf8);
   390     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   391     for (i=0;lines[i];i++)
   392 	if (*(unsigned char *)lines[i]>'!')
   393 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   394 	else
   395 	    g_free(lines[i]);
   396     g_free(lines);
   397 }
   398 
   399 /*
   400  * read_etext:
   401  *
   402  * Read an etext returning a newly allocated string containing the file
   403  * contents or NULL on error.
   404  */
   405 gchar *read_etext(const char *filename,GError **err)
   406 {
   407     gchar *contents,*utf8;
   408     gsize len,nb;
   409     if (!g_file_get_contents(filename,&contents,&len,err))
   410 	return NULL;
   411     if (g_utf8_validate(contents,len,NULL))
   412     {
   413 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   414 	g_set_print_handler(print_as_utf_8);
   415 #ifdef __WIN32__
   416 	SetConsoleOutputCP(CP_UTF8);
   417 #endif
   418     }
   419     else
   420     {
   421 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   422 	g_set_print_handler(print_as_windows_1252);
   423 #ifdef __WIN32__
   424 	SetConsoleOutputCP(1252);
   425 #endif
   426     }
   427     g_free(contents);
   428     return utf8;
   429 }
   430 
   431 void cleanup_on_exit(void)
   432 {
   433 #ifdef __WIN32__
   434     SetConsoleOutputCP(saved_cp);
   435 #endif
   436 }
   437 
   438 int main(int argc,char **argv)
   439 {
   440 #ifdef __WIN32__
   441     atexit(cleanup_on_exit);
   442     saved_cp=GetConsoleOutputCP();
   443 #endif
   444     running_from=g_path_get_dirname(argv[0]);
   445     parse_options(&argc,&argv);
   446     if (pswit[USERTYPO_SWITCH])
   447 	read_user_scannos();
   448     fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   449     procfile(argv[1]);
   450     if (pswit[OVERVIEW_SWITCH])
   451     {
   452 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   453 	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   454 	g_print("    --------------- Queries found --------------\n");
   455 	if (cnt_long)
   456 	    g_print("    Long lines:		    %14ld\n",cnt_long);
   457 	if (cnt_short)
   458 	    g_print("    Short lines:		   %14ld\n",cnt_short);
   459 	if (cnt_lineend)
   460 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   461 	if (cnt_word)
   462 	    g_print("    Common typos:		  %14ld\n",cnt_word);
   463 	if (cnt_dquot)
   464 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   465 	if (cnt_squot)
   466 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   467 	if (cnt_brack)
   468 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   469 	if (cnt_bin)
   470 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   471 	if (cnt_odd)
   472 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   473 	if (cnt_punct)
   474 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   475 	if (cnt_dash)
   476 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   477 	if (cnt_html)
   478 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   479 	g_print("\n");
   480 	g_print("    TOTAL QUERIES		  %14ld\n",
   481 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   482 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   483     }
   484     g_free(running_from);
   485     if (usertypo)
   486 	g_tree_unref(usertypo);
   487     return 0;
   488 }
   489 
   490 /*
   491  * first_pass:
   492  *
   493  * Run a first pass - verify that it's a valid PG
   494  * file, decide whether to report some things that
   495  * occur many times in the text like long or short
   496  * lines, non-standard dashes, etc.
   497  */
   498 struct first_pass_results *first_pass(const char *etext)
   499 {
   500     gunichar laststart=CHAR_SPACE;
   501     const char *s;
   502     gchar *lc_line;
   503     int i,j,lbytes,llen;
   504     gchar **lines;
   505     unsigned int lastlen=0,lastblen=0;
   506     long spline=0,nspline=0;
   507     static struct first_pass_results results={0};
   508     gchar *inword;
   509     lines=g_strsplit(etext,"\n",0);
   510     for (j=0;lines[j];j++)
   511     {
   512 	lbytes=strlen(lines[j]);
   513 	while (lines[j][lbytes-1]=='\r')
   514 	    lines[j][--lbytes]='\0';
   515 	llen=g_utf8_strlen(lines[j],lbytes);
   516 	linecnt++;
   517 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   518 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   519 	{
   520 	    if (spline)
   521 		g_print("   --> Duplicate header?\n");
   522 	    spline=linecnt+1;   /* first line of non-header text, that is */
   523 	}
   524 	if (!strncmp(lines[j],"*** START",9) &&
   525 	  strstr(lines[j],"PROJECT GUTENBERG"))
   526 	{
   527 	    if (nspline)
   528 		g_print("   --> Duplicate header?\n");
   529 	    nspline=linecnt+1;   /* first line of non-header text, that is */
   530 	}
   531 	if (spline || nspline)
   532 	{
   533 	    lc_line=g_utf8_strdown(lines[j],lbytes);
   534 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   535 	    {
   536 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   537 		{
   538 		    if (results.footerline)
   539 		    {
   540 			/* it's an old-form header - we can detect duplicates */
   541 			if (!nspline)
   542 			    g_print("   --> Duplicate footer?\n");
   543 		    }
   544 		    else
   545 			results.footerline=linecnt;
   546 		}
   547 	    }
   548 	    g_free(lc_line);
   549 	}
   550 	if (spline)
   551 	    results.firstline=spline;
   552 	if (nspline)
   553 	    results.firstline=nspline;  /* override with new */
   554 	if (results.footerline)
   555 	    continue;    /* don't count the boilerplate in the footer */
   556 	results.totlen+=llen;
   557 	for (s=lines[j];*s;s=g_utf8_next_char(s))
   558 	{
   559 	    if (g_utf8_get_char(s)>127)
   560 		results.binlen++;
   561 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   562 		results.alphalen++;
   563 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   564 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   565 		results.endquote_count++;
   566 	}
   567 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   568 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   569 	    results.shortline++;
   570 	if (lbytes>0 &&
   571 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   572 	    cnt_spacend++;
   573 	if (strstr(lines[j],".,"))
   574 	    results.dotcomma++;
   575 	/* only count ast lines for ignoring purposes where there is */
   576 	/* locase text on the line */
   577 	if (strchr(lines[j],'*'))
   578 	{
   579 	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   580 		if (g_unichar_islower(g_utf8_get_char(s)))
   581 		    break;
   582 	    if (*s)
   583 		results.astline++;
   584 	}
   585 	if (strchr(lines[j],'/'))
   586 	    results.fslashline++;
   587 	for (s=g_utf8_prev_char(lines[j]+lbytes);
   588 	  s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
   589 	    ;
   590 	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   591 	  g_utf8_get_char(g_utf8_prev_char(s))!='-')
   592 	    results.hyphens++;
   593 	if (llen>LONGEST_PG_LINE)
   594 	    results.longline++;
   595 	if (llen>WAY_TOO_LONG)
   596 	    results.verylongline++;
   597 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   598 	{
   599 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   600 	    if (i>0)
   601 		results.htmcount++;
   602 	    if (strstr(lines[j],"<i>"))
   603 		results.htmcount+=4; /* bonus marks! */
   604 	}
   605 	/* Check for spaced em-dashes */
   606 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   607 	{
   608 	    results.emdash++;
   609 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   610 		results.space_emdash++;
   611 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   612 		/* count of em-dashes with spaces both sides */
   613 		results.non_PG_space_emdash++;
   614 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   615 		/* count of PG-type em-dashes with no spaces */
   616 		results.PG_space_emdash++;
   617 	}
   618 	for (s=lines[j];*s;)
   619 	{
   620 	    inword=getaword(&s);
   621 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   622 		results.Dutchcount++;
   623 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   624 		results.Frenchcount++;
   625 	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   626 		results.standalone_digit++;
   627 	    g_free(inword);
   628 	}
   629 	/* Check for spaced dashes */
   630 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   631 	    results.spacedash++;
   632 	lastblen=lastlen;
   633 	lastlen=llen;
   634 	laststart=lines[j][0];
   635     }
   636     g_strfreev(lines);
   637     return &results;
   638 }
   639 
   640 /*
   641  * report_first_pass:
   642  *
   643  * Make some snap decisions based on the first pass results.
   644  */
   645 struct warnings *report_first_pass(struct first_pass_results *results)
   646 {
   647     static struct warnings warnings={0};
   648     if (cnt_spacend>0)
   649 	g_print("   --> %ld lines in this file have white space at end\n",
   650 	  cnt_spacend);
   651     warnings.dotcomma=1;
   652     if (results->dotcomma>5)
   653     {
   654 	warnings.dotcomma=0;
   655 	g_print("   --> %ld lines in this file contain '.,'. "
   656 	  "Not reporting them.\n",results->dotcomma);
   657     }
   658     /*
   659      * If more than 50 lines, or one-tenth, are short,
   660      * don't bother reporting them.
   661      */
   662     warnings.shortline=1;
   663     if (results->shortline>50 || results->shortline*10>linecnt)
   664     {
   665 	warnings.shortline=0;
   666 	g_print("   --> %ld lines in this file are short. "
   667 	  "Not reporting short lines.\n",results->shortline);
   668     }
   669     /*
   670      * If more than 50 lines, or one-tenth, are long,
   671      * don't bother reporting them.
   672      */
   673     warnings.longline=1;
   674     if (results->longline>50 || results->longline*10>linecnt)
   675     {
   676 	warnings.longline=0;
   677 	g_print("   --> %ld lines in this file are long. "
   678 	  "Not reporting long lines.\n",results->longline);
   679     }
   680     /* If more than 10 lines contain asterisks, don't bother reporting them. */
   681     warnings.ast=1;
   682     if (results->astline>10)
   683     {
   684 	warnings.ast=0;
   685 	g_print("   --> %ld lines in this file contain asterisks. "
   686 	  "Not reporting them.\n",results->astline);
   687     }
   688     /*
   689      * If more than 10 lines contain forward slashes,
   690      * don't bother reporting them.
   691      */
   692     warnings.fslash=1;
   693     if (results->fslashline>10)
   694     {
   695 	warnings.fslash=0;
   696 	g_print("   --> %ld lines in this file contain forward slashes. "
   697 	  "Not reporting them.\n",results->fslashline);
   698     }
   699     /*
   700      * If more than 20 lines contain unpunctuated endquotes,
   701      * don't bother reporting them.
   702      */
   703     warnings.endquote=1;
   704     if (results->endquote_count>20)
   705     {
   706 	warnings.endquote=0;
   707 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   708 	  "Not reporting them.\n",results->endquote_count);
   709     }
   710     /*
   711      * If more than 15 lines contain standalone digits,
   712      * don't bother reporting them.
   713      */
   714     warnings.digit=1;
   715     if (results->standalone_digit>10)
   716     {
   717 	warnings.digit=0;
   718 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   719 	  "Not reporting them.\n",results->standalone_digit);
   720     }
   721     /*
   722      * If more than 20 lines contain hyphens at end,
   723      * don't bother reporting them.
   724      */
   725     warnings.hyphen=1;
   726     if (results->hyphens>20)
   727     {
   728 	warnings.hyphen=0;
   729 	g_print("   --> %ld lines in this file have hyphens at end. "
   730 	  "Not reporting them.\n",results->hyphens);
   731     }
   732     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   733     {
   734 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   735 	pswit[MARKUP_SWITCH]=1;
   736     }
   737     if (results->verylongline>0)
   738 	g_print("   --> %ld lines in this file are VERY long!\n",
   739 	  results->verylongline);
   740     /*
   741      * If there are more non-PG spaced dashes than PG em-dashes,
   742      * assume it's deliberate.
   743      * Current PG guidelines say don't use them, but older texts do,
   744      * and some people insist on them whatever the guidelines say.
   745      */
   746     warnings.dash=1;
   747     if (results->spacedash+results->non_PG_space_emdash>
   748       results->PG_space_emdash)
   749     {
   750 	warnings.dash=0;
   751 	g_print("   --> There are %ld spaced dashes and em-dashes. "
   752 	  "Not reporting them.\n",
   753 	  results->spacedash+results->non_PG_space_emdash);
   754     }
   755     /* If more than a quarter of characters are hi-bit, bug out. */
   756     warnings.bin=1;
   757     if (results->binlen*4>results->totlen)
   758     {
   759 	g_print("   --> This file does not appear to be ASCII. "
   760 	  "Terminating. Best of luck with it!\n");
   761 	exit(1);
   762     }
   763     if (results->alphalen*4<results->totlen)
   764     {
   765 	g_print("   --> This file does not appear to be text. "
   766 	  "Terminating. Best of luck with it!\n");
   767 	exit(1);
   768     }
   769     if (results->binlen*100>results->totlen || results->binlen>100)
   770     {
   771 	g_print("   --> There are a lot of foreign letters here. "
   772 	  "Not reporting them.\n");
   773 	warnings.bin=0;
   774     }
   775     warnings.isDutch=FALSE;
   776     if (results->Dutchcount>50)
   777     {
   778 	warnings.isDutch=TRUE;
   779 	g_print("   --> This looks like Dutch - "
   780 	  "switching off dashes and warnings for 's Middags case.\n");
   781     }
   782     warnings.isFrench=FALSE;
   783     if (results->Frenchcount>50)
   784     {
   785 	warnings.isFrench=TRUE;
   786 	g_print("   --> This looks like French - "
   787 	  "switching off some doublepunct.\n");
   788     }
   789     if (results->firstline && results->footerline)
   790 	g_print("    The PG header and footer appear to be already on.\n");
   791     else
   792     {
   793 	if (results->firstline)
   794 	    g_print("    The PG header is on - no footer.\n");
   795 	if (results->footerline)
   796 	    g_print("    The PG footer is on - no header.\n");
   797     }
   798     g_print("\n");
   799     if (pswit[VERBOSE_SWITCH])
   800     {
   801 	warnings.bin=1;
   802 	warnings.shortline=1;
   803 	warnings.dotcomma=1;
   804 	warnings.longline=1;
   805 	warnings.dash=1;
   806 	warnings.digit=1;
   807 	warnings.ast=1;
   808 	warnings.fslash=1;
   809 	warnings.hyphen=1;
   810 	warnings.endquote=1;
   811 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   812     }
   813     if (warnings.isDutch)
   814 	warnings.dash=0;
   815     if (results->footerline>0 && results->firstline>0 &&
   816       results->footerline>results->firstline &&
   817       results->footerline-results->firstline<100)
   818     {
   819 	g_print("   --> I don't really know where this text starts. \n");
   820 	g_print("       There are no reference points.\n");
   821 	g_print("       I'm going to have to report the header and footer "
   822 	  "as well.\n");
   823 	results->firstline=0;
   824     }
   825     return &warnings;
   826 }
   827 
   828 /*
   829  * analyse_quotes:
   830  *
   831  * Look along the line, accumulate the count of quotes, and see
   832  * if this is an empty line - i.e. a line with nothing on it
   833  * but spaces.
   834  * If line has just spaces, period, * and/or - on it, don't
   835  * count it, since empty lines with asterisks or dashes to
   836  * separate sections are common.
   837  *
   838  * Returns: TRUE if the line is empty.
   839  */
   840 gboolean analyse_quotes(const char *aline,struct counters *counters)
   841 {
   842     int guessquote=0;
   843     /* assume the line is empty until proven otherwise */
   844     gboolean isemptyline=TRUE;
   845     const char *s=aline,*sprev,*snext;
   846     gunichar c;
   847     sprev=NULL;
   848     while (*s)
   849     {
   850 	snext=g_utf8_next_char(s);
   851 	c=g_utf8_get_char(s);
   852 	if (c==CHAR_DQUOTE)
   853 	    counters->quot++;
   854 	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
   855 	{
   856 	    if (s==aline)
   857 	    {
   858 		/*
   859 		 * At start of line, it can only be an openquote.
   860 		 * Hardcode a very common exception!
   861 		 */
   862 		if (!g_str_has_prefix(snext,"tis") &&
   863 		  !g_str_has_prefix(snext,"Tis"))
   864 		    counters->open_single_quote++;
   865 	    }
   866 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   867 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   868 		/* Do nothing! it's definitely an apostrophe, not a quote */
   869 		;
   870 	    /* it's outside a word - let's check it out */
   871 	    else if (c==CHAR_OPEN_SQUOTE ||
   872 	      g_unichar_isalpha(g_utf8_get_char(snext)))
   873 	    {
   874 		/* it damwell better BE an openquote */
   875 		if (!g_str_has_prefix(snext,"tis") &&
   876 		  !g_str_has_prefix(snext,"Tis"))
   877 		    /* hardcode a very common exception! */
   878 		    counters->open_single_quote++;
   879 	    }
   880 	    else
   881 	    {
   882 		/* now - is it a closequote? */
   883 		guessquote=0;   /* accumulate clues */
   884 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   885 		{
   886 		    /* it follows a letter - could be either */
   887 		    guessquote++;
   888 		    if (g_utf8_get_char(sprev)=='s')
   889 		    {
   890 			/* looks like a plural apostrophe */
   891 			guessquote-=3;
   892 			if (g_utf8_get_char(snext)==CHAR_SPACE)
   893 			    /* bonus marks! */
   894 			    guessquote-=2;
   895 		    }
   896 		}
   897 		/* it doesn't have a letter either side */
   898 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
   899 		  strchr(".?!,;: ",g_utf8_get_char(snext)))
   900 		    guessquote+=8; /* looks like a closequote */
   901 		else
   902 		    guessquote++;
   903 		if (counters->open_single_quote>counters->close_single_quote)
   904 		    /*
   905 		     * Give it the benefit of some doubt,
   906 		     * if a squote is already open.
   907 		     */
   908 		    guessquote++;
   909 		else
   910 		    guessquote--;
   911 		if (guessquote>=0)
   912 		    counters->close_single_quote++;
   913 	    }
   914 	}
   915 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   916 	  c!='\r' && c!='\n')
   917 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   918 	if (c==CHAR_UNDERSCORE)
   919 	    counters->c_unders++;
   920 	if (c==CHAR_OPEN_CBRACK)
   921 	    counters->c_brack++;
   922 	if (c==CHAR_CLOSE_CBRACK)
   923 	    counters->c_brack--;
   924 	if (c==CHAR_OPEN_RBRACK)
   925 	    counters->r_brack++;
   926 	if (c==CHAR_CLOSE_RBRACK)
   927 	    counters->r_brack--;
   928 	if (c==CHAR_OPEN_SBRACK)
   929 	    counters->s_brack++;
   930 	if (c==CHAR_CLOSE_SBRACK)
   931 	    counters->s_brack--;
   932 	sprev=s;
   933 	s=snext;
   934     }
   935     return isemptyline;
   936 }
   937 
   938 /*
   939  * check_for_control_characters:
   940  *
   941  * Check for invalid or questionable characters in the line
   942  * Anything above 127 is invalid for plain ASCII, and
   943  * non-printable control characters should also be flagged.
   944  * Tabs should generally not be there.
   945  */
   946 void check_for_control_characters(const char *aline)
   947 {
   948     gunichar c;
   949     const char *s;
   950     for (s=aline;*s;s=g_utf8_next_char(s))
   951     {
   952 	c=g_utf8_get_char(s);
   953 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   954 	{
   955 	    if (pswit[ECHO_SWITCH])
   956 		g_print("\n%s\n",aline);
   957 	    if (!pswit[OVERVIEW_SWITCH])
   958 		g_print("    Line %ld column %ld - Control character %u\n",
   959 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   960 	    else
   961 		cnt_bin++;
   962 	}
   963     }
   964 }
   965 
   966 /*
   967  * check_for_odd_characters:
   968  *
   969  * Check for binary and other odd characters.
   970  */
   971 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   972   gboolean isemptyline)
   973 {
   974     /* Don't repeat multiple warnings on one line. */
   975     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   976     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   977     const char *s;
   978     gunichar c;
   979     for (s=aline;*s;s=g_utf8_next_char(s))
   980     {
   981 	c=g_utf8_get_char(s);
   982 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   983 	{
   984 	    if (pswit[ECHO_SWITCH])
   985 		g_print("\n%s\n",aline);
   986 	    if (!pswit[OVERVIEW_SWITCH])
   987 		if (c>127 && c<160 || c>255)
   988 		    g_print("    Line %ld column %ld - "
   989 		      "Non-ISO-8859 character %u\n",
   990 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   991 		else
   992 		    g_print("    Line %ld column %ld - "
   993 		      "Non-ASCII character %u\n",
   994 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   995 	    else
   996 		cnt_bin++;
   997 	    eNon_A=TRUE;
   998 	}
   999 	if (!eTab && c==CHAR_TAB)
  1000 	{
  1001 	    if (pswit[ECHO_SWITCH])
  1002 		g_print("\n%s\n",aline);
  1003 	    if (!pswit[OVERVIEW_SWITCH])
  1004 		g_print("    Line %ld column %ld - Tab character?\n",
  1005 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1006 	    else
  1007 		cnt_odd++;
  1008 	    eTab=TRUE;
  1009 	}
  1010 	if (!eTilde && c==CHAR_TILDE)
  1011 	{
  1012 	    /*
  1013 	     * Often used by OCR software to indicate an
  1014 	     * unrecognizable character.
  1015 	     */
  1016 	    if (pswit[ECHO_SWITCH])
  1017 		g_print("\n%s\n",aline);
  1018 	    if (!pswit[OVERVIEW_SWITCH])
  1019 		g_print("    Line %ld column %ld - Tilde character?\n",
  1020 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1021 	    else
  1022 		cnt_odd++;
  1023 	    eTilde=TRUE;
  1024 	}
  1025 	if (!eCarat && c==CHAR_CARAT)
  1026 	{  
  1027 	    if (pswit[ECHO_SWITCH])
  1028 		g_print("\n%s\n",aline);
  1029 	    if (!pswit[OVERVIEW_SWITCH])
  1030 		g_print("    Line %ld column %ld - Carat character?\n",
  1031 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1032 	    else
  1033 		cnt_odd++;
  1034 	    eCarat=TRUE;
  1035 	}
  1036 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
  1037 	{  
  1038 	    if (pswit[ECHO_SWITCH])
  1039 		g_print("\n%s\n",aline);
  1040 	    if (!pswit[OVERVIEW_SWITCH])
  1041 		g_print("    Line %ld column %ld - Forward slash?\n",
  1042 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1043 	    else
  1044 		cnt_odd++;
  1045 	    eFSlash=TRUE;
  1046 	}
  1047 	/*
  1048 	 * Report asterisks only in paranoid mode,
  1049 	 * since they're often deliberate.
  1050 	 */
  1051 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
  1052 	  c==CHAR_ASTERISK)
  1053 	{
  1054 	    if (pswit[ECHO_SWITCH])
  1055 		g_print("\n%s\n",aline);
  1056 	    if (!pswit[OVERVIEW_SWITCH])
  1057 		g_print("    Line %ld column %ld - Asterisk?\n",
  1058 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1059 	    else
  1060 		cnt_odd++;
  1061 	    eAst=TRUE;
  1062 	}
  1063     }
  1064 }
  1065 
  1066 /*
  1067  * check_for_long_line:
  1068  *
  1069  * Check for line too long.
  1070  */
  1071 void check_for_long_line(const char *aline)
  1072 {
  1073     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
  1074     {
  1075 	if (pswit[ECHO_SWITCH])
  1076 	    g_print("\n%s\n",aline);
  1077 	if (!pswit[OVERVIEW_SWITCH])
  1078 	    g_print("    Line %ld column %ld - Long line %ld\n",
  1079 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
  1080 	else
  1081 	    cnt_long++;
  1082     }
  1083 }
  1084 
  1085 /*
  1086  * check_for_short_line:
  1087  *
  1088  * Check for line too short.
  1089  *
  1090  * This one is a bit trickier to implement: we don't want to
  1091  * flag the last line of a paragraph for being short, so we
  1092  * have to wait until we know that our current line is a
  1093  * "normal" line, then report the _previous_ line if it was too
  1094  * short. We also don't want to report indented lines like
  1095  * chapter heads or formatted quotations. We therefore keep
  1096  * last->len as the length of the last line examined, and
  1097  * last->blen as the length of the last but one, and try to
  1098  * suppress unnecessary warnings by checking that both were of
  1099  * "normal" length. We keep the first character of the last
  1100  * line in last->start, and if it was a space, we assume that
  1101  * the formatting is deliberate. I can't figure out a way to
  1102  * distinguish something like a quoted verse left-aligned or
  1103  * the header or footer of a letter from a paragraph of short
  1104  * lines - maybe if I examined the whole paragraph, and if the
  1105  * para has less than, say, 8 lines and if all lines are short,
  1106  * then just assume it's OK? Need to look at some texts to see
  1107  * how often a formula like this would get the right result.
  1108  */
  1109 void check_for_short_line(const char *aline,const struct line_properties *last)
  1110 {
  1111     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
  1112       last->len<SHORTEST_PG_LINE && last->blen>1 &&
  1113       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
  1114     {
  1115 	if (pswit[ECHO_SWITCH])
  1116 	    g_print("\n%s\n",prevline);
  1117 	if (!pswit[OVERVIEW_SWITCH])
  1118 	    g_print("    Line %ld column %ld - Short line %ld?\n",
  1119 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
  1120 	else
  1121 	    cnt_short++;
  1122     }
  1123 }
  1124 
  1125 /*
  1126  * check_for_starting_punctuation:
  1127  *
  1128  * Look for punctuation other than full ellipses at start of line.
  1129  */
  1130 void check_for_starting_punctuation(const char *aline)
  1131 {
  1132     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
  1133       !g_str_has_prefix(aline,". . ."))
  1134     {
  1135 	if (pswit[ECHO_SWITCH])
  1136 	    g_print("\n%s\n",aline);
  1137 	if (!pswit[OVERVIEW_SWITCH])
  1138 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
  1139 	      linecnt);
  1140 	else
  1141 	    cnt_punct++;
  1142     }
  1143 }
  1144 
  1145 /*
  1146  * check_for_spaced_emdash:
  1147  *
  1148  * Check for spaced em-dashes.
  1149  *
  1150  * We must check _all_ occurrences of "--" on the line
  1151  * hence the loop - even if the first double-dash is OK
  1152  * there may be another that's wrong later on.
  1153  */
  1154 void check_for_spaced_emdash(const char *aline)
  1155 {
  1156     const char *s,*t,*next;
  1157     for (s=aline;t=strstr(s,"--");s=next)
  1158     {
  1159 	next=g_utf8_next_char(g_utf8_next_char(t));
  1160 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
  1161 	  g_utf8_get_char(next)==CHAR_SPACE)
  1162 	{
  1163 	    if (pswit[ECHO_SWITCH])
  1164 		g_print("\n%s\n",aline);
  1165 	    if (!pswit[OVERVIEW_SWITCH])
  1166 		g_print("    Line %ld column %ld - Spaced em-dash?\n",
  1167 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1168 	    else
  1169 		cnt_dash++;
  1170 	}
  1171     }
  1172 }
  1173 
  1174 /*
  1175  * check_for_spaced_dash:
  1176  *
  1177  * Check for spaced dashes.
  1178  */
  1179 void check_for_spaced_dash(const char *aline)
  1180 {
  1181     const char *s;
  1182     if ((s=strstr(aline," -")))
  1183     {
  1184 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
  1185 	{
  1186 	    if (pswit[ECHO_SWITCH])
  1187 		g_print("\n%s\n",aline);
  1188 	    if (!pswit[OVERVIEW_SWITCH])
  1189 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1190 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1191 	    else
  1192 		cnt_dash++;
  1193 	}
  1194     }
  1195     else if ((s=strstr(aline,"- ")))
  1196     {
  1197 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1198 	{
  1199 	    if (pswit[ECHO_SWITCH])
  1200 		g_print("\n%s\n",aline);
  1201 	    if (!pswit[OVERVIEW_SWITCH])
  1202 		g_print("    Line %ld column %ld - Spaced dash?\n",
  1203 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1204 	    else
  1205 		cnt_dash++;
  1206 	}
  1207     }
  1208 }
  1209 
  1210 /*
  1211  * check_for_unmarked_paragraphs:
  1212  *
  1213  * Check for unmarked paragraphs indicated by separate speakers.
  1214  *
  1215  * May well be false positive:
  1216  * "Bravo!" "Wonderful!" called the crowd.
  1217  * but useful all the same.
  1218  */
  1219 void check_for_unmarked_paragraphs(const char *aline)
  1220 {
  1221     const char *s;
  1222     s=strstr(aline,"\"  \"");
  1223     if (!s)
  1224 	s=strstr(aline,"\" \"");
  1225     if (s)
  1226     {
  1227 	if (pswit[ECHO_SWITCH])
  1228 	    g_print("\n%s\n",aline);
  1229 	if (!pswit[OVERVIEW_SWITCH])
  1230 	    g_print("    Line %ld column %ld - "
  1231 	      "Query missing paragraph break?\n",
  1232 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1233 	else
  1234 	    cnt_punct++;
  1235     }
  1236 }
  1237 
  1238 /*
  1239  * check_for_jeebies:
  1240  *
  1241  * Check for "to he" and other easy h/b errors.
  1242  *
  1243  * This is a very inadequate effort on the h/b problem,
  1244  * but the phrase "to he" is always an error, whereas "to
  1245  * be" is quite common.
  1246  * Similarly, '"Quiet!", be said.' is a non-be error
  1247  * "to he" is _not_ always an error!:
  1248  *       "Where they went to he couldn't say."
  1249  * Another false positive:
  1250  *       What would "Cinderella" be without the . . .
  1251  * and another: "If he wants to he can see for himself."
  1252  */
  1253 void check_for_jeebies(const char *aline)
  1254 {
  1255     const char *s;
  1256     s=strstr(aline," be could ");
  1257     if (!s)
  1258 	s=strstr(aline," be would ");
  1259     if (!s)
  1260 	s=strstr(aline," was be ");
  1261     if (!s)
  1262 	s=strstr(aline," be is ");
  1263     if (!s)
  1264 	s=strstr(aline," is be ");
  1265     if (!s)
  1266 	s=strstr(aline,"\", be ");
  1267     if (!s)
  1268 	s=strstr(aline,"\" be ");
  1269     if (!s)
  1270 	s=strstr(aline,"\" be ");
  1271     if (!s)
  1272 	s=strstr(aline," to he ");
  1273     if (s)
  1274     {
  1275 	if (pswit[ECHO_SWITCH])
  1276 	    g_print("\n%s\n",aline);
  1277 	if (!pswit[OVERVIEW_SWITCH])
  1278 	    g_print("    Line %ld column %ld - Query he/be error?\n",
  1279 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1280 	else
  1281 	    cnt_word++;
  1282     }
  1283     s=strstr(aline," the had ");
  1284     if (!s)
  1285 	s=strstr(aline," a had ");
  1286     if (!s)
  1287 	s=strstr(aline," they bad ");
  1288     if (!s)
  1289 	s=strstr(aline," she bad ");
  1290     if (!s)
  1291 	s=strstr(aline," he bad ");
  1292     if (!s)
  1293 	s=strstr(aline," you bad ");
  1294     if (!s)
  1295 	s=strstr(aline," i bad ");
  1296     if (s)
  1297     {
  1298 	if (pswit[ECHO_SWITCH])
  1299 	    g_print("\n%s\n",aline);
  1300 	if (!pswit[OVERVIEW_SWITCH])
  1301 	    g_print("    Line %ld column %ld - Query had/bad error?\n",
  1302 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1303 	else
  1304 	    cnt_word++;
  1305     }
  1306     s=strstr(aline,"; hut ");
  1307     if (!s)
  1308 	s=strstr(aline,", hut ");
  1309     if (s)
  1310     {
  1311 	if (pswit[ECHO_SWITCH])
  1312 	    g_print("\n%s\n",aline);
  1313 	if (!pswit[OVERVIEW_SWITCH])
  1314 	    g_print("    Line %ld column %ld - Query hut/but error?\n",
  1315 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1316 	else
  1317 	    cnt_word++;
  1318     }
  1319 }
  1320 
  1321 /*
  1322  * check_for_mta_from:
  1323  *
  1324  * Special case - angled bracket in front of "From" placed there by an
  1325  * MTA when sending an e-mail.
  1326  */
  1327 void check_for_mta_from(const char *aline)
  1328 {
  1329     const char *s;
  1330     s=strstr(aline,">From");
  1331     if (s)
  1332     {
  1333 	if (pswit[ECHO_SWITCH])
  1334 	    g_print("\n%s\n",aline);
  1335 	if (!pswit[OVERVIEW_SWITCH])
  1336 	    g_print("    Line %ld column %ld - "
  1337 	      "Query angled bracket with From\n",
  1338 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1339 	else
  1340 	    cnt_punct++;
  1341     }
  1342 }
  1343 
  1344 /*
  1345  * check_for_orphan_character:
  1346  *
  1347  * Check for a single character line -
  1348  * often an overflow from bad wrapping.
  1349  */
  1350 void check_for_orphan_character(const char *aline)
  1351 {
  1352     gunichar c;
  1353     c=g_utf8_get_char(aline);
  1354     if (c && !*g_utf8_next_char(aline))
  1355     {
  1356 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
  1357 	    ; /* Nothing - ignore numerals alone on a line. */
  1358 	else
  1359 	{
  1360 	    if (pswit[ECHO_SWITCH])
  1361 		g_print("\n%s\n",aline);
  1362 	    if (!pswit[OVERVIEW_SWITCH])
  1363 		g_print("    Line %ld column 1 - Query single character line\n",
  1364 		  linecnt);
  1365 	    else
  1366 		cnt_punct++;
  1367 	}
  1368     }
  1369 }
  1370 
  1371 /*
  1372  * check_for_pling_scanno:
  1373  *
  1374  * Check for I" - often should be !
  1375  */
  1376 void check_for_pling_scanno(const char *aline)
  1377 {
  1378     const char *s;
  1379     s=strstr(aline," I\"");
  1380     if (s)
  1381     {
  1382 	if (pswit[ECHO_SWITCH])
  1383 	    g_print("\n%s\n",aline);
  1384 	if (!pswit[OVERVIEW_SWITCH])
  1385 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
  1386 	      linecnt,g_utf8_pointer_to_offset(aline,s));
  1387 	else
  1388 	    cnt_punct++;
  1389     }
  1390 }
  1391 
  1392 /*
  1393  * check_for_extra_period:
  1394  *
  1395  * Check for period without a capital letter. Cut-down from gutspell.
  1396  * Only works when it happens on a single line.
  1397  */
  1398 void check_for_extra_period(const char *aline,const struct warnings *warnings)
  1399 {
  1400     const char *s,*t,*s1;
  1401     int i;
  1402     gsize len;
  1403     gboolean istypo;
  1404     gchar *testword;
  1405     gunichar *decomposition;
  1406     if (pswit[PARANOID_SWITCH])
  1407     {
  1408 	for (t=aline;t=strstr(t,". ");)
  1409 	{
  1410 	    if (t==aline)
  1411 	    {
  1412 		t=g_utf8_next_char(t);
  1413 		/* start of line punctuation is handled elsewhere */
  1414 		continue;
  1415 	    }
  1416 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1417 	    {
  1418 		t=g_utf8_next_char(t);
  1419 		continue;
  1420 	    }
  1421 	    if (warnings->isDutch)
  1422 	    {
  1423 		/* For Frank & Jeroen -- 's Middags case */
  1424 		gunichar c2,c3,c4,c5;
  1425 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1426 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1427 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1428 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1429 		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
  1430 		  c4==CHAR_SPACE && g_unichar_isupper(c5))
  1431 		{
  1432 		    t=g_utf8_next_char(t);
  1433 		    continue;
  1434 		}
  1435 	    }
  1436 	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1437 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1438 	      !isdigit(g_utf8_get_char(s1)))
  1439 		s1=g_utf8_next_char(s1);
  1440 	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1441 	    {
  1442 		/* we have something to investigate */
  1443 		istypo=TRUE;
  1444 		/* so let's go back and find out */
  1445 		for (s1=g_utf8_prev_char(t);s1>=aline &&
  1446 		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||
  1447 		  g_unichar_isdigit(g_utf8_get_char(s1)) ||
  1448 		  g_utf8_get_char(s1)==CHAR_SQUOTE &&
  1449 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
  1450 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
  1451 		  s1=g_utf8_prev_char(s1))
  1452 		    ;
  1453 		s1=g_utf8_next_char(s1);
  1454 		s=strchr(s1,'.');
  1455 		if (s)
  1456 		    testword=g_strndup(s1,s-s1);
  1457 		else
  1458 		    testword=g_strdup(s1);
  1459 		for (i=0;*abbrev[i];i++)
  1460 		    if (!strcmp(testword,abbrev[i]))
  1461 			istypo=FALSE;
  1462 		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1463 		    istypo=FALSE;
  1464 		if (!*g_utf8_next_char(testword))
  1465 		    istypo=FALSE;
  1466 		if (isroman(testword))
  1467 		    istypo=FALSE;
  1468 		if (istypo)
  1469 		{
  1470 		    istypo=FALSE;
  1471 		    for (s=testword;*s;s=g_utf8_next_char(s))
  1472 		    {
  1473 			decomposition=g_unicode_canonical_decomposition(
  1474 			  g_utf8_get_char(s),&len);
  1475 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1476 			    istypo=TRUE;
  1477 			g_free(decomposition);
  1478 		    }
  1479 		}
  1480 		if (istypo &&
  1481 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1482 		{
  1483 		    g_tree_insert(qperiod,g_strdup(testword),
  1484 		      GINT_TO_POINTER(1));
  1485 		    if (pswit[ECHO_SWITCH])
  1486 			g_print("\n%s\n",aline);
  1487 		    if (!pswit[OVERVIEW_SWITCH])
  1488 			g_print("    Line %ld column %ld - Extra period?\n",
  1489 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1490 		    else
  1491 			cnt_punct++;
  1492 		}
  1493 		g_free(testword);
  1494 	    }
  1495 	    t=g_utf8_next_char(t);
  1496 	}
  1497     }
  1498 }
  1499 
  1500 /*
  1501  * check_for_following_punctuation:
  1502  *
  1503  * Check for words usually not followed by punctuation.
  1504  */
  1505 void check_for_following_punctuation(const char *aline)
  1506 {
  1507     int i;
  1508     const char *s,*wordstart;
  1509     gunichar c;
  1510     gchar *inword,*t;
  1511     if (pswit[TYPO_SWITCH])
  1512     {
  1513 	for (s=aline;*s;)
  1514 	{
  1515 	    wordstart=s;
  1516 	    t=getaword(&s);
  1517 	    if (!*t)
  1518 	    {
  1519 		g_free(t);
  1520 		continue;
  1521 	    }
  1522 	    inword=g_utf8_strdown(t,-1);
  1523 	    g_free(t);
  1524 	    for (i=0;*nocomma[i];i++)
  1525 		if (!strcmp(inword,nocomma[i]))
  1526 		{
  1527 		    c=g_utf8_get_char(s);
  1528 		    if (c==',' || c==';' || c==':')
  1529 		    {
  1530 			if (pswit[ECHO_SWITCH])
  1531 			    g_print("\n%s\n",aline);
  1532 			if (!pswit[OVERVIEW_SWITCH])
  1533 			    g_print("    Line %ld column %ld - "
  1534 			      "Query punctuation after %s?\n",
  1535 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1536 			      inword);
  1537 			else
  1538 			    cnt_punct++;
  1539 		    }
  1540 		}
  1541 	    for (i=0;*noperiod[i];i++)
  1542 		if (!strcmp(inword,noperiod[i]))
  1543 		{
  1544 		    c=g_utf8_get_char(s);
  1545 		    if (c=='.' || c=='!')
  1546 		    {
  1547 			if (pswit[ECHO_SWITCH])
  1548 			    g_print("\n%s\n",aline);
  1549 			if (!pswit[OVERVIEW_SWITCH])
  1550 			    g_print("    Line %ld column %ld - "
  1551 			      "Query punctuation after %s?\n",
  1552 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1553 			      inword);
  1554 			else
  1555 			    cnt_punct++;
  1556 		    }
  1557 		}
  1558 	    g_free(inword);
  1559 	}
  1560     }
  1561 }
  1562 
  1563 /*
  1564  * check_for_typos:
  1565  *
  1566  * Check for commonly mistyped words,
  1567  * and digits like 0 for O in a word.
  1568  */
  1569 void check_for_typos(const char *aline,struct warnings *warnings)
  1570 {
  1571     const char *s,*t,*nt,*wordstart;
  1572     gchar *inword;
  1573     gunichar *decomposition;
  1574     gchar *testword;
  1575     int i,vowel,consonant,*dupcnt;
  1576     gboolean isdup,istypo,alower;
  1577     gunichar c;
  1578     long offset,len;
  1579     gsize decomposition_len;
  1580     for (s=aline;*s;)
  1581     {
  1582 	wordstart=s;
  1583 	inword=getaword(&s);
  1584 	if (!*inword)
  1585 	{
  1586 	    g_free(inword);
  1587 	    continue; /* don't bother with empty lines */
  1588 	}
  1589 	if (mixdigit(inword))
  1590 	{
  1591 	    if (pswit[ECHO_SWITCH])
  1592 		g_print("\n%s\n",aline);
  1593 	    if (!pswit[OVERVIEW_SWITCH])
  1594 		g_print("    Line %ld column %ld - Query digit in %s\n",
  1595 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1596 	    else
  1597 		cnt_word++;
  1598 	}
  1599 	/*
  1600 	 * Put the word through a series of tests for likely typos and OCR
  1601 	 * errors.
  1602 	 */
  1603 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1604 	{
  1605 	    istypo=FALSE;
  1606 	    alower=FALSE;
  1607 	    for (t=inword;*t;t=g_utf8_next_char(t))
  1608 	    {
  1609 		c=g_utf8_get_char(t);
  1610 		nt=g_utf8_next_char(t);
  1611 		/* lowercase for testing */
  1612 		if (g_unichar_islower(c))
  1613 		    alower=TRUE;
  1614 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1615 		{
  1616 		    /*
  1617 		     * We have an uppercase mid-word. However, there are
  1618 		     * common cases:
  1619 		     *   Mac and Mc like McGill
  1620 		     *   French contractions like l'Abbe
  1621 		     */
  1622 		    offset=g_utf8_pointer_to_offset(inword,t);
  1623 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1624 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1625 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1626 		      offset>0 &&
  1627 		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
  1628 			; /* do nothing! */
  1629 		    else
  1630 			istypo=TRUE;
  1631 		}
  1632 	    }
  1633 	    testword=g_utf8_casefold(inword,-1);
  1634 	}
  1635 	if (pswit[TYPO_SWITCH])
  1636 	{
  1637 	    /*
  1638 	     * Check for certain unlikely two-letter combinations at word
  1639 	     * start and end.
  1640 	     */
  1641 	    len=g_utf8_strlen(testword,-1);
  1642 	    if (len>1)
  1643 	    {
  1644 		for (i=0;*nostart[i];i++)
  1645 		    if (g_str_has_prefix(testword,nostart[i]))
  1646 			istypo=TRUE;
  1647 		for (i=0;*noend[i];i++)
  1648 		    if (g_str_has_suffix(testword,noend[i]))
  1649 			istypo=TRUE;
  1650 	    }
  1651 	    /* ght is common, gbt never. Like that. */
  1652 	    if (strstr(testword,"cb"))
  1653 		istypo=TRUE;
  1654 	    if (strstr(testword,"gbt"))
  1655 		istypo=TRUE;
  1656 	    if (strstr(testword,"pbt"))
  1657 		istypo=TRUE;
  1658 	    if (strstr(testword,"tbs"))
  1659 		istypo=TRUE;
  1660 	    if (strstr(testword,"mrn"))
  1661 		istypo=TRUE;
  1662 	    if (strstr(testword,"ahle"))
  1663 		istypo=TRUE;
  1664 	    if (strstr(testword,"ihle"))
  1665 		istypo=TRUE;
  1666 	    /*
  1667 	     * "TBE" does happen - like HEARTBEAT - but uncommon.
  1668 	     * Also "TBI" - frostbite, outbid - but uncommon.
  1669 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
  1670 	     * numerals, but "ii" is a common scanno.
  1671 	     */
  1672 	    if (strstr(testword,"tbi"))
  1673 		istypo=TRUE;
  1674 	    if (strstr(testword,"tbe"))
  1675 		istypo=TRUE;
  1676 	    if (strstr(testword,"ii"))
  1677 		istypo=TRUE;
  1678 	    /*
  1679 	     * Check for no vowels or no consonants.
  1680 	     * If none, flag a typo.
  1681 	     */
  1682 	    if (!istypo && len>1)
  1683 	    {
  1684 		vowel=consonant=0;
  1685 		for (t=testword;*t;t=g_utf8_next_char(t))
  1686 		{
  1687 		    c=g_utf8_get_char(t);
  1688 		    decomposition=
  1689 		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1690 		    if (c=='y' || g_unichar_isdigit(c))
  1691 		    {
  1692 			/* Yah, this is loose. */
  1693 			vowel++;
  1694 			consonant++;
  1695 		    }
  1696 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1697 			vowel++;
  1698 		    else
  1699 			consonant++;
  1700 		    g_free(decomposition);
  1701 		}
  1702 		if (!vowel || !consonant)
  1703 		    istypo=TRUE;
  1704 	    }
  1705 	    /*
  1706 	     * Now exclude the word from being reported if it's in
  1707 	     * the okword list.
  1708 	     */
  1709 	    for (i=0;*okword[i];i++)
  1710 		if (!strcmp(testword,okword[i]))
  1711 		    istypo=FALSE;
  1712 	    /*
  1713 	     * What looks like a typo may be a Roman numeral.
  1714 	     * Exclude these.
  1715 	     */
  1716 	    if (istypo && isroman(testword))
  1717 		istypo=FALSE;
  1718 	    /* Check the manual list of typos. */
  1719 	    if (!istypo)
  1720 		for (i=0;*typo[i];i++)
  1721 		    if (!strcmp(testword,typo[i]))
  1722 			istypo=TRUE;
  1723 	    /*
  1724 	     * Check lowercase s, l, i and m - special cases.
  1725 	     *   "j" - often a semi-colon gone wrong.
  1726 	     *   "d" for a missing apostrophe - he d
  1727 	     *   "n" for "in"
  1728 	     */
  1729 	    if (!istypo && len==1 &&
  1730 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1731 		istypo=TRUE;
  1732 	    if (istypo)
  1733 	    {
  1734 		dupcnt=g_tree_lookup(qword,testword);
  1735 		if (dupcnt)
  1736 		{
  1737 		    (*dupcnt)++;
  1738 		    isdup=!pswit[VERBOSE_SWITCH];
  1739 		}
  1740 		else
  1741 		{
  1742 		    dupcnt=g_new0(int,1);
  1743 		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1744 		    isdup=FALSE;
  1745 		}
  1746 		if (!isdup)
  1747 		{
  1748 		    if (pswit[ECHO_SWITCH])
  1749 			g_print("\n%s\n",aline);
  1750 		    if (!pswit[OVERVIEW_SWITCH])
  1751 		    {
  1752 			g_print("    Line %ld column %ld - Query word %s",
  1753 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1754 			  inword);
  1755 			if (!pswit[VERBOSE_SWITCH])
  1756 			    g_print(" - not reporting duplicates");
  1757 			g_print("\n");
  1758 		    }
  1759 		    else
  1760 			cnt_word++;
  1761 		}
  1762 	    }
  1763 	}
  1764 	/* check the user's list of typos */
  1765 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1766 	{
  1767 	    if (pswit[ECHO_SWITCH])
  1768 		g_print("\n%s\n",aline);
  1769 	    if (!pswit[OVERVIEW_SWITCH])  
  1770 		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1771 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1772 	}
  1773 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1774 	    g_free(testword);
  1775 	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1776 	{
  1777 	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1778 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1779 	    {
  1780 		if (pswit[ECHO_SWITCH])
  1781 		    g_print("\n%s\n",aline);
  1782 		if (!pswit[OVERVIEW_SWITCH])
  1783 		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1784 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1785 		      inword);
  1786 		else
  1787 		    cnt_word++;
  1788 	    }
  1789 	}
  1790 	g_free(inword);
  1791     }
  1792 }
  1793 
  1794 /*
  1795  * check_for_misspaced_punctuation:
  1796  *
  1797  * Look for added or missing spaces around punctuation and quotes.
  1798  * If there is a punctuation character like ! with no space on
  1799  * either side, suspect a missing!space. If there are spaces on
  1800  * both sides , assume a typo. If we see a double quote with no
  1801  * space or punctuation on either side of it, assume unspaced
  1802  * quotes "like"this.
  1803  */
  1804 void check_for_misspaced_punctuation(const char *aline,
  1805   struct parities *parities,gboolean isemptyline)
  1806 {
  1807     gboolean isacro,isellipsis;
  1808     const char *s;
  1809     gunichar c,nc,pc,n2c;
  1810     c=g_utf8_get_char(aline);
  1811     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1812     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1813     {
  1814 	pc=c;
  1815 	c=nc;
  1816 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1817 	/* For each character in the line after the first. */
  1818 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1819 	{
  1820 	    /* we need to suppress warnings for acronyms like M.D. */
  1821 	    isacro=FALSE;
  1822 	    /* we need to suppress warnings for ellipsis . . . */
  1823 	    isellipsis=FALSE;
  1824 	    /*
  1825 	     * If there are letters on both sides of it or
  1826 	     * if it's strict punctuation followed by an alpha.
  1827 	     */
  1828 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1829 	      g_utf8_strchr("?!,;:",-1,c)))
  1830 	    {
  1831 		if (c=='.')
  1832 		{
  1833 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1834 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1835 			isacro=TRUE;
  1836 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1837 		    if (nc && n2c=='.')
  1838 			isacro=TRUE;
  1839 		}
  1840 		if (!isacro)
  1841 		{
  1842 		    if (pswit[ECHO_SWITCH])
  1843 			g_print("\n%s\n",aline);
  1844 		    if (!pswit[OVERVIEW_SWITCH])
  1845 			g_print("    Line %ld column %ld - Missing space?\n",
  1846 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1847 		    else
  1848 			cnt_punct++;
  1849 		}
  1850 	    }
  1851 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1852 	    {
  1853 		/*
  1854 		 * If there are spaces on both sides,
  1855 		 * or space before and end of line.
  1856 		 */
  1857 		if (c=='.')
  1858 		{
  1859 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1860 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1861 			isellipsis=TRUE;
  1862 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1863 		    if (nc && n2c=='.')
  1864 			isellipsis=TRUE;
  1865 		}
  1866 		if (!isemptyline && !isellipsis)
  1867 		{
  1868 		    if (pswit[ECHO_SWITCH])
  1869 			g_print("\n%s\n",aline);
  1870 		    if (!pswit[OVERVIEW_SWITCH])
  1871 			g_print("    Line %ld column %ld - "
  1872 			  "Spaced punctuation?\n",linecnt,
  1873 			  g_utf8_pointer_to_offset(aline,s)+1);
  1874 		    else
  1875 			cnt_punct++;
  1876 		}
  1877 	    }
  1878 	}
  1879     }
  1880     /* Split out the characters that CANNOT be preceded by space. */
  1881     c=g_utf8_get_char(aline);
  1882     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1883     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1884     {
  1885 	pc=c;
  1886 	c=nc;
  1887 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1888 	/* for each character in the line after the first */
  1889 	if (g_utf8_strchr("?!,;:",-1,c))
  1890 	{
  1891 	    /* if it's punctuation that _cannot_ have a space before it */
  1892 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1893 	    {
  1894 		/*
  1895 		 * If nc DOES == space,
  1896 		 * it was already reported just above.
  1897 		 */
  1898 		if (pswit[ECHO_SWITCH])
  1899 		    g_print("\n%s\n",aline);
  1900 		if (!pswit[OVERVIEW_SWITCH])
  1901 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1902 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1903 		else
  1904 		    cnt_punct++;
  1905 	    }
  1906 	}
  1907     }
  1908     /*
  1909      * Special case " .X" where X is any alpha.
  1910      * This plugs a hole in the acronym code above.
  1911      * Inelegant, but maintainable.
  1912      */
  1913     c=g_utf8_get_char(aline);
  1914     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1915     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1916     {
  1917 	pc=c;
  1918 	c=nc;
  1919 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1920 	/* for each character in the line after the first */
  1921 	if (c=='.')
  1922 	{
  1923 	    /* if it's a period */
  1924 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1925 	    {
  1926 		/*
  1927 		 * If the period follows a space and
  1928 		 * is followed by a letter.
  1929 		 */
  1930 		if (pswit[ECHO_SWITCH])
  1931 		    g_print("\n%s\n",aline);
  1932 		if (!pswit[OVERVIEW_SWITCH])
  1933 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1934 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1935 		else
  1936 		    cnt_punct++;
  1937 	    }
  1938 	}
  1939     }
  1940     c=g_utf8_get_char(aline);
  1941     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1942     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1943     {
  1944 	pc=c;
  1945 	c=nc;
  1946 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1947 	/* for each character in the line after the first */
  1948 	if (c==CHAR_DQUOTE)
  1949 	{
  1950 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  1951 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  1952 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  1953 	    {
  1954 		if (pswit[ECHO_SWITCH])
  1955 		    g_print("\n%s\n",aline);
  1956 		if (!pswit[OVERVIEW_SWITCH])
  1957 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  1958 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1959 		else
  1960 		    cnt_punct++;
  1961 	    }
  1962 	}
  1963     }
  1964     /* Check parity of quotes. */
  1965     nc=g_utf8_get_char(aline);
  1966     for (s=aline;*s;s=g_utf8_next_char(s))
  1967     {
  1968 	c=nc;
  1969 	nc=g_utf8_get_char(g_utf8_next_char(s));
  1970 	if (c==CHAR_DQUOTE)
  1971 	{
  1972 	    parities->dquote=!parities->dquote;
  1973 	    if (!parities->dquote)
  1974 	    {
  1975 		/* parity even */
  1976 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  1977 		{
  1978 		    if (pswit[ECHO_SWITCH])
  1979 			g_print("\n%s\n",aline);
  1980 		    if (!pswit[OVERVIEW_SWITCH])
  1981 			g_print("    Line %ld column %ld - "
  1982 			  "Wrongspaced quotes?\n",
  1983 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1984 		    else
  1985 			cnt_punct++;
  1986 		}
  1987 	    }
  1988 	    else
  1989 	    {
  1990 		/* parity odd */
  1991 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  1992 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  1993 		{
  1994 		    if (pswit[ECHO_SWITCH])
  1995 			g_print("\n%s\n",aline);
  1996 		    if (!pswit[OVERVIEW_SWITCH])
  1997 			g_print("    Line %ld column %ld - "
  1998 			  "Wrongspaced quotes?\n",
  1999 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2000 		    else
  2001 			cnt_punct++;
  2002 		}
  2003 	    }
  2004 	}
  2005     }
  2006     if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  2007     {
  2008 	if (g_utf8_strchr(",;:!?)]} ",-1,
  2009 	  g_utf8_get_char(g_utf8_next_char(aline))))
  2010 	{
  2011 	    if (pswit[ECHO_SWITCH])
  2012 		g_print("\n%s\n",aline);
  2013 	    if (!pswit[OVERVIEW_SWITCH])
  2014 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  2015 		  linecnt);
  2016 	    else
  2017 		cnt_punct++;
  2018 	}
  2019     }
  2020     if (pswit[SQUOTE_SWITCH])
  2021     {
  2022 	nc=g_utf8_get_char(aline);
  2023 	for (s=aline;*s;s=g_utf8_next_char(s))
  2024 	{
  2025 	    c=nc;
  2026 	    nc=g_utf8_get_char(g_utf8_next_char(s));
  2027 	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
  2028 	      s>aline &&
  2029 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  2030 	      !g_unichar_isalpha(nc)))
  2031 	    {
  2032 		parities->squote=!parities->squote;
  2033 		if (!parities->squote)
  2034 		{
  2035 		    /* parity even */
  2036 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  2037 		    {
  2038 			if (pswit[ECHO_SWITCH])
  2039 			    g_print("\n%s\n",aline);
  2040 			if (!pswit[OVERVIEW_SWITCH])
  2041 			    g_print("    Line %ld column %ld - "
  2042 			      "Wrongspaced singlequotes?\n",
  2043 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2044 			else
  2045 			    cnt_punct++;
  2046 		    }
  2047 		}
  2048 		else
  2049 		{
  2050 		    /* parity odd */
  2051 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  2052 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  2053 		    {
  2054 			if (pswit[ECHO_SWITCH])
  2055 			    g_print("\n%s\n",aline);
  2056 			if (!pswit[OVERVIEW_SWITCH])
  2057 			    g_print("    Line %ld column %ld - "
  2058 			      "Wrongspaced singlequotes?\n",
  2059 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2060 			else
  2061 			    cnt_punct++;
  2062 		    }
  2063 		}
  2064 	    }
  2065 	}
  2066     }
  2067 }
  2068 
  2069 /*
  2070  * check_for_double_punctuation:
  2071  *
  2072  * Look for double punctuation like ,. or ,,
  2073  * Thanks to DW for the suggestion!
  2074  * In books with references, ".," and ".;" are common
  2075  * e.g. "etc., etc.," and vol. 1.; vol 3.;
  2076  * OTOH, from my initial tests, there are also fairly
  2077  * common errors. What to do? Make these cases paranoid?
  2078  * ".," is the most common, so warnings->dotcomma is used
  2079  * to suppress detailed reporting if it occurs often.
  2080  */
  2081 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  2082 {
  2083     const char *s;
  2084     gunichar c,nc;
  2085     nc=g_utf8_get_char(aline);
  2086     for (s=aline;*s;s=g_utf8_next_char(s))
  2087     {
  2088 	c=nc;
  2089 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2090 	/* for each punctuation character in the line */
  2091 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  2092 	  g_utf8_strchr(".?!,;:",-1,nc))
  2093 	{
  2094 	    /* followed by punctuation, it's a query, unless . . . */
  2095 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  2096 	      !warnings->dotcomma && c=='.' && nc==',' ||
  2097 	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  2098 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2099 	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  2100 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2101 	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  2102 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2103 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2104 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2105 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2106 	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  2107 	    {
  2108 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  2109 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  2110 		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  2111 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  2112 		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  2113 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  2114 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  2115 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  2116 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  2117 		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  2118 		{
  2119 		    s+=4;
  2120 		    nc=g_utf8_get_char(g_utf8_next_char(s));
  2121 		}
  2122 		; /* do nothing for .. !! and ?? which can be legit */
  2123 	    }
  2124 	    else
  2125 	    {
  2126 		if (pswit[ECHO_SWITCH])
  2127 		    g_print("\n%s\n",aline);
  2128 		if (!pswit[OVERVIEW_SWITCH])
  2129 		    g_print("    Line %ld column %ld - Double punctuation?\n",
  2130 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2131 		else
  2132 		    cnt_punct++;
  2133 	    }
  2134 	}
  2135     }
  2136 }
  2137 
  2138 /*
  2139  * check_for_spaced_quotes:
  2140  */
  2141 void check_for_spaced_quotes(const char *aline)
  2142 {
  2143     const char *s,*t;
  2144     s=aline;
  2145     while ((t=strstr(s," \" ")))
  2146     {
  2147 	if (pswit[ECHO_SWITCH])
  2148 	    g_print("\n%s\n",aline);
  2149 	if (!pswit[OVERVIEW_SWITCH])
  2150 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  2151 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2152 	else
  2153 	    cnt_punct++;
  2154 	s=g_utf8_next_char(g_utf8_next_char(t));
  2155     }
  2156     s=aline;
  2157     while ((t=strstr(s," ' ")))
  2158     {
  2159 	if (pswit[ECHO_SWITCH])
  2160 	    g_print("\n%s\n",aline);
  2161 	if (!pswit[OVERVIEW_SWITCH])
  2162 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2163 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2164 	else
  2165 	    cnt_punct++;
  2166 	s=g_utf8_next_char(g_utf8_next_char(t));
  2167     }
  2168     s=aline;
  2169     while ((t=strstr(s," ` ")))
  2170     {
  2171 	if (pswit[ECHO_SWITCH])
  2172 	    g_print("\n%s\n",aline);
  2173 	if (!pswit[OVERVIEW_SWITCH])
  2174 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  2175 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  2176 	else
  2177 	    cnt_punct++;
  2178 	s=g_utf8_next_char(g_utf8_next_char(t));
  2179     }
  2180 }
  2181 
  2182 /*
  2183  * check_for_miscased_genative:
  2184  *
  2185  * Check special case of 'S instead of 's at end of word.
  2186  */
  2187 void check_for_miscased_genative(const char *aline)
  2188 {
  2189     const char *s;
  2190     gunichar c,nc,pc;
  2191     if (!*aline)
  2192 	return;
  2193     c=g_utf8_get_char(aline);
  2194     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2195     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2196     {
  2197 	pc=c;
  2198 	c=nc;
  2199 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2200 	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
  2201 	{
  2202 	    if (pswit[ECHO_SWITCH])
  2203 		g_print("\n%s\n",aline);
  2204 	    if (!pswit[OVERVIEW_SWITCH])
  2205 		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  2206 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  2207 	    else
  2208 		cnt_punct++;
  2209 	}
  2210     }
  2211 }
  2212 
  2213 /*
  2214  * check_end_of_line:
  2215  *
  2216  * Now check special cases - start and end of line -
  2217  * for single and double quotes. Start is sometimes [sic]
  2218  * but better to query it anyway.
  2219  * While we're here, check for dash at end of line.
  2220  */
  2221 void check_end_of_line(const char *aline,struct warnings *warnings)
  2222 {
  2223     int lbytes;
  2224     const char *s;
  2225     gunichar c1,c2;
  2226     lbytes=strlen(aline);
  2227     if (g_utf8_strlen(aline,lbytes)>1)
  2228     {
  2229 	s=g_utf8_prev_char(aline+lbytes);
  2230 	c1=g_utf8_get_char(s);
  2231 	c2=g_utf8_get_char(g_utf8_prev_char(s));
  2232 	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
  2233 	  c2==CHAR_SPACE)
  2234 	{
  2235 	    if (pswit[ECHO_SWITCH])
  2236 		g_print("\n%s\n",aline);
  2237 	    if (!pswit[OVERVIEW_SWITCH])
  2238 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  2239 		  g_utf8_strlen(aline,lbytes));
  2240 	    else
  2241 		cnt_punct++;
  2242 	}
  2243 	c1=g_utf8_get_char(aline);
  2244 	c2=g_utf8_get_char(g_utf8_next_char(aline));
  2245 	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
  2246 	{
  2247 	    if (pswit[ECHO_SWITCH])
  2248 		g_print("\n%s\n",aline);
  2249 	    if (!pswit[OVERVIEW_SWITCH])
  2250 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  2251 	    else
  2252 		cnt_punct++;
  2253 	}
  2254 	/*
  2255 	 * Dash at end of line may well be legit - paranoid mode only
  2256 	 * and don't report em-dash at line-end.
  2257 	 */
  2258 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  2259 	{
  2260 	    for (s=g_utf8_prev_char(aline+lbytes);
  2261 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  2262 		;
  2263 	    if (g_utf8_get_char(s)=='-' &&
  2264 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  2265 	    {
  2266 		if (pswit[ECHO_SWITCH])
  2267 		    g_print("\n%s\n",aline);
  2268 		if (!pswit[OVERVIEW_SWITCH])
  2269 		    g_print("    Line %ld column %ld - "
  2270 		      "Hyphen at end of line?\n",
  2271 		      linecnt,g_utf8_pointer_to_offset(aline,s));
  2272 	    }
  2273 	}
  2274     }
  2275 }
  2276 
  2277 /*
  2278  * check_for_unspaced_bracket:
  2279  *
  2280  * Brackets are often unspaced, but shouldn't be surrounded by alpha.
  2281  * If so, suspect a scanno like "a]most".
  2282  */
  2283 void check_for_unspaced_bracket(const char *aline)
  2284 {
  2285     const char *s;
  2286     gunichar c,nc,pc;
  2287     c=g_utf8_get_char(aline);
  2288     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2289     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2290     {
  2291 	pc=c;
  2292 	c=nc;
  2293 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2294 	if (!nc)
  2295 	    break;
  2296 	/* for each bracket character in the line except 1st & last */
  2297 	if (g_utf8_strchr("{[()]}",-1,c) &&
  2298 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  2299 	{
  2300 	    if (pswit[ECHO_SWITCH])
  2301 		g_print("\n%s\n",aline);
  2302 	    if (!pswit[OVERVIEW_SWITCH])
  2303 		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  2304 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2305 	    else
  2306 		cnt_punct++;
  2307 	}
  2308     }
  2309 }
  2310 
  2311 /*
  2312  * check_for_unpunctuated_endquote:
  2313  */
  2314 void check_for_unpunctuated_endquote(const char *aline)
  2315 {
  2316     const char *s;
  2317     gunichar c,nc,pc;
  2318     c=g_utf8_get_char(aline);
  2319     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  2320     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  2321     {
  2322 	pc=c;
  2323 	c=nc;
  2324 	nc=g_utf8_get_char(g_utf8_next_char(s));
  2325 	/* for each character in the line except 1st */
  2326 	if (c==CHAR_DQUOTE && isalpha(pc))
  2327 	{
  2328 	    if (pswit[ECHO_SWITCH])
  2329 		g_print("\n%s\n",aline);
  2330 	    if (!pswit[OVERVIEW_SWITCH])
  2331 		g_print("    Line %ld column %ld - "
  2332 		  "endquote missing punctuation?\n",
  2333 		  linecnt,g_utf8_pointer_to_offset(aline,s));
  2334 	    else
  2335 		cnt_punct++;
  2336 	}
  2337     }
  2338 }
  2339 
  2340 /*
  2341  * check_for_html_tag:
  2342  *
  2343  * Check for <HTML TAG>.
  2344  *
  2345  * If there is a < in the line, followed at some point
  2346  * by a > then we suspect HTML.
  2347  */
  2348 void check_for_html_tag(const char *aline)
  2349 {
  2350     const char *open,*close;
  2351     gchar *tag;
  2352     open=strchr(aline,'<');
  2353     if (open)
  2354     {
  2355 	close=strchr(g_utf8_next_char(open),'>');
  2356 	if (close)
  2357 	{
  2358 	    if (pswit[ECHO_SWITCH])
  2359 		g_print("\n%s\n",aline);
  2360 	    if (!pswit[OVERVIEW_SWITCH])
  2361 	    {
  2362 		tag=g_strndup(open,close-open+1);
  2363 		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  2364 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  2365 		g_free(tag);
  2366 	    }
  2367 	    else
  2368 		cnt_html++;
  2369 	}
  2370     }
  2371 }
  2372 
  2373 /*
  2374  * check_for_html_entity:
  2375  *
  2376  * Check for &symbol; HTML.
  2377  *
  2378  * If there is a & in the line, followed at
  2379  * some point by a ; then we suspect HTML.
  2380  */
  2381 void check_for_html_entity(const char *aline)
  2382 {
  2383     const char *s,*amp,*scolon;
  2384     gchar *entity;
  2385     amp=strchr(aline,'&');
  2386     if (amp)
  2387     {
  2388 	scolon=strchr(amp,';');
  2389 	if (scolon)
  2390 	{
  2391 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  2392 		if (g_utf8_get_char(s)==CHAR_SPACE)
  2393 		    break;		/* Don't report "Jones & Son;" */
  2394 	    if (s>=scolon)
  2395 	    {
  2396 		if (pswit[ECHO_SWITCH])
  2397 		    g_print("\n%s\n",aline);
  2398 		if (!pswit[OVERVIEW_SWITCH])
  2399 		{
  2400 		    entity=g_strndup(amp,scolon-amp+1);
  2401 		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  2402 		      linecnt,(int)(amp-aline)+1,entity);
  2403 		    g_free(entity);
  2404 		}
  2405 		else
  2406 		    cnt_html++;
  2407 	    }
  2408 	}
  2409     }
  2410 }
  2411 
  2412 /*
  2413  * print_pending:
  2414  *
  2415  * If we are in a state of unbalanced quotes, and this line
  2416  * doesn't begin with a quote, output the stored error message.
  2417  * If the -P switch was used, print the warning even if the
  2418  * new para starts with quotes.
  2419  */
  2420 void print_pending(const char *aline,const char *parastart,
  2421   struct pending *pending)
  2422 {
  2423     const char *s;
  2424     gunichar c;
  2425     s=aline;
  2426     while (*s==' ')
  2427 	s++;
  2428     c=g_utf8_get_char(s);
  2429     if (pending->dquote)
  2430     {
  2431 	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  2432 	{
  2433 	    if (!pswit[OVERVIEW_SWITCH])
  2434 	    {
  2435 		if (pswit[ECHO_SWITCH])
  2436 		    g_print("\n%s\n",parastart);
  2437 		g_print("%s\n",pending->dquote);
  2438 	    }
  2439 	    else
  2440 		cnt_dquot++;
  2441 	}
  2442 	g_free(pending->dquote);
  2443 	pending->dquote=NULL;
  2444     }
  2445     if (pending->squote)
  2446     {
  2447 	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
  2448 	  pending->squot)
  2449 	{
  2450 	    if (!pswit[OVERVIEW_SWITCH])
  2451 	    {
  2452 		if (pswit[ECHO_SWITCH])
  2453 		    g_print("\n%s\n",parastart);
  2454 		g_print("%s\n",pending->squote);
  2455 	    }
  2456 	    else
  2457 		cnt_squot++;
  2458 	}
  2459 	g_free(pending->squote);
  2460 	pending->squote=NULL;
  2461     }
  2462     if (pending->rbrack)
  2463     {
  2464 	if (!pswit[OVERVIEW_SWITCH])
  2465 	{
  2466 	    if (pswit[ECHO_SWITCH])
  2467 		g_print("\n%s\n",parastart);
  2468 	    g_print("%s\n",pending->rbrack);
  2469 	}
  2470 	else
  2471 	    cnt_brack++;
  2472 	g_free(pending->rbrack);
  2473 	pending->rbrack=NULL;
  2474     }
  2475     if (pending->sbrack)
  2476     {
  2477 	if (!pswit[OVERVIEW_SWITCH])
  2478 	{
  2479 	    if (pswit[ECHO_SWITCH])
  2480 		g_print("\n%s\n",parastart);
  2481 	    g_print("%s\n",pending->sbrack);
  2482 	}
  2483 	else
  2484 	    cnt_brack++;
  2485 	g_free(pending->sbrack);
  2486 	pending->sbrack=NULL;
  2487     }
  2488     if (pending->cbrack)
  2489     {
  2490 	if (!pswit[OVERVIEW_SWITCH])
  2491 	{
  2492 	    if (pswit[ECHO_SWITCH])
  2493 		g_print("\n%s\n",parastart);
  2494 	    g_print("%s\n",pending->cbrack);
  2495 	}
  2496 	else
  2497 	    cnt_brack++;
  2498 	g_free(pending->cbrack);
  2499 	pending->cbrack=NULL;
  2500     }
  2501     if (pending->unders)
  2502     {
  2503 	if (!pswit[OVERVIEW_SWITCH])
  2504 	{
  2505 	    if (pswit[ECHO_SWITCH])
  2506 		g_print("\n%s\n",parastart);
  2507 	    g_print("%s\n",pending->unders);
  2508 	}
  2509 	else
  2510 	    cnt_brack++;
  2511 	g_free(pending->unders);
  2512 	pending->unders=NULL;
  2513     }
  2514 }
  2515 
  2516 /*
  2517  * check_for_mismatched_quotes:
  2518  *
  2519  * At end of paragraph, check for mismatched quotes.
  2520  *
  2521  * We don't want to report an error immediately, since it is a
  2522  * common convention to omit the quotes at end of paragraph if
  2523  * the next paragraph is a continuation of the same speaker.
  2524  * Where this is the case, the next para should begin with a
  2525  * quote, so we store the warning message and only display it
  2526  * at the top of the next iteration if the new para doesn't
  2527  * start with a quote.
  2528  * The -p switch overrides this default, and warns of unclosed
  2529  * quotes on _every_ paragraph, whether the next begins with a
  2530  * quote or not.
  2531  */
  2532 void check_for_mismatched_quotes(const struct counters *counters,
  2533   struct pending *pending)
  2534 {
  2535     if (counters->quot%2)
  2536 	pending->dquote=
  2537 	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
  2538     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  2539       counters->open_single_quote!=counters->close_single_quote)
  2540 	pending->squote=
  2541 	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
  2542     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  2543       counters->open_single_quote!=counters->close_single_quote &&
  2544       counters->open_single_quote!=counters->close_single_quote+1)
  2545 	/*
  2546 	 * Flag it to be noted regardless of the
  2547 	 * first char of the next para.
  2548 	 */
  2549 	pending->squot=1;
  2550     if (counters->r_brack)
  2551 	pending->rbrack=
  2552 	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
  2553     if (counters->s_brack)
  2554 	pending->sbrack=
  2555 	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
  2556     if (counters->c_brack)
  2557 	pending->cbrack=
  2558 	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
  2559     if (counters->c_unders%2)
  2560 	pending->unders=
  2561 	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
  2562 }
  2563 
  2564 /*
  2565  * check_for_omitted_punctuation:
  2566  *
  2567  * Check for omitted punctuation at end of paragraph by working back
  2568  * through prevline. DW.
  2569  * Need to check this only for "normal" paras.
  2570  * So what is a "normal" para?
  2571  *    Not normal if one-liner (chapter headings, etc.)
  2572  *    Not normal if doesn't contain at least one locase letter
  2573  *    Not normal if starts with space
  2574  */
  2575 void check_for_omitted_punctuation(const char *prevline,
  2576   struct line_properties *last,int start_para_line)
  2577 {
  2578     gboolean letter_on_line=FALSE;
  2579     const char *s;
  2580     for (s=prevline;*s;s=g_utf8_next_char(s))
  2581 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2582 	{
  2583 	    letter_on_line=TRUE;
  2584 	    break;
  2585 	}
  2586     /*
  2587      * This next "if" is a problem.
  2588      * If we say "start_para_line <= linecnt - 1", that includes
  2589      * one-line "paragraphs" like chapter heads. Lotsa false positives.
  2590      * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  2591      * misses genuine one-line paragraphs.
  2592      */
  2593     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  2594       g_utf8_get_char(prevline)>CHAR_SPACE)
  2595     {
  2596 	for (s=g_utf8_prev_char(prevline+strlen(prevline));
  2597 	  (g_utf8_get_char(s)==CHAR_DQUOTE ||
  2598 	  g_utf8_get_char(s)==CHAR_SQUOTE) &&
  2599 	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
  2600 	  s=g_utf8_prev_char(s))
  2601 	    ;
  2602 	for (;s>prevline;s=g_utf8_prev_char(s))
  2603 	{
  2604 	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  2605 	    {
  2606 		if (pswit[ECHO_SWITCH])
  2607 		    g_print("\n%s\n",prevline);
  2608 		if (!pswit[OVERVIEW_SWITCH])
  2609 		    g_print("    Line %ld column %ld - "
  2610 		      "No punctuation at para end?\n",
  2611 		      linecnt-1,g_utf8_strlen(prevline,-1));
  2612 		else
  2613 		    cnt_punct++;
  2614 		break;
  2615 	    }
  2616 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  2617 		break;
  2618 	}
  2619     }
  2620 }
  2621 
  2622 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  2623 {
  2624     const char *word=key;
  2625     int *dupcnt=value;
  2626     if (*dupcnt)
  2627 	g_print("\nNote: Queried word %s was duplicated %d times\n",
  2628 	  word,*dupcnt);
  2629     return FALSE;
  2630 }
  2631 
  2632 void print_as_windows_1252(const char *string)
  2633 {
  2634     gsize inbytes,outbytes;
  2635     gchar *buf,*bp;
  2636     GIConv converter=(GIConv)-1;
  2637     if (!string)
  2638     {
  2639 	if (converter!=(GIConv)-1)
  2640 	    g_iconv_close(converter);
  2641 	converter=(GIConv)-1;
  2642 	return;
  2643     }
  2644     if (converter=(GIConv)-1)
  2645 	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  2646     if (converter!=(GIConv)-1)
  2647     {
  2648 	inbytes=outbytes=strlen(string);
  2649 	bp=buf=g_malloc(outbytes+1);
  2650 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  2651 	*bp='\0';
  2652 	fputs(buf,stdout);
  2653 	g_free(buf);
  2654     }
  2655     else
  2656 	fputs(string,stdout);
  2657 }
  2658 
  2659 void print_as_utf_8(const char *string)
  2660 {
  2661     fputs(string,stdout);
  2662 }
  2663 
  2664 /*
  2665  * procfile:
  2666  *
  2667  * Process one file.
  2668  */
  2669 void procfile(const char *filename)
  2670 {
  2671     const char *s;
  2672     gchar *parastart=NULL;	/* first line of current para */
  2673     gchar *etext,*aline;
  2674     gchar *etext_ptr;
  2675     GError *err=NULL;
  2676     struct first_pass_results *first_pass_results;
  2677     struct warnings *warnings;
  2678     struct counters counters={0};
  2679     struct line_properties last={0};
  2680     struct parities parities={0};
  2681     struct pending pending={0};
  2682     gboolean isemptyline;
  2683     long start_para_line=0;
  2684     gboolean isnewpara=FALSE,enddash=FALSE;
  2685     last.start=CHAR_SPACE;
  2686     linecnt=checked_linecnt=0;
  2687     etext=read_etext(filename,&err);
  2688     if (!etext)
  2689     {
  2690 	if (pswit[STDOUT_SWITCH])
  2691 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  2692 	else
  2693 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  2694 	exit(1);
  2695     }
  2696     g_print("\n\nFile: %s\n\n",filename);
  2697     first_pass_results=first_pass(etext);
  2698     warnings=report_first_pass(first_pass_results);
  2699     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  2700     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  2701     /*
  2702      * Here we go with the main pass. Hold onto yer hat!
  2703      */
  2704     linecnt=0;
  2705     etext_ptr=etext;
  2706     while ((aline=flgets(&etext_ptr,linecnt+1)))
  2707     {
  2708 	linecnt++;
  2709 	if (linecnt==1)
  2710 	    isnewpara=TRUE;
  2711 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  2712 	    continue;    // skip DP page separators completely
  2713 	if (linecnt<first_pass_results->firstline ||
  2714 	  (first_pass_results->footerline>0 &&
  2715 	  linecnt>first_pass_results->footerline))
  2716 	{
  2717 	    if (pswit[HEADER_SWITCH])
  2718 	    {
  2719 		if (g_str_has_prefix(aline,"Title:"))
  2720 		    g_print("    %s\n",aline);
  2721 		if (g_str_has_prefix(aline,"Author:"))
  2722 		    g_print("    %s\n",aline);
  2723 		if (g_str_has_prefix(aline,"Release Date:"))
  2724 		    g_print("    %s\n",aline);
  2725 		if (g_str_has_prefix(aline,"Edition:"))
  2726 		    g_print("    %s\n\n",aline);
  2727 	    }
  2728 	    continue;		/* skip through the header */
  2729 	}
  2730 	checked_linecnt++;
  2731 	print_pending(aline,parastart,&pending);
  2732 	memset(&pending,0,sizeof(pending));
  2733 	isemptyline=analyse_quotes(aline,&counters);
  2734 	if (isnewpara && !isemptyline)
  2735 	{
  2736 	    /* This line is the start of a new paragraph. */
  2737 	    start_para_line=linecnt;
  2738 	    /* Capture its first line in case we want to report it later. */
  2739 	    g_free(parastart);
  2740 	    parastart=g_strdup(aline);
  2741 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  2742 	    s=aline;
  2743 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  2744 	      !g_unichar_isdigit(g_utf8_get_char(s)))
  2745 		s=g_utf8_next_char(s);
  2746 	    if (g_unichar_islower(g_utf8_get_char(s)))
  2747 	    {
  2748 		/* and its first letter is lowercase */
  2749 		if (pswit[ECHO_SWITCH])
  2750 		    g_print("\n%s\n",aline);
  2751 		if (!pswit[OVERVIEW_SWITCH])
  2752 		    g_print("    Line %ld column %ld - "
  2753 		      "Paragraph starts with lower-case\n",
  2754 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  2755 		else
  2756 		    cnt_punct++;
  2757 	    }
  2758 	    isnewpara=FALSE; /* Signal the end of new para processing. */
  2759 	}
  2760 	/* Check for an em-dash broken at line end. */
  2761 	if (enddash && g_utf8_get_char(aline)=='-')
  2762 	{
  2763 	    if (pswit[ECHO_SWITCH])
  2764 		g_print("\n%s\n",aline);
  2765 	    if (!pswit[OVERVIEW_SWITCH])
  2766 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  2767 	    else
  2768 		cnt_punct++;
  2769 	}
  2770 	enddash=FALSE;
  2771 	for (s=g_utf8_prev_char(aline+strlen(aline));
  2772 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  2773 	    ;
  2774 	if (s>=aline && g_utf8_get_char(s)=='-')
  2775 	    enddash=TRUE;
  2776 	check_for_control_characters(aline);
  2777 	if (warnings->bin)
  2778 	    check_for_odd_characters(aline,warnings,isemptyline);
  2779 	if (warnings->longline)
  2780 	    check_for_long_line(aline);
  2781 	if (warnings->shortline)
  2782 	    check_for_short_line(aline,&last);
  2783 	last.blen=last.len;
  2784 	last.len=g_utf8_strlen(aline,-1);
  2785 	last.start=g_utf8_get_char(aline);
  2786 	check_for_starting_punctuation(aline);
  2787 	if (warnings->dash)
  2788 	{
  2789 	    check_for_spaced_emdash(aline);
  2790 	    check_for_spaced_dash(aline);
  2791 	}
  2792 	check_for_unmarked_paragraphs(aline);
  2793 	check_for_jeebies(aline);
  2794 	check_for_mta_from(aline);
  2795 	check_for_orphan_character(aline);
  2796 	check_for_pling_scanno(aline);
  2797 	check_for_extra_period(aline,warnings);
  2798 	check_for_following_punctuation(aline);
  2799 	check_for_typos(aline,warnings);
  2800 	check_for_misspaced_punctuation(aline,&parities,isemptyline);
  2801 	check_for_double_punctuation(aline,warnings);
  2802 	check_for_spaced_quotes(aline);
  2803 	check_for_miscased_genative(aline);
  2804 	check_end_of_line(aline,warnings);
  2805 	check_for_unspaced_bracket(aline);
  2806 	if (warnings->endquote)
  2807 	    check_for_unpunctuated_endquote(aline);
  2808 	check_for_html_tag(aline);
  2809 	check_for_html_entity(aline);
  2810 	if (isemptyline)
  2811 	{
  2812 	    check_for_mismatched_quotes(&counters,&pending);
  2813 	    memset(&counters,0,sizeof(counters));
  2814 	    /* let the next iteration know that it's starting a new para */
  2815 	    isnewpara=TRUE;
  2816 	    if (prevline)
  2817 		check_for_omitted_punctuation(prevline,&last,start_para_line);
  2818 	}
  2819 	g_free(prevline);
  2820 	prevline=g_strdup(aline);
  2821     }
  2822     if (prevline)
  2823     {
  2824 	g_free(prevline);
  2825 	prevline=NULL;
  2826     }
  2827     g_free(parastart);
  2828     g_free(prevline);
  2829     g_free(etext);
  2830     if (!pswit[OVERVIEW_SWITCH])
  2831 	g_tree_foreach(qword,report_duplicate_queries,NULL);
  2832     g_tree_unref(qword);
  2833     g_tree_unref(qperiod);
  2834     g_set_print_handler(NULL);
  2835     print_as_windows_1252(NULL);
  2836     if (pswit[MARKUP_SWITCH])  
  2837 	loseentities(NULL);
  2838 }
  2839 
  2840 /*
  2841  * flgets:
  2842  *
  2843  * Get one line from the input text, checking for
  2844  * the existence of exactly one CR/LF line-end per line.
  2845  *
  2846  * Returns: a pointer to the line.
  2847  */
  2848 char *flgets(char **etext,long lcnt)
  2849 {
  2850     gunichar c;
  2851     gboolean isCR=FALSE;
  2852     char *theline=*etext;
  2853     char *eos=theline;
  2854     gchar *s;
  2855     for (;;)
  2856     {
  2857 	c=g_utf8_get_char(*etext);
  2858 	*etext=g_utf8_next_char(*etext);
  2859 	if (!c)
  2860 	    return NULL;
  2861 	/* either way, it's end of line */
  2862 	if (c=='\n')
  2863 	{
  2864 	    if (isCR)
  2865 		break;
  2866 	    else
  2867 	    {
  2868 		/* Error - a LF without a preceding CR */
  2869 		if (pswit[LINE_END_SWITCH])
  2870 		{
  2871 		    if (pswit[ECHO_SWITCH])
  2872 		    {
  2873 			s=g_strndup(theline,eos-theline);
  2874 			g_print("\n%s\n",s);
  2875 			g_free(s);
  2876 		    }
  2877 		    if (!pswit[OVERVIEW_SWITCH])
  2878 			g_print("    Line %ld - No CR?\n",lcnt);
  2879 		    else
  2880 			cnt_lineend++;
  2881 		}
  2882 		break;
  2883 	    }
  2884 	}
  2885 	if (c=='\r')
  2886 	{
  2887 	    if (isCR)
  2888 	    {
  2889 		/* Error - two successive CRs */
  2890 		if (pswit[LINE_END_SWITCH])
  2891 		{
  2892 		    if (pswit[ECHO_SWITCH])
  2893 		    {
  2894 			s=g_strndup(theline,eos-theline);
  2895 			g_print("\n%s\n",s);
  2896 			g_free(s);
  2897 		    }
  2898 		    if (!pswit[OVERVIEW_SWITCH])
  2899 			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  2900 		    else
  2901 			cnt_lineend++;
  2902 		}
  2903 	    }
  2904 	    isCR=TRUE;
  2905 	}
  2906 	else
  2907 	{
  2908 	    if (pswit[LINE_END_SWITCH] && isCR)
  2909 	    {
  2910 		if (pswit[ECHO_SWITCH])
  2911 		{
  2912 		    s=g_strndup(theline,eos-theline);
  2913 		    g_print("\n%s\n",s);
  2914 		    g_free(s);
  2915 		}
  2916 		if (!pswit[OVERVIEW_SWITCH])
  2917 		    g_print("    Line %ld column %ld - CR without LF?\n",
  2918 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  2919 		else
  2920 		    cnt_lineend++;
  2921 		*eos=' ';
  2922 	    }
  2923 	    isCR=FALSE;
  2924 	    eos=g_utf8_next_char(eos);
  2925 	}
  2926     }
  2927     *eos='\0';
  2928     if (pswit[MARKUP_SWITCH])  
  2929 	postprocess_for_HTML(theline);
  2930     if (pswit[DP_SWITCH])  
  2931 	postprocess_for_DP(theline);
  2932     return theline;
  2933 }
  2934 
  2935 /*
  2936  * mixdigit:
  2937  *
  2938  * Takes a "word" as a parameter, and checks whether it
  2939  * contains a mixture of alpha and digits. Generally, this is an
  2940  * error, but may not be for cases like 4th or L5 12s. 3d.
  2941  *
  2942  * Returns: TRUE iff an is error found.
  2943  */
  2944 gboolean mixdigit(const char *checkword)
  2945 {
  2946     gboolean wehaveadigit,wehavealetter,query;
  2947     const char *s,*nondigit;
  2948     wehaveadigit=wehavealetter=query=FALSE;
  2949     for (s=checkword;*s;s=g_utf8_next_char(s))
  2950 	if (g_unichar_isalpha(g_utf8_get_char(s)))
  2951 	    wehavealetter=TRUE;
  2952 	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  2953 	    wehaveadigit=TRUE;
  2954     if (wehaveadigit && wehavealetter)
  2955     {
  2956 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  2957 	query=TRUE;
  2958 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  2959 	  nondigit=g_utf8_next_char(nondigit))
  2960 	    ;
  2961 	/* digits, ending in st, rd, nd, th of either case */
  2962 	if (!g_ascii_strcasecmp(nondigit,"st") ||
  2963 	  !g_ascii_strcasecmp(nondigit,"rd") ||
  2964 	  !g_ascii_strcasecmp(nondigit,"nd") ||
  2965 	  !g_ascii_strcasecmp(nondigit,"th"))
  2966 	    query=FALSE;
  2967 	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  2968 	  !g_ascii_strcasecmp(nondigit,"rds") ||
  2969 	  !g_ascii_strcasecmp(nondigit,"nds") ||
  2970 	  !g_ascii_strcasecmp(nondigit,"ths"))
  2971 	    query=FALSE;
  2972 	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  2973 	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  2974 	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  2975 	  !g_ascii_strcasecmp(nondigit,"thly"))
  2976 	    query=FALSE;
  2977 	/* digits, ending in l, L, s or d */
  2978 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  2979 	  !strcmp(nondigit,"d"))
  2980 	    query=FALSE;
  2981 	/*
  2982 	 * L at the start of a number, representing Britsh pounds, like L500.
  2983 	 * This is cute. We know the current word is mixed digit. If the first
  2984 	 * letter is L, there must be at least one digit following. If both
  2985 	 * digits and letters follow, we have a genuine error, else we have a
  2986 	 * capital L followed by digits, and we accept that as a non-error.
  2987 	 */
  2988 	if (g_utf8_get_char(checkword)=='L' &&
  2989 	  !mixdigit(g_utf8_next_char(checkword)))
  2990 	    query=FALSE;
  2991     }
  2992     return query;
  2993 }
  2994 
  2995 /*
  2996  * getaword:
  2997  *
  2998  * Extracts the first/next "word" from the line, and returns it.
  2999  * A word is defined as one English word unit--or at least that's the aim.
  3000  * "ptr" is advanced to the position in the line where we will start
  3001  * looking for the next word.
  3002  *
  3003  * Returns: A newly-allocated string.
  3004  */
  3005 gchar *getaword(const char **ptr)
  3006 {
  3007     const char *s,*t;
  3008     GString *word;
  3009     gunichar c,pc;
  3010     word=g_string_new(NULL);
  3011     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  3012       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  3013       **ptr;*ptr=g_utf8_next_char(*ptr))
  3014 	;
  3015     /*
  3016      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  3017      * Especially yucky is the case of L1,000
  3018      * This section looks for a pattern of characters including a digit
  3019      * followed by a comma or period followed by one or more digits.
  3020      * If found, it returns this whole pattern as a word; otherwise we discard
  3021      * the results and resume our normal programming.
  3022      */
  3023     s=*ptr;
  3024     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  3025       g_unichar_isalpha(g_utf8_get_char(s)) ||
  3026       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  3027 	g_string_append_unichar(word,g_utf8_get_char(s));
  3028     for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
  3029       t=g_utf8_next_char(t))
  3030     {
  3031 	c=g_utf8_get_char(t);
  3032 	pc=g_utf8_get_char(g_utf8_prev_char(t));
  3033 	if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  3034 	{
  3035 	    *ptr=s;
  3036 	    return g_string_free(word,FALSE);
  3037 	}
  3038     }
  3039     /* we didn't find a punctuated number - do the regular getword thing */
  3040     g_string_truncate(word,0);
  3041     for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
  3042       g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
  3043       g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
  3044 	g_string_append_unichar(word,g_utf8_get_char(*ptr));
  3045     return g_string_free(word,FALSE);
  3046 }
  3047 
  3048 /*
  3049  * isroman:
  3050  *
  3051  * Is this word a Roman Numeral?
  3052  *
  3053  * It doesn't actually validate that the number is a valid Roman Numeral--for
  3054  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not
  3055  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.
  3056  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or
  3057  * expressions thereof, except when it came to taxes. Allow any number of M,
  3058  * an optional D, an optional CM or CD, any number of optional Cs, an optional
  3059  * XL or an optional XC, an optional IX or IV, an optional V and any number
  3060  * of optional Is.
  3061  */
  3062 gboolean isroman(const char *t)
  3063 {
  3064     const char *s;
  3065     if (!t || !*t)
  3066 	return FALSE;
  3067     s=t;
  3068     while (g_utf8_get_char(t)=='m' && *t)
  3069 	t++;
  3070     if (g_utf8_get_char(t)=='d')
  3071 	t++;
  3072     if (g_str_has_prefix(t,"cm"))
  3073 	t+=2;
  3074     if (g_str_has_prefix(t,"cd"))
  3075 	t+=2;
  3076     while (g_utf8_get_char(t)=='c' && *t)
  3077 	t++;
  3078     if (g_str_has_prefix(t,"xl"))
  3079 	t+=2;
  3080     if (g_str_has_prefix(t,"xc"))
  3081 	t+=2;
  3082     if (g_utf8_get_char(t)=='l')
  3083 	t++;
  3084     while (g_utf8_get_char(t)=='x' && *t)
  3085 	t++;
  3086     if (g_str_has_prefix(t,"ix"))
  3087 	t+=2;
  3088     if (g_str_has_prefix(t,"iv"))
  3089 	t+=2;
  3090     if (g_utf8_get_char(t)=='v')
  3091 	t++;
  3092     while (g_utf8_get_char(t)=='i' && *t)
  3093 	t++;
  3094     return !*t;
  3095 }
  3096 
  3097 /*
  3098  * postprocess_for_DP:
  3099  *
  3100  * Invoked with the -d switch from flgets().
  3101  * It simply "removes" from the line a hard-coded set of common
  3102  * DP-specific tags, so that the line passed to the main routine has
  3103  * been pre-cleaned of DP markup.
  3104  */
  3105 void postprocess_for_DP(char *theline)
  3106 {
  3107     char *s,*t;
  3108     int i;
  3109     if (!*theline) 
  3110 	return;
  3111     for (i=0;*DPmarkup[i];i++)
  3112 	while ((s=strstr(theline,DPmarkup[i])))
  3113 	{
  3114 	    t=s+strlen(DPmarkup[i]);
  3115 	    memmove(s,t,strlen(t)+1);
  3116 	}
  3117 }
  3118 
  3119 /*
  3120  * postprocess_for_HTML:
  3121  *
  3122  * Invoked with the -m switch from flgets().
  3123  * It simply "removes" from the line a hard-coded set of common
  3124  * HTML tags and "replaces" a hard-coded set of common HTML
  3125  * entities, so that the line passed to the main routine has
  3126  * been pre-cleaned of HTML.
  3127  */
  3128 void postprocess_for_HTML(char *theline)
  3129 {
  3130     while (losemarkup(theline))
  3131 	;
  3132     loseentities(theline);
  3133 }
  3134 
  3135 char *losemarkup(char *theline)
  3136 {
  3137     char *s,*t;
  3138     int i;
  3139     s=strchr(theline,'<');
  3140     t=s?strchr(s,'>'):NULL;
  3141     if (!s || !t)
  3142 	return NULL;
  3143     for (i=0;*markup[i];i++)
  3144 	if (tagcomp(g_utf8_next_char(s),markup[i]))
  3145 	{
  3146 	    t=g_utf8_next_char(t);
  3147 	    memmove(s,t,strlen(t)+1);
  3148 	    return s;
  3149 	}
  3150     /* It's an unrecognized <xxx>. */
  3151     return NULL;
  3152 }
  3153 
  3154 void loseentities(char *theline)
  3155 {
  3156     int i;
  3157     gsize nb;
  3158     char *amp,*scolon;
  3159     gchar *s,*t;
  3160     gunichar c;
  3161     GTree *entities=NULL;
  3162     GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
  3163     if (!theline)
  3164     {
  3165 	if (entities)
  3166 	    g_tree_destroy(entities);
  3167 	entities=NULL;
  3168 	if (translit==(GIConv)-1)
  3169 	    g_iconv_close(translit);
  3170 	translit=(GIConv)-1;
  3171 	if (to_utf8==(GIConv)-1)
  3172 	    g_iconv_close(to_utf8);
  3173 	to_utf8=(GIConv)-1;
  3174 	return;
  3175     }
  3176     if (!*theline)
  3177 	return;
  3178     if (!entities)
  3179     {
  3180 	entities=g_tree_new((GCompareFunc)strcmp);
  3181 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
  3182 	    g_tree_insert(entities,HTMLentities[i].name,
  3183 	      GUINT_TO_POINTER(HTMLentities[i].c));
  3184     }
  3185     if (translit==(GIConv)-1)
  3186 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
  3187     if (to_utf8==(GIConv)-1)
  3188 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
  3189     while((amp=strchr(theline,'&')))
  3190     {
  3191 	scolon=strchr(amp,';');
  3192 	if (scolon)
  3193 	{
  3194 	    if (amp[1]=='#')
  3195 	    {
  3196 		if (amp+2+strspn(amp+2,"0123456789")==scolon)
  3197 		    c=strtol(amp+2,NULL,10);
  3198 		else if (amp[2]=='x' &&
  3199 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
  3200 		    c=strtol(amp+3,NULL,16);
  3201 	    }
  3202 	    else
  3203 	    {
  3204 		s=g_strndup(amp+1,scolon-(amp+1));
  3205 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
  3206 		g_free(s);
  3207 	    }
  3208 	}
  3209 	else
  3210 	    c=0;
  3211 	if (c)
  3212 	{
  3213 	    theline=amp;
  3214 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
  3215 		theline+=g_unichar_to_utf8(c,theline);
  3216 	    else
  3217 	    {
  3218 		s=g_malloc(6);
  3219 		nb=g_unichar_to_utf8(c,s);
  3220 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
  3221 		g_free(s);
  3222 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
  3223 		g_free(t);
  3224 		memcpy(theline,s,nb);
  3225 		g_free(s);
  3226 		theline+=nb;
  3227 	    }
  3228 	    memmove(theline,g_utf8_next_char(scolon),
  3229 	      strlen(g_utf8_next_char(scolon))+1);
  3230 	}
  3231 	else
  3232 	    theline=g_utf8_next_char(amp);
  3233     }
  3234 }
  3235 
  3236 gboolean tagcomp(const char *strin,const char *basetag)
  3237 {
  3238     gboolean retval;
  3239     gchar *s,*t;
  3240     if (g_utf8_get_char(strin)=='/')
  3241 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  3242     else
  3243 	t=g_utf8_casefold(strin,-1);
  3244     s=g_utf8_casefold(basetag,-1);
  3245     retval=g_str_has_prefix(t,s);
  3246     g_free(s);
  3247     g_free(t);
  3248     return retval;
  3249 }
  3250 
  3251 void proghelp(GOptionContext *context)
  3252 {
  3253     gchar *help;
  3254     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  3255     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  3256     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  3257     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "
  3258       "For details, read the file COPYING.\n",stderr);
  3259     fputs("This is Free Software; "
  3260       "you may redistribute it under certain conditions (GPL);\n",stderr);
  3261     fputs("read the file COPYING for details.\n\n",stderr);
  3262     help=g_option_context_get_help(context,TRUE,NULL);
  3263     fputs(help,stderr);
  3264     g_free(help);
  3265     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  3266     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  3267       "non-ASCII\n",stderr);
  3268     fputs("characters like accented letters, "
  3269       "lines longer than 75 or shorter than 55,\n",stderr);
  3270     fputs("unbalanced quotes or brackets, "
  3271       "a variety of badly formatted punctuation, \n",stderr);
  3272     fputs("HTML tags, some likely typos. "
  3273       "It is NOT a substitute for human judgement.\n",stderr);
  3274     fputs("\n",stderr);
  3275 }