bookloupe-testing: bookloupe/bookloupe.c@cd3068704d3a

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "bookloupe.h"

    31 #include "counters.h"

    32 #include "pending.h"

    33 #include "HTMLentities.h"

    35 gchar *prevline;

    37 /* Common typos. */

    38 char *typo[] = {

    39     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    40     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    41     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    42     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    43     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    44     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    45     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    46     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    47     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    48     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    49     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    50     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    51     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    52     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    53     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    54     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    55     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    56     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    57     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    58     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    59     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    60     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    61     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    62     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    63     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    64     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    65     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    66     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    67     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    68     "se", ""

    69 };

    71 GTree *usertypo;

    73 /* Common abbreviations and other OK words not to query as typos. */

    74 char *okword[] = {

    75     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    76     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    77     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    78     "outbid", "outbids", "frostbite", "frostbitten", ""

    79 };

    81 /* Common abbreviations that cause otherwise unexplained periods. */

    82 char *abbrev[] = {

    83     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    84     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    85 };

    87 /*

    88  * Two-Letter combinations that rarely if ever start words,

    89  * but are common scannos or otherwise common letter combinations.

    90  */

    91 char *nostart[] = {

    92     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    93 };

    95 /*

    96  * Two-Letter combinations that rarely if ever end words,

    97  * but are common scannos or otherwise common letter combinations.

    98  */

    99 char *noend[] = {

   100     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

   101     "sw", "gr", "sl", "cl", "iy", ""

   102 };

   104 char *markup[] = {

   105     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   106     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   107     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   108     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   109 };

   111 char *DPmarkup[] = {

   112     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   113 };

   115 char *nocomma[] = {

   116     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   117     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   118     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   119     "during", "let", "toward", "among", ""

   120 };

   122 char *noperiod[] = {

   123     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   124     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   125     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   126     "among", "those", "into", "whom", "having", "thence", ""

   127 };

   129 gboolean pswit[SWITNO];  /* program switches */

   131 static GOptionEntry options[]={

   132     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   133       "Ignore DP-specific markup", NULL },

   134     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   135       "Don't echo queried line", NULL },

   136     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   137       "Check single quotes", NULL },

   138     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   139       "Check common typos", NULL },

   140     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   141       "Require closure of quotes on every paragraph", NULL },

   142     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   143       "Disable paranoid querying of everything", NULL },

   144     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   145       "Disable line end checking", NULL },

   146     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   147       "Overview: just show counts", NULL },

   148     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   149       "Output errors to stdout instead of stderr", NULL },

   150     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   151       "Echo header fields", NULL },

   152     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   153       "Ignore markup in < >", NULL },

   154     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   155       "Use file of user-defined typos", NULL },

   156     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   157       "Defaults for use on www upload", NULL },

   158     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   159       "Verbose - list everything", NULL },

   160     { NULL }

   161 };

   163 long cnt_quote;		/* for overview mode, count of quote queries */

   164 long cnt_brack;		/* for overview mode, count of brackets queries */

   165 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   166 long cnt_odd;		/* for overview mode, count of odd character queries */

   167 long cnt_long;		/* for overview mode, count of long line errors */

   168 long cnt_short;		/* for overview mode, count of short line queries */

   169 long cnt_punct;		/* for overview mode,

   170 			   count of punctuation and spacing queries */

   171 long cnt_dash;		/* for overview mode, count of dash-related queries */

   172 long cnt_word;		/* for overview mode, count of word queries */

   173 long cnt_html;		/* for overview mode, count of html queries */

   174 long cnt_lineend;	/* for overview mode, count of line-end queries */

   175 long cnt_spacend;	/* count of lines with space at end */

   176 long linecnt;		/* count of total lines in the file */

   177 long checked_linecnt;	/* count of lines actually checked */

   179 void proghelp(GOptionContext *context);

   180 void procfile(const char *);

   182 gchar *running_from;

   184 gboolean mixdigit(const char *);

   185 gchar *getaword(const char **);

   186 char *flgets(char **,long,gboolean);

   187 void postprocess_for_HTML(char *);

   188 char *linehasmarkup(char *);

   189 char *losemarkup(char *);

   190 gboolean tagcomp(const char *,const char *);

   191 void loseentities(char *);

   192 gboolean isroman(const char *);

   193 void postprocess_for_DP(char *);

   194 void print_as_windows_1252(const char *string);

   195 void print_as_utf_8(const char *string);

   197 GTree *qword,*qperiod;

   199 #ifdef __WIN32__

   200 UINT saved_cp;

   201 #endif

   203 void parse_options(int *argc,char ***argv)

   204 {

   205     GError *err=NULL;

   206     GOptionContext *context;

   207     context=g_option_context_new(

   208       "file - looks for errors in Project Gutenberg(TM) etexts");

   209     g_option_context_add_main_entries(context,options,NULL);

   210     if (!g_option_context_parse(context,argc,argv,&err))

   211     {

   212 	g_printerr("Bookloupe: %s\n",err->message);

   213 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   214 	exit(1);

   215     }

   216     /* Paranoid checking is turned OFF, not on, by its switch */

   217     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   218     if (pswit[PARANOID_SWITCH])

   219 	/* if running in paranoid mode, typo checks default to enabled */

   220 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   221     /* Line-end checking is turned OFF, not on, by its switch */

   222     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];

   223     /* Echoing is turned OFF, not on, by its switch */

   224     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];

   225     if (pswit[OVERVIEW_SWITCH])

   226 	/* just print summary; don't echo */

   227 	pswit[ECHO_SWITCH]=FALSE;

   228     /*

   229      * Web uploads - for the moment, this is really just a placeholder

   230      * until we decide what processing we really want to do on web uploads

   231      */

   232     if (pswit[WEB_SWITCH])

   233     {

   234 	/* specific override for web uploads */

   235 	pswit[ECHO_SWITCH]=TRUE;

   236 	pswit[SQUOTE_SWITCH]=FALSE;

   237 	pswit[TYPO_SWITCH]=TRUE;

   238 	pswit[QPARA_SWITCH]=FALSE;

   239 	pswit[PARANOID_SWITCH]=TRUE;

   240 	pswit[LINE_END_SWITCH]=FALSE;

   241 	pswit[OVERVIEW_SWITCH]=FALSE;

   242 	pswit[STDOUT_SWITCH]=FALSE;

   243 	pswit[HEADER_SWITCH]=TRUE;

   244 	pswit[VERBOSE_SWITCH]=FALSE;

   245 	pswit[MARKUP_SWITCH]=FALSE;

   246 	pswit[USERTYPO_SWITCH]=FALSE;

   247 	pswit[DP_SWITCH]=FALSE;

   248     }

   249     if (*argc<2)

   250     {

   251 	proghelp(context);

   252 	exit(1);

   253     }

   254     g_option_context_free(context);

   255 }

   257 /*

   258  * read_user_scannos:

   259  *

   260  * Read in the user-defined stealth scanno list.

   261  */

   262 void read_user_scannos(void)

   263 {

   264     GError *err=NULL;

   265     gchar *usertypo_file;

   266     gboolean okay;

   267     int i;

   268     gsize len,nb;

   269     gchar *contents,*utf8,**lines;

   270     usertypo_file=g_strdup("bookloupe.typ");

   271     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   272     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   273     {

   274 	g_clear_error(&err);

   275 	g_free(usertypo_file);

   276 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   277 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   278     }

   279     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   280     {

   281 	g_clear_error(&err);

   282 	g_free(usertypo_file);

   283 	usertypo_file=g_strdup("gutcheck.typ");

   284 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   285     }

   286     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   287     {

   288 	g_clear_error(&err);

   289 	g_free(usertypo_file);

   290 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   291 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   292     }

   293     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   294     {

   295 	g_free(usertypo_file);

   296 	g_print("   --> I couldn't find bookloupe.typ "

   297 	  "-- proceeding without user typos.\n");

   298 	return;

   299     }

   300     else if (!okay)

   301     {

   302 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   303 	g_free(usertypo_file);

   304 	g_clear_error(&err);

   305 	exit(1);

   306     }

   307     if (g_utf8_validate(contents,len,NULL))

   308 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   309     else

   310 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   311     g_free(contents);

   312     lines=g_strsplit_set(utf8,"\r\n",0);

   313     g_free(utf8);

   314     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   315     for (i=0;lines[i];i++)

   316 	if (*(unsigned char *)lines[i]>'!')

   317 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   318 	else

   319 	    g_free(lines[i]);

   320     g_free(lines);

   321 }

   323 /*

   324  * read_etext:

   325  *

   326  * Read an etext returning a newly allocated string containing the file

   327  * contents or NULL on error.

   328  */

   329 gchar *read_etext(const char *filename,GError **err)

   330 {

   331     GError *tmp_err=NULL;

   332     gchar *contents,*utf8;

   333     gsize len,bytes_read,bytes_written;

   334     int i,line,col;

   335     if (!g_file_get_contents(filename,&contents,&len,err))

   336 	return NULL;

   337     if (g_utf8_validate(contents,len,NULL))

   338     {

   339 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   340 	g_set_print_handler(print_as_utf_8);

   341 #ifdef __WIN32__

   342 	SetConsoleOutputCP(CP_UTF8);

   343 #endif

   344     }

   345     else

   346     {

   347 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   348 	  &bytes_written,&tmp_err);

   349 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   350 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   351 	{

   352 	    line=col=1;

   353 	    for(i=0;i<bytes_read;i++)

   354 		if (contents[i]=='\n')

   355 		{

   356 		    line++;

   357 		    col=1;

   358 		}

   359 		else if (contents[i]!='\r')

   360 		    col++;

   361 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   362 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   363 	      "valid Windows-1252 character",

   364 	      ((unsigned char *)contents)[bytes_read],line,col);

   365 	}

   366 	else if (tmp_err)

   367 	    g_propagate_error(err,tmp_err);

   368 	g_set_print_handler(print_as_windows_1252);

   369 #ifdef __WIN32__

   370 	SetConsoleOutputCP(1252);

   371 #endif

   372     }

   373     g_free(contents);

   374     return utf8;

   375 }

   377 void cleanup_on_exit(void)

   378 {

   379 #ifdef __WIN32__

   380     SetConsoleOutputCP(saved_cp);

   381 #endif

   382 }

   384 int main(int argc,char **argv)

   385 {

   386 #ifdef __WIN32__

   387     atexit(cleanup_on_exit);

   388     saved_cp=GetConsoleOutputCP();

   389 #endif

   390     running_from=g_path_get_dirname(argv[0]);

   391     parse_options(&argc,&argv);

   392     if (pswit[USERTYPO_SWITCH])

   393 	read_user_scannos();

   394     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   395     procfile(argv[1]);

   396     if (pswit[OVERVIEW_SWITCH])

   397     {

   398 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   399 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   400 	g_print("    --------------- Queries found --------------\n");

   401 	if (cnt_long)

   402 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   403 	if (cnt_short)

   404 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   405 	if (cnt_lineend)

   406 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   407 	if (cnt_word)

   408 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   409 	if (cnt_quote)

   410 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_quote);

   411 	if (cnt_brack)

   412 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   413 	if (cnt_bin)

   414 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   415 	if (cnt_odd)

   416 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   417 	if (cnt_punct)

   418 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   419 	if (cnt_dash)

   420 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   421 	if (cnt_html)

   422 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   423 	g_print("\n");

   424 	g_print("    TOTAL QUERIES		  %14ld\n",

   425 	  cnt_quote+cnt_brack+cnt_bin+cnt_odd+cnt_long+cnt_short+cnt_punct+

   426 	  cnt_dash+cnt_word+cnt_html+cnt_lineend);

   427     }

   428     g_free(running_from);

   429     if (usertypo)

   430 	g_tree_unref(usertypo);

   431     return 0;

   432 }

   434 void count_dashes(const char *line,const char *dash,

   435   struct dash_results *results)

   436 {

   437     int i;

   438     gchar **tokens;

   439     gunichar pc,nc;

   440     gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;

   441     if (!*line)

   442 	return;

   443     tokens=g_strsplit(line,dash,0);

   444     if (tokens[1])

   445 	results->base++;

   446     for(i=1;tokens[i];i++)

   447     {

   448 	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));

   449 	nc=g_utf8_get_char(tokens[i]);

   450 	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))

   451 	    spaced=TRUE;

   452 	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))

   453 	    spaced2=TRUE;

   454 	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))

   455 	    unspaced=TRUE;

   456     }

   457     if (spaced)

   458 	results->space++;

   459     if (spaced2)

   460 	/* count of lines with em-dashes with spaces both sides */

   461 	results->non_PG_space++;

   462     if (unspaced)

   463 	/* count of lines with PG-type em-dashes with no spaces */

   464 	results->PG_space++;

   465     g_strfreev(tokens);

   466 }

   468 /*

   469  * first_pass:

   470  *

   471  * Run a first pass - verify that it's a valid PG

   472  * file, decide whether to report some things that

   473  * occur many times in the text like long or short

   474  * lines, non-standard dashes, etc.

   475  */

   476 struct first_pass_results *first_pass(const char *etext)

   477 {

   478     gunichar laststart=CHAR_SPACE;

   479     const char *s;

   480     gchar *lc_line;

   481     int i,j,lbytes,llen;

   482     gchar **lines;

   483     unsigned int lastlen=0,lastblen=0;

   484     long spline=0,nspline=0;

   485     static struct first_pass_results results={0};

   486     struct dash_results tmp_dash_results;

   487     gchar *inword;

   488     QuoteClass qc;

   489     lines=g_strsplit(etext,"\n",0);

   490     if (lines[0])

   491 	/* If there's at least one line, we might have UNIX-style terminators */

   492 	results.unix_lineends=TRUE;

   493     for (j=0;lines[j];j++)

   494     {

   495 	lbytes=strlen(lines[j]);

   496 	if (lbytes>0 && lines[j][lbytes-1]=='\r')

   497 	{

   498 	    results.unix_lineends=FALSE;

   499 	    do

   500 	    {

   501 		lines[j][--lbytes]='\0';

   502 	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');

   503 	}

   504 	llen=g_utf8_strlen(lines[j],lbytes);

   505 	linecnt++;

   506 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   507 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   508 	{

   509 	    if (spline)

   510 		g_print("   --> Duplicate header?\n");

   511 	    spline=linecnt+1;   /* first line of non-header text, that is */

   512 	}

   513 	if (!strncmp(lines[j],"*** START",9) &&

   514 	  strstr(lines[j],"PROJECT GUTENBERG"))

   515 	{

   516 	    if (nspline)

   517 		g_print("   --> Duplicate header?\n");

   518 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   519 	}

   520 	if (spline || nspline)

   521 	{

   522 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   523 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   524 	    {

   525 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   526 		{

   527 		    if (results.footerline)

   528 		    {

   529 			/* it's an old-form header - we can detect duplicates */

   530 			if (!nspline)

   531 			    g_print("   --> Duplicate footer?\n");

   532 		    }

   533 		    else

   534 			results.footerline=linecnt;

   535 		}

   536 	    }

   537 	    g_free(lc_line);

   538 	}

   539 	if (spline)

   540 	    results.firstline=spline;

   541 	if (nspline)

   542 	    results.firstline=nspline;  /* override with new */

   543 	if (results.footerline)

   544 	    continue;    /* don't count the boilerplate in the footer */

   545 	results.totlen+=llen;

   546 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   547 	{

   548 	    if (g_utf8_get_char(s)>127)

   549 		results.binlen++;

   550 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   551 		results.alphalen++;

   552 	    if (s>lines[j])

   553 	    {

   554 		if (CHAR_IS_DQUOTE(g_utf8_get_char(s)))

   555 		    qc=QUOTE_CLASS(g_utf8_get_char(s));

   556 		else

   557 		    qc=INVALID_QUOTE;

   558 		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&

   559 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   560 		    results.endquote_count++;

   561 	    }

   562 	}

   563 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   564 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   565 	    results.shortline++;

   566 	if (lbytes>0 &&

   567 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   568 	    cnt_spacend++;

   569 	if (strstr(lines[j],".,"))

   570 	    results.dotcomma++;

   571 	/* only count ast lines for ignoring purposes where there is */

   572 	/* locase text on the line */

   573 	if (strchr(lines[j],'*'))

   574 	{

   575 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   576 		if (g_unichar_islower(g_utf8_get_char(s)))

   577 		    break;

   578 	    if (*s)

   579 		results.astline++;

   580 	}

   581 	if (strchr(lines[j],'/'))

   582 	    results.fslashline++;

   583 	if (lbytes>0)

   584 	{

   585 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   586 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   587 	      s=g_utf8_prev_char(s))

   588 		;

   589 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   590 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   591 		results.hyphens++;

   592 	}

   593 	if (llen>LONGEST_PG_LINE)

   594 	    results.longline++;

   595 	if (llen>WAY_TOO_LONG)

   596 	    results.verylongline++;

   597 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   598 	{

   599 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   600 	    if (i>0)

   601 		results.htmcount++;

   602 	    if (strstr(lines[j],"<i>"))

   603 		results.htmcount+=4; /* bonus marks! */

   604 	}

   605 	/* Check for spaced em-dashes */

   606 	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));

   607 	count_dashes(lines[j],"--",&tmp_dash_results);

   608 	count_dashes(lines[j],"—",&tmp_dash_results);

   609 	if (tmp_dash_results.base)

   610 	    results.emdash.base++;

   611 	if (tmp_dash_results.non_PG_space)

   612 	    results.emdash.non_PG_space++;

   613 	if (tmp_dash_results.PG_space)

   614 	    results.emdash.PG_space++;

   615 	for (s=lines[j];*s;)

   616 	{

   617 	    inword=getaword(&s);

   618 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   619 		results.Dutchcount++;

   620 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   621 		results.Frenchcount++;

   622 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   623 		results.standalone_digit++;

   624 	    g_free(inword);

   625 	}

   626 	/* Check for spaced dashes */

   627 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   628 	    results.spacedash++;

   629 	lastblen=lastlen;

   630 	lastlen=llen;

   631 	laststart=lines[j][0];

   632     }

   633     g_strfreev(lines);

   634     return &results;

   635 }

   637 /*

   638  * report_first_pass:

   639  *

   640  * Make some snap decisions based on the first pass results.

   641  */

   642 struct warnings *report_first_pass(struct first_pass_results *results)

   643 {

   644     static struct warnings warnings={0};

   645     warnings.nocr=1;

   646     if (results->unix_lineends)

   647     {

   648 	warnings.nocr=0;

   649 	g_print("   --> No lines in this file have a CR. Not reporting them. "

   650 	  "Project Gutenberg requires that all lineends be CR-LF.\n");

   651     }

   652     if (cnt_spacend>0)

   653 	g_print("   --> %ld lines in this file have white space at end\n",

   654 	  cnt_spacend);

   655     warnings.dotcomma=1;

   656     if (results->dotcomma>5)

   657     {

   658 	warnings.dotcomma=0;

   659 	g_print("   --> %ld lines in this file contain '.,'. "

   660 	  "Not reporting them.\n",results->dotcomma);

   661     }

   662     /*

   663      * If more than 50 lines, or one-tenth, are short,

   664      * don't bother reporting them.

   665      */

   666     warnings.shortline=1;

   667     if (results->shortline>50 || results->shortline*10>linecnt)

   668     {

   669 	warnings.shortline=0;

   670 	g_print("   --> %ld lines in this file are short. "

   671 	  "Not reporting short lines.\n",results->shortline);

   672     }

   673     /*

   674      * If more than 50 lines, or one-tenth, are long,

   675      * don't bother reporting them.

   676      */

   677     warnings.longline=1;

   678     if (results->longline>50 || results->longline*10>linecnt)

   679     {

   680 	warnings.longline=0;

   681 	g_print("   --> %ld lines in this file are long. "

   682 	  "Not reporting long lines.\n",results->longline);

   683     }

   684     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   685     warnings.ast=1;

   686     if (results->astline>10)

   687     {

   688 	warnings.ast=0;

   689 	g_print("   --> %ld lines in this file contain asterisks. "

   690 	  "Not reporting them.\n",results->astline);

   691     }

   692     /*

   693      * If more than 10 lines contain forward slashes,

   694      * don't bother reporting them.

   695      */

   696     warnings.fslash=1;

   697     if (results->fslashline>10)

   698     {

   699 	warnings.fslash=0;

   700 	g_print("   --> %ld lines in this file contain forward slashes. "

   701 	  "Not reporting them.\n",results->fslashline);

   702     }

   703     /*

   704      * If more than 20 lines contain unpunctuated endquotes,

   705      * don't bother reporting them.

   706      */

   707     warnings.endquote=1;

   708     if (results->endquote_count>20)

   709     {

   710 	warnings.endquote=0;

   711 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

   712 	  "Not reporting them.\n",results->endquote_count);

   713     }

   714     /*

   715      * If more than 15 lines contain standalone digits,

   716      * don't bother reporting them.

   717      */

   718     warnings.digit=1;

   719     if (results->standalone_digit>10)

   720     {

   721 	warnings.digit=0;

   722 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

   723 	  "Not reporting them.\n",results->standalone_digit);

   724     }

   725     /*

   726      * If more than 20 lines contain hyphens at end,

   727      * don't bother reporting them.

   728      */

   729     warnings.hyphen=1;

   730     if (results->hyphens>20)

   731     {

   732 	warnings.hyphen=0;

   733 	g_print("   --> %ld lines in this file have hyphens at end. "

   734 	  "Not reporting them.\n",results->hyphens);

   735     }

   736     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   737     {

   738 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   739 	pswit[MARKUP_SWITCH]=1;

   740     }

   741     if (results->verylongline>0)

   742 	g_print("   --> %ld lines in this file are VERY long!\n",

   743 	  results->verylongline);

   744     /*

   745      * If there are more non-PG spaced dashes than PG em-dashes,

   746      * assume it's deliberate.

   747      * Current PG guidelines say don't use them, but older texts do,

   748      * and some people insist on them whatever the guidelines say.

   749      */

   750     warnings.dash=1;

   751     if (results->spacedash+results->emdash.non_PG_space>

   752       results->emdash.PG_space)

   753     {

   754 	warnings.dash=0;

   755 	g_print("   --> There are %ld spaced dashes and em-dashes. "

   756 	  "Not reporting them.\n",

   757 	  results->spacedash+results->emdash.non_PG_space);

   758     }

   759     /* If more than a quarter of characters are hi-bit, bug out. */

   760     warnings.bin=1;

   761     if (results->binlen*4>results->totlen)

   762     {

   763 	g_print("   --> This file does not appear to be ASCII. "

   764 	  "Terminating. Best of luck with it!\n");

   765 	exit(1);

   766     }

   767     if (results->alphalen*4<results->totlen)

   768     {

   769 	g_print("   --> This file does not appear to be text. "

   770 	  "Terminating. Best of luck with it!\n");

   771 	exit(1);

   772     }

   773     if (results->binlen*100>results->totlen || results->binlen>100)

   774     {

   775 	g_print("   --> There are a lot of foreign letters here. "

   776 	  "Not reporting them.\n");

   777 	warnings.bin=0;

   778     }

   779     warnings.isDutch=FALSE;

   780     if (results->Dutchcount>50)

   781     {

   782 	warnings.isDutch=TRUE;

   783 	g_print("   --> This looks like Dutch - "

   784 	  "switching off dashes and warnings for 's Middags case.\n");

   785     }

   786     warnings.isFrench=FALSE;

   787     if (results->Frenchcount>50)

   788     {

   789 	warnings.isFrench=TRUE;

   790 	g_print("   --> This looks like French - "

   791 	  "switching off some doublepunct.\n");

   792     }

   793     if (results->firstline && results->footerline)

   794 	g_print("    The PG header and footer appear to be already on.\n");

   795     else

   796     {

   797 	if (results->firstline)

   798 	    g_print("    The PG header is on - no footer.\n");

   799 	if (results->footerline)

   800 	    g_print("    The PG footer is on - no header.\n");

   801     }

   802     g_print("\n");

   803     if (pswit[VERBOSE_SWITCH])

   804     {

   805 	warnings.bin=1;

   806 	warnings.shortline=1;

   807 	warnings.dotcomma=1;

   808 	warnings.longline=1;

   809 	warnings.dash=1;

   810 	warnings.digit=1;

   811 	warnings.ast=1;

   812 	warnings.fslash=1;

   813 	warnings.hyphen=1;

   814 	warnings.endquote=1;

   815 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

   816     }

   817     if (warnings.isDutch)

   818 	warnings.dash=0;

   819     if (results->footerline>0 && results->firstline>0 &&

   820       results->footerline>results->firstline &&

   821       results->footerline-results->firstline<100)

   822     {

   823 	g_print("   --> I don't really know where this text starts. \n");

   824 	g_print("       There are no reference points.\n");

   825 	g_print("       I'm going to have to report the header and footer "

   826 	  "as well.\n");

   827 	results->firstline=0;

   828     }

   829     return &warnings;

   830 }

   832 /*

   833  * analyse_quotes:

   834  *

   835  * Look along the line, accumulate the count of quotes, and see

   836  * if this is an empty line - i.e. a line with nothing on it

   837  * but spaces.

   838  * If line has just spaces, period, * and/or - on it, don't

   839  * count it, since empty lines with asterisks or dashes to

   840  * separate sections are common.

   841  *

   842  * Returns: TRUE if the line is empty.

   843  */

   844 gboolean analyse_quotes(const char *aline,struct counters *counters)

   845 {

   846     int guessquote=0;

   847     /* assume the line is empty until proven otherwise */

   848     gboolean isemptyline=TRUE;

   849     const char *s=aline,*sprev,*snext;

   850     gunichar c;

   851     sprev=NULL;

   852     GError *tmp_err=NULL;

   853     while (*s)

   854     {

   855 	snext=g_utf8_next_char(s);

   856 	c=g_utf8_get_char(s);

   857 	if (CHAR_IS_DQUOTE(c))

   858 	    (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err);

   859 	else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH])

   860 	{

   861 	    if (s==aline)

   862 	    {

   863 		/*

   864 		 * At start of line, it can only be a quotation mark.

   865 		 * Hardcode a very common exception!

   866 		 */

   867 		if (!g_str_has_prefix(snext,"tis") &&

   868 		  !g_str_has_prefix(snext,"Tis"))

   869 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   870 	    }

   871 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

   872 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   873 		/* Do nothing! it's definitely an apostrophe, not a quote */

   874 		;

   875 	    /* it's outside a word - let's check it out */

   876 	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||

   877 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   878 	    {

   879 		/* certainly looks like a quotation mark */

   880 		if (!g_str_has_prefix(snext,"tis") &&

   881 		  !g_str_has_prefix(snext,"Tis"))

   882 		    /* hardcode a very common exception! */

   883 		{

   884 		    if (strchr(".?!,;:",g_utf8_get_char(sprev)))

   885 			(void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   886 		    else

   887 			(void)count_quote(counters,c,OPENING_QUOTE,&tmp_err);

   888 		}

   889 	    }

   890 	    else

   891 	    {

   892 		/* now - is it a quotation mark? */

   893 		guessquote=0;   /* accumulate clues */

   894 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

   895 		{

   896 		    /* it follows a letter - could be either */

   897 		    guessquote++;

   898 		    if (g_utf8_get_char(sprev)=='s')

   899 		    {

   900 			/* looks like a plural apostrophe */

   901 			guessquote-=3;

   902 			if (g_utf8_get_char(snext)==CHAR_SPACE)

   903 			    /* bonus marks! */

   904 			    guessquote-=2;

   905 		    }

   906 		    if (innermost_quote_matches(counters,c))

   907 			/*

   908 			 * Give it the benefit of some doubt,

   909 			 * if a squote is already open.

   910 			 */

   911 			guessquote++;

   912 		    else

   913 			guessquote--;

   914 		    if (guessquote>=0)

   915 			(void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err);

   916 		}

   917 		else

   918 		    /* no adjacent letter - it must be a quote of some kind */

   919 		    (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err);

   920 	    }

   921 	}

   922 	if (tmp_err)

   923 	{

   924 	    if (pswit[ECHO_SWITCH])

   925 		g_print("\n%s\n",aline);

   926 	    if (!pswit[OVERVIEW_SWITCH])

   927 		g_print("    Line %ld column %ld - %s\n",

   928 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message);

   929 	    g_clear_error(&tmp_err);

   930 	}

   931 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

   932 	  c!='\r' && c!='\n')

   933 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

   934 	if (c==CHAR_UNDERSCORE)

   935 	    counters->c_unders++;

   936 	if (c==CHAR_OPEN_SBRACK)

   937 	{

   938 	    if (!matching_difference(counters,COUNTER_ILLUSTRATION) &&

   939 	      !matching_difference(counters,c) && s==aline &&

   940 	      g_str_has_prefix(s,"[Illustration:"))

   941 		increment_matching(counters,COUNTER_ILLUSTRATION,TRUE);

   942 	    else

   943 		increment_matching(counters,c,TRUE);

   944 	}

   945 	else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK)

   946 	    increment_matching(counters,c,TRUE);

   947 	if (c==CHAR_CLOSE_SBRACK)

   948 	{

   949 	    if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) &&

   950 	      !matching_difference(counters,c) && !*snext)

   951 		increment_matching(counters,COUNTER_ILLUSTRATION,FALSE);

   952 	    else

   953 		increment_matching(counters,c,FALSE);

   954 	}

   955 	else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK)

   956 	    increment_matching(counters,c,FALSE);

   957 	sprev=s;

   958 	s=snext;

   959     }

   960     return isemptyline;

   961 }

   963 /*

   964  * check_for_control_characters:

   965  *

   966  * Check for invalid or questionable characters in the line

   967  * Anything above 127 is invalid for plain ASCII, and

   968  * non-printable control characters should also be flagged.

   969  * Tabs should generally not be there.

   970  */

   971 void check_for_control_characters(const char *aline)

   972 {

   973     gunichar c;

   974     const char *s;

   975     for (s=aline;*s;s=g_utf8_next_char(s))

   976     {

   977 	c=g_utf8_get_char(s);

   978 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

   979 	{

   980 	    if (pswit[ECHO_SWITCH])

   981 		g_print("\n%s\n",aline);

   982 	    if (!pswit[OVERVIEW_SWITCH])

   983 		g_print("    Line %ld column %ld - Control character %u\n",

   984 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

   985 	    else

   986 		cnt_bin++;

   987 	}

   988     }

   989 }

   991 /*

   992  * check_for_odd_characters:

   993  *

   994  * Check for binary and other odd characters.

   995  */

   996 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

   997   gboolean isemptyline)

   998 {

   999     /* Don't repeat multiple warnings on one line. */

  1000     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;

  1001     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

  1002     const char *s;

  1003     gunichar c;

  1004     for (s=aline;*s;s=g_utf8_next_char(s))

  1005     {

  1006 	c=g_utf8_get_char(s);

  1007 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

  1008 	{

  1009 	    if (pswit[ECHO_SWITCH])

  1010 		g_print("\n%s\n",aline);

  1011 	    if (!pswit[OVERVIEW_SWITCH])

  1012 		if (c>127 && c<160 || c>255)

  1013 		    g_print("    Line %ld column %ld - "

  1014 		      "Non-ISO-8859 character %u\n",

  1015 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1016 		else

  1017 		    g_print("    Line %ld column %ld - "

  1018 		      "Non-ASCII character %u\n",

  1019 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1020 	    else

  1021 		cnt_bin++;

  1022 	    eNon_A=TRUE;

  1023 	}

  1024 	if (!eTab && c==CHAR_TAB)

  1025 	{

  1026 	    if (pswit[ECHO_SWITCH])

  1027 		g_print("\n%s\n",aline);

  1028 	    if (!pswit[OVERVIEW_SWITCH])

  1029 		g_print("    Line %ld column %ld - Tab character?\n",

  1030 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1031 	    else

  1032 		cnt_odd++;

  1033 	    eTab=TRUE;

  1034 	}

  1035 	if (!eTilde && c==CHAR_TILDE)

  1036 	{

  1037 	    /*

  1038 	     * Often used by OCR software to indicate an

  1039 	     * unrecognizable character.

  1040 	     */

  1041 	    if (pswit[ECHO_SWITCH])

  1042 		g_print("\n%s\n",aline);

  1043 	    if (!pswit[OVERVIEW_SWITCH])

  1044 		g_print("    Line %ld column %ld - Tilde character?\n",

  1045 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1046 	    else

  1047 		cnt_odd++;

  1048 	    eTilde=TRUE;

  1049 	}

  1050 	if (!eCarat && c==CHAR_CARAT)

  1051 	{

  1052 	    if (pswit[ECHO_SWITCH])

  1053 		g_print("\n%s\n",aline);

  1054 	    if (!pswit[OVERVIEW_SWITCH])

  1055 		g_print("    Line %ld column %ld - Carat character?\n",

  1056 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1057 	    else

  1058 		cnt_odd++;

  1059 	    eCarat=TRUE;

  1060 	}

  1061 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

  1062 	{

  1063 	    if (pswit[ECHO_SWITCH])

  1064 		g_print("\n%s\n",aline);

  1065 	    if (!pswit[OVERVIEW_SWITCH])

  1066 		g_print("    Line %ld column %ld - Forward slash?\n",

  1067 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1068 	    else

  1069 		cnt_odd++;

  1070 	    eFSlash=TRUE;

  1071 	}

  1072 	/*

  1073 	 * Report asterisks only in paranoid mode,

  1074 	 * since they're often deliberate.

  1075 	 */

  1076 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1077 	  c==CHAR_ASTERISK)

  1078 	{

  1079 	    if (pswit[ECHO_SWITCH])

  1080 		g_print("\n%s\n",aline);

  1081 	    if (!pswit[OVERVIEW_SWITCH])

  1082 		g_print("    Line %ld column %ld - Asterisk?\n",

  1083 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1084 	    else

  1085 		cnt_odd++;

  1086 	    eAst=TRUE;

  1087 	}

  1088     }

  1089 }

  1091 /*

  1092  * check_for_long_line:

  1093  *

  1094  * Check for line too long.

  1095  */

  1096 void check_for_long_line(const char *aline)

  1097 {

  1098     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1099     {

  1100 	if (pswit[ECHO_SWITCH])

  1101 	    g_print("\n%s\n",aline);

  1102 	if (!pswit[OVERVIEW_SWITCH])

  1103 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1104 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1105 	else

  1106 	    cnt_long++;

  1107     }

  1108 }

  1110 /*

  1111  * check_for_short_line:

  1112  *

  1113  * Check for line too short.

  1114  *

  1115  * This one is a bit trickier to implement: we don't want to

  1116  * flag the last line of a paragraph for being short, so we

  1117  * have to wait until we know that our current line is a

  1118  * "normal" line, then report the _previous_ line if it was too

  1119  * short. We also don't want to report indented lines like

  1120  * chapter heads or formatted quotations. We therefore keep

  1121  * last->len as the length of the last line examined, and

  1122  * last->blen as the length of the last but one, and try to

  1123  * suppress unnecessary warnings by checking that both were of

  1124  * "normal" length. We keep the first character of the last

  1125  * line in last->start, and if it was a space, we assume that

  1126  * the formatting is deliberate. I can't figure out a way to

  1127  * distinguish something like a quoted verse left-aligned or

  1128  * the header or footer of a letter from a paragraph of short

  1129  * lines - maybe if I examined the whole paragraph, and if the

  1130  * para has less than, say, 8 lines and if all lines are short,

  1131  * then just assume it's OK? Need to look at some texts to see

  1132  * how often a formula like this would get the right result.

  1133  */

  1134 void check_for_short_line(const char *aline,const struct line_properties *last)

  1135 {

  1136     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1137       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1138       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1139     {

  1140 	if (pswit[ECHO_SWITCH])

  1141 	    g_print("\n%s\n",prevline);

  1142 	if (!pswit[OVERVIEW_SWITCH])

  1143 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1144 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1145 	else

  1146 	    cnt_short++;

  1147     }

  1148 }

  1150 /*

  1151  * check_for_starting_punctuation:

  1152  *

  1153  * Look for punctuation other than full ellipses at start of line.

  1154  */

  1155 void check_for_starting_punctuation(const char *aline)

  1156 {

  1157     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1158       !g_str_has_prefix(aline,". . ."))

  1159     {

  1160 	if (pswit[ECHO_SWITCH])

  1161 	    g_print("\n%s\n",aline);

  1162 	if (!pswit[OVERVIEW_SWITCH])

  1163 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1164 	      linecnt);

  1165 	else

  1166 	    cnt_punct++;

  1167     }

  1168 }

  1170 /*

  1171  * str_emdash:

  1172  *

  1173  * Find the first em-dash, return a pointer to it and set <next> to the

  1174  * character following the dash.

  1175  */

  1176 char *str_emdash(const char *s,const char **next)

  1177 {

  1178     const char *s1,*s2;

  1179     s1=strstr(s,"--");

  1180     s2=strstr(s,"—");

  1181     if (!s1)

  1182     {

  1183 	if (s2)

  1184 	    *next=g_utf8_next_char(s2);

  1185 	return (char *)s2;

  1186     }

  1187     else if (!s2)

  1188     {

  1189 	*next=g_utf8_next_char(g_utf8_next_char(s1));

  1190 	return (char *)s1;

  1191     }

  1192     else if (s1<s2)

  1193     {

  1194 	*next=g_utf8_next_char(g_utf8_next_char(s1));

  1195 	return (char *)s1;

  1196     }

  1197     else

  1198     {

  1199 	*next=g_utf8_next_char(s2);

  1200 	return (char *)s2;

  1201     }

  1202 }

  1204 /*

  1205  * check_for_spaced_emdash:

  1206  *

  1207  * Check for spaced em-dashes.

  1208  *

  1209  * We must check _all_ occurrences of em-dashes on the line

  1210  * hence the loop - even if the first dash is OK

  1211  * there may be another that's wrong later on.

  1212  */

  1213 void check_for_spaced_emdash(const char *aline)

  1214 {

  1215     const char *s,*t,*next;

  1216     for (s=aline;t=str_emdash(s,&next);s=next)

  1217     {

  1218 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1219 	  g_utf8_get_char(next)==CHAR_SPACE)

  1220 	{

  1221 	    if (pswit[ECHO_SWITCH])

  1222 		g_print("\n%s\n",aline);

  1223 	    if (!pswit[OVERVIEW_SWITCH])

  1224 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1225 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1226 	    else

  1227 		cnt_dash++;

  1228 	}

  1229     }

  1230 }

  1232 /*

  1233  * check_for_spaced_dash:

  1234  *

  1235  * Check for spaced dashes.

  1236  */

  1237 void check_for_spaced_dash(const char *aline)

  1238 {

  1239     const char *s;

  1240     if ((s=strstr(aline," -")))

  1241     {

  1242 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1243 	{

  1244 	    if (pswit[ECHO_SWITCH])

  1245 		g_print("\n%s\n",aline);

  1246 	    if (!pswit[OVERVIEW_SWITCH])

  1247 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1248 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1249 	    else

  1250 		cnt_dash++;

  1251 	}

  1252     }

  1253     else if ((s=strstr(aline,"- ")))

  1254     {

  1255 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1256 	{

  1257 	    if (pswit[ECHO_SWITCH])

  1258 		g_print("\n%s\n",aline);

  1259 	    if (!pswit[OVERVIEW_SWITCH])

  1260 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1261 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1262 	    else

  1263 		cnt_dash++;

  1264 	}

  1265     }

  1266 }

  1268 /*

  1269  * check_for_unmarked_paragraphs:

  1270  *

  1271  * Check for unmarked paragraphs indicated by separate speakers.

  1272  *

  1273  * May well be false positive:

  1274  * "Bravo!" "Wonderful!" called the crowd.

  1275  * but useful all the same.

  1276  */

  1277 void check_for_unmarked_paragraphs(const char *aline)

  1278 {

  1279     const char *s;

  1280     s=strstr(aline,"\"  \"");

  1281     if (!s)

  1282 	s=strstr(aline,"\" \"");

  1283     if (s)

  1284     {

  1285 	if (pswit[ECHO_SWITCH])

  1286 	    g_print("\n%s\n",aline);

  1287 	if (!pswit[OVERVIEW_SWITCH])

  1288 	    g_print("    Line %ld column %ld - "

  1289 	      "Query missing paragraph break?\n",

  1290 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1291 	else

  1292 	    cnt_punct++;

  1293     }

  1294 }

  1296 /*

  1297  * check_for_jeebies:

  1298  *

  1299  * Check for "to he" and other easy h/b errors.

  1300  *

  1301  * This is a very inadequate effort on the h/b problem,

  1302  * but the phrase "to he" is always an error, whereas "to

  1303  * be" is quite common.

  1304  * Similarly, '"Quiet!", be said.' is a non-be error

  1305  * "to he" is _not_ always an error!:

  1306  *       "Where they went to he couldn't say."

  1307  * Another false positive:

  1308  *       What would "Cinderella" be without the . . .

  1309  * and another: "If he wants to he can see for himself."

  1310  */

  1311 void check_for_jeebies(const char *aline)

  1312 {

  1313     const char *s;

  1314     s=strstr(aline," be could ");

  1315     if (!s)

  1316 	s=strstr(aline," be would ");

  1317     if (!s)

  1318 	s=strstr(aline," was be ");

  1319     if (!s)

  1320 	s=strstr(aline," be is ");

  1321     if (!s)

  1322 	s=strstr(aline," is be ");

  1323     if (!s)

  1324 	s=strstr(aline,"\", be ");

  1325     if (!s)

  1326 	s=strstr(aline,"\" be ");

  1327     if (!s)

  1328 	s=strstr(aline,"\" be ");

  1329     if (!s)

  1330 	s=strstr(aline," to he ");

  1331     if (s)

  1332     {

  1333 	if (pswit[ECHO_SWITCH])

  1334 	    g_print("\n%s\n",aline);

  1335 	if (!pswit[OVERVIEW_SWITCH])

  1336 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1337 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1338 	else

  1339 	    cnt_word++;

  1340     }

  1341     s=strstr(aline," the had ");

  1342     if (!s)

  1343 	s=strstr(aline," a had ");

  1344     if (!s)

  1345 	s=strstr(aline," they bad ");

  1346     if (!s)

  1347 	s=strstr(aline," she bad ");

  1348     if (!s)

  1349 	s=strstr(aline," he bad ");

  1350     if (!s)

  1351 	s=strstr(aline," you bad ");

  1352     if (!s)

  1353 	s=strstr(aline," i bad ");

  1354     if (s)

  1355     {

  1356 	if (pswit[ECHO_SWITCH])

  1357 	    g_print("\n%s\n",aline);

  1358 	if (!pswit[OVERVIEW_SWITCH])

  1359 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1360 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1361 	else

  1362 	    cnt_word++;

  1363     }

  1364     s=strstr(aline,"; hut ");

  1365     if (!s)

  1366 	s=strstr(aline,", hut ");

  1367     if (s)

  1368     {

  1369 	if (pswit[ECHO_SWITCH])

  1370 	    g_print("\n%s\n",aline);

  1371 	if (!pswit[OVERVIEW_SWITCH])

  1372 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1373 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1374 	else

  1375 	    cnt_word++;

  1376     }

  1377 }

  1379 /*

  1380  * check_for_mta_from:

  1381  *

  1382  * Special case - angled bracket in front of "From" placed there by an

  1383  * MTA when sending an e-mail.

  1384  */

  1385 void check_for_mta_from(const char *aline)

  1386 {

  1387     const char *s;

  1388     s=strstr(aline,">From");

  1389     if (s)

  1390     {

  1391 	if (pswit[ECHO_SWITCH])

  1392 	    g_print("\n%s\n",aline);

  1393 	if (!pswit[OVERVIEW_SWITCH])

  1394 	    g_print("    Line %ld column %ld - "

  1395 	      "Query angled bracket with From\n",

  1396 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1397 	else

  1398 	    cnt_punct++;

  1399     }

  1400 }

  1402 /*

  1403  * check_for_orphan_character:

  1404  *

  1405  * Check for a single character line -

  1406  * often an overflow from bad wrapping.

  1407  */

  1408 void check_for_orphan_character(const char *aline)

  1409 {

  1410     gunichar c;

  1411     c=g_utf8_get_char(aline);

  1412     if (c && !*g_utf8_next_char(aline))

  1413     {

  1414 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1415 	    ; /* Nothing - ignore numerals alone on a line. */

  1416 	else

  1417 	{

  1418 	    if (pswit[ECHO_SWITCH])

  1419 		g_print("\n%s\n",aline);

  1420 	    if (!pswit[OVERVIEW_SWITCH])

  1421 		g_print("    Line %ld column 1 - Query single character line\n",

  1422 		  linecnt);

  1423 	    else

  1424 		cnt_punct++;

  1425 	}

  1426     }

  1427 }

  1429 /*

  1430  * check_for_pling_scanno:

  1431  *

  1432  * Check for I" - often should be !

  1433  */

  1434 void check_for_pling_scanno(const char *aline)

  1435 {

  1436     const char *s;

  1437     s=strstr(aline," I\"");

  1438     if (s)

  1439     {

  1440 	if (pswit[ECHO_SWITCH])

  1441 	    g_print("\n%s\n",aline);

  1442 	if (!pswit[OVERVIEW_SWITCH])

  1443 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1444 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1445 	else

  1446 	    cnt_punct++;

  1447     }

  1448 }

  1450 /*

  1451  * check_for_extra_period:

  1452  *

  1453  * Check for period without a capital letter. Cut-down from gutspell.

  1454  * Only works when it happens on a single line.

  1455  */

  1456 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1457 {

  1458     const char *s,*t,*s1,*sprev;

  1459     int i;

  1460     gsize len;

  1461     gboolean istypo;

  1462     gchar *testword;

  1463     gunichar c,nc,pc,*decomposition;

  1464     if (pswit[PARANOID_SWITCH])

  1465     {

  1466 	for (t=aline;t=strstr(t,". ");)

  1467 	{

  1468 	    if (t==aline)

  1469 	    {

  1470 		t=g_utf8_next_char(t);

  1471 		/* start of line punctuation is handled elsewhere */

  1472 		continue;

  1473 	    }

  1474 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1475 	    {

  1476 		t=g_utf8_next_char(t);

  1477 		continue;

  1478 	    }

  1479 	    if (warnings->isDutch)

  1480 	    {

  1481 		/* For Frank & Jeroen -- 's Middags case */

  1482 		gunichar c2,c3,c4,c5;

  1483 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1484 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1485 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1486 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1487 		if (CHAR_IS_APOSTROPHE(c2) &&

  1488 		  g_unichar_islower(c3) && c4==CHAR_SPACE &&

  1489 		  g_unichar_isupper(c5))

  1490 		{

  1491 		    t=g_utf8_next_char(t);

  1492 		    continue;

  1493 		}

  1494 	    }

  1495 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1496 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1497 	      !g_unichar_isdigit(g_utf8_get_char(s1)))

  1498 		s1=g_utf8_next_char(s1);

  1499 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1500 	    {

  1501 		/* we have something to investigate */

  1502 		istypo=TRUE;

  1503 		/* so let's go back and find out */

  1504 		nc=g_utf8_get_char(t);

  1505 		s1=g_utf8_prev_char(t);

  1506 		c=g_utf8_get_char(s1);

  1507 		sprev=g_utf8_prev_char(s1);

  1508 		pc=g_utf8_get_char(sprev);

  1509 		while (s1>=aline &&

  1510 		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||

  1511 		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&

  1512 		  g_unichar_isalpha(nc)))

  1513 		{

  1514 		    nc=c;

  1515 		    s1=sprev;

  1516 		    c=pc;

  1517 		    sprev=g_utf8_prev_char(s1);

  1518 		    pc=g_utf8_get_char(sprev);

  1519 		}

  1520 		s1=g_utf8_next_char(s1);

  1521 		s=strchr(s1,'.');

  1522 		if (s)

  1523 		    testword=g_strndup(s1,s-s1);

  1524 		else

  1525 		    testword=g_strdup(s1);

  1526 		for (i=0;*abbrev[i];i++)

  1527 		    if (!strcmp(testword,abbrev[i]))

  1528 			istypo=FALSE;

  1529 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1530 		    istypo=FALSE;

  1531 		if (!*g_utf8_next_char(testword))

  1532 		    istypo=FALSE;

  1533 		if (isroman(testword))

  1534 		    istypo=FALSE;

  1535 		if (istypo)

  1536 		{

  1537 		    istypo=FALSE;

  1538 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1539 		    {

  1540 			decomposition=g_unicode_canonical_decomposition(

  1541 			  g_utf8_get_char(s),&len);

  1542 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1543 			    istypo=TRUE;

  1544 			g_free(decomposition);

  1545 		    }

  1546 		}

  1547 		if (istypo &&

  1548 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1549 		{

  1550 		    g_tree_insert(qperiod,g_strdup(testword),

  1551 		      GINT_TO_POINTER(1));

  1552 		    if (pswit[ECHO_SWITCH])

  1553 			g_print("\n%s\n",aline);

  1554 		    if (!pswit[OVERVIEW_SWITCH])

  1555 			g_print("    Line %ld column %ld - Extra period?\n",

  1556 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1557 		    else

  1558 			cnt_punct++;

  1559 		}

  1560 		g_free(testword);

  1561 	    }

  1562 	    t=g_utf8_next_char(t);

  1563 	}

  1564     }

  1565 }

  1567 /*

  1568  * check_for_following_punctuation:

  1569  *

  1570  * Check for words usually not followed by punctuation.

  1571  */

  1572 void check_for_following_punctuation(const char *aline)

  1573 {

  1574     int i;

  1575     const char *s,*wordstart;

  1576     gunichar c;

  1577     gchar *inword,*t;

  1578     if (pswit[TYPO_SWITCH])

  1579     {

  1580 	for (s=aline;*s;)

  1581 	{

  1582 	    wordstart=s;

  1583 	    t=getaword(&s);

  1584 	    if (!*t)

  1585 	    {

  1586 		g_free(t);

  1587 		continue;

  1588 	    }

  1589 	    inword=g_utf8_strdown(t,-1);

  1590 	    g_free(t);

  1591 	    for (i=0;*nocomma[i];i++)

  1592 		if (!strcmp(inword,nocomma[i]))

  1593 		{

  1594 		    c=g_utf8_get_char(s);

  1595 		    if (c==',' || c==';' || c==':')

  1596 		    {

  1597 			if (pswit[ECHO_SWITCH])

  1598 			    g_print("\n%s\n",aline);

  1599 			if (!pswit[OVERVIEW_SWITCH])

  1600 			    g_print("    Line %ld column %ld - "

  1601 			      "Query punctuation after %s?\n",

  1602 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1603 			      inword);

  1604 			else

  1605 			    cnt_punct++;

  1606 		    }

  1607 		}

  1608 	    for (i=0;*noperiod[i];i++)

  1609 		if (!strcmp(inword,noperiod[i]))

  1610 		{

  1611 		    c=g_utf8_get_char(s);

  1612 		    if (c=='.' || c=='!')

  1613 		    {

  1614 			if (pswit[ECHO_SWITCH])

  1615 			    g_print("\n%s\n",aline);

  1616 			if (!pswit[OVERVIEW_SWITCH])

  1617 			    g_print("    Line %ld column %ld - "

  1618 			      "Query punctuation after %s?\n",

  1619 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1620 			      inword);

  1621 			else

  1622 			    cnt_punct++;

  1623 		    }

  1624 		}

  1625 	    g_free(inword);

  1626 	}

  1627     }

  1628 }

  1630 /*

  1631  * check_for_typos:

  1632  *

  1633  * Check for commonly mistyped words,

  1634  * and digits like 0 for O in a word.

  1635  */

  1636 void check_for_typos(const char *aline,struct warnings *warnings)

  1637 {

  1638     const char *s,*t,*nt,*wordstart;

  1639     gchar *inword;

  1640     gunichar *decomposition;

  1641     gchar *testword;

  1642     int i,vowel,consonant,*dupcnt;

  1643     gboolean isdup,istypo,alower;

  1644     gunichar c,pc;

  1645     long offset,len;

  1646     gsize decomposition_len;

  1647     for (s=aline;*s;)

  1648     {

  1649 	wordstart=s;

  1650 	inword=getaword(&s);

  1651 	if (!*inword)

  1652 	{

  1653 	    g_free(inword);

  1654 	    continue; /* don't bother with empty lines */

  1655 	}

  1656 	if (mixdigit(inword))

  1657 	{

  1658 	    if (pswit[ECHO_SWITCH])

  1659 		g_print("\n%s\n",aline);

  1660 	    if (!pswit[OVERVIEW_SWITCH])

  1661 		g_print("    Line %ld column %ld - Query digit in %s\n",

  1662 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  1663 	    else

  1664 		cnt_word++;

  1665 	}

  1666 	/*

  1667 	 * Put the word through a series of tests for likely typos and OCR

  1668 	 * errors.

  1669 	 */

  1670 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1671 	{

  1672 	    istypo=FALSE;

  1673 	    alower=FALSE;

  1674 	    for (t=inword;*t;t=g_utf8_next_char(t))

  1675 	    {

  1676 		c=g_utf8_get_char(t);

  1677 		nt=g_utf8_next_char(t);

  1678 		/* lowercase for testing */

  1679 		if (g_unichar_islower(c))

  1680 		    alower=TRUE;

  1681 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  1682 		{

  1683 		    /*

  1684 		     * We have an uppercase mid-word. However, there are

  1685 		     * common cases:

  1686 		     *   Mac and Mc like McGill

  1687 		     *   French contractions like l'Abbe

  1688 		     */

  1689 		    offset=g_utf8_pointer_to_offset(inword,t);

  1690 		    if (offset>0)

  1691 			pc=g_utf8_get_char(g_utf8_prev_char(t));

  1692 		    else

  1693 			pc='\0';

  1694 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  1695 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  1696 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  1697 		      CHAR_IS_APOSTROPHE(pc))

  1698 			; /* do nothing! */

  1699 		    else

  1700 			istypo=TRUE;

  1701 		}

  1702 	    }

  1703 	    testword=g_utf8_casefold(inword,-1);

  1704 	}

  1705 	if (pswit[TYPO_SWITCH])

  1706 	{

  1707 	    /*

  1708 	     * Check for certain unlikely two-letter combinations at word

  1709 	     * start and end.

  1710 	     */

  1711 	    len=g_utf8_strlen(testword,-1);

  1712 	    if (len>1)

  1713 	    {

  1714 		for (i=0;*nostart[i];i++)

  1715 		    if (g_str_has_prefix(testword,nostart[i]))

  1716 			istypo=TRUE;

  1717 		for (i=0;*noend[i];i++)

  1718 		    if (g_str_has_suffix(testword,noend[i]))

  1719 			istypo=TRUE;

  1720 	    }

  1721 	    /* ght is common, gbt never. Like that. */

  1722 	    if (strstr(testword,"cb"))

  1723 		istypo=TRUE;

  1724 	    if (strstr(testword,"gbt"))

  1725 		istypo=TRUE;

  1726 	    if (strstr(testword,"pbt"))

  1727 		istypo=TRUE;

  1728 	    if (strstr(testword,"tbs"))

  1729 		istypo=TRUE;

  1730 	    if (strstr(testword,"mrn"))

  1731 		istypo=TRUE;

  1732 	    if (strstr(testword,"ahle"))

  1733 		istypo=TRUE;

  1734 	    if (strstr(testword,"ihle"))

  1735 		istypo=TRUE;

  1736 	    /*

  1737 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1738 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1739 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1740 	     * numerals, but "ii" is a common scanno.

  1741 	     */

  1742 	    if (strstr(testword,"tbi"))

  1743 		istypo=TRUE;

  1744 	    if (strstr(testword,"tbe"))

  1745 		istypo=TRUE;

  1746 	    if (strstr(testword,"ii"))

  1747 		istypo=TRUE;

  1748 	    /*

  1749 	     * Check for no vowels or no consonants.

  1750 	     * If none, flag a typo.

  1751 	     */

  1752 	    if (!istypo && len>1)

  1753 	    {

  1754 		vowel=consonant=0;

  1755 		for (t=testword;*t;t=g_utf8_next_char(t))

  1756 		{

  1757 		    c=g_utf8_get_char(t);

  1758 		    decomposition=

  1759 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  1760 		    if (c=='y' || g_unichar_isdigit(c))

  1761 		    {

  1762 			/* Yah, this is loose. */

  1763 			vowel++;

  1764 			consonant++;

  1765 		    }

  1766 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1767 			vowel++;

  1768 		    else

  1769 			consonant++;

  1770 		    g_free(decomposition);

  1771 		}

  1772 		if (!vowel || !consonant)

  1773 		    istypo=TRUE;

  1774 	    }

  1775 	    /*

  1776 	     * Now exclude the word from being reported if it's in

  1777 	     * the okword list.

  1778 	     */

  1779 	    for (i=0;*okword[i];i++)

  1780 		if (!strcmp(testword,okword[i]))

  1781 		    istypo=FALSE;

  1782 	    /*

  1783 	     * What looks like a typo may be a Roman numeral.

  1784 	     * Exclude these.

  1785 	     */

  1786 	    if (istypo && isroman(testword))

  1787 		istypo=FALSE;

  1788 	    /* Check the manual list of typos. */

  1789 	    if (!istypo)

  1790 		for (i=0;*typo[i];i++)

  1791 		    if (!strcmp(testword,typo[i]))

  1792 			istypo=TRUE;

  1793 	    /*

  1794 	     * Check lowercase s, l, i and m - special cases.

  1795 	     *   "j" - often a semi-colon gone wrong.

  1796 	     *   "d" for a missing apostrophe - he d

  1797 	     *   "n" for "in"

  1798 	     */

  1799 	    if (!istypo && len==1 &&

  1800 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  1801 		istypo=TRUE;

  1802 	    if (istypo)

  1803 	    {

  1804 		dupcnt=g_tree_lookup(qword,testword);

  1805 		if (dupcnt)

  1806 		{

  1807 		    (*dupcnt)++;

  1808 		    isdup=!pswit[VERBOSE_SWITCH];

  1809 		}

  1810 		else

  1811 		{

  1812 		    dupcnt=g_new0(int,1);

  1813 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1814 		    isdup=FALSE;

  1815 		}

  1816 		if (!isdup)

  1817 		{

  1818 		    if (pswit[ECHO_SWITCH])

  1819 			g_print("\n%s\n",aline);

  1820 		    if (!pswit[OVERVIEW_SWITCH])

  1821 		    {

  1822 			g_print("    Line %ld column %ld - Query word %s",

  1823 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  1824 			  inword);

  1825 			if (!pswit[VERBOSE_SWITCH])

  1826 			    g_print(" - not reporting duplicates");

  1827 			g_print("\n");

  1828 		    }

  1829 		    else

  1830 			cnt_word++;

  1831 		}

  1832 	    }

  1833 	}

  1834 	/* check the user's list of typos */

  1835 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  1836 	{

  1837 	    if (pswit[ECHO_SWITCH])

  1838 		g_print("\n%s\n",aline);

  1839 	    if (!pswit[OVERVIEW_SWITCH])

  1840 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  1841 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  1842 	}

  1843 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1844 	    g_free(testword);

  1845 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1846 	{

  1847 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1848 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1849 	    {

  1850 		if (pswit[ECHO_SWITCH])

  1851 		    g_print("\n%s\n",aline);

  1852 		if (!pswit[OVERVIEW_SWITCH])

  1853 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  1854 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  1855 		      inword);

  1856 		else

  1857 		    cnt_word++;

  1858 	    }

  1859 	}

  1860 	g_free(inword);

  1861     }

  1862 }

  1864 /*

  1865  * check_for_misspaced_punctuation:

  1866  *

  1867  * Look for added or missing spaces around punctuation and quotes.

  1868  * If there is a punctuation character like ! with no space on

  1869  * either side, suspect a missing!space. If there are spaces on

  1870  * both sides , assume a typo. If we see a double quote with no

  1871  * space or punctuation on either side of it, assume unspaced

  1872  * quotes "like"this.

  1873  */

  1874 void check_for_misspaced_punctuation(const char *aline,

  1875   struct parities *parities,gboolean isemptyline)

  1876 {

  1877     gboolean isacro,isellipsis;

  1878     const char *s;

  1879     gunichar c,nc,pc,n2c;

  1880     int parity;

  1881     c=g_utf8_get_char(aline);

  1882     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1883     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1884     {

  1885 	pc=c;

  1886 	c=nc;

  1887 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1888 	/* For each character in the line after the first. */

  1889 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  1890 	{

  1891 	    /* we need to suppress warnings for acronyms like M.D. */

  1892 	    isacro=FALSE;

  1893 	    /* we need to suppress warnings for ellipsis . . . */

  1894 	    isellipsis=FALSE;

  1895 	    /*

  1896 	     * If there are letters on both sides of it or

  1897 	     * if it's strict punctuation followed by an alpha.

  1898 	     */

  1899 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  1900 	      g_utf8_strchr("?!,;:",-1,c)))

  1901 	    {

  1902 		if (c=='.')

  1903 		{

  1904 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1905 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1906 			isacro=TRUE;

  1907 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1908 		    if (nc && n2c=='.')

  1909 			isacro=TRUE;

  1910 		}

  1911 		if (!isacro)

  1912 		{

  1913 		    if (pswit[ECHO_SWITCH])

  1914 			g_print("\n%s\n",aline);

  1915 		    if (!pswit[OVERVIEW_SWITCH])

  1916 			g_print("    Line %ld column %ld - Missing space?\n",

  1917 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1918 		    else

  1919 			cnt_punct++;

  1920 		}

  1921 	    }

  1922 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  1923 	    {

  1924 		/*

  1925 		 * If there are spaces on both sides,

  1926 		 * or space before and end of line.

  1927 		 */

  1928 		if (c=='.')

  1929 		{

  1930 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1931 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1932 			isellipsis=TRUE;

  1933 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1934 		    if (nc && n2c=='.')

  1935 			isellipsis=TRUE;

  1936 		}

  1937 		if (!isemptyline && !isellipsis)

  1938 		{

  1939 		    if (pswit[ECHO_SWITCH])

  1940 			g_print("\n%s\n",aline);

  1941 		    if (!pswit[OVERVIEW_SWITCH])

  1942 			g_print("    Line %ld column %ld - "

  1943 			  "Spaced punctuation?\n",linecnt,

  1944 			  g_utf8_pointer_to_offset(aline,s)+1);

  1945 		    else

  1946 			cnt_punct++;

  1947 		}

  1948 	    }

  1949 	}

  1950     }

  1951     /* Split out the characters that CANNOT be preceded by space. */

  1952     c=g_utf8_get_char(aline);

  1953     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1954     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1955     {

  1956 	pc=c;

  1957 	c=nc;

  1958 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1959 	/* for each character in the line after the first */

  1960 	if (g_utf8_strchr("?!,;:",-1,c))

  1961 	{

  1962 	    /* if it's punctuation that _cannot_ have a space before it */

  1963 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  1964 	    {

  1965 		/*

  1966 		 * If nc DOES == space,

  1967 		 * it was already reported just above.

  1968 		 */

  1969 		if (pswit[ECHO_SWITCH])

  1970 		    g_print("\n%s\n",aline);

  1971 		if (!pswit[OVERVIEW_SWITCH])

  1972 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1973 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1974 		else

  1975 		    cnt_punct++;

  1976 	    }

  1977 	}

  1978     }

  1979     /*

  1980      * Special case " .X" where X is any alpha.

  1981      * This plugs a hole in the acronym code above.

  1982      * Inelegant, but maintainable.

  1983      */

  1984     c=g_utf8_get_char(aline);

  1985     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1986     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1987     {

  1988 	pc=c;

  1989 	c=nc;

  1990 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1991 	/* for each character in the line after the first */

  1992 	if (c=='.')

  1993 	{

  1994 	    /* if it's a period */

  1995 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  1996 	    {

  1997 		/*

  1998 		 * If the period follows a space and

  1999 		 * is followed by a letter.

  2000 		 */

  2001 		if (pswit[ECHO_SWITCH])

  2002 		    g_print("\n%s\n",aline);

  2003 		if (!pswit[OVERVIEW_SWITCH])

  2004 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  2005 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2006 		else

  2007 		    cnt_punct++;

  2008 	    }

  2009 	}

  2010     }

  2011     c=g_utf8_get_char(aline);

  2012     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2013     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2014     {

  2015 	pc=c;

  2016 	c=nc;

  2017 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2018 	/* for each character in the line after the first */

  2019 	if (CHAR_IS_DQUOTE(c))

  2020 	{

  2021 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  2022 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  2023 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  2024 	    {

  2025 		if (pswit[ECHO_SWITCH])

  2026 		    g_print("\n%s\n",aline);

  2027 		if (!pswit[OVERVIEW_SWITCH])

  2028 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  2029 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2030 		else

  2031 		    cnt_punct++;

  2032 	    }

  2033 	}

  2034     }

  2035     /* Check parity of quotes. */

  2036     nc=g_utf8_get_char(aline);

  2037     for (s=aline;*s;s=g_utf8_next_char(s))

  2038     {

  2039 	c=nc;

  2040 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2041 	if (CHAR_IS_DQUOTE(c))

  2042 	{

  2043 	    if (c==CHAR_DQUOTE)

  2044 	    {

  2045 		parities->dquote=!parities->dquote;

  2046 		parity=parities->dquote;

  2047 	    }

  2048 	    else if (c==CHAR_LD_QUOTE)

  2049 		parity=1;

  2050 	    else

  2051 		parity=0;

  2052 	    if (!parity)

  2053 	    {

  2054 		/* parity even */

  2055 		if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc))

  2056 		{

  2057 		    if (pswit[ECHO_SWITCH])

  2058 			g_print("\n%s\n",aline);

  2059 		    if (!pswit[OVERVIEW_SWITCH])

  2060 			g_print("    Line %ld column %ld - "

  2061 			  "Wrongspaced quotes?\n",

  2062 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2063 		    else

  2064 			cnt_punct++;

  2065 		}

  2066 	    }

  2067 	    else

  2068 	    {

  2069 		/* parity odd */

  2070 		if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&

  2071 		  !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc)

  2072 		{

  2073 		    if (pswit[ECHO_SWITCH])

  2074 			g_print("\n%s\n",aline);

  2075 		    if (!pswit[OVERVIEW_SWITCH])

  2076 			g_print("    Line %ld column %ld - "

  2077 			  "Wrongspaced quotes?\n",

  2078 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2079 		    else

  2080 			cnt_punct++;

  2081 		}

  2082 	    }

  2083 	}

  2084     }

  2085     c=g_utf8_get_char(aline);

  2086     if (CHAR_IS_DQUOTE(c))

  2087     {

  2088 	if (g_utf8_strchr(",;:!?)]} ",-1,

  2089 	  g_utf8_get_char(g_utf8_next_char(aline))))

  2090 	{

  2091 	    if (pswit[ECHO_SWITCH])

  2092 		g_print("\n%s\n",aline);

  2093 	    if (!pswit[OVERVIEW_SWITCH])

  2094 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  2095 		  linecnt);

  2096 	    else

  2097 		cnt_punct++;

  2098 	}

  2099     }

  2100     if (pswit[SQUOTE_SWITCH])

  2101     {

  2102 	nc=g_utf8_get_char(aline);

  2103 	for (s=aline;*s;s=g_utf8_next_char(s))

  2104 	{

  2105 	    c=nc;

  2106 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2107 	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&

  2108 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2109 	      !g_unichar_isalpha(nc)))

  2110 	    {

  2111 		parities->squote=!parities->squote;

  2112 		if (!parities->squote)

  2113 		{

  2114 		    /* parity even */

  2115 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2116 		    {

  2117 			if (pswit[ECHO_SWITCH])

  2118 			    g_print("\n%s\n",aline);

  2119 			if (!pswit[OVERVIEW_SWITCH])

  2120 			    g_print("    Line %ld column %ld - "

  2121 			      "Wrongspaced singlequotes?\n",

  2122 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2123 			else

  2124 			    cnt_punct++;

  2125 		    }

  2126 		}

  2127 		else

  2128 		{

  2129 		    /* parity odd */

  2130 		    if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) &&

  2131 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2132 		    {

  2133 			if (pswit[ECHO_SWITCH])

  2134 			    g_print("\n%s\n",aline);

  2135 			if (!pswit[OVERVIEW_SWITCH])

  2136 			    g_print("    Line %ld column %ld - "

  2137 			      "Wrongspaced singlequotes?\n",

  2138 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2139 			else

  2140 			    cnt_punct++;

  2141 		    }

  2142 		}

  2143 	    }

  2144 	}

  2145     }

  2146 }

  2148 /*

  2149  * check_for_double_punctuation:

  2150  *

  2151  * Look for double punctuation like ,. or ,,

  2152  * Thanks to DW for the suggestion!

  2153  * In books with references, ".," and ".;" are common

  2154  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2155  * OTOH, from my initial tests, there are also fairly

  2156  * common errors. What to do? Make these cases paranoid?

  2157  * ".," is the most common, so warnings->dotcomma is used

  2158  * to suppress detailed reporting if it occurs often.

  2159  */

  2160 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2161 {

  2162     const char *s;

  2163     gunichar c,nc;

  2164     nc=g_utf8_get_char(aline);

  2165     for (s=aline;*s;s=g_utf8_next_char(s))

  2166     {

  2167 	c=nc;

  2168 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2169 	/* for each punctuation character in the line */

  2170 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2171 	  g_utf8_strchr(".?!,;:",-1,nc))

  2172 	{

  2173 	    /* followed by punctuation, it's a query, unless . . . */

  2174 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2175 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2176 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2177 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2178 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2179 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2180 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2181 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2182 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2183 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2184 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2185 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2186 	    {

  2187 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2188 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2189 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2190 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2191 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2192 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2193 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2194 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2195 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2196 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2197 		{

  2198 		    s+=4;

  2199 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2200 		}

  2201 		; /* do nothing for .. !! and ?? which can be legit */

  2202 	    }

  2203 	    else

  2204 	    {

  2205 		if (pswit[ECHO_SWITCH])

  2206 		    g_print("\n%s\n",aline);

  2207 		if (!pswit[OVERVIEW_SWITCH])

  2208 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2209 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2210 		else

  2211 		    cnt_punct++;

  2212 	    }

  2213 	}

  2214     }

  2215 }

  2217 /*

  2218  * check_for_spaced_quotes:

  2219  */

  2220 void check_for_spaced_quotes(const char *aline)

  2221 {

  2222     int i;

  2223     const char *s,*t;

  2224     const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,

  2225       CHAR_RS_QUOTE};

  2226     GString *pattern;

  2227     s=aline;

  2228     while ((t=strstr(s," \" ")))

  2229     {

  2230 	if (pswit[ECHO_SWITCH])

  2231 	    g_print("\n%s\n",aline);

  2232 	if (!pswit[OVERVIEW_SWITCH])

  2233 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2234 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2235 	else

  2236 	    cnt_punct++;

  2237 	s=g_utf8_next_char(g_utf8_next_char(t));

  2238     }

  2239     pattern=g_string_new(NULL);

  2240     for(i=0;i<G_N_ELEMENTS(single_quotes);i++)

  2241     {

  2242 	g_string_assign(pattern," ");

  2243 	g_string_append_unichar(pattern,single_quotes[i]);

  2244 	g_string_append_c(pattern,' ');

  2245 	s=aline;

  2246 	while ((t=strstr(s,pattern->str)))

  2247 	{

  2248 	    if (pswit[ECHO_SWITCH])

  2249 		g_print("\n%s\n",aline);

  2250 	    if (!pswit[OVERVIEW_SWITCH])

  2251 		g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2252 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2253 	    else

  2254 		cnt_punct++;

  2255 	    s=g_utf8_next_char(g_utf8_next_char(t));

  2256 	}

  2257     }

  2258     g_string_free(pattern,TRUE);

  2259 }

  2261 /*

  2262  * check_for_miscased_genative:

  2263  *

  2264  * Check special case of 'S instead of 's at end of word.

  2265  */

  2266 void check_for_miscased_genative(const char *aline)

  2267 {

  2268     const char *s;

  2269     gunichar c,nc,pc;

  2270     if (!*aline)

  2271 	return;

  2272     c=g_utf8_get_char(aline);

  2273     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2274     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2275     {

  2276 	pc=c;

  2277 	c=nc;

  2278 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2279 	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))

  2280 	{

  2281 	    if (pswit[ECHO_SWITCH])

  2282 		g_print("\n%s\n",aline);

  2283 	    if (!pswit[OVERVIEW_SWITCH])

  2284 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2285 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2286 	    else

  2287 		cnt_punct++;

  2288 	}

  2289     }

  2290 }

  2292 /*

  2293  * check_end_of_line:

  2294  *

  2295  * Now check special cases - start and end of line -

  2296  * for single and double quotes. Start is sometimes [sic]

  2297  * but better to query it anyway.

  2298  * While we're here, check for dash at end of line.

  2299  */

  2300 void check_end_of_line(const char *aline,struct warnings *warnings)

  2301 {

  2302     int lbytes;

  2303     const char *s;

  2304     gunichar c1,c2;

  2305     lbytes=strlen(aline);

  2306     if (g_utf8_strlen(aline,lbytes)>1)

  2307     {

  2308 	s=g_utf8_prev_char(aline+lbytes);

  2309 	c1=g_utf8_get_char(s);

  2310 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2311 	if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)

  2312 	{

  2313 	    if (pswit[ECHO_SWITCH])

  2314 		g_print("\n%s\n",aline);

  2315 	    if (!pswit[OVERVIEW_SWITCH])

  2316 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2317 		  g_utf8_strlen(aline,lbytes));

  2318 	    else

  2319 		cnt_punct++;

  2320 	}

  2321 	c1=g_utf8_get_char(aline);

  2322 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2323 	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)

  2324 	{

  2325 	    if (pswit[ECHO_SWITCH])

  2326 		g_print("\n%s\n",aline);

  2327 	    if (!pswit[OVERVIEW_SWITCH])

  2328 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2329 	    else

  2330 		cnt_punct++;

  2331 	}

  2332 	/*

  2333 	 * Dash at end of line may well be legit - paranoid mode only

  2334 	 * and don't report em-dash at line-end.

  2335 	 */

  2336 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2337 	{

  2338 	    for (s=g_utf8_prev_char(aline+lbytes);

  2339 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2340 		;

  2341 	    if (g_utf8_get_char(s)=='-' &&

  2342 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2343 	    {

  2344 		if (pswit[ECHO_SWITCH])

  2345 		    g_print("\n%s\n",aline);

  2346 		if (!pswit[OVERVIEW_SWITCH])

  2347 		    g_print("    Line %ld column %ld - "

  2348 		      "Hyphen at end of line?\n",

  2349 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2350 	    }

  2351 	}

  2352     }

  2353 }

  2355 /*

  2356  * check_for_unspaced_bracket:

  2357  *

  2358  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2359  * If so, suspect a scanno like "a]most".

  2360  */

  2361 void check_for_unspaced_bracket(const char *aline)

  2362 {

  2363     const char *s;

  2364     gunichar c,nc,pc;

  2365     c=g_utf8_get_char(aline);

  2366     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2367     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2368     {

  2369 	pc=c;

  2370 	c=nc;

  2371 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2372 	if (!nc)

  2373 	    break;

  2374 	/* for each bracket character in the line except 1st & last */

  2375 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2376 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2377 	{

  2378 	    if (pswit[ECHO_SWITCH])

  2379 		g_print("\n%s\n",aline);

  2380 	    if (!pswit[OVERVIEW_SWITCH])

  2381 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2382 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2383 	    else

  2384 		cnt_punct++;

  2385 	}

  2386     }

  2387 }

  2389 /*

  2390  * check_for_unpunctuated_endquote:

  2391  */

  2392 void check_for_unpunctuated_endquote(const char *aline)

  2393 {

  2394     const char *s;

  2395     gunichar c,nc,pc;

  2396     QuoteClass qc;

  2397     c=g_utf8_get_char(aline);

  2398     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2399     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2400     {

  2401 	pc=c;

  2402 	c=nc;

  2403 	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;

  2404 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2405 	/* for each character in the line except 1st */

  2406 	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))

  2407 	{

  2408 	    if (pswit[ECHO_SWITCH])

  2409 		g_print("\n%s\n",aline);

  2410 	    if (!pswit[OVERVIEW_SWITCH])

  2411 		g_print("    Line %ld column %ld - "

  2412 		  "endquote missing punctuation?\n",

  2413 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2414 	    else

  2415 		cnt_punct++;

  2416 	}

  2417     }

  2418 }

  2420 /*

  2421  * check_for_html_tag:

  2422  *

  2423  * Check for <HTML TAG>.

  2424  *

  2425  * If there is a < in the line, followed at some point

  2426  * by a > then we suspect HTML.

  2427  */

  2428 void check_for_html_tag(const char *aline)

  2429 {

  2430     const char *open,*close;

  2431     gchar *tag;

  2432     open=strchr(aline,'<');

  2433     if (open)

  2434     {

  2435 	close=strchr(g_utf8_next_char(open),'>');

  2436 	if (close)

  2437 	{

  2438 	    if (pswit[ECHO_SWITCH])

  2439 		g_print("\n%s\n",aline);

  2440 	    if (!pswit[OVERVIEW_SWITCH])

  2441 	    {

  2442 		tag=g_strndup(open,close-open+1);

  2443 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2444 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2445 		g_free(tag);

  2446 	    }

  2447 	    else

  2448 		cnt_html++;

  2449 	}

  2450     }

  2451 }

  2453 /*

  2454  * check_for_html_entity:

  2455  *

  2456  * Check for &symbol; HTML.

  2457  *

  2458  * If there is a & in the line, followed at

  2459  * some point by a ; then we suspect HTML.

  2460  */

  2461 void check_for_html_entity(const char *aline)

  2462 {

  2463     const char *s,*amp,*scolon;

  2464     gchar *entity;

  2465     amp=strchr(aline,'&');

  2466     if (amp)

  2467     {

  2468 	scolon=strchr(amp,';');

  2469 	if (scolon)

  2470 	{

  2471 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2472 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2473 		    break;		/* Don't report "Jones & Son;" */

  2474 	    if (s>=scolon)

  2475 	    {

  2476 		if (pswit[ECHO_SWITCH])

  2477 		    g_print("\n%s\n",aline);

  2478 		if (!pswit[OVERVIEW_SWITCH])

  2479 		{

  2480 		    entity=g_strndup(amp,scolon-amp+1);

  2481 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2482 		      linecnt,(int)(amp-aline)+1,entity);

  2483 		    g_free(entity);

  2484 		}

  2485 		else

  2486 		    cnt_html++;

  2487 	    }

  2488 	}

  2489     }

  2490 }

  2492 /*

  2493  * check_for_omitted_punctuation:

  2494  *

  2495  * Check for omitted punctuation at end of paragraph by working back

  2496  * through prevline. DW.

  2497  * Need to check this only for "normal" paras.

  2498  * So what is a "normal" para?

  2499  *    Not normal if one-liner (chapter headings, etc.)

  2500  *    Not normal if doesn't contain at least one locase letter

  2501  *    Not normal if starts with space

  2502  */

  2503 void check_for_omitted_punctuation(const char *prevline,

  2504   struct line_properties *last,int start_para_line)

  2505 {

  2506     gboolean letter_on_line=FALSE;

  2507     const char *s;

  2508     gunichar c;

  2509     gboolean closing_quote;

  2510     for (s=prevline;*s;s=g_utf8_next_char(s))

  2511 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2512 	{

  2513 	    letter_on_line=TRUE;

  2514 	    break;

  2515 	}

  2516     /*

  2517      * This next "if" is a problem.

  2518      * If we say "start_para_line <= linecnt - 1", that includes

  2519      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2520      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2521      * misses genuine one-line paragraphs.

  2522      */

  2523     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2524       g_utf8_get_char(prevline)>CHAR_SPACE)

  2525     {

  2526 	s=prevline+strlen(prevline);

  2527 	do

  2528 	{

  2529 	    s=g_utf8_prev_char(s);

  2530 	    c=g_utf8_get_char(s);

  2531 	    if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE)

  2532 		closing_quote=TRUE;

  2533 	    else

  2534 		closing_quote=FALSE;

  2535 	} while (closing_quote && s>prevline);

  2536 	for (;s>prevline;s=g_utf8_prev_char(s))

  2537 	{

  2538 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2539 	    {

  2540 		if (pswit[ECHO_SWITCH])

  2541 		    g_print("\n%s\n",prevline);

  2542 		if (!pswit[OVERVIEW_SWITCH])

  2543 		    g_print("    Line %ld column %ld - "

  2544 		      "No punctuation at para end?\n",

  2545 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2546 		else

  2547 		    cnt_punct++;

  2548 		break;

  2549 	    }

  2550 	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))

  2551 		break;

  2552 	}

  2553     }

  2554 }

  2556 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2557 {

  2558     const char *word=key;

  2559     int *dupcnt=value;

  2560     if (*dupcnt)

  2561 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2562 	  word,*dupcnt);

  2563     return FALSE;

  2564 }

  2566 void print_as_windows_1252(const char *string)

  2567 {

  2568     gsize inbytes,outbytes;

  2569     gchar *buf,*bp;

  2570     static GIConv converter=(GIConv)-1;

  2571     if (!string)

  2572     {

  2573 	if (converter!=(GIConv)-1)

  2574 	    g_iconv_close(converter);

  2575 	converter=(GIConv)-1;

  2576 	return;

  2577     }

  2578     if (converter==(GIConv)-1)

  2579 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  2580     if (converter!=(GIConv)-1)

  2581     {

  2582 	inbytes=outbytes=strlen(string);

  2583 	bp=buf=g_malloc(outbytes+1);

  2584 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  2585 	*bp='\0';

  2586 	fputs(buf,stdout);

  2587 	g_free(buf);

  2588     }

  2589     else

  2590 	fputs(string,stdout);

  2591 }

  2593 void print_as_utf_8(const char *string)

  2594 {

  2595     fputs(string,stdout);

  2596 }

  2598 /*

  2599  * procfile:

  2600  *

  2601  * Process one file.

  2602  */

  2603 void procfile(const char *filename)

  2604 {

  2605     const char *s;

  2606     gchar *parastart=NULL;	/* first line of current para */

  2607     gchar *etext,*aline;

  2608     gchar *etext_ptr;

  2609     GError *err=NULL;

  2610     struct first_pass_results *first_pass_results;

  2611     struct warnings *warnings;

  2612     struct counters counters={0};

  2613     struct line_properties last={0};

  2614     struct parities parities={0};

  2615     struct pending pending={0};

  2616     gboolean isemptyline;

  2617     long start_para_line=0;

  2618     gboolean isnewpara=FALSE,enddash=FALSE;

  2619     last.start=CHAR_SPACE;

  2620     linecnt=checked_linecnt=0;

  2621     etext=read_etext(filename,&err);

  2622     if (!etext)

  2623     {

  2624 	if (pswit[STDOUT_SWITCH])

  2625 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2626 	else

  2627 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2628 	exit(1);

  2629     }

  2630     g_print("\n\nFile: %s\n\n",filename);

  2631     first_pass_results=first_pass(etext);

  2632     warnings=report_first_pass(first_pass_results);

  2633     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2634     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2635     /*

  2636      * Here we go with the main pass. Hold onto yer hat!

  2637      */

  2638     linecnt=0;

  2639     etext_ptr=etext;

  2640     while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr)))

  2641     {

  2642 	linecnt++;

  2643 	if (linecnt==1)

  2644 	    isnewpara=TRUE;

  2645 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  2646 	    continue;    // skip DP page separators completely

  2647 	if (linecnt<first_pass_results->firstline ||

  2648 	  (first_pass_results->footerline>0 &&

  2649 	  linecnt>first_pass_results->footerline))

  2650 	{

  2651 	    if (pswit[HEADER_SWITCH])

  2652 	    {

  2653 		if (g_str_has_prefix(aline,"Title:"))

  2654 		    g_print("    %s\n",aline);

  2655 		if (g_str_has_prefix(aline,"Author:"))

  2656 		    g_print("    %s\n",aline);

  2657 		if (g_str_has_prefix(aline,"Release Date:"))

  2658 		    g_print("    %s\n",aline);

  2659 		if (g_str_has_prefix(aline,"Edition:"))

  2660 		    g_print("    %s\n\n",aline);

  2661 	    }

  2662 	    continue;		/* skip through the header */

  2663 	}

  2664 	checked_linecnt++;

  2665 	print_pending(aline,parastart,&pending);

  2666 	isemptyline=analyse_quotes(aline,&counters);

  2667 	if (isnewpara && !isemptyline)

  2668 	{

  2669 	    /* This line is the start of a new paragraph. */

  2670 	    start_para_line=linecnt;

  2671 	    /* Capture its first line in case we want to report it later. */

  2672 	    g_free(parastart);

  2673 	    parastart=g_strdup(aline);

  2674 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2675 	    s=aline;

  2676 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  2677 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  2678 		s=g_utf8_next_char(s);

  2679 	    if (g_unichar_islower(g_utf8_get_char(s)))

  2680 	    {

  2681 		/* and its first letter is lowercase */

  2682 		if (pswit[ECHO_SWITCH])

  2683 		    g_print("\n%s\n",aline);

  2684 		if (!pswit[OVERVIEW_SWITCH])

  2685 		    g_print("    Line %ld column %ld - "

  2686 		      "Paragraph starts with lower-case\n",

  2687 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2688 		else

  2689 		    cnt_punct++;

  2690 	    }

  2691 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2692 	}

  2693 	/* Check for an em-dash broken at line end. */

  2694 	if (enddash && g_utf8_get_char(aline)=='-')

  2695 	{

  2696 	    if (pswit[ECHO_SWITCH])

  2697 		g_print("\n%s\n",aline);

  2698 	    if (!pswit[OVERVIEW_SWITCH])

  2699 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2700 	    else

  2701 		cnt_punct++;

  2702 	}

  2703 	enddash=FALSE;

  2704 	for (s=g_utf8_prev_char(aline+strlen(aline));

  2705 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  2706 	    ;

  2707 	if (s>=aline && g_utf8_get_char(s)=='-')

  2708 	    enddash=TRUE;

  2709 	check_for_control_characters(aline);

  2710 	if (warnings->bin)

  2711 	    check_for_odd_characters(aline,warnings,isemptyline);

  2712 	if (warnings->longline)

  2713 	    check_for_long_line(aline);

  2714 	if (warnings->shortline)

  2715 	    check_for_short_line(aline,&last);

  2716 	last.blen=last.len;

  2717 	last.len=g_utf8_strlen(aline,-1);

  2718 	last.start=g_utf8_get_char(aline);

  2719 	check_for_starting_punctuation(aline);

  2720 	if (warnings->dash)

  2721 	{

  2722 	    check_for_spaced_emdash(aline);

  2723 	    check_for_spaced_dash(aline);

  2724 	}

  2725 	check_for_unmarked_paragraphs(aline);

  2726 	check_for_jeebies(aline);

  2727 	check_for_mta_from(aline);

  2728 	check_for_orphan_character(aline);

  2729 	check_for_pling_scanno(aline);

  2730 	check_for_extra_period(aline,warnings);

  2731 	check_for_following_punctuation(aline);

  2732 	check_for_typos(aline,warnings);

  2733 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2734 	check_for_double_punctuation(aline,warnings);

  2735 	check_for_spaced_quotes(aline);

  2736 	check_for_miscased_genative(aline);

  2737 	check_end_of_line(aline,warnings);

  2738 	check_for_unspaced_bracket(aline);

  2739 	if (warnings->endquote)

  2740 	    check_for_unpunctuated_endquote(aline);

  2741 	check_for_html_tag(aline);

  2742 	check_for_html_entity(aline);

  2743 	if (isemptyline)

  2744 	{

  2745 	    check_for_mismatched_quotes(&counters,&pending);

  2746 	    counters_reset(&counters);

  2747 	    /* let the next iteration know that it's starting a new para */

  2748 	    isnewpara=TRUE;

  2749 	    if (prevline)

  2750 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2751 	}

  2752 	g_free(prevline);

  2753 	prevline=g_strdup(aline);

  2754     }

  2755     linecnt++;

  2756     check_for_mismatched_quotes(&counters,&pending);

  2757     print_pending(NULL,parastart,&pending);

  2758     reset_pending(&pending);

  2759     if (prevline)

  2760     {

  2761 	g_free(prevline);

  2762 	prevline=NULL;

  2763     }

  2764     g_free(parastart);

  2765     g_free(prevline);

  2766     g_free(etext);

  2767     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  2768 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2769     g_tree_unref(qword);

  2770     g_tree_unref(qperiod);

  2771     counters_destroy(&counters);

  2772     g_set_print_handler(NULL);

  2773     print_as_windows_1252(NULL);

  2774     if (pswit[MARKUP_SWITCH])

  2775 	loseentities(NULL);

  2776 }

  2778 /*

  2779  * flgets:

  2780  *

  2781  * Get one line from the input text, checking for

  2782  * the existence of exactly one CR/LF line-end per line.

  2783  *

  2784  * Returns: a pointer to the line.

  2785  */

  2786 char *flgets(char **etext,long lcnt,gboolean warn_nocr)

  2787 {

  2788     gunichar c;

  2789     gboolean isCR=FALSE;

  2790     char *theline=*etext;

  2791     char *eos=theline;

  2792     gchar *s;

  2793     for (;;)

  2794     {

  2795 	c=g_utf8_get_char(*etext);

  2796 	if (!c)

  2797 	{

  2798 	    if (*etext==theline)

  2799 		return NULL;

  2800 	    else if (pswit[LINE_END_SWITCH])

  2801 	    {

  2802 		if (pswit[ECHO_SWITCH])

  2803 		{

  2804 		    s=g_strndup(theline,eos-theline);

  2805 		    g_print("\n%s\n",s);

  2806 		    g_free(s);

  2807 		}

  2808 		if (!pswit[OVERVIEW_SWITCH])

  2809 		    /* There may, or may not, have been a CR */

  2810 		    g_print("    Line %ld - No LF?\n",lcnt);

  2811 		else

  2812 		    cnt_lineend++;

  2813 	    }

  2814 	    break;

  2815 	}

  2816 	*etext=g_utf8_next_char(*etext);

  2817 	/* either way, it's end of line */

  2818 	if (c=='\n')

  2819 	{

  2820 	    if (isCR)

  2821 		break;

  2822 	    else

  2823 	    {

  2824 		/* Error - a LF without a preceding CR */

  2825 		if (pswit[LINE_END_SWITCH] && warn_nocr)

  2826 		{

  2827 		    if (pswit[ECHO_SWITCH])

  2828 		    {

  2829 			s=g_strndup(theline,eos-theline);

  2830 			g_print("\n%s\n",s);

  2831 			g_free(s);

  2832 		    }

  2833 		    if (!pswit[OVERVIEW_SWITCH])

  2834 			g_print("    Line %ld - No CR?\n",lcnt);

  2835 		    else

  2836 			cnt_lineend++;

  2837 		}

  2838 		break;

  2839 	    }

  2840 	}

  2841 	if (c=='\r')

  2842 	{

  2843 	    if (isCR)

  2844 	    {

  2845 		/* Error - two successive CRs */

  2846 		if (pswit[LINE_END_SWITCH])

  2847 		{

  2848 		    if (pswit[ECHO_SWITCH])

  2849 		    {

  2850 			s=g_strndup(theline,eos-theline);

  2851 			g_print("\n%s\n",s);

  2852 			g_free(s);

  2853 		    }

  2854 		    if (!pswit[OVERVIEW_SWITCH])

  2855 			g_print("    Line %ld - Two successive CRs?\n",lcnt);

  2856 		    else

  2857 			cnt_lineend++;

  2858 		}

  2859 	    }

  2860 	    isCR=TRUE;

  2861 	}

  2862 	else

  2863 	{

  2864 	    if (pswit[LINE_END_SWITCH] && isCR)

  2865 	    {

  2866 		if (pswit[ECHO_SWITCH])

  2867 		{

  2868 		    s=g_strndup(theline,eos-theline);

  2869 		    g_print("\n%s\n",s);

  2870 		    g_free(s);

  2871 		}

  2872 		if (!pswit[OVERVIEW_SWITCH])

  2873 		    g_print("    Line %ld column %ld - CR without LF?\n",

  2874 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  2875 		else

  2876 		    cnt_lineend++;

  2877 		*eos=' ';

  2878 	    }

  2879 	    isCR=FALSE;

  2880 	    eos=g_utf8_next_char(eos);

  2881 	}

  2882     }

  2883     *eos='\0';

  2884     if (pswit[MARKUP_SWITCH])

  2885 	postprocess_for_HTML(theline);

  2886     if (pswit[DP_SWITCH])

  2887 	postprocess_for_DP(theline);

  2888     return theline;

  2889 }

  2891 /*

  2892  * mixdigit:

  2893  *

  2894  * Takes a "word" as a parameter, and checks whether it

  2895  * contains a mixture of alpha and digits. Generally, this is an

  2896  * error, but may not be for cases like 4th or L5 12s. 3d.

  2897  *

  2898  * Returns: TRUE iff an is error found.

  2899  */

  2900 gboolean mixdigit(const char *checkword)

  2901 {

  2902     gboolean wehaveadigit,wehavealetter,query;

  2903     const char *s,*nondigit;

  2904     wehaveadigit=wehavealetter=query=FALSE;

  2905     for (s=checkword;*s;s=g_utf8_next_char(s))

  2906 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2907 	    wehavealetter=TRUE;

  2908 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  2909 	    wehaveadigit=TRUE;

  2910     if (wehaveadigit && wehavealetter)

  2911     {

  2912 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2913 	query=TRUE;

  2914 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  2915 	  nondigit=g_utf8_next_char(nondigit))

  2916 	    ;

  2917 	/* digits, ending in st, rd, nd, th of either case */

  2918 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  2919 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  2920 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  2921 	  !g_ascii_strcasecmp(nondigit,"th"))

  2922 	    query=FALSE;

  2923 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  2924 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  2925 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  2926 	  !g_ascii_strcasecmp(nondigit,"ths"))

  2927 	    query=FALSE;

  2928 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  2929 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  2930 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  2931 	  !g_ascii_strcasecmp(nondigit,"thly"))

  2932 	    query=FALSE;

  2933 	/* digits, ending in l, L, s or d */

  2934 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  2935 	  !strcmp(nondigit,"d"))

  2936 	    query=FALSE;

  2937 	/*

  2938 	 * L at the start of a number, representing Britsh pounds, like L500.

  2939 	 * This is cute. We know the current word is mixed digit. If the first

  2940 	 * letter is L, there must be at least one digit following. If both

  2941 	 * digits and letters follow, we have a genuine error, else we have a

  2942 	 * capital L followed by digits, and we accept that as a non-error.

  2943 	 */

  2944 	if (g_utf8_get_char(checkword)=='L' &&

  2945 	  !mixdigit(g_utf8_next_char(checkword)))

  2946 	    query=FALSE;

  2947     }

  2948     return query;

  2949 }

  2951 /*

  2952  * getaword:

  2953  *

  2954  * Extracts the first/next "word" from the line, and returns it.

  2955  * A word is defined as one English word unit--or at least that's the aim.

  2956  * "ptr" is advanced to the position in the line where we will start

  2957  * looking for the next word.

  2958  *

  2959  * Returns: A newly-allocated string.

  2960  */

  2961 gchar *getaword(const char **ptr)

  2962 {

  2963     const char *s,*t;

  2964     GString *word;

  2965     gunichar c,pc;

  2966     word=g_string_new(NULL);

  2967     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  2968       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  2969       **ptr;*ptr=g_utf8_next_char(*ptr))

  2970     {

  2971 	/* Handle exceptions for footnote markers like [1] */

  2972 	if (g_utf8_get_char(*ptr)=='[')

  2973 	{

  2974 	    g_string_append_c(word,'[');

  2975 	    s=g_utf8_next_char(*ptr);

  2976 	    for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s))

  2977 		g_string_append_unichar(word,g_utf8_get_char(s));

  2978 	    if (g_utf8_get_char(s)==']')

  2979 	    {

  2980 		g_string_append_c(word,']');

  2981 		*ptr=g_utf8_next_char(s);

  2982 		return g_string_free(word,FALSE);

  2983 	    }

  2984 	    else

  2985 		g_string_truncate(word,0);

  2986 	}

  2987     }

  2988     /*

  2989      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  2990      * Especially yucky is the case of L1,000

  2991      * This section looks for a pattern of characters including a digit

  2992      * followed by a comma or period followed by one or more digits.

  2993      * If found, it returns this whole pattern as a word; otherwise we discard

  2994      * the results and resume our normal programming.

  2995      */

  2996     s=*ptr;

  2997     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  2998       g_unichar_isalpha(g_utf8_get_char(s)) ||

  2999       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  3000 	g_string_append_unichar(word,g_utf8_get_char(s));

  3001     if (word->len)

  3002     {

  3003 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  3004 	{

  3005 	    c=g_utf8_get_char(t);

  3006 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  3007 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  3008 	    {

  3009 		*ptr=s;

  3010 		return g_string_free(word,FALSE);

  3011 	    }

  3012 	}

  3013     }

  3014     /* we didn't find a punctuated number - do the regular getword thing */

  3015     g_string_truncate(word,0);

  3016     c=g_utf8_get_char(*ptr);

  3017     for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);

  3018       *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))

  3019 	g_string_append_unichar(word,c);

  3020     return g_string_free(word,FALSE);

  3021 }

  3023 /*

  3024  * isroman:

  3025  *

  3026  * Is this word a Roman Numeral?

  3027  *

  3028  * It doesn't actually validate that the number is a valid Roman Numeral--for

  3029  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  3030  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  3031  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  3032  * expressions thereof, except when it came to taxes. Allow any number of M,

  3033  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3034  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3035  * of optional Is.

  3036  */

  3037 gboolean isroman(const char *t)

  3038 {

  3039     const char *s;

  3040     if (!t || !*t)

  3041 	return FALSE;

  3042     s=t;

  3043     while (g_utf8_get_char(t)=='m' && *t)

  3044 	t++;

  3045     if (g_utf8_get_char(t)=='d')

  3046 	t++;

  3047     if (g_str_has_prefix(t,"cm"))

  3048 	t+=2;

  3049     if (g_str_has_prefix(t,"cd"))

  3050 	t+=2;

  3051     while (g_utf8_get_char(t)=='c' && *t)

  3052 	t++;

  3053     if (g_str_has_prefix(t,"xl"))

  3054 	t+=2;

  3055     if (g_str_has_prefix(t,"xc"))

  3056 	t+=2;

  3057     if (g_utf8_get_char(t)=='l')

  3058 	t++;

  3059     while (g_utf8_get_char(t)=='x' && *t)

  3060 	t++;

  3061     if (g_str_has_prefix(t,"ix"))

  3062 	t+=2;

  3063     if (g_str_has_prefix(t,"iv"))

  3064 	t+=2;

  3065     if (g_utf8_get_char(t)=='v')

  3066 	t++;

  3067     while (g_utf8_get_char(t)=='i' && *t)

  3068 	t++;

  3069     return !*t;

  3070 }

  3072 /*

  3073  * postprocess_for_DP:

  3074  *

  3075  * Invoked with the -d switch from flgets().

  3076  * It simply "removes" from the line a hard-coded set of common

  3077  * DP-specific tags, so that the line passed to the main routine has

  3078  * been pre-cleaned of DP markup.

  3079  */

  3080 void postprocess_for_DP(char *theline)

  3081 {

  3082     char *s,*t;

  3083     int i;

  3084     if (!*theline)

  3085 	return;

  3086     for (i=0;*DPmarkup[i];i++)

  3087 	while ((s=strstr(theline,DPmarkup[i])))

  3088 	{

  3089 	    t=s+strlen(DPmarkup[i]);

  3090 	    memmove(s,t,strlen(t)+1);

  3091 	}

  3092 }

  3094 /*

  3095  * postprocess_for_HTML:

  3096  *

  3097  * Invoked with the -m switch from flgets().

  3098  * It simply "removes" from the line a hard-coded set of common

  3099  * HTML tags and "replaces" a hard-coded set of common HTML

  3100  * entities, so that the line passed to the main routine has

  3101  * been pre-cleaned of HTML.

  3102  */

  3103 void postprocess_for_HTML(char *theline)

  3104 {

  3105     while (losemarkup(theline))

  3106 	;

  3107     loseentities(theline);

  3108 }

  3110 char *losemarkup(char *theline)

  3111 {

  3112     char *s,*t;

  3113     int i;

  3114     s=strchr(theline,'<');

  3115     t=s?strchr(s,'>'):NULL;

  3116     if (!s || !t)

  3117 	return NULL;

  3118     for (i=0;*markup[i];i++)

  3119 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  3120 	{

  3121 	    t=g_utf8_next_char(t);

  3122 	    memmove(s,t,strlen(t)+1);

  3123 	    return s;

  3124 	}

  3125     /* It's an unrecognized <xxx>. */

  3126     return NULL;

  3127 }

  3129 void loseentities(char *theline)

  3130 {

  3131     int i;

  3132     gsize nb;

  3133     char *amp,*scolon;

  3134     gchar *s,*t;

  3135     gunichar c;

  3136     GTree *entities=NULL;

  3137     static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3138     if (!theline)

  3139     {

  3140 	if (entities)

  3141 	    g_tree_destroy(entities);

  3142 	entities=NULL;

  3143 	if (translit!=(GIConv)-1)

  3144 	    g_iconv_close(translit);

  3145 	translit=(GIConv)-1;

  3146 	if (to_utf8!=(GIConv)-1)

  3147 	    g_iconv_close(to_utf8);

  3148 	to_utf8=(GIConv)-1;

  3149 	return;

  3150     }

  3151     if (!*theline)

  3152 	return;

  3153     if (!entities)

  3154     {

  3155 	entities=g_tree_new((GCompareFunc)strcmp);

  3156 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3157 	    g_tree_insert(entities,HTMLentities[i].name,

  3158 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3159     }

  3160     if (translit==(GIConv)-1)

  3161 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3162     if (to_utf8==(GIConv)-1)

  3163 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3164     while((amp=strchr(theline,'&')))

  3165     {

  3166 	scolon=strchr(amp,';');

  3167 	if (scolon)

  3168 	{

  3169 	    if (amp[1]=='#')

  3170 	    {

  3171 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3172 		    c=strtol(amp+2,NULL,10);

  3173 		else if (amp[2]=='x' &&

  3174 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3175 		    c=strtol(amp+3,NULL,16);

  3176 	    }

  3177 	    else

  3178 	    {

  3179 		s=g_strndup(amp+1,scolon-(amp+1));

  3180 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3181 		g_free(s);

  3182 	    }

  3183 	}

  3184 	else

  3185 	    c=0;

  3186 	if (c)

  3187 	{

  3188 	    theline=amp;

  3189 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3190 		theline+=g_unichar_to_utf8(c,theline);

  3191 	    else

  3192 	    {

  3193 		s=g_malloc(6);

  3194 		nb=g_unichar_to_utf8(c,s);

  3195 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3196 		g_free(s);

  3197 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3198 		g_free(t);

  3199 		memcpy(theline,s,nb);

  3200 		g_free(s);

  3201 		theline+=nb;

  3202 	    }

  3203 	    memmove(theline,g_utf8_next_char(scolon),

  3204 	      strlen(g_utf8_next_char(scolon))+1);

  3205 	}

  3206 	else

  3207 	    theline=g_utf8_next_char(amp);

  3208     }

  3209 }

  3211 gboolean tagcomp(const char *strin,const char *basetag)

  3212 {

  3213     gboolean retval;

  3214     gchar *s,*t;

  3215     if (g_utf8_get_char(strin)=='/')

  3216 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3217     else

  3218 	t=g_utf8_casefold(strin,-1);

  3219     s=g_utf8_casefold(basetag,-1);

  3220     retval=g_str_has_prefix(t,s);

  3221     g_free(s);

  3222     g_free(t);

  3223     return retval;

  3224 }

  3226 void proghelp(GOptionContext *context)

  3227 {

  3228     gchar *help;

  3229     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3230     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3231     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3232     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3233       "For details, read the file COPYING.\n",stderr);

  3234     fputs("This is Free Software; "

  3235       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3236     fputs("read the file COPYING for details.\n\n",stderr);

  3237     help=g_option_context_get_help(context,TRUE,NULL);

  3238     fputs(help,stderr);

  3239     g_free(help);

  3240     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3241     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3242       "non-ASCII\n",stderr);

  3243     fputs("characters like accented letters, "

  3244       "lines longer than 75 or shorter than 55,\n",stderr);

  3245     fputs("unbalanced quotes or brackets, "

  3246       "a variety of badly formatted punctuation, \n",stderr);

  3247     fputs("HTML tags, some likely typos. "

  3248       "It is NOT a substitute for human judgement.\n",stderr);

  3249     fputs("\n",stderr);

  3250 }

author	ali <ali@juiblex.co.uk>
	Sun Oct 20 21:06:25 2013 +0100 (2013-10-20)
changeset 184	cd3068704d3a
parent 174	ad92d11d59b8
child 191	189183b37598
permissions	-rw-r--r--