bookloupe: bookloupe/bookloupe.c@0df25c7f4ed7

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "HTMLentities.h"

    32 gchar *prevline;

    34 /* Common typos. */

    35 char *typo[] = {

    36     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    37     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    38     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    39     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    40     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    41     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    42     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    43     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    44     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    45     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    46     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    47     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    48     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    49     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    50     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    51     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    52     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    53     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    54     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    55     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    56     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    57     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    58     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    59     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    60     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    61     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    62     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    63     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    64     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    65     "se", ""

    66 };

    68 GTree *usertypo;

    70 /* Common abbreviations and other OK words not to query as typos. */

    71 char *okword[] = {

    72     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    73     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    74     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    75     "outbid", "outbids", "frostbite", "frostbitten", ""

    76 };

    78 /* Common abbreviations that cause otherwise unexplained periods. */

    79 char *abbrev[] = {

    80     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    81     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    82 };

    84 /*

    85  * Two-Letter combinations that rarely if ever start words,

    86  * but are common scannos or otherwise common letter combinations.

    87  */

    88 char *nostart[] = {

    89     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    90 };

    92 /*

    93  * Two-Letter combinations that rarely if ever end words,

    94  * but are common scannos or otherwise common letter combinations.

    95  */

    96 char *noend[] = {

    97     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

    98     "sw", "gr", "sl", "cl", "iy", ""

    99 };

   101 char *markup[] = {

   102     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   103     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   104     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   105     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   106 };

   108 char *DPmarkup[] = {

   109     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   110 };

   112 char *nocomma[] = {

   113     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   114     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   115     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   116     "during", "let", "toward", "among", ""

   117 };

   119 char *noperiod[] = {

   120     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   121     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   122     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   123     "among", "those", "into", "whom", "having", "thence", ""

   124 };

   126 /* special characters */

   127 #define CHAR_SPACE	  32

   128 #define CHAR_TAB	   9

   129 #define CHAR_LF		  10

   130 #define CHAR_CR		  13

   131 #define CHAR_DQUOTE	  34

   132 #define CHAR_SQUOTE	  39

   133 #define CHAR_OPEN_SQUOTE  96

   134 #define CHAR_TILDE	 126

   135 #define CHAR_ASTERISK	  42

   136 #define CHAR_FORESLASH	  47

   137 #define CHAR_CARAT	  94

   139 #define CHAR_UNDERSCORE    '_'

   140 #define CHAR_OPEN_CBRACK   '{'

   141 #define CHAR_CLOSE_CBRACK  '}'

   142 #define CHAR_OPEN_RBRACK   '('

   143 #define CHAR_CLOSE_RBRACK  ')'

   144 #define CHAR_OPEN_SBRACK   '['

   145 #define CHAR_CLOSE_SBRACK  ']'

   147 /* longest and shortest normal PG line lengths */

   148 #define LONGEST_PG_LINE   75

   149 #define WAY_TOO_LONG      80

   150 #define SHORTEST_PG_LINE  55

   152 enum {

   153     ECHO_SWITCH,

   154     SQUOTE_SWITCH,

   155     TYPO_SWITCH,

   156     QPARA_SWITCH,

   157     PARANOID_SWITCH,

   158     LINE_END_SWITCH,

   159     OVERVIEW_SWITCH,

   160     STDOUT_SWITCH,

   161     HEADER_SWITCH,

   162     WEB_SWITCH,

   163     VERBOSE_SWITCH,

   164     MARKUP_SWITCH,

   165     USERTYPO_SWITCH,

   166     DP_SWITCH,

   167     SWITNO

   168 };

   170 gboolean pswit[SWITNO];  /* program switches */

   172 static GOptionEntry options[]={

   173     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   174       "Ignore DP-specific markup", NULL },

   175     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   176       "Don't echo queried line", NULL },

   177     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   178       "Check single quotes", NULL },

   179     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   180       "Check common typos", NULL },

   181     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   182       "Require closure of quotes on every paragraph", NULL },

   183     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   184       "Disable paranoid querying of everything", NULL },

   185     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   186       "Disable line end checking", NULL },

   187     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   188       "Overview: just show counts", NULL },

   189     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   190       "Output errors to stdout instead of stderr", NULL },

   191     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   192       "Echo header fields", NULL },

   193     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   194       "Ignore markup in < >", NULL },

   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   196       "Use file of user-defined typos", NULL },

   197     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   198       "Defaults for use on www upload", NULL },

   199     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   200       "Verbose - list everything", NULL },

   201     { NULL }

   202 };

   204 long cnt_dquot;		/* for overview mode, count of doublequote queries */

   205 long cnt_squot;		/* for overview mode, count of singlequote queries */

   206 long cnt_brack;		/* for overview mode, count of brackets queries */

   207 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   208 long cnt_odd;		/* for overview mode, count of odd character queries */

   209 long cnt_long;		/* for overview mode, count of long line errors */

   210 long cnt_short;		/* for overview mode, count of short line queries */

   211 long cnt_punct;		/* for overview mode,

   212 			   count of punctuation and spacing queries */

   213 long cnt_dash;		/* for overview mode, count of dash-related queries */

   214 long cnt_word;		/* for overview mode, count of word queries */

   215 long cnt_html;		/* for overview mode, count of html queries */

   216 long cnt_lineend;	/* for overview mode, count of line-end queries */

   217 long cnt_spacend;	/* count of lines with space at end */

   218 long linecnt;		/* count of total lines in the file */

   219 long checked_linecnt;	/* count of lines actually checked */

   221 void proghelp(GOptionContext *context);

   222 void procfile(const char *);

   224 gchar *running_from;

   226 gboolean mixdigit(const char *);

   227 gchar *getaword(const char **);

   228 char *flgets(char **,long);

   229 void postprocess_for_HTML(char *);

   230 char *linehasmarkup(char *);

   231 char *losemarkup(char *);

   232 gboolean tagcomp(const char *,const char *);

   233 void loseentities(char *);

   234 gboolean isroman(const char *);

   235 void postprocess_for_DP(char *);

   236 void print_as_windows_1252(const char *string);

   237 void print_as_utf_8(const char *string);

   239 GTree *qword,*qperiod;

   241 #ifdef __WIN32__

   242 UINT saved_cp;

   243 #endif

   245 struct first_pass_results {

   246     long firstline,astline;

   247     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   248     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   249     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   250     int Dutchcount,Frenchcount;

   251 };

   253 struct warnings {

   254     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   255     int endquote;

   256     gboolean isDutch,isFrench;

   257 };

   259 struct counters {

   260     long quot;

   261     int c_unders,c_brack,s_brack,r_brack;

   262     int open_single_quote,close_single_quote;

   263 };

   265 struct line_properties {

   266     unsigned int len,blen;

   267     gunichar start;

   268 };

   270 struct parities {

   271     int dquote,squote;

   272 };

   274 struct pending {

   275     char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;

   276     long squot;

   277 };

   279 void parse_options(int *argc,char ***argv)

   280 {

   281     GError *err=NULL;

   282     GOptionContext *context;

   283     context=g_option_context_new(

   284       "file - looks for errors in Project Gutenberg(TM) etexts");

   285     g_option_context_add_main_entries(context,options,NULL);

   286     if (!g_option_context_parse(context,argc,argv,&err))

   287     {

   288 	g_printerr("Bookloupe: %s\n",err->message);

   289 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   290 	exit(1);

   291     }

   292     /* Paranoid checking is turned OFF, not on, by its switch */

   293     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   294     if (pswit[PARANOID_SWITCH])

   295 	/* if running in paranoid mode, typo checks default to enabled */

   296 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   297     /* Line-end checking is turned OFF, not on, by its switch */

   298     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];

   299     /* Echoing is turned OFF, not on, by its switch */

   300     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];

   301     if (pswit[OVERVIEW_SWITCH])

   302 	/* just print summary; don't echo */

   303 	pswit[ECHO_SWITCH]=FALSE;

   304     /*

   305      * Web uploads - for the moment, this is really just a placeholder

   306      * until we decide what processing we really want to do on web uploads

   307      */

   308     if (pswit[WEB_SWITCH])

   309     {

   310 	/* specific override for web uploads */

   311 	pswit[ECHO_SWITCH]=TRUE;

   312 	pswit[SQUOTE_SWITCH]=FALSE;

   313 	pswit[TYPO_SWITCH]=TRUE;

   314 	pswit[QPARA_SWITCH]=FALSE;

   315 	pswit[PARANOID_SWITCH]=TRUE;

   316 	pswit[LINE_END_SWITCH]=FALSE;

   317 	pswit[OVERVIEW_SWITCH]=FALSE;

   318 	pswit[STDOUT_SWITCH]=FALSE;

   319 	pswit[HEADER_SWITCH]=TRUE;

   320 	pswit[VERBOSE_SWITCH]=FALSE;

   321 	pswit[MARKUP_SWITCH]=FALSE;

   322 	pswit[USERTYPO_SWITCH]=FALSE;

   323 	pswit[DP_SWITCH]=FALSE;

   324     }

   325     if (*argc<2)

   326     {

   327 	proghelp(context);

   328 	exit(1);

   329     }

   330     g_option_context_free(context);

   331 }

   333 /*

   334  * read_user_scannos:

   335  *

   336  * Read in the user-defined stealth scanno list.

   337  */

   338 void read_user_scannos(void)

   339 {

   340     GError *err=NULL;

   341     gchar *usertypo_file;

   342     gboolean okay;

   343     int i;

   344     gsize len,nb;

   345     gchar *contents,*utf8,**lines;

   346     usertypo_file=g_strdup("bookloupe.typ");

   347     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   348     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   349     {

   350 	g_clear_error(&err);

   351 	g_free(usertypo_file);

   352 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   353 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   354     }

   355     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   356     {

   357 	g_clear_error(&err);

   358 	g_free(usertypo_file);

   359 	usertypo_file=g_strdup("gutcheck.typ");

   360 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   361     }

   362     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   363     {

   364 	g_clear_error(&err);

   365 	g_free(usertypo_file);

   366 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   367 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   368     }

   369     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   370     {

   371 	g_free(usertypo_file);

   372 	g_print("   --> I couldn't find bookloupe.typ "

   373 	  "-- proceeding without user typos.\n");

   374 	return;

   375     }

   376     else if (!okay)

   377     {

   378 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   379 	g_free(usertypo_file);

   380 	g_clear_error(&err);

   381 	exit(1);

   382     }

   383     if (g_utf8_validate(contents,len,NULL))

   384 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   385     else

   386 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   387     g_free(contents);

   388     lines=g_strsplit_set(utf8,"\r\n",0);

   389     g_free(utf8);

   390     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   391     for (i=0;lines[i];i++)

   392 	if (*(unsigned char *)lines[i]>'!')

   393 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   394 	else

   395 	    g_free(lines[i]);

   396     g_free(lines);

   397 }

   399 /*

   400  * read_etext:

   401  *

   402  * Read an etext returning a newly allocated string containing the file

   403  * contents or NULL on error.

   404  */

   405 gchar *read_etext(const char *filename,GError **err)

   406 {

   407     GError *tmp_err=NULL;

   408     gchar *contents,*utf8;

   409     gsize len,bytes_read,bytes_written;

   410     int i,line,col;

   411     if (!g_file_get_contents(filename,&contents,&len,err))

   412 	return NULL;

   413     if (g_utf8_validate(contents,len,NULL))

   414     {

   415 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   416 	g_set_print_handler(print_as_utf_8);

   417 #ifdef __WIN32__

   418 	SetConsoleOutputCP(CP_UTF8);

   419 #endif

   420     }

   421     else

   422     {

   423 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   424 	  &bytes_written,&tmp_err);

   425 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   426 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   427 	{

   428 	    line=col=1;

   429 	    for(i=0;i<bytes_read;i++)

   430 		if (contents[i]=='\n')

   431 		{

   432 		    line++;

   433 		    col=1;

   434 		}

   435 		else if (contents[i]!='\r')

   436 		    col++;

   437 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   438 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   439 	      "valid Windows-1252 character",

   440 	      ((unsigned char *)contents)[bytes_read],line,col);

   441 	}

   442 	else if (tmp_err)

   443 	    g_propagate_error(err,tmp_err);

   444 	g_set_print_handler(print_as_windows_1252);

   445 #ifdef __WIN32__

   446 	SetConsoleOutputCP(1252);

   447 #endif

   448     }

   449     g_free(contents);

   450     return utf8;

   451 }

   453 void cleanup_on_exit(void)

   454 {

   455 #ifdef __WIN32__

   456     SetConsoleOutputCP(saved_cp);

   457 #endif

   458 }

   460 int main(int argc,char **argv)

   461 {

   462 #ifdef __WIN32__

   463     atexit(cleanup_on_exit);

   464     saved_cp=GetConsoleOutputCP();

   465 #endif

   466     running_from=g_path_get_dirname(argv[0]);

   467     parse_options(&argc,&argv);

   468     if (pswit[USERTYPO_SWITCH])

   469 	read_user_scannos();

   470     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   471     procfile(argv[1]);

   472     if (pswit[OVERVIEW_SWITCH])

   473     {

   474 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   475 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   476 	g_print("    --------------- Queries found --------------\n");

   477 	if (cnt_long)

   478 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   479 	if (cnt_short)

   480 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   481 	if (cnt_lineend)

   482 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   483 	if (cnt_word)

   484 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   485 	if (cnt_dquot)

   486 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);

   487 	if (cnt_squot)

   488 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);

   489 	if (cnt_brack)

   490 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   491 	if (cnt_bin)

   492 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   493 	if (cnt_odd)

   494 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   495 	if (cnt_punct)

   496 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   497 	if (cnt_dash)

   498 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   499 	if (cnt_html)

   500 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   501 	g_print("\n");

   502 	g_print("    TOTAL QUERIES		  %14ld\n",

   503 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   504 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   505     }

   506     g_free(running_from);

   507     if (usertypo)

   508 	g_tree_unref(usertypo);

   509     return 0;

   510 }

   512 /*

   513  * first_pass:

   514  *

   515  * Run a first pass - verify that it's a valid PG

   516  * file, decide whether to report some things that

   517  * occur many times in the text like long or short

   518  * lines, non-standard dashes, etc.

   519  */

   520 struct first_pass_results *first_pass(const char *etext)

   521 {

   522     gunichar laststart=CHAR_SPACE;

   523     const char *s;

   524     gchar *lc_line;

   525     int i,j,lbytes,llen;

   526     gchar **lines;

   527     unsigned int lastlen=0,lastblen=0;

   528     long spline=0,nspline=0;

   529     static struct first_pass_results results={0};

   530     gchar *inword;

   531     lines=g_strsplit(etext,"\n",0);

   532     for (j=0;lines[j];j++)

   533     {

   534 	lbytes=strlen(lines[j]);

   535 	while (lbytes>0 && lines[j][lbytes-1]=='\r')

   536 	    lines[j][--lbytes]='\0';

   537 	llen=g_utf8_strlen(lines[j],lbytes);

   538 	linecnt++;

   539 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   540 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   541 	{

   542 	    if (spline)

   543 		g_print("   --> Duplicate header?\n");

   544 	    spline=linecnt+1;   /* first line of non-header text, that is */

   545 	}

   546 	if (!strncmp(lines[j],"*** START",9) &&

   547 	  strstr(lines[j],"PROJECT GUTENBERG"))

   548 	{

   549 	    if (nspline)

   550 		g_print("   --> Duplicate header?\n");

   551 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   552 	}

   553 	if (spline || nspline)

   554 	{

   555 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   556 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   557 	    {

   558 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   559 		{

   560 		    if (results.footerline)

   561 		    {

   562 			/* it's an old-form header - we can detect duplicates */

   563 			if (!nspline)

   564 			    g_print("   --> Duplicate footer?\n");

   565 		    }

   566 		    else

   567 			results.footerline=linecnt;

   568 		}

   569 	    }

   570 	    g_free(lc_line);

   571 	}

   572 	if (spline)

   573 	    results.firstline=spline;

   574 	if (nspline)

   575 	    results.firstline=nspline;  /* override with new */

   576 	if (results.footerline)

   577 	    continue;    /* don't count the boilerplate in the footer */

   578 	results.totlen+=llen;

   579 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   580 	{

   581 	    if (g_utf8_get_char(s)>127)

   582 		results.binlen++;

   583 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   584 		results.alphalen++;

   585 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&

   586 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   587 		results.endquote_count++;

   588 	}

   589 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   590 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   591 	    results.shortline++;

   592 	if (lbytes>0 &&

   593 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   594 	    cnt_spacend++;

   595 	if (strstr(lines[j],".,"))

   596 	    results.dotcomma++;

   597 	/* only count ast lines for ignoring purposes where there is */

   598 	/* locase text on the line */

   599 	if (strchr(lines[j],'*'))

   600 	{

   601 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   602 		if (g_unichar_islower(g_utf8_get_char(s)))

   603 		    break;

   604 	    if (*s)

   605 		results.astline++;

   606 	}

   607 	if (strchr(lines[j],'/'))

   608 	    results.fslashline++;

   609 	if (lbytes>0)

   610 	{

   611 	    for (s=g_utf8_prev_char(lines[j]+lbytes);

   612 	      s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;

   613 	      s=g_utf8_prev_char(s))

   614 		;

   615 	    if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   616 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

   617 		results.hyphens++;

   618 	}

   619 	if (llen>LONGEST_PG_LINE)

   620 	    results.longline++;

   621 	if (llen>WAY_TOO_LONG)

   622 	    results.verylongline++;

   623 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   624 	{

   625 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   626 	    if (i>0)

   627 		results.htmcount++;

   628 	    if (strstr(lines[j],"<i>"))

   629 		results.htmcount+=4; /* bonus marks! */

   630 	}

   631 	/* Check for spaced em-dashes */

   632 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))

   633 	{

   634 	    results.emdash++;

   635 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)

   636 		results.space_emdash++;

   637 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)

   638 		/* count of em-dashes with spaces both sides */

   639 		results.non_PG_space_emdash++;

   640 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)

   641 		/* count of PG-type em-dashes with no spaces */

   642 		results.PG_space_emdash++;

   643 	}

   644 	for (s=lines[j];*s;)

   645 	{

   646 	    inword=getaword(&s);

   647 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   648 		results.Dutchcount++;

   649 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   650 		results.Frenchcount++;

   651 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   652 		results.standalone_digit++;

   653 	    g_free(inword);

   654 	}

   655 	/* Check for spaced dashes */

   656 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   657 	    results.spacedash++;

   658 	lastblen=lastlen;

   659 	lastlen=llen;

   660 	laststart=lines[j][0];

   661     }

   662     g_strfreev(lines);

   663     return &results;

   664 }

   666 /*

   667  * report_first_pass:

   668  *

   669  * Make some snap decisions based on the first pass results.

   670  */

   671 struct warnings *report_first_pass(struct first_pass_results *results)

   672 {

   673     static struct warnings warnings={0};

   674     if (cnt_spacend>0)

   675 	g_print("   --> %ld lines in this file have white space at end\n",

   676 	  cnt_spacend);

   677     warnings.dotcomma=1;

   678     if (results->dotcomma>5)

   679     {

   680 	warnings.dotcomma=0;

   681 	g_print("   --> %ld lines in this file contain '.,'. "

   682 	  "Not reporting them.\n",results->dotcomma);

   683     }

   684     /*

   685      * If more than 50 lines, or one-tenth, are short,

   686      * don't bother reporting them.

   687      */

   688     warnings.shortline=1;

   689     if (results->shortline>50 || results->shortline*10>linecnt)

   690     {

   691 	warnings.shortline=0;

   692 	g_print("   --> %ld lines in this file are short. "

   693 	  "Not reporting short lines.\n",results->shortline);

   694     }

   695     /*

   696      * If more than 50 lines, or one-tenth, are long,

   697      * don't bother reporting them.

   698      */

   699     warnings.longline=1;

   700     if (results->longline>50 || results->longline*10>linecnt)

   701     {

   702 	warnings.longline=0;

   703 	g_print("   --> %ld lines in this file are long. "

   704 	  "Not reporting long lines.\n",results->longline);

   705     }

   706     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   707     warnings.ast=1;

   708     if (results->astline>10)

   709     {

   710 	warnings.ast=0;

   711 	g_print("   --> %ld lines in this file contain asterisks. "

   712 	  "Not reporting them.\n",results->astline);

   713     }

   714     /*

   715      * If more than 10 lines contain forward slashes,

   716      * don't bother reporting them.

   717      */

   718     warnings.fslash=1;

   719     if (results->fslashline>10)

   720     {

   721 	warnings.fslash=0;

   722 	g_print("   --> %ld lines in this file contain forward slashes. "

   723 	  "Not reporting them.\n",results->fslashline);

   724     }

   725     /*

   726      * If more than 20 lines contain unpunctuated endquotes,

   727      * don't bother reporting them.

   728      */

   729     warnings.endquote=1;

   730     if (results->endquote_count>20)

   731     {

   732 	warnings.endquote=0;

   733 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

   734 	  "Not reporting them.\n",results->endquote_count);

   735     }

   736     /*

   737      * If more than 15 lines contain standalone digits,

   738      * don't bother reporting them.

   739      */

   740     warnings.digit=1;

   741     if (results->standalone_digit>10)

   742     {

   743 	warnings.digit=0;

   744 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

   745 	  "Not reporting them.\n",results->standalone_digit);

   746     }

   747     /*

   748      * If more than 20 lines contain hyphens at end,

   749      * don't bother reporting them.

   750      */

   751     warnings.hyphen=1;

   752     if (results->hyphens>20)

   753     {

   754 	warnings.hyphen=0;

   755 	g_print("   --> %ld lines in this file have hyphens at end. "

   756 	  "Not reporting them.\n",results->hyphens);

   757     }

   758     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   759     {

   760 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   761 	pswit[MARKUP_SWITCH]=1;

   762     }

   763     if (results->verylongline>0)

   764 	g_print("   --> %ld lines in this file are VERY long!\n",

   765 	  results->verylongline);

   766     /*

   767      * If there are more non-PG spaced dashes than PG em-dashes,

   768      * assume it's deliberate.

   769      * Current PG guidelines say don't use them, but older texts do,

   770      * and some people insist on them whatever the guidelines say.

   771      */

   772     warnings.dash=1;

   773     if (results->spacedash+results->non_PG_space_emdash>

   774       results->PG_space_emdash)

   775     {

   776 	warnings.dash=0;

   777 	g_print("   --> There are %ld spaced dashes and em-dashes. "

   778 	  "Not reporting them.\n",

   779 	  results->spacedash+results->non_PG_space_emdash);

   780     }

   781     /* If more than a quarter of characters are hi-bit, bug out. */

   782     warnings.bin=1;

   783     if (results->binlen*4>results->totlen)

   784     {

   785 	g_print("   --> This file does not appear to be ASCII. "

   786 	  "Terminating. Best of luck with it!\n");

   787 	exit(1);

   788     }

   789     if (results->alphalen*4<results->totlen)

   790     {

   791 	g_print("   --> This file does not appear to be text. "

   792 	  "Terminating. Best of luck with it!\n");

   793 	exit(1);

   794     }

   795     if (results->binlen*100>results->totlen || results->binlen>100)

   796     {

   797 	g_print("   --> There are a lot of foreign letters here. "

   798 	  "Not reporting them.\n");

   799 	warnings.bin=0;

   800     }

   801     warnings.isDutch=FALSE;

   802     if (results->Dutchcount>50)

   803     {

   804 	warnings.isDutch=TRUE;

   805 	g_print("   --> This looks like Dutch - "

   806 	  "switching off dashes and warnings for 's Middags case.\n");

   807     }

   808     warnings.isFrench=FALSE;

   809     if (results->Frenchcount>50)

   810     {

   811 	warnings.isFrench=TRUE;

   812 	g_print("   --> This looks like French - "

   813 	  "switching off some doublepunct.\n");

   814     }

   815     if (results->firstline && results->footerline)

   816 	g_print("    The PG header and footer appear to be already on.\n");

   817     else

   818     {

   819 	if (results->firstline)

   820 	    g_print("    The PG header is on - no footer.\n");

   821 	if (results->footerline)

   822 	    g_print("    The PG footer is on - no header.\n");

   823     }

   824     g_print("\n");

   825     if (pswit[VERBOSE_SWITCH])

   826     {

   827 	warnings.bin=1;

   828 	warnings.shortline=1;

   829 	warnings.dotcomma=1;

   830 	warnings.longline=1;

   831 	warnings.dash=1;

   832 	warnings.digit=1;

   833 	warnings.ast=1;

   834 	warnings.fslash=1;

   835 	warnings.hyphen=1;

   836 	warnings.endquote=1;

   837 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

   838     }

   839     if (warnings.isDutch)

   840 	warnings.dash=0;

   841     if (results->footerline>0 && results->firstline>0 &&

   842       results->footerline>results->firstline &&

   843       results->footerline-results->firstline<100)

   844     {

   845 	g_print("   --> I don't really know where this text starts. \n");

   846 	g_print("       There are no reference points.\n");

   847 	g_print("       I'm going to have to report the header and footer "

   848 	  "as well.\n");

   849 	results->firstline=0;

   850     }

   851     return &warnings;

   852 }

   854 /*

   855  * analyse_quotes:

   856  *

   857  * Look along the line, accumulate the count of quotes, and see

   858  * if this is an empty line - i.e. a line with nothing on it

   859  * but spaces.

   860  * If line has just spaces, period, * and/or - on it, don't

   861  * count it, since empty lines with asterisks or dashes to

   862  * separate sections are common.

   863  *

   864  * Returns: TRUE if the line is empty.

   865  */

   866 gboolean analyse_quotes(const char *aline,struct counters *counters)

   867 {

   868     int guessquote=0;

   869     /* assume the line is empty until proven otherwise */

   870     gboolean isemptyline=TRUE;

   871     const char *s=aline,*sprev,*snext;

   872     gunichar c;

   873     sprev=NULL;

   874     while (*s)

   875     {

   876 	snext=g_utf8_next_char(s);

   877 	c=g_utf8_get_char(s);

   878 	if (c==CHAR_DQUOTE)

   879 	    counters->quot++;

   880 	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)

   881 	{

   882 	    if (s==aline)

   883 	    {

   884 		/*

   885 		 * At start of line, it can only be an openquote.

   886 		 * Hardcode a very common exception!

   887 		 */

   888 		if (!g_str_has_prefix(snext,"tis") &&

   889 		  !g_str_has_prefix(snext,"Tis"))

   890 		    counters->open_single_quote++;

   891 	    }

   892 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

   893 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   894 		/* Do nothing! it's definitely an apostrophe, not a quote */

   895 		;

   896 	    /* it's outside a word - let's check it out */

   897 	    else if (c==CHAR_OPEN_SQUOTE ||

   898 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   899 	    {

   900 		/* it damwell better BE an openquote */

   901 		if (!g_str_has_prefix(snext,"tis") &&

   902 		  !g_str_has_prefix(snext,"Tis"))

   903 		    /* hardcode a very common exception! */

   904 		    counters->open_single_quote++;

   905 	    }

   906 	    else

   907 	    {

   908 		/* now - is it a closequote? */

   909 		guessquote=0;   /* accumulate clues */

   910 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

   911 		{

   912 		    /* it follows a letter - could be either */

   913 		    guessquote++;

   914 		    if (g_utf8_get_char(sprev)=='s')

   915 		    {

   916 			/* looks like a plural apostrophe */

   917 			guessquote-=3;

   918 			if (g_utf8_get_char(snext)==CHAR_SPACE)

   919 			    /* bonus marks! */

   920 			    guessquote-=2;

   921 		    }

   922 		}

   923 		/* it doesn't have a letter either side */

   924 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&

   925 		  strchr(".?!,;: ",g_utf8_get_char(snext)))

   926 		    guessquote+=8; /* looks like a closequote */

   927 		else

   928 		    guessquote++;

   929 		if (counters->open_single_quote>counters->close_single_quote)

   930 		    /*

   931 		     * Give it the benefit of some doubt,

   932 		     * if a squote is already open.

   933 		     */

   934 		    guessquote++;

   935 		else

   936 		    guessquote--;

   937 		if (guessquote>=0)

   938 		    counters->close_single_quote++;

   939 	    }

   940 	}

   941 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

   942 	  c!='\r' && c!='\n')

   943 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

   944 	if (c==CHAR_UNDERSCORE)

   945 	    counters->c_unders++;

   946 	if (c==CHAR_OPEN_CBRACK)

   947 	    counters->c_brack++;

   948 	if (c==CHAR_CLOSE_CBRACK)

   949 	    counters->c_brack--;

   950 	if (c==CHAR_OPEN_RBRACK)

   951 	    counters->r_brack++;

   952 	if (c==CHAR_CLOSE_RBRACK)

   953 	    counters->r_brack--;

   954 	if (c==CHAR_OPEN_SBRACK)

   955 	    counters->s_brack++;

   956 	if (c==CHAR_CLOSE_SBRACK)

   957 	    counters->s_brack--;

   958 	sprev=s;

   959 	s=snext;

   960     }

   961     return isemptyline;

   962 }

   964 /*

   965  * check_for_control_characters:

   966  *

   967  * Check for invalid or questionable characters in the line

   968  * Anything above 127 is invalid for plain ASCII, and

   969  * non-printable control characters should also be flagged.

   970  * Tabs should generally not be there.

   971  */

   972 void check_for_control_characters(const char *aline)

   973 {

   974     gunichar c;

   975     const char *s;

   976     for (s=aline;*s;s=g_utf8_next_char(s))

   977     {

   978 	c=g_utf8_get_char(s);

   979 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

   980 	{

   981 	    if (pswit[ECHO_SWITCH])

   982 		g_print("\n%s\n",aline);

   983 	    if (!pswit[OVERVIEW_SWITCH])

   984 		g_print("    Line %ld column %ld - Control character %u\n",

   985 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

   986 	    else

   987 		cnt_bin++;

   988 	}

   989     }

   990 }

   992 /*

   993  * check_for_odd_characters:

   994  *

   995  * Check for binary and other odd characters.

   996  */

   997 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

   998   gboolean isemptyline)

   999 {

  1000     /* Don't repeat multiple warnings on one line. */

  1001     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;

  1002     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

  1003     const char *s;

  1004     gunichar c;

  1005     for (s=aline;*s;s=g_utf8_next_char(s))

  1006     {

  1007 	c=g_utf8_get_char(s);

  1008 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

  1009 	{

  1010 	    if (pswit[ECHO_SWITCH])

  1011 		g_print("\n%s\n",aline);

  1012 	    if (!pswit[OVERVIEW_SWITCH])

  1013 		if (c>127 && c<160 || c>255)

  1014 		    g_print("    Line %ld column %ld - "

  1015 		      "Non-ISO-8859 character %u\n",

  1016 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1017 		else

  1018 		    g_print("    Line %ld column %ld - "

  1019 		      "Non-ASCII character %u\n",

  1020 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1021 	    else

  1022 		cnt_bin++;

  1023 	    eNon_A=TRUE;

  1024 	}

  1025 	if (!eTab && c==CHAR_TAB)

  1026 	{

  1027 	    if (pswit[ECHO_SWITCH])

  1028 		g_print("\n%s\n",aline);

  1029 	    if (!pswit[OVERVIEW_SWITCH])

  1030 		g_print("    Line %ld column %ld - Tab character?\n",

  1031 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1032 	    else

  1033 		cnt_odd++;

  1034 	    eTab=TRUE;

  1035 	}

  1036 	if (!eTilde && c==CHAR_TILDE)

  1037 	{

  1038 	    /*

  1039 	     * Often used by OCR software to indicate an

  1040 	     * unrecognizable character.

  1041 	     */

  1042 	    if (pswit[ECHO_SWITCH])

  1043 		g_print("\n%s\n",aline);

  1044 	    if (!pswit[OVERVIEW_SWITCH])

  1045 		g_print("    Line %ld column %ld - Tilde character?\n",

  1046 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1047 	    else

  1048 		cnt_odd++;

  1049 	    eTilde=TRUE;

  1050 	}

  1051 	if (!eCarat && c==CHAR_CARAT)

  1052 	{

  1053 	    if (pswit[ECHO_SWITCH])

  1054 		g_print("\n%s\n",aline);

  1055 	    if (!pswit[OVERVIEW_SWITCH])

  1056 		g_print("    Line %ld column %ld - Carat character?\n",

  1057 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1058 	    else

  1059 		cnt_odd++;

  1060 	    eCarat=TRUE;

  1061 	}

  1062 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

  1063 	{

  1064 	    if (pswit[ECHO_SWITCH])

  1065 		g_print("\n%s\n",aline);

  1066 	    if (!pswit[OVERVIEW_SWITCH])

  1067 		g_print("    Line %ld column %ld - Forward slash?\n",

  1068 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1069 	    else

  1070 		cnt_odd++;

  1071 	    eFSlash=TRUE;

  1072 	}

  1073 	/*

  1074 	 * Report asterisks only in paranoid mode,

  1075 	 * since they're often deliberate.

  1076 	 */

  1077 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1078 	  c==CHAR_ASTERISK)

  1079 	{

  1080 	    if (pswit[ECHO_SWITCH])

  1081 		g_print("\n%s\n",aline);

  1082 	    if (!pswit[OVERVIEW_SWITCH])

  1083 		g_print("    Line %ld column %ld - Asterisk?\n",

  1084 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1085 	    else

  1086 		cnt_odd++;

  1087 	    eAst=TRUE;

  1088 	}

  1089     }

  1090 }

  1092 /*

  1093  * check_for_long_line:

  1094  *

  1095  * Check for line too long.

  1096  */

  1097 void check_for_long_line(const char *aline)

  1098 {

  1099     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1100     {

  1101 	if (pswit[ECHO_SWITCH])

  1102 	    g_print("\n%s\n",aline);

  1103 	if (!pswit[OVERVIEW_SWITCH])

  1104 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1105 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1106 	else

  1107 	    cnt_long++;

  1108     }

  1109 }

  1111 /*

  1112  * check_for_short_line:

  1113  *

  1114  * Check for line too short.

  1115  *

  1116  * This one is a bit trickier to implement: we don't want to

  1117  * flag the last line of a paragraph for being short, so we

  1118  * have to wait until we know that our current line is a

  1119  * "normal" line, then report the _previous_ line if it was too

  1120  * short. We also don't want to report indented lines like

  1121  * chapter heads or formatted quotations. We therefore keep

  1122  * last->len as the length of the last line examined, and

  1123  * last->blen as the length of the last but one, and try to

  1124  * suppress unnecessary warnings by checking that both were of

  1125  * "normal" length. We keep the first character of the last

  1126  * line in last->start, and if it was a space, we assume that

  1127  * the formatting is deliberate. I can't figure out a way to

  1128  * distinguish something like a quoted verse left-aligned or

  1129  * the header or footer of a letter from a paragraph of short

  1130  * lines - maybe if I examined the whole paragraph, and if the

  1131  * para has less than, say, 8 lines and if all lines are short,

  1132  * then just assume it's OK? Need to look at some texts to see

  1133  * how often a formula like this would get the right result.

  1134  */

  1135 void check_for_short_line(const char *aline,const struct line_properties *last)

  1136 {

  1137     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1138       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1139       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1140     {

  1141 	if (pswit[ECHO_SWITCH])

  1142 	    g_print("\n%s\n",prevline);

  1143 	if (!pswit[OVERVIEW_SWITCH])

  1144 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1145 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1146 	else

  1147 	    cnt_short++;

  1148     }

  1149 }

  1151 /*

  1152  * check_for_starting_punctuation:

  1153  *

  1154  * Look for punctuation other than full ellipses at start of line.

  1155  */

  1156 void check_for_starting_punctuation(const char *aline)

  1157 {

  1158     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1159       !g_str_has_prefix(aline,". . ."))

  1160     {

  1161 	if (pswit[ECHO_SWITCH])

  1162 	    g_print("\n%s\n",aline);

  1163 	if (!pswit[OVERVIEW_SWITCH])

  1164 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1165 	      linecnt);

  1166 	else

  1167 	    cnt_punct++;

  1168     }

  1169 }

  1171 /*

  1172  * check_for_spaced_emdash:

  1173  *

  1174  * Check for spaced em-dashes.

  1175  *

  1176  * We must check _all_ occurrences of "--" on the line

  1177  * hence the loop - even if the first double-dash is OK

  1178  * there may be another that's wrong later on.

  1179  */

  1180 void check_for_spaced_emdash(const char *aline)

  1181 {

  1182     const char *s,*t,*next;

  1183     for (s=aline;t=strstr(s,"--");s=next)

  1184     {

  1185 	next=g_utf8_next_char(g_utf8_next_char(t));

  1186 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1187 	  g_utf8_get_char(next)==CHAR_SPACE)

  1188 	{

  1189 	    if (pswit[ECHO_SWITCH])

  1190 		g_print("\n%s\n",aline);

  1191 	    if (!pswit[OVERVIEW_SWITCH])

  1192 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1193 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1194 	    else

  1195 		cnt_dash++;

  1196 	}

  1197     }

  1198 }

  1200 /*

  1201  * check_for_spaced_dash:

  1202  *

  1203  * Check for spaced dashes.

  1204  */

  1205 void check_for_spaced_dash(const char *aline)

  1206 {

  1207     const char *s;

  1208     if ((s=strstr(aline," -")))

  1209     {

  1210 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1211 	{

  1212 	    if (pswit[ECHO_SWITCH])

  1213 		g_print("\n%s\n",aline);

  1214 	    if (!pswit[OVERVIEW_SWITCH])

  1215 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1216 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1217 	    else

  1218 		cnt_dash++;

  1219 	}

  1220     }

  1221     else if ((s=strstr(aline,"- ")))

  1222     {

  1223 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1224 	{

  1225 	    if (pswit[ECHO_SWITCH])

  1226 		g_print("\n%s\n",aline);

  1227 	    if (!pswit[OVERVIEW_SWITCH])

  1228 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1229 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1230 	    else

  1231 		cnt_dash++;

  1232 	}

  1233     }

  1234 }

  1236 /*

  1237  * check_for_unmarked_paragraphs:

  1238  *

  1239  * Check for unmarked paragraphs indicated by separate speakers.

  1240  *

  1241  * May well be false positive:

  1242  * "Bravo!" "Wonderful!" called the crowd.

  1243  * but useful all the same.

  1244  */

  1245 void check_for_unmarked_paragraphs(const char *aline)

  1246 {

  1247     const char *s;

  1248     s=strstr(aline,"\"  \"");

  1249     if (!s)

  1250 	s=strstr(aline,"\" \"");

  1251     if (s)

  1252     {

  1253 	if (pswit[ECHO_SWITCH])

  1254 	    g_print("\n%s\n",aline);

  1255 	if (!pswit[OVERVIEW_SWITCH])

  1256 	    g_print("    Line %ld column %ld - "

  1257 	      "Query missing paragraph break?\n",

  1258 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1259 	else

  1260 	    cnt_punct++;

  1261     }

  1262 }

  1264 /*

  1265  * check_for_jeebies:

  1266  *

  1267  * Check for "to he" and other easy h/b errors.

  1268  *

  1269  * This is a very inadequate effort on the h/b problem,

  1270  * but the phrase "to he" is always an error, whereas "to

  1271  * be" is quite common.

  1272  * Similarly, '"Quiet!", be said.' is a non-be error

  1273  * "to he" is _not_ always an error!:

  1274  *       "Where they went to he couldn't say."

  1275  * Another false positive:

  1276  *       What would "Cinderella" be without the . . .

  1277  * and another: "If he wants to he can see for himself."

  1278  */

  1279 void check_for_jeebies(const char *aline)

  1280 {

  1281     const char *s;

  1282     s=strstr(aline," be could ");

  1283     if (!s)

  1284 	s=strstr(aline," be would ");

  1285     if (!s)

  1286 	s=strstr(aline," was be ");

  1287     if (!s)

  1288 	s=strstr(aline," be is ");

  1289     if (!s)

  1290 	s=strstr(aline," is be ");

  1291     if (!s)

  1292 	s=strstr(aline,"\", be ");

  1293     if (!s)

  1294 	s=strstr(aline,"\" be ");

  1295     if (!s)

  1296 	s=strstr(aline,"\" be ");

  1297     if (!s)

  1298 	s=strstr(aline," to he ");

  1299     if (s)

  1300     {

  1301 	if (pswit[ECHO_SWITCH])

  1302 	    g_print("\n%s\n",aline);

  1303 	if (!pswit[OVERVIEW_SWITCH])

  1304 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1305 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1306 	else

  1307 	    cnt_word++;

  1308     }

  1309     s=strstr(aline," the had ");

  1310     if (!s)

  1311 	s=strstr(aline," a had ");

  1312     if (!s)

  1313 	s=strstr(aline," they bad ");

  1314     if (!s)

  1315 	s=strstr(aline," she bad ");

  1316     if (!s)

  1317 	s=strstr(aline," he bad ");

  1318     if (!s)

  1319 	s=strstr(aline," you bad ");

  1320     if (!s)

  1321 	s=strstr(aline," i bad ");

  1322     if (s)

  1323     {

  1324 	if (pswit[ECHO_SWITCH])

  1325 	    g_print("\n%s\n",aline);

  1326 	if (!pswit[OVERVIEW_SWITCH])

  1327 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1328 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1329 	else

  1330 	    cnt_word++;

  1331     }

  1332     s=strstr(aline,"; hut ");

  1333     if (!s)

  1334 	s=strstr(aline,", hut ");

  1335     if (s)

  1336     {

  1337 	if (pswit[ECHO_SWITCH])

  1338 	    g_print("\n%s\n",aline);

  1339 	if (!pswit[OVERVIEW_SWITCH])

  1340 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1341 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1342 	else

  1343 	    cnt_word++;

  1344     }

  1345 }

  1347 /*

  1348  * check_for_mta_from:

  1349  *

  1350  * Special case - angled bracket in front of "From" placed there by an

  1351  * MTA when sending an e-mail.

  1352  */

  1353 void check_for_mta_from(const char *aline)

  1354 {

  1355     const char *s;

  1356     s=strstr(aline,">From");

  1357     if (s)

  1358     {

  1359 	if (pswit[ECHO_SWITCH])

  1360 	    g_print("\n%s\n",aline);

  1361 	if (!pswit[OVERVIEW_SWITCH])

  1362 	    g_print("    Line %ld column %ld - "

  1363 	      "Query angled bracket with From\n",

  1364 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1365 	else

  1366 	    cnt_punct++;

  1367     }

  1368 }

  1370 /*

  1371  * check_for_orphan_character:

  1372  *

  1373  * Check for a single character line -

  1374  * often an overflow from bad wrapping.

  1375  */

  1376 void check_for_orphan_character(const char *aline)

  1377 {

  1378     gunichar c;

  1379     c=g_utf8_get_char(aline);

  1380     if (c && !*g_utf8_next_char(aline))

  1381     {

  1382 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1383 	    ; /* Nothing - ignore numerals alone on a line. */

  1384 	else

  1385 	{

  1386 	    if (pswit[ECHO_SWITCH])

  1387 		g_print("\n%s\n",aline);

  1388 	    if (!pswit[OVERVIEW_SWITCH])

  1389 		g_print("    Line %ld column 1 - Query single character line\n",

  1390 		  linecnt);

  1391 	    else

  1392 		cnt_punct++;

  1393 	}

  1394     }

  1395 }

  1397 /*

  1398  * check_for_pling_scanno:

  1399  *

  1400  * Check for I" - often should be !

  1401  */

  1402 void check_for_pling_scanno(const char *aline)

  1403 {

  1404     const char *s;

  1405     s=strstr(aline," I\"");

  1406     if (s)

  1407     {

  1408 	if (pswit[ECHO_SWITCH])

  1409 	    g_print("\n%s\n",aline);

  1410 	if (!pswit[OVERVIEW_SWITCH])

  1411 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1412 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1413 	else

  1414 	    cnt_punct++;

  1415     }

  1416 }

  1418 /*

  1419  * check_for_extra_period:

  1420  *

  1421  * Check for period without a capital letter. Cut-down from gutspell.

  1422  * Only works when it happens on a single line.

  1423  */

  1424 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1425 {

  1426     const char *s,*t,*s1;

  1427     int i;

  1428     gsize len;

  1429     gboolean istypo;

  1430     gchar *testword;

  1431     gunichar *decomposition;

  1432     if (pswit[PARANOID_SWITCH])

  1433     {

  1434 	for (t=aline;t=strstr(t,". ");)

  1435 	{

  1436 	    if (t==aline)

  1437 	    {

  1438 		t=g_utf8_next_char(t);

  1439 		/* start of line punctuation is handled elsewhere */

  1440 		continue;

  1441 	    }

  1442 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1443 	    {

  1444 		t=g_utf8_next_char(t);

  1445 		continue;

  1446 	    }

  1447 	    if (warnings->isDutch)

  1448 	    {

  1449 		/* For Frank & Jeroen -- 's Middags case */

  1450 		gunichar c2,c3,c4,c5;

  1451 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1452 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1453 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1454 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1455 		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&

  1456 		  c4==CHAR_SPACE && g_unichar_isupper(c5))

  1457 		{

  1458 		    t=g_utf8_next_char(t);

  1459 		    continue;

  1460 		}

  1461 	    }

  1462 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1463 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1464 	      !isdigit(g_utf8_get_char(s1)))

  1465 		s1=g_utf8_next_char(s1);

  1466 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1467 	    {

  1468 		/* we have something to investigate */

  1469 		istypo=TRUE;

  1470 		/* so let's go back and find out */

  1471 		for (s1=g_utf8_prev_char(t);s1>=aline &&

  1472 		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||

  1473 		  g_unichar_isdigit(g_utf8_get_char(s1)) ||

  1474 		  g_utf8_get_char(s1)==CHAR_SQUOTE &&

  1475 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&

  1476 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));

  1477 		  s1=g_utf8_prev_char(s1))

  1478 		    ;

  1479 		s1=g_utf8_next_char(s1);

  1480 		s=strchr(s1,'.');

  1481 		if (s)

  1482 		    testword=g_strndup(s1,s-s1);

  1483 		else

  1484 		    testword=g_strdup(s1);

  1485 		for (i=0;*abbrev[i];i++)

  1486 		    if (!strcmp(testword,abbrev[i]))

  1487 			istypo=FALSE;

  1488 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1489 		    istypo=FALSE;

  1490 		if (!*g_utf8_next_char(testword))

  1491 		    istypo=FALSE;

  1492 		if (isroman(testword))

  1493 		    istypo=FALSE;

  1494 		if (istypo)

  1495 		{

  1496 		    istypo=FALSE;

  1497 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1498 		    {

  1499 			decomposition=g_unicode_canonical_decomposition(

  1500 			  g_utf8_get_char(s),&len);

  1501 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1502 			    istypo=TRUE;

  1503 			g_free(decomposition);

  1504 		    }

  1505 		}

  1506 		if (istypo &&

  1507 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1508 		{

  1509 		    g_tree_insert(qperiod,g_strdup(testword),

  1510 		      GINT_TO_POINTER(1));

  1511 		    if (pswit[ECHO_SWITCH])

  1512 			g_print("\n%s\n",aline);

  1513 		    if (!pswit[OVERVIEW_SWITCH])

  1514 			g_print("    Line %ld column %ld - Extra period?\n",

  1515 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1516 		    else

  1517 			cnt_punct++;

  1518 		}

  1519 		g_free(testword);

  1520 	    }

  1521 	    t=g_utf8_next_char(t);

  1522 	}

  1523     }

  1524 }

  1526 /*

  1527  * check_for_following_punctuation:

  1528  *

  1529  * Check for words usually not followed by punctuation.

  1530  */

  1531 void check_for_following_punctuation(const char *aline)

  1532 {

  1533     int i;

  1534     const char *s,*wordstart;

  1535     gunichar c;

  1536     gchar *inword,*t;

  1537     if (pswit[TYPO_SWITCH])

  1538     {

  1539 	for (s=aline;*s;)

  1540 	{

  1541 	    wordstart=s;

  1542 	    t=getaword(&s);

  1543 	    if (!*t)

  1544 	    {

  1545 		g_free(t);

  1546 		continue;

  1547 	    }

  1548 	    inword=g_utf8_strdown(t,-1);

  1549 	    g_free(t);

  1550 	    for (i=0;*nocomma[i];i++)

  1551 		if (!strcmp(inword,nocomma[i]))

  1552 		{

  1553 		    c=g_utf8_get_char(s);

  1554 		    if (c==',' || c==';' || c==':')

  1555 		    {

  1556 			if (pswit[ECHO_SWITCH])

  1557 			    g_print("\n%s\n",aline);

  1558 			if (!pswit[OVERVIEW_SWITCH])

  1559 			    g_print("    Line %ld column %ld - "

  1560 			      "Query punctuation after %s?\n",

  1561 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1562 			      inword);

  1563 			else

  1564 			    cnt_punct++;

  1565 		    }

  1566 		}

  1567 	    for (i=0;*noperiod[i];i++)

  1568 		if (!strcmp(inword,noperiod[i]))

  1569 		{

  1570 		    c=g_utf8_get_char(s);

  1571 		    if (c=='.' || c=='!')

  1572 		    {

  1573 			if (pswit[ECHO_SWITCH])

  1574 			    g_print("\n%s\n",aline);

  1575 			if (!pswit[OVERVIEW_SWITCH])

  1576 			    g_print("    Line %ld column %ld - "

  1577 			      "Query punctuation after %s?\n",

  1578 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1579 			      inword);

  1580 			else

  1581 			    cnt_punct++;

  1582 		    }

  1583 		}

  1584 	    g_free(inword);

  1585 	}

  1586     }

  1587 }

  1589 /*

  1590  * check_for_typos:

  1591  *

  1592  * Check for commonly mistyped words,

  1593  * and digits like 0 for O in a word.

  1594  */

  1595 void check_for_typos(const char *aline,struct warnings *warnings)

  1596 {

  1597     const char *s,*t,*nt,*wordstart;

  1598     gchar *inword;

  1599     gunichar *decomposition;

  1600     gchar *testword;

  1601     int i,vowel,consonant,*dupcnt;

  1602     gboolean isdup,istypo,alower;

  1603     gunichar c;

  1604     long offset,len;

  1605     gsize decomposition_len;

  1606     for (s=aline;*s;)

  1607     {

  1608 	wordstart=s;

  1609 	inword=getaword(&s);

  1610 	if (!*inword)

  1611 	{

  1612 	    g_free(inword);

  1613 	    continue; /* don't bother with empty lines */

  1614 	}

  1615 	if (mixdigit(inword))

  1616 	{

  1617 	    if (pswit[ECHO_SWITCH])

  1618 		g_print("\n%s\n",aline);

  1619 	    if (!pswit[OVERVIEW_SWITCH])

  1620 		g_print("    Line %ld column %ld - Query digit in %s\n",

  1621 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  1622 	    else

  1623 		cnt_word++;

  1624 	}

  1625 	/*

  1626 	 * Put the word through a series of tests for likely typos and OCR

  1627 	 * errors.

  1628 	 */

  1629 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1630 	{

  1631 	    istypo=FALSE;

  1632 	    alower=FALSE;

  1633 	    for (t=inword;*t;t=g_utf8_next_char(t))

  1634 	    {

  1635 		c=g_utf8_get_char(t);

  1636 		nt=g_utf8_next_char(t);

  1637 		/* lowercase for testing */

  1638 		if (g_unichar_islower(c))

  1639 		    alower=TRUE;

  1640 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  1641 		{

  1642 		    /*

  1643 		     * We have an uppercase mid-word. However, there are

  1644 		     * common cases:

  1645 		     *   Mac and Mc like McGill

  1646 		     *   French contractions like l'Abbe

  1647 		     */

  1648 		    offset=g_utf8_pointer_to_offset(inword,t);

  1649 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  1650 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  1651 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  1652 		      offset>0 &&

  1653 		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)

  1654 			; /* do nothing! */

  1655 		    else

  1656 			istypo=TRUE;

  1657 		}

  1658 	    }

  1659 	    testword=g_utf8_casefold(inword,-1);

  1660 	}

  1661 	if (pswit[TYPO_SWITCH])

  1662 	{

  1663 	    /*

  1664 	     * Check for certain unlikely two-letter combinations at word

  1665 	     * start and end.

  1666 	     */

  1667 	    len=g_utf8_strlen(testword,-1);

  1668 	    if (len>1)

  1669 	    {

  1670 		for (i=0;*nostart[i];i++)

  1671 		    if (g_str_has_prefix(testword,nostart[i]))

  1672 			istypo=TRUE;

  1673 		for (i=0;*noend[i];i++)

  1674 		    if (g_str_has_suffix(testword,noend[i]))

  1675 			istypo=TRUE;

  1676 	    }

  1677 	    /* ght is common, gbt never. Like that. */

  1678 	    if (strstr(testword,"cb"))

  1679 		istypo=TRUE;

  1680 	    if (strstr(testword,"gbt"))

  1681 		istypo=TRUE;

  1682 	    if (strstr(testword,"pbt"))

  1683 		istypo=TRUE;

  1684 	    if (strstr(testword,"tbs"))

  1685 		istypo=TRUE;

  1686 	    if (strstr(testword,"mrn"))

  1687 		istypo=TRUE;

  1688 	    if (strstr(testword,"ahle"))

  1689 		istypo=TRUE;

  1690 	    if (strstr(testword,"ihle"))

  1691 		istypo=TRUE;

  1692 	    /*

  1693 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1694 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1695 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1696 	     * numerals, but "ii" is a common scanno.

  1697 	     */

  1698 	    if (strstr(testword,"tbi"))

  1699 		istypo=TRUE;

  1700 	    if (strstr(testword,"tbe"))

  1701 		istypo=TRUE;

  1702 	    if (strstr(testword,"ii"))

  1703 		istypo=TRUE;

  1704 	    /*

  1705 	     * Check for no vowels or no consonants.

  1706 	     * If none, flag a typo.

  1707 	     */

  1708 	    if (!istypo && len>1)

  1709 	    {

  1710 		vowel=consonant=0;

  1711 		for (t=testword;*t;t=g_utf8_next_char(t))

  1712 		{

  1713 		    c=g_utf8_get_char(t);

  1714 		    decomposition=

  1715 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  1716 		    if (c=='y' || g_unichar_isdigit(c))

  1717 		    {

  1718 			/* Yah, this is loose. */

  1719 			vowel++;

  1720 			consonant++;

  1721 		    }

  1722 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1723 			vowel++;

  1724 		    else

  1725 			consonant++;

  1726 		    g_free(decomposition);

  1727 		}

  1728 		if (!vowel || !consonant)

  1729 		    istypo=TRUE;

  1730 	    }

  1731 	    /*

  1732 	     * Now exclude the word from being reported if it's in

  1733 	     * the okword list.

  1734 	     */

  1735 	    for (i=0;*okword[i];i++)

  1736 		if (!strcmp(testword,okword[i]))

  1737 		    istypo=FALSE;

  1738 	    /*

  1739 	     * What looks like a typo may be a Roman numeral.

  1740 	     * Exclude these.

  1741 	     */

  1742 	    if (istypo && isroman(testword))

  1743 		istypo=FALSE;

  1744 	    /* Check the manual list of typos. */

  1745 	    if (!istypo)

  1746 		for (i=0;*typo[i];i++)

  1747 		    if (!strcmp(testword,typo[i]))

  1748 			istypo=TRUE;

  1749 	    /*

  1750 	     * Check lowercase s, l, i and m - special cases.

  1751 	     *   "j" - often a semi-colon gone wrong.

  1752 	     *   "d" for a missing apostrophe - he d

  1753 	     *   "n" for "in"

  1754 	     */

  1755 	    if (!istypo && len==1 &&

  1756 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  1757 		istypo=TRUE;

  1758 	    if (istypo)

  1759 	    {

  1760 		dupcnt=g_tree_lookup(qword,testword);

  1761 		if (dupcnt)

  1762 		{

  1763 		    (*dupcnt)++;

  1764 		    isdup=!pswit[VERBOSE_SWITCH];

  1765 		}

  1766 		else

  1767 		{

  1768 		    dupcnt=g_new0(int,1);

  1769 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1770 		    isdup=FALSE;

  1771 		}

  1772 		if (!isdup)

  1773 		{

  1774 		    if (pswit[ECHO_SWITCH])

  1775 			g_print("\n%s\n",aline);

  1776 		    if (!pswit[OVERVIEW_SWITCH])

  1777 		    {

  1778 			g_print("    Line %ld column %ld - Query word %s",

  1779 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  1780 			  inword);

  1781 			if (!pswit[VERBOSE_SWITCH])

  1782 			    g_print(" - not reporting duplicates");

  1783 			g_print("\n");

  1784 		    }

  1785 		    else

  1786 			cnt_word++;

  1787 		}

  1788 	    }

  1789 	}

  1790 	/* check the user's list of typos */

  1791 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  1792 	{

  1793 	    if (pswit[ECHO_SWITCH])

  1794 		g_print("\n%s\n",aline);

  1795 	    if (!pswit[OVERVIEW_SWITCH])

  1796 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  1797 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  1798 	}

  1799 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1800 	    g_free(testword);

  1801 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1802 	{

  1803 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1804 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1805 	    {

  1806 		if (pswit[ECHO_SWITCH])

  1807 		    g_print("\n%s\n",aline);

  1808 		if (!pswit[OVERVIEW_SWITCH])

  1809 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  1810 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  1811 		      inword);

  1812 		else

  1813 		    cnt_word++;

  1814 	    }

  1815 	}

  1816 	g_free(inword);

  1817     }

  1818 }

  1820 /*

  1821  * check_for_misspaced_punctuation:

  1822  *

  1823  * Look for added or missing spaces around punctuation and quotes.

  1824  * If there is a punctuation character like ! with no space on

  1825  * either side, suspect a missing!space. If there are spaces on

  1826  * both sides , assume a typo. If we see a double quote with no

  1827  * space or punctuation on either side of it, assume unspaced

  1828  * quotes "like"this.

  1829  */

  1830 void check_for_misspaced_punctuation(const char *aline,

  1831   struct parities *parities,gboolean isemptyline)

  1832 {

  1833     gboolean isacro,isellipsis;

  1834     const char *s;

  1835     gunichar c,nc,pc,n2c;

  1836     c=g_utf8_get_char(aline);

  1837     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1838     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1839     {

  1840 	pc=c;

  1841 	c=nc;

  1842 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1843 	/* For each character in the line after the first. */

  1844 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  1845 	{

  1846 	    /* we need to suppress warnings for acronyms like M.D. */

  1847 	    isacro=FALSE;

  1848 	    /* we need to suppress warnings for ellipsis . . . */

  1849 	    isellipsis=FALSE;

  1850 	    /*

  1851 	     * If there are letters on both sides of it or

  1852 	     * if it's strict punctuation followed by an alpha.

  1853 	     */

  1854 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  1855 	      g_utf8_strchr("?!,;:",-1,c)))

  1856 	    {

  1857 		if (c=='.')

  1858 		{

  1859 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1860 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1861 			isacro=TRUE;

  1862 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1863 		    if (nc && n2c=='.')

  1864 			isacro=TRUE;

  1865 		}

  1866 		if (!isacro)

  1867 		{

  1868 		    if (pswit[ECHO_SWITCH])

  1869 			g_print("\n%s\n",aline);

  1870 		    if (!pswit[OVERVIEW_SWITCH])

  1871 			g_print("    Line %ld column %ld - Missing space?\n",

  1872 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1873 		    else

  1874 			cnt_punct++;

  1875 		}

  1876 	    }

  1877 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  1878 	    {

  1879 		/*

  1880 		 * If there are spaces on both sides,

  1881 		 * or space before and end of line.

  1882 		 */

  1883 		if (c=='.')

  1884 		{

  1885 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1886 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1887 			isellipsis=TRUE;

  1888 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1889 		    if (nc && n2c=='.')

  1890 			isellipsis=TRUE;

  1891 		}

  1892 		if (!isemptyline && !isellipsis)

  1893 		{

  1894 		    if (pswit[ECHO_SWITCH])

  1895 			g_print("\n%s\n",aline);

  1896 		    if (!pswit[OVERVIEW_SWITCH])

  1897 			g_print("    Line %ld column %ld - "

  1898 			  "Spaced punctuation?\n",linecnt,

  1899 			  g_utf8_pointer_to_offset(aline,s)+1);

  1900 		    else

  1901 			cnt_punct++;

  1902 		}

  1903 	    }

  1904 	}

  1905     }

  1906     /* Split out the characters that CANNOT be preceded by space. */

  1907     c=g_utf8_get_char(aline);

  1908     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1909     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1910     {

  1911 	pc=c;

  1912 	c=nc;

  1913 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1914 	/* for each character in the line after the first */

  1915 	if (g_utf8_strchr("?!,;:",-1,c))

  1916 	{

  1917 	    /* if it's punctuation that _cannot_ have a space before it */

  1918 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  1919 	    {

  1920 		/*

  1921 		 * If nc DOES == space,

  1922 		 * it was already reported just above.

  1923 		 */

  1924 		if (pswit[ECHO_SWITCH])

  1925 		    g_print("\n%s\n",aline);

  1926 		if (!pswit[OVERVIEW_SWITCH])

  1927 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1928 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1929 		else

  1930 		    cnt_punct++;

  1931 	    }

  1932 	}

  1933     }

  1934     /*

  1935      * Special case " .X" where X is any alpha.

  1936      * This plugs a hole in the acronym code above.

  1937      * Inelegant, but maintainable.

  1938      */

  1939     c=g_utf8_get_char(aline);

  1940     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1941     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1942     {

  1943 	pc=c;

  1944 	c=nc;

  1945 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1946 	/* for each character in the line after the first */

  1947 	if (c=='.')

  1948 	{

  1949 	    /* if it's a period */

  1950 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  1951 	    {

  1952 		/*

  1953 		 * If the period follows a space and

  1954 		 * is followed by a letter.

  1955 		 */

  1956 		if (pswit[ECHO_SWITCH])

  1957 		    g_print("\n%s\n",aline);

  1958 		if (!pswit[OVERVIEW_SWITCH])

  1959 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1960 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1961 		else

  1962 		    cnt_punct++;

  1963 	    }

  1964 	}

  1965     }

  1966     c=g_utf8_get_char(aline);

  1967     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1968     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1969     {

  1970 	pc=c;

  1971 	c=nc;

  1972 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1973 	/* for each character in the line after the first */

  1974 	if (c==CHAR_DQUOTE)

  1975 	{

  1976 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  1977 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  1978 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  1979 	    {

  1980 		if (pswit[ECHO_SWITCH])

  1981 		    g_print("\n%s\n",aline);

  1982 		if (!pswit[OVERVIEW_SWITCH])

  1983 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  1984 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1985 		else

  1986 		    cnt_punct++;

  1987 	    }

  1988 	}

  1989     }

  1990     /* Check parity of quotes. */

  1991     nc=g_utf8_get_char(aline);

  1992     for (s=aline;*s;s=g_utf8_next_char(s))

  1993     {

  1994 	c=nc;

  1995 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1996 	if (c==CHAR_DQUOTE)

  1997 	{

  1998 	    parities->dquote=!parities->dquote;

  1999 	    if (!parities->dquote)

  2000 	    {

  2001 		/* parity even */

  2002 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))

  2003 		{

  2004 		    if (pswit[ECHO_SWITCH])

  2005 			g_print("\n%s\n",aline);

  2006 		    if (!pswit[OVERVIEW_SWITCH])

  2007 			g_print("    Line %ld column %ld - "

  2008 			  "Wrongspaced quotes?\n",

  2009 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2010 		    else

  2011 			cnt_punct++;

  2012 		}

  2013 	    }

  2014 	    else

  2015 	    {

  2016 		/* parity odd */

  2017 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2018 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)

  2019 		{

  2020 		    if (pswit[ECHO_SWITCH])

  2021 			g_print("\n%s\n",aline);

  2022 		    if (!pswit[OVERVIEW_SWITCH])

  2023 			g_print("    Line %ld column %ld - "

  2024 			  "Wrongspaced quotes?\n",

  2025 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2026 		    else

  2027 			cnt_punct++;

  2028 		}

  2029 	    }

  2030 	}

  2031     }

  2032     if (g_utf8_get_char(aline)==CHAR_DQUOTE)

  2033     {

  2034 	if (g_utf8_strchr(",;:!?)]} ",-1,

  2035 	  g_utf8_get_char(g_utf8_next_char(aline))))

  2036 	{

  2037 	    if (pswit[ECHO_SWITCH])

  2038 		g_print("\n%s\n",aline);

  2039 	    if (!pswit[OVERVIEW_SWITCH])

  2040 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  2041 		  linecnt);

  2042 	    else

  2043 		cnt_punct++;

  2044 	}

  2045     }

  2046     if (pswit[SQUOTE_SWITCH])

  2047     {

  2048 	nc=g_utf8_get_char(aline);

  2049 	for (s=aline;*s;s=g_utf8_next_char(s))

  2050 	{

  2051 	    c=nc;

  2052 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2053 	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||

  2054 	      s>aline &&

  2055 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2056 	      !g_unichar_isalpha(nc)))

  2057 	    {

  2058 		parities->squote=!parities->squote;

  2059 		if (!parities->squote)

  2060 		{

  2061 		    /* parity even */

  2062 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2063 		    {

  2064 			if (pswit[ECHO_SWITCH])

  2065 			    g_print("\n%s\n",aline);

  2066 			if (!pswit[OVERVIEW_SWITCH])

  2067 			    g_print("    Line %ld column %ld - "

  2068 			      "Wrongspaced singlequotes?\n",

  2069 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2070 			else

  2071 			    cnt_punct++;

  2072 		    }

  2073 		}

  2074 		else

  2075 		{

  2076 		    /* parity odd */

  2077 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2078 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2079 		    {

  2080 			if (pswit[ECHO_SWITCH])

  2081 			    g_print("\n%s\n",aline);

  2082 			if (!pswit[OVERVIEW_SWITCH])

  2083 			    g_print("    Line %ld column %ld - "

  2084 			      "Wrongspaced singlequotes?\n",

  2085 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2086 			else

  2087 			    cnt_punct++;

  2088 		    }

  2089 		}

  2090 	    }

  2091 	}

  2092     }

  2093 }

  2095 /*

  2096  * check_for_double_punctuation:

  2097  *

  2098  * Look for double punctuation like ,. or ,,

  2099  * Thanks to DW for the suggestion!

  2100  * In books with references, ".," and ".;" are common

  2101  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2102  * OTOH, from my initial tests, there are also fairly

  2103  * common errors. What to do? Make these cases paranoid?

  2104  * ".," is the most common, so warnings->dotcomma is used

  2105  * to suppress detailed reporting if it occurs often.

  2106  */

  2107 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2108 {

  2109     const char *s;

  2110     gunichar c,nc;

  2111     nc=g_utf8_get_char(aline);

  2112     for (s=aline;*s;s=g_utf8_next_char(s))

  2113     {

  2114 	c=nc;

  2115 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2116 	/* for each punctuation character in the line */

  2117 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2118 	  g_utf8_strchr(".?!,;:",-1,nc))

  2119 	{

  2120 	    /* followed by punctuation, it's a query, unless . . . */

  2121 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2122 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2123 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2124 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2125 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2126 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2127 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2128 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2129 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2130 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2131 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2132 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2133 	    {

  2134 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2135 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2136 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2137 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2138 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2139 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2140 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2141 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2142 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2143 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2144 		{

  2145 		    s+=4;

  2146 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2147 		}

  2148 		; /* do nothing for .. !! and ?? which can be legit */

  2149 	    }

  2150 	    else

  2151 	    {

  2152 		if (pswit[ECHO_SWITCH])

  2153 		    g_print("\n%s\n",aline);

  2154 		if (!pswit[OVERVIEW_SWITCH])

  2155 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2156 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2157 		else

  2158 		    cnt_punct++;

  2159 	    }

  2160 	}

  2161     }

  2162 }

  2164 /*

  2165  * check_for_spaced_quotes:

  2166  */

  2167 void check_for_spaced_quotes(const char *aline)

  2168 {

  2169     const char *s,*t;

  2170     s=aline;

  2171     while ((t=strstr(s," \" ")))

  2172     {

  2173 	if (pswit[ECHO_SWITCH])

  2174 	    g_print("\n%s\n",aline);

  2175 	if (!pswit[OVERVIEW_SWITCH])

  2176 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2177 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2178 	else

  2179 	    cnt_punct++;

  2180 	s=g_utf8_next_char(g_utf8_next_char(t));

  2181     }

  2182     s=aline;

  2183     while ((t=strstr(s," ' ")))

  2184     {

  2185 	if (pswit[ECHO_SWITCH])

  2186 	    g_print("\n%s\n",aline);

  2187 	if (!pswit[OVERVIEW_SWITCH])

  2188 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2189 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2190 	else

  2191 	    cnt_punct++;

  2192 	s=g_utf8_next_char(g_utf8_next_char(t));

  2193     }

  2194     s=aline;

  2195     while ((t=strstr(s," ` ")))

  2196     {

  2197 	if (pswit[ECHO_SWITCH])

  2198 	    g_print("\n%s\n",aline);

  2199 	if (!pswit[OVERVIEW_SWITCH])

  2200 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2201 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2202 	else

  2203 	    cnt_punct++;

  2204 	s=g_utf8_next_char(g_utf8_next_char(t));

  2205     }

  2206 }

  2208 /*

  2209  * check_for_miscased_genative:

  2210  *

  2211  * Check special case of 'S instead of 's at end of word.

  2212  */

  2213 void check_for_miscased_genative(const char *aline)

  2214 {

  2215     const char *s;

  2216     gunichar c,nc,pc;

  2217     if (!*aline)

  2218 	return;

  2219     c=g_utf8_get_char(aline);

  2220     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2221     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2222     {

  2223 	pc=c;

  2224 	c=nc;

  2225 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2226 	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))

  2227 	{

  2228 	    if (pswit[ECHO_SWITCH])

  2229 		g_print("\n%s\n",aline);

  2230 	    if (!pswit[OVERVIEW_SWITCH])

  2231 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2232 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2233 	    else

  2234 		cnt_punct++;

  2235 	}

  2236     }

  2237 }

  2239 /*

  2240  * check_end_of_line:

  2241  *

  2242  * Now check special cases - start and end of line -

  2243  * for single and double quotes. Start is sometimes [sic]

  2244  * but better to query it anyway.

  2245  * While we're here, check for dash at end of line.

  2246  */

  2247 void check_end_of_line(const char *aline,struct warnings *warnings)

  2248 {

  2249     int lbytes;

  2250     const char *s;

  2251     gunichar c1,c2;

  2252     lbytes=strlen(aline);

  2253     if (g_utf8_strlen(aline,lbytes)>1)

  2254     {

  2255 	s=g_utf8_prev_char(aline+lbytes);

  2256 	c1=g_utf8_get_char(s);

  2257 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2258 	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&

  2259 	  c2==CHAR_SPACE)

  2260 	{

  2261 	    if (pswit[ECHO_SWITCH])

  2262 		g_print("\n%s\n",aline);

  2263 	    if (!pswit[OVERVIEW_SWITCH])

  2264 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2265 		  g_utf8_strlen(aline,lbytes));

  2266 	    else

  2267 		cnt_punct++;

  2268 	}

  2269 	c1=g_utf8_get_char(aline);

  2270 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2271 	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)

  2272 	{

  2273 	    if (pswit[ECHO_SWITCH])

  2274 		g_print("\n%s\n",aline);

  2275 	    if (!pswit[OVERVIEW_SWITCH])

  2276 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2277 	    else

  2278 		cnt_punct++;

  2279 	}

  2280 	/*

  2281 	 * Dash at end of line may well be legit - paranoid mode only

  2282 	 * and don't report em-dash at line-end.

  2283 	 */

  2284 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2285 	{

  2286 	    for (s=g_utf8_prev_char(aline+lbytes);

  2287 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2288 		;

  2289 	    if (g_utf8_get_char(s)=='-' &&

  2290 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2291 	    {

  2292 		if (pswit[ECHO_SWITCH])

  2293 		    g_print("\n%s\n",aline);

  2294 		if (!pswit[OVERVIEW_SWITCH])

  2295 		    g_print("    Line %ld column %ld - "

  2296 		      "Hyphen at end of line?\n",

  2297 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2298 	    }

  2299 	}

  2300     }

  2301 }

  2303 /*

  2304  * check_for_unspaced_bracket:

  2305  *

  2306  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2307  * If so, suspect a scanno like "a]most".

  2308  */

  2309 void check_for_unspaced_bracket(const char *aline)

  2310 {

  2311     const char *s;

  2312     gunichar c,nc,pc;

  2313     c=g_utf8_get_char(aline);

  2314     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2315     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2316     {

  2317 	pc=c;

  2318 	c=nc;

  2319 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2320 	if (!nc)

  2321 	    break;

  2322 	/* for each bracket character in the line except 1st & last */

  2323 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2324 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2325 	{

  2326 	    if (pswit[ECHO_SWITCH])

  2327 		g_print("\n%s\n",aline);

  2328 	    if (!pswit[OVERVIEW_SWITCH])

  2329 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2330 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2331 	    else

  2332 		cnt_punct++;

  2333 	}

  2334     }

  2335 }

  2337 /*

  2338  * check_for_unpunctuated_endquote:

  2339  */

  2340 void check_for_unpunctuated_endquote(const char *aline)

  2341 {

  2342     const char *s;

  2343     gunichar c,nc,pc;

  2344     c=g_utf8_get_char(aline);

  2345     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2346     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2347     {

  2348 	pc=c;

  2349 	c=nc;

  2350 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2351 	/* for each character in the line except 1st */

  2352 	if (c==CHAR_DQUOTE && isalpha(pc))

  2353 	{

  2354 	    if (pswit[ECHO_SWITCH])

  2355 		g_print("\n%s\n",aline);

  2356 	    if (!pswit[OVERVIEW_SWITCH])

  2357 		g_print("    Line %ld column %ld - "

  2358 		  "endquote missing punctuation?\n",

  2359 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2360 	    else

  2361 		cnt_punct++;

  2362 	}

  2363     }

  2364 }

  2366 /*

  2367  * check_for_html_tag:

  2368  *

  2369  * Check for <HTML TAG>.

  2370  *

  2371  * If there is a < in the line, followed at some point

  2372  * by a > then we suspect HTML.

  2373  */

  2374 void check_for_html_tag(const char *aline)

  2375 {

  2376     const char *open,*close;

  2377     gchar *tag;

  2378     open=strchr(aline,'<');

  2379     if (open)

  2380     {

  2381 	close=strchr(g_utf8_next_char(open),'>');

  2382 	if (close)

  2383 	{

  2384 	    if (pswit[ECHO_SWITCH])

  2385 		g_print("\n%s\n",aline);

  2386 	    if (!pswit[OVERVIEW_SWITCH])

  2387 	    {

  2388 		tag=g_strndup(open,close-open+1);

  2389 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2390 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2391 		g_free(tag);

  2392 	    }

  2393 	    else

  2394 		cnt_html++;

  2395 	}

  2396     }

  2397 }

  2399 /*

  2400  * check_for_html_entity:

  2401  *

  2402  * Check for &symbol; HTML.

  2403  *

  2404  * If there is a & in the line, followed at

  2405  * some point by a ; then we suspect HTML.

  2406  */

  2407 void check_for_html_entity(const char *aline)

  2408 {

  2409     const char *s,*amp,*scolon;

  2410     gchar *entity;

  2411     amp=strchr(aline,'&');

  2412     if (amp)

  2413     {

  2414 	scolon=strchr(amp,';');

  2415 	if (scolon)

  2416 	{

  2417 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2418 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2419 		    break;		/* Don't report "Jones & Son;" */

  2420 	    if (s>=scolon)

  2421 	    {

  2422 		if (pswit[ECHO_SWITCH])

  2423 		    g_print("\n%s\n",aline);

  2424 		if (!pswit[OVERVIEW_SWITCH])

  2425 		{

  2426 		    entity=g_strndup(amp,scolon-amp+1);

  2427 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2428 		      linecnt,(int)(amp-aline)+1,entity);

  2429 		    g_free(entity);

  2430 		}

  2431 		else

  2432 		    cnt_html++;

  2433 	    }

  2434 	}

  2435     }

  2436 }

  2438 /*

  2439  * print_pending:

  2440  *

  2441  * If we are in a state of unbalanced quotes, and this line

  2442  * doesn't begin with a quote, output the stored error message.

  2443  * If the -P switch was used, print the warning even if the

  2444  * new para starts with quotes.

  2445  */

  2446 void print_pending(const char *aline,const char *parastart,

  2447   struct pending *pending)

  2448 {

  2449     const char *s;

  2450     gunichar c;

  2451     s=aline;

  2452     while (*s==' ')

  2453 	s++;

  2454     c=g_utf8_get_char(s);

  2455     if (pending->dquote)

  2456     {

  2457 	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  2458 	{

  2459 	    if (!pswit[OVERVIEW_SWITCH])

  2460 	    {

  2461 		if (pswit[ECHO_SWITCH])

  2462 		    g_print("\n%s\n",parastart);

  2463 		g_print("%s\n",pending->dquote);

  2464 	    }

  2465 	    else

  2466 		cnt_dquot++;

  2467 	}

  2468 	g_free(pending->dquote);

  2469 	pending->dquote=NULL;

  2470     }

  2471     if (pending->squote)

  2472     {

  2473 	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||

  2474 	  pending->squot)

  2475 	{

  2476 	    if (!pswit[OVERVIEW_SWITCH])

  2477 	    {

  2478 		if (pswit[ECHO_SWITCH])

  2479 		    g_print("\n%s\n",parastart);

  2480 		g_print("%s\n",pending->squote);

  2481 	    }

  2482 	    else

  2483 		cnt_squot++;

  2484 	}

  2485 	g_free(pending->squote);

  2486 	pending->squote=NULL;

  2487     }

  2488     if (pending->rbrack)

  2489     {

  2490 	if (!pswit[OVERVIEW_SWITCH])

  2491 	{

  2492 	    if (pswit[ECHO_SWITCH])

  2493 		g_print("\n%s\n",parastart);

  2494 	    g_print("%s\n",pending->rbrack);

  2495 	}

  2496 	else

  2497 	    cnt_brack++;

  2498 	g_free(pending->rbrack);

  2499 	pending->rbrack=NULL;

  2500     }

  2501     if (pending->sbrack)

  2502     {

  2503 	if (!pswit[OVERVIEW_SWITCH])

  2504 	{

  2505 	    if (pswit[ECHO_SWITCH])

  2506 		g_print("\n%s\n",parastart);

  2507 	    g_print("%s\n",pending->sbrack);

  2508 	}

  2509 	else

  2510 	    cnt_brack++;

  2511 	g_free(pending->sbrack);

  2512 	pending->sbrack=NULL;

  2513     }

  2514     if (pending->cbrack)

  2515     {

  2516 	if (!pswit[OVERVIEW_SWITCH])

  2517 	{

  2518 	    if (pswit[ECHO_SWITCH])

  2519 		g_print("\n%s\n",parastart);

  2520 	    g_print("%s\n",pending->cbrack);

  2521 	}

  2522 	else

  2523 	    cnt_brack++;

  2524 	g_free(pending->cbrack);

  2525 	pending->cbrack=NULL;

  2526     }

  2527     if (pending->unders)

  2528     {

  2529 	if (!pswit[OVERVIEW_SWITCH])

  2530 	{

  2531 	    if (pswit[ECHO_SWITCH])

  2532 		g_print("\n%s\n",parastart);

  2533 	    g_print("%s\n",pending->unders);

  2534 	}

  2535 	else

  2536 	    cnt_brack++;

  2537 	g_free(pending->unders);

  2538 	pending->unders=NULL;

  2539     }

  2540 }

  2542 /*

  2543  * check_for_mismatched_quotes:

  2544  *

  2545  * At end of paragraph, check for mismatched quotes.

  2546  *

  2547  * We don't want to report an error immediately, since it is a

  2548  * common convention to omit the quotes at end of paragraph if

  2549  * the next paragraph is a continuation of the same speaker.

  2550  * Where this is the case, the next para should begin with a

  2551  * quote, so we store the warning message and only display it

  2552  * at the top of the next iteration if the new para doesn't

  2553  * start with a quote.

  2554  * The -p switch overrides this default, and warns of unclosed

  2555  * quotes on _every_ paragraph, whether the next begins with a

  2556  * quote or not.

  2557  */

  2558 void check_for_mismatched_quotes(const struct counters *counters,

  2559   struct pending *pending)

  2560 {

  2561     if (counters->quot%2)

  2562 	pending->dquote=

  2563 	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);

  2564     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2565       counters->open_single_quote!=counters->close_single_quote)

  2566 	pending->squote=

  2567 	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);

  2568     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2569       counters->open_single_quote!=counters->close_single_quote &&

  2570       counters->open_single_quote!=counters->close_single_quote+1)

  2571 	/*

  2572 	 * Flag it to be noted regardless of the

  2573 	 * first char of the next para.

  2574 	 */

  2575 	pending->squot=1;

  2576     if (counters->r_brack)

  2577 	pending->rbrack=

  2578 	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);

  2579     if (counters->s_brack)

  2580 	pending->sbrack=

  2581 	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);

  2582     if (counters->c_brack)

  2583 	pending->cbrack=

  2584 	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);

  2585     if (counters->c_unders%2)

  2586 	pending->unders=

  2587 	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);

  2588 }

  2590 /*

  2591  * check_for_omitted_punctuation:

  2592  *

  2593  * Check for omitted punctuation at end of paragraph by working back

  2594  * through prevline. DW.

  2595  * Need to check this only for "normal" paras.

  2596  * So what is a "normal" para?

  2597  *    Not normal if one-liner (chapter headings, etc.)

  2598  *    Not normal if doesn't contain at least one locase letter

  2599  *    Not normal if starts with space

  2600  */

  2601 void check_for_omitted_punctuation(const char *prevline,

  2602   struct line_properties *last,int start_para_line)

  2603 {

  2604     gboolean letter_on_line=FALSE;

  2605     const char *s;

  2606     for (s=prevline;*s;s=g_utf8_next_char(s))

  2607 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2608 	{

  2609 	    letter_on_line=TRUE;

  2610 	    break;

  2611 	}

  2612     /*

  2613      * This next "if" is a problem.

  2614      * If we say "start_para_line <= linecnt - 1", that includes

  2615      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2616      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2617      * misses genuine one-line paragraphs.

  2618      */

  2619     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2620       g_utf8_get_char(prevline)>CHAR_SPACE)

  2621     {

  2622 	for (s=g_utf8_prev_char(prevline+strlen(prevline));

  2623 	  (g_utf8_get_char(s)==CHAR_DQUOTE ||

  2624 	  g_utf8_get_char(s)==CHAR_SQUOTE) &&

  2625 	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;

  2626 	  s=g_utf8_prev_char(s))

  2627 	    ;

  2628 	for (;s>prevline;s=g_utf8_prev_char(s))

  2629 	{

  2630 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2631 	    {

  2632 		if (pswit[ECHO_SWITCH])

  2633 		    g_print("\n%s\n",prevline);

  2634 		if (!pswit[OVERVIEW_SWITCH])

  2635 		    g_print("    Line %ld column %ld - "

  2636 		      "No punctuation at para end?\n",

  2637 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2638 		else

  2639 		    cnt_punct++;

  2640 		break;

  2641 	    }

  2642 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))

  2643 		break;

  2644 	}

  2645     }

  2646 }

  2648 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2649 {

  2650     const char *word=key;

  2651     int *dupcnt=value;

  2652     if (*dupcnt)

  2653 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2654 	  word,*dupcnt);

  2655     return FALSE;

  2656 }

  2658 void print_as_windows_1252(const char *string)

  2659 {

  2660     gsize inbytes,outbytes;

  2661     gchar *buf,*bp;

  2662     GIConv converter=(GIConv)-1;

  2663     if (!string)

  2664     {

  2665 	if (converter!=(GIConv)-1)

  2666 	    g_iconv_close(converter);

  2667 	converter=(GIConv)-1;

  2668 	return;

  2669     }

  2670     if (converter=(GIConv)-1)

  2671 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  2672     if (converter!=(GIConv)-1)

  2673     {

  2674 	inbytes=outbytes=strlen(string);

  2675 	bp=buf=g_malloc(outbytes+1);

  2676 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  2677 	*bp='\0';

  2678 	fputs(buf,stdout);

  2679 	g_free(buf);

  2680     }

  2681     else

  2682 	fputs(string,stdout);

  2683 }

  2685 void print_as_utf_8(const char *string)

  2686 {

  2687     fputs(string,stdout);

  2688 }

  2690 /*

  2691  * procfile:

  2692  *

  2693  * Process one file.

  2694  */

  2695 void procfile(const char *filename)

  2696 {

  2697     const char *s;

  2698     gchar *parastart=NULL;	/* first line of current para */

  2699     gchar *etext,*aline;

  2700     gchar *etext_ptr;

  2701     GError *err=NULL;

  2702     struct first_pass_results *first_pass_results;

  2703     struct warnings *warnings;

  2704     struct counters counters={0};

  2705     struct line_properties last={0};

  2706     struct parities parities={0};

  2707     struct pending pending={0};

  2708     gboolean isemptyline;

  2709     long start_para_line=0;

  2710     gboolean isnewpara=FALSE,enddash=FALSE;

  2711     last.start=CHAR_SPACE;

  2712     linecnt=checked_linecnt=0;

  2713     etext=read_etext(filename,&err);

  2714     if (!etext)

  2715     {

  2716 	if (pswit[STDOUT_SWITCH])

  2717 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2718 	else

  2719 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2720 	exit(1);

  2721     }

  2722     g_print("\n\nFile: %s\n\n",filename);

  2723     first_pass_results=first_pass(etext);

  2724     warnings=report_first_pass(first_pass_results);

  2725     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2726     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2727     /*

  2728      * Here we go with the main pass. Hold onto yer hat!

  2729      */

  2730     linecnt=0;

  2731     etext_ptr=etext;

  2732     while ((aline=flgets(&etext_ptr,linecnt+1)))

  2733     {

  2734 	linecnt++;

  2735 	if (linecnt==1)

  2736 	    isnewpara=TRUE;

  2737 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  2738 	    continue;    // skip DP page separators completely

  2739 	if (linecnt<first_pass_results->firstline ||

  2740 	  (first_pass_results->footerline>0 &&

  2741 	  linecnt>first_pass_results->footerline))

  2742 	{

  2743 	    if (pswit[HEADER_SWITCH])

  2744 	    {

  2745 		if (g_str_has_prefix(aline,"Title:"))

  2746 		    g_print("    %s\n",aline);

  2747 		if (g_str_has_prefix(aline,"Author:"))

  2748 		    g_print("    %s\n",aline);

  2749 		if (g_str_has_prefix(aline,"Release Date:"))

  2750 		    g_print("    %s\n",aline);

  2751 		if (g_str_has_prefix(aline,"Edition:"))

  2752 		    g_print("    %s\n\n",aline);

  2753 	    }

  2754 	    continue;		/* skip through the header */

  2755 	}

  2756 	checked_linecnt++;

  2757 	print_pending(aline,parastart,&pending);

  2758 	memset(&pending,0,sizeof(pending));

  2759 	isemptyline=analyse_quotes(aline,&counters);

  2760 	if (isnewpara && !isemptyline)

  2761 	{

  2762 	    /* This line is the start of a new paragraph. */

  2763 	    start_para_line=linecnt;

  2764 	    /* Capture its first line in case we want to report it later. */

  2765 	    g_free(parastart);

  2766 	    parastart=g_strdup(aline);

  2767 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2768 	    s=aline;

  2769 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  2770 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  2771 		s=g_utf8_next_char(s);

  2772 	    if (g_unichar_islower(g_utf8_get_char(s)))

  2773 	    {

  2774 		/* and its first letter is lowercase */

  2775 		if (pswit[ECHO_SWITCH])

  2776 		    g_print("\n%s\n",aline);

  2777 		if (!pswit[OVERVIEW_SWITCH])

  2778 		    g_print("    Line %ld column %ld - "

  2779 		      "Paragraph starts with lower-case\n",

  2780 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2781 		else

  2782 		    cnt_punct++;

  2783 	    }

  2784 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2785 	}

  2786 	/* Check for an em-dash broken at line end. */

  2787 	if (enddash && g_utf8_get_char(aline)=='-')

  2788 	{

  2789 	    if (pswit[ECHO_SWITCH])

  2790 		g_print("\n%s\n",aline);

  2791 	    if (!pswit[OVERVIEW_SWITCH])

  2792 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2793 	    else

  2794 		cnt_punct++;

  2795 	}

  2796 	enddash=FALSE;

  2797 	for (s=g_utf8_prev_char(aline+strlen(aline));

  2798 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  2799 	    ;

  2800 	if (s>=aline && g_utf8_get_char(s)=='-')

  2801 	    enddash=TRUE;

  2802 	check_for_control_characters(aline);

  2803 	if (warnings->bin)

  2804 	    check_for_odd_characters(aline,warnings,isemptyline);

  2805 	if (warnings->longline)

  2806 	    check_for_long_line(aline);

  2807 	if (warnings->shortline)

  2808 	    check_for_short_line(aline,&last);

  2809 	last.blen=last.len;

  2810 	last.len=g_utf8_strlen(aline,-1);

  2811 	last.start=g_utf8_get_char(aline);

  2812 	check_for_starting_punctuation(aline);

  2813 	if (warnings->dash)

  2814 	{

  2815 	    check_for_spaced_emdash(aline);

  2816 	    check_for_spaced_dash(aline);

  2817 	}

  2818 	check_for_unmarked_paragraphs(aline);

  2819 	check_for_jeebies(aline);

  2820 	check_for_mta_from(aline);

  2821 	check_for_orphan_character(aline);

  2822 	check_for_pling_scanno(aline);

  2823 	check_for_extra_period(aline,warnings);

  2824 	check_for_following_punctuation(aline);

  2825 	check_for_typos(aline,warnings);

  2826 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2827 	check_for_double_punctuation(aline,warnings);

  2828 	check_for_spaced_quotes(aline);

  2829 	check_for_miscased_genative(aline);

  2830 	check_end_of_line(aline,warnings);

  2831 	check_for_unspaced_bracket(aline);

  2832 	if (warnings->endquote)

  2833 	    check_for_unpunctuated_endquote(aline);

  2834 	check_for_html_tag(aline);

  2835 	check_for_html_entity(aline);

  2836 	if (isemptyline)

  2837 	{

  2838 	    check_for_mismatched_quotes(&counters,&pending);

  2839 	    memset(&counters,0,sizeof(counters));

  2840 	    /* let the next iteration know that it's starting a new para */

  2841 	    isnewpara=TRUE;

  2842 	    if (prevline)

  2843 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2844 	}

  2845 	g_free(prevline);

  2846 	prevline=g_strdup(aline);

  2847     }

  2848     if (prevline)

  2849     {

  2850 	g_free(prevline);

  2851 	prevline=NULL;

  2852     }

  2853     g_free(parastart);

  2854     g_free(prevline);

  2855     g_free(etext);

  2856     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  2857 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2858     g_tree_unref(qword);

  2859     g_tree_unref(qperiod);

  2860     g_set_print_handler(NULL);

  2861     print_as_windows_1252(NULL);

  2862     if (pswit[MARKUP_SWITCH])

  2863 	loseentities(NULL);

  2864 }

  2866 /*

  2867  * flgets:

  2868  *

  2869  * Get one line from the input text, checking for

  2870  * the existence of exactly one CR/LF line-end per line.

  2871  *

  2872  * Returns: a pointer to the line.

  2873  */

  2874 char *flgets(char **etext,long lcnt)

  2875 {

  2876     gunichar c;

  2877     gboolean isCR=FALSE;

  2878     char *theline=*etext;

  2879     char *eos=theline;

  2880     gchar *s;

  2881     for (;;)

  2882     {

  2883 	c=g_utf8_get_char(*etext);

  2884 	*etext=g_utf8_next_char(*etext);

  2885 	if (!c)

  2886 	    return NULL;

  2887 	/* either way, it's end of line */

  2888 	if (c=='\n')

  2889 	{

  2890 	    if (isCR)

  2891 		break;

  2892 	    else

  2893 	    {

  2894 		/* Error - a LF without a preceding CR */

  2895 		if (pswit[LINE_END_SWITCH])

  2896 		{

  2897 		    if (pswit[ECHO_SWITCH])

  2898 		    {

  2899 			s=g_strndup(theline,eos-theline);

  2900 			g_print("\n%s\n",s);

  2901 			g_free(s);

  2902 		    }

  2903 		    if (!pswit[OVERVIEW_SWITCH])

  2904 			g_print("    Line %ld - No CR?\n",lcnt);

  2905 		    else

  2906 			cnt_lineend++;

  2907 		}

  2908 		break;

  2909 	    }

  2910 	}

  2911 	if (c=='\r')

  2912 	{

  2913 	    if (isCR)

  2914 	    {

  2915 		/* Error - two successive CRs */

  2916 		if (pswit[LINE_END_SWITCH])

  2917 		{

  2918 		    if (pswit[ECHO_SWITCH])

  2919 		    {

  2920 			s=g_strndup(theline,eos-theline);

  2921 			g_print("\n%s\n",s);

  2922 			g_free(s);

  2923 		    }

  2924 		    if (!pswit[OVERVIEW_SWITCH])

  2925 			g_print("    Line %ld - Two successive CRs?\n",lcnt);

  2926 		    else

  2927 			cnt_lineend++;

  2928 		}

  2929 	    }

  2930 	    isCR=TRUE;

  2931 	}

  2932 	else

  2933 	{

  2934 	    if (pswit[LINE_END_SWITCH] && isCR)

  2935 	    {

  2936 		if (pswit[ECHO_SWITCH])

  2937 		{

  2938 		    s=g_strndup(theline,eos-theline);

  2939 		    g_print("\n%s\n",s);

  2940 		    g_free(s);

  2941 		}

  2942 		if (!pswit[OVERVIEW_SWITCH])

  2943 		    g_print("    Line %ld column %ld - CR without LF?\n",

  2944 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  2945 		else

  2946 		    cnt_lineend++;

  2947 		*eos=' ';

  2948 	    }

  2949 	    isCR=FALSE;

  2950 	    eos=g_utf8_next_char(eos);

  2951 	}

  2952     }

  2953     *eos='\0';

  2954     if (pswit[MARKUP_SWITCH])

  2955 	postprocess_for_HTML(theline);

  2956     if (pswit[DP_SWITCH])

  2957 	postprocess_for_DP(theline);

  2958     return theline;

  2959 }

  2961 /*

  2962  * mixdigit:

  2963  *

  2964  * Takes a "word" as a parameter, and checks whether it

  2965  * contains a mixture of alpha and digits. Generally, this is an

  2966  * error, but may not be for cases like 4th or L5 12s. 3d.

  2967  *

  2968  * Returns: TRUE iff an is error found.

  2969  */

  2970 gboolean mixdigit(const char *checkword)

  2971 {

  2972     gboolean wehaveadigit,wehavealetter,query;

  2973     const char *s,*nondigit;

  2974     wehaveadigit=wehavealetter=query=FALSE;

  2975     for (s=checkword;*s;s=g_utf8_next_char(s))

  2976 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2977 	    wehavealetter=TRUE;

  2978 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  2979 	    wehaveadigit=TRUE;

  2980     if (wehaveadigit && wehavealetter)

  2981     {

  2982 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2983 	query=TRUE;

  2984 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  2985 	  nondigit=g_utf8_next_char(nondigit))

  2986 	    ;

  2987 	/* digits, ending in st, rd, nd, th of either case */

  2988 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  2989 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  2990 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  2991 	  !g_ascii_strcasecmp(nondigit,"th"))

  2992 	    query=FALSE;

  2993 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  2994 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  2995 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  2996 	  !g_ascii_strcasecmp(nondigit,"ths"))

  2997 	    query=FALSE;

  2998 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  2999 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  3000 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  3001 	  !g_ascii_strcasecmp(nondigit,"thly"))

  3002 	    query=FALSE;

  3003 	/* digits, ending in l, L, s or d */

  3004 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  3005 	  !strcmp(nondigit,"d"))

  3006 	    query=FALSE;

  3007 	/*

  3008 	 * L at the start of a number, representing Britsh pounds, like L500.

  3009 	 * This is cute. We know the current word is mixed digit. If the first

  3010 	 * letter is L, there must be at least one digit following. If both

  3011 	 * digits and letters follow, we have a genuine error, else we have a

  3012 	 * capital L followed by digits, and we accept that as a non-error.

  3013 	 */

  3014 	if (g_utf8_get_char(checkword)=='L' &&

  3015 	  !mixdigit(g_utf8_next_char(checkword)))

  3016 	    query=FALSE;

  3017     }

  3018     return query;

  3019 }

  3021 /*

  3022  * getaword:

  3023  *

  3024  * Extracts the first/next "word" from the line, and returns it.

  3025  * A word is defined as one English word unit--or at least that's the aim.

  3026  * "ptr" is advanced to the position in the line where we will start

  3027  * looking for the next word.

  3028  *

  3029  * Returns: A newly-allocated string.

  3030  */

  3031 gchar *getaword(const char **ptr)

  3032 {

  3033     const char *s,*t;

  3034     GString *word;

  3035     gunichar c,pc;

  3036     word=g_string_new(NULL);

  3037     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  3038       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  3039       **ptr;*ptr=g_utf8_next_char(*ptr))

  3040 	;

  3041     /*

  3042      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  3043      * Especially yucky is the case of L1,000

  3044      * This section looks for a pattern of characters including a digit

  3045      * followed by a comma or period followed by one or more digits.

  3046      * If found, it returns this whole pattern as a word; otherwise we discard

  3047      * the results and resume our normal programming.

  3048      */

  3049     s=*ptr;

  3050     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  3051       g_unichar_isalpha(g_utf8_get_char(s)) ||

  3052       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  3053 	g_string_append_unichar(word,g_utf8_get_char(s));

  3054     if (word->len)

  3055     {

  3056 	for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t))

  3057 	{

  3058 	    c=g_utf8_get_char(t);

  3059 	    pc=g_utf8_get_char(g_utf8_prev_char(t));

  3060 	    if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  3061 	    {

  3062 		*ptr=s;

  3063 		return g_string_free(word,FALSE);

  3064 	    }

  3065 	}

  3066     }

  3067     /* we didn't find a punctuated number - do the regular getword thing */

  3068     g_string_truncate(word,0);

  3069     for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||

  3070       g_unichar_isalpha(g_utf8_get_char(*ptr)) ||

  3071       g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))

  3072 	g_string_append_unichar(word,g_utf8_get_char(*ptr));

  3073     return g_string_free(word,FALSE);

  3074 }

  3076 /*

  3077  * isroman:

  3078  *

  3079  * Is this word a Roman Numeral?

  3080  *

  3081  * It doesn't actually validate that the number is a valid Roman Numeral--for

  3082  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  3083  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  3084  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  3085  * expressions thereof, except when it came to taxes. Allow any number of M,

  3086  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3087  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3088  * of optional Is.

  3089  */

  3090 gboolean isroman(const char *t)

  3091 {

  3092     const char *s;

  3093     if (!t || !*t)

  3094 	return FALSE;

  3095     s=t;

  3096     while (g_utf8_get_char(t)=='m' && *t)

  3097 	t++;

  3098     if (g_utf8_get_char(t)=='d')

  3099 	t++;

  3100     if (g_str_has_prefix(t,"cm"))

  3101 	t+=2;

  3102     if (g_str_has_prefix(t,"cd"))

  3103 	t+=2;

  3104     while (g_utf8_get_char(t)=='c' && *t)

  3105 	t++;

  3106     if (g_str_has_prefix(t,"xl"))

  3107 	t+=2;

  3108     if (g_str_has_prefix(t,"xc"))

  3109 	t+=2;

  3110     if (g_utf8_get_char(t)=='l')

  3111 	t++;

  3112     while (g_utf8_get_char(t)=='x' && *t)

  3113 	t++;

  3114     if (g_str_has_prefix(t,"ix"))

  3115 	t+=2;

  3116     if (g_str_has_prefix(t,"iv"))

  3117 	t+=2;

  3118     if (g_utf8_get_char(t)=='v')

  3119 	t++;

  3120     while (g_utf8_get_char(t)=='i' && *t)

  3121 	t++;

  3122     return !*t;

  3123 }

  3125 /*

  3126  * postprocess_for_DP:

  3127  *

  3128  * Invoked with the -d switch from flgets().

  3129  * It simply "removes" from the line a hard-coded set of common

  3130  * DP-specific tags, so that the line passed to the main routine has

  3131  * been pre-cleaned of DP markup.

  3132  */

  3133 void postprocess_for_DP(char *theline)

  3134 {

  3135     char *s,*t;

  3136     int i;

  3137     if (!*theline)

  3138 	return;

  3139     for (i=0;*DPmarkup[i];i++)

  3140 	while ((s=strstr(theline,DPmarkup[i])))

  3141 	{

  3142 	    t=s+strlen(DPmarkup[i]);

  3143 	    memmove(s,t,strlen(t)+1);

  3144 	}

  3145 }

  3147 /*

  3148  * postprocess_for_HTML:

  3149  *

  3150  * Invoked with the -m switch from flgets().

  3151  * It simply "removes" from the line a hard-coded set of common

  3152  * HTML tags and "replaces" a hard-coded set of common HTML

  3153  * entities, so that the line passed to the main routine has

  3154  * been pre-cleaned of HTML.

  3155  */

  3156 void postprocess_for_HTML(char *theline)

  3157 {

  3158     while (losemarkup(theline))

  3159 	;

  3160     loseentities(theline);

  3161 }

  3163 char *losemarkup(char *theline)

  3164 {

  3165     char *s,*t;

  3166     int i;

  3167     s=strchr(theline,'<');

  3168     t=s?strchr(s,'>'):NULL;

  3169     if (!s || !t)

  3170 	return NULL;

  3171     for (i=0;*markup[i];i++)

  3172 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  3173 	{

  3174 	    t=g_utf8_next_char(t);

  3175 	    memmove(s,t,strlen(t)+1);

  3176 	    return s;

  3177 	}

  3178     /* It's an unrecognized <xxx>. */

  3179     return NULL;

  3180 }

  3182 void loseentities(char *theline)

  3183 {

  3184     int i;

  3185     gsize nb;

  3186     char *amp,*scolon;

  3187     gchar *s,*t;

  3188     gunichar c;

  3189     GTree *entities=NULL;

  3190     GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3191     if (!theline)

  3192     {

  3193 	if (entities)

  3194 	    g_tree_destroy(entities);

  3195 	entities=NULL;

  3196 	if (translit==(GIConv)-1)

  3197 	    g_iconv_close(translit);

  3198 	translit=(GIConv)-1;

  3199 	if (to_utf8==(GIConv)-1)

  3200 	    g_iconv_close(to_utf8);

  3201 	to_utf8=(GIConv)-1;

  3202 	return;

  3203     }

  3204     if (!*theline)

  3205 	return;

  3206     if (!entities)

  3207     {

  3208 	entities=g_tree_new((GCompareFunc)strcmp);

  3209 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3210 	    g_tree_insert(entities,HTMLentities[i].name,

  3211 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3212     }

  3213     if (translit==(GIConv)-1)

  3214 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3215     if (to_utf8==(GIConv)-1)

  3216 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3217     while((amp=strchr(theline,'&')))

  3218     {

  3219 	scolon=strchr(amp,';');

  3220 	if (scolon)

  3221 	{

  3222 	    if (amp[1]=='#')

  3223 	    {

  3224 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3225 		    c=strtol(amp+2,NULL,10);

  3226 		else if (amp[2]=='x' &&

  3227 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3228 		    c=strtol(amp+3,NULL,16);

  3229 	    }

  3230 	    else

  3231 	    {

  3232 		s=g_strndup(amp+1,scolon-(amp+1));

  3233 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3234 		g_free(s);

  3235 	    }

  3236 	}

  3237 	else

  3238 	    c=0;

  3239 	if (c)

  3240 	{

  3241 	    theline=amp;

  3242 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3243 		theline+=g_unichar_to_utf8(c,theline);

  3244 	    else

  3245 	    {

  3246 		s=g_malloc(6);

  3247 		nb=g_unichar_to_utf8(c,s);

  3248 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3249 		g_free(s);

  3250 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3251 		g_free(t);

  3252 		memcpy(theline,s,nb);

  3253 		g_free(s);

  3254 		theline+=nb;

  3255 	    }

  3256 	    memmove(theline,g_utf8_next_char(scolon),

  3257 	      strlen(g_utf8_next_char(scolon))+1);

  3258 	}

  3259 	else

  3260 	    theline=g_utf8_next_char(amp);

  3261     }

  3262 }

  3264 gboolean tagcomp(const char *strin,const char *basetag)

  3265 {

  3266     gboolean retval;

  3267     gchar *s,*t;

  3268     if (g_utf8_get_char(strin)=='/')

  3269 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3270     else

  3271 	t=g_utf8_casefold(strin,-1);

  3272     s=g_utf8_casefold(basetag,-1);

  3273     retval=g_str_has_prefix(t,s);

  3274     g_free(s);

  3275     g_free(t);

  3276     return retval;

  3277 }

  3279 void proghelp(GOptionContext *context)

  3280 {

  3281     gchar *help;

  3282     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3283     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3284     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3285     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3286       "For details, read the file COPYING.\n",stderr);

  3287     fputs("This is Free Software; "

  3288       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3289     fputs("read the file COPYING for details.\n\n",stderr);

  3290     help=g_option_context_get_help(context,TRUE,NULL);

  3291     fputs(help,stderr);

  3292     g_free(help);

  3293     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3294     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3295       "non-ASCII\n",stderr);

  3296     fputs("characters like accented letters, "

  3297       "lines longer than 75 or shorter than 55,\n",stderr);

  3298     fputs("unbalanced quotes or brackets, "

  3299       "a variety of badly formatted punctuation, \n",stderr);

  3300     fputs("HTML tags, some likely typos. "

  3301       "It is NOT a substitute for human judgement.\n",stderr);

  3302     fputs("\n",stderr);

  3303 }

author	ali <ali@juiblex.co.uk>
	Fri Sep 06 22:35:23 2013 +0100 (2013-09-06)
changeset 82	0df25c7f4ed7
parent 79	0c7258bf8e4f
child 86	c42c068d2996
permissions	-rw-r--r--