bookloupe-testing: bookloupe/bookloupe.c@0c7258bf8e4f

     1 /*************************************************************************/

     2 /* bookloupe--check for assorted weirdnesses in a PG candidate text file */

     3 /*									 */

     4 /* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>			 */

     5 /* Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>			 */

     6 /*									 */

     7 /* This program is free software; you can redistribute it and/or modify  */

     8 /* it under the terms of the GNU General Public License as published by  */

     9 /* the Free Software Foundation; either version 2 of the License, or     */

    10 /* (at your option) any later version.					 */

    11 /*									 */

    12 /* This program is distributed in the hope that it will be useful,       */

    13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of	 */

    14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the		 */

    15 /* GNU General Public License for more details.				 */

    16 /*									 */

    17 /* You should have received a copy of the GNU General Public License	 */

    18 /* along with this program. If not, see <http://www.gnu.org/licenses/>.	 */

    19 /*************************************************************************/

    21 #include <stdio.h>

    22 #include <stdlib.h>

    23 #include <string.h>

    24 #include <ctype.h>

    25 #ifdef __WIN32__

    26 #include <windows.h>

    27 #endif

    28 #include <glib.h>

    29 #include <bl/bl.h>

    30 #include "HTMLentities.h"

    32 gchar *prevline;

    34 /* Common typos. */

    35 char *typo[] = {

    36     "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane",

    37     "nad", "te", "ig", "acn",  "ahve", "alot", "anbd", "andt", "awya", "aywa",

    38     "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt",

    39     "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse",

    40     "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd",

    41     "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign",

    42     "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis",

    43     "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut",

    44     "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter",

    45     "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices",

    46     "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",

    47     "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe",

    48     "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath",

    49     "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier",

    50     "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne",

    51     "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey",

    52     "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta",

    53     "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",

    54     "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking",

    55     "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve",

    56     "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf",

    57     "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped",

    58     "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge",

    59     "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan",

    60     "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond",

    61     "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile",

    62     "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic",

    63     "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud",

    64     "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee",

    65     "se", ""

    66 };

    68 GTree *usertypo;

    70 /* Common abbreviations and other OK words not to query as typos. */

    71 char *okword[] = {

    72     "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm",

    73     "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",

    74     "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats",

    75     "outbid", "outbids", "frostbite", "frostbitten", ""

    76 };

    78 /* Common abbreviations that cause otherwise unexplained periods. */

    79 char *abbrev[] = {

    80     "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op",

    81     "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", ""

    82 };

    84 /*

    85  * Two-Letter combinations that rarely if ever start words,

    86  * but are common scannos or otherwise common letter combinations.

    87  */

    88 char *nostart[] = {

    89     "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", ""

    90 };

    92 /*

    93  * Two-Letter combinations that rarely if ever end words,

    94  * but are common scannos or otherwise common letter combinations.

    95  */

    96 char *noend[] = {

    97     "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl",

    98     "sw", "gr", "sl", "cl", "iy", ""

    99 };

   101 char *markup[] = {

   102     "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em",

   103     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i",

   104     "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub",

   105     "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", ""

   106 };

   108 char *DPmarkup[] = {

   109     "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>", ""

   110 };

   112 char *nocomma[] = {

   113     "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose",

   114     "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm",

   115     "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm",

   116     "during", "let", "toward", "among", ""

   117 };

   119 char *noperiod[] = {

   120     "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",

   121     "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",

   122     "i'll", "whose", "who", "because", "when", "let", "till", "very", "an",

   123     "among", "those", "into", "whom", "having", "thence", ""

   124 };

   126 /* special characters */

   127 #define CHAR_SPACE	  32

   128 #define CHAR_TAB	   9

   129 #define CHAR_LF		  10

   130 #define CHAR_CR		  13

   131 #define CHAR_DQUOTE	  34

   132 #define CHAR_SQUOTE	  39

   133 #define CHAR_OPEN_SQUOTE  96

   134 #define CHAR_TILDE	 126

   135 #define CHAR_ASTERISK	  42

   136 #define CHAR_FORESLASH	  47

   137 #define CHAR_CARAT	  94

   139 #define CHAR_UNDERSCORE    '_'

   140 #define CHAR_OPEN_CBRACK   '{'

   141 #define CHAR_CLOSE_CBRACK  '}'

   142 #define CHAR_OPEN_RBRACK   '('

   143 #define CHAR_CLOSE_RBRACK  ')'

   144 #define CHAR_OPEN_SBRACK   '['

   145 #define CHAR_CLOSE_SBRACK  ']'

   147 /* longest and shortest normal PG line lengths */

   148 #define LONGEST_PG_LINE   75

   149 #define WAY_TOO_LONG      80

   150 #define SHORTEST_PG_LINE  55

   152 enum {

   153     ECHO_SWITCH,

   154     SQUOTE_SWITCH,

   155     TYPO_SWITCH,

   156     QPARA_SWITCH,

   157     PARANOID_SWITCH,

   158     LINE_END_SWITCH,

   159     OVERVIEW_SWITCH,

   160     STDOUT_SWITCH,

   161     HEADER_SWITCH,

   162     WEB_SWITCH,

   163     VERBOSE_SWITCH,

   164     MARKUP_SWITCH,

   165     USERTYPO_SWITCH,

   166     DP_SWITCH,

   167     SWITNO

   168 };

   170 gboolean pswit[SWITNO];  /* program switches */

   172 static GOptionEntry options[]={

   173     { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,

   174       "Ignore DP-specific markup", NULL },

   175     { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,

   176       "Don't echo queried line", NULL },

   177     { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,

   178       "Check single quotes", NULL },

   179     { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,

   180       "Check common typos", NULL },

   181     { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,

   182       "Require closure of quotes on every paragraph", NULL },

   183     { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,

   184       "Disable paranoid querying of everything", NULL },

   185     { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,

   186       "Disable line end checking", NULL },

   187     { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,

   188       "Overview: just show counts", NULL },

   189     { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,

   190       "Output errors to stdout instead of stderr", NULL },

   191     { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,

   192       "Echo header fields", NULL },

   193     { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,

   194       "Ignore markup in < >", NULL },

   195     { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,

   196       "Use file of user-defined typos", NULL },

   197     { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,

   198       "Defaults for use on www upload", NULL },

   199     { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,

   200       "Verbose - list everything", NULL },

   201     { NULL }

   202 };

   204 long cnt_dquot;		/* for overview mode, count of doublequote queries */

   205 long cnt_squot;		/* for overview mode, count of singlequote queries */

   206 long cnt_brack;		/* for overview mode, count of brackets queries */

   207 long cnt_bin;		/* for overview mode, count of non-ASCII queries */

   208 long cnt_odd;		/* for overview mode, count of odd character queries */

   209 long cnt_long;		/* for overview mode, count of long line errors */

   210 long cnt_short;		/* for overview mode, count of short line queries */

   211 long cnt_punct;		/* for overview mode,

   212 			   count of punctuation and spacing queries */

   213 long cnt_dash;		/* for overview mode, count of dash-related queries */

   214 long cnt_word;		/* for overview mode, count of word queries */

   215 long cnt_html;		/* for overview mode, count of html queries */

   216 long cnt_lineend;	/* for overview mode, count of line-end queries */

   217 long cnt_spacend;	/* count of lines with space at end */

   218 long linecnt;		/* count of total lines in the file */

   219 long checked_linecnt;	/* count of lines actually checked */

   221 void proghelp(GOptionContext *context);

   222 void procfile(const char *);

   224 gchar *running_from;

   226 gboolean mixdigit(const char *);

   227 gchar *getaword(const char **);

   228 char *flgets(char **,long);

   229 void postprocess_for_HTML(char *);

   230 char *linehasmarkup(char *);

   231 char *losemarkup(char *);

   232 gboolean tagcomp(const char *,const char *);

   233 void loseentities(char *);

   234 gboolean isroman(const char *);

   235 void postprocess_for_DP(char *);

   236 void print_as_windows_1252(const char *string);

   237 void print_as_utf_8(const char *string);

   239 GTree *qword,*qperiod;

   241 #ifdef __WIN32__

   242 UINT saved_cp;

   243 #endif

   245 struct first_pass_results {

   246     long firstline,astline;

   247     long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;

   248     long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;

   249     long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;

   250     int Dutchcount,Frenchcount;

   251 };

   253 struct warnings {

   254     int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;

   255     int endquote;

   256     gboolean isDutch,isFrench;

   257 };

   259 struct counters {

   260     long quot;

   261     int c_unders,c_brack,s_brack,r_brack;

   262     int open_single_quote,close_single_quote;

   263 };

   265 struct line_properties {

   266     unsigned int len,blen;

   267     gunichar start;

   268 };

   270 struct parities {

   271     int dquote,squote;

   272 };

   274 struct pending {

   275     char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;

   276     long squot;

   277 };

   279 void parse_options(int *argc,char ***argv)

   280 {

   281     GError *err=NULL;

   282     GOptionContext *context;

   283     context=g_option_context_new(

   284       "file - looks for errors in Project Gutenberg(TM) etexts");

   285     g_option_context_add_main_entries(context,options,NULL);

   286     if (!g_option_context_parse(context,argc,argv,&err))

   287     {

   288 	g_printerr("Bookloupe: %s\n",err->message);

   289 	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);

   290 	exit(1);

   291     }

   292     /* Paranoid checking is turned OFF, not on, by its switch */

   293     pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];

   294     if (pswit[PARANOID_SWITCH])

   295 	/* if running in paranoid mode, typo checks default to enabled */

   296 	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];

   297     /* Line-end checking is turned OFF, not on, by its switch */

   298     pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];

   299     /* Echoing is turned OFF, not on, by its switch */

   300     pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];

   301     if (pswit[OVERVIEW_SWITCH])

   302 	/* just print summary; don't echo */

   303 	pswit[ECHO_SWITCH]=FALSE;

   304     /*

   305      * Web uploads - for the moment, this is really just a placeholder

   306      * until we decide what processing we really want to do on web uploads

   307      */

   308     if (pswit[WEB_SWITCH])

   309     {

   310 	/* specific override for web uploads */

   311 	pswit[ECHO_SWITCH]=TRUE;

   312 	pswit[SQUOTE_SWITCH]=FALSE;

   313 	pswit[TYPO_SWITCH]=TRUE;

   314 	pswit[QPARA_SWITCH]=FALSE;

   315 	pswit[PARANOID_SWITCH]=TRUE;

   316 	pswit[LINE_END_SWITCH]=FALSE;

   317 	pswit[OVERVIEW_SWITCH]=FALSE;

   318 	pswit[STDOUT_SWITCH]=FALSE;

   319 	pswit[HEADER_SWITCH]=TRUE;

   320 	pswit[VERBOSE_SWITCH]=FALSE;

   321 	pswit[MARKUP_SWITCH]=FALSE;

   322 	pswit[USERTYPO_SWITCH]=FALSE;

   323 	pswit[DP_SWITCH]=FALSE;

   324     }

   325     if (*argc<2)

   326     {

   327 	proghelp(context);

   328 	exit(1);

   329     }

   330     g_option_context_free(context);

   331 }

   333 /*

   334  * read_user_scannos:

   335  *

   336  * Read in the user-defined stealth scanno list.

   337  */

   338 void read_user_scannos(void)

   339 {

   340     GError *err=NULL;

   341     gchar *usertypo_file;

   342     gboolean okay;

   343     int i;

   344     gsize len,nb;

   345     gchar *contents,*utf8,**lines;

   346     usertypo_file=g_strdup("bookloupe.typ");

   347     okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   348     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   349     {

   350 	g_clear_error(&err);

   351 	g_free(usertypo_file);

   352 	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);

   353 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   354     }

   355     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   356     {

   357 	g_clear_error(&err);

   358 	g_free(usertypo_file);

   359 	usertypo_file=g_strdup("gutcheck.typ");

   360 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   361     }

   362     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   363     {

   364 	g_clear_error(&err);

   365 	g_free(usertypo_file);

   366 	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);

   367 	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);

   368     }

   369     if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))

   370     {

   371 	g_free(usertypo_file);

   372 	g_print("   --> I couldn't find bookloupe.typ "

   373 	  "-- proceeding without user typos.\n");

   374 	return;

   375     }

   376     else if (!okay)

   377     {

   378 	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);

   379 	g_free(usertypo_file);

   380 	g_clear_error(&err);

   381 	exit(1);

   382     }

   383     if (g_utf8_validate(contents,len,NULL))

   384 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   385     else

   386 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);

   387     g_free(contents);

   388     lines=g_strsplit_set(utf8,"\r\n",0);

   389     g_free(utf8);

   390     usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

   391     for (i=0;lines[i];i++)

   392 	if (*(unsigned char *)lines[i]>'!')

   393 	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));

   394 	else

   395 	    g_free(lines[i]);

   396     g_free(lines);

   397 }

   399 /*

   400  * read_etext:

   401  *

   402  * Read an etext returning a newly allocated string containing the file

   403  * contents or NULL on error.

   404  */

   405 gchar *read_etext(const char *filename,GError **err)

   406 {

   407     GError *tmp_err=NULL;

   408     gchar *contents,*utf8;

   409     gsize len,bytes_read,bytes_written;

   410     int i,line,col;

   411     if (!g_file_get_contents(filename,&contents,&len,err))

   412 	return NULL;

   413     if (g_utf8_validate(contents,len,NULL))

   414     {

   415 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);

   416 	g_set_print_handler(print_as_utf_8);

   417 #ifdef __WIN32__

   418 	SetConsoleOutputCP(CP_UTF8);

   419 #endif

   420     }

   421     else

   422     {

   423 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read,

   424 	  &bytes_written,&tmp_err);

   425 	if (g_error_matches(tmp_err,G_CONVERT_ERROR,

   426 	  G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

   427 	{

   428 	    line=col=1;

   429 	    for(i=0;i<bytes_read;i++)

   430 		if (contents[i]=='\n')

   431 		{

   432 		    line++;

   433 		    col=1;

   434 		}

   435 		else if (contents[i]!='\r')

   436 		    col++;

   437 	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

   438 	      "Input conversion failed. Byte %d at line %d, column %d is not a "

   439 	      "valid Windows-1252 character",

   440 	      ((unsigned char *)contents)[bytes_read],line,col);

   441 	}

   442 	else if (tmp_err)

   443 	    g_propagate_error(err,tmp_err);

   444 	g_set_print_handler(print_as_windows_1252);

   445 #ifdef __WIN32__

   446 	SetConsoleOutputCP(1252);

   447 #endif

   448     }

   449     g_free(contents);

   450     return utf8;

   451 }

   453 void cleanup_on_exit(void)

   454 {

   455 #ifdef __WIN32__

   456     SetConsoleOutputCP(saved_cp);

   457 #endif

   458 }

   460 int main(int argc,char **argv)

   461 {

   462 #ifdef __WIN32__

   463     atexit(cleanup_on_exit);

   464     saved_cp=GetConsoleOutputCP();

   465 #endif

   466     running_from=g_path_get_dirname(argv[0]);

   467     parse_options(&argc,&argv);

   468     if (pswit[USERTYPO_SWITCH])

   469 	read_user_scannos();

   470     fprintf(stderr,"bookloupe: Check and report on an e-text\n");

   471     procfile(argv[1]);

   472     if (pswit[OVERVIEW_SWITCH])

   473     {

   474 	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",

   475 	  checked_linecnt,linecnt,linecnt-checked_linecnt);

   476 	g_print("    --------------- Queries found --------------\n");

   477 	if (cnt_long)

   478 	    g_print("    Long lines:		    %14ld\n",cnt_long);

   479 	if (cnt_short)

   480 	    g_print("    Short lines:		   %14ld\n",cnt_short);

   481 	if (cnt_lineend)

   482 	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);

   483 	if (cnt_word)

   484 	    g_print("    Common typos:		  %14ld\n",cnt_word);

   485 	if (cnt_dquot)

   486 	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);

   487 	if (cnt_squot)

   488 	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);

   489 	if (cnt_brack)

   490 	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);

   491 	if (cnt_bin)

   492 	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);

   493 	if (cnt_odd)

   494 	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);

   495 	if (cnt_punct)

   496 	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);

   497 	if (cnt_dash)

   498 	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);

   499 	if (cnt_html)

   500 	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);

   501 	g_print("\n");

   502 	g_print("    TOTAL QUERIES		  %14ld\n",

   503 	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+

   504 	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);

   505     }

   506     g_free(running_from);

   507     if (usertypo)

   508 	g_tree_unref(usertypo);

   509     return 0;

   510 }

   512 /*

   513  * first_pass:

   514  *

   515  * Run a first pass - verify that it's a valid PG

   516  * file, decide whether to report some things that

   517  * occur many times in the text like long or short

   518  * lines, non-standard dashes, etc.

   519  */

   520 struct first_pass_results *first_pass(const char *etext)

   521 {

   522     gunichar laststart=CHAR_SPACE;

   523     const char *s;

   524     gchar *lc_line;

   525     int i,j,lbytes,llen;

   526     gchar **lines;

   527     unsigned int lastlen=0,lastblen=0;

   528     long spline=0,nspline=0;

   529     static struct first_pass_results results={0};

   530     gchar *inword;

   531     lines=g_strsplit(etext,"\n",0);

   532     for (j=0;lines[j];j++)

   533     {

   534 	lbytes=strlen(lines[j]);

   535 	while (lines[j][lbytes-1]=='\r')

   536 	    lines[j][--lbytes]='\0';

   537 	llen=g_utf8_strlen(lines[j],lbytes);

   538 	linecnt++;

   539 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&

   540 	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))

   541 	{

   542 	    if (spline)

   543 		g_print("   --> Duplicate header?\n");

   544 	    spline=linecnt+1;   /* first line of non-header text, that is */

   545 	}

   546 	if (!strncmp(lines[j],"*** START",9) &&

   547 	  strstr(lines[j],"PROJECT GUTENBERG"))

   548 	{

   549 	    if (nspline)

   550 		g_print("   --> Duplicate header?\n");

   551 	    nspline=linecnt+1;   /* first line of non-header text, that is */

   552 	}

   553 	if (spline || nspline)

   554 	{

   555 	    lc_line=g_utf8_strdown(lines[j],lbytes);

   556 	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))

   557 	    {

   558 		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))

   559 		{

   560 		    if (results.footerline)

   561 		    {

   562 			/* it's an old-form header - we can detect duplicates */

   563 			if (!nspline)

   564 			    g_print("   --> Duplicate footer?\n");

   565 		    }

   566 		    else

   567 			results.footerline=linecnt;

   568 		}

   569 	    }

   570 	    g_free(lc_line);

   571 	}

   572 	if (spline)

   573 	    results.firstline=spline;

   574 	if (nspline)

   575 	    results.firstline=nspline;  /* override with new */

   576 	if (results.footerline)

   577 	    continue;    /* don't count the boilerplate in the footer */

   578 	results.totlen+=llen;

   579 	for (s=lines[j];*s;s=g_utf8_next_char(s))

   580 	{

   581 	    if (g_utf8_get_char(s)>127)

   582 		results.binlen++;

   583 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

   584 		results.alphalen++;

   585 	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&

   586 	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))

   587 		results.endquote_count++;

   588 	}

   589 	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&

   590 	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)

   591 	    results.shortline++;

   592 	if (lbytes>0 &&

   593 	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)

   594 	    cnt_spacend++;

   595 	if (strstr(lines[j],".,"))

   596 	    results.dotcomma++;

   597 	/* only count ast lines for ignoring purposes where there is */

   598 	/* locase text on the line */

   599 	if (strchr(lines[j],'*'))

   600 	{

   601 	    for (s=lines[j];*s;s=g_utf8_next_char(s))

   602 		if (g_unichar_islower(g_utf8_get_char(s)))

   603 		    break;

   604 	    if (*s)

   605 		results.astline++;

   606 	}

   607 	if (strchr(lines[j],'/'))

   608 	    results.fslashline++;

   609 	for (s=g_utf8_prev_char(lines[j]+lbytes);

   610 	  s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

   611 	    ;

   612 	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&

   613 	  g_utf8_get_char(g_utf8_prev_char(s))!='-')

   614 	    results.hyphens++;

   615 	if (llen>LONGEST_PG_LINE)

   616 	    results.longline++;

   617 	if (llen>WAY_TOO_LONG)

   618 	    results.verylongline++;

   619 	if (strchr(lines[j],'<') && strchr(lines[j],'>'))

   620 	{

   621 	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);

   622 	    if (i>0)

   623 		results.htmcount++;

   624 	    if (strstr(lines[j],"<i>"))

   625 		results.htmcount+=4; /* bonus marks! */

   626 	}

   627 	/* Check for spaced em-dashes */

   628 	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))

   629 	{

   630 	    results.emdash++;

   631 	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)

   632 		results.space_emdash++;

   633 	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)

   634 		/* count of em-dashes with spaces both sides */

   635 		results.non_PG_space_emdash++;

   636 	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)

   637 		/* count of PG-type em-dashes with no spaces */

   638 		results.PG_space_emdash++;

   639 	}

   640 	for (s=lines[j];*s;)

   641 	{

   642 	    inword=getaword(&s);

   643 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))

   644 		results.Dutchcount++;

   645 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))

   646 		results.Frenchcount++;

   647 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

   648 		results.standalone_digit++;

   649 	    g_free(inword);

   650 	}

   651 	/* Check for spaced dashes */

   652 	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')

   653 	    results.spacedash++;

   654 	lastblen=lastlen;

   655 	lastlen=llen;

   656 	laststart=lines[j][0];

   657     }

   658     g_strfreev(lines);

   659     return &results;

   660 }

   662 /*

   663  * report_first_pass:

   664  *

   665  * Make some snap decisions based on the first pass results.

   666  */

   667 struct warnings *report_first_pass(struct first_pass_results *results)

   668 {

   669     static struct warnings warnings={0};

   670     if (cnt_spacend>0)

   671 	g_print("   --> %ld lines in this file have white space at end\n",

   672 	  cnt_spacend);

   673     warnings.dotcomma=1;

   674     if (results->dotcomma>5)

   675     {

   676 	warnings.dotcomma=0;

   677 	g_print("   --> %ld lines in this file contain '.,'. "

   678 	  "Not reporting them.\n",results->dotcomma);

   679     }

   680     /*

   681      * If more than 50 lines, or one-tenth, are short,

   682      * don't bother reporting them.

   683      */

   684     warnings.shortline=1;

   685     if (results->shortline>50 || results->shortline*10>linecnt)

   686     {

   687 	warnings.shortline=0;

   688 	g_print("   --> %ld lines in this file are short. "

   689 	  "Not reporting short lines.\n",results->shortline);

   690     }

   691     /*

   692      * If more than 50 lines, or one-tenth, are long,

   693      * don't bother reporting them.

   694      */

   695     warnings.longline=1;

   696     if (results->longline>50 || results->longline*10>linecnt)

   697     {

   698 	warnings.longline=0;

   699 	g_print("   --> %ld lines in this file are long. "

   700 	  "Not reporting long lines.\n",results->longline);

   701     }

   702     /* If more than 10 lines contain asterisks, don't bother reporting them. */

   703     warnings.ast=1;

   704     if (results->astline>10)

   705     {

   706 	warnings.ast=0;

   707 	g_print("   --> %ld lines in this file contain asterisks. "

   708 	  "Not reporting them.\n",results->astline);

   709     }

   710     /*

   711      * If more than 10 lines contain forward slashes,

   712      * don't bother reporting them.

   713      */

   714     warnings.fslash=1;

   715     if (results->fslashline>10)

   716     {

   717 	warnings.fslash=0;

   718 	g_print("   --> %ld lines in this file contain forward slashes. "

   719 	  "Not reporting them.\n",results->fslashline);

   720     }

   721     /*

   722      * If more than 20 lines contain unpunctuated endquotes,

   723      * don't bother reporting them.

   724      */

   725     warnings.endquote=1;

   726     if (results->endquote_count>20)

   727     {

   728 	warnings.endquote=0;

   729 	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "

   730 	  "Not reporting them.\n",results->endquote_count);

   731     }

   732     /*

   733      * If more than 15 lines contain standalone digits,

   734      * don't bother reporting them.

   735      */

   736     warnings.digit=1;

   737     if (results->standalone_digit>10)

   738     {

   739 	warnings.digit=0;

   740 	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "

   741 	  "Not reporting them.\n",results->standalone_digit);

   742     }

   743     /*

   744      * If more than 20 lines contain hyphens at end,

   745      * don't bother reporting them.

   746      */

   747     warnings.hyphen=1;

   748     if (results->hyphens>20)

   749     {

   750 	warnings.hyphen=0;

   751 	g_print("   --> %ld lines in this file have hyphens at end. "

   752 	  "Not reporting them.\n",results->hyphens);

   753     }

   754     if (results->htmcount>20 && !pswit[MARKUP_SWITCH])

   755     {

   756 	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");

   757 	pswit[MARKUP_SWITCH]=1;

   758     }

   759     if (results->verylongline>0)

   760 	g_print("   --> %ld lines in this file are VERY long!\n",

   761 	  results->verylongline);

   762     /*

   763      * If there are more non-PG spaced dashes than PG em-dashes,

   764      * assume it's deliberate.

   765      * Current PG guidelines say don't use them, but older texts do,

   766      * and some people insist on them whatever the guidelines say.

   767      */

   768     warnings.dash=1;

   769     if (results->spacedash+results->non_PG_space_emdash>

   770       results->PG_space_emdash)

   771     {

   772 	warnings.dash=0;

   773 	g_print("   --> There are %ld spaced dashes and em-dashes. "

   774 	  "Not reporting them.\n",

   775 	  results->spacedash+results->non_PG_space_emdash);

   776     }

   777     /* If more than a quarter of characters are hi-bit, bug out. */

   778     warnings.bin=1;

   779     if (results->binlen*4>results->totlen)

   780     {

   781 	g_print("   --> This file does not appear to be ASCII. "

   782 	  "Terminating. Best of luck with it!\n");

   783 	exit(1);

   784     }

   785     if (results->alphalen*4<results->totlen)

   786     {

   787 	g_print("   --> This file does not appear to be text. "

   788 	  "Terminating. Best of luck with it!\n");

   789 	exit(1);

   790     }

   791     if (results->binlen*100>results->totlen || results->binlen>100)

   792     {

   793 	g_print("   --> There are a lot of foreign letters here. "

   794 	  "Not reporting them.\n");

   795 	warnings.bin=0;

   796     }

   797     warnings.isDutch=FALSE;

   798     if (results->Dutchcount>50)

   799     {

   800 	warnings.isDutch=TRUE;

   801 	g_print("   --> This looks like Dutch - "

   802 	  "switching off dashes and warnings for 's Middags case.\n");

   803     }

   804     warnings.isFrench=FALSE;

   805     if (results->Frenchcount>50)

   806     {

   807 	warnings.isFrench=TRUE;

   808 	g_print("   --> This looks like French - "

   809 	  "switching off some doublepunct.\n");

   810     }

   811     if (results->firstline && results->footerline)

   812 	g_print("    The PG header and footer appear to be already on.\n");

   813     else

   814     {

   815 	if (results->firstline)

   816 	    g_print("    The PG header is on - no footer.\n");

   817 	if (results->footerline)

   818 	    g_print("    The PG footer is on - no header.\n");

   819     }

   820     g_print("\n");

   821     if (pswit[VERBOSE_SWITCH])

   822     {

   823 	warnings.bin=1;

   824 	warnings.shortline=1;

   825 	warnings.dotcomma=1;

   826 	warnings.longline=1;

   827 	warnings.dash=1;

   828 	warnings.digit=1;

   829 	warnings.ast=1;

   830 	warnings.fslash=1;

   831 	warnings.hyphen=1;

   832 	warnings.endquote=1;

   833 	g_print("   *** Verbose output is ON -- you asked for it! ***\n");

   834     }

   835     if (warnings.isDutch)

   836 	warnings.dash=0;

   837     if (results->footerline>0 && results->firstline>0 &&

   838       results->footerline>results->firstline &&

   839       results->footerline-results->firstline<100)

   840     {

   841 	g_print("   --> I don't really know where this text starts. \n");

   842 	g_print("       There are no reference points.\n");

   843 	g_print("       I'm going to have to report the header and footer "

   844 	  "as well.\n");

   845 	results->firstline=0;

   846     }

   847     return &warnings;

   848 }

   850 /*

   851  * analyse_quotes:

   852  *

   853  * Look along the line, accumulate the count of quotes, and see

   854  * if this is an empty line - i.e. a line with nothing on it

   855  * but spaces.

   856  * If line has just spaces, period, * and/or - on it, don't

   857  * count it, since empty lines with asterisks or dashes to

   858  * separate sections are common.

   859  *

   860  * Returns: TRUE if the line is empty.

   861  */

   862 gboolean analyse_quotes(const char *aline,struct counters *counters)

   863 {

   864     int guessquote=0;

   865     /* assume the line is empty until proven otherwise */

   866     gboolean isemptyline=TRUE;

   867     const char *s=aline,*sprev,*snext;

   868     gunichar c;

   869     sprev=NULL;

   870     while (*s)

   871     {

   872 	snext=g_utf8_next_char(s);

   873 	c=g_utf8_get_char(s);

   874 	if (c==CHAR_DQUOTE)

   875 	    counters->quot++;

   876 	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)

   877 	{

   878 	    if (s==aline)

   879 	    {

   880 		/*

   881 		 * At start of line, it can only be an openquote.

   882 		 * Hardcode a very common exception!

   883 		 */

   884 		if (!g_str_has_prefix(snext,"tis") &&

   885 		  !g_str_has_prefix(snext,"Tis"))

   886 		    counters->open_single_quote++;

   887 	    }

   888 	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&

   889 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   890 		/* Do nothing! it's definitely an apostrophe, not a quote */

   891 		;

   892 	    /* it's outside a word - let's check it out */

   893 	    else if (c==CHAR_OPEN_SQUOTE ||

   894 	      g_unichar_isalpha(g_utf8_get_char(snext)))

   895 	    {

   896 		/* it damwell better BE an openquote */

   897 		if (!g_str_has_prefix(snext,"tis") &&

   898 		  !g_str_has_prefix(snext,"Tis"))

   899 		    /* hardcode a very common exception! */

   900 		    counters->open_single_quote++;

   901 	    }

   902 	    else

   903 	    {

   904 		/* now - is it a closequote? */

   905 		guessquote=0;   /* accumulate clues */

   906 		if (g_unichar_isalpha(g_utf8_get_char(sprev)))

   907 		{

   908 		    /* it follows a letter - could be either */

   909 		    guessquote++;

   910 		    if (g_utf8_get_char(sprev)=='s')

   911 		    {

   912 			/* looks like a plural apostrophe */

   913 			guessquote-=3;

   914 			if (g_utf8_get_char(snext)==CHAR_SPACE)

   915 			    /* bonus marks! */

   916 			    guessquote-=2;

   917 		    }

   918 		}

   919 		/* it doesn't have a letter either side */

   920 		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&

   921 		  strchr(".?!,;: ",g_utf8_get_char(snext)))

   922 		    guessquote+=8; /* looks like a closequote */

   923 		else

   924 		    guessquote++;

   925 		if (counters->open_single_quote>counters->close_single_quote)

   926 		    /*

   927 		     * Give it the benefit of some doubt,

   928 		     * if a squote is already open.

   929 		     */

   930 		    guessquote++;

   931 		else

   932 		    guessquote--;

   933 		if (guessquote>=0)

   934 		    counters->close_single_quote++;

   935 	    }

   936 	}

   937 	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&

   938 	  c!='\r' && c!='\n')

   939 	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */

   940 	if (c==CHAR_UNDERSCORE)

   941 	    counters->c_unders++;

   942 	if (c==CHAR_OPEN_CBRACK)

   943 	    counters->c_brack++;

   944 	if (c==CHAR_CLOSE_CBRACK)

   945 	    counters->c_brack--;

   946 	if (c==CHAR_OPEN_RBRACK)

   947 	    counters->r_brack++;

   948 	if (c==CHAR_CLOSE_RBRACK)

   949 	    counters->r_brack--;

   950 	if (c==CHAR_OPEN_SBRACK)

   951 	    counters->s_brack++;

   952 	if (c==CHAR_CLOSE_SBRACK)

   953 	    counters->s_brack--;

   954 	sprev=s;

   955 	s=snext;

   956     }

   957     return isemptyline;

   958 }

   960 /*

   961  * check_for_control_characters:

   962  *

   963  * Check for invalid or questionable characters in the line

   964  * Anything above 127 is invalid for plain ASCII, and

   965  * non-printable control characters should also be flagged.

   966  * Tabs should generally not be there.

   967  */

   968 void check_for_control_characters(const char *aline)

   969 {

   970     gunichar c;

   971     const char *s;

   972     for (s=aline;*s;s=g_utf8_next_char(s))

   973     {

   974 	c=g_utf8_get_char(s);

   975 	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)

   976 	{

   977 	    if (pswit[ECHO_SWITCH])

   978 		g_print("\n%s\n",aline);

   979 	    if (!pswit[OVERVIEW_SWITCH])

   980 		g_print("    Line %ld column %ld - Control character %u\n",

   981 		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);

   982 	    else

   983 		cnt_bin++;

   984 	}

   985     }

   986 }

   988 /*

   989  * check_for_odd_characters:

   990  *

   991  * Check for binary and other odd characters.

   992  */

   993 void check_for_odd_characters(const char *aline,const struct warnings *warnings,

   994   gboolean isemptyline)

   995 {

   996     /* Don't repeat multiple warnings on one line. */

   997     gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;

   998     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;

   999     const char *s;

  1000     gunichar c;

  1001     for (s=aline;*s;s=g_utf8_next_char(s))

  1002     {

  1003 	c=g_utf8_get_char(s);

  1004 	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))

  1005 	{

  1006 	    if (pswit[ECHO_SWITCH])

  1007 		g_print("\n%s\n",aline);

  1008 	    if (!pswit[OVERVIEW_SWITCH])

  1009 		if (c>127 && c<160 || c>255)

  1010 		    g_print("    Line %ld column %ld - "

  1011 		      "Non-ISO-8859 character %u\n",

  1012 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1013 		else

  1014 		    g_print("    Line %ld column %ld - "

  1015 		      "Non-ASCII character %u\n",

  1016 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);

  1017 	    else

  1018 		cnt_bin++;

  1019 	    eNon_A=TRUE;

  1020 	}

  1021 	if (!eTab && c==CHAR_TAB)

  1022 	{

  1023 	    if (pswit[ECHO_SWITCH])

  1024 		g_print("\n%s\n",aline);

  1025 	    if (!pswit[OVERVIEW_SWITCH])

  1026 		g_print("    Line %ld column %ld - Tab character?\n",

  1027 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1028 	    else

  1029 		cnt_odd++;

  1030 	    eTab=TRUE;

  1031 	}

  1032 	if (!eTilde && c==CHAR_TILDE)

  1033 	{

  1034 	    /*

  1035 	     * Often used by OCR software to indicate an

  1036 	     * unrecognizable character.

  1037 	     */

  1038 	    if (pswit[ECHO_SWITCH])

  1039 		g_print("\n%s\n",aline);

  1040 	    if (!pswit[OVERVIEW_SWITCH])

  1041 		g_print("    Line %ld column %ld - Tilde character?\n",

  1042 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1043 	    else

  1044 		cnt_odd++;

  1045 	    eTilde=TRUE;

  1046 	}

  1047 	if (!eCarat && c==CHAR_CARAT)

  1048 	{

  1049 	    if (pswit[ECHO_SWITCH])

  1050 		g_print("\n%s\n",aline);

  1051 	    if (!pswit[OVERVIEW_SWITCH])

  1052 		g_print("    Line %ld column %ld - Carat character?\n",

  1053 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1054 	    else

  1055 		cnt_odd++;

  1056 	    eCarat=TRUE;

  1057 	}

  1058 	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)

  1059 	{

  1060 	    if (pswit[ECHO_SWITCH])

  1061 		g_print("\n%s\n",aline);

  1062 	    if (!pswit[OVERVIEW_SWITCH])

  1063 		g_print("    Line %ld column %ld - Forward slash?\n",

  1064 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1065 	    else

  1066 		cnt_odd++;

  1067 	    eFSlash=TRUE;

  1068 	}

  1069 	/*

  1070 	 * Report asterisks only in paranoid mode,

  1071 	 * since they're often deliberate.

  1072 	 */

  1073 	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&

  1074 	  c==CHAR_ASTERISK)

  1075 	{

  1076 	    if (pswit[ECHO_SWITCH])

  1077 		g_print("\n%s\n",aline);

  1078 	    if (!pswit[OVERVIEW_SWITCH])

  1079 		g_print("    Line %ld column %ld - Asterisk?\n",

  1080 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1081 	    else

  1082 		cnt_odd++;

  1083 	    eAst=TRUE;

  1084 	}

  1085     }

  1086 }

  1088 /*

  1089  * check_for_long_line:

  1090  *

  1091  * Check for line too long.

  1092  */

  1093 void check_for_long_line(const char *aline)

  1094 {

  1095     if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)

  1096     {

  1097 	if (pswit[ECHO_SWITCH])

  1098 	    g_print("\n%s\n",aline);

  1099 	if (!pswit[OVERVIEW_SWITCH])

  1100 	    g_print("    Line %ld column %ld - Long line %ld\n",

  1101 	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));

  1102 	else

  1103 	    cnt_long++;

  1104     }

  1105 }

  1107 /*

  1108  * check_for_short_line:

  1109  *

  1110  * Check for line too short.

  1111  *

  1112  * This one is a bit trickier to implement: we don't want to

  1113  * flag the last line of a paragraph for being short, so we

  1114  * have to wait until we know that our current line is a

  1115  * "normal" line, then report the _previous_ line if it was too

  1116  * short. We also don't want to report indented lines like

  1117  * chapter heads or formatted quotations. We therefore keep

  1118  * last->len as the length of the last line examined, and

  1119  * last->blen as the length of the last but one, and try to

  1120  * suppress unnecessary warnings by checking that both were of

  1121  * "normal" length. We keep the first character of the last

  1122  * line in last->start, and if it was a space, we assume that

  1123  * the formatting is deliberate. I can't figure out a way to

  1124  * distinguish something like a quoted verse left-aligned or

  1125  * the header or footer of a letter from a paragraph of short

  1126  * lines - maybe if I examined the whole paragraph, and if the

  1127  * para has less than, say, 8 lines and if all lines are short,

  1128  * then just assume it's OK? Need to look at some texts to see

  1129  * how often a formula like this would get the right result.

  1130  */

  1131 void check_for_short_line(const char *aline,const struct line_properties *last)

  1132 {

  1133     if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&

  1134       last->len<SHORTEST_PG_LINE && last->blen>1 &&

  1135       last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)

  1136     {

  1137 	if (pswit[ECHO_SWITCH])

  1138 	    g_print("\n%s\n",prevline);

  1139 	if (!pswit[OVERVIEW_SWITCH])

  1140 	    g_print("    Line %ld column %ld - Short line %ld?\n",

  1141 	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));

  1142 	else

  1143 	    cnt_short++;

  1144     }

  1145 }

  1147 /*

  1148  * check_for_starting_punctuation:

  1149  *

  1150  * Look for punctuation other than full ellipses at start of line.

  1151  */

  1152 void check_for_starting_punctuation(const char *aline)

  1153 {

  1154     if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&

  1155       !g_str_has_prefix(aline,". . ."))

  1156     {

  1157 	if (pswit[ECHO_SWITCH])

  1158 	    g_print("\n%s\n",aline);

  1159 	if (!pswit[OVERVIEW_SWITCH])

  1160 	    g_print("    Line %ld column 1 - Begins with punctuation?\n",

  1161 	      linecnt);

  1162 	else

  1163 	    cnt_punct++;

  1164     }

  1165 }

  1167 /*

  1168  * check_for_spaced_emdash:

  1169  *

  1170  * Check for spaced em-dashes.

  1171  *

  1172  * We must check _all_ occurrences of "--" on the line

  1173  * hence the loop - even if the first double-dash is OK

  1174  * there may be another that's wrong later on.

  1175  */

  1176 void check_for_spaced_emdash(const char *aline)

  1177 {

  1178     const char *s,*t,*next;

  1179     for (s=aline;t=strstr(s,"--");s=next)

  1180     {

  1181 	next=g_utf8_next_char(g_utf8_next_char(t));

  1182 	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||

  1183 	  g_utf8_get_char(next)==CHAR_SPACE)

  1184 	{

  1185 	    if (pswit[ECHO_SWITCH])

  1186 		g_print("\n%s\n",aline);

  1187 	    if (!pswit[OVERVIEW_SWITCH])

  1188 		g_print("    Line %ld column %ld - Spaced em-dash?\n",

  1189 		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1190 	    else

  1191 		cnt_dash++;

  1192 	}

  1193     }

  1194 }

  1196 /*

  1197  * check_for_spaced_dash:

  1198  *

  1199  * Check for spaced dashes.

  1200  */

  1201 void check_for_spaced_dash(const char *aline)

  1202 {

  1203     const char *s;

  1204     if ((s=strstr(aline," -")))

  1205     {

  1206 	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')

  1207 	{

  1208 	    if (pswit[ECHO_SWITCH])

  1209 		g_print("\n%s\n",aline);

  1210 	    if (!pswit[OVERVIEW_SWITCH])

  1211 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1212 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1213 	    else

  1214 		cnt_dash++;

  1215 	}

  1216     }

  1217     else if ((s=strstr(aline,"- ")))

  1218     {

  1219 	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')

  1220 	{

  1221 	    if (pswit[ECHO_SWITCH])

  1222 		g_print("\n%s\n",aline);

  1223 	    if (!pswit[OVERVIEW_SWITCH])

  1224 		g_print("    Line %ld column %ld - Spaced dash?\n",

  1225 		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1226 	    else

  1227 		cnt_dash++;

  1228 	}

  1229     }

  1230 }

  1232 /*

  1233  * check_for_unmarked_paragraphs:

  1234  *

  1235  * Check for unmarked paragraphs indicated by separate speakers.

  1236  *

  1237  * May well be false positive:

  1238  * "Bravo!" "Wonderful!" called the crowd.

  1239  * but useful all the same.

  1240  */

  1241 void check_for_unmarked_paragraphs(const char *aline)

  1242 {

  1243     const char *s;

  1244     s=strstr(aline,"\"  \"");

  1245     if (!s)

  1246 	s=strstr(aline,"\" \"");

  1247     if (s)

  1248     {

  1249 	if (pswit[ECHO_SWITCH])

  1250 	    g_print("\n%s\n",aline);

  1251 	if (!pswit[OVERVIEW_SWITCH])

  1252 	    g_print("    Line %ld column %ld - "

  1253 	      "Query missing paragraph break?\n",

  1254 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1255 	else

  1256 	    cnt_punct++;

  1257     }

  1258 }

  1260 /*

  1261  * check_for_jeebies:

  1262  *

  1263  * Check for "to he" and other easy h/b errors.

  1264  *

  1265  * This is a very inadequate effort on the h/b problem,

  1266  * but the phrase "to he" is always an error, whereas "to

  1267  * be" is quite common.

  1268  * Similarly, '"Quiet!", be said.' is a non-be error

  1269  * "to he" is _not_ always an error!:

  1270  *       "Where they went to he couldn't say."

  1271  * Another false positive:

  1272  *       What would "Cinderella" be without the . . .

  1273  * and another: "If he wants to he can see for himself."

  1274  */

  1275 void check_for_jeebies(const char *aline)

  1276 {

  1277     const char *s;

  1278     s=strstr(aline," be could ");

  1279     if (!s)

  1280 	s=strstr(aline," be would ");

  1281     if (!s)

  1282 	s=strstr(aline," was be ");

  1283     if (!s)

  1284 	s=strstr(aline," be is ");

  1285     if (!s)

  1286 	s=strstr(aline," is be ");

  1287     if (!s)

  1288 	s=strstr(aline,"\", be ");

  1289     if (!s)

  1290 	s=strstr(aline,"\" be ");

  1291     if (!s)

  1292 	s=strstr(aline,"\" be ");

  1293     if (!s)

  1294 	s=strstr(aline," to he ");

  1295     if (s)

  1296     {

  1297 	if (pswit[ECHO_SWITCH])

  1298 	    g_print("\n%s\n",aline);

  1299 	if (!pswit[OVERVIEW_SWITCH])

  1300 	    g_print("    Line %ld column %ld - Query he/be error?\n",

  1301 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1302 	else

  1303 	    cnt_word++;

  1304     }

  1305     s=strstr(aline," the had ");

  1306     if (!s)

  1307 	s=strstr(aline," a had ");

  1308     if (!s)

  1309 	s=strstr(aline," they bad ");

  1310     if (!s)

  1311 	s=strstr(aline," she bad ");

  1312     if (!s)

  1313 	s=strstr(aline," he bad ");

  1314     if (!s)

  1315 	s=strstr(aline," you bad ");

  1316     if (!s)

  1317 	s=strstr(aline," i bad ");

  1318     if (s)

  1319     {

  1320 	if (pswit[ECHO_SWITCH])

  1321 	    g_print("\n%s\n",aline);

  1322 	if (!pswit[OVERVIEW_SWITCH])

  1323 	    g_print("    Line %ld column %ld - Query had/bad error?\n",

  1324 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1325 	else

  1326 	    cnt_word++;

  1327     }

  1328     s=strstr(aline,"; hut ");

  1329     if (!s)

  1330 	s=strstr(aline,", hut ");

  1331     if (s)

  1332     {

  1333 	if (pswit[ECHO_SWITCH])

  1334 	    g_print("\n%s\n",aline);

  1335 	if (!pswit[OVERVIEW_SWITCH])

  1336 	    g_print("    Line %ld column %ld - Query hut/but error?\n",

  1337 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1338 	else

  1339 	    cnt_word++;

  1340     }

  1341 }

  1343 /*

  1344  * check_for_mta_from:

  1345  *

  1346  * Special case - angled bracket in front of "From" placed there by an

  1347  * MTA when sending an e-mail.

  1348  */

  1349 void check_for_mta_from(const char *aline)

  1350 {

  1351     const char *s;

  1352     s=strstr(aline,">From");

  1353     if (s)

  1354     {

  1355 	if (pswit[ECHO_SWITCH])

  1356 	    g_print("\n%s\n",aline);

  1357 	if (!pswit[OVERVIEW_SWITCH])

  1358 	    g_print("    Line %ld column %ld - "

  1359 	      "Query angled bracket with From\n",

  1360 	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1361 	else

  1362 	    cnt_punct++;

  1363     }

  1364 }

  1366 /*

  1367  * check_for_orphan_character:

  1368  *

  1369  * Check for a single character line -

  1370  * often an overflow from bad wrapping.

  1371  */

  1372 void check_for_orphan_character(const char *aline)

  1373 {

  1374     gunichar c;

  1375     c=g_utf8_get_char(aline);

  1376     if (c && !*g_utf8_next_char(aline))

  1377     {

  1378 	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))

  1379 	    ; /* Nothing - ignore numerals alone on a line. */

  1380 	else

  1381 	{

  1382 	    if (pswit[ECHO_SWITCH])

  1383 		g_print("\n%s\n",aline);

  1384 	    if (!pswit[OVERVIEW_SWITCH])

  1385 		g_print("    Line %ld column 1 - Query single character line\n",

  1386 		  linecnt);

  1387 	    else

  1388 		cnt_punct++;

  1389 	}

  1390     }

  1391 }

  1393 /*

  1394  * check_for_pling_scanno:

  1395  *

  1396  * Check for I" - often should be !

  1397  */

  1398 void check_for_pling_scanno(const char *aline)

  1399 {

  1400     const char *s;

  1401     s=strstr(aline," I\"");

  1402     if (s)

  1403     {

  1404 	if (pswit[ECHO_SWITCH])

  1405 	    g_print("\n%s\n",aline);

  1406 	if (!pswit[OVERVIEW_SWITCH])

  1407 	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",

  1408 	      linecnt,g_utf8_pointer_to_offset(aline,s));

  1409 	else

  1410 	    cnt_punct++;

  1411     }

  1412 }

  1414 /*

  1415  * check_for_extra_period:

  1416  *

  1417  * Check for period without a capital letter. Cut-down from gutspell.

  1418  * Only works when it happens on a single line.

  1419  */

  1420 void check_for_extra_period(const char *aline,const struct warnings *warnings)

  1421 {

  1422     const char *s,*t,*s1;

  1423     int i;

  1424     gsize len;

  1425     gboolean istypo;

  1426     gchar *testword;

  1427     gunichar *decomposition;

  1428     if (pswit[PARANOID_SWITCH])

  1429     {

  1430 	for (t=aline;t=strstr(t,". ");)

  1431 	{

  1432 	    if (t==aline)

  1433 	    {

  1434 		t=g_utf8_next_char(t);

  1435 		/* start of line punctuation is handled elsewhere */

  1436 		continue;

  1437 	    }

  1438 	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))

  1439 	    {

  1440 		t=g_utf8_next_char(t);

  1441 		continue;

  1442 	    }

  1443 	    if (warnings->isDutch)

  1444 	    {

  1445 		/* For Frank & Jeroen -- 's Middags case */

  1446 		gunichar c2,c3,c4,c5;

  1447 		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));

  1448 		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));

  1449 		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));

  1450 		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));

  1451 		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&

  1452 		  c4==CHAR_SPACE && g_unichar_isupper(c5))

  1453 		{

  1454 		    t=g_utf8_next_char(t);

  1455 		    continue;

  1456 		}

  1457 	    }

  1458 	    s1=g_utf8_next_char(g_utf8_next_char(t));

  1459 	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&

  1460 	      !isdigit(g_utf8_get_char(s1)))

  1461 		s1=g_utf8_next_char(s1);

  1462 	    if (g_unichar_islower(g_utf8_get_char(s1)))

  1463 	    {

  1464 		/* we have something to investigate */

  1465 		istypo=TRUE;

  1466 		/* so let's go back and find out */

  1467 		for (s1=g_utf8_prev_char(t);s1>=aline &&

  1468 		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||

  1469 		  g_unichar_isdigit(g_utf8_get_char(s1)) ||

  1470 		  g_utf8_get_char(s1)==CHAR_SQUOTE &&

  1471 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&

  1472 		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));

  1473 		  s1=g_utf8_prev_char(s1))

  1474 		    ;

  1475 		s1=g_utf8_next_char(s1);

  1476 		s=strchr(s1,'.');

  1477 		if (s)

  1478 		    testword=g_strndup(s1,s-s1);

  1479 		else

  1480 		    testword=g_strdup(s1);

  1481 		for (i=0;*abbrev[i];i++)

  1482 		    if (!strcmp(testword,abbrev[i]))

  1483 			istypo=FALSE;

  1484 		if (g_unichar_isdigit(g_utf8_get_char(testword)))

  1485 		    istypo=FALSE;

  1486 		if (!*g_utf8_next_char(testword))

  1487 		    istypo=FALSE;

  1488 		if (isroman(testword))

  1489 		    istypo=FALSE;

  1490 		if (istypo)

  1491 		{

  1492 		    istypo=FALSE;

  1493 		    for (s=testword;*s;s=g_utf8_next_char(s))

  1494 		    {

  1495 			decomposition=g_unicode_canonical_decomposition(

  1496 			  g_utf8_get_char(s),&len);

  1497 			if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1498 			    istypo=TRUE;

  1499 			g_free(decomposition);

  1500 		    }

  1501 		}

  1502 		if (istypo &&

  1503 		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))

  1504 		{

  1505 		    g_tree_insert(qperiod,g_strdup(testword),

  1506 		      GINT_TO_POINTER(1));

  1507 		    if (pswit[ECHO_SWITCH])

  1508 			g_print("\n%s\n",aline);

  1509 		    if (!pswit[OVERVIEW_SWITCH])

  1510 			g_print("    Line %ld column %ld - Extra period?\n",

  1511 			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  1512 		    else

  1513 			cnt_punct++;

  1514 		}

  1515 		g_free(testword);

  1516 	    }

  1517 	    t=g_utf8_next_char(t);

  1518 	}

  1519     }

  1520 }

  1522 /*

  1523  * check_for_following_punctuation:

  1524  *

  1525  * Check for words usually not followed by punctuation.

  1526  */

  1527 void check_for_following_punctuation(const char *aline)

  1528 {

  1529     int i;

  1530     const char *s,*wordstart;

  1531     gunichar c;

  1532     gchar *inword,*t;

  1533     if (pswit[TYPO_SWITCH])

  1534     {

  1535 	for (s=aline;*s;)

  1536 	{

  1537 	    wordstart=s;

  1538 	    t=getaword(&s);

  1539 	    if (!*t)

  1540 	    {

  1541 		g_free(t);

  1542 		continue;

  1543 	    }

  1544 	    inword=g_utf8_strdown(t,-1);

  1545 	    g_free(t);

  1546 	    for (i=0;*nocomma[i];i++)

  1547 		if (!strcmp(inword,nocomma[i]))

  1548 		{

  1549 		    c=g_utf8_get_char(s);

  1550 		    if (c==',' || c==';' || c==':')

  1551 		    {

  1552 			if (pswit[ECHO_SWITCH])

  1553 			    g_print("\n%s\n",aline);

  1554 			if (!pswit[OVERVIEW_SWITCH])

  1555 			    g_print("    Line %ld column %ld - "

  1556 			      "Query punctuation after %s?\n",

  1557 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1558 			      inword);

  1559 			else

  1560 			    cnt_punct++;

  1561 		    }

  1562 		}

  1563 	    for (i=0;*noperiod[i];i++)

  1564 		if (!strcmp(inword,noperiod[i]))

  1565 		{

  1566 		    c=g_utf8_get_char(s);

  1567 		    if (c=='.' || c=='!')

  1568 		    {

  1569 			if (pswit[ECHO_SWITCH])

  1570 			    g_print("\n%s\n",aline);

  1571 			if (!pswit[OVERVIEW_SWITCH])

  1572 			    g_print("    Line %ld column %ld - "

  1573 			      "Query punctuation after %s?\n",

  1574 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,

  1575 			      inword);

  1576 			else

  1577 			    cnt_punct++;

  1578 		    }

  1579 		}

  1580 	    g_free(inword);

  1581 	}

  1582     }

  1583 }

  1585 /*

  1586  * check_for_typos:

  1587  *

  1588  * Check for commonly mistyped words,

  1589  * and digits like 0 for O in a word.

  1590  */

  1591 void check_for_typos(const char *aline,struct warnings *warnings)

  1592 {

  1593     const char *s,*t,*nt,*wordstart;

  1594     gchar *inword;

  1595     gunichar *decomposition;

  1596     gchar *testword;

  1597     int i,vowel,consonant,*dupcnt;

  1598     gboolean isdup,istypo,alower;

  1599     gunichar c;

  1600     long offset,len;

  1601     gsize decomposition_len;

  1602     for (s=aline;*s;)

  1603     {

  1604 	wordstart=s;

  1605 	inword=getaword(&s);

  1606 	if (!*inword)

  1607 	{

  1608 	    g_free(inword);

  1609 	    continue; /* don't bother with empty lines */

  1610 	}

  1611 	if (mixdigit(inword))

  1612 	{

  1613 	    if (pswit[ECHO_SWITCH])

  1614 		g_print("\n%s\n",aline);

  1615 	    if (!pswit[OVERVIEW_SWITCH])

  1616 		g_print("    Line %ld column %ld - Query digit in %s\n",

  1617 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);

  1618 	    else

  1619 		cnt_word++;

  1620 	}

  1621 	/*

  1622 	 * Put the word through a series of tests for likely typos and OCR

  1623 	 * errors.

  1624 	 */

  1625 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1626 	{

  1627 	    istypo=FALSE;

  1628 	    alower=FALSE;

  1629 	    for (t=inword;*t;t=g_utf8_next_char(t))

  1630 	    {

  1631 		c=g_utf8_get_char(t);

  1632 		nt=g_utf8_next_char(t);

  1633 		/* lowercase for testing */

  1634 		if (g_unichar_islower(c))

  1635 		    alower=TRUE;

  1636 		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))

  1637 		{

  1638 		    /*

  1639 		     * We have an uppercase mid-word. However, there are

  1640 		     * common cases:

  1641 		     *   Mac and Mc like McGill

  1642 		     *   French contractions like l'Abbe

  1643 		     */

  1644 		    offset=g_utf8_pointer_to_offset(inword,t);

  1645 		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||

  1646 		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&

  1647 		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||

  1648 		      offset>0 &&

  1649 		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)

  1650 			; /* do nothing! */

  1651 		    else

  1652 			istypo=TRUE;

  1653 		}

  1654 	    }

  1655 	    testword=g_utf8_casefold(inword,-1);

  1656 	}

  1657 	if (pswit[TYPO_SWITCH])

  1658 	{

  1659 	    /*

  1660 	     * Check for certain unlikely two-letter combinations at word

  1661 	     * start and end.

  1662 	     */

  1663 	    len=g_utf8_strlen(testword,-1);

  1664 	    if (len>1)

  1665 	    {

  1666 		for (i=0;*nostart[i];i++)

  1667 		    if (g_str_has_prefix(testword,nostart[i]))

  1668 			istypo=TRUE;

  1669 		for (i=0;*noend[i];i++)

  1670 		    if (g_str_has_suffix(testword,noend[i]))

  1671 			istypo=TRUE;

  1672 	    }

  1673 	    /* ght is common, gbt never. Like that. */

  1674 	    if (strstr(testword,"cb"))

  1675 		istypo=TRUE;

  1676 	    if (strstr(testword,"gbt"))

  1677 		istypo=TRUE;

  1678 	    if (strstr(testword,"pbt"))

  1679 		istypo=TRUE;

  1680 	    if (strstr(testword,"tbs"))

  1681 		istypo=TRUE;

  1682 	    if (strstr(testword,"mrn"))

  1683 		istypo=TRUE;

  1684 	    if (strstr(testword,"ahle"))

  1685 		istypo=TRUE;

  1686 	    if (strstr(testword,"ihle"))

  1687 		istypo=TRUE;

  1688 	    /*

  1689 	     * "TBE" does happen - like HEARTBEAT - but uncommon.

  1690 	     * Also "TBI" - frostbite, outbid - but uncommon.

  1691 	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman

  1692 	     * numerals, but "ii" is a common scanno.

  1693 	     */

  1694 	    if (strstr(testword,"tbi"))

  1695 		istypo=TRUE;

  1696 	    if (strstr(testword,"tbe"))

  1697 		istypo=TRUE;

  1698 	    if (strstr(testword,"ii"))

  1699 		istypo=TRUE;

  1700 	    /*

  1701 	     * Check for no vowels or no consonants.

  1702 	     * If none, flag a typo.

  1703 	     */

  1704 	    if (!istypo && len>1)

  1705 	    {

  1706 		vowel=consonant=0;

  1707 		for (t=testword;*t;t=g_utf8_next_char(t))

  1708 		{

  1709 		    c=g_utf8_get_char(t);

  1710 		    decomposition=

  1711 		      g_unicode_canonical_decomposition(c,&decomposition_len);

  1712 		    if (c=='y' || g_unichar_isdigit(c))

  1713 		    {

  1714 			/* Yah, this is loose. */

  1715 			vowel++;

  1716 			consonant++;

  1717 		    }

  1718 		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))

  1719 			vowel++;

  1720 		    else

  1721 			consonant++;

  1722 		    g_free(decomposition);

  1723 		}

  1724 		if (!vowel || !consonant)

  1725 		    istypo=TRUE;

  1726 	    }

  1727 	    /*

  1728 	     * Now exclude the word from being reported if it's in

  1729 	     * the okword list.

  1730 	     */

  1731 	    for (i=0;*okword[i];i++)

  1732 		if (!strcmp(testword,okword[i]))

  1733 		    istypo=FALSE;

  1734 	    /*

  1735 	     * What looks like a typo may be a Roman numeral.

  1736 	     * Exclude these.

  1737 	     */

  1738 	    if (istypo && isroman(testword))

  1739 		istypo=FALSE;

  1740 	    /* Check the manual list of typos. */

  1741 	    if (!istypo)

  1742 		for (i=0;*typo[i];i++)

  1743 		    if (!strcmp(testword,typo[i]))

  1744 			istypo=TRUE;

  1745 	    /*

  1746 	     * Check lowercase s, l, i and m - special cases.

  1747 	     *   "j" - often a semi-colon gone wrong.

  1748 	     *   "d" for a missing apostrophe - he d

  1749 	     *   "n" for "in"

  1750 	     */

  1751 	    if (!istypo && len==1 &&

  1752 	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))

  1753 		istypo=TRUE;

  1754 	    if (istypo)

  1755 	    {

  1756 		dupcnt=g_tree_lookup(qword,testword);

  1757 		if (dupcnt)

  1758 		{

  1759 		    (*dupcnt)++;

  1760 		    isdup=!pswit[VERBOSE_SWITCH];

  1761 		}

  1762 		else

  1763 		{

  1764 		    dupcnt=g_new0(int,1);

  1765 		    g_tree_insert(qword,g_strdup(testword),dupcnt);

  1766 		    isdup=FALSE;

  1767 		}

  1768 		if (!isdup)

  1769 		{

  1770 		    if (pswit[ECHO_SWITCH])

  1771 			g_print("\n%s\n",aline);

  1772 		    if (!pswit[OVERVIEW_SWITCH])

  1773 		    {

  1774 			g_print("    Line %ld column %ld - Query word %s",

  1775 			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,

  1776 			  inword);

  1777 			if (!pswit[VERBOSE_SWITCH])

  1778 			    g_print(" - not reporting duplicates");

  1779 			g_print("\n");

  1780 		    }

  1781 		    else

  1782 			cnt_word++;

  1783 		}

  1784 	    }

  1785 	}

  1786 	/* check the user's list of typos */

  1787 	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))

  1788 	{

  1789 	    if (pswit[ECHO_SWITCH])

  1790 		g_print("\n%s\n",aline);

  1791 	    if (!pswit[OVERVIEW_SWITCH])

  1792 		g_print("    Line %ld column %ld - Query possible scanno %s\n",

  1793 		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);

  1794 	}

  1795 	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])

  1796 	    g_free(testword);

  1797 	if (pswit[PARANOID_SWITCH] && warnings->digit)

  1798 	{

  1799 	    /* In paranoid mode, query all 0 and 1 standing alone. */

  1800 	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))

  1801 	    {

  1802 		if (pswit[ECHO_SWITCH])

  1803 		    g_print("\n%s\n",aline);

  1804 		if (!pswit[OVERVIEW_SWITCH])

  1805 		    g_print("    Line %ld column %ld - Query standalone %s\n",

  1806 		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,

  1807 		      inword);

  1808 		else

  1809 		    cnt_word++;

  1810 	    }

  1811 	}

  1812 	g_free(inword);

  1813     }

  1814 }

  1816 /*

  1817  * check_for_misspaced_punctuation:

  1818  *

  1819  * Look for added or missing spaces around punctuation and quotes.

  1820  * If there is a punctuation character like ! with no space on

  1821  * either side, suspect a missing!space. If there are spaces on

  1822  * both sides , assume a typo. If we see a double quote with no

  1823  * space or punctuation on either side of it, assume unspaced

  1824  * quotes "like"this.

  1825  */

  1826 void check_for_misspaced_punctuation(const char *aline,

  1827   struct parities *parities,gboolean isemptyline)

  1828 {

  1829     gboolean isacro,isellipsis;

  1830     const char *s;

  1831     gunichar c,nc,pc,n2c;

  1832     c=g_utf8_get_char(aline);

  1833     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1834     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1835     {

  1836 	pc=c;

  1837 	c=nc;

  1838 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1839 	/* For each character in the line after the first. */

  1840 	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */

  1841 	{

  1842 	    /* we need to suppress warnings for acronyms like M.D. */

  1843 	    isacro=FALSE;

  1844 	    /* we need to suppress warnings for ellipsis . . . */

  1845 	    isellipsis=FALSE;

  1846 	    /*

  1847 	     * If there are letters on both sides of it or

  1848 	     * if it's strict punctuation followed by an alpha.

  1849 	     */

  1850 	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||

  1851 	      g_utf8_strchr("?!,;:",-1,c)))

  1852 	    {

  1853 		if (c=='.')

  1854 		{

  1855 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1856 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1857 			isacro=TRUE;

  1858 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1859 		    if (nc && n2c=='.')

  1860 			isacro=TRUE;

  1861 		}

  1862 		if (!isacro)

  1863 		{

  1864 		    if (pswit[ECHO_SWITCH])

  1865 			g_print("\n%s\n",aline);

  1866 		    if (!pswit[OVERVIEW_SWITCH])

  1867 			g_print("    Line %ld column %ld - Missing space?\n",

  1868 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1869 		    else

  1870 			cnt_punct++;

  1871 		}

  1872 	    }

  1873 	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))

  1874 	    {

  1875 		/*

  1876 		 * If there are spaces on both sides,

  1877 		 * or space before and end of line.

  1878 		 */

  1879 		if (c=='.')

  1880 		{

  1881 		    if (g_utf8_pointer_to_offset(aline,s)>2 &&

  1882 		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')

  1883 			isellipsis=TRUE;

  1884 		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));

  1885 		    if (nc && n2c=='.')

  1886 			isellipsis=TRUE;

  1887 		}

  1888 		if (!isemptyline && !isellipsis)

  1889 		{

  1890 		    if (pswit[ECHO_SWITCH])

  1891 			g_print("\n%s\n",aline);

  1892 		    if (!pswit[OVERVIEW_SWITCH])

  1893 			g_print("    Line %ld column %ld - "

  1894 			  "Spaced punctuation?\n",linecnt,

  1895 			  g_utf8_pointer_to_offset(aline,s)+1);

  1896 		    else

  1897 			cnt_punct++;

  1898 		}

  1899 	    }

  1900 	}

  1901     }

  1902     /* Split out the characters that CANNOT be preceded by space. */

  1903     c=g_utf8_get_char(aline);

  1904     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1905     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1906     {

  1907 	pc=c;

  1908 	c=nc;

  1909 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1910 	/* for each character in the line after the first */

  1911 	if (g_utf8_strchr("?!,;:",-1,c))

  1912 	{

  1913 	    /* if it's punctuation that _cannot_ have a space before it */

  1914 	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)

  1915 	    {

  1916 		/*

  1917 		 * If nc DOES == space,

  1918 		 * it was already reported just above.

  1919 		 */

  1920 		if (pswit[ECHO_SWITCH])

  1921 		    g_print("\n%s\n",aline);

  1922 		if (!pswit[OVERVIEW_SWITCH])

  1923 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1924 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1925 		else

  1926 		    cnt_punct++;

  1927 	    }

  1928 	}

  1929     }

  1930     /*

  1931      * Special case " .X" where X is any alpha.

  1932      * This plugs a hole in the acronym code above.

  1933      * Inelegant, but maintainable.

  1934      */

  1935     c=g_utf8_get_char(aline);

  1936     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1937     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1938     {

  1939 	pc=c;

  1940 	c=nc;

  1941 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1942 	/* for each character in the line after the first */

  1943 	if (c=='.')

  1944 	{

  1945 	    /* if it's a period */

  1946 	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))

  1947 	    {

  1948 		/*

  1949 		 * If the period follows a space and

  1950 		 * is followed by a letter.

  1951 		 */

  1952 		if (pswit[ECHO_SWITCH])

  1953 		    g_print("\n%s\n",aline);

  1954 		if (!pswit[OVERVIEW_SWITCH])

  1955 		    g_print("    Line %ld column %ld - Spaced punctuation?\n",

  1956 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1957 		else

  1958 		    cnt_punct++;

  1959 	    }

  1960 	}

  1961     }

  1962     c=g_utf8_get_char(aline);

  1963     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  1964     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  1965     {

  1966 	pc=c;

  1967 	c=nc;

  1968 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1969 	/* for each character in the line after the first */

  1970 	if (c==CHAR_DQUOTE)

  1971 	{

  1972 	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&

  1973 	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||

  1974 	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))

  1975 	    {

  1976 		if (pswit[ECHO_SWITCH])

  1977 		    g_print("\n%s\n",aline);

  1978 		if (!pswit[OVERVIEW_SWITCH])

  1979 		    g_print("    Line %ld column %ld - Unspaced quotes?\n",

  1980 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  1981 		else

  1982 		    cnt_punct++;

  1983 	    }

  1984 	}

  1985     }

  1986     /* Check parity of quotes. */

  1987     nc=g_utf8_get_char(aline);

  1988     for (s=aline;*s;s=g_utf8_next_char(s))

  1989     {

  1990 	c=nc;

  1991 	nc=g_utf8_get_char(g_utf8_next_char(s));

  1992 	if (c==CHAR_DQUOTE)

  1993 	{

  1994 	    parities->dquote=!parities->dquote;

  1995 	    if (!parities->dquote)

  1996 	    {

  1997 		/* parity even */

  1998 		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))

  1999 		{

  2000 		    if (pswit[ECHO_SWITCH])

  2001 			g_print("\n%s\n",aline);

  2002 		    if (!pswit[OVERVIEW_SWITCH])

  2003 			g_print("    Line %ld column %ld - "

  2004 			  "Wrongspaced quotes?\n",

  2005 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2006 		    else

  2007 			cnt_punct++;

  2008 		}

  2009 	    }

  2010 	    else

  2011 	    {

  2012 		/* parity odd */

  2013 		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2014 		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)

  2015 		{

  2016 		    if (pswit[ECHO_SWITCH])

  2017 			g_print("\n%s\n",aline);

  2018 		    if (!pswit[OVERVIEW_SWITCH])

  2019 			g_print("    Line %ld column %ld - "

  2020 			  "Wrongspaced quotes?\n",

  2021 			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2022 		    else

  2023 			cnt_punct++;

  2024 		}

  2025 	    }

  2026 	}

  2027     }

  2028     if (g_utf8_get_char(aline)==CHAR_DQUOTE)

  2029     {

  2030 	if (g_utf8_strchr(",;:!?)]} ",-1,

  2031 	  g_utf8_get_char(g_utf8_next_char(aline))))

  2032 	{

  2033 	    if (pswit[ECHO_SWITCH])

  2034 		g_print("\n%s\n",aline);

  2035 	    if (!pswit[OVERVIEW_SWITCH])

  2036 		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",

  2037 		  linecnt);

  2038 	    else

  2039 		cnt_punct++;

  2040 	}

  2041     }

  2042     if (pswit[SQUOTE_SWITCH])

  2043     {

  2044 	nc=g_utf8_get_char(aline);

  2045 	for (s=aline;*s;s=g_utf8_next_char(s))

  2046 	{

  2047 	    c=nc;

  2048 	    nc=g_utf8_get_char(g_utf8_next_char(s));

  2049 	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||

  2050 	      s>aline &&

  2051 	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||

  2052 	      !g_unichar_isalpha(nc)))

  2053 	    {

  2054 		parities->squote=!parities->squote;

  2055 		if (!parities->squote)

  2056 		{

  2057 		    /* parity even */

  2058 		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))

  2059 		    {

  2060 			if (pswit[ECHO_SWITCH])

  2061 			    g_print("\n%s\n",aline);

  2062 			if (!pswit[OVERVIEW_SWITCH])

  2063 			    g_print("    Line %ld column %ld - "

  2064 			      "Wrongspaced singlequotes?\n",

  2065 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2066 			else

  2067 			    cnt_punct++;

  2068 		    }

  2069 		}

  2070 		else

  2071 		{

  2072 		    /* parity odd */

  2073 		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&

  2074 		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)

  2075 		    {

  2076 			if (pswit[ECHO_SWITCH])

  2077 			    g_print("\n%s\n",aline);

  2078 			if (!pswit[OVERVIEW_SWITCH])

  2079 			    g_print("    Line %ld column %ld - "

  2080 			      "Wrongspaced singlequotes?\n",

  2081 			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2082 			else

  2083 			    cnt_punct++;

  2084 		    }

  2085 		}

  2086 	    }

  2087 	}

  2088     }

  2089 }

  2091 /*

  2092  * check_for_double_punctuation:

  2093  *

  2094  * Look for double punctuation like ,. or ,,

  2095  * Thanks to DW for the suggestion!

  2096  * In books with references, ".," and ".;" are common

  2097  * e.g. "etc., etc.," and vol. 1.; vol 3.;

  2098  * OTOH, from my initial tests, there are also fairly

  2099  * common errors. What to do? Make these cases paranoid?

  2100  * ".," is the most common, so warnings->dotcomma is used

  2101  * to suppress detailed reporting if it occurs often.

  2102  */

  2103 void check_for_double_punctuation(const char *aline,struct warnings *warnings)

  2104 {

  2105     const char *s;

  2106     gunichar c,nc;

  2107     nc=g_utf8_get_char(aline);

  2108     for (s=aline;*s;s=g_utf8_next_char(s))

  2109     {

  2110 	c=nc;

  2111 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2112 	/* for each punctuation character in the line */

  2113 	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&

  2114 	  g_utf8_strchr(".?!,;:",-1,nc))

  2115 	{

  2116 	    /* followed by punctuation, it's a query, unless . . . */

  2117 	    if (c==nc && (c=='.' || c=='?' || c=='!') ||

  2118 	      !warnings->dotcomma && c=='.' && nc==',' ||

  2119 	      warnings->isFrench && g_str_has_prefix(s,",...") ||

  2120 	      warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2121 	      warnings->isFrench && g_str_has_prefix(s,";...") ||

  2122 	      warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2123 	      warnings->isFrench && g_str_has_prefix(s,":...") ||

  2124 	      warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2125 	      warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2126 	      warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2127 	      warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2128 	      warnings->isFrench && g_str_has_prefix(s,"...?"))

  2129 	    {

  2130 		if (warnings->isFrench && g_str_has_prefix(s,",...") ||

  2131 		  warnings->isFrench && g_str_has_prefix(s,"...,") ||

  2132 		  warnings->isFrench && g_str_has_prefix(s,";...") ||

  2133 		  warnings->isFrench && g_str_has_prefix(s,"...;") ||

  2134 		  warnings->isFrench && g_str_has_prefix(s,":...") ||

  2135 		  warnings->isFrench && g_str_has_prefix(s,"...:") ||

  2136 		  warnings->isFrench && g_str_has_prefix(s,"!...") ||

  2137 		  warnings->isFrench && g_str_has_prefix(s,"...!") ||

  2138 		  warnings->isFrench && g_str_has_prefix(s,"?...") ||

  2139 		  warnings->isFrench && g_str_has_prefix(s,"...?"))

  2140 		{

  2141 		    s+=4;

  2142 		    nc=g_utf8_get_char(g_utf8_next_char(s));

  2143 		}

  2144 		; /* do nothing for .. !! and ?? which can be legit */

  2145 	    }

  2146 	    else

  2147 	    {

  2148 		if (pswit[ECHO_SWITCH])

  2149 		    g_print("\n%s\n",aline);

  2150 		if (!pswit[OVERVIEW_SWITCH])

  2151 		    g_print("    Line %ld column %ld - Double punctuation?\n",

  2152 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2153 		else

  2154 		    cnt_punct++;

  2155 	    }

  2156 	}

  2157     }

  2158 }

  2160 /*

  2161  * check_for_spaced_quotes:

  2162  */

  2163 void check_for_spaced_quotes(const char *aline)

  2164 {

  2165     const char *s,*t;

  2166     s=aline;

  2167     while ((t=strstr(s," \" ")))

  2168     {

  2169 	if (pswit[ECHO_SWITCH])

  2170 	    g_print("\n%s\n",aline);

  2171 	if (!pswit[OVERVIEW_SWITCH])

  2172 	    g_print("    Line %ld column %ld - Spaced doublequote?\n",

  2173 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2174 	else

  2175 	    cnt_punct++;

  2176 	s=g_utf8_next_char(g_utf8_next_char(t));

  2177     }

  2178     s=aline;

  2179     while ((t=strstr(s," ' ")))

  2180     {

  2181 	if (pswit[ECHO_SWITCH])

  2182 	    g_print("\n%s\n",aline);

  2183 	if (!pswit[OVERVIEW_SWITCH])

  2184 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2185 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2186 	else

  2187 	    cnt_punct++;

  2188 	s=g_utf8_next_char(g_utf8_next_char(t));

  2189     }

  2190     s=aline;

  2191     while ((t=strstr(s," ` ")))

  2192     {

  2193 	if (pswit[ECHO_SWITCH])

  2194 	    g_print("\n%s\n",aline);

  2195 	if (!pswit[OVERVIEW_SWITCH])

  2196 	    g_print("    Line %ld column %ld - Spaced singlequote?\n",

  2197 	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);

  2198 	else

  2199 	    cnt_punct++;

  2200 	s=g_utf8_next_char(g_utf8_next_char(t));

  2201     }

  2202 }

  2204 /*

  2205  * check_for_miscased_genative:

  2206  *

  2207  * Check special case of 'S instead of 's at end of word.

  2208  */

  2209 void check_for_miscased_genative(const char *aline)

  2210 {

  2211     const char *s;

  2212     gunichar c,nc,pc;

  2213     if (!*aline)

  2214 	return;

  2215     c=g_utf8_get_char(aline);

  2216     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2217     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2218     {

  2219 	pc=c;

  2220 	c=nc;

  2221 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2222 	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))

  2223 	{

  2224 	    if (pswit[ECHO_SWITCH])

  2225 		g_print("\n%s\n",aline);

  2226 	    if (!pswit[OVERVIEW_SWITCH])

  2227 		g_print("    Line %ld column %ld - Capital \"S\"?\n",

  2228 		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);

  2229 	    else

  2230 		cnt_punct++;

  2231 	}

  2232     }

  2233 }

  2235 /*

  2236  * check_end_of_line:

  2237  *

  2238  * Now check special cases - start and end of line -

  2239  * for single and double quotes. Start is sometimes [sic]

  2240  * but better to query it anyway.

  2241  * While we're here, check for dash at end of line.

  2242  */

  2243 void check_end_of_line(const char *aline,struct warnings *warnings)

  2244 {

  2245     int lbytes;

  2246     const char *s;

  2247     gunichar c1,c2;

  2248     lbytes=strlen(aline);

  2249     if (g_utf8_strlen(aline,lbytes)>1)

  2250     {

  2251 	s=g_utf8_prev_char(aline+lbytes);

  2252 	c1=g_utf8_get_char(s);

  2253 	c2=g_utf8_get_char(g_utf8_prev_char(s));

  2254 	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&

  2255 	  c2==CHAR_SPACE)

  2256 	{

  2257 	    if (pswit[ECHO_SWITCH])

  2258 		g_print("\n%s\n",aline);

  2259 	    if (!pswit[OVERVIEW_SWITCH])

  2260 		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,

  2261 		  g_utf8_strlen(aline,lbytes));

  2262 	    else

  2263 		cnt_punct++;

  2264 	}

  2265 	c1=g_utf8_get_char(aline);

  2266 	c2=g_utf8_get_char(g_utf8_next_char(aline));

  2267 	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)

  2268 	{

  2269 	    if (pswit[ECHO_SWITCH])

  2270 		g_print("\n%s\n",aline);

  2271 	    if (!pswit[OVERVIEW_SWITCH])

  2272 		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);

  2273 	    else

  2274 		cnt_punct++;

  2275 	}

  2276 	/*

  2277 	 * Dash at end of line may well be legit - paranoid mode only

  2278 	 * and don't report em-dash at line-end.

  2279 	 */

  2280 	if (pswit[PARANOID_SWITCH] && warnings->hyphen)

  2281 	{

  2282 	    for (s=g_utf8_prev_char(aline+lbytes);

  2283 	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))

  2284 		;

  2285 	    if (g_utf8_get_char(s)=='-' &&

  2286 	      g_utf8_get_char(g_utf8_prev_char(s))!='-')

  2287 	    {

  2288 		if (pswit[ECHO_SWITCH])

  2289 		    g_print("\n%s\n",aline);

  2290 		if (!pswit[OVERVIEW_SWITCH])

  2291 		    g_print("    Line %ld column %ld - "

  2292 		      "Hyphen at end of line?\n",

  2293 		      linecnt,g_utf8_pointer_to_offset(aline,s));

  2294 	    }

  2295 	}

  2296     }

  2297 }

  2299 /*

  2300  * check_for_unspaced_bracket:

  2301  *

  2302  * Brackets are often unspaced, but shouldn't be surrounded by alpha.

  2303  * If so, suspect a scanno like "a]most".

  2304  */

  2305 void check_for_unspaced_bracket(const char *aline)

  2306 {

  2307     const char *s;

  2308     gunichar c,nc,pc;

  2309     c=g_utf8_get_char(aline);

  2310     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2311     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2312     {

  2313 	pc=c;

  2314 	c=nc;

  2315 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2316 	if (!nc)

  2317 	    break;

  2318 	/* for each bracket character in the line except 1st & last */

  2319 	if (g_utf8_strchr("{[()]}",-1,c) &&

  2320 	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))

  2321 	{

  2322 	    if (pswit[ECHO_SWITCH])

  2323 		g_print("\n%s\n",aline);

  2324 	    if (!pswit[OVERVIEW_SWITCH])

  2325 		g_print("    Line %ld column %ld - Unspaced bracket?\n",

  2326 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2327 	    else

  2328 		cnt_punct++;

  2329 	}

  2330     }

  2331 }

  2333 /*

  2334  * check_for_unpunctuated_endquote:

  2335  */

  2336 void check_for_unpunctuated_endquote(const char *aline)

  2337 {

  2338     const char *s;

  2339     gunichar c,nc,pc;

  2340     c=g_utf8_get_char(aline);

  2341     nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;

  2342     for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))

  2343     {

  2344 	pc=c;

  2345 	c=nc;

  2346 	nc=g_utf8_get_char(g_utf8_next_char(s));

  2347 	/* for each character in the line except 1st */

  2348 	if (c==CHAR_DQUOTE && isalpha(pc))

  2349 	{

  2350 	    if (pswit[ECHO_SWITCH])

  2351 		g_print("\n%s\n",aline);

  2352 	    if (!pswit[OVERVIEW_SWITCH])

  2353 		g_print("    Line %ld column %ld - "

  2354 		  "endquote missing punctuation?\n",

  2355 		  linecnt,g_utf8_pointer_to_offset(aline,s));

  2356 	    else

  2357 		cnt_punct++;

  2358 	}

  2359     }

  2360 }

  2362 /*

  2363  * check_for_html_tag:

  2364  *

  2365  * Check for <HTML TAG>.

  2366  *

  2367  * If there is a < in the line, followed at some point

  2368  * by a > then we suspect HTML.

  2369  */

  2370 void check_for_html_tag(const char *aline)

  2371 {

  2372     const char *open,*close;

  2373     gchar *tag;

  2374     open=strchr(aline,'<');

  2375     if (open)

  2376     {

  2377 	close=strchr(g_utf8_next_char(open),'>');

  2378 	if (close)

  2379 	{

  2380 	    if (pswit[ECHO_SWITCH])

  2381 		g_print("\n%s\n",aline);

  2382 	    if (!pswit[OVERVIEW_SWITCH])

  2383 	    {

  2384 		tag=g_strndup(open,close-open+1);

  2385 		g_print("    Line %ld column %ld - HTML Tag? %s \n",

  2386 		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);

  2387 		g_free(tag);

  2388 	    }

  2389 	    else

  2390 		cnt_html++;

  2391 	}

  2392     }

  2393 }

  2395 /*

  2396  * check_for_html_entity:

  2397  *

  2398  * Check for &symbol; HTML.

  2399  *

  2400  * If there is a & in the line, followed at

  2401  * some point by a ; then we suspect HTML.

  2402  */

  2403 void check_for_html_entity(const char *aline)

  2404 {

  2405     const char *s,*amp,*scolon;

  2406     gchar *entity;

  2407     amp=strchr(aline,'&');

  2408     if (amp)

  2409     {

  2410 	scolon=strchr(amp,';');

  2411 	if (scolon)

  2412 	{

  2413 	    for (s=amp;s<scolon;s=g_utf8_next_char(s))

  2414 		if (g_utf8_get_char(s)==CHAR_SPACE)

  2415 		    break;		/* Don't report "Jones & Son;" */

  2416 	    if (s>=scolon)

  2417 	    {

  2418 		if (pswit[ECHO_SWITCH])

  2419 		    g_print("\n%s\n",aline);

  2420 		if (!pswit[OVERVIEW_SWITCH])

  2421 		{

  2422 		    entity=g_strndup(amp,scolon-amp+1);

  2423 		    g_print("    Line %ld column %d - HTML symbol? %s \n",

  2424 		      linecnt,(int)(amp-aline)+1,entity);

  2425 		    g_free(entity);

  2426 		}

  2427 		else

  2428 		    cnt_html++;

  2429 	    }

  2430 	}

  2431     }

  2432 }

  2434 /*

  2435  * print_pending:

  2436  *

  2437  * If we are in a state of unbalanced quotes, and this line

  2438  * doesn't begin with a quote, output the stored error message.

  2439  * If the -P switch was used, print the warning even if the

  2440  * new para starts with quotes.

  2441  */

  2442 void print_pending(const char *aline,const char *parastart,

  2443   struct pending *pending)

  2444 {

  2445     const char *s;

  2446     gunichar c;

  2447     s=aline;

  2448     while (*s==' ')

  2449 	s++;

  2450     c=g_utf8_get_char(s);

  2451     if (pending->dquote)

  2452     {

  2453 	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])

  2454 	{

  2455 	    if (!pswit[OVERVIEW_SWITCH])

  2456 	    {

  2457 		if (pswit[ECHO_SWITCH])

  2458 		    g_print("\n%s\n",parastart);

  2459 		g_print("%s\n",pending->dquote);

  2460 	    }

  2461 	    else

  2462 		cnt_dquot++;

  2463 	}

  2464 	g_free(pending->dquote);

  2465 	pending->dquote=NULL;

  2466     }

  2467     if (pending->squote)

  2468     {

  2469 	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||

  2470 	  pending->squot)

  2471 	{

  2472 	    if (!pswit[OVERVIEW_SWITCH])

  2473 	    {

  2474 		if (pswit[ECHO_SWITCH])

  2475 		    g_print("\n%s\n",parastart);

  2476 		g_print("%s\n",pending->squote);

  2477 	    }

  2478 	    else

  2479 		cnt_squot++;

  2480 	}

  2481 	g_free(pending->squote);

  2482 	pending->squote=NULL;

  2483     }

  2484     if (pending->rbrack)

  2485     {

  2486 	if (!pswit[OVERVIEW_SWITCH])

  2487 	{

  2488 	    if (pswit[ECHO_SWITCH])

  2489 		g_print("\n%s\n",parastart);

  2490 	    g_print("%s\n",pending->rbrack);

  2491 	}

  2492 	else

  2493 	    cnt_brack++;

  2494 	g_free(pending->rbrack);

  2495 	pending->rbrack=NULL;

  2496     }

  2497     if (pending->sbrack)

  2498     {

  2499 	if (!pswit[OVERVIEW_SWITCH])

  2500 	{

  2501 	    if (pswit[ECHO_SWITCH])

  2502 		g_print("\n%s\n",parastart);

  2503 	    g_print("%s\n",pending->sbrack);

  2504 	}

  2505 	else

  2506 	    cnt_brack++;

  2507 	g_free(pending->sbrack);

  2508 	pending->sbrack=NULL;

  2509     }

  2510     if (pending->cbrack)

  2511     {

  2512 	if (!pswit[OVERVIEW_SWITCH])

  2513 	{

  2514 	    if (pswit[ECHO_SWITCH])

  2515 		g_print("\n%s\n",parastart);

  2516 	    g_print("%s\n",pending->cbrack);

  2517 	}

  2518 	else

  2519 	    cnt_brack++;

  2520 	g_free(pending->cbrack);

  2521 	pending->cbrack=NULL;

  2522     }

  2523     if (pending->unders)

  2524     {

  2525 	if (!pswit[OVERVIEW_SWITCH])

  2526 	{

  2527 	    if (pswit[ECHO_SWITCH])

  2528 		g_print("\n%s\n",parastart);

  2529 	    g_print("%s\n",pending->unders);

  2530 	}

  2531 	else

  2532 	    cnt_brack++;

  2533 	g_free(pending->unders);

  2534 	pending->unders=NULL;

  2535     }

  2536 }

  2538 /*

  2539  * check_for_mismatched_quotes:

  2540  *

  2541  * At end of paragraph, check for mismatched quotes.

  2542  *

  2543  * We don't want to report an error immediately, since it is a

  2544  * common convention to omit the quotes at end of paragraph if

  2545  * the next paragraph is a continuation of the same speaker.

  2546  * Where this is the case, the next para should begin with a

  2547  * quote, so we store the warning message and only display it

  2548  * at the top of the next iteration if the new para doesn't

  2549  * start with a quote.

  2550  * The -p switch overrides this default, and warns of unclosed

  2551  * quotes on _every_ paragraph, whether the next begins with a

  2552  * quote or not.

  2553  */

  2554 void check_for_mismatched_quotes(const struct counters *counters,

  2555   struct pending *pending)

  2556 {

  2557     if (counters->quot%2)

  2558 	pending->dquote=

  2559 	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);

  2560     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2561       counters->open_single_quote!=counters->close_single_quote)

  2562 	pending->squote=

  2563 	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);

  2564     if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&

  2565       counters->open_single_quote!=counters->close_single_quote &&

  2566       counters->open_single_quote!=counters->close_single_quote+1)

  2567 	/*

  2568 	 * Flag it to be noted regardless of the

  2569 	 * first char of the next para.

  2570 	 */

  2571 	pending->squot=1;

  2572     if (counters->r_brack)

  2573 	pending->rbrack=

  2574 	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);

  2575     if (counters->s_brack)

  2576 	pending->sbrack=

  2577 	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);

  2578     if (counters->c_brack)

  2579 	pending->cbrack=

  2580 	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);

  2581     if (counters->c_unders%2)

  2582 	pending->unders=

  2583 	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);

  2584 }

  2586 /*

  2587  * check_for_omitted_punctuation:

  2588  *

  2589  * Check for omitted punctuation at end of paragraph by working back

  2590  * through prevline. DW.

  2591  * Need to check this only for "normal" paras.

  2592  * So what is a "normal" para?

  2593  *    Not normal if one-liner (chapter headings, etc.)

  2594  *    Not normal if doesn't contain at least one locase letter

  2595  *    Not normal if starts with space

  2596  */

  2597 void check_for_omitted_punctuation(const char *prevline,

  2598   struct line_properties *last,int start_para_line)

  2599 {

  2600     gboolean letter_on_line=FALSE;

  2601     const char *s;

  2602     for (s=prevline;*s;s=g_utf8_next_char(s))

  2603 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2604 	{

  2605 	    letter_on_line=TRUE;

  2606 	    break;

  2607 	}

  2608     /*

  2609      * This next "if" is a problem.

  2610      * If we say "start_para_line <= linecnt - 1", that includes

  2611      * one-line "paragraphs" like chapter heads. Lotsa false positives.

  2612      * If we say "start_para_line < linecnt - 1" it doesn't, but then it

  2613      * misses genuine one-line paragraphs.

  2614      */

  2615     if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&

  2616       g_utf8_get_char(prevline)>CHAR_SPACE)

  2617     {

  2618 	for (s=g_utf8_prev_char(prevline+strlen(prevline));

  2619 	  (g_utf8_get_char(s)==CHAR_DQUOTE ||

  2620 	  g_utf8_get_char(s)==CHAR_SQUOTE) &&

  2621 	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;

  2622 	  s=g_utf8_prev_char(s))

  2623 	    ;

  2624 	for (;s>prevline;s=g_utf8_prev_char(s))

  2625 	{

  2626 	    if (g_unichar_isalpha(g_utf8_get_char(s)))

  2627 	    {

  2628 		if (pswit[ECHO_SWITCH])

  2629 		    g_print("\n%s\n",prevline);

  2630 		if (!pswit[OVERVIEW_SWITCH])

  2631 		    g_print("    Line %ld column %ld - "

  2632 		      "No punctuation at para end?\n",

  2633 		      linecnt-1,g_utf8_strlen(prevline,-1));

  2634 		else

  2635 		    cnt_punct++;

  2636 		break;

  2637 	    }

  2638 	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))

  2639 		break;

  2640 	}

  2641     }

  2642 }

  2644 gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)

  2645 {

  2646     const char *word=key;

  2647     int *dupcnt=value;

  2648     if (*dupcnt)

  2649 	g_print("\nNote: Queried word %s was duplicated %d times\n",

  2650 	  word,*dupcnt);

  2651     return FALSE;

  2652 }

  2654 void print_as_windows_1252(const char *string)

  2655 {

  2656     gsize inbytes,outbytes;

  2657     gchar *buf,*bp;

  2658     GIConv converter=(GIConv)-1;

  2659     if (!string)

  2660     {

  2661 	if (converter!=(GIConv)-1)

  2662 	    g_iconv_close(converter);

  2663 	converter=(GIConv)-1;

  2664 	return;

  2665     }

  2666     if (converter=(GIConv)-1)

  2667 	converter=g_iconv_open("WINDOWS-1252","UTF-8");

  2668     if (converter!=(GIConv)-1)

  2669     {

  2670 	inbytes=outbytes=strlen(string);

  2671 	bp=buf=g_malloc(outbytes+1);

  2672 	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);

  2673 	*bp='\0';

  2674 	fputs(buf,stdout);

  2675 	g_free(buf);

  2676     }

  2677     else

  2678 	fputs(string,stdout);

  2679 }

  2681 void print_as_utf_8(const char *string)

  2682 {

  2683     fputs(string,stdout);

  2684 }

  2686 /*

  2687  * procfile:

  2688  *

  2689  * Process one file.

  2690  */

  2691 void procfile(const char *filename)

  2692 {

  2693     const char *s;

  2694     gchar *parastart=NULL;	/* first line of current para */

  2695     gchar *etext,*aline;

  2696     gchar *etext_ptr;

  2697     GError *err=NULL;

  2698     struct first_pass_results *first_pass_results;

  2699     struct warnings *warnings;

  2700     struct counters counters={0};

  2701     struct line_properties last={0};

  2702     struct parities parities={0};

  2703     struct pending pending={0};

  2704     gboolean isemptyline;

  2705     long start_para_line=0;

  2706     gboolean isnewpara=FALSE,enddash=FALSE;

  2707     last.start=CHAR_SPACE;

  2708     linecnt=checked_linecnt=0;

  2709     etext=read_etext(filename,&err);

  2710     if (!etext)

  2711     {

  2712 	if (pswit[STDOUT_SWITCH])

  2713 	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);

  2714 	else

  2715 	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);

  2716 	exit(1);

  2717     }

  2718     g_print("\n\nFile: %s\n\n",filename);

  2719     first_pass_results=first_pass(etext);

  2720     warnings=report_first_pass(first_pass_results);

  2721     qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);

  2722     qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);

  2723     /*

  2724      * Here we go with the main pass. Hold onto yer hat!

  2725      */

  2726     linecnt=0;

  2727     etext_ptr=etext;

  2728     while ((aline=flgets(&etext_ptr,linecnt+1)))

  2729     {

  2730 	linecnt++;

  2731 	if (linecnt==1)

  2732 	    isnewpara=TRUE;

  2733 	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))

  2734 	    continue;    // skip DP page separators completely

  2735 	if (linecnt<first_pass_results->firstline ||

  2736 	  (first_pass_results->footerline>0 &&

  2737 	  linecnt>first_pass_results->footerline))

  2738 	{

  2739 	    if (pswit[HEADER_SWITCH])

  2740 	    {

  2741 		if (g_str_has_prefix(aline,"Title:"))

  2742 		    g_print("    %s\n",aline);

  2743 		if (g_str_has_prefix(aline,"Author:"))

  2744 		    g_print("    %s\n",aline);

  2745 		if (g_str_has_prefix(aline,"Release Date:"))

  2746 		    g_print("    %s\n",aline);

  2747 		if (g_str_has_prefix(aline,"Edition:"))

  2748 		    g_print("    %s\n\n",aline);

  2749 	    }

  2750 	    continue;		/* skip through the header */

  2751 	}

  2752 	checked_linecnt++;

  2753 	print_pending(aline,parastart,&pending);

  2754 	memset(&pending,0,sizeof(pending));

  2755 	isemptyline=analyse_quotes(aline,&counters);

  2756 	if (isnewpara && !isemptyline)

  2757 	{

  2758 	    /* This line is the start of a new paragraph. */

  2759 	    start_para_line=linecnt;

  2760 	    /* Capture its first line in case we want to report it later. */

  2761 	    g_free(parastart);

  2762 	    parastart=g_strdup(aline);

  2763 	    memset(&parities,0,sizeof(parities));  /* restart the quote count */

  2764 	    s=aline;

  2765 	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&

  2766 	      !g_unichar_isdigit(g_utf8_get_char(s)))

  2767 		s=g_utf8_next_char(s);

  2768 	    if (g_unichar_islower(g_utf8_get_char(s)))

  2769 	    {

  2770 		/* and its first letter is lowercase */

  2771 		if (pswit[ECHO_SWITCH])

  2772 		    g_print("\n%s\n",aline);

  2773 		if (!pswit[OVERVIEW_SWITCH])

  2774 		    g_print("    Line %ld column %ld - "

  2775 		      "Paragraph starts with lower-case\n",

  2776 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);

  2777 		else

  2778 		    cnt_punct++;

  2779 	    }

  2780 	    isnewpara=FALSE; /* Signal the end of new para processing. */

  2781 	}

  2782 	/* Check for an em-dash broken at line end. */

  2783 	if (enddash && g_utf8_get_char(aline)=='-')

  2784 	{

  2785 	    if (pswit[ECHO_SWITCH])

  2786 		g_print("\n%s\n",aline);

  2787 	    if (!pswit[OVERVIEW_SWITCH])

  2788 		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);

  2789 	    else

  2790 		cnt_punct++;

  2791 	}

  2792 	enddash=FALSE;

  2793 	for (s=g_utf8_prev_char(aline+strlen(aline));

  2794 	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))

  2795 	    ;

  2796 	if (s>=aline && g_utf8_get_char(s)=='-')

  2797 	    enddash=TRUE;

  2798 	check_for_control_characters(aline);

  2799 	if (warnings->bin)

  2800 	    check_for_odd_characters(aline,warnings,isemptyline);

  2801 	if (warnings->longline)

  2802 	    check_for_long_line(aline);

  2803 	if (warnings->shortline)

  2804 	    check_for_short_line(aline,&last);

  2805 	last.blen=last.len;

  2806 	last.len=g_utf8_strlen(aline,-1);

  2807 	last.start=g_utf8_get_char(aline);

  2808 	check_for_starting_punctuation(aline);

  2809 	if (warnings->dash)

  2810 	{

  2811 	    check_for_spaced_emdash(aline);

  2812 	    check_for_spaced_dash(aline);

  2813 	}

  2814 	check_for_unmarked_paragraphs(aline);

  2815 	check_for_jeebies(aline);

  2816 	check_for_mta_from(aline);

  2817 	check_for_orphan_character(aline);

  2818 	check_for_pling_scanno(aline);

  2819 	check_for_extra_period(aline,warnings);

  2820 	check_for_following_punctuation(aline);

  2821 	check_for_typos(aline,warnings);

  2822 	check_for_misspaced_punctuation(aline,&parities,isemptyline);

  2823 	check_for_double_punctuation(aline,warnings);

  2824 	check_for_spaced_quotes(aline);

  2825 	check_for_miscased_genative(aline);

  2826 	check_end_of_line(aline,warnings);

  2827 	check_for_unspaced_bracket(aline);

  2828 	if (warnings->endquote)

  2829 	    check_for_unpunctuated_endquote(aline);

  2830 	check_for_html_tag(aline);

  2831 	check_for_html_entity(aline);

  2832 	if (isemptyline)

  2833 	{

  2834 	    check_for_mismatched_quotes(&counters,&pending);

  2835 	    memset(&counters,0,sizeof(counters));

  2836 	    /* let the next iteration know that it's starting a new para */

  2837 	    isnewpara=TRUE;

  2838 	    if (prevline)

  2839 		check_for_omitted_punctuation(prevline,&last,start_para_line);

  2840 	}

  2841 	g_free(prevline);

  2842 	prevline=g_strdup(aline);

  2843     }

  2844     if (prevline)

  2845     {

  2846 	g_free(prevline);

  2847 	prevline=NULL;

  2848     }

  2849     g_free(parastart);

  2850     g_free(prevline);

  2851     g_free(etext);

  2852     if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH])

  2853 	g_tree_foreach(qword,report_duplicate_queries,NULL);

  2854     g_tree_unref(qword);

  2855     g_tree_unref(qperiod);

  2856     g_set_print_handler(NULL);

  2857     print_as_windows_1252(NULL);

  2858     if (pswit[MARKUP_SWITCH])

  2859 	loseentities(NULL);

  2860 }

  2862 /*

  2863  * flgets:

  2864  *

  2865  * Get one line from the input text, checking for

  2866  * the existence of exactly one CR/LF line-end per line.

  2867  *

  2868  * Returns: a pointer to the line.

  2869  */

  2870 char *flgets(char **etext,long lcnt)

  2871 {

  2872     gunichar c;

  2873     gboolean isCR=FALSE;

  2874     char *theline=*etext;

  2875     char *eos=theline;

  2876     gchar *s;

  2877     for (;;)

  2878     {

  2879 	c=g_utf8_get_char(*etext);

  2880 	*etext=g_utf8_next_char(*etext);

  2881 	if (!c)

  2882 	    return NULL;

  2883 	/* either way, it's end of line */

  2884 	if (c=='\n')

  2885 	{

  2886 	    if (isCR)

  2887 		break;

  2888 	    else

  2889 	    {

  2890 		/* Error - a LF without a preceding CR */

  2891 		if (pswit[LINE_END_SWITCH])

  2892 		{

  2893 		    if (pswit[ECHO_SWITCH])

  2894 		    {

  2895 			s=g_strndup(theline,eos-theline);

  2896 			g_print("\n%s\n",s);

  2897 			g_free(s);

  2898 		    }

  2899 		    if (!pswit[OVERVIEW_SWITCH])

  2900 			g_print("    Line %ld - No CR?\n",lcnt);

  2901 		    else

  2902 			cnt_lineend++;

  2903 		}

  2904 		break;

  2905 	    }

  2906 	}

  2907 	if (c=='\r')

  2908 	{

  2909 	    if (isCR)

  2910 	    {

  2911 		/* Error - two successive CRs */

  2912 		if (pswit[LINE_END_SWITCH])

  2913 		{

  2914 		    if (pswit[ECHO_SWITCH])

  2915 		    {

  2916 			s=g_strndup(theline,eos-theline);

  2917 			g_print("\n%s\n",s);

  2918 			g_free(s);

  2919 		    }

  2920 		    if (!pswit[OVERVIEW_SWITCH])

  2921 			g_print("    Line %ld - Two successive CRs?\n",lcnt);

  2922 		    else

  2923 			cnt_lineend++;

  2924 		}

  2925 	    }

  2926 	    isCR=TRUE;

  2927 	}

  2928 	else

  2929 	{

  2930 	    if (pswit[LINE_END_SWITCH] && isCR)

  2931 	    {

  2932 		if (pswit[ECHO_SWITCH])

  2933 		{

  2934 		    s=g_strndup(theline,eos-theline);

  2935 		    g_print("\n%s\n",s);

  2936 		    g_free(s);

  2937 		}

  2938 		if (!pswit[OVERVIEW_SWITCH])

  2939 		    g_print("    Line %ld column %ld - CR without LF?\n",

  2940 		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);

  2941 		else

  2942 		    cnt_lineend++;

  2943 		*eos=' ';

  2944 	    }

  2945 	    isCR=FALSE;

  2946 	    eos=g_utf8_next_char(eos);

  2947 	}

  2948     }

  2949     *eos='\0';

  2950     if (pswit[MARKUP_SWITCH])

  2951 	postprocess_for_HTML(theline);

  2952     if (pswit[DP_SWITCH])

  2953 	postprocess_for_DP(theline);

  2954     return theline;

  2955 }

  2957 /*

  2958  * mixdigit:

  2959  *

  2960  * Takes a "word" as a parameter, and checks whether it

  2961  * contains a mixture of alpha and digits. Generally, this is an

  2962  * error, but may not be for cases like 4th or L5 12s. 3d.

  2963  *

  2964  * Returns: TRUE iff an is error found.

  2965  */

  2966 gboolean mixdigit(const char *checkword)

  2967 {

  2968     gboolean wehaveadigit,wehavealetter,query;

  2969     const char *s,*nondigit;

  2970     wehaveadigit=wehavealetter=query=FALSE;

  2971     for (s=checkword;*s;s=g_utf8_next_char(s))

  2972 	if (g_unichar_isalpha(g_utf8_get_char(s)))

  2973 	    wehavealetter=TRUE;

  2974 	else if (g_unichar_isdigit(g_utf8_get_char(s)))

  2975 	    wehaveadigit=TRUE;

  2976     if (wehaveadigit && wehavealetter)

  2977     {

  2978 	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */

  2979 	query=TRUE;

  2980 	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));

  2981 	  nondigit=g_utf8_next_char(nondigit))

  2982 	    ;

  2983 	/* digits, ending in st, rd, nd, th of either case */

  2984 	if (!g_ascii_strcasecmp(nondigit,"st") ||

  2985 	  !g_ascii_strcasecmp(nondigit,"rd") ||

  2986 	  !g_ascii_strcasecmp(nondigit,"nd") ||

  2987 	  !g_ascii_strcasecmp(nondigit,"th"))

  2988 	    query=FALSE;

  2989 	if (!g_ascii_strcasecmp(nondigit,"sts") ||

  2990 	  !g_ascii_strcasecmp(nondigit,"rds") ||

  2991 	  !g_ascii_strcasecmp(nondigit,"nds") ||

  2992 	  !g_ascii_strcasecmp(nondigit,"ths"))

  2993 	    query=FALSE;

  2994 	if (!g_ascii_strcasecmp(nondigit,"stly") ||

  2995 	  !g_ascii_strcasecmp(nondigit,"rdly") ||

  2996 	  !g_ascii_strcasecmp(nondigit,"ndly") ||

  2997 	  !g_ascii_strcasecmp(nondigit,"thly"))

  2998 	    query=FALSE;

  2999 	/* digits, ending in l, L, s or d */

  3000 	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||

  3001 	  !strcmp(nondigit,"d"))

  3002 	    query=FALSE;

  3003 	/*

  3004 	 * L at the start of a number, representing Britsh pounds, like L500.

  3005 	 * This is cute. We know the current word is mixed digit. If the first

  3006 	 * letter is L, there must be at least one digit following. If both

  3007 	 * digits and letters follow, we have a genuine error, else we have a

  3008 	 * capital L followed by digits, and we accept that as a non-error.

  3009 	 */

  3010 	if (g_utf8_get_char(checkword)=='L' &&

  3011 	  !mixdigit(g_utf8_next_char(checkword)))

  3012 	    query=FALSE;

  3013     }

  3014     return query;

  3015 }

  3017 /*

  3018  * getaword:

  3019  *

  3020  * Extracts the first/next "word" from the line, and returns it.

  3021  * A word is defined as one English word unit--or at least that's the aim.

  3022  * "ptr" is advanced to the position in the line where we will start

  3023  * looking for the next word.

  3024  *

  3025  * Returns: A newly-allocated string.

  3026  */

  3027 gchar *getaword(const char **ptr)

  3028 {

  3029     const char *s,*t;

  3030     GString *word;

  3031     gunichar c,pc;

  3032     word=g_string_new(NULL);

  3033     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&

  3034       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&

  3035       **ptr;*ptr=g_utf8_next_char(*ptr))

  3036 	;

  3037     /*

  3038      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.

  3039      * Especially yucky is the case of L1,000

  3040      * This section looks for a pattern of characters including a digit

  3041      * followed by a comma or period followed by one or more digits.

  3042      * If found, it returns this whole pattern as a word; otherwise we discard

  3043      * the results and resume our normal programming.

  3044      */

  3045     s=*ptr;

  3046     for (;g_unichar_isdigit(g_utf8_get_char(s)) ||

  3047       g_unichar_isalpha(g_utf8_get_char(s)) ||

  3048       g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))

  3049 	g_string_append_unichar(word,g_utf8_get_char(s));

  3050     for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);

  3051       t=g_utf8_next_char(t))

  3052     {

  3053 	c=g_utf8_get_char(t);

  3054 	pc=g_utf8_get_char(g_utf8_prev_char(t));

  3055 	if ((c=='.' || c==',') && g_unichar_isdigit(pc))

  3056 	{

  3057 	    *ptr=s;

  3058 	    return g_string_free(word,FALSE);

  3059 	}

  3060     }

  3061     /* we didn't find a punctuated number - do the regular getword thing */

  3062     g_string_truncate(word,0);

  3063     for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||

  3064       g_unichar_isalpha(g_utf8_get_char(*ptr)) ||

  3065       g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))

  3066 	g_string_append_unichar(word,g_utf8_get_char(*ptr));

  3067     return g_string_free(word,FALSE);

  3068 }

  3070 /*

  3071  * isroman:

  3072  *

  3073  * Is this word a Roman Numeral?

  3074  *

  3075  * It doesn't actually validate that the number is a valid Roman Numeral--for

  3076  * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not

  3077  * what we're here to do. If it passes this, it LOOKS like a Roman numeral.

  3078  * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or

  3079  * expressions thereof, except when it came to taxes. Allow any number of M,

  3080  * an optional D, an optional CM or CD, any number of optional Cs, an optional

  3081  * XL or an optional XC, an optional IX or IV, an optional V and any number

  3082  * of optional Is.

  3083  */

  3084 gboolean isroman(const char *t)

  3085 {

  3086     const char *s;

  3087     if (!t || !*t)

  3088 	return FALSE;

  3089     s=t;

  3090     while (g_utf8_get_char(t)=='m' && *t)

  3091 	t++;

  3092     if (g_utf8_get_char(t)=='d')

  3093 	t++;

  3094     if (g_str_has_prefix(t,"cm"))

  3095 	t+=2;

  3096     if (g_str_has_prefix(t,"cd"))

  3097 	t+=2;

  3098     while (g_utf8_get_char(t)=='c' && *t)

  3099 	t++;

  3100     if (g_str_has_prefix(t,"xl"))

  3101 	t+=2;

  3102     if (g_str_has_prefix(t,"xc"))

  3103 	t+=2;

  3104     if (g_utf8_get_char(t)=='l')

  3105 	t++;

  3106     while (g_utf8_get_char(t)=='x' && *t)

  3107 	t++;

  3108     if (g_str_has_prefix(t,"ix"))

  3109 	t+=2;

  3110     if (g_str_has_prefix(t,"iv"))

  3111 	t+=2;

  3112     if (g_utf8_get_char(t)=='v')

  3113 	t++;

  3114     while (g_utf8_get_char(t)=='i' && *t)

  3115 	t++;

  3116     return !*t;

  3117 }

  3119 /*

  3120  * postprocess_for_DP:

  3121  *

  3122  * Invoked with the -d switch from flgets().

  3123  * It simply "removes" from the line a hard-coded set of common

  3124  * DP-specific tags, so that the line passed to the main routine has

  3125  * been pre-cleaned of DP markup.

  3126  */

  3127 void postprocess_for_DP(char *theline)

  3128 {

  3129     char *s,*t;

  3130     int i;

  3131     if (!*theline)

  3132 	return;

  3133     for (i=0;*DPmarkup[i];i++)

  3134 	while ((s=strstr(theline,DPmarkup[i])))

  3135 	{

  3136 	    t=s+strlen(DPmarkup[i]);

  3137 	    memmove(s,t,strlen(t)+1);

  3138 	}

  3139 }

  3141 /*

  3142  * postprocess_for_HTML:

  3143  *

  3144  * Invoked with the -m switch from flgets().

  3145  * It simply "removes" from the line a hard-coded set of common

  3146  * HTML tags and "replaces" a hard-coded set of common HTML

  3147  * entities, so that the line passed to the main routine has

  3148  * been pre-cleaned of HTML.

  3149  */

  3150 void postprocess_for_HTML(char *theline)

  3151 {

  3152     while (losemarkup(theline))

  3153 	;

  3154     loseentities(theline);

  3155 }

  3157 char *losemarkup(char *theline)

  3158 {

  3159     char *s,*t;

  3160     int i;

  3161     s=strchr(theline,'<');

  3162     t=s?strchr(s,'>'):NULL;

  3163     if (!s || !t)

  3164 	return NULL;

  3165     for (i=0;*markup[i];i++)

  3166 	if (tagcomp(g_utf8_next_char(s),markup[i]))

  3167 	{

  3168 	    t=g_utf8_next_char(t);

  3169 	    memmove(s,t,strlen(t)+1);

  3170 	    return s;

  3171 	}

  3172     /* It's an unrecognized <xxx>. */

  3173     return NULL;

  3174 }

  3176 void loseentities(char *theline)

  3177 {

  3178     int i;

  3179     gsize nb;

  3180     char *amp,*scolon;

  3181     gchar *s,*t;

  3182     gunichar c;

  3183     GTree *entities=NULL;

  3184     GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;

  3185     if (!theline)

  3186     {

  3187 	if (entities)

  3188 	    g_tree_destroy(entities);

  3189 	entities=NULL;

  3190 	if (translit==(GIConv)-1)

  3191 	    g_iconv_close(translit);

  3192 	translit=(GIConv)-1;

  3193 	if (to_utf8==(GIConv)-1)

  3194 	    g_iconv_close(to_utf8);

  3195 	to_utf8=(GIConv)-1;

  3196 	return;

  3197     }

  3198     if (!*theline)

  3199 	return;

  3200     if (!entities)

  3201     {

  3202 	entities=g_tree_new((GCompareFunc)strcmp);

  3203 	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)

  3204 	    g_tree_insert(entities,HTMLentities[i].name,

  3205 	      GUINT_TO_POINTER(HTMLentities[i].c));

  3206     }

  3207     if (translit==(GIConv)-1)

  3208 	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");

  3209     if (to_utf8==(GIConv)-1)

  3210 	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");

  3211     while((amp=strchr(theline,'&')))

  3212     {

  3213 	scolon=strchr(amp,';');

  3214 	if (scolon)

  3215 	{

  3216 	    if (amp[1]=='#')

  3217 	    {

  3218 		if (amp+2+strspn(amp+2,"0123456789")==scolon)

  3219 		    c=strtol(amp+2,NULL,10);

  3220 		else if (amp[2]=='x' &&

  3221 		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)

  3222 		    c=strtol(amp+3,NULL,16);

  3223 	    }

  3224 	    else

  3225 	    {

  3226 		s=g_strndup(amp+1,scolon-(amp+1));

  3227 	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));

  3228 		g_free(s);

  3229 	    }

  3230 	}

  3231 	else

  3232 	    c=0;

  3233 	if (c)

  3234 	{

  3235 	    theline=amp;

  3236 	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */

  3237 		theline+=g_unichar_to_utf8(c,theline);

  3238 	    else

  3239 	    {

  3240 		s=g_malloc(6);

  3241 		nb=g_unichar_to_utf8(c,s);

  3242 		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);

  3243 		g_free(s);

  3244 		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);

  3245 		g_free(t);

  3246 		memcpy(theline,s,nb);

  3247 		g_free(s);

  3248 		theline+=nb;

  3249 	    }

  3250 	    memmove(theline,g_utf8_next_char(scolon),

  3251 	      strlen(g_utf8_next_char(scolon))+1);

  3252 	}

  3253 	else

  3254 	    theline=g_utf8_next_char(amp);

  3255     }

  3256 }

  3258 gboolean tagcomp(const char *strin,const char *basetag)

  3259 {

  3260     gboolean retval;

  3261     gchar *s,*t;

  3262     if (g_utf8_get_char(strin)=='/')

  3263 	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */

  3264     else

  3265 	t=g_utf8_casefold(strin,-1);

  3266     s=g_utf8_casefold(basetag,-1);

  3267     retval=g_str_has_prefix(t,s);

  3268     g_free(s);

  3269     g_free(t);

  3270     return retval;

  3271 }

  3273 void proghelp(GOptionContext *context)

  3274 {

  3275     gchar *help;

  3276     fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);

  3277     fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);

  3278     fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);

  3279     fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. "

  3280       "For details, read the file COPYING.\n",stderr);

  3281     fputs("This is Free Software; "

  3282       "you may redistribute it under certain conditions (GPL);\n",stderr);

  3283     fputs("read the file COPYING for details.\n\n",stderr);

  3284     help=g_option_context_get_help(context,TRUE,NULL);

  3285     fputs(help,stderr);

  3286     g_free(help);

  3287     fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);

  3288     fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "

  3289       "non-ASCII\n",stderr);

  3290     fputs("characters like accented letters, "

  3291       "lines longer than 75 or shorter than 55,\n",stderr);

  3292     fputs("unbalanced quotes or brackets, "

  3293       "a variety of badly formatted punctuation, \n",stderr);

  3294     fputs("HTML tags, some likely typos. "

  3295       "It is NOT a substitute for human judgement.\n",stderr);

  3296     fputs("\n",stderr);

  3297 }

author	ali <ali@juiblex.co.uk>
	Thu Sep 05 22:34:27 2013 +0100 (2013-09-05)
changeset 79	0c7258bf8e4f
parent 76	4e6e7cc6b50d
child 82	0df25c7f4ed7
permissions	-rw-r--r--