author ali <ali@juiblex.co.uk>

Tue May 28 15:17:19 2013 +0100 (2013-05-28)

changeset 69 1016349e619f

parent 68 adb087007d08

child 70 aa916da2e452

bl/textfileutils.c file | annotate | diff | revisions

bl/textfileutils.h file | annotate | diff | revisions

bookloupe/Makefile.am file | annotate | diff | revisions

bookloupe/bookloupe.c file | annotate | diff | revisions

configure.ac file | annotate | diff | revisions

test/compatibility/user-defined-typo.tst file | annotate | diff | revisions

test/harness/testcaseparser.c file | annotate | diff | revisions
     1.1 --- a/bl/textfileutils.c	Mon May 27 09:03:04 2013 +0100
     1.2 +++ b/bl/textfileutils.c	Tue May 28 15:17:19 2013 +0100
     1.3 @@ -3,26 +3,21 @@
     1.4  #include <bl/bl.h>
     1.5  
     1.6  /*
     1.7 - * Read a file into memory (which should be freed with mem_free when no
     1.8 + * Read a file into memory (which should be freed with g_free when no
     1.9   * longer required). Returns NULL on error and outputs a suitable error
    1.10   * message to stderr.
    1.11   * DOS-style line endings and UTF-8 BOM are handled transparently even
    1.12   * on platforms which don't normally use these formats.
    1.13   */
    1.14  gboolean file_get_contents_text(const char *filename,char **contents,
    1.15 -  size_t *length)
    1.16 +  size_t *length,GError **err)
    1.17  {
    1.18      int i;
    1.19      unsigned char *raw;
    1.20 -    size_t raw_length;
    1.21 +    gsize raw_length;
    1.22      GString *string;
    1.23 -    GError *error=NULL;
    1.24 -    if (!g_file_get_contents(filename,(char *)&raw,&raw_length,&error))
    1.25 -    {
    1.26 -	fprintf(stderr,"%s: %s\n",filename,error->message);
    1.27 -	g_error_free(error);
    1.28 +    if (!g_file_get_contents(filename,(char **)&raw,&raw_length,err))
    1.29  	return FALSE;
    1.30 -    }
    1.31      string=g_string_new(NULL);
    1.32      i=0;
    1.33      if (raw_length>=3 && raw[0]==0xEF && raw[1]==0xBB && raw[2]==0xBF)

     2.1 --- a/bl/textfileutils.h	Mon May 27 09:03:04 2013 +0100
     2.2 +++ b/bl/textfileutils.h	Tue May 28 15:17:19 2013 +0100
     2.3 @@ -4,6 +4,6 @@
     2.4  #include <glib.h>
     2.5  
     2.6  gboolean file_get_contents_text(const char *filename,char **contents,
     2.7 -  size_t *length);
     2.8 +  size_t *length,GError **err);
     2.9  
    2.10  #endif /* BL_TEXTFILEUTILS_H */

     3.1 --- a/bookloupe/Makefile.am	Mon May 27 09:03:04 2013 +0100
     3.2 +++ b/bookloupe/Makefile.am	Tue May 28 15:17:19 2013 +0100
     3.3 @@ -1,5 +1,9 @@
     3.4 +INCLUDES=-I$(top_srcdir)
     3.5  bin_PROGRAMS=bookloupe
     3.6  pkgdata_DATA=bookloupe.typ
     3.7 +AM_CFLAGS=$(GLIB_CFLAGS)
     3.8 +LIBS=$(GLIB_LIBS)
     3.9 +LDADD=../bl/libbl.la
    3.10  
    3.11  bookloupe.typ:	bookloupe.typ.in
    3.12  	sed 's/$$/\r/' $< > $@

     4.1 --- a/bookloupe/bookloupe.c	Mon May 27 09:03:04 2013 +0100
     4.2 +++ b/bookloupe/bookloupe.c	Tue May 28 15:17:19 2013 +0100
     4.3 @@ -22,19 +22,10 @@
     4.4  #include <stdlib.h>
     4.5  #include <string.h>
     4.6  #include <ctype.h>
     4.7 +#include <glib.h>
     4.8 +#include <bl/bl.h>
     4.9  
    4.10 -#define MAXWORDLEN    80    /* max length of one word */
    4.11 -#define LINEBUFSIZE 2048    /* buffer size for an input line */
    4.12 -
    4.13 -#define MAX_USER_TYPOS 1000
    4.14 -#define USERTYPO_FILE "gutcheck.typ"
    4.15 -
    4.16 -#ifndef MAX_PATH
    4.17 -#define MAX_PATH 16384
    4.18 -#endif
    4.19 -
    4.20 -char aline[LINEBUFSIZE];
    4.21 -char prevline[LINEBUFSIZE];
    4.22 +gchar *prevline;
    4.23  
    4.24  /* Common typos. */
    4.25  char *typo[] = {
    4.26 @@ -70,7 +61,7 @@
    4.27      "se", ""
    4.28  };
    4.29  
    4.30 -char *usertypo[MAX_USER_TYPOS];
    4.31 +GTree *usertypo;
    4.32  
    4.33  /* Common abbreviations and other OK words not to query as typos. */
    4.34  char *okword[] = {
    4.35 @@ -282,46 +273,57 @@
    4.36  #define WAY_TOO_LONG      80
    4.37  #define SHORTEST_PG_LINE  55
    4.38  
    4.39 -#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
    4.40 -				  /*     D - ignore DP-specific markup */
    4.41 -				  /*     E - echo queried line */
    4.42 -				  /*     S - check single quotes */
    4.43 -				  /*     T - check common typos	*/
    4.44 -				  /*     P - require closure of quotes on */
    4.45 -				  /*	 every paragraph */
    4.46 -				  /*     X - "Trust no one" :-) Paranoid! */
    4.47 -				  /*	 Queries everything */
    4.48 -				  /*     L - line end checking defaults on */
    4.49 -				  /*	 -L turns it off */
    4.50 -				  /*     O - overview. Just shows counts. */
    4.51 -				  /*     Y - puts errors to stdout */
    4.52 -				  /*	 instead of stderr */
    4.53 -				  /*     H - Echoes header fields */
    4.54 -				  /*     M - Ignore markup in < > */
    4.55 -				  /*     U - Use file of User-defined Typos */
    4.56 -				  /*     W - Defaults for use on Web upload */
    4.57 -				  /*     V - Verbose - list EVERYTHING! */
    4.58 -#define SWITNO 14		  /* max number of switch parms	*/
    4.59 -				  /*	- used for defining array-size */
    4.60 -#define MINARGS   1  /* minimum no of args excl switches */
    4.61 -#define MAXARGS   1  /* maximum no of args excl switches */
    4.62 +enum {
    4.63 +    ECHO_SWITCH,
    4.64 +    SQUOTE_SWITCH,
    4.65 +    TYPO_SWITCH,
    4.66 +    QPARA_SWITCH,
    4.67 +    PARANOID_SWITCH,
    4.68 +    LINE_END_SWITCH,
    4.69 +    OVERVIEW_SWITCH,
    4.70 +    STDOUT_SWITCH,
    4.71 +    HEADER_SWITCH,
    4.72 +    WEB_SWITCH,
    4.73 +    VERBOSE_SWITCH,
    4.74 +    MARKUP_SWITCH,
    4.75 +    USERTYPO_SWITCH,
    4.76 +    DP_SWITCH,
    4.77 +    SWITNO
    4.78 +};
    4.79  
    4.80 -int pswit[SWITNO];   /* program switches set by SWITCHES */
    4.81 +gboolean pswit[SWITNO];  /* program switches */
    4.82  
    4.83 -#define ECHO_SWITCH      0
    4.84 -#define SQUOTE_SWITCH    1
    4.85 -#define TYPO_SWITCH      2
    4.86 -#define QPARA_SWITCH     3
    4.87 -#define PARANOID_SWITCH  4
    4.88 -#define LINE_END_SWITCH  5
    4.89 -#define OVERVIEW_SWITCH  6
    4.90 -#define STDOUT_SWITCH    7
    4.91 -#define HEADER_SWITCH    8
    4.92 -#define WEB_SWITCH       9
    4.93 -#define VERBOSE_SWITCH   10
    4.94 -#define MARKUP_SWITCH    11
    4.95 -#define USERTYPO_SWITCH  12
    4.96 -#define DP_SWITCH	 13
    4.97 +static GOptionEntry options[]={
    4.98 +    { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
    4.99 +      "Ignore DP-specific markup", NULL },
   4.100 +    { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   4.101 +      "Don't echo queried line", NULL },
   4.102 +    { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   4.103 +      "Check single quotes", NULL },
   4.104 +    { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   4.105 +      "Check common typos", NULL },
   4.106 +    { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   4.107 +      "Require closure of quotes on every paragraph", NULL },
   4.108 +    { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   4.109 +      "Disable paranoid querying of everything", NULL },
   4.110 +    { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   4.111 +      "Disable line end checking", NULL },
   4.112 +    { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   4.113 +      "Overview: just show counts", NULL },
   4.114 +    { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   4.115 +      "Output errors to stdout instead of stderr", NULL },
   4.116 +    { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   4.117 +      "Echo header fields", NULL },
   4.118 +    { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   4.119 +      "Ignore markup in < >", NULL },
   4.120 +    { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   4.121 +      "Use file of user-defined typos", NULL },
   4.122 +    { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   4.123 +      "Defaults for use on www upload", NULL },
   4.124 +    { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   4.125 +      "Verbose - list everything", NULL },
   4.126 +    { NULL }
   4.127 +};
   4.128  
   4.129  long cnt_dquot;		/* for overview mode, count of doublequote queries */
   4.130  long cnt_squot;		/* for overview mode, count of singlequote queries */
   4.131 @@ -340,47 +342,26 @@
   4.132  long linecnt;		/* count of total lines in the file */
   4.133  long checked_linecnt;	/* count of lines actually checked */
   4.134  
   4.135 -void proghelp(void);
   4.136 -void procfile(char *);
   4.137 +void proghelp(GOptionContext *context);
   4.138 +void procfile(const char *);
   4.139  
   4.140 -#define LOW_THRESHOLD    0
   4.141 -#define HIGH_THRESHOLD   1
   4.142 +gchar *running_from;
   4.143  
   4.144 -#define START 0
   4.145 -#define END 1
   4.146 -#define PREV 0
   4.147 -#define NEXT 1
   4.148 -#define FIRST_OF_PAIR 0
   4.149 -#define SECOND_OF_PAIR 1
   4.150 -
   4.151 -#define MAX_WORDPAIR 1000
   4.152 -
   4.153 -char running_from[MAX_PATH];
   4.154 -
   4.155 -int mixdigit(char *);
   4.156 -const char *getaword(const char *,char *);
   4.157 -int matchword(char *,char *);
   4.158 -char *flgets(char *,int,FILE *,long);
   4.159 -void lowerit(char *);
   4.160 -int gcisalpha(unsigned char);
   4.161 -int gcisdigit(unsigned char);
   4.162 -int gcisletter(unsigned char);
   4.163 -char *gcstrchr(char *s,char c);
   4.164 +int mixdigit(const char *);
   4.165 +gchar *getaword(const char **);
   4.166 +char *flgets(char **,long);
   4.167 +gboolean gcisalpha(unsigned char);
   4.168 +gboolean gcisdigit(unsigned char);
   4.169 +gboolean gcisletter(unsigned char);
   4.170  void postprocess_for_HTML(char *);
   4.171  char *linehasmarkup(char *);
   4.172  char *losemarkup(char *);
   4.173 -int tagcomp(char *,char *);
   4.174 +int tagcomp(const char *,const char *);
   4.175  char *loseentities(char *);
   4.176 -int isroman(char *);
   4.177 -int usertypo_count;
   4.178 +gboolean isroman(const char *);
   4.179  void postprocess_for_DP(char *);
   4.180  
   4.181 -char wrk[LINEBUFSIZE];
   4.182 -
   4.183 -#define MAX_QWORD 50
   4.184 -#define MAX_QWORD_LENGTH 40
   4.185 -char qword[MAX_QWORD][MAX_QWORD_LENGTH];
   4.186 -int dupcnt[MAX_QWORD];
   4.187 +GTree *qword,*qperiod;
   4.188  
   4.189  struct first_pass_results {
   4.190      long firstline,astline;
   4.191 @@ -392,7 +373,8 @@
   4.192  
   4.193  struct warnings {
   4.194      int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   4.195 -    int endquote,isDutch,isFrench;
   4.196 +    int endquote;
   4.197 +    gboolean isDutch,isFrench;
   4.198  };
   4.199  
   4.200  struct counters {
   4.201 @@ -411,52 +393,35 @@
   4.202  };
   4.203  
   4.204  struct pending {
   4.205 -    char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];
   4.206 +    char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
   4.207      long squot;
   4.208  };
   4.209  
   4.210 -int main(int argc,char **argv)
   4.211 +void parse_options(int *argc,char ***argv)
   4.212  {
   4.213 -    char *argsw,*s;
   4.214 -    int i,switno,invarg;
   4.215 -    char usertypo_file[MAX_PATH];
   4.216 -    FILE *usertypofile;
   4.217 -    if (strlen(argv[0])<sizeof(running_from))
   4.218 -	/* save the path to the executable */
   4.219 -	strcpy(running_from,argv[0]);
   4.220 -    /* find out what directory we're running from */
   4.221 -    s=running_from+strlen(running_from);
   4.222 -    for (;*s!='/' && *s!='\\' && s>=running_from;s--)
   4.223 -	*s=0;
   4.224 -    switno=strlen(SWITCHES);
   4.225 -    for (i=switno;--i>0;)
   4.226 -	pswit[i]=0;	   /* initialise switches */
   4.227 -    /*
   4.228 -     * Standard loop to extract switches.
   4.229 -     * When we come out of this loop, the arguments will be
   4.230 -     * in argv[0] upwards and the switches used will be
   4.231 -     * represented by their equivalent elements in pswit[]
   4.232 -     */
   4.233 -    while (--argc>0 && **++argv=='-')
   4.234 -	for (argsw=argv[0]+1;*argsw!='\0';argsw++)
   4.235 -	    for (i=switno,invarg=1;(--i>=0) && invarg==1;)
   4.236 -		if ((toupper(*argsw))==SWITCHES[i])
   4.237 -		{
   4.238 -		    invarg=0;
   4.239 -		    pswit[i]=1;
   4.240 -		}
   4.241 +    GError *err=NULL;
   4.242 +    GOptionContext *context;
   4.243 +    context=g_option_context_new(
   4.244 +      "file - looks for errors in Project Gutenberg(TM) etexts");
   4.245 +    g_option_context_add_main_entries(context,options,NULL);
   4.246 +    if (!g_option_context_parse(context,argc,argv,&err))
   4.247 +    {
   4.248 +	g_printerr("Bookloupe: %s\n",err->message);
   4.249 +	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   4.250 +	exit(1);
   4.251 +    }
   4.252      /* Paranoid checking is turned OFF, not on, by its switch */
   4.253 -    pswit[PARANOID_SWITCH]^=1;
   4.254 +    pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   4.255      if (pswit[PARANOID_SWITCH])
   4.256 -	/* if running in paranoid mode force typo checks as well   */
   4.257 -	pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
   4.258 +	/* if running in paranoid mode, typo checks default to enabled */
   4.259 +	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   4.260      /* Line-end checking is turned OFF, not on, by its switch */
   4.261 -    pswit[LINE_END_SWITCH]^=1;
   4.262 +    pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   4.263      /* Echoing is turned OFF, not on, by its switch */
   4.264 -    pswit[ECHO_SWITCH]^=1;
   4.265 +    pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   4.266      if (pswit[OVERVIEW_SWITCH])
   4.267  	/* just print summary; don't echo */
   4.268 -	pswit[ECHO_SWITCH]=0;
   4.269 +	pswit[ECHO_SWITCH]=FALSE;
   4.270      /*
   4.271       * Web uploads - for the moment, this is really just a placeholder
   4.272       * until we decide what processing we really want to do on web uploads
   4.273 @@ -464,85 +429,155 @@
   4.274      if (pswit[WEB_SWITCH])
   4.275      {
   4.276  	/* specific override for web uploads */
   4.277 -	pswit[ECHO_SWITCH]=1;
   4.278 -	pswit[SQUOTE_SWITCH]=0;
   4.279 -	pswit[TYPO_SWITCH]=1;
   4.280 -	pswit[QPARA_SWITCH]=0;
   4.281 -	pswit[PARANOID_SWITCH]=1;
   4.282 -	pswit[LINE_END_SWITCH]=0;
   4.283 -	pswit[OVERVIEW_SWITCH]=0;
   4.284 -	pswit[STDOUT_SWITCH]=0;
   4.285 -	pswit[HEADER_SWITCH]=1;
   4.286 -	pswit[VERBOSE_SWITCH]=0;
   4.287 -	pswit[MARKUP_SWITCH]=0;
   4.288 -	pswit[USERTYPO_SWITCH]=0;
   4.289 -	pswit[DP_SWITCH]=0;
   4.290 +	pswit[ECHO_SWITCH]=TRUE;
   4.291 +	pswit[SQUOTE_SWITCH]=FALSE;
   4.292 +	pswit[TYPO_SWITCH]=TRUE;
   4.293 +	pswit[QPARA_SWITCH]=FALSE;
   4.294 +	pswit[PARANOID_SWITCH]=TRUE;
   4.295 +	pswit[LINE_END_SWITCH]=FALSE;
   4.296 +	pswit[OVERVIEW_SWITCH]=FALSE;
   4.297 +	pswit[STDOUT_SWITCH]=FALSE;
   4.298 +	pswit[HEADER_SWITCH]=TRUE;
   4.299 +	pswit[VERBOSE_SWITCH]=FALSE;
   4.300 +	pswit[MARKUP_SWITCH]=FALSE;
   4.301 +	pswit[USERTYPO_SWITCH]=FALSE;
   4.302 +	pswit[DP_SWITCH]=FALSE;
   4.303      }
   4.304 -    if (argc<MINARGS || argc>MAXARGS)
   4.305 +    if (*argc<2)
   4.306      {
   4.307 -	/* check number of args */
   4.308 -	proghelp();
   4.309 -	return 1;
   4.310 +	proghelp(context);
   4.311 +	exit(1);
   4.312      }
   4.313 -    /* read in the user-defined stealth scanno list */
   4.314 +    g_option_context_free(context);
   4.315 +}
   4.316 +
   4.317 +/*
   4.318 + * read_user_scannos:
   4.319 + *
   4.320 + * Read in the user-defined stealth scanno list.
   4.321 + */
   4.322 +void read_user_scannos(void)
   4.323 +{
   4.324 +    GError *err=NULL;
   4.325 +    gchar *usertypo_file;
   4.326 +    gboolean okay;
   4.327 +    int i;
   4.328 +    gsize len;
   4.329 +    gchar *contents,**lines;
   4.330 +    usertypo_file=g_strdup("bookloupe.typ");
   4.331 +    okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   4.332 +    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   4.333 +    {
   4.334 +	g_clear_error(&err);
   4.335 +	g_free(usertypo_file);
   4.336 +	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   4.337 +	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   4.338 +    }
   4.339 +    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   4.340 +    {
   4.341 +	g_clear_error(&err);
   4.342 +	g_free(usertypo_file);
   4.343 +	usertypo_file=g_strdup("gutcheck.typ");
   4.344 +	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   4.345 +    }
   4.346 +    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   4.347 +    {
   4.348 +	g_clear_error(&err);
   4.349 +	g_free(usertypo_file);
   4.350 +	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   4.351 +	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   4.352 +    }
   4.353 +    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   4.354 +    {
   4.355 +	g_free(usertypo_file);
   4.356 +	printf("   --> I couldn't find bookloupe.typ "
   4.357 +	  "-- proceeding without user typos.\n");
   4.358 +	return;
   4.359 +    }
   4.360 +    else if (!okay)
   4.361 +    {
   4.362 +	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   4.363 +	g_free(usertypo_file);
   4.364 +	g_clear_error(&err);
   4.365 +	exit(1);
   4.366 +    }
   4.367 +    lines=g_strsplit(contents,"\n",0);
   4.368 +    usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   4.369 +    for (i=0;lines[i];i++)
   4.370 +	if (*(unsigned char *)lines[i]>'!')
   4.371 +	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   4.372 +	else
   4.373 +	    g_free(lines[i]);
   4.374 +    g_free(lines);
   4.375 +}
   4.376 +
   4.377 +#if 0
   4.378 +/*
   4.379 + * read_etext:
   4.380 + *
   4.381 + * Read an etext returning an array of lines. Lines are normally expected
   4.382 + * to be terminated by CR LF. Solitary LFs delimit lines but are left
   4.383 + * embedded at the end of the line for further processing. Solitary CRs
   4.384 + * do not delimit lines.
   4.385 + */
   4.386 +gchar **read_etext(const char *filename,GError **err)
   4.387 +{
   4.388 +    int i;
   4.389 +    const char *s,*t;
   4.390 +    gchar *contents;
   4.391 +    gchar **raw_lines;
   4.392 +    GPtrArray *lines;
   4.393 +    gsize len;
   4.394 +    if (!g_file_get_contents(filename,&contents,&len,err))
   4.395 +	return NULL;
   4.396 +    raw_lines=g_strsplit(contents,"\r\n",0);
   4.397 +    lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
   4.398 +    for (i=0;raw_lines[i];i++)
   4.399 +    {
   4.400 +	t=strchr(raw_lines[i],'\n');
   4.401 +	if (t)
   4.402 +	{
   4.403 +	    s=raw_lines[i];
   4.404 +	    while ((t=strchr(s,'\n')))
   4.405 +	    {
   4.406 +		g_ptr_array_add(lines,g_strndup(s,t-s+1));
   4.407 +		s=t+1;
   4.408 +	    }
   4.409 +	    g_ptr_array_add(lines,g_strdup(s));
   4.410 +	    g_free(raw_lines[i]);
   4.411 +	}
   4.412 +	else
   4.413 +	    g_ptr_array_add(lines,raw_lines[i]);
   4.414 +    }
   4.415 +    g_free(raw_lines);
   4.416 +    g_ptr_array_add(lines,NULL);
   4.417 +    return (gchar **)g_ptr_array_free(lines,FALSE);
   4.418 +}
   4.419 +#else
   4.420 +/*
   4.421 + * read_etext:
   4.422 + *
   4.423 + * Read an etext returning a newly allocated string containing the file
   4.424 + * contents or NULL on error.
   4.425 + */
   4.426 +gchar *read_etext(const char *filename,GError **err)
   4.427 +{
   4.428 +    gchar *contents;
   4.429 +    gsize len;
   4.430 +    if (!g_file_get_contents(filename,&contents,&len,err))
   4.431 +	return NULL;
   4.432 +    return contents;
   4.433 +}
   4.434 +#endif
   4.435 +
   4.436 +int main(int argc,char **argv)
   4.437 +{
   4.438 +    running_from=g_path_get_dirname(argv[0]);
   4.439 +    parse_options(&argc,&argv);
   4.440      if (pswit[USERTYPO_SWITCH])
   4.441 -    {
   4.442 -	/* ... we were told we had one! */
   4.443 -	usertypofile=fopen(USERTYPO_FILE,"rb");
   4.444 -	if (!usertypofile)
   4.445 -	{
   4.446 -	    /* not in cwd. try excuteable directory. */
   4.447 -	    strcpy(usertypo_file,running_from);
   4.448 -	    strcat(usertypo_file,USERTYPO_FILE);
   4.449 -	    usertypofile=fopen(usertypo_file,"rb");
   4.450 -	    if (!usertypofile) {
   4.451 -		/* we ain't got no user typo file! */
   4.452 -		printf("   --> I couldn't find gutcheck.typ "
   4.453 -		  "-- proceeding without user typos.\n");
   4.454 -	    }
   4.455 -	}
   4.456 -	usertypo_count=0;
   4.457 -	if (usertypofile)
   4.458 -	{
   4.459 -	    /* we managed to open a User Typo File! */
   4.460 -	    if (pswit[USERTYPO_SWITCH])
   4.461 -	    {
   4.462 -		while (flgets(aline,LINEBUFSIZE-1,usertypofile,
   4.463 -		  (long)usertypo_count))
   4.464 -		{
   4.465 -		    if (strlen(aline)>1)
   4.466 -		    {
   4.467 -			if ((int)*aline>33)
   4.468 -			{
   4.469 -			    s=malloc(strlen(aline)+1);
   4.470 -			    if (!s)
   4.471 -			    {
   4.472 -				fprintf(stderr,"bookloupe: cannot get enough "
   4.473 -				  "memory for user typo file!\n");
   4.474 -				exit(1);
   4.475 -			    }
   4.476 -			    strcpy(s,aline);
   4.477 -			    usertypo[usertypo_count]=s;
   4.478 -			    usertypo_count++;
   4.479 -			    if (usertypo_count>=MAX_USER_TYPOS)
   4.480 -			    {
   4.481 -				printf("   --> Only %d user-defined typos "
   4.482 -				  "allowed: ignoring the rest\n",
   4.483 -				  MAX_USER_TYPOS);
   4.484 -				break;
   4.485 -			    }
   4.486 -			}
   4.487 -		    }
   4.488 -		}
   4.489 -	    }
   4.490 -	    fclose(usertypofile);
   4.491 -	}
   4.492 -    }
   4.493 +	read_user_scannos();
   4.494      fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   4.495 -    cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
   4.496 -    cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
   4.497 -    cnt_spacend=0;
   4.498 -    procfile(argv[0]);
   4.499 +    procfile(argv[1]);
   4.500      if (pswit[OVERVIEW_SWITCH])
   4.501      {
   4.502  	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   4.503 @@ -577,6 +612,9 @@
   4.504  	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   4.505  	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   4.506      }
   4.507 +    g_free(running_from);
   4.508 +    if (usertypo)
   4.509 +	g_tree_unref(usertypo);
   4.510      return 0;
   4.511  }
   4.512  
   4.513 @@ -588,28 +626,33 @@
   4.514   * occur many times in the text like long or short
   4.515   * lines, non-standard dashes, etc.
   4.516   */
   4.517 -struct first_pass_results *first_pass(FILE *infile)
   4.518 +struct first_pass_results *first_pass(const char *etext)
   4.519  {
   4.520      char laststart=CHAR_SPACE;
   4.521      const char *s;
   4.522 -    int i,llen;
   4.523 +    gchar *lc_line;
   4.524 +    int i,j,llen;
   4.525 +    gchar **lines;
   4.526      unsigned int lastlen=0,lastblen=0;
   4.527      long spline=0,nspline=0;
   4.528      static struct first_pass_results results={0};
   4.529 -    char inword[MAXWORDLEN]="";
   4.530 -    while (fgets(aline,LINEBUFSIZE-1,infile))
   4.531 +    gchar *inword;
   4.532 +    lines=g_strsplit(etext,"\n",0);
   4.533 +    for (j=0;lines[j];j++)
   4.534      {
   4.535 -	while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
   4.536 -	    aline[strlen(aline)-1]=0;
   4.537 +	llen=strlen(lines[j]);
   4.538 +	while(lines[j][llen-1]=='\r')
   4.539 +	    lines[j][llen--]='\0';
   4.540  	linecnt++;
   4.541 -	if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
   4.542 -	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
   4.543 +	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   4.544 +	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   4.545  	{
   4.546  	    if (spline)
   4.547  		printf("   --> Duplicate header?\n");
   4.548  	    spline=linecnt+1;   /* first line of non-header text, that is */
   4.549  	}
   4.550 -	if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
   4.551 +	if (!strncmp(lines[j],"*** START",9) &&
   4.552 +	  strstr(lines[j],"PROJECT GUTENBERG"))
   4.553  	{
   4.554  	    if (nspline)
   4.555  		printf("   --> Duplicate header?\n");
   4.556 @@ -617,10 +660,10 @@
   4.557  	}
   4.558  	if (spline || nspline)
   4.559  	{
   4.560 -	    lowerit(aline);
   4.561 -	    if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
   4.562 +	    lc_line=g_ascii_strdown(lines[j],llen);
   4.563 +	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   4.564  	    {
   4.565 -		if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
   4.566 +		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   4.567  		{
   4.568  		    if (results.footerline)
   4.569  		    {
   4.570 @@ -632,6 +675,7 @@
   4.571  			results.footerline=linecnt;
   4.572  		}
   4.573  	    }
   4.574 +	    g_free(lc_line);
   4.575  	}
   4.576  	if (spline)
   4.577  	    results.firstline=spline;
   4.578 @@ -639,85 +683,83 @@
   4.579  	    results.firstline=nspline;  /* override with new */
   4.580  	if (results.footerline)
   4.581  	    continue;    /* don't count the boilerplate in the footer */
   4.582 -	llen=strlen(aline);
   4.583  	results.totlen+=llen;
   4.584  	for (i=0;i<llen;i++)
   4.585  	{
   4.586 -	    if ((unsigned char)aline[i]>127)
   4.587 +	    if ((unsigned char)lines[j][i]>127)
   4.588  		results.binlen++;
   4.589 -	    if (gcisalpha(aline[i]))
   4.590 +	    if (gcisalpha(lines[j][i]))
   4.591  		results.alphalen++;
   4.592 -	    if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
   4.593 +	    if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
   4.594  		results.endquote_count++;
   4.595  	}
   4.596 -	if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
   4.597 -	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   4.598 +	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   4.599 +	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   4.600  	    results.shortline++;
   4.601 -	if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
   4.602 +	if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
   4.603  	    cnt_spacend++;
   4.604 -	if (strstr(aline,".,"))
   4.605 +	if (strstr(lines[j],".,"))
   4.606  	    results.dotcomma++;
   4.607  	/* only count ast lines for ignoring purposes where there is */
   4.608  	/* locase text on the line */
   4.609 -	if (strstr(aline,"*"))
   4.610 +	if (strchr(lines[j],'*'))
   4.611  	{
   4.612 -	    for (s=aline;*s;s++)
   4.613 +	    for (s=lines[j];*s;s++)
   4.614  		if (*s>='a' && *s<='z')
   4.615  		    break;
   4.616  	     if (*s)
   4.617  		results.astline++;
   4.618  	}
   4.619 -	if (strstr(aline,"/"))
   4.620 +	if (strchr(lines[j],'/'))
   4.621  	    results.fslashline++;
   4.622 -	for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
   4.623 +	for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
   4.624  	    ;
   4.625 -	if (aline[i]=='-' && aline[i-1]!='-')
   4.626 +	if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
   4.627  	    results.hyphens++;
   4.628  	if (llen>LONGEST_PG_LINE)
   4.629  	    results.longline++;
   4.630  	if (llen>WAY_TOO_LONG)
   4.631  	    results.verylongline++;
   4.632 -	if (strstr(aline,"<") && strstr(aline,">"))
   4.633 +	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   4.634  	{
   4.635 -	    i=(int)(strstr(aline,">")-strstr(aline,"<")+1);
   4.636 +	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   4.637  	    if (i>0)
   4.638  		results.htmcount++;
   4.639 -	    if (strstr(aline,"<i>"))
   4.640 +	    if (strstr(lines[j],"<i>"))
   4.641  		results.htmcount+=4; /* bonus marks! */
   4.642  	}
   4.643  	/* Check for spaced em-dashes */
   4.644 -	if (strstr(aline,"--"))
   4.645 +	if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
   4.646  	{
   4.647  	    results.emdash++;
   4.648 -	    if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
   4.649 -	       (*(strstr(aline,"--")+2)==CHAR_SPACE))
   4.650 +	    if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
   4.651  		results.space_emdash++;
   4.652 -	    if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
   4.653 -	       (*(strstr(aline,"--")+2)==CHAR_SPACE))
   4.654 +	    if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
   4.655  		/* count of em-dashes with spaces both sides */
   4.656  		results.non_PG_space_emdash++;
   4.657 -	    if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
   4.658 -	       (*(strstr(aline,"--")+2)!=CHAR_SPACE))
   4.659 +	    if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
   4.660  		/* count of PG-type em-dashes with no spaces */
   4.661  		results.PG_space_emdash++;
   4.662  	}
   4.663 -	for (s=aline;*s;)
   4.664 +	for (s=lines[j];*s;)
   4.665  	{
   4.666 -	    s=getaword(s,inword);
   4.667 +	    inword=getaword(&s);
   4.668  	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   4.669  		results.Dutchcount++;
   4.670  	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   4.671  		results.Frenchcount++;
   4.672  	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   4.673  		results.standalone_digit++;
   4.674 +	    g_free(inword);
   4.675  	}
   4.676  	/* Check for spaced dashes */
   4.677 -	if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
   4.678 +	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   4.679  	    results.spacedash++;
   4.680  	lastblen=lastlen;
   4.681 -	lastlen=strlen(aline);
   4.682 -	laststart=aline[0];
   4.683 +	lastlen=llen;
   4.684 +	laststart=lines[j][0];
   4.685      }
   4.686 +    g_strfreev(lines);
   4.687      return &results;
   4.688  }
   4.689  
   4.690 @@ -856,17 +898,17 @@
   4.691  	  "Not reporting them.\n");
   4.692  	warnings.bin=0;
   4.693      }
   4.694 -    warnings.isDutch=0;
   4.695 +    warnings.isDutch=FALSE;
   4.696      if (results->Dutchcount>50)
   4.697      {
   4.698 -	warnings.isDutch=1;
   4.699 +	warnings.isDutch=TRUE;
   4.700  	printf("   --> This looks like Dutch - "
   4.701  	  "switching off dashes and warnings for 's Middags case.\n");
   4.702      }
   4.703 -    warnings.isFrench=0;
   4.704 +    warnings.isFrench=FALSE;
   4.705      if (results->Frenchcount>50)
   4.706      {
   4.707 -	warnings.isFrench=1;
   4.708 +	warnings.isFrench=TRUE;
   4.709  	printf("   --> This looks like French - "
   4.710  	  "switching off some doublepunct.\n");
   4.711      }
   4.712 @@ -919,12 +961,14 @@
   4.713   * count it, since empty lines with asterisks or dashes to
   4.714   * separate sections are common.
   4.715   *
   4.716 - * Returns: Non-zero if the line is empty.
   4.717 + * Returns: TRUE if the line is empty.
   4.718   */
   4.719 -int analyse_quotes(const char *s,struct counters *counters)
   4.720 +gboolean analyse_quotes(const char *aline,struct counters *counters)
   4.721  {
   4.722      int guessquote=0;
   4.723 -    int isemptyline=1;    /* assume the line is empty until proven otherwise */
   4.724 +    /* assume the line is empty until proven otherwise */
   4.725 +    gboolean isemptyline=TRUE;
   4.726 +    const char *s=aline;
   4.727      while (*s)
   4.728      {
   4.729  	if (*s==CHAR_DQUOTE)
   4.730 @@ -986,7 +1030,7 @@
   4.731  	}
   4.732  	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
   4.733  	  *s!=13 && *s!=10)
   4.734 -	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */
   4.735 +	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   4.736  	if (*s==CHAR_UNDERSCORE)
   4.737  	    counters->c_unders++;
   4.738  	if (*s==CHAR_OPEN_CBRACK)
   4.739 @@ -1040,7 +1084,7 @@
   4.740   * Check for binary and other odd characters.
   4.741   */
   4.742  void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   4.743 -  int isemptyline)
   4.744 +  gboolean isemptyline)
   4.745  {
   4.746      /* Don't repeat multiple warnings on one line. */
   4.747      int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
   4.748 @@ -1461,16 +1505,15 @@
   4.749  void check_for_extra_period(const char *aline,const struct warnings *warnings)
   4.750  {
   4.751      const char *s,*t,*s1;
   4.752 -    int i,istypo,isdup;
   4.753 -    static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
   4.754 -    static int qperiod_index=0;
   4.755 -    char testword[MAXWORDLEN]="";
   4.756 +    int i;
   4.757 +    gboolean istypo;
   4.758 +    gchar *testword;
   4.759      if (pswit[PARANOID_SWITCH])
   4.760      {
   4.761 -	for (t=s=aline;strstr(t,". ");)
   4.762 +	for (t=aline;strstr(t,". ");)
   4.763  	{
   4.764  	    t=strstr(t,". ");
   4.765 -	    if (t==s)
   4.766 +	    if (t==aline)
   4.767  	    {
   4.768  		t++;
   4.769  		/* start of line punctuation is handled elsewhere */
   4.770 @@ -1497,57 +1540,48 @@
   4.771  	    if (*s1>='a' && *s1<='z')
   4.772  	    {
   4.773  		/* we have something to investigate */
   4.774 -		istypo=1;
   4.775 +		istypo=TRUE;
   4.776  		/* so let's go back and find out */
   4.777 -		for (s1=t-1;s1>=s &&
   4.778 +		for (s1=t-1;s1>=aline &&
   4.779  		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
   4.780  		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
   4.781  		    ;
   4.782  		s1++;
   4.783 -		for (i=0;*s1 && *s1!='.';s1++,i++)
   4.784 -		    testword[i]=*s1;
   4.785 -		testword[i]=0;
   4.786 +		s=strchr(s1,'.');
   4.787 +		if (s)
   4.788 +		    testword=g_strndup(s1,s-s1);
   4.789 +		else
   4.790 +		    testword=g_strdup(s1);
   4.791  		for (i=0;*abbrev[i];i++)
   4.792  		    if (!strcmp(testword,abbrev[i]))
   4.793 -			istypo=0;
   4.794 +			istypo=FALSE;
   4.795  		if (gcisdigit(*testword))
   4.796 -		    istypo=0;
   4.797 +		    istypo=FALSE;
   4.798  		if (!testword[1])
   4.799 -		    istypo=0;
   4.800 +		    istypo=FALSE;
   4.801  		if (isroman(testword))
   4.802 -		    istypo=0;
   4.803 +		    istypo=FALSE;
   4.804  		if (istypo)
   4.805  		{
   4.806 -		    istypo=0;
   4.807 +		    istypo=FALSE;
   4.808  		    for (i=0;testword[i];i++)
   4.809  			if (strchr(vowels,testword[i]))
   4.810 -			    istypo=1;
   4.811 +			    istypo=TRUE;
   4.812  		}
   4.813 -		if (istypo)
   4.814 +		if (istypo &&
   4.815 +		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
   4.816  		{
   4.817 -		    isdup=0;
   4.818 -		    if (strlen(testword)<MAX_QWORD_LENGTH &&
   4.819 -		      !pswit[VERBOSE_SWITCH])
   4.820 -			for (i=0;i<qperiod_index;i++)
   4.821 -			    if (!strcmp(testword,qperiod[i]))
   4.822 -				isdup=1;
   4.823 -		    if (!isdup)
   4.824 -		    {
   4.825 -			if (qperiod_index<MAX_QWORD &&
   4.826 -			  strlen(testword)<MAX_QWORD_LENGTH)
   4.827 -			{
   4.828 -			    strcpy(qperiod[qperiod_index],testword);
   4.829 -			    qperiod_index++;
   4.830 -			}
   4.831 -			if (pswit[ECHO_SWITCH])
   4.832 -			    printf("\n%s\n",aline);
   4.833 -			if (!pswit[OVERVIEW_SWITCH])
   4.834 -			    printf("    Line %ld column %d - Extra period?\n",
   4.835 -			      linecnt,(int)(t-aline)+1);
   4.836 -			else
   4.837 -			    cnt_punct++;
   4.838 -		    }
   4.839 +		    g_tree_insert(qperiod,g_strdup(testword),
   4.840 +		      GINT_TO_POINTER(1));
   4.841 +		    if (pswit[ECHO_SWITCH])
   4.842 +			printf("\n%s\n",aline);
   4.843 +		    if (!pswit[OVERVIEW_SWITCH])
   4.844 +			printf("    Line %ld column %d - Extra period?\n",
   4.845 +			  linecnt,(int)(t-aline)+1);
   4.846 +		    else
   4.847 +			cnt_punct++;
   4.848  		}
   4.849 +		g_free(testword);
   4.850  	    }
   4.851  	    t++;
   4.852  	}
   4.853 @@ -1563,16 +1597,20 @@
   4.854  {
   4.855      int i;
   4.856      const char *s,*wordstart;
   4.857 -    char inword[MAXWORDLEN];
   4.858 +    gchar *inword,*t;
   4.859      if (pswit[TYPO_SWITCH])
   4.860      {
   4.861  	for (s=aline;*s;)
   4.862  	{
   4.863  	    wordstart=s;
   4.864 -	    s=getaword(s,inword);
   4.865 -	    if (!*inword)
   4.866 +	    t=getaword(&s);
   4.867 +	    if (!*t)
   4.868 +	    {
   4.869 +		g_free(t);
   4.870  		continue;
   4.871 -	    lowerit(inword);
   4.872 +	    }
   4.873 +	    inword=g_ascii_strdown(t,-1);
   4.874 +	    g_free(t);
   4.875  	    for (i=0;*nocomma[i];i++)
   4.876  		if (!strcmp(inword,nocomma[i]))
   4.877  		{
   4.878 @@ -1603,6 +1641,7 @@
   4.879  			    cnt_punct++;
   4.880  		    }
   4.881  		}
   4.882 +	    g_free(inword);
   4.883  	}
   4.884      }
   4.885  }
   4.886 @@ -1616,15 +1655,18 @@
   4.887  void check_for_typos(const char *aline,struct warnings *warnings)
   4.888  {
   4.889      const char *s,*wordstart;
   4.890 -    char inword[MAXWORDLEN],testword[MAXWORDLEN];
   4.891 -    int i,istypo,isdup,alower,vowel,consonant;
   4.892 -    static int qword_index=0;
   4.893 +    gchar *inword,*testword;
   4.894 +    int i,alower,vowel,consonant,*dupcnt;
   4.895 +    gboolean isdup,istypo;
   4.896      for (s=aline;*s;)
   4.897      {
   4.898  	wordstart=s;
   4.899 -	s=getaword(s,inword);
   4.900 +	inword=getaword(&s);
   4.901  	if (!*inword)
   4.902 +	{
   4.903 +	    g_free(inword);
   4.904  	    continue; /* don't bother with empty lines */
   4.905 +	}
   4.906  	if (mixdigit(inword))
   4.907  	{
   4.908  	    if (pswit[ECHO_SWITCH])
   4.909 @@ -1639,10 +1681,10 @@
   4.910  	 * Put the word through a series of tests for likely typos and OCR
   4.911  	 * errors.
   4.912  	 */
   4.913 -	if (pswit[TYPO_SWITCH])
   4.914 +	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
   4.915  	{
   4.916 -	    istypo=0;
   4.917 -	    strcpy(testword,inword);
   4.918 +	    istypo=FALSE;
   4.919 +	    testword=g_strdup(inword);
   4.920  	    alower=0;
   4.921  	    for (i=0;i<(int)strlen(testword);i++)
   4.922  	    {
   4.923 @@ -1662,10 +1704,13 @@
   4.924  		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
   4.925  			; /* do nothing! */
   4.926  		    else
   4.927 -			istypo=1;
   4.928 +			istypo=TRUE;
   4.929  		}
   4.930  		testword[i]=(char)tolower(testword[i]);
   4.931  	    }
   4.932 +	}
   4.933 +	if (pswit[TYPO_SWITCH])
   4.934 +	{
   4.935  	    /*
   4.936  	     * Check for certain unlikely two-letter combinations at word
   4.937  	     * start and end.
   4.938 @@ -1674,26 +1719,26 @@
   4.939  	    {
   4.940  		for (i=0;*nostart[i];i++)
   4.941  		    if (!strncmp(testword,nostart[i],2))
   4.942 -			istypo=1;
   4.943 +			istypo=TRUE;
   4.944  		for (i=0;*noend[i];i++)
   4.945  		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))
   4.946 -			istypo=1;
   4.947 +			istypo=TRUE;
   4.948  	    }
   4.949  	    /* ght is common, gbt never. Like that. */
   4.950  	    if (strstr(testword,"cb"))
   4.951 -		istypo=1;
   4.952 +		istypo=TRUE;
   4.953  	    if (strstr(testword,"gbt"))
   4.954 -		istypo=1;
   4.955 +		istypo=TRUE;
   4.956  	    if (strstr(testword,"pbt"))
   4.957 -		istypo=1;
   4.958 +		istypo=TRUE;
   4.959  	    if (strstr(testword,"tbs"))
   4.960 -		istypo=1;
   4.961 +		istypo=TRUE;
   4.962  	    if (strstr(testword,"mrn"))
   4.963 -		istypo=1;
   4.964 +		istypo=TRUE;
   4.965  	    if (strstr(testword,"ahle"))
   4.966 -		istypo=1;
   4.967 +		istypo=TRUE;
   4.968  	    if (strstr(testword,"ihle"))
   4.969 -		istypo=1;
   4.970 +		istypo=TRUE;
   4.971  	    /*
   4.972  	     * "TBE" does happen - like HEARTBEAT - but uncommon.
   4.973  	     * Also "TBI" - frostbite, outbid - but uncommon.
   4.974 @@ -1701,11 +1746,11 @@
   4.975  	     * numerals, but "ii" is a common scanno.
   4.976  	     */
   4.977  	    if (strstr(testword,"tbi"))
   4.978 -		istypo=1;
   4.979 +		istypo=TRUE;
   4.980  	    if (strstr(testword,"tbe"))
   4.981 -		istypo=1;
   4.982 +		istypo=TRUE;
   4.983  	    if (strstr(testword,"ii"))
   4.984 -		istypo=1;
   4.985 +		istypo=TRUE;
   4.986  	    /*
   4.987  	     * Check for no vowels or no consonants.
   4.988  	     * If none, flag a typo.
   4.989 @@ -1727,7 +1772,7 @@
   4.990  			consonant++;
   4.991  		}
   4.992  		if (!vowel || !consonant)
   4.993 -		    istypo=1;
   4.994 +		    istypo=TRUE;
   4.995  	    }
   4.996  	    /*
   4.997  	     * Now exclude the word from being reported if it's in
   4.998 @@ -1735,18 +1780,18 @@
   4.999  	     */
  4.1000  	    for (i=0;*okword[i];i++)
  4.1001  		if (!strcmp(testword,okword[i]))
  4.1002 -		    istypo=0;
  4.1003 +		    istypo=FALSE;
  4.1004  	    /*
  4.1005  	     * What looks like a typo may be a Roman numeral.
  4.1006  	     * Exclude these.
  4.1007  	     */
  4.1008  	    if (istypo && isroman(testword))
  4.1009 -		istypo=0;
  4.1010 +		istypo=FALSE;
  4.1011  	    /* Check the manual list of typos. */
  4.1012  	    if (!istypo)
  4.1013  		for (i=0;*typo[i];i++)
  4.1014  		    if (!strcmp(testword,typo[i]))
  4.1015 -			istypo=1;
  4.1016 +			istypo=TRUE;
  4.1017  	    /*
  4.1018  	     * Check lowercase s, l, i and m - special cases.
  4.1019  	     *   "j" - often a semi-colon gone wrong.
  4.1020 @@ -1754,34 +1799,30 @@
  4.1021  	     *   "n" for "in"
  4.1022  	     */
  4.1023  	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
  4.1024 -		istypo=1;
  4.1025 +		istypo=TRUE;
  4.1026  	    if (istypo)
  4.1027  	    {
  4.1028 -		isdup=0;
  4.1029 -		if (strlen(testword)<MAX_QWORD_LENGTH &&
  4.1030 -		  !pswit[VERBOSE_SWITCH])
  4.1031 -		    for (i=0;i<qword_index;i++)
  4.1032 -			if (!strcmp(testword,qword[i]))
  4.1033 -			{
  4.1034 -			    isdup=1;
  4.1035 -			    ++dupcnt[i];
  4.1036 -			}
  4.1037 +		dupcnt=g_tree_lookup(qword,testword);
  4.1038 +		if (dupcnt)
  4.1039 +		{
  4.1040 +		    (*dupcnt)++;
  4.1041 +		    isdup=!pswit[VERBOSE_SWITCH];
  4.1042 +		}
  4.1043 +		else
  4.1044 +		{
  4.1045 +		    dupcnt=g_new0(int,1);
  4.1046 +		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  4.1047 +		    isdup=FALSE;
  4.1048 +		}
  4.1049  		if (!isdup)
  4.1050  		{
  4.1051 -		    if (qword_index<MAX_QWORD &&
  4.1052 -		      strlen(testword)<MAX_QWORD_LENGTH)
  4.1053 -		    {
  4.1054 -			strcpy(qword[qword_index],testword);
  4.1055 -			qword_index++;
  4.1056 -		    }
  4.1057  		    if (pswit[ECHO_SWITCH])
  4.1058  			printf("\n%s\n",aline);
  4.1059  		    if (!pswit[OVERVIEW_SWITCH])
  4.1060  		    {
  4.1061  			printf("    Line %ld column %d - Query word %s",
  4.1062  			  linecnt,(int)(wordstart-aline)+1,inword);
  4.1063 -			if (strlen(testword)<MAX_QWORD_LENGTH &&
  4.1064 -			  !pswit[VERBOSE_SWITCH])
  4.1065 +			if (!pswit[VERBOSE_SWITCH])
  4.1066  			    printf(" - not reporting duplicates");
  4.1067  			printf("\n");
  4.1068  		    }
  4.1069 @@ -1791,17 +1832,16 @@
  4.1070  	    }
  4.1071  	}
  4.1072  	/* check the user's list of typos */
  4.1073 -	if (!istypo && usertypo_count)
  4.1074 -	    for (i=0;i<usertypo_count;i++)
  4.1075 -		if (!strcmp(testword,usertypo[i]))
  4.1076 -		{
  4.1077 -		    if (pswit[ECHO_SWITCH])
  4.1078 -			printf("\n%s\n",aline);
  4.1079 -		    if (!pswit[OVERVIEW_SWITCH])  
  4.1080 -			printf("    Line %ld column %d - "
  4.1081 -			  "Query possible scanno %s\n",
  4.1082 -			  linecnt,(int)(wordstart-aline)+2,inword);
  4.1083 -		}
  4.1084 +	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  4.1085 +	{
  4.1086 +	    if (pswit[ECHO_SWITCH])
  4.1087 +		printf("\n%s\n",aline);
  4.1088 +	    if (!pswit[OVERVIEW_SWITCH])  
  4.1089 +		printf("    Line %ld column %d - Query possible scanno %s\n",
  4.1090 +		  linecnt,(int)(wordstart-aline)+2,inword);
  4.1091 +	}
  4.1092 +	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  4.1093 +	    g_free(testword);
  4.1094  	if (pswit[PARANOID_SWITCH] && warnings->digit)
  4.1095  	{
  4.1096  	    /* In paranoid mode, query all 0 and 1 standing alone. */
  4.1097 @@ -1816,6 +1856,7 @@
  4.1098  		    cnt_word++;
  4.1099  	    }
  4.1100  	}
  4.1101 +	g_free(inword);
  4.1102      }
  4.1103  }
  4.1104  
  4.1105 @@ -1830,9 +1871,10 @@
  4.1106   * quotes "like"this.
  4.1107   */
  4.1108  void check_for_misspaced_punctuation(const char *aline,
  4.1109 -  struct parities *parities,int isemptyline)
  4.1110 +  struct parities *parities,gboolean isemptyline)
  4.1111  {
  4.1112 -    int i,llen,isacro,isellipsis;
  4.1113 +    int i,llen;
  4.1114 +    gboolean isacro,isellipsis;
  4.1115      const char *s;
  4.1116      llen=strlen(aline);
  4.1117      for (i=1;i<llen;i++)
  4.1118 @@ -1841,9 +1883,9 @@
  4.1119  	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
  4.1120  	{
  4.1121  	    /* we need to suppress warnings for acronyms like M.D. */
  4.1122 -	    isacro=0;
  4.1123 +	    isacro=FALSE;
  4.1124  	    /* we need to suppress warnings for ellipsis . . . */
  4.1125 -	    isellipsis=0;
  4.1126 +	    isellipsis=FALSE;
  4.1127  	    /* if there are letters on both sides of it or ... */
  4.1128  	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
  4.1129  	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
  4.1130 @@ -1852,9 +1894,9 @@
  4.1131  		if (aline[i]=='.')
  4.1132  		{
  4.1133  		    if (i>2 && aline[i-2]=='.')
  4.1134 -			isacro=1;
  4.1135 +			isacro=TRUE;
  4.1136  		    if (i+2<llen && aline[i+2]=='.')
  4.1137 -			isacro=1;
  4.1138 +			isacro=TRUE;
  4.1139  		}
  4.1140  		if (!isacro)
  4.1141  		{
  4.1142 @@ -1877,9 +1919,9 @@
  4.1143  		if (aline[i]=='.')
  4.1144  		{
  4.1145  		    if (i>2 && aline[i-2]=='.')
  4.1146 -			isellipsis=1;
  4.1147 +			isellipsis=TRUE;
  4.1148  		    if (i+2<llen && aline[i+2]=='.')
  4.1149 -			isellipsis=1;
  4.1150 +			isellipsis=TRUE;
  4.1151  		}
  4.1152  		if (!isemptyline && !isellipsis)
  4.1153  		{
  4.1154 @@ -2177,6 +2219,8 @@
  4.1155  void check_for_miscased_genative(const char *aline)
  4.1156  {
  4.1157      const char *s;
  4.1158 +    if (!*aline)
  4.1159 +	return;
  4.1160      s=aline+1;
  4.1161      while (*s)
  4.1162      {
  4.1163 @@ -2321,13 +2365,11 @@
  4.1164  	    i=(int)(close-open+1);
  4.1165  	    if (i>0)
  4.1166  	    {
  4.1167 -		strncpy(wrk,open,i);
  4.1168 -		wrk[i]=0;
  4.1169  		if (pswit[ECHO_SWITCH])
  4.1170  		    printf("\n%s\n",aline);
  4.1171  		if (!pswit[OVERVIEW_SWITCH])
  4.1172 -		    printf("    Line %ld column %d - HTML Tag? %s \n",
  4.1173 -		      linecnt,(int)(open-aline)+1,wrk);
  4.1174 +		    printf("    Line %ld column %d - HTML Tag? %*.*s \n",
  4.1175 +		      linecnt,(int)(open-aline)+1,i,i,open);
  4.1176  		else
  4.1177  		    cnt_html++;
  4.1178  	    }
  4.1179 @@ -2359,13 +2401,11 @@
  4.1180  		    i=0;		/* Don't report "Jones & Son;" */
  4.1181  	    if (i>0)
  4.1182  	    {
  4.1183 -		strncpy(wrk,amp,i);
  4.1184 -		wrk[i]=0;
  4.1185  		if (pswit[ECHO_SWITCH])
  4.1186  		    printf("\n%s\n",aline);
  4.1187  		if (!pswit[OVERVIEW_SWITCH])
  4.1188 -		    printf("    Line %ld column %d - HTML symbol? %s \n",
  4.1189 -		      linecnt,(int)(amp-aline)+1,wrk);
  4.1190 +		    printf("    Line %ld column %d - HTML symbol? %*.*s \n",
  4.1191 +		      linecnt,(int)(amp-aline)+1,i,i,amp);
  4.1192  		else
  4.1193  		    cnt_html++;
  4.1194  	    }
  4.1195 @@ -2388,7 +2428,8 @@
  4.1196      s=aline;
  4.1197      while (*s==' ')
  4.1198  	s++;
  4.1199 -    if (*pending->dquote)
  4.1200 +    if (pending->dquote)
  4.1201 +    {
  4.1202  	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  4.1203  	{
  4.1204  	    if (!pswit[OVERVIEW_SWITCH])
  4.1205 @@ -2400,7 +2441,10 @@
  4.1206  	    else
  4.1207  		cnt_dquot++;
  4.1208  	}
  4.1209 -    if (*pending->squote)
  4.1210 +	g_free(pending->dquote);
  4.1211 +	pending->dquote=NULL;
  4.1212 +    }
  4.1213 +    if (pending->squote)
  4.1214      {
  4.1215  	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
  4.1216  	  pending->squot)
  4.1217 @@ -2414,8 +2458,10 @@
  4.1218  	    else
  4.1219  		cnt_squot++;
  4.1220  	}
  4.1221 +	g_free(pending->squote);
  4.1222 +	pending->squote=NULL;
  4.1223      }
  4.1224 -    if (*pending->rbrack)
  4.1225 +    if (pending->rbrack)
  4.1226      {
  4.1227  	if (!pswit[OVERVIEW_SWITCH])
  4.1228  	{
  4.1229 @@ -2425,8 +2471,10 @@
  4.1230  	}
  4.1231  	else
  4.1232  	    cnt_brack++;
  4.1233 +	g_free(pending->rbrack);
  4.1234 +	pending->rbrack=NULL;
  4.1235      }
  4.1236 -    if (*pending->sbrack)
  4.1237 +    if (pending->sbrack)
  4.1238      {
  4.1239  	if (!pswit[OVERVIEW_SWITCH])
  4.1240  	{
  4.1241 @@ -2436,8 +2484,10 @@
  4.1242  	}
  4.1243  	else
  4.1244  	    cnt_brack++;
  4.1245 +	g_free(pending->sbrack);
  4.1246 +	pending->sbrack=NULL;
  4.1247      }
  4.1248 -    if (*pending->cbrack)
  4.1249 +    if (pending->cbrack)
  4.1250      {
  4.1251  	if (!pswit[OVERVIEW_SWITCH])
  4.1252  	{
  4.1253 @@ -2447,8 +2497,10 @@
  4.1254  	}
  4.1255  	else
  4.1256  	    cnt_brack++;
  4.1257 +	g_free(pending->cbrack);
  4.1258 +	pending->cbrack=NULL;
  4.1259      }
  4.1260 -    if (*pending->unders)
  4.1261 +    if (pending->unders)
  4.1262      {
  4.1263  	if (!pswit[OVERVIEW_SWITCH])
  4.1264  	{
  4.1265 @@ -2458,6 +2510,8 @@
  4.1266  	}
  4.1267  	else
  4.1268  	    cnt_brack++;
  4.1269 +	g_free(pending->unders);
  4.1270 +	pending->unders=NULL;
  4.1271      }
  4.1272  }
  4.1273  
  4.1274 @@ -2481,12 +2535,12 @@
  4.1275    struct pending *pending)
  4.1276  {
  4.1277      if (counters->quot%2)
  4.1278 -	sprintf(pending->dquote,"    Line %ld - Mismatched quotes",
  4.1279 -	  linecnt);
  4.1280 +	pending->dquote=
  4.1281 +	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
  4.1282      if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  4.1283        counters->open_single_quote!=counters->close_single_quote)
  4.1284 -	sprintf(pending->squote,"    Line %ld - Mismatched singlequotes?",
  4.1285 -	  linecnt);
  4.1286 +	pending->squote=
  4.1287 +	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
  4.1288      if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  4.1289        counters->open_single_quote!=counters->close_single_quote &&
  4.1290        counters->open_single_quote!=counters->close_single_quote+1)
  4.1291 @@ -2496,17 +2550,17 @@
  4.1292  	 */
  4.1293  	pending->squot=1;
  4.1294      if (counters->r_brack)
  4.1295 -	sprintf(pending->rbrack,"    Line %ld - Mismatched round brackets?",
  4.1296 -	  linecnt);
  4.1297 +	pending->rbrack=
  4.1298 +	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
  4.1299      if (counters->s_brack)
  4.1300 -	sprintf(pending->sbrack,"    Line %ld - Mismatched square brackets?",
  4.1301 -	  linecnt);
  4.1302 +	pending->sbrack=
  4.1303 +	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
  4.1304      if (counters->c_brack)
  4.1305 -	sprintf(pending->cbrack,"    Line %ld - Mismatched curly brackets?",
  4.1306 -	  linecnt);
  4.1307 +	pending->cbrack=
  4.1308 +	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
  4.1309      if (counters->c_unders%2)
  4.1310 -	sprintf(pending->unders,"    Line %ld - Mismatched underscores?",
  4.1311 -	  linecnt);
  4.1312 +	pending->unders=
  4.1313 +	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
  4.1314  }
  4.1315  
  4.1316  /*
  4.1317 @@ -2563,50 +2617,63 @@
  4.1318      }
  4.1319  }
  4.1320  
  4.1321 +gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  4.1322 +{
  4.1323 +    const char *word=key;
  4.1324 +    int *dupcnt=value;
  4.1325 +    if (*dupcnt)
  4.1326 +	printf("\nNote: Queried word %s was duplicated %d times\n",
  4.1327 +	  word,*dupcnt);
  4.1328 +    return FALSE;
  4.1329 +}
  4.1330 +
  4.1331  /*
  4.1332   * procfile:
  4.1333   *
  4.1334   * Process one file.
  4.1335   */
  4.1336 -void procfile(char *filename)
  4.1337 +void procfile(const char *filename)
  4.1338  {
  4.1339      const char *s;
  4.1340 -    char parastart[81];     /* first line of current para */
  4.1341 -    FILE *infile;
  4.1342 +    gchar *parastart=NULL;	/* first line of current para */
  4.1343 +    gchar *etext,*aline;
  4.1344 +    gchar *etext_ptr;
  4.1345 +    GError *err=NULL;
  4.1346      struct first_pass_results *first_pass_results;
  4.1347      struct warnings *warnings;
  4.1348      struct counters counters={0};
  4.1349      struct line_properties last={0};
  4.1350      struct parities parities={0};
  4.1351 -    struct pending pending={{0},};
  4.1352 -    int isemptyline;
  4.1353 +    struct pending pending={0};
  4.1354 +    gboolean isemptyline;
  4.1355      long start_para_line=0;
  4.1356 -    int i,isnewpara=0,enddash=0;
  4.1357 +    gboolean isnewpara=FALSE,enddash=FALSE;
  4.1358      last.start=CHAR_SPACE;
  4.1359 -    *prevline=0;
  4.1360      linecnt=checked_linecnt=0;
  4.1361 -    infile=fopen(filename,"rb");
  4.1362 -    if (!infile)
  4.1363 +    etext=read_etext(filename,&err);
  4.1364 +    if (!etext)
  4.1365      {
  4.1366  	if (pswit[STDOUT_SWITCH])
  4.1367 -	    fprintf(stdout,"bookloupe: cannot open %s\n",filename);
  4.1368 +	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  4.1369  	else
  4.1370 -	    fprintf(stderr,"bookloupe: cannot open %s\n",filename);
  4.1371 +	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  4.1372  	exit(1);
  4.1373      }
  4.1374      fprintf(stdout,"\n\nFile: %s\n\n",filename);
  4.1375 -    first_pass_results=first_pass(infile);
  4.1376 +    first_pass_results=first_pass(etext);
  4.1377      warnings=report_first_pass(first_pass_results);
  4.1378 +    qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  4.1379 +    qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  4.1380      /*
  4.1381       * Here we go with the main pass. Hold onto yer hat!
  4.1382       */
  4.1383 -    rewind(infile);
  4.1384      linecnt=0;
  4.1385 -    while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
  4.1386 +    etext_ptr=etext;
  4.1387 +    while ((aline=flgets(&etext_ptr,linecnt+1)))
  4.1388      {
  4.1389  	linecnt++;
  4.1390  	if (linecnt==1)
  4.1391 -	    isnewpara=1;
  4.1392 +	    isnewpara=TRUE;
  4.1393  	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
  4.1394  	    continue;    // skip DP page separators completely
  4.1395  	if (linecnt<first_pass_results->firstline ||
  4.1396 @@ -2635,8 +2702,8 @@
  4.1397  	    /* This line is the start of a new paragraph. */
  4.1398  	    start_para_line=linecnt;
  4.1399  	    /* Capture its first line in case we want to report it later. */
  4.1400 -	    strncpy(parastart,aline,80);
  4.1401 -	    parastart[79]=0;
  4.1402 +	    g_free(parastart);
  4.1403 +	    parastart=g_strdup(aline);
  4.1404  	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  4.1405  	    s=aline;
  4.1406  	    while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
  4.1407 @@ -2653,7 +2720,7 @@
  4.1408  		else
  4.1409  		    cnt_punct++;
  4.1410  	    }
  4.1411 -	    isnewpara=0; /* Signal the end of new para processing. */
  4.1412 +	    isnewpara=FALSE; /* Signal the end of new para processing. */
  4.1413  	}
  4.1414  	/* Check for an em-dash broken at line end. */
  4.1415  	if (enddash && *aline=='-')
  4.1416 @@ -2665,11 +2732,11 @@
  4.1417  	    else
  4.1418  		cnt_punct++;
  4.1419  	}
  4.1420 -	enddash=0;
  4.1421 +	enddash=FALSE;
  4.1422  	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
  4.1423  	    ;
  4.1424  	if (s>=aline && *s=='-')
  4.1425 -	    enddash=1;
  4.1426 +	    enddash=TRUE;
  4.1427  	check_for_control_characters(aline);
  4.1428  	if (warnings->bin)
  4.1429  	    check_for_odd_characters(aline,warnings,isemptyline);
  4.1430 @@ -2709,40 +2776,49 @@
  4.1431  	    check_for_mismatched_quotes(&counters,&pending);
  4.1432  	    memset(&counters,0,sizeof(counters));
  4.1433  	    /* let the next iteration know that it's starting a new para */
  4.1434 -	    isnewpara=1;
  4.1435 -	    check_for_omitted_punctuation(prevline,&last,start_para_line);
  4.1436 +	    isnewpara=TRUE;
  4.1437 +	    if (prevline)
  4.1438 +		check_for_omitted_punctuation(prevline,&last,start_para_line);
  4.1439  	}
  4.1440 -	strcpy(prevline,aline);
  4.1441 +	g_free(prevline);
  4.1442 +	prevline=g_strdup(aline);
  4.1443      }
  4.1444 -    fclose(infile);
  4.1445 +    if (prevline)
  4.1446 +    {
  4.1447 +	g_free(prevline);
  4.1448 +	prevline=NULL;
  4.1449 +    }
  4.1450 +    g_free(parastart);
  4.1451 +    g_free(prevline);
  4.1452 +    g_free(etext);
  4.1453      if (!pswit[OVERVIEW_SWITCH])
  4.1454 -	for (i=0;i<MAX_QWORD;i++)
  4.1455 -	    if (dupcnt[i])
  4.1456 -		printf("\nNote: Queried word %s was duplicated %d time%s\n",
  4.1457 -		  qword[i],dupcnt[i],"s");
  4.1458 +	g_tree_foreach(qword,report_duplicate_queries,NULL);
  4.1459 +    g_tree_unref(qword);
  4.1460 +    g_tree_unref(qperiod);
  4.1461  }
  4.1462  
  4.1463  /*
  4.1464   * flgets:
  4.1465   *
  4.1466 - * Get one line from the input stream, checking for
  4.1467 + * Get one line from the input text, checking for
  4.1468   * the existence of exactly one CR/LF line-end per line.
  4.1469   *
  4.1470   * Returns: a pointer to the line.
  4.1471   */
  4.1472 -char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
  4.1473 +char *flgets(char **etext,long lcnt)
  4.1474  {
  4.1475      char c;
  4.1476 -    int len,isCR,cint;
  4.1477 -    *theline=0;
  4.1478 -    len=isCR=0;
  4.1479 -    c=cint=fgetc(thefile);
  4.1480 -    do
  4.1481 +    int len;
  4.1482 +    gboolean isCR=FALSE;
  4.1483 +    char *theline=*etext;
  4.1484 +    len=0;
  4.1485 +    for(;;)
  4.1486      {
  4.1487 -	if (cint==EOF)
  4.1488 +	c=*(*etext)++;
  4.1489 +	if (!c)
  4.1490  	    return NULL;
  4.1491  	/* either way, it's end of line */
  4.1492 -	if (c==10)
  4.1493 +	if (c=='\n')
  4.1494  	{
  4.1495  	    if (isCR)
  4.1496  		break;
  4.1497 @@ -2752,7 +2828,7 @@
  4.1498  		if (pswit[LINE_END_SWITCH])
  4.1499  		{
  4.1500  		    if (pswit[ECHO_SWITCH])
  4.1501 -			printf("\n%s\n",theline);
  4.1502 +			printf("\n%*.*s\n",len,len,theline);
  4.1503  		    if (!pswit[OVERVIEW_SWITCH])
  4.1504  			printf("    Line %ld - No CR?\n",lcnt);
  4.1505  		    else
  4.1506 @@ -2761,7 +2837,7 @@
  4.1507  		break;
  4.1508  	    }
  4.1509  	}
  4.1510 -	if (c==13)
  4.1511 +	if (c=='\r')
  4.1512  	{
  4.1513  	    if (isCR)
  4.1514  	    {
  4.1515 @@ -2769,34 +2845,33 @@
  4.1516  		if (pswit[LINE_END_SWITCH])
  4.1517  		{
  4.1518  		    if (pswit[ECHO_SWITCH])
  4.1519 -			printf("\n%s\n",theline);
  4.1520 +			printf("\n%*.*s\n",len,len,theline);
  4.1521  		    if (!pswit[OVERVIEW_SWITCH])
  4.1522  			printf("    Line %ld - Two successive CRs?\n",lcnt);
  4.1523  		    else
  4.1524  			cnt_lineend++;
  4.1525  		}
  4.1526  	    }
  4.1527 -	    isCR=1;
  4.1528 +	    isCR=TRUE;
  4.1529  	}
  4.1530  	else
  4.1531  	{
  4.1532  	    if (pswit[LINE_END_SWITCH] && isCR)
  4.1533  	    {
  4.1534  		if (pswit[ECHO_SWITCH])
  4.1535 -		    printf("\n%s\n",theline);
  4.1536 +		    printf("\n%*.*s\n",len,len,theline);
  4.1537  		if (!pswit[OVERVIEW_SWITCH])
  4.1538  		    printf("    Line %ld column %d - CR without LF?\n",
  4.1539  		      lcnt,len+1);
  4.1540  		else
  4.1541  		    cnt_lineend++;
  4.1542 +		theline[len]=' ';
  4.1543  	    }
  4.1544 -	    theline[len]=c;
  4.1545 +	    isCR=FALSE;
  4.1546  	    len++;
  4.1547 -	    theline[len]=0;
  4.1548 -	    isCR=0;
  4.1549  	}
  4.1550 -	c=cint=fgetc(thefile);
  4.1551 -    } while(len<maxlen);
  4.1552 +    }
  4.1553 +    theline[len]='\0';
  4.1554      if (pswit[MARKUP_SWITCH])  
  4.1555  	postprocess_for_HTML(theline);
  4.1556      if (pswit[DP_SWITCH])  
  4.1557 @@ -2813,10 +2888,10 @@
  4.1558   *
  4.1559   * Returns: 0 if no error found, 1 if error.
  4.1560   */
  4.1561 -int mixdigit(char *checkword)
  4.1562 +int mixdigit(const char *checkword)
  4.1563  {
  4.1564      int wehaveadigit,wehavealetter,firstdigits,query,wl;
  4.1565 -    char *s;
  4.1566 +    const char *s;
  4.1567      wehaveadigit=wehavealetter=query=0;
  4.1568      for (s=checkword;*s;s++)
  4.1569  	if (gcisalpha(*s))
  4.1570 @@ -2832,17 +2907,20 @@
  4.1571  	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
  4.1572  	    ;
  4.1573  	/* digits, ending in st, rd, nd, th of either case */
  4.1574 -	if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
  4.1575 -	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
  4.1576 -	  matchword(checkword+wl-2,"th")))
  4.1577 +	if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
  4.1578 +	  !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
  4.1579 +	  !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
  4.1580 +	  !g_ascii_strcasecmp(checkword+wl-2,"th")))
  4.1581  	    query=0;
  4.1582 -	if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
  4.1583 -	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
  4.1584 -	  matchword(checkword+wl-3,"ths")))
  4.1585 +	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
  4.1586 +	  !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
  4.1587 +	  !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
  4.1588 +	  !g_ascii_strcasecmp(checkword+wl-3,"ths")))
  4.1589  	    query=0;
  4.1590 -	if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
  4.1591 -	  matchword(checkword+wl-4,"rdly") ||
  4.1592 -	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
  4.1593 +	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
  4.1594 +	  !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
  4.1595 +	  !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
  4.1596 +	  !g_ascii_strcasecmp(checkword+wl-4,"thly")))
  4.1597  	    query=0;
  4.1598  	/* digits, ending in l, L, s or d */
  4.1599  	if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
  4.1600 @@ -2864,20 +2942,20 @@
  4.1601  /*
  4.1602   * getaword:
  4.1603   *
  4.1604 - * Extracts the first/next "word" from the line, and puts
  4.1605 - * it into "thisword". A word is defined as one English word unit--or
  4.1606 - * at least that's the aim.
  4.1607 + * Extracts the first/next "word" from the line, and returns it.
  4.1608 + * A word is defined as one English word unit--or at least that's the aim.
  4.1609 + * "ptr" is advanced to the position in the line where we will start
  4.1610 + * looking for the next word.
  4.1611   *
  4.1612 - * Returns: a pointer to the position in the line where we will start
  4.1613 - *	  looking for the next word.
  4.1614 + * Returns: A newly-allocated string.
  4.1615   */
  4.1616 -const char *getaword(const char *fromline,char *thisword)
  4.1617 +gchar *getaword(const char **ptr)
  4.1618  {
  4.1619 -    int i,wordlen;
  4.1620 +    int i;
  4.1621      const char *s;
  4.1622 -    wordlen=0;
  4.1623 -    for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
  4.1624 -      fromline++)
  4.1625 +    GString *word;
  4.1626 +    word=g_string_new(NULL);
  4.1627 +    for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
  4.1628  	;
  4.1629      /*
  4.1630       * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  4.1631 @@ -2887,64 +2965,25 @@
  4.1632       * If found, it returns this whole pattern as a word; otherwise we discard
  4.1633       * the results and resume our normal programming.
  4.1634       */
  4.1635 -    s=fromline;
  4.1636 -    for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
  4.1637 -      wordlen<MAXWORDLEN;s++)
  4.1638 +    s=*ptr;
  4.1639 +    for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
  4.1640 +	g_string_append_c(word,*s);
  4.1641 +    for (i=1;i+1<word->len;i++)
  4.1642      {
  4.1643 -	thisword[wordlen]=*s;
  4.1644 -	wordlen++;
  4.1645 -    }
  4.1646 -    thisword[wordlen]=0;
  4.1647 -    for (i=1;i<wordlen-1;i++)
  4.1648 -    {
  4.1649 -	if (thisword[i]=='.' || thisword[i]==',')
  4.1650 +	if (word->str[i]=='.' || word->str[i]==',')
  4.1651  	{
  4.1652 -	    if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
  4.1653 +	    if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
  4.1654  	    {
  4.1655 -		fromline=s;
  4.1656 -		return fromline;
  4.1657 +		*ptr=s;
  4.1658 +		return g_string_free(word,FALSE);
  4.1659  	    }
  4.1660  	}
  4.1661      }
  4.1662      /* we didn't find a punctuated number - do the regular getword thing */
  4.1663 -    wordlen=0;
  4.1664 -    for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
  4.1665 -      wordlen<MAXWORDLEN;fromline++)
  4.1666 -    {
  4.1667 -	thisword[wordlen]=*fromline;
  4.1668 -	wordlen++;
  4.1669 -    }
  4.1670 -    thisword[wordlen]=0;
  4.1671 -    return fromline;
  4.1672 -}
  4.1673 -
  4.1674 -/*
  4.1675 - * matchword:
  4.1676 - *
  4.1677 - * A case-insensitive string matcher.
  4.1678 - */
  4.1679 -int matchword(char *checkfor,char *thisword)
  4.1680 -{
  4.1681 -    unsigned int ismatch,i;
  4.1682 -    if (strlen(checkfor)!=strlen(thisword))
  4.1683 -	return 0;
  4.1684 -    ismatch=1;     /* assume a match until we find a difference */
  4.1685 -    for (i=0;i<strlen(checkfor);i++)
  4.1686 -	if (toupper(checkfor[i])!=toupper(thisword[i]))
  4.1687 -	    ismatch=0;
  4.1688 -    return ismatch;
  4.1689 -}
  4.1690 -
  4.1691 -/*
  4.1692 - * lowerit:
  4.1693 - *
  4.1694 - * Lowercase the line.
  4.1695 - */
  4.1696 -void lowerit(char *theline)
  4.1697 -{
  4.1698 -    for (;*theline;theline++)
  4.1699 -	if (*theline>='A' && *theline<='Z')
  4.1700 -	    *theline+=32;
  4.1701 +    g_string_truncate(word,0);
  4.1702 +    for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
  4.1703 +	g_string_append_c(word,**ptr);
  4.1704 +    return g_string_free(word,FALSE);
  4.1705  }
  4.1706  
  4.1707  /*
  4.1708 @@ -2961,11 +3000,11 @@
  4.1709   * XL or an optional XC, an optional IX or IV, an optional V and any number
  4.1710   * of optional Is.
  4.1711   */
  4.1712 -int isroman(char *t)
  4.1713 +gboolean isroman(const char *t)
  4.1714  {
  4.1715 -    char *s;
  4.1716 +    const char *s;
  4.1717      if (!t || !*t)
  4.1718 -	return 0;
  4.1719 +	return FALSE;
  4.1720      s=t;
  4.1721      while (*t=='m' && *t)
  4.1722  	t++;
  4.1723 @@ -3006,19 +3045,19 @@
  4.1724   * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
  4.1725   * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
  4.1726   */
  4.1727 -int gcisalpha(unsigned char c)
  4.1728 +gboolean gcisalpha(unsigned char c)
  4.1729  {
  4.1730      if (c>='a' && c<='z')
  4.1731 -	return 1;
  4.1732 +	return TRUE;
  4.1733      if (c>='A' && c<='Z')
  4.1734 -	return 1;
  4.1735 +	return TRUE;
  4.1736      if (c<140)
  4.1737 -	return 0;
  4.1738 +	return FALSE;
  4.1739      if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
  4.1740 -	return 1;
  4.1741 +	return TRUE;
  4.1742      if (c==140 || c==142 || c==156 || c==158 || c==159)
  4.1743 -	return 1;
  4.1744 -    return 0;
  4.1745 +	return TRUE;
  4.1746 +    return FALSE;
  4.1747  }
  4.1748  
  4.1749  /*
  4.1750 @@ -3026,7 +3065,7 @@
  4.1751   *
  4.1752   * A version of isdigit() that doesn't get confused in 8-bit texts.
  4.1753   */
  4.1754 -int gcisdigit(unsigned char c)
  4.1755 +gboolean gcisdigit(unsigned char c)
  4.1756  {   
  4.1757      return c>='0' && c<='9';
  4.1758  }
  4.1759 @@ -3037,24 +3076,12 @@
  4.1760   * A version of isletter() that doesn't get confused in 8-bit texts.
  4.1761   * NB: this is ISO-8891-1-specific.
  4.1762   */
  4.1763 -int gcisletter(unsigned char c)
  4.1764 +gboolean gcisletter(unsigned char c)
  4.1765  {   
  4.1766      return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
  4.1767  }
  4.1768  
  4.1769  /*
  4.1770 - * gcstrchr:
  4.1771 - *
  4.1772 - * Wraps strchr to return NULL if the character being searched for is zero.
  4.1773 - */
  4.1774 -char *gcstrchr(char *s,char c)
  4.1775 -{
  4.1776 -    if (!c)
  4.1777 -	return NULL;
  4.1778 -    return strchr(s,c);
  4.1779 -}
  4.1780 -
  4.1781 -/*
  4.1782   * postprocess_for_DP:
  4.1783   *
  4.1784   * Invoked with the -d switch from flgets().
  4.1785 @@ -3097,7 +3124,7 @@
  4.1786   */
  4.1787  void postprocess_for_HTML(char *theline)
  4.1788  {
  4.1789 -    if (strstr(theline,"<") && strstr(theline,">"))
  4.1790 +    if (strchr(theline,'<') && strchr(theline,'>'))
  4.1791  	while (losemarkup(theline))
  4.1792  	    ;
  4.1793      while (loseentities(theline))
  4.1794 @@ -3171,9 +3198,9 @@
  4.1795      return NULL;
  4.1796  }
  4.1797  
  4.1798 -int tagcomp(char *strin,char *basetag)
  4.1799 +int tagcomp(const char *strin,const char *basetag)
  4.1800  {
  4.1801 -    char *s,*t;
  4.1802 +    const char *s,*t;
  4.1803      s=basetag;
  4.1804      t=strin;
  4.1805      if (*t=='/')
  4.1806 @@ -3188,8 +3215,9 @@
  4.1807      return 0;
  4.1808  }
  4.1809  
  4.1810 -void proghelp()
  4.1811 +void proghelp(GOptionContext *context)
  4.1812  {
  4.1813 +    gchar *help;
  4.1814      fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  4.1815      fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  4.1816      fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  4.1817 @@ -3198,22 +3226,10 @@
  4.1818      fputs("This is Free Software; "
  4.1819        "you may redistribute it under certain conditions (GPL);\n",stderr);
  4.1820      fputs("read the file COPYING for details.\n\n",stderr);
  4.1821 -    fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
  4.1822 -    fputs("  where -s checks single quotes, -e suppresses echoing lines, "
  4.1823 -      "-t checks typos\n",stderr);
  4.1824 -    fputs("  -x (paranoid) switches OFF -t and extra checks, "
  4.1825 -      "-l turns OFF line-end checks\n",stderr);
  4.1826 -    fputs("  -o just displays overview without detail, "
  4.1827 -      "-h echoes header fields\n",stderr);
  4.1828 -    fputs("  -v (verbose) unsuppresses duplicate reporting, "
  4.1829 -      "-m suppresses markup\n",stderr);
  4.1830 -    fputs("  -d ignores DP-specific markup,\n",stderr);
  4.1831 -    fputs("  -u uses a file gutcheck.typ to query user-defined "
  4.1832 -      "possible typos\n",stderr);
  4.1833 -    fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
  4.1834 -    fputs("\n",stderr);
  4.1835 -    fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
  4.1836 -      stderr);
  4.1837 +    help=g_option_context_get_help(context,TRUE,NULL);
  4.1838 +    fputs(help,stderr);
  4.1839 +    g_free(help);
  4.1840 +    fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  4.1841      fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  4.1842        "non-ASCII\n",stderr);
  4.1843      fputs("characters like accented letters, "

     5.1 --- a/configure.ac	Mon May 27 09:03:04 2013 +0100
     5.2 +++ b/configure.ac	Tue May 28 15:17:19 2013 +0100
     5.3 @@ -13,7 +13,8 @@
     5.4  test/compatibility/Makefile
     5.5  doc/Makefile
     5.6  ])
     5.7 -AM_INIT_AUTOMAKE(no-define)
     5.8 +AM_INIT_AUTOMAKE(no-define,1.11)
     5.9 +AM_SILENT_RULES([yes])
    5.10  AC_CANONICAL_HOST
    5.11  
    5.12  ##################################################

     6.1 --- a/test/compatibility/user-defined-typo.tst	Mon May 27 09:03:04 2013 +0100
     6.2 +++ b/test/compatibility/user-defined-typo.tst	Tue May 28 15:17:19 2013 +0100
     6.3 @@ -1,6 +1,6 @@
     6.4  **************** OPTIONS ****************
     6.5  -u
     6.6 -**************** INPUT(gutcheck.typ) ****************
     6.7 +**************** INPUT(bookloupe.typ) ****************
     6.8  arid
     6.9  **************** INPUT ****************
    6.10  I am the very model of a modern Major-General,

     7.1 --- a/test/harness/testcaseparser.c	Mon May 27 09:03:04 2013 +0100
     7.2 +++ b/test/harness/testcaseparser.c	Tue May 28 15:17:19 2013 +0100
     7.3 @@ -91,9 +91,12 @@
     7.4  {
     7.5      TestcaseParser *parser;
     7.6      gsize len;
     7.7 +    GError *err=NULL;
     7.8      parser=g_new0(TestcaseParser,1);
     7.9 -    if (!file_get_contents_text(filename,&parser->contents,&len))
    7.10 +    if (!file_get_contents_text(filename,&parser->contents,&len,&err))
    7.11      {
    7.12 +	g_printerr("%s: %s\n",filename,err->message);
    7.13 +	g_error_free(err);
    7.14  	g_free(parser);
    7.15  	return NULL;
    7.16      }
author	ali <ali@juiblex.co.uk>
	Tue May 28 15:17:19 2013 +0100 (2013-05-28)
changeset 69	1016349e619f
parent 68	adb087007d08
child 70	aa916da2e452
bl/textfileutils.c		file \| annotate \| diff \| revisions
bl/textfileutils.h		file \| annotate \| diff \| revisions
bookloupe/Makefile.am		file \| annotate \| diff \| revisions
bookloupe/bookloupe.c		file \| annotate \| diff \| revisions
configure.ac		file \| annotate \| diff \| revisions
test/compatibility/user-defined-typo.tst		file \| annotate \| diff \| revisions
test/harness/testcaseparser.c		file \| annotate \| diff \| revisions