1.1 --- a/bookloupe/bookloupe.c	Mon May 27 09:03:04 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Tue May 28 15:17:19 2013 +0100
     1.3 @@ -22,19 +22,10 @@
     1.4  #include <stdlib.h>
     1.5  #include <string.h>
     1.6  #include <ctype.h>
     1.7 +#include <glib.h>
     1.8 +#include <bl/bl.h>
     1.9  
    1.10 -#define MAXWORDLEN    80    /* max length of one word */
    1.11 -#define LINEBUFSIZE 2048    /* buffer size for an input line */
    1.12 -
    1.13 -#define MAX_USER_TYPOS 1000
    1.14 -#define USERTYPO_FILE "gutcheck.typ"
    1.15 -
    1.16 -#ifndef MAX_PATH
    1.17 -#define MAX_PATH 16384
    1.18 -#endif
    1.19 -
    1.20 -char aline[LINEBUFSIZE];
    1.21 -char prevline[LINEBUFSIZE];
    1.22 +gchar *prevline;
    1.23  
    1.24  /* Common typos. */
    1.25  char *typo[] = {
    1.26 @@ -70,7 +61,7 @@
    1.27      "se", ""
    1.28  };
    1.29  
    1.30 -char *usertypo[MAX_USER_TYPOS];
    1.31 +GTree *usertypo;
    1.32  
    1.33  /* Common abbreviations and other OK words not to query as typos. */
    1.34  char *okword[] = {
    1.35 @@ -282,46 +273,57 @@
    1.36  #define WAY_TOO_LONG      80
    1.37  #define SHORTEST_PG_LINE  55
    1.38  
    1.39 -#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
    1.40 -				  /*     D - ignore DP-specific markup */
    1.41 -				  /*     E - echo queried line */
    1.42 -				  /*     S - check single quotes */
    1.43 -				  /*     T - check common typos	*/
    1.44 -				  /*     P - require closure of quotes on */
    1.45 -				  /*	 every paragraph */
    1.46 -				  /*     X - "Trust no one" :-) Paranoid! */
    1.47 -				  /*	 Queries everything */
    1.48 -				  /*     L - line end checking defaults on */
    1.49 -				  /*	 -L turns it off */
    1.50 -				  /*     O - overview. Just shows counts. */
    1.51 -				  /*     Y - puts errors to stdout */
    1.52 -				  /*	 instead of stderr */
    1.53 -				  /*     H - Echoes header fields */
    1.54 -				  /*     M - Ignore markup in < > */
    1.55 -				  /*     U - Use file of User-defined Typos */
    1.56 -				  /*     W - Defaults for use on Web upload */
    1.57 -				  /*     V - Verbose - list EVERYTHING! */
    1.58 -#define SWITNO 14		  /* max number of switch parms	*/
    1.59 -				  /*	- used for defining array-size */
    1.60 -#define MINARGS   1  /* minimum no of args excl switches */
    1.61 -#define MAXARGS   1  /* maximum no of args excl switches */
    1.62 +enum {
    1.63 +    ECHO_SWITCH,
    1.64 +    SQUOTE_SWITCH,
    1.65 +    TYPO_SWITCH,
    1.66 +    QPARA_SWITCH,
    1.67 +    PARANOID_SWITCH,
    1.68 +    LINE_END_SWITCH,
    1.69 +    OVERVIEW_SWITCH,
    1.70 +    STDOUT_SWITCH,
    1.71 +    HEADER_SWITCH,
    1.72 +    WEB_SWITCH,
    1.73 +    VERBOSE_SWITCH,
    1.74 +    MARKUP_SWITCH,
    1.75 +    USERTYPO_SWITCH,
    1.76 +    DP_SWITCH,
    1.77 +    SWITNO
    1.78 +};
    1.79  
    1.80 -int pswit[SWITNO];   /* program switches set by SWITCHES */
    1.81 +gboolean pswit[SWITNO];  /* program switches */
    1.82  
    1.83 -#define ECHO_SWITCH      0
    1.84 -#define SQUOTE_SWITCH    1
    1.85 -#define TYPO_SWITCH      2
    1.86 -#define QPARA_SWITCH     3
    1.87 -#define PARANOID_SWITCH  4
    1.88 -#define LINE_END_SWITCH  5
    1.89 -#define OVERVIEW_SWITCH  6
    1.90 -#define STDOUT_SWITCH    7
    1.91 -#define HEADER_SWITCH    8
    1.92 -#define WEB_SWITCH       9
    1.93 -#define VERBOSE_SWITCH   10
    1.94 -#define MARKUP_SWITCH    11
    1.95 -#define USERTYPO_SWITCH  12
    1.96 -#define DP_SWITCH	 13
    1.97 +static GOptionEntry options[]={
    1.98 +    { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
    1.99 +      "Ignore DP-specific markup", NULL },
   1.100 +    { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
   1.101 +      "Don't echo queried line", NULL },
   1.102 +    { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
   1.103 +      "Check single quotes", NULL },
   1.104 +    { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
   1.105 +      "Check common typos", NULL },
   1.106 +    { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
   1.107 +      "Require closure of quotes on every paragraph", NULL },
   1.108 +    { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
   1.109 +      "Disable paranoid querying of everything", NULL },
   1.110 +    { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
   1.111 +      "Disable line end checking", NULL },
   1.112 +    { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
   1.113 +      "Overview: just show counts", NULL },
   1.114 +    { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
   1.115 +      "Output errors to stdout instead of stderr", NULL },
   1.116 +    { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
   1.117 +      "Echo header fields", NULL },
   1.118 +    { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
   1.119 +      "Ignore markup in < >", NULL },
   1.120 +    { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
   1.121 +      "Use file of user-defined typos", NULL },
   1.122 +    { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
   1.123 +      "Defaults for use on www upload", NULL },
   1.124 +    { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
   1.125 +      "Verbose - list everything", NULL },
   1.126 +    { NULL }
   1.127 +};
   1.128  
   1.129  long cnt_dquot;		/* for overview mode, count of doublequote queries */
   1.130  long cnt_squot;		/* for overview mode, count of singlequote queries */
   1.131 @@ -340,47 +342,26 @@
   1.132  long linecnt;		/* count of total lines in the file */
   1.133  long checked_linecnt;	/* count of lines actually checked */
   1.134  
   1.135 -void proghelp(void);
   1.136 -void procfile(char *);
   1.137 +void proghelp(GOptionContext *context);
   1.138 +void procfile(const char *);
   1.139  
   1.140 -#define LOW_THRESHOLD    0
   1.141 -#define HIGH_THRESHOLD   1
   1.142 +gchar *running_from;
   1.143  
   1.144 -#define START 0
   1.145 -#define END 1
   1.146 -#define PREV 0
   1.147 -#define NEXT 1
   1.148 -#define FIRST_OF_PAIR 0
   1.149 -#define SECOND_OF_PAIR 1
   1.150 -
   1.151 -#define MAX_WORDPAIR 1000
   1.152 -
   1.153 -char running_from[MAX_PATH];
   1.154 -
   1.155 -int mixdigit(char *);
   1.156 -const char *getaword(const char *,char *);
   1.157 -int matchword(char *,char *);
   1.158 -char *flgets(char *,int,FILE *,long);
   1.159 -void lowerit(char *);
   1.160 -int gcisalpha(unsigned char);
   1.161 -int gcisdigit(unsigned char);
   1.162 -int gcisletter(unsigned char);
   1.163 -char *gcstrchr(char *s,char c);
   1.164 +int mixdigit(const char *);
   1.165 +gchar *getaword(const char **);
   1.166 +char *flgets(char **,long);
   1.167 +gboolean gcisalpha(unsigned char);
   1.168 +gboolean gcisdigit(unsigned char);
   1.169 +gboolean gcisletter(unsigned char);
   1.170  void postprocess_for_HTML(char *);
   1.171  char *linehasmarkup(char *);
   1.172  char *losemarkup(char *);
   1.173 -int tagcomp(char *,char *);
   1.174 +int tagcomp(const char *,const char *);
   1.175  char *loseentities(char *);
   1.176 -int isroman(char *);
   1.177 -int usertypo_count;
   1.178 +gboolean isroman(const char *);
   1.179  void postprocess_for_DP(char *);
   1.180  
   1.181 -char wrk[LINEBUFSIZE];
   1.182 -
   1.183 -#define MAX_QWORD 50
   1.184 -#define MAX_QWORD_LENGTH 40
   1.185 -char qword[MAX_QWORD][MAX_QWORD_LENGTH];
   1.186 -int dupcnt[MAX_QWORD];
   1.187 +GTree *qword,*qperiod;
   1.188  
   1.189  struct first_pass_results {
   1.190      long firstline,astline;
   1.191 @@ -392,7 +373,8 @@
   1.192  
   1.193  struct warnings {
   1.194      int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
   1.195 -    int endquote,isDutch,isFrench;
   1.196 +    int endquote;
   1.197 +    gboolean isDutch,isFrench;
   1.198  };
   1.199  
   1.200  struct counters {
   1.201 @@ -411,52 +393,35 @@
   1.202  };
   1.203  
   1.204  struct pending {
   1.205 -    char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];
   1.206 +    char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
   1.207      long squot;
   1.208  };
   1.209  
   1.210 -int main(int argc,char **argv)
   1.211 +void parse_options(int *argc,char ***argv)
   1.212  {
   1.213 -    char *argsw,*s;
   1.214 -    int i,switno,invarg;
   1.215 -    char usertypo_file[MAX_PATH];
   1.216 -    FILE *usertypofile;
   1.217 -    if (strlen(argv[0])<sizeof(running_from))
   1.218 -	/* save the path to the executable */
   1.219 -	strcpy(running_from,argv[0]);
   1.220 -    /* find out what directory we're running from */
   1.221 -    s=running_from+strlen(running_from);
   1.222 -    for (;*s!='/' && *s!='\\' && s>=running_from;s--)
   1.223 -	*s=0;
   1.224 -    switno=strlen(SWITCHES);
   1.225 -    for (i=switno;--i>0;)
   1.226 -	pswit[i]=0;	   /* initialise switches */
   1.227 -    /*
   1.228 -     * Standard loop to extract switches.
   1.229 -     * When we come out of this loop, the arguments will be
   1.230 -     * in argv[0] upwards and the switches used will be
   1.231 -     * represented by their equivalent elements in pswit[]
   1.232 -     */
   1.233 -    while (--argc>0 && **++argv=='-')
   1.234 -	for (argsw=argv[0]+1;*argsw!='\0';argsw++)
   1.235 -	    for (i=switno,invarg=1;(--i>=0) && invarg==1;)
   1.236 -		if ((toupper(*argsw))==SWITCHES[i])
   1.237 -		{
   1.238 -		    invarg=0;
   1.239 -		    pswit[i]=1;
   1.240 -		}
   1.241 +    GError *err=NULL;
   1.242 +    GOptionContext *context;
   1.243 +    context=g_option_context_new(
   1.244 +      "file - looks for errors in Project Gutenberg(TM) etexts");
   1.245 +    g_option_context_add_main_entries(context,options,NULL);
   1.246 +    if (!g_option_context_parse(context,argc,argv,&err))
   1.247 +    {
   1.248 +	g_printerr("Bookloupe: %s\n",err->message);
   1.249 +	g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
   1.250 +	exit(1);
   1.251 +    }
   1.252      /* Paranoid checking is turned OFF, not on, by its switch */
   1.253 -    pswit[PARANOID_SWITCH]^=1;
   1.254 +    pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
   1.255      if (pswit[PARANOID_SWITCH])
   1.256 -	/* if running in paranoid mode force typo checks as well   */
   1.257 -	pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
   1.258 +	/* if running in paranoid mode, typo checks default to enabled */
   1.259 +	pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
   1.260      /* Line-end checking is turned OFF, not on, by its switch */
   1.261 -    pswit[LINE_END_SWITCH]^=1;
   1.262 +    pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
   1.263      /* Echoing is turned OFF, not on, by its switch */
   1.264 -    pswit[ECHO_SWITCH]^=1;
   1.265 +    pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
   1.266      if (pswit[OVERVIEW_SWITCH])
   1.267  	/* just print summary; don't echo */
   1.268 -	pswit[ECHO_SWITCH]=0;
   1.269 +	pswit[ECHO_SWITCH]=FALSE;
   1.270      /*
   1.271       * Web uploads - for the moment, this is really just a placeholder
   1.272       * until we decide what processing we really want to do on web uploads
   1.273 @@ -464,85 +429,155 @@
   1.274      if (pswit[WEB_SWITCH])
   1.275      {
   1.276  	/* specific override for web uploads */
   1.277 -	pswit[ECHO_SWITCH]=1;
   1.278 -	pswit[SQUOTE_SWITCH]=0;
   1.279 -	pswit[TYPO_SWITCH]=1;
   1.280 -	pswit[QPARA_SWITCH]=0;
   1.281 -	pswit[PARANOID_SWITCH]=1;
   1.282 -	pswit[LINE_END_SWITCH]=0;
   1.283 -	pswit[OVERVIEW_SWITCH]=0;
   1.284 -	pswit[STDOUT_SWITCH]=0;
   1.285 -	pswit[HEADER_SWITCH]=1;
   1.286 -	pswit[VERBOSE_SWITCH]=0;
   1.287 -	pswit[MARKUP_SWITCH]=0;
   1.288 -	pswit[USERTYPO_SWITCH]=0;
   1.289 -	pswit[DP_SWITCH]=0;
   1.290 +	pswit[ECHO_SWITCH]=TRUE;
   1.291 +	pswit[SQUOTE_SWITCH]=FALSE;
   1.292 +	pswit[TYPO_SWITCH]=TRUE;
   1.293 +	pswit[QPARA_SWITCH]=FALSE;
   1.294 +	pswit[PARANOID_SWITCH]=TRUE;
   1.295 +	pswit[LINE_END_SWITCH]=FALSE;
   1.296 +	pswit[OVERVIEW_SWITCH]=FALSE;
   1.297 +	pswit[STDOUT_SWITCH]=FALSE;
   1.298 +	pswit[HEADER_SWITCH]=TRUE;
   1.299 +	pswit[VERBOSE_SWITCH]=FALSE;
   1.300 +	pswit[MARKUP_SWITCH]=FALSE;
   1.301 +	pswit[USERTYPO_SWITCH]=FALSE;
   1.302 +	pswit[DP_SWITCH]=FALSE;
   1.303      }
   1.304 -    if (argc<MINARGS || argc>MAXARGS)
   1.305 +    if (*argc<2)
   1.306      {
   1.307 -	/* check number of args */
   1.308 -	proghelp();
   1.309 -	return 1;
   1.310 +	proghelp(context);
   1.311 +	exit(1);
   1.312      }
   1.313 -    /* read in the user-defined stealth scanno list */
   1.314 +    g_option_context_free(context);
   1.315 +}
   1.316 +
   1.317 +/*
   1.318 + * read_user_scannos:
   1.319 + *
   1.320 + * Read in the user-defined stealth scanno list.
   1.321 + */
   1.322 +void read_user_scannos(void)
   1.323 +{
   1.324 +    GError *err=NULL;
   1.325 +    gchar *usertypo_file;
   1.326 +    gboolean okay;
   1.327 +    int i;
   1.328 +    gsize len;
   1.329 +    gchar *contents,**lines;
   1.330 +    usertypo_file=g_strdup("bookloupe.typ");
   1.331 +    okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   1.332 +    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   1.333 +    {
   1.334 +	g_clear_error(&err);
   1.335 +	g_free(usertypo_file);
   1.336 +	usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
   1.337 +	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   1.338 +    }
   1.339 +    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   1.340 +    {
   1.341 +	g_clear_error(&err);
   1.342 +	g_free(usertypo_file);
   1.343 +	usertypo_file=g_strdup("gutcheck.typ");
   1.344 +	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   1.345 +    }
   1.346 +    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   1.347 +    {
   1.348 +	g_clear_error(&err);
   1.349 +	g_free(usertypo_file);
   1.350 +	usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
   1.351 +	okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
   1.352 +    }
   1.353 +    if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
   1.354 +    {
   1.355 +	g_free(usertypo_file);
   1.356 +	printf("   --> I couldn't find bookloupe.typ "
   1.357 +	  "-- proceeding without user typos.\n");
   1.358 +	return;
   1.359 +    }
   1.360 +    else if (!okay)
   1.361 +    {
   1.362 +	fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
   1.363 +	g_free(usertypo_file);
   1.364 +	g_clear_error(&err);
   1.365 +	exit(1);
   1.366 +    }
   1.367 +    lines=g_strsplit(contents,"\n",0);
   1.368 +    usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
   1.369 +    for (i=0;lines[i];i++)
   1.370 +	if (*(unsigned char *)lines[i]>'!')
   1.371 +	    g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
   1.372 +	else
   1.373 +	    g_free(lines[i]);
   1.374 +    g_free(lines);
   1.375 +}
   1.376 +
   1.377 +#if 0
   1.378 +/*
   1.379 + * read_etext:
   1.380 + *
   1.381 + * Read an etext returning an array of lines. Lines are normally expected
   1.382 + * to be terminated by CR LF. Solitary LFs delimit lines but are left
   1.383 + * embedded at the end of the line for further processing. Solitary CRs
   1.384 + * do not delimit lines.
   1.385 + */
   1.386 +gchar **read_etext(const char *filename,GError **err)
   1.387 +{
   1.388 +    int i;
   1.389 +    const char *s,*t;
   1.390 +    gchar *contents;
   1.391 +    gchar **raw_lines;
   1.392 +    GPtrArray *lines;
   1.393 +    gsize len;
   1.394 +    if (!g_file_get_contents(filename,&contents,&len,err))
   1.395 +	return NULL;
   1.396 +    raw_lines=g_strsplit(contents,"\r\n",0);
   1.397 +    lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
   1.398 +    for (i=0;raw_lines[i];i++)
   1.399 +    {
   1.400 +	t=strchr(raw_lines[i],'\n');
   1.401 +	if (t)
   1.402 +	{
   1.403 +	    s=raw_lines[i];
   1.404 +	    while ((t=strchr(s,'\n')))
   1.405 +	    {
   1.406 +		g_ptr_array_add(lines,g_strndup(s,t-s+1));
   1.407 +		s=t+1;
   1.408 +	    }
   1.409 +	    g_ptr_array_add(lines,g_strdup(s));
   1.410 +	    g_free(raw_lines[i]);
   1.411 +	}
   1.412 +	else
   1.413 +	    g_ptr_array_add(lines,raw_lines[i]);
   1.414 +    }
   1.415 +    g_free(raw_lines);
   1.416 +    g_ptr_array_add(lines,NULL);
   1.417 +    return (gchar **)g_ptr_array_free(lines,FALSE);
   1.418 +}
   1.419 +#else
   1.420 +/*
   1.421 + * read_etext:
   1.422 + *
   1.423 + * Read an etext returning a newly allocated string containing the file
   1.424 + * contents or NULL on error.
   1.425 + */
   1.426 +gchar *read_etext(const char *filename,GError **err)
   1.427 +{
   1.428 +    gchar *contents;
   1.429 +    gsize len;
   1.430 +    if (!g_file_get_contents(filename,&contents,&len,err))
   1.431 +	return NULL;
   1.432 +    return contents;
   1.433 +}
   1.434 +#endif
   1.435 +
   1.436 +int main(int argc,char **argv)
   1.437 +{
   1.438 +    running_from=g_path_get_dirname(argv[0]);
   1.439 +    parse_options(&argc,&argv);
   1.440      if (pswit[USERTYPO_SWITCH])
   1.441 -    {
   1.442 -	/* ... we were told we had one! */
   1.443 -	usertypofile=fopen(USERTYPO_FILE,"rb");
   1.444 -	if (!usertypofile)
   1.445 -	{
   1.446 -	    /* not in cwd. try excuteable directory. */
   1.447 -	    strcpy(usertypo_file,running_from);
   1.448 -	    strcat(usertypo_file,USERTYPO_FILE);
   1.449 -	    usertypofile=fopen(usertypo_file,"rb");
   1.450 -	    if (!usertypofile) {
   1.451 -		/* we ain't got no user typo file! */
   1.452 -		printf("   --> I couldn't find gutcheck.typ "
   1.453 -		  "-- proceeding without user typos.\n");
   1.454 -	    }
   1.455 -	}
   1.456 -	usertypo_count=0;
   1.457 -	if (usertypofile)
   1.458 -	{
   1.459 -	    /* we managed to open a User Typo File! */
   1.460 -	    if (pswit[USERTYPO_SWITCH])
   1.461 -	    {
   1.462 -		while (flgets(aline,LINEBUFSIZE-1,usertypofile,
   1.463 -		  (long)usertypo_count))
   1.464 -		{
   1.465 -		    if (strlen(aline)>1)
   1.466 -		    {
   1.467 -			if ((int)*aline>33)
   1.468 -			{
   1.469 -			    s=malloc(strlen(aline)+1);
   1.470 -			    if (!s)
   1.471 -			    {
   1.472 -				fprintf(stderr,"bookloupe: cannot get enough "
   1.473 -				  "memory for user typo file!\n");
   1.474 -				exit(1);
   1.475 -			    }
   1.476 -			    strcpy(s,aline);
   1.477 -			    usertypo[usertypo_count]=s;
   1.478 -			    usertypo_count++;
   1.479 -			    if (usertypo_count>=MAX_USER_TYPOS)
   1.480 -			    {
   1.481 -				printf("   --> Only %d user-defined typos "
   1.482 -				  "allowed: ignoring the rest\n",
   1.483 -				  MAX_USER_TYPOS);
   1.484 -				break;
   1.485 -			    }
   1.486 -			}
   1.487 -		    }
   1.488 -		}
   1.489 -	    }
   1.490 -	    fclose(usertypofile);
   1.491 -	}
   1.492 -    }
   1.493 +	read_user_scannos();
   1.494      fprintf(stderr,"bookloupe: Check and report on an e-text\n");
   1.495 -    cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
   1.496 -    cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
   1.497 -    cnt_spacend=0;
   1.498 -    procfile(argv[0]);
   1.499 +    procfile(argv[1]);
   1.500      if (pswit[OVERVIEW_SWITCH])
   1.501      {
   1.502  	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   1.503 @@ -577,6 +612,9 @@
   1.504  	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   1.505  	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   1.506      }
   1.507 +    g_free(running_from);
   1.508 +    if (usertypo)
   1.509 +	g_tree_unref(usertypo);
   1.510      return 0;
   1.511  }
   1.512  
   1.513 @@ -588,28 +626,33 @@
   1.514   * occur many times in the text like long or short
   1.515   * lines, non-standard dashes, etc.
   1.516   */
   1.517 -struct first_pass_results *first_pass(FILE *infile)
   1.518 +struct first_pass_results *first_pass(const char *etext)
   1.519  {
   1.520      char laststart=CHAR_SPACE;
   1.521      const char *s;
   1.522 -    int i,llen;
   1.523 +    gchar *lc_line;
   1.524 +    int i,j,llen;
   1.525 +    gchar **lines;
   1.526      unsigned int lastlen=0,lastblen=0;
   1.527      long spline=0,nspline=0;
   1.528      static struct first_pass_results results={0};
   1.529 -    char inword[MAXWORDLEN]="";
   1.530 -    while (fgets(aline,LINEBUFSIZE-1,infile))
   1.531 +    gchar *inword;
   1.532 +    lines=g_strsplit(etext,"\n",0);
   1.533 +    for (j=0;lines[j];j++)
   1.534      {
   1.535 -	while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
   1.536 -	    aline[strlen(aline)-1]=0;
   1.537 +	llen=strlen(lines[j]);
   1.538 +	while(lines[j][llen-1]=='\r')
   1.539 +	    lines[j][llen--]='\0';
   1.540  	linecnt++;
   1.541 -	if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
   1.542 -	  (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
   1.543 +	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   1.544 +	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   1.545  	{
   1.546  	    if (spline)
   1.547  		printf("   --> Duplicate header?\n");
   1.548  	    spline=linecnt+1;   /* first line of non-header text, that is */
   1.549  	}
   1.550 -	if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
   1.551 +	if (!strncmp(lines[j],"*** START",9) &&
   1.552 +	  strstr(lines[j],"PROJECT GUTENBERG"))
   1.553  	{
   1.554  	    if (nspline)
   1.555  		printf("   --> Duplicate header?\n");
   1.556 @@ -617,10 +660,10 @@
   1.557  	}
   1.558  	if (spline || nspline)
   1.559  	{
   1.560 -	    lowerit(aline);
   1.561 -	    if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
   1.562 +	    lc_line=g_ascii_strdown(lines[j],llen);
   1.563 +	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   1.564  	    {
   1.565 -		if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
   1.566 +		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   1.567  		{
   1.568  		    if (results.footerline)
   1.569  		    {
   1.570 @@ -632,6 +675,7 @@
   1.571  			results.footerline=linecnt;
   1.572  		}
   1.573  	    }
   1.574 +	    g_free(lc_line);
   1.575  	}
   1.576  	if (spline)
   1.577  	    results.firstline=spline;
   1.578 @@ -639,85 +683,83 @@
   1.579  	    results.firstline=nspline;  /* override with new */
   1.580  	if (results.footerline)
   1.581  	    continue;    /* don't count the boilerplate in the footer */
   1.582 -	llen=strlen(aline);
   1.583  	results.totlen+=llen;
   1.584  	for (i=0;i<llen;i++)
   1.585  	{
   1.586 -	    if ((unsigned char)aline[i]>127)
   1.587 +	    if ((unsigned char)lines[j][i]>127)
   1.588  		results.binlen++;
   1.589 -	    if (gcisalpha(aline[i]))
   1.590 +	    if (gcisalpha(lines[j][i]))
   1.591  		results.alphalen++;
   1.592 -	    if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
   1.593 +	    if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
   1.594  		results.endquote_count++;
   1.595  	}
   1.596 -	if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
   1.597 -	  lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   1.598 +	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   1.599 +	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   1.600  	    results.shortline++;
   1.601 -	if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
   1.602 +	if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
   1.603  	    cnt_spacend++;
   1.604 -	if (strstr(aline,".,"))
   1.605 +	if (strstr(lines[j],".,"))
   1.606  	    results.dotcomma++;
   1.607  	/* only count ast lines for ignoring purposes where there is */
   1.608  	/* locase text on the line */
   1.609 -	if (strstr(aline,"*"))
   1.610 +	if (strchr(lines[j],'*'))
   1.611  	{
   1.612 -	    for (s=aline;*s;s++)
   1.613 +	    for (s=lines[j];*s;s++)
   1.614  		if (*s>='a' && *s<='z')
   1.615  		    break;
   1.616  	     if (*s)
   1.617  		results.astline++;
   1.618  	}
   1.619 -	if (strstr(aline,"/"))
   1.620 +	if (strchr(lines[j],'/'))
   1.621  	    results.fslashline++;
   1.622 -	for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
   1.623 +	for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
   1.624  	    ;
   1.625 -	if (aline[i]=='-' && aline[i-1]!='-')
   1.626 +	if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
   1.627  	    results.hyphens++;
   1.628  	if (llen>LONGEST_PG_LINE)
   1.629  	    results.longline++;
   1.630  	if (llen>WAY_TOO_LONG)
   1.631  	    results.verylongline++;
   1.632 -	if (strstr(aline,"<") && strstr(aline,">"))
   1.633 +	if (strchr(lines[j],'<') && strchr(lines[j],'>'))
   1.634  	{
   1.635 -	    i=(int)(strstr(aline,">")-strstr(aline,"<")+1);
   1.636 +	    i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
   1.637  	    if (i>0)
   1.638  		results.htmcount++;
   1.639 -	    if (strstr(aline,"<i>"))
   1.640 +	    if (strstr(lines[j],"<i>"))
   1.641  		results.htmcount+=4; /* bonus marks! */
   1.642  	}
   1.643  	/* Check for spaced em-dashes */
   1.644 -	if (strstr(aline,"--"))
   1.645 +	if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
   1.646  	{
   1.647  	    results.emdash++;
   1.648 -	    if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
   1.649 -	       (*(strstr(aline,"--")+2)==CHAR_SPACE))
   1.650 +	    if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
   1.651  		results.space_emdash++;
   1.652 -	    if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
   1.653 -	       (*(strstr(aline,"--")+2)==CHAR_SPACE))
   1.654 +	    if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
   1.655  		/* count of em-dashes with spaces both sides */
   1.656  		results.non_PG_space_emdash++;
   1.657 -	    if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
   1.658 -	       (*(strstr(aline,"--")+2)!=CHAR_SPACE))
   1.659 +	    if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
   1.660  		/* count of PG-type em-dashes with no spaces */
   1.661  		results.PG_space_emdash++;
   1.662  	}
   1.663 -	for (s=aline;*s;)
   1.664 +	for (s=lines[j];*s;)
   1.665  	{
   1.666 -	    s=getaword(s,inword);
   1.667 +	    inword=getaword(&s);
   1.668  	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
   1.669  		results.Dutchcount++;
   1.670  	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
   1.671  		results.Frenchcount++;
   1.672  	    if (!strcmp(inword,"0") || !strcmp(inword,"1")) 
   1.673  		results.standalone_digit++;
   1.674 +	    g_free(inword);
   1.675  	}
   1.676  	/* Check for spaced dashes */
   1.677 -	if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
   1.678 +	if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
   1.679  	    results.spacedash++;
   1.680  	lastblen=lastlen;
   1.681 -	lastlen=strlen(aline);
   1.682 -	laststart=aline[0];
   1.683 +	lastlen=llen;
   1.684 +	laststart=lines[j][0];
   1.685      }
   1.686 +    g_strfreev(lines);
   1.687      return &results;
   1.688  }
   1.689  
   1.690 @@ -856,17 +898,17 @@
   1.691  	  "Not reporting them.\n");
   1.692  	warnings.bin=0;
   1.693      }
   1.694 -    warnings.isDutch=0;
   1.695 +    warnings.isDutch=FALSE;
   1.696      if (results->Dutchcount>50)
   1.697      {
   1.698 -	warnings.isDutch=1;
   1.699 +	warnings.isDutch=TRUE;
   1.700  	printf("   --> This looks like Dutch - "
   1.701  	  "switching off dashes and warnings for 's Middags case.\n");
   1.702      }
   1.703 -    warnings.isFrench=0;
   1.704 +    warnings.isFrench=FALSE;
   1.705      if (results->Frenchcount>50)
   1.706      {
   1.707 -	warnings.isFrench=1;
   1.708 +	warnings.isFrench=TRUE;
   1.709  	printf("   --> This looks like French - "
   1.710  	  "switching off some doublepunct.\n");
   1.711      }
   1.712 @@ -919,12 +961,14 @@
   1.713   * count it, since empty lines with asterisks or dashes to
   1.714   * separate sections are common.
   1.715   *
   1.716 - * Returns: Non-zero if the line is empty.
   1.717 + * Returns: TRUE if the line is empty.
   1.718   */
   1.719 -int analyse_quotes(const char *s,struct counters *counters)
   1.720 +gboolean analyse_quotes(const char *aline,struct counters *counters)
   1.721  {
   1.722      int guessquote=0;
   1.723 -    int isemptyline=1;    /* assume the line is empty until proven otherwise */
   1.724 +    /* assume the line is empty until proven otherwise */
   1.725 +    gboolean isemptyline=TRUE;
   1.726 +    const char *s=aline;
   1.727      while (*s)
   1.728      {
   1.729  	if (*s==CHAR_DQUOTE)
   1.730 @@ -986,7 +1030,7 @@
   1.731  	}
   1.732  	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
   1.733  	  *s!=13 && *s!=10)
   1.734 -	    isemptyline=0;  /* ignore lines like  *  *  *  as spacers */
   1.735 +	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   1.736  	if (*s==CHAR_UNDERSCORE)
   1.737  	    counters->c_unders++;
   1.738  	if (*s==CHAR_OPEN_CBRACK)
   1.739 @@ -1040,7 +1084,7 @@
   1.740   * Check for binary and other odd characters.
   1.741   */
   1.742  void check_for_odd_characters(const char *aline,const struct warnings *warnings,
   1.743 -  int isemptyline)
   1.744 +  gboolean isemptyline)
   1.745  {
   1.746      /* Don't repeat multiple warnings on one line. */
   1.747      int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
   1.748 @@ -1461,16 +1505,15 @@
   1.749  void check_for_extra_period(const char *aline,const struct warnings *warnings)
   1.750  {
   1.751      const char *s,*t,*s1;
   1.752 -    int i,istypo,isdup;
   1.753 -    static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
   1.754 -    static int qperiod_index=0;
   1.755 -    char testword[MAXWORDLEN]="";
   1.756 +    int i;
   1.757 +    gboolean istypo;
   1.758 +    gchar *testword;
   1.759      if (pswit[PARANOID_SWITCH])
   1.760      {
   1.761 -	for (t=s=aline;strstr(t,". ");)
   1.762 +	for (t=aline;strstr(t,". ");)
   1.763  	{
   1.764  	    t=strstr(t,". ");
   1.765 -	    if (t==s)
   1.766 +	    if (t==aline)
   1.767  	    {
   1.768  		t++;
   1.769  		/* start of line punctuation is handled elsewhere */
   1.770 @@ -1497,57 +1540,48 @@
   1.771  	    if (*s1>='a' && *s1<='z')
   1.772  	    {
   1.773  		/* we have something to investigate */
   1.774 -		istypo=1;
   1.775 +		istypo=TRUE;
   1.776  		/* so let's go back and find out */
   1.777 -		for (s1=t-1;s1>=s &&
   1.778 +		for (s1=t-1;s1>=aline &&
   1.779  		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
   1.780  		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
   1.781  		    ;
   1.782  		s1++;
   1.783 -		for (i=0;*s1 && *s1!='.';s1++,i++)
   1.784 -		    testword[i]=*s1;
   1.785 -		testword[i]=0;
   1.786 +		s=strchr(s1,'.');
   1.787 +		if (s)
   1.788 +		    testword=g_strndup(s1,s-s1);
   1.789 +		else
   1.790 +		    testword=g_strdup(s1);
   1.791  		for (i=0;*abbrev[i];i++)
   1.792  		    if (!strcmp(testword,abbrev[i]))
   1.793 -			istypo=0;
   1.794 +			istypo=FALSE;
   1.795  		if (gcisdigit(*testword))
   1.796 -		    istypo=0;
   1.797 +		    istypo=FALSE;
   1.798  		if (!testword[1])
   1.799 -		    istypo=0;
   1.800 +		    istypo=FALSE;
   1.801  		if (isroman(testword))
   1.802 -		    istypo=0;
   1.803 +		    istypo=FALSE;
   1.804  		if (istypo)
   1.805  		{
   1.806 -		    istypo=0;
   1.807 +		    istypo=FALSE;
   1.808  		    for (i=0;testword[i];i++)
   1.809  			if (strchr(vowels,testword[i]))
   1.810 -			    istypo=1;
   1.811 +			    istypo=TRUE;
   1.812  		}
   1.813 -		if (istypo)
   1.814 +		if (istypo &&
   1.815 +		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
   1.816  		{
   1.817 -		    isdup=0;
   1.818 -		    if (strlen(testword)<MAX_QWORD_LENGTH &&
   1.819 -		      !pswit[VERBOSE_SWITCH])
   1.820 -			for (i=0;i<qperiod_index;i++)
   1.821 -			    if (!strcmp(testword,qperiod[i]))
   1.822 -				isdup=1;
   1.823 -		    if (!isdup)
   1.824 -		    {
   1.825 -			if (qperiod_index<MAX_QWORD &&
   1.826 -			  strlen(testword)<MAX_QWORD_LENGTH)
   1.827 -			{
   1.828 -			    strcpy(qperiod[qperiod_index],testword);
   1.829 -			    qperiod_index++;
   1.830 -			}
   1.831 -			if (pswit[ECHO_SWITCH])
   1.832 -			    printf("\n%s\n",aline);
   1.833 -			if (!pswit[OVERVIEW_SWITCH])
   1.834 -			    printf("    Line %ld column %d - Extra period?\n",
   1.835 -			      linecnt,(int)(t-aline)+1);
   1.836 -			else
   1.837 -			    cnt_punct++;
   1.838 -		    }
   1.839 +		    g_tree_insert(qperiod,g_strdup(testword),
   1.840 +		      GINT_TO_POINTER(1));
   1.841 +		    if (pswit[ECHO_SWITCH])
   1.842 +			printf("\n%s\n",aline);
   1.843 +		    if (!pswit[OVERVIEW_SWITCH])
   1.844 +			printf("    Line %ld column %d - Extra period?\n",
   1.845 +			  linecnt,(int)(t-aline)+1);
   1.846 +		    else
   1.847 +			cnt_punct++;
   1.848  		}
   1.849 +		g_free(testword);
   1.850  	    }
   1.851  	    t++;
   1.852  	}
   1.853 @@ -1563,16 +1597,20 @@
   1.854  {
   1.855      int i;
   1.856      const char *s,*wordstart;
   1.857 -    char inword[MAXWORDLEN];
   1.858 +    gchar *inword,*t;
   1.859      if (pswit[TYPO_SWITCH])
   1.860      {
   1.861  	for (s=aline;*s;)
   1.862  	{
   1.863  	    wordstart=s;
   1.864 -	    s=getaword(s,inword);
   1.865 -	    if (!*inword)
   1.866 +	    t=getaword(&s);
   1.867 +	    if (!*t)
   1.868 +	    {
   1.869 +		g_free(t);
   1.870  		continue;
   1.871 -	    lowerit(inword);
   1.872 +	    }
   1.873 +	    inword=g_ascii_strdown(t,-1);
   1.874 +	    g_free(t);
   1.875  	    for (i=0;*nocomma[i];i++)
   1.876  		if (!strcmp(inword,nocomma[i]))
   1.877  		{
   1.878 @@ -1603,6 +1641,7 @@
   1.879  			    cnt_punct++;
   1.880  		    }
   1.881  		}
   1.882 +	    g_free(inword);
   1.883  	}
   1.884      }
   1.885  }
   1.886 @@ -1616,15 +1655,18 @@
   1.887  void check_for_typos(const char *aline,struct warnings *warnings)
   1.888  {
   1.889      const char *s,*wordstart;
   1.890 -    char inword[MAXWORDLEN],testword[MAXWORDLEN];
   1.891 -    int i,istypo,isdup,alower,vowel,consonant;
   1.892 -    static int qword_index=0;
   1.893 +    gchar *inword,*testword;
   1.894 +    int i,alower,vowel,consonant,*dupcnt;
   1.895 +    gboolean isdup,istypo;
   1.896      for (s=aline;*s;)
   1.897      {
   1.898  	wordstart=s;
   1.899 -	s=getaword(s,inword);
   1.900 +	inword=getaword(&s);
   1.901  	if (!*inword)
   1.902 +	{
   1.903 +	    g_free(inword);
   1.904  	    continue; /* don't bother with empty lines */
   1.905 +	}
   1.906  	if (mixdigit(inword))
   1.907  	{
   1.908  	    if (pswit[ECHO_SWITCH])
   1.909 @@ -1639,10 +1681,10 @@
   1.910  	 * Put the word through a series of tests for likely typos and OCR
   1.911  	 * errors.
   1.912  	 */
   1.913 -	if (pswit[TYPO_SWITCH])
   1.914 +	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
   1.915  	{
   1.916 -	    istypo=0;
   1.917 -	    strcpy(testword,inword);
   1.918 +	    istypo=FALSE;
   1.919 +	    testword=g_strdup(inword);
   1.920  	    alower=0;
   1.921  	    for (i=0;i<(int)strlen(testword);i++)
   1.922  	    {
   1.923 @@ -1662,10 +1704,13 @@
   1.924  		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
   1.925  			; /* do nothing! */
   1.926  		    else
   1.927 -			istypo=1;
   1.928 +			istypo=TRUE;
   1.929  		}
   1.930  		testword[i]=(char)tolower(testword[i]);
   1.931  	    }
   1.932 +	}
   1.933 +	if (pswit[TYPO_SWITCH])
   1.934 +	{
   1.935  	    /*
   1.936  	     * Check for certain unlikely two-letter combinations at word
   1.937  	     * start and end.
   1.938 @@ -1674,26 +1719,26 @@
   1.939  	    {
   1.940  		for (i=0;*nostart[i];i++)
   1.941  		    if (!strncmp(testword,nostart[i],2))
   1.942 -			istypo=1;
   1.943 +			istypo=TRUE;
   1.944  		for (i=0;*noend[i];i++)
   1.945  		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))
   1.946 -			istypo=1;
   1.947 +			istypo=TRUE;
   1.948  	    }
   1.949  	    /* ght is common, gbt never. Like that. */
   1.950  	    if (strstr(testword,"cb"))
   1.951 -		istypo=1;
   1.952 +		istypo=TRUE;
   1.953  	    if (strstr(testword,"gbt"))
   1.954 -		istypo=1;
   1.955 +		istypo=TRUE;
   1.956  	    if (strstr(testword,"pbt"))
   1.957 -		istypo=1;
   1.958 +		istypo=TRUE;
   1.959  	    if (strstr(testword,"tbs"))
   1.960 -		istypo=1;
   1.961 +		istypo=TRUE;
   1.962  	    if (strstr(testword,"mrn"))
   1.963 -		istypo=1;
   1.964 +		istypo=TRUE;
   1.965  	    if (strstr(testword,"ahle"))
   1.966 -		istypo=1;
   1.967 +		istypo=TRUE;
   1.968  	    if (strstr(testword,"ihle"))
   1.969 -		istypo=1;
   1.970 +		istypo=TRUE;
   1.971  	    /*
   1.972  	     * "TBE" does happen - like HEARTBEAT - but uncommon.
   1.973  	     * Also "TBI" - frostbite, outbid - but uncommon.
   1.974 @@ -1701,11 +1746,11 @@
   1.975  	     * numerals, but "ii" is a common scanno.
   1.976  	     */
   1.977  	    if (strstr(testword,"tbi"))
   1.978 -		istypo=1;
   1.979 +		istypo=TRUE;
   1.980  	    if (strstr(testword,"tbe"))
   1.981 -		istypo=1;
   1.982 +		istypo=TRUE;
   1.983  	    if (strstr(testword,"ii"))
   1.984 -		istypo=1;
   1.985 +		istypo=TRUE;
   1.986  	    /*
   1.987  	     * Check for no vowels or no consonants.
   1.988  	     * If none, flag a typo.
   1.989 @@ -1727,7 +1772,7 @@
   1.990  			consonant++;
   1.991  		}
   1.992  		if (!vowel || !consonant)
   1.993 -		    istypo=1;
   1.994 +		    istypo=TRUE;
   1.995  	    }
   1.996  	    /*
   1.997  	     * Now exclude the word from being reported if it's in
   1.998 @@ -1735,18 +1780,18 @@
   1.999  	     */
  1.1000  	    for (i=0;*okword[i];i++)
  1.1001  		if (!strcmp(testword,okword[i]))
  1.1002 -		    istypo=0;
  1.1003 +		    istypo=FALSE;
  1.1004  	    /*
  1.1005  	     * What looks like a typo may be a Roman numeral.
  1.1006  	     * Exclude these.
  1.1007  	     */
  1.1008  	    if (istypo && isroman(testword))
  1.1009 -		istypo=0;
  1.1010 +		istypo=FALSE;
  1.1011  	    /* Check the manual list of typos. */
  1.1012  	    if (!istypo)
  1.1013  		for (i=0;*typo[i];i++)
  1.1014  		    if (!strcmp(testword,typo[i]))
  1.1015 -			istypo=1;
  1.1016 +			istypo=TRUE;
  1.1017  	    /*
  1.1018  	     * Check lowercase s, l, i and m - special cases.
  1.1019  	     *   "j" - often a semi-colon gone wrong.
  1.1020 @@ -1754,34 +1799,30 @@
  1.1021  	     *   "n" for "in"
  1.1022  	     */
  1.1023  	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
  1.1024 -		istypo=1;
  1.1025 +		istypo=TRUE;
  1.1026  	    if (istypo)
  1.1027  	    {
  1.1028 -		isdup=0;
  1.1029 -		if (strlen(testword)<MAX_QWORD_LENGTH &&
  1.1030 -		  !pswit[VERBOSE_SWITCH])
  1.1031 -		    for (i=0;i<qword_index;i++)
  1.1032 -			if (!strcmp(testword,qword[i]))
  1.1033 -			{
  1.1034 -			    isdup=1;
  1.1035 -			    ++dupcnt[i];
  1.1036 -			}
  1.1037 +		dupcnt=g_tree_lookup(qword,testword);
  1.1038 +		if (dupcnt)
  1.1039 +		{
  1.1040 +		    (*dupcnt)++;
  1.1041 +		    isdup=!pswit[VERBOSE_SWITCH];
  1.1042 +		}
  1.1043 +		else
  1.1044 +		{
  1.1045 +		    dupcnt=g_new0(int,1);
  1.1046 +		    g_tree_insert(qword,g_strdup(testword),dupcnt);
  1.1047 +		    isdup=FALSE;
  1.1048 +		}
  1.1049  		if (!isdup)
  1.1050  		{
  1.1051 -		    if (qword_index<MAX_QWORD &&
  1.1052 -		      strlen(testword)<MAX_QWORD_LENGTH)
  1.1053 -		    {
  1.1054 -			strcpy(qword[qword_index],testword);
  1.1055 -			qword_index++;
  1.1056 -		    }
  1.1057  		    if (pswit[ECHO_SWITCH])
  1.1058  			printf("\n%s\n",aline);
  1.1059  		    if (!pswit[OVERVIEW_SWITCH])
  1.1060  		    {
  1.1061  			printf("    Line %ld column %d - Query word %s",
  1.1062  			  linecnt,(int)(wordstart-aline)+1,inword);
  1.1063 -			if (strlen(testword)<MAX_QWORD_LENGTH &&
  1.1064 -			  !pswit[VERBOSE_SWITCH])
  1.1065 +			if (!pswit[VERBOSE_SWITCH])
  1.1066  			    printf(" - not reporting duplicates");
  1.1067  			printf("\n");
  1.1068  		    }
  1.1069 @@ -1791,17 +1832,16 @@
  1.1070  	    }
  1.1071  	}
  1.1072  	/* check the user's list of typos */
  1.1073 -	if (!istypo && usertypo_count)
  1.1074 -	    for (i=0;i<usertypo_count;i++)
  1.1075 -		if (!strcmp(testword,usertypo[i]))
  1.1076 -		{
  1.1077 -		    if (pswit[ECHO_SWITCH])
  1.1078 -			printf("\n%s\n",aline);
  1.1079 -		    if (!pswit[OVERVIEW_SWITCH])  
  1.1080 -			printf("    Line %ld column %d - "
  1.1081 -			  "Query possible scanno %s\n",
  1.1082 -			  linecnt,(int)(wordstart-aline)+2,inword);
  1.1083 -		}
  1.1084 +	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1.1085 +	{
  1.1086 +	    if (pswit[ECHO_SWITCH])
  1.1087 +		printf("\n%s\n",aline);
  1.1088 +	    if (!pswit[OVERVIEW_SWITCH])  
  1.1089 +		printf("    Line %ld column %d - Query possible scanno %s\n",
  1.1090 +		  linecnt,(int)(wordstart-aline)+2,inword);
  1.1091 +	}
  1.1092 +	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1.1093 +	    g_free(testword);
  1.1094  	if (pswit[PARANOID_SWITCH] && warnings->digit)
  1.1095  	{
  1.1096  	    /* In paranoid mode, query all 0 and 1 standing alone. */
  1.1097 @@ -1816,6 +1856,7 @@
  1.1098  		    cnt_word++;
  1.1099  	    }
  1.1100  	}
  1.1101 +	g_free(inword);
  1.1102      }
  1.1103  }
  1.1104  
  1.1105 @@ -1830,9 +1871,10 @@
  1.1106   * quotes "like"this.
  1.1107   */
  1.1108  void check_for_misspaced_punctuation(const char *aline,
  1.1109 -  struct parities *parities,int isemptyline)
  1.1110 +  struct parities *parities,gboolean isemptyline)
  1.1111  {
  1.1112 -    int i,llen,isacro,isellipsis;
  1.1113 +    int i,llen;
  1.1114 +    gboolean isacro,isellipsis;
  1.1115      const char *s;
  1.1116      llen=strlen(aline);
  1.1117      for (i=1;i<llen;i++)
  1.1118 @@ -1841,9 +1883,9 @@
  1.1119  	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
  1.1120  	{
  1.1121  	    /* we need to suppress warnings for acronyms like M.D. */
  1.1122 -	    isacro=0;
  1.1123 +	    isacro=FALSE;
  1.1124  	    /* we need to suppress warnings for ellipsis . . . */
  1.1125 -	    isellipsis=0;
  1.1126 +	    isellipsis=FALSE;
  1.1127  	    /* if there are letters on both sides of it or ... */
  1.1128  	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
  1.1129  	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
  1.1130 @@ -1852,9 +1894,9 @@
  1.1131  		if (aline[i]=='.')
  1.1132  		{
  1.1133  		    if (i>2 && aline[i-2]=='.')
  1.1134 -			isacro=1;
  1.1135 +			isacro=TRUE;
  1.1136  		    if (i+2<llen && aline[i+2]=='.')
  1.1137 -			isacro=1;
  1.1138 +			isacro=TRUE;
  1.1139  		}
  1.1140  		if (!isacro)
  1.1141  		{
  1.1142 @@ -1877,9 +1919,9 @@
  1.1143  		if (aline[i]=='.')
  1.1144  		{
  1.1145  		    if (i>2 && aline[i-2]=='.')
  1.1146 -			isellipsis=1;
  1.1147 +			isellipsis=TRUE;
  1.1148  		    if (i+2<llen && aline[i+2]=='.')
  1.1149 -			isellipsis=1;
  1.1150 +			isellipsis=TRUE;
  1.1151  		}
  1.1152  		if (!isemptyline && !isellipsis)
  1.1153  		{
  1.1154 @@ -2177,6 +2219,8 @@
  1.1155  void check_for_miscased_genative(const char *aline)
  1.1156  {
  1.1157      const char *s;
  1.1158 +    if (!*aline)
  1.1159 +	return;
  1.1160      s=aline+1;
  1.1161      while (*s)
  1.1162      {
  1.1163 @@ -2321,13 +2365,11 @@
  1.1164  	    i=(int)(close-open+1);
  1.1165  	    if (i>0)
  1.1166  	    {
  1.1167 -		strncpy(wrk,open,i);
  1.1168 -		wrk[i]=0;
  1.1169  		if (pswit[ECHO_SWITCH])
  1.1170  		    printf("\n%s\n",aline);
  1.1171  		if (!pswit[OVERVIEW_SWITCH])
  1.1172 -		    printf("    Line %ld column %d - HTML Tag? %s \n",
  1.1173 -		      linecnt,(int)(open-aline)+1,wrk);
  1.1174 +		    printf("    Line %ld column %d - HTML Tag? %*.*s \n",
  1.1175 +		      linecnt,(int)(open-aline)+1,i,i,open);
  1.1176  		else
  1.1177  		    cnt_html++;
  1.1178  	    }
  1.1179 @@ -2359,13 +2401,11 @@
  1.1180  		    i=0;		/* Don't report "Jones & Son;" */
  1.1181  	    if (i>0)
  1.1182  	    {
  1.1183 -		strncpy(wrk,amp,i);
  1.1184 -		wrk[i]=0;
  1.1185  		if (pswit[ECHO_SWITCH])
  1.1186  		    printf("\n%s\n",aline);
  1.1187  		if (!pswit[OVERVIEW_SWITCH])
  1.1188 -		    printf("    Line %ld column %d - HTML symbol? %s \n",
  1.1189 -		      linecnt,(int)(amp-aline)+1,wrk);
  1.1190 +		    printf("    Line %ld column %d - HTML symbol? %*.*s \n",
  1.1191 +		      linecnt,(int)(amp-aline)+1,i,i,amp);
  1.1192  		else
  1.1193  		    cnt_html++;
  1.1194  	    }
  1.1195 @@ -2388,7 +2428,8 @@
  1.1196      s=aline;
  1.1197      while (*s==' ')
  1.1198  	s++;
  1.1199 -    if (*pending->dquote)
  1.1200 +    if (pending->dquote)
  1.1201 +    {
  1.1202  	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  1.1203  	{
  1.1204  	    if (!pswit[OVERVIEW_SWITCH])
  1.1205 @@ -2400,7 +2441,10 @@
  1.1206  	    else
  1.1207  		cnt_dquot++;
  1.1208  	}
  1.1209 -    if (*pending->squote)
  1.1210 +	g_free(pending->dquote);
  1.1211 +	pending->dquote=NULL;
  1.1212 +    }
  1.1213 +    if (pending->squote)
  1.1214      {
  1.1215  	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
  1.1216  	  pending->squot)
  1.1217 @@ -2414,8 +2458,10 @@
  1.1218  	    else
  1.1219  		cnt_squot++;
  1.1220  	}
  1.1221 +	g_free(pending->squote);
  1.1222 +	pending->squote=NULL;
  1.1223      }
  1.1224 -    if (*pending->rbrack)
  1.1225 +    if (pending->rbrack)
  1.1226      {
  1.1227  	if (!pswit[OVERVIEW_SWITCH])
  1.1228  	{
  1.1229 @@ -2425,8 +2471,10 @@
  1.1230  	}
  1.1231  	else
  1.1232  	    cnt_brack++;
  1.1233 +	g_free(pending->rbrack);
  1.1234 +	pending->rbrack=NULL;
  1.1235      }
  1.1236 -    if (*pending->sbrack)
  1.1237 +    if (pending->sbrack)
  1.1238      {
  1.1239  	if (!pswit[OVERVIEW_SWITCH])
  1.1240  	{
  1.1241 @@ -2436,8 +2484,10 @@
  1.1242  	}
  1.1243  	else
  1.1244  	    cnt_brack++;
  1.1245 +	g_free(pending->sbrack);
  1.1246 +	pending->sbrack=NULL;
  1.1247      }
  1.1248 -    if (*pending->cbrack)
  1.1249 +    if (pending->cbrack)
  1.1250      {
  1.1251  	if (!pswit[OVERVIEW_SWITCH])
  1.1252  	{
  1.1253 @@ -2447,8 +2497,10 @@
  1.1254  	}
  1.1255  	else
  1.1256  	    cnt_brack++;
  1.1257 +	g_free(pending->cbrack);
  1.1258 +	pending->cbrack=NULL;
  1.1259      }
  1.1260 -    if (*pending->unders)
  1.1261 +    if (pending->unders)
  1.1262      {
  1.1263  	if (!pswit[OVERVIEW_SWITCH])
  1.1264  	{
  1.1265 @@ -2458,6 +2510,8 @@
  1.1266  	}
  1.1267  	else
  1.1268  	    cnt_brack++;
  1.1269 +	g_free(pending->unders);
  1.1270 +	pending->unders=NULL;
  1.1271      }
  1.1272  }
  1.1273  
  1.1274 @@ -2481,12 +2535,12 @@
  1.1275    struct pending *pending)
  1.1276  {
  1.1277      if (counters->quot%2)
  1.1278 -	sprintf(pending->dquote,"    Line %ld - Mismatched quotes",
  1.1279 -	  linecnt);
  1.1280 +	pending->dquote=
  1.1281 +	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
  1.1282      if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  1.1283        counters->open_single_quote!=counters->close_single_quote)
  1.1284 -	sprintf(pending->squote,"    Line %ld - Mismatched singlequotes?",
  1.1285 -	  linecnt);
  1.1286 +	pending->squote=
  1.1287 +	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
  1.1288      if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
  1.1289        counters->open_single_quote!=counters->close_single_quote &&
  1.1290        counters->open_single_quote!=counters->close_single_quote+1)
  1.1291 @@ -2496,17 +2550,17 @@
  1.1292  	 */
  1.1293  	pending->squot=1;
  1.1294      if (counters->r_brack)
  1.1295 -	sprintf(pending->rbrack,"    Line %ld - Mismatched round brackets?",
  1.1296 -	  linecnt);
  1.1297 +	pending->rbrack=
  1.1298 +	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
  1.1299      if (counters->s_brack)
  1.1300 -	sprintf(pending->sbrack,"    Line %ld - Mismatched square brackets?",
  1.1301 -	  linecnt);
  1.1302 +	pending->sbrack=
  1.1303 +	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
  1.1304      if (counters->c_brack)
  1.1305 -	sprintf(pending->cbrack,"    Line %ld - Mismatched curly brackets?",
  1.1306 -	  linecnt);
  1.1307 +	pending->cbrack=
  1.1308 +	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
  1.1309      if (counters->c_unders%2)
  1.1310 -	sprintf(pending->unders,"    Line %ld - Mismatched underscores?",
  1.1311 -	  linecnt);
  1.1312 +	pending->unders=
  1.1313 +	  g_strdup_printf("    Line %ld - Mismatched underscores?",linecnt);
  1.1314  }
  1.1315  
  1.1316  /*
  1.1317 @@ -2563,50 +2617,63 @@
  1.1318      }
  1.1319  }
  1.1320  
  1.1321 +gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
  1.1322 +{
  1.1323 +    const char *word=key;
  1.1324 +    int *dupcnt=value;
  1.1325 +    if (*dupcnt)
  1.1326 +	printf("\nNote: Queried word %s was duplicated %d times\n",
  1.1327 +	  word,*dupcnt);
  1.1328 +    return FALSE;
  1.1329 +}
  1.1330 +
  1.1331  /*
  1.1332   * procfile:
  1.1333   *
  1.1334   * Process one file.
  1.1335   */
  1.1336 -void procfile(char *filename)
  1.1337 +void procfile(const char *filename)
  1.1338  {
  1.1339      const char *s;
  1.1340 -    char parastart[81];     /* first line of current para */
  1.1341 -    FILE *infile;
  1.1342 +    gchar *parastart=NULL;	/* first line of current para */
  1.1343 +    gchar *etext,*aline;
  1.1344 +    gchar *etext_ptr;
  1.1345 +    GError *err=NULL;
  1.1346      struct first_pass_results *first_pass_results;
  1.1347      struct warnings *warnings;
  1.1348      struct counters counters={0};
  1.1349      struct line_properties last={0};
  1.1350      struct parities parities={0};
  1.1351 -    struct pending pending={{0},};
  1.1352 -    int isemptyline;
  1.1353 +    struct pending pending={0};
  1.1354 +    gboolean isemptyline;
  1.1355      long start_para_line=0;
  1.1356 -    int i,isnewpara=0,enddash=0;
  1.1357 +    gboolean isnewpara=FALSE,enddash=FALSE;
  1.1358      last.start=CHAR_SPACE;
  1.1359 -    *prevline=0;
  1.1360      linecnt=checked_linecnt=0;
  1.1361 -    infile=fopen(filename,"rb");
  1.1362 -    if (!infile)
  1.1363 +    etext=read_etext(filename,&err);
  1.1364 +    if (!etext)
  1.1365      {
  1.1366  	if (pswit[STDOUT_SWITCH])
  1.1367 -	    fprintf(stdout,"bookloupe: cannot open %s\n",filename);
  1.1368 +	    fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
  1.1369  	else
  1.1370 -	    fprintf(stderr,"bookloupe: cannot open %s\n",filename);
  1.1371 +	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  1.1372  	exit(1);
  1.1373      }
  1.1374      fprintf(stdout,"\n\nFile: %s\n\n",filename);
  1.1375 -    first_pass_results=first_pass(infile);
  1.1376 +    first_pass_results=first_pass(etext);
  1.1377      warnings=report_first_pass(first_pass_results);
  1.1378 +    qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  1.1379 +    qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
  1.1380      /*
  1.1381       * Here we go with the main pass. Hold onto yer hat!
  1.1382       */
  1.1383 -    rewind(infile);
  1.1384      linecnt=0;
  1.1385 -    while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
  1.1386 +    etext_ptr=etext;
  1.1387 +    while ((aline=flgets(&etext_ptr,linecnt+1)))
  1.1388      {
  1.1389  	linecnt++;
  1.1390  	if (linecnt==1)
  1.1391 -	    isnewpara=1;
  1.1392 +	    isnewpara=TRUE;
  1.1393  	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
  1.1394  	    continue;    // skip DP page separators completely
  1.1395  	if (linecnt<first_pass_results->firstline ||
  1.1396 @@ -2635,8 +2702,8 @@
  1.1397  	    /* This line is the start of a new paragraph. */
  1.1398  	    start_para_line=linecnt;
  1.1399  	    /* Capture its first line in case we want to report it later. */
  1.1400 -	    strncpy(parastart,aline,80);
  1.1401 -	    parastart[79]=0;
  1.1402 +	    g_free(parastart);
  1.1403 +	    parastart=g_strdup(aline);
  1.1404  	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  1.1405  	    s=aline;
  1.1406  	    while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
  1.1407 @@ -2653,7 +2720,7 @@
  1.1408  		else
  1.1409  		    cnt_punct++;
  1.1410  	    }
  1.1411 -	    isnewpara=0; /* Signal the end of new para processing. */
  1.1412 +	    isnewpara=FALSE; /* Signal the end of new para processing. */
  1.1413  	}
  1.1414  	/* Check for an em-dash broken at line end. */
  1.1415  	if (enddash && *aline=='-')
  1.1416 @@ -2665,11 +2732,11 @@
  1.1417  	    else
  1.1418  		cnt_punct++;
  1.1419  	}
  1.1420 -	enddash=0;
  1.1421 +	enddash=FALSE;
  1.1422  	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
  1.1423  	    ;
  1.1424  	if (s>=aline && *s=='-')
  1.1425 -	    enddash=1;
  1.1426 +	    enddash=TRUE;
  1.1427  	check_for_control_characters(aline);
  1.1428  	if (warnings->bin)
  1.1429  	    check_for_odd_characters(aline,warnings,isemptyline);
  1.1430 @@ -2709,40 +2776,49 @@
  1.1431  	    check_for_mismatched_quotes(&counters,&pending);
  1.1432  	    memset(&counters,0,sizeof(counters));
  1.1433  	    /* let the next iteration know that it's starting a new para */
  1.1434 -	    isnewpara=1;
  1.1435 -	    check_for_omitted_punctuation(prevline,&last,start_para_line);
  1.1436 +	    isnewpara=TRUE;
  1.1437 +	    if (prevline)
  1.1438 +		check_for_omitted_punctuation(prevline,&last,start_para_line);
  1.1439  	}
  1.1440 -	strcpy(prevline,aline);
  1.1441 +	g_free(prevline);
  1.1442 +	prevline=g_strdup(aline);
  1.1443      }
  1.1444 -    fclose(infile);
  1.1445 +    if (prevline)
  1.1446 +    {
  1.1447 +	g_free(prevline);
  1.1448 +	prevline=NULL;
  1.1449 +    }
  1.1450 +    g_free(parastart);
  1.1451 +    g_free(prevline);
  1.1452 +    g_free(etext);
  1.1453      if (!pswit[OVERVIEW_SWITCH])
  1.1454 -	for (i=0;i<MAX_QWORD;i++)
  1.1455 -	    if (dupcnt[i])
  1.1456 -		printf("\nNote: Queried word %s was duplicated %d time%s\n",
  1.1457 -		  qword[i],dupcnt[i],"s");
  1.1458 +	g_tree_foreach(qword,report_duplicate_queries,NULL);
  1.1459 +    g_tree_unref(qword);
  1.1460 +    g_tree_unref(qperiod);
  1.1461  }
  1.1462  
  1.1463  /*
  1.1464   * flgets:
  1.1465   *
  1.1466 - * Get one line from the input stream, checking for
  1.1467 + * Get one line from the input text, checking for
  1.1468   * the existence of exactly one CR/LF line-end per line.
  1.1469   *
  1.1470   * Returns: a pointer to the line.
  1.1471   */
  1.1472 -char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
  1.1473 +char *flgets(char **etext,long lcnt)
  1.1474  {
  1.1475      char c;
  1.1476 -    int len,isCR,cint;
  1.1477 -    *theline=0;
  1.1478 -    len=isCR=0;
  1.1479 -    c=cint=fgetc(thefile);
  1.1480 -    do
  1.1481 +    int len;
  1.1482 +    gboolean isCR=FALSE;
  1.1483 +    char *theline=*etext;
  1.1484 +    len=0;
  1.1485 +    for(;;)
  1.1486      {
  1.1487 -	if (cint==EOF)
  1.1488 +	c=*(*etext)++;
  1.1489 +	if (!c)
  1.1490  	    return NULL;
  1.1491  	/* either way, it's end of line */
  1.1492 -	if (c==10)
  1.1493 +	if (c=='\n')
  1.1494  	{
  1.1495  	    if (isCR)
  1.1496  		break;
  1.1497 @@ -2752,7 +2828,7 @@
  1.1498  		if (pswit[LINE_END_SWITCH])
  1.1499  		{
  1.1500  		    if (pswit[ECHO_SWITCH])
  1.1501 -			printf("\n%s\n",theline);
  1.1502 +			printf("\n%*.*s\n",len,len,theline);
  1.1503  		    if (!pswit[OVERVIEW_SWITCH])
  1.1504  			printf("    Line %ld - No CR?\n",lcnt);
  1.1505  		    else
  1.1506 @@ -2761,7 +2837,7 @@
  1.1507  		break;
  1.1508  	    }
  1.1509  	}
  1.1510 -	if (c==13)
  1.1511 +	if (c=='\r')
  1.1512  	{
  1.1513  	    if (isCR)
  1.1514  	    {
  1.1515 @@ -2769,34 +2845,33 @@
  1.1516  		if (pswit[LINE_END_SWITCH])
  1.1517  		{
  1.1518  		    if (pswit[ECHO_SWITCH])
  1.1519 -			printf("\n%s\n",theline);
  1.1520 +			printf("\n%*.*s\n",len,len,theline);
  1.1521  		    if (!pswit[OVERVIEW_SWITCH])
  1.1522  			printf("    Line %ld - Two successive CRs?\n",lcnt);
  1.1523  		    else
  1.1524  			cnt_lineend++;
  1.1525  		}
  1.1526  	    }
  1.1527 -	    isCR=1;
  1.1528 +	    isCR=TRUE;
  1.1529  	}
  1.1530  	else
  1.1531  	{
  1.1532  	    if (pswit[LINE_END_SWITCH] && isCR)
  1.1533  	    {
  1.1534  		if (pswit[ECHO_SWITCH])
  1.1535 -		    printf("\n%s\n",theline);
  1.1536 +		    printf("\n%*.*s\n",len,len,theline);
  1.1537  		if (!pswit[OVERVIEW_SWITCH])
  1.1538  		    printf("    Line %ld column %d - CR without LF?\n",
  1.1539  		      lcnt,len+1);
  1.1540  		else
  1.1541  		    cnt_lineend++;
  1.1542 +		theline[len]=' ';
  1.1543  	    }
  1.1544 -	    theline[len]=c;
  1.1545 +	    isCR=FALSE;
  1.1546  	    len++;
  1.1547 -	    theline[len]=0;
  1.1548 -	    isCR=0;
  1.1549  	}
  1.1550 -	c=cint=fgetc(thefile);
  1.1551 -    } while(len<maxlen);
  1.1552 +    }
  1.1553 +    theline[len]='\0';
  1.1554      if (pswit[MARKUP_SWITCH])  
  1.1555  	postprocess_for_HTML(theline);
  1.1556      if (pswit[DP_SWITCH])  
  1.1557 @@ -2813,10 +2888,10 @@
  1.1558   *
  1.1559   * Returns: 0 if no error found, 1 if error.
  1.1560   */
  1.1561 -int mixdigit(char *checkword)
  1.1562 +int mixdigit(const char *checkword)
  1.1563  {
  1.1564      int wehaveadigit,wehavealetter,firstdigits,query,wl;
  1.1565 -    char *s;
  1.1566 +    const char *s;
  1.1567      wehaveadigit=wehavealetter=query=0;
  1.1568      for (s=checkword;*s;s++)
  1.1569  	if (gcisalpha(*s))
  1.1570 @@ -2832,17 +2907,20 @@
  1.1571  	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
  1.1572  	    ;
  1.1573  	/* digits, ending in st, rd, nd, th of either case */
  1.1574 -	if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
  1.1575 -	  matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
  1.1576 -	  matchword(checkword+wl-2,"th")))
  1.1577 +	if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
  1.1578 +	  !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
  1.1579 +	  !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
  1.1580 +	  !g_ascii_strcasecmp(checkword+wl-2,"th")))
  1.1581  	    query=0;
  1.1582 -	if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
  1.1583 -	  matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
  1.1584 -	  matchword(checkword+wl-3,"ths")))
  1.1585 +	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
  1.1586 +	  !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
  1.1587 +	  !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
  1.1588 +	  !g_ascii_strcasecmp(checkword+wl-3,"ths")))
  1.1589  	    query=0;
  1.1590 -	if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
  1.1591 -	  matchword(checkword+wl-4,"rdly") ||
  1.1592 -	  matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
  1.1593 +	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
  1.1594 +	  !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
  1.1595 +	  !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
  1.1596 +	  !g_ascii_strcasecmp(checkword+wl-4,"thly")))
  1.1597  	    query=0;
  1.1598  	/* digits, ending in l, L, s or d */
  1.1599  	if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
  1.1600 @@ -2864,20 +2942,20 @@
  1.1601  /*
  1.1602   * getaword:
  1.1603   *
  1.1604 - * Extracts the first/next "word" from the line, and puts
  1.1605 - * it into "thisword". A word is defined as one English word unit--or
  1.1606 - * at least that's the aim.
  1.1607 + * Extracts the first/next "word" from the line, and returns it.
  1.1608 + * A word is defined as one English word unit--or at least that's the aim.
  1.1609 + * "ptr" is advanced to the position in the line where we will start
  1.1610 + * looking for the next word.
  1.1611   *
  1.1612 - * Returns: a pointer to the position in the line where we will start
  1.1613 - *	  looking for the next word.
  1.1614 + * Returns: A newly-allocated string.
  1.1615   */
  1.1616 -const char *getaword(const char *fromline,char *thisword)
  1.1617 +gchar *getaword(const char **ptr)
  1.1618  {
  1.1619 -    int i,wordlen;
  1.1620 +    int i;
  1.1621      const char *s;
  1.1622 -    wordlen=0;
  1.1623 -    for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
  1.1624 -      fromline++)
  1.1625 +    GString *word;
  1.1626 +    word=g_string_new(NULL);
  1.1627 +    for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
  1.1628  	;
  1.1629      /*
  1.1630       * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  1.1631 @@ -2887,64 +2965,25 @@
  1.1632       * If found, it returns this whole pattern as a word; otherwise we discard
  1.1633       * the results and resume our normal programming.
  1.1634       */
  1.1635 -    s=fromline;
  1.1636 -    for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
  1.1637 -      wordlen<MAXWORDLEN;s++)
  1.1638 +    s=*ptr;
  1.1639 +    for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
  1.1640 +	g_string_append_c(word,*s);
  1.1641 +    for (i=1;i+1<word->len;i++)
  1.1642      {
  1.1643 -	thisword[wordlen]=*s;
  1.1644 -	wordlen++;
  1.1645 -    }
  1.1646 -    thisword[wordlen]=0;
  1.1647 -    for (i=1;i<wordlen-1;i++)
  1.1648 -    {
  1.1649 -	if (thisword[i]=='.' || thisword[i]==',')
  1.1650 +	if (word->str[i]=='.' || word->str[i]==',')
  1.1651  	{
  1.1652 -	    if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
  1.1653 +	    if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
  1.1654  	    {
  1.1655 -		fromline=s;
  1.1656 -		return fromline;
  1.1657 +		*ptr=s;
  1.1658 +		return g_string_free(word,FALSE);
  1.1659  	    }
  1.1660  	}
  1.1661      }
  1.1662      /* we didn't find a punctuated number - do the regular getword thing */
  1.1663 -    wordlen=0;
  1.1664 -    for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
  1.1665 -      wordlen<MAXWORDLEN;fromline++)
  1.1666 -    {
  1.1667 -	thisword[wordlen]=*fromline;
  1.1668 -	wordlen++;
  1.1669 -    }
  1.1670 -    thisword[wordlen]=0;
  1.1671 -    return fromline;
  1.1672 -}
  1.1673 -
  1.1674 -/*
  1.1675 - * matchword:
  1.1676 - *
  1.1677 - * A case-insensitive string matcher.
  1.1678 - */
  1.1679 -int matchword(char *checkfor,char *thisword)
  1.1680 -{
  1.1681 -    unsigned int ismatch,i;
  1.1682 -    if (strlen(checkfor)!=strlen(thisword))
  1.1683 -	return 0;
  1.1684 -    ismatch=1;     /* assume a match until we find a difference */
  1.1685 -    for (i=0;i<strlen(checkfor);i++)
  1.1686 -	if (toupper(checkfor[i])!=toupper(thisword[i]))
  1.1687 -	    ismatch=0;
  1.1688 -    return ismatch;
  1.1689 -}
  1.1690 -
  1.1691 -/*
  1.1692 - * lowerit:
  1.1693 - *
  1.1694 - * Lowercase the line.
  1.1695 - */
  1.1696 -void lowerit(char *theline)
  1.1697 -{
  1.1698 -    for (;*theline;theline++)
  1.1699 -	if (*theline>='A' && *theline<='Z')
  1.1700 -	    *theline+=32;
  1.1701 +    g_string_truncate(word,0);
  1.1702 +    for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
  1.1703 +	g_string_append_c(word,**ptr);
  1.1704 +    return g_string_free(word,FALSE);
  1.1705  }
  1.1706  
  1.1707  /*
  1.1708 @@ -2961,11 +3000,11 @@
  1.1709   * XL or an optional XC, an optional IX or IV, an optional V and any number
  1.1710   * of optional Is.
  1.1711   */
  1.1712 -int isroman(char *t)
  1.1713 +gboolean isroman(const char *t)
  1.1714  {
  1.1715 -    char *s;
  1.1716 +    const char *s;
  1.1717      if (!t || !*t)
  1.1718 -	return 0;
  1.1719 +	return FALSE;
  1.1720      s=t;
  1.1721      while (*t=='m' && *t)
  1.1722  	t++;
  1.1723 @@ -3006,19 +3045,19 @@
  1.1724   * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
  1.1725   * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
  1.1726   */
  1.1727 -int gcisalpha(unsigned char c)
  1.1728 +gboolean gcisalpha(unsigned char c)
  1.1729  {
  1.1730      if (c>='a' && c<='z')
  1.1731 -	return 1;
  1.1732 +	return TRUE;
  1.1733      if (c>='A' && c<='Z')
  1.1734 -	return 1;
  1.1735 +	return TRUE;
  1.1736      if (c<140)
  1.1737 -	return 0;
  1.1738 +	return FALSE;
  1.1739      if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
  1.1740 -	return 1;
  1.1741 +	return TRUE;
  1.1742      if (c==140 || c==142 || c==156 || c==158 || c==159)
  1.1743 -	return 1;
  1.1744 -    return 0;
  1.1745 +	return TRUE;
  1.1746 +    return FALSE;
  1.1747  }
  1.1748  
  1.1749  /*
  1.1750 @@ -3026,7 +3065,7 @@
  1.1751   *
  1.1752   * A version of isdigit() that doesn't get confused in 8-bit texts.
  1.1753   */
  1.1754 -int gcisdigit(unsigned char c)
  1.1755 +gboolean gcisdigit(unsigned char c)
  1.1756  {   
  1.1757      return c>='0' && c<='9';
  1.1758  }
  1.1759 @@ -3037,24 +3076,12 @@
  1.1760   * A version of isletter() that doesn't get confused in 8-bit texts.
  1.1761   * NB: this is ISO-8891-1-specific.
  1.1762   */
  1.1763 -int gcisletter(unsigned char c)
  1.1764 +gboolean gcisletter(unsigned char c)
  1.1765  {   
  1.1766      return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
  1.1767  }
  1.1768  
  1.1769  /*
  1.1770 - * gcstrchr:
  1.1771 - *
  1.1772 - * Wraps strchr to return NULL if the character being searched for is zero.
  1.1773 - */
  1.1774 -char *gcstrchr(char *s,char c)
  1.1775 -{
  1.1776 -    if (!c)
  1.1777 -	return NULL;
  1.1778 -    return strchr(s,c);
  1.1779 -}
  1.1780 -
  1.1781 -/*
  1.1782   * postprocess_for_DP:
  1.1783   *
  1.1784   * Invoked with the -d switch from flgets().
  1.1785 @@ -3097,7 +3124,7 @@
  1.1786   */
  1.1787  void postprocess_for_HTML(char *theline)
  1.1788  {
  1.1789 -    if (strstr(theline,"<") && strstr(theline,">"))
  1.1790 +    if (strchr(theline,'<') && strchr(theline,'>'))
  1.1791  	while (losemarkup(theline))
  1.1792  	    ;
  1.1793      while (loseentities(theline))
  1.1794 @@ -3171,9 +3198,9 @@
  1.1795      return NULL;
  1.1796  }
  1.1797  
  1.1798 -int tagcomp(char *strin,char *basetag)
  1.1799 +int tagcomp(const char *strin,const char *basetag)
  1.1800  {
  1.1801 -    char *s,*t;
  1.1802 +    const char *s,*t;
  1.1803      s=basetag;
  1.1804      t=strin;
  1.1805      if (*t=='/')
  1.1806 @@ -3188,8 +3215,9 @@
  1.1807      return 0;
  1.1808  }
  1.1809  
  1.1810 -void proghelp()
  1.1811 +void proghelp(GOptionContext *context)
  1.1812  {
  1.1813 +    gchar *help;
  1.1814      fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
  1.1815      fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
  1.1816      fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
  1.1817 @@ -3198,22 +3226,10 @@
  1.1818      fputs("This is Free Software; "
  1.1819        "you may redistribute it under certain conditions (GPL);\n",stderr);
  1.1820      fputs("read the file COPYING for details.\n\n",stderr);
  1.1821 -    fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
  1.1822 -    fputs("  where -s checks single quotes, -e suppresses echoing lines, "
  1.1823 -      "-t checks typos\n",stderr);
  1.1824 -    fputs("  -x (paranoid) switches OFF -t and extra checks, "
  1.1825 -      "-l turns OFF line-end checks\n",stderr);
  1.1826 -    fputs("  -o just displays overview without detail, "
  1.1827 -      "-h echoes header fields\n",stderr);
  1.1828 -    fputs("  -v (verbose) unsuppresses duplicate reporting, "
  1.1829 -      "-m suppresses markup\n",stderr);
  1.1830 -    fputs("  -d ignores DP-specific markup,\n",stderr);
  1.1831 -    fputs("  -u uses a file gutcheck.typ to query user-defined "
  1.1832 -      "possible typos\n",stderr);
  1.1833 -    fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
  1.1834 -    fputs("\n",stderr);
  1.1835 -    fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
  1.1836 -      stderr);
  1.1837 +    help=g_option_context_get_help(context,TRUE,NULL);
  1.1838 +    fputs(help,stderr);
  1.1839 +    g_free(help);
  1.1840 +    fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
  1.1841      fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
  1.1842        "non-ASCII\n",stderr);
  1.1843      fputs("characters like accented letters, "
changeset 69	1016349e619f
parent 68	adb087007d08
child 70	aa916da2e452