1.1 --- a/bl/textfileutils.c Mon May 27 09:03:04 2013 +0100
1.2 +++ b/bl/textfileutils.c Tue May 28 15:17:19 2013 +0100
1.3 @@ -3,26 +3,21 @@
1.4 #include <bl/bl.h>
1.5
1.6 /*
1.7 - * Read a file into memory (which should be freed with mem_free when no
1.8 + * Read a file into memory (which should be freed with g_free when no
1.9 * longer required). Returns NULL on error and outputs a suitable error
1.10 * message to stderr.
1.11 * DOS-style line endings and UTF-8 BOM are handled transparently even
1.12 * on platforms which don't normally use these formats.
1.13 */
1.14 gboolean file_get_contents_text(const char *filename,char **contents,
1.15 - size_t *length)
1.16 + size_t *length,GError **err)
1.17 {
1.18 int i;
1.19 unsigned char *raw;
1.20 - size_t raw_length;
1.21 + gsize raw_length;
1.22 GString *string;
1.23 - GError *error=NULL;
1.24 - if (!g_file_get_contents(filename,(char *)&raw,&raw_length,&error))
1.25 - {
1.26 - fprintf(stderr,"%s: %s\n",filename,error->message);
1.27 - g_error_free(error);
1.28 + if (!g_file_get_contents(filename,(char **)&raw,&raw_length,err))
1.29 return FALSE;
1.30 - }
1.31 string=g_string_new(NULL);
1.32 i=0;
1.33 if (raw_length>=3 && raw[0]==0xEF && raw[1]==0xBB && raw[2]==0xBF)
2.1 --- a/bl/textfileutils.h Mon May 27 09:03:04 2013 +0100
2.2 +++ b/bl/textfileutils.h Tue May 28 15:17:19 2013 +0100
2.3 @@ -4,6 +4,6 @@
2.4 #include <glib.h>
2.5
2.6 gboolean file_get_contents_text(const char *filename,char **contents,
2.7 - size_t *length);
2.8 + size_t *length,GError **err);
2.9
2.10 #endif /* BL_TEXTFILEUTILS_H */
3.1 --- a/bookloupe/Makefile.am Mon May 27 09:03:04 2013 +0100
3.2 +++ b/bookloupe/Makefile.am Tue May 28 15:17:19 2013 +0100
3.3 @@ -1,5 +1,9 @@
3.4 +INCLUDES=-I$(top_srcdir)
3.5 bin_PROGRAMS=bookloupe
3.6 pkgdata_DATA=bookloupe.typ
3.7 +AM_CFLAGS=$(GLIB_CFLAGS)
3.8 +LIBS=$(GLIB_LIBS)
3.9 +LDADD=../bl/libbl.la
3.10
3.11 bookloupe.typ: bookloupe.typ.in
3.12 sed 's/$$/\r/' $< > $@
4.1 --- a/bookloupe/bookloupe.c Mon May 27 09:03:04 2013 +0100
4.2 +++ b/bookloupe/bookloupe.c Tue May 28 15:17:19 2013 +0100
4.3 @@ -22,19 +22,10 @@
4.4 #include <stdlib.h>
4.5 #include <string.h>
4.6 #include <ctype.h>
4.7 +#include <glib.h>
4.8 +#include <bl/bl.h>
4.9
4.10 -#define MAXWORDLEN 80 /* max length of one word */
4.11 -#define LINEBUFSIZE 2048 /* buffer size for an input line */
4.12 -
4.13 -#define MAX_USER_TYPOS 1000
4.14 -#define USERTYPO_FILE "gutcheck.typ"
4.15 -
4.16 -#ifndef MAX_PATH
4.17 -#define MAX_PATH 16384
4.18 -#endif
4.19 -
4.20 -char aline[LINEBUFSIZE];
4.21 -char prevline[LINEBUFSIZE];
4.22 +gchar *prevline;
4.23
4.24 /* Common typos. */
4.25 char *typo[] = {
4.26 @@ -70,7 +61,7 @@
4.27 "se", ""
4.28 };
4.29
4.30 -char *usertypo[MAX_USER_TYPOS];
4.31 +GTree *usertypo;
4.32
4.33 /* Common abbreviations and other OK words not to query as typos. */
4.34 char *okword[] = {
4.35 @@ -282,46 +273,57 @@
4.36 #define WAY_TOO_LONG 80
4.37 #define SHORTEST_PG_LINE 55
4.38
4.39 -#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
4.40 - /* D - ignore DP-specific markup */
4.41 - /* E - echo queried line */
4.42 - /* S - check single quotes */
4.43 - /* T - check common typos */
4.44 - /* P - require closure of quotes on */
4.45 - /* every paragraph */
4.46 - /* X - "Trust no one" :-) Paranoid! */
4.47 - /* Queries everything */
4.48 - /* L - line end checking defaults on */
4.49 - /* -L turns it off */
4.50 - /* O - overview. Just shows counts. */
4.51 - /* Y - puts errors to stdout */
4.52 - /* instead of stderr */
4.53 - /* H - Echoes header fields */
4.54 - /* M - Ignore markup in < > */
4.55 - /* U - Use file of User-defined Typos */
4.56 - /* W - Defaults for use on Web upload */
4.57 - /* V - Verbose - list EVERYTHING! */
4.58 -#define SWITNO 14 /* max number of switch parms */
4.59 - /* - used for defining array-size */
4.60 -#define MINARGS 1 /* minimum no of args excl switches */
4.61 -#define MAXARGS 1 /* maximum no of args excl switches */
4.62 +enum {
4.63 + ECHO_SWITCH,
4.64 + SQUOTE_SWITCH,
4.65 + TYPO_SWITCH,
4.66 + QPARA_SWITCH,
4.67 + PARANOID_SWITCH,
4.68 + LINE_END_SWITCH,
4.69 + OVERVIEW_SWITCH,
4.70 + STDOUT_SWITCH,
4.71 + HEADER_SWITCH,
4.72 + WEB_SWITCH,
4.73 + VERBOSE_SWITCH,
4.74 + MARKUP_SWITCH,
4.75 + USERTYPO_SWITCH,
4.76 + DP_SWITCH,
4.77 + SWITNO
4.78 +};
4.79
4.80 -int pswit[SWITNO]; /* program switches set by SWITCHES */
4.81 +gboolean pswit[SWITNO]; /* program switches */
4.82
4.83 -#define ECHO_SWITCH 0
4.84 -#define SQUOTE_SWITCH 1
4.85 -#define TYPO_SWITCH 2
4.86 -#define QPARA_SWITCH 3
4.87 -#define PARANOID_SWITCH 4
4.88 -#define LINE_END_SWITCH 5
4.89 -#define OVERVIEW_SWITCH 6
4.90 -#define STDOUT_SWITCH 7
4.91 -#define HEADER_SWITCH 8
4.92 -#define WEB_SWITCH 9
4.93 -#define VERBOSE_SWITCH 10
4.94 -#define MARKUP_SWITCH 11
4.95 -#define USERTYPO_SWITCH 12
4.96 -#define DP_SWITCH 13
4.97 +static GOptionEntry options[]={
4.98 + { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
4.99 + "Ignore DP-specific markup", NULL },
4.100 + { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
4.101 + "Don't echo queried line", NULL },
4.102 + { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
4.103 + "Check single quotes", NULL },
4.104 + { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
4.105 + "Check common typos", NULL },
4.106 + { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
4.107 + "Require closure of quotes on every paragraph", NULL },
4.108 + { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
4.109 + "Disable paranoid querying of everything", NULL },
4.110 + { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
4.111 + "Disable line end checking", NULL },
4.112 + { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
4.113 + "Overview: just show counts", NULL },
4.114 + { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
4.115 + "Output errors to stdout instead of stderr", NULL },
4.116 + { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
4.117 + "Echo header fields", NULL },
4.118 + { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
4.119 + "Ignore markup in < >", NULL },
4.120 + { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
4.121 + "Use file of user-defined typos", NULL },
4.122 + { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
4.123 + "Defaults for use on www upload", NULL },
4.124 + { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
4.125 + "Verbose - list everything", NULL },
4.126 + { NULL }
4.127 +};
4.128
4.129 long cnt_dquot; /* for overview mode, count of doublequote queries */
4.130 long cnt_squot; /* for overview mode, count of singlequote queries */
4.131 @@ -340,47 +342,26 @@
4.132 long linecnt; /* count of total lines in the file */
4.133 long checked_linecnt; /* count of lines actually checked */
4.134
4.135 -void proghelp(void);
4.136 -void procfile(char *);
4.137 +void proghelp(GOptionContext *context);
4.138 +void procfile(const char *);
4.139
4.140 -#define LOW_THRESHOLD 0
4.141 -#define HIGH_THRESHOLD 1
4.142 +gchar *running_from;
4.143
4.144 -#define START 0
4.145 -#define END 1
4.146 -#define PREV 0
4.147 -#define NEXT 1
4.148 -#define FIRST_OF_PAIR 0
4.149 -#define SECOND_OF_PAIR 1
4.150 -
4.151 -#define MAX_WORDPAIR 1000
4.152 -
4.153 -char running_from[MAX_PATH];
4.154 -
4.155 -int mixdigit(char *);
4.156 -const char *getaword(const char *,char *);
4.157 -int matchword(char *,char *);
4.158 -char *flgets(char *,int,FILE *,long);
4.159 -void lowerit(char *);
4.160 -int gcisalpha(unsigned char);
4.161 -int gcisdigit(unsigned char);
4.162 -int gcisletter(unsigned char);
4.163 -char *gcstrchr(char *s,char c);
4.164 +int mixdigit(const char *);
4.165 +gchar *getaword(const char **);
4.166 +char *flgets(char **,long);
4.167 +gboolean gcisalpha(unsigned char);
4.168 +gboolean gcisdigit(unsigned char);
4.169 +gboolean gcisletter(unsigned char);
4.170 void postprocess_for_HTML(char *);
4.171 char *linehasmarkup(char *);
4.172 char *losemarkup(char *);
4.173 -int tagcomp(char *,char *);
4.174 +int tagcomp(const char *,const char *);
4.175 char *loseentities(char *);
4.176 -int isroman(char *);
4.177 -int usertypo_count;
4.178 +gboolean isroman(const char *);
4.179 void postprocess_for_DP(char *);
4.180
4.181 -char wrk[LINEBUFSIZE];
4.182 -
4.183 -#define MAX_QWORD 50
4.184 -#define MAX_QWORD_LENGTH 40
4.185 -char qword[MAX_QWORD][MAX_QWORD_LENGTH];
4.186 -int dupcnt[MAX_QWORD];
4.187 +GTree *qword,*qperiod;
4.188
4.189 struct first_pass_results {
4.190 long firstline,astline;
4.191 @@ -392,7 +373,8 @@
4.192
4.193 struct warnings {
4.194 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
4.195 - int endquote,isDutch,isFrench;
4.196 + int endquote;
4.197 + gboolean isDutch,isFrench;
4.198 };
4.199
4.200 struct counters {
4.201 @@ -411,52 +393,35 @@
4.202 };
4.203
4.204 struct pending {
4.205 - char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];
4.206 + char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
4.207 long squot;
4.208 };
4.209
4.210 -int main(int argc,char **argv)
4.211 +void parse_options(int *argc,char ***argv)
4.212 {
4.213 - char *argsw,*s;
4.214 - int i,switno,invarg;
4.215 - char usertypo_file[MAX_PATH];
4.216 - FILE *usertypofile;
4.217 - if (strlen(argv[0])<sizeof(running_from))
4.218 - /* save the path to the executable */
4.219 - strcpy(running_from,argv[0]);
4.220 - /* find out what directory we're running from */
4.221 - s=running_from+strlen(running_from);
4.222 - for (;*s!='/' && *s!='\\' && s>=running_from;s--)
4.223 - *s=0;
4.224 - switno=strlen(SWITCHES);
4.225 - for (i=switno;--i>0;)
4.226 - pswit[i]=0; /* initialise switches */
4.227 - /*
4.228 - * Standard loop to extract switches.
4.229 - * When we come out of this loop, the arguments will be
4.230 - * in argv[0] upwards and the switches used will be
4.231 - * represented by their equivalent elements in pswit[]
4.232 - */
4.233 - while (--argc>0 && **++argv=='-')
4.234 - for (argsw=argv[0]+1;*argsw!='\0';argsw++)
4.235 - for (i=switno,invarg=1;(--i>=0) && invarg==1;)
4.236 - if ((toupper(*argsw))==SWITCHES[i])
4.237 - {
4.238 - invarg=0;
4.239 - pswit[i]=1;
4.240 - }
4.241 + GError *err=NULL;
4.242 + GOptionContext *context;
4.243 + context=g_option_context_new(
4.244 + "file - looks for errors in Project Gutenberg(TM) etexts");
4.245 + g_option_context_add_main_entries(context,options,NULL);
4.246 + if (!g_option_context_parse(context,argc,argv,&err))
4.247 + {
4.248 + g_printerr("Bookloupe: %s\n",err->message);
4.249 + g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
4.250 + exit(1);
4.251 + }
4.252 /* Paranoid checking is turned OFF, not on, by its switch */
4.253 - pswit[PARANOID_SWITCH]^=1;
4.254 + pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
4.255 if (pswit[PARANOID_SWITCH])
4.256 - /* if running in paranoid mode force typo checks as well */
4.257 - pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
4.258 + /* if running in paranoid mode, typo checks default to enabled */
4.259 + pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
4.260 /* Line-end checking is turned OFF, not on, by its switch */
4.261 - pswit[LINE_END_SWITCH]^=1;
4.262 + pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
4.263 /* Echoing is turned OFF, not on, by its switch */
4.264 - pswit[ECHO_SWITCH]^=1;
4.265 + pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
4.266 if (pswit[OVERVIEW_SWITCH])
4.267 /* just print summary; don't echo */
4.268 - pswit[ECHO_SWITCH]=0;
4.269 + pswit[ECHO_SWITCH]=FALSE;
4.270 /*
4.271 * Web uploads - for the moment, this is really just a placeholder
4.272 * until we decide what processing we really want to do on web uploads
4.273 @@ -464,85 +429,155 @@
4.274 if (pswit[WEB_SWITCH])
4.275 {
4.276 /* specific override for web uploads */
4.277 - pswit[ECHO_SWITCH]=1;
4.278 - pswit[SQUOTE_SWITCH]=0;
4.279 - pswit[TYPO_SWITCH]=1;
4.280 - pswit[QPARA_SWITCH]=0;
4.281 - pswit[PARANOID_SWITCH]=1;
4.282 - pswit[LINE_END_SWITCH]=0;
4.283 - pswit[OVERVIEW_SWITCH]=0;
4.284 - pswit[STDOUT_SWITCH]=0;
4.285 - pswit[HEADER_SWITCH]=1;
4.286 - pswit[VERBOSE_SWITCH]=0;
4.287 - pswit[MARKUP_SWITCH]=0;
4.288 - pswit[USERTYPO_SWITCH]=0;
4.289 - pswit[DP_SWITCH]=0;
4.290 + pswit[ECHO_SWITCH]=TRUE;
4.291 + pswit[SQUOTE_SWITCH]=FALSE;
4.292 + pswit[TYPO_SWITCH]=TRUE;
4.293 + pswit[QPARA_SWITCH]=FALSE;
4.294 + pswit[PARANOID_SWITCH]=TRUE;
4.295 + pswit[LINE_END_SWITCH]=FALSE;
4.296 + pswit[OVERVIEW_SWITCH]=FALSE;
4.297 + pswit[STDOUT_SWITCH]=FALSE;
4.298 + pswit[HEADER_SWITCH]=TRUE;
4.299 + pswit[VERBOSE_SWITCH]=FALSE;
4.300 + pswit[MARKUP_SWITCH]=FALSE;
4.301 + pswit[USERTYPO_SWITCH]=FALSE;
4.302 + pswit[DP_SWITCH]=FALSE;
4.303 }
4.304 - if (argc<MINARGS || argc>MAXARGS)
4.305 + if (*argc<2)
4.306 {
4.307 - /* check number of args */
4.308 - proghelp();
4.309 - return 1;
4.310 + proghelp(context);
4.311 + exit(1);
4.312 }
4.313 - /* read in the user-defined stealth scanno list */
4.314 + g_option_context_free(context);
4.315 +}
4.316 +
4.317 +/*
4.318 + * read_user_scannos:
4.319 + *
4.320 + * Read in the user-defined stealth scanno list.
4.321 + */
4.322 +void read_user_scannos(void)
4.323 +{
4.324 + GError *err=NULL;
4.325 + gchar *usertypo_file;
4.326 + gboolean okay;
4.327 + int i;
4.328 + gsize len;
4.329 + gchar *contents,**lines;
4.330 + usertypo_file=g_strdup("bookloupe.typ");
4.331 + okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
4.332 + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
4.333 + {
4.334 + g_clear_error(&err);
4.335 + g_free(usertypo_file);
4.336 + usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
4.337 + okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
4.338 + }
4.339 + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
4.340 + {
4.341 + g_clear_error(&err);
4.342 + g_free(usertypo_file);
4.343 + usertypo_file=g_strdup("gutcheck.typ");
4.344 + okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
4.345 + }
4.346 + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
4.347 + {
4.348 + g_clear_error(&err);
4.349 + g_free(usertypo_file);
4.350 + usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
4.351 + okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
4.352 + }
4.353 + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
4.354 + {
4.355 + g_free(usertypo_file);
4.356 + printf(" --> I couldn't find bookloupe.typ "
4.357 + "-- proceeding without user typos.\n");
4.358 + return;
4.359 + }
4.360 + else if (!okay)
4.361 + {
4.362 + fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
4.363 + g_free(usertypo_file);
4.364 + g_clear_error(&err);
4.365 + exit(1);
4.366 + }
4.367 + lines=g_strsplit(contents,"\n",0);
4.368 + usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
4.369 + for (i=0;lines[i];i++)
4.370 + if (*(unsigned char *)lines[i]>'!')
4.371 + g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
4.372 + else
4.373 + g_free(lines[i]);
4.374 + g_free(lines);
4.375 +}
4.376 +
4.377 +#if 0
4.378 +/*
4.379 + * read_etext:
4.380 + *
4.381 + * Read an etext returning an array of lines. Lines are normally expected
4.382 + * to be terminated by CR LF. Solitary LFs delimit lines but are left
4.383 + * embedded at the end of the line for further processing. Solitary CRs
4.384 + * do not delimit lines.
4.385 + */
4.386 +gchar **read_etext(const char *filename,GError **err)
4.387 +{
4.388 + int i;
4.389 + const char *s,*t;
4.390 + gchar *contents;
4.391 + gchar **raw_lines;
4.392 + GPtrArray *lines;
4.393 + gsize len;
4.394 + if (!g_file_get_contents(filename,&contents,&len,err))
4.395 + return NULL;
4.396 + raw_lines=g_strsplit(contents,"\r\n",0);
4.397 + lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
4.398 + for (i=0;raw_lines[i];i++)
4.399 + {
4.400 + t=strchr(raw_lines[i],'\n');
4.401 + if (t)
4.402 + {
4.403 + s=raw_lines[i];
4.404 + while ((t=strchr(s,'\n')))
4.405 + {
4.406 + g_ptr_array_add(lines,g_strndup(s,t-s+1));
4.407 + s=t+1;
4.408 + }
4.409 + g_ptr_array_add(lines,g_strdup(s));
4.410 + g_free(raw_lines[i]);
4.411 + }
4.412 + else
4.413 + g_ptr_array_add(lines,raw_lines[i]);
4.414 + }
4.415 + g_free(raw_lines);
4.416 + g_ptr_array_add(lines,NULL);
4.417 + return (gchar **)g_ptr_array_free(lines,FALSE);
4.418 +}
4.419 +#else
4.420 +/*
4.421 + * read_etext:
4.422 + *
4.423 + * Read an etext returning a newly allocated string containing the file
4.424 + * contents or NULL on error.
4.425 + */
4.426 +gchar *read_etext(const char *filename,GError **err)
4.427 +{
4.428 + gchar *contents;
4.429 + gsize len;
4.430 + if (!g_file_get_contents(filename,&contents,&len,err))
4.431 + return NULL;
4.432 + return contents;
4.433 +}
4.434 +#endif
4.435 +
4.436 +int main(int argc,char **argv)
4.437 +{
4.438 + running_from=g_path_get_dirname(argv[0]);
4.439 + parse_options(&argc,&argv);
4.440 if (pswit[USERTYPO_SWITCH])
4.441 - {
4.442 - /* ... we were told we had one! */
4.443 - usertypofile=fopen(USERTYPO_FILE,"rb");
4.444 - if (!usertypofile)
4.445 - {
4.446 - /* not in cwd. try excuteable directory. */
4.447 - strcpy(usertypo_file,running_from);
4.448 - strcat(usertypo_file,USERTYPO_FILE);
4.449 - usertypofile=fopen(usertypo_file,"rb");
4.450 - if (!usertypofile) {
4.451 - /* we ain't got no user typo file! */
4.452 - printf(" --> I couldn't find gutcheck.typ "
4.453 - "-- proceeding without user typos.\n");
4.454 - }
4.455 - }
4.456 - usertypo_count=0;
4.457 - if (usertypofile)
4.458 - {
4.459 - /* we managed to open a User Typo File! */
4.460 - if (pswit[USERTYPO_SWITCH])
4.461 - {
4.462 - while (flgets(aline,LINEBUFSIZE-1,usertypofile,
4.463 - (long)usertypo_count))
4.464 - {
4.465 - if (strlen(aline)>1)
4.466 - {
4.467 - if ((int)*aline>33)
4.468 - {
4.469 - s=malloc(strlen(aline)+1);
4.470 - if (!s)
4.471 - {
4.472 - fprintf(stderr,"bookloupe: cannot get enough "
4.473 - "memory for user typo file!\n");
4.474 - exit(1);
4.475 - }
4.476 - strcpy(s,aline);
4.477 - usertypo[usertypo_count]=s;
4.478 - usertypo_count++;
4.479 - if (usertypo_count>=MAX_USER_TYPOS)
4.480 - {
4.481 - printf(" --> Only %d user-defined typos "
4.482 - "allowed: ignoring the rest\n",
4.483 - MAX_USER_TYPOS);
4.484 - break;
4.485 - }
4.486 - }
4.487 - }
4.488 - }
4.489 - }
4.490 - fclose(usertypofile);
4.491 - }
4.492 - }
4.493 + read_user_scannos();
4.494 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
4.495 - cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
4.496 - cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
4.497 - cnt_spacend=0;
4.498 - procfile(argv[0]);
4.499 + procfile(argv[1]);
4.500 if (pswit[OVERVIEW_SWITCH])
4.501 {
4.502 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
4.503 @@ -577,6 +612,9 @@
4.504 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
4.505 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
4.506 }
4.507 + g_free(running_from);
4.508 + if (usertypo)
4.509 + g_tree_unref(usertypo);
4.510 return 0;
4.511 }
4.512
4.513 @@ -588,28 +626,33 @@
4.514 * occur many times in the text like long or short
4.515 * lines, non-standard dashes, etc.
4.516 */
4.517 -struct first_pass_results *first_pass(FILE *infile)
4.518 +struct first_pass_results *first_pass(const char *etext)
4.519 {
4.520 char laststart=CHAR_SPACE;
4.521 const char *s;
4.522 - int i,llen;
4.523 + gchar *lc_line;
4.524 + int i,j,llen;
4.525 + gchar **lines;
4.526 unsigned int lastlen=0,lastblen=0;
4.527 long spline=0,nspline=0;
4.528 static struct first_pass_results results={0};
4.529 - char inword[MAXWORDLEN]="";
4.530 - while (fgets(aline,LINEBUFSIZE-1,infile))
4.531 + gchar *inword;
4.532 + lines=g_strsplit(etext,"\n",0);
4.533 + for (j=0;lines[j];j++)
4.534 {
4.535 - while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
4.536 - aline[strlen(aline)-1]=0;
4.537 + llen=strlen(lines[j]);
4.538 + while(lines[j][llen-1]=='\r')
4.539 + lines[j][llen--]='\0';
4.540 linecnt++;
4.541 - if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
4.542 - (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
4.543 + if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
4.544 + (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
4.545 {
4.546 if (spline)
4.547 printf(" --> Duplicate header?\n");
4.548 spline=linecnt+1; /* first line of non-header text, that is */
4.549 }
4.550 - if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
4.551 + if (!strncmp(lines[j],"*** START",9) &&
4.552 + strstr(lines[j],"PROJECT GUTENBERG"))
4.553 {
4.554 if (nspline)
4.555 printf(" --> Duplicate header?\n");
4.556 @@ -617,10 +660,10 @@
4.557 }
4.558 if (spline || nspline)
4.559 {
4.560 - lowerit(aline);
4.561 - if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
4.562 + lc_line=g_ascii_strdown(lines[j],llen);
4.563 + if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
4.564 {
4.565 - if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
4.566 + if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
4.567 {
4.568 if (results.footerline)
4.569 {
4.570 @@ -632,6 +675,7 @@
4.571 results.footerline=linecnt;
4.572 }
4.573 }
4.574 + g_free(lc_line);
4.575 }
4.576 if (spline)
4.577 results.firstline=spline;
4.578 @@ -639,85 +683,83 @@
4.579 results.firstline=nspline; /* override with new */
4.580 if (results.footerline)
4.581 continue; /* don't count the boilerplate in the footer */
4.582 - llen=strlen(aline);
4.583 results.totlen+=llen;
4.584 for (i=0;i<llen;i++)
4.585 {
4.586 - if ((unsigned char)aline[i]>127)
4.587 + if ((unsigned char)lines[j][i]>127)
4.588 results.binlen++;
4.589 - if (gcisalpha(aline[i]))
4.590 + if (gcisalpha(lines[j][i]))
4.591 results.alphalen++;
4.592 - if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
4.593 + if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
4.594 results.endquote_count++;
4.595 }
4.596 - if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
4.597 - lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
4.598 + if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
4.599 + lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
4.600 results.shortline++;
4.601 - if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
4.602 + if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
4.603 cnt_spacend++;
4.604 - if (strstr(aline,".,"))
4.605 + if (strstr(lines[j],".,"))
4.606 results.dotcomma++;
4.607 /* only count ast lines for ignoring purposes where there is */
4.608 /* locase text on the line */
4.609 - if (strstr(aline,"*"))
4.610 + if (strchr(lines[j],'*'))
4.611 {
4.612 - for (s=aline;*s;s++)
4.613 + for (s=lines[j];*s;s++)
4.614 if (*s>='a' && *s<='z')
4.615 break;
4.616 if (*s)
4.617 results.astline++;
4.618 }
4.619 - if (strstr(aline,"/"))
4.620 + if (strchr(lines[j],'/'))
4.621 results.fslashline++;
4.622 - for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
4.623 + for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
4.624 ;
4.625 - if (aline[i]=='-' && aline[i-1]!='-')
4.626 + if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
4.627 results.hyphens++;
4.628 if (llen>LONGEST_PG_LINE)
4.629 results.longline++;
4.630 if (llen>WAY_TOO_LONG)
4.631 results.verylongline++;
4.632 - if (strstr(aline,"<") && strstr(aline,">"))
4.633 + if (strchr(lines[j],'<') && strchr(lines[j],'>'))
4.634 {
4.635 - i=(int)(strstr(aline,">")-strstr(aline,"<")+1);
4.636 + i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
4.637 if (i>0)
4.638 results.htmcount++;
4.639 - if (strstr(aline,"<i>"))
4.640 + if (strstr(lines[j],"<i>"))
4.641 results.htmcount+=4; /* bonus marks! */
4.642 }
4.643 /* Check for spaced em-dashes */
4.644 - if (strstr(aline,"--"))
4.645 + if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
4.646 {
4.647 results.emdash++;
4.648 - if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
4.649 - (*(strstr(aline,"--")+2)==CHAR_SPACE))
4.650 + if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
4.651 results.space_emdash++;
4.652 - if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
4.653 - (*(strstr(aline,"--")+2)==CHAR_SPACE))
4.654 + if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
4.655 /* count of em-dashes with spaces both sides */
4.656 results.non_PG_space_emdash++;
4.657 - if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
4.658 - (*(strstr(aline,"--")+2)!=CHAR_SPACE))
4.659 + if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
4.660 /* count of PG-type em-dashes with no spaces */
4.661 results.PG_space_emdash++;
4.662 }
4.663 - for (s=aline;*s;)
4.664 + for (s=lines[j];*s;)
4.665 {
4.666 - s=getaword(s,inword);
4.667 + inword=getaword(&s);
4.668 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
4.669 results.Dutchcount++;
4.670 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
4.671 results.Frenchcount++;
4.672 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
4.673 results.standalone_digit++;
4.674 + g_free(inword);
4.675 }
4.676 /* Check for spaced dashes */
4.677 - if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
4.678 + if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
4.679 results.spacedash++;
4.680 lastblen=lastlen;
4.681 - lastlen=strlen(aline);
4.682 - laststart=aline[0];
4.683 + lastlen=llen;
4.684 + laststart=lines[j][0];
4.685 }
4.686 + g_strfreev(lines);
4.687 return &results;
4.688 }
4.689
4.690 @@ -856,17 +898,17 @@
4.691 "Not reporting them.\n");
4.692 warnings.bin=0;
4.693 }
4.694 - warnings.isDutch=0;
4.695 + warnings.isDutch=FALSE;
4.696 if (results->Dutchcount>50)
4.697 {
4.698 - warnings.isDutch=1;
4.699 + warnings.isDutch=TRUE;
4.700 printf(" --> This looks like Dutch - "
4.701 "switching off dashes and warnings for 's Middags case.\n");
4.702 }
4.703 - warnings.isFrench=0;
4.704 + warnings.isFrench=FALSE;
4.705 if (results->Frenchcount>50)
4.706 {
4.707 - warnings.isFrench=1;
4.708 + warnings.isFrench=TRUE;
4.709 printf(" --> This looks like French - "
4.710 "switching off some doublepunct.\n");
4.711 }
4.712 @@ -919,12 +961,14 @@
4.713 * count it, since empty lines with asterisks or dashes to
4.714 * separate sections are common.
4.715 *
4.716 - * Returns: Non-zero if the line is empty.
4.717 + * Returns: TRUE if the line is empty.
4.718 */
4.719 -int analyse_quotes(const char *s,struct counters *counters)
4.720 +gboolean analyse_quotes(const char *aline,struct counters *counters)
4.721 {
4.722 int guessquote=0;
4.723 - int isemptyline=1; /* assume the line is empty until proven otherwise */
4.724 + /* assume the line is empty until proven otherwise */
4.725 + gboolean isemptyline=TRUE;
4.726 + const char *s=aline;
4.727 while (*s)
4.728 {
4.729 if (*s==CHAR_DQUOTE)
4.730 @@ -986,7 +1030,7 @@
4.731 }
4.732 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
4.733 *s!=13 && *s!=10)
4.734 - isemptyline=0; /* ignore lines like * * * as spacers */
4.735 + isemptyline=FALSE; /* ignore lines like * * * as spacers */
4.736 if (*s==CHAR_UNDERSCORE)
4.737 counters->c_unders++;
4.738 if (*s==CHAR_OPEN_CBRACK)
4.739 @@ -1040,7 +1084,7 @@
4.740 * Check for binary and other odd characters.
4.741 */
4.742 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
4.743 - int isemptyline)
4.744 + gboolean isemptyline)
4.745 {
4.746 /* Don't repeat multiple warnings on one line. */
4.747 int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
4.748 @@ -1461,16 +1505,15 @@
4.749 void check_for_extra_period(const char *aline,const struct warnings *warnings)
4.750 {
4.751 const char *s,*t,*s1;
4.752 - int i,istypo,isdup;
4.753 - static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
4.754 - static int qperiod_index=0;
4.755 - char testword[MAXWORDLEN]="";
4.756 + int i;
4.757 + gboolean istypo;
4.758 + gchar *testword;
4.759 if (pswit[PARANOID_SWITCH])
4.760 {
4.761 - for (t=s=aline;strstr(t,". ");)
4.762 + for (t=aline;strstr(t,". ");)
4.763 {
4.764 t=strstr(t,". ");
4.765 - if (t==s)
4.766 + if (t==aline)
4.767 {
4.768 t++;
4.769 /* start of line punctuation is handled elsewhere */
4.770 @@ -1497,57 +1540,48 @@
4.771 if (*s1>='a' && *s1<='z')
4.772 {
4.773 /* we have something to investigate */
4.774 - istypo=1;
4.775 + istypo=TRUE;
4.776 /* so let's go back and find out */
4.777 - for (s1=t-1;s1>=s &&
4.778 + for (s1=t-1;s1>=aline &&
4.779 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
4.780 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
4.781 ;
4.782 s1++;
4.783 - for (i=0;*s1 && *s1!='.';s1++,i++)
4.784 - testword[i]=*s1;
4.785 - testword[i]=0;
4.786 + s=strchr(s1,'.');
4.787 + if (s)
4.788 + testword=g_strndup(s1,s-s1);
4.789 + else
4.790 + testword=g_strdup(s1);
4.791 for (i=0;*abbrev[i];i++)
4.792 if (!strcmp(testword,abbrev[i]))
4.793 - istypo=0;
4.794 + istypo=FALSE;
4.795 if (gcisdigit(*testword))
4.796 - istypo=0;
4.797 + istypo=FALSE;
4.798 if (!testword[1])
4.799 - istypo=0;
4.800 + istypo=FALSE;
4.801 if (isroman(testword))
4.802 - istypo=0;
4.803 + istypo=FALSE;
4.804 if (istypo)
4.805 {
4.806 - istypo=0;
4.807 + istypo=FALSE;
4.808 for (i=0;testword[i];i++)
4.809 if (strchr(vowels,testword[i]))
4.810 - istypo=1;
4.811 + istypo=TRUE;
4.812 }
4.813 - if (istypo)
4.814 + if (istypo &&
4.815 + (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
4.816 {
4.817 - isdup=0;
4.818 - if (strlen(testword)<MAX_QWORD_LENGTH &&
4.819 - !pswit[VERBOSE_SWITCH])
4.820 - for (i=0;i<qperiod_index;i++)
4.821 - if (!strcmp(testword,qperiod[i]))
4.822 - isdup=1;
4.823 - if (!isdup)
4.824 - {
4.825 - if (qperiod_index<MAX_QWORD &&
4.826 - strlen(testword)<MAX_QWORD_LENGTH)
4.827 - {
4.828 - strcpy(qperiod[qperiod_index],testword);
4.829 - qperiod_index++;
4.830 - }
4.831 - if (pswit[ECHO_SWITCH])
4.832 - printf("\n%s\n",aline);
4.833 - if (!pswit[OVERVIEW_SWITCH])
4.834 - printf(" Line %ld column %d - Extra period?\n",
4.835 - linecnt,(int)(t-aline)+1);
4.836 - else
4.837 - cnt_punct++;
4.838 - }
4.839 + g_tree_insert(qperiod,g_strdup(testword),
4.840 + GINT_TO_POINTER(1));
4.841 + if (pswit[ECHO_SWITCH])
4.842 + printf("\n%s\n",aline);
4.843 + if (!pswit[OVERVIEW_SWITCH])
4.844 + printf(" Line %ld column %d - Extra period?\n",
4.845 + linecnt,(int)(t-aline)+1);
4.846 + else
4.847 + cnt_punct++;
4.848 }
4.849 + g_free(testword);
4.850 }
4.851 t++;
4.852 }
4.853 @@ -1563,16 +1597,20 @@
4.854 {
4.855 int i;
4.856 const char *s,*wordstart;
4.857 - char inword[MAXWORDLEN];
4.858 + gchar *inword,*t;
4.859 if (pswit[TYPO_SWITCH])
4.860 {
4.861 for (s=aline;*s;)
4.862 {
4.863 wordstart=s;
4.864 - s=getaword(s,inword);
4.865 - if (!*inword)
4.866 + t=getaword(&s);
4.867 + if (!*t)
4.868 + {
4.869 + g_free(t);
4.870 continue;
4.871 - lowerit(inword);
4.872 + }
4.873 + inword=g_ascii_strdown(t,-1);
4.874 + g_free(t);
4.875 for (i=0;*nocomma[i];i++)
4.876 if (!strcmp(inword,nocomma[i]))
4.877 {
4.878 @@ -1603,6 +1641,7 @@
4.879 cnt_punct++;
4.880 }
4.881 }
4.882 + g_free(inword);
4.883 }
4.884 }
4.885 }
4.886 @@ -1616,15 +1655,18 @@
4.887 void check_for_typos(const char *aline,struct warnings *warnings)
4.888 {
4.889 const char *s,*wordstart;
4.890 - char inword[MAXWORDLEN],testword[MAXWORDLEN];
4.891 - int i,istypo,isdup,alower,vowel,consonant;
4.892 - static int qword_index=0;
4.893 + gchar *inword,*testword;
4.894 + int i,alower,vowel,consonant,*dupcnt;
4.895 + gboolean isdup,istypo;
4.896 for (s=aline;*s;)
4.897 {
4.898 wordstart=s;
4.899 - s=getaword(s,inword);
4.900 + inword=getaword(&s);
4.901 if (!*inword)
4.902 + {
4.903 + g_free(inword);
4.904 continue; /* don't bother with empty lines */
4.905 + }
4.906 if (mixdigit(inword))
4.907 {
4.908 if (pswit[ECHO_SWITCH])
4.909 @@ -1639,10 +1681,10 @@
4.910 * Put the word through a series of tests for likely typos and OCR
4.911 * errors.
4.912 */
4.913 - if (pswit[TYPO_SWITCH])
4.914 + if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
4.915 {
4.916 - istypo=0;
4.917 - strcpy(testword,inword);
4.918 + istypo=FALSE;
4.919 + testword=g_strdup(inword);
4.920 alower=0;
4.921 for (i=0;i<(int)strlen(testword);i++)
4.922 {
4.923 @@ -1662,10 +1704,13 @@
4.924 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
4.925 ; /* do nothing! */
4.926 else
4.927 - istypo=1;
4.928 + istypo=TRUE;
4.929 }
4.930 testword[i]=(char)tolower(testword[i]);
4.931 }
4.932 + }
4.933 + if (pswit[TYPO_SWITCH])
4.934 + {
4.935 /*
4.936 * Check for certain unlikely two-letter combinations at word
4.937 * start and end.
4.938 @@ -1674,26 +1719,26 @@
4.939 {
4.940 for (i=0;*nostart[i];i++)
4.941 if (!strncmp(testword,nostart[i],2))
4.942 - istypo=1;
4.943 + istypo=TRUE;
4.944 for (i=0;*noend[i];i++)
4.945 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
4.946 - istypo=1;
4.947 + istypo=TRUE;
4.948 }
4.949 /* ght is common, gbt never. Like that. */
4.950 if (strstr(testword,"cb"))
4.951 - istypo=1;
4.952 + istypo=TRUE;
4.953 if (strstr(testword,"gbt"))
4.954 - istypo=1;
4.955 + istypo=TRUE;
4.956 if (strstr(testword,"pbt"))
4.957 - istypo=1;
4.958 + istypo=TRUE;
4.959 if (strstr(testword,"tbs"))
4.960 - istypo=1;
4.961 + istypo=TRUE;
4.962 if (strstr(testword,"mrn"))
4.963 - istypo=1;
4.964 + istypo=TRUE;
4.965 if (strstr(testword,"ahle"))
4.966 - istypo=1;
4.967 + istypo=TRUE;
4.968 if (strstr(testword,"ihle"))
4.969 - istypo=1;
4.970 + istypo=TRUE;
4.971 /*
4.972 * "TBE" does happen - like HEARTBEAT - but uncommon.
4.973 * Also "TBI" - frostbite, outbid - but uncommon.
4.974 @@ -1701,11 +1746,11 @@
4.975 * numerals, but "ii" is a common scanno.
4.976 */
4.977 if (strstr(testword,"tbi"))
4.978 - istypo=1;
4.979 + istypo=TRUE;
4.980 if (strstr(testword,"tbe"))
4.981 - istypo=1;
4.982 + istypo=TRUE;
4.983 if (strstr(testword,"ii"))
4.984 - istypo=1;
4.985 + istypo=TRUE;
4.986 /*
4.987 * Check for no vowels or no consonants.
4.988 * If none, flag a typo.
4.989 @@ -1727,7 +1772,7 @@
4.990 consonant++;
4.991 }
4.992 if (!vowel || !consonant)
4.993 - istypo=1;
4.994 + istypo=TRUE;
4.995 }
4.996 /*
4.997 * Now exclude the word from being reported if it's in
4.998 @@ -1735,18 +1780,18 @@
4.999 */
4.1000 for (i=0;*okword[i];i++)
4.1001 if (!strcmp(testword,okword[i]))
4.1002 - istypo=0;
4.1003 + istypo=FALSE;
4.1004 /*
4.1005 * What looks like a typo may be a Roman numeral.
4.1006 * Exclude these.
4.1007 */
4.1008 if (istypo && isroman(testword))
4.1009 - istypo=0;
4.1010 + istypo=FALSE;
4.1011 /* Check the manual list of typos. */
4.1012 if (!istypo)
4.1013 for (i=0;*typo[i];i++)
4.1014 if (!strcmp(testword,typo[i]))
4.1015 - istypo=1;
4.1016 + istypo=TRUE;
4.1017 /*
4.1018 * Check lowercase s, l, i and m - special cases.
4.1019 * "j" - often a semi-colon gone wrong.
4.1020 @@ -1754,34 +1799,30 @@
4.1021 * "n" for "in"
4.1022 */
4.1023 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
4.1024 - istypo=1;
4.1025 + istypo=TRUE;
4.1026 if (istypo)
4.1027 {
4.1028 - isdup=0;
4.1029 - if (strlen(testword)<MAX_QWORD_LENGTH &&
4.1030 - !pswit[VERBOSE_SWITCH])
4.1031 - for (i=0;i<qword_index;i++)
4.1032 - if (!strcmp(testword,qword[i]))
4.1033 - {
4.1034 - isdup=1;
4.1035 - ++dupcnt[i];
4.1036 - }
4.1037 + dupcnt=g_tree_lookup(qword,testword);
4.1038 + if (dupcnt)
4.1039 + {
4.1040 + (*dupcnt)++;
4.1041 + isdup=!pswit[VERBOSE_SWITCH];
4.1042 + }
4.1043 + else
4.1044 + {
4.1045 + dupcnt=g_new0(int,1);
4.1046 + g_tree_insert(qword,g_strdup(testword),dupcnt);
4.1047 + isdup=FALSE;
4.1048 + }
4.1049 if (!isdup)
4.1050 {
4.1051 - if (qword_index<MAX_QWORD &&
4.1052 - strlen(testword)<MAX_QWORD_LENGTH)
4.1053 - {
4.1054 - strcpy(qword[qword_index],testword);
4.1055 - qword_index++;
4.1056 - }
4.1057 if (pswit[ECHO_SWITCH])
4.1058 printf("\n%s\n",aline);
4.1059 if (!pswit[OVERVIEW_SWITCH])
4.1060 {
4.1061 printf(" Line %ld column %d - Query word %s",
4.1062 linecnt,(int)(wordstart-aline)+1,inword);
4.1063 - if (strlen(testword)<MAX_QWORD_LENGTH &&
4.1064 - !pswit[VERBOSE_SWITCH])
4.1065 + if (!pswit[VERBOSE_SWITCH])
4.1066 printf(" - not reporting duplicates");
4.1067 printf("\n");
4.1068 }
4.1069 @@ -1791,17 +1832,16 @@
4.1070 }
4.1071 }
4.1072 /* check the user's list of typos */
4.1073 - if (!istypo && usertypo_count)
4.1074 - for (i=0;i<usertypo_count;i++)
4.1075 - if (!strcmp(testword,usertypo[i]))
4.1076 - {
4.1077 - if (pswit[ECHO_SWITCH])
4.1078 - printf("\n%s\n",aline);
4.1079 - if (!pswit[OVERVIEW_SWITCH])
4.1080 - printf(" Line %ld column %d - "
4.1081 - "Query possible scanno %s\n",
4.1082 - linecnt,(int)(wordstart-aline)+2,inword);
4.1083 - }
4.1084 + if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
4.1085 + {
4.1086 + if (pswit[ECHO_SWITCH])
4.1087 + printf("\n%s\n",aline);
4.1088 + if (!pswit[OVERVIEW_SWITCH])
4.1089 + printf(" Line %ld column %d - Query possible scanno %s\n",
4.1090 + linecnt,(int)(wordstart-aline)+2,inword);
4.1091 + }
4.1092 + if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
4.1093 + g_free(testword);
4.1094 if (pswit[PARANOID_SWITCH] && warnings->digit)
4.1095 {
4.1096 /* In paranoid mode, query all 0 and 1 standing alone. */
4.1097 @@ -1816,6 +1856,7 @@
4.1098 cnt_word++;
4.1099 }
4.1100 }
4.1101 + g_free(inword);
4.1102 }
4.1103 }
4.1104
4.1105 @@ -1830,9 +1871,10 @@
4.1106 * quotes "like"this.
4.1107 */
4.1108 void check_for_misspaced_punctuation(const char *aline,
4.1109 - struct parities *parities,int isemptyline)
4.1110 + struct parities *parities,gboolean isemptyline)
4.1111 {
4.1112 - int i,llen,isacro,isellipsis;
4.1113 + int i,llen;
4.1114 + gboolean isacro,isellipsis;
4.1115 const char *s;
4.1116 llen=strlen(aline);
4.1117 for (i=1;i<llen;i++)
4.1118 @@ -1841,9 +1883,9 @@
4.1119 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
4.1120 {
4.1121 /* we need to suppress warnings for acronyms like M.D. */
4.1122 - isacro=0;
4.1123 + isacro=FALSE;
4.1124 /* we need to suppress warnings for ellipsis . . . */
4.1125 - isellipsis=0;
4.1126 + isellipsis=FALSE;
4.1127 /* if there are letters on both sides of it or ... */
4.1128 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
4.1129 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
4.1130 @@ -1852,9 +1894,9 @@
4.1131 if (aline[i]=='.')
4.1132 {
4.1133 if (i>2 && aline[i-2]=='.')
4.1134 - isacro=1;
4.1135 + isacro=TRUE;
4.1136 if (i+2<llen && aline[i+2]=='.')
4.1137 - isacro=1;
4.1138 + isacro=TRUE;
4.1139 }
4.1140 if (!isacro)
4.1141 {
4.1142 @@ -1877,9 +1919,9 @@
4.1143 if (aline[i]=='.')
4.1144 {
4.1145 if (i>2 && aline[i-2]=='.')
4.1146 - isellipsis=1;
4.1147 + isellipsis=TRUE;
4.1148 if (i+2<llen && aline[i+2]=='.')
4.1149 - isellipsis=1;
4.1150 + isellipsis=TRUE;
4.1151 }
4.1152 if (!isemptyline && !isellipsis)
4.1153 {
4.1154 @@ -2177,6 +2219,8 @@
4.1155 void check_for_miscased_genative(const char *aline)
4.1156 {
4.1157 const char *s;
4.1158 + if (!*aline)
4.1159 + return;
4.1160 s=aline+1;
4.1161 while (*s)
4.1162 {
4.1163 @@ -2321,13 +2365,11 @@
4.1164 i=(int)(close-open+1);
4.1165 if (i>0)
4.1166 {
4.1167 - strncpy(wrk,open,i);
4.1168 - wrk[i]=0;
4.1169 if (pswit[ECHO_SWITCH])
4.1170 printf("\n%s\n",aline);
4.1171 if (!pswit[OVERVIEW_SWITCH])
4.1172 - printf(" Line %ld column %d - HTML Tag? %s \n",
4.1173 - linecnt,(int)(open-aline)+1,wrk);
4.1174 + printf(" Line %ld column %d - HTML Tag? %*.*s \n",
4.1175 + linecnt,(int)(open-aline)+1,i,i,open);
4.1176 else
4.1177 cnt_html++;
4.1178 }
4.1179 @@ -2359,13 +2401,11 @@
4.1180 i=0; /* Don't report "Jones & Son;" */
4.1181 if (i>0)
4.1182 {
4.1183 - strncpy(wrk,amp,i);
4.1184 - wrk[i]=0;
4.1185 if (pswit[ECHO_SWITCH])
4.1186 printf("\n%s\n",aline);
4.1187 if (!pswit[OVERVIEW_SWITCH])
4.1188 - printf(" Line %ld column %d - HTML symbol? %s \n",
4.1189 - linecnt,(int)(amp-aline)+1,wrk);
4.1190 + printf(" Line %ld column %d - HTML symbol? %*.*s \n",
4.1191 + linecnt,(int)(amp-aline)+1,i,i,amp);
4.1192 else
4.1193 cnt_html++;
4.1194 }
4.1195 @@ -2388,7 +2428,8 @@
4.1196 s=aline;
4.1197 while (*s==' ')
4.1198 s++;
4.1199 - if (*pending->dquote)
4.1200 + if (pending->dquote)
4.1201 + {
4.1202 if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
4.1203 {
4.1204 if (!pswit[OVERVIEW_SWITCH])
4.1205 @@ -2400,7 +2441,10 @@
4.1206 else
4.1207 cnt_dquot++;
4.1208 }
4.1209 - if (*pending->squote)
4.1210 + g_free(pending->dquote);
4.1211 + pending->dquote=NULL;
4.1212 + }
4.1213 + if (pending->squote)
4.1214 {
4.1215 if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
4.1216 pending->squot)
4.1217 @@ -2414,8 +2458,10 @@
4.1218 else
4.1219 cnt_squot++;
4.1220 }
4.1221 + g_free(pending->squote);
4.1222 + pending->squote=NULL;
4.1223 }
4.1224 - if (*pending->rbrack)
4.1225 + if (pending->rbrack)
4.1226 {
4.1227 if (!pswit[OVERVIEW_SWITCH])
4.1228 {
4.1229 @@ -2425,8 +2471,10 @@
4.1230 }
4.1231 else
4.1232 cnt_brack++;
4.1233 + g_free(pending->rbrack);
4.1234 + pending->rbrack=NULL;
4.1235 }
4.1236 - if (*pending->sbrack)
4.1237 + if (pending->sbrack)
4.1238 {
4.1239 if (!pswit[OVERVIEW_SWITCH])
4.1240 {
4.1241 @@ -2436,8 +2484,10 @@
4.1242 }
4.1243 else
4.1244 cnt_brack++;
4.1245 + g_free(pending->sbrack);
4.1246 + pending->sbrack=NULL;
4.1247 }
4.1248 - if (*pending->cbrack)
4.1249 + if (pending->cbrack)
4.1250 {
4.1251 if (!pswit[OVERVIEW_SWITCH])
4.1252 {
4.1253 @@ -2447,8 +2497,10 @@
4.1254 }
4.1255 else
4.1256 cnt_brack++;
4.1257 + g_free(pending->cbrack);
4.1258 + pending->cbrack=NULL;
4.1259 }
4.1260 - if (*pending->unders)
4.1261 + if (pending->unders)
4.1262 {
4.1263 if (!pswit[OVERVIEW_SWITCH])
4.1264 {
4.1265 @@ -2458,6 +2510,8 @@
4.1266 }
4.1267 else
4.1268 cnt_brack++;
4.1269 + g_free(pending->unders);
4.1270 + pending->unders=NULL;
4.1271 }
4.1272 }
4.1273
4.1274 @@ -2481,12 +2535,12 @@
4.1275 struct pending *pending)
4.1276 {
4.1277 if (counters->quot%2)
4.1278 - sprintf(pending->dquote," Line %ld - Mismatched quotes",
4.1279 - linecnt);
4.1280 + pending->dquote=
4.1281 + g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
4.1282 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
4.1283 counters->open_single_quote!=counters->close_single_quote)
4.1284 - sprintf(pending->squote," Line %ld - Mismatched singlequotes?",
4.1285 - linecnt);
4.1286 + pending->squote=
4.1287 + g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
4.1288 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
4.1289 counters->open_single_quote!=counters->close_single_quote &&
4.1290 counters->open_single_quote!=counters->close_single_quote+1)
4.1291 @@ -2496,17 +2550,17 @@
4.1292 */
4.1293 pending->squot=1;
4.1294 if (counters->r_brack)
4.1295 - sprintf(pending->rbrack," Line %ld - Mismatched round brackets?",
4.1296 - linecnt);
4.1297 + pending->rbrack=
4.1298 + g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
4.1299 if (counters->s_brack)
4.1300 - sprintf(pending->sbrack," Line %ld - Mismatched square brackets?",
4.1301 - linecnt);
4.1302 + pending->sbrack=
4.1303 + g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
4.1304 if (counters->c_brack)
4.1305 - sprintf(pending->cbrack," Line %ld - Mismatched curly brackets?",
4.1306 - linecnt);
4.1307 + pending->cbrack=
4.1308 + g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
4.1309 if (counters->c_unders%2)
4.1310 - sprintf(pending->unders," Line %ld - Mismatched underscores?",
4.1311 - linecnt);
4.1312 + pending->unders=
4.1313 + g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
4.1314 }
4.1315
4.1316 /*
4.1317 @@ -2563,50 +2617,63 @@
4.1318 }
4.1319 }
4.1320
4.1321 +gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
4.1322 +{
4.1323 + const char *word=key;
4.1324 + int *dupcnt=value;
4.1325 + if (*dupcnt)
4.1326 + printf("\nNote: Queried word %s was duplicated %d times\n",
4.1327 + word,*dupcnt);
4.1328 + return FALSE;
4.1329 +}
4.1330 +
4.1331 /*
4.1332 * procfile:
4.1333 *
4.1334 * Process one file.
4.1335 */
4.1336 -void procfile(char *filename)
4.1337 +void procfile(const char *filename)
4.1338 {
4.1339 const char *s;
4.1340 - char parastart[81]; /* first line of current para */
4.1341 - FILE *infile;
4.1342 + gchar *parastart=NULL; /* first line of current para */
4.1343 + gchar *etext,*aline;
4.1344 + gchar *etext_ptr;
4.1345 + GError *err=NULL;
4.1346 struct first_pass_results *first_pass_results;
4.1347 struct warnings *warnings;
4.1348 struct counters counters={0};
4.1349 struct line_properties last={0};
4.1350 struct parities parities={0};
4.1351 - struct pending pending={{0},};
4.1352 - int isemptyline;
4.1353 + struct pending pending={0};
4.1354 + gboolean isemptyline;
4.1355 long start_para_line=0;
4.1356 - int i,isnewpara=0,enddash=0;
4.1357 + gboolean isnewpara=FALSE,enddash=FALSE;
4.1358 last.start=CHAR_SPACE;
4.1359 - *prevline=0;
4.1360 linecnt=checked_linecnt=0;
4.1361 - infile=fopen(filename,"rb");
4.1362 - if (!infile)
4.1363 + etext=read_etext(filename,&err);
4.1364 + if (!etext)
4.1365 {
4.1366 if (pswit[STDOUT_SWITCH])
4.1367 - fprintf(stdout,"bookloupe: cannot open %s\n",filename);
4.1368 + fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
4.1369 else
4.1370 - fprintf(stderr,"bookloupe: cannot open %s\n",filename);
4.1371 + fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
4.1372 exit(1);
4.1373 }
4.1374 fprintf(stdout,"\n\nFile: %s\n\n",filename);
4.1375 - first_pass_results=first_pass(infile);
4.1376 + first_pass_results=first_pass(etext);
4.1377 warnings=report_first_pass(first_pass_results);
4.1378 + qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
4.1379 + qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
4.1380 /*
4.1381 * Here we go with the main pass. Hold onto yer hat!
4.1382 */
4.1383 - rewind(infile);
4.1384 linecnt=0;
4.1385 - while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
4.1386 + etext_ptr=etext;
4.1387 + while ((aline=flgets(&etext_ptr,linecnt+1)))
4.1388 {
4.1389 linecnt++;
4.1390 if (linecnt==1)
4.1391 - isnewpara=1;
4.1392 + isnewpara=TRUE;
4.1393 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
4.1394 continue; // skip DP page separators completely
4.1395 if (linecnt<first_pass_results->firstline ||
4.1396 @@ -2635,8 +2702,8 @@
4.1397 /* This line is the start of a new paragraph. */
4.1398 start_para_line=linecnt;
4.1399 /* Capture its first line in case we want to report it later. */
4.1400 - strncpy(parastart,aline,80);
4.1401 - parastart[79]=0;
4.1402 + g_free(parastart);
4.1403 + parastart=g_strdup(aline);
4.1404 memset(&parities,0,sizeof(parities)); /* restart the quote count */
4.1405 s=aline;
4.1406 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
4.1407 @@ -2653,7 +2720,7 @@
4.1408 else
4.1409 cnt_punct++;
4.1410 }
4.1411 - isnewpara=0; /* Signal the end of new para processing. */
4.1412 + isnewpara=FALSE; /* Signal the end of new para processing. */
4.1413 }
4.1414 /* Check for an em-dash broken at line end. */
4.1415 if (enddash && *aline=='-')
4.1416 @@ -2665,11 +2732,11 @@
4.1417 else
4.1418 cnt_punct++;
4.1419 }
4.1420 - enddash=0;
4.1421 + enddash=FALSE;
4.1422 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
4.1423 ;
4.1424 if (s>=aline && *s=='-')
4.1425 - enddash=1;
4.1426 + enddash=TRUE;
4.1427 check_for_control_characters(aline);
4.1428 if (warnings->bin)
4.1429 check_for_odd_characters(aline,warnings,isemptyline);
4.1430 @@ -2709,40 +2776,49 @@
4.1431 check_for_mismatched_quotes(&counters,&pending);
4.1432 memset(&counters,0,sizeof(counters));
4.1433 /* let the next iteration know that it's starting a new para */
4.1434 - isnewpara=1;
4.1435 - check_for_omitted_punctuation(prevline,&last,start_para_line);
4.1436 + isnewpara=TRUE;
4.1437 + if (prevline)
4.1438 + check_for_omitted_punctuation(prevline,&last,start_para_line);
4.1439 }
4.1440 - strcpy(prevline,aline);
4.1441 + g_free(prevline);
4.1442 + prevline=g_strdup(aline);
4.1443 }
4.1444 - fclose(infile);
4.1445 + if (prevline)
4.1446 + {
4.1447 + g_free(prevline);
4.1448 + prevline=NULL;
4.1449 + }
4.1450 + g_free(parastart);
4.1451 + g_free(prevline);
4.1452 + g_free(etext);
4.1453 if (!pswit[OVERVIEW_SWITCH])
4.1454 - for (i=0;i<MAX_QWORD;i++)
4.1455 - if (dupcnt[i])
4.1456 - printf("\nNote: Queried word %s was duplicated %d time%s\n",
4.1457 - qword[i],dupcnt[i],"s");
4.1458 + g_tree_foreach(qword,report_duplicate_queries,NULL);
4.1459 + g_tree_unref(qword);
4.1460 + g_tree_unref(qperiod);
4.1461 }
4.1462
4.1463 /*
4.1464 * flgets:
4.1465 *
4.1466 - * Get one line from the input stream, checking for
4.1467 + * Get one line from the input text, checking for
4.1468 * the existence of exactly one CR/LF line-end per line.
4.1469 *
4.1470 * Returns: a pointer to the line.
4.1471 */
4.1472 -char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
4.1473 +char *flgets(char **etext,long lcnt)
4.1474 {
4.1475 char c;
4.1476 - int len,isCR,cint;
4.1477 - *theline=0;
4.1478 - len=isCR=0;
4.1479 - c=cint=fgetc(thefile);
4.1480 - do
4.1481 + int len;
4.1482 + gboolean isCR=FALSE;
4.1483 + char *theline=*etext;
4.1484 + len=0;
4.1485 + for(;;)
4.1486 {
4.1487 - if (cint==EOF)
4.1488 + c=*(*etext)++;
4.1489 + if (!c)
4.1490 return NULL;
4.1491 /* either way, it's end of line */
4.1492 - if (c==10)
4.1493 + if (c=='\n')
4.1494 {
4.1495 if (isCR)
4.1496 break;
4.1497 @@ -2752,7 +2828,7 @@
4.1498 if (pswit[LINE_END_SWITCH])
4.1499 {
4.1500 if (pswit[ECHO_SWITCH])
4.1501 - printf("\n%s\n",theline);
4.1502 + printf("\n%*.*s\n",len,len,theline);
4.1503 if (!pswit[OVERVIEW_SWITCH])
4.1504 printf(" Line %ld - No CR?\n",lcnt);
4.1505 else
4.1506 @@ -2761,7 +2837,7 @@
4.1507 break;
4.1508 }
4.1509 }
4.1510 - if (c==13)
4.1511 + if (c=='\r')
4.1512 {
4.1513 if (isCR)
4.1514 {
4.1515 @@ -2769,34 +2845,33 @@
4.1516 if (pswit[LINE_END_SWITCH])
4.1517 {
4.1518 if (pswit[ECHO_SWITCH])
4.1519 - printf("\n%s\n",theline);
4.1520 + printf("\n%*.*s\n",len,len,theline);
4.1521 if (!pswit[OVERVIEW_SWITCH])
4.1522 printf(" Line %ld - Two successive CRs?\n",lcnt);
4.1523 else
4.1524 cnt_lineend++;
4.1525 }
4.1526 }
4.1527 - isCR=1;
4.1528 + isCR=TRUE;
4.1529 }
4.1530 else
4.1531 {
4.1532 if (pswit[LINE_END_SWITCH] && isCR)
4.1533 {
4.1534 if (pswit[ECHO_SWITCH])
4.1535 - printf("\n%s\n",theline);
4.1536 + printf("\n%*.*s\n",len,len,theline);
4.1537 if (!pswit[OVERVIEW_SWITCH])
4.1538 printf(" Line %ld column %d - CR without LF?\n",
4.1539 lcnt,len+1);
4.1540 else
4.1541 cnt_lineend++;
4.1542 + theline[len]=' ';
4.1543 }
4.1544 - theline[len]=c;
4.1545 + isCR=FALSE;
4.1546 len++;
4.1547 - theline[len]=0;
4.1548 - isCR=0;
4.1549 }
4.1550 - c=cint=fgetc(thefile);
4.1551 - } while(len<maxlen);
4.1552 + }
4.1553 + theline[len]='\0';
4.1554 if (pswit[MARKUP_SWITCH])
4.1555 postprocess_for_HTML(theline);
4.1556 if (pswit[DP_SWITCH])
4.1557 @@ -2813,10 +2888,10 @@
4.1558 *
4.1559 * Returns: 0 if no error found, 1 if error.
4.1560 */
4.1561 -int mixdigit(char *checkword)
4.1562 +int mixdigit(const char *checkword)
4.1563 {
4.1564 int wehaveadigit,wehavealetter,firstdigits,query,wl;
4.1565 - char *s;
4.1566 + const char *s;
4.1567 wehaveadigit=wehavealetter=query=0;
4.1568 for (s=checkword;*s;s++)
4.1569 if (gcisalpha(*s))
4.1570 @@ -2832,17 +2907,20 @@
4.1571 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
4.1572 ;
4.1573 /* digits, ending in st, rd, nd, th of either case */
4.1574 - if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
4.1575 - matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
4.1576 - matchword(checkword+wl-2,"th")))
4.1577 + if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
4.1578 + !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
4.1579 + !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
4.1580 + !g_ascii_strcasecmp(checkword+wl-2,"th")))
4.1581 query=0;
4.1582 - if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
4.1583 - matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
4.1584 - matchword(checkword+wl-3,"ths")))
4.1585 + if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
4.1586 + !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
4.1587 + !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
4.1588 + !g_ascii_strcasecmp(checkword+wl-3,"ths")))
4.1589 query=0;
4.1590 - if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
4.1591 - matchword(checkword+wl-4,"rdly") ||
4.1592 - matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
4.1593 + if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
4.1594 + !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
4.1595 + !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
4.1596 + !g_ascii_strcasecmp(checkword+wl-4,"thly")))
4.1597 query=0;
4.1598 /* digits, ending in l, L, s or d */
4.1599 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
4.1600 @@ -2864,20 +2942,20 @@
4.1601 /*
4.1602 * getaword:
4.1603 *
4.1604 - * Extracts the first/next "word" from the line, and puts
4.1605 - * it into "thisword". A word is defined as one English word unit--or
4.1606 - * at least that's the aim.
4.1607 + * Extracts the first/next "word" from the line, and returns it.
4.1608 + * A word is defined as one English word unit--or at least that's the aim.
4.1609 + * "ptr" is advanced to the position in the line where we will start
4.1610 + * looking for the next word.
4.1611 *
4.1612 - * Returns: a pointer to the position in the line where we will start
4.1613 - * looking for the next word.
4.1614 + * Returns: A newly-allocated string.
4.1615 */
4.1616 -const char *getaword(const char *fromline,char *thisword)
4.1617 +gchar *getaword(const char **ptr)
4.1618 {
4.1619 - int i,wordlen;
4.1620 + int i;
4.1621 const char *s;
4.1622 - wordlen=0;
4.1623 - for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
4.1624 - fromline++)
4.1625 + GString *word;
4.1626 + word=g_string_new(NULL);
4.1627 + for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
4.1628 ;
4.1629 /*
4.1630 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
4.1631 @@ -2887,64 +2965,25 @@
4.1632 * If found, it returns this whole pattern as a word; otherwise we discard
4.1633 * the results and resume our normal programming.
4.1634 */
4.1635 - s=fromline;
4.1636 - for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
4.1637 - wordlen<MAXWORDLEN;s++)
4.1638 + s=*ptr;
4.1639 + for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
4.1640 + g_string_append_c(word,*s);
4.1641 + for (i=1;i+1<word->len;i++)
4.1642 {
4.1643 - thisword[wordlen]=*s;
4.1644 - wordlen++;
4.1645 - }
4.1646 - thisword[wordlen]=0;
4.1647 - for (i=1;i<wordlen-1;i++)
4.1648 - {
4.1649 - if (thisword[i]=='.' || thisword[i]==',')
4.1650 + if (word->str[i]=='.' || word->str[i]==',')
4.1651 {
4.1652 - if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
4.1653 + if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
4.1654 {
4.1655 - fromline=s;
4.1656 - return fromline;
4.1657 + *ptr=s;
4.1658 + return g_string_free(word,FALSE);
4.1659 }
4.1660 }
4.1661 }
4.1662 /* we didn't find a punctuated number - do the regular getword thing */
4.1663 - wordlen=0;
4.1664 - for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
4.1665 - wordlen<MAXWORDLEN;fromline++)
4.1666 - {
4.1667 - thisword[wordlen]=*fromline;
4.1668 - wordlen++;
4.1669 - }
4.1670 - thisword[wordlen]=0;
4.1671 - return fromline;
4.1672 -}
4.1673 -
4.1674 -/*
4.1675 - * matchword:
4.1676 - *
4.1677 - * A case-insensitive string matcher.
4.1678 - */
4.1679 -int matchword(char *checkfor,char *thisword)
4.1680 -{
4.1681 - unsigned int ismatch,i;
4.1682 - if (strlen(checkfor)!=strlen(thisword))
4.1683 - return 0;
4.1684 - ismatch=1; /* assume a match until we find a difference */
4.1685 - for (i=0;i<strlen(checkfor);i++)
4.1686 - if (toupper(checkfor[i])!=toupper(thisword[i]))
4.1687 - ismatch=0;
4.1688 - return ismatch;
4.1689 -}
4.1690 -
4.1691 -/*
4.1692 - * lowerit:
4.1693 - *
4.1694 - * Lowercase the line.
4.1695 - */
4.1696 -void lowerit(char *theline)
4.1697 -{
4.1698 - for (;*theline;theline++)
4.1699 - if (*theline>='A' && *theline<='Z')
4.1700 - *theline+=32;
4.1701 + g_string_truncate(word,0);
4.1702 + for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
4.1703 + g_string_append_c(word,**ptr);
4.1704 + return g_string_free(word,FALSE);
4.1705 }
4.1706
4.1707 /*
4.1708 @@ -2961,11 +3000,11 @@
4.1709 * XL or an optional XC, an optional IX or IV, an optional V and any number
4.1710 * of optional Is.
4.1711 */
4.1712 -int isroman(char *t)
4.1713 +gboolean isroman(const char *t)
4.1714 {
4.1715 - char *s;
4.1716 + const char *s;
4.1717 if (!t || !*t)
4.1718 - return 0;
4.1719 + return FALSE;
4.1720 s=t;
4.1721 while (*t=='m' && *t)
4.1722 t++;
4.1723 @@ -3006,19 +3045,19 @@
4.1724 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
4.1725 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
4.1726 */
4.1727 -int gcisalpha(unsigned char c)
4.1728 +gboolean gcisalpha(unsigned char c)
4.1729 {
4.1730 if (c>='a' && c<='z')
4.1731 - return 1;
4.1732 + return TRUE;
4.1733 if (c>='A' && c<='Z')
4.1734 - return 1;
4.1735 + return TRUE;
4.1736 if (c<140)
4.1737 - return 0;
4.1738 + return FALSE;
4.1739 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
4.1740 - return 1;
4.1741 + return TRUE;
4.1742 if (c==140 || c==142 || c==156 || c==158 || c==159)
4.1743 - return 1;
4.1744 - return 0;
4.1745 + return TRUE;
4.1746 + return FALSE;
4.1747 }
4.1748
4.1749 /*
4.1750 @@ -3026,7 +3065,7 @@
4.1751 *
4.1752 * A version of isdigit() that doesn't get confused in 8-bit texts.
4.1753 */
4.1754 -int gcisdigit(unsigned char c)
4.1755 +gboolean gcisdigit(unsigned char c)
4.1756 {
4.1757 return c>='0' && c<='9';
4.1758 }
4.1759 @@ -3037,24 +3076,12 @@
4.1760 * A version of isletter() that doesn't get confused in 8-bit texts.
4.1761 * NB: this is ISO-8891-1-specific.
4.1762 */
4.1763 -int gcisletter(unsigned char c)
4.1764 +gboolean gcisletter(unsigned char c)
4.1765 {
4.1766 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
4.1767 }
4.1768
4.1769 /*
4.1770 - * gcstrchr:
4.1771 - *
4.1772 - * Wraps strchr to return NULL if the character being searched for is zero.
4.1773 - */
4.1774 -char *gcstrchr(char *s,char c)
4.1775 -{
4.1776 - if (!c)
4.1777 - return NULL;
4.1778 - return strchr(s,c);
4.1779 -}
4.1780 -
4.1781 -/*
4.1782 * postprocess_for_DP:
4.1783 *
4.1784 * Invoked with the -d switch from flgets().
4.1785 @@ -3097,7 +3124,7 @@
4.1786 */
4.1787 void postprocess_for_HTML(char *theline)
4.1788 {
4.1789 - if (strstr(theline,"<") && strstr(theline,">"))
4.1790 + if (strchr(theline,'<') && strchr(theline,'>'))
4.1791 while (losemarkup(theline))
4.1792 ;
4.1793 while (loseentities(theline))
4.1794 @@ -3171,9 +3198,9 @@
4.1795 return NULL;
4.1796 }
4.1797
4.1798 -int tagcomp(char *strin,char *basetag)
4.1799 +int tagcomp(const char *strin,const char *basetag)
4.1800 {
4.1801 - char *s,*t;
4.1802 + const char *s,*t;
4.1803 s=basetag;
4.1804 t=strin;
4.1805 if (*t=='/')
4.1806 @@ -3188,8 +3215,9 @@
4.1807 return 0;
4.1808 }
4.1809
4.1810 -void proghelp()
4.1811 +void proghelp(GOptionContext *context)
4.1812 {
4.1813 + gchar *help;
4.1814 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
4.1815 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
4.1816 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
4.1817 @@ -3198,22 +3226,10 @@
4.1818 fputs("This is Free Software; "
4.1819 "you may redistribute it under certain conditions (GPL);\n",stderr);
4.1820 fputs("read the file COPYING for details.\n\n",stderr);
4.1821 - fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
4.1822 - fputs(" where -s checks single quotes, -e suppresses echoing lines, "
4.1823 - "-t checks typos\n",stderr);
4.1824 - fputs(" -x (paranoid) switches OFF -t and extra checks, "
4.1825 - "-l turns OFF line-end checks\n",stderr);
4.1826 - fputs(" -o just displays overview without detail, "
4.1827 - "-h echoes header fields\n",stderr);
4.1828 - fputs(" -v (verbose) unsuppresses duplicate reporting, "
4.1829 - "-m suppresses markup\n",stderr);
4.1830 - fputs(" -d ignores DP-specific markup,\n",stderr);
4.1831 - fputs(" -u uses a file gutcheck.typ to query user-defined "
4.1832 - "possible typos\n",stderr);
4.1833 - fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
4.1834 - fputs("\n",stderr);
4.1835 - fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
4.1836 - stderr);
4.1837 + help=g_option_context_get_help(context,TRUE,NULL);
4.1838 + fputs(help,stderr);
4.1839 + g_free(help);
4.1840 + fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
4.1841 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
4.1842 "non-ASCII\n",stderr);
4.1843 fputs("characters like accented letters, "
5.1 --- a/configure.ac Mon May 27 09:03:04 2013 +0100
5.2 +++ b/configure.ac Tue May 28 15:17:19 2013 +0100
5.3 @@ -13,7 +13,8 @@
5.4 test/compatibility/Makefile
5.5 doc/Makefile
5.6 ])
5.7 -AM_INIT_AUTOMAKE(no-define)
5.8 +AM_INIT_AUTOMAKE(no-define,1.11)
5.9 +AM_SILENT_RULES([yes])
5.10 AC_CANONICAL_HOST
5.11
5.12 ##################################################
6.1 --- a/test/compatibility/user-defined-typo.tst Mon May 27 09:03:04 2013 +0100
6.2 +++ b/test/compatibility/user-defined-typo.tst Tue May 28 15:17:19 2013 +0100
6.3 @@ -1,6 +1,6 @@
6.4 **************** OPTIONS ****************
6.5 -u
6.6 -**************** INPUT(gutcheck.typ) ****************
6.7 +**************** INPUT(bookloupe.typ) ****************
6.8 arid
6.9 **************** INPUT ****************
6.10 I am the very model of a modern Major-General,
7.1 --- a/test/harness/testcaseparser.c Mon May 27 09:03:04 2013 +0100
7.2 +++ b/test/harness/testcaseparser.c Tue May 28 15:17:19 2013 +0100
7.3 @@ -91,9 +91,12 @@
7.4 {
7.5 TestcaseParser *parser;
7.6 gsize len;
7.7 + GError *err=NULL;
7.8 parser=g_new0(TestcaseParser,1);
7.9 - if (!file_get_contents_text(filename,&parser->contents,&len))
7.10 + if (!file_get_contents_text(filename,&parser->contents,&len,&err))
7.11 {
7.12 + g_printerr("%s: %s\n",filename,err->message);
7.13 + g_error_free(err);
7.14 g_free(parser);
7.15 return NULL;
7.16 }