1.1 --- a/bookloupe/bookloupe.c Mon May 27 09:03:04 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Tue May 28 15:17:19 2013 +0100
1.3 @@ -22,19 +22,10 @@
1.4 #include <stdlib.h>
1.5 #include <string.h>
1.6 #include <ctype.h>
1.7 +#include <glib.h>
1.8 +#include <bl/bl.h>
1.9
1.10 -#define MAXWORDLEN 80 /* max length of one word */
1.11 -#define LINEBUFSIZE 2048 /* buffer size for an input line */
1.12 -
1.13 -#define MAX_USER_TYPOS 1000
1.14 -#define USERTYPO_FILE "gutcheck.typ"
1.15 -
1.16 -#ifndef MAX_PATH
1.17 -#define MAX_PATH 16384
1.18 -#endif
1.19 -
1.20 -char aline[LINEBUFSIZE];
1.21 -char prevline[LINEBUFSIZE];
1.22 +gchar *prevline;
1.23
1.24 /* Common typos. */
1.25 char *typo[] = {
1.26 @@ -70,7 +61,7 @@
1.27 "se", ""
1.28 };
1.29
1.30 -char *usertypo[MAX_USER_TYPOS];
1.31 +GTree *usertypo;
1.32
1.33 /* Common abbreviations and other OK words not to query as typos. */
1.34 char *okword[] = {
1.35 @@ -282,46 +273,57 @@
1.36 #define WAY_TOO_LONG 80
1.37 #define SHORTEST_PG_LINE 55
1.38
1.39 -#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
1.40 - /* D - ignore DP-specific markup */
1.41 - /* E - echo queried line */
1.42 - /* S - check single quotes */
1.43 - /* T - check common typos */
1.44 - /* P - require closure of quotes on */
1.45 - /* every paragraph */
1.46 - /* X - "Trust no one" :-) Paranoid! */
1.47 - /* Queries everything */
1.48 - /* L - line end checking defaults on */
1.49 - /* -L turns it off */
1.50 - /* O - overview. Just shows counts. */
1.51 - /* Y - puts errors to stdout */
1.52 - /* instead of stderr */
1.53 - /* H - Echoes header fields */
1.54 - /* M - Ignore markup in < > */
1.55 - /* U - Use file of User-defined Typos */
1.56 - /* W - Defaults for use on Web upload */
1.57 - /* V - Verbose - list EVERYTHING! */
1.58 -#define SWITNO 14 /* max number of switch parms */
1.59 - /* - used for defining array-size */
1.60 -#define MINARGS 1 /* minimum no of args excl switches */
1.61 -#define MAXARGS 1 /* maximum no of args excl switches */
1.62 +enum {
1.63 + ECHO_SWITCH,
1.64 + SQUOTE_SWITCH,
1.65 + TYPO_SWITCH,
1.66 + QPARA_SWITCH,
1.67 + PARANOID_SWITCH,
1.68 + LINE_END_SWITCH,
1.69 + OVERVIEW_SWITCH,
1.70 + STDOUT_SWITCH,
1.71 + HEADER_SWITCH,
1.72 + WEB_SWITCH,
1.73 + VERBOSE_SWITCH,
1.74 + MARKUP_SWITCH,
1.75 + USERTYPO_SWITCH,
1.76 + DP_SWITCH,
1.77 + SWITNO
1.78 +};
1.79
1.80 -int pswit[SWITNO]; /* program switches set by SWITCHES */
1.81 +gboolean pswit[SWITNO]; /* program switches */
1.82
1.83 -#define ECHO_SWITCH 0
1.84 -#define SQUOTE_SWITCH 1
1.85 -#define TYPO_SWITCH 2
1.86 -#define QPARA_SWITCH 3
1.87 -#define PARANOID_SWITCH 4
1.88 -#define LINE_END_SWITCH 5
1.89 -#define OVERVIEW_SWITCH 6
1.90 -#define STDOUT_SWITCH 7
1.91 -#define HEADER_SWITCH 8
1.92 -#define WEB_SWITCH 9
1.93 -#define VERBOSE_SWITCH 10
1.94 -#define MARKUP_SWITCH 11
1.95 -#define USERTYPO_SWITCH 12
1.96 -#define DP_SWITCH 13
1.97 +static GOptionEntry options[]={
1.98 + { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
1.99 + "Ignore DP-specific markup", NULL },
1.100 + { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH,
1.101 + "Don't echo queried line", NULL },
1.102 + { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH,
1.103 + "Check single quotes", NULL },
1.104 + { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH,
1.105 + "Check common typos", NULL },
1.106 + { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH,
1.107 + "Require closure of quotes on every paragraph", NULL },
1.108 + { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH,
1.109 + "Disable paranoid querying of everything", NULL },
1.110 + { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH,
1.111 + "Disable line end checking", NULL },
1.112 + { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH,
1.113 + "Overview: just show counts", NULL },
1.114 + { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH,
1.115 + "Output errors to stdout instead of stderr", NULL },
1.116 + { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH,
1.117 + "Echo header fields", NULL },
1.118 + { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH,
1.119 + "Ignore markup in < >", NULL },
1.120 + { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH,
1.121 + "Use file of user-defined typos", NULL },
1.122 + { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH,
1.123 + "Defaults for use on www upload", NULL },
1.124 + { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
1.125 + "Verbose - list everything", NULL },
1.126 + { NULL }
1.127 +};
1.128
1.129 long cnt_dquot; /* for overview mode, count of doublequote queries */
1.130 long cnt_squot; /* for overview mode, count of singlequote queries */
1.131 @@ -340,47 +342,26 @@
1.132 long linecnt; /* count of total lines in the file */
1.133 long checked_linecnt; /* count of lines actually checked */
1.134
1.135 -void proghelp(void);
1.136 -void procfile(char *);
1.137 +void proghelp(GOptionContext *context);
1.138 +void procfile(const char *);
1.139
1.140 -#define LOW_THRESHOLD 0
1.141 -#define HIGH_THRESHOLD 1
1.142 +gchar *running_from;
1.143
1.144 -#define START 0
1.145 -#define END 1
1.146 -#define PREV 0
1.147 -#define NEXT 1
1.148 -#define FIRST_OF_PAIR 0
1.149 -#define SECOND_OF_PAIR 1
1.150 -
1.151 -#define MAX_WORDPAIR 1000
1.152 -
1.153 -char running_from[MAX_PATH];
1.154 -
1.155 -int mixdigit(char *);
1.156 -const char *getaword(const char *,char *);
1.157 -int matchword(char *,char *);
1.158 -char *flgets(char *,int,FILE *,long);
1.159 -void lowerit(char *);
1.160 -int gcisalpha(unsigned char);
1.161 -int gcisdigit(unsigned char);
1.162 -int gcisletter(unsigned char);
1.163 -char *gcstrchr(char *s,char c);
1.164 +int mixdigit(const char *);
1.165 +gchar *getaword(const char **);
1.166 +char *flgets(char **,long);
1.167 +gboolean gcisalpha(unsigned char);
1.168 +gboolean gcisdigit(unsigned char);
1.169 +gboolean gcisletter(unsigned char);
1.170 void postprocess_for_HTML(char *);
1.171 char *linehasmarkup(char *);
1.172 char *losemarkup(char *);
1.173 -int tagcomp(char *,char *);
1.174 +int tagcomp(const char *,const char *);
1.175 char *loseentities(char *);
1.176 -int isroman(char *);
1.177 -int usertypo_count;
1.178 +gboolean isroman(const char *);
1.179 void postprocess_for_DP(char *);
1.180
1.181 -char wrk[LINEBUFSIZE];
1.182 -
1.183 -#define MAX_QWORD 50
1.184 -#define MAX_QWORD_LENGTH 40
1.185 -char qword[MAX_QWORD][MAX_QWORD_LENGTH];
1.186 -int dupcnt[MAX_QWORD];
1.187 +GTree *qword,*qperiod;
1.188
1.189 struct first_pass_results {
1.190 long firstline,astline;
1.191 @@ -392,7 +373,8 @@
1.192
1.193 struct warnings {
1.194 int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
1.195 - int endquote,isDutch,isFrench;
1.196 + int endquote;
1.197 + gboolean isDutch,isFrench;
1.198 };
1.199
1.200 struct counters {
1.201 @@ -411,52 +393,35 @@
1.202 };
1.203
1.204 struct pending {
1.205 - char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80];
1.206 + char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
1.207 long squot;
1.208 };
1.209
1.210 -int main(int argc,char **argv)
1.211 +void parse_options(int *argc,char ***argv)
1.212 {
1.213 - char *argsw,*s;
1.214 - int i,switno,invarg;
1.215 - char usertypo_file[MAX_PATH];
1.216 - FILE *usertypofile;
1.217 - if (strlen(argv[0])<sizeof(running_from))
1.218 - /* save the path to the executable */
1.219 - strcpy(running_from,argv[0]);
1.220 - /* find out what directory we're running from */
1.221 - s=running_from+strlen(running_from);
1.222 - for (;*s!='/' && *s!='\\' && s>=running_from;s--)
1.223 - *s=0;
1.224 - switno=strlen(SWITCHES);
1.225 - for (i=switno;--i>0;)
1.226 - pswit[i]=0; /* initialise switches */
1.227 - /*
1.228 - * Standard loop to extract switches.
1.229 - * When we come out of this loop, the arguments will be
1.230 - * in argv[0] upwards and the switches used will be
1.231 - * represented by their equivalent elements in pswit[]
1.232 - */
1.233 - while (--argc>0 && **++argv=='-')
1.234 - for (argsw=argv[0]+1;*argsw!='\0';argsw++)
1.235 - for (i=switno,invarg=1;(--i>=0) && invarg==1;)
1.236 - if ((toupper(*argsw))==SWITCHES[i])
1.237 - {
1.238 - invarg=0;
1.239 - pswit[i]=1;
1.240 - }
1.241 + GError *err=NULL;
1.242 + GOptionContext *context;
1.243 + context=g_option_context_new(
1.244 + "file - looks for errors in Project Gutenberg(TM) etexts");
1.245 + g_option_context_add_main_entries(context,options,NULL);
1.246 + if (!g_option_context_parse(context,argc,argv,&err))
1.247 + {
1.248 + g_printerr("Bookloupe: %s\n",err->message);
1.249 + g_printerr("Use \"%s --help\" for help\n",(*argv)[0]);
1.250 + exit(1);
1.251 + }
1.252 /* Paranoid checking is turned OFF, not on, by its switch */
1.253 - pswit[PARANOID_SWITCH]^=1;
1.254 + pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH];
1.255 if (pswit[PARANOID_SWITCH])
1.256 - /* if running in paranoid mode force typo checks as well */
1.257 - pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1;
1.258 + /* if running in paranoid mode, typo checks default to enabled */
1.259 + pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH];
1.260 /* Line-end checking is turned OFF, not on, by its switch */
1.261 - pswit[LINE_END_SWITCH]^=1;
1.262 + pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH];
1.263 /* Echoing is turned OFF, not on, by its switch */
1.264 - pswit[ECHO_SWITCH]^=1;
1.265 + pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH];
1.266 if (pswit[OVERVIEW_SWITCH])
1.267 /* just print summary; don't echo */
1.268 - pswit[ECHO_SWITCH]=0;
1.269 + pswit[ECHO_SWITCH]=FALSE;
1.270 /*
1.271 * Web uploads - for the moment, this is really just a placeholder
1.272 * until we decide what processing we really want to do on web uploads
1.273 @@ -464,85 +429,155 @@
1.274 if (pswit[WEB_SWITCH])
1.275 {
1.276 /* specific override for web uploads */
1.277 - pswit[ECHO_SWITCH]=1;
1.278 - pswit[SQUOTE_SWITCH]=0;
1.279 - pswit[TYPO_SWITCH]=1;
1.280 - pswit[QPARA_SWITCH]=0;
1.281 - pswit[PARANOID_SWITCH]=1;
1.282 - pswit[LINE_END_SWITCH]=0;
1.283 - pswit[OVERVIEW_SWITCH]=0;
1.284 - pswit[STDOUT_SWITCH]=0;
1.285 - pswit[HEADER_SWITCH]=1;
1.286 - pswit[VERBOSE_SWITCH]=0;
1.287 - pswit[MARKUP_SWITCH]=0;
1.288 - pswit[USERTYPO_SWITCH]=0;
1.289 - pswit[DP_SWITCH]=0;
1.290 + pswit[ECHO_SWITCH]=TRUE;
1.291 + pswit[SQUOTE_SWITCH]=FALSE;
1.292 + pswit[TYPO_SWITCH]=TRUE;
1.293 + pswit[QPARA_SWITCH]=FALSE;
1.294 + pswit[PARANOID_SWITCH]=TRUE;
1.295 + pswit[LINE_END_SWITCH]=FALSE;
1.296 + pswit[OVERVIEW_SWITCH]=FALSE;
1.297 + pswit[STDOUT_SWITCH]=FALSE;
1.298 + pswit[HEADER_SWITCH]=TRUE;
1.299 + pswit[VERBOSE_SWITCH]=FALSE;
1.300 + pswit[MARKUP_SWITCH]=FALSE;
1.301 + pswit[USERTYPO_SWITCH]=FALSE;
1.302 + pswit[DP_SWITCH]=FALSE;
1.303 }
1.304 - if (argc<MINARGS || argc>MAXARGS)
1.305 + if (*argc<2)
1.306 {
1.307 - /* check number of args */
1.308 - proghelp();
1.309 - return 1;
1.310 + proghelp(context);
1.311 + exit(1);
1.312 }
1.313 - /* read in the user-defined stealth scanno list */
1.314 + g_option_context_free(context);
1.315 +}
1.316 +
1.317 +/*
1.318 + * read_user_scannos:
1.319 + *
1.320 + * Read in the user-defined stealth scanno list.
1.321 + */
1.322 +void read_user_scannos(void)
1.323 +{
1.324 + GError *err=NULL;
1.325 + gchar *usertypo_file;
1.326 + gboolean okay;
1.327 + int i;
1.328 + gsize len;
1.329 + gchar *contents,**lines;
1.330 + usertypo_file=g_strdup("bookloupe.typ");
1.331 + okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
1.332 + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
1.333 + {
1.334 + g_clear_error(&err);
1.335 + g_free(usertypo_file);
1.336 + usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL);
1.337 + okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
1.338 + }
1.339 + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
1.340 + {
1.341 + g_clear_error(&err);
1.342 + g_free(usertypo_file);
1.343 + usertypo_file=g_strdup("gutcheck.typ");
1.344 + okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
1.345 + }
1.346 + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
1.347 + {
1.348 + g_clear_error(&err);
1.349 + g_free(usertypo_file);
1.350 + usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL);
1.351 + okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
1.352 + }
1.353 + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
1.354 + {
1.355 + g_free(usertypo_file);
1.356 + printf(" --> I couldn't find bookloupe.typ "
1.357 + "-- proceeding without user typos.\n");
1.358 + return;
1.359 + }
1.360 + else if (!okay)
1.361 + {
1.362 + fprintf(stderr,"%s: %s\n",usertypo_file,err->message);
1.363 + g_free(usertypo_file);
1.364 + g_clear_error(&err);
1.365 + exit(1);
1.366 + }
1.367 + lines=g_strsplit(contents,"\n",0);
1.368 + usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
1.369 + for (i=0;lines[i];i++)
1.370 + if (*(unsigned char *)lines[i]>'!')
1.371 + g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1));
1.372 + else
1.373 + g_free(lines[i]);
1.374 + g_free(lines);
1.375 +}
1.376 +
1.377 +#if 0
1.378 +/*
1.379 + * read_etext:
1.380 + *
1.381 + * Read an etext returning an array of lines. Lines are normally expected
1.382 + * to be terminated by CR LF. Solitary LFs delimit lines but are left
1.383 + * embedded at the end of the line for further processing. Solitary CRs
1.384 + * do not delimit lines.
1.385 + */
1.386 +gchar **read_etext(const char *filename,GError **err)
1.387 +{
1.388 + int i;
1.389 + const char *s,*t;
1.390 + gchar *contents;
1.391 + gchar **raw_lines;
1.392 + GPtrArray *lines;
1.393 + gsize len;
1.394 + if (!g_file_get_contents(filename,&contents,&len,err))
1.395 + return NULL;
1.396 + raw_lines=g_strsplit(contents,"\r\n",0);
1.397 + lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
1.398 + for (i=0;raw_lines[i];i++)
1.399 + {
1.400 + t=strchr(raw_lines[i],'\n');
1.401 + if (t)
1.402 + {
1.403 + s=raw_lines[i];
1.404 + while ((t=strchr(s,'\n')))
1.405 + {
1.406 + g_ptr_array_add(lines,g_strndup(s,t-s+1));
1.407 + s=t+1;
1.408 + }
1.409 + g_ptr_array_add(lines,g_strdup(s));
1.410 + g_free(raw_lines[i]);
1.411 + }
1.412 + else
1.413 + g_ptr_array_add(lines,raw_lines[i]);
1.414 + }
1.415 + g_free(raw_lines);
1.416 + g_ptr_array_add(lines,NULL);
1.417 + return (gchar **)g_ptr_array_free(lines,FALSE);
1.418 +}
1.419 +#else
1.420 +/*
1.421 + * read_etext:
1.422 + *
1.423 + * Read an etext returning a newly allocated string containing the file
1.424 + * contents or NULL on error.
1.425 + */
1.426 +gchar *read_etext(const char *filename,GError **err)
1.427 +{
1.428 + gchar *contents;
1.429 + gsize len;
1.430 + if (!g_file_get_contents(filename,&contents,&len,err))
1.431 + return NULL;
1.432 + return contents;
1.433 +}
1.434 +#endif
1.435 +
1.436 +int main(int argc,char **argv)
1.437 +{
1.438 + running_from=g_path_get_dirname(argv[0]);
1.439 + parse_options(&argc,&argv);
1.440 if (pswit[USERTYPO_SWITCH])
1.441 - {
1.442 - /* ... we were told we had one! */
1.443 - usertypofile=fopen(USERTYPO_FILE,"rb");
1.444 - if (!usertypofile)
1.445 - {
1.446 - /* not in cwd. try excuteable directory. */
1.447 - strcpy(usertypo_file,running_from);
1.448 - strcat(usertypo_file,USERTYPO_FILE);
1.449 - usertypofile=fopen(usertypo_file,"rb");
1.450 - if (!usertypofile) {
1.451 - /* we ain't got no user typo file! */
1.452 - printf(" --> I couldn't find gutcheck.typ "
1.453 - "-- proceeding without user typos.\n");
1.454 - }
1.455 - }
1.456 - usertypo_count=0;
1.457 - if (usertypofile)
1.458 - {
1.459 - /* we managed to open a User Typo File! */
1.460 - if (pswit[USERTYPO_SWITCH])
1.461 - {
1.462 - while (flgets(aline,LINEBUFSIZE-1,usertypofile,
1.463 - (long)usertypo_count))
1.464 - {
1.465 - if (strlen(aline)>1)
1.466 - {
1.467 - if ((int)*aline>33)
1.468 - {
1.469 - s=malloc(strlen(aline)+1);
1.470 - if (!s)
1.471 - {
1.472 - fprintf(stderr,"bookloupe: cannot get enough "
1.473 - "memory for user typo file!\n");
1.474 - exit(1);
1.475 - }
1.476 - strcpy(s,aline);
1.477 - usertypo[usertypo_count]=s;
1.478 - usertypo_count++;
1.479 - if (usertypo_count>=MAX_USER_TYPOS)
1.480 - {
1.481 - printf(" --> Only %d user-defined typos "
1.482 - "allowed: ignoring the rest\n",
1.483 - MAX_USER_TYPOS);
1.484 - break;
1.485 - }
1.486 - }
1.487 - }
1.488 - }
1.489 - }
1.490 - fclose(usertypofile);
1.491 - }
1.492 - }
1.493 + read_user_scannos();
1.494 fprintf(stderr,"bookloupe: Check and report on an e-text\n");
1.495 - cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long=
1.496 - cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend=
1.497 - cnt_spacend=0;
1.498 - procfile(argv[0]);
1.499 + procfile(argv[1]);
1.500 if (pswit[OVERVIEW_SWITCH])
1.501 {
1.502 printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
1.503 @@ -577,6 +612,9 @@
1.504 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
1.505 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
1.506 }
1.507 + g_free(running_from);
1.508 + if (usertypo)
1.509 + g_tree_unref(usertypo);
1.510 return 0;
1.511 }
1.512
1.513 @@ -588,28 +626,33 @@
1.514 * occur many times in the text like long or short
1.515 * lines, non-standard dashes, etc.
1.516 */
1.517 -struct first_pass_results *first_pass(FILE *infile)
1.518 +struct first_pass_results *first_pass(const char *etext)
1.519 {
1.520 char laststart=CHAR_SPACE;
1.521 const char *s;
1.522 - int i,llen;
1.523 + gchar *lc_line;
1.524 + int i,j,llen;
1.525 + gchar **lines;
1.526 unsigned int lastlen=0,lastblen=0;
1.527 long spline=0,nspline=0;
1.528 static struct first_pass_results results={0};
1.529 - char inword[MAXWORDLEN]="";
1.530 - while (fgets(aline,LINEBUFSIZE-1,infile))
1.531 + gchar *inword;
1.532 + lines=g_strsplit(etext,"\n",0);
1.533 + for (j=0;lines[j];j++)
1.534 {
1.535 - while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
1.536 - aline[strlen(aline)-1]=0;
1.537 + llen=strlen(lines[j]);
1.538 + while(lines[j][llen-1]=='\r')
1.539 + lines[j][llen--]='\0';
1.540 linecnt++;
1.541 - if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") &&
1.542 - (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT")))
1.543 + if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
1.544 + (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
1.545 {
1.546 if (spline)
1.547 printf(" --> Duplicate header?\n");
1.548 spline=linecnt+1; /* first line of non-header text, that is */
1.549 }
1.550 - if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG"))
1.551 + if (!strncmp(lines[j],"*** START",9) &&
1.552 + strstr(lines[j],"PROJECT GUTENBERG"))
1.553 {
1.554 if (nspline)
1.555 printf(" --> Duplicate header?\n");
1.556 @@ -617,10 +660,10 @@
1.557 }
1.558 if (spline || nspline)
1.559 {
1.560 - lowerit(aline);
1.561 - if (strstr(aline,"end") && strstr(aline,"project gutenberg"))
1.562 + lc_line=g_ascii_strdown(lines[j],llen);
1.563 + if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
1.564 {
1.565 - if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
1.566 + if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
1.567 {
1.568 if (results.footerline)
1.569 {
1.570 @@ -632,6 +675,7 @@
1.571 results.footerline=linecnt;
1.572 }
1.573 }
1.574 + g_free(lc_line);
1.575 }
1.576 if (spline)
1.577 results.firstline=spline;
1.578 @@ -639,85 +683,83 @@
1.579 results.firstline=nspline; /* override with new */
1.580 if (results.footerline)
1.581 continue; /* don't count the boilerplate in the footer */
1.582 - llen=strlen(aline);
1.583 results.totlen+=llen;
1.584 for (i=0;i<llen;i++)
1.585 {
1.586 - if ((unsigned char)aline[i]>127)
1.587 + if ((unsigned char)lines[j][i]>127)
1.588 results.binlen++;
1.589 - if (gcisalpha(aline[i]))
1.590 + if (gcisalpha(lines[j][i]))
1.591 results.alphalen++;
1.592 - if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
1.593 + if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
1.594 results.endquote_count++;
1.595 }
1.596 - if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
1.597 - lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
1.598 + if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
1.599 + lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
1.600 results.shortline++;
1.601 - if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
1.602 + if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
1.603 cnt_spacend++;
1.604 - if (strstr(aline,".,"))
1.605 + if (strstr(lines[j],".,"))
1.606 results.dotcomma++;
1.607 /* only count ast lines for ignoring purposes where there is */
1.608 /* locase text on the line */
1.609 - if (strstr(aline,"*"))
1.610 + if (strchr(lines[j],'*'))
1.611 {
1.612 - for (s=aline;*s;s++)
1.613 + for (s=lines[j];*s;s++)
1.614 if (*s>='a' && *s<='z')
1.615 break;
1.616 if (*s)
1.617 results.astline++;
1.618 }
1.619 - if (strstr(aline,"/"))
1.620 + if (strchr(lines[j],'/'))
1.621 results.fslashline++;
1.622 - for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
1.623 + for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
1.624 ;
1.625 - if (aline[i]=='-' && aline[i-1]!='-')
1.626 + if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
1.627 results.hyphens++;
1.628 if (llen>LONGEST_PG_LINE)
1.629 results.longline++;
1.630 if (llen>WAY_TOO_LONG)
1.631 results.verylongline++;
1.632 - if (strstr(aline,"<") && strstr(aline,">"))
1.633 + if (strchr(lines[j],'<') && strchr(lines[j],'>'))
1.634 {
1.635 - i=(int)(strstr(aline,">")-strstr(aline,"<")+1);
1.636 + i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1);
1.637 if (i>0)
1.638 results.htmcount++;
1.639 - if (strstr(aline,"<i>"))
1.640 + if (strstr(lines[j],"<i>"))
1.641 results.htmcount+=4; /* bonus marks! */
1.642 }
1.643 /* Check for spaced em-dashes */
1.644 - if (strstr(aline,"--"))
1.645 + if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
1.646 {
1.647 results.emdash++;
1.648 - if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
1.649 - (*(strstr(aline,"--")+2)==CHAR_SPACE))
1.650 + if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
1.651 results.space_emdash++;
1.652 - if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
1.653 - (*(strstr(aline,"--")+2)==CHAR_SPACE))
1.654 + if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
1.655 /* count of em-dashes with spaces both sides */
1.656 results.non_PG_space_emdash++;
1.657 - if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
1.658 - (*(strstr(aline,"--")+2)!=CHAR_SPACE))
1.659 + if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
1.660 /* count of PG-type em-dashes with no spaces */
1.661 results.PG_space_emdash++;
1.662 }
1.663 - for (s=aline;*s;)
1.664 + for (s=lines[j];*s;)
1.665 {
1.666 - s=getaword(s,inword);
1.667 + inword=getaword(&s);
1.668 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
1.669 results.Dutchcount++;
1.670 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
1.671 results.Frenchcount++;
1.672 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1.673 results.standalone_digit++;
1.674 + g_free(inword);
1.675 }
1.676 /* Check for spaced dashes */
1.677 - if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
1.678 + if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-')
1.679 results.spacedash++;
1.680 lastblen=lastlen;
1.681 - lastlen=strlen(aline);
1.682 - laststart=aline[0];
1.683 + lastlen=llen;
1.684 + laststart=lines[j][0];
1.685 }
1.686 + g_strfreev(lines);
1.687 return &results;
1.688 }
1.689
1.690 @@ -856,17 +898,17 @@
1.691 "Not reporting them.\n");
1.692 warnings.bin=0;
1.693 }
1.694 - warnings.isDutch=0;
1.695 + warnings.isDutch=FALSE;
1.696 if (results->Dutchcount>50)
1.697 {
1.698 - warnings.isDutch=1;
1.699 + warnings.isDutch=TRUE;
1.700 printf(" --> This looks like Dutch - "
1.701 "switching off dashes and warnings for 's Middags case.\n");
1.702 }
1.703 - warnings.isFrench=0;
1.704 + warnings.isFrench=FALSE;
1.705 if (results->Frenchcount>50)
1.706 {
1.707 - warnings.isFrench=1;
1.708 + warnings.isFrench=TRUE;
1.709 printf(" --> This looks like French - "
1.710 "switching off some doublepunct.\n");
1.711 }
1.712 @@ -919,12 +961,14 @@
1.713 * count it, since empty lines with asterisks or dashes to
1.714 * separate sections are common.
1.715 *
1.716 - * Returns: Non-zero if the line is empty.
1.717 + * Returns: TRUE if the line is empty.
1.718 */
1.719 -int analyse_quotes(const char *s,struct counters *counters)
1.720 +gboolean analyse_quotes(const char *aline,struct counters *counters)
1.721 {
1.722 int guessquote=0;
1.723 - int isemptyline=1; /* assume the line is empty until proven otherwise */
1.724 + /* assume the line is empty until proven otherwise */
1.725 + gboolean isemptyline=TRUE;
1.726 + const char *s=aline;
1.727 while (*s)
1.728 {
1.729 if (*s==CHAR_DQUOTE)
1.730 @@ -986,7 +1030,7 @@
1.731 }
1.732 if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
1.733 *s!=13 && *s!=10)
1.734 - isemptyline=0; /* ignore lines like * * * as spacers */
1.735 + isemptyline=FALSE; /* ignore lines like * * * as spacers */
1.736 if (*s==CHAR_UNDERSCORE)
1.737 counters->c_unders++;
1.738 if (*s==CHAR_OPEN_CBRACK)
1.739 @@ -1040,7 +1084,7 @@
1.740 * Check for binary and other odd characters.
1.741 */
1.742 void check_for_odd_characters(const char *aline,const struct warnings *warnings,
1.743 - int isemptyline)
1.744 + gboolean isemptyline)
1.745 {
1.746 /* Don't repeat multiple warnings on one line. */
1.747 int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
1.748 @@ -1461,16 +1505,15 @@
1.749 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1.750 {
1.751 const char *s,*t,*s1;
1.752 - int i,istypo,isdup;
1.753 - static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
1.754 - static int qperiod_index=0;
1.755 - char testword[MAXWORDLEN]="";
1.756 + int i;
1.757 + gboolean istypo;
1.758 + gchar *testword;
1.759 if (pswit[PARANOID_SWITCH])
1.760 {
1.761 - for (t=s=aline;strstr(t,". ");)
1.762 + for (t=aline;strstr(t,". ");)
1.763 {
1.764 t=strstr(t,". ");
1.765 - if (t==s)
1.766 + if (t==aline)
1.767 {
1.768 t++;
1.769 /* start of line punctuation is handled elsewhere */
1.770 @@ -1497,57 +1540,48 @@
1.771 if (*s1>='a' && *s1<='z')
1.772 {
1.773 /* we have something to investigate */
1.774 - istypo=1;
1.775 + istypo=TRUE;
1.776 /* so let's go back and find out */
1.777 - for (s1=t-1;s1>=s &&
1.778 + for (s1=t-1;s1>=aline &&
1.779 (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1.780 gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1.781 ;
1.782 s1++;
1.783 - for (i=0;*s1 && *s1!='.';s1++,i++)
1.784 - testword[i]=*s1;
1.785 - testword[i]=0;
1.786 + s=strchr(s1,'.');
1.787 + if (s)
1.788 + testword=g_strndup(s1,s-s1);
1.789 + else
1.790 + testword=g_strdup(s1);
1.791 for (i=0;*abbrev[i];i++)
1.792 if (!strcmp(testword,abbrev[i]))
1.793 - istypo=0;
1.794 + istypo=FALSE;
1.795 if (gcisdigit(*testword))
1.796 - istypo=0;
1.797 + istypo=FALSE;
1.798 if (!testword[1])
1.799 - istypo=0;
1.800 + istypo=FALSE;
1.801 if (isroman(testword))
1.802 - istypo=0;
1.803 + istypo=FALSE;
1.804 if (istypo)
1.805 {
1.806 - istypo=0;
1.807 + istypo=FALSE;
1.808 for (i=0;testword[i];i++)
1.809 if (strchr(vowels,testword[i]))
1.810 - istypo=1;
1.811 + istypo=TRUE;
1.812 }
1.813 - if (istypo)
1.814 + if (istypo &&
1.815 + (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1.816 {
1.817 - isdup=0;
1.818 - if (strlen(testword)<MAX_QWORD_LENGTH &&
1.819 - !pswit[VERBOSE_SWITCH])
1.820 - for (i=0;i<qperiod_index;i++)
1.821 - if (!strcmp(testword,qperiod[i]))
1.822 - isdup=1;
1.823 - if (!isdup)
1.824 - {
1.825 - if (qperiod_index<MAX_QWORD &&
1.826 - strlen(testword)<MAX_QWORD_LENGTH)
1.827 - {
1.828 - strcpy(qperiod[qperiod_index],testword);
1.829 - qperiod_index++;
1.830 - }
1.831 - if (pswit[ECHO_SWITCH])
1.832 - printf("\n%s\n",aline);
1.833 - if (!pswit[OVERVIEW_SWITCH])
1.834 - printf(" Line %ld column %d - Extra period?\n",
1.835 - linecnt,(int)(t-aline)+1);
1.836 - else
1.837 - cnt_punct++;
1.838 - }
1.839 + g_tree_insert(qperiod,g_strdup(testword),
1.840 + GINT_TO_POINTER(1));
1.841 + if (pswit[ECHO_SWITCH])
1.842 + printf("\n%s\n",aline);
1.843 + if (!pswit[OVERVIEW_SWITCH])
1.844 + printf(" Line %ld column %d - Extra period?\n",
1.845 + linecnt,(int)(t-aline)+1);
1.846 + else
1.847 + cnt_punct++;
1.848 }
1.849 + g_free(testword);
1.850 }
1.851 t++;
1.852 }
1.853 @@ -1563,16 +1597,20 @@
1.854 {
1.855 int i;
1.856 const char *s,*wordstart;
1.857 - char inword[MAXWORDLEN];
1.858 + gchar *inword,*t;
1.859 if (pswit[TYPO_SWITCH])
1.860 {
1.861 for (s=aline;*s;)
1.862 {
1.863 wordstart=s;
1.864 - s=getaword(s,inword);
1.865 - if (!*inword)
1.866 + t=getaword(&s);
1.867 + if (!*t)
1.868 + {
1.869 + g_free(t);
1.870 continue;
1.871 - lowerit(inword);
1.872 + }
1.873 + inword=g_ascii_strdown(t,-1);
1.874 + g_free(t);
1.875 for (i=0;*nocomma[i];i++)
1.876 if (!strcmp(inword,nocomma[i]))
1.877 {
1.878 @@ -1603,6 +1641,7 @@
1.879 cnt_punct++;
1.880 }
1.881 }
1.882 + g_free(inword);
1.883 }
1.884 }
1.885 }
1.886 @@ -1616,15 +1655,18 @@
1.887 void check_for_typos(const char *aline,struct warnings *warnings)
1.888 {
1.889 const char *s,*wordstart;
1.890 - char inword[MAXWORDLEN],testword[MAXWORDLEN];
1.891 - int i,istypo,isdup,alower,vowel,consonant;
1.892 - static int qword_index=0;
1.893 + gchar *inword,*testword;
1.894 + int i,alower,vowel,consonant,*dupcnt;
1.895 + gboolean isdup,istypo;
1.896 for (s=aline;*s;)
1.897 {
1.898 wordstart=s;
1.899 - s=getaword(s,inword);
1.900 + inword=getaword(&s);
1.901 if (!*inword)
1.902 + {
1.903 + g_free(inword);
1.904 continue; /* don't bother with empty lines */
1.905 + }
1.906 if (mixdigit(inword))
1.907 {
1.908 if (pswit[ECHO_SWITCH])
1.909 @@ -1639,10 +1681,10 @@
1.910 * Put the word through a series of tests for likely typos and OCR
1.911 * errors.
1.912 */
1.913 - if (pswit[TYPO_SWITCH])
1.914 + if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1.915 {
1.916 - istypo=0;
1.917 - strcpy(testword,inword);
1.918 + istypo=FALSE;
1.919 + testword=g_strdup(inword);
1.920 alower=0;
1.921 for (i=0;i<(int)strlen(testword);i++)
1.922 {
1.923 @@ -1662,10 +1704,13 @@
1.924 testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1.925 ; /* do nothing! */
1.926 else
1.927 - istypo=1;
1.928 + istypo=TRUE;
1.929 }
1.930 testword[i]=(char)tolower(testword[i]);
1.931 }
1.932 + }
1.933 + if (pswit[TYPO_SWITCH])
1.934 + {
1.935 /*
1.936 * Check for certain unlikely two-letter combinations at word
1.937 * start and end.
1.938 @@ -1674,26 +1719,26 @@
1.939 {
1.940 for (i=0;*nostart[i];i++)
1.941 if (!strncmp(testword,nostart[i],2))
1.942 - istypo=1;
1.943 + istypo=TRUE;
1.944 for (i=0;*noend[i];i++)
1.945 if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1.946 - istypo=1;
1.947 + istypo=TRUE;
1.948 }
1.949 /* ght is common, gbt never. Like that. */
1.950 if (strstr(testword,"cb"))
1.951 - istypo=1;
1.952 + istypo=TRUE;
1.953 if (strstr(testword,"gbt"))
1.954 - istypo=1;
1.955 + istypo=TRUE;
1.956 if (strstr(testword,"pbt"))
1.957 - istypo=1;
1.958 + istypo=TRUE;
1.959 if (strstr(testword,"tbs"))
1.960 - istypo=1;
1.961 + istypo=TRUE;
1.962 if (strstr(testword,"mrn"))
1.963 - istypo=1;
1.964 + istypo=TRUE;
1.965 if (strstr(testword,"ahle"))
1.966 - istypo=1;
1.967 + istypo=TRUE;
1.968 if (strstr(testword,"ihle"))
1.969 - istypo=1;
1.970 + istypo=TRUE;
1.971 /*
1.972 * "TBE" does happen - like HEARTBEAT - but uncommon.
1.973 * Also "TBI" - frostbite, outbid - but uncommon.
1.974 @@ -1701,11 +1746,11 @@
1.975 * numerals, but "ii" is a common scanno.
1.976 */
1.977 if (strstr(testword,"tbi"))
1.978 - istypo=1;
1.979 + istypo=TRUE;
1.980 if (strstr(testword,"tbe"))
1.981 - istypo=1;
1.982 + istypo=TRUE;
1.983 if (strstr(testword,"ii"))
1.984 - istypo=1;
1.985 + istypo=TRUE;
1.986 /*
1.987 * Check for no vowels or no consonants.
1.988 * If none, flag a typo.
1.989 @@ -1727,7 +1772,7 @@
1.990 consonant++;
1.991 }
1.992 if (!vowel || !consonant)
1.993 - istypo=1;
1.994 + istypo=TRUE;
1.995 }
1.996 /*
1.997 * Now exclude the word from being reported if it's in
1.998 @@ -1735,18 +1780,18 @@
1.999 */
1.1000 for (i=0;*okword[i];i++)
1.1001 if (!strcmp(testword,okword[i]))
1.1002 - istypo=0;
1.1003 + istypo=FALSE;
1.1004 /*
1.1005 * What looks like a typo may be a Roman numeral.
1.1006 * Exclude these.
1.1007 */
1.1008 if (istypo && isroman(testword))
1.1009 - istypo=0;
1.1010 + istypo=FALSE;
1.1011 /* Check the manual list of typos. */
1.1012 if (!istypo)
1.1013 for (i=0;*typo[i];i++)
1.1014 if (!strcmp(testword,typo[i]))
1.1015 - istypo=1;
1.1016 + istypo=TRUE;
1.1017 /*
1.1018 * Check lowercase s, l, i and m - special cases.
1.1019 * "j" - often a semi-colon gone wrong.
1.1020 @@ -1754,34 +1799,30 @@
1.1021 * "n" for "in"
1.1022 */
1.1023 if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1.1024 - istypo=1;
1.1025 + istypo=TRUE;
1.1026 if (istypo)
1.1027 {
1.1028 - isdup=0;
1.1029 - if (strlen(testword)<MAX_QWORD_LENGTH &&
1.1030 - !pswit[VERBOSE_SWITCH])
1.1031 - for (i=0;i<qword_index;i++)
1.1032 - if (!strcmp(testword,qword[i]))
1.1033 - {
1.1034 - isdup=1;
1.1035 - ++dupcnt[i];
1.1036 - }
1.1037 + dupcnt=g_tree_lookup(qword,testword);
1.1038 + if (dupcnt)
1.1039 + {
1.1040 + (*dupcnt)++;
1.1041 + isdup=!pswit[VERBOSE_SWITCH];
1.1042 + }
1.1043 + else
1.1044 + {
1.1045 + dupcnt=g_new0(int,1);
1.1046 + g_tree_insert(qword,g_strdup(testword),dupcnt);
1.1047 + isdup=FALSE;
1.1048 + }
1.1049 if (!isdup)
1.1050 {
1.1051 - if (qword_index<MAX_QWORD &&
1.1052 - strlen(testword)<MAX_QWORD_LENGTH)
1.1053 - {
1.1054 - strcpy(qword[qword_index],testword);
1.1055 - qword_index++;
1.1056 - }
1.1057 if (pswit[ECHO_SWITCH])
1.1058 printf("\n%s\n",aline);
1.1059 if (!pswit[OVERVIEW_SWITCH])
1.1060 {
1.1061 printf(" Line %ld column %d - Query word %s",
1.1062 linecnt,(int)(wordstart-aline)+1,inword);
1.1063 - if (strlen(testword)<MAX_QWORD_LENGTH &&
1.1064 - !pswit[VERBOSE_SWITCH])
1.1065 + if (!pswit[VERBOSE_SWITCH])
1.1066 printf(" - not reporting duplicates");
1.1067 printf("\n");
1.1068 }
1.1069 @@ -1791,17 +1832,16 @@
1.1070 }
1.1071 }
1.1072 /* check the user's list of typos */
1.1073 - if (!istypo && usertypo_count)
1.1074 - for (i=0;i<usertypo_count;i++)
1.1075 - if (!strcmp(testword,usertypo[i]))
1.1076 - {
1.1077 - if (pswit[ECHO_SWITCH])
1.1078 - printf("\n%s\n",aline);
1.1079 - if (!pswit[OVERVIEW_SWITCH])
1.1080 - printf(" Line %ld column %d - "
1.1081 - "Query possible scanno %s\n",
1.1082 - linecnt,(int)(wordstart-aline)+2,inword);
1.1083 - }
1.1084 + if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1.1085 + {
1.1086 + if (pswit[ECHO_SWITCH])
1.1087 + printf("\n%s\n",aline);
1.1088 + if (!pswit[OVERVIEW_SWITCH])
1.1089 + printf(" Line %ld column %d - Query possible scanno %s\n",
1.1090 + linecnt,(int)(wordstart-aline)+2,inword);
1.1091 + }
1.1092 + if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1.1093 + g_free(testword);
1.1094 if (pswit[PARANOID_SWITCH] && warnings->digit)
1.1095 {
1.1096 /* In paranoid mode, query all 0 and 1 standing alone. */
1.1097 @@ -1816,6 +1856,7 @@
1.1098 cnt_word++;
1.1099 }
1.1100 }
1.1101 + g_free(inword);
1.1102 }
1.1103 }
1.1104
1.1105 @@ -1830,9 +1871,10 @@
1.1106 * quotes "like"this.
1.1107 */
1.1108 void check_for_misspaced_punctuation(const char *aline,
1.1109 - struct parities *parities,int isemptyline)
1.1110 + struct parities *parities,gboolean isemptyline)
1.1111 {
1.1112 - int i,llen,isacro,isellipsis;
1.1113 + int i,llen;
1.1114 + gboolean isacro,isellipsis;
1.1115 const char *s;
1.1116 llen=strlen(aline);
1.1117 for (i=1;i<llen;i++)
1.1118 @@ -1841,9 +1883,9 @@
1.1119 if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
1.1120 {
1.1121 /* we need to suppress warnings for acronyms like M.D. */
1.1122 - isacro=0;
1.1123 + isacro=FALSE;
1.1124 /* we need to suppress warnings for ellipsis . . . */
1.1125 - isellipsis=0;
1.1126 + isellipsis=FALSE;
1.1127 /* if there are letters on both sides of it or ... */
1.1128 if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
1.1129 gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
1.1130 @@ -1852,9 +1894,9 @@
1.1131 if (aline[i]=='.')
1.1132 {
1.1133 if (i>2 && aline[i-2]=='.')
1.1134 - isacro=1;
1.1135 + isacro=TRUE;
1.1136 if (i+2<llen && aline[i+2]=='.')
1.1137 - isacro=1;
1.1138 + isacro=TRUE;
1.1139 }
1.1140 if (!isacro)
1.1141 {
1.1142 @@ -1877,9 +1919,9 @@
1.1143 if (aline[i]=='.')
1.1144 {
1.1145 if (i>2 && aline[i-2]=='.')
1.1146 - isellipsis=1;
1.1147 + isellipsis=TRUE;
1.1148 if (i+2<llen && aline[i+2]=='.')
1.1149 - isellipsis=1;
1.1150 + isellipsis=TRUE;
1.1151 }
1.1152 if (!isemptyline && !isellipsis)
1.1153 {
1.1154 @@ -2177,6 +2219,8 @@
1.1155 void check_for_miscased_genative(const char *aline)
1.1156 {
1.1157 const char *s;
1.1158 + if (!*aline)
1.1159 + return;
1.1160 s=aline+1;
1.1161 while (*s)
1.1162 {
1.1163 @@ -2321,13 +2365,11 @@
1.1164 i=(int)(close-open+1);
1.1165 if (i>0)
1.1166 {
1.1167 - strncpy(wrk,open,i);
1.1168 - wrk[i]=0;
1.1169 if (pswit[ECHO_SWITCH])
1.1170 printf("\n%s\n",aline);
1.1171 if (!pswit[OVERVIEW_SWITCH])
1.1172 - printf(" Line %ld column %d - HTML Tag? %s \n",
1.1173 - linecnt,(int)(open-aline)+1,wrk);
1.1174 + printf(" Line %ld column %d - HTML Tag? %*.*s \n",
1.1175 + linecnt,(int)(open-aline)+1,i,i,open);
1.1176 else
1.1177 cnt_html++;
1.1178 }
1.1179 @@ -2359,13 +2401,11 @@
1.1180 i=0; /* Don't report "Jones & Son;" */
1.1181 if (i>0)
1.1182 {
1.1183 - strncpy(wrk,amp,i);
1.1184 - wrk[i]=0;
1.1185 if (pswit[ECHO_SWITCH])
1.1186 printf("\n%s\n",aline);
1.1187 if (!pswit[OVERVIEW_SWITCH])
1.1188 - printf(" Line %ld column %d - HTML symbol? %s \n",
1.1189 - linecnt,(int)(amp-aline)+1,wrk);
1.1190 + printf(" Line %ld column %d - HTML symbol? %*.*s \n",
1.1191 + linecnt,(int)(amp-aline)+1,i,i,amp);
1.1192 else
1.1193 cnt_html++;
1.1194 }
1.1195 @@ -2388,7 +2428,8 @@
1.1196 s=aline;
1.1197 while (*s==' ')
1.1198 s++;
1.1199 - if (*pending->dquote)
1.1200 + if (pending->dquote)
1.1201 + {
1.1202 if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
1.1203 {
1.1204 if (!pswit[OVERVIEW_SWITCH])
1.1205 @@ -2400,7 +2441,10 @@
1.1206 else
1.1207 cnt_dquot++;
1.1208 }
1.1209 - if (*pending->squote)
1.1210 + g_free(pending->dquote);
1.1211 + pending->dquote=NULL;
1.1212 + }
1.1213 + if (pending->squote)
1.1214 {
1.1215 if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
1.1216 pending->squot)
1.1217 @@ -2414,8 +2458,10 @@
1.1218 else
1.1219 cnt_squot++;
1.1220 }
1.1221 + g_free(pending->squote);
1.1222 + pending->squote=NULL;
1.1223 }
1.1224 - if (*pending->rbrack)
1.1225 + if (pending->rbrack)
1.1226 {
1.1227 if (!pswit[OVERVIEW_SWITCH])
1.1228 {
1.1229 @@ -2425,8 +2471,10 @@
1.1230 }
1.1231 else
1.1232 cnt_brack++;
1.1233 + g_free(pending->rbrack);
1.1234 + pending->rbrack=NULL;
1.1235 }
1.1236 - if (*pending->sbrack)
1.1237 + if (pending->sbrack)
1.1238 {
1.1239 if (!pswit[OVERVIEW_SWITCH])
1.1240 {
1.1241 @@ -2436,8 +2484,10 @@
1.1242 }
1.1243 else
1.1244 cnt_brack++;
1.1245 + g_free(pending->sbrack);
1.1246 + pending->sbrack=NULL;
1.1247 }
1.1248 - if (*pending->cbrack)
1.1249 + if (pending->cbrack)
1.1250 {
1.1251 if (!pswit[OVERVIEW_SWITCH])
1.1252 {
1.1253 @@ -2447,8 +2497,10 @@
1.1254 }
1.1255 else
1.1256 cnt_brack++;
1.1257 + g_free(pending->cbrack);
1.1258 + pending->cbrack=NULL;
1.1259 }
1.1260 - if (*pending->unders)
1.1261 + if (pending->unders)
1.1262 {
1.1263 if (!pswit[OVERVIEW_SWITCH])
1.1264 {
1.1265 @@ -2458,6 +2510,8 @@
1.1266 }
1.1267 else
1.1268 cnt_brack++;
1.1269 + g_free(pending->unders);
1.1270 + pending->unders=NULL;
1.1271 }
1.1272 }
1.1273
1.1274 @@ -2481,12 +2535,12 @@
1.1275 struct pending *pending)
1.1276 {
1.1277 if (counters->quot%2)
1.1278 - sprintf(pending->dquote," Line %ld - Mismatched quotes",
1.1279 - linecnt);
1.1280 + pending->dquote=
1.1281 + g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
1.1282 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
1.1283 counters->open_single_quote!=counters->close_single_quote)
1.1284 - sprintf(pending->squote," Line %ld - Mismatched singlequotes?",
1.1285 - linecnt);
1.1286 + pending->squote=
1.1287 + g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
1.1288 if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
1.1289 counters->open_single_quote!=counters->close_single_quote &&
1.1290 counters->open_single_quote!=counters->close_single_quote+1)
1.1291 @@ -2496,17 +2550,17 @@
1.1292 */
1.1293 pending->squot=1;
1.1294 if (counters->r_brack)
1.1295 - sprintf(pending->rbrack," Line %ld - Mismatched round brackets?",
1.1296 - linecnt);
1.1297 + pending->rbrack=
1.1298 + g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
1.1299 if (counters->s_brack)
1.1300 - sprintf(pending->sbrack," Line %ld - Mismatched square brackets?",
1.1301 - linecnt);
1.1302 + pending->sbrack=
1.1303 + g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
1.1304 if (counters->c_brack)
1.1305 - sprintf(pending->cbrack," Line %ld - Mismatched curly brackets?",
1.1306 - linecnt);
1.1307 + pending->cbrack=
1.1308 + g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
1.1309 if (counters->c_unders%2)
1.1310 - sprintf(pending->unders," Line %ld - Mismatched underscores?",
1.1311 - linecnt);
1.1312 + pending->unders=
1.1313 + g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt);
1.1314 }
1.1315
1.1316 /*
1.1317 @@ -2563,50 +2617,63 @@
1.1318 }
1.1319 }
1.1320
1.1321 +gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data)
1.1322 +{
1.1323 + const char *word=key;
1.1324 + int *dupcnt=value;
1.1325 + if (*dupcnt)
1.1326 + printf("\nNote: Queried word %s was duplicated %d times\n",
1.1327 + word,*dupcnt);
1.1328 + return FALSE;
1.1329 +}
1.1330 +
1.1331 /*
1.1332 * procfile:
1.1333 *
1.1334 * Process one file.
1.1335 */
1.1336 -void procfile(char *filename)
1.1337 +void procfile(const char *filename)
1.1338 {
1.1339 const char *s;
1.1340 - char parastart[81]; /* first line of current para */
1.1341 - FILE *infile;
1.1342 + gchar *parastart=NULL; /* first line of current para */
1.1343 + gchar *etext,*aline;
1.1344 + gchar *etext_ptr;
1.1345 + GError *err=NULL;
1.1346 struct first_pass_results *first_pass_results;
1.1347 struct warnings *warnings;
1.1348 struct counters counters={0};
1.1349 struct line_properties last={0};
1.1350 struct parities parities={0};
1.1351 - struct pending pending={{0},};
1.1352 - int isemptyline;
1.1353 + struct pending pending={0};
1.1354 + gboolean isemptyline;
1.1355 long start_para_line=0;
1.1356 - int i,isnewpara=0,enddash=0;
1.1357 + gboolean isnewpara=FALSE,enddash=FALSE;
1.1358 last.start=CHAR_SPACE;
1.1359 - *prevline=0;
1.1360 linecnt=checked_linecnt=0;
1.1361 - infile=fopen(filename,"rb");
1.1362 - if (!infile)
1.1363 + etext=read_etext(filename,&err);
1.1364 + if (!etext)
1.1365 {
1.1366 if (pswit[STDOUT_SWITCH])
1.1367 - fprintf(stdout,"bookloupe: cannot open %s\n",filename);
1.1368 + fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message);
1.1369 else
1.1370 - fprintf(stderr,"bookloupe: cannot open %s\n",filename);
1.1371 + fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
1.1372 exit(1);
1.1373 }
1.1374 fprintf(stdout,"\n\nFile: %s\n\n",filename);
1.1375 - first_pass_results=first_pass(infile);
1.1376 + first_pass_results=first_pass(etext);
1.1377 warnings=report_first_pass(first_pass_results);
1.1378 + qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
1.1379 + qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
1.1380 /*
1.1381 * Here we go with the main pass. Hold onto yer hat!
1.1382 */
1.1383 - rewind(infile);
1.1384 linecnt=0;
1.1385 - while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1))
1.1386 + etext_ptr=etext;
1.1387 + while ((aline=flgets(&etext_ptr,linecnt+1)))
1.1388 {
1.1389 linecnt++;
1.1390 if (linecnt==1)
1.1391 - isnewpara=1;
1.1392 + isnewpara=TRUE;
1.1393 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
1.1394 continue; // skip DP page separators completely
1.1395 if (linecnt<first_pass_results->firstline ||
1.1396 @@ -2635,8 +2702,8 @@
1.1397 /* This line is the start of a new paragraph. */
1.1398 start_para_line=linecnt;
1.1399 /* Capture its first line in case we want to report it later. */
1.1400 - strncpy(parastart,aline,80);
1.1401 - parastart[79]=0;
1.1402 + g_free(parastart);
1.1403 + parastart=g_strdup(aline);
1.1404 memset(&parities,0,sizeof(parities)); /* restart the quote count */
1.1405 s=aline;
1.1406 while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
1.1407 @@ -2653,7 +2720,7 @@
1.1408 else
1.1409 cnt_punct++;
1.1410 }
1.1411 - isnewpara=0; /* Signal the end of new para processing. */
1.1412 + isnewpara=FALSE; /* Signal the end of new para processing. */
1.1413 }
1.1414 /* Check for an em-dash broken at line end. */
1.1415 if (enddash && *aline=='-')
1.1416 @@ -2665,11 +2732,11 @@
1.1417 else
1.1418 cnt_punct++;
1.1419 }
1.1420 - enddash=0;
1.1421 + enddash=FALSE;
1.1422 for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
1.1423 ;
1.1424 if (s>=aline && *s=='-')
1.1425 - enddash=1;
1.1426 + enddash=TRUE;
1.1427 check_for_control_characters(aline);
1.1428 if (warnings->bin)
1.1429 check_for_odd_characters(aline,warnings,isemptyline);
1.1430 @@ -2709,40 +2776,49 @@
1.1431 check_for_mismatched_quotes(&counters,&pending);
1.1432 memset(&counters,0,sizeof(counters));
1.1433 /* let the next iteration know that it's starting a new para */
1.1434 - isnewpara=1;
1.1435 - check_for_omitted_punctuation(prevline,&last,start_para_line);
1.1436 + isnewpara=TRUE;
1.1437 + if (prevline)
1.1438 + check_for_omitted_punctuation(prevline,&last,start_para_line);
1.1439 }
1.1440 - strcpy(prevline,aline);
1.1441 + g_free(prevline);
1.1442 + prevline=g_strdup(aline);
1.1443 }
1.1444 - fclose(infile);
1.1445 + if (prevline)
1.1446 + {
1.1447 + g_free(prevline);
1.1448 + prevline=NULL;
1.1449 + }
1.1450 + g_free(parastart);
1.1451 + g_free(prevline);
1.1452 + g_free(etext);
1.1453 if (!pswit[OVERVIEW_SWITCH])
1.1454 - for (i=0;i<MAX_QWORD;i++)
1.1455 - if (dupcnt[i])
1.1456 - printf("\nNote: Queried word %s was duplicated %d time%s\n",
1.1457 - qword[i],dupcnt[i],"s");
1.1458 + g_tree_foreach(qword,report_duplicate_queries,NULL);
1.1459 + g_tree_unref(qword);
1.1460 + g_tree_unref(qperiod);
1.1461 }
1.1462
1.1463 /*
1.1464 * flgets:
1.1465 *
1.1466 - * Get one line from the input stream, checking for
1.1467 + * Get one line from the input text, checking for
1.1468 * the existence of exactly one CR/LF line-end per line.
1.1469 *
1.1470 * Returns: a pointer to the line.
1.1471 */
1.1472 -char *flgets(char *theline,int maxlen,FILE *thefile,long lcnt)
1.1473 +char *flgets(char **etext,long lcnt)
1.1474 {
1.1475 char c;
1.1476 - int len,isCR,cint;
1.1477 - *theline=0;
1.1478 - len=isCR=0;
1.1479 - c=cint=fgetc(thefile);
1.1480 - do
1.1481 + int len;
1.1482 + gboolean isCR=FALSE;
1.1483 + char *theline=*etext;
1.1484 + len=0;
1.1485 + for(;;)
1.1486 {
1.1487 - if (cint==EOF)
1.1488 + c=*(*etext)++;
1.1489 + if (!c)
1.1490 return NULL;
1.1491 /* either way, it's end of line */
1.1492 - if (c==10)
1.1493 + if (c=='\n')
1.1494 {
1.1495 if (isCR)
1.1496 break;
1.1497 @@ -2752,7 +2828,7 @@
1.1498 if (pswit[LINE_END_SWITCH])
1.1499 {
1.1500 if (pswit[ECHO_SWITCH])
1.1501 - printf("\n%s\n",theline);
1.1502 + printf("\n%*.*s\n",len,len,theline);
1.1503 if (!pswit[OVERVIEW_SWITCH])
1.1504 printf(" Line %ld - No CR?\n",lcnt);
1.1505 else
1.1506 @@ -2761,7 +2837,7 @@
1.1507 break;
1.1508 }
1.1509 }
1.1510 - if (c==13)
1.1511 + if (c=='\r')
1.1512 {
1.1513 if (isCR)
1.1514 {
1.1515 @@ -2769,34 +2845,33 @@
1.1516 if (pswit[LINE_END_SWITCH])
1.1517 {
1.1518 if (pswit[ECHO_SWITCH])
1.1519 - printf("\n%s\n",theline);
1.1520 + printf("\n%*.*s\n",len,len,theline);
1.1521 if (!pswit[OVERVIEW_SWITCH])
1.1522 printf(" Line %ld - Two successive CRs?\n",lcnt);
1.1523 else
1.1524 cnt_lineend++;
1.1525 }
1.1526 }
1.1527 - isCR=1;
1.1528 + isCR=TRUE;
1.1529 }
1.1530 else
1.1531 {
1.1532 if (pswit[LINE_END_SWITCH] && isCR)
1.1533 {
1.1534 if (pswit[ECHO_SWITCH])
1.1535 - printf("\n%s\n",theline);
1.1536 + printf("\n%*.*s\n",len,len,theline);
1.1537 if (!pswit[OVERVIEW_SWITCH])
1.1538 printf(" Line %ld column %d - CR without LF?\n",
1.1539 lcnt,len+1);
1.1540 else
1.1541 cnt_lineend++;
1.1542 + theline[len]=' ';
1.1543 }
1.1544 - theline[len]=c;
1.1545 + isCR=FALSE;
1.1546 len++;
1.1547 - theline[len]=0;
1.1548 - isCR=0;
1.1549 }
1.1550 - c=cint=fgetc(thefile);
1.1551 - } while(len<maxlen);
1.1552 + }
1.1553 + theline[len]='\0';
1.1554 if (pswit[MARKUP_SWITCH])
1.1555 postprocess_for_HTML(theline);
1.1556 if (pswit[DP_SWITCH])
1.1557 @@ -2813,10 +2888,10 @@
1.1558 *
1.1559 * Returns: 0 if no error found, 1 if error.
1.1560 */
1.1561 -int mixdigit(char *checkword)
1.1562 +int mixdigit(const char *checkword)
1.1563 {
1.1564 int wehaveadigit,wehavealetter,firstdigits,query,wl;
1.1565 - char *s;
1.1566 + const char *s;
1.1567 wehaveadigit=wehavealetter=query=0;
1.1568 for (s=checkword;*s;s++)
1.1569 if (gcisalpha(*s))
1.1570 @@ -2832,17 +2907,20 @@
1.1571 for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
1.1572 ;
1.1573 /* digits, ending in st, rd, nd, th of either case */
1.1574 - if (firstdigits+2==wl && (matchword(checkword+wl-2,"st") ||
1.1575 - matchword(checkword+wl-2,"rd") || matchword(checkword+wl-2,"nd") ||
1.1576 - matchword(checkword+wl-2,"th")))
1.1577 + if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
1.1578 + !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
1.1579 + !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
1.1580 + !g_ascii_strcasecmp(checkword+wl-2,"th")))
1.1581 query=0;
1.1582 - if (firstdigits+3==wl && (matchword(checkword+wl-3,"sts") ||
1.1583 - matchword(checkword+wl-3,"rds") || matchword(checkword+wl-3,"nds") ||
1.1584 - matchword(checkword+wl-3,"ths")))
1.1585 + if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
1.1586 + !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
1.1587 + !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
1.1588 + !g_ascii_strcasecmp(checkword+wl-3,"ths")))
1.1589 query=0;
1.1590 - if (firstdigits+3==wl && (matchword(checkword+wl-4,"stly") ||
1.1591 - matchword(checkword+wl-4,"rdly") ||
1.1592 - matchword(checkword+wl-4,"ndly") || matchword(checkword+wl-4,"thly")))
1.1593 + if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
1.1594 + !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
1.1595 + !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
1.1596 + !g_ascii_strcasecmp(checkword+wl-4,"thly")))
1.1597 query=0;
1.1598 /* digits, ending in l, L, s or d */
1.1599 if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
1.1600 @@ -2864,20 +2942,20 @@
1.1601 /*
1.1602 * getaword:
1.1603 *
1.1604 - * Extracts the first/next "word" from the line, and puts
1.1605 - * it into "thisword". A word is defined as one English word unit--or
1.1606 - * at least that's the aim.
1.1607 + * Extracts the first/next "word" from the line, and returns it.
1.1608 + * A word is defined as one English word unit--or at least that's the aim.
1.1609 + * "ptr" is advanced to the position in the line where we will start
1.1610 + * looking for the next word.
1.1611 *
1.1612 - * Returns: a pointer to the position in the line where we will start
1.1613 - * looking for the next word.
1.1614 + * Returns: A newly-allocated string.
1.1615 */
1.1616 -const char *getaword(const char *fromline,char *thisword)
1.1617 +gchar *getaword(const char **ptr)
1.1618 {
1.1619 - int i,wordlen;
1.1620 + int i;
1.1621 const char *s;
1.1622 - wordlen=0;
1.1623 - for (;!gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline;
1.1624 - fromline++)
1.1625 + GString *word;
1.1626 + word=g_string_new(NULL);
1.1627 + for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
1.1628 ;
1.1629 /*
1.1630 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
1.1631 @@ -2887,64 +2965,25 @@
1.1632 * If found, it returns this whole pattern as a word; otherwise we discard
1.1633 * the results and resume our normal programming.
1.1634 */
1.1635 - s=fromline;
1.1636 - for (;(gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.') &&
1.1637 - wordlen<MAXWORDLEN;s++)
1.1638 + s=*ptr;
1.1639 + for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
1.1640 + g_string_append_c(word,*s);
1.1641 + for (i=1;i+1<word->len;i++)
1.1642 {
1.1643 - thisword[wordlen]=*s;
1.1644 - wordlen++;
1.1645 - }
1.1646 - thisword[wordlen]=0;
1.1647 - for (i=1;i<wordlen-1;i++)
1.1648 - {
1.1649 - if (thisword[i]=='.' || thisword[i]==',')
1.1650 + if (word->str[i]=='.' || word->str[i]==',')
1.1651 {
1.1652 - if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1]))
1.1653 + if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
1.1654 {
1.1655 - fromline=s;
1.1656 - return fromline;
1.1657 + *ptr=s;
1.1658 + return g_string_free(word,FALSE);
1.1659 }
1.1660 }
1.1661 }
1.1662 /* we didn't find a punctuated number - do the regular getword thing */
1.1663 - wordlen=0;
1.1664 - for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') &&
1.1665 - wordlen<MAXWORDLEN;fromline++)
1.1666 - {
1.1667 - thisword[wordlen]=*fromline;
1.1668 - wordlen++;
1.1669 - }
1.1670 - thisword[wordlen]=0;
1.1671 - return fromline;
1.1672 -}
1.1673 -
1.1674 -/*
1.1675 - * matchword:
1.1676 - *
1.1677 - * A case-insensitive string matcher.
1.1678 - */
1.1679 -int matchword(char *checkfor,char *thisword)
1.1680 -{
1.1681 - unsigned int ismatch,i;
1.1682 - if (strlen(checkfor)!=strlen(thisword))
1.1683 - return 0;
1.1684 - ismatch=1; /* assume a match until we find a difference */
1.1685 - for (i=0;i<strlen(checkfor);i++)
1.1686 - if (toupper(checkfor[i])!=toupper(thisword[i]))
1.1687 - ismatch=0;
1.1688 - return ismatch;
1.1689 -}
1.1690 -
1.1691 -/*
1.1692 - * lowerit:
1.1693 - *
1.1694 - * Lowercase the line.
1.1695 - */
1.1696 -void lowerit(char *theline)
1.1697 -{
1.1698 - for (;*theline;theline++)
1.1699 - if (*theline>='A' && *theline<='Z')
1.1700 - *theline+=32;
1.1701 + g_string_truncate(word,0);
1.1702 + for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
1.1703 + g_string_append_c(word,**ptr);
1.1704 + return g_string_free(word,FALSE);
1.1705 }
1.1706
1.1707 /*
1.1708 @@ -2961,11 +3000,11 @@
1.1709 * XL or an optional XC, an optional IX or IV, an optional V and any number
1.1710 * of optional Is.
1.1711 */
1.1712 -int isroman(char *t)
1.1713 +gboolean isroman(const char *t)
1.1714 {
1.1715 - char *s;
1.1716 + const char *s;
1.1717 if (!t || !*t)
1.1718 - return 0;
1.1719 + return FALSE;
1.1720 s=t;
1.1721 while (*t=='m' && *t)
1.1722 t++;
1.1723 @@ -3006,19 +3045,19 @@
1.1724 * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
1.1725 * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
1.1726 */
1.1727 -int gcisalpha(unsigned char c)
1.1728 +gboolean gcisalpha(unsigned char c)
1.1729 {
1.1730 if (c>='a' && c<='z')
1.1731 - return 1;
1.1732 + return TRUE;
1.1733 if (c>='A' && c<='Z')
1.1734 - return 1;
1.1735 + return TRUE;
1.1736 if (c<140)
1.1737 - return 0;
1.1738 + return FALSE;
1.1739 if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
1.1740 - return 1;
1.1741 + return TRUE;
1.1742 if (c==140 || c==142 || c==156 || c==158 || c==159)
1.1743 - return 1;
1.1744 - return 0;
1.1745 + return TRUE;
1.1746 + return FALSE;
1.1747 }
1.1748
1.1749 /*
1.1750 @@ -3026,7 +3065,7 @@
1.1751 *
1.1752 * A version of isdigit() that doesn't get confused in 8-bit texts.
1.1753 */
1.1754 -int gcisdigit(unsigned char c)
1.1755 +gboolean gcisdigit(unsigned char c)
1.1756 {
1.1757 return c>='0' && c<='9';
1.1758 }
1.1759 @@ -3037,24 +3076,12 @@
1.1760 * A version of isletter() that doesn't get confused in 8-bit texts.
1.1761 * NB: this is ISO-8891-1-specific.
1.1762 */
1.1763 -int gcisletter(unsigned char c)
1.1764 +gboolean gcisletter(unsigned char c)
1.1765 {
1.1766 return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
1.1767 }
1.1768
1.1769 /*
1.1770 - * gcstrchr:
1.1771 - *
1.1772 - * Wraps strchr to return NULL if the character being searched for is zero.
1.1773 - */
1.1774 -char *gcstrchr(char *s,char c)
1.1775 -{
1.1776 - if (!c)
1.1777 - return NULL;
1.1778 - return strchr(s,c);
1.1779 -}
1.1780 -
1.1781 -/*
1.1782 * postprocess_for_DP:
1.1783 *
1.1784 * Invoked with the -d switch from flgets().
1.1785 @@ -3097,7 +3124,7 @@
1.1786 */
1.1787 void postprocess_for_HTML(char *theline)
1.1788 {
1.1789 - if (strstr(theline,"<") && strstr(theline,">"))
1.1790 + if (strchr(theline,'<') && strchr(theline,'>'))
1.1791 while (losemarkup(theline))
1.1792 ;
1.1793 while (loseentities(theline))
1.1794 @@ -3171,9 +3198,9 @@
1.1795 return NULL;
1.1796 }
1.1797
1.1798 -int tagcomp(char *strin,char *basetag)
1.1799 +int tagcomp(const char *strin,const char *basetag)
1.1800 {
1.1801 - char *s,*t;
1.1802 + const char *s,*t;
1.1803 s=basetag;
1.1804 t=strin;
1.1805 if (*t=='/')
1.1806 @@ -3188,8 +3215,9 @@
1.1807 return 0;
1.1808 }
1.1809
1.1810 -void proghelp()
1.1811 +void proghelp(GOptionContext *context)
1.1812 {
1.1813 + gchar *help;
1.1814 fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr);
1.1815 fputs("Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
1.1816 fputs("Copyright 2012- J. Ali Harlow <ali@juiblex.co.uk>.\n",stderr);
1.1817 @@ -3198,22 +3226,10 @@
1.1818 fputs("This is Free Software; "
1.1819 "you may redistribute it under certain conditions (GPL);\n",stderr);
1.1820 fputs("read the file COPYING for details.\n\n",stderr);
1.1821 - fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr);
1.1822 - fputs(" where -s checks single quotes, -e suppresses echoing lines, "
1.1823 - "-t checks typos\n",stderr);
1.1824 - fputs(" -x (paranoid) switches OFF -t and extra checks, "
1.1825 - "-l turns OFF line-end checks\n",stderr);
1.1826 - fputs(" -o just displays overview without detail, "
1.1827 - "-h echoes header fields\n",stderr);
1.1828 - fputs(" -v (verbose) unsuppresses duplicate reporting, "
1.1829 - "-m suppresses markup\n",stderr);
1.1830 - fputs(" -d ignores DP-specific markup,\n",stderr);
1.1831 - fputs(" -u uses a file gutcheck.typ to query user-defined "
1.1832 - "possible typos\n",stderr);
1.1833 - fputs("Sample usage: bookloupe warpeace.txt \n",stderr);
1.1834 - fputs("\n",stderr);
1.1835 - fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n",
1.1836 - stderr);
1.1837 + help=g_option_context_get_help(context,TRUE,NULL);
1.1838 + fputs(help,stderr);
1.1839 + g_free(help);
1.1840 + fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr);
1.1841 fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; "
1.1842 "non-ASCII\n",stderr);
1.1843 fputs("characters like accented letters, "