# HG changeset patch # User ali # Date 1369750639 -3600 # Node ID 1016349e619fe15277a41597cf9b8811c5411b25 # Parent adb087007d082cc23390b68107f47584f937668e Use GLib functions and data types diff -r adb087007d08 -r 1016349e619f bl/textfileutils.c --- a/bl/textfileutils.c Mon May 27 09:03:04 2013 +0100 +++ b/bl/textfileutils.c Tue May 28 15:17:19 2013 +0100 @@ -3,26 +3,21 @@ #include /* - * Read a file into memory (which should be freed with mem_free when no + * Read a file into memory (which should be freed with g_free when no * longer required). Returns NULL on error and outputs a suitable error * message to stderr. * DOS-style line endings and UTF-8 BOM are handled transparently even * on platforms which don't normally use these formats. */ gboolean file_get_contents_text(const char *filename,char **contents, - size_t *length) + size_t *length,GError **err) { int i; unsigned char *raw; - size_t raw_length; + gsize raw_length; GString *string; - GError *error=NULL; - if (!g_file_get_contents(filename,(char *)&raw,&raw_length,&error)) - { - fprintf(stderr,"%s: %s\n",filename,error->message); - g_error_free(error); + if (!g_file_get_contents(filename,(char **)&raw,&raw_length,err)) return FALSE; - } string=g_string_new(NULL); i=0; if (raw_length>=3 && raw[0]==0xEF && raw[1]==0xBB && raw[2]==0xBF) diff -r adb087007d08 -r 1016349e619f bl/textfileutils.h --- a/bl/textfileutils.h Mon May 27 09:03:04 2013 +0100 +++ b/bl/textfileutils.h Tue May 28 15:17:19 2013 +0100 @@ -4,6 +4,6 @@ #include gboolean file_get_contents_text(const char *filename,char **contents, - size_t *length); + size_t *length,GError **err); #endif /* BL_TEXTFILEUTILS_H */ diff -r adb087007d08 -r 1016349e619f bookloupe/Makefile.am --- a/bookloupe/Makefile.am Mon May 27 09:03:04 2013 +0100 +++ b/bookloupe/Makefile.am Tue May 28 15:17:19 2013 +0100 @@ -1,5 +1,9 @@ +INCLUDES=-I$(top_srcdir) bin_PROGRAMS=bookloupe pkgdata_DATA=bookloupe.typ +AM_CFLAGS=$(GLIB_CFLAGS) +LIBS=$(GLIB_LIBS) +LDADD=../bl/libbl.la bookloupe.typ: bookloupe.typ.in sed 's/$$/\r/' $< > $@ diff -r adb087007d08 -r 1016349e619f bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Mon May 27 09:03:04 2013 +0100 +++ b/bookloupe/bookloupe.c Tue May 28 15:17:19 2013 +0100 @@ -22,19 +22,10 @@ #include #include #include +#include +#include -#define MAXWORDLEN 80 /* max length of one word */ -#define LINEBUFSIZE 2048 /* buffer size for an input line */ - -#define MAX_USER_TYPOS 1000 -#define USERTYPO_FILE "gutcheck.typ" - -#ifndef MAX_PATH -#define MAX_PATH 16384 -#endif - -char aline[LINEBUFSIZE]; -char prevline[LINEBUFSIZE]; +gchar *prevline; /* Common typos. */ char *typo[] = { @@ -70,7 +61,7 @@ "se", "" }; -char *usertypo[MAX_USER_TYPOS]; +GTree *usertypo; /* Common abbreviations and other OK words not to query as typos. */ char *okword[] = { @@ -282,46 +273,57 @@ #define WAY_TOO_LONG 80 #define SHORTEST_PG_LINE 55 -#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */ - /* D - ignore DP-specific markup */ - /* E - echo queried line */ - /* S - check single quotes */ - /* T - check common typos */ - /* P - require closure of quotes on */ - /* every paragraph */ - /* X - "Trust no one" :-) Paranoid! */ - /* Queries everything */ - /* L - line end checking defaults on */ - /* -L turns it off */ - /* O - overview. Just shows counts. */ - /* Y - puts errors to stdout */ - /* instead of stderr */ - /* H - Echoes header fields */ - /* M - Ignore markup in < > */ - /* U - Use file of User-defined Typos */ - /* W - Defaults for use on Web upload */ - /* V - Verbose - list EVERYTHING! */ -#define SWITNO 14 /* max number of switch parms */ - /* - used for defining array-size */ -#define MINARGS 1 /* minimum no of args excl switches */ -#define MAXARGS 1 /* maximum no of args excl switches */ +enum { + ECHO_SWITCH, + SQUOTE_SWITCH, + TYPO_SWITCH, + QPARA_SWITCH, + PARANOID_SWITCH, + LINE_END_SWITCH, + OVERVIEW_SWITCH, + STDOUT_SWITCH, + HEADER_SWITCH, + WEB_SWITCH, + VERBOSE_SWITCH, + MARKUP_SWITCH, + USERTYPO_SWITCH, + DP_SWITCH, + SWITNO +}; -int pswit[SWITNO]; /* program switches set by SWITCHES */ +gboolean pswit[SWITNO]; /* program switches */ -#define ECHO_SWITCH 0 -#define SQUOTE_SWITCH 1 -#define TYPO_SWITCH 2 -#define QPARA_SWITCH 3 -#define PARANOID_SWITCH 4 -#define LINE_END_SWITCH 5 -#define OVERVIEW_SWITCH 6 -#define STDOUT_SWITCH 7 -#define HEADER_SWITCH 8 -#define WEB_SWITCH 9 -#define VERBOSE_SWITCH 10 -#define MARKUP_SWITCH 11 -#define USERTYPO_SWITCH 12 -#define DP_SWITCH 13 +static GOptionEntry options[]={ + { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH, + "Ignore DP-specific markup", NULL }, + { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, + "Don't echo queried line", NULL }, + { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH, + "Check single quotes", NULL }, + { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, + "Check common typos", NULL }, + { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH, + "Require closure of quotes on every paragraph", NULL }, + { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, + "Disable paranoid querying of everything", NULL }, + { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, + "Disable line end checking", NULL }, + { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH, + "Overview: just show counts", NULL }, + { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH, + "Output errors to stdout instead of stderr", NULL }, + { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH, + "Echo header fields", NULL }, + { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH, + "Ignore markup in < >", NULL }, + { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH, + "Use file of user-defined typos", NULL }, + { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH, + "Defaults for use on www upload", NULL }, + { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, + "Verbose - list everything", NULL }, + { NULL } +}; long cnt_dquot; /* for overview mode, count of doublequote queries */ long cnt_squot; /* for overview mode, count of singlequote queries */ @@ -340,47 +342,26 @@ long linecnt; /* count of total lines in the file */ long checked_linecnt; /* count of lines actually checked */ -void proghelp(void); -void procfile(char *); +void proghelp(GOptionContext *context); +void procfile(const char *); -#define LOW_THRESHOLD 0 -#define HIGH_THRESHOLD 1 +gchar *running_from; -#define START 0 -#define END 1 -#define PREV 0 -#define NEXT 1 -#define FIRST_OF_PAIR 0 -#define SECOND_OF_PAIR 1 - -#define MAX_WORDPAIR 1000 - -char running_from[MAX_PATH]; - -int mixdigit(char *); -const char *getaword(const char *,char *); -int matchword(char *,char *); -char *flgets(char *,int,FILE *,long); -void lowerit(char *); -int gcisalpha(unsigned char); -int gcisdigit(unsigned char); -int gcisletter(unsigned char); -char *gcstrchr(char *s,char c); +int mixdigit(const char *); +gchar *getaword(const char **); +char *flgets(char **,long); +gboolean gcisalpha(unsigned char); +gboolean gcisdigit(unsigned char); +gboolean gcisletter(unsigned char); void postprocess_for_HTML(char *); char *linehasmarkup(char *); char *losemarkup(char *); -int tagcomp(char *,char *); +int tagcomp(const char *,const char *); char *loseentities(char *); -int isroman(char *); -int usertypo_count; +gboolean isroman(const char *); void postprocess_for_DP(char *); -char wrk[LINEBUFSIZE]; - -#define MAX_QWORD 50 -#define MAX_QWORD_LENGTH 40 -char qword[MAX_QWORD][MAX_QWORD_LENGTH]; -int dupcnt[MAX_QWORD]; +GTree *qword,*qperiod; struct first_pass_results { long firstline,astline; @@ -392,7 +373,8 @@ struct warnings { int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen; - int endquote,isDutch,isFrench; + int endquote; + gboolean isDutch,isFrench; }; struct counters { @@ -411,52 +393,35 @@ }; struct pending { - char dquote[80],squote[80],rbrack[80],sbrack[80],cbrack[80],unders[80]; + char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders; long squot; }; -int main(int argc,char **argv) +void parse_options(int *argc,char ***argv) { - char *argsw,*s; - int i,switno,invarg; - char usertypo_file[MAX_PATH]; - FILE *usertypofile; - if (strlen(argv[0])=running_from;s--) - *s=0; - switno=strlen(SWITCHES); - for (i=switno;--i>0;) - pswit[i]=0; /* initialise switches */ - /* - * Standard loop to extract switches. - * When we come out of this loop, the arguments will be - * in argv[0] upwards and the switches used will be - * represented by their equivalent elements in pswit[] - */ - while (--argc>0 && **++argv=='-') - for (argsw=argv[0]+1;*argsw!='\0';argsw++) - for (i=switno,invarg=1;(--i>=0) && invarg==1;) - if ((toupper(*argsw))==SWITCHES[i]) - { - invarg=0; - pswit[i]=1; - } + GError *err=NULL; + GOptionContext *context; + context=g_option_context_new( + "file - looks for errors in Project Gutenberg(TM) etexts"); + g_option_context_add_main_entries(context,options,NULL); + if (!g_option_context_parse(context,argc,argv,&err)) + { + g_printerr("Bookloupe: %s\n",err->message); + g_printerr("Use \"%s --help\" for help\n",(*argv)[0]); + exit(1); + } /* Paranoid checking is turned OFF, not on, by its switch */ - pswit[PARANOID_SWITCH]^=1; + pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH]; if (pswit[PARANOID_SWITCH]) - /* if running in paranoid mode force typo checks as well */ - pswit[TYPO_SWITCH]=pswit[TYPO_SWITCH]^1; + /* if running in paranoid mode, typo checks default to enabled */ + pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH]; /* Line-end checking is turned OFF, not on, by its switch */ - pswit[LINE_END_SWITCH]^=1; + pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH]; /* Echoing is turned OFF, not on, by its switch */ - pswit[ECHO_SWITCH]^=1; + pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH]; if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */ - pswit[ECHO_SWITCH]=0; + pswit[ECHO_SWITCH]=FALSE; /* * Web uploads - for the moment, this is really just a placeholder * until we decide what processing we really want to do on web uploads @@ -464,85 +429,155 @@ if (pswit[WEB_SWITCH]) { /* specific override for web uploads */ - pswit[ECHO_SWITCH]=1; - pswit[SQUOTE_SWITCH]=0; - pswit[TYPO_SWITCH]=1; - pswit[QPARA_SWITCH]=0; - pswit[PARANOID_SWITCH]=1; - pswit[LINE_END_SWITCH]=0; - pswit[OVERVIEW_SWITCH]=0; - pswit[STDOUT_SWITCH]=0; - pswit[HEADER_SWITCH]=1; - pswit[VERBOSE_SWITCH]=0; - pswit[MARKUP_SWITCH]=0; - pswit[USERTYPO_SWITCH]=0; - pswit[DP_SWITCH]=0; + pswit[ECHO_SWITCH]=TRUE; + pswit[SQUOTE_SWITCH]=FALSE; + pswit[TYPO_SWITCH]=TRUE; + pswit[QPARA_SWITCH]=FALSE; + pswit[PARANOID_SWITCH]=TRUE; + pswit[LINE_END_SWITCH]=FALSE; + pswit[OVERVIEW_SWITCH]=FALSE; + pswit[STDOUT_SWITCH]=FALSE; + pswit[HEADER_SWITCH]=TRUE; + pswit[VERBOSE_SWITCH]=FALSE; + pswit[MARKUP_SWITCH]=FALSE; + pswit[USERTYPO_SWITCH]=FALSE; + pswit[DP_SWITCH]=FALSE; } - if (argcMAXARGS) + if (*argc<2) { - /* check number of args */ - proghelp(); - return 1; + proghelp(context); + exit(1); } - /* read in the user-defined stealth scanno list */ + g_option_context_free(context); +} + +/* + * read_user_scannos: + * + * Read in the user-defined stealth scanno list. + */ +void read_user_scannos(void) +{ + GError *err=NULL; + gchar *usertypo_file; + gboolean okay; + int i; + gsize len; + gchar *contents,**lines; + usertypo_file=g_strdup("bookloupe.typ"); + okay=file_get_contents_text(usertypo_file,&contents,&len,&err); + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) + { + g_clear_error(&err); + g_free(usertypo_file); + usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL); + okay=file_get_contents_text(usertypo_file,&contents,&len,&err); + } + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) + { + g_clear_error(&err); + g_free(usertypo_file); + usertypo_file=g_strdup("gutcheck.typ"); + okay=file_get_contents_text(usertypo_file,&contents,&len,&err); + } + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) + { + g_clear_error(&err); + g_free(usertypo_file); + usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL); + okay=file_get_contents_text(usertypo_file,&contents,&len,&err); + } + if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) + { + g_free(usertypo_file); + printf(" --> I couldn't find bookloupe.typ " + "-- proceeding without user typos.\n"); + return; + } + else if (!okay) + { + fprintf(stderr,"%s: %s\n",usertypo_file,err->message); + g_free(usertypo_file); + g_clear_error(&err); + exit(1); + } + lines=g_strsplit(contents,"\n",0); + usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL); + for (i=0;lines[i];i++) + if (*(unsigned char *)lines[i]>'!') + g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1)); + else + g_free(lines[i]); + g_free(lines); +} + +#if 0 +/* + * read_etext: + * + * Read an etext returning an array of lines. Lines are normally expected + * to be terminated by CR LF. Solitary LFs delimit lines but are left + * embedded at the end of the line for further processing. Solitary CRs + * do not delimit lines. + */ +gchar **read_etext(const char *filename,GError **err) +{ + int i; + const char *s,*t; + gchar *contents; + gchar **raw_lines; + GPtrArray *lines; + gsize len; + if (!g_file_get_contents(filename,&contents,&len,err)) + return NULL; + raw_lines=g_strsplit(contents,"\r\n",0); + lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1); + for (i=0;raw_lines[i];i++) + { + t=strchr(raw_lines[i],'\n'); + if (t) + { + s=raw_lines[i]; + while ((t=strchr(s,'\n'))) + { + g_ptr_array_add(lines,g_strndup(s,t-s+1)); + s=t+1; + } + g_ptr_array_add(lines,g_strdup(s)); + g_free(raw_lines[i]); + } + else + g_ptr_array_add(lines,raw_lines[i]); + } + g_free(raw_lines); + g_ptr_array_add(lines,NULL); + return (gchar **)g_ptr_array_free(lines,FALSE); +} +#else +/* + * read_etext: + * + * Read an etext returning a newly allocated string containing the file + * contents or NULL on error. + */ +gchar *read_etext(const char *filename,GError **err) +{ + gchar *contents; + gsize len; + if (!g_file_get_contents(filename,&contents,&len,err)) + return NULL; + return contents; +} +#endif + +int main(int argc,char **argv) +{ + running_from=g_path_get_dirname(argv[0]); + parse_options(&argc,&argv); if (pswit[USERTYPO_SWITCH]) - { - /* ... we were told we had one! */ - usertypofile=fopen(USERTYPO_FILE,"rb"); - if (!usertypofile) - { - /* not in cwd. try excuteable directory. */ - strcpy(usertypo_file,running_from); - strcat(usertypo_file,USERTYPO_FILE); - usertypofile=fopen(usertypo_file,"rb"); - if (!usertypofile) { - /* we ain't got no user typo file! */ - printf(" --> I couldn't find gutcheck.typ " - "-- proceeding without user typos.\n"); - } - } - usertypo_count=0; - if (usertypofile) - { - /* we managed to open a User Typo File! */ - if (pswit[USERTYPO_SWITCH]) - { - while (flgets(aline,LINEBUFSIZE-1,usertypofile, - (long)usertypo_count)) - { - if (strlen(aline)>1) - { - if ((int)*aline>33) - { - s=malloc(strlen(aline)+1); - if (!s) - { - fprintf(stderr,"bookloupe: cannot get enough " - "memory for user typo file!\n"); - exit(1); - } - strcpy(s,aline); - usertypo[usertypo_count]=s; - usertypo_count++; - if (usertypo_count>=MAX_USER_TYPOS) - { - printf(" --> Only %d user-defined typos " - "allowed: ignoring the rest\n", - MAX_USER_TYPOS); - break; - } - } - } - } - } - fclose(usertypofile); - } - } + read_user_scannos(); fprintf(stderr,"bookloupe: Check and report on an e-text\n"); - cnt_dquot=cnt_squot=cnt_brack=cnt_bin=cnt_odd=cnt_long= - cnt_short=cnt_punct=cnt_dash=cnt_word=cnt_html=cnt_lineend= - cnt_spacend=0; - procfile(argv[0]); + procfile(argv[1]); if (pswit[OVERVIEW_SWITCH]) { printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n", @@ -577,6 +612,9 @@ cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+ cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend); } + g_free(running_from); + if (usertypo) + g_tree_unref(usertypo); return 0; } @@ -588,28 +626,33 @@ * occur many times in the text like long or short * lines, non-standard dashes, etc. */ -struct first_pass_results *first_pass(FILE *infile) +struct first_pass_results *first_pass(const char *etext) { char laststart=CHAR_SPACE; const char *s; - int i,llen; + gchar *lc_line; + int i,j,llen; + gchar **lines; unsigned int lastlen=0,lastblen=0; long spline=0,nspline=0; static struct first_pass_results results={0}; - char inword[MAXWORDLEN]=""; - while (fgets(aline,LINEBUFSIZE-1,infile)) + gchar *inword; + lines=g_strsplit(etext,"\n",0); + for (j=0;lines[j];j++) { - while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13) - aline[strlen(aline)-1]=0; + llen=strlen(lines[j]); + while(lines[j][llen-1]=='\r') + lines[j][llen--]='\0'; linecnt++; - if (strstr(aline,"*END") && strstr(aline,"SMALL PRINT") && - (strstr(aline,"PUBLIC DOMAIN") || strstr(aline,"COPYRIGHT"))) + if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") && + (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT"))) { if (spline) printf(" --> Duplicate header?\n"); spline=linecnt+1; /* first line of non-header text, that is */ } - if (!strncmp(aline,"*** START",9) && strstr(aline,"PROJECT GUTENBERG")) + if (!strncmp(lines[j],"*** START",9) && + strstr(lines[j],"PROJECT GUTENBERG")) { if (nspline) printf(" --> Duplicate header?\n"); @@ -617,10 +660,10 @@ } if (spline || nspline) { - lowerit(aline); - if (strstr(aline,"end") && strstr(aline,"project gutenberg")) + lc_line=g_ascii_strdown(lines[j],llen); + if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg")) { - if (strstr(aline,"end")127) + if ((unsigned char)lines[j][i]>127) results.binlen++; - if (gcisalpha(aline[i])) + if (gcisalpha(lines[j][i])) results.alphalen++; - if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1])) + if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1])) results.endquote_count++; } - if (strlen(aline)>2 && lastlen>2 && lastlen2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE) + if (llen>2 && lastlen>2 && lastlen2 && + lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE) results.shortline++; - if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE) + if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE) cnt_spacend++; - if (strstr(aline,".,")) + if (strstr(lines[j],".,")) results.dotcomma++; /* only count ast lines for ignoring purposes where there is */ /* locase text on the line */ - if (strstr(aline,"*")) + if (strchr(lines[j],'*')) { - for (s=aline;*s;s++) + for (s=lines[j];*s;s++) if (*s>='a' && *s<='z') break; if (*s) results.astline++; } - if (strstr(aline,"/")) + if (strchr(lines[j],'/')) results.fslashline++; - for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--) + for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--) ; - if (aline[i]=='-' && aline[i-1]!='-') + if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-') results.hyphens++; if (llen>LONGEST_PG_LINE) results.longline++; if (llen>WAY_TOO_LONG) results.verylongline++; - if (strstr(aline,"<") && strstr(aline,">")) + if (strchr(lines[j],'<') && strchr(lines[j],'>')) { - i=(int)(strstr(aline,">")-strstr(aline,"<")+1); + i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1); if (i>0) results.htmcount++; - if (strstr(aline,"")) + if (strstr(lines[j],"")) results.htmcount+=4; /* bonus marks! */ } /* Check for spaced em-dashes */ - if (strstr(aline,"--")) + if (lines[j][0] && (s=strstr(lines[j]+1,"--"))) { results.emdash++; - if (*(strstr(aline,"--")-1)==CHAR_SPACE || - (*(strstr(aline,"--")+2)==CHAR_SPACE)) + if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE)) results.space_emdash++; - if (*(strstr(aline,"--")-1)==CHAR_SPACE && - (*(strstr(aline,"--")+2)==CHAR_SPACE)) + if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE)) /* count of em-dashes with spaces both sides */ results.non_PG_space_emdash++; - if (*(strstr(aline,"--")-1)!=CHAR_SPACE && - (*(strstr(aline,"--")+2)!=CHAR_SPACE)) + if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE)) /* count of PG-type em-dashes with no spaces */ results.PG_space_emdash++; } - for (s=aline;*s;) + for (s=lines[j];*s;) { - s=getaword(s,inword); + inword=getaword(&s); if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) results.Dutchcount++; if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) results.Frenchcount++; if (!strcmp(inword,"0") || !strcmp(inword,"1")) results.standalone_digit++; + g_free(inword); } /* Check for spaced dashes */ - if (strstr(aline," -") && *(strstr(aline," -")+2)!='-') + if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-') results.spacedash++; lastblen=lastlen; - lastlen=strlen(aline); - laststart=aline[0]; + lastlen=llen; + laststart=lines[j][0]; } + g_strfreev(lines); return &results; } @@ -856,17 +898,17 @@ "Not reporting them.\n"); warnings.bin=0; } - warnings.isDutch=0; + warnings.isDutch=FALSE; if (results->Dutchcount>50) { - warnings.isDutch=1; + warnings.isDutch=TRUE; printf(" --> This looks like Dutch - " "switching off dashes and warnings for 's Middags case.\n"); } - warnings.isFrench=0; + warnings.isFrench=FALSE; if (results->Frenchcount>50) { - warnings.isFrench=1; + warnings.isFrench=TRUE; printf(" --> This looks like French - " "switching off some doublepunct.\n"); } @@ -919,12 +961,14 @@ * count it, since empty lines with asterisks or dashes to * separate sections are common. * - * Returns: Non-zero if the line is empty. + * Returns: TRUE if the line is empty. */ -int analyse_quotes(const char *s,struct counters *counters) +gboolean analyse_quotes(const char *aline,struct counters *counters) { int guessquote=0; - int isemptyline=1; /* assume the line is empty until proven otherwise */ + /* assume the line is empty until proven otherwise */ + gboolean isemptyline=TRUE; + const char *s=aline; while (*s) { if (*s==CHAR_DQUOTE) @@ -986,7 +1030,7 @@ } if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK && *s!=13 && *s!=10) - isemptyline=0; /* ignore lines like * * * as spacers */ + isemptyline=FALSE; /* ignore lines like * * * as spacers */ if (*s==CHAR_UNDERSCORE) counters->c_unders++; if (*s==CHAR_OPEN_CBRACK) @@ -1040,7 +1084,7 @@ * Check for binary and other odd characters. */ void check_for_odd_characters(const char *aline,const struct warnings *warnings, - int isemptyline) + gboolean isemptyline) { /* Don't repeat multiple warnings on one line. */ int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0; @@ -1461,16 +1505,15 @@ void check_for_extra_period(const char *aline,const struct warnings *warnings) { const char *s,*t,*s1; - int i,istypo,isdup; - static char qperiod[MAX_QWORD][MAX_QWORD_LENGTH]; - static int qperiod_index=0; - char testword[MAXWORDLEN]=""; + int i; + gboolean istypo; + gchar *testword; if (pswit[PARANOID_SWITCH]) { - for (t=s=aline;strstr(t,". ");) + for (t=aline;strstr(t,". ");) { t=strstr(t,". "); - if (t==s) + if (t==aline) { t++; /* start of line punctuation is handled elsewhere */ @@ -1497,57 +1540,48 @@ if (*s1>='a' && *s1<='z') { /* we have something to investigate */ - istypo=1; + istypo=TRUE; /* so let's go back and find out */ - for (s1=t-1;s1>=s && + for (s1=t-1;s1>=aline && (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE && gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--) ; s1++; - for (i=0;*s1 && *s1!='.';s1++,i++) - testword[i]=*s1; - testword[i]=0; + s=strchr(s1,'.'); + if (s) + testword=g_strndup(s1,s-s1); + else + testword=g_strdup(s1); for (i=0;*abbrev[i];i++) if (!strcmp(testword,abbrev[i])) - istypo=0; + istypo=FALSE; if (gcisdigit(*testword)) - istypo=0; + istypo=FALSE; if (!testword[1]) - istypo=0; + istypo=FALSE; if (isroman(testword)) - istypo=0; + istypo=FALSE; if (istypo) { - istypo=0; + istypo=FALSE; for (i=0;testword[i];i++) if (strchr(vowels,testword[i])) - istypo=1; + istypo=TRUE; } - if (istypo) + if (istypo && + (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword))) { - isdup=0; - if (strlen(testword)0 && testword[i-1]==CHAR_SQUOTE) ; /* do nothing! */ else - istypo=1; + istypo=TRUE; } testword[i]=(char)tolower(testword[i]); } + } + if (pswit[TYPO_SWITCH]) + { /* * Check for certain unlikely two-letter combinations at word * start and end. @@ -1674,26 +1719,26 @@ { for (i=0;*nostart[i];i++) if (!strncmp(testword,nostart[i],2)) - istypo=1; + istypo=TRUE; for (i=0;*noend[i];i++) if (!strncmp(testword+strlen(testword)-2,noend[i],2)) - istypo=1; + istypo=TRUE; } /* ght is common, gbt never. Like that. */ if (strstr(testword,"cb")) - istypo=1; + istypo=TRUE; if (strstr(testword,"gbt")) - istypo=1; + istypo=TRUE; if (strstr(testword,"pbt")) - istypo=1; + istypo=TRUE; if (strstr(testword,"tbs")) - istypo=1; + istypo=TRUE; if (strstr(testword,"mrn")) - istypo=1; + istypo=TRUE; if (strstr(testword,"ahle")) - istypo=1; + istypo=TRUE; if (strstr(testword,"ihle")) - istypo=1; + istypo=TRUE; /* * "TBE" does happen - like HEARTBEAT - but uncommon. * Also "TBI" - frostbite, outbid - but uncommon. @@ -1701,11 +1746,11 @@ * numerals, but "ii" is a common scanno. */ if (strstr(testword,"tbi")) - istypo=1; + istypo=TRUE; if (strstr(testword,"tbe")) - istypo=1; + istypo=TRUE; if (strstr(testword,"ii")) - istypo=1; + istypo=TRUE; /* * Check for no vowels or no consonants. * If none, flag a typo. @@ -1727,7 +1772,7 @@ consonant++; } if (!vowel || !consonant) - istypo=1; + istypo=TRUE; } /* * Now exclude the word from being reported if it's in @@ -1735,18 +1780,18 @@ */ for (i=0;*okword[i];i++) if (!strcmp(testword,okword[i])) - istypo=0; + istypo=FALSE; /* * What looks like a typo may be a Roman numeral. * Exclude these. */ if (istypo && isroman(testword)) - istypo=0; + istypo=FALSE; /* Check the manual list of typos. */ if (!istypo) for (i=0;*typo[i];i++) if (!strcmp(testword,typo[i])) - istypo=1; + istypo=TRUE; /* * Check lowercase s, l, i and m - special cases. * "j" - often a semi-colon gone wrong. @@ -1754,34 +1799,30 @@ * "n" for "in" */ if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword)) - istypo=1; + istypo=TRUE; if (istypo) { - isdup=0; - if (strlen(testword)digit) { /* In paranoid mode, query all 0 and 1 standing alone. */ @@ -1816,6 +1856,7 @@ cnt_word++; } } + g_free(inword); } } @@ -1830,9 +1871,10 @@ * quotes "like"this. */ void check_for_misspaced_punctuation(const char *aline, - struct parities *parities,int isemptyline) + struct parities *parities,gboolean isemptyline) { - int i,llen,isacro,isellipsis; + int i,llen; + gboolean isacro,isellipsis; const char *s; llen=strlen(aline); for (i=1;i2 && aline[i-2]=='.') - isacro=1; + isacro=TRUE; if (i+22 && aline[i-2]=='.') - isellipsis=1; + isellipsis=TRUE; if (i+20) { - strncpy(wrk,open,i); - wrk[i]=0; if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - HTML Tag? %s \n", - linecnt,(int)(open-aline)+1,wrk); + printf(" Line %ld column %d - HTML Tag? %*.*s \n", + linecnt,(int)(open-aline)+1,i,i,open); else cnt_html++; } @@ -2359,13 +2401,11 @@ i=0; /* Don't report "Jones & Son;" */ if (i>0) { - strncpy(wrk,amp,i); - wrk[i]=0; if (pswit[ECHO_SWITCH]) printf("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - HTML symbol? %s \n", - linecnt,(int)(amp-aline)+1,wrk); + printf(" Line %ld column %d - HTML symbol? %*.*s \n", + linecnt,(int)(amp-aline)+1,i,i,amp); else cnt_html++; } @@ -2388,7 +2428,8 @@ s=aline; while (*s==' ') s++; - if (*pending->dquote) + if (pending->dquote) + { if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH]) { if (!pswit[OVERVIEW_SWITCH]) @@ -2400,7 +2441,10 @@ else cnt_dquot++; } - if (*pending->squote) + g_free(pending->dquote); + pending->dquote=NULL; + } + if (pending->squote) { if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || pending->squot) @@ -2414,8 +2458,10 @@ else cnt_squot++; } + g_free(pending->squote); + pending->squote=NULL; } - if (*pending->rbrack) + if (pending->rbrack) { if (!pswit[OVERVIEW_SWITCH]) { @@ -2425,8 +2471,10 @@ } else cnt_brack++; + g_free(pending->rbrack); + pending->rbrack=NULL; } - if (*pending->sbrack) + if (pending->sbrack) { if (!pswit[OVERVIEW_SWITCH]) { @@ -2436,8 +2484,10 @@ } else cnt_brack++; + g_free(pending->sbrack); + pending->sbrack=NULL; } - if (*pending->cbrack) + if (pending->cbrack) { if (!pswit[OVERVIEW_SWITCH]) { @@ -2447,8 +2497,10 @@ } else cnt_brack++; + g_free(pending->cbrack); + pending->cbrack=NULL; } - if (*pending->unders) + if (pending->unders) { if (!pswit[OVERVIEW_SWITCH]) { @@ -2458,6 +2510,8 @@ } else cnt_brack++; + g_free(pending->unders); + pending->unders=NULL; } } @@ -2481,12 +2535,12 @@ struct pending *pending) { if (counters->quot%2) - sprintf(pending->dquote," Line %ld - Mismatched quotes", - linecnt); + pending->dquote= + g_strdup_printf(" Line %ld - Mismatched quotes",linecnt); if (pswit[SQUOTE_SWITCH] && counters->open_single_quote && counters->open_single_quote!=counters->close_single_quote) - sprintf(pending->squote," Line %ld - Mismatched singlequotes?", - linecnt); + pending->squote= + g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt); if (pswit[SQUOTE_SWITCH] && counters->open_single_quote && counters->open_single_quote!=counters->close_single_quote && counters->open_single_quote!=counters->close_single_quote+1) @@ -2496,17 +2550,17 @@ */ pending->squot=1; if (counters->r_brack) - sprintf(pending->rbrack," Line %ld - Mismatched round brackets?", - linecnt); + pending->rbrack= + g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt); if (counters->s_brack) - sprintf(pending->sbrack," Line %ld - Mismatched square brackets?", - linecnt); + pending->sbrack= + g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt); if (counters->c_brack) - sprintf(pending->cbrack," Line %ld - Mismatched curly brackets?", - linecnt); + pending->cbrack= + g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt); if (counters->c_unders%2) - sprintf(pending->unders," Line %ld - Mismatched underscores?", - linecnt); + pending->unders= + g_strdup_printf(" Line %ld - Mismatched underscores?",linecnt); } /* @@ -2563,50 +2617,63 @@ } } +gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data) +{ + const char *word=key; + int *dupcnt=value; + if (*dupcnt) + printf("\nNote: Queried word %s was duplicated %d times\n", + word,*dupcnt); + return FALSE; +} + /* * procfile: * * Process one file. */ -void procfile(char *filename) +void procfile(const char *filename) { const char *s; - char parastart[81]; /* first line of current para */ - FILE *infile; + gchar *parastart=NULL; /* first line of current para */ + gchar *etext,*aline; + gchar *etext_ptr; + GError *err=NULL; struct first_pass_results *first_pass_results; struct warnings *warnings; struct counters counters={0}; struct line_properties last={0}; struct parities parities={0}; - struct pending pending={{0},}; - int isemptyline; + struct pending pending={0}; + gboolean isemptyline; long start_para_line=0; - int i,isnewpara=0,enddash=0; + gboolean isnewpara=FALSE,enddash=FALSE; last.start=CHAR_SPACE; - *prevline=0; linecnt=checked_linecnt=0; - infile=fopen(filename,"rb"); - if (!infile) + etext=read_etext(filename,&err); + if (!etext) { if (pswit[STDOUT_SWITCH]) - fprintf(stdout,"bookloupe: cannot open %s\n",filename); + fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message); else - fprintf(stderr,"bookloupe: cannot open %s\n",filename); + fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message); exit(1); } fprintf(stdout,"\n\nFile: %s\n\n",filename); - first_pass_results=first_pass(infile); + first_pass_results=first_pass(etext); warnings=report_first_pass(first_pass_results); + qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free); + qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL); /* * Here we go with the main pass. Hold onto yer hat! */ - rewind(infile); linecnt=0; - while (flgets(aline,LINEBUFSIZE-1,infile,linecnt+1)) + etext_ptr=etext; + while ((aline=flgets(&etext_ptr,linecnt+1))) { linecnt++; if (linecnt==1) - isnewpara=1; + isnewpara=TRUE; if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11)) continue; // skip DP page separators completely if (linecntfirstline || @@ -2635,8 +2702,8 @@ /* This line is the start of a new paragraph. */ start_para_line=linecnt; /* Capture its first line in case we want to report it later. */ - strncpy(parastart,aline,80); - parastart[79]=0; + g_free(parastart); + parastart=g_strdup(aline); memset(&parities,0,sizeof(parities)); /* restart the quote count */ s=aline; while (!gcisalpha(*s) && !gcisdigit(*s) && *s) @@ -2653,7 +2720,7 @@ else cnt_punct++; } - isnewpara=0; /* Signal the end of new para processing. */ + isnewpara=FALSE; /* Signal the end of new para processing. */ } /* Check for an em-dash broken at line end. */ if (enddash && *aline=='-') @@ -2665,11 +2732,11 @@ else cnt_punct++; } - enddash=0; + enddash=FALSE; for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--) ; if (s>=aline && *s=='-') - enddash=1; + enddash=TRUE; check_for_control_characters(aline); if (warnings->bin) check_for_odd_characters(aline,warnings,isemptyline); @@ -2709,40 +2776,49 @@ check_for_mismatched_quotes(&counters,&pending); memset(&counters,0,sizeof(counters)); /* let the next iteration know that it's starting a new para */ - isnewpara=1; - check_for_omitted_punctuation(prevline,&last,start_para_line); + isnewpara=TRUE; + if (prevline) + check_for_omitted_punctuation(prevline,&last,start_para_line); } - strcpy(prevline,aline); + g_free(prevline); + prevline=g_strdup(aline); } - fclose(infile); + if (prevline) + { + g_free(prevline); + prevline=NULL; + } + g_free(parastart); + g_free(prevline); + g_free(etext); if (!pswit[OVERVIEW_SWITCH]) - for (i=0;ilen;i++) { - thisword[wordlen]=*s; - wordlen++; - } - thisword[wordlen]=0; - for (i=1;istr[i]=='.' || word->str[i]==',') { - if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) + if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1])) { - fromline=s; - return fromline; + *ptr=s; + return g_string_free(word,FALSE); } } } /* we didn't find a punctuated number - do the regular getword thing */ - wordlen=0; - for (;(gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline=='\'') && - wordlen='A' && *theline<='Z') - *theline+=32; + g_string_truncate(word,0); + for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++) + g_string_append_c(word,**ptr); + return g_string_free(word,FALSE); } /* @@ -2961,11 +3000,11 @@ * XL or an optional XC, an optional IX or IV, an optional V and any number * of optional Is. */ -int isroman(char *t) +gboolean isroman(const char *t) { - char *s; + const char *s; if (!t || !*t) - return 0; + return FALSE; s=t; while (*t=='m' && *t) t++; @@ -3006,19 +3045,19 @@ * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) * and ISO-8859-1 character sets, which are the most common PG 8-bit types. */ -int gcisalpha(unsigned char c) +gboolean gcisalpha(unsigned char c) { if (c>='a' && c<='z') - return 1; + return TRUE; if (c>='A' && c<='Z') - return 1; + return TRUE; if (c<140) - return 0; + return FALSE; if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254) - return 1; + return TRUE; if (c==140 || c==142 || c==156 || c==158 || c==159) - return 1; - return 0; + return TRUE; + return FALSE; } /* @@ -3026,7 +3065,7 @@ * * A version of isdigit() that doesn't get confused in 8-bit texts. */ -int gcisdigit(unsigned char c) +gboolean gcisdigit(unsigned char c) { return c>='0' && c<='9'; } @@ -3037,24 +3076,12 @@ * A version of isletter() that doesn't get confused in 8-bit texts. * NB: this is ISO-8891-1-specific. */ -int gcisletter(unsigned char c) +gboolean gcisletter(unsigned char c) { return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192; } /* - * gcstrchr: - * - * Wraps strchr to return NULL if the character being searched for is zero. - */ -char *gcstrchr(char *s,char c) -{ - if (!c) - return NULL; - return strchr(s,c); -} - -/* * postprocess_for_DP: * * Invoked with the -d switch from flgets(). @@ -3097,7 +3124,7 @@ */ void postprocess_for_HTML(char *theline) { - if (strstr(theline,"<") && strstr(theline,">")) + if (strchr(theline,'<') && strchr(theline,'>')) while (losemarkup(theline)) ; while (loseentities(theline)) @@ -3171,9 +3198,9 @@ return NULL; } -int tagcomp(char *strin,char *basetag) +int tagcomp(const char *strin,const char *basetag) { - char *s,*t; + const char *s,*t; s=basetag; t=strin; if (*t=='/') @@ -3188,8 +3215,9 @@ return 0; } -void proghelp() +void proghelp(GOptionContext *context) { + gchar *help; fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr); fputs("Copyright 2000-2005 Jim Tinsley .\n",stderr); fputs("Copyright 2012- J. Ali Harlow .\n",stderr); @@ -3198,22 +3226,10 @@ fputs("This is Free Software; " "you may redistribute it under certain conditions (GPL);\n",stderr); fputs("read the file COPYING for details.\n\n",stderr); - fputs("Usage is: bookloupe [-setpxloyhud] filename\n",stderr); - fputs(" where -s checks single quotes, -e suppresses echoing lines, " - "-t checks typos\n",stderr); - fputs(" -x (paranoid) switches OFF -t and extra checks, " - "-l turns OFF line-end checks\n",stderr); - fputs(" -o just displays overview without detail, " - "-h echoes header fields\n",stderr); - fputs(" -v (verbose) unsuppresses duplicate reporting, " - "-m suppresses markup\n",stderr); - fputs(" -d ignores DP-specific markup,\n",stderr); - fputs(" -u uses a file gutcheck.typ to query user-defined " - "possible typos\n",stderr); - fputs("Sample usage: bookloupe warpeace.txt \n",stderr); - fputs("\n",stderr); - fputs("Bookloupe looks for errors in Project Gutenberg(TM) etexts.\n", - stderr); + help=g_option_context_get_help(context,TRUE,NULL); + fputs(help,stderr); + g_free(help); + fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr); fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; " "non-ASCII\n",stderr); fputs("characters like accented letters, " diff -r adb087007d08 -r 1016349e619f configure.ac --- a/configure.ac Mon May 27 09:03:04 2013 +0100 +++ b/configure.ac Tue May 28 15:17:19 2013 +0100 @@ -13,7 +13,8 @@ test/compatibility/Makefile doc/Makefile ]) -AM_INIT_AUTOMAKE(no-define) +AM_INIT_AUTOMAKE(no-define,1.11) +AM_SILENT_RULES([yes]) AC_CANONICAL_HOST ################################################## diff -r adb087007d08 -r 1016349e619f test/compatibility/user-defined-typo.tst --- a/test/compatibility/user-defined-typo.tst Mon May 27 09:03:04 2013 +0100 +++ b/test/compatibility/user-defined-typo.tst Tue May 28 15:17:19 2013 +0100 @@ -1,6 +1,6 @@ **************** OPTIONS **************** -u -**************** INPUT(gutcheck.typ) **************** +**************** INPUT(bookloupe.typ) **************** arid **************** INPUT **************** I am the very model of a modern Major-General, diff -r adb087007d08 -r 1016349e619f test/harness/testcaseparser.c --- a/test/harness/testcaseparser.c Mon May 27 09:03:04 2013 +0100 +++ b/test/harness/testcaseparser.c Tue May 28 15:17:19 2013 +0100 @@ -91,9 +91,12 @@ { TestcaseParser *parser; gsize len; + GError *err=NULL; parser=g_new0(TestcaseParser,1); - if (!file_get_contents_text(filename,&parser->contents,&len)) + if (!file_get_contents_text(filename,&parser->contents,&len,&err)) { + g_printerr("%s: %s\n",filename,err->message); + g_error_free(err); g_free(parser); return NULL; }