1.1 --- a/bookloupe/bookloupe.c Fri May 24 22:47:16 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Sat May 25 08:52:47 2013 +0100
1.3 @@ -546,82 +546,30 @@
1.4 return 0;
1.5 }
1.6
1.7 +struct first_pass_results {
1.8 + long firstline,astline;
1.9 + long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
1.10 + long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
1.11 + long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
1.12 + signed int Dutchcount,Frenchcount;
1.13 +};
1.14 +
1.15 /*
1.16 - * procfile:
1.17 + * first_pass:
1.18 *
1.19 - * Process one file.
1.20 + * Run a first pass - verify that it's a valid PG
1.21 + * file, decide whether to report some things that
1.22 + * occur many times in the text like long or short
1.23 + * lines, non-standard dashes, etc.
1.24 */
1.25 -void procfile(char *filename)
1.26 +struct first_pass_results *first_pass(FILE *infile)
1.27 {
1.28 - char *s,*t,*s1,laststart,*wordstart;
1.29 - char inword[MAXWORDLEN],testword[MAXWORDLEN];
1.30 - char parastart[81]; /* first line of current para */
1.31 - FILE *infile;
1.32 - long quot,squot,firstline,alphalen,totlen,binlen,
1.33 - shortline,longline,verylongline,spacedash,emdash,
1.34 - space_emdash,non_PG_space_emdash,PG_space_emdash,
1.35 - footerline,dotcomma,start_para_line,astline,fslashline,
1.36 - standalone_digit,hyphens,htmcount,endquote_count;
1.37 - long spline,nspline;
1.38 - signed int i,j,llen,isemptyline,isacro,isellipsis,istypo,alower,
1.39 - eNon_A,eTab,eTilde,eAst,eFSlash,eCarat;
1.40 - signed int warn_short,warn_long,warn_bin,warn_dash,warn_dotcomma,
1.41 - warn_ast,warn_fslash,warn_digit,warn_hyphen,warn_endquote;
1.42 - unsigned int lastlen,lastblen;
1.43 - signed int s_brack,c_brack,r_brack,c_unders;
1.44 - signed int open_single_quote,close_single_quote,guessquote,dquotepar,
1.45 - squotepar;
1.46 - signed int isnewpara,vowel,consonant;
1.47 - char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
1.48 - cbrack_err[80],unders_err[80];
1.49 - signed int qword_index,qperiod_index,isdup;
1.50 - signed int enddash;
1.51 - signed int Dutchcount,isDutch,Frenchcount,isFrench;
1.52 - laststart=CHAR_SPACE;
1.53 - lastlen=lastblen=0;
1.54 - *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
1.55 - *unders_err=*prevline=0;
1.56 - linecnt=firstline=alphalen=totlen=binlen=
1.57 - shortline=longline=spacedash=emdash=checked_linecnt=
1.58 - space_emdash=non_PG_space_emdash=PG_space_emdash=
1.59 - footerline=dotcomma=start_para_line=astline=fslashline=
1.60 - standalone_digit=hyphens=htmcount=endquote_count=0;
1.61 - quot=squot=s_brack=c_brack=r_brack=c_unders=0;
1.62 - i=llen=isemptyline=isacro=isellipsis=istypo=0;
1.63 - warn_short=warn_long=warn_bin=warn_dash=warn_dotcomma=
1.64 - warn_ast=warn_fslash=warn_digit=warn_endquote=0;
1.65 - isnewpara=vowel=consonant=enddash=0;
1.66 - spline=nspline=0;
1.67 - qword_index=qperiod_index=isdup=0;
1.68 - *inword=*testword=0;
1.69 - open_single_quote=close_single_quote=guessquote=dquotepar=squotepar=0;
1.70 - Dutchcount=isDutch=Frenchcount=isFrench=0;
1.71 - for (j=0;j<MAX_QWORD;j++)
1.72 - {
1.73 - dupcnt[j]=0;
1.74 - for (i=0;i<MAX_QWORD_LENGTH;i++)
1.75 - {
1.76 - qword[i][j]=0;
1.77 - qperiod[i][j]=0;
1.78 - }
1.79 - }
1.80 - infile=fopen(filename,"rb");
1.81 - if (!infile)
1.82 - {
1.83 - if (pswit[STDOUT_SWITCH])
1.84 - fprintf(stdout,"bookloupe: cannot open %s\n",filename);
1.85 - else
1.86 - fprintf(stderr,"bookloupe: cannot open %s\n",filename);
1.87 - exit(1);
1.88 - }
1.89 - fprintf(stdout,"\n\nFile: %s\n\n",filename);
1.90 - firstline=shortline=longline=verylongline=0;
1.91 - /*
1.92 - * Run a first pass - verify that it's a valid PG
1.93 - * file, decide whether to report some things that
1.94 - * occur many times in the text like long or short
1.95 - * lines, non-standard dashes, etc.
1.96 - */
1.97 + char laststart=CHAR_SPACE,*s;
1.98 + signed int i,llen;
1.99 + unsigned int lastlen=0,lastblen=0;
1.100 + long spline=0,nspline=0;
1.101 + static struct first_pass_results results={0};
1.102 + char inword[MAXWORDLEN]="";
1.103 while (fgets(aline,LINEBUFSIZE-1,infile))
1.104 {
1.105 while (aline[strlen(aline)-1]==10 || aline[strlen(aline)-1]==13)
1.106 @@ -647,41 +595,41 @@
1.107 {
1.108 if (strstr(aline,"end")<strstr(aline,"project gutenberg"))
1.109 {
1.110 - if (footerline)
1.111 + if (results.footerline)
1.112 {
1.113 /* it's an old-form header - we can detect duplicates */
1.114 if (!nspline)
1.115 printf(" --> Duplicate footer?\n");
1.116 }
1.117 else
1.118 - footerline=linecnt;
1.119 + results.footerline=linecnt;
1.120 }
1.121 }
1.122 }
1.123 if (spline)
1.124 - firstline=spline;
1.125 + results.firstline=spline;
1.126 if (nspline)
1.127 - firstline=nspline; /* override with new */
1.128 - if (footerline)
1.129 + results.firstline=nspline; /* override with new */
1.130 + if (results.footerline)
1.131 continue; /* don't count the boilerplate in the footer */
1.132 llen=strlen(aline);
1.133 - totlen+=llen;
1.134 + results.totlen+=llen;
1.135 for (i=0;i<llen;i++)
1.136 {
1.137 if ((unsigned char)aline[i]>127)
1.138 - binlen++;
1.139 + results.binlen++;
1.140 if (gcisalpha(aline[i]))
1.141 - alphalen++;
1.142 + results.alphalen++;
1.143 if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
1.144 - endquote_count++;
1.145 + results.endquote_count++;
1.146 }
1.147 if (strlen(aline)>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE &&
1.148 lastblen>2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
1.149 - shortline++;
1.150 + results.shortline++;
1.151 if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE)
1.152 cnt_spacend++;
1.153 if (strstr(aline,".,"))
1.154 - dotcomma++;
1.155 + results.dotcomma++;
1.156 /* only count ast lines for ignoring purposes where there is */
1.157 /* locase text on the line */
1.158 if (strstr(aline,"*"))
1.159 @@ -690,150 +638,219 @@
1.160 if (*s>='a' && *s<='z')
1.161 break;
1.162 if (*s)
1.163 - astline++;
1.164 + results.astline++;
1.165 }
1.166 if (strstr(aline,"/"))
1.167 - fslashline++;
1.168 + results.fslashline++;
1.169 for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
1.170 ;
1.171 if (aline[i]=='-' && aline[i-1]!='-')
1.172 - hyphens++;
1.173 + results.hyphens++;
1.174 if (llen>LONGEST_PG_LINE)
1.175 - longline++;
1.176 + results.longline++;
1.177 if (llen>WAY_TOO_LONG)
1.178 - verylongline++;
1.179 + results.verylongline++;
1.180 if (strstr(aline,"<") && strstr(aline,">"))
1.181 {
1.182 i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1);
1.183 if (i>0)
1.184 - htmcount++;
1.185 + results.htmcount++;
1.186 if (strstr(aline,"<i>"))
1.187 - htmcount+=4; /* bonus marks! */
1.188 + results.htmcount+=4; /* bonus marks! */
1.189 }
1.190 /* Check for spaced em-dashes */
1.191 if (strstr(aline,"--"))
1.192 {
1.193 - emdash++;
1.194 + results.emdash++;
1.195 if (*(strstr(aline,"--")-1)==CHAR_SPACE ||
1.196 (*(strstr(aline,"--")+2)==CHAR_SPACE))
1.197 - space_emdash++;
1.198 + results.space_emdash++;
1.199 if (*(strstr(aline,"--")-1)==CHAR_SPACE &&
1.200 (*(strstr(aline,"--")+2)==CHAR_SPACE))
1.201 /* count of em-dashes with spaces both sides */
1.202 - non_PG_space_emdash++;
1.203 + results.non_PG_space_emdash++;
1.204 if (*(strstr(aline,"--")-1)!=CHAR_SPACE &&
1.205 (*(strstr(aline,"--")+2)!=CHAR_SPACE))
1.206 /* count of PG-type em-dashes with no spaces */
1.207 - PG_space_emdash++;
1.208 + results.PG_space_emdash++;
1.209 }
1.210 for (s=aline;*s;)
1.211 {
1.212 s=getaword(s,inword);
1.213 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
1.214 - Dutchcount++;
1.215 + results.Dutchcount++;
1.216 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
1.217 - Frenchcount++;
1.218 + results.Frenchcount++;
1.219 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1.220 - standalone_digit++;
1.221 + results.standalone_digit++;
1.222 }
1.223 /* Check for spaced dashes */
1.224 if (strstr(aline," -") && *(strstr(aline," -")+2)!='-')
1.225 - spacedash++;
1.226 + results.spacedash++;
1.227 lastblen=lastlen;
1.228 lastlen=strlen(aline);
1.229 laststart=aline[0];
1.230 }
1.231 + return &results;
1.232 +}
1.233 +
1.234 +/*
1.235 + * procfile:
1.236 + *
1.237 + * Process one file.
1.238 + */
1.239 +void procfile(char *filename)
1.240 +{
1.241 + char *s,*t,*s1,laststart,*wordstart;
1.242 + char inword[MAXWORDLEN],testword[MAXWORDLEN];
1.243 + char parastart[81]; /* first line of current para */
1.244 + FILE *infile;
1.245 + struct first_pass_results *first_pass_results;
1.246 + long quot,squot,start_para_line;
1.247 + signed int i,j,llen,isemptyline,isacro,isellipsis,istypo,alower,
1.248 + eNon_A,eTab,eTilde,eAst,eFSlash,eCarat;
1.249 + signed int warn_short,warn_long,warn_bin,warn_dash,warn_dotcomma,
1.250 + warn_ast,warn_fslash,warn_digit,warn_hyphen,warn_endquote;
1.251 + unsigned int lastlen,lastblen;
1.252 + signed int s_brack,c_brack,r_brack,c_unders;
1.253 + signed int open_single_quote,close_single_quote,guessquote,dquotepar,
1.254 + squotepar;
1.255 + signed int isnewpara,vowel,consonant;
1.256 + char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
1.257 + cbrack_err[80],unders_err[80];
1.258 + signed int qword_index,qperiod_index,isdup;
1.259 + signed int enddash;
1.260 + signed int isDutch,isFrench;
1.261 + laststart=CHAR_SPACE;
1.262 + lastlen=lastblen=0;
1.263 + *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
1.264 + *unders_err=*prevline=0;
1.265 + linecnt=checked_linecnt=start_para_line=0;
1.266 + quot=squot=s_brack=c_brack=r_brack=c_unders=0;
1.267 + i=llen=isemptyline=isacro=isellipsis=istypo=0;
1.268 + warn_short=warn_long=warn_bin=warn_dash=warn_dotcomma=
1.269 + warn_ast=warn_fslash=warn_digit=warn_endquote=0;
1.270 + isnewpara=vowel=consonant=enddash=0;
1.271 + qword_index=qperiod_index=isdup=0;
1.272 + *inword=*testword=0;
1.273 + open_single_quote=close_single_quote=guessquote=dquotepar=squotepar=0;
1.274 + isDutch=isFrench=0;
1.275 + for (j=0;j<MAX_QWORD;j++)
1.276 + {
1.277 + dupcnt[j]=0;
1.278 + for (i=0;i<MAX_QWORD_LENGTH;i++)
1.279 + {
1.280 + qword[i][j]=0;
1.281 + qperiod[i][j]=0;
1.282 + }
1.283 + }
1.284 + infile=fopen(filename,"rb");
1.285 + if (!infile)
1.286 + {
1.287 + if (pswit[STDOUT_SWITCH])
1.288 + fprintf(stdout,"bookloupe: cannot open %s\n",filename);
1.289 + else
1.290 + fprintf(stderr,"bookloupe: cannot open %s\n",filename);
1.291 + exit(1);
1.292 + }
1.293 + fprintf(stdout,"\n\nFile: %s\n\n",filename);
1.294 +
1.295 + first_pass_results=first_pass(infile);
1.296 +
1.297 fclose(infile);
1.298 /* now, based on this quick view, make some snap decisions */
1.299 if (cnt_spacend>0)
1.300 printf(" --> %ld lines in this file have white space at end\n",
1.301 cnt_spacend);
1.302 warn_dotcomma=1;
1.303 - if (dotcomma>5)
1.304 + if (first_pass_results->dotcomma>5)
1.305 {
1.306 warn_dotcomma=0;
1.307 printf(" --> %ld lines in this file contain '.,'. "
1.308 - "Not reporting them.\n",dotcomma);
1.309 + "Not reporting them.\n",first_pass_results->dotcomma);
1.310 }
1.311 /* if more than 50 lines, or one-tenth, are short,
1.312 * don't bother reporting them */
1.313 warn_short=1;
1.314 - if (shortline>50 || shortline*10>linecnt)
1.315 + if (first_pass_results->shortline>50 ||
1.316 + first_pass_results->shortline*10>linecnt)
1.317 {
1.318 warn_short=0;
1.319 printf(" --> %ld lines in this file are short. "
1.320 - "Not reporting short lines.\n",shortline);
1.321 + "Not reporting short lines.\n",first_pass_results->shortline);
1.322 }
1.323 /*
1.324 * If more than 50 lines, or one-tenth, are long,
1.325 * don't bother reporting them.
1.326 */
1.327 warn_long=1;
1.328 - if (longline>50 || longline*10>linecnt)
1.329 + if (first_pass_results->longline>50 ||
1.330 + first_pass_results->longline*10>linecnt)
1.331 {
1.332 warn_long=0;
1.333 printf(" --> %ld lines in this file are long. "
1.334 - "Not reporting long lines.\n",longline);
1.335 + "Not reporting long lines.\n",first_pass_results->longline);
1.336 }
1.337 /* If more than 10 lines contain asterisks, don't bother reporting them. */
1.338 warn_ast=1;
1.339 - if (astline>10)
1.340 + if (first_pass_results->astline>10)
1.341 {
1.342 warn_ast=0;
1.343 printf(" --> %ld lines in this file contain asterisks. "
1.344 - "Not reporting them.\n",astline);
1.345 + "Not reporting them.\n",first_pass_results->astline);
1.346 }
1.347 /*
1.348 * If more than 10 lines contain forward slashes,
1.349 * don't bother reporting them.
1.350 */
1.351 warn_fslash=1;
1.352 - if (fslashline>10)
1.353 + if (first_pass_results->fslashline>10)
1.354 {
1.355 warn_fslash=0;
1.356 printf(" --> %ld lines in this file contain forward slashes. "
1.357 - "Not reporting them.\n",fslashline);
1.358 + "Not reporting them.\n",first_pass_results->fslashline);
1.359 }
1.360 /*
1.361 * If more than 20 lines contain unpunctuated endquotes,
1.362 * don't bother reporting them.
1.363 */
1.364 warn_endquote=1;
1.365 - if (endquote_count>20)
1.366 + if (first_pass_results->endquote_count>20)
1.367 {
1.368 warn_endquote=0;
1.369 printf(" --> %ld lines in this file contain unpunctuated endquotes. "
1.370 - "Not reporting them.\n",endquote_count);
1.371 + "Not reporting them.\n",first_pass_results->endquote_count);
1.372 }
1.373 /*
1.374 * If more than 15 lines contain standalone digits,
1.375 * don't bother reporting them.
1.376 */
1.377 warn_digit=1;
1.378 - if (standalone_digit>10)
1.379 + if (first_pass_results->standalone_digit>10)
1.380 {
1.381 warn_digit=0;
1.382 printf(" --> %ld lines in this file contain standalone 0s and 1s. "
1.383 - "Not reporting them.\n",standalone_digit);
1.384 + "Not reporting them.\n",first_pass_results->standalone_digit);
1.385 }
1.386 /*
1.387 * If more than 20 lines contain hyphens at end,
1.388 * don't bother reporting them.
1.389 */
1.390 warn_hyphen=1;
1.391 - if (hyphens>20)
1.392 + if (first_pass_results->hyphens>20)
1.393 {
1.394 warn_hyphen=0;
1.395 printf(" --> %ld lines in this file have hyphens at end. "
1.396 - "Not reporting them.\n",hyphens);
1.397 + "Not reporting them.\n",first_pass_results->hyphens);
1.398 }
1.399 - if (htmcount>20 && !pswit[MARKUP_SWITCH])
1.400 + if (first_pass_results->htmcount>20 && !pswit[MARKUP_SWITCH])
1.401 {
1.402 printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1.403 pswit[MARKUP_SWITCH]=1;
1.404 }
1.405 - if (verylongline>0)
1.406 - printf(" --> %ld lines in this file are VERY long!\n",verylongline);
1.407 + if (first_pass_results->verylongline>0)
1.408 + printf(" --> %ld lines in this file are VERY long!\n",
1.409 + first_pass_results->verylongline);
1.410 /*
1.411 * If there are more non-PG spaced dashes than PG em-dashes,
1.412 * assume it's deliberate.
1.413 @@ -841,53 +858,56 @@
1.414 * and some people insist on them whatever the guidelines say.
1.415 */
1.416 warn_dash=1;
1.417 - if (spacedash+non_PG_space_emdash>PG_space_emdash)
1.418 + if (first_pass_results->spacedash+first_pass_results->non_PG_space_emdash>
1.419 + first_pass_results->PG_space_emdash)
1.420 {
1.421 warn_dash=0;
1.422 printf(" --> There are %ld spaced dashes and em-dashes. "
1.423 - "Not reporting them.\n",spacedash+non_PG_space_emdash);
1.424 + "Not reporting them.\n",first_pass_results->spacedash+
1.425 + first_pass_results->non_PG_space_emdash);
1.426 }
1.427 /* If more than a quarter of characters are hi-bit, bug out. */
1.428 warn_bin=1;
1.429 - if (binlen*4>totlen)
1.430 + if (first_pass_results->binlen*4>first_pass_results->totlen)
1.431 {
1.432 printf(" --> This file does not appear to be ASCII. "
1.433 "Terminating. Best of luck with it!\n");
1.434 exit(1);
1.435 }
1.436 - if (alphalen*4<totlen)
1.437 + if (first_pass_results->alphalen*4<first_pass_results->totlen)
1.438 {
1.439 printf(" --> This file does not appear to be text. "
1.440 "Terminating. Best of luck with it!\n");
1.441 exit(1);
1.442 }
1.443 - if (binlen*100>totlen || binlen>100)
1.444 + if (first_pass_results->binlen*100>first_pass_results->totlen ||
1.445 + first_pass_results->binlen>100)
1.446 {
1.447 printf(" --> There are a lot of foreign letters here. "
1.448 "Not reporting them.\n");
1.449 warn_bin=0;
1.450 }
1.451 isDutch=0;
1.452 - if (Dutchcount>50)
1.453 + if (first_pass_results->Dutchcount>50)
1.454 {
1.455 isDutch=1;
1.456 printf(" --> This looks like Dutch - "
1.457 "switching off dashes and warnings for 's Middags case.\n");
1.458 }
1.459 isFrench=0;
1.460 - if (Frenchcount>50)
1.461 + if (first_pass_results->Frenchcount>50)
1.462 {
1.463 isFrench=1;
1.464 printf(" --> This looks like French - "
1.465 "switching off some doublepunct.\n");
1.466 }
1.467 - if (firstline && footerline)
1.468 + if (first_pass_results->firstline && first_pass_results->footerline)
1.469 printf(" The PG header and footer appear to be already on.\n");
1.470 else
1.471 {
1.472 - if (firstline)
1.473 + if (first_pass_results->firstline)
1.474 printf(" The PG header is on - no footer.\n");
1.475 - if (footerline)
1.476 + if (first_pass_results->footerline)
1.477 printf(" The PG footer is on - no header.\n");
1.478 }
1.479 printf("\n");
1.480 @@ -916,14 +936,15 @@
1.481 fprintf(stderr,"bookloupe: cannot open %s\n",filename);
1.482 exit(1);
1.483 }
1.484 - if (footerline>0 && firstline>0 && footerline>firstline &&
1.485 - footerline-firstline<100)
1.486 + if (first_pass_results->footerline>0 && first_pass_results->firstline>0 &&
1.487 + first_pass_results->footerline>first_pass_results->firstline &&
1.488 + first_pass_results->footerline-first_pass_results->firstline<100)
1.489 {
1.490 printf(" --> I don't really know where this text starts. \n");
1.491 printf(" There are no reference points.\n");
1.492 printf(" I'm going to have to report the header and footer "
1.493 "as well.\n");
1.494 - firstline=0;
1.495 + first_pass_results->firstline=0;
1.496 }
1.497 /*
1.498 * Here we go with the main pass. Hold onto yer hat!
1.499 @@ -939,7 +960,9 @@
1.500 isnewpara=1;
1.501 if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
1.502 continue; // skip DP page separators completely
1.503 - if (linecnt<firstline || (footerline>0 && linecnt>footerline))
1.504 + if (linecnt<first_pass_results->firstline ||
1.505 + (first_pass_results->footerline>0 &&
1.506 + linecnt>first_pass_results->footerline))
1.507 {
1.508 if (pswit[HEADER_SWITCH])
1.509 {