# HG changeset patch # User ali # Date 1369468367 -3600 # Node ID 68b1403e2971fecd8f1530b668c3bbff9dac7838 # Parent b130f135022d7c0d4e12cc0b08fe07cbb49ce314 Break first_pass() out diff -r b130f135022d -r 68b1403e2971 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Fri May 24 22:47:16 2013 +0100 +++ b/bookloupe/bookloupe.c Sat May 25 08:52:47 2013 +0100 @@ -546,82 +546,30 @@ return 0; } +struct first_pass_results { + long firstline,astline; + long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma; + long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit; + long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash; + signed int Dutchcount,Frenchcount; +}; + /* - * procfile: + * first_pass: * - * Process one file. + * Run a first pass - verify that it's a valid PG + * file, decide whether to report some things that + * occur many times in the text like long or short + * lines, non-standard dashes, etc. */ -void procfile(char *filename) +struct first_pass_results *first_pass(FILE *infile) { - char *s,*t,*s1,laststart,*wordstart; - char inword[MAXWORDLEN],testword[MAXWORDLEN]; - char parastart[81]; /* first line of current para */ - FILE *infile; - long quot,squot,firstline,alphalen,totlen,binlen, - shortline,longline,verylongline,spacedash,emdash, - space_emdash,non_PG_space_emdash,PG_space_emdash, - footerline,dotcomma,start_para_line,astline,fslashline, - standalone_digit,hyphens,htmcount,endquote_count; - long spline,nspline; - signed int i,j,llen,isemptyline,isacro,isellipsis,istypo,alower, - eNon_A,eTab,eTilde,eAst,eFSlash,eCarat; - signed int warn_short,warn_long,warn_bin,warn_dash,warn_dotcomma, - warn_ast,warn_fslash,warn_digit,warn_hyphen,warn_endquote; - unsigned int lastlen,lastblen; - signed int s_brack,c_brack,r_brack,c_unders; - signed int open_single_quote,close_single_quote,guessquote,dquotepar, - squotepar; - signed int isnewpara,vowel,consonant; - char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80], - cbrack_err[80],unders_err[80]; - signed int qword_index,qperiod_index,isdup; - signed int enddash; - signed int Dutchcount,isDutch,Frenchcount,isFrench; - laststart=CHAR_SPACE; - lastlen=lastblen=0; - *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err= - *unders_err=*prevline=0; - linecnt=firstline=alphalen=totlen=binlen= - shortline=longline=spacedash=emdash=checked_linecnt= - space_emdash=non_PG_space_emdash=PG_space_emdash= - footerline=dotcomma=start_para_line=astline=fslashline= - standalone_digit=hyphens=htmcount=endquote_count=0; - quot=squot=s_brack=c_brack=r_brack=c_unders=0; - i=llen=isemptyline=isacro=isellipsis=istypo=0; - warn_short=warn_long=warn_bin=warn_dash=warn_dotcomma= - warn_ast=warn_fslash=warn_digit=warn_endquote=0; - isnewpara=vowel=consonant=enddash=0; - spline=nspline=0; - qword_index=qperiod_index=isdup=0; - *inword=*testword=0; - open_single_quote=close_single_quote=guessquote=dquotepar=squotepar=0; - Dutchcount=isDutch=Frenchcount=isFrench=0; - for (j=0;j Duplicate footer?\n"); } else - footerline=linecnt; + results.footerline=linecnt; } } } if (spline) - firstline=spline; + results.firstline=spline; if (nspline) - firstline=nspline; /* override with new */ - if (footerline) + results.firstline=nspline; /* override with new */ + if (results.footerline) continue; /* don't count the boilerplate in the footer */ llen=strlen(aline); - totlen+=llen; + results.totlen+=llen; for (i=0;i127) - binlen++; + results.binlen++; if (gcisalpha(aline[i])) - alphalen++; + results.alphalen++; if (i>0 && aline[i]==CHAR_DQUOTE && isalpha(aline[i-1])) - endquote_count++; + results.endquote_count++; } if (strlen(aline)>2 && lastlen>2 && lastlen2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE) - shortline++; + results.shortline++; if (*aline && (unsigned char)aline[strlen(aline)-1]<=CHAR_SPACE) cnt_spacend++; if (strstr(aline,".,")) - dotcomma++; + results.dotcomma++; /* only count ast lines for ignoring purposes where there is */ /* locase text on the line */ if (strstr(aline,"*")) @@ -690,150 +638,219 @@ if (*s>='a' && *s<='z') break; if (*s) - astline++; + results.astline++; } if (strstr(aline,"/")) - fslashline++; + results.fslashline++; for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--) ; if (aline[i]=='-' && aline[i-1]!='-') - hyphens++; + results.hyphens++; if (llen>LONGEST_PG_LINE) - longline++; + results.longline++; if (llen>WAY_TOO_LONG) - verylongline++; + results.verylongline++; if (strstr(aline,"<") && strstr(aline,">")) { i=(signed int)(strstr(aline,">")-strstr(aline,"<")+1); if (i>0) - htmcount++; + results.htmcount++; if (strstr(aline,"")) - htmcount+=4; /* bonus marks! */ + results.htmcount+=4; /* bonus marks! */ } /* Check for spaced em-dashes */ if (strstr(aline,"--")) { - emdash++; + results.emdash++; if (*(strstr(aline,"--")-1)==CHAR_SPACE || (*(strstr(aline,"--")+2)==CHAR_SPACE)) - space_emdash++; + results.space_emdash++; if (*(strstr(aline,"--")-1)==CHAR_SPACE && (*(strstr(aline,"--")+2)==CHAR_SPACE)) /* count of em-dashes with spaces both sides */ - non_PG_space_emdash++; + results.non_PG_space_emdash++; if (*(strstr(aline,"--")-1)!=CHAR_SPACE && (*(strstr(aline,"--")+2)!=CHAR_SPACE)) /* count of PG-type em-dashes with no spaces */ - PG_space_emdash++; + results.PG_space_emdash++; } for (s=aline;*s;) { s=getaword(s,inword); if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) - Dutchcount++; + results.Dutchcount++; if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) - Frenchcount++; + results.Frenchcount++; if (!strcmp(inword,"0") || !strcmp(inword,"1")) - standalone_digit++; + results.standalone_digit++; } /* Check for spaced dashes */ if (strstr(aline," -") && *(strstr(aline," -")+2)!='-') - spacedash++; + results.spacedash++; lastblen=lastlen; lastlen=strlen(aline); laststart=aline[0]; } + return &results; +} + +/* + * procfile: + * + * Process one file. + */ +void procfile(char *filename) +{ + char *s,*t,*s1,laststart,*wordstart; + char inword[MAXWORDLEN],testword[MAXWORDLEN]; + char parastart[81]; /* first line of current para */ + FILE *infile; + struct first_pass_results *first_pass_results; + long quot,squot,start_para_line; + signed int i,j,llen,isemptyline,isacro,isellipsis,istypo,alower, + eNon_A,eTab,eTilde,eAst,eFSlash,eCarat; + signed int warn_short,warn_long,warn_bin,warn_dash,warn_dotcomma, + warn_ast,warn_fslash,warn_digit,warn_hyphen,warn_endquote; + unsigned int lastlen,lastblen; + signed int s_brack,c_brack,r_brack,c_unders; + signed int open_single_quote,close_single_quote,guessquote,dquotepar, + squotepar; + signed int isnewpara,vowel,consonant; + char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80], + cbrack_err[80],unders_err[80]; + signed int qword_index,qperiod_index,isdup; + signed int enddash; + signed int isDutch,isFrench; + laststart=CHAR_SPACE; + lastlen=lastblen=0; + *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err= + *unders_err=*prevline=0; + linecnt=checked_linecnt=start_para_line=0; + quot=squot=s_brack=c_brack=r_brack=c_unders=0; + i=llen=isemptyline=isacro=isellipsis=istypo=0; + warn_short=warn_long=warn_bin=warn_dash=warn_dotcomma= + warn_ast=warn_fslash=warn_digit=warn_endquote=0; + isnewpara=vowel=consonant=enddash=0; + qword_index=qperiod_index=isdup=0; + *inword=*testword=0; + open_single_quote=close_single_quote=guessquote=dquotepar=squotepar=0; + isDutch=isFrench=0; + for (j=0;j0) printf(" --> %ld lines in this file have white space at end\n", cnt_spacend); warn_dotcomma=1; - if (dotcomma>5) + if (first_pass_results->dotcomma>5) { warn_dotcomma=0; printf(" --> %ld lines in this file contain '.,'. " - "Not reporting them.\n",dotcomma); + "Not reporting them.\n",first_pass_results->dotcomma); } /* if more than 50 lines, or one-tenth, are short, * don't bother reporting them */ warn_short=1; - if (shortline>50 || shortline*10>linecnt) + if (first_pass_results->shortline>50 || + first_pass_results->shortline*10>linecnt) { warn_short=0; printf(" --> %ld lines in this file are short. " - "Not reporting short lines.\n",shortline); + "Not reporting short lines.\n",first_pass_results->shortline); } /* * If more than 50 lines, or one-tenth, are long, * don't bother reporting them. */ warn_long=1; - if (longline>50 || longline*10>linecnt) + if (first_pass_results->longline>50 || + first_pass_results->longline*10>linecnt) { warn_long=0; printf(" --> %ld lines in this file are long. " - "Not reporting long lines.\n",longline); + "Not reporting long lines.\n",first_pass_results->longline); } /* If more than 10 lines contain asterisks, don't bother reporting them. */ warn_ast=1; - if (astline>10) + if (first_pass_results->astline>10) { warn_ast=0; printf(" --> %ld lines in this file contain asterisks. " - "Not reporting them.\n",astline); + "Not reporting them.\n",first_pass_results->astline); } /* * If more than 10 lines contain forward slashes, * don't bother reporting them. */ warn_fslash=1; - if (fslashline>10) + if (first_pass_results->fslashline>10) { warn_fslash=0; printf(" --> %ld lines in this file contain forward slashes. " - "Not reporting them.\n",fslashline); + "Not reporting them.\n",first_pass_results->fslashline); } /* * If more than 20 lines contain unpunctuated endquotes, * don't bother reporting them. */ warn_endquote=1; - if (endquote_count>20) + if (first_pass_results->endquote_count>20) { warn_endquote=0; printf(" --> %ld lines in this file contain unpunctuated endquotes. " - "Not reporting them.\n",endquote_count); + "Not reporting them.\n",first_pass_results->endquote_count); } /* * If more than 15 lines contain standalone digits, * don't bother reporting them. */ warn_digit=1; - if (standalone_digit>10) + if (first_pass_results->standalone_digit>10) { warn_digit=0; printf(" --> %ld lines in this file contain standalone 0s and 1s. " - "Not reporting them.\n",standalone_digit); + "Not reporting them.\n",first_pass_results->standalone_digit); } /* * If more than 20 lines contain hyphens at end, * don't bother reporting them. */ warn_hyphen=1; - if (hyphens>20) + if (first_pass_results->hyphens>20) { warn_hyphen=0; printf(" --> %ld lines in this file have hyphens at end. " - "Not reporting them.\n",hyphens); + "Not reporting them.\n",first_pass_results->hyphens); } - if (htmcount>20 && !pswit[MARKUP_SWITCH]) + if (first_pass_results->htmcount>20 && !pswit[MARKUP_SWITCH]) { printf(" --> Looks like this is HTML. Switching HTML mode ON.\n"); pswit[MARKUP_SWITCH]=1; } - if (verylongline>0) - printf(" --> %ld lines in this file are VERY long!\n",verylongline); + if (first_pass_results->verylongline>0) + printf(" --> %ld lines in this file are VERY long!\n", + first_pass_results->verylongline); /* * If there are more non-PG spaced dashes than PG em-dashes, * assume it's deliberate. @@ -841,53 +858,56 @@ * and some people insist on them whatever the guidelines say. */ warn_dash=1; - if (spacedash+non_PG_space_emdash>PG_space_emdash) + if (first_pass_results->spacedash+first_pass_results->non_PG_space_emdash> + first_pass_results->PG_space_emdash) { warn_dash=0; printf(" --> There are %ld spaced dashes and em-dashes. " - "Not reporting them.\n",spacedash+non_PG_space_emdash); + "Not reporting them.\n",first_pass_results->spacedash+ + first_pass_results->non_PG_space_emdash); } /* If more than a quarter of characters are hi-bit, bug out. */ warn_bin=1; - if (binlen*4>totlen) + if (first_pass_results->binlen*4>first_pass_results->totlen) { printf(" --> This file does not appear to be ASCII. " "Terminating. Best of luck with it!\n"); exit(1); } - if (alphalen*4alphalen*4totlen) { printf(" --> This file does not appear to be text. " "Terminating. Best of luck with it!\n"); exit(1); } - if (binlen*100>totlen || binlen>100) + if (first_pass_results->binlen*100>first_pass_results->totlen || + first_pass_results->binlen>100) { printf(" --> There are a lot of foreign letters here. " "Not reporting them.\n"); warn_bin=0; } isDutch=0; - if (Dutchcount>50) + if (first_pass_results->Dutchcount>50) { isDutch=1; printf(" --> This looks like Dutch - " "switching off dashes and warnings for 's Middags case.\n"); } isFrench=0; - if (Frenchcount>50) + if (first_pass_results->Frenchcount>50) { isFrench=1; printf(" --> This looks like French - " "switching off some doublepunct.\n"); } - if (firstline && footerline) + if (first_pass_results->firstline && first_pass_results->footerline) printf(" The PG header and footer appear to be already on.\n"); else { - if (firstline) + if (first_pass_results->firstline) printf(" The PG header is on - no footer.\n"); - if (footerline) + if (first_pass_results->footerline) printf(" The PG footer is on - no header.\n"); } printf("\n"); @@ -916,14 +936,15 @@ fprintf(stderr,"bookloupe: cannot open %s\n",filename); exit(1); } - if (footerline>0 && firstline>0 && footerline>firstline && - footerline-firstline<100) + if (first_pass_results->footerline>0 && first_pass_results->firstline>0 && + first_pass_results->footerline>first_pass_results->firstline && + first_pass_results->footerline-first_pass_results->firstline<100) { printf(" --> I don't really know where this text starts. \n"); printf(" There are no reference points.\n"); printf(" I'm going to have to report the header and footer " "as well.\n"); - firstline=0; + first_pass_results->firstline=0; } /* * Here we go with the main pass. Hold onto yer hat! @@ -939,7 +960,9 @@ isnewpara=1; if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11)) continue; // skip DP page separators completely - if (linecnt0 && linecnt>footerline)) + if (linecntfirstline || + (first_pass_results->footerline>0 && + linecnt>first_pass_results->footerline)) { if (pswit[HEADER_SWITCH]) {