# HG changeset patch # User ali # Date 1369583646 -3600 # Node ID 6b786cc05b3cb4a3a678e44112e0b966da590c70 # Parent 23b2ea51b029a4c26908ec0c2b048b274b44952f Break check_for_typos() out diff -r 23b2ea51b029 -r 6b786cc05b3c bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Sun May 26 16:39:48 2013 +0100 +++ b/bookloupe/bookloupe.c Sun May 26 16:54:06 2013 +0100 @@ -1570,14 +1570,225 @@ } /* + * check_for_typos: + * + * Check for commonly mistyped words, + * and digits like 0 for O in a word. + */ +void check_for_typos(const char *aline,struct warnings *warnings) +{ + const char *s,*wordstart; + char inword[MAXWORDLEN],testword[MAXWORDLEN]; + int i,istypo,isdup,alower,vowel,consonant; + static int qword_index=0; + for (s=aline;*s;) + { + wordstart=s; + s=getaword(s,inword); + if (!*inword) + continue; /* don't bother with empty lines */ + if (mixdigit(inword)) + { + if (pswit[ECHO_SWITCH]) + printf("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query digit in %s\n", + linecnt,(int)(wordstart-aline)+1,inword); + else + cnt_word++; + } + /* + * Put the word through a series of tests for likely typos and OCR + * errors. + */ + if (pswit[TYPO_SWITCH]) + { + istypo=0; + strcpy(testword,inword); + alower=0; + for (i=0;i<(signed int)strlen(testword);i++) + { + /* lowercase for testing */ + if (testword[i]>='a' && testword[i]<='z') + alower=1; + if (alower && testword[i]>='A' && testword[i]<='Z') + { + /* + * We have an uppercase mid-word. However, there are + * common cases: + * Mac and Mc like McGill + * French contractions like l'Abbe + */ + if (i==2 && testword[0]=='m' && testword[1]=='c' || + i==3 && testword[0]=='m' && testword[1]=='a' && + testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE) + ; /* do nothing! */ + else + istypo=1; + } + testword[i]=(char)tolower(testword[i]); + } + /* + * Check for certain unlikely two-letter combinations at word + * start and end. + */ + if (strlen(testword)>1) + { + for (i=0;*nostart[i];i++) + if (!strncmp(testword,nostart[i],2)) + istypo=1; + for (i=0;*noend[i];i++) + if (!strncmp(testword+strlen(testword)-2,noend[i],2)) + istypo=1; + } + /* ght is common, gbt never. Like that. */ + if (strstr(testword,"cb")) + istypo=1; + if (strstr(testword,"gbt")) + istypo=1; + if (strstr(testword,"pbt")) + istypo=1; + if (strstr(testword,"tbs")) + istypo=1; + if (strstr(testword,"mrn")) + istypo=1; + if (strstr(testword,"ahle")) + istypo=1; + if (strstr(testword,"ihle")) + istypo=1; + /* + * "TBE" does happen - like HEARTBEAT - but uncommon. + * Also "TBI" - frostbite, outbid - but uncommon. + * Similarly "ii" like Hawaii, or Pompeii, and in Roman + * numerals, but "ii" is a common scanno. + */ + if (strstr(testword,"tbi")) + istypo=1; + if (strstr(testword,"tbe")) + istypo=1; + if (strstr(testword,"ii")) + istypo=1; + /* + * Check for no vowels or no consonants. + * If none, flag a typo. + */ + if (!istypo && strlen(testword)>1) + { + vowel=consonant=0; + for (i=0;testword[i];i++) + { + if (testword[i]=='y' || gcisdigit(testword[i])) + { + /* Yah, this is loose. */ + vowel++; + consonant++; + } + else if (strchr(vowels,testword[i])) + vowel++; + else + consonant++; + } + if (!vowel || !consonant) + istypo=1; + } + /* + * Now exclude the word from being reported if it's in + * the okword list. + */ + for (i=0;*okword[i];i++) + if (!strcmp(testword,okword[i])) + istypo=0; + /* + * What looks like a typo may be a Roman numeral. + * Exclude these. + */ + if (istypo && isroman(testword)) + istypo=0; + /* Check the manual list of typos. */ + if (!istypo) + for (i=0;*typo[i];i++) + if (!strcmp(testword,typo[i])) + istypo=1; + /* + * Check lowercase s, l, i and m - special cases. + * "j" - often a semi-colon gone wrong. + * "d" for a missing apostrophe - he d + * "n" for "in" + */ + if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword)) + istypo=1; + if (istypo) + { + isdup=0; + if (strlen(testword)digit) + { + /* In paranoid mode, query all 0 and 1 standing alone. */ + if (!strcmp(inword,"0") || !strcmp(inword,"1")) + { + if (pswit[ECHO_SWITCH]) + printf("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + printf(" Line %ld column %d - Query standalone %s\n", + linecnt,(int)(wordstart-aline)+2,inword); + else + cnt_word++; + } + } + } +} + +/* * procfile: * * Process one file. */ void procfile(char *filename) { - const char *s,*t,*wordstart; - char inword[MAXWORDLEN],testword[MAXWORDLEN]; + const char *s,*t; char parastart[81]; /* first line of current para */ FILE *infile; struct first_pass_results *first_pass_results; @@ -1586,12 +1797,11 @@ struct line_properties last={0}; int isemptyline; long squot,start_para_line; - signed int i,llen,isacro,isellipsis,istypo,alower; + signed int i,llen,isacro,isellipsis; signed int dquotepar,squotepar; - signed int isnewpara,vowel,consonant; + signed int isnewpara; char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80], cbrack_err[80],unders_err[80]; - signed int qword_index,isdup; signed int enddash; last.start=CHAR_SPACE; *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err= @@ -1599,9 +1809,7 @@ linecnt=checked_linecnt=start_para_line=0; squot=0; i=llen=isacro=isellipsis=0; - isnewpara=vowel=consonant=enddash=0; - qword_index=0; - *inword=*testword=0; + isnewpara=enddash=0; dquotepar=squotepar=0; infile=fopen(filename,"rb"); if (!infile) @@ -1813,208 +2021,7 @@ check_for_pling_scanno(aline); check_for_extra_period(aline,warnings); check_for_following_punctuation(aline); - /* - * Check for commonly mistyped words, - * and digits like 0 for O in a word. - */ - for (s=aline;*s;) - { - wordstart=s; - s=getaword(s,inword); - if (!*inword) - continue; /* don't bother with empty lines */ - if (mixdigit(inword)) - { - if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); - if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query digit in %s\n", - linecnt,(int)(wordstart-aline)+1,inword); - else - cnt_word++; - } - /* - * Put the word through a series of tests for likely typos and OCR - * errors. - */ - if (pswit[TYPO_SWITCH]) - { - istypo=0; - strcpy(testword,inword); - alower=0; - for (i=0;i<(signed int)strlen(testword);i++) - { - /* lowercase for testing */ - if (testword[i]>='a' && testword[i]<='z') - alower=1; - if (alower && testword[i]>='A' && testword[i]<='Z') - { - /* - * We have an uppercase mid-word. However, there are - * common cases: - * Mac and Mc like McGill - * French contractions like l'Abbe - */ - if (i==2 && testword[0]=='m' && testword[1]=='c' || - i==3 && testword[0]=='m' && testword[1]=='a' && - testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE) - ; /* do nothing! */ - else - istypo=1; - } - testword[i]=(char)tolower(testword[i]); - } - /* - * Check for certain unlikely two-letter combinations at word - * start and end. - */ - if (strlen(testword)>1) - { - for (i=0;*nostart[i];i++) - if (!strncmp(testword,nostart[i],2)) - istypo=1; - for (i=0;*noend[i];i++) - if (!strncmp(testword+strlen(testword)-2,noend[i],2)) - istypo=1; - } - /* ght is common, gbt never. Like that. */ - if (strstr(testword,"cb")) - istypo=1; - if (strstr(testword,"gbt")) - istypo=1; - if (strstr(testword,"pbt")) - istypo=1; - if (strstr(testword,"tbs")) - istypo=1; - if (strstr(testword,"mrn")) - istypo=1; - if (strstr(testword,"ahle")) - istypo=1; - if (strstr(testword,"ihle")) - istypo=1; - /* - * "TBE" does happen - like HEARTBEAT - but uncommon. - * Also "TBI" - frostbite, outbid - but uncommon. - * Similarly "ii" like Hawaii, or Pompeii, and in Roman - * numerals, but "ii" is a common scanno. - */ - if (strstr(testword,"tbi")) - istypo=1; - if (strstr(testword,"tbe")) - istypo=1; - if (strstr(testword,"ii")) - istypo=1; - /* - * Check for no vowels or no consonants. - * If none, flag a typo. - */ - if (!istypo && strlen(testword)>1) - { - vowel=consonant=0; - for (i=0;testword[i];i++) - { - if (testword[i]=='y' || gcisdigit(testword[i])) - { - /* Yah, this is loose. */ - vowel++; - consonant++; - } - else if (strchr(vowels,testword[i])) - vowel++; - else - consonant++; - } - if (!vowel || !consonant) - istypo=1; - } - /* - * Now exclude the word from being reported if it's in - * the okword list. - */ - for (i=0;*okword[i];i++) - if (!strcmp(testword,okword[i])) - istypo=0; - /* - * What looks like a typo may be a Roman numeral. - * Exclude these. - */ - if (istypo && isroman(testword)) - istypo=0; - /* Check the manual list of typos. */ - if (!istypo) - for (i=0;*typo[i];i++) - if (!strcmp(testword,typo[i])) - istypo=1; - /* - * Check lowercase s, l, i and m - special cases. - * "j" - often a semi-colon gone wrong. - * "d" for a missing apostrophe - he d - * "n" for "in" - */ - if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword)) - istypo=1; - if (istypo) - { - isdup=0; - if (strlen(testword)digit) - { - /* In paranoid mode, query all 0 and 1 standing alone. */ - if (!strcmp(inword,"0") || !strcmp(inword,"1")) - { - if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); - if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query standalone %s\n", - linecnt,(int)(wordstart-aline)+2,inword); - else - cnt_word++; - } - } - } + check_for_typos(aline,warnings); /* * Look for added or missing spaces around punctuation and quotes. * If there is a punctuation character like ! with no space on