1.1 --- a/bookloupe/bookloupe.c Sun May 26 16:39:48 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Sun May 26 16:54:06 2013 +0100
1.3 @@ -1570,14 +1570,225 @@
1.4 }
1.5
1.6 /*
1.7 + * check_for_typos:
1.8 + *
1.9 + * Check for commonly mistyped words,
1.10 + * and digits like 0 for O in a word.
1.11 + */
1.12 +void check_for_typos(const char *aline,struct warnings *warnings)
1.13 +{
1.14 + const char *s,*wordstart;
1.15 + char inword[MAXWORDLEN],testword[MAXWORDLEN];
1.16 + int i,istypo,isdup,alower,vowel,consonant;
1.17 + static int qword_index=0;
1.18 + for (s=aline;*s;)
1.19 + {
1.20 + wordstart=s;
1.21 + s=getaword(s,inword);
1.22 + if (!*inword)
1.23 + continue; /* don't bother with empty lines */
1.24 + if (mixdigit(inword))
1.25 + {
1.26 + if (pswit[ECHO_SWITCH])
1.27 + printf("\n%s\n",aline);
1.28 + if (!pswit[OVERVIEW_SWITCH])
1.29 + printf(" Line %ld column %d - Query digit in %s\n",
1.30 + linecnt,(int)(wordstart-aline)+1,inword);
1.31 + else
1.32 + cnt_word++;
1.33 + }
1.34 + /*
1.35 + * Put the word through a series of tests for likely typos and OCR
1.36 + * errors.
1.37 + */
1.38 + if (pswit[TYPO_SWITCH])
1.39 + {
1.40 + istypo=0;
1.41 + strcpy(testword,inword);
1.42 + alower=0;
1.43 + for (i=0;i<(signed int)strlen(testword);i++)
1.44 + {
1.45 + /* lowercase for testing */
1.46 + if (testword[i]>='a' && testword[i]<='z')
1.47 + alower=1;
1.48 + if (alower && testword[i]>='A' && testword[i]<='Z')
1.49 + {
1.50 + /*
1.51 + * We have an uppercase mid-word. However, there are
1.52 + * common cases:
1.53 + * Mac and Mc like McGill
1.54 + * French contractions like l'Abbe
1.55 + */
1.56 + if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1.57 + i==3 && testword[0]=='m' && testword[1]=='a' &&
1.58 + testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1.59 + ; /* do nothing! */
1.60 + else
1.61 + istypo=1;
1.62 + }
1.63 + testword[i]=(char)tolower(testword[i]);
1.64 + }
1.65 + /*
1.66 + * Check for certain unlikely two-letter combinations at word
1.67 + * start and end.
1.68 + */
1.69 + if (strlen(testword)>1)
1.70 + {
1.71 + for (i=0;*nostart[i];i++)
1.72 + if (!strncmp(testword,nostart[i],2))
1.73 + istypo=1;
1.74 + for (i=0;*noend[i];i++)
1.75 + if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1.76 + istypo=1;
1.77 + }
1.78 + /* ght is common, gbt never. Like that. */
1.79 + if (strstr(testword,"cb"))
1.80 + istypo=1;
1.81 + if (strstr(testword,"gbt"))
1.82 + istypo=1;
1.83 + if (strstr(testword,"pbt"))
1.84 + istypo=1;
1.85 + if (strstr(testword,"tbs"))
1.86 + istypo=1;
1.87 + if (strstr(testword,"mrn"))
1.88 + istypo=1;
1.89 + if (strstr(testword,"ahle"))
1.90 + istypo=1;
1.91 + if (strstr(testword,"ihle"))
1.92 + istypo=1;
1.93 + /*
1.94 + * "TBE" does happen - like HEARTBEAT - but uncommon.
1.95 + * Also "TBI" - frostbite, outbid - but uncommon.
1.96 + * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1.97 + * numerals, but "ii" is a common scanno.
1.98 + */
1.99 + if (strstr(testword,"tbi"))
1.100 + istypo=1;
1.101 + if (strstr(testword,"tbe"))
1.102 + istypo=1;
1.103 + if (strstr(testword,"ii"))
1.104 + istypo=1;
1.105 + /*
1.106 + * Check for no vowels or no consonants.
1.107 + * If none, flag a typo.
1.108 + */
1.109 + if (!istypo && strlen(testword)>1)
1.110 + {
1.111 + vowel=consonant=0;
1.112 + for (i=0;testword[i];i++)
1.113 + {
1.114 + if (testword[i]=='y' || gcisdigit(testword[i]))
1.115 + {
1.116 + /* Yah, this is loose. */
1.117 + vowel++;
1.118 + consonant++;
1.119 + }
1.120 + else if (strchr(vowels,testword[i]))
1.121 + vowel++;
1.122 + else
1.123 + consonant++;
1.124 + }
1.125 + if (!vowel || !consonant)
1.126 + istypo=1;
1.127 + }
1.128 + /*
1.129 + * Now exclude the word from being reported if it's in
1.130 + * the okword list.
1.131 + */
1.132 + for (i=0;*okword[i];i++)
1.133 + if (!strcmp(testword,okword[i]))
1.134 + istypo=0;
1.135 + /*
1.136 + * What looks like a typo may be a Roman numeral.
1.137 + * Exclude these.
1.138 + */
1.139 + if (istypo && isroman(testword))
1.140 + istypo=0;
1.141 + /* Check the manual list of typos. */
1.142 + if (!istypo)
1.143 + for (i=0;*typo[i];i++)
1.144 + if (!strcmp(testword,typo[i]))
1.145 + istypo=1;
1.146 + /*
1.147 + * Check lowercase s, l, i and m - special cases.
1.148 + * "j" - often a semi-colon gone wrong.
1.149 + * "d" for a missing apostrophe - he d
1.150 + * "n" for "in"
1.151 + */
1.152 + if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1.153 + istypo=1;
1.154 + if (istypo)
1.155 + {
1.156 + isdup=0;
1.157 + if (strlen(testword)<MAX_QWORD_LENGTH &&
1.158 + !pswit[VERBOSE_SWITCH])
1.159 + for (i=0;i<qword_index;i++)
1.160 + if (!strcmp(testword,qword[i]))
1.161 + {
1.162 + isdup=1;
1.163 + ++dupcnt[i];
1.164 + }
1.165 + if (!isdup)
1.166 + {
1.167 + if (qword_index<MAX_QWORD &&
1.168 + strlen(testword)<MAX_QWORD_LENGTH)
1.169 + {
1.170 + strcpy(qword[qword_index],testword);
1.171 + qword_index++;
1.172 + }
1.173 + if (pswit[ECHO_SWITCH])
1.174 + printf("\n%s\n",aline);
1.175 + if (!pswit[OVERVIEW_SWITCH])
1.176 + {
1.177 + printf(" Line %ld column %d - Query word %s",
1.178 + linecnt,(int)(wordstart-aline)+1,inword);
1.179 + if (strlen(testword)<MAX_QWORD_LENGTH &&
1.180 + !pswit[VERBOSE_SWITCH])
1.181 + printf(" - not reporting duplicates");
1.182 + printf("\n");
1.183 + }
1.184 + else
1.185 + cnt_word++;
1.186 + }
1.187 + }
1.188 + }
1.189 + /* check the user's list of typos */
1.190 + if (!istypo && usertypo_count)
1.191 + for (i=0;i<usertypo_count;i++)
1.192 + if (!strcmp(testword,usertypo[i]))
1.193 + {
1.194 + if (pswit[ECHO_SWITCH])
1.195 + printf("\n%s\n",aline);
1.196 + if (!pswit[OVERVIEW_SWITCH])
1.197 + printf(" Line %ld column %d - "
1.198 + "Query possible scanno %s\n",
1.199 + linecnt,(int)(wordstart-aline)+2,inword);
1.200 + }
1.201 + if (pswit[PARANOID_SWITCH] && warnings->digit)
1.202 + {
1.203 + /* In paranoid mode, query all 0 and 1 standing alone. */
1.204 + if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1.205 + {
1.206 + if (pswit[ECHO_SWITCH])
1.207 + printf("\n%s\n",aline);
1.208 + if (!pswit[OVERVIEW_SWITCH])
1.209 + printf(" Line %ld column %d - Query standalone %s\n",
1.210 + linecnt,(int)(wordstart-aline)+2,inword);
1.211 + else
1.212 + cnt_word++;
1.213 + }
1.214 + }
1.215 + }
1.216 +}
1.217 +
1.218 +/*
1.219 * procfile:
1.220 *
1.221 * Process one file.
1.222 */
1.223 void procfile(char *filename)
1.224 {
1.225 - const char *s,*t,*wordstart;
1.226 - char inword[MAXWORDLEN],testword[MAXWORDLEN];
1.227 + const char *s,*t;
1.228 char parastart[81]; /* first line of current para */
1.229 FILE *infile;
1.230 struct first_pass_results *first_pass_results;
1.231 @@ -1586,12 +1797,11 @@
1.232 struct line_properties last={0};
1.233 int isemptyline;
1.234 long squot,start_para_line;
1.235 - signed int i,llen,isacro,isellipsis,istypo,alower;
1.236 + signed int i,llen,isacro,isellipsis;
1.237 signed int dquotepar,squotepar;
1.238 - signed int isnewpara,vowel,consonant;
1.239 + signed int isnewpara;
1.240 char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
1.241 cbrack_err[80],unders_err[80];
1.242 - signed int qword_index,isdup;
1.243 signed int enddash;
1.244 last.start=CHAR_SPACE;
1.245 *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
1.246 @@ -1599,9 +1809,7 @@
1.247 linecnt=checked_linecnt=start_para_line=0;
1.248 squot=0;
1.249 i=llen=isacro=isellipsis=0;
1.250 - isnewpara=vowel=consonant=enddash=0;
1.251 - qword_index=0;
1.252 - *inword=*testword=0;
1.253 + isnewpara=enddash=0;
1.254 dquotepar=squotepar=0;
1.255 infile=fopen(filename,"rb");
1.256 if (!infile)
1.257 @@ -1813,208 +2021,7 @@
1.258 check_for_pling_scanno(aline);
1.259 check_for_extra_period(aline,warnings);
1.260 check_for_following_punctuation(aline);
1.261 - /*
1.262 - * Check for commonly mistyped words,
1.263 - * and digits like 0 for O in a word.
1.264 - */
1.265 - for (s=aline;*s;)
1.266 - {
1.267 - wordstart=s;
1.268 - s=getaword(s,inword);
1.269 - if (!*inword)
1.270 - continue; /* don't bother with empty lines */
1.271 - if (mixdigit(inword))
1.272 - {
1.273 - if (pswit[ECHO_SWITCH])
1.274 - printf("\n%s\n",aline);
1.275 - if (!pswit[OVERVIEW_SWITCH])
1.276 - printf(" Line %ld column %d - Query digit in %s\n",
1.277 - linecnt,(int)(wordstart-aline)+1,inword);
1.278 - else
1.279 - cnt_word++;
1.280 - }
1.281 - /*
1.282 - * Put the word through a series of tests for likely typos and OCR
1.283 - * errors.
1.284 - */
1.285 - if (pswit[TYPO_SWITCH])
1.286 - {
1.287 - istypo=0;
1.288 - strcpy(testword,inword);
1.289 - alower=0;
1.290 - for (i=0;i<(signed int)strlen(testword);i++)
1.291 - {
1.292 - /* lowercase for testing */
1.293 - if (testword[i]>='a' && testword[i]<='z')
1.294 - alower=1;
1.295 - if (alower && testword[i]>='A' && testword[i]<='Z')
1.296 - {
1.297 - /*
1.298 - * We have an uppercase mid-word. However, there are
1.299 - * common cases:
1.300 - * Mac and Mc like McGill
1.301 - * French contractions like l'Abbe
1.302 - */
1.303 - if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1.304 - i==3 && testword[0]=='m' && testword[1]=='a' &&
1.305 - testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1.306 - ; /* do nothing! */
1.307 - else
1.308 - istypo=1;
1.309 - }
1.310 - testword[i]=(char)tolower(testword[i]);
1.311 - }
1.312 - /*
1.313 - * Check for certain unlikely two-letter combinations at word
1.314 - * start and end.
1.315 - */
1.316 - if (strlen(testword)>1)
1.317 - {
1.318 - for (i=0;*nostart[i];i++)
1.319 - if (!strncmp(testword,nostart[i],2))
1.320 - istypo=1;
1.321 - for (i=0;*noend[i];i++)
1.322 - if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1.323 - istypo=1;
1.324 - }
1.325 - /* ght is common, gbt never. Like that. */
1.326 - if (strstr(testword,"cb"))
1.327 - istypo=1;
1.328 - if (strstr(testword,"gbt"))
1.329 - istypo=1;
1.330 - if (strstr(testword,"pbt"))
1.331 - istypo=1;
1.332 - if (strstr(testword,"tbs"))
1.333 - istypo=1;
1.334 - if (strstr(testword,"mrn"))
1.335 - istypo=1;
1.336 - if (strstr(testword,"ahle"))
1.337 - istypo=1;
1.338 - if (strstr(testword,"ihle"))
1.339 - istypo=1;
1.340 - /*
1.341 - * "TBE" does happen - like HEARTBEAT - but uncommon.
1.342 - * Also "TBI" - frostbite, outbid - but uncommon.
1.343 - * Similarly "ii" like Hawaii, or Pompeii, and in Roman
1.344 - * numerals, but "ii" is a common scanno.
1.345 - */
1.346 - if (strstr(testword,"tbi"))
1.347 - istypo=1;
1.348 - if (strstr(testword,"tbe"))
1.349 - istypo=1;
1.350 - if (strstr(testword,"ii"))
1.351 - istypo=1;
1.352 - /*
1.353 - * Check for no vowels or no consonants.
1.354 - * If none, flag a typo.
1.355 - */
1.356 - if (!istypo && strlen(testword)>1)
1.357 - {
1.358 - vowel=consonant=0;
1.359 - for (i=0;testword[i];i++)
1.360 - {
1.361 - if (testword[i]=='y' || gcisdigit(testword[i]))
1.362 - {
1.363 - /* Yah, this is loose. */
1.364 - vowel++;
1.365 - consonant++;
1.366 - }
1.367 - else if (strchr(vowels,testword[i]))
1.368 - vowel++;
1.369 - else
1.370 - consonant++;
1.371 - }
1.372 - if (!vowel || !consonant)
1.373 - istypo=1;
1.374 - }
1.375 - /*
1.376 - * Now exclude the word from being reported if it's in
1.377 - * the okword list.
1.378 - */
1.379 - for (i=0;*okword[i];i++)
1.380 - if (!strcmp(testword,okword[i]))
1.381 - istypo=0;
1.382 - /*
1.383 - * What looks like a typo may be a Roman numeral.
1.384 - * Exclude these.
1.385 - */
1.386 - if (istypo && isroman(testword))
1.387 - istypo=0;
1.388 - /* Check the manual list of typos. */
1.389 - if (!istypo)
1.390 - for (i=0;*typo[i];i++)
1.391 - if (!strcmp(testword,typo[i]))
1.392 - istypo=1;
1.393 - /*
1.394 - * Check lowercase s, l, i and m - special cases.
1.395 - * "j" - often a semi-colon gone wrong.
1.396 - * "d" for a missing apostrophe - he d
1.397 - * "n" for "in"
1.398 - */
1.399 - if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1.400 - istypo=1;
1.401 - if (istypo)
1.402 - {
1.403 - isdup=0;
1.404 - if (strlen(testword)<MAX_QWORD_LENGTH &&
1.405 - !pswit[VERBOSE_SWITCH])
1.406 - for (i=0;i<qword_index;i++)
1.407 - if (!strcmp(testword,qword[i]))
1.408 - {
1.409 - isdup=1;
1.410 - ++dupcnt[i];
1.411 - }
1.412 - if (!isdup)
1.413 - {
1.414 - if (qword_index<MAX_QWORD &&
1.415 - strlen(testword)<MAX_QWORD_LENGTH)
1.416 - {
1.417 - strcpy(qword[qword_index],testword);
1.418 - qword_index++;
1.419 - }
1.420 - if (pswit[ECHO_SWITCH])
1.421 - printf("\n%s\n",aline);
1.422 - if (!pswit[OVERVIEW_SWITCH])
1.423 - {
1.424 - printf(" Line %ld column %d - Query word %s",
1.425 - linecnt,(int)(wordstart-aline)+1,inword);
1.426 - if (strlen(testword)<MAX_QWORD_LENGTH &&
1.427 - !pswit[VERBOSE_SWITCH])
1.428 - printf(" - not reporting duplicates");
1.429 - printf("\n");
1.430 - }
1.431 - else
1.432 - cnt_word++;
1.433 - }
1.434 - }
1.435 - }
1.436 - /* check the user's list of typos */
1.437 - if (!istypo && usertypo_count)
1.438 - for (i=0;i<usertypo_count;i++)
1.439 - if (!strcmp(testword,usertypo[i]))
1.440 - {
1.441 - if (pswit[ECHO_SWITCH])
1.442 - printf("\n%s\n",aline);
1.443 - if (!pswit[OVERVIEW_SWITCH])
1.444 - printf(" Line %ld column %d - "
1.445 - "Query possible scanno %s\n",
1.446 - linecnt,(int)(wordstart-aline)+2,inword);
1.447 - }
1.448 - if (pswit[PARANOID_SWITCH] && warnings->digit)
1.449 - {
1.450 - /* In paranoid mode, query all 0 and 1 standing alone. */
1.451 - if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1.452 - {
1.453 - if (pswit[ECHO_SWITCH])
1.454 - printf("\n%s\n",aline);
1.455 - if (!pswit[OVERVIEW_SWITCH])
1.456 - printf(" Line %ld column %d - Query standalone %s\n",
1.457 - linecnt,(int)(wordstart-aline)+2,inword);
1.458 - else
1.459 - cnt_word++;
1.460 - }
1.461 - }
1.462 - }
1.463 + check_for_typos(aline,warnings);
1.464 /*
1.465 * Look for added or missing spaces around punctuation and quotes.
1.466 * If there is a punctuation character like ! with no space on