Break check_for_typos() out
authorali <ali@juiblex.co.uk>
Sun May 26 16:54:06 2013 +0100 (2013-05-26)
changeset 556b786cc05b3c
parent 54 23b2ea51b029
child 56 8ade5460e220
Break check_for_typos() out
bookloupe/bookloupe.c
     1.1 --- a/bookloupe/bookloupe.c	Sun May 26 16:39:48 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Sun May 26 16:54:06 2013 +0100
     1.3 @@ -1570,14 +1570,225 @@
     1.4  }
     1.5  
     1.6  /*
     1.7 + * check_for_typos:
     1.8 + *
     1.9 + * Check for commonly mistyped words,
    1.10 + * and digits like 0 for O in a word.
    1.11 + */
    1.12 +void check_for_typos(const char *aline,struct warnings *warnings)
    1.13 +{
    1.14 +    const char *s,*wordstart;
    1.15 +    char inword[MAXWORDLEN],testword[MAXWORDLEN];
    1.16 +    int i,istypo,isdup,alower,vowel,consonant;
    1.17 +    static int qword_index=0;
    1.18 +    for (s=aline;*s;)
    1.19 +    {
    1.20 +	wordstart=s;
    1.21 +	s=getaword(s,inword);
    1.22 +	if (!*inword)
    1.23 +	    continue; /* don't bother with empty lines */
    1.24 +	if (mixdigit(inword))
    1.25 +	{
    1.26 +	    if (pswit[ECHO_SWITCH])
    1.27 +		printf("\n%s\n",aline);
    1.28 +	    if (!pswit[OVERVIEW_SWITCH])
    1.29 +		printf("    Line %ld column %d - Query digit in %s\n",
    1.30 +		  linecnt,(int)(wordstart-aline)+1,inword);
    1.31 +	    else
    1.32 +		cnt_word++;
    1.33 +	}
    1.34 +	/*
    1.35 +	 * Put the word through a series of tests for likely typos and OCR
    1.36 +	 * errors.
    1.37 +	 */
    1.38 +	if (pswit[TYPO_SWITCH])
    1.39 +	{
    1.40 +	    istypo=0;
    1.41 +	    strcpy(testword,inword);
    1.42 +	    alower=0;
    1.43 +	    for (i=0;i<(signed int)strlen(testword);i++)
    1.44 +	    {
    1.45 +		/* lowercase for testing */
    1.46 +		if (testword[i]>='a' && testword[i]<='z')
    1.47 +		    alower=1;
    1.48 +		if (alower && testword[i]>='A' && testword[i]<='Z')
    1.49 +		{
    1.50 +		    /*
    1.51 +		     * We have an uppercase mid-word. However, there are
    1.52 +		     * common cases:
    1.53 +		     *   Mac and Mc like McGill
    1.54 +		     *   French contractions like l'Abbe
    1.55 +		     */
    1.56 +		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||
    1.57 +		      i==3 && testword[0]=='m' && testword[1]=='a' &&
    1.58 +		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
    1.59 +			; /* do nothing! */
    1.60 +		    else
    1.61 +			istypo=1;
    1.62 +		}
    1.63 +		testword[i]=(char)tolower(testword[i]);
    1.64 +	    }
    1.65 +	    /*
    1.66 +	     * Check for certain unlikely two-letter combinations at word
    1.67 +	     * start and end.
    1.68 +	     */
    1.69 +	    if (strlen(testword)>1)
    1.70 +	    {
    1.71 +		for (i=0;*nostart[i];i++)
    1.72 +		    if (!strncmp(testword,nostart[i],2))
    1.73 +			istypo=1;
    1.74 +		for (i=0;*noend[i];i++)
    1.75 +		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))
    1.76 +			istypo=1;
    1.77 +	    }
    1.78 +	    /* ght is common, gbt never. Like that. */
    1.79 +	    if (strstr(testword,"cb"))
    1.80 +		istypo=1;
    1.81 +	    if (strstr(testword,"gbt"))
    1.82 +		istypo=1;
    1.83 +	    if (strstr(testword,"pbt"))
    1.84 +		istypo=1;
    1.85 +	    if (strstr(testword,"tbs"))
    1.86 +		istypo=1;
    1.87 +	    if (strstr(testword,"mrn"))
    1.88 +		istypo=1;
    1.89 +	    if (strstr(testword,"ahle"))
    1.90 +		istypo=1;
    1.91 +	    if (strstr(testword,"ihle"))
    1.92 +		istypo=1;
    1.93 +	    /*
    1.94 +	     * "TBE" does happen - like HEARTBEAT - but uncommon.
    1.95 +	     * Also "TBI" - frostbite, outbid - but uncommon.
    1.96 +	     * Similarly "ii" like Hawaii, or Pompeii, and in Roman
    1.97 +	     * numerals, but "ii" is a common scanno.
    1.98 +	     */
    1.99 +	    if (strstr(testword,"tbi"))
   1.100 +		istypo=1;
   1.101 +	    if (strstr(testword,"tbe"))
   1.102 +		istypo=1;
   1.103 +	    if (strstr(testword,"ii"))
   1.104 +		istypo=1;
   1.105 +	    /*
   1.106 +	     * Check for no vowels or no consonants.
   1.107 +	     * If none, flag a typo.
   1.108 +	     */
   1.109 +	    if (!istypo && strlen(testword)>1)
   1.110 +	    {
   1.111 +		vowel=consonant=0;
   1.112 +		for (i=0;testword[i];i++)
   1.113 +		{
   1.114 +		    if (testword[i]=='y' || gcisdigit(testword[i]))
   1.115 +		    {
   1.116 +			/* Yah, this is loose. */
   1.117 +			vowel++;
   1.118 +			consonant++;
   1.119 +		    }
   1.120 +		    else if (strchr(vowels,testword[i]))
   1.121 +			vowel++;
   1.122 +		    else
   1.123 +			consonant++;
   1.124 +		}
   1.125 +		if (!vowel || !consonant)
   1.126 +		    istypo=1;
   1.127 +	    }
   1.128 +	    /*
   1.129 +	     * Now exclude the word from being reported if it's in
   1.130 +	     * the okword list.
   1.131 +	     */
   1.132 +	    for (i=0;*okword[i];i++)
   1.133 +		if (!strcmp(testword,okword[i]))
   1.134 +		    istypo=0;
   1.135 +	    /*
   1.136 +	     * What looks like a typo may be a Roman numeral.
   1.137 +	     * Exclude these.
   1.138 +	     */
   1.139 +	    if (istypo && isroman(testword))
   1.140 +		istypo=0;
   1.141 +	    /* Check the manual list of typos. */
   1.142 +	    if (!istypo)
   1.143 +		for (i=0;*typo[i];i++)
   1.144 +		    if (!strcmp(testword,typo[i]))
   1.145 +			istypo=1;
   1.146 +	    /*
   1.147 +	     * Check lowercase s, l, i and m - special cases.
   1.148 +	     *   "j" - often a semi-colon gone wrong.
   1.149 +	     *   "d" for a missing apostrophe - he d
   1.150 +	     *   "n" for "in"
   1.151 +	     */
   1.152 +	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
   1.153 +		istypo=1;
   1.154 +	    if (istypo)
   1.155 +	    {
   1.156 +		isdup=0;
   1.157 +		if (strlen(testword)<MAX_QWORD_LENGTH &&
   1.158 +		  !pswit[VERBOSE_SWITCH])
   1.159 +		    for (i=0;i<qword_index;i++)
   1.160 +			if (!strcmp(testword,qword[i]))
   1.161 +			{
   1.162 +			    isdup=1;
   1.163 +			    ++dupcnt[i];
   1.164 +			}
   1.165 +		if (!isdup)
   1.166 +		{
   1.167 +		    if (qword_index<MAX_QWORD &&
   1.168 +		      strlen(testword)<MAX_QWORD_LENGTH)
   1.169 +		    {
   1.170 +			strcpy(qword[qword_index],testword);
   1.171 +			qword_index++;
   1.172 +		    }
   1.173 +		    if (pswit[ECHO_SWITCH])
   1.174 +			printf("\n%s\n",aline);
   1.175 +		    if (!pswit[OVERVIEW_SWITCH])
   1.176 +		    {
   1.177 +			printf("    Line %ld column %d - Query word %s",
   1.178 +			  linecnt,(int)(wordstart-aline)+1,inword);
   1.179 +			if (strlen(testword)<MAX_QWORD_LENGTH &&
   1.180 +			  !pswit[VERBOSE_SWITCH])
   1.181 +			    printf(" - not reporting duplicates");
   1.182 +			printf("\n");
   1.183 +		    }
   1.184 +		    else
   1.185 +			cnt_word++;
   1.186 +		}
   1.187 +	    }
   1.188 +	}
   1.189 +	/* check the user's list of typos */
   1.190 +	if (!istypo && usertypo_count)
   1.191 +	    for (i=0;i<usertypo_count;i++)
   1.192 +		if (!strcmp(testword,usertypo[i]))
   1.193 +		{
   1.194 +		    if (pswit[ECHO_SWITCH])
   1.195 +			printf("\n%s\n",aline);
   1.196 +		    if (!pswit[OVERVIEW_SWITCH])  
   1.197 +			printf("    Line %ld column %d - "
   1.198 +			  "Query possible scanno %s\n",
   1.199 +			  linecnt,(int)(wordstart-aline)+2,inword);
   1.200 +		}
   1.201 +	if (pswit[PARANOID_SWITCH] && warnings->digit)
   1.202 +	{
   1.203 +	    /* In paranoid mode, query all 0 and 1 standing alone. */
   1.204 +	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
   1.205 +	    {
   1.206 +		if (pswit[ECHO_SWITCH])
   1.207 +		    printf("\n%s\n",aline);
   1.208 +		if (!pswit[OVERVIEW_SWITCH])
   1.209 +		    printf("    Line %ld column %d - Query standalone %s\n",
   1.210 +		      linecnt,(int)(wordstart-aline)+2,inword);
   1.211 +		else
   1.212 +		    cnt_word++;
   1.213 +	    }
   1.214 +	}
   1.215 +    }
   1.216 +}
   1.217 +
   1.218 +/*
   1.219   * procfile:
   1.220   *
   1.221   * Process one file.
   1.222   */
   1.223  void procfile(char *filename)
   1.224  {
   1.225 -    const char *s,*t,*wordstart;
   1.226 -    char inword[MAXWORDLEN],testword[MAXWORDLEN];
   1.227 +    const char *s,*t;
   1.228      char parastart[81];     /* first line of current para */
   1.229      FILE *infile;
   1.230      struct first_pass_results *first_pass_results;
   1.231 @@ -1586,12 +1797,11 @@
   1.232      struct line_properties last={0};
   1.233      int isemptyline;
   1.234      long squot,start_para_line;
   1.235 -    signed int i,llen,isacro,isellipsis,istypo,alower;
   1.236 +    signed int i,llen,isacro,isellipsis;
   1.237      signed int dquotepar,squotepar;
   1.238 -    signed int isnewpara,vowel,consonant;
   1.239 +    signed int isnewpara;
   1.240      char dquote_err[80],squote_err[80],rbrack_err[80],sbrack_err[80],
   1.241        cbrack_err[80],unders_err[80];
   1.242 -    signed int qword_index,isdup;
   1.243      signed int enddash;
   1.244      last.start=CHAR_SPACE;
   1.245      *dquote_err=*squote_err=*rbrack_err=*cbrack_err=*sbrack_err=
   1.246 @@ -1599,9 +1809,7 @@
   1.247      linecnt=checked_linecnt=start_para_line=0;
   1.248      squot=0;
   1.249      i=llen=isacro=isellipsis=0;
   1.250 -    isnewpara=vowel=consonant=enddash=0;
   1.251 -    qword_index=0;
   1.252 -    *inword=*testword=0;
   1.253 +    isnewpara=enddash=0;
   1.254      dquotepar=squotepar=0;
   1.255      infile=fopen(filename,"rb");
   1.256      if (!infile)
   1.257 @@ -1813,208 +2021,7 @@
   1.258  	check_for_pling_scanno(aline);
   1.259  	check_for_extra_period(aline,warnings);
   1.260  	check_for_following_punctuation(aline);
   1.261 -        /*
   1.262 -	 * Check for commonly mistyped words,
   1.263 -	 * and digits like 0 for O in a word.
   1.264 -	 */
   1.265 -        for (s=aline;*s;)
   1.266 -	{
   1.267 -            wordstart=s;
   1.268 -            s=getaword(s,inword);
   1.269 -            if (!*inword)
   1.270 -		continue; /* don't bother with empty lines */
   1.271 -            if (mixdigit(inword))
   1.272 -	    {
   1.273 -                if (pswit[ECHO_SWITCH])
   1.274 -		    printf("\n%s\n",aline);
   1.275 -                if (!pswit[OVERVIEW_SWITCH])
   1.276 -                    printf("    Line %ld column %d - Query digit in %s\n",
   1.277 -		      linecnt,(int)(wordstart-aline)+1,inword);
   1.278 -                else
   1.279 -                    cnt_word++;
   1.280 -	    }
   1.281 -            /*
   1.282 -	     * Put the word through a series of tests for likely typos and OCR
   1.283 -	     * errors.
   1.284 -	     */
   1.285 -            if (pswit[TYPO_SWITCH])
   1.286 -	    {
   1.287 -                istypo=0;
   1.288 -                strcpy(testword,inword);
   1.289 -                alower=0;
   1.290 -                for (i=0;i<(signed int)strlen(testword);i++)
   1.291 -		{
   1.292 -		    /* lowercase for testing */
   1.293 -                    if (testword[i]>='a' && testword[i]<='z')
   1.294 -			alower=1;
   1.295 -                    if (alower && testword[i]>='A' && testword[i]<='Z')
   1.296 -		    {
   1.297 -                        /*
   1.298 -			 * We have an uppercase mid-word. However, there are
   1.299 -			 * common cases:
   1.300 -                         *   Mac and Mc like McGill
   1.301 -                         *   French contractions like l'Abbe
   1.302 -			 */
   1.303 -                        if (i==2 && testword[0]=='m' && testword[1]=='c' ||
   1.304 -                          i==3 && testword[0]=='m' && testword[1]=='a' &&
   1.305 -			  testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
   1.306 -			    ; /* do nothing! */
   1.307 -                        else
   1.308 -                            istypo=1;
   1.309 -		    }
   1.310 -                    testword[i]=(char)tolower(testword[i]);
   1.311 -		}
   1.312 -                /*
   1.313 -		 * Check for certain unlikely two-letter combinations at word
   1.314 -		 * start and end.
   1.315 -		 */
   1.316 -                if (strlen(testword)>1)
   1.317 -		{
   1.318 -                    for (i=0;*nostart[i];i++)
   1.319 -                        if (!strncmp(testword,nostart[i],2))
   1.320 -                            istypo=1;
   1.321 -                    for (i=0;*noend[i];i++)
   1.322 -                        if (!strncmp(testword+strlen(testword)-2,noend[i],2))
   1.323 -                            istypo=1;
   1.324 -		}
   1.325 -                /* ght is common, gbt never. Like that. */
   1.326 -                if (strstr(testword,"cb"))
   1.327 -		    istypo=1;
   1.328 -                if (strstr(testword,"gbt"))
   1.329 -		    istypo=1;
   1.330 -                if (strstr(testword,"pbt"))
   1.331 -		    istypo=1;
   1.332 -                if (strstr(testword,"tbs"))
   1.333 -		    istypo=1;
   1.334 -                if (strstr(testword,"mrn"))
   1.335 -		    istypo=1;
   1.336 -                if (strstr(testword,"ahle"))
   1.337 -		    istypo=1;
   1.338 -                if (strstr(testword,"ihle"))
   1.339 -		    istypo=1;
   1.340 -                /*
   1.341 -		 * "TBE" does happen - like HEARTBEAT - but uncommon.
   1.342 -                 * Also "TBI" - frostbite, outbid - but uncommon.
   1.343 -                 * Similarly "ii" like Hawaii, or Pompeii, and in Roman
   1.344 -		 * numerals, but "ii" is a common scanno.
   1.345 -		 */
   1.346 -                if (strstr(testword,"tbi"))
   1.347 -		    istypo=1;
   1.348 -                if (strstr(testword,"tbe"))
   1.349 -		    istypo=1;
   1.350 -                if (strstr(testword,"ii"))
   1.351 -		    istypo=1;
   1.352 -                /*
   1.353 -		 * Check for no vowels or no consonants.
   1.354 -                 * If none, flag a typo.
   1.355 -		 */
   1.356 -                if (!istypo && strlen(testword)>1)
   1.357 -		{
   1.358 -                    vowel=consonant=0;
   1.359 -                    for (i=0;testword[i];i++)
   1.360 -		    {
   1.361 -                        if (testword[i]=='y' || gcisdigit(testword[i]))
   1.362 -			{
   1.363 -			    /* Yah, this is loose. */
   1.364 -                            vowel++;
   1.365 -                            consonant++;
   1.366 -			}
   1.367 -                        else if (strchr(vowels,testword[i]))
   1.368 -			    vowel++;
   1.369 -			else
   1.370 -			    consonant++;
   1.371 -		    }
   1.372 -                    if (!vowel || !consonant)
   1.373 -                        istypo=1;
   1.374 -		}
   1.375 -                /*
   1.376 -		 * Now exclude the word from being reported if it's in
   1.377 -                 * the okword list.
   1.378 -		 */
   1.379 -                for (i=0;*okword[i];i++)
   1.380 -                    if (!strcmp(testword,okword[i]))
   1.381 -                        istypo=0;
   1.382 -                /*
   1.383 -		 * What looks like a typo may be a Roman numeral.
   1.384 -		 * Exclude these.
   1.385 -		 */
   1.386 -                if (istypo && isroman(testword))
   1.387 -		    istypo=0;
   1.388 -                /* Check the manual list of typos. */
   1.389 -                if (!istypo)
   1.390 -                    for (i=0;*typo[i];i++)
   1.391 -                        if (!strcmp(testword,typo[i]))
   1.392 -                            istypo=1;
   1.393 -                /*
   1.394 -		 * Check lowercase s, l, i and m - special cases.
   1.395 -                 *   "j" - often a semi-colon gone wrong.
   1.396 -                 *   "d" for a missing apostrophe - he d
   1.397 -                 *   "n" for "in"
   1.398 -		 */
   1.399 -                if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
   1.400 -		    istypo=1;
   1.401 -                if (istypo)
   1.402 -		{
   1.403 -                    isdup=0;
   1.404 -                    if (strlen(testword)<MAX_QWORD_LENGTH &&
   1.405 -		      !pswit[VERBOSE_SWITCH])
   1.406 -                        for (i=0;i<qword_index;i++)
   1.407 -                            if (!strcmp(testword,qword[i]))
   1.408 -			    {
   1.409 -                                isdup=1;
   1.410 -                                ++dupcnt[i];
   1.411 -			    }
   1.412 -                    if (!isdup)
   1.413 -		    {
   1.414 -                        if (qword_index<MAX_QWORD &&
   1.415 -			  strlen(testword)<MAX_QWORD_LENGTH)
   1.416 -			{
   1.417 -                            strcpy(qword[qword_index],testword);
   1.418 -                            qword_index++;
   1.419 -			}
   1.420 -                        if (pswit[ECHO_SWITCH])
   1.421 -			    printf("\n%s\n",aline);
   1.422 -                        if (!pswit[OVERVIEW_SWITCH])
   1.423 -			{
   1.424 -                            printf("    Line %ld column %d - Query word %s",
   1.425 -			      linecnt,(int)(wordstart-aline)+1,inword);
   1.426 -                            if (strlen(testword)<MAX_QWORD_LENGTH &&
   1.427 -			      !pswit[VERBOSE_SWITCH])
   1.428 -                                printf(" - not reporting duplicates");
   1.429 -                            printf("\n");
   1.430 -			}
   1.431 -                        else
   1.432 -                            cnt_word++;
   1.433 -		    }
   1.434 -		}
   1.435 -	    }
   1.436 -	    /* check the user's list of typos */
   1.437 -	    if (!istypo && usertypo_count)
   1.438 -		for (i=0;i<usertypo_count;i++)
   1.439 -		    if (!strcmp(testword,usertypo[i]))
   1.440 -		    {
   1.441 -			if (pswit[ECHO_SWITCH])
   1.442 -			    printf("\n%s\n",aline);
   1.443 -			if (!pswit[OVERVIEW_SWITCH])  
   1.444 -			    printf("    Line %ld column %d - "
   1.445 -			      "Query possible scanno %s\n",
   1.446 -			      linecnt,(int)(wordstart-aline)+2,inword);
   1.447 -		    }
   1.448 -            if (pswit[PARANOID_SWITCH] && warnings->digit)
   1.449 -	    {
   1.450 -		/* In paranoid mode, query all 0 and 1 standing alone. */
   1.451 -                if (!strcmp(inword,"0") || !strcmp(inword,"1"))
   1.452 -		{
   1.453 -                    if (pswit[ECHO_SWITCH])
   1.454 -			printf("\n%s\n",aline);
   1.455 -                    if (!pswit[OVERVIEW_SWITCH])
   1.456 -                        printf("    Line %ld column %d - Query standalone %s\n",
   1.457 -			  linecnt,(int)(wordstart-aline)+2,inword);
   1.458 -                    else
   1.459 -                        cnt_word++;
   1.460 -		}
   1.461 -	    }
   1.462 -	}
   1.463 +	check_for_typos(aline,warnings);
   1.464  	/*
   1.465           * Look for added or missing spaces around punctuation and quotes.
   1.466           * If there is a punctuation character like ! with no space on