# HG changeset patch # User ali # Date 1369895484 -3600 # Node ID aa916da2e452f74c1042749418904d659ed89693 # Parent 1016349e619fe15277a41597cf9b8811c5411b25 Switch to using UTF-8 internally diff -r 1016349e619f -r aa916da2e452 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Tue May 28 15:17:19 2013 +0100 +++ b/bookloupe/bookloupe.c Thu May 30 07:31:24 2013 +0100 @@ -119,8 +119,6 @@ "among", "those", "into", "whom", "having", "thence", "" }; -char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; - struct { char *htmlent; char *htmlnum; @@ -347,16 +345,13 @@ gchar *running_from; -int mixdigit(const char *); +gboolean mixdigit(const char *); gchar *getaword(const char **); char *flgets(char **,long); -gboolean gcisalpha(unsigned char); -gboolean gcisdigit(unsigned char); -gboolean gcisletter(unsigned char); void postprocess_for_HTML(char *); char *linehasmarkup(char *); char *losemarkup(char *); -int tagcomp(const char *,const char *); +gboolean tagcomp(const char *,const char *); char *loseentities(char *); gboolean isroman(const char *); void postprocess_for_DP(char *); @@ -385,7 +380,7 @@ struct line_properties { unsigned int len,blen; - char start; + gunichar start; }; struct parities { @@ -462,8 +457,8 @@ gchar *usertypo_file; gboolean okay; int i; - gsize len; - gchar *contents,**lines; + gsize len,nb; + gchar *contents,*utf8,**lines; usertypo_file=g_strdup("bookloupe.typ"); okay=file_get_contents_text(usertypo_file,&contents,&len,&err); if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) @@ -490,7 +485,7 @@ if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) { g_free(usertypo_file); - printf(" --> I couldn't find bookloupe.typ " + g_print(" --> I couldn't find bookloupe.typ " "-- proceeding without user typos.\n"); return; } @@ -501,7 +496,10 @@ g_clear_error(&err); exit(1); } - lines=g_strsplit(contents,"\n",0); + utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); + g_free(contents); + lines=g_strsplit_set(utf8,"\r\n",0); + g_free(utf8); usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL); for (i=0;lines[i];i++) if (*(unsigned char *)lines[i]>'!') @@ -511,49 +509,6 @@ g_free(lines); } -#if 0 -/* - * read_etext: - * - * Read an etext returning an array of lines. Lines are normally expected - * to be terminated by CR LF. Solitary LFs delimit lines but are left - * embedded at the end of the line for further processing. Solitary CRs - * do not delimit lines. - */ -gchar **read_etext(const char *filename,GError **err) -{ - int i; - const char *s,*t; - gchar *contents; - gchar **raw_lines; - GPtrArray *lines; - gsize len; - if (!g_file_get_contents(filename,&contents,&len,err)) - return NULL; - raw_lines=g_strsplit(contents,"\r\n",0); - lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1); - for (i=0;raw_lines[i];i++) - { - t=strchr(raw_lines[i],'\n'); - if (t) - { - s=raw_lines[i]; - while ((t=strchr(s,'\n'))) - { - g_ptr_array_add(lines,g_strndup(s,t-s+1)); - s=t+1; - } - g_ptr_array_add(lines,g_strdup(s)); - g_free(raw_lines[i]); - } - else - g_ptr_array_add(lines,raw_lines[i]); - } - g_free(raw_lines); - g_ptr_array_add(lines,NULL); - return (gchar **)g_ptr_array_free(lines,FALSE); -} -#else /* * read_etext: * @@ -562,13 +517,14 @@ */ gchar *read_etext(const char *filename,GError **err) { - gchar *contents; - gsize len; + gchar *contents,*utf8; + gsize len,nb; if (!g_file_get_contents(filename,&contents,&len,err)) return NULL; - return contents; + utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); + g_free(contents); + return utf8; } -#endif int main(int argc,char **argv) { @@ -580,35 +536,35 @@ procfile(argv[1]); if (pswit[OVERVIEW_SWITCH]) { - printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n", + g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n", checked_linecnt,linecnt,linecnt-checked_linecnt); - printf(" --------------- Queries found --------------\n"); + g_print(" --------------- Queries found --------------\n"); if (cnt_long) - printf(" Long lines: %14ld\n",cnt_long); + g_print(" Long lines: %14ld\n",cnt_long); if (cnt_short) - printf(" Short lines: %14ld\n",cnt_short); + g_print(" Short lines: %14ld\n",cnt_short); if (cnt_lineend) - printf(" Line-end problems: %14ld\n",cnt_lineend); + g_print(" Line-end problems: %14ld\n",cnt_lineend); if (cnt_word) - printf(" Common typos: %14ld\n",cnt_word); + g_print(" Common typos: %14ld\n",cnt_word); if (cnt_dquot) - printf(" Unmatched quotes: %14ld\n",cnt_dquot); + g_print(" Unmatched quotes: %14ld\n",cnt_dquot); if (cnt_squot) - printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot); + g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot); if (cnt_brack) - printf(" Unmatched brackets: %14ld\n",cnt_brack); + g_print(" Unmatched brackets: %14ld\n",cnt_brack); if (cnt_bin) - printf(" Non-ASCII characters: %14ld\n",cnt_bin); + g_print(" Non-ASCII characters: %14ld\n",cnt_bin); if (cnt_odd) - printf(" Proofing characters: %14ld\n",cnt_odd); + g_print(" Proofing characters: %14ld\n",cnt_odd); if (cnt_punct) - printf(" Punctuation & spacing queries: %14ld\n",cnt_punct); + g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct); if (cnt_dash) - printf(" Non-standard dashes: %14ld\n",cnt_dash); + g_print(" Non-standard dashes: %14ld\n",cnt_dash); if (cnt_html) - printf(" Possible HTML tags: %14ld\n",cnt_html); - printf("\n"); - printf(" TOTAL QUERIES %14ld\n", + g_print(" Possible HTML tags: %14ld\n",cnt_html); + g_print("\n"); + g_print(" TOTAL QUERIES %14ld\n", cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+ cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend); } @@ -628,10 +584,10 @@ */ struct first_pass_results *first_pass(const char *etext) { - char laststart=CHAR_SPACE; + gunichar laststart=CHAR_SPACE; const char *s; gchar *lc_line; - int i,j,llen; + int i,j,lbytes,llen; gchar **lines; unsigned int lastlen=0,lastblen=0; long spline=0,nspline=0; @@ -640,27 +596,28 @@ lines=g_strsplit(etext,"\n",0); for (j=0;lines[j];j++) { - llen=strlen(lines[j]); - while(lines[j][llen-1]=='\r') - lines[j][llen--]='\0'; + lbytes=strlen(lines[j]); + while (lines[j][lbytes-1]=='\r') + lines[j][--lbytes]='\0'; + llen=g_utf8_strlen(lines[j],lbytes); linecnt++; if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") && (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT"))) { if (spline) - printf(" --> Duplicate header?\n"); + g_print(" --> Duplicate header?\n"); spline=linecnt+1; /* first line of non-header text, that is */ } if (!strncmp(lines[j],"*** START",9) && strstr(lines[j],"PROJECT GUTENBERG")) { if (nspline) - printf(" --> Duplicate header?\n"); + g_print(" --> Duplicate header?\n"); nspline=linecnt+1; /* first line of non-header text, that is */ } if (spline || nspline) { - lc_line=g_ascii_strdown(lines[j],llen); + lc_line=g_utf8_strdown(lines[j],lbytes); if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg")) { if (strstr(lc_line,"end") Duplicate footer?\n"); + g_print(" --> Duplicate footer?\n"); } else results.footerline=linecnt; @@ -684,19 +641,21 @@ if (results.footerline) continue; /* don't count the boilerplate in the footer */ results.totlen+=llen; - for (i=0;i127) + if (g_utf8_get_char(s)>127) results.binlen++; - if (gcisalpha(lines[j][i])) + if (g_unichar_isalpha(g_utf8_get_char(s))) results.alphalen++; - if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1])) + if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE && + isalpha(g_utf8_get_char(g_utf8_prev_char(s)))) results.endquote_count++; } if (llen>2 && lastlen>2 && lastlen2 && lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE) results.shortline++; - if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE) + if (lbytes>0 && + g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE) cnt_spacend++; if (strstr(lines[j],".,")) results.dotcomma++; @@ -704,17 +663,19 @@ /* locase text on the line */ if (strchr(lines[j],'*')) { - for (s=lines[j];*s;s++) - if (*s>='a' && *s<='z') + for (s=lines[j];*s;s=g_utf8_next_char(s)) + if (g_unichar_islower(g_utf8_get_char(s))) break; - if (*s) + if (*s) results.astline++; } if (strchr(lines[j],'/')) results.fslashline++; - for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--) + for (s=g_utf8_prev_char(lines[j]+lbytes); + s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s)) ; - if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-') + if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' && + g_utf8_get_char(g_utf8_prev_char(s))!='-') results.hyphens++; if (llen>LONGEST_PG_LINE) results.longline++; @@ -729,15 +690,15 @@ results.htmcount+=4; /* bonus marks! */ } /* Check for spaced em-dashes */ - if (lines[j][0] && (s=strstr(lines[j]+1,"--"))) + if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--"))) { results.emdash++; - if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE)) + if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE) results.space_emdash++; - if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE)) + if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE) /* count of em-dashes with spaces both sides */ results.non_PG_space_emdash++; - if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE)) + if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE) /* count of PG-type em-dashes with no spaces */ results.PG_space_emdash++; } @@ -772,13 +733,13 @@ { static struct warnings warnings={0}; if (cnt_spacend>0) - printf(" --> %ld lines in this file have white space at end\n", + g_print(" --> %ld lines in this file have white space at end\n", cnt_spacend); warnings.dotcomma=1; if (results->dotcomma>5) { warnings.dotcomma=0; - printf(" --> %ld lines in this file contain '.,'. " + g_print(" --> %ld lines in this file contain '.,'. " "Not reporting them.\n",results->dotcomma); } /* @@ -789,7 +750,7 @@ if (results->shortline>50 || results->shortline*10>linecnt) { warnings.shortline=0; - printf(" --> %ld lines in this file are short. " + g_print(" --> %ld lines in this file are short. " "Not reporting short lines.\n",results->shortline); } /* @@ -800,7 +761,7 @@ if (results->longline>50 || results->longline*10>linecnt) { warnings.longline=0; - printf(" --> %ld lines in this file are long. " + g_print(" --> %ld lines in this file are long. " "Not reporting long lines.\n",results->longline); } /* If more than 10 lines contain asterisks, don't bother reporting them. */ @@ -808,7 +769,7 @@ if (results->astline>10) { warnings.ast=0; - printf(" --> %ld lines in this file contain asterisks. " + g_print(" --> %ld lines in this file contain asterisks. " "Not reporting them.\n",results->astline); } /* @@ -819,7 +780,7 @@ if (results->fslashline>10) { warnings.fslash=0; - printf(" --> %ld lines in this file contain forward slashes. " + g_print(" --> %ld lines in this file contain forward slashes. " "Not reporting them.\n",results->fslashline); } /* @@ -830,7 +791,7 @@ if (results->endquote_count>20) { warnings.endquote=0; - printf(" --> %ld lines in this file contain unpunctuated endquotes. " + g_print(" --> %ld lines in this file contain unpunctuated endquotes. " "Not reporting them.\n",results->endquote_count); } /* @@ -841,7 +802,7 @@ if (results->standalone_digit>10) { warnings.digit=0; - printf(" --> %ld lines in this file contain standalone 0s and 1s. " + g_print(" --> %ld lines in this file contain standalone 0s and 1s. " "Not reporting them.\n",results->standalone_digit); } /* @@ -852,16 +813,16 @@ if (results->hyphens>20) { warnings.hyphen=0; - printf(" --> %ld lines in this file have hyphens at end. " + g_print(" --> %ld lines in this file have hyphens at end. " "Not reporting them.\n",results->hyphens); } if (results->htmcount>20 && !pswit[MARKUP_SWITCH]) { - printf(" --> Looks like this is HTML. Switching HTML mode ON.\n"); + g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n"); pswit[MARKUP_SWITCH]=1; } if (results->verylongline>0) - printf(" --> %ld lines in this file are VERY long!\n", + g_print(" --> %ld lines in this file are VERY long!\n", results->verylongline); /* * If there are more non-PG spaced dashes than PG em-dashes, @@ -874,7 +835,7 @@ results->PG_space_emdash) { warnings.dash=0; - printf(" --> There are %ld spaced dashes and em-dashes. " + g_print(" --> There are %ld spaced dashes and em-dashes. " "Not reporting them.\n", results->spacedash+results->non_PG_space_emdash); } @@ -882,19 +843,19 @@ warnings.bin=1; if (results->binlen*4>results->totlen) { - printf(" --> This file does not appear to be ASCII. " + g_print(" --> This file does not appear to be ASCII. " "Terminating. Best of luck with it!\n"); exit(1); } if (results->alphalen*4totlen) { - printf(" --> This file does not appear to be text. " + g_print(" --> This file does not appear to be text. " "Terminating. Best of luck with it!\n"); exit(1); } if (results->binlen*100>results->totlen || results->binlen>100) { - printf(" --> There are a lot of foreign letters here. " + g_print(" --> There are a lot of foreign letters here. " "Not reporting them.\n"); warnings.bin=0; } @@ -902,26 +863,26 @@ if (results->Dutchcount>50) { warnings.isDutch=TRUE; - printf(" --> This looks like Dutch - " + g_print(" --> This looks like Dutch - " "switching off dashes and warnings for 's Middags case.\n"); } warnings.isFrench=FALSE; if (results->Frenchcount>50) { warnings.isFrench=TRUE; - printf(" --> This looks like French - " + g_print(" --> This looks like French - " "switching off some doublepunct.\n"); } if (results->firstline && results->footerline) - printf(" The PG header and footer appear to be already on.\n"); + g_print(" The PG header and footer appear to be already on.\n"); else { if (results->firstline) - printf(" The PG header is on - no footer.\n"); + g_print(" The PG header is on - no footer.\n"); if (results->footerline) - printf(" The PG footer is on - no header.\n"); + g_print(" The PG footer is on - no header.\n"); } - printf("\n"); + g_print("\n"); if (pswit[VERBOSE_SWITCH]) { warnings.bin=1; @@ -934,7 +895,7 @@ warnings.fslash=1; warnings.hyphen=1; warnings.endquote=1; - printf(" *** Verbose output is ON -- you asked for it! ***\n"); + g_print(" *** Verbose output is ON -- you asked for it! ***\n"); } if (warnings.isDutch) warnings.dash=0; @@ -942,9 +903,9 @@ results->footerline>results->firstline && results->footerline-results->firstline<100) { - printf(" --> I don't really know where this text starts. \n"); - printf(" There are no reference points.\n"); - printf(" I'm going to have to report the header and footer " + g_print(" --> I don't really know where this text starts. \n"); + g_print(" There are no reference points.\n"); + g_print(" I'm going to have to report the header and footer " "as well.\n"); results->firstline=0; } @@ -968,12 +929,16 @@ int guessquote=0; /* assume the line is empty until proven otherwise */ gboolean isemptyline=TRUE; - const char *s=aline; + const char *s=aline,*sprev,*snext; + gunichar c; + sprev=NULL; while (*s) { - if (*s==CHAR_DQUOTE) + snext=g_utf8_next_char(s); + c=g_utf8_get_char(s); + if (c==CHAR_DQUOTE) counters->quot++; - if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) + if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) { if (s==aline) { @@ -981,17 +946,21 @@ * At start of line, it can only be an openquote. * Hardcode a very common exception! */ - if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3)) + if (!g_str_has_prefix(snext,"tis") && + !g_str_has_prefix(snext,"Tis")) counters->open_single_quote++; } - else if (gcisalpha(s[-1]) && gcisalpha(s[1])) + else if (g_unichar_isalpha(g_utf8_get_char(sprev)) && + g_unichar_isalpha(g_utf8_get_char(snext))) /* Do nothing! it's definitely an apostrophe, not a quote */ ; /* it's outside a word - let's check it out */ - else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1])) + else if (c==CHAR_OPEN_SQUOTE || + g_unichar_isalpha(g_utf8_get_char(snext))) { /* it damwell better BE an openquote */ - if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3)) + if (!g_str_has_prefix(snext,"tis") && + !g_str_has_prefix(snext,"Tis")) /* hardcode a very common exception! */ counters->open_single_quote++; } @@ -999,20 +968,22 @@ { /* now - is it a closequote? */ guessquote=0; /* accumulate clues */ - if (gcisalpha(s[-1])) + if (g_unichar_isalpha(g_utf8_get_char(sprev))) { /* it follows a letter - could be either */ guessquote++; - if (s[-1]=='s') + if (g_utf8_get_char(sprev)=='s') { /* looks like a plural apostrophe */ guessquote-=3; - if (s[1]==CHAR_SPACE) /* bonus marks! */ + if (g_utf8_get_char(snext)==CHAR_SPACE) + /* bonus marks! */ guessquote-=2; } } /* it doesn't have a letter either side */ - else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1])) + else if (strchr(".?!,;:",g_utf8_get_char(sprev)) && + strchr(".?!,;: ",g_utf8_get_char(snext))) guessquote+=8; /* looks like a closequote */ else guessquote++; @@ -1028,24 +999,25 @@ counters->close_single_quote++; } } - if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK && - *s!=13 && *s!=10) + if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK && + c!='\r' && c!='\n') isemptyline=FALSE; /* ignore lines like * * * as spacers */ - if (*s==CHAR_UNDERSCORE) + if (c==CHAR_UNDERSCORE) counters->c_unders++; - if (*s==CHAR_OPEN_CBRACK) + if (c==CHAR_OPEN_CBRACK) counters->c_brack++; - if (*s==CHAR_CLOSE_CBRACK) + if (c==CHAR_CLOSE_CBRACK) counters->c_brack--; - if (*s==CHAR_OPEN_RBRACK) + if (c==CHAR_OPEN_RBRACK) counters->r_brack++; - if (*s==CHAR_CLOSE_RBRACK) + if (c==CHAR_CLOSE_RBRACK) counters->r_brack--; - if (*s==CHAR_OPEN_SBRACK) + if (c==CHAR_OPEN_SBRACK) counters->s_brack++; - if (*s==CHAR_CLOSE_SBRACK) + if (c==CHAR_CLOSE_SBRACK) counters->s_brack--; - s++; + sprev=s; + s=snext; } return isemptyline; } @@ -1060,18 +1032,18 @@ */ void check_for_control_characters(const char *aline) { - unsigned char c; + gunichar c; const char *s; - for (s=aline;*s;s++) + for (s=aline;*s;s=g_utf8_next_char(s)) { - c=*(unsigned char *)s; + c=g_utf8_get_char(s); if (c127)) + c=g_utf8_get_char(s); + if (!eNon_A && (c127)) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - if (c>127 && c<160) - printf(" Line %ld column %d - " - "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c); + if (c>127 && c<160 || c>255) + g_print(" Line %ld column %ld - " + "Non-ISO-8859 character %u\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); else - printf(" Line %ld column %d - Non-ASCII character %d\n", - linecnt,(int)(s-aline)+1,c); + g_print(" Line %ld column %ld - " + "Non-ASCII character %u\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); else cnt_bin++; - eNon_A=1; + eNon_A=TRUE; } - if (!eTab && *s==CHAR_TAB) + if (!eTab && c==CHAR_TAB) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Tab character?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Tab character?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_odd++; - eTab=1; + eTab=TRUE; } - if (!eTilde && *s==CHAR_TILDE) + if (!eTilde && c==CHAR_TILDE) { /* * Often used by OCR software to indicate an * unrecognizable character. */ if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Tilde character?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Tilde character?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_odd++; - eTilde=1; + eTilde=TRUE; } - if (!eCarat && *s==CHAR_CARAT) + if (!eCarat && c==CHAR_CARAT) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Carat character?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Carat character?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_odd++; - eCarat=1; + eCarat=TRUE; } - if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash) + if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Forward slash?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Forward slash?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_odd++; - eFSlash=1; + eFSlash=TRUE; } /* * Report asterisks only in paranoid mode, * since they're often deliberate. */ if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline && - *s==CHAR_ASTERISK) + c==CHAR_ASTERISK) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Asterisk?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Asterisk?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_odd++; - eAst=1; + eAst=TRUE; } } } @@ -1182,13 +1157,13 @@ */ void check_for_long_line(const char *aline) { - if (strlen(aline)>LONGEST_PG_LINE) + if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Long line %d\n", - linecnt,(int)strlen(aline),(int)strlen(aline)); + g_print(" Line %ld column %ld - Long line %ld\n", + linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1)); else cnt_long++; } @@ -1220,14 +1195,15 @@ */ void check_for_short_line(const char *aline,const struct line_properties *last) { - if (strlen(aline)>1 && last->len>1 && last->lenblen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE) + if (g_utf8_strlen(aline,-1)>1 && last->len>1 && + last->lenblen>1 && + last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",prevline); + g_print("\n%s\n",prevline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Short line %d?\n", - linecnt-1,(int)strlen(prevline),(int)strlen(prevline)); + g_print(" Line %ld column %ld - Short line %ld?\n", + linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1)); else cnt_short++; } @@ -1240,12 +1216,13 @@ */ void check_for_starting_punctuation(const char *aline) { - if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5)) + if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) && + !g_str_has_prefix(aline,". . .")) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column 1 - Begins with punctuation?\n", + g_print(" Line %ld column 1 - Begins with punctuation?\n", linecnt); else cnt_punct++; @@ -1263,21 +1240,21 @@ */ void check_for_spaced_emdash(const char *aline) { - const char *s,*t; - s=aline; - while ((t=strstr(s,"--"))) + const char *s,*t,*next; + for (s=aline;t=strstr(s,"--");s=next) { - if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE) + next=g_utf8_next_char(g_utf8_next_char(t)); + if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE || + g_utf8_get_char(next)==CHAR_SPACE) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Spaced em-dash?\n", - linecnt,(int)(t-aline)+1); + g_print(" Line %ld column %ld - Spaced em-dash?\n", + linecnt,g_utf8_pointer_to_offset(aline,t)+1); else cnt_dash++; } - s=t+2; } } @@ -1291,26 +1268,26 @@ const char *s; if ((s=strstr(aline," -"))) { - if (s[2]!='-') + if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-') { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Spaced dash?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Spaced dash?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_dash++; } } else if ((s=strstr(aline,"- "))) { - if (s==aline || s[-1]!='-') + if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-') { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Spaced dash?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Spaced dash?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_dash++; } @@ -1335,10 +1312,11 @@ if (s) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query missing paragraph break?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - " + "Query missing paragraph break?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_punct++; } @@ -1382,10 +1360,10 @@ if (s) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query he/be error?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Query he/be error?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_word++; } @@ -1405,10 +1383,10 @@ if (s) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query had/bad error?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Query had/bad error?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_word++; } @@ -1418,10 +1396,10 @@ if (s) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query hut/but error?\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - Query hut/but error?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_word++; } @@ -1440,10 +1418,11 @@ if (s) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query angled bracket with From\n", - linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - " + "Query angled bracket with From\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_punct++; } @@ -1457,17 +1436,18 @@ */ void check_for_orphan_character(const char *aline) { - if (*aline && !aline[1]) + gunichar c; + c=g_utf8_get_char(aline); + if (c && !*g_utf8_next_char(aline)) { - if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' || - gcisdigit(*aline)) + if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c)) ; /* Nothing - ignore numerals alone on a line. */ else { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column 1 - Query single character line\n", + g_print(" Line %ld column 1 - Query single character line\n", linecnt); else cnt_punct++; @@ -1487,10 +1467,10 @@ if (s) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %ld - Query I=exclamation mark?\n", - linecnt,s-aline); + g_print(" Line %ld column %ld - Query I=exclamation mark?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)); else cnt_punct++; } @@ -1506,47 +1486,58 @@ { const char *s,*t,*s1; int i; + gsize len; gboolean istypo; gchar *testword; + gunichar *decomposition; if (pswit[PARANOID_SWITCH]) { - for (t=aline;strstr(t,". ");) + for (t=aline;t=strstr(t,". ");) { - t=strstr(t,". "); if (t==aline) { - t++; + t=g_utf8_next_char(t); /* start of line punctuation is handled elsewhere */ continue; } - if (!gcisalpha(t[-1])) + if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t)))) { - t++; + t=g_utf8_next_char(t); continue; } if (warnings->isDutch) { /* For Frank & Jeroen -- 's Middags case */ - if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' && - t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z') + gunichar c2,c3,c4,c5; + c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2)); + c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3)); + c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4)); + c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5)); + if (c2==CHAR_SQUOTE && g_unichar_islower(c3) && + c4==CHAR_SPACE && g_unichar_isupper(c5)) { - t++; + t=g_utf8_next_char(t); continue; } } - s1=t+2; - while (*s1 && !gcisalpha(*s1) && !isdigit(*s1)) - s1++; - if (*s1>='a' && *s1<='z') + s1=g_utf8_next_char(g_utf8_next_char(t)); + while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) && + !isdigit(g_utf8_get_char(s1))) + s1=g_utf8_next_char(s1); + if (g_unichar_islower(g_utf8_get_char(s1))) { /* we have something to investigate */ istypo=TRUE; /* so let's go back and find out */ - for (s1=t-1;s1>=aline && - (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE && - gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--) + for (s1=g_utf8_prev_char(t);s1>=aline && + (g_unichar_isalpha(g_utf8_get_char(s1)) || + g_unichar_isdigit(g_utf8_get_char(s1)) || + g_utf8_get_char(s1)==CHAR_SQUOTE && + g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) && + g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1)))); + s1=g_utf8_prev_char(s1)) ; - s1++; + s1=g_utf8_next_char(s1); s=strchr(s1,'.'); if (s) testword=g_strndup(s1,s-s1); @@ -1555,18 +1546,23 @@ for (i=0;*abbrev[i];i++) if (!strcmp(testword,abbrev[i])) istypo=FALSE; - if (gcisdigit(*testword)) + if (g_unichar_isdigit(g_utf8_get_char(testword))) istypo=FALSE; - if (!testword[1]) + if (!*g_utf8_next_char(testword)) istypo=FALSE; if (isroman(testword)) istypo=FALSE; if (istypo) { istypo=FALSE; - for (i=0;testword[i];i++) - if (strchr(vowels,testword[i])) + for (s=testword;*s;s=g_utf8_next_char(s)) + { + decomposition=g_unicode_canonical_decomposition( + g_utf8_get_char(s),&len); + if (g_utf8_strchr("aeiou",-1,decomposition[0])) istypo=TRUE; + g_free(decomposition); + } } if (istypo && (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword))) @@ -1574,16 +1570,16 @@ g_tree_insert(qperiod,g_strdup(testword), GINT_TO_POINTER(1)); if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Extra period?\n", - linecnt,(int)(t-aline)+1); + g_print(" Line %ld column %ld - Extra period?\n", + linecnt,g_utf8_pointer_to_offset(aline,t)+1); else cnt_punct++; } g_free(testword); } - t++; + t=g_utf8_next_char(t); } } } @@ -1597,6 +1593,7 @@ { int i; const char *s,*wordstart; + gunichar c; gchar *inword,*t; if (pswit[TYPO_SWITCH]) { @@ -1609,19 +1606,21 @@ g_free(t); continue; } - inword=g_ascii_strdown(t,-1); + inword=g_utf8_strdown(t,-1); g_free(t); for (i=0;*nocomma[i];i++) if (!strcmp(inword,nocomma[i])) { - if (*s==',' || *s==';' || *s==':') + c=g_utf8_get_char(s); + if (c==',' || c==';' || c==':') { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - " + g_print(" Line %ld column %ld - " "Query punctuation after %s?\n", - linecnt,(int)(s-aline)+1,inword); + linecnt,g_utf8_pointer_to_offset(aline,s)+1, + inword); else cnt_punct++; } @@ -1629,14 +1628,16 @@ for (i=0;*noperiod[i];i++) if (!strcmp(inword,noperiod[i])) { - if (*s=='.' || *s=='!') + c=g_utf8_get_char(s); + if (c=='.' || c=='!') { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - " + g_print(" Line %ld column %ld - " "Query punctuation after %s?\n", - linecnt,(int)(s-aline)+1,inword); + linecnt,g_utf8_pointer_to_offset(aline,s)+1, + inword); else cnt_punct++; } @@ -1654,10 +1655,15 @@ */ void check_for_typos(const char *aline,struct warnings *warnings) { - const char *s,*wordstart; - gchar *inword,*testword; - int i,alower,vowel,consonant,*dupcnt; - gboolean isdup,istypo; + const char *s,*t,*nt,*wordstart; + gchar *inword; + gunichar *decomposition; + gchar *testword; + int i,vowel,consonant,*dupcnt; + gboolean isdup,istypo,alower; + gunichar c; + long offset,len; + gsize decomposition_len; for (s=aline;*s;) { wordstart=s; @@ -1670,10 +1676,10 @@ if (mixdigit(inword)) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query digit in %s\n", - linecnt,(int)(wordstart-aline)+1,inword); + g_print(" Line %ld column %ld - Query digit in %s\n", + linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword); else cnt_word++; } @@ -1684,14 +1690,15 @@ if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH]) { istypo=FALSE; - testword=g_strdup(inword); - alower=0; - for (i=0;i<(int)strlen(testword);i++) + alower=FALSE; + for (t=inword;*t;t=g_utf8_next_char(t)) { + c=g_utf8_get_char(t); + nt=g_utf8_next_char(t); /* lowercase for testing */ - if (testword[i]>='a' && testword[i]<='z') - alower=1; - if (alower && testword[i]>='A' && testword[i]<='Z') + if (g_unichar_islower(c)) + alower=TRUE; + if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c))) { /* * We have an uppercase mid-word. However, there are @@ -1699,15 +1706,18 @@ * Mac and Mc like McGill * French contractions like l'Abbe */ - if (i==2 && testword[0]=='m' && testword[1]=='c' || - i==3 && testword[0]=='m' && testword[1]=='a' && - testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE) + offset=g_utf8_pointer_to_offset(inword,t); + if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' || + offset==3 && c=='m' && g_utf8_get_char(nt)=='a' && + g_utf8_get_char(g_utf8_next_char(nt))=='c' || + offset>0 && + g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE) ; /* do nothing! */ else istypo=TRUE; } - testword[i]=(char)tolower(testword[i]); } + testword=g_utf8_casefold(inword,-1); } if (pswit[TYPO_SWITCH]) { @@ -1715,13 +1725,14 @@ * Check for certain unlikely two-letter combinations at word * start and end. */ - if (strlen(testword)>1) + len=g_utf8_strlen(testword,-1); + if (len>1) { for (i=0;*nostart[i];i++) - if (!strncmp(testword,nostart[i],2)) + if (g_str_has_prefix(testword,nostart[i])) istypo=TRUE; for (i=0;*noend[i];i++) - if (!strncmp(testword+strlen(testword)-2,noend[i],2)) + if (g_str_has_suffix(testword,noend[i])) istypo=TRUE; } /* ght is common, gbt never. Like that. */ @@ -1755,21 +1766,25 @@ * Check for no vowels or no consonants. * If none, flag a typo. */ - if (!istypo && strlen(testword)>1) + if (!istypo && len>1) { vowel=consonant=0; - for (i=0;testword[i];i++) + for (t=testword;*t;t=g_utf8_next_char(t)) { - if (testword[i]=='y' || gcisdigit(testword[i])) + c=g_utf8_get_char(t); + decomposition= + g_unicode_canonical_decomposition(c,&decomposition_len); + if (c=='y' || g_unichar_isdigit(c)) { /* Yah, this is loose. */ vowel++; consonant++; } - else if (strchr(vowels,testword[i])) + else if (g_utf8_strchr("aeiou",-1,decomposition[0])) vowel++; else consonant++; + g_free(decomposition); } if (!vowel || !consonant) istypo=TRUE; @@ -1798,7 +1813,8 @@ * "d" for a missing apostrophe - he d * "n" for "in" */ - if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword)) + if (!istypo && len==1 && + g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword))) istypo=TRUE; if (istypo) { @@ -1817,14 +1833,15 @@ if (!isdup) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) { - printf(" Line %ld column %d - Query word %s", - linecnt,(int)(wordstart-aline)+1,inword); + g_print(" Line %ld column %ld - Query word %s", + linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1, + inword); if (!pswit[VERBOSE_SWITCH]) - printf(" - not reporting duplicates"); - printf("\n"); + g_print(" - not reporting duplicates"); + g_print("\n"); } else cnt_word++; @@ -1835,10 +1852,10 @@ if (!istypo && usertypo && g_tree_lookup(usertypo,testword)) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query possible scanno %s\n", - linecnt,(int)(wordstart-aline)+2,inword); + g_print(" Line %ld column %ld - Query possible scanno %s\n", + linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword); } if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH]) g_free(testword); @@ -1848,10 +1865,11 @@ if (!strcmp(inword,"0") || !strcmp(inword,"1")) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Query standalone %s\n", - linecnt,(int)(wordstart-aline)+2,inword); + g_print(" Line %ld column %ld - Query standalone %s\n", + linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2, + inword); else cnt_word++; } @@ -1873,63 +1891,73 @@ void check_for_misspaced_punctuation(const char *aline, struct parities *parities,gboolean isemptyline) { - int i,llen; gboolean isacro,isellipsis; const char *s; - llen=strlen(aline); - for (i=1;i2 && aline[i-2]=='.') + if (g_utf8_pointer_to_offset(aline,s)>2 && + g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.') isacro=TRUE; - if (i+22 && aline[i-2]=='.') + if (g_utf8_pointer_to_offset(aline,s)>2 && + g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.') isellipsis=TRUE; - if (i+2dquote=!parities->dquote; if (!parities->dquote) { /* parity even */ - if (!strchr("_-.'`/,;:!?)]} ",s[1])) + if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc)) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - " - "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - " + "Wrongspaced quotes?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_punct++; } @@ -2031,28 +2075,30 @@ else { /* parity odd */ - if (!gcisalpha(s[1]) && !isdigit(s[1]) && - !strchr("_-/.'`([{$",s[1]) || !s[1]) + if (!g_unichar_isalpha(nc) && !isdigit(nc) && + !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - " - "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1); + g_print(" Line %ld column %ld - " + "Wrongspaced quotes?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_punct++; } } } } - if (*aline==CHAR_DQUOTE) + if (g_utf8_get_char(aline)==CHAR_DQUOTE) { - if (strchr(",;:!?)]} ",aline[1])) + if (g_utf8_strchr(",;:!?)]} ",-1, + g_utf8_get_char(g_utf8_next_char(aline)))) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column 1 - Wrongspaced quotes?\n", + g_print(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt); else cnt_punct++; @@ -2060,24 +2106,28 @@ } if (pswit[SQUOTE_SWITCH]) { - for (s=aline;*s;s++) + nc=g_utf8_get_char(aline); + for (s=aline;*s;s=g_utf8_next_char(s)) { - if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) && - (s==aline || s>aline && !gcisalpha(s[-1]) || - !gcisalpha(s[1]))) + c=nc; + nc=g_utf8_get_char(g_utf8_next_char(s)); + if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline || + s>aline && + !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) || + !g_unichar_isalpha(nc))) { parities->squote=!parities->squote; if (!parities->squote) { /* parity even */ - if (!strchr("_-.'`/\",;:!?)]} ",s[1])) + if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc)) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - " + g_print(" Line %ld column %ld - " "Wrongspaced singlequotes?\n", - linecnt,(int)(s-aline)+1); + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_punct++; } @@ -2085,15 +2135,15 @@ else { /* parity odd */ - if (!gcisalpha(s[1]) && !isdigit(s[1]) && - !strchr("_-/\".'`",s[1]) || !s[1]) + if (!g_unichar_isalpha(nc) && !isdigit(nc) && + !g_utf8_strchr("_-/\".'`",-1,nc) || !nc) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - " + g_print(" Line %ld column %ld - " "Wrongspaced singlequotes?\n", - linecnt,(int)(s-aline)+1); + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_punct++; } @@ -2117,49 +2167,54 @@ */ void check_for_double_punctuation(const char *aline,struct warnings *warnings) { - int i,llen; - llen=strlen(aline); - for (i=0;idotcomma && aline[i]=='.' && aline[i+1]==',' || - warnings->isFrench && !strncmp(aline+i,",...",4) || - warnings->isFrench && !strncmp(aline+i,"...,",4) || - warnings->isFrench && !strncmp(aline+i,";...",4) || - warnings->isFrench && !strncmp(aline+i,"...;",4) || - warnings->isFrench && !strncmp(aline+i,":...",4) || - warnings->isFrench && !strncmp(aline+i,"...:",4) || - warnings->isFrench && !strncmp(aline+i,"!...",4) || - warnings->isFrench && !strncmp(aline+i,"...!",4) || - warnings->isFrench && !strncmp(aline+i,"?...",4) || - warnings->isFrench && !strncmp(aline+i,"...?",4)) + if (c==nc && (c=='.' || c=='?' || c=='!') || + !warnings->dotcomma && c=='.' && nc==',' || + warnings->isFrench && g_str_has_prefix(s,",...") || + warnings->isFrench && g_str_has_prefix(s,"...,") || + warnings->isFrench && g_str_has_prefix(s,";...") || + warnings->isFrench && g_str_has_prefix(s,"...;") || + warnings->isFrench && g_str_has_prefix(s,":...") || + warnings->isFrench && g_str_has_prefix(s,"...:") || + warnings->isFrench && g_str_has_prefix(s,"!...") || + warnings->isFrench && g_str_has_prefix(s,"...!") || + warnings->isFrench && g_str_has_prefix(s,"?...") || + warnings->isFrench && g_str_has_prefix(s,"...?")) { - if (warnings->isFrench && !strncmp(aline+i,",...",4) || - warnings->isFrench && !strncmp(aline+i,"...,",4) || - warnings->isFrench && !strncmp(aline+i,";...",4) || - warnings->isFrench && !strncmp(aline+i,"...;",4) || - warnings->isFrench && !strncmp(aline+i,":...",4) || - warnings->isFrench && !strncmp(aline+i,"...:",4) || - warnings->isFrench && !strncmp(aline+i,"!...",4) || - warnings->isFrench && !strncmp(aline+i,"...!",4) || - warnings->isFrench && !strncmp(aline+i,"?...",4) || - warnings->isFrench && !strncmp(aline+i,"...?",4)) - i+=4; + if (warnings->isFrench && g_str_has_prefix(s,",...") || + warnings->isFrench && g_str_has_prefix(s,"...,") || + warnings->isFrench && g_str_has_prefix(s,";...") || + warnings->isFrench && g_str_has_prefix(s,"...;") || + warnings->isFrench && g_str_has_prefix(s,":...") || + warnings->isFrench && g_str_has_prefix(s,"...:") || + warnings->isFrench && g_str_has_prefix(s,"!...") || + warnings->isFrench && g_str_has_prefix(s,"...!") || + warnings->isFrench && g_str_has_prefix(s,"?...") || + warnings->isFrench && g_str_has_prefix(s,"...?")) + { + s+=4; + nc=g_utf8_get_char(g_utf8_next_char(s)); + } ; /* do nothing for .. !! and ?? which can be legit */ } else { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Double punctuation?\n", - linecnt,i+1); + g_print(" Line %ld column %ld - Double punctuation?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_punct++; } @@ -2177,37 +2232,37 @@ while ((t=strstr(s," \" "))) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Spaced doublequote?\n", - linecnt,(int)(t-aline+1)); + g_print(" Line %ld column %ld - Spaced doublequote?\n", + linecnt,g_utf8_pointer_to_offset(aline,t)+1); else cnt_punct++; - s=t+2; + s=g_utf8_next_char(g_utf8_next_char(t)); } s=aline; while ((t=strstr(s," ' "))) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Spaced singlequote?\n", - linecnt,(int)(t-aline+1)); + g_print(" Line %ld column %ld - Spaced singlequote?\n", + linecnt,g_utf8_pointer_to_offset(aline,t)+1); else cnt_punct++; - s=t+2; + s=g_utf8_next_char(g_utf8_next_char(t)); } s=aline; while ((t=strstr(s," ` "))) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Spaced singlequote?\n", - linecnt,(int)(t-aline+1)); + g_print(" Line %ld column %ld - Spaced singlequote?\n", + linecnt,g_utf8_pointer_to_offset(aline,t)+1); else cnt_punct++; - s=t+2; + s=g_utf8_next_char(g_utf8_next_char(t)); } } @@ -2219,22 +2274,26 @@ void check_for_miscased_genative(const char *aline) { const char *s; + gunichar c,nc,pc; if (!*aline) return; - s=aline+1; - while (*s) + c=g_utf8_get_char(aline); + nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0; + for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s)) { - if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z') + pc=c; + c=nc; + nc=g_utf8_get_char(g_utf8_next_char(s)); + if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc)) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Capital \"S\"?\n", - linecnt,(int)(s-aline+2)); + g_print(" Line %ld column %ld - Capital \"S\"?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+2); else cnt_punct++; } - s++; } } @@ -2248,29 +2307,34 @@ */ void check_end_of_line(const char *aline,struct warnings *warnings) { - int i,llen; - llen=strlen(aline); - if (llen>1) + int lbytes; + const char *s; + gunichar c1,c2; + lbytes=strlen(aline); + if (g_utf8_strlen(aline,lbytes)>1) { - if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE || - aline[llen-1]==CHAR_OPEN_SQUOTE) - if (aline[llen-2]==CHAR_SPACE) - { - if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); - if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Spaced quote?\n", - linecnt,llen); - else - cnt_punct++; - } - if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) && - aline[1]==CHAR_SPACE) + s=g_utf8_prev_char(aline+lbytes); + c1=g_utf8_get_char(s); + c2=g_utf8_get_char(g_utf8_prev_char(s)); + if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && + c2==CHAR_SPACE) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column 1 - Spaced quote?\n",linecnt); + g_print(" Line %ld column %ld - Spaced quote?\n",linecnt, + g_utf8_strlen(aline,lbytes)); + else + cnt_punct++; + } + c1=g_utf8_get_char(aline); + c2=g_utf8_get_char(g_utf8_next_char(aline)); + if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column 1 - Spaced quote?\n",linecnt); else cnt_punct++; } @@ -2280,15 +2344,18 @@ */ if (pswit[PARANOID_SWITCH] && warnings->hyphen) { - for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--) + for (s=g_utf8_prev_char(aline+lbytes); + s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s)) ; - if (aline[i]=='-' && aline[i-1]!='-') + if (g_utf8_get_char(s)=='-' && + g_utf8_get_char(g_utf8_prev_char(s))!='-') { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - Hyphen at end of line?\n", - linecnt,i); + g_print(" Line %ld column %ld - " + "Hyphen at end of line?\n", + linecnt,g_utf8_pointer_to_offset(aline,s)); } } } @@ -2302,19 +2369,26 @@ */ void check_for_unspaced_bracket(const char *aline) { - int i,llen; - llen=strlen(aline); - for (i=1;i"); + close=strchr(g_utf8_next_char(open),'>'); if (close) { - i=(int)(close-open+1); - if (i>0) + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) { - if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); - if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - HTML Tag? %*.*s \n", - linecnt,(int)(open-aline)+1,i,i,open); - else - cnt_html++; + tag=g_strndup(open,close-open+1); + g_print(" Line %ld column %ld - HTML Tag? %s \n", + linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag); + g_free(tag); } + else + cnt_html++; } } } @@ -2387,25 +2467,28 @@ */ void check_for_html_entity(const char *aline) { - int i; const char *s,*amp,*scolon; - amp=strstr(aline,"&"); + gchar *entity; + amp=strchr(aline,'&'); if (amp) { - scolon=strstr(aline,";"); + scolon=strchr(amp,';'); if (scolon) { - i=(int)(scolon-amp+1); - for (s=amp;s0) + for (s=amp;s=scolon) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - HTML symbol? %*.*s \n", - linecnt,(int)(amp-aline)+1,i,i,amp); + { + entity=g_strndup(amp,scolon-amp+1); + g_print(" Line %ld column %d - HTML symbol? %s \n", + linecnt,(int)(amp-aline)+1,entity); + g_free(entity); + } else cnt_html++; } @@ -2425,18 +2508,20 @@ struct pending *pending) { const char *s; + gunichar c; s=aline; while (*s==' ') s++; + c=g_utf8_get_char(s); if (pending->dquote) { - if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH]) + if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH]) { if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",parastart); - puts(pending->dquote); + g_print("\n%s\n",parastart); + g_print("%s\n",pending->dquote); } else cnt_dquot++; @@ -2446,14 +2531,14 @@ } if (pending->squote) { - if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || + if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || pending->squot) { if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",parastart); - puts(pending->squote); + g_print("\n%s\n",parastart); + g_print("%s\n",pending->squote); } else cnt_squot++; @@ -2466,8 +2551,8 @@ if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",parastart); - puts(pending->rbrack); + g_print("\n%s\n",parastart); + g_print("%s\n",pending->rbrack); } else cnt_brack++; @@ -2479,8 +2564,8 @@ if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",parastart); - puts(pending->sbrack); + g_print("\n%s\n",parastart); + g_print("%s\n",pending->sbrack); } else cnt_brack++; @@ -2492,8 +2577,8 @@ if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",parastart); - puts(pending->cbrack); + g_print("\n%s\n",parastart); + g_print("%s\n",pending->cbrack); } else cnt_brack++; @@ -2505,8 +2590,8 @@ if (!pswit[OVERVIEW_SWITCH]) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",parastart); - puts(pending->unders); + g_print("\n%s\n",parastart); + g_print("%s\n",pending->unders); } else cnt_brack++; @@ -2577,12 +2662,14 @@ void check_for_omitted_punctuation(const char *prevline, struct line_properties *last,int start_para_line) { - int i; + gboolean letter_on_line=FALSE; const char *s; - for (s=prevline,i=0;*s && !i;s++) - if (gcisletter(*s)) - /* use i to indicate the presence of a letter on the line */ - i=1; + for (s=prevline;*s;s=g_utf8_next_char(s)) + if (g_unichar_isalpha(g_utf8_get_char(s))) + { + letter_on_line=TRUE; + break; + } /* * This next "if" is a problem. * If we say "start_para_line <= linecnt - 1", that includes @@ -2590,28 +2677,30 @@ * If we say "start_para_line < linecnt - 1" it doesn't, but then it * misses genuine one-line paragraphs. */ - if (i && last->blen>2 && start_para_lineCHAR_SPACE) + if (letter_on_line && last->blen>2 && start_para_lineCHAR_SPACE) { - for (i=strlen(prevline)-1; - (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) && - prevline[i]>CHAR_SPACE && i>0; - i--) + for (s=g_utf8_prev_char(prevline+strlen(prevline)); + (g_utf8_get_char(s)==CHAR_DQUOTE || + g_utf8_get_char(s)==CHAR_SQUOTE) && + g_utf8_get_char(s)>CHAR_SPACE && s>prevline; + s=g_utf8_prev_char(s)) ; - for (;i>0;i--) + for (;s>prevline;s=g_utf8_prev_char(s)) { - if (gcisalpha(prevline[i])) + if (g_unichar_isalpha(g_utf8_get_char(s))) { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",prevline); + g_print("\n%s\n",prevline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - " + g_print(" Line %ld column %ld - " "No punctuation at para end?\n", - linecnt-1,(int)strlen(prevline)); + linecnt-1,g_utf8_strlen(prevline,-1)); else cnt_punct++; break; } - if (strchr("-.:!([{?}])",prevline[i])) + if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s))) break; } } @@ -2622,11 +2711,38 @@ const char *word=key; int *dupcnt=value; if (*dupcnt) - printf("\nNote: Queried word %s was duplicated %d times\n", + g_print("\nNote: Queried word %s was duplicated %d times\n", word,*dupcnt); return FALSE; } +void print_as_windows_1252(const char *string) +{ + gsize inbytes,outbytes; + gchar *buf,*bp; + GIConv converter=(GIConv)-1; + if (!string) + { + if (converter!=(GIConv)-1) + g_iconv_close(converter); + converter=(GIConv)-1; + return; + } + if (converter=(GIConv)-1) + converter=g_iconv_open("WINDOWS-1252","UTF-8"); + if (converter!=(GIConv)-1) + { + inbytes=outbytes=strlen(string); + bp=buf=g_malloc(outbytes+1); + g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes); + *bp='\0'; + fputs(buf,stdout); + g_free(buf); + } + else + fputs(string,stdout); +} + /* * procfile: * @@ -2659,7 +2775,8 @@ fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message); exit(1); } - fprintf(stdout,"\n\nFile: %s\n\n",filename); + g_set_print_handler(print_as_windows_1252); + g_print("\n\nFile: %s\n\n",filename); first_pass_results=first_pass(etext); warnings=report_first_pass(first_pass_results); qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free); @@ -2674,7 +2791,7 @@ linecnt++; if (linecnt==1) isnewpara=TRUE; - if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11)) + if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: ")) continue; // skip DP page separators completely if (linecntfirstline || (first_pass_results->footerline>0 && @@ -2682,14 +2799,14 @@ { if (pswit[HEADER_SWITCH]) { - if (!strncmp(aline,"Title:",6)) - printf(" %s\n",aline); - if (!strncmp(aline,"Author:",7)) - printf(" %s\n",aline); - if (!strncmp(aline,"Release Date:",13)) - printf(" %s\n",aline); - if (!strncmp(aline,"Edition:",8)) - printf(" %s\n\n",aline); + if (g_str_has_prefix(aline,"Title:")) + g_print(" %s\n",aline); + if (g_str_has_prefix(aline,"Author:")) + g_print(" %s\n",aline); + if (g_str_has_prefix(aline,"Release Date:")) + g_print(" %s\n",aline); + if (g_str_has_prefix(aline,"Edition:")) + g_print(" %s\n\n",aline); } continue; /* skip through the header */ } @@ -2706,36 +2823,38 @@ parastart=g_strdup(aline); memset(&parities,0,sizeof(parities)); /* restart the quote count */ s=aline; - while (!gcisalpha(*s) && !gcisdigit(*s) && *s) - s++; - if (*s>='a' && *s<='z') + while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) && + !g_unichar_isdigit(g_utf8_get_char(s))) + s=g_utf8_next_char(s); + if (g_unichar_islower(g_utf8_get_char(s))) { /* and its first letter is lowercase */ if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - " + g_print(" Line %ld column %ld - " "Paragraph starts with lower-case\n", - linecnt,(int)(s-aline)+1); + linecnt,g_utf8_pointer_to_offset(aline,s)+1); else cnt_punct++; } isnewpara=FALSE; /* Signal the end of new para processing. */ } /* Check for an em-dash broken at line end. */ - if (enddash && *aline=='-') + if (enddash && g_utf8_get_char(aline)=='-') { if (pswit[ECHO_SWITCH]) - printf("\n%s\n",aline); + g_print("\n%s\n",aline); if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column 1 - Broken em-dash?\n",linecnt); + g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt); else cnt_punct++; } enddash=FALSE; - for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--) + for (s=g_utf8_prev_char(aline+strlen(aline)); + g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s)) ; - if (s>=aline && *s=='-') + if (s>=aline && g_utf8_get_char(s)=='-') enddash=TRUE; check_for_control_characters(aline); if (warnings->bin) @@ -2745,8 +2864,8 @@ if (warnings->shortline) check_for_short_line(aline,&last); last.blen=last.len; - last.len=strlen(aline); - last.start=aline[0]; + last.len=g_utf8_strlen(aline,-1); + last.start=g_utf8_get_char(aline); check_for_starting_punctuation(aline); if (warnings->dash) { @@ -2795,6 +2914,8 @@ g_tree_foreach(qword,report_duplicate_queries,NULL); g_tree_unref(qword); g_tree_unref(qperiod); + g_set_print_handler(NULL); + print_as_windows_1252(NULL); } /* @@ -2807,14 +2928,15 @@ */ char *flgets(char **etext,long lcnt) { - char c; - int len; + gunichar c; gboolean isCR=FALSE; char *theline=*etext; - len=0; - for(;;) + char *eos=theline; + gchar *s; + for (;;) { - c=*(*etext)++; + c=g_utf8_get_char(*etext); + *etext=g_utf8_next_char(*etext); if (!c) return NULL; /* either way, it's end of line */ @@ -2828,9 +2950,13 @@ if (pswit[LINE_END_SWITCH]) { if (pswit[ECHO_SWITCH]) - printf("\n%*.*s\n",len,len,theline); + { + s=g_strndup(theline,eos-theline); + g_print("\n%s\n",s); + g_free(s); + } if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld - No CR?\n",lcnt); + g_print(" Line %ld - No CR?\n",lcnt); else cnt_lineend++; } @@ -2845,9 +2971,13 @@ if (pswit[LINE_END_SWITCH]) { if (pswit[ECHO_SWITCH]) - printf("\n%*.*s\n",len,len,theline); + { + s=g_strndup(theline,eos-theline); + g_print("\n%s\n",s); + g_free(s); + } if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld - Two successive CRs?\n",lcnt); + g_print(" Line %ld - Two successive CRs?\n",lcnt); else cnt_lineend++; } @@ -2859,19 +2989,23 @@ if (pswit[LINE_END_SWITCH] && isCR) { if (pswit[ECHO_SWITCH]) - printf("\n%*.*s\n",len,len,theline); + { + s=g_strndup(theline,eos-theline); + g_print("\n%s\n",s); + g_free(s); + } if (!pswit[OVERVIEW_SWITCH]) - printf(" Line %ld column %d - CR without LF?\n", - lcnt,len+1); + g_print(" Line %ld column %ld - CR without LF?\n", + lcnt,g_utf8_pointer_to_offset(theline,eos)+1); else cnt_lineend++; - theline[len]=' '; + *eos=' '; } isCR=FALSE; - len++; + eos=g_utf8_next_char(eos); } } - theline[len]='\0'; + *eos='\0'; if (pswit[MARKUP_SWITCH]) postprocess_for_HTML(theline); if (pswit[DP_SWITCH]) @@ -2886,55 +3020,55 @@ * contains a mixture of alpha and digits. Generally, this is an * error, but may not be for cases like 4th or L5 12s. 3d. * - * Returns: 0 if no error found, 1 if error. + * Returns: TRUE iff an is error found. */ -int mixdigit(const char *checkword) +gboolean mixdigit(const char *checkword) { - int wehaveadigit,wehavealetter,firstdigits,query,wl; - const char *s; - wehaveadigit=wehavealetter=query=0; - for (s=checkword;*s;s++) - if (gcisalpha(*s)) - wehavealetter=1; - else - if (gcisdigit(*s)) - wehaveadigit=1; + gboolean wehaveadigit,wehavealetter,query; + const char *s,*nondigit; + wehaveadigit=wehavealetter=query=FALSE; + for (s=checkword;*s;s=g_utf8_next_char(s)) + if (g_unichar_isalpha(g_utf8_get_char(s))) + wehavealetter=TRUE; + else if (g_unichar_isdigit(g_utf8_get_char(s))) + wehaveadigit=TRUE; if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */ - query=1; - wl=strlen(checkword); - for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++) + query=TRUE; + for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit)); + nondigit=g_utf8_next_char(nondigit)) ; /* digits, ending in st, rd, nd, th of either case */ - if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") || - !g_ascii_strcasecmp(checkword+wl-2,"rd") || - !g_ascii_strcasecmp(checkword+wl-2,"nd") || - !g_ascii_strcasecmp(checkword+wl-2,"th"))) - query=0; - if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") || - !g_ascii_strcasecmp(checkword+wl-3,"rds") || - !g_ascii_strcasecmp(checkword+wl-3,"nds") || - !g_ascii_strcasecmp(checkword+wl-3,"ths"))) - query=0; - if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") || - !g_ascii_strcasecmp(checkword+wl-4,"rdly") || - !g_ascii_strcasecmp(checkword+wl-4,"ndly") || - !g_ascii_strcasecmp(checkword+wl-4,"thly"))) - query=0; + if (!g_ascii_strcasecmp(nondigit,"st") || + !g_ascii_strcasecmp(nondigit,"rd") || + !g_ascii_strcasecmp(nondigit,"nd") || + !g_ascii_strcasecmp(nondigit,"th")) + query=FALSE; + if (!g_ascii_strcasecmp(nondigit,"sts") || + !g_ascii_strcasecmp(nondigit,"rds") || + !g_ascii_strcasecmp(nondigit,"nds") || + !g_ascii_strcasecmp(nondigit,"ths")) + query=FALSE; + if (!g_ascii_strcasecmp(nondigit,"stly") || + !g_ascii_strcasecmp(nondigit,"rdly") || + !g_ascii_strcasecmp(nondigit,"ndly") || + !g_ascii_strcasecmp(nondigit,"thly")) + query=FALSE; /* digits, ending in l, L, s or d */ - if (firstdigits+1==wl && (checkword[wl-1]=='l' || - checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d')) - query=0; + if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") || + !strcmp(nondigit,"d")) + query=FALSE; /* * L at the start of a number, representing Britsh pounds, like L500. - * This is cute. We know the current word is mixeddigit. If the first + * This is cute. We know the current word is mixed digit. If the first * letter is L, there must be at least one digit following. If both * digits and letters follow, we have a genuine error, else we have a * capital L followed by digits, and we accept that as a non-error. */ - if (checkword[0]=='L' && !mixdigit(checkword+1)) - query=0; + if (g_utf8_get_char(checkword)=='L' && + !mixdigit(g_utf8_next_char(checkword))) + query=FALSE; } return query; } @@ -2951,11 +3085,13 @@ */ gchar *getaword(const char **ptr) { - int i; - const char *s; + const char *s,*t; GString *word; + gunichar c,pc; word=g_string_new(NULL); - for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++) + for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) && + !g_unichar_isalpha(g_utf8_get_char(*ptr)) && + **ptr;*ptr=g_utf8_next_char(*ptr)) ; /* * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35. @@ -2966,23 +3102,27 @@ * the results and resume our normal programming. */ s=*ptr; - for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++) - g_string_append_c(word,*s); - for (i=1;i+1len;i++) + for (;g_unichar_isdigit(g_utf8_get_char(s)) || + g_unichar_isalpha(g_utf8_get_char(s)) || + g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s)) + g_string_append_unichar(word,g_utf8_get_char(s)); + for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t); + t=g_utf8_next_char(t)) { - if (word->str[i]=='.' || word->str[i]==',') + c=g_utf8_get_char(t); + pc=g_utf8_get_char(g_utf8_prev_char(t)); + if ((c=='.' || c==',') && g_unichar_isdigit(pc)) { - if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1])) - { - *ptr=s; - return g_string_free(word,FALSE); - } + *ptr=s; + return g_string_free(word,FALSE); } } /* we didn't find a punctuated number - do the regular getword thing */ g_string_truncate(word,0); - for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++) - g_string_append_c(word,**ptr); + for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) || + g_unichar_isalpha(g_utf8_get_char(*ptr)) || + g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr)) + g_string_append_unichar(word,g_utf8_get_char(*ptr)); return g_string_free(word,FALSE); } @@ -3006,82 +3146,36 @@ if (!t || !*t) return FALSE; s=t; - while (*t=='m' && *t) + while (g_utf8_get_char(t)=='m' && *t) t++; - if (*t=='d') + if (g_utf8_get_char(t)=='d') t++; - if (*t=='c' && t[1]=='m') + if (g_str_has_prefix(t,"cm")) t+=2; - if (*t=='c' && t[1]=='d') + if (g_str_has_prefix(t,"cd")) t+=2; - while (*t=='c' && *t) + while (g_utf8_get_char(t)=='c' && *t) t++; - if (*t=='x' && t[1]=='l') + if (g_str_has_prefix(t,"xl")) t+=2; - if (*t=='x' && t[1]=='c') + if (g_str_has_prefix(t,"xc")) t+=2; - if (*t=='l') + if (g_utf8_get_char(t)=='l') t++; - while (*t=='x' && *t) + while (g_utf8_get_char(t)=='x' && *t) t++; - if (*t=='i' && t[1]=='x') + if (g_str_has_prefix(t,"ix")) t+=2; - if (*t=='i' && t[1]=='v') + if (g_str_has_prefix(t,"iv")) t+=2; - if (*t=='v') + if (g_utf8_get_char(t)=='v') t++; - while (*t=='i' && *t) + while (g_utf8_get_char(t)=='i' && *t) t++; return !*t; } /* - * gcisalpha: - * - * A version of isalpha() that is somewhat lenient on 8-bit texts. - * If we use the standard function, 8-bit accented characters break - * words, so that tete with accented characters appears to be two words, "t" - * and "t", with 8-bit characters between them. This causes over-reporting of - * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) - * and ISO-8859-1 character sets, which are the most common PG 8-bit types. - */ -gboolean gcisalpha(unsigned char c) -{ - if (c>='a' && c<='z') - return TRUE; - if (c>='A' && c<='Z') - return TRUE; - if (c<140) - return FALSE; - if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254) - return TRUE; - if (c==140 || c==142 || c==156 || c==158 || c==159) - return TRUE; - return FALSE; -} - -/* - * gcisdigit: - * - * A version of isdigit() that doesn't get confused in 8-bit texts. - */ -gboolean gcisdigit(unsigned char c) -{ - return c>='0' && c<='9'; -} - -/* - * gcisletter: - * - * A version of isletter() that doesn't get confused in 8-bit texts. - * NB: this is ISO-8891-1-specific. - */ -gboolean gcisletter(unsigned char c) -{ - return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192; -} - -/* * postprocess_for_DP: * * Invoked with the -d switch from flgets(). @@ -3096,21 +3190,11 @@ if (!*theline) return; for (i=0;*DPmarkup[i];i++) - { - s=strstr(theline,DPmarkup[i]); - while (s) + while ((s=strstr(theline,DPmarkup[i]))) { t=s+strlen(DPmarkup[i]); - while (*t) - { - *s=*t; - t++; - s++; - } - *s=0; - s=strstr(theline,DPmarkup[i]); + memmove(s,t,strlen(t)+1); } - } } /* @@ -3124,9 +3208,8 @@ */ void postprocess_for_HTML(char *theline) { - if (strchr(theline,'<') && strchr(theline,'>')) - while (losemarkup(theline)) - ; + while (losemarkup(theline)) + ; while (loseentities(theline)) ; } @@ -3135,25 +3218,16 @@ { char *s,*t; int i; - if (!*theline) - return NULL; - s=strstr(theline,"<"); - t=strstr(theline,">"); + s=strchr(theline,'<'); + t=s?strchr(s,'>'):NULL; if (!s || !t) return NULL; for (i=0;*markup[i];i++) - if (!tagcomp(s+1,markup[i])) + if (tagcomp(g_utf8_next_char(s),markup[i])) { - if (!t[1]) - { - *s=0; - return s; - } - else if (t>s) - { - strcpy(s,t+1); - return s; - } + t=g_utf8_next_char(t); + memmove(s,t,strlen(t)+1); + return s; } /* It's an unrecognized . */ return NULL; @@ -3170,13 +3244,10 @@ s=strstr(theline,entities[i].htmlent); if (s) { - t=malloc((size_t)strlen(s)); - if (!t) - return NULL; - strcpy(t,s+strlen(entities[i].htmlent)); + t=g_strdup(s+strlen(entities[i].htmlent)); strcpy(s,entities[i].textent); strcat(s,t); - free(t); + g_free(t); return theline; } } @@ -3185,34 +3256,29 @@ s=strstr(theline,entities[i].htmlnum); if (s) { - t=malloc((size_t)strlen(s)); - if (!t) - return NULL; - strcpy(t,s+strlen(entities[i].htmlnum)); + t=g_strdup(s+strlen(entities[i].htmlnum)); strcpy(s,entities[i].textent); strcat(s,t); - free(t); + g_free(t); return theline; } } return NULL; } -int tagcomp(const char *strin,const char *basetag) +gboolean tagcomp(const char *strin,const char *basetag) { - const char *s,*t; - s=basetag; - t=strin; - if (*t=='/') - t++; /* ignore a slash */ - while (*s && *t) - { - if (tolower(*s)!=tolower(*t)) - return 1; - s++; - t++; - } - return 0; + gboolean retval; + gchar *s,*t; + if (g_utf8_get_char(strin)=='/') + t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */ + else + t=g_utf8_casefold(strin,-1); + s=g_utf8_casefold(basetag,-1); + retval=g_str_has_prefix(t,s); + g_free(s); + g_free(t); + return retval; } void proghelp(GOptionContext *context)