1.1 --- a/bookloupe/bookloupe.c Tue May 28 15:17:19 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Thu May 30 07:31:24 2013 +0100
1.3 @@ -119,8 +119,6 @@
1.4 "among", "those", "into", "whom", "having", "thence", ""
1.5 };
1.6
1.7 -char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
1.8 -
1.9 struct {
1.10 char *htmlent;
1.11 char *htmlnum;
1.12 @@ -347,16 +345,13 @@
1.13
1.14 gchar *running_from;
1.15
1.16 -int mixdigit(const char *);
1.17 +gboolean mixdigit(const char *);
1.18 gchar *getaword(const char **);
1.19 char *flgets(char **,long);
1.20 -gboolean gcisalpha(unsigned char);
1.21 -gboolean gcisdigit(unsigned char);
1.22 -gboolean gcisletter(unsigned char);
1.23 void postprocess_for_HTML(char *);
1.24 char *linehasmarkup(char *);
1.25 char *losemarkup(char *);
1.26 -int tagcomp(const char *,const char *);
1.27 +gboolean tagcomp(const char *,const char *);
1.28 char *loseentities(char *);
1.29 gboolean isroman(const char *);
1.30 void postprocess_for_DP(char *);
1.31 @@ -385,7 +380,7 @@
1.32
1.33 struct line_properties {
1.34 unsigned int len,blen;
1.35 - char start;
1.36 + gunichar start;
1.37 };
1.38
1.39 struct parities {
1.40 @@ -462,8 +457,8 @@
1.41 gchar *usertypo_file;
1.42 gboolean okay;
1.43 int i;
1.44 - gsize len;
1.45 - gchar *contents,**lines;
1.46 + gsize len,nb;
1.47 + gchar *contents,*utf8,**lines;
1.48 usertypo_file=g_strdup("bookloupe.typ");
1.49 okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
1.50 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
1.51 @@ -490,7 +485,7 @@
1.52 if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
1.53 {
1.54 g_free(usertypo_file);
1.55 - printf(" --> I couldn't find bookloupe.typ "
1.56 + g_print(" --> I couldn't find bookloupe.typ "
1.57 "-- proceeding without user typos.\n");
1.58 return;
1.59 }
1.60 @@ -501,7 +496,10 @@
1.61 g_clear_error(&err);
1.62 exit(1);
1.63 }
1.64 - lines=g_strsplit(contents,"\n",0);
1.65 + utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
1.66 + g_free(contents);
1.67 + lines=g_strsplit_set(utf8,"\r\n",0);
1.68 + g_free(utf8);
1.69 usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
1.70 for (i=0;lines[i];i++)
1.71 if (*(unsigned char *)lines[i]>'!')
1.72 @@ -511,49 +509,6 @@
1.73 g_free(lines);
1.74 }
1.75
1.76 -#if 0
1.77 -/*
1.78 - * read_etext:
1.79 - *
1.80 - * Read an etext returning an array of lines. Lines are normally expected
1.81 - * to be terminated by CR LF. Solitary LFs delimit lines but are left
1.82 - * embedded at the end of the line for further processing. Solitary CRs
1.83 - * do not delimit lines.
1.84 - */
1.85 -gchar **read_etext(const char *filename,GError **err)
1.86 -{
1.87 - int i;
1.88 - const char *s,*t;
1.89 - gchar *contents;
1.90 - gchar **raw_lines;
1.91 - GPtrArray *lines;
1.92 - gsize len;
1.93 - if (!g_file_get_contents(filename,&contents,&len,err))
1.94 - return NULL;
1.95 - raw_lines=g_strsplit(contents,"\r\n",0);
1.96 - lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
1.97 - for (i=0;raw_lines[i];i++)
1.98 - {
1.99 - t=strchr(raw_lines[i],'\n');
1.100 - if (t)
1.101 - {
1.102 - s=raw_lines[i];
1.103 - while ((t=strchr(s,'\n')))
1.104 - {
1.105 - g_ptr_array_add(lines,g_strndup(s,t-s+1));
1.106 - s=t+1;
1.107 - }
1.108 - g_ptr_array_add(lines,g_strdup(s));
1.109 - g_free(raw_lines[i]);
1.110 - }
1.111 - else
1.112 - g_ptr_array_add(lines,raw_lines[i]);
1.113 - }
1.114 - g_free(raw_lines);
1.115 - g_ptr_array_add(lines,NULL);
1.116 - return (gchar **)g_ptr_array_free(lines,FALSE);
1.117 -}
1.118 -#else
1.119 /*
1.120 * read_etext:
1.121 *
1.122 @@ -562,13 +517,14 @@
1.123 */
1.124 gchar *read_etext(const char *filename,GError **err)
1.125 {
1.126 - gchar *contents;
1.127 - gsize len;
1.128 + gchar *contents,*utf8;
1.129 + gsize len,nb;
1.130 if (!g_file_get_contents(filename,&contents,&len,err))
1.131 return NULL;
1.132 - return contents;
1.133 + utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
1.134 + g_free(contents);
1.135 + return utf8;
1.136 }
1.137 -#endif
1.138
1.139 int main(int argc,char **argv)
1.140 {
1.141 @@ -580,35 +536,35 @@
1.142 procfile(argv[1]);
1.143 if (pswit[OVERVIEW_SWITCH])
1.144 {
1.145 - printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
1.146 + g_print(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
1.147 checked_linecnt,linecnt,linecnt-checked_linecnt);
1.148 - printf(" --------------- Queries found --------------\n");
1.149 + g_print(" --------------- Queries found --------------\n");
1.150 if (cnt_long)
1.151 - printf(" Long lines: %14ld\n",cnt_long);
1.152 + g_print(" Long lines: %14ld\n",cnt_long);
1.153 if (cnt_short)
1.154 - printf(" Short lines: %14ld\n",cnt_short);
1.155 + g_print(" Short lines: %14ld\n",cnt_short);
1.156 if (cnt_lineend)
1.157 - printf(" Line-end problems: %14ld\n",cnt_lineend);
1.158 + g_print(" Line-end problems: %14ld\n",cnt_lineend);
1.159 if (cnt_word)
1.160 - printf(" Common typos: %14ld\n",cnt_word);
1.161 + g_print(" Common typos: %14ld\n",cnt_word);
1.162 if (cnt_dquot)
1.163 - printf(" Unmatched quotes: %14ld\n",cnt_dquot);
1.164 + g_print(" Unmatched quotes: %14ld\n",cnt_dquot);
1.165 if (cnt_squot)
1.166 - printf(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
1.167 + g_print(" Unmatched SingleQuotes: %14ld\n",cnt_squot);
1.168 if (cnt_brack)
1.169 - printf(" Unmatched brackets: %14ld\n",cnt_brack);
1.170 + g_print(" Unmatched brackets: %14ld\n",cnt_brack);
1.171 if (cnt_bin)
1.172 - printf(" Non-ASCII characters: %14ld\n",cnt_bin);
1.173 + g_print(" Non-ASCII characters: %14ld\n",cnt_bin);
1.174 if (cnt_odd)
1.175 - printf(" Proofing characters: %14ld\n",cnt_odd);
1.176 + g_print(" Proofing characters: %14ld\n",cnt_odd);
1.177 if (cnt_punct)
1.178 - printf(" Punctuation & spacing queries: %14ld\n",cnt_punct);
1.179 + g_print(" Punctuation & spacing queries: %14ld\n",cnt_punct);
1.180 if (cnt_dash)
1.181 - printf(" Non-standard dashes: %14ld\n",cnt_dash);
1.182 + g_print(" Non-standard dashes: %14ld\n",cnt_dash);
1.183 if (cnt_html)
1.184 - printf(" Possible HTML tags: %14ld\n",cnt_html);
1.185 - printf("\n");
1.186 - printf(" TOTAL QUERIES %14ld\n",
1.187 + g_print(" Possible HTML tags: %14ld\n",cnt_html);
1.188 + g_print("\n");
1.189 + g_print(" TOTAL QUERIES %14ld\n",
1.190 cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
1.191 cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
1.192 }
1.193 @@ -628,10 +584,10 @@
1.194 */
1.195 struct first_pass_results *first_pass(const char *etext)
1.196 {
1.197 - char laststart=CHAR_SPACE;
1.198 + gunichar laststart=CHAR_SPACE;
1.199 const char *s;
1.200 gchar *lc_line;
1.201 - int i,j,llen;
1.202 + int i,j,lbytes,llen;
1.203 gchar **lines;
1.204 unsigned int lastlen=0,lastblen=0;
1.205 long spline=0,nspline=0;
1.206 @@ -640,27 +596,28 @@
1.207 lines=g_strsplit(etext,"\n",0);
1.208 for (j=0;lines[j];j++)
1.209 {
1.210 - llen=strlen(lines[j]);
1.211 - while(lines[j][llen-1]=='\r')
1.212 - lines[j][llen--]='\0';
1.213 + lbytes=strlen(lines[j]);
1.214 + while (lines[j][lbytes-1]=='\r')
1.215 + lines[j][--lbytes]='\0';
1.216 + llen=g_utf8_strlen(lines[j],lbytes);
1.217 linecnt++;
1.218 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
1.219 (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
1.220 {
1.221 if (spline)
1.222 - printf(" --> Duplicate header?\n");
1.223 + g_print(" --> Duplicate header?\n");
1.224 spline=linecnt+1; /* first line of non-header text, that is */
1.225 }
1.226 if (!strncmp(lines[j],"*** START",9) &&
1.227 strstr(lines[j],"PROJECT GUTENBERG"))
1.228 {
1.229 if (nspline)
1.230 - printf(" --> Duplicate header?\n");
1.231 + g_print(" --> Duplicate header?\n");
1.232 nspline=linecnt+1; /* first line of non-header text, that is */
1.233 }
1.234 if (spline || nspline)
1.235 {
1.236 - lc_line=g_ascii_strdown(lines[j],llen);
1.237 + lc_line=g_utf8_strdown(lines[j],lbytes);
1.238 if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
1.239 {
1.240 if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
1.241 @@ -669,7 +626,7 @@
1.242 {
1.243 /* it's an old-form header - we can detect duplicates */
1.244 if (!nspline)
1.245 - printf(" --> Duplicate footer?\n");
1.246 + g_print(" --> Duplicate footer?\n");
1.247 }
1.248 else
1.249 results.footerline=linecnt;
1.250 @@ -684,19 +641,21 @@
1.251 if (results.footerline)
1.252 continue; /* don't count the boilerplate in the footer */
1.253 results.totlen+=llen;
1.254 - for (i=0;i<llen;i++)
1.255 + for (s=lines[j];*s;s=g_utf8_next_char(s))
1.256 {
1.257 - if ((unsigned char)lines[j][i]>127)
1.258 + if (g_utf8_get_char(s)>127)
1.259 results.binlen++;
1.260 - if (gcisalpha(lines[j][i]))
1.261 + if (g_unichar_isalpha(g_utf8_get_char(s)))
1.262 results.alphalen++;
1.263 - if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
1.264 + if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
1.265 + isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
1.266 results.endquote_count++;
1.267 }
1.268 if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
1.269 lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
1.270 results.shortline++;
1.271 - if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
1.272 + if (lbytes>0 &&
1.273 + g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
1.274 cnt_spacend++;
1.275 if (strstr(lines[j],".,"))
1.276 results.dotcomma++;
1.277 @@ -704,17 +663,19 @@
1.278 /* locase text on the line */
1.279 if (strchr(lines[j],'*'))
1.280 {
1.281 - for (s=lines[j];*s;s++)
1.282 - if (*s>='a' && *s<='z')
1.283 + for (s=lines[j];*s;s=g_utf8_next_char(s))
1.284 + if (g_unichar_islower(g_utf8_get_char(s)))
1.285 break;
1.286 - if (*s)
1.287 + if (*s)
1.288 results.astline++;
1.289 }
1.290 if (strchr(lines[j],'/'))
1.291 results.fslashline++;
1.292 - for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
1.293 + for (s=g_utf8_prev_char(lines[j]+lbytes);
1.294 + s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
1.295 ;
1.296 - if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
1.297 + if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
1.298 + g_utf8_get_char(g_utf8_prev_char(s))!='-')
1.299 results.hyphens++;
1.300 if (llen>LONGEST_PG_LINE)
1.301 results.longline++;
1.302 @@ -729,15 +690,15 @@
1.303 results.htmcount+=4; /* bonus marks! */
1.304 }
1.305 /* Check for spaced em-dashes */
1.306 - if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
1.307 + if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
1.308 {
1.309 results.emdash++;
1.310 - if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
1.311 + if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
1.312 results.space_emdash++;
1.313 - if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
1.314 + if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
1.315 /* count of em-dashes with spaces both sides */
1.316 results.non_PG_space_emdash++;
1.317 - if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
1.318 + if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
1.319 /* count of PG-type em-dashes with no spaces */
1.320 results.PG_space_emdash++;
1.321 }
1.322 @@ -772,13 +733,13 @@
1.323 {
1.324 static struct warnings warnings={0};
1.325 if (cnt_spacend>0)
1.326 - printf(" --> %ld lines in this file have white space at end\n",
1.327 + g_print(" --> %ld lines in this file have white space at end\n",
1.328 cnt_spacend);
1.329 warnings.dotcomma=1;
1.330 if (results->dotcomma>5)
1.331 {
1.332 warnings.dotcomma=0;
1.333 - printf(" --> %ld lines in this file contain '.,'. "
1.334 + g_print(" --> %ld lines in this file contain '.,'. "
1.335 "Not reporting them.\n",results->dotcomma);
1.336 }
1.337 /*
1.338 @@ -789,7 +750,7 @@
1.339 if (results->shortline>50 || results->shortline*10>linecnt)
1.340 {
1.341 warnings.shortline=0;
1.342 - printf(" --> %ld lines in this file are short. "
1.343 + g_print(" --> %ld lines in this file are short. "
1.344 "Not reporting short lines.\n",results->shortline);
1.345 }
1.346 /*
1.347 @@ -800,7 +761,7 @@
1.348 if (results->longline>50 || results->longline*10>linecnt)
1.349 {
1.350 warnings.longline=0;
1.351 - printf(" --> %ld lines in this file are long. "
1.352 + g_print(" --> %ld lines in this file are long. "
1.353 "Not reporting long lines.\n",results->longline);
1.354 }
1.355 /* If more than 10 lines contain asterisks, don't bother reporting them. */
1.356 @@ -808,7 +769,7 @@
1.357 if (results->astline>10)
1.358 {
1.359 warnings.ast=0;
1.360 - printf(" --> %ld lines in this file contain asterisks. "
1.361 + g_print(" --> %ld lines in this file contain asterisks. "
1.362 "Not reporting them.\n",results->astline);
1.363 }
1.364 /*
1.365 @@ -819,7 +780,7 @@
1.366 if (results->fslashline>10)
1.367 {
1.368 warnings.fslash=0;
1.369 - printf(" --> %ld lines in this file contain forward slashes. "
1.370 + g_print(" --> %ld lines in this file contain forward slashes. "
1.371 "Not reporting them.\n",results->fslashline);
1.372 }
1.373 /*
1.374 @@ -830,7 +791,7 @@
1.375 if (results->endquote_count>20)
1.376 {
1.377 warnings.endquote=0;
1.378 - printf(" --> %ld lines in this file contain unpunctuated endquotes. "
1.379 + g_print(" --> %ld lines in this file contain unpunctuated endquotes. "
1.380 "Not reporting them.\n",results->endquote_count);
1.381 }
1.382 /*
1.383 @@ -841,7 +802,7 @@
1.384 if (results->standalone_digit>10)
1.385 {
1.386 warnings.digit=0;
1.387 - printf(" --> %ld lines in this file contain standalone 0s and 1s. "
1.388 + g_print(" --> %ld lines in this file contain standalone 0s and 1s. "
1.389 "Not reporting them.\n",results->standalone_digit);
1.390 }
1.391 /*
1.392 @@ -852,16 +813,16 @@
1.393 if (results->hyphens>20)
1.394 {
1.395 warnings.hyphen=0;
1.396 - printf(" --> %ld lines in this file have hyphens at end. "
1.397 + g_print(" --> %ld lines in this file have hyphens at end. "
1.398 "Not reporting them.\n",results->hyphens);
1.399 }
1.400 if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
1.401 {
1.402 - printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1.403 + g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1.404 pswit[MARKUP_SWITCH]=1;
1.405 }
1.406 if (results->verylongline>0)
1.407 - printf(" --> %ld lines in this file are VERY long!\n",
1.408 + g_print(" --> %ld lines in this file are VERY long!\n",
1.409 results->verylongline);
1.410 /*
1.411 * If there are more non-PG spaced dashes than PG em-dashes,
1.412 @@ -874,7 +835,7 @@
1.413 results->PG_space_emdash)
1.414 {
1.415 warnings.dash=0;
1.416 - printf(" --> There are %ld spaced dashes and em-dashes. "
1.417 + g_print(" --> There are %ld spaced dashes and em-dashes. "
1.418 "Not reporting them.\n",
1.419 results->spacedash+results->non_PG_space_emdash);
1.420 }
1.421 @@ -882,19 +843,19 @@
1.422 warnings.bin=1;
1.423 if (results->binlen*4>results->totlen)
1.424 {
1.425 - printf(" --> This file does not appear to be ASCII. "
1.426 + g_print(" --> This file does not appear to be ASCII. "
1.427 "Terminating. Best of luck with it!\n");
1.428 exit(1);
1.429 }
1.430 if (results->alphalen*4<results->totlen)
1.431 {
1.432 - printf(" --> This file does not appear to be text. "
1.433 + g_print(" --> This file does not appear to be text. "
1.434 "Terminating. Best of luck with it!\n");
1.435 exit(1);
1.436 }
1.437 if (results->binlen*100>results->totlen || results->binlen>100)
1.438 {
1.439 - printf(" --> There are a lot of foreign letters here. "
1.440 + g_print(" --> There are a lot of foreign letters here. "
1.441 "Not reporting them.\n");
1.442 warnings.bin=0;
1.443 }
1.444 @@ -902,26 +863,26 @@
1.445 if (results->Dutchcount>50)
1.446 {
1.447 warnings.isDutch=TRUE;
1.448 - printf(" --> This looks like Dutch - "
1.449 + g_print(" --> This looks like Dutch - "
1.450 "switching off dashes and warnings for 's Middags case.\n");
1.451 }
1.452 warnings.isFrench=FALSE;
1.453 if (results->Frenchcount>50)
1.454 {
1.455 warnings.isFrench=TRUE;
1.456 - printf(" --> This looks like French - "
1.457 + g_print(" --> This looks like French - "
1.458 "switching off some doublepunct.\n");
1.459 }
1.460 if (results->firstline && results->footerline)
1.461 - printf(" The PG header and footer appear to be already on.\n");
1.462 + g_print(" The PG header and footer appear to be already on.\n");
1.463 else
1.464 {
1.465 if (results->firstline)
1.466 - printf(" The PG header is on - no footer.\n");
1.467 + g_print(" The PG header is on - no footer.\n");
1.468 if (results->footerline)
1.469 - printf(" The PG footer is on - no header.\n");
1.470 + g_print(" The PG footer is on - no header.\n");
1.471 }
1.472 - printf("\n");
1.473 + g_print("\n");
1.474 if (pswit[VERBOSE_SWITCH])
1.475 {
1.476 warnings.bin=1;
1.477 @@ -934,7 +895,7 @@
1.478 warnings.fslash=1;
1.479 warnings.hyphen=1;
1.480 warnings.endquote=1;
1.481 - printf(" *** Verbose output is ON -- you asked for it! ***\n");
1.482 + g_print(" *** Verbose output is ON -- you asked for it! ***\n");
1.483 }
1.484 if (warnings.isDutch)
1.485 warnings.dash=0;
1.486 @@ -942,9 +903,9 @@
1.487 results->footerline>results->firstline &&
1.488 results->footerline-results->firstline<100)
1.489 {
1.490 - printf(" --> I don't really know where this text starts. \n");
1.491 - printf(" There are no reference points.\n");
1.492 - printf(" I'm going to have to report the header and footer "
1.493 + g_print(" --> I don't really know where this text starts. \n");
1.494 + g_print(" There are no reference points.\n");
1.495 + g_print(" I'm going to have to report the header and footer "
1.496 "as well.\n");
1.497 results->firstline=0;
1.498 }
1.499 @@ -968,12 +929,16 @@
1.500 int guessquote=0;
1.501 /* assume the line is empty until proven otherwise */
1.502 gboolean isemptyline=TRUE;
1.503 - const char *s=aline;
1.504 + const char *s=aline,*sprev,*snext;
1.505 + gunichar c;
1.506 + sprev=NULL;
1.507 while (*s)
1.508 {
1.509 - if (*s==CHAR_DQUOTE)
1.510 + snext=g_utf8_next_char(s);
1.511 + c=g_utf8_get_char(s);
1.512 + if (c==CHAR_DQUOTE)
1.513 counters->quot++;
1.514 - if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
1.515 + if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
1.516 {
1.517 if (s==aline)
1.518 {
1.519 @@ -981,17 +946,21 @@
1.520 * At start of line, it can only be an openquote.
1.521 * Hardcode a very common exception!
1.522 */
1.523 - if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
1.524 + if (!g_str_has_prefix(snext,"tis") &&
1.525 + !g_str_has_prefix(snext,"Tis"))
1.526 counters->open_single_quote++;
1.527 }
1.528 - else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
1.529 + else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1.530 + g_unichar_isalpha(g_utf8_get_char(snext)))
1.531 /* Do nothing! it's definitely an apostrophe, not a quote */
1.532 ;
1.533 /* it's outside a word - let's check it out */
1.534 - else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
1.535 + else if (c==CHAR_OPEN_SQUOTE ||
1.536 + g_unichar_isalpha(g_utf8_get_char(snext)))
1.537 {
1.538 /* it damwell better BE an openquote */
1.539 - if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
1.540 + if (!g_str_has_prefix(snext,"tis") &&
1.541 + !g_str_has_prefix(snext,"Tis"))
1.542 /* hardcode a very common exception! */
1.543 counters->open_single_quote++;
1.544 }
1.545 @@ -999,20 +968,22 @@
1.546 {
1.547 /* now - is it a closequote? */
1.548 guessquote=0; /* accumulate clues */
1.549 - if (gcisalpha(s[-1]))
1.550 + if (g_unichar_isalpha(g_utf8_get_char(sprev)))
1.551 {
1.552 /* it follows a letter - could be either */
1.553 guessquote++;
1.554 - if (s[-1]=='s')
1.555 + if (g_utf8_get_char(sprev)=='s')
1.556 {
1.557 /* looks like a plural apostrophe */
1.558 guessquote-=3;
1.559 - if (s[1]==CHAR_SPACE) /* bonus marks! */
1.560 + if (g_utf8_get_char(snext)==CHAR_SPACE)
1.561 + /* bonus marks! */
1.562 guessquote-=2;
1.563 }
1.564 }
1.565 /* it doesn't have a letter either side */
1.566 - else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
1.567 + else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
1.568 + strchr(".?!,;: ",g_utf8_get_char(snext)))
1.569 guessquote+=8; /* looks like a closequote */
1.570 else
1.571 guessquote++;
1.572 @@ -1028,24 +999,25 @@
1.573 counters->close_single_quote++;
1.574 }
1.575 }
1.576 - if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
1.577 - *s!=13 && *s!=10)
1.578 + if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1.579 + c!='\r' && c!='\n')
1.580 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1.581 - if (*s==CHAR_UNDERSCORE)
1.582 + if (c==CHAR_UNDERSCORE)
1.583 counters->c_unders++;
1.584 - if (*s==CHAR_OPEN_CBRACK)
1.585 + if (c==CHAR_OPEN_CBRACK)
1.586 counters->c_brack++;
1.587 - if (*s==CHAR_CLOSE_CBRACK)
1.588 + if (c==CHAR_CLOSE_CBRACK)
1.589 counters->c_brack--;
1.590 - if (*s==CHAR_OPEN_RBRACK)
1.591 + if (c==CHAR_OPEN_RBRACK)
1.592 counters->r_brack++;
1.593 - if (*s==CHAR_CLOSE_RBRACK)
1.594 + if (c==CHAR_CLOSE_RBRACK)
1.595 counters->r_brack--;
1.596 - if (*s==CHAR_OPEN_SBRACK)
1.597 + if (c==CHAR_OPEN_SBRACK)
1.598 counters->s_brack++;
1.599 - if (*s==CHAR_CLOSE_SBRACK)
1.600 + if (c==CHAR_CLOSE_SBRACK)
1.601 counters->s_brack--;
1.602 - s++;
1.603 + sprev=s;
1.604 + s=snext;
1.605 }
1.606 return isemptyline;
1.607 }
1.608 @@ -1060,18 +1032,18 @@
1.609 */
1.610 void check_for_control_characters(const char *aline)
1.611 {
1.612 - unsigned char c;
1.613 + gunichar c;
1.614 const char *s;
1.615 - for (s=aline;*s;s++)
1.616 + for (s=aline;*s;s=g_utf8_next_char(s))
1.617 {
1.618 - c=*(unsigned char *)s;
1.619 + c=g_utf8_get_char(s);
1.620 if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
1.621 {
1.622 if (pswit[ECHO_SWITCH])
1.623 - printf("\n%s\n",aline);
1.624 + g_print("\n%s\n",aline);
1.625 if (!pswit[OVERVIEW_SWITCH])
1.626 - printf(" Line %ld column %d - Control character %d\n",
1.627 - linecnt,(int)(s-aline)+1,c);
1.628 + g_print(" Line %ld column %ld - Control character %u\n",
1.629 + linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
1.630 else
1.631 cnt_bin++;
1.632 }
1.633 @@ -1087,90 +1059,93 @@
1.634 gboolean isemptyline)
1.635 {
1.636 /* Don't repeat multiple warnings on one line. */
1.637 - int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
1.638 + gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1.639 + gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1.640 const char *s;
1.641 - unsigned char c;
1.642 - for (s=aline;*s;s++)
1.643 + gunichar c;
1.644 + for (s=aline;*s;s=g_utf8_next_char(s))
1.645 {
1.646 - c=*(unsigned char *)s;
1.647 - if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
1.648 + c=g_utf8_get_char(s);
1.649 + if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1.650 {
1.651 if (pswit[ECHO_SWITCH])
1.652 - printf("\n%s\n",aline);
1.653 + g_print("\n%s\n",aline);
1.654 if (!pswit[OVERVIEW_SWITCH])
1.655 - if (c>127 && c<160)
1.656 - printf(" Line %ld column %d - "
1.657 - "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
1.658 + if (c>127 && c<160 || c>255)
1.659 + g_print(" Line %ld column %ld - "
1.660 + "Non-ISO-8859 character %u\n",
1.661 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.662 else
1.663 - printf(" Line %ld column %d - Non-ASCII character %d\n",
1.664 - linecnt,(int)(s-aline)+1,c);
1.665 + g_print(" Line %ld column %ld - "
1.666 + "Non-ASCII character %u\n",
1.667 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.668 else
1.669 cnt_bin++;
1.670 - eNon_A=1;
1.671 + eNon_A=TRUE;
1.672 }
1.673 - if (!eTab && *s==CHAR_TAB)
1.674 + if (!eTab && c==CHAR_TAB)
1.675 {
1.676 if (pswit[ECHO_SWITCH])
1.677 - printf("\n%s\n",aline);
1.678 + g_print("\n%s\n",aline);
1.679 if (!pswit[OVERVIEW_SWITCH])
1.680 - printf(" Line %ld column %d - Tab character?\n",
1.681 - linecnt,(int)(s-aline)+1);
1.682 + g_print(" Line %ld column %ld - Tab character?\n",
1.683 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.684 else
1.685 cnt_odd++;
1.686 - eTab=1;
1.687 + eTab=TRUE;
1.688 }
1.689 - if (!eTilde && *s==CHAR_TILDE)
1.690 + if (!eTilde && c==CHAR_TILDE)
1.691 {
1.692 /*
1.693 * Often used by OCR software to indicate an
1.694 * unrecognizable character.
1.695 */
1.696 if (pswit[ECHO_SWITCH])
1.697 - printf("\n%s\n",aline);
1.698 + g_print("\n%s\n",aline);
1.699 if (!pswit[OVERVIEW_SWITCH])
1.700 - printf(" Line %ld column %d - Tilde character?\n",
1.701 - linecnt,(int)(s-aline)+1);
1.702 + g_print(" Line %ld column %ld - Tilde character?\n",
1.703 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.704 else
1.705 cnt_odd++;
1.706 - eTilde=1;
1.707 + eTilde=TRUE;
1.708 }
1.709 - if (!eCarat && *s==CHAR_CARAT)
1.710 + if (!eCarat && c==CHAR_CARAT)
1.711 {
1.712 if (pswit[ECHO_SWITCH])
1.713 - printf("\n%s\n",aline);
1.714 + g_print("\n%s\n",aline);
1.715 if (!pswit[OVERVIEW_SWITCH])
1.716 - printf(" Line %ld column %d - Carat character?\n",
1.717 - linecnt,(int)(s-aline)+1);
1.718 + g_print(" Line %ld column %ld - Carat character?\n",
1.719 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.720 else
1.721 cnt_odd++;
1.722 - eCarat=1;
1.723 + eCarat=TRUE;
1.724 }
1.725 - if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
1.726 + if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
1.727 {
1.728 if (pswit[ECHO_SWITCH])
1.729 - printf("\n%s\n",aline);
1.730 + g_print("\n%s\n",aline);
1.731 if (!pswit[OVERVIEW_SWITCH])
1.732 - printf(" Line %ld column %d - Forward slash?\n",
1.733 - linecnt,(int)(s-aline)+1);
1.734 + g_print(" Line %ld column %ld - Forward slash?\n",
1.735 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.736 else
1.737 cnt_odd++;
1.738 - eFSlash=1;
1.739 + eFSlash=TRUE;
1.740 }
1.741 /*
1.742 * Report asterisks only in paranoid mode,
1.743 * since they're often deliberate.
1.744 */
1.745 if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
1.746 - *s==CHAR_ASTERISK)
1.747 + c==CHAR_ASTERISK)
1.748 {
1.749 if (pswit[ECHO_SWITCH])
1.750 - printf("\n%s\n",aline);
1.751 + g_print("\n%s\n",aline);
1.752 if (!pswit[OVERVIEW_SWITCH])
1.753 - printf(" Line %ld column %d - Asterisk?\n",
1.754 - linecnt,(int)(s-aline)+1);
1.755 + g_print(" Line %ld column %ld - Asterisk?\n",
1.756 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.757 else
1.758 cnt_odd++;
1.759 - eAst=1;
1.760 + eAst=TRUE;
1.761 }
1.762 }
1.763 }
1.764 @@ -1182,13 +1157,13 @@
1.765 */
1.766 void check_for_long_line(const char *aline)
1.767 {
1.768 - if (strlen(aline)>LONGEST_PG_LINE)
1.769 + if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
1.770 {
1.771 if (pswit[ECHO_SWITCH])
1.772 - printf("\n%s\n",aline);
1.773 + g_print("\n%s\n",aline);
1.774 if (!pswit[OVERVIEW_SWITCH])
1.775 - printf(" Line %ld column %d - Long line %d\n",
1.776 - linecnt,(int)strlen(aline),(int)strlen(aline));
1.777 + g_print(" Line %ld column %ld - Long line %ld\n",
1.778 + linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
1.779 else
1.780 cnt_long++;
1.781 }
1.782 @@ -1220,14 +1195,15 @@
1.783 */
1.784 void check_for_short_line(const char *aline,const struct line_properties *last)
1.785 {
1.786 - if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
1.787 - last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1.788 + if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
1.789 + last->len<SHORTEST_PG_LINE && last->blen>1 &&
1.790 + last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
1.791 {
1.792 if (pswit[ECHO_SWITCH])
1.793 - printf("\n%s\n",prevline);
1.794 + g_print("\n%s\n",prevline);
1.795 if (!pswit[OVERVIEW_SWITCH])
1.796 - printf(" Line %ld column %d - Short line %d?\n",
1.797 - linecnt-1,(int)strlen(prevline),(int)strlen(prevline));
1.798 + g_print(" Line %ld column %ld - Short line %ld?\n",
1.799 + linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
1.800 else
1.801 cnt_short++;
1.802 }
1.803 @@ -1240,12 +1216,13 @@
1.804 */
1.805 void check_for_starting_punctuation(const char *aline)
1.806 {
1.807 - if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
1.808 + if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
1.809 + !g_str_has_prefix(aline,". . ."))
1.810 {
1.811 if (pswit[ECHO_SWITCH])
1.812 - printf("\n%s\n",aline);
1.813 + g_print("\n%s\n",aline);
1.814 if (!pswit[OVERVIEW_SWITCH])
1.815 - printf(" Line %ld column 1 - Begins with punctuation?\n",
1.816 + g_print(" Line %ld column 1 - Begins with punctuation?\n",
1.817 linecnt);
1.818 else
1.819 cnt_punct++;
1.820 @@ -1263,21 +1240,21 @@
1.821 */
1.822 void check_for_spaced_emdash(const char *aline)
1.823 {
1.824 - const char *s,*t;
1.825 - s=aline;
1.826 - while ((t=strstr(s,"--")))
1.827 + const char *s,*t,*next;
1.828 + for (s=aline;t=strstr(s,"--");s=next)
1.829 {
1.830 - if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
1.831 + next=g_utf8_next_char(g_utf8_next_char(t));
1.832 + if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1.833 + g_utf8_get_char(next)==CHAR_SPACE)
1.834 {
1.835 if (pswit[ECHO_SWITCH])
1.836 - printf("\n%s\n",aline);
1.837 + g_print("\n%s\n",aline);
1.838 if (!pswit[OVERVIEW_SWITCH])
1.839 - printf(" Line %ld column %d - Spaced em-dash?\n",
1.840 - linecnt,(int)(t-aline)+1);
1.841 + g_print(" Line %ld column %ld - Spaced em-dash?\n",
1.842 + linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1.843 else
1.844 cnt_dash++;
1.845 }
1.846 - s=t+2;
1.847 }
1.848 }
1.849
1.850 @@ -1291,26 +1268,26 @@
1.851 const char *s;
1.852 if ((s=strstr(aline," -")))
1.853 {
1.854 - if (s[2]!='-')
1.855 + if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
1.856 {
1.857 if (pswit[ECHO_SWITCH])
1.858 - printf("\n%s\n",aline);
1.859 + g_print("\n%s\n",aline);
1.860 if (!pswit[OVERVIEW_SWITCH])
1.861 - printf(" Line %ld column %d - Spaced dash?\n",
1.862 - linecnt,(int)(s-aline)+1);
1.863 + g_print(" Line %ld column %ld - Spaced dash?\n",
1.864 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.865 else
1.866 cnt_dash++;
1.867 }
1.868 }
1.869 else if ((s=strstr(aline,"- ")))
1.870 {
1.871 - if (s==aline || s[-1]!='-')
1.872 + if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
1.873 {
1.874 if (pswit[ECHO_SWITCH])
1.875 - printf("\n%s\n",aline);
1.876 + g_print("\n%s\n",aline);
1.877 if (!pswit[OVERVIEW_SWITCH])
1.878 - printf(" Line %ld column %d - Spaced dash?\n",
1.879 - linecnt,(int)(s-aline)+1);
1.880 + g_print(" Line %ld column %ld - Spaced dash?\n",
1.881 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.882 else
1.883 cnt_dash++;
1.884 }
1.885 @@ -1335,10 +1312,11 @@
1.886 if (s)
1.887 {
1.888 if (pswit[ECHO_SWITCH])
1.889 - printf("\n%s\n",aline);
1.890 + g_print("\n%s\n",aline);
1.891 if (!pswit[OVERVIEW_SWITCH])
1.892 - printf(" Line %ld column %d - Query missing paragraph break?\n",
1.893 - linecnt,(int)(s-aline)+1);
1.894 + g_print(" Line %ld column %ld - "
1.895 + "Query missing paragraph break?\n",
1.896 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.897 else
1.898 cnt_punct++;
1.899 }
1.900 @@ -1382,10 +1360,10 @@
1.901 if (s)
1.902 {
1.903 if (pswit[ECHO_SWITCH])
1.904 - printf("\n%s\n",aline);
1.905 + g_print("\n%s\n",aline);
1.906 if (!pswit[OVERVIEW_SWITCH])
1.907 - printf(" Line %ld column %d - Query he/be error?\n",
1.908 - linecnt,(int)(s-aline)+1);
1.909 + g_print(" Line %ld column %ld - Query he/be error?\n",
1.910 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.911 else
1.912 cnt_word++;
1.913 }
1.914 @@ -1405,10 +1383,10 @@
1.915 if (s)
1.916 {
1.917 if (pswit[ECHO_SWITCH])
1.918 - printf("\n%s\n",aline);
1.919 + g_print("\n%s\n",aline);
1.920 if (!pswit[OVERVIEW_SWITCH])
1.921 - printf(" Line %ld column %d - Query had/bad error?\n",
1.922 - linecnt,(int)(s-aline)+1);
1.923 + g_print(" Line %ld column %ld - Query had/bad error?\n",
1.924 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.925 else
1.926 cnt_word++;
1.927 }
1.928 @@ -1418,10 +1396,10 @@
1.929 if (s)
1.930 {
1.931 if (pswit[ECHO_SWITCH])
1.932 - printf("\n%s\n",aline);
1.933 + g_print("\n%s\n",aline);
1.934 if (!pswit[OVERVIEW_SWITCH])
1.935 - printf(" Line %ld column %d - Query hut/but error?\n",
1.936 - linecnt,(int)(s-aline)+1);
1.937 + g_print(" Line %ld column %ld - Query hut/but error?\n",
1.938 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.939 else
1.940 cnt_word++;
1.941 }
1.942 @@ -1440,10 +1418,11 @@
1.943 if (s)
1.944 {
1.945 if (pswit[ECHO_SWITCH])
1.946 - printf("\n%s\n",aline);
1.947 + g_print("\n%s\n",aline);
1.948 if (!pswit[OVERVIEW_SWITCH])
1.949 - printf(" Line %ld column %d - Query angled bracket with From\n",
1.950 - linecnt,(int)(s-aline)+1);
1.951 + g_print(" Line %ld column %ld - "
1.952 + "Query angled bracket with From\n",
1.953 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.954 else
1.955 cnt_punct++;
1.956 }
1.957 @@ -1457,17 +1436,18 @@
1.958 */
1.959 void check_for_orphan_character(const char *aline)
1.960 {
1.961 - if (*aline && !aline[1])
1.962 + gunichar c;
1.963 + c=g_utf8_get_char(aline);
1.964 + if (c && !*g_utf8_next_char(aline))
1.965 {
1.966 - if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
1.967 - gcisdigit(*aline))
1.968 + if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
1.969 ; /* Nothing - ignore numerals alone on a line. */
1.970 else
1.971 {
1.972 if (pswit[ECHO_SWITCH])
1.973 - printf("\n%s\n",aline);
1.974 + g_print("\n%s\n",aline);
1.975 if (!pswit[OVERVIEW_SWITCH])
1.976 - printf(" Line %ld column 1 - Query single character line\n",
1.977 + g_print(" Line %ld column 1 - Query single character line\n",
1.978 linecnt);
1.979 else
1.980 cnt_punct++;
1.981 @@ -1487,10 +1467,10 @@
1.982 if (s)
1.983 {
1.984 if (pswit[ECHO_SWITCH])
1.985 - printf("\n%s\n",aline);
1.986 + g_print("\n%s\n",aline);
1.987 if (!pswit[OVERVIEW_SWITCH])
1.988 - printf(" Line %ld column %ld - Query I=exclamation mark?\n",
1.989 - linecnt,s-aline);
1.990 + g_print(" Line %ld column %ld - Query I=exclamation mark?\n",
1.991 + linecnt,g_utf8_pointer_to_offset(aline,s));
1.992 else
1.993 cnt_punct++;
1.994 }
1.995 @@ -1506,47 +1486,58 @@
1.996 {
1.997 const char *s,*t,*s1;
1.998 int i;
1.999 + gsize len;
1.1000 gboolean istypo;
1.1001 gchar *testword;
1.1002 + gunichar *decomposition;
1.1003 if (pswit[PARANOID_SWITCH])
1.1004 {
1.1005 - for (t=aline;strstr(t,". ");)
1.1006 + for (t=aline;t=strstr(t,". ");)
1.1007 {
1.1008 - t=strstr(t,". ");
1.1009 if (t==aline)
1.1010 {
1.1011 - t++;
1.1012 + t=g_utf8_next_char(t);
1.1013 /* start of line punctuation is handled elsewhere */
1.1014 continue;
1.1015 }
1.1016 - if (!gcisalpha(t[-1]))
1.1017 + if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
1.1018 {
1.1019 - t++;
1.1020 + t=g_utf8_next_char(t);
1.1021 continue;
1.1022 }
1.1023 if (warnings->isDutch)
1.1024 {
1.1025 /* For Frank & Jeroen -- 's Middags case */
1.1026 - if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
1.1027 - t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
1.1028 + gunichar c2,c3,c4,c5;
1.1029 + c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
1.1030 + c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1.1031 + c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1.1032 + c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1.1033 + if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
1.1034 + c4==CHAR_SPACE && g_unichar_isupper(c5))
1.1035 {
1.1036 - t++;
1.1037 + t=g_utf8_next_char(t);
1.1038 continue;
1.1039 }
1.1040 }
1.1041 - s1=t+2;
1.1042 - while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1.1043 - s1++;
1.1044 - if (*s1>='a' && *s1<='z')
1.1045 + s1=g_utf8_next_char(g_utf8_next_char(t));
1.1046 + while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
1.1047 + !isdigit(g_utf8_get_char(s1)))
1.1048 + s1=g_utf8_next_char(s1);
1.1049 + if (g_unichar_islower(g_utf8_get_char(s1)))
1.1050 {
1.1051 /* we have something to investigate */
1.1052 istypo=TRUE;
1.1053 /* so let's go back and find out */
1.1054 - for (s1=t-1;s1>=aline &&
1.1055 - (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
1.1056 - gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
1.1057 + for (s1=g_utf8_prev_char(t);s1>=aline &&
1.1058 + (g_unichar_isalpha(g_utf8_get_char(s1)) ||
1.1059 + g_unichar_isdigit(g_utf8_get_char(s1)) ||
1.1060 + g_utf8_get_char(s1)==CHAR_SQUOTE &&
1.1061 + g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
1.1062 + g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
1.1063 + s1=g_utf8_prev_char(s1))
1.1064 ;
1.1065 - s1++;
1.1066 + s1=g_utf8_next_char(s1);
1.1067 s=strchr(s1,'.');
1.1068 if (s)
1.1069 testword=g_strndup(s1,s-s1);
1.1070 @@ -1555,18 +1546,23 @@
1.1071 for (i=0;*abbrev[i];i++)
1.1072 if (!strcmp(testword,abbrev[i]))
1.1073 istypo=FALSE;
1.1074 - if (gcisdigit(*testword))
1.1075 + if (g_unichar_isdigit(g_utf8_get_char(testword)))
1.1076 istypo=FALSE;
1.1077 - if (!testword[1])
1.1078 + if (!*g_utf8_next_char(testword))
1.1079 istypo=FALSE;
1.1080 if (isroman(testword))
1.1081 istypo=FALSE;
1.1082 if (istypo)
1.1083 {
1.1084 istypo=FALSE;
1.1085 - for (i=0;testword[i];i++)
1.1086 - if (strchr(vowels,testword[i]))
1.1087 + for (s=testword;*s;s=g_utf8_next_char(s))
1.1088 + {
1.1089 + decomposition=g_unicode_canonical_decomposition(
1.1090 + g_utf8_get_char(s),&len);
1.1091 + if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1.1092 istypo=TRUE;
1.1093 + g_free(decomposition);
1.1094 + }
1.1095 }
1.1096 if (istypo &&
1.1097 (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
1.1098 @@ -1574,16 +1570,16 @@
1.1099 g_tree_insert(qperiod,g_strdup(testword),
1.1100 GINT_TO_POINTER(1));
1.1101 if (pswit[ECHO_SWITCH])
1.1102 - printf("\n%s\n",aline);
1.1103 + g_print("\n%s\n",aline);
1.1104 if (!pswit[OVERVIEW_SWITCH])
1.1105 - printf(" Line %ld column %d - Extra period?\n",
1.1106 - linecnt,(int)(t-aline)+1);
1.1107 + g_print(" Line %ld column %ld - Extra period?\n",
1.1108 + linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1.1109 else
1.1110 cnt_punct++;
1.1111 }
1.1112 g_free(testword);
1.1113 }
1.1114 - t++;
1.1115 + t=g_utf8_next_char(t);
1.1116 }
1.1117 }
1.1118 }
1.1119 @@ -1597,6 +1593,7 @@
1.1120 {
1.1121 int i;
1.1122 const char *s,*wordstart;
1.1123 + gunichar c;
1.1124 gchar *inword,*t;
1.1125 if (pswit[TYPO_SWITCH])
1.1126 {
1.1127 @@ -1609,19 +1606,21 @@
1.1128 g_free(t);
1.1129 continue;
1.1130 }
1.1131 - inword=g_ascii_strdown(t,-1);
1.1132 + inword=g_utf8_strdown(t,-1);
1.1133 g_free(t);
1.1134 for (i=0;*nocomma[i];i++)
1.1135 if (!strcmp(inword,nocomma[i]))
1.1136 {
1.1137 - if (*s==',' || *s==';' || *s==':')
1.1138 + c=g_utf8_get_char(s);
1.1139 + if (c==',' || c==';' || c==':')
1.1140 {
1.1141 if (pswit[ECHO_SWITCH])
1.1142 - printf("\n%s\n",aline);
1.1143 + g_print("\n%s\n",aline);
1.1144 if (!pswit[OVERVIEW_SWITCH])
1.1145 - printf(" Line %ld column %d - "
1.1146 + g_print(" Line %ld column %ld - "
1.1147 "Query punctuation after %s?\n",
1.1148 - linecnt,(int)(s-aline)+1,inword);
1.1149 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1.1150 + inword);
1.1151 else
1.1152 cnt_punct++;
1.1153 }
1.1154 @@ -1629,14 +1628,16 @@
1.1155 for (i=0;*noperiod[i];i++)
1.1156 if (!strcmp(inword,noperiod[i]))
1.1157 {
1.1158 - if (*s=='.' || *s=='!')
1.1159 + c=g_utf8_get_char(s);
1.1160 + if (c=='.' || c=='!')
1.1161 {
1.1162 if (pswit[ECHO_SWITCH])
1.1163 - printf("\n%s\n",aline);
1.1164 + g_print("\n%s\n",aline);
1.1165 if (!pswit[OVERVIEW_SWITCH])
1.1166 - printf(" Line %ld column %d - "
1.1167 + g_print(" Line %ld column %ld - "
1.1168 "Query punctuation after %s?\n",
1.1169 - linecnt,(int)(s-aline)+1,inword);
1.1170 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,
1.1171 + inword);
1.1172 else
1.1173 cnt_punct++;
1.1174 }
1.1175 @@ -1654,10 +1655,15 @@
1.1176 */
1.1177 void check_for_typos(const char *aline,struct warnings *warnings)
1.1178 {
1.1179 - const char *s,*wordstart;
1.1180 - gchar *inword,*testword;
1.1181 - int i,alower,vowel,consonant,*dupcnt;
1.1182 - gboolean isdup,istypo;
1.1183 + const char *s,*t,*nt,*wordstart;
1.1184 + gchar *inword;
1.1185 + gunichar *decomposition;
1.1186 + gchar *testword;
1.1187 + int i,vowel,consonant,*dupcnt;
1.1188 + gboolean isdup,istypo,alower;
1.1189 + gunichar c;
1.1190 + long offset,len;
1.1191 + gsize decomposition_len;
1.1192 for (s=aline;*s;)
1.1193 {
1.1194 wordstart=s;
1.1195 @@ -1670,10 +1676,10 @@
1.1196 if (mixdigit(inword))
1.1197 {
1.1198 if (pswit[ECHO_SWITCH])
1.1199 - printf("\n%s\n",aline);
1.1200 + g_print("\n%s\n",aline);
1.1201 if (!pswit[OVERVIEW_SWITCH])
1.1202 - printf(" Line %ld column %d - Query digit in %s\n",
1.1203 - linecnt,(int)(wordstart-aline)+1,inword);
1.1204 + g_print(" Line %ld column %ld - Query digit in %s\n",
1.1205 + linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
1.1206 else
1.1207 cnt_word++;
1.1208 }
1.1209 @@ -1684,14 +1690,15 @@
1.1210 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1.1211 {
1.1212 istypo=FALSE;
1.1213 - testword=g_strdup(inword);
1.1214 - alower=0;
1.1215 - for (i=0;i<(int)strlen(testword);i++)
1.1216 + alower=FALSE;
1.1217 + for (t=inword;*t;t=g_utf8_next_char(t))
1.1218 {
1.1219 + c=g_utf8_get_char(t);
1.1220 + nt=g_utf8_next_char(t);
1.1221 /* lowercase for testing */
1.1222 - if (testword[i]>='a' && testword[i]<='z')
1.1223 - alower=1;
1.1224 - if (alower && testword[i]>='A' && testword[i]<='Z')
1.1225 + if (g_unichar_islower(c))
1.1226 + alower=TRUE;
1.1227 + if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
1.1228 {
1.1229 /*
1.1230 * We have an uppercase mid-word. However, there are
1.1231 @@ -1699,15 +1706,18 @@
1.1232 * Mac and Mc like McGill
1.1233 * French contractions like l'Abbe
1.1234 */
1.1235 - if (i==2 && testword[0]=='m' && testword[1]=='c' ||
1.1236 - i==3 && testword[0]=='m' && testword[1]=='a' &&
1.1237 - testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
1.1238 + offset=g_utf8_pointer_to_offset(inword,t);
1.1239 + if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1.1240 + offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1.1241 + g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1.1242 + offset>0 &&
1.1243 + g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
1.1244 ; /* do nothing! */
1.1245 else
1.1246 istypo=TRUE;
1.1247 }
1.1248 - testword[i]=(char)tolower(testword[i]);
1.1249 }
1.1250 + testword=g_utf8_casefold(inword,-1);
1.1251 }
1.1252 if (pswit[TYPO_SWITCH])
1.1253 {
1.1254 @@ -1715,13 +1725,14 @@
1.1255 * Check for certain unlikely two-letter combinations at word
1.1256 * start and end.
1.1257 */
1.1258 - if (strlen(testword)>1)
1.1259 + len=g_utf8_strlen(testword,-1);
1.1260 + if (len>1)
1.1261 {
1.1262 for (i=0;*nostart[i];i++)
1.1263 - if (!strncmp(testword,nostart[i],2))
1.1264 + if (g_str_has_prefix(testword,nostart[i]))
1.1265 istypo=TRUE;
1.1266 for (i=0;*noend[i];i++)
1.1267 - if (!strncmp(testword+strlen(testword)-2,noend[i],2))
1.1268 + if (g_str_has_suffix(testword,noend[i]))
1.1269 istypo=TRUE;
1.1270 }
1.1271 /* ght is common, gbt never. Like that. */
1.1272 @@ -1755,21 +1766,25 @@
1.1273 * Check for no vowels or no consonants.
1.1274 * If none, flag a typo.
1.1275 */
1.1276 - if (!istypo && strlen(testword)>1)
1.1277 + if (!istypo && len>1)
1.1278 {
1.1279 vowel=consonant=0;
1.1280 - for (i=0;testword[i];i++)
1.1281 + for (t=testword;*t;t=g_utf8_next_char(t))
1.1282 {
1.1283 - if (testword[i]=='y' || gcisdigit(testword[i]))
1.1284 + c=g_utf8_get_char(t);
1.1285 + decomposition=
1.1286 + g_unicode_canonical_decomposition(c,&decomposition_len);
1.1287 + if (c=='y' || g_unichar_isdigit(c))
1.1288 {
1.1289 /* Yah, this is loose. */
1.1290 vowel++;
1.1291 consonant++;
1.1292 }
1.1293 - else if (strchr(vowels,testword[i]))
1.1294 + else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
1.1295 vowel++;
1.1296 else
1.1297 consonant++;
1.1298 + g_free(decomposition);
1.1299 }
1.1300 if (!vowel || !consonant)
1.1301 istypo=TRUE;
1.1302 @@ -1798,7 +1813,8 @@
1.1303 * "d" for a missing apostrophe - he d
1.1304 * "n" for "in"
1.1305 */
1.1306 - if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
1.1307 + if (!istypo && len==1 &&
1.1308 + g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
1.1309 istypo=TRUE;
1.1310 if (istypo)
1.1311 {
1.1312 @@ -1817,14 +1833,15 @@
1.1313 if (!isdup)
1.1314 {
1.1315 if (pswit[ECHO_SWITCH])
1.1316 - printf("\n%s\n",aline);
1.1317 + g_print("\n%s\n",aline);
1.1318 if (!pswit[OVERVIEW_SWITCH])
1.1319 {
1.1320 - printf(" Line %ld column %d - Query word %s",
1.1321 - linecnt,(int)(wordstart-aline)+1,inword);
1.1322 + g_print(" Line %ld column %ld - Query word %s",
1.1323 + linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
1.1324 + inword);
1.1325 if (!pswit[VERBOSE_SWITCH])
1.1326 - printf(" - not reporting duplicates");
1.1327 - printf("\n");
1.1328 + g_print(" - not reporting duplicates");
1.1329 + g_print("\n");
1.1330 }
1.1331 else
1.1332 cnt_word++;
1.1333 @@ -1835,10 +1852,10 @@
1.1334 if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
1.1335 {
1.1336 if (pswit[ECHO_SWITCH])
1.1337 - printf("\n%s\n",aline);
1.1338 + g_print("\n%s\n",aline);
1.1339 if (!pswit[OVERVIEW_SWITCH])
1.1340 - printf(" Line %ld column %d - Query possible scanno %s\n",
1.1341 - linecnt,(int)(wordstart-aline)+2,inword);
1.1342 + g_print(" Line %ld column %ld - Query possible scanno %s\n",
1.1343 + linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
1.1344 }
1.1345 if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
1.1346 g_free(testword);
1.1347 @@ -1848,10 +1865,11 @@
1.1348 if (!strcmp(inword,"0") || !strcmp(inword,"1"))
1.1349 {
1.1350 if (pswit[ECHO_SWITCH])
1.1351 - printf("\n%s\n",aline);
1.1352 + g_print("\n%s\n",aline);
1.1353 if (!pswit[OVERVIEW_SWITCH])
1.1354 - printf(" Line %ld column %d - Query standalone %s\n",
1.1355 - linecnt,(int)(wordstart-aline)+2,inword);
1.1356 + g_print(" Line %ld column %ld - Query standalone %s\n",
1.1357 + linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
1.1358 + inword);
1.1359 else
1.1360 cnt_word++;
1.1361 }
1.1362 @@ -1873,63 +1891,73 @@
1.1363 void check_for_misspaced_punctuation(const char *aline,
1.1364 struct parities *parities,gboolean isemptyline)
1.1365 {
1.1366 - int i,llen;
1.1367 gboolean isacro,isellipsis;
1.1368 const char *s;
1.1369 - llen=strlen(aline);
1.1370 - for (i=1;i<llen;i++)
1.1371 + gunichar c,nc,pc,n2c;
1.1372 + c=g_utf8_get_char(aline);
1.1373 + nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1.1374 + for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1.1375 {
1.1376 + pc=c;
1.1377 + c=nc;
1.1378 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1379 /* For each character in the line after the first. */
1.1380 - if (strchr(".?!,;:_",aline[i])) /* if it's punctuation */
1.1381 + if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */
1.1382 {
1.1383 /* we need to suppress warnings for acronyms like M.D. */
1.1384 isacro=FALSE;
1.1385 /* we need to suppress warnings for ellipsis . . . */
1.1386 isellipsis=FALSE;
1.1387 - /* if there are letters on both sides of it or ... */
1.1388 - if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
1.1389 - gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
1.1390 + /*
1.1391 + * If there are letters on both sides of it or
1.1392 + * if it's strict punctuation followed by an alpha.
1.1393 + */
1.1394 + if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1.1395 + g_utf8_strchr("?!,;:",-1,c)))
1.1396 {
1.1397 - /* ...if it's strict punctuation followed by an alpha */
1.1398 - if (aline[i]=='.')
1.1399 + if (c=='.')
1.1400 {
1.1401 - if (i>2 && aline[i-2]=='.')
1.1402 + if (g_utf8_pointer_to_offset(aline,s)>2 &&
1.1403 + g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1.1404 isacro=TRUE;
1.1405 - if (i+2<llen && aline[i+2]=='.')
1.1406 + n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1.1407 + if (nc && n2c=='.')
1.1408 isacro=TRUE;
1.1409 }
1.1410 if (!isacro)
1.1411 {
1.1412 if (pswit[ECHO_SWITCH])
1.1413 - printf("\n%s\n",aline);
1.1414 + g_print("\n%s\n",aline);
1.1415 if (!pswit[OVERVIEW_SWITCH])
1.1416 - printf(" Line %ld column %d - Missing space?\n",
1.1417 - linecnt,i+1);
1.1418 + g_print(" Line %ld column %ld - Missing space?\n",
1.1419 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1420 else
1.1421 cnt_punct++;
1.1422 }
1.1423 }
1.1424 - if (aline[i-1]==CHAR_SPACE &&
1.1425 - (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
1.1426 + if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
1.1427 {
1.1428 /*
1.1429 * If there are spaces on both sides,
1.1430 * or space before and end of line.
1.1431 */
1.1432 - if (aline[i]=='.')
1.1433 + if (c=='.')
1.1434 {
1.1435 - if (i>2 && aline[i-2]=='.')
1.1436 + if (g_utf8_pointer_to_offset(aline,s)>2 &&
1.1437 + g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
1.1438 isellipsis=TRUE;
1.1439 - if (i+2<llen && aline[i+2]=='.')
1.1440 + n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
1.1441 + if (nc && n2c=='.')
1.1442 isellipsis=TRUE;
1.1443 }
1.1444 if (!isemptyline && !isellipsis)
1.1445 {
1.1446 if (pswit[ECHO_SWITCH])
1.1447 - printf("\n%s\n",aline);
1.1448 + g_print("\n%s\n",aline);
1.1449 if (!pswit[OVERVIEW_SWITCH])
1.1450 - printf(" Line %ld column %d - "
1.1451 - "Spaced punctuation?\n",linecnt,i+1);
1.1452 + g_print(" Line %ld column %ld - "
1.1453 + "Spaced punctuation?\n",linecnt,
1.1454 + g_utf8_pointer_to_offset(aline,s)+1);
1.1455 else
1.1456 cnt_punct++;
1.1457 }
1.1458 @@ -1937,25 +1965,28 @@
1.1459 }
1.1460 }
1.1461 /* Split out the characters that CANNOT be preceded by space. */
1.1462 - llen=strlen(aline);
1.1463 - for (i=1;i<llen;i++)
1.1464 + c=g_utf8_get_char(aline);
1.1465 + nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1.1466 + for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1.1467 {
1.1468 + pc=c;
1.1469 + c=nc;
1.1470 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1471 /* for each character in the line after the first */
1.1472 - if (strchr("?!,;:",aline[i]))
1.1473 + if (g_utf8_strchr("?!,;:",-1,c))
1.1474 {
1.1475 /* if it's punctuation that _cannot_ have a space before it */
1.1476 - if (aline[i-1]==CHAR_SPACE && !isemptyline &&
1.1477 - aline[i+1]!=CHAR_SPACE)
1.1478 + if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
1.1479 {
1.1480 /*
1.1481 - * If aline[i+1) DOES == space,
1.1482 + * If nc DOES == space,
1.1483 * it was already reported just above.
1.1484 */
1.1485 if (pswit[ECHO_SWITCH])
1.1486 - printf("\n%s\n",aline);
1.1487 + g_print("\n%s\n",aline);
1.1488 if (!pswit[OVERVIEW_SWITCH])
1.1489 - printf(" Line %ld column %d - Spaced punctuation?\n",
1.1490 - linecnt,i+1);
1.1491 + g_print(" Line %ld column %ld - Spaced punctuation?\n",
1.1492 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1493 else
1.1494 cnt_punct++;
1.1495 }
1.1496 @@ -1966,64 +1997,77 @@
1.1497 * This plugs a hole in the acronym code above.
1.1498 * Inelegant, but maintainable.
1.1499 */
1.1500 - llen=strlen(aline);
1.1501 - for (i=1;i<llen;i++)
1.1502 + c=g_utf8_get_char(aline);
1.1503 + nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1.1504 + for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1.1505 {
1.1506 + pc=c;
1.1507 + c=nc;
1.1508 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1509 /* for each character in the line after the first */
1.1510 - if (aline[i]=='.')
1.1511 + if (c=='.')
1.1512 {
1.1513 /* if it's a period */
1.1514 - if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
1.1515 + if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
1.1516 {
1.1517 /*
1.1518 * If the period follows a space and
1.1519 * is followed by a letter.
1.1520 */
1.1521 if (pswit[ECHO_SWITCH])
1.1522 - printf("\n%s\n",aline);
1.1523 + g_print("\n%s\n",aline);
1.1524 if (!pswit[OVERVIEW_SWITCH])
1.1525 - printf(" Line %ld column %d - Spaced punctuation?\n",
1.1526 - linecnt,i+1);
1.1527 + g_print(" Line %ld column %ld - Spaced punctuation?\n",
1.1528 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1529 else
1.1530 cnt_punct++;
1.1531 }
1.1532 }
1.1533 }
1.1534 - for (i=1;i<llen;i++)
1.1535 + c=g_utf8_get_char(aline);
1.1536 + nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1.1537 + for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1.1538 {
1.1539 + pc=c;
1.1540 + c=nc;
1.1541 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1542 /* for each character in the line after the first */
1.1543 - if (aline[i]==CHAR_DQUOTE)
1.1544 + if (c==CHAR_DQUOTE)
1.1545 {
1.1546 - if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
1.1547 - !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
1.1548 - !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
1.1549 + if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
1.1550 + !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
1.1551 + !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
1.1552 {
1.1553 if (pswit[ECHO_SWITCH])
1.1554 - printf("\n%s\n",aline);
1.1555 + g_print("\n%s\n",aline);
1.1556 if (!pswit[OVERVIEW_SWITCH])
1.1557 - printf(" Line %ld column %d - Unspaced quotes?\n",
1.1558 - linecnt,i+1);
1.1559 + g_print(" Line %ld column %ld - Unspaced quotes?\n",
1.1560 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1561 else
1.1562 cnt_punct++;
1.1563 }
1.1564 }
1.1565 }
1.1566 /* Check parity of quotes. */
1.1567 - for (s=aline;*s;s++)
1.1568 + nc=g_utf8_get_char(aline);
1.1569 + for (s=aline;*s;s=g_utf8_next_char(s))
1.1570 {
1.1571 - if (*s==CHAR_DQUOTE)
1.1572 + c=nc;
1.1573 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1574 + if (c==CHAR_DQUOTE)
1.1575 {
1.1576 parities->dquote=!parities->dquote;
1.1577 if (!parities->dquote)
1.1578 {
1.1579 /* parity even */
1.1580 - if (!strchr("_-.'`/,;:!?)]} ",s[1]))
1.1581 + if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
1.1582 {
1.1583 if (pswit[ECHO_SWITCH])
1.1584 - printf("\n%s\n",aline);
1.1585 + g_print("\n%s\n",aline);
1.1586 if (!pswit[OVERVIEW_SWITCH])
1.1587 - printf(" Line %ld column %d - "
1.1588 - "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
1.1589 + g_print(" Line %ld column %ld - "
1.1590 + "Wrongspaced quotes?\n",
1.1591 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1592 else
1.1593 cnt_punct++;
1.1594 }
1.1595 @@ -2031,28 +2075,30 @@
1.1596 else
1.1597 {
1.1598 /* parity odd */
1.1599 - if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
1.1600 - !strchr("_-/.'`([{$",s[1]) || !s[1])
1.1601 + if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1.1602 + !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
1.1603 {
1.1604 if (pswit[ECHO_SWITCH])
1.1605 - printf("\n%s\n",aline);
1.1606 + g_print("\n%s\n",aline);
1.1607 if (!pswit[OVERVIEW_SWITCH])
1.1608 - printf(" Line %ld column %d - "
1.1609 - "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
1.1610 + g_print(" Line %ld column %ld - "
1.1611 + "Wrongspaced quotes?\n",
1.1612 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1613 else
1.1614 cnt_punct++;
1.1615 }
1.1616 }
1.1617 }
1.1618 }
1.1619 - if (*aline==CHAR_DQUOTE)
1.1620 + if (g_utf8_get_char(aline)==CHAR_DQUOTE)
1.1621 {
1.1622 - if (strchr(",;:!?)]} ",aline[1]))
1.1623 + if (g_utf8_strchr(",;:!?)]} ",-1,
1.1624 + g_utf8_get_char(g_utf8_next_char(aline))))
1.1625 {
1.1626 if (pswit[ECHO_SWITCH])
1.1627 - printf("\n%s\n",aline);
1.1628 + g_print("\n%s\n",aline);
1.1629 if (!pswit[OVERVIEW_SWITCH])
1.1630 - printf(" Line %ld column 1 - Wrongspaced quotes?\n",
1.1631 + g_print(" Line %ld column 1 - Wrongspaced quotes?\n",
1.1632 linecnt);
1.1633 else
1.1634 cnt_punct++;
1.1635 @@ -2060,24 +2106,28 @@
1.1636 }
1.1637 if (pswit[SQUOTE_SWITCH])
1.1638 {
1.1639 - for (s=aline;*s;s++)
1.1640 + nc=g_utf8_get_char(aline);
1.1641 + for (s=aline;*s;s=g_utf8_next_char(s))
1.1642 {
1.1643 - if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
1.1644 - (s==aline || s>aline && !gcisalpha(s[-1]) ||
1.1645 - !gcisalpha(s[1])))
1.1646 + c=nc;
1.1647 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1648 + if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
1.1649 + s>aline &&
1.1650 + !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
1.1651 + !g_unichar_isalpha(nc)))
1.1652 {
1.1653 parities->squote=!parities->squote;
1.1654 if (!parities->squote)
1.1655 {
1.1656 /* parity even */
1.1657 - if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
1.1658 + if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
1.1659 {
1.1660 if (pswit[ECHO_SWITCH])
1.1661 - printf("\n%s\n",aline);
1.1662 + g_print("\n%s\n",aline);
1.1663 if (!pswit[OVERVIEW_SWITCH])
1.1664 - printf(" Line %ld column %d - "
1.1665 + g_print(" Line %ld column %ld - "
1.1666 "Wrongspaced singlequotes?\n",
1.1667 - linecnt,(int)(s-aline)+1);
1.1668 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1669 else
1.1670 cnt_punct++;
1.1671 }
1.1672 @@ -2085,15 +2135,15 @@
1.1673 else
1.1674 {
1.1675 /* parity odd */
1.1676 - if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
1.1677 - !strchr("_-/\".'`",s[1]) || !s[1])
1.1678 + if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
1.1679 + !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
1.1680 {
1.1681 if (pswit[ECHO_SWITCH])
1.1682 - printf("\n%s\n",aline);
1.1683 + g_print("\n%s\n",aline);
1.1684 if (!pswit[OVERVIEW_SWITCH])
1.1685 - printf(" Line %ld column %d - "
1.1686 + g_print(" Line %ld column %ld - "
1.1687 "Wrongspaced singlequotes?\n",
1.1688 - linecnt,(int)(s-aline)+1);
1.1689 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1690 else
1.1691 cnt_punct++;
1.1692 }
1.1693 @@ -2117,49 +2167,54 @@
1.1694 */
1.1695 void check_for_double_punctuation(const char *aline,struct warnings *warnings)
1.1696 {
1.1697 - int i,llen;
1.1698 - llen=strlen(aline);
1.1699 - for (i=0;i<llen;i++)
1.1700 + const char *s;
1.1701 + gunichar c,nc;
1.1702 + nc=g_utf8_get_char(aline);
1.1703 + for (s=aline;*s;s=g_utf8_next_char(s))
1.1704 {
1.1705 + c=nc;
1.1706 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1707 /* for each punctuation character in the line */
1.1708 - if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
1.1709 - aline[i] && aline[i+1])
1.1710 + if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
1.1711 + g_utf8_strchr(".?!,;:",-1,nc))
1.1712 {
1.1713 /* followed by punctuation, it's a query, unless . . . */
1.1714 - if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
1.1715 - aline[i]=='!') ||
1.1716 - !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
1.1717 - warnings->isFrench && !strncmp(aline+i,",...",4) ||
1.1718 - warnings->isFrench && !strncmp(aline+i,"...,",4) ||
1.1719 - warnings->isFrench && !strncmp(aline+i,";...",4) ||
1.1720 - warnings->isFrench && !strncmp(aline+i,"...;",4) ||
1.1721 - warnings->isFrench && !strncmp(aline+i,":...",4) ||
1.1722 - warnings->isFrench && !strncmp(aline+i,"...:",4) ||
1.1723 - warnings->isFrench && !strncmp(aline+i,"!...",4) ||
1.1724 - warnings->isFrench && !strncmp(aline+i,"...!",4) ||
1.1725 - warnings->isFrench && !strncmp(aline+i,"?...",4) ||
1.1726 - warnings->isFrench && !strncmp(aline+i,"...?",4))
1.1727 + if (c==nc && (c=='.' || c=='?' || c=='!') ||
1.1728 + !warnings->dotcomma && c=='.' && nc==',' ||
1.1729 + warnings->isFrench && g_str_has_prefix(s,",...") ||
1.1730 + warnings->isFrench && g_str_has_prefix(s,"...,") ||
1.1731 + warnings->isFrench && g_str_has_prefix(s,";...") ||
1.1732 + warnings->isFrench && g_str_has_prefix(s,"...;") ||
1.1733 + warnings->isFrench && g_str_has_prefix(s,":...") ||
1.1734 + warnings->isFrench && g_str_has_prefix(s,"...:") ||
1.1735 + warnings->isFrench && g_str_has_prefix(s,"!...") ||
1.1736 + warnings->isFrench && g_str_has_prefix(s,"...!") ||
1.1737 + warnings->isFrench && g_str_has_prefix(s,"?...") ||
1.1738 + warnings->isFrench && g_str_has_prefix(s,"...?"))
1.1739 {
1.1740 - if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
1.1741 - warnings->isFrench && !strncmp(aline+i,"...,",4) ||
1.1742 - warnings->isFrench && !strncmp(aline+i,";...",4) ||
1.1743 - warnings->isFrench && !strncmp(aline+i,"...;",4) ||
1.1744 - warnings->isFrench && !strncmp(aline+i,":...",4) ||
1.1745 - warnings->isFrench && !strncmp(aline+i,"...:",4) ||
1.1746 - warnings->isFrench && !strncmp(aline+i,"!...",4) ||
1.1747 - warnings->isFrench && !strncmp(aline+i,"...!",4) ||
1.1748 - warnings->isFrench && !strncmp(aline+i,"?...",4) ||
1.1749 - warnings->isFrench && !strncmp(aline+i,"...?",4))
1.1750 - i+=4;
1.1751 + if (warnings->isFrench && g_str_has_prefix(s,",...") ||
1.1752 + warnings->isFrench && g_str_has_prefix(s,"...,") ||
1.1753 + warnings->isFrench && g_str_has_prefix(s,";...") ||
1.1754 + warnings->isFrench && g_str_has_prefix(s,"...;") ||
1.1755 + warnings->isFrench && g_str_has_prefix(s,":...") ||
1.1756 + warnings->isFrench && g_str_has_prefix(s,"...:") ||
1.1757 + warnings->isFrench && g_str_has_prefix(s,"!...") ||
1.1758 + warnings->isFrench && g_str_has_prefix(s,"...!") ||
1.1759 + warnings->isFrench && g_str_has_prefix(s,"?...") ||
1.1760 + warnings->isFrench && g_str_has_prefix(s,"...?"))
1.1761 + {
1.1762 + s+=4;
1.1763 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1764 + }
1.1765 ; /* do nothing for .. !! and ?? which can be legit */
1.1766 }
1.1767 else
1.1768 {
1.1769 if (pswit[ECHO_SWITCH])
1.1770 - printf("\n%s\n",aline);
1.1771 + g_print("\n%s\n",aline);
1.1772 if (!pswit[OVERVIEW_SWITCH])
1.1773 - printf(" Line %ld column %d - Double punctuation?\n",
1.1774 - linecnt,i+1);
1.1775 + g_print(" Line %ld column %ld - Double punctuation?\n",
1.1776 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.1777 else
1.1778 cnt_punct++;
1.1779 }
1.1780 @@ -2177,37 +2232,37 @@
1.1781 while ((t=strstr(s," \" ")))
1.1782 {
1.1783 if (pswit[ECHO_SWITCH])
1.1784 - printf("\n%s\n",aline);
1.1785 + g_print("\n%s\n",aline);
1.1786 if (!pswit[OVERVIEW_SWITCH])
1.1787 - printf(" Line %ld column %d - Spaced doublequote?\n",
1.1788 - linecnt,(int)(t-aline+1));
1.1789 + g_print(" Line %ld column %ld - Spaced doublequote?\n",
1.1790 + linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1.1791 else
1.1792 cnt_punct++;
1.1793 - s=t+2;
1.1794 + s=g_utf8_next_char(g_utf8_next_char(t));
1.1795 }
1.1796 s=aline;
1.1797 while ((t=strstr(s," ' ")))
1.1798 {
1.1799 if (pswit[ECHO_SWITCH])
1.1800 - printf("\n%s\n",aline);
1.1801 + g_print("\n%s\n",aline);
1.1802 if (!pswit[OVERVIEW_SWITCH])
1.1803 - printf(" Line %ld column %d - Spaced singlequote?\n",
1.1804 - linecnt,(int)(t-aline+1));
1.1805 + g_print(" Line %ld column %ld - Spaced singlequote?\n",
1.1806 + linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1.1807 else
1.1808 cnt_punct++;
1.1809 - s=t+2;
1.1810 + s=g_utf8_next_char(g_utf8_next_char(t));
1.1811 }
1.1812 s=aline;
1.1813 while ((t=strstr(s," ` ")))
1.1814 {
1.1815 if (pswit[ECHO_SWITCH])
1.1816 - printf("\n%s\n",aline);
1.1817 + g_print("\n%s\n",aline);
1.1818 if (!pswit[OVERVIEW_SWITCH])
1.1819 - printf(" Line %ld column %d - Spaced singlequote?\n",
1.1820 - linecnt,(int)(t-aline+1));
1.1821 + g_print(" Line %ld column %ld - Spaced singlequote?\n",
1.1822 + linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1.1823 else
1.1824 cnt_punct++;
1.1825 - s=t+2;
1.1826 + s=g_utf8_next_char(g_utf8_next_char(t));
1.1827 }
1.1828 }
1.1829
1.1830 @@ -2219,22 +2274,26 @@
1.1831 void check_for_miscased_genative(const char *aline)
1.1832 {
1.1833 const char *s;
1.1834 + gunichar c,nc,pc;
1.1835 if (!*aline)
1.1836 return;
1.1837 - s=aline+1;
1.1838 - while (*s)
1.1839 + c=g_utf8_get_char(aline);
1.1840 + nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1.1841 + for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1.1842 {
1.1843 - if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
1.1844 + pc=c;
1.1845 + c=nc;
1.1846 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1847 + if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
1.1848 {
1.1849 if (pswit[ECHO_SWITCH])
1.1850 - printf("\n%s\n",aline);
1.1851 + g_print("\n%s\n",aline);
1.1852 if (!pswit[OVERVIEW_SWITCH])
1.1853 - printf(" Line %ld column %d - Capital \"S\"?\n",
1.1854 - linecnt,(int)(s-aline+2));
1.1855 + g_print(" Line %ld column %ld - Capital \"S\"?\n",
1.1856 + linecnt,g_utf8_pointer_to_offset(aline,s)+2);
1.1857 else
1.1858 cnt_punct++;
1.1859 }
1.1860 - s++;
1.1861 }
1.1862 }
1.1863
1.1864 @@ -2248,29 +2307,34 @@
1.1865 */
1.1866 void check_end_of_line(const char *aline,struct warnings *warnings)
1.1867 {
1.1868 - int i,llen;
1.1869 - llen=strlen(aline);
1.1870 - if (llen>1)
1.1871 + int lbytes;
1.1872 + const char *s;
1.1873 + gunichar c1,c2;
1.1874 + lbytes=strlen(aline);
1.1875 + if (g_utf8_strlen(aline,lbytes)>1)
1.1876 {
1.1877 - if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
1.1878 - aline[llen-1]==CHAR_OPEN_SQUOTE)
1.1879 - if (aline[llen-2]==CHAR_SPACE)
1.1880 - {
1.1881 - if (pswit[ECHO_SWITCH])
1.1882 - printf("\n%s\n",aline);
1.1883 - if (!pswit[OVERVIEW_SWITCH])
1.1884 - printf(" Line %ld column %d - Spaced quote?\n",
1.1885 - linecnt,llen);
1.1886 - else
1.1887 - cnt_punct++;
1.1888 - }
1.1889 - if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
1.1890 - aline[1]==CHAR_SPACE)
1.1891 + s=g_utf8_prev_char(aline+lbytes);
1.1892 + c1=g_utf8_get_char(s);
1.1893 + c2=g_utf8_get_char(g_utf8_prev_char(s));
1.1894 + if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
1.1895 + c2==CHAR_SPACE)
1.1896 {
1.1897 if (pswit[ECHO_SWITCH])
1.1898 - printf("\n%s\n",aline);
1.1899 + g_print("\n%s\n",aline);
1.1900 if (!pswit[OVERVIEW_SWITCH])
1.1901 - printf(" Line %ld column 1 - Spaced quote?\n",linecnt);
1.1902 + g_print(" Line %ld column %ld - Spaced quote?\n",linecnt,
1.1903 + g_utf8_strlen(aline,lbytes));
1.1904 + else
1.1905 + cnt_punct++;
1.1906 + }
1.1907 + c1=g_utf8_get_char(aline);
1.1908 + c2=g_utf8_get_char(g_utf8_next_char(aline));
1.1909 + if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
1.1910 + {
1.1911 + if (pswit[ECHO_SWITCH])
1.1912 + g_print("\n%s\n",aline);
1.1913 + if (!pswit[OVERVIEW_SWITCH])
1.1914 + g_print(" Line %ld column 1 - Spaced quote?\n",linecnt);
1.1915 else
1.1916 cnt_punct++;
1.1917 }
1.1918 @@ -2280,15 +2344,18 @@
1.1919 */
1.1920 if (pswit[PARANOID_SWITCH] && warnings->hyphen)
1.1921 {
1.1922 - for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
1.1923 + for (s=g_utf8_prev_char(aline+lbytes);
1.1924 + s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
1.1925 ;
1.1926 - if (aline[i]=='-' && aline[i-1]!='-')
1.1927 + if (g_utf8_get_char(s)=='-' &&
1.1928 + g_utf8_get_char(g_utf8_prev_char(s))!='-')
1.1929 {
1.1930 if (pswit[ECHO_SWITCH])
1.1931 - printf("\n%s\n",aline);
1.1932 + g_print("\n%s\n",aline);
1.1933 if (!pswit[OVERVIEW_SWITCH])
1.1934 - printf(" Line %ld column %d - Hyphen at end of line?\n",
1.1935 - linecnt,i);
1.1936 + g_print(" Line %ld column %ld - "
1.1937 + "Hyphen at end of line?\n",
1.1938 + linecnt,g_utf8_pointer_to_offset(aline,s));
1.1939 }
1.1940 }
1.1941 }
1.1942 @@ -2302,19 +2369,26 @@
1.1943 */
1.1944 void check_for_unspaced_bracket(const char *aline)
1.1945 {
1.1946 - int i,llen;
1.1947 - llen=strlen(aline);
1.1948 - for (i=1;i<llen-1;i++)
1.1949 + const char *s;
1.1950 + gunichar c,nc,pc;
1.1951 + c=g_utf8_get_char(aline);
1.1952 + nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1.1953 + for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1.1954 {
1.1955 + pc=c;
1.1956 + c=nc;
1.1957 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1958 + if (!nc)
1.1959 + break;
1.1960 /* for each bracket character in the line except 1st & last */
1.1961 - if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
1.1962 - gcisalpha(aline[i+1]))
1.1963 + if (g_utf8_strchr("{[()]}",-1,c) &&
1.1964 + g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
1.1965 {
1.1966 if (pswit[ECHO_SWITCH])
1.1967 - printf("\n%s\n",aline);
1.1968 + g_print("\n%s\n",aline);
1.1969 if (!pswit[OVERVIEW_SWITCH])
1.1970 - printf(" Line %ld column %d - Unspaced bracket?\n",
1.1971 - linecnt,i);
1.1972 + g_print(" Line %ld column %ld - Unspaced bracket?\n",
1.1973 + linecnt,g_utf8_pointer_to_offset(aline,s));
1.1974 else
1.1975 cnt_punct++;
1.1976 }
1.1977 @@ -2326,18 +2400,24 @@
1.1978 */
1.1979 void check_for_unpunctuated_endquote(const char *aline)
1.1980 {
1.1981 - int i,llen;
1.1982 - llen=strlen(aline);
1.1983 - for (i=1;i<llen;i++)
1.1984 + const char *s;
1.1985 + gunichar c,nc,pc;
1.1986 + c=g_utf8_get_char(aline);
1.1987 + nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
1.1988 + for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
1.1989 {
1.1990 + pc=c;
1.1991 + c=nc;
1.1992 + nc=g_utf8_get_char(g_utf8_next_char(s));
1.1993 /* for each character in the line except 1st */
1.1994 - if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
1.1995 + if (c==CHAR_DQUOTE && isalpha(pc))
1.1996 {
1.1997 if (pswit[ECHO_SWITCH])
1.1998 - printf("\n%s\n",aline);
1.1999 + g_print("\n%s\n",aline);
1.2000 if (!pswit[OVERVIEW_SWITCH])
1.2001 - printf(" Line %ld column %d - "
1.2002 - "endquote missing punctuation?\n",linecnt,i);
1.2003 + g_print(" Line %ld column %ld - "
1.2004 + "endquote missing punctuation?\n",
1.2005 + linecnt,g_utf8_pointer_to_offset(aline,s));
1.2006 else
1.2007 cnt_punct++;
1.2008 }
1.2009 @@ -2354,25 +2434,25 @@
1.2010 */
1.2011 void check_for_html_tag(const char *aline)
1.2012 {
1.2013 - int i;
1.2014 const char *open,*close;
1.2015 - open=strstr(aline,"<");
1.2016 + gchar *tag;
1.2017 + open=strchr(aline,'<');
1.2018 if (open)
1.2019 {
1.2020 - close=strstr(aline,">");
1.2021 + close=strchr(g_utf8_next_char(open),'>');
1.2022 if (close)
1.2023 {
1.2024 - i=(int)(close-open+1);
1.2025 - if (i>0)
1.2026 + if (pswit[ECHO_SWITCH])
1.2027 + g_print("\n%s\n",aline);
1.2028 + if (!pswit[OVERVIEW_SWITCH])
1.2029 {
1.2030 - if (pswit[ECHO_SWITCH])
1.2031 - printf("\n%s\n",aline);
1.2032 - if (!pswit[OVERVIEW_SWITCH])
1.2033 - printf(" Line %ld column %d - HTML Tag? %*.*s \n",
1.2034 - linecnt,(int)(open-aline)+1,i,i,open);
1.2035 - else
1.2036 - cnt_html++;
1.2037 + tag=g_strndup(open,close-open+1);
1.2038 + g_print(" Line %ld column %ld - HTML Tag? %s \n",
1.2039 + linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
1.2040 + g_free(tag);
1.2041 }
1.2042 + else
1.2043 + cnt_html++;
1.2044 }
1.2045 }
1.2046 }
1.2047 @@ -2387,25 +2467,28 @@
1.2048 */
1.2049 void check_for_html_entity(const char *aline)
1.2050 {
1.2051 - int i;
1.2052 const char *s,*amp,*scolon;
1.2053 - amp=strstr(aline,"&");
1.2054 + gchar *entity;
1.2055 + amp=strchr(aline,'&');
1.2056 if (amp)
1.2057 {
1.2058 - scolon=strstr(aline,";");
1.2059 + scolon=strchr(amp,';');
1.2060 if (scolon)
1.2061 {
1.2062 - i=(int)(scolon-amp+1);
1.2063 - for (s=amp;s<scolon;s++)
1.2064 - if (*s==CHAR_SPACE)
1.2065 - i=0; /* Don't report "Jones & Son;" */
1.2066 - if (i>0)
1.2067 + for (s=amp;s<scolon;s=g_utf8_next_char(s))
1.2068 + if (g_utf8_get_char(s)==CHAR_SPACE)
1.2069 + break; /* Don't report "Jones & Son;" */
1.2070 + if (s>=scolon)
1.2071 {
1.2072 if (pswit[ECHO_SWITCH])
1.2073 - printf("\n%s\n",aline);
1.2074 + g_print("\n%s\n",aline);
1.2075 if (!pswit[OVERVIEW_SWITCH])
1.2076 - printf(" Line %ld column %d - HTML symbol? %*.*s \n",
1.2077 - linecnt,(int)(amp-aline)+1,i,i,amp);
1.2078 + {
1.2079 + entity=g_strndup(amp,scolon-amp+1);
1.2080 + g_print(" Line %ld column %d - HTML symbol? %s \n",
1.2081 + linecnt,(int)(amp-aline)+1,entity);
1.2082 + g_free(entity);
1.2083 + }
1.2084 else
1.2085 cnt_html++;
1.2086 }
1.2087 @@ -2425,18 +2508,20 @@
1.2088 struct pending *pending)
1.2089 {
1.2090 const char *s;
1.2091 + gunichar c;
1.2092 s=aline;
1.2093 while (*s==' ')
1.2094 s++;
1.2095 + c=g_utf8_get_char(s);
1.2096 if (pending->dquote)
1.2097 {
1.2098 - if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
1.2099 + if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
1.2100 {
1.2101 if (!pswit[OVERVIEW_SWITCH])
1.2102 {
1.2103 if (pswit[ECHO_SWITCH])
1.2104 - printf("\n%s\n",parastart);
1.2105 - puts(pending->dquote);
1.2106 + g_print("\n%s\n",parastart);
1.2107 + g_print("%s\n",pending->dquote);
1.2108 }
1.2109 else
1.2110 cnt_dquot++;
1.2111 @@ -2446,14 +2531,14 @@
1.2112 }
1.2113 if (pending->squote)
1.2114 {
1.2115 - if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
1.2116 + if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
1.2117 pending->squot)
1.2118 {
1.2119 if (!pswit[OVERVIEW_SWITCH])
1.2120 {
1.2121 if (pswit[ECHO_SWITCH])
1.2122 - printf("\n%s\n",parastart);
1.2123 - puts(pending->squote);
1.2124 + g_print("\n%s\n",parastart);
1.2125 + g_print("%s\n",pending->squote);
1.2126 }
1.2127 else
1.2128 cnt_squot++;
1.2129 @@ -2466,8 +2551,8 @@
1.2130 if (!pswit[OVERVIEW_SWITCH])
1.2131 {
1.2132 if (pswit[ECHO_SWITCH])
1.2133 - printf("\n%s\n",parastart);
1.2134 - puts(pending->rbrack);
1.2135 + g_print("\n%s\n",parastart);
1.2136 + g_print("%s\n",pending->rbrack);
1.2137 }
1.2138 else
1.2139 cnt_brack++;
1.2140 @@ -2479,8 +2564,8 @@
1.2141 if (!pswit[OVERVIEW_SWITCH])
1.2142 {
1.2143 if (pswit[ECHO_SWITCH])
1.2144 - printf("\n%s\n",parastart);
1.2145 - puts(pending->sbrack);
1.2146 + g_print("\n%s\n",parastart);
1.2147 + g_print("%s\n",pending->sbrack);
1.2148 }
1.2149 else
1.2150 cnt_brack++;
1.2151 @@ -2492,8 +2577,8 @@
1.2152 if (!pswit[OVERVIEW_SWITCH])
1.2153 {
1.2154 if (pswit[ECHO_SWITCH])
1.2155 - printf("\n%s\n",parastart);
1.2156 - puts(pending->cbrack);
1.2157 + g_print("\n%s\n",parastart);
1.2158 + g_print("%s\n",pending->cbrack);
1.2159 }
1.2160 else
1.2161 cnt_brack++;
1.2162 @@ -2505,8 +2590,8 @@
1.2163 if (!pswit[OVERVIEW_SWITCH])
1.2164 {
1.2165 if (pswit[ECHO_SWITCH])
1.2166 - printf("\n%s\n",parastart);
1.2167 - puts(pending->unders);
1.2168 + g_print("\n%s\n",parastart);
1.2169 + g_print("%s\n",pending->unders);
1.2170 }
1.2171 else
1.2172 cnt_brack++;
1.2173 @@ -2577,12 +2662,14 @@
1.2174 void check_for_omitted_punctuation(const char *prevline,
1.2175 struct line_properties *last,int start_para_line)
1.2176 {
1.2177 - int i;
1.2178 + gboolean letter_on_line=FALSE;
1.2179 const char *s;
1.2180 - for (s=prevline,i=0;*s && !i;s++)
1.2181 - if (gcisletter(*s))
1.2182 - /* use i to indicate the presence of a letter on the line */
1.2183 - i=1;
1.2184 + for (s=prevline;*s;s=g_utf8_next_char(s))
1.2185 + if (g_unichar_isalpha(g_utf8_get_char(s)))
1.2186 + {
1.2187 + letter_on_line=TRUE;
1.2188 + break;
1.2189 + }
1.2190 /*
1.2191 * This next "if" is a problem.
1.2192 * If we say "start_para_line <= linecnt - 1", that includes
1.2193 @@ -2590,28 +2677,30 @@
1.2194 * If we say "start_para_line < linecnt - 1" it doesn't, but then it
1.2195 * misses genuine one-line paragraphs.
1.2196 */
1.2197 - if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)
1.2198 + if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
1.2199 + g_utf8_get_char(prevline)>CHAR_SPACE)
1.2200 {
1.2201 - for (i=strlen(prevline)-1;
1.2202 - (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
1.2203 - prevline[i]>CHAR_SPACE && i>0;
1.2204 - i--)
1.2205 + for (s=g_utf8_prev_char(prevline+strlen(prevline));
1.2206 + (g_utf8_get_char(s)==CHAR_DQUOTE ||
1.2207 + g_utf8_get_char(s)==CHAR_SQUOTE) &&
1.2208 + g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
1.2209 + s=g_utf8_prev_char(s))
1.2210 ;
1.2211 - for (;i>0;i--)
1.2212 + for (;s>prevline;s=g_utf8_prev_char(s))
1.2213 {
1.2214 - if (gcisalpha(prevline[i]))
1.2215 + if (g_unichar_isalpha(g_utf8_get_char(s)))
1.2216 {
1.2217 if (pswit[ECHO_SWITCH])
1.2218 - printf("\n%s\n",prevline);
1.2219 + g_print("\n%s\n",prevline);
1.2220 if (!pswit[OVERVIEW_SWITCH])
1.2221 - printf(" Line %ld column %d - "
1.2222 + g_print(" Line %ld column %ld - "
1.2223 "No punctuation at para end?\n",
1.2224 - linecnt-1,(int)strlen(prevline));
1.2225 + linecnt-1,g_utf8_strlen(prevline,-1));
1.2226 else
1.2227 cnt_punct++;
1.2228 break;
1.2229 }
1.2230 - if (strchr("-.:!([{?}])",prevline[i]))
1.2231 + if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
1.2232 break;
1.2233 }
1.2234 }
1.2235 @@ -2622,11 +2711,38 @@
1.2236 const char *word=key;
1.2237 int *dupcnt=value;
1.2238 if (*dupcnt)
1.2239 - printf("\nNote: Queried word %s was duplicated %d times\n",
1.2240 + g_print("\nNote: Queried word %s was duplicated %d times\n",
1.2241 word,*dupcnt);
1.2242 return FALSE;
1.2243 }
1.2244
1.2245 +void print_as_windows_1252(const char *string)
1.2246 +{
1.2247 + gsize inbytes,outbytes;
1.2248 + gchar *buf,*bp;
1.2249 + GIConv converter=(GIConv)-1;
1.2250 + if (!string)
1.2251 + {
1.2252 + if (converter!=(GIConv)-1)
1.2253 + g_iconv_close(converter);
1.2254 + converter=(GIConv)-1;
1.2255 + return;
1.2256 + }
1.2257 + if (converter=(GIConv)-1)
1.2258 + converter=g_iconv_open("WINDOWS-1252","UTF-8");
1.2259 + if (converter!=(GIConv)-1)
1.2260 + {
1.2261 + inbytes=outbytes=strlen(string);
1.2262 + bp=buf=g_malloc(outbytes+1);
1.2263 + g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
1.2264 + *bp='\0';
1.2265 + fputs(buf,stdout);
1.2266 + g_free(buf);
1.2267 + }
1.2268 + else
1.2269 + fputs(string,stdout);
1.2270 +}
1.2271 +
1.2272 /*
1.2273 * procfile:
1.2274 *
1.2275 @@ -2659,7 +2775,8 @@
1.2276 fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
1.2277 exit(1);
1.2278 }
1.2279 - fprintf(stdout,"\n\nFile: %s\n\n",filename);
1.2280 + g_set_print_handler(print_as_windows_1252);
1.2281 + g_print("\n\nFile: %s\n\n",filename);
1.2282 first_pass_results=first_pass(etext);
1.2283 warnings=report_first_pass(first_pass_results);
1.2284 qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
1.2285 @@ -2674,7 +2791,7 @@
1.2286 linecnt++;
1.2287 if (linecnt==1)
1.2288 isnewpara=TRUE;
1.2289 - if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
1.2290 + if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
1.2291 continue; // skip DP page separators completely
1.2292 if (linecnt<first_pass_results->firstline ||
1.2293 (first_pass_results->footerline>0 &&
1.2294 @@ -2682,14 +2799,14 @@
1.2295 {
1.2296 if (pswit[HEADER_SWITCH])
1.2297 {
1.2298 - if (!strncmp(aline,"Title:",6))
1.2299 - printf(" %s\n",aline);
1.2300 - if (!strncmp(aline,"Author:",7))
1.2301 - printf(" %s\n",aline);
1.2302 - if (!strncmp(aline,"Release Date:",13))
1.2303 - printf(" %s\n",aline);
1.2304 - if (!strncmp(aline,"Edition:",8))
1.2305 - printf(" %s\n\n",aline);
1.2306 + if (g_str_has_prefix(aline,"Title:"))
1.2307 + g_print(" %s\n",aline);
1.2308 + if (g_str_has_prefix(aline,"Author:"))
1.2309 + g_print(" %s\n",aline);
1.2310 + if (g_str_has_prefix(aline,"Release Date:"))
1.2311 + g_print(" %s\n",aline);
1.2312 + if (g_str_has_prefix(aline,"Edition:"))
1.2313 + g_print(" %s\n\n",aline);
1.2314 }
1.2315 continue; /* skip through the header */
1.2316 }
1.2317 @@ -2706,36 +2823,38 @@
1.2318 parastart=g_strdup(aline);
1.2319 memset(&parities,0,sizeof(parities)); /* restart the quote count */
1.2320 s=aline;
1.2321 - while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
1.2322 - s++;
1.2323 - if (*s>='a' && *s<='z')
1.2324 + while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
1.2325 + !g_unichar_isdigit(g_utf8_get_char(s)))
1.2326 + s=g_utf8_next_char(s);
1.2327 + if (g_unichar_islower(g_utf8_get_char(s)))
1.2328 {
1.2329 /* and its first letter is lowercase */
1.2330 if (pswit[ECHO_SWITCH])
1.2331 - printf("\n%s\n",aline);
1.2332 + g_print("\n%s\n",aline);
1.2333 if (!pswit[OVERVIEW_SWITCH])
1.2334 - printf(" Line %ld column %d - "
1.2335 + g_print(" Line %ld column %ld - "
1.2336 "Paragraph starts with lower-case\n",
1.2337 - linecnt,(int)(s-aline)+1);
1.2338 + linecnt,g_utf8_pointer_to_offset(aline,s)+1);
1.2339 else
1.2340 cnt_punct++;
1.2341 }
1.2342 isnewpara=FALSE; /* Signal the end of new para processing. */
1.2343 }
1.2344 /* Check for an em-dash broken at line end. */
1.2345 - if (enddash && *aline=='-')
1.2346 + if (enddash && g_utf8_get_char(aline)=='-')
1.2347 {
1.2348 if (pswit[ECHO_SWITCH])
1.2349 - printf("\n%s\n",aline);
1.2350 + g_print("\n%s\n",aline);
1.2351 if (!pswit[OVERVIEW_SWITCH])
1.2352 - printf(" Line %ld column 1 - Broken em-dash?\n",linecnt);
1.2353 + g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt);
1.2354 else
1.2355 cnt_punct++;
1.2356 }
1.2357 enddash=FALSE;
1.2358 - for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
1.2359 + for (s=g_utf8_prev_char(aline+strlen(aline));
1.2360 + g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
1.2361 ;
1.2362 - if (s>=aline && *s=='-')
1.2363 + if (s>=aline && g_utf8_get_char(s)=='-')
1.2364 enddash=TRUE;
1.2365 check_for_control_characters(aline);
1.2366 if (warnings->bin)
1.2367 @@ -2745,8 +2864,8 @@
1.2368 if (warnings->shortline)
1.2369 check_for_short_line(aline,&last);
1.2370 last.blen=last.len;
1.2371 - last.len=strlen(aline);
1.2372 - last.start=aline[0];
1.2373 + last.len=g_utf8_strlen(aline,-1);
1.2374 + last.start=g_utf8_get_char(aline);
1.2375 check_for_starting_punctuation(aline);
1.2376 if (warnings->dash)
1.2377 {
1.2378 @@ -2795,6 +2914,8 @@
1.2379 g_tree_foreach(qword,report_duplicate_queries,NULL);
1.2380 g_tree_unref(qword);
1.2381 g_tree_unref(qperiod);
1.2382 + g_set_print_handler(NULL);
1.2383 + print_as_windows_1252(NULL);
1.2384 }
1.2385
1.2386 /*
1.2387 @@ -2807,14 +2928,15 @@
1.2388 */
1.2389 char *flgets(char **etext,long lcnt)
1.2390 {
1.2391 - char c;
1.2392 - int len;
1.2393 + gunichar c;
1.2394 gboolean isCR=FALSE;
1.2395 char *theline=*etext;
1.2396 - len=0;
1.2397 - for(;;)
1.2398 + char *eos=theline;
1.2399 + gchar *s;
1.2400 + for (;;)
1.2401 {
1.2402 - c=*(*etext)++;
1.2403 + c=g_utf8_get_char(*etext);
1.2404 + *etext=g_utf8_next_char(*etext);
1.2405 if (!c)
1.2406 return NULL;
1.2407 /* either way, it's end of line */
1.2408 @@ -2828,9 +2950,13 @@
1.2409 if (pswit[LINE_END_SWITCH])
1.2410 {
1.2411 if (pswit[ECHO_SWITCH])
1.2412 - printf("\n%*.*s\n",len,len,theline);
1.2413 + {
1.2414 + s=g_strndup(theline,eos-theline);
1.2415 + g_print("\n%s\n",s);
1.2416 + g_free(s);
1.2417 + }
1.2418 if (!pswit[OVERVIEW_SWITCH])
1.2419 - printf(" Line %ld - No CR?\n",lcnt);
1.2420 + g_print(" Line %ld - No CR?\n",lcnt);
1.2421 else
1.2422 cnt_lineend++;
1.2423 }
1.2424 @@ -2845,9 +2971,13 @@
1.2425 if (pswit[LINE_END_SWITCH])
1.2426 {
1.2427 if (pswit[ECHO_SWITCH])
1.2428 - printf("\n%*.*s\n",len,len,theline);
1.2429 + {
1.2430 + s=g_strndup(theline,eos-theline);
1.2431 + g_print("\n%s\n",s);
1.2432 + g_free(s);
1.2433 + }
1.2434 if (!pswit[OVERVIEW_SWITCH])
1.2435 - printf(" Line %ld - Two successive CRs?\n",lcnt);
1.2436 + g_print(" Line %ld - Two successive CRs?\n",lcnt);
1.2437 else
1.2438 cnt_lineend++;
1.2439 }
1.2440 @@ -2859,19 +2989,23 @@
1.2441 if (pswit[LINE_END_SWITCH] && isCR)
1.2442 {
1.2443 if (pswit[ECHO_SWITCH])
1.2444 - printf("\n%*.*s\n",len,len,theline);
1.2445 + {
1.2446 + s=g_strndup(theline,eos-theline);
1.2447 + g_print("\n%s\n",s);
1.2448 + g_free(s);
1.2449 + }
1.2450 if (!pswit[OVERVIEW_SWITCH])
1.2451 - printf(" Line %ld column %d - CR without LF?\n",
1.2452 - lcnt,len+1);
1.2453 + g_print(" Line %ld column %ld - CR without LF?\n",
1.2454 + lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
1.2455 else
1.2456 cnt_lineend++;
1.2457 - theline[len]=' ';
1.2458 + *eos=' ';
1.2459 }
1.2460 isCR=FALSE;
1.2461 - len++;
1.2462 + eos=g_utf8_next_char(eos);
1.2463 }
1.2464 }
1.2465 - theline[len]='\0';
1.2466 + *eos='\0';
1.2467 if (pswit[MARKUP_SWITCH])
1.2468 postprocess_for_HTML(theline);
1.2469 if (pswit[DP_SWITCH])
1.2470 @@ -2886,55 +3020,55 @@
1.2471 * contains a mixture of alpha and digits. Generally, this is an
1.2472 * error, but may not be for cases like 4th or L5 12s. 3d.
1.2473 *
1.2474 - * Returns: 0 if no error found, 1 if error.
1.2475 + * Returns: TRUE iff an is error found.
1.2476 */
1.2477 -int mixdigit(const char *checkword)
1.2478 +gboolean mixdigit(const char *checkword)
1.2479 {
1.2480 - int wehaveadigit,wehavealetter,firstdigits,query,wl;
1.2481 - const char *s;
1.2482 - wehaveadigit=wehavealetter=query=0;
1.2483 - for (s=checkword;*s;s++)
1.2484 - if (gcisalpha(*s))
1.2485 - wehavealetter=1;
1.2486 - else
1.2487 - if (gcisdigit(*s))
1.2488 - wehaveadigit=1;
1.2489 + gboolean wehaveadigit,wehavealetter,query;
1.2490 + const char *s,*nondigit;
1.2491 + wehaveadigit=wehavealetter=query=FALSE;
1.2492 + for (s=checkword;*s;s=g_utf8_next_char(s))
1.2493 + if (g_unichar_isalpha(g_utf8_get_char(s)))
1.2494 + wehavealetter=TRUE;
1.2495 + else if (g_unichar_isdigit(g_utf8_get_char(s)))
1.2496 + wehaveadigit=TRUE;
1.2497 if (wehaveadigit && wehavealetter)
1.2498 {
1.2499 /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
1.2500 - query=1;
1.2501 - wl=strlen(checkword);
1.2502 - for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
1.2503 + query=TRUE;
1.2504 + for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
1.2505 + nondigit=g_utf8_next_char(nondigit))
1.2506 ;
1.2507 /* digits, ending in st, rd, nd, th of either case */
1.2508 - if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
1.2509 - !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
1.2510 - !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
1.2511 - !g_ascii_strcasecmp(checkword+wl-2,"th")))
1.2512 - query=0;
1.2513 - if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
1.2514 - !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
1.2515 - !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
1.2516 - !g_ascii_strcasecmp(checkword+wl-3,"ths")))
1.2517 - query=0;
1.2518 - if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
1.2519 - !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
1.2520 - !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
1.2521 - !g_ascii_strcasecmp(checkword+wl-4,"thly")))
1.2522 - query=0;
1.2523 + if (!g_ascii_strcasecmp(nondigit,"st") ||
1.2524 + !g_ascii_strcasecmp(nondigit,"rd") ||
1.2525 + !g_ascii_strcasecmp(nondigit,"nd") ||
1.2526 + !g_ascii_strcasecmp(nondigit,"th"))
1.2527 + query=FALSE;
1.2528 + if (!g_ascii_strcasecmp(nondigit,"sts") ||
1.2529 + !g_ascii_strcasecmp(nondigit,"rds") ||
1.2530 + !g_ascii_strcasecmp(nondigit,"nds") ||
1.2531 + !g_ascii_strcasecmp(nondigit,"ths"))
1.2532 + query=FALSE;
1.2533 + if (!g_ascii_strcasecmp(nondigit,"stly") ||
1.2534 + !g_ascii_strcasecmp(nondigit,"rdly") ||
1.2535 + !g_ascii_strcasecmp(nondigit,"ndly") ||
1.2536 + !g_ascii_strcasecmp(nondigit,"thly"))
1.2537 + query=FALSE;
1.2538 /* digits, ending in l, L, s or d */
1.2539 - if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
1.2540 - checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
1.2541 - query=0;
1.2542 + if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
1.2543 + !strcmp(nondigit,"d"))
1.2544 + query=FALSE;
1.2545 /*
1.2546 * L at the start of a number, representing Britsh pounds, like L500.
1.2547 - * This is cute. We know the current word is mixeddigit. If the first
1.2548 + * This is cute. We know the current word is mixed digit. If the first
1.2549 * letter is L, there must be at least one digit following. If both
1.2550 * digits and letters follow, we have a genuine error, else we have a
1.2551 * capital L followed by digits, and we accept that as a non-error.
1.2552 */
1.2553 - if (checkword[0]=='L' && !mixdigit(checkword+1))
1.2554 - query=0;
1.2555 + if (g_utf8_get_char(checkword)=='L' &&
1.2556 + !mixdigit(g_utf8_next_char(checkword)))
1.2557 + query=FALSE;
1.2558 }
1.2559 return query;
1.2560 }
1.2561 @@ -2951,11 +3085,13 @@
1.2562 */
1.2563 gchar *getaword(const char **ptr)
1.2564 {
1.2565 - int i;
1.2566 - const char *s;
1.2567 + const char *s,*t;
1.2568 GString *word;
1.2569 + gunichar c,pc;
1.2570 word=g_string_new(NULL);
1.2571 - for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
1.2572 + for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
1.2573 + !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
1.2574 + **ptr;*ptr=g_utf8_next_char(*ptr))
1.2575 ;
1.2576 /*
1.2577 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
1.2578 @@ -2966,23 +3102,27 @@
1.2579 * the results and resume our normal programming.
1.2580 */
1.2581 s=*ptr;
1.2582 - for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
1.2583 - g_string_append_c(word,*s);
1.2584 - for (i=1;i+1<word->len;i++)
1.2585 + for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
1.2586 + g_unichar_isalpha(g_utf8_get_char(s)) ||
1.2587 + g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
1.2588 + g_string_append_unichar(word,g_utf8_get_char(s));
1.2589 + for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
1.2590 + t=g_utf8_next_char(t))
1.2591 {
1.2592 - if (word->str[i]=='.' || word->str[i]==',')
1.2593 + c=g_utf8_get_char(t);
1.2594 + pc=g_utf8_get_char(g_utf8_prev_char(t));
1.2595 + if ((c=='.' || c==',') && g_unichar_isdigit(pc))
1.2596 {
1.2597 - if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
1.2598 - {
1.2599 - *ptr=s;
1.2600 - return g_string_free(word,FALSE);
1.2601 - }
1.2602 + *ptr=s;
1.2603 + return g_string_free(word,FALSE);
1.2604 }
1.2605 }
1.2606 /* we didn't find a punctuated number - do the regular getword thing */
1.2607 g_string_truncate(word,0);
1.2608 - for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
1.2609 - g_string_append_c(word,**ptr);
1.2610 + for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
1.2611 + g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
1.2612 + g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
1.2613 + g_string_append_unichar(word,g_utf8_get_char(*ptr));
1.2614 return g_string_free(word,FALSE);
1.2615 }
1.2616
1.2617 @@ -3006,82 +3146,36 @@
1.2618 if (!t || !*t)
1.2619 return FALSE;
1.2620 s=t;
1.2621 - while (*t=='m' && *t)
1.2622 + while (g_utf8_get_char(t)=='m' && *t)
1.2623 t++;
1.2624 - if (*t=='d')
1.2625 + if (g_utf8_get_char(t)=='d')
1.2626 t++;
1.2627 - if (*t=='c' && t[1]=='m')
1.2628 + if (g_str_has_prefix(t,"cm"))
1.2629 t+=2;
1.2630 - if (*t=='c' && t[1]=='d')
1.2631 + if (g_str_has_prefix(t,"cd"))
1.2632 t+=2;
1.2633 - while (*t=='c' && *t)
1.2634 + while (g_utf8_get_char(t)=='c' && *t)
1.2635 t++;
1.2636 - if (*t=='x' && t[1]=='l')
1.2637 + if (g_str_has_prefix(t,"xl"))
1.2638 t+=2;
1.2639 - if (*t=='x' && t[1]=='c')
1.2640 + if (g_str_has_prefix(t,"xc"))
1.2641 t+=2;
1.2642 - if (*t=='l')
1.2643 + if (g_utf8_get_char(t)=='l')
1.2644 t++;
1.2645 - while (*t=='x' && *t)
1.2646 + while (g_utf8_get_char(t)=='x' && *t)
1.2647 t++;
1.2648 - if (*t=='i' && t[1]=='x')
1.2649 + if (g_str_has_prefix(t,"ix"))
1.2650 t+=2;
1.2651 - if (*t=='i' && t[1]=='v')
1.2652 + if (g_str_has_prefix(t,"iv"))
1.2653 t+=2;
1.2654 - if (*t=='v')
1.2655 + if (g_utf8_get_char(t)=='v')
1.2656 t++;
1.2657 - while (*t=='i' && *t)
1.2658 + while (g_utf8_get_char(t)=='i' && *t)
1.2659 t++;
1.2660 return !*t;
1.2661 }
1.2662
1.2663 /*
1.2664 - * gcisalpha:
1.2665 - *
1.2666 - * A version of isalpha() that is somewhat lenient on 8-bit texts.
1.2667 - * If we use the standard function, 8-bit accented characters break
1.2668 - * words, so that tete with accented characters appears to be two words, "t"
1.2669 - * and "t", with 8-bit characters between them. This causes over-reporting of
1.2670 - * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
1.2671 - * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
1.2672 - */
1.2673 -gboolean gcisalpha(unsigned char c)
1.2674 -{
1.2675 - if (c>='a' && c<='z')
1.2676 - return TRUE;
1.2677 - if (c>='A' && c<='Z')
1.2678 - return TRUE;
1.2679 - if (c<140)
1.2680 - return FALSE;
1.2681 - if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
1.2682 - return TRUE;
1.2683 - if (c==140 || c==142 || c==156 || c==158 || c==159)
1.2684 - return TRUE;
1.2685 - return FALSE;
1.2686 -}
1.2687 -
1.2688 -/*
1.2689 - * gcisdigit:
1.2690 - *
1.2691 - * A version of isdigit() that doesn't get confused in 8-bit texts.
1.2692 - */
1.2693 -gboolean gcisdigit(unsigned char c)
1.2694 -{
1.2695 - return c>='0' && c<='9';
1.2696 -}
1.2697 -
1.2698 -/*
1.2699 - * gcisletter:
1.2700 - *
1.2701 - * A version of isletter() that doesn't get confused in 8-bit texts.
1.2702 - * NB: this is ISO-8891-1-specific.
1.2703 - */
1.2704 -gboolean gcisletter(unsigned char c)
1.2705 -{
1.2706 - return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
1.2707 -}
1.2708 -
1.2709 -/*
1.2710 * postprocess_for_DP:
1.2711 *
1.2712 * Invoked with the -d switch from flgets().
1.2713 @@ -3096,21 +3190,11 @@
1.2714 if (!*theline)
1.2715 return;
1.2716 for (i=0;*DPmarkup[i];i++)
1.2717 - {
1.2718 - s=strstr(theline,DPmarkup[i]);
1.2719 - while (s)
1.2720 + while ((s=strstr(theline,DPmarkup[i])))
1.2721 {
1.2722 t=s+strlen(DPmarkup[i]);
1.2723 - while (*t)
1.2724 - {
1.2725 - *s=*t;
1.2726 - t++;
1.2727 - s++;
1.2728 - }
1.2729 - *s=0;
1.2730 - s=strstr(theline,DPmarkup[i]);
1.2731 + memmove(s,t,strlen(t)+1);
1.2732 }
1.2733 - }
1.2734 }
1.2735
1.2736 /*
1.2737 @@ -3124,9 +3208,8 @@
1.2738 */
1.2739 void postprocess_for_HTML(char *theline)
1.2740 {
1.2741 - if (strchr(theline,'<') && strchr(theline,'>'))
1.2742 - while (losemarkup(theline))
1.2743 - ;
1.2744 + while (losemarkup(theline))
1.2745 + ;
1.2746 while (loseentities(theline))
1.2747 ;
1.2748 }
1.2749 @@ -3135,25 +3218,16 @@
1.2750 {
1.2751 char *s,*t;
1.2752 int i;
1.2753 - if (!*theline)
1.2754 - return NULL;
1.2755 - s=strstr(theline,"<");
1.2756 - t=strstr(theline,">");
1.2757 + s=strchr(theline,'<');
1.2758 + t=s?strchr(s,'>'):NULL;
1.2759 if (!s || !t)
1.2760 return NULL;
1.2761 for (i=0;*markup[i];i++)
1.2762 - if (!tagcomp(s+1,markup[i]))
1.2763 + if (tagcomp(g_utf8_next_char(s),markup[i]))
1.2764 {
1.2765 - if (!t[1])
1.2766 - {
1.2767 - *s=0;
1.2768 - return s;
1.2769 - }
1.2770 - else if (t>s)
1.2771 - {
1.2772 - strcpy(s,t+1);
1.2773 - return s;
1.2774 - }
1.2775 + t=g_utf8_next_char(t);
1.2776 + memmove(s,t,strlen(t)+1);
1.2777 + return s;
1.2778 }
1.2779 /* It's an unrecognized <xxx>. */
1.2780 return NULL;
1.2781 @@ -3170,13 +3244,10 @@
1.2782 s=strstr(theline,entities[i].htmlent);
1.2783 if (s)
1.2784 {
1.2785 - t=malloc((size_t)strlen(s));
1.2786 - if (!t)
1.2787 - return NULL;
1.2788 - strcpy(t,s+strlen(entities[i].htmlent));
1.2789 + t=g_strdup(s+strlen(entities[i].htmlent));
1.2790 strcpy(s,entities[i].textent);
1.2791 strcat(s,t);
1.2792 - free(t);
1.2793 + g_free(t);
1.2794 return theline;
1.2795 }
1.2796 }
1.2797 @@ -3185,34 +3256,29 @@
1.2798 s=strstr(theline,entities[i].htmlnum);
1.2799 if (s)
1.2800 {
1.2801 - t=malloc((size_t)strlen(s));
1.2802 - if (!t)
1.2803 - return NULL;
1.2804 - strcpy(t,s+strlen(entities[i].htmlnum));
1.2805 + t=g_strdup(s+strlen(entities[i].htmlnum));
1.2806 strcpy(s,entities[i].textent);
1.2807 strcat(s,t);
1.2808 - free(t);
1.2809 + g_free(t);
1.2810 return theline;
1.2811 }
1.2812 }
1.2813 return NULL;
1.2814 }
1.2815
1.2816 -int tagcomp(const char *strin,const char *basetag)
1.2817 +gboolean tagcomp(const char *strin,const char *basetag)
1.2818 {
1.2819 - const char *s,*t;
1.2820 - s=basetag;
1.2821 - t=strin;
1.2822 - if (*t=='/')
1.2823 - t++; /* ignore a slash */
1.2824 - while (*s && *t)
1.2825 - {
1.2826 - if (tolower(*s)!=tolower(*t))
1.2827 - return 1;
1.2828 - s++;
1.2829 - t++;
1.2830 - }
1.2831 - return 0;
1.2832 + gboolean retval;
1.2833 + gchar *s,*t;
1.2834 + if (g_utf8_get_char(strin)=='/')
1.2835 + t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
1.2836 + else
1.2837 + t=g_utf8_casefold(strin,-1);
1.2838 + s=g_utf8_casefold(basetag,-1);
1.2839 + retval=g_str_has_prefix(t,s);
1.2840 + g_free(s);
1.2841 + g_free(t);
1.2842 + return retval;
1.2843 }
1.2844
1.2845 void proghelp(GOptionContext *context)