Switch to using UTF-8 internally
authorali <ali@juiblex.co.uk>
Thu May 30 07:31:24 2013 +0100 (2013-05-30)
changeset 70aa916da2e452
parent 69 1016349e619f
child 71 82d3cc398b54
Switch to using UTF-8 internally
bookloupe/bookloupe.c
     1.1 --- a/bookloupe/bookloupe.c	Tue May 28 15:17:19 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Thu May 30 07:31:24 2013 +0100
     1.3 @@ -119,8 +119,6 @@
     1.4      "among", "those", "into", "whom", "having", "thence", ""
     1.5  }; 
     1.6  
     1.7 -char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü";
     1.8 -
     1.9  struct {
    1.10      char *htmlent;
    1.11      char *htmlnum;
    1.12 @@ -347,16 +345,13 @@
    1.13  
    1.14  gchar *running_from;
    1.15  
    1.16 -int mixdigit(const char *);
    1.17 +gboolean mixdigit(const char *);
    1.18  gchar *getaword(const char **);
    1.19  char *flgets(char **,long);
    1.20 -gboolean gcisalpha(unsigned char);
    1.21 -gboolean gcisdigit(unsigned char);
    1.22 -gboolean gcisletter(unsigned char);
    1.23  void postprocess_for_HTML(char *);
    1.24  char *linehasmarkup(char *);
    1.25  char *losemarkup(char *);
    1.26 -int tagcomp(const char *,const char *);
    1.27 +gboolean tagcomp(const char *,const char *);
    1.28  char *loseentities(char *);
    1.29  gboolean isroman(const char *);
    1.30  void postprocess_for_DP(char *);
    1.31 @@ -385,7 +380,7 @@
    1.32  
    1.33  struct line_properties {
    1.34      unsigned int len,blen;
    1.35 -    char start;
    1.36 +    gunichar start;
    1.37  };
    1.38  
    1.39  struct parities {
    1.40 @@ -462,8 +457,8 @@
    1.41      gchar *usertypo_file;
    1.42      gboolean okay;
    1.43      int i;
    1.44 -    gsize len;
    1.45 -    gchar *contents,**lines;
    1.46 +    gsize len,nb;
    1.47 +    gchar *contents,*utf8,**lines;
    1.48      usertypo_file=g_strdup("bookloupe.typ");
    1.49      okay=file_get_contents_text(usertypo_file,&contents,&len,&err);
    1.50      if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
    1.51 @@ -490,7 +485,7 @@
    1.52      if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT))
    1.53      {
    1.54  	g_free(usertypo_file);
    1.55 -	printf("   --> I couldn't find bookloupe.typ "
    1.56 +	g_print("   --> I couldn't find bookloupe.typ "
    1.57  	  "-- proceeding without user typos.\n");
    1.58  	return;
    1.59      }
    1.60 @@ -501,7 +496,10 @@
    1.61  	g_clear_error(&err);
    1.62  	exit(1);
    1.63      }
    1.64 -    lines=g_strsplit(contents,"\n",0);
    1.65 +    utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
    1.66 +    g_free(contents);
    1.67 +    lines=g_strsplit_set(utf8,"\r\n",0);
    1.68 +    g_free(utf8);
    1.69      usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL);
    1.70      for (i=0;lines[i];i++)
    1.71  	if (*(unsigned char *)lines[i]>'!')
    1.72 @@ -511,49 +509,6 @@
    1.73      g_free(lines);
    1.74  }
    1.75  
    1.76 -#if 0
    1.77 -/*
    1.78 - * read_etext:
    1.79 - *
    1.80 - * Read an etext returning an array of lines. Lines are normally expected
    1.81 - * to be terminated by CR LF. Solitary LFs delimit lines but are left
    1.82 - * embedded at the end of the line for further processing. Solitary CRs
    1.83 - * do not delimit lines.
    1.84 - */
    1.85 -gchar **read_etext(const char *filename,GError **err)
    1.86 -{
    1.87 -    int i;
    1.88 -    const char *s,*t;
    1.89 -    gchar *contents;
    1.90 -    gchar **raw_lines;
    1.91 -    GPtrArray *lines;
    1.92 -    gsize len;
    1.93 -    if (!g_file_get_contents(filename,&contents,&len,err))
    1.94 -	return NULL;
    1.95 -    raw_lines=g_strsplit(contents,"\r\n",0);
    1.96 -    lines=g_ptr_array_sized_new(g_strv_length(raw_lines)+1);
    1.97 -    for (i=0;raw_lines[i];i++)
    1.98 -    {
    1.99 -	t=strchr(raw_lines[i],'\n');
   1.100 -	if (t)
   1.101 -	{
   1.102 -	    s=raw_lines[i];
   1.103 -	    while ((t=strchr(s,'\n')))
   1.104 -	    {
   1.105 -		g_ptr_array_add(lines,g_strndup(s,t-s+1));
   1.106 -		s=t+1;
   1.107 -	    }
   1.108 -	    g_ptr_array_add(lines,g_strdup(s));
   1.109 -	    g_free(raw_lines[i]);
   1.110 -	}
   1.111 -	else
   1.112 -	    g_ptr_array_add(lines,raw_lines[i]);
   1.113 -    }
   1.114 -    g_free(raw_lines);
   1.115 -    g_ptr_array_add(lines,NULL);
   1.116 -    return (gchar **)g_ptr_array_free(lines,FALSE);
   1.117 -}
   1.118 -#else
   1.119  /*
   1.120   * read_etext:
   1.121   *
   1.122 @@ -562,13 +517,14 @@
   1.123   */
   1.124  gchar *read_etext(const char *filename,GError **err)
   1.125  {
   1.126 -    gchar *contents;
   1.127 -    gsize len;
   1.128 +    gchar *contents,*utf8;
   1.129 +    gsize len,nb;
   1.130      if (!g_file_get_contents(filename,&contents,&len,err))
   1.131  	return NULL;
   1.132 -    return contents;
   1.133 +    utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   1.134 +    g_free(contents);
   1.135 +    return utf8;
   1.136  }
   1.137 -#endif
   1.138  
   1.139  int main(int argc,char **argv)
   1.140  {
   1.141 @@ -580,35 +536,35 @@
   1.142      procfile(argv[1]);
   1.143      if (pswit[OVERVIEW_SWITCH])
   1.144      {
   1.145 -	printf("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   1.146 +	g_print("    Checked %ld lines of %ld (head+foot = %ld)\n\n",
   1.147  	  checked_linecnt,linecnt,linecnt-checked_linecnt);
   1.148 -	printf("    --------------- Queries found --------------\n");
   1.149 +	g_print("    --------------- Queries found --------------\n");
   1.150  	if (cnt_long)
   1.151 -	    printf("    Long lines:		    %14ld\n",cnt_long);
   1.152 +	    g_print("    Long lines:		    %14ld\n",cnt_long);
   1.153  	if (cnt_short)
   1.154 -	    printf("    Short lines:		   %14ld\n",cnt_short);
   1.155 +	    g_print("    Short lines:		   %14ld\n",cnt_short);
   1.156  	if (cnt_lineend)
   1.157 -	    printf("    Line-end problems:	     %14ld\n",cnt_lineend);
   1.158 +	    g_print("    Line-end problems:	     %14ld\n",cnt_lineend);
   1.159  	if (cnt_word)
   1.160 -	    printf("    Common typos:		  %14ld\n",cnt_word);
   1.161 +	    g_print("    Common typos:		  %14ld\n",cnt_word);
   1.162  	if (cnt_dquot)
   1.163 -	    printf("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   1.164 +	    g_print("    Unmatched quotes:	      %14ld\n",cnt_dquot);
   1.165  	if (cnt_squot)
   1.166 -	    printf("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   1.167 +	    g_print("    Unmatched SingleQuotes:	%14ld\n",cnt_squot);
   1.168  	if (cnt_brack)
   1.169 -	    printf("    Unmatched brackets:	    %14ld\n",cnt_brack);
   1.170 +	    g_print("    Unmatched brackets:	    %14ld\n",cnt_brack);
   1.171  	if (cnt_bin)
   1.172 -	    printf("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   1.173 +	    g_print("    Non-ASCII characters:	  %14ld\n",cnt_bin);
   1.174  	if (cnt_odd)
   1.175 -	    printf("    Proofing characters:	   %14ld\n",cnt_odd);
   1.176 +	    g_print("    Proofing characters:	   %14ld\n",cnt_odd);
   1.177  	if (cnt_punct)
   1.178 -	    printf("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   1.179 +	    g_print("    Punctuation & spacing queries: %14ld\n",cnt_punct);
   1.180  	if (cnt_dash)
   1.181 -	    printf("    Non-standard dashes:	   %14ld\n",cnt_dash);
   1.182 +	    g_print("    Non-standard dashes:	   %14ld\n",cnt_dash);
   1.183  	if (cnt_html)
   1.184 -	    printf("    Possible HTML tags:	    %14ld\n",cnt_html);
   1.185 -	printf("\n");
   1.186 -	printf("    TOTAL QUERIES		  %14ld\n",
   1.187 +	    g_print("    Possible HTML tags:	    %14ld\n",cnt_html);
   1.188 +	g_print("\n");
   1.189 +	g_print("    TOTAL QUERIES		  %14ld\n",
   1.190  	  cnt_dquot+cnt_squot+cnt_brack+cnt_bin+cnt_odd+cnt_long+
   1.191  	  cnt_short+cnt_punct+cnt_dash+cnt_word+cnt_html+cnt_lineend);
   1.192      }
   1.193 @@ -628,10 +584,10 @@
   1.194   */
   1.195  struct first_pass_results *first_pass(const char *etext)
   1.196  {
   1.197 -    char laststart=CHAR_SPACE;
   1.198 +    gunichar laststart=CHAR_SPACE;
   1.199      const char *s;
   1.200      gchar *lc_line;
   1.201 -    int i,j,llen;
   1.202 +    int i,j,lbytes,llen;
   1.203      gchar **lines;
   1.204      unsigned int lastlen=0,lastblen=0;
   1.205      long spline=0,nspline=0;
   1.206 @@ -640,27 +596,28 @@
   1.207      lines=g_strsplit(etext,"\n",0);
   1.208      for (j=0;lines[j];j++)
   1.209      {
   1.210 -	llen=strlen(lines[j]);
   1.211 -	while(lines[j][llen-1]=='\r')
   1.212 -	    lines[j][llen--]='\0';
   1.213 +	lbytes=strlen(lines[j]);
   1.214 +	while (lines[j][lbytes-1]=='\r')
   1.215 +	    lines[j][--lbytes]='\0';
   1.216 +	llen=g_utf8_strlen(lines[j],lbytes);
   1.217  	linecnt++;
   1.218  	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   1.219  	  (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT")))
   1.220  	{
   1.221  	    if (spline)
   1.222 -		printf("   --> Duplicate header?\n");
   1.223 +		g_print("   --> Duplicate header?\n");
   1.224  	    spline=linecnt+1;   /* first line of non-header text, that is */
   1.225  	}
   1.226  	if (!strncmp(lines[j],"*** START",9) &&
   1.227  	  strstr(lines[j],"PROJECT GUTENBERG"))
   1.228  	{
   1.229  	    if (nspline)
   1.230 -		printf("   --> Duplicate header?\n");
   1.231 +		g_print("   --> Duplicate header?\n");
   1.232  	    nspline=linecnt+1;   /* first line of non-header text, that is */
   1.233  	}
   1.234  	if (spline || nspline)
   1.235  	{
   1.236 -	    lc_line=g_ascii_strdown(lines[j],llen);
   1.237 +	    lc_line=g_utf8_strdown(lines[j],lbytes);
   1.238  	    if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg"))
   1.239  	    {
   1.240  		if (strstr(lc_line,"end")<strstr(lc_line,"project gutenberg"))
   1.241 @@ -669,7 +626,7 @@
   1.242  		    {
   1.243  			/* it's an old-form header - we can detect duplicates */
   1.244  			if (!nspline)
   1.245 -			    printf("   --> Duplicate footer?\n");
   1.246 +			    g_print("   --> Duplicate footer?\n");
   1.247  		    }
   1.248  		    else
   1.249  			results.footerline=linecnt;
   1.250 @@ -684,19 +641,21 @@
   1.251  	if (results.footerline)
   1.252  	    continue;    /* don't count the boilerplate in the footer */
   1.253  	results.totlen+=llen;
   1.254 -	for (i=0;i<llen;i++)
   1.255 +	for (s=lines[j];*s;s=g_utf8_next_char(s))
   1.256  	{
   1.257 -	    if ((unsigned char)lines[j][i]>127)
   1.258 +	    if (g_utf8_get_char(s)>127)
   1.259  		results.binlen++;
   1.260 -	    if (gcisalpha(lines[j][i]))
   1.261 +	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   1.262  		results.alphalen++;
   1.263 -	    if (i>0 && lines[j][i]==CHAR_DQUOTE && isalpha(lines[j][i-1]))
   1.264 +	    if (s>lines[j] && g_utf8_get_char(s)==CHAR_DQUOTE &&
   1.265 +	      isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
   1.266  		results.endquote_count++;
   1.267  	}
   1.268  	if (llen>2 && lastlen>2 && lastlen<SHORTEST_PG_LINE && lastblen>2 &&
   1.269  	  lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE)
   1.270  	    results.shortline++;
   1.271 -	if (llen>0 && (unsigned char)lines[j][llen-1]<=CHAR_SPACE)
   1.272 +	if (lbytes>0 &&
   1.273 +	  g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE)
   1.274  	    cnt_spacend++;
   1.275  	if (strstr(lines[j],".,"))
   1.276  	    results.dotcomma++;
   1.277 @@ -704,17 +663,19 @@
   1.278  	/* locase text on the line */
   1.279  	if (strchr(lines[j],'*'))
   1.280  	{
   1.281 -	    for (s=lines[j];*s;s++)
   1.282 -		if (*s>='a' && *s<='z')
   1.283 +	    for (s=lines[j];*s;s=g_utf8_next_char(s))
   1.284 +		if (g_unichar_islower(g_utf8_get_char(s)))
   1.285  		    break;
   1.286 -	     if (*s)
   1.287 +	    if (*s)
   1.288  		results.astline++;
   1.289  	}
   1.290  	if (strchr(lines[j],'/'))
   1.291  	    results.fslashline++;
   1.292 -	for (i=llen-1;i>0 && (unsigned char)lines[j][i]<=CHAR_SPACE;i--)
   1.293 +	for (s=g_utf8_prev_char(lines[j]+lbytes);
   1.294 +	  s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
   1.295  	    ;
   1.296 -	if (i>1 && lines[j][i]=='-' && lines[j][i-1]!='-')
   1.297 +	if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' &&
   1.298 +	  g_utf8_get_char(g_utf8_prev_char(s))!='-')
   1.299  	    results.hyphens++;
   1.300  	if (llen>LONGEST_PG_LINE)
   1.301  	    results.longline++;
   1.302 @@ -729,15 +690,15 @@
   1.303  		results.htmcount+=4; /* bonus marks! */
   1.304  	}
   1.305  	/* Check for spaced em-dashes */
   1.306 -	if (lines[j][0] && (s=strstr(lines[j]+1,"--")))
   1.307 +	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
   1.308  	{
   1.309  	    results.emdash++;
   1.310 -	    if (s[-1]==CHAR_SPACE || (s[2]==CHAR_SPACE))
   1.311 +	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
   1.312  		results.space_emdash++;
   1.313 -	    if (s[-1]==CHAR_SPACE && (s[2]==CHAR_SPACE))
   1.314 +	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
   1.315  		/* count of em-dashes with spaces both sides */
   1.316  		results.non_PG_space_emdash++;
   1.317 -	    if (s[-1]!=CHAR_SPACE && (s[2]!=CHAR_SPACE))
   1.318 +	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
   1.319  		/* count of PG-type em-dashes with no spaces */
   1.320  		results.PG_space_emdash++;
   1.321  	}
   1.322 @@ -772,13 +733,13 @@
   1.323  {
   1.324      static struct warnings warnings={0};
   1.325      if (cnt_spacend>0)
   1.326 -	printf("   --> %ld lines in this file have white space at end\n",
   1.327 +	g_print("   --> %ld lines in this file have white space at end\n",
   1.328  	  cnt_spacend);
   1.329      warnings.dotcomma=1;
   1.330      if (results->dotcomma>5)
   1.331      {
   1.332  	warnings.dotcomma=0;
   1.333 -	printf("   --> %ld lines in this file contain '.,'. "
   1.334 +	g_print("   --> %ld lines in this file contain '.,'. "
   1.335  	  "Not reporting them.\n",results->dotcomma);
   1.336      }
   1.337      /*
   1.338 @@ -789,7 +750,7 @@
   1.339      if (results->shortline>50 || results->shortline*10>linecnt)
   1.340      {
   1.341  	warnings.shortline=0;
   1.342 -	printf("   --> %ld lines in this file are short. "
   1.343 +	g_print("   --> %ld lines in this file are short. "
   1.344  	  "Not reporting short lines.\n",results->shortline);
   1.345      }
   1.346      /*
   1.347 @@ -800,7 +761,7 @@
   1.348      if (results->longline>50 || results->longline*10>linecnt)
   1.349      {
   1.350  	warnings.longline=0;
   1.351 -	printf("   --> %ld lines in this file are long. "
   1.352 +	g_print("   --> %ld lines in this file are long. "
   1.353  	  "Not reporting long lines.\n",results->longline);
   1.354      }
   1.355      /* If more than 10 lines contain asterisks, don't bother reporting them. */
   1.356 @@ -808,7 +769,7 @@
   1.357      if (results->astline>10)
   1.358      {
   1.359  	warnings.ast=0;
   1.360 -	printf("   --> %ld lines in this file contain asterisks. "
   1.361 +	g_print("   --> %ld lines in this file contain asterisks. "
   1.362  	  "Not reporting them.\n",results->astline);
   1.363      }
   1.364      /*
   1.365 @@ -819,7 +780,7 @@
   1.366      if (results->fslashline>10)
   1.367      {
   1.368  	warnings.fslash=0;
   1.369 -	printf("   --> %ld lines in this file contain forward slashes. "
   1.370 +	g_print("   --> %ld lines in this file contain forward slashes. "
   1.371  	  "Not reporting them.\n",results->fslashline);
   1.372      }
   1.373      /*
   1.374 @@ -830,7 +791,7 @@
   1.375      if (results->endquote_count>20)
   1.376      {
   1.377  	warnings.endquote=0;
   1.378 -	printf("   --> %ld lines in this file contain unpunctuated endquotes. "
   1.379 +	g_print("   --> %ld lines in this file contain unpunctuated endquotes. "
   1.380  	  "Not reporting them.\n",results->endquote_count);
   1.381      }
   1.382      /*
   1.383 @@ -841,7 +802,7 @@
   1.384      if (results->standalone_digit>10)
   1.385      {
   1.386  	warnings.digit=0;
   1.387 -	printf("   --> %ld lines in this file contain standalone 0s and 1s. "
   1.388 +	g_print("   --> %ld lines in this file contain standalone 0s and 1s. "
   1.389  	  "Not reporting them.\n",results->standalone_digit);
   1.390      }
   1.391      /*
   1.392 @@ -852,16 +813,16 @@
   1.393      if (results->hyphens>20)
   1.394      {
   1.395  	warnings.hyphen=0;
   1.396 -	printf("   --> %ld lines in this file have hyphens at end. "
   1.397 +	g_print("   --> %ld lines in this file have hyphens at end. "
   1.398  	  "Not reporting them.\n",results->hyphens);
   1.399      }
   1.400      if (results->htmcount>20 && !pswit[MARKUP_SWITCH])
   1.401      {
   1.402 -	printf("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   1.403 +	g_print("   --> Looks like this is HTML. Switching HTML mode ON.\n");
   1.404  	pswit[MARKUP_SWITCH]=1;
   1.405      }
   1.406      if (results->verylongline>0)
   1.407 -	printf("   --> %ld lines in this file are VERY long!\n",
   1.408 +	g_print("   --> %ld lines in this file are VERY long!\n",
   1.409  	  results->verylongline);
   1.410      /*
   1.411       * If there are more non-PG spaced dashes than PG em-dashes,
   1.412 @@ -874,7 +835,7 @@
   1.413        results->PG_space_emdash)
   1.414      {
   1.415  	warnings.dash=0;
   1.416 -	printf("   --> There are %ld spaced dashes and em-dashes. "
   1.417 +	g_print("   --> There are %ld spaced dashes and em-dashes. "
   1.418  	  "Not reporting them.\n",
   1.419  	  results->spacedash+results->non_PG_space_emdash);
   1.420      }
   1.421 @@ -882,19 +843,19 @@
   1.422      warnings.bin=1;
   1.423      if (results->binlen*4>results->totlen)
   1.424      {
   1.425 -	printf("   --> This file does not appear to be ASCII. "
   1.426 +	g_print("   --> This file does not appear to be ASCII. "
   1.427  	  "Terminating. Best of luck with it!\n");
   1.428  	exit(1);
   1.429      }
   1.430      if (results->alphalen*4<results->totlen)
   1.431      {
   1.432 -	printf("   --> This file does not appear to be text. "
   1.433 +	g_print("   --> This file does not appear to be text. "
   1.434  	  "Terminating. Best of luck with it!\n");
   1.435  	exit(1);
   1.436      }
   1.437      if (results->binlen*100>results->totlen || results->binlen>100)
   1.438      {
   1.439 -	printf("   --> There are a lot of foreign letters here. "
   1.440 +	g_print("   --> There are a lot of foreign letters here. "
   1.441  	  "Not reporting them.\n");
   1.442  	warnings.bin=0;
   1.443      }
   1.444 @@ -902,26 +863,26 @@
   1.445      if (results->Dutchcount>50)
   1.446      {
   1.447  	warnings.isDutch=TRUE;
   1.448 -	printf("   --> This looks like Dutch - "
   1.449 +	g_print("   --> This looks like Dutch - "
   1.450  	  "switching off dashes and warnings for 's Middags case.\n");
   1.451      }
   1.452      warnings.isFrench=FALSE;
   1.453      if (results->Frenchcount>50)
   1.454      {
   1.455  	warnings.isFrench=TRUE;
   1.456 -	printf("   --> This looks like French - "
   1.457 +	g_print("   --> This looks like French - "
   1.458  	  "switching off some doublepunct.\n");
   1.459      }
   1.460      if (results->firstline && results->footerline)
   1.461 -	printf("    The PG header and footer appear to be already on.\n");
   1.462 +	g_print("    The PG header and footer appear to be already on.\n");
   1.463      else
   1.464      {
   1.465  	if (results->firstline)
   1.466 -	    printf("    The PG header is on - no footer.\n");
   1.467 +	    g_print("    The PG header is on - no footer.\n");
   1.468  	if (results->footerline)
   1.469 -	    printf("    The PG footer is on - no header.\n");
   1.470 +	    g_print("    The PG footer is on - no header.\n");
   1.471      }
   1.472 -    printf("\n");
   1.473 +    g_print("\n");
   1.474      if (pswit[VERBOSE_SWITCH])
   1.475      {
   1.476  	warnings.bin=1;
   1.477 @@ -934,7 +895,7 @@
   1.478  	warnings.fslash=1;
   1.479  	warnings.hyphen=1;
   1.480  	warnings.endquote=1;
   1.481 -	printf("   *** Verbose output is ON -- you asked for it! ***\n");
   1.482 +	g_print("   *** Verbose output is ON -- you asked for it! ***\n");
   1.483      }
   1.484      if (warnings.isDutch)
   1.485  	warnings.dash=0;
   1.486 @@ -942,9 +903,9 @@
   1.487        results->footerline>results->firstline &&
   1.488        results->footerline-results->firstline<100)
   1.489      {
   1.490 -	printf("   --> I don't really know where this text starts. \n");
   1.491 -	printf("       There are no reference points.\n");
   1.492 -	printf("       I'm going to have to report the header and footer "
   1.493 +	g_print("   --> I don't really know where this text starts. \n");
   1.494 +	g_print("       There are no reference points.\n");
   1.495 +	g_print("       I'm going to have to report the header and footer "
   1.496  	  "as well.\n");
   1.497  	results->firstline=0;
   1.498      }
   1.499 @@ -968,12 +929,16 @@
   1.500      int guessquote=0;
   1.501      /* assume the line is empty until proven otherwise */
   1.502      gboolean isemptyline=TRUE;
   1.503 -    const char *s=aline;
   1.504 +    const char *s=aline,*sprev,*snext;
   1.505 +    gunichar c;
   1.506 +    sprev=NULL;
   1.507      while (*s)
   1.508      {
   1.509 -	if (*s==CHAR_DQUOTE)
   1.510 +	snext=g_utf8_next_char(s);
   1.511 +	c=g_utf8_get_char(s);
   1.512 +	if (c==CHAR_DQUOTE)
   1.513  	    counters->quot++;
   1.514 -	if (*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE)
   1.515 +	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
   1.516  	{
   1.517  	    if (s==aline)
   1.518  	    {
   1.519 @@ -981,17 +946,21 @@
   1.520  		 * At start of line, it can only be an openquote.
   1.521  		 * Hardcode a very common exception!
   1.522  		 */
   1.523 -		if (strncmp(s+2,"tis",3) && strncmp(s+2,"Tis",3))
   1.524 +		if (!g_str_has_prefix(snext,"tis") &&
   1.525 +		  !g_str_has_prefix(snext,"Tis"))
   1.526  		    counters->open_single_quote++;
   1.527  	    }
   1.528 -	    else if (gcisalpha(s[-1]) && gcisalpha(s[1]))
   1.529 +	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   1.530 +	      g_unichar_isalpha(g_utf8_get_char(snext)))
   1.531  		/* Do nothing! it's definitely an apostrophe, not a quote */
   1.532  		;
   1.533  	    /* it's outside a word - let's check it out */
   1.534 -	    else if (*s==CHAR_OPEN_SQUOTE || gcisalpha(s[1]))
   1.535 +	    else if (c==CHAR_OPEN_SQUOTE ||
   1.536 +	      g_unichar_isalpha(g_utf8_get_char(snext)))
   1.537  	    {
   1.538  		/* it damwell better BE an openquote */
   1.539 -		if (strncmp(s+1,"tis",3) && strncmp(s+1,"Tis",3))
   1.540 +		if (!g_str_has_prefix(snext,"tis") &&
   1.541 +		  !g_str_has_prefix(snext,"Tis"))
   1.542  		    /* hardcode a very common exception! */
   1.543  		    counters->open_single_quote++;
   1.544  	    }
   1.545 @@ -999,20 +968,22 @@
   1.546  	    {
   1.547  		/* now - is it a closequote? */
   1.548  		guessquote=0;   /* accumulate clues */
   1.549 -		if (gcisalpha(s[-1]))
   1.550 +		if (g_unichar_isalpha(g_utf8_get_char(sprev)))
   1.551  		{
   1.552  		    /* it follows a letter - could be either */
   1.553  		    guessquote++;
   1.554 -		    if (s[-1]=='s')
   1.555 +		    if (g_utf8_get_char(sprev)=='s')
   1.556  		    {
   1.557  			/* looks like a plural apostrophe */
   1.558  			guessquote-=3;
   1.559 -			if (s[1]==CHAR_SPACE)  /* bonus marks! */
   1.560 +			if (g_utf8_get_char(snext)==CHAR_SPACE)
   1.561 +			    /* bonus marks! */
   1.562  			    guessquote-=2;
   1.563  		    }
   1.564  		}
   1.565  		/* it doesn't have a letter either side */
   1.566 -		else if (strchr(".?!,;:",s[-1]) && strchr(".?!,;: ",s[1]))
   1.567 +		else if (strchr(".?!,;:",g_utf8_get_char(sprev)) &&
   1.568 +		  strchr(".?!,;: ",g_utf8_get_char(snext)))
   1.569  		    guessquote+=8; /* looks like a closequote */
   1.570  		else
   1.571  		    guessquote++;
   1.572 @@ -1028,24 +999,25 @@
   1.573  		    counters->close_single_quote++;
   1.574  	    }
   1.575  	}
   1.576 -	if (*s!=CHAR_SPACE && *s!='-' && *s!='.' && *s!=CHAR_ASTERISK &&
   1.577 -	  *s!=13 && *s!=10)
   1.578 +	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   1.579 +	  c!='\r' && c!='\n')
   1.580  	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   1.581 -	if (*s==CHAR_UNDERSCORE)
   1.582 +	if (c==CHAR_UNDERSCORE)
   1.583  	    counters->c_unders++;
   1.584 -	if (*s==CHAR_OPEN_CBRACK)
   1.585 +	if (c==CHAR_OPEN_CBRACK)
   1.586  	    counters->c_brack++;
   1.587 -	if (*s==CHAR_CLOSE_CBRACK)
   1.588 +	if (c==CHAR_CLOSE_CBRACK)
   1.589  	    counters->c_brack--;
   1.590 -	if (*s==CHAR_OPEN_RBRACK)
   1.591 +	if (c==CHAR_OPEN_RBRACK)
   1.592  	    counters->r_brack++;
   1.593 -	if (*s==CHAR_CLOSE_RBRACK)
   1.594 +	if (c==CHAR_CLOSE_RBRACK)
   1.595  	    counters->r_brack--;
   1.596 -	if (*s==CHAR_OPEN_SBRACK)
   1.597 +	if (c==CHAR_OPEN_SBRACK)
   1.598  	    counters->s_brack++;
   1.599 -	if (*s==CHAR_CLOSE_SBRACK)
   1.600 +	if (c==CHAR_CLOSE_SBRACK)
   1.601  	    counters->s_brack--;
   1.602 -	s++;
   1.603 +	sprev=s;
   1.604 +	s=snext;
   1.605      }
   1.606      return isemptyline;
   1.607  }
   1.608 @@ -1060,18 +1032,18 @@
   1.609   */
   1.610  void check_for_control_characters(const char *aline)
   1.611  {
   1.612 -    unsigned char c;
   1.613 +    gunichar c;
   1.614      const char *s;
   1.615 -    for (s=aline;*s;s++)
   1.616 +    for (s=aline;*s;s=g_utf8_next_char(s))
   1.617      {
   1.618 -	c=*(unsigned char *)s;
   1.619 +	c=g_utf8_get_char(s);
   1.620  	if (c<CHAR_SPACE && c!=CHAR_LF && c!=CHAR_CR && c!=CHAR_TAB)
   1.621  	{
   1.622  	    if (pswit[ECHO_SWITCH])
   1.623 -		printf("\n%s\n",aline);
   1.624 +		g_print("\n%s\n",aline);
   1.625  	    if (!pswit[OVERVIEW_SWITCH])
   1.626 -		printf("    Line %ld column %d - Control character %d\n",
   1.627 -		  linecnt,(int)(s-aline)+1,c);
   1.628 +		g_print("    Line %ld column %ld - Control character %u\n",
   1.629 +		  linecnt,g_utf8_pointer_to_offset(s,aline)+1,c);
   1.630  	    else
   1.631  		cnt_bin++;
   1.632  	}
   1.633 @@ -1087,90 +1059,93 @@
   1.634    gboolean isemptyline)
   1.635  {
   1.636      /* Don't repeat multiple warnings on one line. */
   1.637 -    int eNon_A=0,eTab=0,eTilde=0,eCarat=0,eFSlash=0,eAst=0;
   1.638 +    gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   1.639 +    gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   1.640      const char *s;
   1.641 -    unsigned char c;
   1.642 -    for (s=aline;*s;s++)
   1.643 +    gunichar c;
   1.644 +    for (s=aline;*s;s=g_utf8_next_char(s))
   1.645      {
   1.646 -	c=*(unsigned char *)s;
   1.647 -	if (!eNon_A && (*s<CHAR_SPACE && *s!=9 && *s!='\n' || c>127))
   1.648 +	c=g_utf8_get_char(s);
   1.649 +	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   1.650  	{
   1.651  	    if (pswit[ECHO_SWITCH])
   1.652 -		printf("\n%s\n",aline);
   1.653 +		g_print("\n%s\n",aline);
   1.654  	    if (!pswit[OVERVIEW_SWITCH])
   1.655 -		if (c>127 && c<160)
   1.656 -		    printf("    Line %ld column %d - "
   1.657 -		      "Non-ISO-8859 character %d\n",linecnt,(int)(s-aline)+1,c);
   1.658 +		if (c>127 && c<160 || c>255)
   1.659 +		    g_print("    Line %ld column %ld - "
   1.660 +		      "Non-ISO-8859 character %u\n",
   1.661 +		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   1.662  		else
   1.663 -		    printf("    Line %ld column %d - Non-ASCII character %d\n",
   1.664 -		      linecnt,(int)(s-aline)+1,c);
   1.665 +		    g_print("    Line %ld column %ld - "
   1.666 +		      "Non-ASCII character %u\n",
   1.667 +		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   1.668  	    else
   1.669  		cnt_bin++;
   1.670 -	    eNon_A=1;
   1.671 +	    eNon_A=TRUE;
   1.672  	}
   1.673 -	if (!eTab && *s==CHAR_TAB)
   1.674 +	if (!eTab && c==CHAR_TAB)
   1.675  	{
   1.676  	    if (pswit[ECHO_SWITCH])
   1.677 -		printf("\n%s\n",aline);
   1.678 +		g_print("\n%s\n",aline);
   1.679  	    if (!pswit[OVERVIEW_SWITCH])
   1.680 -		printf("    Line %ld column %d - Tab character?\n",
   1.681 -		  linecnt,(int)(s-aline)+1);
   1.682 +		g_print("    Line %ld column %ld - Tab character?\n",
   1.683 +		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.684  	    else
   1.685  		cnt_odd++;
   1.686 -	    eTab=1;
   1.687 +	    eTab=TRUE;
   1.688  	}
   1.689 -	if (!eTilde && *s==CHAR_TILDE)
   1.690 +	if (!eTilde && c==CHAR_TILDE)
   1.691  	{
   1.692  	    /*
   1.693  	     * Often used by OCR software to indicate an
   1.694  	     * unrecognizable character.
   1.695  	     */
   1.696  	    if (pswit[ECHO_SWITCH])
   1.697 -		printf("\n%s\n",aline);
   1.698 +		g_print("\n%s\n",aline);
   1.699  	    if (!pswit[OVERVIEW_SWITCH])
   1.700 -		printf("    Line %ld column %d - Tilde character?\n",
   1.701 -		  linecnt,(int)(s-aline)+1);
   1.702 +		g_print("    Line %ld column %ld - Tilde character?\n",
   1.703 +		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.704  	    else
   1.705  		cnt_odd++;
   1.706 -	    eTilde=1;
   1.707 +	    eTilde=TRUE;
   1.708  	}
   1.709 -	if (!eCarat && *s==CHAR_CARAT)
   1.710 +	if (!eCarat && c==CHAR_CARAT)
   1.711  	{  
   1.712  	    if (pswit[ECHO_SWITCH])
   1.713 -		printf("\n%s\n",aline);
   1.714 +		g_print("\n%s\n",aline);
   1.715  	    if (!pswit[OVERVIEW_SWITCH])
   1.716 -		printf("    Line %ld column %d - Carat character?\n",
   1.717 -		  linecnt,(int)(s-aline)+1);
   1.718 +		g_print("    Line %ld column %ld - Carat character?\n",
   1.719 +		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.720  	    else
   1.721  		cnt_odd++;
   1.722 -	    eCarat=1;
   1.723 +	    eCarat=TRUE;
   1.724  	}
   1.725 -	if (!eFSlash && *s==CHAR_FORESLASH && warnings->fslash)
   1.726 +	if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash)
   1.727  	{  
   1.728  	    if (pswit[ECHO_SWITCH])
   1.729 -		printf("\n%s\n",aline);
   1.730 +		g_print("\n%s\n",aline);
   1.731  	    if (!pswit[OVERVIEW_SWITCH])
   1.732 -		printf("    Line %ld column %d - Forward slash?\n",
   1.733 -		  linecnt,(int)(s-aline)+1);
   1.734 +		g_print("    Line %ld column %ld - Forward slash?\n",
   1.735 +		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.736  	    else
   1.737  		cnt_odd++;
   1.738 -	    eFSlash=1;
   1.739 +	    eFSlash=TRUE;
   1.740  	}
   1.741  	/*
   1.742  	 * Report asterisks only in paranoid mode,
   1.743  	 * since they're often deliberate.
   1.744  	 */
   1.745  	if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline &&
   1.746 -	  *s==CHAR_ASTERISK)
   1.747 +	  c==CHAR_ASTERISK)
   1.748  	{
   1.749  	    if (pswit[ECHO_SWITCH])
   1.750 -		printf("\n%s\n",aline);
   1.751 +		g_print("\n%s\n",aline);
   1.752  	    if (!pswit[OVERVIEW_SWITCH])
   1.753 -		printf("    Line %ld column %d - Asterisk?\n",
   1.754 -		  linecnt,(int)(s-aline)+1);
   1.755 +		g_print("    Line %ld column %ld - Asterisk?\n",
   1.756 +		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.757  	    else
   1.758  		cnt_odd++;
   1.759 -	    eAst=1;
   1.760 +	    eAst=TRUE;
   1.761  	}
   1.762      }
   1.763  }
   1.764 @@ -1182,13 +1157,13 @@
   1.765   */
   1.766  void check_for_long_line(const char *aline)
   1.767  {
   1.768 -    if (strlen(aline)>LONGEST_PG_LINE)
   1.769 +    if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE)
   1.770      {
   1.771  	if (pswit[ECHO_SWITCH])
   1.772 -	    printf("\n%s\n",aline);
   1.773 +	    g_print("\n%s\n",aline);
   1.774  	if (!pswit[OVERVIEW_SWITCH])
   1.775 -	    printf("    Line %ld column %d - Long line %d\n",
   1.776 -	      linecnt,(int)strlen(aline),(int)strlen(aline));
   1.777 +	    g_print("    Line %ld column %ld - Long line %ld\n",
   1.778 +	      linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1));
   1.779  	else
   1.780  	    cnt_long++;
   1.781      }
   1.782 @@ -1220,14 +1195,15 @@
   1.783   */
   1.784  void check_for_short_line(const char *aline,const struct line_properties *last)
   1.785  {
   1.786 -    if (strlen(aline)>1 && last->len>1 && last->len<SHORTEST_PG_LINE &&
   1.787 -      last->blen>1 && last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
   1.788 +    if (g_utf8_strlen(aline,-1)>1 && last->len>1 &&
   1.789 +      last->len<SHORTEST_PG_LINE && last->blen>1 &&
   1.790 +      last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE)
   1.791      {
   1.792  	if (pswit[ECHO_SWITCH])
   1.793 -	    printf("\n%s\n",prevline);
   1.794 +	    g_print("\n%s\n",prevline);
   1.795  	if (!pswit[OVERVIEW_SWITCH])
   1.796 -	    printf("    Line %ld column %d - Short line %d?\n",
   1.797 -	      linecnt-1,(int)strlen(prevline),(int)strlen(prevline));
   1.798 +	    g_print("    Line %ld column %ld - Short line %ld?\n",
   1.799 +	      linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1));
   1.800  	else
   1.801  	    cnt_short++;
   1.802      }
   1.803 @@ -1240,12 +1216,13 @@
   1.804   */
   1.805  void check_for_starting_punctuation(const char *aline)
   1.806  {
   1.807 -    if (*aline && strchr(".?!,;:",aline[0]) && strncmp(". . .",aline,5))
   1.808 +    if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) &&
   1.809 +      !g_str_has_prefix(aline,". . ."))
   1.810      {
   1.811  	if (pswit[ECHO_SWITCH])
   1.812 -	    printf("\n%s\n",aline);
   1.813 +	    g_print("\n%s\n",aline);
   1.814  	if (!pswit[OVERVIEW_SWITCH])
   1.815 -	    printf("    Line %ld column 1 - Begins with punctuation?\n",
   1.816 +	    g_print("    Line %ld column 1 - Begins with punctuation?\n",
   1.817  	      linecnt);
   1.818  	else
   1.819  	    cnt_punct++;
   1.820 @@ -1263,21 +1240,21 @@
   1.821   */
   1.822  void check_for_spaced_emdash(const char *aline)
   1.823  {
   1.824 -    const char *s,*t;
   1.825 -    s=aline;
   1.826 -    while ((t=strstr(s,"--")))
   1.827 +    const char *s,*t,*next;
   1.828 +    for (s=aline;t=strstr(s,"--");s=next)
   1.829      {
   1.830 -	if (t>aline && t[-1]==CHAR_SPACE || t[2]==CHAR_SPACE)
   1.831 +	next=g_utf8_next_char(g_utf8_next_char(t));
   1.832 +	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
   1.833 +	  g_utf8_get_char(next)==CHAR_SPACE)
   1.834  	{
   1.835  	    if (pswit[ECHO_SWITCH])
   1.836 -		printf("\n%s\n",aline);
   1.837 +		g_print("\n%s\n",aline);
   1.838  	    if (!pswit[OVERVIEW_SWITCH])
   1.839 -		printf("    Line %ld column %d - Spaced em-dash?\n",
   1.840 -		  linecnt,(int)(t-aline)+1);
   1.841 +		g_print("    Line %ld column %ld - Spaced em-dash?\n",
   1.842 +		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
   1.843  	    else
   1.844  		cnt_dash++;
   1.845  	}
   1.846 -	s=t+2;
   1.847      }
   1.848  }
   1.849  
   1.850 @@ -1291,26 +1268,26 @@
   1.851      const char *s;
   1.852      if ((s=strstr(aline," -")))
   1.853      {
   1.854 -	if (s[2]!='-')
   1.855 +	if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-')
   1.856  	{
   1.857  	    if (pswit[ECHO_SWITCH])
   1.858 -		printf("\n%s\n",aline);
   1.859 +		g_print("\n%s\n",aline);
   1.860  	    if (!pswit[OVERVIEW_SWITCH])
   1.861 -		printf("    Line %ld column %d - Spaced dash?\n",
   1.862 -		  linecnt,(int)(s-aline)+1);
   1.863 +		g_print("    Line %ld column %ld - Spaced dash?\n",
   1.864 +		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.865  	    else
   1.866  		cnt_dash++;
   1.867  	}
   1.868      }
   1.869      else if ((s=strstr(aline,"- ")))
   1.870      {
   1.871 -	if (s==aline || s[-1]!='-')
   1.872 +	if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-')
   1.873  	{
   1.874  	    if (pswit[ECHO_SWITCH])
   1.875 -		printf("\n%s\n",aline);
   1.876 +		g_print("\n%s\n",aline);
   1.877  	    if (!pswit[OVERVIEW_SWITCH])
   1.878 -		printf("    Line %ld column %d - Spaced dash?\n",
   1.879 -		  linecnt,(int)(s-aline)+1);
   1.880 +		g_print("    Line %ld column %ld - Spaced dash?\n",
   1.881 +		  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.882  	    else
   1.883  		cnt_dash++;
   1.884  	}
   1.885 @@ -1335,10 +1312,11 @@
   1.886      if (s)
   1.887      {
   1.888  	if (pswit[ECHO_SWITCH])
   1.889 -	    printf("\n%s\n",aline);
   1.890 +	    g_print("\n%s\n",aline);
   1.891  	if (!pswit[OVERVIEW_SWITCH])
   1.892 -	    printf("    Line %ld column %d - Query missing paragraph break?\n",
   1.893 -	      linecnt,(int)(s-aline)+1);
   1.894 +	    g_print("    Line %ld column %ld - "
   1.895 +	      "Query missing paragraph break?\n",
   1.896 +	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.897  	else
   1.898  	    cnt_punct++;
   1.899      }
   1.900 @@ -1382,10 +1360,10 @@
   1.901      if (s)
   1.902      {
   1.903  	if (pswit[ECHO_SWITCH])
   1.904 -	    printf("\n%s\n",aline);
   1.905 +	    g_print("\n%s\n",aline);
   1.906  	if (!pswit[OVERVIEW_SWITCH])
   1.907 -	    printf("    Line %ld column %d - Query he/be error?\n",
   1.908 -	      linecnt,(int)(s-aline)+1);
   1.909 +	    g_print("    Line %ld column %ld - Query he/be error?\n",
   1.910 +	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.911  	else
   1.912  	    cnt_word++;
   1.913      }
   1.914 @@ -1405,10 +1383,10 @@
   1.915      if (s)
   1.916      {
   1.917  	if (pswit[ECHO_SWITCH])
   1.918 -	    printf("\n%s\n",aline);
   1.919 +	    g_print("\n%s\n",aline);
   1.920  	if (!pswit[OVERVIEW_SWITCH])
   1.921 -	    printf("    Line %ld column %d - Query had/bad error?\n",
   1.922 -	      linecnt,(int)(s-aline)+1);
   1.923 +	    g_print("    Line %ld column %ld - Query had/bad error?\n",
   1.924 +	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.925  	else
   1.926  	    cnt_word++;
   1.927      }
   1.928 @@ -1418,10 +1396,10 @@
   1.929      if (s)
   1.930      {
   1.931  	if (pswit[ECHO_SWITCH])
   1.932 -	    printf("\n%s\n",aline);
   1.933 +	    g_print("\n%s\n",aline);
   1.934  	if (!pswit[OVERVIEW_SWITCH])
   1.935 -	    printf("    Line %ld column %d - Query hut/but error?\n",
   1.936 -	      linecnt,(int)(s-aline)+1);
   1.937 +	    g_print("    Line %ld column %ld - Query hut/but error?\n",
   1.938 +	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.939  	else
   1.940  	    cnt_word++;
   1.941      }
   1.942 @@ -1440,10 +1418,11 @@
   1.943      if (s)
   1.944      {
   1.945  	if (pswit[ECHO_SWITCH])
   1.946 -	    printf("\n%s\n",aline);
   1.947 +	    g_print("\n%s\n",aline);
   1.948  	if (!pswit[OVERVIEW_SWITCH])
   1.949 -	    printf("    Line %ld column %d - Query angled bracket with From\n",
   1.950 -	      linecnt,(int)(s-aline)+1);
   1.951 +	    g_print("    Line %ld column %ld - "
   1.952 +	      "Query angled bracket with From\n",
   1.953 +	      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
   1.954  	else
   1.955  	    cnt_punct++;
   1.956      }
   1.957 @@ -1457,17 +1436,18 @@
   1.958   */
   1.959  void check_for_orphan_character(const char *aline)
   1.960  {
   1.961 -    if (*aline && !aline[1])
   1.962 +    gunichar c;
   1.963 +    c=g_utf8_get_char(aline);
   1.964 +    if (c && !*g_utf8_next_char(aline))
   1.965      {
   1.966 -	if (*aline=='I' || *aline=='V' || *aline=='X' || *aline=='L' ||
   1.967 -	  gcisdigit(*aline))
   1.968 +	if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c))
   1.969  	    ; /* Nothing - ignore numerals alone on a line. */
   1.970  	else
   1.971  	{
   1.972  	    if (pswit[ECHO_SWITCH])
   1.973 -		printf("\n%s\n",aline);
   1.974 +		g_print("\n%s\n",aline);
   1.975  	    if (!pswit[OVERVIEW_SWITCH])
   1.976 -		printf("    Line %ld column 1 - Query single character line\n",
   1.977 +		g_print("    Line %ld column 1 - Query single character line\n",
   1.978  		  linecnt);
   1.979  	    else
   1.980  		cnt_punct++;
   1.981 @@ -1487,10 +1467,10 @@
   1.982      if (s)
   1.983      {
   1.984  	if (pswit[ECHO_SWITCH])
   1.985 -	    printf("\n%s\n",aline);
   1.986 +	    g_print("\n%s\n",aline);
   1.987  	if (!pswit[OVERVIEW_SWITCH])
   1.988 -	    printf("    Line %ld column %ld - Query I=exclamation mark?\n",
   1.989 -	      linecnt,s-aline);
   1.990 +	    g_print("    Line %ld column %ld - Query I=exclamation mark?\n",
   1.991 +	      linecnt,g_utf8_pointer_to_offset(aline,s));
   1.992  	else
   1.993  	    cnt_punct++;
   1.994      }
   1.995 @@ -1506,47 +1486,58 @@
   1.996  {
   1.997      const char *s,*t,*s1;
   1.998      int i;
   1.999 +    gsize len;
  1.1000      gboolean istypo;
  1.1001      gchar *testword;
  1.1002 +    gunichar *decomposition;
  1.1003      if (pswit[PARANOID_SWITCH])
  1.1004      {
  1.1005 -	for (t=aline;strstr(t,". ");)
  1.1006 +	for (t=aline;t=strstr(t,". ");)
  1.1007  	{
  1.1008 -	    t=strstr(t,". ");
  1.1009  	    if (t==aline)
  1.1010  	    {
  1.1011 -		t++;
  1.1012 +		t=g_utf8_next_char(t);
  1.1013  		/* start of line punctuation is handled elsewhere */
  1.1014  		continue;
  1.1015  	    }
  1.1016 -	    if (!gcisalpha(t[-1]))
  1.1017 +	    if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t))))
  1.1018  	    {
  1.1019 -		t++;
  1.1020 +		t=g_utf8_next_char(t);
  1.1021  		continue;
  1.1022  	    }
  1.1023  	    if (warnings->isDutch)
  1.1024  	    {
  1.1025  		/* For Frank & Jeroen -- 's Middags case */
  1.1026 -		if (t[2]==CHAR_SQUOTE && t[3]>='a' && t[3]<='z' &&
  1.1027 -		  t[4]==CHAR_SPACE && t[5]>='A' && t[5]<='Z')
  1.1028 +		gunichar c2,c3,c4,c5;
  1.1029 +		c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2));
  1.1030 +		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
  1.1031 +		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
  1.1032 +		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
  1.1033 +		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
  1.1034 +		  c4==CHAR_SPACE && g_unichar_isupper(c5))
  1.1035  		{
  1.1036 -		    t++;
  1.1037 +		    t=g_utf8_next_char(t);
  1.1038  		    continue;
  1.1039  		}
  1.1040  	    }
  1.1041 -	    s1=t+2;
  1.1042 -	    while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
  1.1043 -		s1++;
  1.1044 -	    if (*s1>='a' && *s1<='z')
  1.1045 +	    s1=g_utf8_next_char(g_utf8_next_char(t));
  1.1046 +	    while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) &&
  1.1047 +	      !isdigit(g_utf8_get_char(s1)))
  1.1048 +		s1=g_utf8_next_char(s1);
  1.1049 +	    if (g_unichar_islower(g_utf8_get_char(s1)))
  1.1050  	    {
  1.1051  		/* we have something to investigate */
  1.1052  		istypo=TRUE;
  1.1053  		/* so let's go back and find out */
  1.1054 -		for (s1=t-1;s1>=aline &&
  1.1055 -		  (gcisalpha(*s1) || gcisdigit(*s1) || *s1==CHAR_SQUOTE &&
  1.1056 -		  gcisalpha(s1[1]) && gcisalpha(s1[-1]));s1--)
  1.1057 +		for (s1=g_utf8_prev_char(t);s1>=aline &&
  1.1058 +		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||
  1.1059 +		  g_unichar_isdigit(g_utf8_get_char(s1)) ||
  1.1060 +		  g_utf8_get_char(s1)==CHAR_SQUOTE &&
  1.1061 +		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
  1.1062 +		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
  1.1063 +		  s1=g_utf8_prev_char(s1))
  1.1064  		    ;
  1.1065 -		s1++;
  1.1066 +		s1=g_utf8_next_char(s1);
  1.1067  		s=strchr(s1,'.');
  1.1068  		if (s)
  1.1069  		    testword=g_strndup(s1,s-s1);
  1.1070 @@ -1555,18 +1546,23 @@
  1.1071  		for (i=0;*abbrev[i];i++)
  1.1072  		    if (!strcmp(testword,abbrev[i]))
  1.1073  			istypo=FALSE;
  1.1074 -		if (gcisdigit(*testword))
  1.1075 +		if (g_unichar_isdigit(g_utf8_get_char(testword)))
  1.1076  		    istypo=FALSE;
  1.1077 -		if (!testword[1])
  1.1078 +		if (!*g_utf8_next_char(testword))
  1.1079  		    istypo=FALSE;
  1.1080  		if (isroman(testword))
  1.1081  		    istypo=FALSE;
  1.1082  		if (istypo)
  1.1083  		{
  1.1084  		    istypo=FALSE;
  1.1085 -		    for (i=0;testword[i];i++)
  1.1086 -			if (strchr(vowels,testword[i]))
  1.1087 +		    for (s=testword;*s;s=g_utf8_next_char(s))
  1.1088 +		    {
  1.1089 +			decomposition=g_unicode_canonical_decomposition(
  1.1090 +			  g_utf8_get_char(s),&len);
  1.1091 +			if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1.1092  			    istypo=TRUE;
  1.1093 +			g_free(decomposition);
  1.1094 +		    }
  1.1095  		}
  1.1096  		if (istypo &&
  1.1097  		  (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword)))
  1.1098 @@ -1574,16 +1570,16 @@
  1.1099  		    g_tree_insert(qperiod,g_strdup(testword),
  1.1100  		      GINT_TO_POINTER(1));
  1.1101  		    if (pswit[ECHO_SWITCH])
  1.1102 -			printf("\n%s\n",aline);
  1.1103 +			g_print("\n%s\n",aline);
  1.1104  		    if (!pswit[OVERVIEW_SWITCH])
  1.1105 -			printf("    Line %ld column %d - Extra period?\n",
  1.1106 -			  linecnt,(int)(t-aline)+1);
  1.1107 +			g_print("    Line %ld column %ld - Extra period?\n",
  1.1108 +			  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1.1109  		    else
  1.1110  			cnt_punct++;
  1.1111  		}
  1.1112  		g_free(testword);
  1.1113  	    }
  1.1114 -	    t++;
  1.1115 +	    t=g_utf8_next_char(t);
  1.1116  	}
  1.1117      }
  1.1118  }
  1.1119 @@ -1597,6 +1593,7 @@
  1.1120  {
  1.1121      int i;
  1.1122      const char *s,*wordstart;
  1.1123 +    gunichar c;
  1.1124      gchar *inword,*t;
  1.1125      if (pswit[TYPO_SWITCH])
  1.1126      {
  1.1127 @@ -1609,19 +1606,21 @@
  1.1128  		g_free(t);
  1.1129  		continue;
  1.1130  	    }
  1.1131 -	    inword=g_ascii_strdown(t,-1);
  1.1132 +	    inword=g_utf8_strdown(t,-1);
  1.1133  	    g_free(t);
  1.1134  	    for (i=0;*nocomma[i];i++)
  1.1135  		if (!strcmp(inword,nocomma[i]))
  1.1136  		{
  1.1137 -		    if (*s==',' || *s==';' || *s==':')
  1.1138 +		    c=g_utf8_get_char(s);
  1.1139 +		    if (c==',' || c==';' || c==':')
  1.1140  		    {
  1.1141  			if (pswit[ECHO_SWITCH])
  1.1142 -			    printf("\n%s\n",aline);
  1.1143 +			    g_print("\n%s\n",aline);
  1.1144  			if (!pswit[OVERVIEW_SWITCH])
  1.1145 -			    printf("    Line %ld column %d - "
  1.1146 +			    g_print("    Line %ld column %ld - "
  1.1147  			      "Query punctuation after %s?\n",
  1.1148 -			      linecnt,(int)(s-aline)+1,inword);
  1.1149 +			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1.1150 +			      inword);
  1.1151  			else
  1.1152  			    cnt_punct++;
  1.1153  		    }
  1.1154 @@ -1629,14 +1628,16 @@
  1.1155  	    for (i=0;*noperiod[i];i++)
  1.1156  		if (!strcmp(inword,noperiod[i]))
  1.1157  		{
  1.1158 -		    if (*s=='.' || *s=='!')
  1.1159 +		    c=g_utf8_get_char(s);
  1.1160 +		    if (c=='.' || c=='!')
  1.1161  		    {
  1.1162  			if (pswit[ECHO_SWITCH])
  1.1163 -			    printf("\n%s\n",aline);
  1.1164 +			    g_print("\n%s\n",aline);
  1.1165  			if (!pswit[OVERVIEW_SWITCH])
  1.1166 -			    printf("    Line %ld column %d - "
  1.1167 +			    g_print("    Line %ld column %ld - "
  1.1168  			      "Query punctuation after %s?\n",
  1.1169 -			      linecnt,(int)(s-aline)+1,inword);
  1.1170 +			      linecnt,g_utf8_pointer_to_offset(aline,s)+1,
  1.1171 +			      inword);
  1.1172  			else
  1.1173  			    cnt_punct++;
  1.1174  		    }
  1.1175 @@ -1654,10 +1655,15 @@
  1.1176   */
  1.1177  void check_for_typos(const char *aline,struct warnings *warnings)
  1.1178  {
  1.1179 -    const char *s,*wordstart;
  1.1180 -    gchar *inword,*testword;
  1.1181 -    int i,alower,vowel,consonant,*dupcnt;
  1.1182 -    gboolean isdup,istypo;
  1.1183 +    const char *s,*t,*nt,*wordstart;
  1.1184 +    gchar *inword;
  1.1185 +    gunichar *decomposition;
  1.1186 +    gchar *testword;
  1.1187 +    int i,vowel,consonant,*dupcnt;
  1.1188 +    gboolean isdup,istypo,alower;
  1.1189 +    gunichar c;
  1.1190 +    long offset,len;
  1.1191 +    gsize decomposition_len;
  1.1192      for (s=aline;*s;)
  1.1193      {
  1.1194  	wordstart=s;
  1.1195 @@ -1670,10 +1676,10 @@
  1.1196  	if (mixdigit(inword))
  1.1197  	{
  1.1198  	    if (pswit[ECHO_SWITCH])
  1.1199 -		printf("\n%s\n",aline);
  1.1200 +		g_print("\n%s\n",aline);
  1.1201  	    if (!pswit[OVERVIEW_SWITCH])
  1.1202 -		printf("    Line %ld column %d - Query digit in %s\n",
  1.1203 -		  linecnt,(int)(wordstart-aline)+1,inword);
  1.1204 +		g_print("    Line %ld column %ld - Query digit in %s\n",
  1.1205 +		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword);
  1.1206  	    else
  1.1207  		cnt_word++;
  1.1208  	}
  1.1209 @@ -1684,14 +1690,15 @@
  1.1210  	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1.1211  	{
  1.1212  	    istypo=FALSE;
  1.1213 -	    testword=g_strdup(inword);
  1.1214 -	    alower=0;
  1.1215 -	    for (i=0;i<(int)strlen(testword);i++)
  1.1216 +	    alower=FALSE;
  1.1217 +	    for (t=inword;*t;t=g_utf8_next_char(t))
  1.1218  	    {
  1.1219 +		c=g_utf8_get_char(t);
  1.1220 +		nt=g_utf8_next_char(t);
  1.1221  		/* lowercase for testing */
  1.1222 -		if (testword[i]>='a' && testword[i]<='z')
  1.1223 -		    alower=1;
  1.1224 -		if (alower && testword[i]>='A' && testword[i]<='Z')
  1.1225 +		if (g_unichar_islower(c))
  1.1226 +		    alower=TRUE;
  1.1227 +		if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c)))
  1.1228  		{
  1.1229  		    /*
  1.1230  		     * We have an uppercase mid-word. However, there are
  1.1231 @@ -1699,15 +1706,18 @@
  1.1232  		     *   Mac and Mc like McGill
  1.1233  		     *   French contractions like l'Abbe
  1.1234  		     */
  1.1235 -		    if (i==2 && testword[0]=='m' && testword[1]=='c' ||
  1.1236 -		      i==3 && testword[0]=='m' && testword[1]=='a' &&
  1.1237 -		      testword[2]=='c' || i>0 && testword[i-1]==CHAR_SQUOTE)
  1.1238 +		    offset=g_utf8_pointer_to_offset(inword,t);
  1.1239 +		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
  1.1240 +		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
  1.1241 +		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
  1.1242 +		      offset>0 &&
  1.1243 +		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
  1.1244  			; /* do nothing! */
  1.1245  		    else
  1.1246  			istypo=TRUE;
  1.1247  		}
  1.1248 -		testword[i]=(char)tolower(testword[i]);
  1.1249  	    }
  1.1250 +	    testword=g_utf8_casefold(inword,-1);
  1.1251  	}
  1.1252  	if (pswit[TYPO_SWITCH])
  1.1253  	{
  1.1254 @@ -1715,13 +1725,14 @@
  1.1255  	     * Check for certain unlikely two-letter combinations at word
  1.1256  	     * start and end.
  1.1257  	     */
  1.1258 -	    if (strlen(testword)>1)
  1.1259 +	    len=g_utf8_strlen(testword,-1);
  1.1260 +	    if (len>1)
  1.1261  	    {
  1.1262  		for (i=0;*nostart[i];i++)
  1.1263 -		    if (!strncmp(testword,nostart[i],2))
  1.1264 +		    if (g_str_has_prefix(testword,nostart[i]))
  1.1265  			istypo=TRUE;
  1.1266  		for (i=0;*noend[i];i++)
  1.1267 -		    if (!strncmp(testword+strlen(testword)-2,noend[i],2))
  1.1268 +		    if (g_str_has_suffix(testword,noend[i]))
  1.1269  			istypo=TRUE;
  1.1270  	    }
  1.1271  	    /* ght is common, gbt never. Like that. */
  1.1272 @@ -1755,21 +1766,25 @@
  1.1273  	     * Check for no vowels or no consonants.
  1.1274  	     * If none, flag a typo.
  1.1275  	     */
  1.1276 -	    if (!istypo && strlen(testword)>1)
  1.1277 +	    if (!istypo && len>1)
  1.1278  	    {
  1.1279  		vowel=consonant=0;
  1.1280 -		for (i=0;testword[i];i++)
  1.1281 +		for (t=testword;*t;t=g_utf8_next_char(t))
  1.1282  		{
  1.1283 -		    if (testword[i]=='y' || gcisdigit(testword[i]))
  1.1284 +		    c=g_utf8_get_char(t);
  1.1285 +		    decomposition=
  1.1286 +		      g_unicode_canonical_decomposition(c,&decomposition_len);
  1.1287 +		    if (c=='y' || g_unichar_isdigit(c))
  1.1288  		    {
  1.1289  			/* Yah, this is loose. */
  1.1290  			vowel++;
  1.1291  			consonant++;
  1.1292  		    }
  1.1293 -		    else if (strchr(vowels,testword[i]))
  1.1294 +		    else if (g_utf8_strchr("aeiou",-1,decomposition[0]))
  1.1295  			vowel++;
  1.1296  		    else
  1.1297  			consonant++;
  1.1298 +		    g_free(decomposition);
  1.1299  		}
  1.1300  		if (!vowel || !consonant)
  1.1301  		    istypo=TRUE;
  1.1302 @@ -1798,7 +1813,8 @@
  1.1303  	     *   "d" for a missing apostrophe - he d
  1.1304  	     *   "n" for "in"
  1.1305  	     */
  1.1306 -	    if (!istypo && strlen(testword)==1 && strchr("slmijdn",*inword))
  1.1307 +	    if (!istypo && len==1 &&
  1.1308 +	      g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword)))
  1.1309  		istypo=TRUE;
  1.1310  	    if (istypo)
  1.1311  	    {
  1.1312 @@ -1817,14 +1833,15 @@
  1.1313  		if (!isdup)
  1.1314  		{
  1.1315  		    if (pswit[ECHO_SWITCH])
  1.1316 -			printf("\n%s\n",aline);
  1.1317 +			g_print("\n%s\n",aline);
  1.1318  		    if (!pswit[OVERVIEW_SWITCH])
  1.1319  		    {
  1.1320 -			printf("    Line %ld column %d - Query word %s",
  1.1321 -			  linecnt,(int)(wordstart-aline)+1,inword);
  1.1322 +			g_print("    Line %ld column %ld - Query word %s",
  1.1323 +			  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,
  1.1324 +			  inword);
  1.1325  			if (!pswit[VERBOSE_SWITCH])
  1.1326 -			    printf(" - not reporting duplicates");
  1.1327 -			printf("\n");
  1.1328 +			    g_print(" - not reporting duplicates");
  1.1329 +			g_print("\n");
  1.1330  		    }
  1.1331  		    else
  1.1332  			cnt_word++;
  1.1333 @@ -1835,10 +1852,10 @@
  1.1334  	if (!istypo && usertypo && g_tree_lookup(usertypo,testword))
  1.1335  	{
  1.1336  	    if (pswit[ECHO_SWITCH])
  1.1337 -		printf("\n%s\n",aline);
  1.1338 +		g_print("\n%s\n",aline);
  1.1339  	    if (!pswit[OVERVIEW_SWITCH])  
  1.1340 -		printf("    Line %ld column %d - Query possible scanno %s\n",
  1.1341 -		  linecnt,(int)(wordstart-aline)+2,inword);
  1.1342 +		g_print("    Line %ld column %ld - Query possible scanno %s\n",
  1.1343 +		  linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword);
  1.1344  	}
  1.1345  	if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH])
  1.1346  	    g_free(testword);
  1.1347 @@ -1848,10 +1865,11 @@
  1.1348  	    if (!strcmp(inword,"0") || !strcmp(inword,"1"))
  1.1349  	    {
  1.1350  		if (pswit[ECHO_SWITCH])
  1.1351 -		    printf("\n%s\n",aline);
  1.1352 +		    g_print("\n%s\n",aline);
  1.1353  		if (!pswit[OVERVIEW_SWITCH])
  1.1354 -		    printf("    Line %ld column %d - Query standalone %s\n",
  1.1355 -		      linecnt,(int)(wordstart-aline)+2,inword);
  1.1356 +		    g_print("    Line %ld column %ld - Query standalone %s\n",
  1.1357 +		      linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,
  1.1358 +		      inword);
  1.1359  		else
  1.1360  		    cnt_word++;
  1.1361  	    }
  1.1362 @@ -1873,63 +1891,73 @@
  1.1363  void check_for_misspaced_punctuation(const char *aline,
  1.1364    struct parities *parities,gboolean isemptyline)
  1.1365  {
  1.1366 -    int i,llen;
  1.1367      gboolean isacro,isellipsis;
  1.1368      const char *s;
  1.1369 -    llen=strlen(aline);
  1.1370 -    for (i=1;i<llen;i++)
  1.1371 +    gunichar c,nc,pc,n2c;
  1.1372 +    c=g_utf8_get_char(aline);
  1.1373 +    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1.1374 +    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1.1375      {
  1.1376 +	pc=c;
  1.1377 +	c=nc;
  1.1378 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1379  	/* For each character in the line after the first. */
  1.1380 -	if (strchr(".?!,;:_",aline[i]))  /* if it's punctuation */
  1.1381 +	if (g_utf8_strchr(".?!,;:_",-1,c))  /* if it's punctuation */
  1.1382  	{
  1.1383  	    /* we need to suppress warnings for acronyms like M.D. */
  1.1384  	    isacro=FALSE;
  1.1385  	    /* we need to suppress warnings for ellipsis . . . */
  1.1386  	    isellipsis=FALSE;
  1.1387 -	    /* if there are letters on both sides of it or ... */
  1.1388 -	    if (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1]) ||
  1.1389 -	       gcisalpha(aline[i+1]) && strchr("?!,;:",aline[i]))
  1.1390 +	    /*
  1.1391 +	     * If there are letters on both sides of it or
  1.1392 +	     * if it's strict punctuation followed by an alpha.
  1.1393 +	     */
  1.1394 +	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
  1.1395 +	      g_utf8_strchr("?!,;:",-1,c)))
  1.1396  	    {
  1.1397 -		/* ...if it's strict punctuation followed by an alpha */
  1.1398 -		if (aline[i]=='.')
  1.1399 +		if (c=='.')
  1.1400  		{
  1.1401 -		    if (i>2 && aline[i-2]=='.')
  1.1402 +		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1.1403 +		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1.1404  			isacro=TRUE;
  1.1405 -		    if (i+2<llen && aline[i+2]=='.')
  1.1406 +		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1.1407 +		    if (nc && n2c=='.')
  1.1408  			isacro=TRUE;
  1.1409  		}
  1.1410  		if (!isacro)
  1.1411  		{
  1.1412  		    if (pswit[ECHO_SWITCH])
  1.1413 -			printf("\n%s\n",aline);
  1.1414 +			g_print("\n%s\n",aline);
  1.1415  		    if (!pswit[OVERVIEW_SWITCH])
  1.1416 -			printf("    Line %ld column %d - Missing space?\n",
  1.1417 -			  linecnt,i+1);
  1.1418 +			g_print("    Line %ld column %ld - Missing space?\n",
  1.1419 +			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1420  		    else
  1.1421  			cnt_punct++;
  1.1422  		}
  1.1423  	    }
  1.1424 -	    if (aline[i-1]==CHAR_SPACE &&
  1.1425 -	      (aline[i+1]==CHAR_SPACE || aline[i+1]==0))
  1.1426 +	    if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc))
  1.1427  	    {
  1.1428  		/*
  1.1429  		 * If there are spaces on both sides,
  1.1430  		 * or space before and end of line.
  1.1431  		 */
  1.1432 -		if (aline[i]=='.')
  1.1433 +		if (c=='.')
  1.1434  		{
  1.1435 -		    if (i>2 && aline[i-2]=='.')
  1.1436 +		    if (g_utf8_pointer_to_offset(aline,s)>2 &&
  1.1437 +		      g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.')
  1.1438  			isellipsis=TRUE;
  1.1439 -		    if (i+2<llen && aline[i+2]=='.')
  1.1440 +		    n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)));
  1.1441 +		    if (nc && n2c=='.')
  1.1442  			isellipsis=TRUE;
  1.1443  		}
  1.1444  		if (!isemptyline && !isellipsis)
  1.1445  		{
  1.1446  		    if (pswit[ECHO_SWITCH])
  1.1447 -			printf("\n%s\n",aline);
  1.1448 +			g_print("\n%s\n",aline);
  1.1449  		    if (!pswit[OVERVIEW_SWITCH])
  1.1450 -			printf("    Line %ld column %d - "
  1.1451 -			  "Spaced punctuation?\n",linecnt,i+1);
  1.1452 +			g_print("    Line %ld column %ld - "
  1.1453 +			  "Spaced punctuation?\n",linecnt,
  1.1454 +			  g_utf8_pointer_to_offset(aline,s)+1);
  1.1455  		    else
  1.1456  			cnt_punct++;
  1.1457  		}
  1.1458 @@ -1937,25 +1965,28 @@
  1.1459  	}
  1.1460      }
  1.1461      /* Split out the characters that CANNOT be preceded by space. */
  1.1462 -    llen=strlen(aline);
  1.1463 -    for (i=1;i<llen;i++)
  1.1464 +    c=g_utf8_get_char(aline);
  1.1465 +    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1.1466 +    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1.1467      {
  1.1468 +	pc=c;
  1.1469 +	c=nc;
  1.1470 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1471  	/* for each character in the line after the first */
  1.1472 -	if (strchr("?!,;:",aline[i]))
  1.1473 +	if (g_utf8_strchr("?!,;:",-1,c))
  1.1474  	{
  1.1475  	    /* if it's punctuation that _cannot_ have a space before it */
  1.1476 -	    if (aline[i-1]==CHAR_SPACE && !isemptyline &&
  1.1477 -	      aline[i+1]!=CHAR_SPACE)
  1.1478 +	    if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE)
  1.1479  	    {
  1.1480  		/*
  1.1481 -		 * If aline[i+1) DOES == space,
  1.1482 +		 * If nc DOES == space,
  1.1483  		 * it was already reported just above.
  1.1484  		 */
  1.1485  		if (pswit[ECHO_SWITCH])
  1.1486 -		    printf("\n%s\n",aline);
  1.1487 +		    g_print("\n%s\n",aline);
  1.1488  		if (!pswit[OVERVIEW_SWITCH])
  1.1489 -		    printf("    Line %ld column %d - Spaced punctuation?\n",
  1.1490 -		      linecnt,i+1);
  1.1491 +		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1.1492 +		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1493  		else
  1.1494  		    cnt_punct++;
  1.1495  	    }
  1.1496 @@ -1966,64 +1997,77 @@
  1.1497       * This plugs a hole in the acronym code above.
  1.1498       * Inelegant, but maintainable.
  1.1499       */
  1.1500 -    llen=strlen(aline);
  1.1501 -    for (i=1;i<llen;i++)
  1.1502 +    c=g_utf8_get_char(aline);
  1.1503 +    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1.1504 +    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1.1505      {
  1.1506 +	pc=c;
  1.1507 +	c=nc;
  1.1508 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1509  	/* for each character in the line after the first */
  1.1510 -	if (aline[i]=='.')
  1.1511 +	if (c=='.')
  1.1512  	{
  1.1513  	    /* if it's a period */
  1.1514 -	    if (aline[i-1]==CHAR_SPACE && gcisalpha(aline[i+1]))
  1.1515 +	    if (pc==CHAR_SPACE && g_unichar_isalpha(nc))
  1.1516  	    {
  1.1517  		/*
  1.1518  		 * If the period follows a space and
  1.1519  		 * is followed by a letter.
  1.1520  		 */
  1.1521  		if (pswit[ECHO_SWITCH])
  1.1522 -		    printf("\n%s\n",aline);
  1.1523 +		    g_print("\n%s\n",aline);
  1.1524  		if (!pswit[OVERVIEW_SWITCH])
  1.1525 -		    printf("    Line %ld column %d - Spaced punctuation?\n",
  1.1526 -		      linecnt,i+1);
  1.1527 +		    g_print("    Line %ld column %ld - Spaced punctuation?\n",
  1.1528 +		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1529  		else
  1.1530  		    cnt_punct++;
  1.1531  	    }
  1.1532  	}
  1.1533      }
  1.1534 -    for (i=1;i<llen;i++)
  1.1535 +    c=g_utf8_get_char(aline);
  1.1536 +    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1.1537 +    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1.1538      {
  1.1539 +	pc=c;
  1.1540 +	c=nc;
  1.1541 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1542  	/* for each character in the line after the first */
  1.1543 -	if (aline[i]==CHAR_DQUOTE)
  1.1544 +	if (c==CHAR_DQUOTE)
  1.1545  	{
  1.1546 -	    if (!strchr(" _-.'`,;:!/([{?}])",aline[i-1]) &&
  1.1547 -	      !strchr(" _-.'`,;:!/([{?}])",aline[i+1]) && aline[i+1] ||
  1.1548 -	      !strchr(" _-([{'`",aline[i-1]) && gcisalpha(aline[i+1]))
  1.1549 +	    if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) &&
  1.1550 +	      !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc ||
  1.1551 +	      !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc))
  1.1552  	    {
  1.1553  		if (pswit[ECHO_SWITCH])
  1.1554 -		    printf("\n%s\n",aline);
  1.1555 +		    g_print("\n%s\n",aline);
  1.1556  		if (!pswit[OVERVIEW_SWITCH])
  1.1557 -		    printf("    Line %ld column %d - Unspaced quotes?\n",
  1.1558 -		      linecnt,i+1);
  1.1559 +		    g_print("    Line %ld column %ld - Unspaced quotes?\n",
  1.1560 +		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1561  		else
  1.1562  		    cnt_punct++;
  1.1563  	    }
  1.1564  	}
  1.1565      }
  1.1566      /* Check parity of quotes. */
  1.1567 -    for (s=aline;*s;s++)
  1.1568 +    nc=g_utf8_get_char(aline);
  1.1569 +    for (s=aline;*s;s=g_utf8_next_char(s))
  1.1570      {
  1.1571 -	if (*s==CHAR_DQUOTE)
  1.1572 +	c=nc;
  1.1573 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1574 +	if (c==CHAR_DQUOTE)
  1.1575  	{
  1.1576  	    parities->dquote=!parities->dquote;
  1.1577  	    if (!parities->dquote)
  1.1578  	    {
  1.1579  		/* parity even */
  1.1580 -		if (!strchr("_-.'`/,;:!?)]} ",s[1]))
  1.1581 +		if (!g_utf8_strchr("_-.'`/,;:!?)]} ",-1,nc))
  1.1582  		{
  1.1583  		    if (pswit[ECHO_SWITCH])
  1.1584 -			printf("\n%s\n",aline);
  1.1585 +			g_print("\n%s\n",aline);
  1.1586  		    if (!pswit[OVERVIEW_SWITCH])
  1.1587 -			printf("    Line %ld column %d - "
  1.1588 -			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
  1.1589 +			g_print("    Line %ld column %ld - "
  1.1590 +			  "Wrongspaced quotes?\n",
  1.1591 +			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1592  		    else
  1.1593  			cnt_punct++;
  1.1594  		}
  1.1595 @@ -2031,28 +2075,30 @@
  1.1596  	    else
  1.1597  	    {
  1.1598  		/* parity odd */
  1.1599 -		if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
  1.1600 -		  !strchr("_-/.'`([{$",s[1]) || !s[1])
  1.1601 +		if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  1.1602 +		  !g_utf8_strchr("_-/.'`([{$",-1,nc) || !nc)
  1.1603  		{
  1.1604  		    if (pswit[ECHO_SWITCH])
  1.1605 -			printf("\n%s\n",aline);
  1.1606 +			g_print("\n%s\n",aline);
  1.1607  		    if (!pswit[OVERVIEW_SWITCH])
  1.1608 -			printf("    Line %ld column %d - "
  1.1609 -			  "Wrongspaced quotes?\n",linecnt,(int)(s-aline)+1);
  1.1610 +			g_print("    Line %ld column %ld - "
  1.1611 +			  "Wrongspaced quotes?\n",
  1.1612 +			  linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1613  		    else
  1.1614  			cnt_punct++;
  1.1615  		}
  1.1616  	    }
  1.1617  	}
  1.1618      }
  1.1619 -    if (*aline==CHAR_DQUOTE)
  1.1620 +    if (g_utf8_get_char(aline)==CHAR_DQUOTE)
  1.1621      {
  1.1622 -	if (strchr(",;:!?)]} ",aline[1]))
  1.1623 +	if (g_utf8_strchr(",;:!?)]} ",-1,
  1.1624 +	  g_utf8_get_char(g_utf8_next_char(aline))))
  1.1625  	{
  1.1626  	    if (pswit[ECHO_SWITCH])
  1.1627 -		printf("\n%s\n",aline);
  1.1628 +		g_print("\n%s\n",aline);
  1.1629  	    if (!pswit[OVERVIEW_SWITCH])
  1.1630 -		printf("    Line %ld column 1 - Wrongspaced quotes?\n",
  1.1631 +		g_print("    Line %ld column 1 - Wrongspaced quotes?\n",
  1.1632  		  linecnt);
  1.1633  	    else
  1.1634  		cnt_punct++;
  1.1635 @@ -2060,24 +2106,28 @@
  1.1636      }
  1.1637      if (pswit[SQUOTE_SWITCH])
  1.1638      {
  1.1639 -	for (s=aline;*s;s++)
  1.1640 +	nc=g_utf8_get_char(aline);
  1.1641 +	for (s=aline;*s;s=g_utf8_next_char(s))
  1.1642  	{
  1.1643 -	    if ((*s==CHAR_SQUOTE || *s==CHAR_OPEN_SQUOTE) &&
  1.1644 -	      (s==aline || s>aline && !gcisalpha(s[-1]) ||
  1.1645 -	      !gcisalpha(s[1])))
  1.1646 +	    c=nc;
  1.1647 +	    nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1648 +	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
  1.1649 +	      s>aline &&
  1.1650 +	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
  1.1651 +	      !g_unichar_isalpha(nc)))
  1.1652  	    {
  1.1653  		parities->squote=!parities->squote;
  1.1654  		if (!parities->squote)
  1.1655  		{
  1.1656  		    /* parity even */
  1.1657 -		    if (!strchr("_-.'`/\",;:!?)]} ",s[1]))
  1.1658 +		    if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc))
  1.1659  		    {
  1.1660  			if (pswit[ECHO_SWITCH])
  1.1661 -			    printf("\n%s\n",aline);
  1.1662 +			    g_print("\n%s\n",aline);
  1.1663  			if (!pswit[OVERVIEW_SWITCH])
  1.1664 -			    printf("    Line %ld column %d - "
  1.1665 +			    g_print("    Line %ld column %ld - "
  1.1666  			      "Wrongspaced singlequotes?\n",
  1.1667 -			      linecnt,(int)(s-aline)+1);
  1.1668 +			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1669  			else
  1.1670  			    cnt_punct++;
  1.1671  		    }
  1.1672 @@ -2085,15 +2135,15 @@
  1.1673  		else
  1.1674  		{
  1.1675  		    /* parity odd */
  1.1676 -		    if (!gcisalpha(s[1]) && !isdigit(s[1]) &&
  1.1677 -		      !strchr("_-/\".'`",s[1]) || !s[1])
  1.1678 +		    if (!g_unichar_isalpha(nc) && !isdigit(nc) &&
  1.1679 +		      !g_utf8_strchr("_-/\".'`",-1,nc) || !nc)
  1.1680  		    {
  1.1681  			if (pswit[ECHO_SWITCH])
  1.1682 -			    printf("\n%s\n",aline);
  1.1683 +			    g_print("\n%s\n",aline);
  1.1684  			if (!pswit[OVERVIEW_SWITCH])
  1.1685 -			    printf("    Line %ld column %d - "
  1.1686 +			    g_print("    Line %ld column %ld - "
  1.1687  			      "Wrongspaced singlequotes?\n",
  1.1688 -			      linecnt,(int)(s-aline)+1);
  1.1689 +			      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1690  			else
  1.1691  			    cnt_punct++;
  1.1692  		    }
  1.1693 @@ -2117,49 +2167,54 @@
  1.1694   */
  1.1695  void check_for_double_punctuation(const char *aline,struct warnings *warnings)
  1.1696  {
  1.1697 -    int i,llen;
  1.1698 -    llen=strlen(aline);
  1.1699 -    for (i=0;i<llen;i++)
  1.1700 +    const char *s;
  1.1701 +    gunichar c,nc;
  1.1702 +    nc=g_utf8_get_char(aline);
  1.1703 +    for (s=aline;*s;s=g_utf8_next_char(s))
  1.1704      {
  1.1705 +	c=nc;
  1.1706 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1707  	/* for each punctuation character in the line */
  1.1708 -	if (strchr(".?!,;:",aline[i]) && strchr(".?!,;:",aline[i+1]) &&
  1.1709 -	  aline[i] && aline[i+1])
  1.1710 +	if (c && nc && g_utf8_strchr(".?!,;:",-1,c) &&
  1.1711 +	  g_utf8_strchr(".?!,;:",-1,nc))
  1.1712  	{
  1.1713  	    /* followed by punctuation, it's a query, unless . . . */
  1.1714 -	    if (aline[i]==aline[i+1] && (aline[i]=='.' || aline[i]=='?' ||
  1.1715 -	      aline[i]=='!') ||
  1.1716 -	      !warnings->dotcomma && aline[i]=='.' && aline[i+1]==',' ||
  1.1717 -	      warnings->isFrench && !strncmp(aline+i,",...",4) ||
  1.1718 -	      warnings->isFrench && !strncmp(aline+i,"...,",4) ||
  1.1719 -	      warnings->isFrench && !strncmp(aline+i,";...",4) ||
  1.1720 -	      warnings->isFrench && !strncmp(aline+i,"...;",4) ||
  1.1721 -	      warnings->isFrench && !strncmp(aline+i,":...",4) ||
  1.1722 -	      warnings->isFrench && !strncmp(aline+i,"...:",4) ||
  1.1723 -	      warnings->isFrench && !strncmp(aline+i,"!...",4) ||
  1.1724 -	      warnings->isFrench && !strncmp(aline+i,"...!",4) ||
  1.1725 -	      warnings->isFrench && !strncmp(aline+i,"?...",4) ||
  1.1726 -	      warnings->isFrench && !strncmp(aline+i,"...?",4))
  1.1727 +	    if (c==nc && (c=='.' || c=='?' || c=='!') ||
  1.1728 +	      !warnings->dotcomma && c=='.' && nc==',' ||
  1.1729 +	      warnings->isFrench && g_str_has_prefix(s,",...") ||
  1.1730 +	      warnings->isFrench && g_str_has_prefix(s,"...,") ||
  1.1731 +	      warnings->isFrench && g_str_has_prefix(s,";...") ||
  1.1732 +	      warnings->isFrench && g_str_has_prefix(s,"...;") ||
  1.1733 +	      warnings->isFrench && g_str_has_prefix(s,":...") ||
  1.1734 +	      warnings->isFrench && g_str_has_prefix(s,"...:") ||
  1.1735 +	      warnings->isFrench && g_str_has_prefix(s,"!...") ||
  1.1736 +	      warnings->isFrench && g_str_has_prefix(s,"...!") ||
  1.1737 +	      warnings->isFrench && g_str_has_prefix(s,"?...") ||
  1.1738 +	      warnings->isFrench && g_str_has_prefix(s,"...?"))
  1.1739  	    {
  1.1740 -		if (warnings->isFrench && !strncmp(aline+i,",...",4) ||
  1.1741 -		  warnings->isFrench && !strncmp(aline+i,"...,",4) ||
  1.1742 -		  warnings->isFrench && !strncmp(aline+i,";...",4) ||
  1.1743 -		  warnings->isFrench && !strncmp(aline+i,"...;",4) ||
  1.1744 -		  warnings->isFrench && !strncmp(aline+i,":...",4) ||
  1.1745 -		  warnings->isFrench && !strncmp(aline+i,"...:",4) ||
  1.1746 -		  warnings->isFrench && !strncmp(aline+i,"!...",4) ||
  1.1747 -		  warnings->isFrench && !strncmp(aline+i,"...!",4) ||
  1.1748 -		  warnings->isFrench && !strncmp(aline+i,"?...",4) ||
  1.1749 -		  warnings->isFrench && !strncmp(aline+i,"...?",4))
  1.1750 -		    i+=4;
  1.1751 +		if (warnings->isFrench && g_str_has_prefix(s,",...") ||
  1.1752 +		  warnings->isFrench && g_str_has_prefix(s,"...,") ||
  1.1753 +		  warnings->isFrench && g_str_has_prefix(s,";...") ||
  1.1754 +		  warnings->isFrench && g_str_has_prefix(s,"...;") ||
  1.1755 +		  warnings->isFrench && g_str_has_prefix(s,":...") ||
  1.1756 +		  warnings->isFrench && g_str_has_prefix(s,"...:") ||
  1.1757 +		  warnings->isFrench && g_str_has_prefix(s,"!...") ||
  1.1758 +		  warnings->isFrench && g_str_has_prefix(s,"...!") ||
  1.1759 +		  warnings->isFrench && g_str_has_prefix(s,"?...") ||
  1.1760 +		  warnings->isFrench && g_str_has_prefix(s,"...?"))
  1.1761 +		{
  1.1762 +		    s+=4;
  1.1763 +		    nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1764 +		}
  1.1765  		; /* do nothing for .. !! and ?? which can be legit */
  1.1766  	    }
  1.1767  	    else
  1.1768  	    {
  1.1769  		if (pswit[ECHO_SWITCH])
  1.1770 -		    printf("\n%s\n",aline);
  1.1771 +		    g_print("\n%s\n",aline);
  1.1772  		if (!pswit[OVERVIEW_SWITCH])
  1.1773 -		    printf("    Line %ld column %d - Double punctuation?\n",
  1.1774 -		      linecnt,i+1);
  1.1775 +		    g_print("    Line %ld column %ld - Double punctuation?\n",
  1.1776 +		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.1777  		else
  1.1778  		    cnt_punct++;
  1.1779  	    }
  1.1780 @@ -2177,37 +2232,37 @@
  1.1781      while ((t=strstr(s," \" ")))
  1.1782      {
  1.1783  	if (pswit[ECHO_SWITCH])
  1.1784 -	    printf("\n%s\n",aline);
  1.1785 +	    g_print("\n%s\n",aline);
  1.1786  	if (!pswit[OVERVIEW_SWITCH])
  1.1787 -	    printf("    Line %ld column %d - Spaced doublequote?\n",
  1.1788 -	      linecnt,(int)(t-aline+1));
  1.1789 +	    g_print("    Line %ld column %ld - Spaced doublequote?\n",
  1.1790 +	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1.1791  	else
  1.1792  	    cnt_punct++;
  1.1793 -	s=t+2;
  1.1794 +	s=g_utf8_next_char(g_utf8_next_char(t));
  1.1795      }
  1.1796      s=aline;
  1.1797      while ((t=strstr(s," ' ")))
  1.1798      {
  1.1799  	if (pswit[ECHO_SWITCH])
  1.1800 -	    printf("\n%s\n",aline);
  1.1801 +	    g_print("\n%s\n",aline);
  1.1802  	if (!pswit[OVERVIEW_SWITCH])
  1.1803 -	    printf("    Line %ld column %d - Spaced singlequote?\n",
  1.1804 -	      linecnt,(int)(t-aline+1));
  1.1805 +	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  1.1806 +	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1.1807  	else
  1.1808  	    cnt_punct++;
  1.1809 -	s=t+2;
  1.1810 +	s=g_utf8_next_char(g_utf8_next_char(t));
  1.1811      }
  1.1812      s=aline;
  1.1813      while ((t=strstr(s," ` ")))
  1.1814      {
  1.1815  	if (pswit[ECHO_SWITCH])
  1.1816 -	    printf("\n%s\n",aline);
  1.1817 +	    g_print("\n%s\n",aline);
  1.1818  	if (!pswit[OVERVIEW_SWITCH])
  1.1819 -	    printf("    Line %ld column %d - Spaced singlequote?\n",
  1.1820 -	      linecnt,(int)(t-aline+1));
  1.1821 +	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
  1.1822 +	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
  1.1823  	else
  1.1824  	    cnt_punct++;
  1.1825 -	s=t+2;
  1.1826 +	s=g_utf8_next_char(g_utf8_next_char(t));
  1.1827      }
  1.1828  }
  1.1829  
  1.1830 @@ -2219,22 +2274,26 @@
  1.1831  void check_for_miscased_genative(const char *aline)
  1.1832  {
  1.1833      const char *s;
  1.1834 +    gunichar c,nc,pc;
  1.1835      if (!*aline)
  1.1836  	return;
  1.1837 -    s=aline+1;
  1.1838 -    while (*s)
  1.1839 +    c=g_utf8_get_char(aline);
  1.1840 +    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1.1841 +    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1.1842      {
  1.1843 -	if (*s==CHAR_SQUOTE && s[1]=='S' && s[-1]>='a' && s[-1]<='z')
  1.1844 +	pc=c;
  1.1845 +	c=nc;
  1.1846 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1847 +	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
  1.1848  	{
  1.1849  	    if (pswit[ECHO_SWITCH])
  1.1850 -		printf("\n%s\n",aline);
  1.1851 +		g_print("\n%s\n",aline);
  1.1852  	    if (!pswit[OVERVIEW_SWITCH])
  1.1853 -		printf("    Line %ld column %d - Capital \"S\"?\n",
  1.1854 -		  linecnt,(int)(s-aline+2));
  1.1855 +		g_print("    Line %ld column %ld - Capital \"S\"?\n",
  1.1856 +		  linecnt,g_utf8_pointer_to_offset(aline,s)+2);
  1.1857  	    else
  1.1858  		cnt_punct++;
  1.1859  	}
  1.1860 -	s++;
  1.1861      }
  1.1862  }
  1.1863  
  1.1864 @@ -2248,29 +2307,34 @@
  1.1865   */
  1.1866  void check_end_of_line(const char *aline,struct warnings *warnings)
  1.1867  {
  1.1868 -    int i,llen;
  1.1869 -    llen=strlen(aline);
  1.1870 -    if (llen>1)
  1.1871 +    int lbytes;
  1.1872 +    const char *s;
  1.1873 +    gunichar c1,c2;
  1.1874 +    lbytes=strlen(aline);
  1.1875 +    if (g_utf8_strlen(aline,lbytes)>1)
  1.1876      {
  1.1877 -	if (aline[llen-1]==CHAR_DQUOTE || aline[llen-1]==CHAR_SQUOTE ||
  1.1878 -	  aline[llen-1]==CHAR_OPEN_SQUOTE)
  1.1879 -	    if (aline[llen-2]==CHAR_SPACE)
  1.1880 -	    {
  1.1881 -		if (pswit[ECHO_SWITCH])
  1.1882 -		    printf("\n%s\n",aline);
  1.1883 -		if (!pswit[OVERVIEW_SWITCH])
  1.1884 -		    printf("    Line %ld column %d - Spaced quote?\n",
  1.1885 -		      linecnt,llen);
  1.1886 -		else
  1.1887 -		    cnt_punct++;
  1.1888 -	    }
  1.1889 -	if ((aline[0]==CHAR_SQUOTE || aline[0]==CHAR_OPEN_SQUOTE) &&
  1.1890 -	  aline[1]==CHAR_SPACE)
  1.1891 +	s=g_utf8_prev_char(aline+lbytes);
  1.1892 +	c1=g_utf8_get_char(s);
  1.1893 +	c2=g_utf8_get_char(g_utf8_prev_char(s));
  1.1894 +	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
  1.1895 +	  c2==CHAR_SPACE)
  1.1896  	{
  1.1897  	    if (pswit[ECHO_SWITCH])
  1.1898 -		printf("\n%s\n",aline);
  1.1899 +		g_print("\n%s\n",aline);
  1.1900  	    if (!pswit[OVERVIEW_SWITCH])
  1.1901 -		printf("    Line %ld column 1 - Spaced quote?\n",linecnt);
  1.1902 +		g_print("    Line %ld column %ld - Spaced quote?\n",linecnt,
  1.1903 +		  g_utf8_strlen(aline,lbytes));
  1.1904 +	    else
  1.1905 +		cnt_punct++;
  1.1906 +	}
  1.1907 +	c1=g_utf8_get_char(aline);
  1.1908 +	c2=g_utf8_get_char(g_utf8_next_char(aline));
  1.1909 +	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
  1.1910 +	{
  1.1911 +	    if (pswit[ECHO_SWITCH])
  1.1912 +		g_print("\n%s\n",aline);
  1.1913 +	    if (!pswit[OVERVIEW_SWITCH])
  1.1914 +		g_print("    Line %ld column 1 - Spaced quote?\n",linecnt);
  1.1915  	    else
  1.1916  		cnt_punct++;
  1.1917  	}
  1.1918 @@ -2280,15 +2344,18 @@
  1.1919  	 */
  1.1920  	if (pswit[PARANOID_SWITCH] && warnings->hyphen)
  1.1921  	{
  1.1922 -	    for (i=llen-1;i>0 && (unsigned char)aline[i]<=CHAR_SPACE;i--)
  1.1923 +	    for (s=g_utf8_prev_char(aline+lbytes);
  1.1924 +	      s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s))
  1.1925  		;
  1.1926 -	    if (aline[i]=='-' && aline[i-1]!='-')
  1.1927 +	    if (g_utf8_get_char(s)=='-' &&
  1.1928 +	      g_utf8_get_char(g_utf8_prev_char(s))!='-')
  1.1929  	    {
  1.1930  		if (pswit[ECHO_SWITCH])
  1.1931 -		    printf("\n%s\n",aline);
  1.1932 +		    g_print("\n%s\n",aline);
  1.1933  		if (!pswit[OVERVIEW_SWITCH])
  1.1934 -		    printf("    Line %ld column %d - Hyphen at end of line?\n",
  1.1935 -		      linecnt,i);
  1.1936 +		    g_print("    Line %ld column %ld - "
  1.1937 +		      "Hyphen at end of line?\n",
  1.1938 +		      linecnt,g_utf8_pointer_to_offset(aline,s));
  1.1939  	    }
  1.1940  	}
  1.1941      }
  1.1942 @@ -2302,19 +2369,26 @@
  1.1943   */
  1.1944  void check_for_unspaced_bracket(const char *aline)
  1.1945  {
  1.1946 -    int i,llen;
  1.1947 -    llen=strlen(aline);
  1.1948 -    for (i=1;i<llen-1;i++)
  1.1949 +    const char *s;
  1.1950 +    gunichar c,nc,pc;
  1.1951 +    c=g_utf8_get_char(aline);
  1.1952 +    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1.1953 +    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1.1954      {
  1.1955 +	pc=c;
  1.1956 +	c=nc;
  1.1957 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1958 +	if (!nc)
  1.1959 +	    break;
  1.1960  	/* for each bracket character in the line except 1st & last */
  1.1961 -	if (strchr("{[()]}",aline[i]) && gcisalpha(aline[i-1]) &&
  1.1962 -	  gcisalpha(aline[i+1]))
  1.1963 +	if (g_utf8_strchr("{[()]}",-1,c) &&
  1.1964 +	  g_unichar_isalpha(pc) && g_unichar_isalpha(nc))
  1.1965  	{
  1.1966  	    if (pswit[ECHO_SWITCH])
  1.1967 -		printf("\n%s\n",aline);
  1.1968 +		g_print("\n%s\n",aline);
  1.1969  	    if (!pswit[OVERVIEW_SWITCH])
  1.1970 -		printf("    Line %ld column %d - Unspaced bracket?\n",
  1.1971 -		  linecnt,i);
  1.1972 +		g_print("    Line %ld column %ld - Unspaced bracket?\n",
  1.1973 +		  linecnt,g_utf8_pointer_to_offset(aline,s));
  1.1974  	    else
  1.1975  		cnt_punct++;
  1.1976  	}
  1.1977 @@ -2326,18 +2400,24 @@
  1.1978   */
  1.1979  void check_for_unpunctuated_endquote(const char *aline)
  1.1980  {
  1.1981 -    int i,llen;
  1.1982 -    llen=strlen(aline);
  1.1983 -    for (i=1;i<llen;i++)
  1.1984 +    const char *s;
  1.1985 +    gunichar c,nc,pc;
  1.1986 +    c=g_utf8_get_char(aline);
  1.1987 +    nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0;
  1.1988 +    for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s))
  1.1989      {
  1.1990 +	pc=c;
  1.1991 +	c=nc;
  1.1992 +	nc=g_utf8_get_char(g_utf8_next_char(s));
  1.1993  	/* for each character in the line except 1st */
  1.1994 -	if (aline[i]==CHAR_DQUOTE && isalpha(aline[i-1]))
  1.1995 +	if (c==CHAR_DQUOTE && isalpha(pc))
  1.1996  	{
  1.1997  	    if (pswit[ECHO_SWITCH])
  1.1998 -		printf("\n%s\n",aline);
  1.1999 +		g_print("\n%s\n",aline);
  1.2000  	    if (!pswit[OVERVIEW_SWITCH])
  1.2001 -		printf("    Line %ld column %d - "
  1.2002 -		  "endquote missing punctuation?\n",linecnt,i);
  1.2003 +		g_print("    Line %ld column %ld - "
  1.2004 +		  "endquote missing punctuation?\n",
  1.2005 +		  linecnt,g_utf8_pointer_to_offset(aline,s));
  1.2006  	    else
  1.2007  		cnt_punct++;
  1.2008  	}
  1.2009 @@ -2354,25 +2434,25 @@
  1.2010   */
  1.2011  void check_for_html_tag(const char *aline)
  1.2012  {
  1.2013 -    int i;
  1.2014      const char *open,*close;
  1.2015 -    open=strstr(aline,"<");
  1.2016 +    gchar *tag;
  1.2017 +    open=strchr(aline,'<');
  1.2018      if (open)
  1.2019      {
  1.2020 -	close=strstr(aline,">");
  1.2021 +	close=strchr(g_utf8_next_char(open),'>');
  1.2022  	if (close)
  1.2023  	{
  1.2024 -	    i=(int)(close-open+1);
  1.2025 -	    if (i>0)
  1.2026 +	    if (pswit[ECHO_SWITCH])
  1.2027 +		g_print("\n%s\n",aline);
  1.2028 +	    if (!pswit[OVERVIEW_SWITCH])
  1.2029  	    {
  1.2030 -		if (pswit[ECHO_SWITCH])
  1.2031 -		    printf("\n%s\n",aline);
  1.2032 -		if (!pswit[OVERVIEW_SWITCH])
  1.2033 -		    printf("    Line %ld column %d - HTML Tag? %*.*s \n",
  1.2034 -		      linecnt,(int)(open-aline)+1,i,i,open);
  1.2035 -		else
  1.2036 -		    cnt_html++;
  1.2037 +		tag=g_strndup(open,close-open+1);
  1.2038 +		g_print("    Line %ld column %ld - HTML Tag? %s \n",
  1.2039 +		  linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag);
  1.2040 +		g_free(tag);
  1.2041  	    }
  1.2042 +	    else
  1.2043 +		cnt_html++;
  1.2044  	}
  1.2045      }
  1.2046  }
  1.2047 @@ -2387,25 +2467,28 @@
  1.2048   */
  1.2049  void check_for_html_entity(const char *aline)
  1.2050  {
  1.2051 -    int i;
  1.2052      const char *s,*amp,*scolon;
  1.2053 -    amp=strstr(aline,"&");
  1.2054 +    gchar *entity;
  1.2055 +    amp=strchr(aline,'&');
  1.2056      if (amp)
  1.2057      {
  1.2058 -	scolon=strstr(aline,";");
  1.2059 +	scolon=strchr(amp,';');
  1.2060  	if (scolon)
  1.2061  	{
  1.2062 -	    i=(int)(scolon-amp+1);
  1.2063 -	    for (s=amp;s<scolon;s++)   
  1.2064 -		if (*s==CHAR_SPACE)
  1.2065 -		    i=0;		/* Don't report "Jones & Son;" */
  1.2066 -	    if (i>0)
  1.2067 +	    for (s=amp;s<scolon;s=g_utf8_next_char(s))   
  1.2068 +		if (g_utf8_get_char(s)==CHAR_SPACE)
  1.2069 +		    break;		/* Don't report "Jones & Son;" */
  1.2070 +	    if (s>=scolon)
  1.2071  	    {
  1.2072  		if (pswit[ECHO_SWITCH])
  1.2073 -		    printf("\n%s\n",aline);
  1.2074 +		    g_print("\n%s\n",aline);
  1.2075  		if (!pswit[OVERVIEW_SWITCH])
  1.2076 -		    printf("    Line %ld column %d - HTML symbol? %*.*s \n",
  1.2077 -		      linecnt,(int)(amp-aline)+1,i,i,amp);
  1.2078 +		{
  1.2079 +		    entity=g_strndup(amp,scolon-amp+1);
  1.2080 +		    g_print("    Line %ld column %d - HTML symbol? %s \n",
  1.2081 +		      linecnt,(int)(amp-aline)+1,entity);
  1.2082 +		    g_free(entity);
  1.2083 +		}
  1.2084  		else
  1.2085  		    cnt_html++;
  1.2086  	    }
  1.2087 @@ -2425,18 +2508,20 @@
  1.2088    struct pending *pending)
  1.2089  {
  1.2090      const char *s;
  1.2091 +    gunichar c;
  1.2092      s=aline;
  1.2093      while (*s==' ')
  1.2094  	s++;
  1.2095 +    c=g_utf8_get_char(s);
  1.2096      if (pending->dquote)
  1.2097      {
  1.2098 -	if (*s!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  1.2099 +	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
  1.2100  	{
  1.2101  	    if (!pswit[OVERVIEW_SWITCH])
  1.2102  	    {
  1.2103  		if (pswit[ECHO_SWITCH])
  1.2104 -		    printf("\n%s\n",parastart);
  1.2105 -		puts(pending->dquote);
  1.2106 +		    g_print("\n%s\n",parastart);
  1.2107 +		g_print("%s\n",pending->dquote);
  1.2108  	    }
  1.2109  	    else
  1.2110  		cnt_dquot++;
  1.2111 @@ -2446,14 +2531,14 @@
  1.2112      }
  1.2113      if (pending->squote)
  1.2114      {
  1.2115 -	if (*s!=CHAR_SQUOTE && *s!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
  1.2116 +	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
  1.2117  	  pending->squot)
  1.2118  	{
  1.2119  	    if (!pswit[OVERVIEW_SWITCH])
  1.2120  	    {
  1.2121  		if (pswit[ECHO_SWITCH])
  1.2122 -		    printf("\n%s\n",parastart);
  1.2123 -		puts(pending->squote);
  1.2124 +		    g_print("\n%s\n",parastart);
  1.2125 +		g_print("%s\n",pending->squote);
  1.2126  	    }
  1.2127  	    else
  1.2128  		cnt_squot++;
  1.2129 @@ -2466,8 +2551,8 @@
  1.2130  	if (!pswit[OVERVIEW_SWITCH])
  1.2131  	{
  1.2132  	    if (pswit[ECHO_SWITCH])
  1.2133 -		printf("\n%s\n",parastart);
  1.2134 -	    puts(pending->rbrack);
  1.2135 +		g_print("\n%s\n",parastart);
  1.2136 +	    g_print("%s\n",pending->rbrack);
  1.2137  	}
  1.2138  	else
  1.2139  	    cnt_brack++;
  1.2140 @@ -2479,8 +2564,8 @@
  1.2141  	if (!pswit[OVERVIEW_SWITCH])
  1.2142  	{
  1.2143  	    if (pswit[ECHO_SWITCH])
  1.2144 -		printf("\n%s\n",parastart);
  1.2145 -	    puts(pending->sbrack);
  1.2146 +		g_print("\n%s\n",parastart);
  1.2147 +	    g_print("%s\n",pending->sbrack);
  1.2148  	}
  1.2149  	else
  1.2150  	    cnt_brack++;
  1.2151 @@ -2492,8 +2577,8 @@
  1.2152  	if (!pswit[OVERVIEW_SWITCH])
  1.2153  	{
  1.2154  	    if (pswit[ECHO_SWITCH])
  1.2155 -		printf("\n%s\n",parastart);
  1.2156 -	    puts(pending->cbrack);
  1.2157 +		g_print("\n%s\n",parastart);
  1.2158 +	    g_print("%s\n",pending->cbrack);
  1.2159  	}
  1.2160  	else
  1.2161  	    cnt_brack++;
  1.2162 @@ -2505,8 +2590,8 @@
  1.2163  	if (!pswit[OVERVIEW_SWITCH])
  1.2164  	{
  1.2165  	    if (pswit[ECHO_SWITCH])
  1.2166 -		printf("\n%s\n",parastart);
  1.2167 -	    puts(pending->unders);
  1.2168 +		g_print("\n%s\n",parastart);
  1.2169 +	    g_print("%s\n",pending->unders);
  1.2170  	}
  1.2171  	else
  1.2172  	    cnt_brack++;
  1.2173 @@ -2577,12 +2662,14 @@
  1.2174  void check_for_omitted_punctuation(const char *prevline,
  1.2175    struct line_properties *last,int start_para_line)
  1.2176  {
  1.2177 -    int i;
  1.2178 +    gboolean letter_on_line=FALSE;
  1.2179      const char *s;
  1.2180 -    for (s=prevline,i=0;*s && !i;s++)
  1.2181 -	if (gcisletter(*s))
  1.2182 -	    /* use i to indicate the presence of a letter on the line */
  1.2183 -	    i=1;
  1.2184 +    for (s=prevline;*s;s=g_utf8_next_char(s))
  1.2185 +	if (g_unichar_isalpha(g_utf8_get_char(s)))
  1.2186 +	{
  1.2187 +	    letter_on_line=TRUE;
  1.2188 +	    break;
  1.2189 +	}
  1.2190      /*
  1.2191       * This next "if" is a problem.
  1.2192       * If we say "start_para_line <= linecnt - 1", that includes
  1.2193 @@ -2590,28 +2677,30 @@
  1.2194       * If we say "start_para_line < linecnt - 1" it doesn't, but then it
  1.2195       * misses genuine one-line paragraphs.
  1.2196       */
  1.2197 -    if (i && last->blen>2 && start_para_line<linecnt-1 && *prevline>CHAR_SPACE)
  1.2198 +    if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
  1.2199 +      g_utf8_get_char(prevline)>CHAR_SPACE)
  1.2200      {
  1.2201 -	for (i=strlen(prevline)-1;
  1.2202 -	  (prevline[i]==CHAR_DQUOTE || prevline[i]==CHAR_SQUOTE) &&
  1.2203 -	  prevline[i]>CHAR_SPACE && i>0;
  1.2204 -	  i--)
  1.2205 +	for (s=g_utf8_prev_char(prevline+strlen(prevline));
  1.2206 +	  (g_utf8_get_char(s)==CHAR_DQUOTE ||
  1.2207 +	  g_utf8_get_char(s)==CHAR_SQUOTE) &&
  1.2208 +	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
  1.2209 +	  s=g_utf8_prev_char(s))
  1.2210  	    ;
  1.2211 -	for (;i>0;i--)
  1.2212 +	for (;s>prevline;s=g_utf8_prev_char(s))
  1.2213  	{
  1.2214 -	    if (gcisalpha(prevline[i]))
  1.2215 +	    if (g_unichar_isalpha(g_utf8_get_char(s)))
  1.2216  	    {
  1.2217  		if (pswit[ECHO_SWITCH])
  1.2218 -		    printf("\n%s\n",prevline);
  1.2219 +		    g_print("\n%s\n",prevline);
  1.2220  		if (!pswit[OVERVIEW_SWITCH])
  1.2221 -		    printf("    Line %ld column %d - "
  1.2222 +		    g_print("    Line %ld column %ld - "
  1.2223  		      "No punctuation at para end?\n",
  1.2224 -		      linecnt-1,(int)strlen(prevline));
  1.2225 +		      linecnt-1,g_utf8_strlen(prevline,-1));
  1.2226  		else
  1.2227  		    cnt_punct++;
  1.2228  		break;
  1.2229  	    }
  1.2230 -	    if (strchr("-.:!([{?}])",prevline[i]))
  1.2231 +	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
  1.2232  		break;
  1.2233  	}
  1.2234      }
  1.2235 @@ -2622,11 +2711,38 @@
  1.2236      const char *word=key;
  1.2237      int *dupcnt=value;
  1.2238      if (*dupcnt)
  1.2239 -	printf("\nNote: Queried word %s was duplicated %d times\n",
  1.2240 +	g_print("\nNote: Queried word %s was duplicated %d times\n",
  1.2241  	  word,*dupcnt);
  1.2242      return FALSE;
  1.2243  }
  1.2244  
  1.2245 +void print_as_windows_1252(const char *string)
  1.2246 +{
  1.2247 +    gsize inbytes,outbytes;
  1.2248 +    gchar *buf,*bp;
  1.2249 +    GIConv converter=(GIConv)-1;
  1.2250 +    if (!string)
  1.2251 +    {
  1.2252 +	if (converter!=(GIConv)-1)
  1.2253 +	    g_iconv_close(converter);
  1.2254 +	converter=(GIConv)-1;
  1.2255 +	return;
  1.2256 +    }
  1.2257 +    if (converter=(GIConv)-1)
  1.2258 +	converter=g_iconv_open("WINDOWS-1252","UTF-8");
  1.2259 +    if (converter!=(GIConv)-1)
  1.2260 +    {
  1.2261 +	inbytes=outbytes=strlen(string);
  1.2262 +	bp=buf=g_malloc(outbytes+1);
  1.2263 +	g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes);
  1.2264 +	*bp='\0';
  1.2265 +	fputs(buf,stdout);
  1.2266 +	g_free(buf);
  1.2267 +    }
  1.2268 +    else
  1.2269 +	fputs(string,stdout);
  1.2270 +}
  1.2271 +
  1.2272  /*
  1.2273   * procfile:
  1.2274   *
  1.2275 @@ -2659,7 +2775,8 @@
  1.2276  	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
  1.2277  	exit(1);
  1.2278      }
  1.2279 -    fprintf(stdout,"\n\nFile: %s\n\n",filename);
  1.2280 +    g_set_print_handler(print_as_windows_1252);
  1.2281 +    g_print("\n\nFile: %s\n\n",filename);
  1.2282      first_pass_results=first_pass(etext);
  1.2283      warnings=report_first_pass(first_pass_results);
  1.2284      qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free);
  1.2285 @@ -2674,7 +2791,7 @@
  1.2286  	linecnt++;
  1.2287  	if (linecnt==1)
  1.2288  	    isnewpara=TRUE;
  1.2289 -	if (pswit[DP_SWITCH] && !strncmp(aline,"-----File: ",11))
  1.2290 +	if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: "))
  1.2291  	    continue;    // skip DP page separators completely
  1.2292  	if (linecnt<first_pass_results->firstline ||
  1.2293  	  (first_pass_results->footerline>0 &&
  1.2294 @@ -2682,14 +2799,14 @@
  1.2295  	{
  1.2296  	    if (pswit[HEADER_SWITCH])
  1.2297  	    {
  1.2298 -		if (!strncmp(aline,"Title:",6))
  1.2299 -		    printf("    %s\n",aline);
  1.2300 -		if (!strncmp(aline,"Author:",7))
  1.2301 -		    printf("    %s\n",aline);
  1.2302 -		if (!strncmp(aline,"Release Date:",13))
  1.2303 -		    printf("    %s\n",aline);
  1.2304 -		if (!strncmp(aline,"Edition:",8))
  1.2305 -		    printf("    %s\n\n",aline);
  1.2306 +		if (g_str_has_prefix(aline,"Title:"))
  1.2307 +		    g_print("    %s\n",aline);
  1.2308 +		if (g_str_has_prefix(aline,"Author:"))
  1.2309 +		    g_print("    %s\n",aline);
  1.2310 +		if (g_str_has_prefix(aline,"Release Date:"))
  1.2311 +		    g_print("    %s\n",aline);
  1.2312 +		if (g_str_has_prefix(aline,"Edition:"))
  1.2313 +		    g_print("    %s\n\n",aline);
  1.2314  	    }
  1.2315  	    continue;		/* skip through the header */
  1.2316  	}
  1.2317 @@ -2706,36 +2823,38 @@
  1.2318  	    parastart=g_strdup(aline);
  1.2319  	    memset(&parities,0,sizeof(parities));  /* restart the quote count */
  1.2320  	    s=aline;
  1.2321 -	    while (!gcisalpha(*s) && !gcisdigit(*s) && *s)
  1.2322 -		s++;
  1.2323 -	    if (*s>='a' && *s<='z')
  1.2324 +	    while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) &&
  1.2325 +	      !g_unichar_isdigit(g_utf8_get_char(s)))
  1.2326 +		s=g_utf8_next_char(s);
  1.2327 +	    if (g_unichar_islower(g_utf8_get_char(s)))
  1.2328  	    {
  1.2329  		/* and its first letter is lowercase */
  1.2330  		if (pswit[ECHO_SWITCH])
  1.2331 -		    printf("\n%s\n",aline);
  1.2332 +		    g_print("\n%s\n",aline);
  1.2333  		if (!pswit[OVERVIEW_SWITCH])
  1.2334 -		    printf("    Line %ld column %d - "
  1.2335 +		    g_print("    Line %ld column %ld - "
  1.2336  		      "Paragraph starts with lower-case\n",
  1.2337 -		      linecnt,(int)(s-aline)+1);
  1.2338 +		      linecnt,g_utf8_pointer_to_offset(aline,s)+1);
  1.2339  		else
  1.2340  		    cnt_punct++;
  1.2341  	    }
  1.2342  	    isnewpara=FALSE; /* Signal the end of new para processing. */
  1.2343  	}
  1.2344  	/* Check for an em-dash broken at line end. */
  1.2345 -	if (enddash && *aline=='-')
  1.2346 +	if (enddash && g_utf8_get_char(aline)=='-')
  1.2347  	{
  1.2348  	    if (pswit[ECHO_SWITCH])
  1.2349 -		printf("\n%s\n",aline);
  1.2350 +		g_print("\n%s\n",aline);
  1.2351  	    if (!pswit[OVERVIEW_SWITCH])
  1.2352 -		printf("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  1.2353 +		g_print("    Line %ld column 1 - Broken em-dash?\n",linecnt);
  1.2354  	    else
  1.2355  		cnt_punct++;
  1.2356  	}
  1.2357  	enddash=FALSE;
  1.2358 -	for (s=aline+strlen(aline)-1;*s==' ' && s>aline;s--)
  1.2359 +	for (s=g_utf8_prev_char(aline+strlen(aline));
  1.2360 +	  g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s))
  1.2361  	    ;
  1.2362 -	if (s>=aline && *s=='-')
  1.2363 +	if (s>=aline && g_utf8_get_char(s)=='-')
  1.2364  	    enddash=TRUE;
  1.2365  	check_for_control_characters(aline);
  1.2366  	if (warnings->bin)
  1.2367 @@ -2745,8 +2864,8 @@
  1.2368  	if (warnings->shortline)
  1.2369  	    check_for_short_line(aline,&last);
  1.2370  	last.blen=last.len;
  1.2371 -	last.len=strlen(aline);
  1.2372 -	last.start=aline[0];
  1.2373 +	last.len=g_utf8_strlen(aline,-1);
  1.2374 +	last.start=g_utf8_get_char(aline);
  1.2375  	check_for_starting_punctuation(aline);
  1.2376  	if (warnings->dash)
  1.2377  	{
  1.2378 @@ -2795,6 +2914,8 @@
  1.2379  	g_tree_foreach(qword,report_duplicate_queries,NULL);
  1.2380      g_tree_unref(qword);
  1.2381      g_tree_unref(qperiod);
  1.2382 +    g_set_print_handler(NULL);
  1.2383 +    print_as_windows_1252(NULL);
  1.2384  }
  1.2385  
  1.2386  /*
  1.2387 @@ -2807,14 +2928,15 @@
  1.2388   */
  1.2389  char *flgets(char **etext,long lcnt)
  1.2390  {
  1.2391 -    char c;
  1.2392 -    int len;
  1.2393 +    gunichar c;
  1.2394      gboolean isCR=FALSE;
  1.2395      char *theline=*etext;
  1.2396 -    len=0;
  1.2397 -    for(;;)
  1.2398 +    char *eos=theline;
  1.2399 +    gchar *s;
  1.2400 +    for (;;)
  1.2401      {
  1.2402 -	c=*(*etext)++;
  1.2403 +	c=g_utf8_get_char(*etext);
  1.2404 +	*etext=g_utf8_next_char(*etext);
  1.2405  	if (!c)
  1.2406  	    return NULL;
  1.2407  	/* either way, it's end of line */
  1.2408 @@ -2828,9 +2950,13 @@
  1.2409  		if (pswit[LINE_END_SWITCH])
  1.2410  		{
  1.2411  		    if (pswit[ECHO_SWITCH])
  1.2412 -			printf("\n%*.*s\n",len,len,theline);
  1.2413 +		    {
  1.2414 +			s=g_strndup(theline,eos-theline);
  1.2415 +			g_print("\n%s\n",s);
  1.2416 +			g_free(s);
  1.2417 +		    }
  1.2418  		    if (!pswit[OVERVIEW_SWITCH])
  1.2419 -			printf("    Line %ld - No CR?\n",lcnt);
  1.2420 +			g_print("    Line %ld - No CR?\n",lcnt);
  1.2421  		    else
  1.2422  			cnt_lineend++;
  1.2423  		}
  1.2424 @@ -2845,9 +2971,13 @@
  1.2425  		if (pswit[LINE_END_SWITCH])
  1.2426  		{
  1.2427  		    if (pswit[ECHO_SWITCH])
  1.2428 -			printf("\n%*.*s\n",len,len,theline);
  1.2429 +		    {
  1.2430 +			s=g_strndup(theline,eos-theline);
  1.2431 +			g_print("\n%s\n",s);
  1.2432 +			g_free(s);
  1.2433 +		    }
  1.2434  		    if (!pswit[OVERVIEW_SWITCH])
  1.2435 -			printf("    Line %ld - Two successive CRs?\n",lcnt);
  1.2436 +			g_print("    Line %ld - Two successive CRs?\n",lcnt);
  1.2437  		    else
  1.2438  			cnt_lineend++;
  1.2439  		}
  1.2440 @@ -2859,19 +2989,23 @@
  1.2441  	    if (pswit[LINE_END_SWITCH] && isCR)
  1.2442  	    {
  1.2443  		if (pswit[ECHO_SWITCH])
  1.2444 -		    printf("\n%*.*s\n",len,len,theline);
  1.2445 +		{
  1.2446 +		    s=g_strndup(theline,eos-theline);
  1.2447 +		    g_print("\n%s\n",s);
  1.2448 +		    g_free(s);
  1.2449 +		}
  1.2450  		if (!pswit[OVERVIEW_SWITCH])
  1.2451 -		    printf("    Line %ld column %d - CR without LF?\n",
  1.2452 -		      lcnt,len+1);
  1.2453 +		    g_print("    Line %ld column %ld - CR without LF?\n",
  1.2454 +		      lcnt,g_utf8_pointer_to_offset(theline,eos)+1);
  1.2455  		else
  1.2456  		    cnt_lineend++;
  1.2457 -		theline[len]=' ';
  1.2458 +		*eos=' ';
  1.2459  	    }
  1.2460  	    isCR=FALSE;
  1.2461 -	    len++;
  1.2462 +	    eos=g_utf8_next_char(eos);
  1.2463  	}
  1.2464      }
  1.2465 -    theline[len]='\0';
  1.2466 +    *eos='\0';
  1.2467      if (pswit[MARKUP_SWITCH])  
  1.2468  	postprocess_for_HTML(theline);
  1.2469      if (pswit[DP_SWITCH])  
  1.2470 @@ -2886,55 +3020,55 @@
  1.2471   * contains a mixture of alpha and digits. Generally, this is an
  1.2472   * error, but may not be for cases like 4th or L5 12s. 3d.
  1.2473   *
  1.2474 - * Returns: 0 if no error found, 1 if error.
  1.2475 + * Returns: TRUE iff an is error found.
  1.2476   */
  1.2477 -int mixdigit(const char *checkword)
  1.2478 +gboolean mixdigit(const char *checkword)
  1.2479  {
  1.2480 -    int wehaveadigit,wehavealetter,firstdigits,query,wl;
  1.2481 -    const char *s;
  1.2482 -    wehaveadigit=wehavealetter=query=0;
  1.2483 -    for (s=checkword;*s;s++)
  1.2484 -	if (gcisalpha(*s))
  1.2485 -	    wehavealetter=1;
  1.2486 -	else
  1.2487 -	    if (gcisdigit(*s))
  1.2488 -		wehaveadigit=1;
  1.2489 +    gboolean wehaveadigit,wehavealetter,query;
  1.2490 +    const char *s,*nondigit;
  1.2491 +    wehaveadigit=wehavealetter=query=FALSE;
  1.2492 +    for (s=checkword;*s;s=g_utf8_next_char(s))
  1.2493 +	if (g_unichar_isalpha(g_utf8_get_char(s)))
  1.2494 +	    wehavealetter=TRUE;
  1.2495 +	else if (g_unichar_isdigit(g_utf8_get_char(s)))
  1.2496 +	    wehaveadigit=TRUE;
  1.2497      if (wehaveadigit && wehavealetter)
  1.2498      {
  1.2499  	/* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
  1.2500 -	query=1;
  1.2501 -	wl=strlen(checkword);
  1.2502 -	for (firstdigits=0;gcisdigit(checkword[firstdigits]);firstdigits++)
  1.2503 +	query=TRUE;
  1.2504 +	for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit));
  1.2505 +	  nondigit=g_utf8_next_char(nondigit))
  1.2506  	    ;
  1.2507  	/* digits, ending in st, rd, nd, th of either case */
  1.2508 -	if (firstdigits+2==wl && (!g_ascii_strcasecmp(checkword+wl-2,"st") ||
  1.2509 -	  !g_ascii_strcasecmp(checkword+wl-2,"rd") ||
  1.2510 -	  !g_ascii_strcasecmp(checkword+wl-2,"nd") ||
  1.2511 -	  !g_ascii_strcasecmp(checkword+wl-2,"th")))
  1.2512 -	    query=0;
  1.2513 -	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-3,"sts") ||
  1.2514 -	  !g_ascii_strcasecmp(checkword+wl-3,"rds") ||
  1.2515 -	  !g_ascii_strcasecmp(checkword+wl-3,"nds") ||
  1.2516 -	  !g_ascii_strcasecmp(checkword+wl-3,"ths")))
  1.2517 -	    query=0;
  1.2518 -	if (firstdigits+3==wl && (!g_ascii_strcasecmp(checkword+wl-4,"stly") ||
  1.2519 -	  !g_ascii_strcasecmp(checkword+wl-4,"rdly") ||
  1.2520 -	  !g_ascii_strcasecmp(checkword+wl-4,"ndly") ||
  1.2521 -	  !g_ascii_strcasecmp(checkword+wl-4,"thly")))
  1.2522 -	    query=0;
  1.2523 +	if (!g_ascii_strcasecmp(nondigit,"st") ||
  1.2524 +	  !g_ascii_strcasecmp(nondigit,"rd") ||
  1.2525 +	  !g_ascii_strcasecmp(nondigit,"nd") ||
  1.2526 +	  !g_ascii_strcasecmp(nondigit,"th"))
  1.2527 +	    query=FALSE;
  1.2528 +	if (!g_ascii_strcasecmp(nondigit,"sts") ||
  1.2529 +	  !g_ascii_strcasecmp(nondigit,"rds") ||
  1.2530 +	  !g_ascii_strcasecmp(nondigit,"nds") ||
  1.2531 +	  !g_ascii_strcasecmp(nondigit,"ths"))
  1.2532 +	    query=FALSE;
  1.2533 +	if (!g_ascii_strcasecmp(nondigit,"stly") ||
  1.2534 +	  !g_ascii_strcasecmp(nondigit,"rdly") ||
  1.2535 +	  !g_ascii_strcasecmp(nondigit,"ndly") ||
  1.2536 +	  !g_ascii_strcasecmp(nondigit,"thly"))
  1.2537 +	    query=FALSE;
  1.2538  	/* digits, ending in l, L, s or d */
  1.2539 -	if (firstdigits+1==wl && (checkword[wl-1]=='l' ||
  1.2540 -	  checkword[wl-1]=='L' || checkword[wl-1]=='s' || checkword[wl-1]=='d'))
  1.2541 -	    query=0;
  1.2542 +	if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") ||
  1.2543 +	  !strcmp(nondigit,"d"))
  1.2544 +	    query=FALSE;
  1.2545  	/*
  1.2546  	 * L at the start of a number, representing Britsh pounds, like L500.
  1.2547 -	 * This is cute. We know the current word is mixeddigit. If the first
  1.2548 +	 * This is cute. We know the current word is mixed digit. If the first
  1.2549  	 * letter is L, there must be at least one digit following. If both
  1.2550  	 * digits and letters follow, we have a genuine error, else we have a
  1.2551  	 * capital L followed by digits, and we accept that as a non-error.
  1.2552  	 */
  1.2553 -	if (checkword[0]=='L' && !mixdigit(checkword+1))
  1.2554 -	    query=0;
  1.2555 +	if (g_utf8_get_char(checkword)=='L' &&
  1.2556 +	  !mixdigit(g_utf8_next_char(checkword)))
  1.2557 +	    query=FALSE;
  1.2558      }
  1.2559      return query;
  1.2560  }
  1.2561 @@ -2951,11 +3085,13 @@
  1.2562   */
  1.2563  gchar *getaword(const char **ptr)
  1.2564  {
  1.2565 -    int i;
  1.2566 -    const char *s;
  1.2567 +    const char *s,*t;
  1.2568      GString *word;
  1.2569 +    gunichar c,pc;
  1.2570      word=g_string_new(NULL);
  1.2571 -    for (;!gcisdigit(**ptr) && !gcisalpha(**ptr) && **ptr;(*ptr)++)
  1.2572 +    for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
  1.2573 +      !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
  1.2574 +      **ptr;*ptr=g_utf8_next_char(*ptr))
  1.2575  	;
  1.2576      /*
  1.2577       * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
  1.2578 @@ -2966,23 +3102,27 @@
  1.2579       * the results and resume our normal programming.
  1.2580       */
  1.2581      s=*ptr;
  1.2582 -    for (;gcisdigit(*s) || gcisalpha(*s) || *s==',' || *s=='.';s++)
  1.2583 -	g_string_append_c(word,*s);
  1.2584 -    for (i=1;i+1<word->len;i++)
  1.2585 +    for (;g_unichar_isdigit(g_utf8_get_char(s)) ||
  1.2586 +      g_unichar_isalpha(g_utf8_get_char(s)) ||
  1.2587 +      g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s))
  1.2588 +	g_string_append_unichar(word,g_utf8_get_char(s));
  1.2589 +    for (t=g_utf8_next_char(word->str);*g_utf8_next_char(t);
  1.2590 +      t=g_utf8_next_char(t))
  1.2591      {
  1.2592 -	if (word->str[i]=='.' || word->str[i]==',')
  1.2593 +	c=g_utf8_get_char(t);
  1.2594 +	pc=g_utf8_get_char(g_utf8_prev_char(t));
  1.2595 +	if ((c=='.' || c==',') && g_unichar_isdigit(pc))
  1.2596  	{
  1.2597 -	    if (gcisdigit(word->str[i-1]) && gcisdigit(word->str[i-1]))
  1.2598 -	    {
  1.2599 -		*ptr=s;
  1.2600 -		return g_string_free(word,FALSE);
  1.2601 -	    }
  1.2602 +	    *ptr=s;
  1.2603 +	    return g_string_free(word,FALSE);
  1.2604  	}
  1.2605      }
  1.2606      /* we didn't find a punctuated number - do the regular getword thing */
  1.2607      g_string_truncate(word,0);
  1.2608 -    for (;gcisdigit(**ptr) || gcisalpha(**ptr) || **ptr=='\'';(*ptr)++)
  1.2609 -	g_string_append_c(word,**ptr);
  1.2610 +    for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
  1.2611 +      g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
  1.2612 +      g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
  1.2613 +	g_string_append_unichar(word,g_utf8_get_char(*ptr));
  1.2614      return g_string_free(word,FALSE);
  1.2615  }
  1.2616  
  1.2617 @@ -3006,82 +3146,36 @@
  1.2618      if (!t || !*t)
  1.2619  	return FALSE;
  1.2620      s=t;
  1.2621 -    while (*t=='m' && *t)
  1.2622 +    while (g_utf8_get_char(t)=='m' && *t)
  1.2623  	t++;
  1.2624 -    if (*t=='d')
  1.2625 +    if (g_utf8_get_char(t)=='d')
  1.2626  	t++;
  1.2627 -    if (*t=='c' && t[1]=='m')
  1.2628 +    if (g_str_has_prefix(t,"cm"))
  1.2629  	t+=2;
  1.2630 -    if (*t=='c' && t[1]=='d')
  1.2631 +    if (g_str_has_prefix(t,"cd"))
  1.2632  	t+=2;
  1.2633 -    while (*t=='c' && *t)
  1.2634 +    while (g_utf8_get_char(t)=='c' && *t)
  1.2635  	t++;
  1.2636 -    if (*t=='x' && t[1]=='l')
  1.2637 +    if (g_str_has_prefix(t,"xl"))
  1.2638  	t+=2;
  1.2639 -    if (*t=='x' && t[1]=='c')
  1.2640 +    if (g_str_has_prefix(t,"xc"))
  1.2641  	t+=2;
  1.2642 -    if (*t=='l')
  1.2643 +    if (g_utf8_get_char(t)=='l')
  1.2644  	t++;
  1.2645 -    while (*t=='x' && *t)
  1.2646 +    while (g_utf8_get_char(t)=='x' && *t)
  1.2647  	t++;
  1.2648 -    if (*t=='i' && t[1]=='x')
  1.2649 +    if (g_str_has_prefix(t,"ix"))
  1.2650  	t+=2;
  1.2651 -    if (*t=='i' && t[1]=='v')
  1.2652 +    if (g_str_has_prefix(t,"iv"))
  1.2653  	t+=2;
  1.2654 -    if (*t=='v')
  1.2655 +    if (g_utf8_get_char(t)=='v')
  1.2656  	t++;
  1.2657 -    while (*t=='i' && *t)
  1.2658 +    while (g_utf8_get_char(t)=='i' && *t)
  1.2659  	t++;
  1.2660      return !*t;
  1.2661  }
  1.2662  
  1.2663  /*
  1.2664 - * gcisalpha:
  1.2665 - *
  1.2666 - * A version of isalpha() that is somewhat lenient on 8-bit texts.
  1.2667 - * If we use the standard function, 8-bit accented characters break
  1.2668 - * words, so that tete with accented characters appears to be two words, "t"
  1.2669 - * and "t", with 8-bit characters between them. This causes over-reporting of
  1.2670 - * errors. gcisalpha() recognizes accented letters from the CP1252 (Windows)
  1.2671 - * and ISO-8859-1 character sets, which are the most common PG 8-bit types.
  1.2672 - */
  1.2673 -gboolean gcisalpha(unsigned char c)
  1.2674 -{
  1.2675 -    if (c>='a' && c<='z')
  1.2676 -	return TRUE;
  1.2677 -    if (c>='A' && c<='Z')
  1.2678 -	return TRUE;
  1.2679 -    if (c<140)
  1.2680 -	return FALSE;
  1.2681 -    if (c>=192 && c!=208 && c!=215 && c!=222 && c!=240 && c!=247 && c!=254)
  1.2682 -	return TRUE;
  1.2683 -    if (c==140 || c==142 || c==156 || c==158 || c==159)
  1.2684 -	return TRUE;
  1.2685 -    return FALSE;
  1.2686 -}
  1.2687 -
  1.2688 -/*
  1.2689 - * gcisdigit:
  1.2690 - *
  1.2691 - * A version of isdigit() that doesn't get confused in 8-bit texts.
  1.2692 - */
  1.2693 -gboolean gcisdigit(unsigned char c)
  1.2694 -{   
  1.2695 -    return c>='0' && c<='9';
  1.2696 -}
  1.2697 -
  1.2698 -/*
  1.2699 - * gcisletter:
  1.2700 - *
  1.2701 - * A version of isletter() that doesn't get confused in 8-bit texts.
  1.2702 - * NB: this is ISO-8891-1-specific.
  1.2703 - */
  1.2704 -gboolean gcisletter(unsigned char c)
  1.2705 -{   
  1.2706 -    return c>='A' && c<='Z' || c>='a' && c<='z' || c>=192;
  1.2707 -}
  1.2708 -
  1.2709 -/*
  1.2710   * postprocess_for_DP:
  1.2711   *
  1.2712   * Invoked with the -d switch from flgets().
  1.2713 @@ -3096,21 +3190,11 @@
  1.2714      if (!*theline) 
  1.2715  	return;
  1.2716      for (i=0;*DPmarkup[i];i++)
  1.2717 -    {
  1.2718 -	s=strstr(theline,DPmarkup[i]);
  1.2719 -	while (s)
  1.2720 +	while ((s=strstr(theline,DPmarkup[i])))
  1.2721  	{
  1.2722  	    t=s+strlen(DPmarkup[i]);
  1.2723 -	    while (*t)
  1.2724 -	    {
  1.2725 -		*s=*t;
  1.2726 -		t++;
  1.2727 -		s++;
  1.2728 -	    }
  1.2729 -	    *s=0;
  1.2730 -	    s=strstr(theline,DPmarkup[i]);
  1.2731 +	    memmove(s,t,strlen(t)+1);
  1.2732  	}
  1.2733 -    }
  1.2734  }
  1.2735  
  1.2736  /*
  1.2737 @@ -3124,9 +3208,8 @@
  1.2738   */
  1.2739  void postprocess_for_HTML(char *theline)
  1.2740  {
  1.2741 -    if (strchr(theline,'<') && strchr(theline,'>'))
  1.2742 -	while (losemarkup(theline))
  1.2743 -	    ;
  1.2744 +    while (losemarkup(theline))
  1.2745 +	;
  1.2746      while (loseentities(theline))
  1.2747  	;
  1.2748  }
  1.2749 @@ -3135,25 +3218,16 @@
  1.2750  {
  1.2751      char *s,*t;
  1.2752      int i;
  1.2753 -    if (!*theline) 
  1.2754 -	return NULL;
  1.2755 -    s=strstr(theline,"<");
  1.2756 -    t=strstr(theline,">");
  1.2757 +    s=strchr(theline,'<');
  1.2758 +    t=s?strchr(s,'>'):NULL;
  1.2759      if (!s || !t)
  1.2760  	return NULL;
  1.2761      for (i=0;*markup[i];i++)
  1.2762 -	if (!tagcomp(s+1,markup[i]))
  1.2763 +	if (tagcomp(g_utf8_next_char(s),markup[i]))
  1.2764  	{
  1.2765 -	    if (!t[1])
  1.2766 -	    {
  1.2767 -		*s=0;
  1.2768 -		return s;
  1.2769 -	    }
  1.2770 -	    else if (t>s)
  1.2771 -	    {
  1.2772 -		strcpy(s,t+1);
  1.2773 -		return s;
  1.2774 -	    }
  1.2775 +	    t=g_utf8_next_char(t);
  1.2776 +	    memmove(s,t,strlen(t)+1);
  1.2777 +	    return s;
  1.2778  	}
  1.2779      /* It's an unrecognized <xxx>. */
  1.2780      return NULL;
  1.2781 @@ -3170,13 +3244,10 @@
  1.2782  	s=strstr(theline,entities[i].htmlent);
  1.2783  	if (s)
  1.2784  	{
  1.2785 -	    t=malloc((size_t)strlen(s));
  1.2786 -	    if (!t)
  1.2787 -		return NULL;
  1.2788 -	    strcpy(t,s+strlen(entities[i].htmlent));
  1.2789 +	    t=g_strdup(s+strlen(entities[i].htmlent));
  1.2790  	    strcpy(s,entities[i].textent);
  1.2791  	    strcat(s,t);
  1.2792 -	    free(t);
  1.2793 +	    g_free(t);
  1.2794  	    return theline;
  1.2795  	}
  1.2796      }
  1.2797 @@ -3185,34 +3256,29 @@
  1.2798  	s=strstr(theline,entities[i].htmlnum);
  1.2799  	if (s)
  1.2800  	{
  1.2801 -	    t=malloc((size_t)strlen(s));
  1.2802 -	    if (!t)
  1.2803 -		return NULL;
  1.2804 -	    strcpy(t,s+strlen(entities[i].htmlnum));
  1.2805 +	    t=g_strdup(s+strlen(entities[i].htmlnum));
  1.2806  	    strcpy(s,entities[i].textent);
  1.2807  	    strcat(s,t);
  1.2808 -	    free(t);
  1.2809 +	    g_free(t);
  1.2810  	    return theline;
  1.2811  	}
  1.2812      }
  1.2813      return NULL;
  1.2814  }
  1.2815  
  1.2816 -int tagcomp(const char *strin,const char *basetag)
  1.2817 +gboolean tagcomp(const char *strin,const char *basetag)
  1.2818  {
  1.2819 -    const char *s,*t;
  1.2820 -    s=basetag;
  1.2821 -    t=strin;
  1.2822 -    if (*t=='/')
  1.2823 -	t++; /* ignore a slash */
  1.2824 -    while (*s && *t)
  1.2825 -    {
  1.2826 -	if (tolower(*s)!=tolower(*t))
  1.2827 -	    return 1;
  1.2828 -	s++;
  1.2829 -	t++;
  1.2830 -    }
  1.2831 -    return 0;
  1.2832 +    gboolean retval;
  1.2833 +    gchar *s,*t;
  1.2834 +    if (g_utf8_get_char(strin)=='/')
  1.2835 +	t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */
  1.2836 +    else
  1.2837 +	t=g_utf8_casefold(strin,-1);
  1.2838 +    s=g_utf8_casefold(basetag,-1);
  1.2839 +    retval=g_str_has_prefix(t,s);
  1.2840 +    g_free(s);
  1.2841 +    g_free(t);
  1.2842 +    return retval;
  1.2843  }
  1.2844  
  1.2845  void proghelp(GOptionContext *context)