bookloupe/bookloupe.c
changeset 192 1aeda7fe17ca
parent 191 189183b37598
parent 185 a6d93c9932ac
child 193 7fdf168fb748
     1.1 --- a/bookloupe/bookloupe.c	Mon Oct 21 23:36:40 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Mon Oct 21 23:39:54 2013 +0100
     1.3 @@ -32,6 +32,9 @@
     1.4  #include "pending.h"
     1.5  #include "HTMLentities.h"
     1.6  
     1.7 +gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
     1.8 +GIConv charset_validator=(GIConv)-1;
     1.9 +
    1.10  gchar *prevline;
    1.11  
    1.12  /* Common typos. */
    1.13 @@ -127,6 +130,7 @@
    1.14  }; 
    1.15  
    1.16  gboolean pswit[SWITNO];  /* program switches */
    1.17 +gchar *opt_charset;
    1.18  
    1.19  gboolean typo_compat,paranoid_compat;
    1.20  
    1.21 @@ -198,6 +202,8 @@
    1.22      { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
    1.23        G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
    1.24        "Switch off verbose mode", NULL },
    1.25 +    { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
    1.26 +      "Set of characters valid for this ebook", "NAME" },
    1.27      { NULL }
    1.28  };
    1.29  
    1.30 @@ -245,7 +251,7 @@
    1.31  
    1.32  gboolean mixdigit(const char *);
    1.33  gchar *getaword(const char **);
    1.34 -char *flgets(char **,long,gboolean);
    1.35 +char *flgets(char **,long);
    1.36  void postprocess_for_HTML(char *);
    1.37  char *linehasmarkup(char *);
    1.38  char *losemarkup(char *);
    1.39 @@ -423,6 +429,49 @@
    1.40  	g_free(path);
    1.41  }
    1.42  
    1.43 +gboolean set_charset(const char *name,GError **err)
    1.44 +{
    1.45 +    /* The various UNICODE encodings all share the same character set. */
    1.46 +    const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
    1.47 +      "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
    1.48 +      "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
    1.49 +      "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
    1.50 +      "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
    1.51 +    int i;
    1.52 +    if (charset)
    1.53 +	g_free(charset);
    1.54 +    if (charset_validator!=(GIConv)-1)
    1.55 +	g_iconv_close(charset_validator);
    1.56 +    if (!name || !g_strcasecmp(name,"auto"))
    1.57 +    {
    1.58 +	charset=NULL;
    1.59 +	charset_validator=(GIConv)-1;
    1.60 +	return TRUE;
    1.61 +    }
    1.62 +    else
    1.63 +	charset=g_strdup(name);
    1.64 +    for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
    1.65 +	if (!g_strcasecmp(charset,unicode_aliases[i]))
    1.66 +	{
    1.67 +	    g_free(charset);
    1.68 +	    charset=g_strdup("UTF-8");
    1.69 +	    break;
    1.70 +	}
    1.71 +    if (!strcmp(charset,"UTF-8"))
    1.72 +	charset_validator=(GIConv)-1;
    1.73 +    else
    1.74 +    {
    1.75 +	charset_validator=g_iconv_open(charset,"UTF-8");
    1.76 +	if (charset_validator==(GIConv)-1)
    1.77 +	{
    1.78 +	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
    1.79 +	      "Unknown character set \"%s\"",charset);
    1.80 +	    return FALSE;
    1.81 +	}
    1.82 +    }
    1.83 +    return TRUE;
    1.84 +}
    1.85 +
    1.86  void parse_options(int *argc,char ***argv)
    1.87  {
    1.88      GError *err=NULL;
    1.89 @@ -475,11 +524,18 @@
    1.90  	pswit[USERTYPO_SWITCH]=FALSE;
    1.91  	pswit[DP_SWITCH]=FALSE;
    1.92      }
    1.93 +    if (opt_charset && !set_charset(opt_charset,&err))
    1.94 +    {
    1.95 +	g_printerr("%s\n",err->message);
    1.96 +	exit(1);
    1.97 +    }
    1.98      if (pswit[DUMP_CONFIG_SWITCH])
    1.99      {
   1.100  	dump_config();
   1.101  	exit(0);
   1.102      }
   1.103 +    g_free(opt_charset);
   1.104 +    opt_charset=NULL;
   1.105      if (pswit[OVERVIEW_SWITCH])
   1.106  	/* just print summary; don't echo */
   1.107  	pswit[ECHO_SWITCH]=FALSE;
   1.108 @@ -542,7 +598,11 @@
   1.109  	exit(1);
   1.110      }
   1.111      if (g_utf8_validate(contents,len,NULL))
   1.112 +    {
   1.113  	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
   1.114 +	if (!charset)
   1.115 +	    (void)set_charset("UNICODE",NULL);
   1.116 +    }
   1.117      else
   1.118  	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
   1.119      g_free(contents);
   1.120 @@ -674,6 +734,7 @@
   1.121      g_free(running_from);
   1.122      if (usertypo)
   1.123  	g_tree_unref(usertypo);
   1.124 +    set_charset(NULL,NULL);
   1.125      if (config)
   1.126  	g_key_file_free(config);
   1.127      return 0;
   1.128 @@ -735,20 +796,11 @@
   1.129      gchar *inword;
   1.130      QuoteClass qc;
   1.131      lines=g_strsplit(etext,"\n",0);
   1.132 -    if (lines[0])
   1.133 -	/* If there's at least one line, we might have UNIX-style terminators */
   1.134 -	results.unix_lineends=TRUE;
   1.135      for (j=0;lines[j];j++)
   1.136      {
   1.137  	lbytes=strlen(lines[j]);
   1.138 -	if (lbytes>0 && lines[j][lbytes-1]=='\r')
   1.139 -	{
   1.140 -	    results.unix_lineends=FALSE;
   1.141 -	    do
   1.142 -	    {
   1.143 -		lines[j][--lbytes]='\0';
   1.144 -	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
   1.145 -	}
   1.146 +	while (lbytes>0 && lines[j][lbytes-1]=='\r')
   1.147 +	    lines[j][--lbytes]='\0';
   1.148  	llen=g_utf8_strlen(lines[j],lbytes);
   1.149  	linecnt++;
   1.150  	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
   1.151 @@ -890,13 +942,6 @@
   1.152  struct warnings *report_first_pass(struct first_pass_results *results)
   1.153  {
   1.154      static struct warnings warnings={0};
   1.155 -    warnings.nocr=1;
   1.156 -    if (results->unix_lineends)
   1.157 -    {
   1.158 -	warnings.nocr=0;
   1.159 -	g_print("   --> No lines in this file have a CR. Not reporting them. "
   1.160 -	  "Project Gutenberg requires that all lineends be CR-LF.\n");
   1.161 -    }
   1.162      if (cnt_spacend>0)
   1.163  	g_print("   --> %ld lines in this file have white space at end\n",
   1.164  	  cnt_spacend);
   1.165 @@ -1004,25 +1049,32 @@
   1.166  	  "Not reporting them.\n",
   1.167  	  results->spacedash+results->emdash.non_PG_space);
   1.168      }
   1.169 -    /* If more than a quarter of characters are hi-bit, bug out. */
   1.170 -    warnings.bin=1;
   1.171 -    if (results->binlen*4>results->totlen)
   1.172 +    if (charset)
   1.173 +	warnings.bin=0;
   1.174 +    else
   1.175      {
   1.176 -	g_print("   --> This file does not appear to be ASCII. "
   1.177 -	  "Terminating. Best of luck with it!\n");
   1.178 -	exit(1);
   1.179 -    }
   1.180 -    if (results->alphalen*4<results->totlen)
   1.181 -    {
   1.182 -	g_print("   --> This file does not appear to be text. "
   1.183 -	  "Terminating. Best of luck with it!\n");
   1.184 -	exit(1);
   1.185 -    }
   1.186 -    if (results->binlen*100>results->totlen || results->binlen>100)
   1.187 -    {
   1.188 -	g_print("   --> There are a lot of foreign letters here. "
   1.189 -	  "Not reporting them.\n");
   1.190 -	warnings.bin=0;
   1.191 +	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
   1.192 +	warnings.bin=1;
   1.193 +	/* If more than a quarter of characters are hi-bit, bug out. */
   1.194 +	if (results->binlen*4>results->totlen)
   1.195 +	{
   1.196 +	    g_print("   --> This file does not appear to be ASCII. "
   1.197 +	      "Terminating. Best of luck with it!\n");
   1.198 +	    exit(1);
   1.199 +	}
   1.200 +	if (results->alphalen*4<results->totlen)
   1.201 +	{
   1.202 +	    g_print("   --> This file does not appear to be text. "
   1.203 +	      "Terminating. Best of luck with it!\n");
   1.204 +	    exit(1);
   1.205 +	}
   1.206 +	if (results->binlen*100>results->totlen || results->binlen>100)
   1.207 +	{
   1.208 +	    g_print("   --> There are a lot of foreign letters here. "
   1.209 +	      "Not reporting them.\n");
   1.210 +	    if (!pswit[VERBOSE_SWITCH])
   1.211 +		warnings.bin=0;
   1.212 +	}
   1.213      }
   1.214      warnings.isDutch=FALSE;
   1.215      if (results->Dutchcount>50)
   1.216 @@ -1050,7 +1102,6 @@
   1.217      g_print("\n");
   1.218      if (pswit[VERBOSE_SWITCH])
   1.219      {
   1.220 -	warnings.bin=1;
   1.221  	warnings.shortline=1;
   1.222  	warnings.dotcomma=1;
   1.223  	warnings.longline=1;
   1.224 @@ -1245,14 +1296,17 @@
   1.225    gboolean isemptyline)
   1.226  {
   1.227      /* Don't repeat multiple warnings on one line. */
   1.228 -    gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
   1.229 +    gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
   1.230      gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
   1.231      const char *s;
   1.232      gunichar c;
   1.233 +    gsize nb;
   1.234 +    gchar *t;
   1.235      for (s=aline;*s;s=g_utf8_next_char(s))
   1.236      {
   1.237  	c=g_utf8_get_char(s);
   1.238 -	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   1.239 +	if (warnings->bin && !eInvalidChar &&
   1.240 +	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
   1.241  	{
   1.242  	    if (pswit[ECHO_SWITCH])
   1.243  		g_print("\n%s\n",aline);
   1.244 @@ -1267,7 +1321,57 @@
   1.245  		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   1.246  	    else
   1.247  		cnt_bin++;
   1.248 -	    eNon_A=TRUE;
   1.249 +	    eInvalidChar=TRUE;
   1.250 +	}
   1.251 +	if (!eInvalidChar && charset)
   1.252 +	{
   1.253 +	    if (charset_validator==(GIConv)-1)
   1.254 +	    {
   1.255 +		if (!g_unichar_isdefined(c))
   1.256 +		{
   1.257 +		    if (pswit[ECHO_SWITCH])
   1.258 +			g_print("\n%s\n",aline);
   1.259 +		    if (!pswit[OVERVIEW_SWITCH])
   1.260 +			g_print("    Line %ld column %ld - Unassigned UNICODE "
   1.261 +			  "code point U+%04" G_GINT32_MODIFIER "X\n",
   1.262 +			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   1.263 +		    else
   1.264 +			cnt_bin++;
   1.265 +		    eInvalidChar=TRUE;
   1.266 +		}
   1.267 +		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
   1.268 +		  c>=100000 && c<=0x10FFFD)
   1.269 +		{
   1.270 +		    if (pswit[ECHO_SWITCH])
   1.271 +			g_print("\n%s\n",aline);
   1.272 +		    if (!pswit[OVERVIEW_SWITCH])
   1.273 +			g_print("    Line %ld column %ld - Private Use "
   1.274 +			  "character U+%04" G_GINT32_MODIFIER "X\n",
   1.275 +			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
   1.276 +		    else
   1.277 +			cnt_bin++;
   1.278 +		    eInvalidChar=TRUE;
   1.279 +		}
   1.280 +	    }
   1.281 +	    else
   1.282 +	    {
   1.283 +		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
   1.284 +		  charset_validator,NULL,&nb,NULL);
   1.285 +		if (t)
   1.286 +		    g_free(t);
   1.287 +		else
   1.288 +		{
   1.289 +		    if (pswit[ECHO_SWITCH])
   1.290 +			g_print("\n%s\n",aline);
   1.291 +		    if (!pswit[OVERVIEW_SWITCH])
   1.292 +			g_print("    Line %ld column %ld - Non-%s "
   1.293 +			  "character %u\n",linecnt,
   1.294 +			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
   1.295 +		    else
   1.296 +			cnt_bin++;
   1.297 +		    eInvalidChar=TRUE;
   1.298 +		}
   1.299 +	    }
   1.300  	}
   1.301  	if (!eTab && c==CHAR_TAB)
   1.302  	{
   1.303 @@ -2885,7 +2989,7 @@
   1.304       */
   1.305      linecnt=0;
   1.306      etext_ptr=etext;
   1.307 -    while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr)))
   1.308 +    while ((aline=flgets(&etext_ptr,linecnt+1)))
   1.309      {
   1.310  	linecnt++;
   1.311  	if (linecnt==1)
   1.312 @@ -2955,8 +3059,7 @@
   1.313  	if (s>=aline && g_utf8_get_char(s)=='-')
   1.314  	    enddash=TRUE;
   1.315  	check_for_control_characters(aline);
   1.316 -	if (warnings->bin)
   1.317 -	    check_for_odd_characters(aline,warnings,isemptyline);
   1.318 +	check_for_odd_characters(aline,warnings,isemptyline);
   1.319  	if (warnings->longline)
   1.320  	    check_for_long_line(aline);
   1.321  	if (warnings->shortline)
   1.322 @@ -3031,7 +3134,7 @@
   1.323   *
   1.324   * Returns: a pointer to the line.
   1.325   */
   1.326 -char *flgets(char **etext,long lcnt,gboolean warn_nocr)
   1.327 +char *flgets(char **etext,long lcnt)
   1.328  {
   1.329      gunichar c;
   1.330      gboolean isCR=FALSE;
   1.331 @@ -3070,7 +3173,7 @@
   1.332  	    else
   1.333  	    {
   1.334  		/* Error - a LF without a preceding CR */
   1.335 -		if (pswit[LINE_END_SWITCH] && warn_nocr)
   1.336 +		if (pswit[LINE_END_SWITCH])
   1.337  		{
   1.338  		    if (pswit[ECHO_SWITCH])
   1.339  		    {