1.1 --- a/bookloupe/bookloupe.c Mon Sep 23 21:18:27 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Sat Sep 28 17:06:27 2013 +0100
1.3 @@ -32,6 +32,9 @@
1.4 #include "pending.h"
1.5 #include "HTMLentities.h"
1.6
1.7 +gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
1.8 +GIConv charset_validator=(GIConv)-1;
1.9 +
1.10 gchar *prevline;
1.11
1.12 /* Common typos. */
1.13 @@ -127,6 +130,7 @@
1.14 };
1.15
1.16 gboolean pswit[SWITNO]; /* program switches */
1.17 +gchar *opt_charset;
1.18
1.19 static GOptionEntry options[]={
1.20 { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH,
1.21 @@ -157,6 +161,8 @@
1.22 "Defaults for use on www upload", NULL },
1.23 { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
1.24 "Verbose - list everything", NULL },
1.25 + { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
1.26 + "Set of characters valid for this ebook", "NAME" },
1.27 { NULL }
1.28 };
1.29
1.30 @@ -201,6 +207,49 @@
1.31 UINT saved_cp;
1.32 #endif
1.33
1.34 +gboolean set_charset(const char *name,GError **err)
1.35 +{
1.36 + /* The various UNICODE encodings all share the same character set. */
1.37 + const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
1.38 + "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
1.39 + "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1.40 + "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
1.41 + "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
1.42 + int i;
1.43 + if (charset)
1.44 + g_free(charset);
1.45 + if (charset_validator==(GIConv)-1)
1.46 + g_iconv_close(charset_validator);
1.47 + if (!name || !g_strcasecmp(name,"auto"))
1.48 + {
1.49 + charset=NULL;
1.50 + charset_validator=(GIConv)-1;
1.51 + return TRUE;
1.52 + }
1.53 + else
1.54 + charset=g_strdup(name);
1.55 + for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
1.56 + if (!g_strcasecmp(charset,unicode_aliases[i]))
1.57 + {
1.58 + g_free(charset);
1.59 + charset=g_strdup("UTF-8");
1.60 + break;
1.61 + }
1.62 + if (!strcmp(charset,"UTF-8"))
1.63 + charset_validator=(GIConv)-1;
1.64 + else
1.65 + {
1.66 + charset_validator=g_iconv_open(charset,"UTF-8");
1.67 + if (charset_validator==(GIConv)-1)
1.68 + {
1.69 + g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
1.70 + "Unknown character set \"%s\"",charset);
1.71 + return FALSE;
1.72 + }
1.73 + }
1.74 + return TRUE;
1.75 +}
1.76 +
1.77 void parse_options(int *argc,char ***argv)
1.78 {
1.79 GError *err=NULL;
1.80 @@ -247,6 +296,13 @@
1.81 pswit[USERTYPO_SWITCH]=FALSE;
1.82 pswit[DP_SWITCH]=FALSE;
1.83 }
1.84 + if (opt_charset && !set_charset(opt_charset,&err))
1.85 + {
1.86 + g_printerr("%s\n",err->message);
1.87 + exit(1);
1.88 + }
1.89 + g_free(opt_charset);
1.90 + opt_charset=NULL;
1.91 if (*argc<2)
1.92 {
1.93 proghelp(context);
1.94 @@ -306,7 +362,11 @@
1.95 exit(1);
1.96 }
1.97 if (g_utf8_validate(contents,len,NULL))
1.98 + {
1.99 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
1.100 + if (!charset)
1.101 + (void)set_charset("UNICODE",NULL);
1.102 + }
1.103 else
1.104 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
1.105 g_free(contents);
1.106 @@ -431,6 +491,7 @@
1.107 g_free(running_from);
1.108 if (usertypo)
1.109 g_tree_unref(usertypo);
1.110 + set_charset(NULL,NULL);
1.111 return 0;
1.112 }
1.113
1.114 @@ -703,25 +764,32 @@
1.115 "Not reporting them.\n",
1.116 results->spacedash+results->non_PG_space_emdash);
1.117 }
1.118 - /* If more than a quarter of characters are hi-bit, bug out. */
1.119 - warnings.bin=1;
1.120 - if (results->binlen*4>results->totlen)
1.121 + if (charset)
1.122 + warnings.bin=0;
1.123 + else
1.124 {
1.125 - g_print(" --> This file does not appear to be ASCII. "
1.126 - "Terminating. Best of luck with it!\n");
1.127 - exit(1);
1.128 - }
1.129 - if (results->alphalen*4<results->totlen)
1.130 - {
1.131 - g_print(" --> This file does not appear to be text. "
1.132 - "Terminating. Best of luck with it!\n");
1.133 - exit(1);
1.134 - }
1.135 - if (results->binlen*100>results->totlen || results->binlen>100)
1.136 - {
1.137 - g_print(" --> There are a lot of foreign letters here. "
1.138 - "Not reporting them.\n");
1.139 - warnings.bin=0;
1.140 + /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1.141 + warnings.bin=1;
1.142 + /* If more than a quarter of characters are hi-bit, bug out. */
1.143 + if (results->binlen*4>results->totlen)
1.144 + {
1.145 + g_print(" --> This file does not appear to be ASCII. "
1.146 + "Terminating. Best of luck with it!\n");
1.147 + exit(1);
1.148 + }
1.149 + if (results->alphalen*4<results->totlen)
1.150 + {
1.151 + g_print(" --> This file does not appear to be text. "
1.152 + "Terminating. Best of luck with it!\n");
1.153 + exit(1);
1.154 + }
1.155 + if (results->binlen*100>results->totlen || results->binlen>100)
1.156 + {
1.157 + g_print(" --> There are a lot of foreign letters here. "
1.158 + "Not reporting them.\n");
1.159 + if (!pswit[VERBOSE_SWITCH])
1.160 + warnings.bin=0;
1.161 + }
1.162 }
1.163 warnings.isDutch=FALSE;
1.164 if (results->Dutchcount>50)
1.165 @@ -749,7 +817,6 @@
1.166 g_print("\n");
1.167 if (pswit[VERBOSE_SWITCH])
1.168 {
1.169 - warnings.bin=1;
1.170 warnings.shortline=1;
1.171 warnings.dotcomma=1;
1.172 warnings.longline=1;
1.173 @@ -932,14 +999,17 @@
1.174 gboolean isemptyline)
1.175 {
1.176 /* Don't repeat multiple warnings on one line. */
1.177 - gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1.178 + gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1.179 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1.180 const char *s;
1.181 gunichar c;
1.182 + gsize nb;
1.183 + gchar *t;
1.184 for (s=aline;*s;s=g_utf8_next_char(s))
1.185 {
1.186 c=g_utf8_get_char(s);
1.187 - if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1.188 + if (warnings->bin && !eInvalidChar &&
1.189 + (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1.190 {
1.191 if (pswit[ECHO_SWITCH])
1.192 g_print("\n%s\n",aline);
1.193 @@ -954,7 +1024,57 @@
1.194 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.195 else
1.196 cnt_bin++;
1.197 - eNon_A=TRUE;
1.198 + eInvalidChar=TRUE;
1.199 + }
1.200 + if (!eInvalidChar && charset)
1.201 + {
1.202 + if (charset_validator==(GIConv)-1)
1.203 + {
1.204 + if (!g_unichar_isdefined(c))
1.205 + {
1.206 + if (pswit[ECHO_SWITCH])
1.207 + g_print("\n%s\n",aline);
1.208 + if (!pswit[OVERVIEW_SWITCH])
1.209 + g_print(" Line %ld column %ld - Unassigned UNICODE "
1.210 + "code point U+%04" G_GINT32_MODIFIER "X\n",
1.211 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.212 + else
1.213 + cnt_bin++;
1.214 + eInvalidChar=TRUE;
1.215 + }
1.216 + else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1.217 + c>=100000 && c<=0x10FFFD)
1.218 + {
1.219 + if (pswit[ECHO_SWITCH])
1.220 + g_print("\n%s\n",aline);
1.221 + if (!pswit[OVERVIEW_SWITCH])
1.222 + g_print(" Line %ld column %ld - Private Use "
1.223 + "character U+%04" G_GINT32_MODIFIER "X\n",
1.224 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.225 + else
1.226 + cnt_bin++;
1.227 + eInvalidChar=TRUE;
1.228 + }
1.229 + }
1.230 + else
1.231 + {
1.232 + t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1.233 + charset_validator,NULL,&nb,NULL);
1.234 + if (t)
1.235 + g_free(t);
1.236 + else
1.237 + {
1.238 + if (pswit[ECHO_SWITCH])
1.239 + g_print("\n%s\n",aline);
1.240 + if (!pswit[OVERVIEW_SWITCH])
1.241 + g_print(" Line %ld column %ld - Non-%s "
1.242 + "character %u\n",linecnt,
1.243 + g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1.244 + else
1.245 + cnt_bin++;
1.246 + eInvalidChar=TRUE;
1.247 + }
1.248 + }
1.249 }
1.250 if (!eTab && c==CHAR_TAB)
1.251 {
1.252 @@ -2592,8 +2712,7 @@
1.253 if (s>=aline && g_utf8_get_char(s)=='-')
1.254 enddash=TRUE;
1.255 check_for_control_characters(aline);
1.256 - if (warnings->bin)
1.257 - check_for_odd_characters(aline,warnings,isemptyline);
1.258 + check_for_odd_characters(aline,warnings,isemptyline);
1.259 if (warnings->longline)
1.260 check_for_long_line(aline);
1.261 if (warnings->shortline)