1.1 --- a/bookloupe/bookloupe.c Fri Oct 25 11:15:18 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Sun Oct 27 17:01:47 2013 +0000
1.3 @@ -32,6 +32,9 @@
1.4 #include "pending.h"
1.5 #include "HTMLentities.h"
1.6
1.7 +gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
1.8 +GIConv charset_validator=(GIConv)-1;
1.9 +
1.10 gchar *prevline;
1.11
1.12 /* Common typos. */
1.13 @@ -127,6 +130,7 @@
1.14 };
1.15
1.16 gboolean pswit[SWITNO]; /* program switches */
1.17 +gchar *opt_charset;
1.18
1.19 gboolean typo_compat,paranoid_compat;
1.20
1.21 @@ -198,6 +202,8 @@
1.22 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
1.23 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
1.24 "Switch off verbose mode", NULL },
1.25 + { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
1.26 + "Set of characters valid for this ebook", "NAME" },
1.27 { NULL }
1.28 };
1.29
1.30 @@ -262,11 +268,55 @@
1.31 UINT saved_cp;
1.32 #endif
1.33
1.34 +gboolean set_charset(const char *name,GError **err)
1.35 +{
1.36 + /* The various UNICODE encodings all share the same character set. */
1.37 + const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
1.38 + "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
1.39 + "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1.40 + "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
1.41 + "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
1.42 + int i;
1.43 + if (charset)
1.44 + g_free(charset);
1.45 + if (charset_validator!=(GIConv)-1)
1.46 + g_iconv_close(charset_validator);
1.47 + if (!name || !g_strcasecmp(name,"auto"))
1.48 + {
1.49 + charset=NULL;
1.50 + charset_validator=(GIConv)-1;
1.51 + return TRUE;
1.52 + }
1.53 + else
1.54 + charset=g_strdup(name);
1.55 + for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
1.56 + if (!g_strcasecmp(charset,unicode_aliases[i]))
1.57 + {
1.58 + g_free(charset);
1.59 + charset=g_strdup("UTF-8");
1.60 + break;
1.61 + }
1.62 + if (!strcmp(charset,"UTF-8"))
1.63 + charset_validator=(GIConv)-1;
1.64 + else
1.65 + {
1.66 + charset_validator=g_iconv_open(charset,"UTF-8");
1.67 + if (charset_validator==(GIConv)-1)
1.68 + {
1.69 + g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
1.70 + "Unknown character set \"%s\"",charset);
1.71 + return FALSE;
1.72 + }
1.73 + }
1.74 + return TRUE;
1.75 +}
1.76 +
1.77 GKeyFile *config;
1.78
1.79 void config_file_update(GKeyFile *kf)
1.80 {
1.81 int i;
1.82 + const char *s;
1.83 gboolean sw;
1.84 for(i=0;options[i].long_name;i++)
1.85 {
1.86 @@ -279,6 +329,13 @@
1.87 sw=!sw;
1.88 g_key_file_set_boolean(kf,"options",options[i].long_name,sw);
1.89 }
1.90 + else if (options[i].arg==G_OPTION_ARG_STRING)
1.91 + {
1.92 + s=*(gchar **)options[i].arg_data;
1.93 + if (!s)
1.94 + s="auto";
1.95 + g_key_file_set_string(kf,"options",options[i].long_name,s);
1.96 + }
1.97 else
1.98 g_assert_not_reached();
1.99 }
1.100 @@ -375,7 +432,7 @@
1.101 void parse_config_file(void)
1.102 {
1.103 int i,j;
1.104 - gchar *path;
1.105 + gchar *path,*s;
1.106 gchar **keys;
1.107 gboolean sw;
1.108 GError *err=NULL;
1.109 @@ -404,9 +461,35 @@
1.110 path,keys[i],err->message);
1.111 g_clear_error(&err);
1.112 }
1.113 - if (options[j].flags&G_OPTION_FLAG_REVERSE)
1.114 - sw=!sw;
1.115 - *(gboolean *)options[j].arg_data=sw;
1.116 + else
1.117 + {
1.118 + if (options[j].flags&G_OPTION_FLAG_REVERSE)
1.119 + sw=!sw;
1.120 + *(gboolean *)options[j].arg_data=sw;
1.121 + }
1.122 + break;
1.123 + }
1.124 + else if (options[j].arg==G_OPTION_ARG_STRING)
1.125 + {
1.126 + s=g_key_file_get_string(config,"options",keys[i],
1.127 + &err);
1.128 + if (err)
1.129 + {
1.130 + g_printerr("Bookloupe: %s: options.%s: %s\n",
1.131 + path,keys[i],err->message);
1.132 + g_clear_error(&err);
1.133 + }
1.134 + else
1.135 + {
1.136 + g_free(*(gchar **)options[j].arg_data);
1.137 + if (!g_strcmp0(s,"auto"))
1.138 + {
1.139 + *(gchar **)options[j].arg_data=NULL;
1.140 + g_free(s);
1.141 + }
1.142 + else
1.143 + *(gchar **)options[j].arg_data=s;
1.144 + }
1.145 break;
1.146 }
1.147 else
1.148 @@ -475,11 +558,18 @@
1.149 pswit[USERTYPO_SWITCH]=FALSE;
1.150 pswit[DP_SWITCH]=FALSE;
1.151 }
1.152 + if (opt_charset && !set_charset(opt_charset,&err))
1.153 + {
1.154 + g_printerr("%s\n",err->message);
1.155 + exit(1);
1.156 + }
1.157 if (pswit[DUMP_CONFIG_SWITCH])
1.158 {
1.159 dump_config();
1.160 exit(0);
1.161 }
1.162 + g_free(opt_charset);
1.163 + opt_charset=NULL;
1.164 if (pswit[OVERVIEW_SWITCH])
1.165 /* just print summary; don't echo */
1.166 pswit[ECHO_SWITCH]=FALSE;
1.167 @@ -542,7 +632,11 @@
1.168 exit(1);
1.169 }
1.170 if (g_utf8_validate(contents,len,NULL))
1.171 + {
1.172 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
1.173 + if (!charset)
1.174 + (void)set_charset("UNICODE",NULL);
1.175 + }
1.176 else
1.177 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
1.178 g_free(contents);
1.179 @@ -674,6 +768,7 @@
1.180 g_free(running_from);
1.181 if (usertypo)
1.182 g_tree_unref(usertypo);
1.183 + set_charset(NULL,NULL);
1.184 if (config)
1.185 g_key_file_free(config);
1.186 return 0;
1.187 @@ -1024,25 +1119,32 @@
1.188 "Not reporting them.\n",
1.189 results->spacedash+results->emdash.non_PG_space);
1.190 }
1.191 - /* If more than a quarter of characters are hi-bit, bug out. */
1.192 - warnings.bin=1;
1.193 - if (results->binlen*4>results->totlen)
1.194 + if (charset)
1.195 + warnings.bin=0;
1.196 + else
1.197 {
1.198 - g_print(" --> This file does not appear to be ASCII. "
1.199 - "Terminating. Best of luck with it!\n");
1.200 - exit(1);
1.201 - }
1.202 - if (results->alphalen*4<results->totlen)
1.203 - {
1.204 - g_print(" --> This file does not appear to be text. "
1.205 - "Terminating. Best of luck with it!\n");
1.206 - exit(1);
1.207 - }
1.208 - if (results->binlen*100>results->totlen || results->binlen>100)
1.209 - {
1.210 - g_print(" --> There are a lot of foreign letters here. "
1.211 - "Not reporting them.\n");
1.212 - warnings.bin=0;
1.213 + /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1.214 + warnings.bin=1;
1.215 + /* If more than a quarter of characters are hi-bit, bug out. */
1.216 + if (results->binlen*4>results->totlen)
1.217 + {
1.218 + g_print(" --> This file does not appear to be ASCII. "
1.219 + "Terminating. Best of luck with it!\n");
1.220 + exit(1);
1.221 + }
1.222 + if (results->alphalen*4<results->totlen)
1.223 + {
1.224 + g_print(" --> This file does not appear to be text. "
1.225 + "Terminating. Best of luck with it!\n");
1.226 + exit(1);
1.227 + }
1.228 + if (results->binlen*100>results->totlen || results->binlen>100)
1.229 + {
1.230 + g_print(" --> There are a lot of foreign letters here. "
1.231 + "Not reporting them.\n");
1.232 + if (!pswit[VERBOSE_SWITCH])
1.233 + warnings.bin=0;
1.234 + }
1.235 }
1.236 warnings.isDutch=FALSE;
1.237 if (results->Dutchcount>50)
1.238 @@ -1070,7 +1172,6 @@
1.239 g_print("\n");
1.240 if (pswit[VERBOSE_SWITCH])
1.241 {
1.242 - warnings.bin=1;
1.243 warnings.shortline=1;
1.244 warnings.dotcomma=1;
1.245 warnings.longline=1;
1.246 @@ -1265,14 +1366,17 @@
1.247 gboolean isemptyline)
1.248 {
1.249 /* Don't repeat multiple warnings on one line. */
1.250 - gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1.251 + gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1.252 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1.253 const char *s;
1.254 gunichar c;
1.255 + gsize nb;
1.256 + gchar *t;
1.257 for (s=aline;*s;s=g_utf8_next_char(s))
1.258 {
1.259 c=g_utf8_get_char(s);
1.260 - if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1.261 + if (warnings->bin && !eInvalidChar &&
1.262 + (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1.263 {
1.264 if (pswit[ECHO_SWITCH])
1.265 g_print("\n%s\n",aline);
1.266 @@ -1287,7 +1391,57 @@
1.267 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.268 else
1.269 cnt_bin++;
1.270 - eNon_A=TRUE;
1.271 + eInvalidChar=TRUE;
1.272 + }
1.273 + if (!eInvalidChar && charset)
1.274 + {
1.275 + if (charset_validator==(GIConv)-1)
1.276 + {
1.277 + if (!g_unichar_isdefined(c))
1.278 + {
1.279 + if (pswit[ECHO_SWITCH])
1.280 + g_print("\n%s\n",aline);
1.281 + if (!pswit[OVERVIEW_SWITCH])
1.282 + g_print(" Line %ld column %ld - Unassigned UNICODE "
1.283 + "code point U+%04" G_GINT32_MODIFIER "X\n",
1.284 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.285 + else
1.286 + cnt_bin++;
1.287 + eInvalidChar=TRUE;
1.288 + }
1.289 + else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1.290 + c>=100000 && c<=0x10FFFD)
1.291 + {
1.292 + if (pswit[ECHO_SWITCH])
1.293 + g_print("\n%s\n",aline);
1.294 + if (!pswit[OVERVIEW_SWITCH])
1.295 + g_print(" Line %ld column %ld - Private Use "
1.296 + "character U+%04" G_GINT32_MODIFIER "X\n",
1.297 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.298 + else
1.299 + cnt_bin++;
1.300 + eInvalidChar=TRUE;
1.301 + }
1.302 + }
1.303 + else
1.304 + {
1.305 + t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1.306 + charset_validator,NULL,&nb,NULL);
1.307 + if (t)
1.308 + g_free(t);
1.309 + else
1.310 + {
1.311 + if (pswit[ECHO_SWITCH])
1.312 + g_print("\n%s\n",aline);
1.313 + if (!pswit[OVERVIEW_SWITCH])
1.314 + g_print(" Line %ld column %ld - Non-%s "
1.315 + "character %u\n",linecnt,
1.316 + g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1.317 + else
1.318 + cnt_bin++;
1.319 + eInvalidChar=TRUE;
1.320 + }
1.321 + }
1.322 }
1.323 if (!eTab && c==CHAR_TAB)
1.324 {
1.325 @@ -2975,8 +3129,7 @@
1.326 if (s>=aline && g_utf8_get_char(s)=='-')
1.327 enddash=TRUE;
1.328 check_for_control_characters(aline);
1.329 - if (warnings->bin)
1.330 - check_for_odd_characters(aline,warnings,isemptyline);
1.331 + check_for_odd_characters(aline,warnings,isemptyline);
1.332 if (warnings->longline)
1.333 check_for_long_line(aline);
1.334 if (warnings->shortline)
2.1 --- a/sample.ini Fri Oct 25 11:15:18 2013 +0100
2.2 +++ b/sample.ini Sun Oct 27 17:01:47 2013 +0000
2.3 @@ -29,3 +29,5 @@
2.4 web=false
2.5 # Verbose - list everything
2.6 verbose=false
2.7 +# Set of characters valid for this ebook
2.8 +charset=auto
3.1 --- a/test/bookloupe/Makefile.am Fri Oct 25 11:15:18 2013 +0100
3.2 +++ b/test/bookloupe/Makefile.am Sun Oct 27 17:01:47 2013 +0000
3.3 @@ -2,7 +2,7 @@
3.4 TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \
3.5 runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \
3.6 emdash.tst config-internal.tst config-default.tst config-user.tst \
3.7 - config-override.tst footnote-marker.tst unix-lineends.tst \
3.8 - os9-lineends.tst
3.9 + config-override.tst charset-cp1252.tst charset-latin1.tst \
3.10 + footnote-marker.tst unix-lineends.tst os9-lineends.tst
3.11
3.12 dist_pkgdata_DATA=$(TESTS)
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/test/bookloupe/charset-cp1252.tst Sun Oct 27 17:01:47 2013 +0000
4.3 @@ -0,0 +1,16 @@
4.4 +**************** OPTIONS ****************
4.5 +--charset=WINDOWS-1252
4.6 +**************** ENCODING ****************
4.7 +WINDOWS-1252
4.8 +**************** INPUT ****************
4.9 +Unless binary mode is engaged, gutcheck will warn about a number of
4.10 +characters defined in Windows-1252. Bookloupe provides support for
4.11 +disabling such checks without concern as to the file size and how
4.12 +many characters with the eighth bit set it may contain by allowing a
4.13 +character set to be declared. With the character set declared as
4.14 +WINDOWS-1252, all characters defined in Windows-1252 shoud be acceptable
4.15 +and no warnings should be issued.
4.16 +
4.17 +We test for this by including just one such character—the em dash.
4.18 +
4.19 +**************** EXPECTED ****************
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2 +++ b/test/bookloupe/charset-latin1.tst Sun Oct 27 17:01:47 2013 +0000
5.3 @@ -0,0 +1,58 @@
5.4 +**************** OPTIONS ****************
5.5 +--charset=ISO-8859-1
5.6 +**************** ENCODING ****************
5.7 +WINDOWS-1252
5.8 +**************** INPUT ****************
5.9 +Where the character set declared is narrower than the character set
5.10 +implied by the encoding as in this case (Windows-1252 is a superset
5.11 +of the first latin alphabet defined in ECMA 94), then bookloupe should
5.12 +warn about characters that are not in the declared character set but
5.13 +should still recognise them and otherwise handle them as it would
5.14 +normally do. We use the curved apostrophe as a test for this since
5.15 +if bookloupe didn't recognise it then it would query the orphaned
5.16 +letters from the genitives and abbreviations.
5.17 +
5.18 +John Hendricks was bear-leading at the time. He had originally studied
5.19 +for Holy Orders, but had abandoned the Church later for private reasons
5.20 +connected with his faith, and had taken to teaching and tutoring
5.21 +instead. He was an honest, upstanding fellow of five-and-thirty,
5.22 +incorruptible, intelligent in a simple, straightforward way. He played
5.23 +games with his head, more than most Englishmen do, but he went through
5.24 +life without much calculation. He had qualities that made boys like
5.25 +and respect him; he won their confidence. Poor, proud, ambitious,
5.26 +he realised that fate offered him a chance when the Secretary of
5.27 +State for Scotland asked him if he would give up his other pupils
5.28 +for a year and take his son, Lord Ernie, round the world upon an
5.29 +educational trip that might make a man of him. For Lord Ernie was the
5.30 +only son, and the Marquess’s influence was naturally great. To have
5.31 +deposited a regenerated Lord Ernie at the castle gates might have
5.32 +guaranteed Hendricks’ future. After leaving Eton prematurely the lad
5.33 +had come under Hendricks’ charge for a time, and with such excellent
5.34 +results--‘I’d simply swear by that chap, you know,’ the boy used
5.35 +to say--that his father, considerably impressed, and rather as a
5.36 +last resort, had made this proposition. And Hendricks, without much
5.37 +calculation, had accepted it. He liked ‘Bindy’ for himself. It was
5.38 +in his heart to ‘make a man of him,’ if possible. They had now been
5.39 +round the world together and had come up from Brindisi to the Italian
5.40 +Lakes, and so into Switzerland. It was middle October. With a week or
5.41 +two to spare they were making leisurely for the ancestral halls in
5.42 +Aberdeenshire.
5.43 +**************** EXPECTED ****************
5.44 +
5.45 +only son, and the Marquess’s influence was naturally great. To have
5.46 + Line 22 column 27 - Non-ISO-8859-1 character 8217
5.47 +
5.48 +guaranteed Hendricks’ future. After leaving Eton prematurely the lad
5.49 + Line 24 column 21 - Non-ISO-8859-1 character 8217
5.50 +
5.51 +had come under Hendricks’ charge for a time, and with such excellent
5.52 + Line 25 column 25 - Non-ISO-8859-1 character 8217
5.53 +
5.54 +results--‘I’d simply swear by that chap, you know,’ the boy used
5.55 + Line 26 column 10 - Non-ISO-8859-1 character 8216
5.56 +
5.57 +calculation, had accepted it. He liked ‘Bindy’ for himself. It was
5.58 + Line 29 column 40 - Non-ISO-8859-1 character 8216
5.59 +
5.60 +in his heart to ‘make a man of him,’ if possible. They had now been
5.61 + Line 30 column 17 - Non-ISO-8859-1 character 8216
6.1 --- a/test/bookloupe/config-default.tst Fri Oct 25 11:15:18 2013 +0100
6.2 +++ b/test/bookloupe/config-default.tst Sun Oct 27 17:01:47 2013 +0000
6.3 @@ -30,6 +30,8 @@
6.4 usertypo=false
6.5 # Verbose - list everything
6.6 verbose=false
6.7 +# Set of characters valid for this ebook
6.8 +charset=auto
6.9 **************** EXPECTED(stdout) ****************
6.10 # Default configuration for bookloupe
6.11
6.12 @@ -60,3 +62,5 @@
6.13 usertypo=false
6.14 # Verbose - list everything
6.15 verbose=false
6.16 +# Set of characters valid for this ebook
6.17 +charset=auto
7.1 --- a/test/bookloupe/config-internal.tst Fri Oct 25 11:15:18 2013 +0100
7.2 +++ b/test/bookloupe/config-internal.tst Sun Oct 27 17:01:47 2013 +0000
7.3 @@ -30,3 +30,5 @@
7.4 usertypo=false
7.5 # Verbose - list everything
7.6 verbose=false
7.7 +# Set of characters valid for this ebook
7.8 +charset=auto
8.1 --- a/test/bookloupe/config-override.tst Fri Oct 25 11:15:18 2013 +0100
8.2 +++ b/test/bookloupe/config-override.tst Sun Oct 27 17:01:47 2013 +0000
8.3 @@ -1,5 +1,6 @@
8.4 **************** OPTIONS ****************
8.5 --usertypo
8.6 +--charset=auto
8.7 --dump-config
8.8 **************** INPUT(bookloupe.ini) ****************
8.9 # Relaxed configuration for bookloupe
8.10 @@ -31,6 +32,8 @@
8.11 usertypo=false
8.12 # Verbose - list everything
8.13 verbose=false
8.14 +# Set of characters valid for this ebook
8.15 +charset=UNICODE
8.16 **************** EXPECTED(stdout) ****************
8.17 # Relaxed configuration for bookloupe
8.18
8.19 @@ -61,3 +64,5 @@
8.20 usertypo=true
8.21 # Verbose - list everything
8.22 verbose=false
8.23 +# Set of characters valid for this ebook
8.24 +charset=auto
9.1 --- a/test/bookloupe/config-user.tst Fri Oct 25 11:15:18 2013 +0100
9.2 +++ b/test/bookloupe/config-user.tst Sun Oct 27 17:01:47 2013 +0000
9.3 @@ -35,6 +35,8 @@
9.4 usertypo=true
9.5 # Verbose - list everything - Contrary by name...
9.6 verbose=true
9.7 +# Set of characters valid for this ebook - Let's stick with Latin1
9.8 +charset=ISO-8859-1
9.9 **************** EXPECTED(stdout) ****************
9.10 # Mary Contrary's configuration for bookloupe
9.11
9.12 @@ -70,3 +72,5 @@
9.13 usertypo=true
9.14 # Verbose - list everything - Contrary by name...
9.15 verbose=true
9.16 +# Set of characters valid for this ebook - Let's stick with Latin1
9.17 +charset=ISO-8859-1