# HG changeset patch # User ali # Date 1382893307 0 # Node ID d22d8cd4f6285d16494b49a825d6c951e0d132c0 # Parent ff0aa9b1397accc693bba14931b32178d871bcdb Fix bug #13: Character sets diff -r ff0aa9b1397a -r d22d8cd4f628 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Fri Oct 25 11:15:18 2013 +0100 +++ b/bookloupe/bookloupe.c Sun Oct 27 17:01:47 2013 +0000 @@ -32,6 +32,9 @@ #include "pending.h" #include "HTMLentities.h" +gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */ +GIConv charset_validator=(GIConv)-1; + gchar *prevline; /* Common typos. */ @@ -127,6 +130,7 @@ }; gboolean pswit[SWITNO]; /* program switches */ +gchar *opt_charset; gboolean typo_compat,paranoid_compat; @@ -198,6 +202,8 @@ { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, "Switch off verbose mode", NULL }, + { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset, + "Set of characters valid for this ebook", "NAME" }, { NULL } }; @@ -262,11 +268,55 @@ UINT saved_cp; #endif +gboolean set_charset(const char *name,GError **err) +{ + /* The various UNICODE encodings all share the same character set. */ + const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4", + "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG", + "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", + "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE", + "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" }; + int i; + if (charset) + g_free(charset); + if (charset_validator!=(GIConv)-1) + g_iconv_close(charset_validator); + if (!name || !g_strcasecmp(name,"auto")) + { + charset=NULL; + charset_validator=(GIConv)-1; + return TRUE; + } + else + charset=g_strdup(name); + for(i=0;imessage); g_clear_error(&err); } - if (options[j].flags&G_OPTION_FLAG_REVERSE) - sw=!sw; - *(gboolean *)options[j].arg_data=sw; + else + { + if (options[j].flags&G_OPTION_FLAG_REVERSE) + sw=!sw; + *(gboolean *)options[j].arg_data=sw; + } + break; + } + else if (options[j].arg==G_OPTION_ARG_STRING) + { + s=g_key_file_get_string(config,"options",keys[i], + &err); + if (err) + { + g_printerr("Bookloupe: %s: options.%s: %s\n", + path,keys[i],err->message); + g_clear_error(&err); + } + else + { + g_free(*(gchar **)options[j].arg_data); + if (!g_strcmp0(s,"auto")) + { + *(gchar **)options[j].arg_data=NULL; + g_free(s); + } + else + *(gchar **)options[j].arg_data=s; + } break; } else @@ -475,11 +558,18 @@ pswit[USERTYPO_SWITCH]=FALSE; pswit[DP_SWITCH]=FALSE; } + if (opt_charset && !set_charset(opt_charset,&err)) + { + g_printerr("%s\n",err->message); + exit(1); + } if (pswit[DUMP_CONFIG_SWITCH]) { dump_config(); exit(0); } + g_free(opt_charset); + opt_charset=NULL; if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */ pswit[ECHO_SWITCH]=FALSE; @@ -542,7 +632,11 @@ exit(1); } if (g_utf8_validate(contents,len,NULL)) + { utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); + if (!charset) + (void)set_charset("UNICODE",NULL); + } else utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); g_free(contents); @@ -674,6 +768,7 @@ g_free(running_from); if (usertypo) g_tree_unref(usertypo); + set_charset(NULL,NULL); if (config) g_key_file_free(config); return 0; @@ -1024,25 +1119,32 @@ "Not reporting them.\n", results->spacedash+results->emdash.non_PG_space); } - /* If more than a quarter of characters are hi-bit, bug out. */ - warnings.bin=1; - if (results->binlen*4>results->totlen) + if (charset) + warnings.bin=0; + else { - g_print(" --> This file does not appear to be ASCII. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->alphalen*4totlen) - { - g_print(" --> This file does not appear to be text. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->binlen*100>results->totlen || results->binlen>100) - { - g_print(" --> There are a lot of foreign letters here. " - "Not reporting them.\n"); - warnings.bin=0; + /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */ + warnings.bin=1; + /* If more than a quarter of characters are hi-bit, bug out. */ + if (results->binlen*4>results->totlen) + { + g_print(" --> This file does not appear to be ASCII. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->alphalen*4totlen) + { + g_print(" --> This file does not appear to be text. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->binlen*100>results->totlen || results->binlen>100) + { + g_print(" --> There are a lot of foreign letters here. " + "Not reporting them.\n"); + if (!pswit[VERBOSE_SWITCH]) + warnings.bin=0; + } } warnings.isDutch=FALSE; if (results->Dutchcount>50) @@ -1070,7 +1172,6 @@ g_print("\n"); if (pswit[VERBOSE_SWITCH]) { - warnings.bin=1; warnings.shortline=1; warnings.dotcomma=1; warnings.longline=1; @@ -1265,14 +1366,17 @@ gboolean isemptyline) { /* Don't repeat multiple warnings on one line. */ - gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE; + gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE; gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE; const char *s; gunichar c; + gsize nb; + gchar *t; for (s=aline;*s;s=g_utf8_next_char(s)) { c=g_utf8_get_char(s); - if (!eNon_A && (c127)) + if (warnings->bin && !eInvalidChar && + (c127)) { if (pswit[ECHO_SWITCH]) g_print("\n%s\n",aline); @@ -1287,7 +1391,57 @@ linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); else cnt_bin++; - eNon_A=TRUE; + eInvalidChar=TRUE; + } + if (!eInvalidChar && charset) + { + if (charset_validator==(GIConv)-1) + { + if (!g_unichar_isdefined(c)) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Unassigned UNICODE " + "code point U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD || + c>=100000 && c<=0x10FFFD) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Private Use " + "character U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } + else + { + t=g_convert_with_iconv(s,g_utf8_next_char(s)-s, + charset_validator,NULL,&nb,NULL); + if (t) + g_free(t); + else + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Non-%s " + "character %u\n",linecnt, + g_utf8_pointer_to_offset(aline,s)+1,charset,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } } if (!eTab && c==CHAR_TAB) { @@ -2975,8 +3129,7 @@ if (s>=aline && g_utf8_get_char(s)=='-') enddash=TRUE; check_for_control_characters(aline); - if (warnings->bin) - check_for_odd_characters(aline,warnings,isemptyline); + check_for_odd_characters(aline,warnings,isemptyline); if (warnings->longline) check_for_long_line(aline); if (warnings->shortline) diff -r ff0aa9b1397a -r d22d8cd4f628 sample.ini --- a/sample.ini Fri Oct 25 11:15:18 2013 +0100 +++ b/sample.ini Sun Oct 27 17:01:47 2013 +0000 @@ -29,3 +29,5 @@ web=false # Verbose - list everything verbose=false +# Set of characters valid for this ebook +charset=auto diff -r ff0aa9b1397a -r d22d8cd4f628 test/bookloupe/Makefile.am --- a/test/bookloupe/Makefile.am Fri Oct 25 11:15:18 2013 +0100 +++ b/test/bookloupe/Makefile.am Sun Oct 27 17:01:47 2013 +0000 @@ -2,7 +2,7 @@ TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \ runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \ emdash.tst config-internal.tst config-default.tst config-user.tst \ - config-override.tst footnote-marker.tst unix-lineends.tst \ - os9-lineends.tst + config-override.tst charset-cp1252.tst charset-latin1.tst \ + footnote-marker.tst unix-lineends.tst os9-lineends.tst dist_pkgdata_DATA=$(TESTS) diff -r ff0aa9b1397a -r d22d8cd4f628 test/bookloupe/charset-cp1252.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/charset-cp1252.tst Sun Oct 27 17:01:47 2013 +0000 @@ -0,0 +1,16 @@ +**************** OPTIONS **************** +--charset=WINDOWS-1252 +**************** ENCODING **************** +WINDOWS-1252 +**************** INPUT **************** +Unless binary mode is engaged, gutcheck will warn about a number of +characters defined in Windows-1252. Bookloupe provides support for +disabling such checks without concern as to the file size and how +many characters with the eighth bit set it may contain by allowing a +character set to be declared. With the character set declared as +WINDOWS-1252, all characters defined in Windows-1252 shoud be acceptable +and no warnings should be issued. + +We test for this by including just one such character—the em dash. + +**************** EXPECTED **************** diff -r ff0aa9b1397a -r d22d8cd4f628 test/bookloupe/charset-latin1.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/charset-latin1.tst Sun Oct 27 17:01:47 2013 +0000 @@ -0,0 +1,58 @@ +**************** OPTIONS **************** +--charset=ISO-8859-1 +**************** ENCODING **************** +WINDOWS-1252 +**************** INPUT **************** +Where the character set declared is narrower than the character set +implied by the encoding as in this case (Windows-1252 is a superset +of the first latin alphabet defined in ECMA 94), then bookloupe should +warn about characters that are not in the declared character set but +should still recognise them and otherwise handle them as it would +normally do. We use the curved apostrophe as a test for this since +if bookloupe didn't recognise it then it would query the orphaned +letters from the genitives and abbreviations. + +John Hendricks was bear-leading at the time. He had originally studied +for Holy Orders, but had abandoned the Church later for private reasons +connected with his faith, and had taken to teaching and tutoring +instead. He was an honest, upstanding fellow of five-and-thirty, +incorruptible, intelligent in a simple, straightforward way. He played +games with his head, more than most Englishmen do, but he went through +life without much calculation. He had qualities that made boys like +and respect him; he won their confidence. Poor, proud, ambitious, +he realised that fate offered him a chance when the Secretary of +State for Scotland asked him if he would give up his other pupils +for a year and take his son, Lord Ernie, round the world upon an +educational trip that might make a man of him. For Lord Ernie was the +only son, and the Marquess’s influence was naturally great. To have +deposited a regenerated Lord Ernie at the castle gates might have +guaranteed Hendricks’ future. After leaving Eton prematurely the lad +had come under Hendricks’ charge for a time, and with such excellent +results--‘I’d simply swear by that chap, you know,’ the boy used +to say--that his father, considerably impressed, and rather as a +last resort, had made this proposition. And Hendricks, without much +calculation, had accepted it. He liked ‘Bindy’ for himself. It was +in his heart to ‘make a man of him,’ if possible. They had now been +round the world together and had come up from Brindisi to the Italian +Lakes, and so into Switzerland. It was middle October. With a week or +two to spare they were making leisurely for the ancestral halls in +Aberdeenshire. +**************** EXPECTED **************** + +only son, and the Marquess’s influence was naturally great. To have + Line 22 column 27 - Non-ISO-8859-1 character 8217 + +guaranteed Hendricks’ future. After leaving Eton prematurely the lad + Line 24 column 21 - Non-ISO-8859-1 character 8217 + +had come under Hendricks’ charge for a time, and with such excellent + Line 25 column 25 - Non-ISO-8859-1 character 8217 + +results--‘I’d simply swear by that chap, you know,’ the boy used + Line 26 column 10 - Non-ISO-8859-1 character 8216 + +calculation, had accepted it. He liked ‘Bindy’ for himself. It was + Line 29 column 40 - Non-ISO-8859-1 character 8216 + +in his heart to ‘make a man of him,’ if possible. They had now been + Line 30 column 17 - Non-ISO-8859-1 character 8216 diff -r ff0aa9b1397a -r d22d8cd4f628 test/bookloupe/config-default.tst --- a/test/bookloupe/config-default.tst Fri Oct 25 11:15:18 2013 +0100 +++ b/test/bookloupe/config-default.tst Sun Oct 27 17:01:47 2013 +0000 @@ -30,6 +30,8 @@ usertypo=false # Verbose - list everything verbose=false +# Set of characters valid for this ebook +charset=auto **************** EXPECTED(stdout) **************** # Default configuration for bookloupe @@ -60,3 +62,5 @@ usertypo=false # Verbose - list everything verbose=false +# Set of characters valid for this ebook +charset=auto diff -r ff0aa9b1397a -r d22d8cd4f628 test/bookloupe/config-internal.tst --- a/test/bookloupe/config-internal.tst Fri Oct 25 11:15:18 2013 +0100 +++ b/test/bookloupe/config-internal.tst Sun Oct 27 17:01:47 2013 +0000 @@ -30,3 +30,5 @@ usertypo=false # Verbose - list everything verbose=false +# Set of characters valid for this ebook +charset=auto diff -r ff0aa9b1397a -r d22d8cd4f628 test/bookloupe/config-override.tst --- a/test/bookloupe/config-override.tst Fri Oct 25 11:15:18 2013 +0100 +++ b/test/bookloupe/config-override.tst Sun Oct 27 17:01:47 2013 +0000 @@ -1,5 +1,6 @@ **************** OPTIONS **************** --usertypo +--charset=auto --dump-config **************** INPUT(bookloupe.ini) **************** # Relaxed configuration for bookloupe @@ -31,6 +32,8 @@ usertypo=false # Verbose - list everything verbose=false +# Set of characters valid for this ebook +charset=UNICODE **************** EXPECTED(stdout) **************** # Relaxed configuration for bookloupe @@ -61,3 +64,5 @@ usertypo=true # Verbose - list everything verbose=false +# Set of characters valid for this ebook +charset=auto diff -r ff0aa9b1397a -r d22d8cd4f628 test/bookloupe/config-user.tst --- a/test/bookloupe/config-user.tst Fri Oct 25 11:15:18 2013 +0100 +++ b/test/bookloupe/config-user.tst Sun Oct 27 17:01:47 2013 +0000 @@ -35,6 +35,8 @@ usertypo=true # Verbose - list everything - Contrary by name... verbose=true +# Set of characters valid for this ebook - Let's stick with Latin1 +charset=ISO-8859-1 **************** EXPECTED(stdout) **************** # Mary Contrary's configuration for bookloupe @@ -70,3 +72,5 @@ usertypo=true # Verbose - list everything - Contrary by name... verbose=true +# Set of characters valid for this ebook - Let's stick with Latin1 +charset=ISO-8859-1