diff -r adc06e9e8470 -r 2ff298db529e bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Mon Sep 23 21:18:27 2013 +0100 +++ b/bookloupe/bookloupe.c Sun Sep 29 09:19:46 2013 +0100 @@ -32,6 +32,9 @@ #include "pending.h" #include "HTMLentities.h" +gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */ +GIConv charset_validator=(GIConv)-1; + gchar *prevline; /* Common typos. */ @@ -127,6 +130,7 @@ }; gboolean pswit[SWITNO]; /* program switches */ +gchar *opt_charset; static GOptionEntry options[]={ { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH, @@ -157,6 +161,8 @@ "Defaults for use on www upload", NULL }, { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, "Verbose - list everything", NULL }, + { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset, + "Set of characters valid for this ebook", "NAME" }, { NULL } }; @@ -201,6 +207,49 @@ UINT saved_cp; #endif +gboolean set_charset(const char *name,GError **err) +{ + /* The various UNICODE encodings all share the same character set. */ + const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4", + "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG", + "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", + "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE", + "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" }; + int i; + if (charset) + g_free(charset); + if (charset_validator!=(GIConv)-1) + g_iconv_close(charset_validator); + if (!name || !g_strcasecmp(name,"auto")) + { + charset=NULL; + charset_validator=(GIConv)-1; + return TRUE; + } + else + charset=g_strdup(name); + for(i=0;imessage); + exit(1); + } + g_free(opt_charset); + opt_charset=NULL; if (*argc<2) { proghelp(context); @@ -306,7 +362,11 @@ exit(1); } if (g_utf8_validate(contents,len,NULL)) + { utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); + if (!charset) + (void)set_charset("UNICODE",NULL); + } else utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); g_free(contents); @@ -431,6 +491,7 @@ g_free(running_from); if (usertypo) g_tree_unref(usertypo); + set_charset(NULL,NULL); return 0; } @@ -703,25 +764,32 @@ "Not reporting them.\n", results->spacedash+results->non_PG_space_emdash); } - /* If more than a quarter of characters are hi-bit, bug out. */ - warnings.bin=1; - if (results->binlen*4>results->totlen) + if (charset) + warnings.bin=0; + else { - g_print(" --> This file does not appear to be ASCII. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->alphalen*4totlen) - { - g_print(" --> This file does not appear to be text. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->binlen*100>results->totlen || results->binlen>100) - { - g_print(" --> There are a lot of foreign letters here. " - "Not reporting them.\n"); - warnings.bin=0; + /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */ + warnings.bin=1; + /* If more than a quarter of characters are hi-bit, bug out. */ + if (results->binlen*4>results->totlen) + { + g_print(" --> This file does not appear to be ASCII. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->alphalen*4totlen) + { + g_print(" --> This file does not appear to be text. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->binlen*100>results->totlen || results->binlen>100) + { + g_print(" --> There are a lot of foreign letters here. " + "Not reporting them.\n"); + if (!pswit[VERBOSE_SWITCH]) + warnings.bin=0; + } } warnings.isDutch=FALSE; if (results->Dutchcount>50) @@ -749,7 +817,6 @@ g_print("\n"); if (pswit[VERBOSE_SWITCH]) { - warnings.bin=1; warnings.shortline=1; warnings.dotcomma=1; warnings.longline=1; @@ -932,14 +999,17 @@ gboolean isemptyline) { /* Don't repeat multiple warnings on one line. */ - gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE; + gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE; gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE; const char *s; gunichar c; + gsize nb; + gchar *t; for (s=aline;*s;s=g_utf8_next_char(s)) { c=g_utf8_get_char(s); - if (!eNon_A && (c127)) + if (warnings->bin && !eInvalidChar && + (c127)) { if (pswit[ECHO_SWITCH]) g_print("\n%s\n",aline); @@ -954,7 +1024,57 @@ linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); else cnt_bin++; - eNon_A=TRUE; + eInvalidChar=TRUE; + } + if (!eInvalidChar && charset) + { + if (charset_validator==(GIConv)-1) + { + if (!g_unichar_isdefined(c)) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Unassigned UNICODE " + "code point U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD || + c>=100000 && c<=0x10FFFD) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Private Use " + "character U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } + else + { + t=g_convert_with_iconv(s,g_utf8_next_char(s)-s, + charset_validator,NULL,&nb,NULL); + if (t) + g_free(t); + else + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Non-%s " + "character %u\n",linecnt, + g_utf8_pointer_to_offset(aline,s)+1,charset,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } } if (!eTab && c==CHAR_TAB) { @@ -2592,8 +2712,7 @@ if (s>=aline && g_utf8_get_char(s)=='-') enddash=TRUE; check_for_control_characters(aline); - if (warnings->bin) - check_for_odd_characters(aline,warnings,isemptyline); + check_for_odd_characters(aline,warnings,isemptyline); if (warnings->longline) check_for_long_line(aline); if (warnings->shortline)