diff -r 189183b37598 -r 1aeda7fe17ca bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Mon Oct 21 23:36:40 2013 +0100 +++ b/bookloupe/bookloupe.c Mon Oct 21 23:39:54 2013 +0100 @@ -32,6 +32,9 @@ #include "pending.h" #include "HTMLentities.h" +gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */ +GIConv charset_validator=(GIConv)-1; + gchar *prevline; /* Common typos. */ @@ -127,6 +130,7 @@ }; gboolean pswit[SWITNO]; /* program switches */ +gchar *opt_charset; gboolean typo_compat,paranoid_compat; @@ -198,6 +202,8 @@ { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, "Switch off verbose mode", NULL }, + { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset, + "Set of characters valid for this ebook", "NAME" }, { NULL } }; @@ -245,7 +251,7 @@ gboolean mixdigit(const char *); gchar *getaword(const char **); -char *flgets(char **,long,gboolean); +char *flgets(char **,long); void postprocess_for_HTML(char *); char *linehasmarkup(char *); char *losemarkup(char *); @@ -423,6 +429,49 @@ g_free(path); } +gboolean set_charset(const char *name,GError **err) +{ + /* The various UNICODE encodings all share the same character set. */ + const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4", + "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG", + "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", + "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE", + "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" }; + int i; + if (charset) + g_free(charset); + if (charset_validator!=(GIConv)-1) + g_iconv_close(charset_validator); + if (!name || !g_strcasecmp(name,"auto")) + { + charset=NULL; + charset_validator=(GIConv)-1; + return TRUE; + } + else + charset=g_strdup(name); + for(i=0;imessage); + exit(1); + } if (pswit[DUMP_CONFIG_SWITCH]) { dump_config(); exit(0); } + g_free(opt_charset); + opt_charset=NULL; if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */ pswit[ECHO_SWITCH]=FALSE; @@ -542,7 +598,11 @@ exit(1); } if (g_utf8_validate(contents,len,NULL)) + { utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); + if (!charset) + (void)set_charset("UNICODE",NULL); + } else utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); g_free(contents); @@ -674,6 +734,7 @@ g_free(running_from); if (usertypo) g_tree_unref(usertypo); + set_charset(NULL,NULL); if (config) g_key_file_free(config); return 0; @@ -735,20 +796,11 @@ gchar *inword; QuoteClass qc; lines=g_strsplit(etext,"\n",0); - if (lines[0]) - /* If there's at least one line, we might have UNIX-style terminators */ - results.unix_lineends=TRUE; for (j=0;lines[j];j++) { lbytes=strlen(lines[j]); - if (lbytes>0 && lines[j][lbytes-1]=='\r') - { - results.unix_lineends=FALSE; - do - { - lines[j][--lbytes]='\0'; - } while (lbytes>0 && lines[j][lbytes-1]=='\r'); - } + while (lbytes>0 && lines[j][lbytes-1]=='\r') + lines[j][--lbytes]='\0'; llen=g_utf8_strlen(lines[j],lbytes); linecnt++; if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") && @@ -890,13 +942,6 @@ struct warnings *report_first_pass(struct first_pass_results *results) { static struct warnings warnings={0}; - warnings.nocr=1; - if (results->unix_lineends) - { - warnings.nocr=0; - g_print(" --> No lines in this file have a CR. Not reporting them. " - "Project Gutenberg requires that all lineends be CR-LF.\n"); - } if (cnt_spacend>0) g_print(" --> %ld lines in this file have white space at end\n", cnt_spacend); @@ -1004,25 +1049,32 @@ "Not reporting them.\n", results->spacedash+results->emdash.non_PG_space); } - /* If more than a quarter of characters are hi-bit, bug out. */ - warnings.bin=1; - if (results->binlen*4>results->totlen) + if (charset) + warnings.bin=0; + else { - g_print(" --> This file does not appear to be ASCII. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->alphalen*4totlen) - { - g_print(" --> This file does not appear to be text. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->binlen*100>results->totlen || results->binlen>100) - { - g_print(" --> There are a lot of foreign letters here. " - "Not reporting them.\n"); - warnings.bin=0; + /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */ + warnings.bin=1; + /* If more than a quarter of characters are hi-bit, bug out. */ + if (results->binlen*4>results->totlen) + { + g_print(" --> This file does not appear to be ASCII. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->alphalen*4totlen) + { + g_print(" --> This file does not appear to be text. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->binlen*100>results->totlen || results->binlen>100) + { + g_print(" --> There are a lot of foreign letters here. " + "Not reporting them.\n"); + if (!pswit[VERBOSE_SWITCH]) + warnings.bin=0; + } } warnings.isDutch=FALSE; if (results->Dutchcount>50) @@ -1050,7 +1102,6 @@ g_print("\n"); if (pswit[VERBOSE_SWITCH]) { - warnings.bin=1; warnings.shortline=1; warnings.dotcomma=1; warnings.longline=1; @@ -1245,14 +1296,17 @@ gboolean isemptyline) { /* Don't repeat multiple warnings on one line. */ - gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE; + gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE; gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE; const char *s; gunichar c; + gsize nb; + gchar *t; for (s=aline;*s;s=g_utf8_next_char(s)) { c=g_utf8_get_char(s); - if (!eNon_A && (c127)) + if (warnings->bin && !eInvalidChar && + (c127)) { if (pswit[ECHO_SWITCH]) g_print("\n%s\n",aline); @@ -1267,7 +1321,57 @@ linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); else cnt_bin++; - eNon_A=TRUE; + eInvalidChar=TRUE; + } + if (!eInvalidChar && charset) + { + if (charset_validator==(GIConv)-1) + { + if (!g_unichar_isdefined(c)) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Unassigned UNICODE " + "code point U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD || + c>=100000 && c<=0x10FFFD) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Private Use " + "character U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } + else + { + t=g_convert_with_iconv(s,g_utf8_next_char(s)-s, + charset_validator,NULL,&nb,NULL); + if (t) + g_free(t); + else + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Non-%s " + "character %u\n",linecnt, + g_utf8_pointer_to_offset(aline,s)+1,charset,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } } if (!eTab && c==CHAR_TAB) { @@ -2885,7 +2989,7 @@ */ linecnt=0; etext_ptr=etext; - while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr))) + while ((aline=flgets(&etext_ptr,linecnt+1))) { linecnt++; if (linecnt==1) @@ -2955,8 +3059,7 @@ if (s>=aline && g_utf8_get_char(s)=='-') enddash=TRUE; check_for_control_characters(aline); - if (warnings->bin) - check_for_odd_characters(aline,warnings,isemptyline); + check_for_odd_characters(aline,warnings,isemptyline); if (warnings->longline) check_for_long_line(aline); if (warnings->shortline) @@ -3031,7 +3134,7 @@ * * Returns: a pointer to the line. */ -char *flgets(char **etext,long lcnt,gboolean warn_nocr) +char *flgets(char **etext,long lcnt) { gunichar c; gboolean isCR=FALSE; @@ -3070,7 +3173,7 @@ else { /* Error - a LF without a preceding CR */ - if (pswit[LINE_END_SWITCH] && warn_nocr) + if (pswit[LINE_END_SWITCH]) { if (pswit[ECHO_SWITCH]) {