1.1 --- a/bookloupe/bookloupe.c Mon Oct 21 23:36:40 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Mon Oct 21 23:39:54 2013 +0100
1.3 @@ -32,6 +32,9 @@
1.4 #include "pending.h"
1.5 #include "HTMLentities.h"
1.6
1.7 +gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
1.8 +GIConv charset_validator=(GIConv)-1;
1.9 +
1.10 gchar *prevline;
1.11
1.12 /* Common typos. */
1.13 @@ -127,6 +130,7 @@
1.14 };
1.15
1.16 gboolean pswit[SWITNO]; /* program switches */
1.17 +gchar *opt_charset;
1.18
1.19 gboolean typo_compat,paranoid_compat;
1.20
1.21 @@ -198,6 +202,8 @@
1.22 { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
1.23 G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
1.24 "Switch off verbose mode", NULL },
1.25 + { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
1.26 + "Set of characters valid for this ebook", "NAME" },
1.27 { NULL }
1.28 };
1.29
1.30 @@ -245,7 +251,7 @@
1.31
1.32 gboolean mixdigit(const char *);
1.33 gchar *getaword(const char **);
1.34 -char *flgets(char **,long,gboolean);
1.35 +char *flgets(char **,long);
1.36 void postprocess_for_HTML(char *);
1.37 char *linehasmarkup(char *);
1.38 char *losemarkup(char *);
1.39 @@ -423,6 +429,49 @@
1.40 g_free(path);
1.41 }
1.42
1.43 +gboolean set_charset(const char *name,GError **err)
1.44 +{
1.45 + /* The various UNICODE encodings all share the same character set. */
1.46 + const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
1.47 + "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
1.48 + "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
1.49 + "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
1.50 + "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
1.51 + int i;
1.52 + if (charset)
1.53 + g_free(charset);
1.54 + if (charset_validator!=(GIConv)-1)
1.55 + g_iconv_close(charset_validator);
1.56 + if (!name || !g_strcasecmp(name,"auto"))
1.57 + {
1.58 + charset=NULL;
1.59 + charset_validator=(GIConv)-1;
1.60 + return TRUE;
1.61 + }
1.62 + else
1.63 + charset=g_strdup(name);
1.64 + for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
1.65 + if (!g_strcasecmp(charset,unicode_aliases[i]))
1.66 + {
1.67 + g_free(charset);
1.68 + charset=g_strdup("UTF-8");
1.69 + break;
1.70 + }
1.71 + if (!strcmp(charset,"UTF-8"))
1.72 + charset_validator=(GIConv)-1;
1.73 + else
1.74 + {
1.75 + charset_validator=g_iconv_open(charset,"UTF-8");
1.76 + if (charset_validator==(GIConv)-1)
1.77 + {
1.78 + g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
1.79 + "Unknown character set \"%s\"",charset);
1.80 + return FALSE;
1.81 + }
1.82 + }
1.83 + return TRUE;
1.84 +}
1.85 +
1.86 void parse_options(int *argc,char ***argv)
1.87 {
1.88 GError *err=NULL;
1.89 @@ -475,11 +524,18 @@
1.90 pswit[USERTYPO_SWITCH]=FALSE;
1.91 pswit[DP_SWITCH]=FALSE;
1.92 }
1.93 + if (opt_charset && !set_charset(opt_charset,&err))
1.94 + {
1.95 + g_printerr("%s\n",err->message);
1.96 + exit(1);
1.97 + }
1.98 if (pswit[DUMP_CONFIG_SWITCH])
1.99 {
1.100 dump_config();
1.101 exit(0);
1.102 }
1.103 + g_free(opt_charset);
1.104 + opt_charset=NULL;
1.105 if (pswit[OVERVIEW_SWITCH])
1.106 /* just print summary; don't echo */
1.107 pswit[ECHO_SWITCH]=FALSE;
1.108 @@ -542,7 +598,11 @@
1.109 exit(1);
1.110 }
1.111 if (g_utf8_validate(contents,len,NULL))
1.112 + {
1.113 utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
1.114 + if (!charset)
1.115 + (void)set_charset("UNICODE",NULL);
1.116 + }
1.117 else
1.118 utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
1.119 g_free(contents);
1.120 @@ -674,6 +734,7 @@
1.121 g_free(running_from);
1.122 if (usertypo)
1.123 g_tree_unref(usertypo);
1.124 + set_charset(NULL,NULL);
1.125 if (config)
1.126 g_key_file_free(config);
1.127 return 0;
1.128 @@ -735,20 +796,11 @@
1.129 gchar *inword;
1.130 QuoteClass qc;
1.131 lines=g_strsplit(etext,"\n",0);
1.132 - if (lines[0])
1.133 - /* If there's at least one line, we might have UNIX-style terminators */
1.134 - results.unix_lineends=TRUE;
1.135 for (j=0;lines[j];j++)
1.136 {
1.137 lbytes=strlen(lines[j]);
1.138 - if (lbytes>0 && lines[j][lbytes-1]=='\r')
1.139 - {
1.140 - results.unix_lineends=FALSE;
1.141 - do
1.142 - {
1.143 - lines[j][--lbytes]='\0';
1.144 - } while (lbytes>0 && lines[j][lbytes-1]=='\r');
1.145 - }
1.146 + while (lbytes>0 && lines[j][lbytes-1]=='\r')
1.147 + lines[j][--lbytes]='\0';
1.148 llen=g_utf8_strlen(lines[j],lbytes);
1.149 linecnt++;
1.150 if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
1.151 @@ -890,13 +942,6 @@
1.152 struct warnings *report_first_pass(struct first_pass_results *results)
1.153 {
1.154 static struct warnings warnings={0};
1.155 - warnings.nocr=1;
1.156 - if (results->unix_lineends)
1.157 - {
1.158 - warnings.nocr=0;
1.159 - g_print(" --> No lines in this file have a CR. Not reporting them. "
1.160 - "Project Gutenberg requires that all lineends be CR-LF.\n");
1.161 - }
1.162 if (cnt_spacend>0)
1.163 g_print(" --> %ld lines in this file have white space at end\n",
1.164 cnt_spacend);
1.165 @@ -1004,25 +1049,32 @@
1.166 "Not reporting them.\n",
1.167 results->spacedash+results->emdash.non_PG_space);
1.168 }
1.169 - /* If more than a quarter of characters are hi-bit, bug out. */
1.170 - warnings.bin=1;
1.171 - if (results->binlen*4>results->totlen)
1.172 + if (charset)
1.173 + warnings.bin=0;
1.174 + else
1.175 {
1.176 - g_print(" --> This file does not appear to be ASCII. "
1.177 - "Terminating. Best of luck with it!\n");
1.178 - exit(1);
1.179 - }
1.180 - if (results->alphalen*4<results->totlen)
1.181 - {
1.182 - g_print(" --> This file does not appear to be text. "
1.183 - "Terminating. Best of luck with it!\n");
1.184 - exit(1);
1.185 - }
1.186 - if (results->binlen*100>results->totlen || results->binlen>100)
1.187 - {
1.188 - g_print(" --> There are a lot of foreign letters here. "
1.189 - "Not reporting them.\n");
1.190 - warnings.bin=0;
1.191 + /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
1.192 + warnings.bin=1;
1.193 + /* If more than a quarter of characters are hi-bit, bug out. */
1.194 + if (results->binlen*4>results->totlen)
1.195 + {
1.196 + g_print(" --> This file does not appear to be ASCII. "
1.197 + "Terminating. Best of luck with it!\n");
1.198 + exit(1);
1.199 + }
1.200 + if (results->alphalen*4<results->totlen)
1.201 + {
1.202 + g_print(" --> This file does not appear to be text. "
1.203 + "Terminating. Best of luck with it!\n");
1.204 + exit(1);
1.205 + }
1.206 + if (results->binlen*100>results->totlen || results->binlen>100)
1.207 + {
1.208 + g_print(" --> There are a lot of foreign letters here. "
1.209 + "Not reporting them.\n");
1.210 + if (!pswit[VERBOSE_SWITCH])
1.211 + warnings.bin=0;
1.212 + }
1.213 }
1.214 warnings.isDutch=FALSE;
1.215 if (results->Dutchcount>50)
1.216 @@ -1050,7 +1102,6 @@
1.217 g_print("\n");
1.218 if (pswit[VERBOSE_SWITCH])
1.219 {
1.220 - warnings.bin=1;
1.221 warnings.shortline=1;
1.222 warnings.dotcomma=1;
1.223 warnings.longline=1;
1.224 @@ -1245,14 +1296,17 @@
1.225 gboolean isemptyline)
1.226 {
1.227 /* Don't repeat multiple warnings on one line. */
1.228 - gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
1.229 + gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
1.230 gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
1.231 const char *s;
1.232 gunichar c;
1.233 + gsize nb;
1.234 + gchar *t;
1.235 for (s=aline;*s;s=g_utf8_next_char(s))
1.236 {
1.237 c=g_utf8_get_char(s);
1.238 - if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1.239 + if (warnings->bin && !eInvalidChar &&
1.240 + (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
1.241 {
1.242 if (pswit[ECHO_SWITCH])
1.243 g_print("\n%s\n",aline);
1.244 @@ -1267,7 +1321,57 @@
1.245 linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.246 else
1.247 cnt_bin++;
1.248 - eNon_A=TRUE;
1.249 + eInvalidChar=TRUE;
1.250 + }
1.251 + if (!eInvalidChar && charset)
1.252 + {
1.253 + if (charset_validator==(GIConv)-1)
1.254 + {
1.255 + if (!g_unichar_isdefined(c))
1.256 + {
1.257 + if (pswit[ECHO_SWITCH])
1.258 + g_print("\n%s\n",aline);
1.259 + if (!pswit[OVERVIEW_SWITCH])
1.260 + g_print(" Line %ld column %ld - Unassigned UNICODE "
1.261 + "code point U+%04" G_GINT32_MODIFIER "X\n",
1.262 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.263 + else
1.264 + cnt_bin++;
1.265 + eInvalidChar=TRUE;
1.266 + }
1.267 + else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
1.268 + c>=100000 && c<=0x10FFFD)
1.269 + {
1.270 + if (pswit[ECHO_SWITCH])
1.271 + g_print("\n%s\n",aline);
1.272 + if (!pswit[OVERVIEW_SWITCH])
1.273 + g_print(" Line %ld column %ld - Private Use "
1.274 + "character U+%04" G_GINT32_MODIFIER "X\n",
1.275 + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
1.276 + else
1.277 + cnt_bin++;
1.278 + eInvalidChar=TRUE;
1.279 + }
1.280 + }
1.281 + else
1.282 + {
1.283 + t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
1.284 + charset_validator,NULL,&nb,NULL);
1.285 + if (t)
1.286 + g_free(t);
1.287 + else
1.288 + {
1.289 + if (pswit[ECHO_SWITCH])
1.290 + g_print("\n%s\n",aline);
1.291 + if (!pswit[OVERVIEW_SWITCH])
1.292 + g_print(" Line %ld column %ld - Non-%s "
1.293 + "character %u\n",linecnt,
1.294 + g_utf8_pointer_to_offset(aline,s)+1,charset,c);
1.295 + else
1.296 + cnt_bin++;
1.297 + eInvalidChar=TRUE;
1.298 + }
1.299 + }
1.300 }
1.301 if (!eTab && c==CHAR_TAB)
1.302 {
1.303 @@ -2885,7 +2989,7 @@
1.304 */
1.305 linecnt=0;
1.306 etext_ptr=etext;
1.307 - while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr)))
1.308 + while ((aline=flgets(&etext_ptr,linecnt+1)))
1.309 {
1.310 linecnt++;
1.311 if (linecnt==1)
1.312 @@ -2955,8 +3059,7 @@
1.313 if (s>=aline && g_utf8_get_char(s)=='-')
1.314 enddash=TRUE;
1.315 check_for_control_characters(aline);
1.316 - if (warnings->bin)
1.317 - check_for_odd_characters(aline,warnings,isemptyline);
1.318 + check_for_odd_characters(aline,warnings,isemptyline);
1.319 if (warnings->longline)
1.320 check_for_long_line(aline);
1.321 if (warnings->shortline)
1.322 @@ -3031,7 +3134,7 @@
1.323 *
1.324 * Returns: a pointer to the line.
1.325 */
1.326 -char *flgets(char **etext,long lcnt,gboolean warn_nocr)
1.327 +char *flgets(char **etext,long lcnt)
1.328 {
1.329 gunichar c;
1.330 gboolean isCR=FALSE;
1.331 @@ -3070,7 +3173,7 @@
1.332 else
1.333 {
1.334 /* Error - a LF without a preceding CR */
1.335 - if (pswit[LINE_END_SWITCH] && warn_nocr)
1.336 + if (pswit[LINE_END_SWITCH])
1.337 {
1.338 if (pswit[ECHO_SWITCH])
1.339 {
2.1 --- a/test/bookloupe/Makefile.am Mon Oct 21 23:36:40 2013 +0100
2.2 +++ b/test/bookloupe/Makefile.am Mon Oct 21 23:39:54 2013 +0100
2.3 @@ -2,6 +2,7 @@
2.4 TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \
2.5 runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \
2.6 emdash.tst config-internal.tst config-default.tst config-user.tst \
2.7 - config-override.tst footnote-marker.tst unix-lineends.tst
2.8 + config-override.tst charset-cp1252.tst charset-latin1.tst \
2.9 + footnote-marker.tst unix-lineends.tst
2.10
2.11 dist_pkgdata_DATA=$(TESTS)
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/test/bookloupe/charset-cp1252.tst Mon Oct 21 23:39:54 2013 +0100
3.3 @@ -0,0 +1,16 @@
3.4 +**************** OPTIONS ****************
3.5 +--charset=WINDOWS-1252
3.6 +**************** ENCODING ****************
3.7 +WINDOWS-1252
3.8 +**************** INPUT ****************
3.9 +Unless binary mode is engaged, gutcheck will warn about a number of
3.10 +characters defined in Windows-1252. Bookloupe provides support for
3.11 +disabling such checks without concern as to the file size and how
3.12 +many characters with the eighth bit set it may contain by allowing a
3.13 +character set to be declared. With the character set declared as
3.14 +WINDOWS-1252, all characters defined in Windows-1252 shoud be acceptable
3.15 +and no warnings should be issued.
3.16 +
3.17 +We test for this by including just one such character—the em dash.
3.18 +
3.19 +**************** EXPECTED ****************
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/test/bookloupe/charset-latin1.tst Mon Oct 21 23:39:54 2013 +0100
4.3 @@ -0,0 +1,58 @@
4.4 +**************** OPTIONS ****************
4.5 +--charset=ISO-8859-1
4.6 +**************** ENCODING ****************
4.7 +WINDOWS-1252
4.8 +**************** INPUT ****************
4.9 +Where the character set declared is narrower than the character set
4.10 +implied by the encoding as in this case (Windows-1252 is a superset
4.11 +of the first latin alphabet defined in ECMA 94), then bookloupe should
4.12 +warn about characters that are not in the declared character set but
4.13 +should still recognise them and otherwise handle them as it would
4.14 +normally do. We use the curved apostrophe as a test for this since
4.15 +if bookloupe didn't recognise it then it would query the orphaned
4.16 +letters from the genitives and abbreviations.
4.17 +
4.18 +John Hendricks was bear-leading at the time. He had originally studied
4.19 +for Holy Orders, but had abandoned the Church later for private reasons
4.20 +connected with his faith, and had taken to teaching and tutoring
4.21 +instead. He was an honest, upstanding fellow of five-and-thirty,
4.22 +incorruptible, intelligent in a simple, straightforward way. He played
4.23 +games with his head, more than most Englishmen do, but he went through
4.24 +life without much calculation. He had qualities that made boys like
4.25 +and respect him; he won their confidence. Poor, proud, ambitious,
4.26 +he realised that fate offered him a chance when the Secretary of
4.27 +State for Scotland asked him if he would give up his other pupils
4.28 +for a year and take his son, Lord Ernie, round the world upon an
4.29 +educational trip that might make a man of him. For Lord Ernie was the
4.30 +only son, and the Marquess’s influence was naturally great. To have
4.31 +deposited a regenerated Lord Ernie at the castle gates might have
4.32 +guaranteed Hendricks’ future. After leaving Eton prematurely the lad
4.33 +had come under Hendricks’ charge for a time, and with such excellent
4.34 +results--‘I’d simply swear by that chap, you know,’ the boy used
4.35 +to say--that his father, considerably impressed, and rather as a
4.36 +last resort, had made this proposition. And Hendricks, without much
4.37 +calculation, had accepted it. He liked ‘Bindy’ for himself. It was
4.38 +in his heart to ‘make a man of him,’ if possible. They had now been
4.39 +round the world together and had come up from Brindisi to the Italian
4.40 +Lakes, and so into Switzerland. It was middle October. With a week or
4.41 +two to spare they were making leisurely for the ancestral halls in
4.42 +Aberdeenshire.
4.43 +**************** EXPECTED ****************
4.44 +
4.45 +only son, and the Marquess’s influence was naturally great. To have
4.46 + Line 22 column 27 - Non-ISO-8859-1 character 8217
4.47 +
4.48 +guaranteed Hendricks’ future. After leaving Eton prematurely the lad
4.49 + Line 24 column 21 - Non-ISO-8859-1 character 8217
4.50 +
4.51 +had come under Hendricks’ charge for a time, and with such excellent
4.52 + Line 25 column 25 - Non-ISO-8859-1 character 8217
4.53 +
4.54 +results--‘I’d simply swear by that chap, you know,’ the boy used
4.55 + Line 26 column 10 - Non-ISO-8859-1 character 8216
4.56 +
4.57 +calculation, had accepted it. He liked ‘Bindy’ for himself. It was
4.58 + Line 29 column 40 - Non-ISO-8859-1 character 8216
4.59 +
4.60 +in his heart to ‘make a man of him,’ if possible. They had now been
4.61 + Line 30 column 17 - Non-ISO-8859-1 character 8216