diff -r 189183b37598 -r 1aeda7fe17ca bookloupe/bookloupe.c
--- a/bookloupe/bookloupe.c	Mon Oct 21 23:36:40 2013 +0100
+++ b/bookloupe/bookloupe.c	Mon Oct 21 23:39:54 2013 +0100
@@ -32,6 +32,9 @@
 #include "pending.h"
 #include "HTMLentities.h"
 
+gchar *charset;		/* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */
+GIConv charset_validator=(GIConv)-1;
+
 gchar *prevline;
 
 /* Common typos. */
@@ -127,6 +130,7 @@
 }; 
 
 gboolean pswit[SWITNO];  /* program switches */
+gchar *opt_charset;
 
 gboolean typo_compat,paranoid_compat;
 
@@ -198,6 +202,8 @@
     { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE,
       G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH,
       "Switch off verbose mode", NULL },
+    { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset,
+      "Set of characters valid for this ebook", "NAME" },
     { NULL }
 };
 
@@ -245,7 +251,7 @@
 
 gboolean mixdigit(const char *);
 gchar *getaword(const char **);
-char *flgets(char **,long,gboolean);
+char *flgets(char **,long);
 void postprocess_for_HTML(char *);
 char *linehasmarkup(char *);
 char *losemarkup(char *);
@@ -423,6 +429,49 @@
 	g_free(path);
 }
 
+gboolean set_charset(const char *name,GError **err)
+{
+    /* The various UNICODE encodings all share the same character set. */
+    const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4",
+      "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG",
+      "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE",
+      "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE",
+      "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" };
+    int i;
+    if (charset)
+	g_free(charset);
+    if (charset_validator!=(GIConv)-1)
+	g_iconv_close(charset_validator);
+    if (!name || !g_strcasecmp(name,"auto"))
+    {
+	charset=NULL;
+	charset_validator=(GIConv)-1;
+	return TRUE;
+    }
+    else
+	charset=g_strdup(name);
+    for(i=0;i<G_N_ELEMENTS(unicode_aliases);i++)
+	if (!g_strcasecmp(charset,unicode_aliases[i]))
+	{
+	    g_free(charset);
+	    charset=g_strdup("UTF-8");
+	    break;
+	}
+    if (!strcmp(charset,"UTF-8"))
+	charset_validator=(GIConv)-1;
+    else
+    {
+	charset_validator=g_iconv_open(charset,"UTF-8");
+	if (charset_validator==(GIConv)-1)
+	{
+	    g_set_error(err,G_CONVERT_ERROR,G_CONVERT_ERROR_NO_CONVERSION,
+	      "Unknown character set \"%s\"",charset);
+	    return FALSE;
+	}
+    }
+    return TRUE;
+}
+
 void parse_options(int *argc,char ***argv)
 {
     GError *err=NULL;
@@ -475,11 +524,18 @@
 	pswit[USERTYPO_SWITCH]=FALSE;
 	pswit[DP_SWITCH]=FALSE;
     }
+    if (opt_charset && !set_charset(opt_charset,&err))
+    {
+	g_printerr("%s\n",err->message);
+	exit(1);
+    }
     if (pswit[DUMP_CONFIG_SWITCH])
     {
 	dump_config();
 	exit(0);
     }
+    g_free(opt_charset);
+    opt_charset=NULL;
     if (pswit[OVERVIEW_SWITCH])
 	/* just print summary; don't echo */
 	pswit[ECHO_SWITCH]=FALSE;
@@ -542,7 +598,11 @@
 	exit(1);
     }
     if (g_utf8_validate(contents,len,NULL))
+    {
 	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
+	if (!charset)
+	    (void)set_charset("UNICODE",NULL);
+    }
     else
 	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
     g_free(contents);
@@ -674,6 +734,7 @@
     g_free(running_from);
     if (usertypo)
 	g_tree_unref(usertypo);
+    set_charset(NULL,NULL);
     if (config)
 	g_key_file_free(config);
     return 0;
@@ -735,20 +796,11 @@
     gchar *inword;
     QuoteClass qc;
     lines=g_strsplit(etext,"\n",0);
-    if (lines[0])
-	/* If there's at least one line, we might have UNIX-style terminators */
-	results.unix_lineends=TRUE;
     for (j=0;lines[j];j++)
     {
 	lbytes=strlen(lines[j]);
-	if (lbytes>0 && lines[j][lbytes-1]=='\r')
-	{
-	    results.unix_lineends=FALSE;
-	    do
-	    {
-		lines[j][--lbytes]='\0';
-	    } while (lbytes>0 && lines[j][lbytes-1]=='\r');
-	}
+	while (lbytes>0 && lines[j][lbytes-1]=='\r')
+	    lines[j][--lbytes]='\0';
 	llen=g_utf8_strlen(lines[j],lbytes);
 	linecnt++;
 	if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") &&
@@ -890,13 +942,6 @@
 struct warnings *report_first_pass(struct first_pass_results *results)
 {
     static struct warnings warnings={0};
-    warnings.nocr=1;
-    if (results->unix_lineends)
-    {
-	warnings.nocr=0;
-	g_print("   --> No lines in this file have a CR. Not reporting them. "
-	  "Project Gutenberg requires that all lineends be CR-LF.\n");
-    }
     if (cnt_spacend>0)
 	g_print("   --> %ld lines in this file have white space at end\n",
 	  cnt_spacend);
@@ -1004,25 +1049,32 @@
 	  "Not reporting them.\n",
 	  results->spacedash+results->emdash.non_PG_space);
     }
-    /* If more than a quarter of characters are hi-bit, bug out. */
-    warnings.bin=1;
-    if (results->binlen*4>results->totlen)
+    if (charset)
+	warnings.bin=0;
+    else
     {
-	g_print("   --> This file does not appear to be ASCII. "
-	  "Terminating. Best of luck with it!\n");
-	exit(1);
-    }
-    if (results->alphalen*4<results->totlen)
-    {
-	g_print("   --> This file does not appear to be text. "
-	  "Terminating. Best of luck with it!\n");
-	exit(1);
-    }
-    if (results->binlen*100>results->totlen || results->binlen>100)
-    {
-	g_print("   --> There are a lot of foreign letters here. "
-	  "Not reporting them.\n");
-	warnings.bin=0;
+	/* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */
+	warnings.bin=1;
+	/* If more than a quarter of characters are hi-bit, bug out. */
+	if (results->binlen*4>results->totlen)
+	{
+	    g_print("   --> This file does not appear to be ASCII. "
+	      "Terminating. Best of luck with it!\n");
+	    exit(1);
+	}
+	if (results->alphalen*4<results->totlen)
+	{
+	    g_print("   --> This file does not appear to be text. "
+	      "Terminating. Best of luck with it!\n");
+	    exit(1);
+	}
+	if (results->binlen*100>results->totlen || results->binlen>100)
+	{
+	    g_print("   --> There are a lot of foreign letters here. "
+	      "Not reporting them.\n");
+	    if (!pswit[VERBOSE_SWITCH])
+		warnings.bin=0;
+	}
     }
     warnings.isDutch=FALSE;
     if (results->Dutchcount>50)
@@ -1050,7 +1102,6 @@
     g_print("\n");
     if (pswit[VERBOSE_SWITCH])
     {
-	warnings.bin=1;
 	warnings.shortline=1;
 	warnings.dotcomma=1;
 	warnings.longline=1;
@@ -1245,14 +1296,17 @@
   gboolean isemptyline)
 {
     /* Don't repeat multiple warnings on one line. */
-    gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE;
+    gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE;
     gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE;
     const char *s;
     gunichar c;
+    gsize nb;
+    gchar *t;
     for (s=aline;*s;s=g_utf8_next_char(s))
     {
 	c=g_utf8_get_char(s);
-	if (!eNon_A && (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
+	if (warnings->bin && !eInvalidChar &&
+	  (c<CHAR_SPACE && c!='\t' && c!='\n' || c>127))
 	{
 	    if (pswit[ECHO_SWITCH])
 		g_print("\n%s\n",aline);
@@ -1267,7 +1321,57 @@
 		      linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
 	    else
 		cnt_bin++;
-	    eNon_A=TRUE;
+	    eInvalidChar=TRUE;
+	}
+	if (!eInvalidChar && charset)
+	{
+	    if (charset_validator==(GIConv)-1)
+	    {
+		if (!g_unichar_isdefined(c))
+		{
+		    if (pswit[ECHO_SWITCH])
+			g_print("\n%s\n",aline);
+		    if (!pswit[OVERVIEW_SWITCH])
+			g_print("    Line %ld column %ld - Unassigned UNICODE "
+			  "code point U+%04" G_GINT32_MODIFIER "X\n",
+			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
+		    else
+			cnt_bin++;
+		    eInvalidChar=TRUE;
+		}
+		else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD ||
+		  c>=100000 && c<=0x10FFFD)
+		{
+		    if (pswit[ECHO_SWITCH])
+			g_print("\n%s\n",aline);
+		    if (!pswit[OVERVIEW_SWITCH])
+			g_print("    Line %ld column %ld - Private Use "
+			  "character U+%04" G_GINT32_MODIFIER "X\n",
+			  linecnt,g_utf8_pointer_to_offset(aline,s)+1,c);
+		    else
+			cnt_bin++;
+		    eInvalidChar=TRUE;
+		}
+	    }
+	    else
+	    {
+		t=g_convert_with_iconv(s,g_utf8_next_char(s)-s,
+		  charset_validator,NULL,&nb,NULL);
+		if (t)
+		    g_free(t);
+		else
+		{
+		    if (pswit[ECHO_SWITCH])
+			g_print("\n%s\n",aline);
+		    if (!pswit[OVERVIEW_SWITCH])
+			g_print("    Line %ld column %ld - Non-%s "
+			  "character %u\n",linecnt,
+			  g_utf8_pointer_to_offset(aline,s)+1,charset,c);
+		    else
+			cnt_bin++;
+		    eInvalidChar=TRUE;
+		}
+	    }
 	}
 	if (!eTab && c==CHAR_TAB)
 	{
@@ -2885,7 +2989,7 @@
      */
     linecnt=0;
     etext_ptr=etext;
-    while ((aline=flgets(&etext_ptr,linecnt+1,warnings->nocr)))
+    while ((aline=flgets(&etext_ptr,linecnt+1)))
     {
 	linecnt++;
 	if (linecnt==1)
@@ -2955,8 +3059,7 @@
 	if (s>=aline && g_utf8_get_char(s)=='-')
 	    enddash=TRUE;
 	check_for_control_characters(aline);
-	if (warnings->bin)
-	    check_for_odd_characters(aline,warnings,isemptyline);
+	check_for_odd_characters(aline,warnings,isemptyline);
 	if (warnings->longline)
 	    check_for_long_line(aline);
 	if (warnings->shortline)
@@ -3031,7 +3134,7 @@
  *
  * Returns: a pointer to the line.
  */
-char *flgets(char **etext,long lcnt,gboolean warn_nocr)
+char *flgets(char **etext,long lcnt)
 {
     gunichar c;
     gboolean isCR=FALSE;
@@ -3070,7 +3173,7 @@
 	    else
 	    {
 		/* Error - a LF without a preceding CR */
-		if (pswit[LINE_END_SWITCH] && warn_nocr)
+		if (pswit[LINE_END_SWITCH])
 		{
 		    if (pswit[ECHO_SWITCH])
 		    {