1.1 --- a/bookloupe/bookloupe.c	Thu May 30 07:31:24 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Thu May 30 17:16:37 2013 +0100
     1.3 @@ -24,6 +24,7 @@
     1.4  #include <ctype.h>
     1.5  #include <glib.h>
     1.6  #include <bl/bl.h>
     1.7 +#include "HTMLentities.h"
     1.8  
     1.9  gchar *prevline;
    1.10  
    1.11 @@ -119,132 +120,6 @@
    1.12      "among", "those", "into", "whom", "having", "thence", ""
    1.13  }; 
    1.14  
    1.15 -struct {
    1.16 -    char *htmlent;
    1.17 -    char *htmlnum;
    1.18 -    char *textent;
    1.19 -} entities[] = {
    1.20 -    "&amp;",	"&#38;",     "&", 
    1.21 -    "&lt;",	"&#60;",     "<",
    1.22 -    "&gt;",	"&#62;",     ">",
    1.23 -    "&deg;",	"&#176;",    " degrees",
    1.24 -    "&pound;",	"&#163;",    "L",
    1.25 -    "&quot;",	"&#34;",     "\"", /* quotation mark = APL quote */
    1.26 -    "&OElig;",	"&#338;",    "OE", /* latin capital ligature OE */
    1.27 -    "&oelig;",	"&#339;",    "oe", /* latin small ligature oe */
    1.28 -    "&Scaron;",	"&#352;",    "S", /* latin capital letter S with caron */
    1.29 -    "&scaron;",	"&#353;",    "s", /* latin small letter s with caron */
    1.30 -    "&Yuml;",	"&#376;",    "Y", /* latin capital letter Y with diaeresis */
    1.31 -    "&circ;",	"&#710;",    "",  /* modifier letter circumflex accent */
    1.32 -    "&tilde;",	"&#732;",    "~", /* small tilde, U+02DC ISOdia */
    1.33 -    "&ensp;",	"&#8194;",   " ", /* en space, U+2002 ISOpub */
    1.34 -    "&emsp;",	"&#8195;",   " ", /* em space, U+2003 ISOpub */
    1.35 -    "&thinsp;",	"&#8201;",   " ", /* thin space, U+2009 ISOpub */
    1.36 -    "&ndash;",	"&#8211;",   "-", /* en dash, U+2013 ISOpub */
    1.37 -    "&mdash;",	"&#8212;",   "--", /* em dash, U+2014 ISOpub */
    1.38 -    "&rsquo;",	"&#8217;",   "'", /* right single quotation mark */
    1.39 -    "&sbquo;",	"&#8218;",   "'", /* single low-9 quotation mark */
    1.40 -    "&ldquo;",	"&#8220;",   "\"", /* left double quotation mark */
    1.41 -    "&rdquo;",	"&#8221;",   "\"", /* right double quotation mark */
    1.42 -    "&bdquo;",	"&#8222;",   "\"", /* double low-9 quotation mark */
    1.43 -    "&lsaquo;",	"&#8249;",   "\"", /* single left-pointing angle quotation mark */
    1.44 -    "&rsaquo;",	"&#8250;",   "\"", /* single right-pointing angle quotation mark */
    1.45 -    "&nbsp;",	"&#160;",    " ", /* no-break space = non-breaking space, */
    1.46 -    "&iexcl;",	"&#161;",    "!", /* inverted exclamation mark */
    1.47 -    "&cent;",	"&#162;",    "c", /* cent sign */
    1.48 -    "&pound;",	"&#163;",    "L", /* pound sign */
    1.49 -    "&curren;",	"&#164;",    "$", /* currency sign */
    1.50 -    "&yen;",	"&#165;",    "Y", /* yen sign = yuan sign */
    1.51 -    "&sect;",	"&#167;",    "--", /* section sign */
    1.52 -    "&uml;",	"&#168;",    " ", /* diaeresis = spacing diaeresis */
    1.53 -    "&copy;",	"&#169;",    "(C) ", /* copyright sign */
    1.54 -    "&ordf;",	"&#170;",    " ", /* feminine ordinal indicator */
    1.55 -    "&laquo;",	"&#171;",    "\"", /* left-pointing double angle quotation mark */
    1.56 -    "&shy;",	"&#173;",    "-", /* soft hyphen = discretionary hyphen */
    1.57 -    "&reg;",	"&#174;",    "(R) ", /* registered sign = registered trade mark sign */
    1.58 -    "&macr;",	"&#175;",    " ", /* macron = spacing macron = overline */
    1.59 -    "&deg;",	"&#176;",    " degrees", /* degree sign */
    1.60 -    "&plusmn;",	"&#177;",    "+-", /* plus-minus sign = plus-or-minus sign */
    1.61 -    "&sup2;",	"&#178;",    "2", /* superscript two = superscript digit two */
    1.62 -    "&sup3;",	"&#179;",    "3", /* superscript three = superscript digit three */
    1.63 -    "&acute;",	"&#180;",    " ", /* acute accent = spacing acute */
    1.64 -    "&micro;",	"&#181;",    "m", /* micro sign */
    1.65 -    "&para;",	"&#182;",    "--", /* pilcrow sign = paragraph sign */
    1.66 -    "&cedil;",	"&#184;",    " ", /* cedilla = spacing cedilla */
    1.67 -    "&sup1;",	"&#185;",    "1", /* superscript one = superscript digit one */
    1.68 -    "&ordm;",	"&#186;",    " ", /* masculine ordinal indicator */
    1.69 -    "&raquo;",	"&#187;",    "\"", /* right-pointing double angle quotation mark */
    1.70 -    "&frac14;",	"&#188;",    "1/4", /* vulgar fraction one quarter */
    1.71 -    "&frac12;",	"&#189;",    "1/2", /* vulgar fraction one half */
    1.72 -    "&frac34;",	"&#190;",    "3/4", /* vulgar fraction three quarters */
    1.73 -    "&iquest;",	"&#191;",    "?", /* inverted question mark */
    1.74 -    "&Agrave;",	"&#192;",    "A", /* latin capital letter A with grave */
    1.75 -    "&Aacute;",	"&#193;",    "A", /* latin capital letter A with acute */
    1.76 -    "&Acirc;",	"&#194;",    "A", /* latin capital letter A with circumflex */
    1.77 -    "&Atilde;",	"&#195;",    "A", /* latin capital letter A with tilde */
    1.78 -    "&Auml;",	"&#196;",    "A", /* latin capital letter A with diaeresis */
    1.79 -    "&Aring;",	"&#197;",    "A", /* latin capital letter A with ring above */
    1.80 -    "&AElig;",	"&#198;",    "AE", /* latin capital letter AE */
    1.81 -    "&Ccedil;",	"&#199;",    "C", /* latin capital letter C with cedilla */
    1.82 -    "&Egrave;",	"&#200;",    "E", /* latin capital letter E with grave */
    1.83 -    "&Eacute;",	"&#201;",    "E", /* latin capital letter E with acute */
    1.84 -    "&Ecirc;",	"&#202;",    "E", /* latin capital letter E with circumflex */
    1.85 -    "&Euml;",	"&#203;",    "E", /* latin capital letter E with diaeresis */
    1.86 -    "&Igrave;",	"&#204;",    "I", /* latin capital letter I with grave */
    1.87 -    "&Iacute;",	"&#205;",    "I", /* latin capital letter I with acute */
    1.88 -    "&Icirc;",	"&#206;",    "I", /* latin capital letter I with circumflex */
    1.89 -    "&Iuml;",	"&#207;",    "I", /* latin capital letter I with diaeresis */
    1.90 -    "&ETH;",	"&#208;",    "E", /* latin capital letter ETH */
    1.91 -    "&Ntilde;",	"&#209;",    "N", /* latin capital letter N with tilde */
    1.92 -    "&Ograve;",	"&#210;",    "O", /* latin capital letter O with grave */
    1.93 -    "&Oacute;",	"&#211;",    "O", /* latin capital letter O with acute */
    1.94 -    "&Ocirc;",	"&#212;",    "O", /* latin capital letter O with circumflex */
    1.95 -    "&Otilde;",	"&#213;",    "O", /* latin capital letter O with tilde */
    1.96 -    "&Ouml;",	"&#214;",    "O", /* latin capital letter O with diaeresis */
    1.97 -    "&times;",	"&#215;",    "*", /* multiplication sign */
    1.98 -    "&Oslash;",	"&#216;",    "O", /* latin capital letter O with stroke */
    1.99 -    "&Ugrave;",	"&#217;",    "U", /* latin capital letter U with grave */
   1.100 -    "&Uacute;",	"&#218;",    "U", /* latin capital letter U with acute */
   1.101 -    "&Ucirc;",	"&#219;",    "U", /* latin capital letter U with circumflex */
   1.102 -    "&Uuml;",	"&#220;",    "U", /* latin capital letter U with diaeresis */
   1.103 -    "&Yacute;",	"&#221;",    "Y", /* latin capital letter Y with acute */
   1.104 -    "&THORN;",	"&#222;",    "TH", /* latin capital letter THORN */
   1.105 -    "&szlig;",	"&#223;",    "sz", /* latin small letter sharp s = ess-zed */
   1.106 -    "&agrave;",	"&#224;",    "a", /* latin small letter a with grave */
   1.107 -    "&aacute;",	"&#225;",    "a", /* latin small letter a with acute */
   1.108 -    "&acirc;",	"&#226;",    "a", /* latin small letter a with circumflex */
   1.109 -    "&atilde;",	"&#227;",    "a", /* latin small letter a with tilde */
   1.110 -    "&auml;",	"&#228;",    "a", /* latin small letter a with diaeresis */
   1.111 -    "&aring;",	"&#229;",    "a", /* latin small letter a with ring above */
   1.112 -    "&aelig;",	"&#230;",    "ae", /* latin small letter ae */
   1.113 -    "&ccedil;",	"&#231;",    "c", /* latin small letter c with cedilla */
   1.114 -    "&egrave;",	"&#232;",    "e", /* latin small letter e with grave */
   1.115 -    "&eacute;",	"&#233;",    "e", /* latin small letter e with acute */
   1.116 -    "&ecirc;",	"&#234;",    "e", /* latin small letter e with circumflex */
   1.117 -    "&euml;",	"&#235;",    "e", /* latin small letter e with diaeresis */
   1.118 -    "&igrave;",	"&#236;",    "i", /* latin small letter i with grave */
   1.119 -    "&iacute;",	"&#237;",    "i", /* latin small letter i with acute */
   1.120 -    "&icirc;",	"&#238;",    "i", /* latin small letter i with circumflex */
   1.121 -    "&iuml;",	"&#239;",    "i", /* latin small letter i with diaeresis */
   1.122 -    "&eth;",	"&#240;",    "eth", /* latin small letter eth */
   1.123 -    "&ntilde;",	"&#241;",    "n", /* latin small letter n with tilde */
   1.124 -    "&ograve;",	"&#242;",    "o", /* latin small letter o with grave */
   1.125 -    "&oacute;",	"&#243;",    "o", /* latin small letter o with acute */
   1.126 -    "&ocirc;",	"&#244;",    "o", /* latin small letter o with circumflex */
   1.127 -    "&otilde;",	"&#245;",    "o", /* latin small letter o with tilde */
   1.128 -    "&ouml;",	"&#246;",    "o", /* latin small letter o with diaeresis */
   1.129 -    "&divide;",	"&#247;",    "/", /* division sign */
   1.130 -    "&oslash;",	"&#248;",    "o", /* latin small letter o with stroke */
   1.131 -    "&ugrave;",	"&#249;",    "u", /* latin small letter u with grave */
   1.132 -    "&uacute;",	"&#250;",    "u", /* latin small letter u with acute */
   1.133 -    "&ucirc;",	"&#251;",    "u", /* latin small letter u with circumflex */
   1.134 -    "&uuml;",	"&#252;",    "u", /* latin small letter u with diaeresis */
   1.135 -    "&yacute;",	"&#253;",    "y", /* latin small letter y with acute */
   1.136 -    "&thorn;",	"&#254;",    "th", /* latin small letter thorn */
   1.137 -    "&yuml;",	"&#255;",    "y", /* latin small letter y with diaeresis */
   1.138 -    "", ""
   1.139 -};
   1.140 -
   1.141  /* special characters */
   1.142  #define CHAR_SPACE	  32
   1.143  #define CHAR_TAB	   9
   1.144 @@ -352,7 +227,7 @@
   1.145  char *linehasmarkup(char *);
   1.146  char *losemarkup(char *);
   1.147  gboolean tagcomp(const char *,const char *);
   1.148 -char *loseentities(char *);
   1.149 +void loseentities(char *);
   1.150  gboolean isroman(const char *);
   1.151  void postprocess_for_DP(char *);
   1.152  
   1.153 @@ -2916,6 +2791,8 @@
   1.154      g_tree_unref(qperiod);
   1.155      g_set_print_handler(NULL);
   1.156      print_as_windows_1252(NULL);
   1.157 +    if (pswit[MARKUP_SWITCH])  
   1.158 +	loseentities(NULL);
   1.159  }
   1.160  
   1.161  /*
   1.162 @@ -3210,8 +3087,7 @@
   1.163  {
   1.164      while (losemarkup(theline))
   1.165  	;
   1.166 -    while (loseentities(theline))
   1.167 -	;
   1.168 +    loseentities(theline);
   1.169  }
   1.170  
   1.171  char *losemarkup(char *theline)
   1.172 @@ -3233,37 +3109,86 @@
   1.173      return NULL;
   1.174  }
   1.175  
   1.176 -char *loseentities(char *theline)
   1.177 +void loseentities(char *theline)
   1.178  {
   1.179      int i;
   1.180 -    char *s,*t;
   1.181 -    if (!*theline) 
   1.182 -	return NULL;
   1.183 -    for (i=0;*entities[i].htmlent;i++)
   1.184 +    gsize nb;
   1.185 +    char *amp,*scolon;
   1.186 +    gchar *s,*t;
   1.187 +    gunichar c;
   1.188 +    GTree *entities=NULL;
   1.189 +    GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
   1.190 +    if (!theline)
   1.191      {
   1.192 -	s=strstr(theline,entities[i].htmlent);
   1.193 -	if (s)
   1.194 +	if (entities)
   1.195 +	    g_tree_destroy(entities);
   1.196 +	entities=NULL;
   1.197 +	if (translit==(GIConv)-1)
   1.198 +	    g_iconv_close(translit);
   1.199 +	translit=(GIConv)-1;
   1.200 +	if (to_utf8==(GIConv)-1)
   1.201 +	    g_iconv_close(to_utf8);
   1.202 +	to_utf8=(GIConv)-1;
   1.203 +	return;
   1.204 +    }
   1.205 +    if (!*theline)
   1.206 +	return;
   1.207 +    if (!entities)
   1.208 +    {
   1.209 +	entities=g_tree_new((GCompareFunc)strcmp);
   1.210 +	for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
   1.211 +	    g_tree_insert(entities,HTMLentities[i].name,
   1.212 +	      GUINT_TO_POINTER(HTMLentities[i].c));
   1.213 +    }
   1.214 +    if (translit==(GIConv)-1)
   1.215 +	translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
   1.216 +    if (to_utf8==(GIConv)-1)
   1.217 +	to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
   1.218 +    while((amp=strchr(theline,'&')))
   1.219 +    {
   1.220 +	scolon=strchr(amp,';');
   1.221 +	if (scolon)
   1.222  	{
   1.223 -	    t=g_strdup(s+strlen(entities[i].htmlent));
   1.224 -	    strcpy(s,entities[i].textent);
   1.225 -	    strcat(s,t);
   1.226 -	    g_free(t);
   1.227 -	    return theline;
   1.228 +	    if (amp[1]=='#')
   1.229 +	    {
   1.230 +		if (amp+2+strspn(amp+2,"0123456789")==scolon)
   1.231 +		    c=strtol(amp+2,NULL,10);
   1.232 +		else if (amp[2]=='x' &&
   1.233 +		  amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
   1.234 +		    c=strtol(amp+3,NULL,16);
   1.235 +	    }
   1.236 +	    else
   1.237 +	    {
   1.238 +		s=g_strndup(amp+1,scolon-(amp+1));
   1.239 +	        c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
   1.240 +		g_free(s);
   1.241 +	    }
   1.242  	}
   1.243 +	else
   1.244 +	    c=0;
   1.245 +	if (c)
   1.246 +	{
   1.247 +	    theline=amp;
   1.248 +	    if (c<128 || c>=192 && c<=255)	/* An ISO-8859-1 character */
   1.249 +		theline+=g_unichar_to_utf8(c,theline);
   1.250 +	    else
   1.251 +	    {
   1.252 +		s=g_malloc(6);
   1.253 +		nb=g_unichar_to_utf8(c,s);
   1.254 +		t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
   1.255 +		g_free(s);
   1.256 +		s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
   1.257 +		g_free(t);
   1.258 +		memcpy(theline,s,nb);
   1.259 +		g_free(s);
   1.260 +		theline+=nb;
   1.261 +	    }
   1.262 +	    memmove(theline,g_utf8_next_char(scolon),
   1.263 +	      strlen(g_utf8_next_char(scolon))+1);
   1.264 +	}
   1.265 +	else
   1.266 +	    theline=g_utf8_next_char(amp);
   1.267      }
   1.268 -    for (i=0;*entities[i].htmlnum;i++)
   1.269 -    {
   1.270 -	s=strstr(theline,entities[i].htmlnum);
   1.271 -	if (s)
   1.272 -	{
   1.273 -	    t=g_strdup(s+strlen(entities[i].htmlnum));
   1.274 -	    strcpy(s,entities[i].textent);
   1.275 -	    strcat(s,t);
   1.276 -	    g_free(t);
   1.277 -	    return theline;
   1.278 -	}
   1.279 -    }
   1.280 -    return NULL;
   1.281  }
   1.282  
   1.283  gboolean tagcomp(const char *strin,const char *basetag)
changeset 71	82d3cc398b54
parent 70	aa916da2e452
child 72	52d4a7f926b4