1.1 --- a/bookloupe/bookloupe.c Thu May 30 07:31:24 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Thu May 30 17:16:37 2013 +0100
1.3 @@ -24,6 +24,7 @@
1.4 #include <ctype.h>
1.5 #include <glib.h>
1.6 #include <bl/bl.h>
1.7 +#include "HTMLentities.h"
1.8
1.9 gchar *prevline;
1.10
1.11 @@ -119,132 +120,6 @@
1.12 "among", "those", "into", "whom", "having", "thence", ""
1.13 };
1.14
1.15 -struct {
1.16 - char *htmlent;
1.17 - char *htmlnum;
1.18 - char *textent;
1.19 -} entities[] = {
1.20 - "&", "&", "&",
1.21 - "<", "<", "<",
1.22 - ">", ">", ">",
1.23 - "°", "°", " degrees",
1.24 - "£", "£", "L",
1.25 - """, """, "\"", /* quotation mark = APL quote */
1.26 - "Œ", "Œ", "OE", /* latin capital ligature OE */
1.27 - "œ", "œ", "oe", /* latin small ligature oe */
1.28 - "Š", "Š", "S", /* latin capital letter S with caron */
1.29 - "š", "š", "s", /* latin small letter s with caron */
1.30 - "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */
1.31 - "ˆ", "ˆ", "", /* modifier letter circumflex accent */
1.32 - "˜", "˜", "~", /* small tilde, U+02DC ISOdia */
1.33 - " ", " ", " ", /* en space, U+2002 ISOpub */
1.34 - " ", " ", " ", /* em space, U+2003 ISOpub */
1.35 - " ", " ", " ", /* thin space, U+2009 ISOpub */
1.36 - "–", "–", "-", /* en dash, U+2013 ISOpub */
1.37 - "—", "—", "--", /* em dash, U+2014 ISOpub */
1.38 - "’", "’", "'", /* right single quotation mark */
1.39 - "‚", "‚", "'", /* single low-9 quotation mark */
1.40 - "“", "“", "\"", /* left double quotation mark */
1.41 - "”", "”", "\"", /* right double quotation mark */
1.42 - "„", "„", "\"", /* double low-9 quotation mark */
1.43 - "‹", "‹", "\"", /* single left-pointing angle quotation mark */
1.44 - "›", "›", "\"", /* single right-pointing angle quotation mark */
1.45 - " ", " ", " ", /* no-break space = non-breaking space, */
1.46 - "¡", "¡", "!", /* inverted exclamation mark */
1.47 - "¢", "¢", "c", /* cent sign */
1.48 - "£", "£", "L", /* pound sign */
1.49 - "¤", "¤", "$", /* currency sign */
1.50 - "¥", "¥", "Y", /* yen sign = yuan sign */
1.51 - "§", "§", "--", /* section sign */
1.52 - "¨", "¨", " ", /* diaeresis = spacing diaeresis */
1.53 - "©", "©", "(C) ", /* copyright sign */
1.54 - "ª", "ª", " ", /* feminine ordinal indicator */
1.55 - "«", "«", "\"", /* left-pointing double angle quotation mark */
1.56 - "­", "­", "-", /* soft hyphen = discretionary hyphen */
1.57 - "®", "®", "(R) ", /* registered sign = registered trade mark sign */
1.58 - "¯", "¯", " ", /* macron = spacing macron = overline */
1.59 - "°", "°", " degrees", /* degree sign */
1.60 - "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */
1.61 - "²", "²", "2", /* superscript two = superscript digit two */
1.62 - "³", "³", "3", /* superscript three = superscript digit three */
1.63 - "´", "´", " ", /* acute accent = spacing acute */
1.64 - "µ", "µ", "m", /* micro sign */
1.65 - "¶", "¶", "--", /* pilcrow sign = paragraph sign */
1.66 - "¸", "¸", " ", /* cedilla = spacing cedilla */
1.67 - "¹", "¹", "1", /* superscript one = superscript digit one */
1.68 - "º", "º", " ", /* masculine ordinal indicator */
1.69 - "»", "»", "\"", /* right-pointing double angle quotation mark */
1.70 - "¼", "¼", "1/4", /* vulgar fraction one quarter */
1.71 - "½", "½", "1/2", /* vulgar fraction one half */
1.72 - "¾", "¾", "3/4", /* vulgar fraction three quarters */
1.73 - "¿", "¿", "?", /* inverted question mark */
1.74 - "À", "À", "A", /* latin capital letter A with grave */
1.75 - "Á", "Á", "A", /* latin capital letter A with acute */
1.76 - "Â", "Â", "A", /* latin capital letter A with circumflex */
1.77 - "Ã", "Ã", "A", /* latin capital letter A with tilde */
1.78 - "Ä", "Ä", "A", /* latin capital letter A with diaeresis */
1.79 - "Å", "Å", "A", /* latin capital letter A with ring above */
1.80 - "Æ", "Æ", "AE", /* latin capital letter AE */
1.81 - "Ç", "Ç", "C", /* latin capital letter C with cedilla */
1.82 - "È", "È", "E", /* latin capital letter E with grave */
1.83 - "É", "É", "E", /* latin capital letter E with acute */
1.84 - "Ê", "Ê", "E", /* latin capital letter E with circumflex */
1.85 - "Ë", "Ë", "E", /* latin capital letter E with diaeresis */
1.86 - "Ì", "Ì", "I", /* latin capital letter I with grave */
1.87 - "Í", "Í", "I", /* latin capital letter I with acute */
1.88 - "Î", "Î", "I", /* latin capital letter I with circumflex */
1.89 - "Ï", "Ï", "I", /* latin capital letter I with diaeresis */
1.90 - "Ð", "Ð", "E", /* latin capital letter ETH */
1.91 - "Ñ", "Ñ", "N", /* latin capital letter N with tilde */
1.92 - "Ò", "Ò", "O", /* latin capital letter O with grave */
1.93 - "Ó", "Ó", "O", /* latin capital letter O with acute */
1.94 - "Ô", "Ô", "O", /* latin capital letter O with circumflex */
1.95 - "Õ", "Õ", "O", /* latin capital letter O with tilde */
1.96 - "Ö", "Ö", "O", /* latin capital letter O with diaeresis */
1.97 - "×", "×", "*", /* multiplication sign */
1.98 - "Ø", "Ø", "O", /* latin capital letter O with stroke */
1.99 - "Ù", "Ù", "U", /* latin capital letter U with grave */
1.100 - "Ú", "Ú", "U", /* latin capital letter U with acute */
1.101 - "Û", "Û", "U", /* latin capital letter U with circumflex */
1.102 - "Ü", "Ü", "U", /* latin capital letter U with diaeresis */
1.103 - "Ý", "Ý", "Y", /* latin capital letter Y with acute */
1.104 - "Þ", "Þ", "TH", /* latin capital letter THORN */
1.105 - "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */
1.106 - "à", "à", "a", /* latin small letter a with grave */
1.107 - "á", "á", "a", /* latin small letter a with acute */
1.108 - "â", "â", "a", /* latin small letter a with circumflex */
1.109 - "ã", "ã", "a", /* latin small letter a with tilde */
1.110 - "ä", "ä", "a", /* latin small letter a with diaeresis */
1.111 - "å", "å", "a", /* latin small letter a with ring above */
1.112 - "æ", "æ", "ae", /* latin small letter ae */
1.113 - "ç", "ç", "c", /* latin small letter c with cedilla */
1.114 - "è", "è", "e", /* latin small letter e with grave */
1.115 - "é", "é", "e", /* latin small letter e with acute */
1.116 - "ê", "ê", "e", /* latin small letter e with circumflex */
1.117 - "ë", "ë", "e", /* latin small letter e with diaeresis */
1.118 - "ì", "ì", "i", /* latin small letter i with grave */
1.119 - "í", "í", "i", /* latin small letter i with acute */
1.120 - "î", "î", "i", /* latin small letter i with circumflex */
1.121 - "ï", "ï", "i", /* latin small letter i with diaeresis */
1.122 - "ð", "ð", "eth", /* latin small letter eth */
1.123 - "ñ", "ñ", "n", /* latin small letter n with tilde */
1.124 - "ò", "ò", "o", /* latin small letter o with grave */
1.125 - "ó", "ó", "o", /* latin small letter o with acute */
1.126 - "ô", "ô", "o", /* latin small letter o with circumflex */
1.127 - "õ", "õ", "o", /* latin small letter o with tilde */
1.128 - "ö", "ö", "o", /* latin small letter o with diaeresis */
1.129 - "÷", "÷", "/", /* division sign */
1.130 - "ø", "ø", "o", /* latin small letter o with stroke */
1.131 - "ù", "ù", "u", /* latin small letter u with grave */
1.132 - "ú", "ú", "u", /* latin small letter u with acute */
1.133 - "û", "û", "u", /* latin small letter u with circumflex */
1.134 - "ü", "ü", "u", /* latin small letter u with diaeresis */
1.135 - "ý", "ý", "y", /* latin small letter y with acute */
1.136 - "þ", "þ", "th", /* latin small letter thorn */
1.137 - "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */
1.138 - "", ""
1.139 -};
1.140 -
1.141 /* special characters */
1.142 #define CHAR_SPACE 32
1.143 #define CHAR_TAB 9
1.144 @@ -352,7 +227,7 @@
1.145 char *linehasmarkup(char *);
1.146 char *losemarkup(char *);
1.147 gboolean tagcomp(const char *,const char *);
1.148 -char *loseentities(char *);
1.149 +void loseentities(char *);
1.150 gboolean isroman(const char *);
1.151 void postprocess_for_DP(char *);
1.152
1.153 @@ -2916,6 +2791,8 @@
1.154 g_tree_unref(qperiod);
1.155 g_set_print_handler(NULL);
1.156 print_as_windows_1252(NULL);
1.157 + if (pswit[MARKUP_SWITCH])
1.158 + loseentities(NULL);
1.159 }
1.160
1.161 /*
1.162 @@ -3210,8 +3087,7 @@
1.163 {
1.164 while (losemarkup(theline))
1.165 ;
1.166 - while (loseentities(theline))
1.167 - ;
1.168 + loseentities(theline);
1.169 }
1.170
1.171 char *losemarkup(char *theline)
1.172 @@ -3233,37 +3109,86 @@
1.173 return NULL;
1.174 }
1.175
1.176 -char *loseentities(char *theline)
1.177 +void loseentities(char *theline)
1.178 {
1.179 int i;
1.180 - char *s,*t;
1.181 - if (!*theline)
1.182 - return NULL;
1.183 - for (i=0;*entities[i].htmlent;i++)
1.184 + gsize nb;
1.185 + char *amp,*scolon;
1.186 + gchar *s,*t;
1.187 + gunichar c;
1.188 + GTree *entities=NULL;
1.189 + GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1;
1.190 + if (!theline)
1.191 {
1.192 - s=strstr(theline,entities[i].htmlent);
1.193 - if (s)
1.194 + if (entities)
1.195 + g_tree_destroy(entities);
1.196 + entities=NULL;
1.197 + if (translit==(GIConv)-1)
1.198 + g_iconv_close(translit);
1.199 + translit=(GIConv)-1;
1.200 + if (to_utf8==(GIConv)-1)
1.201 + g_iconv_close(to_utf8);
1.202 + to_utf8=(GIConv)-1;
1.203 + return;
1.204 + }
1.205 + if (!*theline)
1.206 + return;
1.207 + if (!entities)
1.208 + {
1.209 + entities=g_tree_new((GCompareFunc)strcmp);
1.210 + for(i=0;i<G_N_ELEMENTS(HTMLentities);i++)
1.211 + g_tree_insert(entities,HTMLentities[i].name,
1.212 + GUINT_TO_POINTER(HTMLentities[i].c));
1.213 + }
1.214 + if (translit==(GIConv)-1)
1.215 + translit=g_iconv_open("ISO_8859-1//TRANSLIT","UTF-8");
1.216 + if (to_utf8==(GIConv)-1)
1.217 + to_utf8=g_iconv_open("UTF-8","ISO_8859-1");
1.218 + while((amp=strchr(theline,'&')))
1.219 + {
1.220 + scolon=strchr(amp,';');
1.221 + if (scolon)
1.222 {
1.223 - t=g_strdup(s+strlen(entities[i].htmlent));
1.224 - strcpy(s,entities[i].textent);
1.225 - strcat(s,t);
1.226 - g_free(t);
1.227 - return theline;
1.228 + if (amp[1]=='#')
1.229 + {
1.230 + if (amp+2+strspn(amp+2,"0123456789")==scolon)
1.231 + c=strtol(amp+2,NULL,10);
1.232 + else if (amp[2]=='x' &&
1.233 + amp+3+strspn(amp+3,"0123456789abcdefABCDEF")==scolon)
1.234 + c=strtol(amp+3,NULL,16);
1.235 + }
1.236 + else
1.237 + {
1.238 + s=g_strndup(amp+1,scolon-(amp+1));
1.239 + c=GPOINTER_TO_UINT(g_tree_lookup(entities,s));
1.240 + g_free(s);
1.241 + }
1.242 }
1.243 + else
1.244 + c=0;
1.245 + if (c)
1.246 + {
1.247 + theline=amp;
1.248 + if (c<128 || c>=192 && c<=255) /* An ISO-8859-1 character */
1.249 + theline+=g_unichar_to_utf8(c,theline);
1.250 + else
1.251 + {
1.252 + s=g_malloc(6);
1.253 + nb=g_unichar_to_utf8(c,s);
1.254 + t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL);
1.255 + g_free(s);
1.256 + s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL);
1.257 + g_free(t);
1.258 + memcpy(theline,s,nb);
1.259 + g_free(s);
1.260 + theline+=nb;
1.261 + }
1.262 + memmove(theline,g_utf8_next_char(scolon),
1.263 + strlen(g_utf8_next_char(scolon))+1);
1.264 + }
1.265 + else
1.266 + theline=g_utf8_next_char(amp);
1.267 }
1.268 - for (i=0;*entities[i].htmlnum;i++)
1.269 - {
1.270 - s=strstr(theline,entities[i].htmlnum);
1.271 - if (s)
1.272 - {
1.273 - t=g_strdup(s+strlen(entities[i].htmlnum));
1.274 - strcpy(s,entities[i].textent);
1.275 - strcat(s,t);
1.276 - g_free(t);
1.277 - return theline;
1.278 - }
1.279 - }
1.280 - return NULL;
1.281 }
1.282
1.283 gboolean tagcomp(const char *strin,const char *basetag)