# HG changeset patch # User ali # Date 1369930597 -3600 # Node ID 82d3cc398b54e0737093bb229cacf7d2d13ef2c8 # Parent aa916da2e452f74c1042749418904d659ed89693 Use official HTML 4 character entity definitions diff -r aa916da2e452 -r 82d3cc398b54 bookloupe/HTMLlat1.ent --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bookloupe/HTMLlat1.ent Thu May 30 17:16:37 2013 +0100 @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r aa916da2e452 -r 82d3cc398b54 bookloupe/HTMLspecial.ent --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bookloupe/HTMLspecial.ent Thu May 30 17:16:37 2013 +0100 @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r aa916da2e452 -r 82d3cc398b54 bookloupe/HTMLsymbol.ent --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bookloupe/HTMLsymbol.ent Thu May 30 17:16:37 2013 +0100 @@ -0,0 +1,241 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r aa916da2e452 -r 82d3cc398b54 bookloupe/Makefile.am --- a/bookloupe/Makefile.am Thu May 30 07:31:24 2013 +0100 +++ b/bookloupe/Makefile.am Thu May 30 17:16:37 2013 +0100 @@ -8,5 +8,12 @@ bookloupe.typ: bookloupe.typ.in sed 's/$$/\r/' $< > $@ -EXTRA_DIST=bookloupe.typ.in -CLEANFILES=bookloupe.typ +HTMLentities.h: gen-html-entities.sh HTMLlat1.ent HTMLsymbol.ent HTMLspecial.ent + sh ${srcdir}/gen-html-entities.sh ${srcdir}/HTMLlat1.ent \ + ${srcdir}/HTMLsymbol.ent ${srcdir}/HTMLspecial.ent > $@ + +bookloupe.$(OBJEXT): HTMLentities.h + +EXTRA_DIST=bookloupe.typ.in gen-html-entities.sh HTMLlat1.ent HTMLsymbol.ent \ + HTMLspecial.ent +CLEANFILES=bookloupe.typ HTMLentities.h diff -r aa916da2e452 -r 82d3cc398b54 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Thu May 30 07:31:24 2013 +0100 +++ b/bookloupe/bookloupe.c Thu May 30 17:16:37 2013 +0100 @@ -24,6 +24,7 @@ #include #include #include +#include "HTMLentities.h" gchar *prevline; @@ -119,132 +120,6 @@ "among", "those", "into", "whom", "having", "thence", "" }; -struct { - char *htmlent; - char *htmlnum; - char *textent; -} entities[] = { - "&", "&", "&", - "<", "<", "<", - ">", ">", ">", - "°", "°", " degrees", - "£", "£", "L", - """, """, "\"", /* quotation mark = APL quote */ - "Œ", "Œ", "OE", /* latin capital ligature OE */ - "œ", "œ", "oe", /* latin small ligature oe */ - "Š", "Š", "S", /* latin capital letter S with caron */ - "š", "š", "s", /* latin small letter s with caron */ - "Ÿ", "Ÿ", "Y", /* latin capital letter Y with diaeresis */ - "ˆ", "ˆ", "", /* modifier letter circumflex accent */ - "˜", "˜", "~", /* small tilde, U+02DC ISOdia */ - " ", " ", " ", /* en space, U+2002 ISOpub */ - " ", " ", " ", /* em space, U+2003 ISOpub */ - " ", " ", " ", /* thin space, U+2009 ISOpub */ - "–", "–", "-", /* en dash, U+2013 ISOpub */ - "—", "—", "--", /* em dash, U+2014 ISOpub */ - "’", "’", "'", /* right single quotation mark */ - "‚", "‚", "'", /* single low-9 quotation mark */ - "“", "“", "\"", /* left double quotation mark */ - "”", "”", "\"", /* right double quotation mark */ - "„", "„", "\"", /* double low-9 quotation mark */ - "‹", "‹", "\"", /* single left-pointing angle quotation mark */ - "›", "›", "\"", /* single right-pointing angle quotation mark */ - " ", " ", " ", /* no-break space = non-breaking space, */ - "¡", "¡", "!", /* inverted exclamation mark */ - "¢", "¢", "c", /* cent sign */ - "£", "£", "L", /* pound sign */ - "¤", "¤", "$", /* currency sign */ - "¥", "¥", "Y", /* yen sign = yuan sign */ - "§", "§", "--", /* section sign */ - "¨", "¨", " ", /* diaeresis = spacing diaeresis */ - "©", "©", "(C) ", /* copyright sign */ - "ª", "ª", " ", /* feminine ordinal indicator */ - "«", "«", "\"", /* left-pointing double angle quotation mark */ - "­", "­", "-", /* soft hyphen = discretionary hyphen */ - "®", "®", "(R) ", /* registered sign = registered trade mark sign */ - "¯", "¯", " ", /* macron = spacing macron = overline */ - "°", "°", " degrees", /* degree sign */ - "±", "±", "+-", /* plus-minus sign = plus-or-minus sign */ - "²", "²", "2", /* superscript two = superscript digit two */ - "³", "³", "3", /* superscript three = superscript digit three */ - "´", "´", " ", /* acute accent = spacing acute */ - "µ", "µ", "m", /* micro sign */ - "¶", "¶", "--", /* pilcrow sign = paragraph sign */ - "¸", "¸", " ", /* cedilla = spacing cedilla */ - "¹", "¹", "1", /* superscript one = superscript digit one */ - "º", "º", " ", /* masculine ordinal indicator */ - "»", "»", "\"", /* right-pointing double angle quotation mark */ - "¼", "¼", "1/4", /* vulgar fraction one quarter */ - "½", "½", "1/2", /* vulgar fraction one half */ - "¾", "¾", "3/4", /* vulgar fraction three quarters */ - "¿", "¿", "?", /* inverted question mark */ - "À", "À", "A", /* latin capital letter A with grave */ - "Á", "Á", "A", /* latin capital letter A with acute */ - "Â", "Â", "A", /* latin capital letter A with circumflex */ - "Ã", "Ã", "A", /* latin capital letter A with tilde */ - "Ä", "Ä", "A", /* latin capital letter A with diaeresis */ - "Å", "Å", "A", /* latin capital letter A with ring above */ - "Æ", "Æ", "AE", /* latin capital letter AE */ - "Ç", "Ç", "C", /* latin capital letter C with cedilla */ - "È", "È", "E", /* latin capital letter E with grave */ - "É", "É", "E", /* latin capital letter E with acute */ - "Ê", "Ê", "E", /* latin capital letter E with circumflex */ - "Ë", "Ë", "E", /* latin capital letter E with diaeresis */ - "Ì", "Ì", "I", /* latin capital letter I with grave */ - "Í", "Í", "I", /* latin capital letter I with acute */ - "Î", "Î", "I", /* latin capital letter I with circumflex */ - "Ï", "Ï", "I", /* latin capital letter I with diaeresis */ - "Ð", "Ð", "E", /* latin capital letter ETH */ - "Ñ", "Ñ", "N", /* latin capital letter N with tilde */ - "Ò", "Ò", "O", /* latin capital letter O with grave */ - "Ó", "Ó", "O", /* latin capital letter O with acute */ - "Ô", "Ô", "O", /* latin capital letter O with circumflex */ - "Õ", "Õ", "O", /* latin capital letter O with tilde */ - "Ö", "Ö", "O", /* latin capital letter O with diaeresis */ - "×", "×", "*", /* multiplication sign */ - "Ø", "Ø", "O", /* latin capital letter O with stroke */ - "Ù", "Ù", "U", /* latin capital letter U with grave */ - "Ú", "Ú", "U", /* latin capital letter U with acute */ - "Û", "Û", "U", /* latin capital letter U with circumflex */ - "Ü", "Ü", "U", /* latin capital letter U with diaeresis */ - "Ý", "Ý", "Y", /* latin capital letter Y with acute */ - "Þ", "Þ", "TH", /* latin capital letter THORN */ - "ß", "ß", "sz", /* latin small letter sharp s = ess-zed */ - "à", "à", "a", /* latin small letter a with grave */ - "á", "á", "a", /* latin small letter a with acute */ - "â", "â", "a", /* latin small letter a with circumflex */ - "ã", "ã", "a", /* latin small letter a with tilde */ - "ä", "ä", "a", /* latin small letter a with diaeresis */ - "å", "å", "a", /* latin small letter a with ring above */ - "æ", "æ", "ae", /* latin small letter ae */ - "ç", "ç", "c", /* latin small letter c with cedilla */ - "è", "è", "e", /* latin small letter e with grave */ - "é", "é", "e", /* latin small letter e with acute */ - "ê", "ê", "e", /* latin small letter e with circumflex */ - "ë", "ë", "e", /* latin small letter e with diaeresis */ - "ì", "ì", "i", /* latin small letter i with grave */ - "í", "í", "i", /* latin small letter i with acute */ - "î", "î", "i", /* latin small letter i with circumflex */ - "ï", "ï", "i", /* latin small letter i with diaeresis */ - "ð", "ð", "eth", /* latin small letter eth */ - "ñ", "ñ", "n", /* latin small letter n with tilde */ - "ò", "ò", "o", /* latin small letter o with grave */ - "ó", "ó", "o", /* latin small letter o with acute */ - "ô", "ô", "o", /* latin small letter o with circumflex */ - "õ", "õ", "o", /* latin small letter o with tilde */ - "ö", "ö", "o", /* latin small letter o with diaeresis */ - "÷", "÷", "/", /* division sign */ - "ø", "ø", "o", /* latin small letter o with stroke */ - "ù", "ù", "u", /* latin small letter u with grave */ - "ú", "ú", "u", /* latin small letter u with acute */ - "û", "û", "u", /* latin small letter u with circumflex */ - "ü", "ü", "u", /* latin small letter u with diaeresis */ - "ý", "ý", "y", /* latin small letter y with acute */ - "þ", "þ", "th", /* latin small letter thorn */ - "ÿ", "ÿ", "y", /* latin small letter y with diaeresis */ - "", "" -}; - /* special characters */ #define CHAR_SPACE 32 #define CHAR_TAB 9 @@ -352,7 +227,7 @@ char *linehasmarkup(char *); char *losemarkup(char *); gboolean tagcomp(const char *,const char *); -char *loseentities(char *); +void loseentities(char *); gboolean isroman(const char *); void postprocess_for_DP(char *); @@ -2916,6 +2791,8 @@ g_tree_unref(qperiod); g_set_print_handler(NULL); print_as_windows_1252(NULL); + if (pswit[MARKUP_SWITCH]) + loseentities(NULL); } /* @@ -3210,8 +3087,7 @@ { while (losemarkup(theline)) ; - while (loseentities(theline)) - ; + loseentities(theline); } char *losemarkup(char *theline) @@ -3233,37 +3109,86 @@ return NULL; } -char *loseentities(char *theline) +void loseentities(char *theline) { int i; - char *s,*t; - if (!*theline) - return NULL; - for (i=0;*entities[i].htmlent;i++) + gsize nb; + char *amp,*scolon; + gchar *s,*t; + gunichar c; + GTree *entities=NULL; + GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1; + if (!theline) { - s=strstr(theline,entities[i].htmlent); - if (s) + if (entities) + g_tree_destroy(entities); + entities=NULL; + if (translit==(GIConv)-1) + g_iconv_close(translit); + translit=(GIConv)-1; + if (to_utf8==(GIConv)-1) + g_iconv_close(to_utf8); + to_utf8=(GIConv)-1; + return; + } + if (!*theline) + return; + if (!entities) + { + entities=g_tree_new((GCompareFunc)strcmp); + for(i=0;i=192 && c<=255) /* An ISO-8859-1 character */ + theline+=g_unichar_to_utf8(c,theline); + else + { + s=g_malloc(6); + nb=g_unichar_to_utf8(c,s); + t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL); + g_free(s); + s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL); + g_free(t); + memcpy(theline,s,nb); + g_free(s); + theline+=nb; + } + memmove(theline,g_utf8_next_char(scolon), + strlen(g_utf8_next_char(scolon))+1); + } + else + theline=g_utf8_next_char(amp); } - for (i=0;*entities[i].htmlnum;i++) - { - s=strstr(theline,entities[i].htmlnum); - if (s) - { - t=g_strdup(s+strlen(entities[i].htmlnum)); - strcpy(s,entities[i].textent); - strcat(s,t); - g_free(t); - return theline; - } - } - return NULL; } gboolean tagcomp(const char *strin,const char *basetag) diff -r aa916da2e452 -r 82d3cc398b54 bookloupe/gen-html-entities.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bookloupe/gen-html-entities.sh Thu May 30 17:16:37 2013 +0100 @@ -0,0 +1,36 @@ +#!/bin/sh + +header() +{ +cat << EOF +/* + * Automatically generated by gen-html-entities. Do not edit by hand. + */ + +struct { + char *name; + gunichar c; +} HTMLentities[] = { +EOF +} + +parse_ent_file() +{ + awk '/