# HG changeset patch # User ali # Date 1380812979 -3600 # Node ID c45fa3843618a62325745fd3661fd667cff3803d # Parent 8c2d6a0cf7172aba353cc2bb1bd7fd37c966b663 Fix bug #20: Accept Unicode emdash as valid end-of-paragraph punctuation diff -r 8c2d6a0cf717 -r c45fa3843618 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Tue Sep 24 07:18:50 2013 +0100 +++ b/bookloupe/bookloupe.c Thu Oct 03 16:09:39 2013 +0100 @@ -431,6 +431,40 @@ return 0; } +void count_dashes(const char *line,const char *dash, + struct dash_results *results) +{ + int i; + gchar **tokens; + gunichar pc,nc; + gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE; + if (!*line) + return; + tokens=g_strsplit(line,dash,0); + if (tokens[1]) + results->base++; + for(i=1;tokens[i];i++) + { + pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1]))); + nc=g_utf8_get_char(tokens[i]); + if (g_unichar_isspace(pc) || g_unichar_isspace(nc)) + spaced=TRUE; + if (g_unichar_isspace(pc) && g_unichar_isspace(nc)) + spaced2=TRUE; + else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc)) + unspaced=TRUE; + } + if (spaced) + results->space++; + if (spaced2) + /* count of lines with em-dashes with spaces both sides */ + results->non_PG_space++; + if (unspaced) + /* count of lines with PG-type em-dashes with no spaces */ + results->PG_space++; + g_strfreev(tokens); +} + /* * first_pass: * @@ -449,6 +483,7 @@ unsigned int lastlen=0,lastblen=0; long spline=0,nspline=0; static struct first_pass_results results={0}; + struct dash_results tmp_dash_results; gchar *inword; QuoteClass qc; lines=g_strsplit(etext,"\n",0); @@ -512,7 +547,7 @@ else qc=INVALID_QUOTE; if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && - isalpha(g_utf8_get_char(g_utf8_prev_char(s)))) + g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s)))) results.endquote_count++; } } @@ -559,18 +594,15 @@ results.htmcount+=4; /* bonus marks! */ } /* Check for spaced em-dashes */ - if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--"))) - { - results.emdash++; - if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE) - results.space_emdash++; - if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE) - /* count of em-dashes with spaces both sides */ - results.non_PG_space_emdash++; - if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE) - /* count of PG-type em-dashes with no spaces */ - results.PG_space_emdash++; - } + memset(&tmp_dash_results,0,sizeof(tmp_dash_results)); + count_dashes(lines[j],"--",&tmp_dash_results); + count_dashes(lines[j],"—",&tmp_dash_results); + if (tmp_dash_results.base) + results.emdash.base++; + if (tmp_dash_results.non_PG_space) + results.emdash.non_PG_space++; + if (tmp_dash_results.PG_space) + results.emdash.PG_space++; for (s=lines[j];*s;) { inword=getaword(&s); @@ -700,13 +732,13 @@ * and some people insist on them whatever the guidelines say. */ warnings.dash=1; - if (results->spacedash+results->non_PG_space_emdash> - results->PG_space_emdash) + if (results->spacedash+results->emdash.non_PG_space> + results->emdash.PG_space) { warnings.dash=0; g_print(" --> There are %ld spaced dashes and em-dashes. " "Not reporting them.\n", - results->spacedash+results->non_PG_space_emdash); + results->spacedash+results->emdash.non_PG_space); } /* If more than a quarter of characters are hi-bit, bug out. */ warnings.bin=1; @@ -1120,20 +1152,53 @@ } /* + * str_emdash: + * + * Find the first em-dash, return a pointer to it and set to the + * character following the dash. + */ +char *str_emdash(const char *s,const char **next) +{ + const char *s1,*s2; + s1=strstr(s,"--"); + s2=strstr(s,"—"); + if (!s1) + { + if (s2) + *next=g_utf8_next_char(s2); + return (char *)s2; + } + else if (!s2) + { + *next=g_utf8_next_char(g_utf8_next_char(s1)); + return (char *)s1; + } + else if (s1aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE || g_utf8_get_char(next)==CHAR_SPACE) { @@ -2322,7 +2387,7 @@ qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE; nc=g_utf8_get_char(g_utf8_next_char(s)); /* for each character in the line except 1st */ - if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc)) + if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc)) { if (pswit[ECHO_SWITCH]) g_print("\n%s\n",aline); @@ -2466,7 +2531,7 @@ cnt_punct++; break; } - if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s))) + if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s))) break; } } diff -r 8c2d6a0cf717 -r c45fa3843618 bookloupe/bookloupe.h --- a/bookloupe/bookloupe.h Tue Sep 24 07:18:50 2013 +0100 +++ b/bookloupe/bookloupe.h Thu Oct 03 16:09:39 2013 +0100 @@ -58,11 +58,16 @@ SWITNO }; +struct dash_results { + long base,space,non_PG_space,PG_space; +}; + struct first_pass_results { long firstline,astline; long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma; long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit; - long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash; + long spacedash; + struct dash_results emdash; int Dutchcount,Frenchcount; }; diff -r 8c2d6a0cf717 -r c45fa3843618 test/bookloupe/Makefile.am --- a/test/bookloupe/Makefile.am Tue Sep 24 07:18:50 2013 +0100 +++ b/test/bookloupe/Makefile.am Thu Oct 03 16:09:39 2013 +0100 @@ -1,5 +1,6 @@ TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \ - runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst + runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \ + emdash.tst dist_pkgdata_DATA=$(TESTS) diff -r 8c2d6a0cf717 -r c45fa3843618 test/bookloupe/emdash.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/emdash.tst Thu Oct 03 16:09:39 2013 +0100 @@ -0,0 +1,14 @@ +**************** INPUT **************** +“Those are good words,” Spotted Deer declared, admiringly. “You will +soon find Gokhos, the great white Medicine Owl, and then we will be able +to do some great things. I am not thinking about the Shawnees——” + +“Stop!” Running Fox interrupted, excitedly. “I see smoke rising behind +that ridge — I believe we have found the Mohawk camp.” +**************** WARNINGS **************** + + + + Spaced em-dash? + +