1.1 --- a/bookloupe/bookloupe.c Tue Sep 24 07:18:50 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Thu Oct 03 16:09:39 2013 +0100
1.3 @@ -431,6 +431,40 @@
1.4 return 0;
1.5 }
1.6
1.7 +void count_dashes(const char *line,const char *dash,
1.8 + struct dash_results *results)
1.9 +{
1.10 + int i;
1.11 + gchar **tokens;
1.12 + gunichar pc,nc;
1.13 + gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
1.14 + if (!*line)
1.15 + return;
1.16 + tokens=g_strsplit(line,dash,0);
1.17 + if (tokens[1])
1.18 + results->base++;
1.19 + for(i=1;tokens[i];i++)
1.20 + {
1.21 + pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
1.22 + nc=g_utf8_get_char(tokens[i]);
1.23 + if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
1.24 + spaced=TRUE;
1.25 + if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
1.26 + spaced2=TRUE;
1.27 + else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
1.28 + unspaced=TRUE;
1.29 + }
1.30 + if (spaced)
1.31 + results->space++;
1.32 + if (spaced2)
1.33 + /* count of lines with em-dashes with spaces both sides */
1.34 + results->non_PG_space++;
1.35 + if (unspaced)
1.36 + /* count of lines with PG-type em-dashes with no spaces */
1.37 + results->PG_space++;
1.38 + g_strfreev(tokens);
1.39 +}
1.40 +
1.41 /*
1.42 * first_pass:
1.43 *
1.44 @@ -449,6 +483,7 @@
1.45 unsigned int lastlen=0,lastblen=0;
1.46 long spline=0,nspline=0;
1.47 static struct first_pass_results results={0};
1.48 + struct dash_results tmp_dash_results;
1.49 gchar *inword;
1.50 QuoteClass qc;
1.51 lines=g_strsplit(etext,"\n",0);
1.52 @@ -512,7 +547,7 @@
1.53 else
1.54 qc=INVALID_QUOTE;
1.55 if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
1.56 - isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
1.57 + g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
1.58 results.endquote_count++;
1.59 }
1.60 }
1.61 @@ -559,18 +594,15 @@
1.62 results.htmcount+=4; /* bonus marks! */
1.63 }
1.64 /* Check for spaced em-dashes */
1.65 - if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
1.66 - {
1.67 - results.emdash++;
1.68 - if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
1.69 - results.space_emdash++;
1.70 - if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
1.71 - /* count of em-dashes with spaces both sides */
1.72 - results.non_PG_space_emdash++;
1.73 - if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
1.74 - /* count of PG-type em-dashes with no spaces */
1.75 - results.PG_space_emdash++;
1.76 - }
1.77 + memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
1.78 + count_dashes(lines[j],"--",&tmp_dash_results);
1.79 + count_dashes(lines[j],"—",&tmp_dash_results);
1.80 + if (tmp_dash_results.base)
1.81 + results.emdash.base++;
1.82 + if (tmp_dash_results.non_PG_space)
1.83 + results.emdash.non_PG_space++;
1.84 + if (tmp_dash_results.PG_space)
1.85 + results.emdash.PG_space++;
1.86 for (s=lines[j];*s;)
1.87 {
1.88 inword=getaword(&s);
1.89 @@ -700,13 +732,13 @@
1.90 * and some people insist on them whatever the guidelines say.
1.91 */
1.92 warnings.dash=1;
1.93 - if (results->spacedash+results->non_PG_space_emdash>
1.94 - results->PG_space_emdash)
1.95 + if (results->spacedash+results->emdash.non_PG_space>
1.96 + results->emdash.PG_space)
1.97 {
1.98 warnings.dash=0;
1.99 g_print(" --> There are %ld spaced dashes and em-dashes. "
1.100 "Not reporting them.\n",
1.101 - results->spacedash+results->non_PG_space_emdash);
1.102 + results->spacedash+results->emdash.non_PG_space);
1.103 }
1.104 /* If more than a quarter of characters are hi-bit, bug out. */
1.105 warnings.bin=1;
1.106 @@ -1120,20 +1152,53 @@
1.107 }
1.108
1.109 /*
1.110 + * str_emdash:
1.111 + *
1.112 + * Find the first em-dash, return a pointer to it and set <next> to the
1.113 + * character following the dash.
1.114 + */
1.115 +char *str_emdash(const char *s,const char **next)
1.116 +{
1.117 + const char *s1,*s2;
1.118 + s1=strstr(s,"--");
1.119 + s2=strstr(s,"—");
1.120 + if (!s1)
1.121 + {
1.122 + if (s2)
1.123 + *next=g_utf8_next_char(s2);
1.124 + return (char *)s2;
1.125 + }
1.126 + else if (!s2)
1.127 + {
1.128 + *next=g_utf8_next_char(g_utf8_next_char(s1));
1.129 + return (char *)s1;
1.130 + }
1.131 + else if (s1<s2)
1.132 + {
1.133 + *next=g_utf8_next_char(g_utf8_next_char(s1));
1.134 + return (char *)s1;
1.135 + }
1.136 + else
1.137 + {
1.138 + *next=g_utf8_next_char(s2);
1.139 + return (char *)s2;
1.140 + }
1.141 +}
1.142 +
1.143 +/*
1.144 * check_for_spaced_emdash:
1.145 *
1.146 * Check for spaced em-dashes.
1.147 *
1.148 - * We must check _all_ occurrences of "--" on the line
1.149 - * hence the loop - even if the first double-dash is OK
1.150 + * We must check _all_ occurrences of em-dashes on the line
1.151 + * hence the loop - even if the first dash is OK
1.152 * there may be another that's wrong later on.
1.153 */
1.154 void check_for_spaced_emdash(const char *aline)
1.155 {
1.156 const char *s,*t,*next;
1.157 - for (s=aline;t=strstr(s,"--");s=next)
1.158 + for (s=aline;t=str_emdash(s,&next);s=next)
1.159 {
1.160 - next=g_utf8_next_char(g_utf8_next_char(t));
1.161 if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
1.162 g_utf8_get_char(next)==CHAR_SPACE)
1.163 {
1.164 @@ -2322,7 +2387,7 @@
1.165 qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
1.166 nc=g_utf8_get_char(g_utf8_next_char(s));
1.167 /* for each character in the line except 1st */
1.168 - if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
1.169 + if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
1.170 {
1.171 if (pswit[ECHO_SWITCH])
1.172 g_print("\n%s\n",aline);
1.173 @@ -2466,7 +2531,7 @@
1.174 cnt_punct++;
1.175 break;
1.176 }
1.177 - if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
1.178 + if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
1.179 break;
1.180 }
1.181 }
2.1 --- a/bookloupe/bookloupe.h Tue Sep 24 07:18:50 2013 +0100
2.2 +++ b/bookloupe/bookloupe.h Thu Oct 03 16:09:39 2013 +0100
2.3 @@ -58,11 +58,16 @@
2.4 SWITNO
2.5 };
2.6
2.7 +struct dash_results {
2.8 + long base,space,non_PG_space,PG_space;
2.9 +};
2.10 +
2.11 struct first_pass_results {
2.12 long firstline,astline;
2.13 long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
2.14 long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
2.15 - long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
2.16 + long spacedash;
2.17 + struct dash_results emdash;
2.18 int Dutchcount,Frenchcount;
2.19 };
2.20
3.1 --- a/test/bookloupe/Makefile.am Tue Sep 24 07:18:50 2013 +0100
3.2 +++ b/test/bookloupe/Makefile.am Thu Oct 03 16:09:39 2013 +0100
3.3 @@ -1,5 +1,6 @@
3.4 TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test
3.5 TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \
3.6 - runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst
3.7 + runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \
3.8 + emdash.tst
3.9
3.10 dist_pkgdata_DATA=$(TESTS)
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/test/bookloupe/emdash.tst Thu Oct 03 16:09:39 2013 +0100
4.3 @@ -0,0 +1,14 @@
4.4 +**************** INPUT ****************
4.5 +“Those are good words,” Spotted Deer declared, admiringly. “You will
4.6 +soon find Gokhos, the great white Medicine Owl, and then we will be able
4.7 +to do some great things. I am not thinking about the Shawnees——”
4.8 +
4.9 +“Stop!” Running Fox interrupted, excitedly. “I see smoke rising behind
4.10 +that ridge — I believe we have found the Mohawk camp.”
4.11 +**************** WARNINGS ****************
4.12 +<expected>
4.13 + <error>
4.14 + <at line="6" column="12"/>
4.15 + <text>Spaced em-dash?</text>
4.16 + </error>
4.17 +</expected>