Fix bug #20: Accept Unicode emdash as valid end-of-paragraph punctuation
authorali <ali@juiblex.co.uk>
Thu Oct 03 16:09:39 2013 +0100 (2013-10-03)
changeset 97c45fa3843618
parent 96 8c2d6a0cf717
child 98 37da646396b9
Fix bug #20: Accept Unicode emdash as valid end-of-paragraph punctuation
bookloupe/bookloupe.c
bookloupe/bookloupe.h
test/bookloupe/Makefile.am
test/bookloupe/emdash.tst
     1.1 --- a/bookloupe/bookloupe.c	Tue Sep 24 07:18:50 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Thu Oct 03 16:09:39 2013 +0100
     1.3 @@ -431,6 +431,40 @@
     1.4      return 0;
     1.5  }
     1.6  
     1.7 +void count_dashes(const char *line,const char *dash,
     1.8 +  struct dash_results *results)
     1.9 +{
    1.10 +    int i;
    1.11 +    gchar **tokens;
    1.12 +    gunichar pc,nc;
    1.13 +    gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE;
    1.14 +    if (!*line)
    1.15 +	return;
    1.16 +    tokens=g_strsplit(line,dash,0);
    1.17 +    if (tokens[1])
    1.18 +	results->base++;
    1.19 +    for(i=1;tokens[i];i++)
    1.20 +    {
    1.21 +	pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1])));
    1.22 +	nc=g_utf8_get_char(tokens[i]);
    1.23 +	if (g_unichar_isspace(pc) || g_unichar_isspace(nc))
    1.24 +	    spaced=TRUE;
    1.25 +	if (g_unichar_isspace(pc) && g_unichar_isspace(nc))
    1.26 +	    spaced2=TRUE;
    1.27 +	else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc))
    1.28 +	    unspaced=TRUE;
    1.29 +    }
    1.30 +    if (spaced)
    1.31 +	results->space++;
    1.32 +    if (spaced2)
    1.33 +	/* count of lines with em-dashes with spaces both sides */
    1.34 +	results->non_PG_space++;
    1.35 +    if (unspaced)
    1.36 +	/* count of lines with PG-type em-dashes with no spaces */
    1.37 +	results->PG_space++;
    1.38 +    g_strfreev(tokens);
    1.39 +}
    1.40 +
    1.41  /*
    1.42   * first_pass:
    1.43   *
    1.44 @@ -449,6 +483,7 @@
    1.45      unsigned int lastlen=0,lastblen=0;
    1.46      long spline=0,nspline=0;
    1.47      static struct first_pass_results results={0};
    1.48 +    struct dash_results tmp_dash_results;
    1.49      gchar *inword;
    1.50      QuoteClass qc;
    1.51      lines=g_strsplit(etext,"\n",0);
    1.52 @@ -512,7 +547,7 @@
    1.53  		else
    1.54  		    qc=INVALID_QUOTE;
    1.55  		if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) &&
    1.56 -		  isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
    1.57 +		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))))
    1.58  		    results.endquote_count++;
    1.59  	    }
    1.60  	}
    1.61 @@ -559,18 +594,15 @@
    1.62  		results.htmcount+=4; /* bonus marks! */
    1.63  	}
    1.64  	/* Check for spaced em-dashes */
    1.65 -	if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--")))
    1.66 -	{
    1.67 -	    results.emdash++;
    1.68 -	    if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE)
    1.69 -		results.space_emdash++;
    1.70 -	    if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE)
    1.71 -		/* count of em-dashes with spaces both sides */
    1.72 -		results.non_PG_space_emdash++;
    1.73 -	    if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE)
    1.74 -		/* count of PG-type em-dashes with no spaces */
    1.75 -		results.PG_space_emdash++;
    1.76 -	}
    1.77 +	memset(&tmp_dash_results,0,sizeof(tmp_dash_results));
    1.78 +	count_dashes(lines[j],"--",&tmp_dash_results);
    1.79 +	count_dashes(lines[j],"—",&tmp_dash_results);
    1.80 +	if (tmp_dash_results.base)
    1.81 +	    results.emdash.base++;
    1.82 +	if (tmp_dash_results.non_PG_space)
    1.83 +	    results.emdash.non_PG_space++;
    1.84 +	if (tmp_dash_results.PG_space)
    1.85 +	    results.emdash.PG_space++;
    1.86  	for (s=lines[j];*s;)
    1.87  	{
    1.88  	    inword=getaword(&s);
    1.89 @@ -700,13 +732,13 @@
    1.90       * and some people insist on them whatever the guidelines say.
    1.91       */
    1.92      warnings.dash=1;
    1.93 -    if (results->spacedash+results->non_PG_space_emdash>
    1.94 -      results->PG_space_emdash)
    1.95 +    if (results->spacedash+results->emdash.non_PG_space>
    1.96 +      results->emdash.PG_space)
    1.97      {
    1.98  	warnings.dash=0;
    1.99  	g_print("   --> There are %ld spaced dashes and em-dashes. "
   1.100  	  "Not reporting them.\n",
   1.101 -	  results->spacedash+results->non_PG_space_emdash);
   1.102 +	  results->spacedash+results->emdash.non_PG_space);
   1.103      }
   1.104      /* If more than a quarter of characters are hi-bit, bug out. */
   1.105      warnings.bin=1;
   1.106 @@ -1120,20 +1152,53 @@
   1.107  }
   1.108  
   1.109  /*
   1.110 + * str_emdash:
   1.111 + *
   1.112 + * Find the first em-dash, return a pointer to it and set <next> to the
   1.113 + * character following the dash.
   1.114 + */
   1.115 +char *str_emdash(const char *s,const char **next)
   1.116 +{
   1.117 +    const char *s1,*s2;
   1.118 +    s1=strstr(s,"--");
   1.119 +    s2=strstr(s,"—");
   1.120 +    if (!s1)
   1.121 +    {
   1.122 +	if (s2)
   1.123 +	    *next=g_utf8_next_char(s2);
   1.124 +	return (char *)s2;
   1.125 +    }
   1.126 +    else if (!s2)
   1.127 +    {
   1.128 +	*next=g_utf8_next_char(g_utf8_next_char(s1));
   1.129 +	return (char *)s1;
   1.130 +    }
   1.131 +    else if (s1<s2)
   1.132 +    {
   1.133 +	*next=g_utf8_next_char(g_utf8_next_char(s1));
   1.134 +	return (char *)s1;
   1.135 +    }
   1.136 +    else
   1.137 +    {
   1.138 +	*next=g_utf8_next_char(s2);
   1.139 +	return (char *)s2;
   1.140 +    }
   1.141 +}
   1.142 +
   1.143 +/*
   1.144   * check_for_spaced_emdash:
   1.145   *
   1.146   * Check for spaced em-dashes.
   1.147   *
   1.148 - * We must check _all_ occurrences of "--" on the line
   1.149 - * hence the loop - even if the first double-dash is OK
   1.150 + * We must check _all_ occurrences of em-dashes on the line
   1.151 + * hence the loop - even if the first dash is OK
   1.152   * there may be another that's wrong later on.
   1.153   */
   1.154  void check_for_spaced_emdash(const char *aline)
   1.155  {
   1.156      const char *s,*t,*next;
   1.157 -    for (s=aline;t=strstr(s,"--");s=next)
   1.158 +    for (s=aline;t=str_emdash(s,&next);s=next)
   1.159      {
   1.160 -	next=g_utf8_next_char(g_utf8_next_char(t));
   1.161  	if (t>aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE ||
   1.162  	  g_utf8_get_char(next)==CHAR_SPACE)
   1.163  	{
   1.164 @@ -2322,7 +2387,7 @@
   1.165  	qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE;
   1.166  	nc=g_utf8_get_char(g_utf8_next_char(s));
   1.167  	/* for each character in the line except 1st */
   1.168 -	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc))
   1.169 +	if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc))
   1.170  	{
   1.171  	    if (pswit[ECHO_SWITCH])
   1.172  		g_print("\n%s\n",aline);
   1.173 @@ -2466,7 +2531,7 @@
   1.174  		    cnt_punct++;
   1.175  		break;
   1.176  	    }
   1.177 -	    if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s)))
   1.178 +	    if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s)))
   1.179  		break;
   1.180  	}
   1.181      }
     2.1 --- a/bookloupe/bookloupe.h	Tue Sep 24 07:18:50 2013 +0100
     2.2 +++ b/bookloupe/bookloupe.h	Thu Oct 03 16:09:39 2013 +0100
     2.3 @@ -58,11 +58,16 @@
     2.4      SWITNO
     2.5  };
     2.6  
     2.7 +struct dash_results {
     2.8 +    long base,space,non_PG_space,PG_space;
     2.9 +};
    2.10 +
    2.11  struct first_pass_results {
    2.12      long firstline,astline;
    2.13      long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
    2.14      long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
    2.15 -    long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
    2.16 +    long spacedash;
    2.17 +    struct dash_results emdash;
    2.18      int Dutchcount,Frenchcount;
    2.19  };
    2.20  
     3.1 --- a/test/bookloupe/Makefile.am	Tue Sep 24 07:18:50 2013 +0100
     3.2 +++ b/test/bookloupe/Makefile.am	Thu Oct 03 16:09:39 2013 +0100
     3.3 @@ -1,5 +1,6 @@
     3.4  TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test
     3.5  TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \
     3.6 -	runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst
     3.7 +	runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \
     3.8 +	emdash.tst
     3.9  
    3.10  dist_pkgdata_DATA=$(TESTS)
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/test/bookloupe/emdash.tst	Thu Oct 03 16:09:39 2013 +0100
     4.3 @@ -0,0 +1,14 @@
     4.4 +**************** INPUT ****************
     4.5 +“Those are good words,” Spotted Deer declared, admiringly. “You will
     4.6 +soon find Gokhos, the great white Medicine Owl, and then we will be able
     4.7 +to do some great things. I am not thinking about the Shawnees——”
     4.8 +
     4.9 +“Stop!” Running Fox interrupted, excitedly. “I see smoke rising behind
    4.10 +that ridge — I believe we have found the Mohawk camp.”
    4.11 +**************** WARNINGS ****************
    4.12 +<expected>
    4.13 +  <error>
    4.14 +    <at line="6" column="12"/>
    4.15 +    <text>Spaced em-dash?</text>
    4.16 +  </error>
    4.17 +</expected>