Fix bug #6: BL treats a slanted apostrophe ? as a word separator, not as a contraction or possessive
authorali <ali@juiblex.co.uk>
Sat Sep 21 23:40:18 2013 +0100 (2013-09-21)
changeset 927a62c77a0dbe
parent 91 9a5f6d17e86e
child 93 9fb13a5dde3b
Fix bug #6: BL treats a slanted apostrophe ? as a word separator, not as a contraction or possessive
bookloupe/Makefile.am
bookloupe/bookloupe.c
bookloupe/bookloupe.h
bookloupe/counters.c
bookloupe/counters.h
doc/bookloupe.txt
test/bookloupe/Makefile.am
test/bookloupe/curved-genitives.tst
test/bookloupe/curved-single-quotes.tst
test/compatibility/Makefile.am
test/compatibility/genatives.tst
test/compatibility/genitives.tst
     1.1 --- a/bookloupe/Makefile.am	Tue Sep 17 20:55:57 2013 +0100
     1.2 +++ b/bookloupe/Makefile.am	Sat Sep 21 23:40:18 2013 +0100
     1.3 @@ -1,5 +1,6 @@
     1.4  INCLUDES=-I$(top_srcdir)
     1.5  bin_PROGRAMS=bookloupe
     1.6 +bookloupe_SOURCES=bookloupe.c bookloupe.h counters.c counters.h
     1.7  pkgdata_DATA=bookloupe.typ
     1.8  AM_CFLAGS=$(GLIB_CFLAGS)
     1.9  LIBS=$(GLIB_LIBS)
     2.1 --- a/bookloupe/bookloupe.c	Tue Sep 17 20:55:57 2013 +0100
     2.2 +++ b/bookloupe/bookloupe.c	Sat Sep 21 23:40:18 2013 +0100
     2.3 @@ -27,6 +27,8 @@
     2.4  #endif
     2.5  #include <glib.h>
     2.6  #include <bl/bl.h>
     2.7 +#include "bookloupe.h"
     2.8 +#include "counters.h"
     2.9  #include "HTMLentities.h"
    2.10  
    2.11  gchar *prevline;
    2.12 @@ -123,50 +125,6 @@
    2.13      "among", "those", "into", "whom", "having", "thence", ""
    2.14  }; 
    2.15  
    2.16 -/* special characters */
    2.17 -#define CHAR_SPACE	  32
    2.18 -#define CHAR_TAB	   9
    2.19 -#define CHAR_LF		  10
    2.20 -#define CHAR_CR		  13
    2.21 -#define CHAR_DQUOTE	  34
    2.22 -#define CHAR_SQUOTE	  39
    2.23 -#define CHAR_OPEN_SQUOTE  96
    2.24 -#define CHAR_TILDE	 126
    2.25 -#define CHAR_ASTERISK	  42
    2.26 -#define CHAR_FORESLASH	  47
    2.27 -#define CHAR_CARAT	  94
    2.28 -
    2.29 -#define CHAR_UNDERSCORE    '_'
    2.30 -#define CHAR_OPEN_CBRACK   '{'
    2.31 -#define CHAR_CLOSE_CBRACK  '}'
    2.32 -#define CHAR_OPEN_RBRACK   '('
    2.33 -#define CHAR_CLOSE_RBRACK  ')'
    2.34 -#define CHAR_OPEN_SBRACK   '['
    2.35 -#define CHAR_CLOSE_SBRACK  ']'
    2.36 -
    2.37 -/* longest and shortest normal PG line lengths */
    2.38 -#define LONGEST_PG_LINE   75
    2.39 -#define WAY_TOO_LONG      80
    2.40 -#define SHORTEST_PG_LINE  55
    2.41 -
    2.42 -enum {
    2.43 -    ECHO_SWITCH,
    2.44 -    SQUOTE_SWITCH,
    2.45 -    TYPO_SWITCH,
    2.46 -    QPARA_SWITCH,
    2.47 -    PARANOID_SWITCH,
    2.48 -    LINE_END_SWITCH,
    2.49 -    OVERVIEW_SWITCH,
    2.50 -    STDOUT_SWITCH,
    2.51 -    HEADER_SWITCH,
    2.52 -    WEB_SWITCH,
    2.53 -    VERBOSE_SWITCH,
    2.54 -    MARKUP_SWITCH,
    2.55 -    USERTYPO_SWITCH,
    2.56 -    DP_SWITCH,
    2.57 -    SWITNO
    2.58 -};
    2.59 -
    2.60  gboolean pswit[SWITNO];  /* program switches */
    2.61  
    2.62  static GOptionEntry options[]={
    2.63 @@ -242,40 +200,6 @@
    2.64  UINT saved_cp;
    2.65  #endif
    2.66  
    2.67 -struct first_pass_results {
    2.68 -    long firstline,astline;
    2.69 -    long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
    2.70 -    long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
    2.71 -    long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
    2.72 -    int Dutchcount,Frenchcount;
    2.73 -};
    2.74 -
    2.75 -struct warnings {
    2.76 -    int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
    2.77 -    int endquote;
    2.78 -    gboolean isDutch,isFrench;
    2.79 -};
    2.80 -
    2.81 -struct counters {
    2.82 -    long quot;
    2.83 -    int c_unders,c_brack,s_brack,r_brack;
    2.84 -    int open_single_quote,close_single_quote;
    2.85 -};
    2.86 -
    2.87 -struct line_properties {
    2.88 -    unsigned int len,blen;
    2.89 -    gunichar start;
    2.90 -};
    2.91 -
    2.92 -struct parities {
    2.93 -    int dquote,squote;
    2.94 -};
    2.95 -
    2.96 -struct pending {
    2.97 -    char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
    2.98 -    long squot;
    2.99 -};
   2.100 -
   2.101  void parse_options(int *argc,char ***argv)
   2.102  {
   2.103      GError *err=NULL;
   2.104 @@ -877,7 +801,7 @@
   2.105  	c=g_utf8_get_char(s);
   2.106  	if (c==CHAR_DQUOTE)
   2.107  	    counters->quot++;
   2.108 -	if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
   2.109 +	if (CHAR_IS_SQUOTE(c))
   2.110  	{
   2.111  	    if (s==aline)
   2.112  	    {
   2.113 @@ -887,21 +811,21 @@
   2.114  		 */
   2.115  		if (!g_str_has_prefix(snext,"tis") &&
   2.116  		  !g_str_has_prefix(snext,"Tis"))
   2.117 -		    counters->open_single_quote++;
   2.118 +		    increment_matching(counters,c,TRUE);
   2.119  	    }
   2.120  	    else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
   2.121  	      g_unichar_isalpha(g_utf8_get_char(snext)))
   2.122  		/* Do nothing! it's definitely an apostrophe, not a quote */
   2.123  		;
   2.124  	    /* it's outside a word - let's check it out */
   2.125 -	    else if (c==CHAR_OPEN_SQUOTE ||
   2.126 +	    else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
   2.127  	      g_unichar_isalpha(g_utf8_get_char(snext)))
   2.128  	    {
   2.129  		/* it damwell better BE an openquote */
   2.130  		if (!g_str_has_prefix(snext,"tis") &&
   2.131  		  !g_str_has_prefix(snext,"Tis"))
   2.132  		    /* hardcode a very common exception! */
   2.133 -		    counters->open_single_quote++;
   2.134 +		    increment_matching(counters,c,TRUE);
   2.135  	    }
   2.136  	    else
   2.137  	    {
   2.138 @@ -926,7 +850,7 @@
   2.139  		    guessquote+=8; /* looks like a closequote */
   2.140  		else
   2.141  		    guessquote++;
   2.142 -		if (counters->open_single_quote>counters->close_single_quote)
   2.143 +		if (matching_difference(counters,CHAR_SQUOTE)>0)
   2.144  		    /*
   2.145  		     * Give it the benefit of some doubt,
   2.146  		     * if a squote is already open.
   2.147 @@ -935,7 +859,7 @@
   2.148  		else
   2.149  		    guessquote--;
   2.150  		if (guessquote>=0)
   2.151 -		    counters->close_single_quote++;
   2.152 +		    increment_matching(counters,c,FALSE);
   2.153  	    }
   2.154  	}
   2.155  	if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
   2.156 @@ -943,18 +867,11 @@
   2.157  	    isemptyline=FALSE;  /* ignore lines like  *  *  *  as spacers */
   2.158  	if (c==CHAR_UNDERSCORE)
   2.159  	    counters->c_unders++;
   2.160 -	if (c==CHAR_OPEN_CBRACK)
   2.161 -	    counters->c_brack++;
   2.162 -	if (c==CHAR_CLOSE_CBRACK)
   2.163 -	    counters->c_brack--;
   2.164 -	if (c==CHAR_OPEN_RBRACK)
   2.165 -	    counters->r_brack++;
   2.166 -	if (c==CHAR_CLOSE_RBRACK)
   2.167 -	    counters->r_brack--;
   2.168 -	if (c==CHAR_OPEN_SBRACK)
   2.169 -	    counters->s_brack++;
   2.170 -	if (c==CHAR_CLOSE_SBRACK)
   2.171 -	    counters->s_brack--;
   2.172 +	if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK || c==CHAR_OPEN_SBRACK)
   2.173 +	    increment_matching(counters,c,TRUE);
   2.174 +	if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK ||
   2.175 +	  c==CHAR_CLOSE_SBRACK)
   2.176 +	    increment_matching(counters,c,FALSE);
   2.177  	sprev=s;
   2.178  	s=snext;
   2.179      }
   2.180 @@ -1423,12 +1340,12 @@
   2.181   */
   2.182  void check_for_extra_period(const char *aline,const struct warnings *warnings)
   2.183  {
   2.184 -    const char *s,*t,*s1;
   2.185 +    const char *s,*t,*s1,*sprev;
   2.186      int i;
   2.187      gsize len;
   2.188      gboolean istypo;
   2.189      gchar *testword;
   2.190 -    gunichar *decomposition;
   2.191 +    gunichar c,nc,pc,*decomposition;
   2.192      if (pswit[PARANOID_SWITCH])
   2.193      {
   2.194  	for (t=aline;t=strstr(t,". ");)
   2.195 @@ -1452,8 +1369,9 @@
   2.196  		c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
   2.197  		c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
   2.198  		c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
   2.199 -		if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
   2.200 -		  c4==CHAR_SPACE && g_unichar_isupper(c5))
   2.201 +		if (CHAR_IS_APOSTROPHE(c2) &&
   2.202 +		  g_unichar_islower(c3) && c4==CHAR_SPACE &&
   2.203 +		  g_unichar_isupper(c5))
   2.204  		{
   2.205  		    t=g_utf8_next_char(t);
   2.206  		    continue;
   2.207 @@ -1468,14 +1386,22 @@
   2.208  		/* we have something to investigate */
   2.209  		istypo=TRUE;
   2.210  		/* so let's go back and find out */
   2.211 -		for (s1=g_utf8_prev_char(t);s1>=aline &&
   2.212 -		  (g_unichar_isalpha(g_utf8_get_char(s1)) ||
   2.213 -		  g_unichar_isdigit(g_utf8_get_char(s1)) ||
   2.214 -		  g_utf8_get_char(s1)==CHAR_SQUOTE &&
   2.215 -		  g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
   2.216 -		  g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
   2.217 -		  s1=g_utf8_prev_char(s1))
   2.218 -		    ;
   2.219 +		nc=g_utf8_get_char(t);
   2.220 +		s1=g_utf8_prev_char(t);
   2.221 +		c=g_utf8_get_char(s1);
   2.222 +		sprev=g_utf8_prev_char(s1);
   2.223 +		pc=g_utf8_get_char(sprev);
   2.224 +		while (s1>=aline &&
   2.225 +		  (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
   2.226 +		  g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
   2.227 +		  g_unichar_isalpha(nc)))
   2.228 +		{
   2.229 +		    nc=c;
   2.230 +		    s1=sprev;
   2.231 +		    c=pc;
   2.232 +		    sprev=g_utf8_prev_char(s1);
   2.233 +		    pc=g_utf8_get_char(sprev);
   2.234 +		}
   2.235  		s1=g_utf8_next_char(s1);
   2.236  		s=strchr(s1,'.');
   2.237  		if (s)
   2.238 @@ -1600,7 +1526,7 @@
   2.239      gchar *testword;
   2.240      int i,vowel,consonant,*dupcnt;
   2.241      gboolean isdup,istypo,alower;
   2.242 -    gunichar c;
   2.243 +    gunichar c,pc;
   2.244      long offset,len;
   2.245      gsize decomposition_len;
   2.246      for (s=aline;*s;)
   2.247 @@ -1646,11 +1572,14 @@
   2.248  		     *   French contractions like l'Abbe
   2.249  		     */
   2.250  		    offset=g_utf8_pointer_to_offset(inword,t);
   2.251 +		    if (offset>0)
   2.252 +			pc=g_utf8_get_char(g_utf8_prev_char(t));
   2.253 +		    else
   2.254 +			pc='\0';
   2.255  		    if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
   2.256  		      offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
   2.257  		      g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
   2.258 -		      offset>0 &&
   2.259 -		      g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
   2.260 +		      CHAR_IS_APOSTROPHE(pc))
   2.261  			; /* do nothing! */
   2.262  		    else
   2.263  			istypo=TRUE;
   2.264 @@ -2050,8 +1979,7 @@
   2.265  	{
   2.266  	    c=nc;
   2.267  	    nc=g_utf8_get_char(g_utf8_next_char(s));
   2.268 -	    if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
   2.269 -	      s>aline &&
   2.270 +	    if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
   2.271  	      !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
   2.272  	      !g_unichar_isalpha(nc)))
   2.273  	    {
   2.274 @@ -2166,7 +2094,11 @@
   2.275   */
   2.276  void check_for_spaced_quotes(const char *aline)
   2.277  {
   2.278 +    int i;
   2.279      const char *s,*t;
   2.280 +    const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
   2.281 +      CHAR_RS_QUOTE};
   2.282 +    GString *pattern;
   2.283      s=aline;
   2.284      while ((t=strstr(s," \" ")))
   2.285      {
   2.286 @@ -2179,30 +2111,26 @@
   2.287  	    cnt_punct++;
   2.288  	s=g_utf8_next_char(g_utf8_next_char(t));
   2.289      }
   2.290 -    s=aline;
   2.291 -    while ((t=strstr(s," ' ")))
   2.292 +    pattern=g_string_new(NULL);
   2.293 +    for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
   2.294      {
   2.295 -	if (pswit[ECHO_SWITCH])
   2.296 -	    g_print("\n%s\n",aline);
   2.297 -	if (!pswit[OVERVIEW_SWITCH])
   2.298 -	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
   2.299 -	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
   2.300 -	else
   2.301 -	    cnt_punct++;
   2.302 -	s=g_utf8_next_char(g_utf8_next_char(t));
   2.303 +	g_string_assign(pattern," ");
   2.304 +	g_string_append_unichar(pattern,single_quotes[i]);
   2.305 +	g_string_append_c(pattern,' ');
   2.306 +	s=aline;
   2.307 +	while ((t=strstr(s,pattern->str)))
   2.308 +	{
   2.309 +	    if (pswit[ECHO_SWITCH])
   2.310 +		g_print("\n%s\n",aline);
   2.311 +	    if (!pswit[OVERVIEW_SWITCH])
   2.312 +		g_print("    Line %ld column %ld - Spaced singlequote?\n",
   2.313 +		  linecnt,g_utf8_pointer_to_offset(aline,t)+1);
   2.314 +	    else
   2.315 +		cnt_punct++;
   2.316 +	    s=g_utf8_next_char(g_utf8_next_char(t));
   2.317 +	}
   2.318      }
   2.319 -    s=aline;
   2.320 -    while ((t=strstr(s," ` ")))
   2.321 -    {
   2.322 -	if (pswit[ECHO_SWITCH])
   2.323 -	    g_print("\n%s\n",aline);
   2.324 -	if (!pswit[OVERVIEW_SWITCH])
   2.325 -	    g_print("    Line %ld column %ld - Spaced singlequote?\n",
   2.326 -	      linecnt,g_utf8_pointer_to_offset(aline,t)+1);
   2.327 -	else
   2.328 -	    cnt_punct++;
   2.329 -	s=g_utf8_next_char(g_utf8_next_char(t));
   2.330 -    }
   2.331 +    g_string_free(pattern,TRUE);
   2.332  }
   2.333  
   2.334  /*
   2.335 @@ -2223,7 +2151,7 @@
   2.336  	pc=c;
   2.337  	c=nc;
   2.338  	nc=g_utf8_get_char(g_utf8_next_char(s));
   2.339 -	if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
   2.340 +	if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
   2.341  	{
   2.342  	    if (pswit[ECHO_SWITCH])
   2.343  		g_print("\n%s\n",aline);
   2.344 @@ -2255,8 +2183,7 @@
   2.345  	s=g_utf8_prev_char(aline+lbytes);
   2.346  	c1=g_utf8_get_char(s);
   2.347  	c2=g_utf8_get_char(g_utf8_prev_char(s));
   2.348 -	if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
   2.349 -	  c2==CHAR_SPACE)
   2.350 +	if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
   2.351  	{
   2.352  	    if (pswit[ECHO_SWITCH])
   2.353  		g_print("\n%s\n",aline);
   2.354 @@ -2268,7 +2195,7 @@
   2.355  	}
   2.356  	c1=g_utf8_get_char(aline);
   2.357  	c2=g_utf8_get_char(g_utf8_next_char(aline));
   2.358 -	if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
   2.359 +	if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
   2.360  	{
   2.361  	    if (pswit[ECHO_SWITCH])
   2.362  		g_print("\n%s\n",aline);
   2.363 @@ -2470,8 +2397,7 @@
   2.364      }
   2.365      if (pending->squote)
   2.366      {
   2.367 -	if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
   2.368 -	  pending->squot)
   2.369 +	if (!CHAR_IS_SQUOTE(c) || pswit[QPARA_SWITCH] || pending->squot)
   2.370  	{
   2.371  	    if (!pswit[OVERVIEW_SWITCH])
   2.372  	    {
   2.373 @@ -2558,28 +2484,39 @@
   2.374  void check_for_mismatched_quotes(const struct counters *counters,
   2.375    struct pending *pending)
   2.376  {
   2.377 +    int squote_straight,squote_curved;
   2.378      if (counters->quot%2)
   2.379  	pending->dquote=
   2.380  	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
   2.381 -    if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
   2.382 -      counters->open_single_quote!=counters->close_single_quote)
   2.383 -	pending->squote=
   2.384 -	  g_strdup_printf("    Line %ld - Mismatched singlequotes?",linecnt);
   2.385 -    if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
   2.386 -      counters->open_single_quote!=counters->close_single_quote &&
   2.387 -      counters->open_single_quote!=counters->close_single_quote+1)
   2.388 -	/*
   2.389 -	 * Flag it to be noted regardless of the
   2.390 -	 * first char of the next para.
   2.391 -	 */
   2.392 -	pending->squot=1;
   2.393 -    if (counters->r_brack)
   2.394 +    if (pswit[SQUOTE_SWITCH])
   2.395 +    {
   2.396 +	if (matching_count(counters,CHAR_SQUOTE,TRUE))
   2.397 +	    squote_straight=matching_difference(counters,CHAR_SQUOTE);
   2.398 +	else
   2.399 +	    squote_straight=0;
   2.400 +	if (matching_count(counters,CHAR_LS_QUOTE,TRUE))
   2.401 +	    squote_curved=matching_difference(counters,CHAR_LS_QUOTE);
   2.402 +	else
   2.403 +	    squote_curved=0;
   2.404 +	if (squote_straight || squote_curved)
   2.405 +	    pending->squote=
   2.406 +	      g_strdup_printf("    Line %ld - Mismatched singlequotes?",
   2.407 +	      linecnt);
   2.408 +	if (squote_straight && squote_straight!=1 ||
   2.409 +	  squote_curved && squote_curved!=1)
   2.410 +	    /*
   2.411 +	     * Flag it to be noted regardless of the
   2.412 +	     * first char of the next para.
   2.413 +	     */
   2.414 +	    pending->squot=1;
   2.415 +    }
   2.416 +    if (matching_difference(counters,CHAR_OPEN_RBRACK))
   2.417  	pending->rbrack=
   2.418  	  g_strdup_printf("    Line %ld - Mismatched round brackets?",linecnt);
   2.419 -    if (counters->s_brack)
   2.420 +    if (matching_difference(counters,CHAR_OPEN_SBRACK))
   2.421  	pending->sbrack=
   2.422  	  g_strdup_printf("    Line %ld - Mismatched square brackets?",linecnt);
   2.423 -    if (counters->c_brack)
   2.424 +    if (matching_difference(counters,CHAR_OPEN_CBRACK))
   2.425  	pending->cbrack=
   2.426  	  g_strdup_printf("    Line %ld - Mismatched curly brackets?",linecnt);
   2.427      if (counters->c_unders%2)
   2.428 @@ -2603,6 +2540,7 @@
   2.429  {
   2.430      gboolean letter_on_line=FALSE;
   2.431      const char *s;
   2.432 +    gunichar c;
   2.433      for (s=prevline;*s;s=g_utf8_next_char(s))
   2.434  	if (g_unichar_isalpha(g_utf8_get_char(s)))
   2.435  	{
   2.436 @@ -2619,12 +2557,12 @@
   2.437      if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
   2.438        g_utf8_get_char(prevline)>CHAR_SPACE)
   2.439      {
   2.440 -	for (s=g_utf8_prev_char(prevline+strlen(prevline));
   2.441 -	  (g_utf8_get_char(s)==CHAR_DQUOTE ||
   2.442 -	  g_utf8_get_char(s)==CHAR_SQUOTE) &&
   2.443 -	  g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
   2.444 -	  s=g_utf8_prev_char(s))
   2.445 -	    ;
   2.446 +	s=prevline+strlen(prevline);
   2.447 +	do
   2.448 +	{
   2.449 +	    s=g_utf8_prev_char(s);
   2.450 +	    c=g_utf8_get_char(s);
   2.451 +	} while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
   2.452  	for (;s>prevline;s=g_utf8_prev_char(s))
   2.453  	{
   2.454  	    if (g_unichar_isalpha(g_utf8_get_char(s)))
   2.455 @@ -2857,6 +2795,7 @@
   2.456  	g_tree_foreach(qword,report_duplicate_queries,NULL);
   2.457      g_tree_unref(qword);
   2.458      g_tree_unref(qperiod);
   2.459 +    counters_destroy(&counters);
   2.460      g_set_print_handler(NULL);
   2.461      print_as_windows_1252(NULL);
   2.462      if (pswit[MARKUP_SWITCH])  
   2.463 @@ -3066,10 +3005,10 @@
   2.464      }
   2.465      /* we didn't find a punctuated number - do the regular getword thing */
   2.466      g_string_truncate(word,0);
   2.467 -    for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
   2.468 -      g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
   2.469 -      g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
   2.470 -	g_string_append_unichar(word,g_utf8_get_char(*ptr));
   2.471 +    c=g_utf8_get_char(*ptr);
   2.472 +    for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
   2.473 +      *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
   2.474 +	g_string_append_unichar(word,c);
   2.475      return g_string_free(word,FALSE);
   2.476  }
   2.477  
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/bookloupe/bookloupe.h	Sat Sep 21 23:40:18 2013 +0100
     3.3 @@ -0,0 +1,87 @@
     3.4 +#ifndef BOOKLOUPE_H
     3.5 +#define BOOKLOUPE_H
     3.6 +
     3.7 +/* special characters */
     3.8 +#define CHAR_SPACE	  32
     3.9 +#define CHAR_TAB	   9
    3.10 +#define CHAR_LF		  10
    3.11 +#define CHAR_CR		  13
    3.12 +#define CHAR_DQUOTE	  34
    3.13 +#define CHAR_SQUOTE	  39
    3.14 +#define CHAR_OPEN_SQUOTE  96
    3.15 +#define CHAR_TILDE	 126
    3.16 +#define CHAR_ASTERISK	  42
    3.17 +#define CHAR_FORESLASH	  47
    3.18 +#define CHAR_CARAT	  94
    3.19 +
    3.20 +#define CHAR_UNDERSCORE    '_'
    3.21 +#define CHAR_OPEN_CBRACK   '{'
    3.22 +#define CHAR_CLOSE_CBRACK  '}'
    3.23 +#define CHAR_OPEN_RBRACK   '('
    3.24 +#define CHAR_CLOSE_RBRACK  ')'
    3.25 +#define CHAR_OPEN_SBRACK   '['
    3.26 +#define CHAR_CLOSE_SBRACK  ']'
    3.27 +
    3.28 +#define CHAR_LS_QUOTE	  0x2018
    3.29 +#define CHAR_RS_QUOTE	  0x2019
    3.30 +
    3.31 +#define CHAR_IS_SQUOTE(c)	((c)==CHAR_SQUOTE || (c)==CHAR_OPEN_SQUOTE || \
    3.32 +				 (c)==CHAR_LS_QUOTE || (c)==CHAR_RS_QUOTE)
    3.33 +
    3.34 +#define CHAR_IS_APOSTROPHE(c)	((c)==CHAR_SQUOTE || (c)==CHAR_RS_QUOTE)
    3.35 +
    3.36 +#define CHAR_IS_CLOSING_QUOTE(c) \
    3.37 +    ((c)==CHAR_DQUOTE || (c)==CHAR_SQUOTE || (c)==CHAR_RS_QUOTE)
    3.38 +
    3.39 +/* longest and shortest normal PG line lengths */
    3.40 +#define LONGEST_PG_LINE   75
    3.41 +#define WAY_TOO_LONG      80
    3.42 +#define SHORTEST_PG_LINE  55
    3.43 +
    3.44 +enum {
    3.45 +    ECHO_SWITCH,
    3.46 +    SQUOTE_SWITCH,
    3.47 +    TYPO_SWITCH,
    3.48 +    QPARA_SWITCH,
    3.49 +    PARANOID_SWITCH,
    3.50 +    LINE_END_SWITCH,
    3.51 +    OVERVIEW_SWITCH,
    3.52 +    STDOUT_SWITCH,
    3.53 +    HEADER_SWITCH,
    3.54 +    WEB_SWITCH,
    3.55 +    VERBOSE_SWITCH,
    3.56 +    MARKUP_SWITCH,
    3.57 +    USERTYPO_SWITCH,
    3.58 +    DP_SWITCH,
    3.59 +    SWITNO
    3.60 +};
    3.61 +
    3.62 +struct first_pass_results {
    3.63 +    long firstline,astline;
    3.64 +    long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
    3.65 +    long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
    3.66 +    long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
    3.67 +    int Dutchcount,Frenchcount;
    3.68 +};
    3.69 +
    3.70 +struct warnings {
    3.71 +    int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
    3.72 +    int endquote;
    3.73 +    gboolean isDutch,isFrench;
    3.74 +};
    3.75 +
    3.76 +struct line_properties {
    3.77 +    unsigned int len,blen;
    3.78 +    gunichar start;
    3.79 +};
    3.80 +
    3.81 +struct parities {
    3.82 +    int dquote,squote;
    3.83 +};
    3.84 +
    3.85 +struct pending {
    3.86 +    char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
    3.87 +    long squot;
    3.88 +};
    3.89 +
    3.90 +#endif /* BOOKOUPE_H */
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/bookloupe/counters.c	Sat Sep 21 23:40:18 2013 +0100
     4.3 @@ -0,0 +1,106 @@
     4.4 +#include <stdlib.h>
     4.5 +#include <glib.h>
     4.6 +#include "bookloupe.h"
     4.7 +#include "counters.h"
     4.8 +
     4.9 +struct matching_counter {
    4.10 +    int open,close;
    4.11 +};
    4.12 +
    4.13 +static struct matching_counter *matching_counter_new(void)
    4.14 +{
    4.15 +    return g_slice_new0(struct matching_counter);
    4.16 +}
    4.17 +
    4.18 +static void matching_counter_free(struct matching_counter *counter)
    4.19 +{
    4.20 +    g_slice_free(struct matching_counter,counter);
    4.21 +}
    4.22 +
    4.23 +static gint compar_unichars(gconstpointer a,gconstpointer b,gpointer unused)
    4.24 +{
    4.25 +    /*
    4.26 +     * Unicode code points only go up to 0x10FFFF and thus this cannot overflow.
    4.27 +     */
    4.28 +    return GPOINTER_TO_INT(a)-GPOINTER_TO_INT(b);
    4.29 +}
    4.30 +
    4.31 +/*
    4.32 + * For matching characters, we maintain a count of the opens and closes.
    4.33 + * In the simplest case, we are dealing with a matching pair such as [ and ]
    4.34 + * where there is a 1:1 mapping between an instance of [ with an open and
    4.35 + * between an instance of ] with a close. matching_ket() is
    4.36 + * responsible for selecting an arbitary base character of a matching pair.
    4.37 + */
    4.38 +static gpointer matching_key(gunichar ch)
    4.39 +{
    4.40 +    gunichar mirrored;
    4.41 +    if (g_unichar_get_mirror_char(ch,&mirrored))
    4.42 +	if (ch<mirrored)
    4.43 +	    return GINT_TO_POINTER((gint)ch);
    4.44 +	else
    4.45 +	    return GINT_TO_POINTER((gint)mirrored);
    4.46 +    else if (ch==CHAR_SQUOTE || ch==CHAR_OPEN_SQUOTE)
    4.47 +	return GINT_TO_POINTER((gint)CHAR_SQUOTE);
    4.48 +    else if (ch==CHAR_LS_QUOTE || ch==CHAR_RS_QUOTE)
    4.49 +	return GINT_TO_POINTER((gint)CHAR_LS_QUOTE);
    4.50 +    else
    4.51 +    {
    4.52 +	g_warning("Matching pair not found for U+%04"G_GINT32_FORMAT"X",ch);
    4.53 +	return GINT_TO_POINTER((gint)ch);
    4.54 +    }
    4.55 +}
    4.56 +
    4.57 +void increment_matching(struct counters *counters,gunichar ch,gboolean open)
    4.58 +{
    4.59 +    gpointer key,orig_key;
    4.60 +    struct matching_counter *value;
    4.61 +    if (!counters->matching)
    4.62 +	counters->matching=g_tree_new_full(compar_unichars,NULL,NULL,
    4.63 +	  (GDestroyNotify)matching_counter_free);
    4.64 +    key=matching_key(ch);
    4.65 +    if (!g_tree_lookup_extended(counters->matching,key,&orig_key,
    4.66 +      (gpointer *)&value))
    4.67 +    {
    4.68 +	value=matching_counter_new();
    4.69 +	g_tree_insert(counters->matching,key,value);
    4.70 +    }
    4.71 +    if (open)
    4.72 +	value->open++;
    4.73 +    else
    4.74 +	value->close++;
    4.75 +}
    4.76 +
    4.77 +int matching_count(const struct counters *counters,gunichar ch,gboolean open)
    4.78 +{
    4.79 +    struct matching_counter *value;
    4.80 +    if (!counters->matching)
    4.81 +	return 0;
    4.82 +    value=g_tree_lookup(counters->matching,matching_key(ch));
    4.83 +    if (!value)
    4.84 +	return 0;
    4.85 +    return open?value->open:value->close;
    4.86 +}
    4.87 +
    4.88 +/*
    4.89 + * Return open count - closed count
    4.90 + */
    4.91 +int matching_difference(const struct counters *counters,gunichar ch)
    4.92 +{
    4.93 +    struct matching_counter *value;
    4.94 +    if (!counters->matching)
    4.95 +	return 0;
    4.96 +    value=g_tree_lookup(counters->matching,matching_key(ch));
    4.97 +    if (!value)
    4.98 +	return 0;
    4.99 +    return value->open-value->close;
   4.100 +}
   4.101 +
   4.102 +void counters_destroy(struct counters *counters)
   4.103 +{
   4.104 +    if (counters->matching)
   4.105 +    {
   4.106 +	g_tree_destroy(counters->matching);
   4.107 +	counters->matching=NULL;
   4.108 +    }
   4.109 +}
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/bookloupe/counters.h	Sat Sep 21 23:40:18 2013 +0100
     5.3 @@ -0,0 +1,17 @@
     5.4 +#ifndef COUNTERS_H
     5.5 +#define COUNTERS_H
     5.6 +
     5.7 +#include <glib.h>
     5.8 +
     5.9 +struct counters {
    5.10 +    GTree *matching;
    5.11 +    long quot;
    5.12 +    int c_unders;
    5.13 +};
    5.14 +
    5.15 +void increment_matching(struct counters *counters,gunichar ch,gboolean open);
    5.16 +int matching_count(const struct counters *counters,gunichar ch,gboolean open);
    5.17 +int matching_difference(const struct counters *counters,gunichar ch);
    5.18 +void counters_destroy(struct counters *counters);
    5.19 +
    5.20 +#endif /* COUNTERS_H */
     6.1 --- a/doc/bookloupe.txt	Tue Sep 17 20:55:57 2013 +0100
     6.2 +++ b/doc/bookloupe.txt	Sat Sep 21 23:40:18 2013 +0100
     6.3 @@ -77,8 +77,8 @@
     6.4        to see all unclosed quotes, even where the next paragraph
     6.5        begins with a quote, you should use the -p switch.
     6.6  
     6.7 -      Singlequotes (') are a problem, since the same character
     6.8 -      is used for an apostrophe. I'm not sure that it is 
     6.9 +      Singlequotes (' and ’) are a problem, since the same
    6.10 +      character is used for an apostrophe. I'm not sure that it is
    6.11        possible to get 100% accuracy on singlequotes checking,
    6.12        particularly since dialect, quite common in PG texts,
    6.13        upsets the normal rules so badly. Consider the sentence:
     7.1 --- a/test/bookloupe/Makefile.am	Tue Sep 17 20:55:57 2013 +0100
     7.2 +++ b/test/bookloupe/Makefile.am	Sat Sep 21 23:40:18 2013 +0100
     7.3 @@ -1,4 +1,5 @@
     7.4  TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test
     7.5 -TESTS=non-ascii.tst long-line.tst
     7.6 +TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst \
     7.7 +	curved-genitives.tst
     7.8  
     7.9  dist_pkgdata_DATA=$(TESTS)
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/test/bookloupe/curved-genitives.tst	Sat Sep 21 23:40:18 2013 +0100
     8.3 @@ -0,0 +1,12 @@
     8.4 +**************** INPUT ****************
     8.5 +The genitive case of single nouns is normally formed like this:
     8.6 +
     8.7 +The fireworks known as Serpent’s Eggs, or PHARAOH’S SERPENTS.
     8.8 +
     8.9 +What should never happen is something like this:
    8.10 +
    8.11 +At this suggestion Nellie’S face grew crimson.
    8.12 +**************** EXPECTED ****************
    8.13 +
    8.14 +At this suggestion Nellie’S face grew crimson.
    8.15 +    Line 7 column 27 - Capital "S"?
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/test/bookloupe/curved-single-quotes.tst	Sat Sep 21 23:40:18 2013 +0100
     9.3 @@ -0,0 +1,55 @@
     9.4 +**************** OPTIONS ****************
     9.5 +-s
     9.6 +**************** INPUT ****************
     9.7 +‘Now you should start for school, ’ Margaret said.
     9.8 +
     9.9 +‘In a moment,’ Peter replied,‘ I'm just coming.’
    9.10 +
    9.11 +‘Come on,’shouted Jane.
    9.12 +
    9.13 +‘Alright,’ said Peter, ‘Keep your hair on.
    9.14 +’ He looked down as he came round the corner.
    9.15 +‘Where's my coat? ’
    9.16 +
    9.17 +`Underneath the girls’ scarves.’ said his mother.
    9.18 +
    9.19 +Grabbing it, he joined the others as they set out.
    9.20 +**************** WARNINGS ****************
    9.21 +<expected>
    9.22 +  <error>
    9.23 +    <at line="1" column="34"/>
    9.24 +    <text>Spaced singlequote?</text>
    9.25 +  </error>
    9.26 +  <error>
    9.27 +    <at line="3" column="30"/>
    9.28 +    <text>Wrongspaced singlequotes?</text>
    9.29 +  </error>
    9.30 +  <error>
    9.31 +    <at line="5" column="10"/>
    9.32 +    <text>Wrongspaced singlequotes?</text>
    9.33 +  </error>
    9.34 +  <false-positive>
    9.35 +    <at line="6"/>
    9.36 +    <text>Mismatched singlequotes?</text>
    9.37 +  </false-positive>
    9.38 +  <error>
    9.39 +    <at line="8" column="1"/>
    9.40 +    <text>Spaced quote?</text>
    9.41 +  </error>
    9.42 +  <error>
    9.43 +    <at line="9" column="19"/>
    9.44 +    <text>Spaced quote?</text>
    9.45 +  </error>
    9.46 +  <false-positive>
    9.47 +    <at line="10"/>
    9.48 +    <text>Mismatched singlequotes?</text>
    9.49 +  </false-positive>
    9.50 +  <false-positive>
    9.51 +    <at line="11" column="32"/>
    9.52 +    <text>Wrongspaced singlequotes?</text>
    9.53 +  </false-positive>
    9.54 +  <error>
    9.55 +    <at line="12"/>
    9.56 +    <text>Mismatched singlequotes?</text>
    9.57 +  </error>
    9.58 +</expected>
    10.1 --- a/test/compatibility/Makefile.am	Tue Sep 17 20:55:57 2013 +0100
    10.2 +++ b/test/compatibility/Makefile.am	Sat Sep 21 23:40:18 2013 +0100
    10.3 @@ -7,6 +7,6 @@
    10.4  	dashes.tst control-characters.tst unusual-characters.tst \
    10.5  	windows-1252.tst periods.tst long-line.tst unmarked-paragraph.tst \
    10.6  	hebe-jeebies.tst mail-from.tst scannos.tst before-comma.tst \
    10.7 -	before-period.tst double-punctuation.tst genatives.tst embedded-cr.tst
    10.8 +	before-period.tst double-punctuation.tst genitives.tst embedded-cr.tst
    10.9  
   10.10  dist_pkgdata_DATA=$(TESTS)
    11.1 --- a/test/compatibility/genatives.tst	Tue Sep 17 20:55:57 2013 +0100
    11.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.3 @@ -1,12 +0,0 @@
    11.4 -**************** INPUT ****************
    11.5 -The genative case of single nouns is normally formed like this:
    11.6 -
    11.7 -The fireworks known as Serpent's Eggs, or PHARAOH'S SERPENTS.
    11.8 -
    11.9 -What should never happen is something like this:
   11.10 -
   11.11 -At this suggestion Nellie'S face grew crimson.
   11.12 -**************** EXPECTED ****************
   11.13 -
   11.14 -At this suggestion Nellie'S face grew crimson.
   11.15 -    Line 7 column 27 - Capital "S"?
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/test/compatibility/genitives.tst	Sat Sep 21 23:40:18 2013 +0100
    12.3 @@ -0,0 +1,12 @@
    12.4 +**************** INPUT ****************
    12.5 +The genitive case of single nouns is normally formed like this:
    12.6 +
    12.7 +The fireworks known as Serpent's Eggs, or PHARAOH'S SERPENTS.
    12.8 +
    12.9 +What should never happen is something like this:
   12.10 +
   12.11 +At this suggestion Nellie'S face grew crimson.
   12.12 +**************** EXPECTED ****************
   12.13 +
   12.14 +At this suggestion Nellie'S face grew crimson.
   12.15 +    Line 7 column 27 - Capital "S"?