Fix bug #11: Test for balanced "slanted" UTF-8 quotation marks 8220/8221
authorali <ali@juiblex.co.uk>
Tue Sep 24 22:28:02 2013 +0100 (2013-09-24)
changeset 111f805130deb6f
parent 106 9fb13a5dde3b
child 112 12fdc390e6aa
Fix bug #11: Test for balanced "slanted" UTF-8 quotation marks 8220/8221
bl/Makefile.am
bl/bl.h
bl/utf8.c
bl/utf8.h
bookloupe/bookloupe.c
bookloupe/bookloupe.h
bookloupe/counters.c
bookloupe/counters.h
bookloupe/pending.c
bookloupe/pending.h
test/bookloupe/Makefile.am
test/bookloupe/curved-quotes.tst
test/compatibility/Makefile.am
test/compatibility/continuing-quotes.tst
     1.1 --- a/bl/Makefile.am	Mon Sep 23 21:18:27 2013 +0100
     1.2 +++ b/bl/Makefile.am	Tue Sep 24 22:28:02 2013 +0100
     1.3 @@ -4,4 +4,4 @@
     1.4  
     1.5  noinst_LTLIBRARIES=libbl.la
     1.6  libbl_la_SOURCES=bl.h textfileutils.c textfileutils.h spawn.c spawn.h \
     1.7 -	path.c path.h mkdtemp.c mkdtemp.h print.c print.h
     1.8 +	path.c path.h mkdtemp.c mkdtemp.h print.c print.h utf8.c utf8.h
     2.1 --- a/bl/bl.h	Mon Sep 23 21:18:27 2013 +0100
     2.2 +++ b/bl/bl.h	Tue Sep 24 22:28:02 2013 +0100
     2.3 @@ -3,3 +3,4 @@
     2.4  #include <bl/path.h>
     2.5  #include <bl/mkdtemp.h>
     2.6  #include <bl/print.h>
     2.7 +#include <bl/utf8.h>
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/bl/utf8.c	Tue Sep 24 22:28:02 2013 +0100
     3.3 @@ -0,0 +1,24 @@
     3.4 +#include <stdlib.h>
     3.5 +#include <string.h>
     3.6 +#include <glib.h>
     3.7 +#include <bl/bl.h>
     3.8 +
     3.9 +/*
    3.10 + * Creates a new string length bytes long filled with fill_char.
    3.11 + * The returned string should be freed when no longer needed.
    3.12 + */
    3.13 +gchar *utf8_strnfill(gsize length,gunichar fill_char)
    3.14 +{
    3.15 +    int n,i;
    3.16 +    gchar *s;
    3.17 +    char utf8[6];
    3.18 +    n=g_unichar_to_utf8(fill_char,utf8);
    3.19 +    s=g_new(gchar,length*n+1);
    3.20 +    if (n==1)
    3.21 +	memset(s,utf8[0],length);
    3.22 +    else
    3.23 +	for(i=0;i<length;i++)
    3.24 +	    memcpy(s+i*n,utf8,n);
    3.25 +    s[length*n]='\0';
    3.26 +    return s;
    3.27 +}
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/bl/utf8.h	Tue Sep 24 22:28:02 2013 +0100
     4.3 @@ -0,0 +1,6 @@
     4.4 +#ifndef BL_UTF8_H
     4.5 +#define BL_UTF8_H
     4.6 +
     4.7 +gchar *utf8_strnfill(gsize length,gunichar fill_char);
     4.8 +
     4.9 +#endif	/* BL_UTF8_H */
     5.1 --- a/bookloupe/bookloupe.c	Mon Sep 23 21:18:27 2013 +0100
     5.2 +++ b/bookloupe/bookloupe.c	Tue Sep 24 22:28:02 2013 +0100
     5.3 @@ -801,8 +801,10 @@
     5.4  	snext=g_utf8_next_char(s);
     5.5  	c=g_utf8_get_char(s);
     5.6  	if (c==CHAR_DQUOTE)
     5.7 -	    counters->quot++;
     5.8 -	if (CHAR_IS_SQUOTE(c))
     5.9 +	    increment_matching(counters,c,!matching_difference(counters,c));
    5.10 +	else if (CHAR_IS_DQUOTE(c))
    5.11 +	    increment_matching(counters,c,!CHAR_IS_CLOSING_QUOTE(c));
    5.12 +	else if (CHAR_IS_SQUOTE(c))
    5.13  	{
    5.14  	    if (s==aline)
    5.15  	    {
     6.1 --- a/bookloupe/bookloupe.h	Mon Sep 23 21:18:27 2013 +0100
     6.2 +++ b/bookloupe/bookloupe.h	Tue Sep 24 22:28:02 2013 +0100
     6.3 @@ -24,14 +24,20 @@
     6.4  
     6.5  #define CHAR_LS_QUOTE	  0x2018
     6.6  #define CHAR_RS_QUOTE	  0x2019
     6.7 +#define CHAR_LD_QUOTE	  0x201C
     6.8 +#define CHAR_RD_QUOTE	  0x201D
     6.9  
    6.10  #define CHAR_IS_SQUOTE(c)	((c)==CHAR_SQUOTE || (c)==CHAR_OPEN_SQUOTE || \
    6.11  				 (c)==CHAR_LS_QUOTE || (c)==CHAR_RS_QUOTE)
    6.12  
    6.13 +#define CHAR_IS_DQUOTE(c)	((c)==CHAR_DQUOTE || (c)==CHAR_LD_QUOTE || \
    6.14 +				 (c)==CHAR_RD_QUOTE)
    6.15 +
    6.16  #define CHAR_IS_APOSTROPHE(c)	((c)==CHAR_SQUOTE || (c)==CHAR_RS_QUOTE)
    6.17  
    6.18  #define CHAR_IS_CLOSING_QUOTE(c) \
    6.19 -    ((c)==CHAR_DQUOTE || (c)==CHAR_SQUOTE || (c)==CHAR_RS_QUOTE)
    6.20 +    ((c)==CHAR_DQUOTE || (c)==CHAR_RD_QUOTE || (c)==CHAR_SQUOTE || \
    6.21 +     (c)==CHAR_RS_QUOTE)
    6.22  
    6.23  /* longest and shortest normal PG line lengths */
    6.24  #define LONGEST_PG_LINE   75
     7.1 --- a/bookloupe/counters.c	Mon Sep 23 21:18:27 2013 +0100
     7.2 +++ b/bookloupe/counters.c	Tue Sep 24 22:28:02 2013 +0100
     7.3 @@ -45,11 +45,13 @@
     7.4  	return GINT_TO_POINTER((gint)CHAR_SQUOTE);
     7.5      else if (ch==CHAR_LS_QUOTE || ch==CHAR_RS_QUOTE)
     7.6  	return GINT_TO_POINTER((gint)CHAR_LS_QUOTE);
     7.7 +    else if (ch==CHAR_LD_QUOTE || ch==CHAR_RD_QUOTE)
     7.8 +	return GINT_TO_POINTER((gint)CHAR_LD_QUOTE);
     7.9 +    else if (ch==CHAR_DQUOTE)
    7.10 +	return GINT_TO_POINTER((gint)ch);
    7.11      else if (ch<0x4000 || ch-0x4000>=NO_SPECIAL_COUNTERS)
    7.12 -    {
    7.13  	g_warning("Matching pair not found for U+%04" G_GINT32_MODIFIER "X",ch);
    7.14 -	return GINT_TO_POINTER((gint)ch);
    7.15 -    }
    7.16 +    return GINT_TO_POINTER((gint)ch);
    7.17  }
    7.18  
    7.19  void increment_matching(struct counters *counters,gunichar ch,gboolean open)
     8.1 --- a/bookloupe/counters.h	Mon Sep 23 21:18:27 2013 +0100
     8.2 +++ b/bookloupe/counters.h	Tue Sep 24 22:28:02 2013 +0100
     8.3 @@ -11,7 +11,6 @@
     8.4  
     8.5  struct counters {
     8.6      GTree *matching;
     8.7 -    long quot;
     8.8      int c_unders;
     8.9  };
    8.10  
     9.1 --- a/bookloupe/pending.c	Mon Sep 23 21:18:27 2013 +0100
     9.2 +++ b/bookloupe/pending.c	Tue Sep 24 22:28:02 2013 +0100
     9.3 @@ -1,6 +1,7 @@
     9.4  #include <stdlib.h>
     9.5  #include <string.h>
     9.6  #include <glib.h>
     9.7 +#include <bl/bl.h>
     9.8  #include "bookloupe.h"
     9.9  #include "pending.h"
    9.10  
    9.11 @@ -15,20 +16,9 @@
    9.12  void print_pending(const char *aline,const char *parastart,
    9.13    struct pending *pending)
    9.14  {
    9.15 -    const char *s;
    9.16 -    gunichar c;
    9.17      if (aline)
    9.18 -    {
    9.19 -	s=aline;
    9.20 -	while (*s==' ')
    9.21 -	    s++;
    9.22 -	c=g_utf8_get_char(s);
    9.23 -    }
    9.24 -    else
    9.25 -    {
    9.26 -	s=NULL;
    9.27 -	c='\0';
    9.28 -    }
    9.29 +	while (g_unichar_isspace(g_utf8_get_char(aline)))
    9.30 +	    aline=g_utf8_next_char(aline);
    9.31      if (pending->illustration.warning_text)
    9.32      {
    9.33  	if (aline)
    9.34 @@ -54,7 +44,8 @@
    9.35      }
    9.36      if (pending->dquote)
    9.37      {
    9.38 -	if (c!=CHAR_DQUOTE || pswit[QPARA_SWITCH])
    9.39 +	if (!pending->continuing_quote || !aline ||
    9.40 +	  !g_str_has_prefix(aline,pending->continuing_quote))
    9.41  	{
    9.42  	    if (!pswit[OVERVIEW_SWITCH])
    9.43  	    {
    9.44 @@ -70,7 +61,8 @@
    9.45      }
    9.46      if (pending->squote)
    9.47      {
    9.48 -	if (!CHAR_IS_SQUOTE(c) || pswit[QPARA_SWITCH] || pending->squot)
    9.49 +	if (!pending->continuing_quote ||
    9.50 +	  !g_str_has_prefix(aline,pending->continuing_quote))
    9.51  	{
    9.52  	    if (!pswit[OVERVIEW_SWITCH])
    9.53  	    {
    9.54 @@ -84,6 +76,8 @@
    9.55  	g_free(pending->squote);
    9.56  	pending->squote=NULL;
    9.57      }
    9.58 +    g_free(pending->continuing_quote);
    9.59 +    pending->continuing_quote=NULL;
    9.60      if (pending->rbrack)
    9.61      {
    9.62  	if (!pswit[OVERVIEW_SWITCH])
    9.63 @@ -159,34 +153,58 @@
    9.64   * quotes on _every_ paragraph, whether the next begins with a
    9.65   * quote or not.
    9.66   */
    9.67 -void check_for_mismatched_quotes(const struct counters *counters,
    9.68 +void check_for_mismatched_quotes(struct counters *counters,
    9.69    struct pending *pending)
    9.70  {
    9.71 -    int squote_straight,squote_curved,difference;
    9.72 -    if (counters->quot%2)
    9.73 +    int quote_straight,quote_curved,difference;
    9.74 +    quote_straight=matching_difference(counters,CHAR_DQUOTE);
    9.75 +    quote_curved=matching_difference(counters,CHAR_LD_QUOTE);
    9.76 +    if (quote_straight || quote_curved)
    9.77 +    {
    9.78  	pending->dquote=
    9.79  	  g_strdup_printf("    Line %ld - Mismatched quotes",linecnt);
    9.80 +	if (pswit[QPARA_SWITCH] || quote_curved && quote_curved!=1 ||
    9.81 +	  quote_straight && quote_curved)
    9.82 +	    /*
    9.83 +	     * Flag it to be noted regardless of the
    9.84 +	     * first line of the next para.
    9.85 +	     */
    9.86 +	    pending->continuing_quote=NULL;
    9.87 +	else if (quote_straight)
    9.88 +	    pending->continuing_quote=utf8_strnfill(quote_straight,CHAR_DQUOTE);
    9.89 +	else
    9.90 +	    pending->continuing_quote=utf8_strnfill(quote_curved,CHAR_LD_QUOTE);
    9.91 +    }
    9.92      if (pswit[SQUOTE_SWITCH])
    9.93      {
    9.94  	if (matching_count(counters,CHAR_SQUOTE,TRUE))
    9.95 -	    squote_straight=matching_difference(counters,CHAR_SQUOTE);
    9.96 +	    quote_straight=matching_difference(counters,CHAR_SQUOTE);
    9.97  	else
    9.98 -	    squote_straight=0;
    9.99 +	    quote_straight=0;
   9.100  	if (matching_count(counters,CHAR_LS_QUOTE,TRUE))
   9.101 -	    squote_curved=matching_difference(counters,CHAR_LS_QUOTE);
   9.102 +	    quote_curved=matching_difference(counters,CHAR_LS_QUOTE);
   9.103  	else
   9.104 -	    squote_curved=0;
   9.105 -	if (squote_straight || squote_curved)
   9.106 +	    quote_curved=0;
   9.107 +	if (quote_straight || quote_curved)
   9.108  	    pending->squote=
   9.109  	      g_strdup_printf("    Line %ld - Mismatched singlequotes?",
   9.110  	      linecnt);
   9.111 -	if (squote_straight && squote_straight!=1 ||
   9.112 -	  squote_curved && squote_curved!=1)
   9.113 +	if (pending->continuing_quote)
   9.114 +	{
   9.115  	    /*
   9.116  	     * Flag it to be noted regardless of the
   9.117 -	     * first char of the next para.
   9.118 +	     * first line of the next para.
   9.119  	     */
   9.120 -	    pending->squot=1;
   9.121 +	    g_free(pending->continuing_quote);
   9.122 +	    pending->continuing_quote=NULL;
   9.123 +	}
   9.124 +	if (pswit[QPARA_SWITCH] || quote_straight && quote_straight!=1 ||
   9.125 +	  quote_curved && quote_curved!=1 || quote_straight && quote_curved)
   9.126 +	    pending->continuing_quote=NULL;
   9.127 +	else if (quote_straight)
   9.128 +	    pending->continuing_quote=utf8_strnfill(quote_straight,CHAR_SQUOTE);
   9.129 +	else
   9.130 +	    pending->continuing_quote=utf8_strnfill(quote_curved,CHAR_LS_QUOTE);
   9.131      }
   9.132      difference=matching_difference(counters,COUNTER_ILLUSTRATION);
   9.133      if (difference)
    10.1 --- a/bookloupe/pending.h	Mon Sep 23 21:18:27 2013 +0100
    10.2 +++ b/bookloupe/pending.h	Tue Sep 24 22:28:02 2013 +0100
    10.3 @@ -10,14 +10,14 @@
    10.4  
    10.5  struct pending {
    10.6      char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
    10.7 -    long squot;
    10.8 +    char *continuing_quote;
    10.9      struct pending_warning illustration;
   10.10  };
   10.11  
   10.12  void print_pending(const char *aline,const char *parastart,
   10.13    struct pending *pending);
   10.14  void reset_pending(struct pending *pending);
   10.15 -void check_for_mismatched_quotes(const struct counters *counters,
   10.16 +void check_for_mismatched_quotes(struct counters *counters,
   10.17    struct pending *pending);
   10.18  
   10.19  #endif /* PENDING_H */
    11.1 --- a/test/bookloupe/Makefile.am	Mon Sep 23 21:18:27 2013 +0100
    11.2 +++ b/test/bookloupe/Makefile.am	Tue Sep 24 22:28:02 2013 +0100
    11.3 @@ -1,5 +1,5 @@
    11.4  TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test
    11.5 -TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst \
    11.6 +TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \
    11.7  	curved-genitives.tst multi-line-illustration.tst
    11.8  
    11.9  dist_pkgdata_DATA=$(TESTS)
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/test/bookloupe/curved-quotes.tst	Tue Sep 24 22:28:02 2013 +0100
    12.3 @@ -0,0 +1,35 @@
    12.4 +**************** INPUT ****************
    12.5 +When Tom had made fast his motorboat, he went to the rowing craft to
    12.6 +see if it was in good condition. He saw a piece of paper on one of the
    12.7 +seats, held down by a little stone. Picking it up he read:
    12.8 +
    12.9 +     “Many thanks for the use of your boat. I had a fine row, and
   12.10 +     I feel better, though I’m as much up a tree as ever. I hope
   12.11 +     to see you again, sometime. If ever you are near Elmwood Hall,
   12.12 +     look me up.
   12.13 +
   12.14 +     “BRUCE BENNINGTON.”
   12.15 +
   12.16 +That was nice of him,” remarked Will, as Tom showed him the note.
   12.17 +
   12.18 +“And he didn’t damage your boat any," spoke Dick.
   12.19 +
   12.20 +"No, he knows how to handle ’em--he rows on the Elmwood Hall crew,” said
   12.21 +Tom. “Well, so long, fellows. I’m going for a long run to-morrow, if
   12.22 +you’d like to come.”
   12.23 +
   12.24 +“Sure! they chorused.
   12.25 +**************** WARNINGS ****************
   12.26 +<expected>
   12.27 +  <error>
   12.28 +    <at line="13"/>
   12.29 +    <at line="15"/>
   12.30 +    <at line="19"/>
   12.31 +    <at line="21"/>
   12.32 +    <text>Mismatched quotes</text>
   12.33 +  </error>
   12.34 +  <false-positive>
   12.35 +    <at line="14" column="37"/>
   12.36 +    <text>Wrongspaced quotes?</text>
   12.37 +  </false-positive>
   12.38 +</expected>
    13.1 --- a/test/compatibility/Makefile.am	Mon Sep 23 21:18:27 2013 +0100
    13.2 +++ b/test/compatibility/Makefile.am	Tue Sep 24 22:28:02 2013 +0100
    13.3 @@ -7,6 +7,7 @@
    13.4  	dashes.tst control-characters.tst unusual-characters.tst \
    13.5  	windows-1252.tst periods.tst long-line.tst unmarked-paragraph.tst \
    13.6  	hebe-jeebies.tst mail-from.tst scannos.tst before-comma.tst \
    13.7 -	before-period.tst double-punctuation.tst genitives.tst embedded-cr.tst
    13.8 +	before-period.tst double-punctuation.tst genitives.tst embedded-cr.tst \
    13.9 +	continuing-quotes.tst
   13.10  
   13.11  dist_pkgdata_DATA=$(TESTS)
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/test/compatibility/continuing-quotes.tst	Tue Sep 24 22:28:02 2013 +0100
    14.3 @@ -0,0 +1,14 @@
    14.4 +**************** INPUT ****************
    14.5 +When Tom had made fast his motorboat, he went to the rowing craft to
    14.6 +see if it was in good condition. He saw a piece of paper on one of the
    14.7 +seats, held down by a little stone. Picking it up he read:
    14.8 +
    14.9 +     "Many thanks for the use of your boat. I had a fine row, and
   14.10 +     I feel better, though I'm as much up a tree as ever. I hope
   14.11 +     to see you again, sometime. If ever you are near Elmwood Hall,
   14.12 +     look me up.
   14.13 +
   14.14 +     "BRUCE BENNINGTON."
   14.15 +
   14.16 +"That was nice of him," remarked Will, as Tom showed him the note.
   14.17 +**************** EXPECTED ****************