Fix bug #26: Partially emphasized words
authorali <ali@juiblex.co.uk>
Wed Oct 30 17:11:11 2013 +0000 (2013-10-30)
changeset 2117f0bbee5c8b0
parent 210 2d48e8cdda24
child 212 aece0899b1d3
Fix bug #26: Partially emphasized words
bookloupe/bookloupe.c
test/bookloupe/Makefile.am
test/bookloupe/partial-underlining.tst
test/compatibility/brackets.tst
     1.1 --- a/bookloupe/bookloupe.c	Wed Oct 02 09:14:33 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Wed Oct 30 17:11:11 2013 +0000
     1.3 @@ -250,7 +250,7 @@
     1.4  gchar *running_from;
     1.5  
     1.6  gboolean mixdigit(const char *);
     1.7 -gchar *getaword(const char **);
     1.8 +gchar *getaword(const char *,const char **);
     1.9  char *flgets(char **,long,int);
    1.10  void postprocess_for_HTML(char *);
    1.11  char *linehasmarkup(char *);
    1.12 @@ -977,7 +977,7 @@
    1.13  	    results.emdash.PG_space++;
    1.14  	for (s=lines[j];*s;)
    1.15  	{
    1.16 -	    inword=getaword(&s);
    1.17 +	    inword=getaword(NULL,&s);
    1.18  	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
    1.19  		results.Dutchcount++;
    1.20  	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
    1.21 @@ -2002,7 +2002,7 @@
    1.22  	for (s=aline;*s;)
    1.23  	{
    1.24  	    wordstart=s;
    1.25 -	    t=getaword(&s);
    1.26 +	    t=getaword(NULL,&s);
    1.27  	    if (!*t)
    1.28  	    {
    1.29  		g_free(t);
    1.30 @@ -2052,8 +2052,9 @@
    1.31  /*
    1.32   * check_for_typos:
    1.33   *
    1.34 - * Check for commonly mistyped words,
    1.35 - * and digits like 0 for O in a word.
    1.36 + * Check for commonly mistyped words, and digits like 0 for O in a word.
    1.37 + * Note that somewhat confusingly, this is also where we call getaword()
    1.38 + * with a non-NULL line so that it will issue warnings.
    1.39   */
    1.40  void check_for_typos(const char *aline,struct warnings *warnings)
    1.41  {
    1.42 @@ -2069,7 +2070,7 @@
    1.43      for (s=aline;*s;)
    1.44      {
    1.45  	wordstart=s;
    1.46 -	inword=getaword(&s);
    1.47 +	inword=getaword(aline,&s);
    1.48  	if (!*inword)
    1.49  	{
    1.50  	    g_free(inword);
    1.51 @@ -2318,7 +2319,7 @@
    1.52  	     * If there are letters on both sides of it or
    1.53  	     * if it's strict punctuation followed by an alpha.
    1.54  	     */
    1.55 -	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
    1.56 +	    if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
    1.57  	      g_utf8_strchr("?!,;:",-1,c)))
    1.58  	    {
    1.59  		if (c=='.')
    1.60 @@ -3419,14 +3420,18 @@
    1.61   * A word is defined as one English word unit--or at least that's the aim.
    1.62   * "ptr" is advanced to the position in the line where we will start
    1.63   * looking for the next word.
    1.64 + * If line is non-NULL, then it will be used to derive the column numbers for
    1.65 + * any warnings issued. If line is NULL, then warnings will be suppressed.
    1.66   *
    1.67   * Returns: A newly-allocated string.
    1.68   */
    1.69 -gchar *getaword(const char **ptr)
    1.70 +gchar *getaword(const char *line,const char **ptr)
    1.71  {
    1.72 -    const char *s,*t;
    1.73 +    const char *s,*t,*t2;
    1.74      GString *word;
    1.75      gunichar c,pc;
    1.76 +    int adjust;
    1.77 +    gboolean initial_underlining=FALSE;
    1.78      word=g_string_new(NULL);
    1.79      for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
    1.80        !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
    1.81 @@ -3448,6 +3453,7 @@
    1.82  	    else
    1.83  		g_string_truncate(word,0);
    1.84  	}
    1.85 +	initial_underlining=g_utf8_get_char(*ptr)=='_';
    1.86      }
    1.87      /*
    1.88       * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
    1.89 @@ -3477,10 +3483,81 @@
    1.90      }
    1.91      /* we didn't find a punctuated number - do the regular getword thing */
    1.92      g_string_truncate(word,0);
    1.93 -    c=g_utf8_get_char(*ptr);
    1.94 -    for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
    1.95 -      *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
    1.96 +    s=*ptr;
    1.97 +    c=g_utf8_get_char(s);
    1.98 +    for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' ||
    1.99 +      CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s))
   1.100  	g_string_append_unichar(word,c);
   1.101 +    if (initial_underlining && word->str[word->len-1]=='_')
   1.102 +    {
   1.103 +	/* _Simple_ or _Old-school_underlining_ */
   1.104 +	t=strchr(*ptr,'_');
   1.105 +	g_string_truncate(word,t-*ptr);
   1.106 +	*ptr=t;
   1.107 +    }
   1.108 +    else if (initial_underlining || (t=strchr(word->str,'_')))
   1.109 +    {
   1.110 +	/* Part_ial_ underlining */
   1.111 +	adjust=0;
   1.112 +	if (initial_underlining)
   1.113 +	{
   1.114 +	    t2=strchr(word->str,'_');
   1.115 +	    if (t2)
   1.116 +	    {
   1.117 +		g_string_erase(word,t2-word->str,1);
   1.118 +		adjust++;
   1.119 +	    }
   1.120 +	    else
   1.121 +	    {
   1.122 +		if (line)
   1.123 +		{
   1.124 +		    if (pswit[ECHO_SWITCH])
   1.125 +			g_print("\n%s\n",line);
   1.126 +		    if (!pswit[OVERVIEW_SWITCH])
   1.127 +			g_print("    Line %ld column %ld - "
   1.128 +			  "Missing space or underscore?\n",linecnt,
   1.129 +			  g_utf8_pointer_to_offset(line,*ptr));
   1.130 +		    else
   1.131 +			cnt_punct++;
   1.132 +		}
   1.133 +		*ptr=s;
   1.134 +		return g_string_free(word,FALSE);
   1.135 +	    }
   1.136 +	}
   1.137 +	while ((t=strchr(word->str,'_')))
   1.138 +	{
   1.139 +	    t2=strchr(t+1,'_');
   1.140 +	    if (t2)
   1.141 +	    {
   1.142 +		g_string_erase(word,t-word->str,1);
   1.143 +		t2--;
   1.144 +		g_string_erase(word,t2-word->str,1);
   1.145 +		adjust+=2;
   1.146 +	    }
   1.147 +	    else
   1.148 +	    {
   1.149 +		g_string_truncate(word,t-word->str);
   1.150 +		adjust+=g_utf8_pointer_to_offset(word->str,t);
   1.151 +		*ptr=g_utf8_offset_to_pointer(*ptr,adjust);
   1.152 +		if (line)
   1.153 +		{
   1.154 +		    if (pswit[ECHO_SWITCH])
   1.155 +			g_print("\n%s\n",line);
   1.156 +		    if (!pswit[OVERVIEW_SWITCH])
   1.157 +			g_print("    Line %ld column %ld - "
   1.158 +			  "Missing space or underscore?\n",linecnt,
   1.159 +			  g_utf8_pointer_to_offset(line,*ptr)+1);
   1.160 +		    else
   1.161 +			cnt_punct++;
   1.162 +		}
   1.163 +		return g_string_free(word,FALSE);
   1.164 +	    }
   1.165 +	}
   1.166 +	*ptr=s;
   1.167 +    }
   1.168 +    else
   1.169 +	/* No underlining */
   1.170 +	*ptr=s;
   1.171      return g_string_free(word,FALSE);
   1.172  }
   1.173  
     2.1 --- a/test/bookloupe/Makefile.am	Wed Oct 02 09:14:33 2013 +0100
     2.2 +++ b/test/bookloupe/Makefile.am	Wed Oct 30 17:11:11 2013 +0000
     2.3 @@ -3,6 +3,7 @@
     2.4  	runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \
     2.5  	emdash.tst config-internal.tst config-default.tst config-user.tst \
     2.6  	config-override.tst charset-cp1252.tst charset-latin1.tst \
     2.7 -	footnote-marker.tst unix-lineends.tst os9-lineends.tst dot-comma.tst
     2.8 +	footnote-marker.tst unix-lineends.tst os9-lineends.tst dot-comma.tst \
     2.9 +	partial-underlining.tst
    2.10  
    2.11  dist_pkgdata_DATA=$(TESTS)
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/test/bookloupe/partial-underlining.tst	Wed Oct 30 17:11:11 2013 +0000
     3.3 @@ -0,0 +1,29 @@
     3.4 +**************** INPUT ****************
     3.5 +Bookloupe understands simple underlining, for example, a _bd_ word as
     3.6 +well as old-school underlining, for example, _a_pr_word_.
     3.7 +
     3.8 +It also understands partial underlining, as in l'_Adthima_, Abag_ae_l,
     3.9 +_ph_antasm, and even _ph_antasi_z_e.
    3.10 +
    3.11 +While warnings about missing spaces around underscores are generally
    3.12 +suppressed, partial underlining with_an odd number of un_der_scor_es
    3.13 +will still be warned about.
    3.14 +**************** WARNINGS ****************
    3.15 +<expected>
    3.16 +  <error>
    3.17 +    <at line="1" column="57"/>
    3.18 +    <text>Query word bd - not reporting duplicates</text>
    3.19 +  </error>
    3.20 +  <error>
    3.21 +    <at line="2" column="48"/>
    3.22 +    <text>Query word pr - not reporting duplicates</text>
    3.23 +  </error>
    3.24 +  <error>
    3.25 +    <at line="8" column="37"/>
    3.26 +    <text>Missing space or underscore?</text>
    3.27 +  </error>
    3.28 +  <error>
    3.29 +    <at line="8" column="66"/>
    3.30 +    <text>Missing space or underscore?</text>
    3.31 +  </error>
    3.32 +</expected>
     4.1 --- a/test/compatibility/brackets.tst	Wed Oct 02 09:14:33 2013 +0100
     4.2 +++ b/test/compatibility/brackets.tst	Wed Oct 30 17:11:11 2013 +0000
     4.3 @@ -14,31 +14,46 @@
     4.4  This _very_ important_ paragraph has an odd number of underscores.
     4.5  
     4.6  Unspaced brackets are a[most a]ways _wrong_.
     4.7 -**************** EXPECTED ****************
     4.8 -
     4.9 -This (excellent paragraph has one more {opening} paranthesis than closing.
    4.10 -    Line 2 - Mismatched round brackets?
    4.11 -
    4.12 -On the other hand, this poor) paragraph does it backwards.
    4.13 -    Line 4 - Mismatched round brackets?
    4.14 -
    4.15 -This {slightly odd paragraph has one more [opening] brace than closing.
    4.16 -    Line 6 - Mismatched curly brackets?
    4.17 -
    4.18 -And again, this balmy} paragraph does it backwards.
    4.19 -    Line 8 - Mismatched curly brackets?
    4.20 -
    4.21 -This paragraph[11 has one more (opening) bracket than closing.
    4.22 -    Line 10 - Mismatched square brackets?
    4.23 -
    4.24 -Whereas this one is 12]tupsy turvey.
    4.25 -    Line 12 - Mismatched square brackets?
    4.26 -
    4.27 -This _very_ important_ paragraph has an odd number of underscores.
    4.28 -    Line 14 - Mismatched underscores?
    4.29 -
    4.30 -Unspaced brackets are a[most a]ways _wrong_.
    4.31 -    Line 15 column 23 - Unspaced bracket?
    4.32 -
    4.33 -Unspaced brackets are a[most a]ways _wrong_.
    4.34 -    Line 15 column 30 - Unspaced bracket?
    4.35 +**************** WARNINGS ****************
    4.36 +<expected>
    4.37 +  <error>
    4.38 +    <at line="2"/>
    4.39 +    <text>Mismatched round brackets?</text>
    4.40 +  </error>
    4.41 +  <error>
    4.42 +    <at line="4"/>
    4.43 +    <text>Mismatched round brackets?</text>
    4.44 +  </error>
    4.45 +  <error>
    4.46 +    <at line="6"/>
    4.47 +    <text>Mismatched curly brackets?</text>
    4.48 +  </error>
    4.49 +  <error>
    4.50 +    <at line="8"/>
    4.51 +    <text>Mismatched curly brackets?</text>
    4.52 +  </error>
    4.53 +  <error>
    4.54 +    <at line="10"/>
    4.55 +    <text>Mismatched square brackets?</text>
    4.56 +  </error>
    4.57 +  <error>
    4.58 +    <at line="12"/>
    4.59 +    <text>Mismatched square brackets?</text>
    4.60 +  </error>
    4.61 +  <false-negative>
    4.62 +    <at line="13" column="22"/>
    4.63 +    <text>Missing space or underscore?</text>
    4.64 +  </false-negative>
    4.65 +  <error>
    4.66 +    <at line="14"/>
    4.67 +    <text>Mismatched underscores?</text>
    4.68 +  </error>
    4.69 +  <error>
    4.70 +    <at line="15" column="23"/>
    4.71 +    <text>Unspaced bracket?</text>
    4.72 +  </error>
    4.73 +  <error>
    4.74 +    <at line="15" column="30"/>
    4.75 +    <text>Unspaced bracket?</text>
    4.76 +  </error>
    4.77 +</expected>