1.1 --- a/bookloupe/bookloupe.c Wed Oct 02 09:14:33 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Wed Oct 30 17:11:11 2013 +0000
1.3 @@ -250,7 +250,7 @@
1.4 gchar *running_from;
1.5
1.6 gboolean mixdigit(const char *);
1.7 -gchar *getaword(const char **);
1.8 +gchar *getaword(const char *,const char **);
1.9 char *flgets(char **,long,int);
1.10 void postprocess_for_HTML(char *);
1.11 char *linehasmarkup(char *);
1.12 @@ -977,7 +977,7 @@
1.13 results.emdash.PG_space++;
1.14 for (s=lines[j];*s;)
1.15 {
1.16 - inword=getaword(&s);
1.17 + inword=getaword(NULL,&s);
1.18 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
1.19 results.Dutchcount++;
1.20 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
1.21 @@ -2002,7 +2002,7 @@
1.22 for (s=aline;*s;)
1.23 {
1.24 wordstart=s;
1.25 - t=getaword(&s);
1.26 + t=getaword(NULL,&s);
1.27 if (!*t)
1.28 {
1.29 g_free(t);
1.30 @@ -2052,8 +2052,9 @@
1.31 /*
1.32 * check_for_typos:
1.33 *
1.34 - * Check for commonly mistyped words,
1.35 - * and digits like 0 for O in a word.
1.36 + * Check for commonly mistyped words, and digits like 0 for O in a word.
1.37 + * Note that somewhat confusingly, this is also where we call getaword()
1.38 + * with a non-NULL line so that it will issue warnings.
1.39 */
1.40 void check_for_typos(const char *aline,struct warnings *warnings)
1.41 {
1.42 @@ -2069,7 +2070,7 @@
1.43 for (s=aline;*s;)
1.44 {
1.45 wordstart=s;
1.46 - inword=getaword(&s);
1.47 + inword=getaword(aline,&s);
1.48 if (!*inword)
1.49 {
1.50 g_free(inword);
1.51 @@ -2318,7 +2319,7 @@
1.52 * If there are letters on both sides of it or
1.53 * if it's strict punctuation followed by an alpha.
1.54 */
1.55 - if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1.56 + if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1.57 g_utf8_strchr("?!,;:",-1,c)))
1.58 {
1.59 if (c=='.')
1.60 @@ -3419,14 +3420,18 @@
1.61 * A word is defined as one English word unit--or at least that's the aim.
1.62 * "ptr" is advanced to the position in the line where we will start
1.63 * looking for the next word.
1.64 + * If line is non-NULL, then it will be used to derive the column numbers for
1.65 + * any warnings issued. If line is NULL, then warnings will be suppressed.
1.66 *
1.67 * Returns: A newly-allocated string.
1.68 */
1.69 -gchar *getaword(const char **ptr)
1.70 +gchar *getaword(const char *line,const char **ptr)
1.71 {
1.72 - const char *s,*t;
1.73 + const char *s,*t,*t2;
1.74 GString *word;
1.75 gunichar c,pc;
1.76 + int adjust;
1.77 + gboolean initial_underlining=FALSE;
1.78 word=g_string_new(NULL);
1.79 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
1.80 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
1.81 @@ -3448,6 +3453,7 @@
1.82 else
1.83 g_string_truncate(word,0);
1.84 }
1.85 + initial_underlining=g_utf8_get_char(*ptr)=='_';
1.86 }
1.87 /*
1.88 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
1.89 @@ -3477,10 +3483,81 @@
1.90 }
1.91 /* we didn't find a punctuated number - do the regular getword thing */
1.92 g_string_truncate(word,0);
1.93 - c=g_utf8_get_char(*ptr);
1.94 - for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
1.95 - *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
1.96 + s=*ptr;
1.97 + c=g_utf8_get_char(s);
1.98 + for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' ||
1.99 + CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s))
1.100 g_string_append_unichar(word,c);
1.101 + if (initial_underlining && word->str[word->len-1]=='_')
1.102 + {
1.103 + /* _Simple_ or _Old-school_underlining_ */
1.104 + t=strchr(*ptr,'_');
1.105 + g_string_truncate(word,t-*ptr);
1.106 + *ptr=t;
1.107 + }
1.108 + else if (initial_underlining || (t=strchr(word->str,'_')))
1.109 + {
1.110 + /* Part_ial_ underlining */
1.111 + adjust=0;
1.112 + if (initial_underlining)
1.113 + {
1.114 + t2=strchr(word->str,'_');
1.115 + if (t2)
1.116 + {
1.117 + g_string_erase(word,t2-word->str,1);
1.118 + adjust++;
1.119 + }
1.120 + else
1.121 + {
1.122 + if (line)
1.123 + {
1.124 + if (pswit[ECHO_SWITCH])
1.125 + g_print("\n%s\n",line);
1.126 + if (!pswit[OVERVIEW_SWITCH])
1.127 + g_print(" Line %ld column %ld - "
1.128 + "Missing space or underscore?\n",linecnt,
1.129 + g_utf8_pointer_to_offset(line,*ptr));
1.130 + else
1.131 + cnt_punct++;
1.132 + }
1.133 + *ptr=s;
1.134 + return g_string_free(word,FALSE);
1.135 + }
1.136 + }
1.137 + while ((t=strchr(word->str,'_')))
1.138 + {
1.139 + t2=strchr(t+1,'_');
1.140 + if (t2)
1.141 + {
1.142 + g_string_erase(word,t-word->str,1);
1.143 + t2--;
1.144 + g_string_erase(word,t2-word->str,1);
1.145 + adjust+=2;
1.146 + }
1.147 + else
1.148 + {
1.149 + g_string_truncate(word,t-word->str);
1.150 + adjust+=g_utf8_pointer_to_offset(word->str,t);
1.151 + *ptr=g_utf8_offset_to_pointer(*ptr,adjust);
1.152 + if (line)
1.153 + {
1.154 + if (pswit[ECHO_SWITCH])
1.155 + g_print("\n%s\n",line);
1.156 + if (!pswit[OVERVIEW_SWITCH])
1.157 + g_print(" Line %ld column %ld - "
1.158 + "Missing space or underscore?\n",linecnt,
1.159 + g_utf8_pointer_to_offset(line,*ptr)+1);
1.160 + else
1.161 + cnt_punct++;
1.162 + }
1.163 + return g_string_free(word,FALSE);
1.164 + }
1.165 + }
1.166 + *ptr=s;
1.167 + }
1.168 + else
1.169 + /* No underlining */
1.170 + *ptr=s;
1.171 return g_string_free(word,FALSE);
1.172 }
1.173
2.1 --- a/test/bookloupe/Makefile.am Wed Oct 02 09:14:33 2013 +0100
2.2 +++ b/test/bookloupe/Makefile.am Wed Oct 30 17:11:11 2013 +0000
2.3 @@ -3,6 +3,7 @@
2.4 runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \
2.5 emdash.tst config-internal.tst config-default.tst config-user.tst \
2.6 config-override.tst charset-cp1252.tst charset-latin1.tst \
2.7 - footnote-marker.tst unix-lineends.tst os9-lineends.tst dot-comma.tst
2.8 + footnote-marker.tst unix-lineends.tst os9-lineends.tst dot-comma.tst \
2.9 + partial-underlining.tst
2.10
2.11 dist_pkgdata_DATA=$(TESTS)
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/test/bookloupe/partial-underlining.tst Wed Oct 30 17:11:11 2013 +0000
3.3 @@ -0,0 +1,29 @@
3.4 +**************** INPUT ****************
3.5 +Bookloupe understands simple underlining, for example, a _bd_ word as
3.6 +well as old-school underlining, for example, _a_pr_word_.
3.7 +
3.8 +It also understands partial underlining, as in l'_Adthima_, Abag_ae_l,
3.9 +_ph_antasm, and even _ph_antasi_z_e.
3.10 +
3.11 +While warnings about missing spaces around underscores are generally
3.12 +suppressed, partial underlining with_an odd number of un_der_scor_es
3.13 +will still be warned about.
3.14 +**************** WARNINGS ****************
3.15 +<expected>
3.16 + <error>
3.17 + <at line="1" column="57"/>
3.18 + <text>Query word bd - not reporting duplicates</text>
3.19 + </error>
3.20 + <error>
3.21 + <at line="2" column="48"/>
3.22 + <text>Query word pr - not reporting duplicates</text>
3.23 + </error>
3.24 + <error>
3.25 + <at line="8" column="37"/>
3.26 + <text>Missing space or underscore?</text>
3.27 + </error>
3.28 + <error>
3.29 + <at line="8" column="66"/>
3.30 + <text>Missing space or underscore?</text>
3.31 + </error>
3.32 +</expected>
4.1 --- a/test/compatibility/brackets.tst Wed Oct 02 09:14:33 2013 +0100
4.2 +++ b/test/compatibility/brackets.tst Wed Oct 30 17:11:11 2013 +0000
4.3 @@ -14,31 +14,46 @@
4.4 This _very_ important_ paragraph has an odd number of underscores.
4.5
4.6 Unspaced brackets are a[most a]ways _wrong_.
4.7 -**************** EXPECTED ****************
4.8 -
4.9 -This (excellent paragraph has one more {opening} paranthesis than closing.
4.10 - Line 2 - Mismatched round brackets?
4.11 -
4.12 -On the other hand, this poor) paragraph does it backwards.
4.13 - Line 4 - Mismatched round brackets?
4.14 -
4.15 -This {slightly odd paragraph has one more [opening] brace than closing.
4.16 - Line 6 - Mismatched curly brackets?
4.17 -
4.18 -And again, this balmy} paragraph does it backwards.
4.19 - Line 8 - Mismatched curly brackets?
4.20 -
4.21 -This paragraph[11 has one more (opening) bracket than closing.
4.22 - Line 10 - Mismatched square brackets?
4.23 -
4.24 -Whereas this one is 12]tupsy turvey.
4.25 - Line 12 - Mismatched square brackets?
4.26 -
4.27 -This _very_ important_ paragraph has an odd number of underscores.
4.28 - Line 14 - Mismatched underscores?
4.29 -
4.30 -Unspaced brackets are a[most a]ways _wrong_.
4.31 - Line 15 column 23 - Unspaced bracket?
4.32 -
4.33 -Unspaced brackets are a[most a]ways _wrong_.
4.34 - Line 15 column 30 - Unspaced bracket?
4.35 +**************** WARNINGS ****************
4.36 +<expected>
4.37 + <error>
4.38 + <at line="2"/>
4.39 + <text>Mismatched round brackets?</text>
4.40 + </error>
4.41 + <error>
4.42 + <at line="4"/>
4.43 + <text>Mismatched round brackets?</text>
4.44 + </error>
4.45 + <error>
4.46 + <at line="6"/>
4.47 + <text>Mismatched curly brackets?</text>
4.48 + </error>
4.49 + <error>
4.50 + <at line="8"/>
4.51 + <text>Mismatched curly brackets?</text>
4.52 + </error>
4.53 + <error>
4.54 + <at line="10"/>
4.55 + <text>Mismatched square brackets?</text>
4.56 + </error>
4.57 + <error>
4.58 + <at line="12"/>
4.59 + <text>Mismatched square brackets?</text>
4.60 + </error>
4.61 + <false-negative>
4.62 + <at line="13" column="22"/>
4.63 + <text>Missing space or underscore?</text>
4.64 + </false-negative>
4.65 + <error>
4.66 + <at line="14"/>
4.67 + <text>Mismatched underscores?</text>
4.68 + </error>
4.69 + <error>
4.70 + <at line="15" column="23"/>
4.71 + <text>Unspaced bracket?</text>
4.72 + </error>
4.73 + <error>
4.74 + <at line="15" column="30"/>
4.75 + <text>Unspaced bracket?</text>
4.76 + </error>
4.77 +</expected>