# HG changeset patch # User ali # Date 1383383034 0 # Node ID 43c73b36e936f99234ac60d5a2e6998f91437805 # Parent 2d48e8cdda24e2e8b6ddd25515c26d0c6dbc203d Fix bug #26: Partially emphasized words diff -r 2d48e8cdda24 -r 43c73b36e936 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Wed Oct 02 09:14:33 2013 +0100 +++ b/bookloupe/bookloupe.c Sat Nov 02 09:03:54 2013 +0000 @@ -250,7 +250,7 @@ gchar *running_from; gboolean mixdigit(const char *); -gchar *getaword(const char **); +gchar *getaword(const char *,const char **); char *flgets(char **,long,int); void postprocess_for_HTML(char *); char *linehasmarkup(char *); @@ -977,7 +977,7 @@ results.emdash.PG_space++; for (s=lines[j];*s;) { - inword=getaword(&s); + inword=getaword(NULL,&s); if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) results.Dutchcount++; if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) @@ -2002,7 +2002,7 @@ for (s=aline;*s;) { wordstart=s; - t=getaword(&s); + t=getaword(NULL,&s); if (!*t) { g_free(t); @@ -2052,8 +2052,9 @@ /* * check_for_typos: * - * Check for commonly mistyped words, - * and digits like 0 for O in a word. + * Check for commonly mistyped words, and digits like 0 for O in a word. + * Note that somewhat confusingly, this is also where we call getaword() + * with a non-NULL line so that it will issue warnings. */ void check_for_typos(const char *aline,struct warnings *warnings) { @@ -2069,7 +2070,7 @@ for (s=aline;*s;) { wordstart=s; - inword=getaword(&s); + inword=getaword(aline,&s); if (!*inword) { g_free(inword); @@ -2318,7 +2319,7 @@ * If there are letters on both sides of it or * if it's strict punctuation followed by an alpha. */ - if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) || + if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) || g_utf8_strchr("?!,;:",-1,c))) { if (c=='.') @@ -3419,14 +3420,18 @@ * A word is defined as one English word unit--or at least that's the aim. * "ptr" is advanced to the position in the line where we will start * looking for the next word. + * If line is non-NULL, then it will be used to derive the column numbers for + * any warnings issued. If line is NULL, then warnings will be suppressed. * * Returns: A newly-allocated string. */ -gchar *getaword(const char **ptr) +gchar *getaword(const char *line,const char **ptr) { - const char *s,*t; + const char *s,*t,*t2; GString *word; gunichar c,pc; + int adjust; + gboolean initial_underlining=FALSE; word=g_string_new(NULL); for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) && !g_unichar_isalpha(g_utf8_get_char(*ptr)) && @@ -3448,6 +3453,7 @@ else g_string_truncate(word,0); } + initial_underlining=g_utf8_get_char(*ptr)=='_'; } /* * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35. @@ -3477,10 +3483,84 @@ } /* we didn't find a punctuated number - do the regular getword thing */ g_string_truncate(word,0); - c=g_utf8_get_char(*ptr); - for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c); - *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr)) + s=*ptr; + c=g_utf8_get_char(s); + for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' || + CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s)) g_string_append_unichar(word,c); + if (initial_underlining && word->str[word->len-1]=='_') + { + /* _Simple_ or _Old-school_underlining_ */ + t=strchr(*ptr,'_'); + g_string_truncate(word,t-*ptr); + if (s-t>1) + *ptr=t; /* _Old-school_underlining_ */ + else + *ptr=s; /* _Simple_ */ + } + else if (initial_underlining || (t=strchr(word->str,'_'))) + { + /* Part_ial_ underlining */ + adjust=0; + if (initial_underlining) + { + t2=strchr(word->str,'_'); + if (t2) + { + g_string_erase(word,t2-word->str,1); + adjust++; + } + else + { + if (line) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",line); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - " + "Missing space or underscore?\n",linecnt, + g_utf8_pointer_to_offset(line,*ptr)); + else + cnt_punct++; + } + *ptr=s; + return g_string_free(word,FALSE); + } + } + while ((t=strchr(word->str,'_'))) + { + t2=strchr(t+1,'_'); + if (t2) + { + g_string_erase(word,t-word->str,1); + t2--; + g_string_erase(word,t2-word->str,1); + adjust+=2; + } + else + { + g_string_truncate(word,t-word->str); + adjust+=g_utf8_pointer_to_offset(word->str,t); + *ptr=g_utf8_offset_to_pointer(*ptr,adjust); + if (line) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",line); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - " + "Missing space or underscore?\n",linecnt, + g_utf8_pointer_to_offset(line,*ptr)+1); + else + cnt_punct++; + } + return g_string_free(word,FALSE); + } + } + *ptr=s; + } + else + /* No underlining */ + *ptr=s; return g_string_free(word,FALSE); } diff -r 2d48e8cdda24 -r 43c73b36e936 test/bookloupe/Makefile.am --- a/test/bookloupe/Makefile.am Wed Oct 02 09:14:33 2013 +0100 +++ b/test/bookloupe/Makefile.am Sat Nov 02 09:03:54 2013 +0000 @@ -3,6 +3,7 @@ runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \ emdash.tst config-internal.tst config-default.tst config-user.tst \ config-override.tst charset-cp1252.tst charset-latin1.tst \ - footnote-marker.tst unix-lineends.tst os9-lineends.tst dot-comma.tst + footnote-marker.tst unix-lineends.tst os9-lineends.tst dot-comma.tst \ + partial-underlining.tst dist_pkgdata_DATA=$(TESTS) diff -r 2d48e8cdda24 -r 43c73b36e936 test/bookloupe/partial-underlining.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/partial-underlining.tst Sat Nov 02 09:03:54 2013 +0000 @@ -0,0 +1,45 @@ +**************** INPUT **************** +Bookloupe understands simple underlining, for example, a _bd_ word as +well as old-school underlining, for example, _a_pr_word_. + +It also understands partial underlining, as in l'_Adthima_, Abag_ae_l, +_ph_antasm, and even _ph_antasi_z_e. + +While warnings about missing spaces around underscores are generally +suppressed, partial underlining with_an odd number of un_der_scor_es +will still be warned about. + +Then just a couple of special cases we need to check for: first _simple_ +underlining with an underscore at the end of the line and a solitary _ +should both be handled correctly. +**************** WARNINGS **************** + + + + Query word bd - not reporting duplicates + + + + Query word pr - not reporting duplicates + + + + Missing space or underscore? + + + + Missing space or underscore? + + + + Missing space or underscore? + + + + Spaced punctuation? + + + + Mismatched underscores? + + diff -r 2d48e8cdda24 -r 43c73b36e936 test/compatibility/brackets.tst --- a/test/compatibility/brackets.tst Wed Oct 02 09:14:33 2013 +0100 +++ b/test/compatibility/brackets.tst Sat Nov 02 09:03:54 2013 +0000 @@ -14,31 +14,46 @@ This _very_ important_ paragraph has an odd number of underscores. Unspaced brackets are a[most a]ways _wrong_. -**************** EXPECTED **************** - -This (excellent paragraph has one more {opening} paranthesis than closing. - Line 2 - Mismatched round brackets? - -On the other hand, this poor) paragraph does it backwards. - Line 4 - Mismatched round brackets? - -This {slightly odd paragraph has one more [opening] brace than closing. - Line 6 - Mismatched curly brackets? - -And again, this balmy} paragraph does it backwards. - Line 8 - Mismatched curly brackets? - -This paragraph[11 has one more (opening) bracket than closing. - Line 10 - Mismatched square brackets? - -Whereas this one is 12]tupsy turvey. - Line 12 - Mismatched square brackets? - -This _very_ important_ paragraph has an odd number of underscores. - Line 14 - Mismatched underscores? - -Unspaced brackets are a[most a]ways _wrong_. - Line 15 column 23 - Unspaced bracket? - -Unspaced brackets are a[most a]ways _wrong_. - Line 15 column 30 - Unspaced bracket? +**************** WARNINGS **************** + + + + Mismatched round brackets? + + + + Mismatched round brackets? + + + + Mismatched curly brackets? + + + + Mismatched curly brackets? + + + + Mismatched square brackets? + + + + Mismatched square brackets? + + + + Missing space or underscore? + + + + Mismatched underscores? + + + + Unspaced bracket? + + + + Unspaced bracket? + +