# HG changeset patch
# User ali <ali@juiblex.co.uk>
# Date 1383383034 0
# Node ID 43c73b36e936f99234ac60d5a2e6998f91437805
# Parent  2d48e8cdda24e2e8b6ddd25515c26d0c6dbc203d
Fix bug #26: Partially emphasized words

diff -r 2d48e8cdda24 -r 43c73b36e936 bookloupe/bookloupe.c
--- a/bookloupe/bookloupe.c	Wed Oct 02 09:14:33 2013 +0100
+++ b/bookloupe/bookloupe.c	Sat Nov 02 09:03:54 2013 +0000
@@ -250,7 +250,7 @@
 gchar *running_from;
 
 gboolean mixdigit(const char *);
-gchar *getaword(const char **);
+gchar *getaword(const char *,const char **);
 char *flgets(char **,long,int);
 void postprocess_for_HTML(char *);
 char *linehasmarkup(char *);
@@ -977,7 +977,7 @@
 	    results.emdash.PG_space++;
 	for (s=lines[j];*s;)
 	{
-	    inword=getaword(&s);
+	    inword=getaword(NULL,&s);
 	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
 		results.Dutchcount++;
 	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
@@ -2002,7 +2002,7 @@
 	for (s=aline;*s;)
 	{
 	    wordstart=s;
-	    t=getaword(&s);
+	    t=getaword(NULL,&s);
 	    if (!*t)
 	    {
 		g_free(t);
@@ -2052,8 +2052,9 @@
 /*
  * check_for_typos:
  *
- * Check for commonly mistyped words,
- * and digits like 0 for O in a word.
+ * Check for commonly mistyped words, and digits like 0 for O in a word.
+ * Note that somewhat confusingly, this is also where we call getaword()
+ * with a non-NULL line so that it will issue warnings.
  */
 void check_for_typos(const char *aline,struct warnings *warnings)
 {
@@ -2069,7 +2070,7 @@
     for (s=aline;*s;)
     {
 	wordstart=s;
-	inword=getaword(&s);
+	inword=getaword(aline,&s);
 	if (!*inword)
 	{
 	    g_free(inword);
@@ -2318,7 +2319,7 @@
 	     * If there are letters on both sides of it or
 	     * if it's strict punctuation followed by an alpha.
 	     */
-	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
+	    if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
 	      g_utf8_strchr("?!,;:",-1,c)))
 	    {
 		if (c=='.')
@@ -3419,14 +3420,18 @@
  * A word is defined as one English word unit--or at least that's the aim.
  * "ptr" is advanced to the position in the line where we will start
  * looking for the next word.
+ * If line is non-NULL, then it will be used to derive the column numbers for
+ * any warnings issued. If line is NULL, then warnings will be suppressed.
  *
  * Returns: A newly-allocated string.
  */
-gchar *getaword(const char **ptr)
+gchar *getaword(const char *line,const char **ptr)
 {
-    const char *s,*t;
+    const char *s,*t,*t2;
     GString *word;
     gunichar c,pc;
+    int adjust;
+    gboolean initial_underlining=FALSE;
     word=g_string_new(NULL);
     for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
       !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
@@ -3448,6 +3453,7 @@
 	    else
 		g_string_truncate(word,0);
 	}
+	initial_underlining=g_utf8_get_char(*ptr)=='_';
     }
     /*
      * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
@@ -3477,10 +3483,84 @@
     }
     /* we didn't find a punctuated number - do the regular getword thing */
     g_string_truncate(word,0);
-    c=g_utf8_get_char(*ptr);
-    for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
-      *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
+    s=*ptr;
+    c=g_utf8_get_char(s);
+    for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' ||
+      CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s))
 	g_string_append_unichar(word,c);
+    if (initial_underlining && word->str[word->len-1]=='_')
+    {
+	/* _Simple_ or _Old-school_underlining_ */
+	t=strchr(*ptr,'_');
+	g_string_truncate(word,t-*ptr);
+	if (s-t>1)
+	    *ptr=t;	/* _Old-school_underlining_ */
+	else
+	    *ptr=s;	/* _Simple_ */
+    }
+    else if (initial_underlining || (t=strchr(word->str,'_')))
+    {
+	/* Part_ial_ underlining */
+	adjust=0;
+	if (initial_underlining)
+	{
+	    t2=strchr(word->str,'_');
+	    if (t2)
+	    {
+		g_string_erase(word,t2-word->str,1);
+		adjust++;
+	    }
+	    else
+	    {
+		if (line)
+		{
+		    if (pswit[ECHO_SWITCH])
+			g_print("\n%s\n",line);
+		    if (!pswit[OVERVIEW_SWITCH])
+			g_print("    Line %ld column %ld - "
+			  "Missing space or underscore?\n",linecnt,
+			  g_utf8_pointer_to_offset(line,*ptr));
+		    else
+			cnt_punct++;
+		}
+		*ptr=s;
+		return g_string_free(word,FALSE);
+	    }
+	}
+	while ((t=strchr(word->str,'_')))
+	{
+	    t2=strchr(t+1,'_');
+	    if (t2)
+	    {
+		g_string_erase(word,t-word->str,1);
+		t2--;
+		g_string_erase(word,t2-word->str,1);
+		adjust+=2;
+	    }
+	    else
+	    {
+		g_string_truncate(word,t-word->str);
+		adjust+=g_utf8_pointer_to_offset(word->str,t);
+		*ptr=g_utf8_offset_to_pointer(*ptr,adjust);
+		if (line)
+		{
+		    if (pswit[ECHO_SWITCH])
+			g_print("\n%s\n",line);
+		    if (!pswit[OVERVIEW_SWITCH])
+			g_print("    Line %ld column %ld - "
+			  "Missing space or underscore?\n",linecnt,
+			  g_utf8_pointer_to_offset(line,*ptr)+1);
+		    else
+			cnt_punct++;
+		}
+		return g_string_free(word,FALSE);
+	    }
+	}
+	*ptr=s;
+    }
+    else
+	/* No underlining */
+	*ptr=s;
     return g_string_free(word,FALSE);
 }
 
diff -r 2d48e8cdda24 -r 43c73b36e936 test/bookloupe/Makefile.am
--- a/test/bookloupe/Makefile.am	Wed Oct 02 09:14:33 2013 +0100
+++ b/test/bookloupe/Makefile.am	Sat Nov 02 09:03:54 2013 +0000
@@ -3,6 +3,7 @@
 	runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \
 	emdash.tst config-internal.tst config-default.tst config-user.tst \
 	config-override.tst charset-cp1252.tst charset-latin1.tst \
-	footnote-marker.tst unix-lineends.tst os9-lineends.tst dot-comma.tst
+	footnote-marker.tst unix-lineends.tst os9-lineends.tst dot-comma.tst \
+	partial-underlining.tst
 
 dist_pkgdata_DATA=$(TESTS)
diff -r 2d48e8cdda24 -r 43c73b36e936 test/bookloupe/partial-underlining.tst
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/bookloupe/partial-underlining.tst	Sat Nov 02 09:03:54 2013 +0000
@@ -0,0 +1,45 @@
+**************** INPUT ****************
+Bookloupe understands simple underlining, for example, a _bd_ word as
+well as old-school underlining, for example, _a_pr_word_.
+
+It also understands partial underlining, as in l'_Adthima_, Abag_ae_l,
+_ph_antasm, and even _ph_antasi_z_e.
+
+While warnings about missing spaces around underscores are generally
+suppressed, partial underlining with_an odd number of un_der_scor_es
+will still be warned about.
+
+Then just a couple of special cases we need to check for: first _simple_
+underlining with an underscore at the end of the line and a solitary _
+should both be handled correctly.
+**************** WARNINGS ****************
+<expected>
+  <error>
+    <at line="1" column="57"/>
+    <text>Query word bd - not reporting duplicates</text>
+  </error>
+  <error>
+    <at line="2" column="48"/>
+    <text>Query word pr - not reporting duplicates</text>
+  </error>
+  <error>
+    <at line="8" column="37"/>
+    <text>Missing space or underscore?</text>
+  </error>
+  <error>
+    <at line="8" column="66"/>
+    <text>Missing space or underscore?</text>
+  </error>
+  <error>
+    <at line="12" column="70"/>
+    <text>Missing space or underscore?</text>
+  </error>
+  <error>
+    <at line="12" column="70"/>
+    <text>Spaced punctuation?</text>
+  </error>
+  <error>
+    <at line="14"/>
+    <text>Mismatched underscores?</text>
+  </error>
+</expected>
diff -r 2d48e8cdda24 -r 43c73b36e936 test/compatibility/brackets.tst
--- a/test/compatibility/brackets.tst	Wed Oct 02 09:14:33 2013 +0100
+++ b/test/compatibility/brackets.tst	Sat Nov 02 09:03:54 2013 +0000
@@ -14,31 +14,46 @@
 This _very_ important_ paragraph has an odd number of underscores.
 
 Unspaced brackets are a[most a]ways _wrong_.
-**************** EXPECTED ****************
-
-This (excellent paragraph has one more {opening} paranthesis than closing.
-    Line 2 - Mismatched round brackets?
-
-On the other hand, this poor) paragraph does it backwards.
-    Line 4 - Mismatched round brackets?
-
-This {slightly odd paragraph has one more [opening] brace than closing.
-    Line 6 - Mismatched curly brackets?
-
-And again, this balmy} paragraph does it backwards.
-    Line 8 - Mismatched curly brackets?
-
-This paragraph[11 has one more (opening) bracket than closing.
-    Line 10 - Mismatched square brackets?
-
-Whereas this one is 12]tupsy turvey.
-    Line 12 - Mismatched square brackets?
-
-This _very_ important_ paragraph has an odd number of underscores.
-    Line 14 - Mismatched underscores?
-
-Unspaced brackets are a[most a]ways _wrong_.
-    Line 15 column 23 - Unspaced bracket?
-
-Unspaced brackets are a[most a]ways _wrong_.
-    Line 15 column 30 - Unspaced bracket?
+**************** WARNINGS ****************
+<expected>
+  <error>
+    <at line="2"/>
+    <text>Mismatched round brackets?</text>
+  </error>
+  <error>
+    <at line="4"/>
+    <text>Mismatched round brackets?</text>
+  </error>
+  <error>
+    <at line="6"/>
+    <text>Mismatched curly brackets?</text>
+  </error>
+  <error>
+    <at line="8"/>
+    <text>Mismatched curly brackets?</text>
+  </error>
+  <error>
+    <at line="10"/>
+    <text>Mismatched square brackets?</text>
+  </error>
+  <error>
+    <at line="12"/>
+    <text>Mismatched square brackets?</text>
+  </error>
+  <false-negative>
+    <at line="13" column="22"/>
+    <text>Missing space or underscore?</text>
+  </false-negative>
+  <error>
+    <at line="14"/>
+    <text>Mismatched underscores?</text>
+  </error>
+  <error>
+    <at line="15" column="23"/>
+    <text>Unspaced bracket?</text>
+  </error>
+  <error>
+    <at line="15" column="30"/>
+    <text>Unspaced bracket?</text>
+  </error>
+</expected>