diff -r 70cc629ec1e0 -r aece0899b1d3 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Wed Oct 16 22:51:29 2013 +0100 +++ b/bookloupe/bookloupe.c Wed Oct 30 17:21:21 2013 +0000 @@ -250,7 +250,7 @@ gchar *running_from; gboolean mixdigit(const char *); -gchar *getaword(const char **); +gchar *getaword(const char *,const char **); char *flgets(char **,long,int); void postprocess_for_HTML(char *); char *linehasmarkup(char *); @@ -977,7 +977,7 @@ results.emdash.PG_space++; for (s=lines[j];*s;) { - inword=getaword(&s); + inword=getaword(NULL,&s); if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) results.Dutchcount++; if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) @@ -2002,7 +2002,7 @@ for (s=aline;*s;) { wordstart=s; - t=getaword(&s); + t=getaword(NULL,&s); if (!*t) { g_free(t); @@ -2052,8 +2052,9 @@ /* * check_for_typos: * - * Check for commonly mistyped words, - * and digits like 0 for O in a word. + * Check for commonly mistyped words, and digits like 0 for O in a word. + * Note that somewhat confusingly, this is also where we call getaword() + * with a non-NULL line so that it will issue warnings. */ void check_for_typos(const char *aline,struct warnings *warnings) { @@ -2069,7 +2070,7 @@ for (s=aline;*s;) { wordstart=s; - inword=getaword(&s); + inword=getaword(aline,&s); if (!*inword) { g_free(inword); @@ -2318,7 +2319,7 @@ * If there are letters on both sides of it or * if it's strict punctuation followed by an alpha. */ - if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) || + if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) || g_utf8_strchr("?!,;:",-1,c))) { if (c=='.') @@ -3419,14 +3420,18 @@ * A word is defined as one English word unit--or at least that's the aim. * "ptr" is advanced to the position in the line where we will start * looking for the next word. + * If line is non-NULL, then it will be used to derive the column numbers for + * any warnings issued. If line is NULL, then warnings will be suppressed. * * Returns: A newly-allocated string. */ -gchar *getaword(const char **ptr) +gchar *getaword(const char *line,const char **ptr) { - const char *s,*t; + const char *s,*t,*t2; GString *word; gunichar c,pc; + int adjust; + gboolean initial_underlining=FALSE; word=g_string_new(NULL); for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) && !g_unichar_isalpha(g_utf8_get_char(*ptr)) && @@ -3448,6 +3453,7 @@ else g_string_truncate(word,0); } + initial_underlining=g_utf8_get_char(*ptr)=='_'; } /* * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35. @@ -3477,10 +3483,81 @@ } /* we didn't find a punctuated number - do the regular getword thing */ g_string_truncate(word,0); - c=g_utf8_get_char(*ptr); - for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c); - *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr)) + s=*ptr; + c=g_utf8_get_char(s); + for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' || + CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s)) g_string_append_unichar(word,c); + if (initial_underlining && word->str[word->len-1]=='_') + { + /* _Simple_ or _Old-school_underlining_ */ + t=strchr(*ptr,'_'); + g_string_truncate(word,t-*ptr); + *ptr=t; + } + else if (initial_underlining || (t=strchr(word->str,'_'))) + { + /* Part_ial_ underlining */ + adjust=0; + if (initial_underlining) + { + t2=strchr(word->str,'_'); + if (t2) + { + g_string_erase(word,t2-word->str,1); + adjust++; + } + else + { + if (line) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",line); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - " + "Missing space or underscore?\n",linecnt, + g_utf8_pointer_to_offset(line,*ptr)); + else + cnt_punct++; + } + *ptr=s; + return g_string_free(word,FALSE); + } + } + while ((t=strchr(word->str,'_'))) + { + t2=strchr(t+1,'_'); + if (t2) + { + g_string_erase(word,t-word->str,1); + t2--; + g_string_erase(word,t2-word->str,1); + adjust+=2; + } + else + { + g_string_truncate(word,t-word->str); + adjust+=g_utf8_pointer_to_offset(word->str,t); + *ptr=g_utf8_offset_to_pointer(*ptr,adjust); + if (line) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",line); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - " + "Missing space or underscore?\n",linecnt, + g_utf8_pointer_to_offset(line,*ptr)+1); + else + cnt_punct++; + } + return g_string_free(word,FALSE); + } + } + *ptr=s; + } + else + /* No underlining */ + *ptr=s; return g_string_free(word,FALSE); }