1.1 --- a/bookloupe/bookloupe.c Wed Oct 16 22:51:29 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Sat Nov 02 09:03:54 2013 +0000
1.3 @@ -250,7 +250,7 @@
1.4 gchar *running_from;
1.5
1.6 gboolean mixdigit(const char *);
1.7 -gchar *getaword(const char **);
1.8 +gchar *getaword(const char *,const char **);
1.9 char *flgets(char **,long,int);
1.10 void postprocess_for_HTML(char *);
1.11 char *linehasmarkup(char *);
1.12 @@ -977,7 +977,7 @@
1.13 results.emdash.PG_space++;
1.14 for (s=lines[j];*s;)
1.15 {
1.16 - inword=getaword(&s);
1.17 + inword=getaword(NULL,&s);
1.18 if (!strcmp(inword,"hij") || !strcmp(inword,"niet"))
1.19 results.Dutchcount++;
1.20 if (!strcmp(inword,"dans") || !strcmp(inword,"avec"))
1.21 @@ -2002,7 +2002,7 @@
1.22 for (s=aline;*s;)
1.23 {
1.24 wordstart=s;
1.25 - t=getaword(&s);
1.26 + t=getaword(NULL,&s);
1.27 if (!*t)
1.28 {
1.29 g_free(t);
1.30 @@ -2052,8 +2052,9 @@
1.31 /*
1.32 * check_for_typos:
1.33 *
1.34 - * Check for commonly mistyped words,
1.35 - * and digits like 0 for O in a word.
1.36 + * Check for commonly mistyped words, and digits like 0 for O in a word.
1.37 + * Note that somewhat confusingly, this is also where we call getaword()
1.38 + * with a non-NULL line so that it will issue warnings.
1.39 */
1.40 void check_for_typos(const char *aline,struct warnings *warnings)
1.41 {
1.42 @@ -2069,7 +2070,7 @@
1.43 for (s=aline;*s;)
1.44 {
1.45 wordstart=s;
1.46 - inword=getaword(&s);
1.47 + inword=getaword(aline,&s);
1.48 if (!*inword)
1.49 {
1.50 g_free(inword);
1.51 @@ -2318,7 +2319,7 @@
1.52 * If there are letters on both sides of it or
1.53 * if it's strict punctuation followed by an alpha.
1.54 */
1.55 - if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1.56 + if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
1.57 g_utf8_strchr("?!,;:",-1,c)))
1.58 {
1.59 if (c=='.')
1.60 @@ -3419,14 +3420,18 @@
1.61 * A word is defined as one English word unit--or at least that's the aim.
1.62 * "ptr" is advanced to the position in the line where we will start
1.63 * looking for the next word.
1.64 + * If line is non-NULL, then it will be used to derive the column numbers for
1.65 + * any warnings issued. If line is NULL, then warnings will be suppressed.
1.66 *
1.67 * Returns: A newly-allocated string.
1.68 */
1.69 -gchar *getaword(const char **ptr)
1.70 +gchar *getaword(const char *line,const char **ptr)
1.71 {
1.72 - const char *s,*t;
1.73 + const char *s,*t,*t2;
1.74 GString *word;
1.75 gunichar c,pc;
1.76 + int adjust;
1.77 + gboolean initial_underlining=FALSE;
1.78 word=g_string_new(NULL);
1.79 for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
1.80 !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
1.81 @@ -3448,6 +3453,7 @@
1.82 else
1.83 g_string_truncate(word,0);
1.84 }
1.85 + initial_underlining=g_utf8_get_char(*ptr)=='_';
1.86 }
1.87 /*
1.88 * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
1.89 @@ -3477,10 +3483,84 @@
1.90 }
1.91 /* we didn't find a punctuated number - do the regular getword thing */
1.92 g_string_truncate(word,0);
1.93 - c=g_utf8_get_char(*ptr);
1.94 - for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
1.95 - *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
1.96 + s=*ptr;
1.97 + c=g_utf8_get_char(s);
1.98 + for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' ||
1.99 + CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s))
1.100 g_string_append_unichar(word,c);
1.101 + if (initial_underlining && word->str[word->len-1]=='_')
1.102 + {
1.103 + /* _Simple_ or _Old-school_underlining_ */
1.104 + t=strchr(*ptr,'_');
1.105 + g_string_truncate(word,t-*ptr);
1.106 + if (s-t>1)
1.107 + *ptr=t; /* _Old-school_underlining_ */
1.108 + else
1.109 + *ptr=s; /* _Simple_ */
1.110 + }
1.111 + else if (initial_underlining || (t=strchr(word->str,'_')))
1.112 + {
1.113 + /* Part_ial_ underlining */
1.114 + adjust=0;
1.115 + if (initial_underlining)
1.116 + {
1.117 + t2=strchr(word->str,'_');
1.118 + if (t2)
1.119 + {
1.120 + g_string_erase(word,t2-word->str,1);
1.121 + adjust++;
1.122 + }
1.123 + else
1.124 + {
1.125 + if (line)
1.126 + {
1.127 + if (pswit[ECHO_SWITCH])
1.128 + g_print("\n%s\n",line);
1.129 + if (!pswit[OVERVIEW_SWITCH])
1.130 + g_print(" Line %ld column %ld - "
1.131 + "Missing space or underscore?\n",linecnt,
1.132 + g_utf8_pointer_to_offset(line,*ptr));
1.133 + else
1.134 + cnt_punct++;
1.135 + }
1.136 + *ptr=s;
1.137 + return g_string_free(word,FALSE);
1.138 + }
1.139 + }
1.140 + while ((t=strchr(word->str,'_')))
1.141 + {
1.142 + t2=strchr(t+1,'_');
1.143 + if (t2)
1.144 + {
1.145 + g_string_erase(word,t-word->str,1);
1.146 + t2--;
1.147 + g_string_erase(word,t2-word->str,1);
1.148 + adjust+=2;
1.149 + }
1.150 + else
1.151 + {
1.152 + g_string_truncate(word,t-word->str);
1.153 + adjust+=g_utf8_pointer_to_offset(word->str,t);
1.154 + *ptr=g_utf8_offset_to_pointer(*ptr,adjust);
1.155 + if (line)
1.156 + {
1.157 + if (pswit[ECHO_SWITCH])
1.158 + g_print("\n%s\n",line);
1.159 + if (!pswit[OVERVIEW_SWITCH])
1.160 + g_print(" Line %ld column %ld - "
1.161 + "Missing space or underscore?\n",linecnt,
1.162 + g_utf8_pointer_to_offset(line,*ptr)+1);
1.163 + else
1.164 + cnt_punct++;
1.165 + }
1.166 + return g_string_free(word,FALSE);
1.167 + }
1.168 + }
1.169 + *ptr=s;
1.170 + }
1.171 + else
1.172 + /* No underlining */
1.173 + *ptr=s;
1.174 return g_string_free(word,FALSE);
1.175 }
1.176