Fix bug #6: BL treats a slanted apostrophe ? as a word separator, not as a contraction or possessive
1.1 --- a/bookloupe/Makefile.am Tue Sep 17 20:55:57 2013 +0100
1.2 +++ b/bookloupe/Makefile.am Sat Sep 21 23:40:18 2013 +0100
1.3 @@ -1,5 +1,6 @@
1.4 INCLUDES=-I$(top_srcdir)
1.5 bin_PROGRAMS=bookloupe
1.6 +bookloupe_SOURCES=bookloupe.c bookloupe.h counters.c counters.h
1.7 pkgdata_DATA=bookloupe.typ
1.8 AM_CFLAGS=$(GLIB_CFLAGS)
1.9 LIBS=$(GLIB_LIBS)
2.1 --- a/bookloupe/bookloupe.c Tue Sep 17 20:55:57 2013 +0100
2.2 +++ b/bookloupe/bookloupe.c Sat Sep 21 23:40:18 2013 +0100
2.3 @@ -27,6 +27,8 @@
2.4 #endif
2.5 #include <glib.h>
2.6 #include <bl/bl.h>
2.7 +#include "bookloupe.h"
2.8 +#include "counters.h"
2.9 #include "HTMLentities.h"
2.10
2.11 gchar *prevline;
2.12 @@ -123,50 +125,6 @@
2.13 "among", "those", "into", "whom", "having", "thence", ""
2.14 };
2.15
2.16 -/* special characters */
2.17 -#define CHAR_SPACE 32
2.18 -#define CHAR_TAB 9
2.19 -#define CHAR_LF 10
2.20 -#define CHAR_CR 13
2.21 -#define CHAR_DQUOTE 34
2.22 -#define CHAR_SQUOTE 39
2.23 -#define CHAR_OPEN_SQUOTE 96
2.24 -#define CHAR_TILDE 126
2.25 -#define CHAR_ASTERISK 42
2.26 -#define CHAR_FORESLASH 47
2.27 -#define CHAR_CARAT 94
2.28 -
2.29 -#define CHAR_UNDERSCORE '_'
2.30 -#define CHAR_OPEN_CBRACK '{'
2.31 -#define CHAR_CLOSE_CBRACK '}'
2.32 -#define CHAR_OPEN_RBRACK '('
2.33 -#define CHAR_CLOSE_RBRACK ')'
2.34 -#define CHAR_OPEN_SBRACK '['
2.35 -#define CHAR_CLOSE_SBRACK ']'
2.36 -
2.37 -/* longest and shortest normal PG line lengths */
2.38 -#define LONGEST_PG_LINE 75
2.39 -#define WAY_TOO_LONG 80
2.40 -#define SHORTEST_PG_LINE 55
2.41 -
2.42 -enum {
2.43 - ECHO_SWITCH,
2.44 - SQUOTE_SWITCH,
2.45 - TYPO_SWITCH,
2.46 - QPARA_SWITCH,
2.47 - PARANOID_SWITCH,
2.48 - LINE_END_SWITCH,
2.49 - OVERVIEW_SWITCH,
2.50 - STDOUT_SWITCH,
2.51 - HEADER_SWITCH,
2.52 - WEB_SWITCH,
2.53 - VERBOSE_SWITCH,
2.54 - MARKUP_SWITCH,
2.55 - USERTYPO_SWITCH,
2.56 - DP_SWITCH,
2.57 - SWITNO
2.58 -};
2.59 -
2.60 gboolean pswit[SWITNO]; /* program switches */
2.61
2.62 static GOptionEntry options[]={
2.63 @@ -242,40 +200,6 @@
2.64 UINT saved_cp;
2.65 #endif
2.66
2.67 -struct first_pass_results {
2.68 - long firstline,astline;
2.69 - long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
2.70 - long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
2.71 - long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
2.72 - int Dutchcount,Frenchcount;
2.73 -};
2.74 -
2.75 -struct warnings {
2.76 - int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
2.77 - int endquote;
2.78 - gboolean isDutch,isFrench;
2.79 -};
2.80 -
2.81 -struct counters {
2.82 - long quot;
2.83 - int c_unders,c_brack,s_brack,r_brack;
2.84 - int open_single_quote,close_single_quote;
2.85 -};
2.86 -
2.87 -struct line_properties {
2.88 - unsigned int len,blen;
2.89 - gunichar start;
2.90 -};
2.91 -
2.92 -struct parities {
2.93 - int dquote,squote;
2.94 -};
2.95 -
2.96 -struct pending {
2.97 - char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
2.98 - long squot;
2.99 -};
2.100 -
2.101 void parse_options(int *argc,char ***argv)
2.102 {
2.103 GError *err=NULL;
2.104 @@ -877,7 +801,7 @@
2.105 c=g_utf8_get_char(s);
2.106 if (c==CHAR_DQUOTE)
2.107 counters->quot++;
2.108 - if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
2.109 + if (CHAR_IS_SQUOTE(c))
2.110 {
2.111 if (s==aline)
2.112 {
2.113 @@ -887,21 +811,21 @@
2.114 */
2.115 if (!g_str_has_prefix(snext,"tis") &&
2.116 !g_str_has_prefix(snext,"Tis"))
2.117 - counters->open_single_quote++;
2.118 + increment_matching(counters,c,TRUE);
2.119 }
2.120 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
2.121 g_unichar_isalpha(g_utf8_get_char(snext)))
2.122 /* Do nothing! it's definitely an apostrophe, not a quote */
2.123 ;
2.124 /* it's outside a word - let's check it out */
2.125 - else if (c==CHAR_OPEN_SQUOTE ||
2.126 + else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
2.127 g_unichar_isalpha(g_utf8_get_char(snext)))
2.128 {
2.129 /* it damwell better BE an openquote */
2.130 if (!g_str_has_prefix(snext,"tis") &&
2.131 !g_str_has_prefix(snext,"Tis"))
2.132 /* hardcode a very common exception! */
2.133 - counters->open_single_quote++;
2.134 + increment_matching(counters,c,TRUE);
2.135 }
2.136 else
2.137 {
2.138 @@ -926,7 +850,7 @@
2.139 guessquote+=8; /* looks like a closequote */
2.140 else
2.141 guessquote++;
2.142 - if (counters->open_single_quote>counters->close_single_quote)
2.143 + if (matching_difference(counters,CHAR_SQUOTE)>0)
2.144 /*
2.145 * Give it the benefit of some doubt,
2.146 * if a squote is already open.
2.147 @@ -935,7 +859,7 @@
2.148 else
2.149 guessquote--;
2.150 if (guessquote>=0)
2.151 - counters->close_single_quote++;
2.152 + increment_matching(counters,c,FALSE);
2.153 }
2.154 }
2.155 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
2.156 @@ -943,18 +867,11 @@
2.157 isemptyline=FALSE; /* ignore lines like * * * as spacers */
2.158 if (c==CHAR_UNDERSCORE)
2.159 counters->c_unders++;
2.160 - if (c==CHAR_OPEN_CBRACK)
2.161 - counters->c_brack++;
2.162 - if (c==CHAR_CLOSE_CBRACK)
2.163 - counters->c_brack--;
2.164 - if (c==CHAR_OPEN_RBRACK)
2.165 - counters->r_brack++;
2.166 - if (c==CHAR_CLOSE_RBRACK)
2.167 - counters->r_brack--;
2.168 - if (c==CHAR_OPEN_SBRACK)
2.169 - counters->s_brack++;
2.170 - if (c==CHAR_CLOSE_SBRACK)
2.171 - counters->s_brack--;
2.172 + if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK || c==CHAR_OPEN_SBRACK)
2.173 + increment_matching(counters,c,TRUE);
2.174 + if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK ||
2.175 + c==CHAR_CLOSE_SBRACK)
2.176 + increment_matching(counters,c,FALSE);
2.177 sprev=s;
2.178 s=snext;
2.179 }
2.180 @@ -1423,12 +1340,12 @@
2.181 */
2.182 void check_for_extra_period(const char *aline,const struct warnings *warnings)
2.183 {
2.184 - const char *s,*t,*s1;
2.185 + const char *s,*t,*s1,*sprev;
2.186 int i;
2.187 gsize len;
2.188 gboolean istypo;
2.189 gchar *testword;
2.190 - gunichar *decomposition;
2.191 + gunichar c,nc,pc,*decomposition;
2.192 if (pswit[PARANOID_SWITCH])
2.193 {
2.194 for (t=aline;t=strstr(t,". ");)
2.195 @@ -1452,8 +1369,9 @@
2.196 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
2.197 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
2.198 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
2.199 - if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
2.200 - c4==CHAR_SPACE && g_unichar_isupper(c5))
2.201 + if (CHAR_IS_APOSTROPHE(c2) &&
2.202 + g_unichar_islower(c3) && c4==CHAR_SPACE &&
2.203 + g_unichar_isupper(c5))
2.204 {
2.205 t=g_utf8_next_char(t);
2.206 continue;
2.207 @@ -1468,14 +1386,22 @@
2.208 /* we have something to investigate */
2.209 istypo=TRUE;
2.210 /* so let's go back and find out */
2.211 - for (s1=g_utf8_prev_char(t);s1>=aline &&
2.212 - (g_unichar_isalpha(g_utf8_get_char(s1)) ||
2.213 - g_unichar_isdigit(g_utf8_get_char(s1)) ||
2.214 - g_utf8_get_char(s1)==CHAR_SQUOTE &&
2.215 - g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
2.216 - g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
2.217 - s1=g_utf8_prev_char(s1))
2.218 - ;
2.219 + nc=g_utf8_get_char(t);
2.220 + s1=g_utf8_prev_char(t);
2.221 + c=g_utf8_get_char(s1);
2.222 + sprev=g_utf8_prev_char(s1);
2.223 + pc=g_utf8_get_char(sprev);
2.224 + while (s1>=aline &&
2.225 + (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
2.226 + g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
2.227 + g_unichar_isalpha(nc)))
2.228 + {
2.229 + nc=c;
2.230 + s1=sprev;
2.231 + c=pc;
2.232 + sprev=g_utf8_prev_char(s1);
2.233 + pc=g_utf8_get_char(sprev);
2.234 + }
2.235 s1=g_utf8_next_char(s1);
2.236 s=strchr(s1,'.');
2.237 if (s)
2.238 @@ -1600,7 +1526,7 @@
2.239 gchar *testword;
2.240 int i,vowel,consonant,*dupcnt;
2.241 gboolean isdup,istypo,alower;
2.242 - gunichar c;
2.243 + gunichar c,pc;
2.244 long offset,len;
2.245 gsize decomposition_len;
2.246 for (s=aline;*s;)
2.247 @@ -1646,11 +1572,14 @@
2.248 * French contractions like l'Abbe
2.249 */
2.250 offset=g_utf8_pointer_to_offset(inword,t);
2.251 + if (offset>0)
2.252 + pc=g_utf8_get_char(g_utf8_prev_char(t));
2.253 + else
2.254 + pc='\0';
2.255 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
2.256 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
2.257 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
2.258 - offset>0 &&
2.259 - g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
2.260 + CHAR_IS_APOSTROPHE(pc))
2.261 ; /* do nothing! */
2.262 else
2.263 istypo=TRUE;
2.264 @@ -2050,8 +1979,7 @@
2.265 {
2.266 c=nc;
2.267 nc=g_utf8_get_char(g_utf8_next_char(s));
2.268 - if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
2.269 - s>aline &&
2.270 + if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
2.271 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
2.272 !g_unichar_isalpha(nc)))
2.273 {
2.274 @@ -2166,7 +2094,11 @@
2.275 */
2.276 void check_for_spaced_quotes(const char *aline)
2.277 {
2.278 + int i;
2.279 const char *s,*t;
2.280 + const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
2.281 + CHAR_RS_QUOTE};
2.282 + GString *pattern;
2.283 s=aline;
2.284 while ((t=strstr(s," \" ")))
2.285 {
2.286 @@ -2179,30 +2111,26 @@
2.287 cnt_punct++;
2.288 s=g_utf8_next_char(g_utf8_next_char(t));
2.289 }
2.290 - s=aline;
2.291 - while ((t=strstr(s," ' ")))
2.292 + pattern=g_string_new(NULL);
2.293 + for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
2.294 {
2.295 - if (pswit[ECHO_SWITCH])
2.296 - g_print("\n%s\n",aline);
2.297 - if (!pswit[OVERVIEW_SWITCH])
2.298 - g_print(" Line %ld column %ld - Spaced singlequote?\n",
2.299 - linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2.300 - else
2.301 - cnt_punct++;
2.302 - s=g_utf8_next_char(g_utf8_next_char(t));
2.303 + g_string_assign(pattern," ");
2.304 + g_string_append_unichar(pattern,single_quotes[i]);
2.305 + g_string_append_c(pattern,' ');
2.306 + s=aline;
2.307 + while ((t=strstr(s,pattern->str)))
2.308 + {
2.309 + if (pswit[ECHO_SWITCH])
2.310 + g_print("\n%s\n",aline);
2.311 + if (!pswit[OVERVIEW_SWITCH])
2.312 + g_print(" Line %ld column %ld - Spaced singlequote?\n",
2.313 + linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2.314 + else
2.315 + cnt_punct++;
2.316 + s=g_utf8_next_char(g_utf8_next_char(t));
2.317 + }
2.318 }
2.319 - s=aline;
2.320 - while ((t=strstr(s," ` ")))
2.321 - {
2.322 - if (pswit[ECHO_SWITCH])
2.323 - g_print("\n%s\n",aline);
2.324 - if (!pswit[OVERVIEW_SWITCH])
2.325 - g_print(" Line %ld column %ld - Spaced singlequote?\n",
2.326 - linecnt,g_utf8_pointer_to_offset(aline,t)+1);
2.327 - else
2.328 - cnt_punct++;
2.329 - s=g_utf8_next_char(g_utf8_next_char(t));
2.330 - }
2.331 + g_string_free(pattern,TRUE);
2.332 }
2.333
2.334 /*
2.335 @@ -2223,7 +2151,7 @@
2.336 pc=c;
2.337 c=nc;
2.338 nc=g_utf8_get_char(g_utf8_next_char(s));
2.339 - if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
2.340 + if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
2.341 {
2.342 if (pswit[ECHO_SWITCH])
2.343 g_print("\n%s\n",aline);
2.344 @@ -2255,8 +2183,7 @@
2.345 s=g_utf8_prev_char(aline+lbytes);
2.346 c1=g_utf8_get_char(s);
2.347 c2=g_utf8_get_char(g_utf8_prev_char(s));
2.348 - if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
2.349 - c2==CHAR_SPACE)
2.350 + if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
2.351 {
2.352 if (pswit[ECHO_SWITCH])
2.353 g_print("\n%s\n",aline);
2.354 @@ -2268,7 +2195,7 @@
2.355 }
2.356 c1=g_utf8_get_char(aline);
2.357 c2=g_utf8_get_char(g_utf8_next_char(aline));
2.358 - if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
2.359 + if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
2.360 {
2.361 if (pswit[ECHO_SWITCH])
2.362 g_print("\n%s\n",aline);
2.363 @@ -2470,8 +2397,7 @@
2.364 }
2.365 if (pending->squote)
2.366 {
2.367 - if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
2.368 - pending->squot)
2.369 + if (!CHAR_IS_SQUOTE(c) || pswit[QPARA_SWITCH] || pending->squot)
2.370 {
2.371 if (!pswit[OVERVIEW_SWITCH])
2.372 {
2.373 @@ -2558,28 +2484,39 @@
2.374 void check_for_mismatched_quotes(const struct counters *counters,
2.375 struct pending *pending)
2.376 {
2.377 + int squote_straight,squote_curved;
2.378 if (counters->quot%2)
2.379 pending->dquote=
2.380 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
2.381 - if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2.382 - counters->open_single_quote!=counters->close_single_quote)
2.383 - pending->squote=
2.384 - g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
2.385 - if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
2.386 - counters->open_single_quote!=counters->close_single_quote &&
2.387 - counters->open_single_quote!=counters->close_single_quote+1)
2.388 - /*
2.389 - * Flag it to be noted regardless of the
2.390 - * first char of the next para.
2.391 - */
2.392 - pending->squot=1;
2.393 - if (counters->r_brack)
2.394 + if (pswit[SQUOTE_SWITCH])
2.395 + {
2.396 + if (matching_count(counters,CHAR_SQUOTE,TRUE))
2.397 + squote_straight=matching_difference(counters,CHAR_SQUOTE);
2.398 + else
2.399 + squote_straight=0;
2.400 + if (matching_count(counters,CHAR_LS_QUOTE,TRUE))
2.401 + squote_curved=matching_difference(counters,CHAR_LS_QUOTE);
2.402 + else
2.403 + squote_curved=0;
2.404 + if (squote_straight || squote_curved)
2.405 + pending->squote=
2.406 + g_strdup_printf(" Line %ld - Mismatched singlequotes?",
2.407 + linecnt);
2.408 + if (squote_straight && squote_straight!=1 ||
2.409 + squote_curved && squote_curved!=1)
2.410 + /*
2.411 + * Flag it to be noted regardless of the
2.412 + * first char of the next para.
2.413 + */
2.414 + pending->squot=1;
2.415 + }
2.416 + if (matching_difference(counters,CHAR_OPEN_RBRACK))
2.417 pending->rbrack=
2.418 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
2.419 - if (counters->s_brack)
2.420 + if (matching_difference(counters,CHAR_OPEN_SBRACK))
2.421 pending->sbrack=
2.422 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
2.423 - if (counters->c_brack)
2.424 + if (matching_difference(counters,CHAR_OPEN_CBRACK))
2.425 pending->cbrack=
2.426 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
2.427 if (counters->c_unders%2)
2.428 @@ -2603,6 +2540,7 @@
2.429 {
2.430 gboolean letter_on_line=FALSE;
2.431 const char *s;
2.432 + gunichar c;
2.433 for (s=prevline;*s;s=g_utf8_next_char(s))
2.434 if (g_unichar_isalpha(g_utf8_get_char(s)))
2.435 {
2.436 @@ -2619,12 +2557,12 @@
2.437 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
2.438 g_utf8_get_char(prevline)>CHAR_SPACE)
2.439 {
2.440 - for (s=g_utf8_prev_char(prevline+strlen(prevline));
2.441 - (g_utf8_get_char(s)==CHAR_DQUOTE ||
2.442 - g_utf8_get_char(s)==CHAR_SQUOTE) &&
2.443 - g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
2.444 - s=g_utf8_prev_char(s))
2.445 - ;
2.446 + s=prevline+strlen(prevline);
2.447 + do
2.448 + {
2.449 + s=g_utf8_prev_char(s);
2.450 + c=g_utf8_get_char(s);
2.451 + } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
2.452 for (;s>prevline;s=g_utf8_prev_char(s))
2.453 {
2.454 if (g_unichar_isalpha(g_utf8_get_char(s)))
2.455 @@ -2857,6 +2795,7 @@
2.456 g_tree_foreach(qword,report_duplicate_queries,NULL);
2.457 g_tree_unref(qword);
2.458 g_tree_unref(qperiod);
2.459 + counters_destroy(&counters);
2.460 g_set_print_handler(NULL);
2.461 print_as_windows_1252(NULL);
2.462 if (pswit[MARKUP_SWITCH])
2.463 @@ -3066,10 +3005,10 @@
2.464 }
2.465 /* we didn't find a punctuated number - do the regular getword thing */
2.466 g_string_truncate(word,0);
2.467 - for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
2.468 - g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
2.469 - g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
2.470 - g_string_append_unichar(word,g_utf8_get_char(*ptr));
2.471 + c=g_utf8_get_char(*ptr);
2.472 + for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
2.473 + *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
2.474 + g_string_append_unichar(word,c);
2.475 return g_string_free(word,FALSE);
2.476 }
2.477
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/bookloupe/bookloupe.h Sat Sep 21 23:40:18 2013 +0100
3.3 @@ -0,0 +1,87 @@
3.4 +#ifndef BOOKLOUPE_H
3.5 +#define BOOKLOUPE_H
3.6 +
3.7 +/* special characters */
3.8 +#define CHAR_SPACE 32
3.9 +#define CHAR_TAB 9
3.10 +#define CHAR_LF 10
3.11 +#define CHAR_CR 13
3.12 +#define CHAR_DQUOTE 34
3.13 +#define CHAR_SQUOTE 39
3.14 +#define CHAR_OPEN_SQUOTE 96
3.15 +#define CHAR_TILDE 126
3.16 +#define CHAR_ASTERISK 42
3.17 +#define CHAR_FORESLASH 47
3.18 +#define CHAR_CARAT 94
3.19 +
3.20 +#define CHAR_UNDERSCORE '_'
3.21 +#define CHAR_OPEN_CBRACK '{'
3.22 +#define CHAR_CLOSE_CBRACK '}'
3.23 +#define CHAR_OPEN_RBRACK '('
3.24 +#define CHAR_CLOSE_RBRACK ')'
3.25 +#define CHAR_OPEN_SBRACK '['
3.26 +#define CHAR_CLOSE_SBRACK ']'
3.27 +
3.28 +#define CHAR_LS_QUOTE 0x2018
3.29 +#define CHAR_RS_QUOTE 0x2019
3.30 +
3.31 +#define CHAR_IS_SQUOTE(c) ((c)==CHAR_SQUOTE || (c)==CHAR_OPEN_SQUOTE || \
3.32 + (c)==CHAR_LS_QUOTE || (c)==CHAR_RS_QUOTE)
3.33 +
3.34 +#define CHAR_IS_APOSTROPHE(c) ((c)==CHAR_SQUOTE || (c)==CHAR_RS_QUOTE)
3.35 +
3.36 +#define CHAR_IS_CLOSING_QUOTE(c) \
3.37 + ((c)==CHAR_DQUOTE || (c)==CHAR_SQUOTE || (c)==CHAR_RS_QUOTE)
3.38 +
3.39 +/* longest and shortest normal PG line lengths */
3.40 +#define LONGEST_PG_LINE 75
3.41 +#define WAY_TOO_LONG 80
3.42 +#define SHORTEST_PG_LINE 55
3.43 +
3.44 +enum {
3.45 + ECHO_SWITCH,
3.46 + SQUOTE_SWITCH,
3.47 + TYPO_SWITCH,
3.48 + QPARA_SWITCH,
3.49 + PARANOID_SWITCH,
3.50 + LINE_END_SWITCH,
3.51 + OVERVIEW_SWITCH,
3.52 + STDOUT_SWITCH,
3.53 + HEADER_SWITCH,
3.54 + WEB_SWITCH,
3.55 + VERBOSE_SWITCH,
3.56 + MARKUP_SWITCH,
3.57 + USERTYPO_SWITCH,
3.58 + DP_SWITCH,
3.59 + SWITNO
3.60 +};
3.61 +
3.62 +struct first_pass_results {
3.63 + long firstline,astline;
3.64 + long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
3.65 + long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
3.66 + long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
3.67 + int Dutchcount,Frenchcount;
3.68 +};
3.69 +
3.70 +struct warnings {
3.71 + int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
3.72 + int endquote;
3.73 + gboolean isDutch,isFrench;
3.74 +};
3.75 +
3.76 +struct line_properties {
3.77 + unsigned int len,blen;
3.78 + gunichar start;
3.79 +};
3.80 +
3.81 +struct parities {
3.82 + int dquote,squote;
3.83 +};
3.84 +
3.85 +struct pending {
3.86 + char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
3.87 + long squot;
3.88 +};
3.89 +
3.90 +#endif /* BOOKOUPE_H */
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/bookloupe/counters.c Sat Sep 21 23:40:18 2013 +0100
4.3 @@ -0,0 +1,106 @@
4.4 +#include <stdlib.h>
4.5 +#include <glib.h>
4.6 +#include "bookloupe.h"
4.7 +#include "counters.h"
4.8 +
4.9 +struct matching_counter {
4.10 + int open,close;
4.11 +};
4.12 +
4.13 +static struct matching_counter *matching_counter_new(void)
4.14 +{
4.15 + return g_slice_new0(struct matching_counter);
4.16 +}
4.17 +
4.18 +static void matching_counter_free(struct matching_counter *counter)
4.19 +{
4.20 + g_slice_free(struct matching_counter,counter);
4.21 +}
4.22 +
4.23 +static gint compar_unichars(gconstpointer a,gconstpointer b,gpointer unused)
4.24 +{
4.25 + /*
4.26 + * Unicode code points only go up to 0x10FFFF and thus this cannot overflow.
4.27 + */
4.28 + return GPOINTER_TO_INT(a)-GPOINTER_TO_INT(b);
4.29 +}
4.30 +
4.31 +/*
4.32 + * For matching characters, we maintain a count of the opens and closes.
4.33 + * In the simplest case, we are dealing with a matching pair such as [ and ]
4.34 + * where there is a 1:1 mapping between an instance of [ with an open and
4.35 + * between an instance of ] with a close. matching_ket() is
4.36 + * responsible for selecting an arbitary base character of a matching pair.
4.37 + */
4.38 +static gpointer matching_key(gunichar ch)
4.39 +{
4.40 + gunichar mirrored;
4.41 + if (g_unichar_get_mirror_char(ch,&mirrored))
4.42 + if (ch<mirrored)
4.43 + return GINT_TO_POINTER((gint)ch);
4.44 + else
4.45 + return GINT_TO_POINTER((gint)mirrored);
4.46 + else if (ch==CHAR_SQUOTE || ch==CHAR_OPEN_SQUOTE)
4.47 + return GINT_TO_POINTER((gint)CHAR_SQUOTE);
4.48 + else if (ch==CHAR_LS_QUOTE || ch==CHAR_RS_QUOTE)
4.49 + return GINT_TO_POINTER((gint)CHAR_LS_QUOTE);
4.50 + else
4.51 + {
4.52 + g_warning("Matching pair not found for U+%04"G_GINT32_FORMAT"X",ch);
4.53 + return GINT_TO_POINTER((gint)ch);
4.54 + }
4.55 +}
4.56 +
4.57 +void increment_matching(struct counters *counters,gunichar ch,gboolean open)
4.58 +{
4.59 + gpointer key,orig_key;
4.60 + struct matching_counter *value;
4.61 + if (!counters->matching)
4.62 + counters->matching=g_tree_new_full(compar_unichars,NULL,NULL,
4.63 + (GDestroyNotify)matching_counter_free);
4.64 + key=matching_key(ch);
4.65 + if (!g_tree_lookup_extended(counters->matching,key,&orig_key,
4.66 + (gpointer *)&value))
4.67 + {
4.68 + value=matching_counter_new();
4.69 + g_tree_insert(counters->matching,key,value);
4.70 + }
4.71 + if (open)
4.72 + value->open++;
4.73 + else
4.74 + value->close++;
4.75 +}
4.76 +
4.77 +int matching_count(const struct counters *counters,gunichar ch,gboolean open)
4.78 +{
4.79 + struct matching_counter *value;
4.80 + if (!counters->matching)
4.81 + return 0;
4.82 + value=g_tree_lookup(counters->matching,matching_key(ch));
4.83 + if (!value)
4.84 + return 0;
4.85 + return open?value->open:value->close;
4.86 +}
4.87 +
4.88 +/*
4.89 + * Return open count - closed count
4.90 + */
4.91 +int matching_difference(const struct counters *counters,gunichar ch)
4.92 +{
4.93 + struct matching_counter *value;
4.94 + if (!counters->matching)
4.95 + return 0;
4.96 + value=g_tree_lookup(counters->matching,matching_key(ch));
4.97 + if (!value)
4.98 + return 0;
4.99 + return value->open-value->close;
4.100 +}
4.101 +
4.102 +void counters_destroy(struct counters *counters)
4.103 +{
4.104 + if (counters->matching)
4.105 + {
4.106 + g_tree_destroy(counters->matching);
4.107 + counters->matching=NULL;
4.108 + }
4.109 +}
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2 +++ b/bookloupe/counters.h Sat Sep 21 23:40:18 2013 +0100
5.3 @@ -0,0 +1,17 @@
5.4 +#ifndef COUNTERS_H
5.5 +#define COUNTERS_H
5.6 +
5.7 +#include <glib.h>
5.8 +
5.9 +struct counters {
5.10 + GTree *matching;
5.11 + long quot;
5.12 + int c_unders;
5.13 +};
5.14 +
5.15 +void increment_matching(struct counters *counters,gunichar ch,gboolean open);
5.16 +int matching_count(const struct counters *counters,gunichar ch,gboolean open);
5.17 +int matching_difference(const struct counters *counters,gunichar ch);
5.18 +void counters_destroy(struct counters *counters);
5.19 +
5.20 +#endif /* COUNTERS_H */
6.1 --- a/doc/bookloupe.txt Tue Sep 17 20:55:57 2013 +0100
6.2 +++ b/doc/bookloupe.txt Sat Sep 21 23:40:18 2013 +0100
6.3 @@ -77,8 +77,8 @@
6.4 to see all unclosed quotes, even where the next paragraph
6.5 begins with a quote, you should use the -p switch.
6.6
6.7 - Singlequotes (') are a problem, since the same character
6.8 - is used for an apostrophe. I'm not sure that it is
6.9 + Singlequotes (' and ’) are a problem, since the same
6.10 + character is used for an apostrophe. I'm not sure that it is
6.11 possible to get 100% accuracy on singlequotes checking,
6.12 particularly since dialect, quite common in PG texts,
6.13 upsets the normal rules so badly. Consider the sentence:
7.1 --- a/test/bookloupe/Makefile.am Tue Sep 17 20:55:57 2013 +0100
7.2 +++ b/test/bookloupe/Makefile.am Sat Sep 21 23:40:18 2013 +0100
7.3 @@ -1,4 +1,5 @@
7.4 TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test
7.5 -TESTS=non-ascii.tst long-line.tst
7.6 +TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst \
7.7 + curved-genitives.tst
7.8
7.9 dist_pkgdata_DATA=$(TESTS)
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
8.2 +++ b/test/bookloupe/curved-genitives.tst Sat Sep 21 23:40:18 2013 +0100
8.3 @@ -0,0 +1,12 @@
8.4 +**************** INPUT ****************
8.5 +The genitive case of single nouns is normally formed like this:
8.6 +
8.7 +The fireworks known as Serpent’s Eggs, or PHARAOH’S SERPENTS.
8.8 +
8.9 +What should never happen is something like this:
8.10 +
8.11 +At this suggestion Nellie’S face grew crimson.
8.12 +**************** EXPECTED ****************
8.13 +
8.14 +At this suggestion Nellie’S face grew crimson.
8.15 + Line 7 column 27 - Capital "S"?
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
9.2 +++ b/test/bookloupe/curved-single-quotes.tst Sat Sep 21 23:40:18 2013 +0100
9.3 @@ -0,0 +1,55 @@
9.4 +**************** OPTIONS ****************
9.5 +-s
9.6 +**************** INPUT ****************
9.7 +‘Now you should start for school, ’ Margaret said.
9.8 +
9.9 +‘In a moment,’ Peter replied,‘ I'm just coming.’
9.10 +
9.11 +‘Come on,’shouted Jane.
9.12 +
9.13 +‘Alright,’ said Peter, ‘Keep your hair on.
9.14 +’ He looked down as he came round the corner.
9.15 +‘Where's my coat? ’
9.16 +
9.17 +`Underneath the girls’ scarves.’ said his mother.
9.18 +
9.19 +Grabbing it, he joined the others as they set out.
9.20 +**************** WARNINGS ****************
9.21 +<expected>
9.22 + <error>
9.23 + <at line="1" column="34"/>
9.24 + <text>Spaced singlequote?</text>
9.25 + </error>
9.26 + <error>
9.27 + <at line="3" column="30"/>
9.28 + <text>Wrongspaced singlequotes?</text>
9.29 + </error>
9.30 + <error>
9.31 + <at line="5" column="10"/>
9.32 + <text>Wrongspaced singlequotes?</text>
9.33 + </error>
9.34 + <false-positive>
9.35 + <at line="6"/>
9.36 + <text>Mismatched singlequotes?</text>
9.37 + </false-positive>
9.38 + <error>
9.39 + <at line="8" column="1"/>
9.40 + <text>Spaced quote?</text>
9.41 + </error>
9.42 + <error>
9.43 + <at line="9" column="19"/>
9.44 + <text>Spaced quote?</text>
9.45 + </error>
9.46 + <false-positive>
9.47 + <at line="10"/>
9.48 + <text>Mismatched singlequotes?</text>
9.49 + </false-positive>
9.50 + <false-positive>
9.51 + <at line="11" column="32"/>
9.52 + <text>Wrongspaced singlequotes?</text>
9.53 + </false-positive>
9.54 + <error>
9.55 + <at line="12"/>
9.56 + <text>Mismatched singlequotes?</text>
9.57 + </error>
9.58 +</expected>
10.1 --- a/test/compatibility/Makefile.am Tue Sep 17 20:55:57 2013 +0100
10.2 +++ b/test/compatibility/Makefile.am Sat Sep 21 23:40:18 2013 +0100
10.3 @@ -7,6 +7,6 @@
10.4 dashes.tst control-characters.tst unusual-characters.tst \
10.5 windows-1252.tst periods.tst long-line.tst unmarked-paragraph.tst \
10.6 hebe-jeebies.tst mail-from.tst scannos.tst before-comma.tst \
10.7 - before-period.tst double-punctuation.tst genatives.tst embedded-cr.tst
10.8 + before-period.tst double-punctuation.tst genitives.tst embedded-cr.tst
10.9
10.10 dist_pkgdata_DATA=$(TESTS)
11.1 --- a/test/compatibility/genatives.tst Tue Sep 17 20:55:57 2013 +0100
11.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
11.3 @@ -1,12 +0,0 @@
11.4 -**************** INPUT ****************
11.5 -The genative case of single nouns is normally formed like this:
11.6 -
11.7 -The fireworks known as Serpent's Eggs, or PHARAOH'S SERPENTS.
11.8 -
11.9 -What should never happen is something like this:
11.10 -
11.11 -At this suggestion Nellie'S face grew crimson.
11.12 -**************** EXPECTED ****************
11.13 -
11.14 -At this suggestion Nellie'S face grew crimson.
11.15 - Line 7 column 27 - Capital "S"?
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
12.2 +++ b/test/compatibility/genitives.tst Sat Sep 21 23:40:18 2013 +0100
12.3 @@ -0,0 +1,12 @@
12.4 +**************** INPUT ****************
12.5 +The genitive case of single nouns is normally formed like this:
12.6 +
12.7 +The fireworks known as Serpent's Eggs, or PHARAOH'S SERPENTS.
12.8 +
12.9 +What should never happen is something like this:
12.10 +
12.11 +At this suggestion Nellie'S face grew crimson.
12.12 +**************** EXPECTED ****************
12.13 +
12.14 +At this suggestion Nellie'S face grew crimson.
12.15 + Line 7 column 27 - Capital "S"?