1.1 --- a/bookloupe/bookloupe.c Sat Sep 07 08:38:13 2013 +0100
1.2 +++ b/bookloupe/bookloupe.c Sat Sep 21 23:40:18 2013 +0100
1.3 @@ -27,6 +27,8 @@
1.4 #endif
1.5 #include <glib.h>
1.6 #include <bl/bl.h>
1.7 +#include "bookloupe.h"
1.8 +#include "counters.h"
1.9 #include "HTMLentities.h"
1.10
1.11 gchar *prevline;
1.12 @@ -123,50 +125,6 @@
1.13 "among", "those", "into", "whom", "having", "thence", ""
1.14 };
1.15
1.16 -/* special characters */
1.17 -#define CHAR_SPACE 32
1.18 -#define CHAR_TAB 9
1.19 -#define CHAR_LF 10
1.20 -#define CHAR_CR 13
1.21 -#define CHAR_DQUOTE 34
1.22 -#define CHAR_SQUOTE 39
1.23 -#define CHAR_OPEN_SQUOTE 96
1.24 -#define CHAR_TILDE 126
1.25 -#define CHAR_ASTERISK 42
1.26 -#define CHAR_FORESLASH 47
1.27 -#define CHAR_CARAT 94
1.28 -
1.29 -#define CHAR_UNDERSCORE '_'
1.30 -#define CHAR_OPEN_CBRACK '{'
1.31 -#define CHAR_CLOSE_CBRACK '}'
1.32 -#define CHAR_OPEN_RBRACK '('
1.33 -#define CHAR_CLOSE_RBRACK ')'
1.34 -#define CHAR_OPEN_SBRACK '['
1.35 -#define CHAR_CLOSE_SBRACK ']'
1.36 -
1.37 -/* longest and shortest normal PG line lengths */
1.38 -#define LONGEST_PG_LINE 75
1.39 -#define WAY_TOO_LONG 80
1.40 -#define SHORTEST_PG_LINE 55
1.41 -
1.42 -enum {
1.43 - ECHO_SWITCH,
1.44 - SQUOTE_SWITCH,
1.45 - TYPO_SWITCH,
1.46 - QPARA_SWITCH,
1.47 - PARANOID_SWITCH,
1.48 - LINE_END_SWITCH,
1.49 - OVERVIEW_SWITCH,
1.50 - STDOUT_SWITCH,
1.51 - HEADER_SWITCH,
1.52 - WEB_SWITCH,
1.53 - VERBOSE_SWITCH,
1.54 - MARKUP_SWITCH,
1.55 - USERTYPO_SWITCH,
1.56 - DP_SWITCH,
1.57 - SWITNO
1.58 -};
1.59 -
1.60 gboolean pswit[SWITNO]; /* program switches */
1.61
1.62 static GOptionEntry options[]={
1.63 @@ -242,40 +200,6 @@
1.64 UINT saved_cp;
1.65 #endif
1.66
1.67 -struct first_pass_results {
1.68 - long firstline,astline;
1.69 - long footerline,totlen,binlen,alphalen,endquote_count,shortline,dotcomma;
1.70 - long fslashline,hyphens,longline,verylongline,htmcount,standalone_digit;
1.71 - long spacedash,emdash,space_emdash,non_PG_space_emdash,PG_space_emdash;
1.72 - int Dutchcount,Frenchcount;
1.73 -};
1.74 -
1.75 -struct warnings {
1.76 - int shortline,longline,bin,dash,dotcomma,ast,fslash,digit,hyphen;
1.77 - int endquote;
1.78 - gboolean isDutch,isFrench;
1.79 -};
1.80 -
1.81 -struct counters {
1.82 - long quot;
1.83 - int c_unders,c_brack,s_brack,r_brack;
1.84 - int open_single_quote,close_single_quote;
1.85 -};
1.86 -
1.87 -struct line_properties {
1.88 - unsigned int len,blen;
1.89 - gunichar start;
1.90 -};
1.91 -
1.92 -struct parities {
1.93 - int dquote,squote;
1.94 -};
1.95 -
1.96 -struct pending {
1.97 - char *dquote,*squote,*rbrack,*sbrack,*cbrack,*unders;
1.98 - long squot;
1.99 -};
1.100 -
1.101 void parse_options(int *argc,char ***argv)
1.102 {
1.103 GError *err=NULL;
1.104 @@ -877,7 +801,7 @@
1.105 c=g_utf8_get_char(s);
1.106 if (c==CHAR_DQUOTE)
1.107 counters->quot++;
1.108 - if (c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE)
1.109 + if (CHAR_IS_SQUOTE(c))
1.110 {
1.111 if (s==aline)
1.112 {
1.113 @@ -887,21 +811,21 @@
1.114 */
1.115 if (!g_str_has_prefix(snext,"tis") &&
1.116 !g_str_has_prefix(snext,"Tis"))
1.117 - counters->open_single_quote++;
1.118 + increment_matching(counters,c,TRUE);
1.119 }
1.120 else if (g_unichar_isalpha(g_utf8_get_char(sprev)) &&
1.121 g_unichar_isalpha(g_utf8_get_char(snext)))
1.122 /* Do nothing! it's definitely an apostrophe, not a quote */
1.123 ;
1.124 /* it's outside a word - let's check it out */
1.125 - else if (c==CHAR_OPEN_SQUOTE ||
1.126 + else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE ||
1.127 g_unichar_isalpha(g_utf8_get_char(snext)))
1.128 {
1.129 /* it damwell better BE an openquote */
1.130 if (!g_str_has_prefix(snext,"tis") &&
1.131 !g_str_has_prefix(snext,"Tis"))
1.132 /* hardcode a very common exception! */
1.133 - counters->open_single_quote++;
1.134 + increment_matching(counters,c,TRUE);
1.135 }
1.136 else
1.137 {
1.138 @@ -926,7 +850,7 @@
1.139 guessquote+=8; /* looks like a closequote */
1.140 else
1.141 guessquote++;
1.142 - if (counters->open_single_quote>counters->close_single_quote)
1.143 + if (matching_difference(counters,CHAR_SQUOTE)>0)
1.144 /*
1.145 * Give it the benefit of some doubt,
1.146 * if a squote is already open.
1.147 @@ -935,7 +859,7 @@
1.148 else
1.149 guessquote--;
1.150 if (guessquote>=0)
1.151 - counters->close_single_quote++;
1.152 + increment_matching(counters,c,FALSE);
1.153 }
1.154 }
1.155 if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK &&
1.156 @@ -943,18 +867,11 @@
1.157 isemptyline=FALSE; /* ignore lines like * * * as spacers */
1.158 if (c==CHAR_UNDERSCORE)
1.159 counters->c_unders++;
1.160 - if (c==CHAR_OPEN_CBRACK)
1.161 - counters->c_brack++;
1.162 - if (c==CHAR_CLOSE_CBRACK)
1.163 - counters->c_brack--;
1.164 - if (c==CHAR_OPEN_RBRACK)
1.165 - counters->r_brack++;
1.166 - if (c==CHAR_CLOSE_RBRACK)
1.167 - counters->r_brack--;
1.168 - if (c==CHAR_OPEN_SBRACK)
1.169 - counters->s_brack++;
1.170 - if (c==CHAR_CLOSE_SBRACK)
1.171 - counters->s_brack--;
1.172 + if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK || c==CHAR_OPEN_SBRACK)
1.173 + increment_matching(counters,c,TRUE);
1.174 + if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK ||
1.175 + c==CHAR_CLOSE_SBRACK)
1.176 + increment_matching(counters,c,FALSE);
1.177 sprev=s;
1.178 s=snext;
1.179 }
1.180 @@ -1423,12 +1340,12 @@
1.181 */
1.182 void check_for_extra_period(const char *aline,const struct warnings *warnings)
1.183 {
1.184 - const char *s,*t,*s1;
1.185 + const char *s,*t,*s1,*sprev;
1.186 int i;
1.187 gsize len;
1.188 gboolean istypo;
1.189 gchar *testword;
1.190 - gunichar *decomposition;
1.191 + gunichar c,nc,pc,*decomposition;
1.192 if (pswit[PARANOID_SWITCH])
1.193 {
1.194 for (t=aline;t=strstr(t,". ");)
1.195 @@ -1452,8 +1369,9 @@
1.196 c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3));
1.197 c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4));
1.198 c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5));
1.199 - if (c2==CHAR_SQUOTE && g_unichar_islower(c3) &&
1.200 - c4==CHAR_SPACE && g_unichar_isupper(c5))
1.201 + if (CHAR_IS_APOSTROPHE(c2) &&
1.202 + g_unichar_islower(c3) && c4==CHAR_SPACE &&
1.203 + g_unichar_isupper(c5))
1.204 {
1.205 t=g_utf8_next_char(t);
1.206 continue;
1.207 @@ -1468,14 +1386,22 @@
1.208 /* we have something to investigate */
1.209 istypo=TRUE;
1.210 /* so let's go back and find out */
1.211 - for (s1=g_utf8_prev_char(t);s1>=aline &&
1.212 - (g_unichar_isalpha(g_utf8_get_char(s1)) ||
1.213 - g_unichar_isdigit(g_utf8_get_char(s1)) ||
1.214 - g_utf8_get_char(s1)==CHAR_SQUOTE &&
1.215 - g_unichar_isalpha(g_utf8_get_char(g_utf8_next_char(s1))) &&
1.216 - g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s1))));
1.217 - s1=g_utf8_prev_char(s1))
1.218 - ;
1.219 + nc=g_utf8_get_char(t);
1.220 + s1=g_utf8_prev_char(t);
1.221 + c=g_utf8_get_char(s1);
1.222 + sprev=g_utf8_prev_char(s1);
1.223 + pc=g_utf8_get_char(sprev);
1.224 + while (s1>=aline &&
1.225 + (g_unichar_isalpha(c) || g_unichar_isdigit(c) ||
1.226 + g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) &&
1.227 + g_unichar_isalpha(nc)))
1.228 + {
1.229 + nc=c;
1.230 + s1=sprev;
1.231 + c=pc;
1.232 + sprev=g_utf8_prev_char(s1);
1.233 + pc=g_utf8_get_char(sprev);
1.234 + }
1.235 s1=g_utf8_next_char(s1);
1.236 s=strchr(s1,'.');
1.237 if (s)
1.238 @@ -1600,7 +1526,7 @@
1.239 gchar *testword;
1.240 int i,vowel,consonant,*dupcnt;
1.241 gboolean isdup,istypo,alower;
1.242 - gunichar c;
1.243 + gunichar c,pc;
1.244 long offset,len;
1.245 gsize decomposition_len;
1.246 for (s=aline;*s;)
1.247 @@ -1646,11 +1572,14 @@
1.248 * French contractions like l'Abbe
1.249 */
1.250 offset=g_utf8_pointer_to_offset(inword,t);
1.251 + if (offset>0)
1.252 + pc=g_utf8_get_char(g_utf8_prev_char(t));
1.253 + else
1.254 + pc='\0';
1.255 if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' ||
1.256 offset==3 && c=='m' && g_utf8_get_char(nt)=='a' &&
1.257 g_utf8_get_char(g_utf8_next_char(nt))=='c' ||
1.258 - offset>0 &&
1.259 - g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SQUOTE)
1.260 + CHAR_IS_APOSTROPHE(pc))
1.261 ; /* do nothing! */
1.262 else
1.263 istypo=TRUE;
1.264 @@ -2050,8 +1979,7 @@
1.265 {
1.266 c=nc;
1.267 nc=g_utf8_get_char(g_utf8_next_char(s));
1.268 - if ((c==CHAR_SQUOTE || c==CHAR_OPEN_SQUOTE) && (s==aline ||
1.269 - s>aline &&
1.270 + if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline &&
1.271 !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) ||
1.272 !g_unichar_isalpha(nc)))
1.273 {
1.274 @@ -2166,7 +2094,11 @@
1.275 */
1.276 void check_for_spaced_quotes(const char *aline)
1.277 {
1.278 + int i;
1.279 const char *s,*t;
1.280 + const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE,
1.281 + CHAR_RS_QUOTE};
1.282 + GString *pattern;
1.283 s=aline;
1.284 while ((t=strstr(s," \" ")))
1.285 {
1.286 @@ -2179,30 +2111,26 @@
1.287 cnt_punct++;
1.288 s=g_utf8_next_char(g_utf8_next_char(t));
1.289 }
1.290 - s=aline;
1.291 - while ((t=strstr(s," ' ")))
1.292 + pattern=g_string_new(NULL);
1.293 + for(i=0;i<G_N_ELEMENTS(single_quotes);i++)
1.294 {
1.295 - if (pswit[ECHO_SWITCH])
1.296 - g_print("\n%s\n",aline);
1.297 - if (!pswit[OVERVIEW_SWITCH])
1.298 - g_print(" Line %ld column %ld - Spaced singlequote?\n",
1.299 - linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1.300 - else
1.301 - cnt_punct++;
1.302 - s=g_utf8_next_char(g_utf8_next_char(t));
1.303 + g_string_assign(pattern," ");
1.304 + g_string_append_unichar(pattern,single_quotes[i]);
1.305 + g_string_append_c(pattern,' ');
1.306 + s=aline;
1.307 + while ((t=strstr(s,pattern->str)))
1.308 + {
1.309 + if (pswit[ECHO_SWITCH])
1.310 + g_print("\n%s\n",aline);
1.311 + if (!pswit[OVERVIEW_SWITCH])
1.312 + g_print(" Line %ld column %ld - Spaced singlequote?\n",
1.313 + linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1.314 + else
1.315 + cnt_punct++;
1.316 + s=g_utf8_next_char(g_utf8_next_char(t));
1.317 + }
1.318 }
1.319 - s=aline;
1.320 - while ((t=strstr(s," ` ")))
1.321 - {
1.322 - if (pswit[ECHO_SWITCH])
1.323 - g_print("\n%s\n",aline);
1.324 - if (!pswit[OVERVIEW_SWITCH])
1.325 - g_print(" Line %ld column %ld - Spaced singlequote?\n",
1.326 - linecnt,g_utf8_pointer_to_offset(aline,t)+1);
1.327 - else
1.328 - cnt_punct++;
1.329 - s=g_utf8_next_char(g_utf8_next_char(t));
1.330 - }
1.331 + g_string_free(pattern,TRUE);
1.332 }
1.333
1.334 /*
1.335 @@ -2223,7 +2151,7 @@
1.336 pc=c;
1.337 c=nc;
1.338 nc=g_utf8_get_char(g_utf8_next_char(s));
1.339 - if (c==CHAR_SQUOTE && nc=='S' && g_unichar_islower(pc))
1.340 + if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc))
1.341 {
1.342 if (pswit[ECHO_SWITCH])
1.343 g_print("\n%s\n",aline);
1.344 @@ -2255,8 +2183,7 @@
1.345 s=g_utf8_prev_char(aline+lbytes);
1.346 c1=g_utf8_get_char(s);
1.347 c2=g_utf8_get_char(g_utf8_prev_char(s));
1.348 - if ((c1==CHAR_DQUOTE || c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) &&
1.349 - c2==CHAR_SPACE)
1.350 + if ((c1==CHAR_DQUOTE || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE)
1.351 {
1.352 if (pswit[ECHO_SWITCH])
1.353 g_print("\n%s\n",aline);
1.354 @@ -2268,7 +2195,7 @@
1.355 }
1.356 c1=g_utf8_get_char(aline);
1.357 c2=g_utf8_get_char(g_utf8_next_char(aline));
1.358 - if ((c1==CHAR_SQUOTE || c1==CHAR_OPEN_SQUOTE) && c2==CHAR_SPACE)
1.359 + if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE)
1.360 {
1.361 if (pswit[ECHO_SWITCH])
1.362 g_print("\n%s\n",aline);
1.363 @@ -2470,8 +2397,7 @@
1.364 }
1.365 if (pending->squote)
1.366 {
1.367 - if (c!=CHAR_SQUOTE && c!=CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] ||
1.368 - pending->squot)
1.369 + if (!CHAR_IS_SQUOTE(c) || pswit[QPARA_SWITCH] || pending->squot)
1.370 {
1.371 if (!pswit[OVERVIEW_SWITCH])
1.372 {
1.373 @@ -2558,28 +2484,39 @@
1.374 void check_for_mismatched_quotes(const struct counters *counters,
1.375 struct pending *pending)
1.376 {
1.377 + int squote_straight,squote_curved;
1.378 if (counters->quot%2)
1.379 pending->dquote=
1.380 g_strdup_printf(" Line %ld - Mismatched quotes",linecnt);
1.381 - if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
1.382 - counters->open_single_quote!=counters->close_single_quote)
1.383 - pending->squote=
1.384 - g_strdup_printf(" Line %ld - Mismatched singlequotes?",linecnt);
1.385 - if (pswit[SQUOTE_SWITCH] && counters->open_single_quote &&
1.386 - counters->open_single_quote!=counters->close_single_quote &&
1.387 - counters->open_single_quote!=counters->close_single_quote+1)
1.388 - /*
1.389 - * Flag it to be noted regardless of the
1.390 - * first char of the next para.
1.391 - */
1.392 - pending->squot=1;
1.393 - if (counters->r_brack)
1.394 + if (pswit[SQUOTE_SWITCH])
1.395 + {
1.396 + if (matching_count(counters,CHAR_SQUOTE,TRUE))
1.397 + squote_straight=matching_difference(counters,CHAR_SQUOTE);
1.398 + else
1.399 + squote_straight=0;
1.400 + if (matching_count(counters,CHAR_LS_QUOTE,TRUE))
1.401 + squote_curved=matching_difference(counters,CHAR_LS_QUOTE);
1.402 + else
1.403 + squote_curved=0;
1.404 + if (squote_straight || squote_curved)
1.405 + pending->squote=
1.406 + g_strdup_printf(" Line %ld - Mismatched singlequotes?",
1.407 + linecnt);
1.408 + if (squote_straight && squote_straight!=1 ||
1.409 + squote_curved && squote_curved!=1)
1.410 + /*
1.411 + * Flag it to be noted regardless of the
1.412 + * first char of the next para.
1.413 + */
1.414 + pending->squot=1;
1.415 + }
1.416 + if (matching_difference(counters,CHAR_OPEN_RBRACK))
1.417 pending->rbrack=
1.418 g_strdup_printf(" Line %ld - Mismatched round brackets?",linecnt);
1.419 - if (counters->s_brack)
1.420 + if (matching_difference(counters,CHAR_OPEN_SBRACK))
1.421 pending->sbrack=
1.422 g_strdup_printf(" Line %ld - Mismatched square brackets?",linecnt);
1.423 - if (counters->c_brack)
1.424 + if (matching_difference(counters,CHAR_OPEN_CBRACK))
1.425 pending->cbrack=
1.426 g_strdup_printf(" Line %ld - Mismatched curly brackets?",linecnt);
1.427 if (counters->c_unders%2)
1.428 @@ -2603,6 +2540,7 @@
1.429 {
1.430 gboolean letter_on_line=FALSE;
1.431 const char *s;
1.432 + gunichar c;
1.433 for (s=prevline;*s;s=g_utf8_next_char(s))
1.434 if (g_unichar_isalpha(g_utf8_get_char(s)))
1.435 {
1.436 @@ -2619,12 +2557,12 @@
1.437 if (letter_on_line && last->blen>2 && start_para_line<linecnt-1 &&
1.438 g_utf8_get_char(prevline)>CHAR_SPACE)
1.439 {
1.440 - for (s=g_utf8_prev_char(prevline+strlen(prevline));
1.441 - (g_utf8_get_char(s)==CHAR_DQUOTE ||
1.442 - g_utf8_get_char(s)==CHAR_SQUOTE) &&
1.443 - g_utf8_get_char(s)>CHAR_SPACE && s>prevline;
1.444 - s=g_utf8_prev_char(s))
1.445 - ;
1.446 + s=prevline+strlen(prevline);
1.447 + do
1.448 + {
1.449 + s=g_utf8_prev_char(s);
1.450 + c=g_utf8_get_char(s);
1.451 + } while (CHAR_IS_CLOSING_QUOTE(c) && c>CHAR_SPACE && s>prevline);
1.452 for (;s>prevline;s=g_utf8_prev_char(s))
1.453 {
1.454 if (g_unichar_isalpha(g_utf8_get_char(s)))
1.455 @@ -2857,6 +2795,7 @@
1.456 g_tree_foreach(qword,report_duplicate_queries,NULL);
1.457 g_tree_unref(qword);
1.458 g_tree_unref(qperiod);
1.459 + counters_destroy(&counters);
1.460 g_set_print_handler(NULL);
1.461 print_as_windows_1252(NULL);
1.462 if (pswit[MARKUP_SWITCH])
1.463 @@ -3066,10 +3005,10 @@
1.464 }
1.465 /* we didn't find a punctuated number - do the regular getword thing */
1.466 g_string_truncate(word,0);
1.467 - for (;g_unichar_isdigit(g_utf8_get_char(*ptr)) ||
1.468 - g_unichar_isalpha(g_utf8_get_char(*ptr)) ||
1.469 - g_utf8_get_char(*ptr)=='\'';*ptr=g_utf8_next_char(*ptr))
1.470 - g_string_append_unichar(word,g_utf8_get_char(*ptr));
1.471 + c=g_utf8_get_char(*ptr);
1.472 + for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
1.473 + *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
1.474 + g_string_append_unichar(word,c);
1.475 return g_string_free(word,FALSE);
1.476 }
1.477