diff -r 466f43a12118 -r bb31577536d1 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Wed Oct 02 23:51:18 2013 +0100 +++ b/bookloupe/bookloupe.c Thu Oct 03 16:09:39 2013 +0100 @@ -431,6 +431,40 @@ return 0; } +void count_dashes(const char *line,const char *dash, + struct dash_results *results) +{ + int i; + gchar **tokens; + gunichar pc,nc; + gboolean spaced=FALSE,unspaced=FALSE,spaced2=FALSE; + if (!*line) + return; + tokens=g_strsplit(line,dash,0); + if (tokens[1]) + results->base++; + for(i=1;tokens[i];i++) + { + pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1]))); + nc=g_utf8_get_char(tokens[i]); + if (g_unichar_isspace(pc) || g_unichar_isspace(nc)) + spaced=TRUE; + if (g_unichar_isspace(pc) && g_unichar_isspace(nc)) + spaced2=TRUE; + else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc)) + unspaced=TRUE; + } + if (spaced) + results->space++; + if (spaced2) + /* count of lines with em-dashes with spaces both sides */ + results->non_PG_space++; + if (unspaced) + /* count of lines with PG-type em-dashes with no spaces */ + results->PG_space++; + g_strfreev(tokens); +} + /* * first_pass: * @@ -449,6 +483,7 @@ unsigned int lastlen=0,lastblen=0; long spline=0,nspline=0; static struct first_pass_results results={0}; + struct dash_results tmp_dash_results; gchar *inword; QuoteClass qc; lines=g_strsplit(etext,"\n",0); @@ -512,7 +547,7 @@ else qc=INVALID_QUOTE; if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && - isalpha(g_utf8_get_char(g_utf8_prev_char(s)))) + g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s)))) results.endquote_count++; } } @@ -559,18 +594,15 @@ results.htmcount+=4; /* bonus marks! */ } /* Check for spaced em-dashes */ - if (lines[j][0] && (s=strstr(g_utf8_next_char(lines[j]),"--"))) - { - results.emdash++; - if (s[-1]==CHAR_SPACE || s[2]==CHAR_SPACE) - results.space_emdash++; - if (s[-1]==CHAR_SPACE && s[2]==CHAR_SPACE) - /* count of em-dashes with spaces both sides */ - results.non_PG_space_emdash++; - if (s[-1]!=CHAR_SPACE && s[2]!=CHAR_SPACE) - /* count of PG-type em-dashes with no spaces */ - results.PG_space_emdash++; - } + memset(&tmp_dash_results,0,sizeof(tmp_dash_results)); + count_dashes(lines[j],"--",&tmp_dash_results); + count_dashes(lines[j],"—",&tmp_dash_results); + if (tmp_dash_results.base) + results.emdash.base++; + if (tmp_dash_results.non_PG_space) + results.emdash.non_PG_space++; + if (tmp_dash_results.PG_space) + results.emdash.PG_space++; for (s=lines[j];*s;) { inword=getaword(&s); @@ -700,13 +732,13 @@ * and some people insist on them whatever the guidelines say. */ warnings.dash=1; - if (results->spacedash+results->non_PG_space_emdash> - results->PG_space_emdash) + if (results->spacedash+results->emdash.non_PG_space> + results->emdash.PG_space) { warnings.dash=0; g_print(" --> There are %ld spaced dashes and em-dashes. " "Not reporting them.\n", - results->spacedash+results->non_PG_space_emdash); + results->spacedash+results->emdash.non_PG_space); } /* If more than a quarter of characters are hi-bit, bug out. */ warnings.bin=1; @@ -1120,20 +1152,53 @@ } /* + * str_emdash: + * + * Find the first em-dash, return a pointer to it and set to the + * character following the dash. + */ +char *str_emdash(const char *s,const char **next) +{ + const char *s1,*s2; + s1=strstr(s,"--"); + s2=strstr(s,"—"); + if (!s1) + { + if (s2) + *next=g_utf8_next_char(s2); + return (char *)s2; + } + else if (!s2) + { + *next=g_utf8_next_char(g_utf8_next_char(s1)); + return (char *)s1; + } + else if (s1aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE || g_utf8_get_char(next)==CHAR_SPACE) { @@ -2322,7 +2387,7 @@ qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE; nc=g_utf8_get_char(g_utf8_next_char(s)); /* for each character in the line except 1st */ - if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && isalpha(pc)) + if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc)) { if (pswit[ECHO_SWITCH]) g_print("\n%s\n",aline); @@ -2466,7 +2531,7 @@ cnt_punct++; break; } - if (g_utf8_strchr("-.:!([{?}])",-1,g_utf8_get_char(s))) + if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s))) break; } }