bookloupe/bookloupe.c
changeset 214 43c73b36e936
parent 209 70cc629ec1e0
     1.1 --- a/bookloupe/bookloupe.c	Wed Oct 16 22:51:29 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Sat Nov 02 09:03:54 2013 +0000
     1.3 @@ -250,7 +250,7 @@
     1.4  gchar *running_from;
     1.5  
     1.6  gboolean mixdigit(const char *);
     1.7 -gchar *getaword(const char **);
     1.8 +gchar *getaword(const char *,const char **);
     1.9  char *flgets(char **,long,int);
    1.10  void postprocess_for_HTML(char *);
    1.11  char *linehasmarkup(char *);
    1.12 @@ -977,7 +977,7 @@
    1.13  	    results.emdash.PG_space++;
    1.14  	for (s=lines[j];*s;)
    1.15  	{
    1.16 -	    inword=getaword(&s);
    1.17 +	    inword=getaword(NULL,&s);
    1.18  	    if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) 
    1.19  		results.Dutchcount++;
    1.20  	    if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) 
    1.21 @@ -2002,7 +2002,7 @@
    1.22  	for (s=aline;*s;)
    1.23  	{
    1.24  	    wordstart=s;
    1.25 -	    t=getaword(&s);
    1.26 +	    t=getaword(NULL,&s);
    1.27  	    if (!*t)
    1.28  	    {
    1.29  		g_free(t);
    1.30 @@ -2052,8 +2052,9 @@
    1.31  /*
    1.32   * check_for_typos:
    1.33   *
    1.34 - * Check for commonly mistyped words,
    1.35 - * and digits like 0 for O in a word.
    1.36 + * Check for commonly mistyped words, and digits like 0 for O in a word.
    1.37 + * Note that somewhat confusingly, this is also where we call getaword()
    1.38 + * with a non-NULL line so that it will issue warnings.
    1.39   */
    1.40  void check_for_typos(const char *aline,struct warnings *warnings)
    1.41  {
    1.42 @@ -2069,7 +2070,7 @@
    1.43      for (s=aline;*s;)
    1.44      {
    1.45  	wordstart=s;
    1.46 -	inword=getaword(&s);
    1.47 +	inword=getaword(aline,&s);
    1.48  	if (!*inword)
    1.49  	{
    1.50  	    g_free(inword);
    1.51 @@ -2318,7 +2319,7 @@
    1.52  	     * If there are letters on both sides of it or
    1.53  	     * if it's strict punctuation followed by an alpha.
    1.54  	     */
    1.55 -	    if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
    1.56 +	    if (c!='_' && g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) ||
    1.57  	      g_utf8_strchr("?!,;:",-1,c)))
    1.58  	    {
    1.59  		if (c=='.')
    1.60 @@ -3419,14 +3420,18 @@
    1.61   * A word is defined as one English word unit--or at least that's the aim.
    1.62   * "ptr" is advanced to the position in the line where we will start
    1.63   * looking for the next word.
    1.64 + * If line is non-NULL, then it will be used to derive the column numbers for
    1.65 + * any warnings issued. If line is NULL, then warnings will be suppressed.
    1.66   *
    1.67   * Returns: A newly-allocated string.
    1.68   */
    1.69 -gchar *getaword(const char **ptr)
    1.70 +gchar *getaword(const char *line,const char **ptr)
    1.71  {
    1.72 -    const char *s,*t;
    1.73 +    const char *s,*t,*t2;
    1.74      GString *word;
    1.75      gunichar c,pc;
    1.76 +    int adjust;
    1.77 +    gboolean initial_underlining=FALSE;
    1.78      word=g_string_new(NULL);
    1.79      for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) &&
    1.80        !g_unichar_isalpha(g_utf8_get_char(*ptr)) &&
    1.81 @@ -3448,6 +3453,7 @@
    1.82  	    else
    1.83  		g_string_truncate(word,0);
    1.84  	}
    1.85 +	initial_underlining=g_utf8_get_char(*ptr)=='_';
    1.86      }
    1.87      /*
    1.88       * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35.
    1.89 @@ -3477,10 +3483,84 @@
    1.90      }
    1.91      /* we didn't find a punctuated number - do the regular getword thing */
    1.92      g_string_truncate(word,0);
    1.93 -    c=g_utf8_get_char(*ptr);
    1.94 -    for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c);
    1.95 -      *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr))
    1.96 +    s=*ptr;
    1.97 +    c=g_utf8_get_char(s);
    1.98 +    for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || c=='_' ||
    1.99 +      CHAR_IS_APOSTROPHE(c); s=g_utf8_next_char(s),c=g_utf8_get_char(s))
   1.100  	g_string_append_unichar(word,c);
   1.101 +    if (initial_underlining && word->str[word->len-1]=='_')
   1.102 +    {
   1.103 +	/* _Simple_ or _Old-school_underlining_ */
   1.104 +	t=strchr(*ptr,'_');
   1.105 +	g_string_truncate(word,t-*ptr);
   1.106 +	if (s-t>1)
   1.107 +	    *ptr=t;	/* _Old-school_underlining_ */
   1.108 +	else
   1.109 +	    *ptr=s;	/* _Simple_ */
   1.110 +    }
   1.111 +    else if (initial_underlining || (t=strchr(word->str,'_')))
   1.112 +    {
   1.113 +	/* Part_ial_ underlining */
   1.114 +	adjust=0;
   1.115 +	if (initial_underlining)
   1.116 +	{
   1.117 +	    t2=strchr(word->str,'_');
   1.118 +	    if (t2)
   1.119 +	    {
   1.120 +		g_string_erase(word,t2-word->str,1);
   1.121 +		adjust++;
   1.122 +	    }
   1.123 +	    else
   1.124 +	    {
   1.125 +		if (line)
   1.126 +		{
   1.127 +		    if (pswit[ECHO_SWITCH])
   1.128 +			g_print("\n%s\n",line);
   1.129 +		    if (!pswit[OVERVIEW_SWITCH])
   1.130 +			g_print("    Line %ld column %ld - "
   1.131 +			  "Missing space or underscore?\n",linecnt,
   1.132 +			  g_utf8_pointer_to_offset(line,*ptr));
   1.133 +		    else
   1.134 +			cnt_punct++;
   1.135 +		}
   1.136 +		*ptr=s;
   1.137 +		return g_string_free(word,FALSE);
   1.138 +	    }
   1.139 +	}
   1.140 +	while ((t=strchr(word->str,'_')))
   1.141 +	{
   1.142 +	    t2=strchr(t+1,'_');
   1.143 +	    if (t2)
   1.144 +	    {
   1.145 +		g_string_erase(word,t-word->str,1);
   1.146 +		t2--;
   1.147 +		g_string_erase(word,t2-word->str,1);
   1.148 +		adjust+=2;
   1.149 +	    }
   1.150 +	    else
   1.151 +	    {
   1.152 +		g_string_truncate(word,t-word->str);
   1.153 +		adjust+=g_utf8_pointer_to_offset(word->str,t);
   1.154 +		*ptr=g_utf8_offset_to_pointer(*ptr,adjust);
   1.155 +		if (line)
   1.156 +		{
   1.157 +		    if (pswit[ECHO_SWITCH])
   1.158 +			g_print("\n%s\n",line);
   1.159 +		    if (!pswit[OVERVIEW_SWITCH])
   1.160 +			g_print("    Line %ld column %ld - "
   1.161 +			  "Missing space or underscore?\n",linecnt,
   1.162 +			  g_utf8_pointer_to_offset(line,*ptr)+1);
   1.163 +		    else
   1.164 +			cnt_punct++;
   1.165 +		}
   1.166 +		return g_string_free(word,FALSE);
   1.167 +	    }
   1.168 +	}
   1.169 +	*ptr=s;
   1.170 +    }
   1.171 +    else
   1.172 +	/* No underlining */
   1.173 +	*ptr=s;
   1.174      return g_string_free(word,FALSE);
   1.175  }
   1.176