Support WINDOWS-1252 characters encoded as UTF-8
authorali <ali@juiblex.co.uk>
Thu May 30 18:33:44 2013 +0100 (2013-05-30)
changeset 7252d4a7f926b4
parent 71 82d3cc398b54
child 73 cffa80824f8c
Support WINDOWS-1252 characters encoded as UTF-8
bookloupe/bookloupe.c
configure.ac
test/Makefile.am
test/bookloupe/Makefile.am
test/bookloupe/long-line.tst
test/bookloupe/non-ascii.tst
test/compatibility/unmarked-paragraph.tst
     1.1 --- a/bookloupe/bookloupe.c	Thu May 30 17:16:37 2013 +0100
     1.2 +++ b/bookloupe/bookloupe.c	Thu May 30 18:33:44 2013 +0100
     1.3 @@ -230,6 +230,8 @@
     1.4  void loseentities(char *);
     1.5  gboolean isroman(const char *);
     1.6  void postprocess_for_DP(char *);
     1.7 +void print_as_windows_1252(const char *string);
     1.8 +void print_as_utf_8(const char *string);
     1.9  
    1.10  GTree *qword,*qperiod;
    1.11  
    1.12 @@ -371,7 +373,10 @@
    1.13  	g_clear_error(&err);
    1.14  	exit(1);
    1.15      }
    1.16 -    utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
    1.17 +    if (g_utf8_validate(contents,len,NULL))
    1.18 +	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
    1.19 +    else
    1.20 +	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
    1.21      g_free(contents);
    1.22      lines=g_strsplit_set(utf8,"\r\n",0);
    1.23      g_free(utf8);
    1.24 @@ -396,7 +401,16 @@
    1.25      gsize len,nb;
    1.26      if (!g_file_get_contents(filename,&contents,&len,err))
    1.27  	return NULL;
    1.28 -    utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
    1.29 +    if (g_utf8_validate(contents,len,NULL))
    1.30 +    {
    1.31 +	utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE);
    1.32 +	g_set_print_handler(print_as_utf_8);
    1.33 +    }
    1.34 +    else
    1.35 +    {
    1.36 +	utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL);
    1.37 +	g_set_print_handler(print_as_windows_1252);
    1.38 +    }
    1.39      g_free(contents);
    1.40      return utf8;
    1.41  }
    1.42 @@ -2618,6 +2632,11 @@
    1.43  	fputs(string,stdout);
    1.44  }
    1.45  
    1.46 +void print_as_utf_8(const char *string)
    1.47 +{
    1.48 +    fputs(string,stdout);
    1.49 +}
    1.50 +
    1.51  /*
    1.52   * procfile:
    1.53   *
    1.54 @@ -2650,7 +2669,6 @@
    1.55  	    fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message);
    1.56  	exit(1);
    1.57      }
    1.58 -    g_set_print_handler(print_as_windows_1252);
    1.59      g_print("\n\nFile: %s\n\n",filename);
    1.60      first_pass_results=first_pass(etext);
    1.61      warnings=report_first_pass(first_pass_results);
     2.1 --- a/configure.ac	Thu May 30 17:16:37 2013 +0100
     2.2 +++ b/configure.ac	Thu May 30 18:33:44 2013 +0100
     2.3 @@ -11,6 +11,7 @@
     2.4  test/Makefile
     2.5  test/harness/Makefile
     2.6  test/compatibility/Makefile
     2.7 +test/bookloupe/Makefile
     2.8  doc/Makefile
     2.9  ])
    2.10  AM_INIT_AUTOMAKE([no-define 1.11])
     3.1 --- a/test/Makefile.am	Thu May 30 17:16:37 2013 +0100
     3.2 +++ b/test/Makefile.am	Thu May 30 18:33:44 2013 +0100
     3.3 @@ -1,1 +1,1 @@
     3.4 -SUBDIRS=harness compatibility .
     3.5 +SUBDIRS=harness compatibility bookloupe .
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/test/bookloupe/Makefile.am	Thu May 30 18:33:44 2013 +0100
     4.3 @@ -0,0 +1,4 @@
     4.4 +TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test
     4.5 +TESTS=non-ascii.tst long-line.tst
     4.6 +
     4.7 +dist_pkgdata_DATA=$(TESTS)
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/test/bookloupe/long-line.tst	Thu May 30 18:33:44 2013 +0100
     5.3 @@ -0,0 +1,37 @@
     5.4 +**************** INPUT ****************
     5.5 +Lines up to seventy five columns should be acceptable and shouldn't trigger
     5.6 +any kind of warning. At seventy six columns, however, one warning is issued.
     5.7 +
     5.8 +Les élèves ont mangés leur petit déjeuner avant le commencement de l'école.
     5.9 +Les pains au chocolat et les petit brioches sont le choix le plus délicieux.
    5.10 +
    5.11 +Unfortunately, with two long lines, we need to drivel on for at least
    5.12 +twenty lines so that more than ninety per cent of the text consists of
    5.13 +non-long lines so that the warnings are not switched off in a misguided
    5.14 +attempt at being helpful.
    5.15 +
    5.16 +“I love to sail the briny deep!
    5.17 +  The briny deep for me!
    5.18 +I love to watch the sunlit waves
    5.19 +  That brighten up the sea!
    5.20 +I love to listen to the wind
    5.21 +  That fills the snowy sails!
    5.22 +I love to roam around the deck----”
    5.23 +
    5.24 +  “And eat the fishes’ tails!”
    5.25 +**************** WARNINGS ****************
    5.26 +<expected>
    5.27 +  <error>
    5.28 +    <at line="2" column="76"/>
    5.29 +    <text>Long line 76</text>
    5.30 +  </error>
    5.31 +  <error>
    5.32 +    <at line="5" column="76"/>
    5.33 +    <text>Long line 76</text>
    5.34 +  </error>
    5.35 +  <false-positive>
    5.36 +    <at line="5" column="9"/>
    5.37 +    <at line="5" column="10"/>
    5.38 +    <text>Query word au - not reporting duplicates</text>
    5.39 +  </false-positive>
    5.40 +</expected>
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/test/bookloupe/non-ascii.tst	Thu May 30 18:33:44 2013 +0100
     6.3 @@ -0,0 +1,6 @@
     6.4 +**************** INPUT ****************
     6.5 +"Hello," he said, "I wanted to bave a tête-à-tête with you."
     6.6 +**************** EXPECTED ****************
     6.7 +
     6.8 +"Hello," he said, "I wanted to bave a tête-à-tête with you."
     6.9 +    Line 1 column 31 - Query word bave - not reporting duplicates
     7.1 --- a/test/compatibility/unmarked-paragraph.tst	Thu May 30 17:16:37 2013 +0100
     7.2 +++ b/test/compatibility/unmarked-paragraph.tst	Thu May 30 18:33:44 2013 +0100
     7.3 @@ -2,7 +2,7 @@
     7.4  "Hurrah! that's the way to do it!" "Now, then, Tom, see if you can't bring
     7.5  Dick home!"
     7.6  
     7.7 -"Give him a swift one, Frank! Don’t let him hit it!" cried Sam Rover,
     7.8 +"Give him a swift one, Frank! Don't let him hit it!" cried Sam Rover,
     7.9  merrily.
    7.10  **************** WARNINGS ****************
    7.11  <expected>