diff -r 82d3cc398b54 -r 52d4a7f926b4 bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Thu May 30 17:16:37 2013 +0100 +++ b/bookloupe/bookloupe.c Thu May 30 18:33:44 2013 +0100 @@ -230,6 +230,8 @@ void loseentities(char *); gboolean isroman(const char *); void postprocess_for_DP(char *); +void print_as_windows_1252(const char *string); +void print_as_utf_8(const char *string); GTree *qword,*qperiod; @@ -371,7 +373,10 @@ g_clear_error(&err); exit(1); } - utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); + if (g_utf8_validate(contents,len,NULL)) + utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); + else + utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); g_free(contents); lines=g_strsplit_set(utf8,"\r\n",0); g_free(utf8); @@ -396,7 +401,16 @@ gsize len,nb; if (!g_file_get_contents(filename,&contents,&len,err)) return NULL; - utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); + if (g_utf8_validate(contents,len,NULL)) + { + utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); + g_set_print_handler(print_as_utf_8); + } + else + { + utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); + g_set_print_handler(print_as_windows_1252); + } g_free(contents); return utf8; } @@ -2618,6 +2632,11 @@ fputs(string,stdout); } +void print_as_utf_8(const char *string) +{ + fputs(string,stdout); +} + /* * procfile: * @@ -2650,7 +2669,6 @@ fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message); exit(1); } - g_set_print_handler(print_as_windows_1252); g_print("\n\nFile: %s\n\n",filename); first_pass_results=first_pass(etext); warnings=report_first_pass(first_pass_results);