# HG changeset patch # User ali # Date 1380754718 -3600 # Node ID d7a97f077f9eafc70912d4cd20f4e66d716ccebd # Parent 5926b21bcb38d392161a0ae5d8c0c02771a5c974# Parent 5e27fa988c5cd69407ce14d9338a9a393070f0b1 Merge bugs #13+14: Character sets / Add a configuration file diff -r 5926b21bcb38 -r d7a97f077f9e Makefile.am --- a/Makefile.am Wed Oct 02 23:53:32 2013 +0100 +++ b/Makefile.am Wed Oct 02 23:58:38 2013 +0100 @@ -1,1 +1,3 @@ SUBDIRS=bl bookloupe test doc + +dist_pkgdata_DATA=sample.ini diff -r 5926b21bcb38 -r d7a97f077f9e bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Wed Oct 02 23:53:32 2013 +0100 +++ b/bookloupe/bookloupe.c Wed Oct 02 23:58:38 2013 +0100 @@ -32,6 +32,9 @@ #include "pending.h" #include "HTMLentities.h" +gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */ +GIConv charset_validator=(GIConv)-1; + gchar *prevline; /* Common typos. */ @@ -127,36 +130,101 @@ }; gboolean pswit[SWITNO]; /* program switches */ +gchar *opt_charset; + +gboolean typo_compat,paranoid_compat; static GOptionEntry options[]={ { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH, "Ignore DP-specific markup", NULL }, - { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, + { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+DP_SWITCH, + "Don't ignore DP-specific markup", NULL }, + { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, + "Echo queried line", NULL }, + { "no-echo", 'e', G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, "Don't echo queried line", NULL }, { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH, "Check single quotes", NULL }, - { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, + { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH, + "Don't check single quotes", NULL }, + { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, "Check common typos", NULL }, + { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, + "Don't check common typos", NULL }, { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH, "Require closure of quotes on every paragraph", NULL }, - { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, + { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+QPARA_SWITCH, + "Don't require closure of quotes on every paragraph", NULL }, + { "paranoid", 0, G_OPTION_FLAG_HIDDEN, + G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, + "Enable paranoid querying of everything", NULL }, + { "no-paranoid", 0, G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, "Disable paranoid querying of everything", NULL }, - { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, - "Disable line end checking", NULL }, + { "line-end", 0, G_OPTION_FLAG_HIDDEN, + G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, + "Enable line end checking", NULL }, + { "no-line-end", 'l', G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, + "Diable line end checking", NULL }, { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH, "Overview: just show counts", NULL }, + { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH, + "Show individual warnings", NULL }, { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH, "Output errors to stdout instead of stderr", NULL }, + { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH, + "Output errors to stderr instead of stdout", NULL }, { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH, "Echo header fields", NULL }, + { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+HEADER_SWITCH, + "Don't echo header fields", NULL }, { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH, "Ignore markup in < >", NULL }, + { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH, + "No special handling for markup in < >", NULL }, { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH, "Use file of user-defined typos", NULL }, + { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH, + "Ignore file of user-defined typos", NULL }, + { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, + "Verbose - list everything", NULL }, + { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, + "Switch off verbose mode", NULL }, + { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset, + "Set of characters valid for this ebook", "NAME" }, + { NULL } +}; + +/* + * Options relating to configuration which make no sense from inside + * a configuration file. + */ + +static GOptionEntry config_options[]={ { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH, "Defaults for use on www upload", NULL }, - { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, - "Verbose - list everything", NULL }, + { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH, + "Dump current config settings", NULL }, + { NULL } +}; + +static GOptionEntry compatibility_options[]={ + { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat, + "Toggle checking for common typos", NULL }, + { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat, + "Toggle both paranoid mode and common typos", NULL }, { NULL } }; @@ -200,31 +268,275 @@ UINT saved_cp; #endif +gboolean set_charset(const char *name,GError **err) +{ + /* The various UNICODE encodings all share the same character set. */ + const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4", + "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG", + "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", + "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE", + "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" }; + int i; + if (charset) + g_free(charset); + if (charset_validator!=(GIConv)-1) + g_iconv_close(charset_validator); + if (!name || !g_strcasecmp(name,"auto")) + { + charset=NULL; + charset_validator=(GIConv)-1; + return TRUE; + } + else + charset=g_strdup(name); + for(i=0;imessage); + exit(1); + } + g_clear_error(&err); + g_free(path); + path=NULL; + } + if (!search_dirs[i]) + { + g_key_file_free(kf); + kf=NULL; + } + g_strfreev(search_dirs); + if (full_path && kf) + *full_path=path; + else + g_free(path); + return kf; +} + +void parse_config_file(void) +{ + int i,j; + gchar *path,*s; + gchar **keys; + gboolean sw; + GError *err=NULL; + config=read_config_file(&path); + if (config) + keys=g_key_file_get_keys(config,"options",NULL,NULL); + else + keys=NULL; + if (keys) + { + for(i=0;keys[i];i++) + { + for(j=0;options[j].long_name;j++) + { + if (g_str_has_prefix(options[j].long_name,"no-")) + continue; + else if (!strcmp(keys[i],options[j].long_name)) + { + if (options[j].arg==G_OPTION_ARG_NONE) + { + sw=g_key_file_get_boolean(config,"options",keys[i], + &err); + if (err) + { + g_printerr("Bookloupe: %s: options.%s: %s\n", + path,keys[i],err->message); + g_clear_error(&err); + } + else + { + if (options[j].flags&G_OPTION_FLAG_REVERSE) + sw=!sw; + *(gboolean *)options[j].arg_data=sw; + } + break; + } + else if (options[j].arg==G_OPTION_ARG_STRING) + { + s=g_key_file_get_string(config,"options",keys[i], + &err); + if (err) + { + g_printerr("Bookloupe: %s: options.%s: %s\n", + path,keys[i],err->message); + g_clear_error(&err); + } + else + { + g_free(*(gchar **)options[j].arg_data); + if (!g_strcmp0(s,"auto")) + { + *(gchar **)options[j].arg_data=NULL; + g_free(s); + } + else + *(gchar **)options[j].arg_data=s; + } + break; + } + else + g_assert_not_reached(); + } + } + if (!options[j].long_name) + g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n", + path,keys[i]); + } + g_strfreev(keys); + } + if (config) + g_free(path); +} + void parse_options(int *argc,char ***argv) { GError *err=NULL; GOptionContext *context; + GOptionGroup *compatibility; context=g_option_context_new( - "file - looks for errors in Project Gutenberg(TM) etexts"); + "file - look for errors in Project Gutenberg(TM) etexts"); g_option_context_add_main_entries(context,options,NULL); + g_option_context_add_main_entries(context,config_options,NULL); + compatibility=g_option_group_new("compatibility", + "Options for Compatibility with Gutcheck:", + "Show compatibility options",NULL,NULL); + g_option_group_add_entries(compatibility,compatibility_options); + g_option_context_add_group(context,compatibility); + g_option_context_set_description(context, + "For simplicity, only the switch options which reverse the\n" + "default configuration are listed. In most cases, both vanilla\n" + "and \"no-\" prefixed versions are available for use."); if (!g_option_context_parse(context,argc,argv,&err)) { g_printerr("Bookloupe: %s\n",err->message); g_printerr("Use \"%s --help\" for help\n",(*argv)[0]); exit(1); } - /* Paranoid checking is turned OFF, not on, by its switch */ - pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH]; - if (pswit[PARANOID_SWITCH]) - /* if running in paranoid mode, typo checks default to enabled */ + if (typo_compat) pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH]; - /* Line-end checking is turned OFF, not on, by its switch */ - pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH]; - /* Echoing is turned OFF, not on, by its switch */ - pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH]; - if (pswit[OVERVIEW_SWITCH]) - /* just print summary; don't echo */ - pswit[ECHO_SWITCH]=FALSE; + if (paranoid_compat) + { + pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH]; + pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH]; + } /* * Web uploads - for the moment, this is really just a placeholder * until we decide what processing we really want to do on web uploads @@ -246,6 +558,21 @@ pswit[USERTYPO_SWITCH]=FALSE; pswit[DP_SWITCH]=FALSE; } + if (opt_charset && !set_charset(opt_charset,&err)) + { + g_printerr("%s\n",err->message); + exit(1); + } + if (pswit[DUMP_CONFIG_SWITCH]) + { + dump_config(); + exit(0); + } + g_free(opt_charset); + opt_charset=NULL; + if (pswit[OVERVIEW_SWITCH]) + /* just print summary; don't echo */ + pswit[ECHO_SWITCH]=FALSE; if (*argc<2) { proghelp(context); @@ -305,7 +632,11 @@ exit(1); } if (g_utf8_validate(contents,len,NULL)) + { utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); + if (!charset) + (void)set_charset("UNICODE",NULL); + } else utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); g_free(contents); @@ -388,6 +719,15 @@ saved_cp=GetConsoleOutputCP(); #endif running_from=g_path_get_dirname(argv[0]); + /* Paranoid checking is turned OFF, not on, by its switch */ + pswit[PARANOID_SWITCH]=TRUE; + /* if running in paranoid mode, typo checks default to enabled */ + pswit[TYPO_SWITCH]=TRUE; + /* Line-end checking is turned OFF, not on, by its switch */ + pswit[LINE_END_SWITCH]=TRUE; + /* Echoing is turned OFF, not on, by its switch */ + pswit[ECHO_SWITCH]=TRUE; + parse_config_file(); parse_options(&argc,&argv); if (pswit[USERTYPO_SWITCH]) read_user_scannos(); @@ -428,6 +768,9 @@ g_free(running_from); if (usertypo) g_tree_unref(usertypo); + set_charset(NULL,NULL); + if (config) + g_key_file_free(config); return 0; } @@ -708,25 +1051,32 @@ "Not reporting them.\n", results->spacedash+results->non_PG_space_emdash); } - /* If more than a quarter of characters are hi-bit, bug out. */ - warnings.bin=1; - if (results->binlen*4>results->totlen) + if (charset) + warnings.bin=0; + else { - g_print(" --> This file does not appear to be ASCII. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->alphalen*4totlen) - { - g_print(" --> This file does not appear to be text. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->binlen*100>results->totlen || results->binlen>100) - { - g_print(" --> There are a lot of foreign letters here. " - "Not reporting them.\n"); - warnings.bin=0; + /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */ + warnings.bin=1; + /* If more than a quarter of characters are hi-bit, bug out. */ + if (results->binlen*4>results->totlen) + { + g_print(" --> This file does not appear to be ASCII. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->alphalen*4totlen) + { + g_print(" --> This file does not appear to be text. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->binlen*100>results->totlen || results->binlen>100) + { + g_print(" --> There are a lot of foreign letters here. " + "Not reporting them.\n"); + if (!pswit[VERBOSE_SWITCH]) + warnings.bin=0; + } } warnings.isDutch=FALSE; if (results->Dutchcount>50) @@ -754,7 +1104,6 @@ g_print("\n"); if (pswit[VERBOSE_SWITCH]) { - warnings.bin=1; warnings.shortline=1; warnings.dotcomma=1; warnings.longline=1; @@ -949,14 +1298,17 @@ gboolean isemptyline) { /* Don't repeat multiple warnings on one line. */ - gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE; + gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE; gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE; const char *s; gunichar c; + gsize nb; + gchar *t; for (s=aline;*s;s=g_utf8_next_char(s)) { c=g_utf8_get_char(s); - if (!eNon_A && (c127)) + if (warnings->bin && !eInvalidChar && + (c127)) { if (pswit[ECHO_SWITCH]) g_print("\n%s\n",aline); @@ -971,7 +1323,57 @@ linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); else cnt_bin++; - eNon_A=TRUE; + eInvalidChar=TRUE; + } + if (!eInvalidChar && charset) + { + if (charset_validator==(GIConv)-1) + { + if (!g_unichar_isdefined(c)) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Unassigned UNICODE " + "code point U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD || + c>=100000 && c<=0x10FFFD) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Private Use " + "character U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } + else + { + t=g_convert_with_iconv(s,g_utf8_next_char(s)-s, + charset_validator,NULL,&nb,NULL); + if (t) + g_free(t); + else + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Non-%s " + "character %u\n",linecnt, + g_utf8_pointer_to_offset(aline,s)+1,charset,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } } if (!eTab && c==CHAR_TAB) { @@ -2626,8 +3028,7 @@ if (s>=aline && g_utf8_get_char(s)=='-') enddash=TRUE; check_for_control_characters(aline); - if (warnings->bin) - check_for_odd_characters(aline,warnings,isemptyline); + check_for_odd_characters(aline,warnings,isemptyline); if (warnings->longline) check_for_long_line(aline); if (warnings->shortline) diff -r 5926b21bcb38 -r d7a97f077f9e bookloupe/bookloupe.h --- a/bookloupe/bookloupe.h Wed Oct 02 23:53:32 2013 +0100 +++ b/bookloupe/bookloupe.h Wed Oct 02 23:58:38 2013 +0100 @@ -55,6 +55,7 @@ MARKUP_SWITCH, USERTYPO_SWITCH, DP_SWITCH, + DUMP_CONFIG_SWITCH, SWITNO }; diff -r 5926b21bcb38 -r d7a97f077f9e sample.ini --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sample.ini Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,33 @@ +# Default configuration for bookloupe + +[options] +# Ignore DP-specific markup +dp=false +# Echo queried line +echo=true +# Check single quotes +squote=false +# Check common typos +typo=true +# Require closure of quotes on every paragraph +qpara=false +# Enable paranoid querying of everything +paranoid=true +# Enable line end checking +line-end=true +# Overview: just show counts +overview=false +# Output errors to stdout instead of stderr +stdout=false +# Echo header fields +header=false +# Ignore markup in < > +markup=false +# Use file of user-defined typos +usertypo=false +# Defaults for use on www upload +web=false +# Verbose - list everything +verbose=false +# Set of characters valid for this ebook +charset=auto diff -r 5926b21bcb38 -r d7a97f077f9e test/bookloupe/Makefile.am --- a/test/bookloupe/Makefile.am Wed Oct 02 23:53:32 2013 +0100 +++ b/test/bookloupe/Makefile.am Wed Oct 02 23:58:38 2013 +0100 @@ -1,5 +1,7 @@ TESTS_ENVIRONMENT=BOOKLOUPE=../../bookloupe/bookloupe ../harness/loupe-test TESTS=non-ascii.tst long-line.tst curved-single-quotes.tst curved-quotes.tst \ - runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst + runfox-quotes.tst curved-genitives.tst multi-line-illustration.tst \ + config-internal.tst config-default.tst config-user.tst \ + config-override.tst charset-cp1252.tst charset-latin1.tst dist_pkgdata_DATA=$(TESTS) diff -r 5926b21bcb38 -r d7a97f077f9e test/bookloupe/charset-cp1252.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/charset-cp1252.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,16 @@ +**************** OPTIONS **************** +--charset=WINDOWS-1252 +**************** ENCODING **************** +WINDOWS-1252 +**************** INPUT **************** +Unless binary mode is engaged, gutcheck will warn about a number of +characters defined in Windows-1252. Bookloupe provides support for +disabling such checks without concern as to the file size and how +many characters with the eighth bit set it may contain by allowing a +character set to be declared. With the character set declared as +WINDOWS-1252, all characters defined in Windows-1252 shoud be acceptable +and no warnings should be issued. + +We test for this by including just one such character—the em dash. + +**************** EXPECTED **************** diff -r 5926b21bcb38 -r d7a97f077f9e test/bookloupe/charset-latin1.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/charset-latin1.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,58 @@ +**************** OPTIONS **************** +--charset=ISO-8859-1 +**************** ENCODING **************** +WINDOWS-1252 +**************** INPUT **************** +Where the character set declared is narrower than the character set +implied by the encoding as in this case (Windows-1252 is a superset +of the first latin alphabet defined in ECMA 94), then bookloupe should +warn about characters that are not in the declared character set but +should still recognise them and otherwise handle them as it would +normally do. We use the curved apostrophe as a test for this since +if bookloupe didn't recognise it then it would query the orphaned +letters from the genitives and abbreviations. + +John Hendricks was bear-leading at the time. He had originally studied +for Holy Orders, but had abandoned the Church later for private reasons +connected with his faith, and had taken to teaching and tutoring +instead. He was an honest, upstanding fellow of five-and-thirty, +incorruptible, intelligent in a simple, straightforward way. He played +games with his head, more than most Englishmen do, but he went through +life without much calculation. He had qualities that made boys like +and respect him; he won their confidence. Poor, proud, ambitious, +he realised that fate offered him a chance when the Secretary of +State for Scotland asked him if he would give up his other pupils +for a year and take his son, Lord Ernie, round the world upon an +educational trip that might make a man of him. For Lord Ernie was the +only son, and the Marquess’s influence was naturally great. To have +deposited a regenerated Lord Ernie at the castle gates might have +guaranteed Hendricks’ future. After leaving Eton prematurely the lad +had come under Hendricks’ charge for a time, and with such excellent +results--‘I’d simply swear by that chap, you know,’ the boy used +to say--that his father, considerably impressed, and rather as a +last resort, had made this proposition. And Hendricks, without much +calculation, had accepted it. He liked ‘Bindy’ for himself. It was +in his heart to ‘make a man of him,’ if possible. They had now been +round the world together and had come up from Brindisi to the Italian +Lakes, and so into Switzerland. It was middle October. With a week or +two to spare they were making leisurely for the ancestral halls in +Aberdeenshire. +**************** EXPECTED **************** + +only son, and the Marquess’s influence was naturally great. To have + Line 22 column 27 - Non-ISO-8859-1 character 8217 + +guaranteed Hendricks’ future. After leaving Eton prematurely the lad + Line 24 column 21 - Non-ISO-8859-1 character 8217 + +had come under Hendricks’ charge for a time, and with such excellent + Line 25 column 25 - Non-ISO-8859-1 character 8217 + +results--‘I’d simply swear by that chap, you know,’ the boy used + Line 26 column 10 - Non-ISO-8859-1 character 8216 + +calculation, had accepted it. He liked ‘Bindy’ for himself. It was + Line 29 column 40 - Non-ISO-8859-1 character 8216 + +in his heart to ‘make a man of him,’ if possible. They had now been + Line 30 column 17 - Non-ISO-8859-1 character 8216 diff -r 5926b21bcb38 -r d7a97f077f9e test/bookloupe/config-default.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/config-default.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,66 @@ +**************** OPTIONS **************** +--dump-config +**************** INPUT(bookloupe.ini) **************** +# Default configuration for bookloupe + +[options] +# Ignore DP-specific markup +dp=false +# Echo queried line +echo=true +# Check single quotes +squote=false +# Check common typos +typo=true +# Require closure of quotes on every paragraph +qpara=false +# Enable paranoid querying of everything +paranoid=true +# Enable line end checking +line-end=true +# Overview: just show counts +overview=false +# Output errors to stdout instead of stderr +stdout=false +# Echo header fields +header=false +# Ignore markup in < > +markup=false +# Use file of user-defined typos +usertypo=false +# Verbose - list everything +verbose=false +# Set of characters valid for this ebook +charset=auto +**************** EXPECTED(stdout) **************** +# Default configuration for bookloupe + +[options] +# Ignore DP-specific markup +dp=false +# Echo queried line +echo=true +# Check single quotes +squote=false +# Check common typos +typo=true +# Require closure of quotes on every paragraph +qpara=false +# Enable paranoid querying of everything +paranoid=true +# Enable line end checking +line-end=true +# Overview: just show counts +overview=false +# Output errors to stdout instead of stderr +stdout=false +# Echo header fields +header=false +# Ignore markup in < > +markup=false +# Use file of user-defined typos +usertypo=false +# Verbose - list everything +verbose=false +# Set of characters valid for this ebook +charset=auto diff -r 5926b21bcb38 -r d7a97f077f9e test/bookloupe/config-internal.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/config-internal.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,34 @@ +**************** OPTIONS **************** +--dump-config +**************** EXPECTED(stdout) **************** +# Default configuration for bookloupe + +[options] +# Ignore DP-specific markup +dp=false +# Echo queried line +echo=true +# Check single quotes +squote=false +# Check common typos +typo=true +# Require closure of quotes on every paragraph +qpara=false +# Enable paranoid querying of everything +paranoid=true +# Enable line end checking +line-end=true +# Overview: just show counts +overview=false +# Output errors to stdout instead of stderr +stdout=false +# Echo header fields +header=false +# Ignore markup in < > +markup=false +# Use file of user-defined typos +usertypo=false +# Verbose - list everything +verbose=false +# Set of characters valid for this ebook +charset=auto diff -r 5926b21bcb38 -r d7a97f077f9e test/bookloupe/config-override.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/config-override.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,68 @@ +**************** OPTIONS **************** +--usertypo +--charset=auto +--dump-config +**************** INPUT(bookloupe.ini) **************** +# Relaxed configuration for bookloupe + +[options] +# Ignore DP-specific markup +dp=false +# Echo queried line +echo=true +# Check single quotes +squote=false +# Check common typos +typo=true +# Require closure of quotes on every paragraph +qpara=false +# Enable paranoid querying of everything +paranoid=false +# Enable line end checking +line-end=true +# Overview: just show counts +overview=false +# Output errors to stdout instead of stderr +stdout=false +# Echo header fields +header=false +# Ignore markup in < > +markup=false +# Use file of user-defined typos +usertypo=false +# Verbose - list everything +verbose=false +# Set of characters valid for this ebook +charset=UNICODE +**************** EXPECTED(stdout) **************** +# Relaxed configuration for bookloupe + +[options] +# Ignore DP-specific markup +dp=false +# Echo queried line +echo=true +# Check single quotes +squote=false +# Check common typos +typo=true +# Require closure of quotes on every paragraph +qpara=false +# Enable paranoid querying of everything +paranoid=false +# Enable line end checking +line-end=true +# Overview: just show counts +overview=false +# Output errors to stdout instead of stderr +stdout=false +# Echo header fields +header=false +# Ignore markup in < > +markup=false +# Use file of user-defined typos +usertypo=true +# Verbose - list everything +verbose=false +# Set of characters valid for this ebook +charset=auto diff -r 5926b21bcb38 -r d7a97f077f9e test/bookloupe/config-user.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/bookloupe/config-user.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,76 @@ +**************** OPTIONS **************** +--dump-config +**************** INPUT(bookloupe.ini) **************** +# Mary Contrary's configuration for bookloupe + +# Bookloupe will ignore this group, but it's nice to have. +[other] +# Look at me! +name="Mary Contrary" + +[options] +# Ignore DP-specific markup - sounds useful +dp=true +# Echo queried line - what's the point of that? +echo=false +# Check single quotes - yup +squote=true +# Check common typos - waste of time +typo=false +# Require closure of quotes on every paragraph - okay +qpara=true +# Enable paranoid querying of everything - Huh? +paranoid=false +# Enable line end checking - pointless +line-end=false +# Overview: just show counts - Brief is good +overview=true +# Output errors to stdout instead of stderr - keeps things together +stdout=true +# Echo header fields - I'd rather see it +header=true +# Ignore markup in < > - Need this +markup=true +# Use file of user-defined typos - And this +usertypo=true +# Verbose - list everything - Contrary by name... +verbose=true +# Set of characters valid for this ebook - Let's stick with Latin1 +charset=ISO-8859-1 +**************** EXPECTED(stdout) **************** +# Mary Contrary's configuration for bookloupe + +# Bookloupe will ignore this group, but it's nice to have. +[other] +# Look at me! +name="Mary Contrary" + +[options] +# Ignore DP-specific markup - sounds useful +dp=true +# Echo queried line - what's the point of that? +echo=false +# Check single quotes - yup +squote=true +# Check common typos - waste of time +typo=false +# Require closure of quotes on every paragraph - okay +qpara=true +# Enable paranoid querying of everything - Huh? +paranoid=false +# Enable line end checking - pointless +line-end=false +# Overview: just show counts - Brief is good +overview=true +# Output errors to stdout instead of stderr - keeps things together +stdout=true +# Echo header fields - I'd rather see it +header=true +# Ignore markup in < > - Need this +markup=true +# Use file of user-defined typos - And this +usertypo=true +# Verbose - list everything - Contrary by name... +verbose=true +# Set of characters valid for this ebook - Let's stick with Latin1 +charset=ISO-8859-1 diff -r 5926b21bcb38 -r d7a97f077f9e test/compatibility/Makefile.am --- a/test/compatibility/Makefile.am Wed Oct 02 23:53:32 2013 +0100 +++ b/test/compatibility/Makefile.am Wed Oct 02 23:58:38 2013 +0100 @@ -6,6 +6,7 @@ user-defined-typo.tst brackets.tst single-quotes.tst grave-quotes.tst \ dashes.tst control-characters.tst unusual-characters.tst \ windows-1252.tst periods.tst long-line.tst unmarked-paragraph.tst \ + paranoid.tst paranoid-typos.tst no-paranoid.tst no-paranoid-typos.tst \ hebe-jeebies.tst mail-from.tst scannos.tst before-comma.tst \ before-period.tst double-punctuation.tst genitives.tst embedded-cr.tst \ continuing-quotes.tst diff -r 5926b21bcb38 -r d7a97f077f9e test/compatibility/no-paranoid-typos.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/compatibility/no-paranoid-typos.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,12 @@ +**************** OPTIONS **************** +-x +-t +**************** INPUT **************** +In paranoid mode we check for a standalone digits. 1 think this is a useful +feature. When checking for typos every, strangly placed comma is reported. + +If paranoid mode is switched off, we can still check for typos. +**************** EXPECTED **************** + +feature. When checking for typos every, strangly placed comma is reported. + Line 2 column 39 - Query punctuation after every? diff -r 5926b21bcb38 -r d7a97f077f9e test/compatibility/no-paranoid.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/compatibility/no-paranoid.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,8 @@ +**************** OPTIONS **************** +-x +**************** INPUT **************** +In paranoid mode we check for a standalone digits. 1 think this is a useful +feature. When checking for typos every, strangly placed comma is reported. + +If paranoid mode is switched off, checking for typos defaults to off too. +**************** EXPECTED **************** diff -r 5926b21bcb38 -r d7a97f077f9e test/compatibility/paranoid-typos.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/compatibility/paranoid-typos.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,12 @@ +**************** OPTIONS **************** +-t +**************** INPUT **************** +In paranoid mode we check for a standalone digits. 1 think this is a useful +feature. When checking for typos every, strangly placed comma is reported. + +In paranoid mode (the default), typo checking is switched off with its +short option. +**************** EXPECTED **************** + +In paranoid mode we check for a standalone digits. 1 think this is a useful + Line 1 column 51 - Query standalone 1 diff -r 5926b21bcb38 -r d7a97f077f9e test/compatibility/paranoid.tst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/compatibility/paranoid.tst Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,12 @@ +**************** INPUT **************** +In paranoid mode we check for a standalone digits. 1 think this is a useful +feature. When checking for typos every, strangly placed comma is reported. + +By default, both paranoid mode and checking for typos should be on. +**************** EXPECTED **************** + +In paranoid mode we check for a standalone digits. 1 think this is a useful + Line 1 column 51 - Query standalone 1 + +feature. When checking for typos every, strangly placed comma is reported. + Line 2 column 39 - Query punctuation after every? diff -r 5926b21bcb38 -r d7a97f077f9e test/harness/Makefile.am --- a/test/harness/Makefile.am Wed Oct 02 23:53:32 2013 +0100 +++ b/test/harness/Makefile.am Wed Oct 02 23:58:38 2013 +0100 @@ -5,5 +5,6 @@ loupe_test_SOURCES=loupe-test.c testcase.c testcase.h testcaseio.c \ testcaseio.h testcaseparser.c testcaseparser.h testcaseinput.c \ - testcaseinput.h warningsparser.c warningsparser.h + testcaseinput.h testcaseoutput.c testcaseoutput.h warningsparser.c \ + warningsparser.h loupe_test_LDADD=../../bl/libbl.la diff -r 5926b21bcb38 -r d7a97f077f9e test/harness/loupe-test.c --- a/test/harness/loupe-test.c Wed Oct 02 23:53:32 2013 +0100 +++ b/test/harness/loupe-test.c Wed Oct 02 23:58:38 2013 +0100 @@ -48,6 +48,7 @@ exit(1); } bl_set_print_handlers(); + g_setenv("BOOKLOUPE_CONFIG_PATH",".",TRUE); for(i=1;i #include "testcase.h" #include "testcaseinput.h" +#include "testcaseoutput.h" GQuark testcase_error_quark(void) { @@ -171,6 +172,64 @@ return g_string_free(filename,FALSE); } +/* + * Verify that all the output files specified by a testcase are present + * with the expected contents. + */ +gboolean testcase_verify_output_files(Testcase *testcase) +{ + GSList *link; + GError *tmp_err=NULL; + gboolean retval=TRUE; + ssize_t offset; + gchar *contents; + TestcaseOutput *output; + for(link=testcase->outputs;link;link=link->next) + { + output=link->data; + if (!testcase_output_read(testcase,output,&contents,NULL,&tmp_err)) + { + g_print("%s: FAIL\n",testcase->basename); + g_print("%s\n",tmp_err->message); + g_clear_error(&tmp_err); + retval=FALSE; + break; + } + else + { + if (strcmp(contents,output->contents)) + { + g_print("%s: FAIL\n",testcase->basename); + offset=common_prefix_length(contents,output->contents); + if (!offset && !contents[offset]) + g_print("%s: Unexpected empty output from bookloupe.\n", + output->name); + else + { + g_print("%s: Unexpected output from bookloupe:\n", + output->name); + print_unexpected(contents,offset); + } + retval=FALSE; + } + g_free(contents); + break; + } + } + for(link=testcase->outputs;link;link=link->next) + if (!testcase_output_remove(testcase,link->data,&tmp_err)) + { + if (retval) + { + g_print("%s: FAIL\n",testcase->basename); + g_print("%s\n",tmp_err->message); + retval=TRUE; + } + g_clear_error(&tmp_err); + } + return retval; +} + gboolean testcase_spawn_bookloupe(Testcase *testcase,char **standard_output, GError **error) { @@ -460,7 +519,7 @@ gboolean r; size_t pos,offset; GString *header; - char *output,*filename,*s,*xfail=NULL; + char *filename,*s,*xfail=NULL; GError *error=NULL; if (!testcase_create_input_files(testcase,&error)) { @@ -469,7 +528,7 @@ g_error_free(error); return FALSE; } - r=testcase_spawn_bookloupe(testcase,&output,&error); + r=testcase_spawn_bookloupe(testcase,&testcase->test_output,&error); if (!r) { g_print("%s: FAIL\n",testcase->basename); @@ -486,35 +545,40 @@ g_error_free(error); return FALSE; } - header=g_string_new("\n\nFile: "); - g_string_append(header,filename); - g_string_append(header,"\n"); - if (!g_str_has_prefix(output,header->str)) + if (testcase->expected || testcase->warnings) { - g_print("%s: FAIL\n",testcase->basename); - g_print("Unexpected header from bookloupe:\n"); - offset=common_prefix_length(output,header->str); - print_unexpected(output,offset); - r=FALSE; - } - pos=header->len; - if (r) - { - /* Skip the summary */ - s=strstr(output+pos,"\n\n"); - if (s) - pos=s-output+2; - else + header=g_string_new("\n\nFile: "); + g_string_append(header,filename); + g_string_append(header,"\n"); + if (!g_str_has_prefix(testcase->test_output,header->str)) { g_print("%s: FAIL\n",testcase->basename); - g_print("Unterminated summary from bookloupe:\n%s\n",output+pos); + g_print("Unexpected header from bookloupe:\n"); + offset=common_prefix_length(testcase->test_output,header->str); + print_unexpected(testcase->test_output,offset); r=FALSE; } + pos=header->len; + if (r) + { + /* Skip the summary */ + s=strstr(testcase->test_output+pos,"\n\n"); + if (s) + pos=s-testcase->test_output+2; + else + { + g_print("%s: FAIL\n",testcase->basename); + g_print("Unterminated summary from bookloupe:\n%s\n", + testcase->test_output+pos); + r=FALSE; + } + } + g_string_free(header,TRUE); + r=testcase_check_warnings(testcase,testcase->test_output+pos,&xfail); } - g_string_free(header,TRUE); - r=testcase_check_warnings(testcase,output+pos,&xfail); + if (!testcase_verify_output_files(testcase)) + r=FALSE; g_free(filename); - g_free(output); if (r) { if (xfail) @@ -575,5 +639,6 @@ g_slist_free(testcase->warnings); g_free(testcase->encoding); g_strfreev(testcase->options); + g_free(testcase->test_output); g_free(testcase); } diff -r 5926b21bcb38 -r d7a97f077f9e test/harness/testcase.h --- a/test/harness/testcase.h Wed Oct 02 23:53:32 2013 +0100 +++ b/test/harness/testcase.h Wed Oct 02 23:58:38 2013 +0100 @@ -37,10 +37,12 @@ char *basename; char *tmpdir; GSList *inputs; + GSList *outputs; char *expected; GSList *warnings; char *encoding; /* The character encoding to talk to BOOKLOUPE in */ char **options; + char *test_output; enum { TESTCASE_XFAIL=1<<0, TESTCASE_TMP_DIR=1<<1, diff -r 5926b21bcb38 -r d7a97f077f9e test/harness/testcaseio.c --- a/test/harness/testcaseio.c Wed Oct 02 23:53:32 2013 +0100 +++ b/test/harness/testcaseio.c Wed Oct 02 23:58:38 2013 +0100 @@ -5,6 +5,7 @@ #include #include "testcaseparser.h" #include "testcaseinput.h" +#include "testcaseoutput.h" #include "testcaseio.h" #include "warningsparser.h" @@ -70,6 +71,25 @@ else if (!testcase->expected && !testcase->warnings && !strcmp(tag,"EXPECTED")) testcase->expected=g_strdup(text); + else if (g_str_has_prefix(tag,"EXPECTED(") && tag[strlen(tag)-1]==')') + { + arg=g_strndup(tag+9,strlen(tag)-10); + s=g_path_get_dirname(arg); + if (strcmp(s,".")) + { + g_printerr("%s: Expected files may not have a " + "directory component\n",arg); + g_free(s); + g_free(arg); + testcase_free(testcase); + testcase_parser_free(parser); + return NULL; + } + g_free(s); + testcase->outputs=g_slist_prepend(testcase->outputs, + testcase_output_new(arg,text)); + g_free(arg); + } else if (!testcase->expected && !testcase->warnings && !strcmp(tag,"WARNINGS")) { @@ -88,11 +108,14 @@ } else if (!testcase->encoding && !strcmp(tag,"ENCODING")) testcase->encoding=g_strchomp(g_strdup(text)); - else if (!testcase->encoding && !strcmp(tag,"OPTIONS")) + else if (!testcase->options && !strcmp(tag,"OPTIONS")) { testcase->options=g_strsplit(text,"\n",0); - g_free(testcase->options[g_strv_length(testcase->options)-1]); - testcase->options[g_strv_length(testcase->options)-1]=NULL; + if (testcase->options && g_strv_length(testcase->options)>0) + { + g_free(testcase->options[g_strv_length(testcase->options)-1]); + testcase->options[g_strv_length(testcase->options)-1]=NULL; + } } else { diff -r 5926b21bcb38 -r d7a97f077f9e test/harness/testcaseoutput.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/harness/testcaseoutput.c Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include "testcase.h" +#include "testcaseoutput.h" + +/* + * Replace \r\n with \n, \n with U+240A (visible symbol for LF) + * and \r with U+240D (visible symbol for CR). + */ +static char *dos2unix(const char *text) +{ + gunichar c; + gboolean cr=FALSE; + const gunichar visible_lf=0x240A; + const gunichar visible_cr=0x240D; + GString *string; + string=g_string_new(NULL); + while(*text) + { + c=g_utf8_get_char(text); + text=g_utf8_next_char(text); + if (cr) + { + cr=FALSE; + if (c=='\n') + { + g_string_append_c(string,'\n'); + continue; + } + else + g_string_append_unichar(string,visible_cr); + } + if (c=='\r') + cr=TRUE; + else if (c=='\n') + g_string_append_unichar(string,visible_lf); + else + g_string_append_unichar(string,c); + } + if (cr) + g_string_append_unichar(string,visible_cr); + return g_string_free(string,FALSE); +} + +/* + * Read an output file needed for a testcase (as specified in ). + * The file is read in the encoding specified for communicating with + * bookloupe. + */ +gboolean testcase_output_read(Testcase *testcase,TestcaseOutput *output, + gchar **contents,gsize *length,GError **error) +{ + char *filename,*s,*t; + gboolean retval; + GError *tmp_err=NULL; + if (!strcmp(output->name,"stdout")) + { + *contents=g_strdup(testcase->test_output); + if (length) + *length=strlen(testcase->test_output); + } + else + { + if (testcase->tmpdir) + filename=g_build_filename(testcase->tmpdir,output->name,NULL); + else + filename=g_strdup(output->name); + if (!g_file_get_contents(filename,&s,NULL,error)) + { + g_free(filename); + return FALSE; + } + g_free(filename); + if (testcase->encoding) + { + t=dos2unix(s); + g_free(s); + s=g_convert(t,-1,"UTF-8",testcase->encoding,NULL,length,&tmp_err); + g_free(t); + if (!s) + { + g_propagate_prefixed_error(error,tmp_err, + "Conversion from %s failed: ",testcase->encoding); + return FALSE; + } + *contents=s; + } + else + { + *contents=dos2unix(s); + if (length) + *length=strlen(*contents); + } + } + return TRUE; +} + +/* + * Remove an output file created by program under test. + */ +gboolean testcase_output_remove(Testcase *testcase,TestcaseOutput *output, + GError **error) +{ + char *filename; + if (!strcmp(output->name,"stdout")) + return TRUE; + if (testcase->tmpdir) + filename=g_build_filename(testcase->tmpdir,output->name,NULL); + else + filename=g_strdup(output->name); + if (g_unlink(filename)<0) + { + g_set_error(error,G_FILE_ERROR,g_file_error_from_errno(errno), + "%s: %s",filename,g_strerror(errno)); + return FALSE; + } + g_free(filename); + return TRUE; +} + +/* Create a new description of an output file expected by a testcase */ +TestcaseOutput *testcase_output_new(const char *name,const char *contents) +{ + TestcaseOutput *output; + output=g_new0(TestcaseOutput,1); + output->name=g_strdup(name); + output->contents=g_strdup(contents); + return output; +} + +/* Free the description of a testcase output file */ +void testcase_output_free(TestcaseOutput *output) +{ + g_free(output->name); + g_free(output->contents); + g_free(output); +} diff -r 5926b21bcb38 -r d7a97f077f9e test/harness/testcaseoutput.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/harness/testcaseoutput.h Wed Oct 02 23:58:38 2013 +0100 @@ -0,0 +1,19 @@ +#ifndef TESTCASE_OUTPUT_H +#define TESTCASE_OUTPUT_H + +#include +#include "testcase.h" + +typedef struct { + char *name; + char *contents; +} TestcaseOutput; + +gboolean testcase_output_read(Testcase *testcase,TestcaseOutput *output, + gchar **contents,gsize *length,GError **error); +gboolean testcase_output_remove(Testcase *testcase,TestcaseOutput *output, + GError **error); +TestcaseOutput *testcase_output_new(const char *name,const char *contents); +void testcase_output_free(TestcaseOutput *output); + +#endif /* TESTCASE_OUTPUT_H */