diff -r 466f43a12118 -r d7a97f077f9e bookloupe/bookloupe.c --- a/bookloupe/bookloupe.c Wed Oct 02 23:51:18 2013 +0100 +++ b/bookloupe/bookloupe.c Wed Oct 02 23:58:38 2013 +0100 @@ -32,6 +32,9 @@ #include "pending.h" #include "HTMLentities.h" +gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */ +GIConv charset_validator=(GIConv)-1; + gchar *prevline; /* Common typos. */ @@ -127,36 +130,101 @@ }; gboolean pswit[SWITNO]; /* program switches */ +gchar *opt_charset; + +gboolean typo_compat,paranoid_compat; static GOptionEntry options[]={ { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH, "Ignore DP-specific markup", NULL }, - { "noecho", 'e', 0, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, + { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+DP_SWITCH, + "Don't ignore DP-specific markup", NULL }, + { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, + "Echo queried line", NULL }, + { "no-echo", 'e', G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, "Don't echo queried line", NULL }, { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH, "Check single quotes", NULL }, - { "typo", 't', 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, + { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH, + "Don't check single quotes", NULL }, + { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, "Check common typos", NULL }, + { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, + "Don't check common typos", NULL }, { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH, "Require closure of quotes on every paragraph", NULL }, - { "relaxed", 'x', 0, G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, + { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+QPARA_SWITCH, + "Don't require closure of quotes on every paragraph", NULL }, + { "paranoid", 0, G_OPTION_FLAG_HIDDEN, + G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, + "Enable paranoid querying of everything", NULL }, + { "no-paranoid", 0, G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, "Disable paranoid querying of everything", NULL }, - { "line-end", 'l', 0, G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, - "Disable line end checking", NULL }, + { "line-end", 0, G_OPTION_FLAG_HIDDEN, + G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, + "Enable line end checking", NULL }, + { "no-line-end", 'l', G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, + "Diable line end checking", NULL }, { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH, "Overview: just show counts", NULL }, + { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH, + "Show individual warnings", NULL }, { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH, "Output errors to stdout instead of stderr", NULL }, + { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH, + "Output errors to stderr instead of stdout", NULL }, { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH, "Echo header fields", NULL }, + { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+HEADER_SWITCH, + "Don't echo header fields", NULL }, { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH, "Ignore markup in < >", NULL }, + { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH, + "No special handling for markup in < >", NULL }, { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH, "Use file of user-defined typos", NULL }, + { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH, + "Ignore file of user-defined typos", NULL }, + { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, + "Verbose - list everything", NULL }, + { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, + G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, + "Switch off verbose mode", NULL }, + { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset, + "Set of characters valid for this ebook", "NAME" }, + { NULL } +}; + +/* + * Options relating to configuration which make no sense from inside + * a configuration file. + */ + +static GOptionEntry config_options[]={ { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH, "Defaults for use on www upload", NULL }, - { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, - "Verbose - list everything", NULL }, + { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH, + "Dump current config settings", NULL }, + { NULL } +}; + +static GOptionEntry compatibility_options[]={ + { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat, + "Toggle checking for common typos", NULL }, + { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat, + "Toggle both paranoid mode and common typos", NULL }, { NULL } }; @@ -200,31 +268,275 @@ UINT saved_cp; #endif +gboolean set_charset(const char *name,GError **err) +{ + /* The various UNICODE encodings all share the same character set. */ + const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4", + "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG", + "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", + "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE", + "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" }; + int i; + if (charset) + g_free(charset); + if (charset_validator!=(GIConv)-1) + g_iconv_close(charset_validator); + if (!name || !g_strcasecmp(name,"auto")) + { + charset=NULL; + charset_validator=(GIConv)-1; + return TRUE; + } + else + charset=g_strdup(name); + for(i=0;imessage); + exit(1); + } + g_clear_error(&err); + g_free(path); + path=NULL; + } + if (!search_dirs[i]) + { + g_key_file_free(kf); + kf=NULL; + } + g_strfreev(search_dirs); + if (full_path && kf) + *full_path=path; + else + g_free(path); + return kf; +} + +void parse_config_file(void) +{ + int i,j; + gchar *path,*s; + gchar **keys; + gboolean sw; + GError *err=NULL; + config=read_config_file(&path); + if (config) + keys=g_key_file_get_keys(config,"options",NULL,NULL); + else + keys=NULL; + if (keys) + { + for(i=0;keys[i];i++) + { + for(j=0;options[j].long_name;j++) + { + if (g_str_has_prefix(options[j].long_name,"no-")) + continue; + else if (!strcmp(keys[i],options[j].long_name)) + { + if (options[j].arg==G_OPTION_ARG_NONE) + { + sw=g_key_file_get_boolean(config,"options",keys[i], + &err); + if (err) + { + g_printerr("Bookloupe: %s: options.%s: %s\n", + path,keys[i],err->message); + g_clear_error(&err); + } + else + { + if (options[j].flags&G_OPTION_FLAG_REVERSE) + sw=!sw; + *(gboolean *)options[j].arg_data=sw; + } + break; + } + else if (options[j].arg==G_OPTION_ARG_STRING) + { + s=g_key_file_get_string(config,"options",keys[i], + &err); + if (err) + { + g_printerr("Bookloupe: %s: options.%s: %s\n", + path,keys[i],err->message); + g_clear_error(&err); + } + else + { + g_free(*(gchar **)options[j].arg_data); + if (!g_strcmp0(s,"auto")) + { + *(gchar **)options[j].arg_data=NULL; + g_free(s); + } + else + *(gchar **)options[j].arg_data=s; + } + break; + } + else + g_assert_not_reached(); + } + } + if (!options[j].long_name) + g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n", + path,keys[i]); + } + g_strfreev(keys); + } + if (config) + g_free(path); +} + void parse_options(int *argc,char ***argv) { GError *err=NULL; GOptionContext *context; + GOptionGroup *compatibility; context=g_option_context_new( - "file - looks for errors in Project Gutenberg(TM) etexts"); + "file - look for errors in Project Gutenberg(TM) etexts"); g_option_context_add_main_entries(context,options,NULL); + g_option_context_add_main_entries(context,config_options,NULL); + compatibility=g_option_group_new("compatibility", + "Options for Compatibility with Gutcheck:", + "Show compatibility options",NULL,NULL); + g_option_group_add_entries(compatibility,compatibility_options); + g_option_context_add_group(context,compatibility); + g_option_context_set_description(context, + "For simplicity, only the switch options which reverse the\n" + "default configuration are listed. In most cases, both vanilla\n" + "and \"no-\" prefixed versions are available for use."); if (!g_option_context_parse(context,argc,argv,&err)) { g_printerr("Bookloupe: %s\n",err->message); g_printerr("Use \"%s --help\" for help\n",(*argv)[0]); exit(1); } - /* Paranoid checking is turned OFF, not on, by its switch */ - pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH]; - if (pswit[PARANOID_SWITCH]) - /* if running in paranoid mode, typo checks default to enabled */ + if (typo_compat) pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH]; - /* Line-end checking is turned OFF, not on, by its switch */ - pswit[LINE_END_SWITCH]=!pswit[LINE_END_SWITCH]; - /* Echoing is turned OFF, not on, by its switch */ - pswit[ECHO_SWITCH]=!pswit[ECHO_SWITCH]; - if (pswit[OVERVIEW_SWITCH]) - /* just print summary; don't echo */ - pswit[ECHO_SWITCH]=FALSE; + if (paranoid_compat) + { + pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH]; + pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH]; + } /* * Web uploads - for the moment, this is really just a placeholder * until we decide what processing we really want to do on web uploads @@ -246,6 +558,21 @@ pswit[USERTYPO_SWITCH]=FALSE; pswit[DP_SWITCH]=FALSE; } + if (opt_charset && !set_charset(opt_charset,&err)) + { + g_printerr("%s\n",err->message); + exit(1); + } + if (pswit[DUMP_CONFIG_SWITCH]) + { + dump_config(); + exit(0); + } + g_free(opt_charset); + opt_charset=NULL; + if (pswit[OVERVIEW_SWITCH]) + /* just print summary; don't echo */ + pswit[ECHO_SWITCH]=FALSE; if (*argc<2) { proghelp(context); @@ -305,7 +632,11 @@ exit(1); } if (g_utf8_validate(contents,len,NULL)) + { utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); + if (!charset) + (void)set_charset("UNICODE",NULL); + } else utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); g_free(contents); @@ -388,6 +719,15 @@ saved_cp=GetConsoleOutputCP(); #endif running_from=g_path_get_dirname(argv[0]); + /* Paranoid checking is turned OFF, not on, by its switch */ + pswit[PARANOID_SWITCH]=TRUE; + /* if running in paranoid mode, typo checks default to enabled */ + pswit[TYPO_SWITCH]=TRUE; + /* Line-end checking is turned OFF, not on, by its switch */ + pswit[LINE_END_SWITCH]=TRUE; + /* Echoing is turned OFF, not on, by its switch */ + pswit[ECHO_SWITCH]=TRUE; + parse_config_file(); parse_options(&argc,&argv); if (pswit[USERTYPO_SWITCH]) read_user_scannos(); @@ -428,6 +768,9 @@ g_free(running_from); if (usertypo) g_tree_unref(usertypo); + set_charset(NULL,NULL); + if (config) + g_key_file_free(config); return 0; } @@ -708,25 +1051,32 @@ "Not reporting them.\n", results->spacedash+results->non_PG_space_emdash); } - /* If more than a quarter of characters are hi-bit, bug out. */ - warnings.bin=1; - if (results->binlen*4>results->totlen) + if (charset) + warnings.bin=0; + else { - g_print(" --> This file does not appear to be ASCII. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->alphalen*4totlen) - { - g_print(" --> This file does not appear to be text. " - "Terminating. Best of luck with it!\n"); - exit(1); - } - if (results->binlen*100>results->totlen || results->binlen>100) - { - g_print(" --> There are a lot of foreign letters here. " - "Not reporting them.\n"); - warnings.bin=0; + /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */ + warnings.bin=1; + /* If more than a quarter of characters are hi-bit, bug out. */ + if (results->binlen*4>results->totlen) + { + g_print(" --> This file does not appear to be ASCII. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->alphalen*4totlen) + { + g_print(" --> This file does not appear to be text. " + "Terminating. Best of luck with it!\n"); + exit(1); + } + if (results->binlen*100>results->totlen || results->binlen>100) + { + g_print(" --> There are a lot of foreign letters here. " + "Not reporting them.\n"); + if (!pswit[VERBOSE_SWITCH]) + warnings.bin=0; + } } warnings.isDutch=FALSE; if (results->Dutchcount>50) @@ -754,7 +1104,6 @@ g_print("\n"); if (pswit[VERBOSE_SWITCH]) { - warnings.bin=1; warnings.shortline=1; warnings.dotcomma=1; warnings.longline=1; @@ -949,14 +1298,17 @@ gboolean isemptyline) { /* Don't repeat multiple warnings on one line. */ - gboolean eNon_A=FALSE,eTab=FALSE,eTilde=FALSE; + gboolean eInvalidChar=FALSE,eTab=FALSE,eTilde=FALSE; gboolean eCarat=FALSE,eFSlash=FALSE,eAst=FALSE; const char *s; gunichar c; + gsize nb; + gchar *t; for (s=aline;*s;s=g_utf8_next_char(s)) { c=g_utf8_get_char(s); - if (!eNon_A && (c127)) + if (warnings->bin && !eInvalidChar && + (c127)) { if (pswit[ECHO_SWITCH]) g_print("\n%s\n",aline); @@ -971,7 +1323,57 @@ linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); else cnt_bin++; - eNon_A=TRUE; + eInvalidChar=TRUE; + } + if (!eInvalidChar && charset) + { + if (charset_validator==(GIConv)-1) + { + if (!g_unichar_isdefined(c)) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Unassigned UNICODE " + "code point U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD || + c>=100000 && c<=0x10FFFD) + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Private Use " + "character U+%04" G_GINT32_MODIFIER "X\n", + linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } + else + { + t=g_convert_with_iconv(s,g_utf8_next_char(s)-s, + charset_validator,NULL,&nb,NULL); + if (t) + g_free(t); + else + { + if (pswit[ECHO_SWITCH]) + g_print("\n%s\n",aline); + if (!pswit[OVERVIEW_SWITCH]) + g_print(" Line %ld column %ld - Non-%s " + "character %u\n",linecnt, + g_utf8_pointer_to_offset(aline,s)+1,charset,c); + else + cnt_bin++; + eInvalidChar=TRUE; + } + } } if (!eTab && c==CHAR_TAB) { @@ -2626,8 +3028,7 @@ if (s>=aline && g_utf8_get_char(s)=='-') enddash=TRUE; check_for_control_characters(aline); - if (warnings->bin) - check_for_odd_characters(aline,warnings,isemptyline); + check_for_odd_characters(aline,warnings,isemptyline); if (warnings->longline) check_for_long_line(aline); if (warnings->shortline)