ali@0: /*************************************************************************/ ali@40: /* bookloupe--check for assorted weirdnesses in a PG candidate text file */ ali@68: /* */ ali@68: /* Copyright 2000-2005 Jim Tinsley */ ali@68: /* Copyright 2012- J. Ali Harlow */ ali@68: /* */ ali@0: /* This program is free software; you can redistribute it and/or modify */ ali@0: /* it under the terms of the GNU General Public License as published by */ ali@0: /* the Free Software Foundation; either version 2 of the License, or */ ali@68: /* (at your option) any later version. */ ali@68: /* */ ali@0: /* This program is distributed in the hope that it will be useful, */ ali@68: /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ ali@68: /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ ali@68: /* GNU General Public License for more details. */ ali@68: /* */ ali@68: /* You should have received a copy of the GNU General Public License */ ali@68: /* along with this program. If not, see . */ ali@0: /*************************************************************************/ ali@0: ali@0: #include ali@0: #include ali@0: #include ali@0: #include ali@73: #ifdef __WIN32__ ali@73: #include ali@73: #endif ali@69: #include ali@69: #include ali@99: #include "bookloupe.h" ali@99: #include "counters.h" ali@103: #include "pending.h" ali@71: #include "HTMLentities.h" ali@0: ali@185: gchar *charset; /* Or NULL for auto (ISO_8859-1/ASCII or UNICODE) */ ali@185: GIConv charset_validator=(GIConv)-1; ali@185: ali@69: gchar *prevline; ali@0: ali@40: /* Common typos. */ ali@40: char *typo[] = { ali@40: "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", ali@40: "nad", "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", ali@40: "bakc", "om", "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", ali@40: "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr", "hmi", "hse", ali@40: "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", ali@40: "gerat", "goign", "gruop", "haev", "hda", "hearign", "seeign", "sayign", ali@40: "herat", "hge", "hsa", "hsi", "hte", "htere", "htese", "htey", "htis", ali@40: "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", ali@40: "loev", "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", ali@40: "omre", "onyl", "otehr", "otu", "owrk", "owuld", "peice", "peices", ali@40: "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem", ali@40: "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", ali@40: "sohw", "stnad", "stopry", "stoyr", "stpo", "tahn", "taht", "tath", ali@40: "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge", "thier", ali@40: "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", ali@40: "tirne", "tkae", "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", ali@40: "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih", "whihc", "whta", ali@40: "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats", ali@40: "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", ali@40: "wtih", "wuould", "wya", "yera", "yeras", "yersa", "yoiu", "youve", ali@40: "ytou", "yuor", "abead", "ahle", "ahout", "ahove", "altbough", "balf", ali@40: "bardly", "bas", "bave", "baving", "bebind", "beld", "belp", "belped", ali@40: "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", ali@40: "dehates", "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", ali@40: "hegin", "heing", "helieve", "henefit", "hetter", "hetween", "heyond", ali@40: "hig", "higber", "huild", "huy", "hy", "jobn", "joh", "meanwbile", ali@40: "memher", "memhers", "numher", "numhers", "perbaps", "prohlem", "puhlic", ali@40: "witbout", "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", ali@40: "prornise", "prornised", "modem", "bo", "heside", "chapteb", "chaptee", ali@40: "se", "" ali@40: }; ali@0: ali@69: GTree *usertypo; ali@0: ali@40: /* Common abbreviations and other OK words not to query as typos. */ ali@40: char *okword[] = { ali@40: "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", ali@40: "rd", "sh", "br", "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", ali@40: "pompeii","hawaii","hawaiian", "hotbed", "heartbeat", "heartbeats", ali@40: "outbid", "outbids", "frostbite", "frostbitten", "" ali@40: }; ali@0: ali@40: /* Common abbreviations that cause otherwise unexplained periods. */ ali@40: char *abbrev[] = { ali@40: "cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", ali@40: "cit", "deg", "min", "chap", "oz", "mme", "mlle", "mssrs", "" ali@40: }; ali@0: ali@40: /* ali@40: * Two-Letter combinations that rarely if ever start words, ali@40: * but are common scannos or otherwise common letter combinations. ali@40: */ ali@40: char *nostart[] = { ali@40: "hr", "hl", "cb", "sb", "tb", "wb", "tl", "tn", "rn", "lt", "tj", "" ali@40: }; ali@0: ali@40: /* ali@40: * Two-Letter combinations that rarely if ever end words, ali@40: * but are common scannos or otherwise common letter combinations. ali@40: */ ali@40: char *noend[] = { ali@40: "cb", "gb", "pb", "sb", "tb", "wh", "fr", "br", "qu", "tw", "gl", "fl", ali@40: "sw", "gr", "sl", "cl", "iy", "" ali@40: }; ali@0: ali@40: char *markup[] = { ali@40: "a", "b", "big", "blockquote", "body", "br", "center", "col", "div", "em", ali@40: "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", ali@40: "img", "li", "meta", "ol", "p", "pre", "small", "span", "strong", "sub", ali@40: "sup", "table", "td", "tfoot", "thead", "title", "tr", "tt", "u", "ul", "" ali@40: }; ali@0: ali@40: char *DPmarkup[] = { ali@40: "", "", "/*", "*/", "/#", "#/", "/$", "$/", "", "" ali@40: }; ali@0: ali@40: char *nocomma[] = { ali@40: "the", "it's", "their", "an", "mrs", "a", "our", "that's", "its", "whose", ali@40: "every", "i'll", "your", "my", "mr", "mrs", "mss", "mssrs", "ft", "pm", ali@40: "st", "dr", "rd", "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "i'm", ali@40: "during", "let", "toward", "among", "" ali@40: }; ali@0: ali@40: char *noperiod[] = { ali@40: "every", "i'm", "during", "that's", "their", "your", "our", "my", "or", ali@40: "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether", ali@40: "i'll", "whose", "who", "because", "when", "let", "till", "very", "an", ali@40: "among", "those", "into", "whom", "having", "thence", "" ali@40: }; ali@0: ali@69: gboolean pswit[SWITNO]; /* program switches */ ali@185: gchar *opt_charset; ali@0: ali@186: gboolean typo_compat,paranoid_compat; ali@186: ali@69: static GOptionEntry options[]={ ali@69: { "dp", 'd', 0, G_OPTION_ARG_NONE, pswit+DP_SWITCH, ali@69: "Ignore DP-specific markup", NULL }, ali@186: { "no-dp", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+DP_SWITCH, ali@186: "Don't ignore DP-specific markup", NULL }, ali@186: { "echo", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, ali@186: "Echo queried line", NULL }, ali@186: { "no-echo", 'e', G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+ECHO_SWITCH, ali@69: "Don't echo queried line", NULL }, ali@69: { "squote", 's', 0, G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH, ali@69: "Check single quotes", NULL }, ali@186: { "no-squote", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+SQUOTE_SWITCH, ali@186: "Don't check single quotes", NULL }, ali@186: { "typo", 0, 0, G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, ali@69: "Check common typos", NULL }, ali@186: { "no-typo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+TYPO_SWITCH, ali@186: "Don't check common typos", NULL }, ali@69: { "qpara", 'p', 0, G_OPTION_ARG_NONE, pswit+QPARA_SWITCH, ali@69: "Require closure of quotes on every paragraph", NULL }, ali@186: { "no-qpara", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+QPARA_SWITCH, ali@186: "Don't require closure of quotes on every paragraph", NULL }, ali@186: { "paranoid", 0, G_OPTION_FLAG_HIDDEN, ali@186: G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, ali@186: "Enable paranoid querying of everything", NULL }, ali@186: { "no-paranoid", 0, G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+PARANOID_SWITCH, ali@69: "Disable paranoid querying of everything", NULL }, ali@186: { "line-end", 0, G_OPTION_FLAG_HIDDEN, ali@186: G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, ali@186: "Enable line end checking", NULL }, ali@186: { "no-line-end", 'l', G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+LINE_END_SWITCH, ali@186: "Diable line end checking", NULL }, ali@69: { "overview", 'o', 0, G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH, ali@69: "Overview: just show counts", NULL }, ali@186: { "no-overview", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+OVERVIEW_SWITCH, ali@186: "Show individual warnings", NULL }, ali@69: { "stdout", 'y', 0, G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH, ali@69: "Output errors to stdout instead of stderr", NULL }, ali@186: { "no-stdout", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+STDOUT_SWITCH, ali@186: "Output errors to stderr instead of stdout", NULL }, ali@69: { "header", 'h', 0, G_OPTION_ARG_NONE, pswit+HEADER_SWITCH, ali@69: "Echo header fields", NULL }, ali@186: { "no-header", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+HEADER_SWITCH, ali@186: "Don't echo header fields", NULL }, ali@69: { "markup", 'm', 0, G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH, ali@69: "Ignore markup in < >", NULL }, ali@186: { "no-markup", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+MARKUP_SWITCH, ali@186: "No special handling for markup in < >", NULL }, ali@69: { "usertypo", 'u', 0, G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH, ali@69: "Use file of user-defined typos", NULL }, ali@186: { "no-usertypo", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+USERTYPO_SWITCH, ali@186: "Ignore file of user-defined typos", NULL }, ali@186: { "verbose", 'v', 0, G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, ali@186: "Verbose - list everything", NULL }, ali@186: { "no-verbose", 0, G_OPTION_FLAG_HIDDEN|G_OPTION_FLAG_REVERSE, ali@186: G_OPTION_ARG_NONE, pswit+VERBOSE_SWITCH, ali@186: "Switch off verbose mode", NULL }, ali@187: { "charset", 0, 0, G_OPTION_ARG_STRING, &opt_charset, ali@187: "Set of characters valid for this ebook", "NAME" }, ali@186: { NULL } ali@186: }; ali@186: ali@186: /* ali@186: * Options relating to configuration which make no sense from inside ali@186: * a configuration file. ali@186: */ ali@186: ali@186: static GOptionEntry config_options[]={ ali@69: { "web", 'w', 0, G_OPTION_ARG_NONE, pswit+WEB_SWITCH, ali@69: "Defaults for use on www upload", NULL }, ali@186: { "dump-config", 0, 0, G_OPTION_ARG_NONE, pswit+DUMP_CONFIG_SWITCH, ali@186: "Dump current config settings", NULL }, ali@186: { NULL } ali@186: }; ali@186: ali@186: static GOptionEntry compatibility_options[]={ ali@186: { "toggle-typo", 't', 0, G_OPTION_ARG_NONE, &typo_compat, ali@186: "Toggle checking for common typos", NULL }, ali@186: { "toggle-relaxed", 'x', 0, G_OPTION_ARG_NONE, ¶noid_compat, ali@186: "Toggle both paranoid mode and common typos", NULL }, ali@69: { NULL } ali@69: }; ali@0: ali@142: long cnt_quote; /* for overview mode, count of quote queries */ ali@68: long cnt_brack; /* for overview mode, count of brackets queries */ ali@68: long cnt_bin; /* for overview mode, count of non-ASCII queries */ ali@68: long cnt_odd; /* for overview mode, count of odd character queries */ ali@68: long cnt_long; /* for overview mode, count of long line errors */ ali@68: long cnt_short; /* for overview mode, count of short line queries */ ali@68: long cnt_punct; /* for overview mode, ali@68: count of punctuation and spacing queries */ ali@68: long cnt_dash; /* for overview mode, count of dash-related queries */ ali@68: long cnt_word; /* for overview mode, count of word queries */ ali@68: long cnt_html; /* for overview mode, count of html queries */ ali@68: long cnt_lineend; /* for overview mode, count of line-end queries */ ali@68: long cnt_spacend; /* count of lines with space at end */ ali@68: long linecnt; /* count of total lines in the file */ ali@68: long checked_linecnt; /* count of lines actually checked */ ali@0: ali@69: void proghelp(GOptionContext *context); ali@69: void procfile(const char *); ali@0: ali@69: gchar *running_from; ali@0: ali@70: gboolean mixdigit(const char *); ali@69: gchar *getaword(const char **); ali@69: char *flgets(char **,long); ali@0: void postprocess_for_HTML(char *); ali@0: char *linehasmarkup(char *); ali@0: char *losemarkup(char *); ali@70: gboolean tagcomp(const char *,const char *); ali@71: void loseentities(char *); ali@69: gboolean isroman(const char *); ali@0: void postprocess_for_DP(char *); ali@72: void print_as_windows_1252(const char *string); ali@72: void print_as_utf_8(const char *string); ali@0: ali@69: GTree *qword,*qperiod; ali@68: ali@73: #ifdef __WIN32__ ali@73: UINT saved_cp; ali@73: #endif ali@73: ali@186: GKeyFile *config; ali@186: ali@186: void config_file_update(GKeyFile *kf) ali@186: { ali@186: int i; ali@186: gboolean sw; ali@186: for(i=0;options[i].long_name;i++) ali@186: { ali@186: if (g_str_has_prefix(options[i].long_name,"no-")) ali@186: continue; ali@186: if (options[i].arg==G_OPTION_ARG_NONE) ali@186: { ali@186: sw=*(gboolean *)options[i].arg_data; ali@186: if (options[i].flags&G_OPTION_FLAG_REVERSE) ali@186: sw=!sw; ali@186: g_key_file_set_boolean(kf,"options",options[i].long_name,sw); ali@186: } ali@186: else ali@186: g_assert_not_reached(); ali@186: } ali@186: } ali@186: ali@186: void config_file_add_comments(GKeyFile *kf) ali@186: { ali@186: int i; ali@186: gchar *comment; ali@186: g_key_file_set_comment(kf,NULL,NULL," Default configuration for bookloupe", ali@186: NULL); ali@186: for(i=0;options[i].long_name;i++) ali@186: { ali@186: if (g_str_has_prefix(options[i].long_name,"no-")) ali@186: continue; ali@186: comment=g_strconcat(" ",options[i].description,NULL); ali@186: g_key_file_set_comment(kf,"options",options[i].long_name,comment,NULL); ali@186: g_free(comment); ali@186: } ali@186: } ali@186: ali@186: void dump_config(void) ali@186: { ali@186: gchar *s; ali@186: if (config) ali@186: config_file_update(config); ali@186: else ali@186: { ali@186: config=g_key_file_new(); ali@186: config_file_update(config); ali@186: config_file_add_comments(config); ali@186: } ali@186: s=g_key_file_to_data(config,NULL,NULL); ali@186: if (s) ali@186: g_print("%s",s); ali@186: g_free(s); ali@186: } ali@186: ali@186: GKeyFile *read_config_file(gchar **full_path) ali@186: { ali@186: int i; ali@186: GError *err=NULL; ali@186: gchar **search_dirs; ali@186: gchar *path; ali@186: const char *search_path; ali@186: GKeyFile *kf; ali@186: kf=g_key_file_new(); ali@186: search_path=g_getenv("BOOKLOUPE_CONFIG_PATH"); ali@186: if (search_path) ali@186: { ali@186: #ifdef __WIN32__ ali@186: search_dirs=g_strsplit(search_path,";",0); ali@186: #else ali@186: search_dirs=g_strsplit(search_path,":",0); ali@186: #endif ali@186: } ali@186: else ali@186: { ali@186: search_dirs=g_new(gchar *,4); ali@186: search_dirs[0]=g_get_current_dir(); ali@186: search_dirs[1]=g_strdup(running_from); ali@186: search_dirs[2]=g_strdup(g_get_user_config_dir()); ali@186: search_dirs[3]=NULL; ali@186: } ali@186: for(i=0;search_dirs[i];i++) ali@186: { ali@186: path=g_build_filename(search_dirs[i],"bookloupe.ini",NULL); ali@186: if (g_key_file_load_from_file(kf,path, ali@186: G_KEY_FILE_KEEP_COMMENTS|G_KEY_FILE_KEEP_TRANSLATIONS,&err)) ali@186: break; ali@186: if (!g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) ali@186: { ali@186: g_printerr("Bookloupe: Error reading %s\n",path); ali@186: g_printerr("%s\n",err->message); ali@186: exit(1); ali@186: } ali@186: g_clear_error(&err); ali@186: g_free(path); ali@186: path=NULL; ali@186: } ali@186: if (!search_dirs[i]) ali@186: { ali@186: g_key_file_free(kf); ali@186: kf=NULL; ali@186: } ali@186: g_strfreev(search_dirs); ali@186: if (full_path && kf) ali@186: *full_path=path; ali@186: else ali@186: g_free(path); ali@186: return kf; ali@186: } ali@186: ali@186: void parse_config_file(void) ali@186: { ali@186: int i,j; ali@186: gchar *path; ali@186: gchar **keys; ali@186: gboolean sw; ali@186: GError *err=NULL; ali@186: config=read_config_file(&path); ali@186: if (config) ali@186: keys=g_key_file_get_keys(config,"options",NULL,NULL); ali@186: else ali@186: keys=NULL; ali@186: if (keys) ali@186: { ali@186: for(i=0;keys[i];i++) ali@186: { ali@186: for(j=0;options[j].long_name;j++) ali@186: { ali@186: if (g_str_has_prefix(options[j].long_name,"no-")) ali@186: continue; ali@186: else if (!strcmp(keys[i],options[j].long_name)) ali@186: { ali@186: if (options[j].arg==G_OPTION_ARG_NONE) ali@186: { ali@186: sw=g_key_file_get_boolean(config,"options",keys[i], ali@186: &err); ali@186: if (err) ali@186: { ali@186: g_printerr("Bookloupe: %s: options.%s: %s\n", ali@186: path,keys[i],err->message); ali@186: g_clear_error(&err); ali@186: } ali@186: if (options[j].flags&G_OPTION_FLAG_REVERSE) ali@186: sw=!sw; ali@186: *(gboolean *)options[j].arg_data=sw; ali@186: break; ali@186: } ali@186: else ali@186: g_assert_not_reached(); ali@186: } ali@186: } ali@186: if (!options[j].long_name) ali@186: g_printerr("Bookloupe: %s: Unknown option \"%s\" ignored\n", ali@186: path,keys[i]); ali@186: } ali@186: g_strfreev(keys); ali@186: } ali@186: if (config) ali@186: g_free(path); ali@186: } ali@186: ali@185: gboolean set_charset(const char *name,GError **err) ali@185: { ali@185: /* The various UNICODE encodings all share the same character set. */ ali@185: const char *unicode_aliases[]={ "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4", ali@185: "UCS-4BE", "UCS-4LE", "UCS2", "UCS4", "UNICODE", "UNICODEBIG", ali@185: "UNICODELITTLE", "UTF-7", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", ali@185: "UTF-32", "UTF-32BE", "UTF-32LE", "UTF7", "UTF8", "UTF16", "UTF16BE", ali@185: "UTF16LE", "UTF32", "UTF32BE", "UTF32LE" }; ali@185: int i; ali@185: if (charset) ali@185: g_free(charset); ali@185: if (charset_validator!=(GIConv)-1) ali@185: g_iconv_close(charset_validator); ali@185: if (!name || !g_strcasecmp(name,"auto")) ali@185: { ali@185: charset=NULL; ali@185: charset_validator=(GIConv)-1; ali@185: return TRUE; ali@185: } ali@185: else ali@185: charset=g_strdup(name); ali@185: for(i=0;imessage); ali@69: g_printerr("Use \"%s --help\" for help\n",(*argv)[0]); ali@69: exit(1); ali@69: } ali@186: if (typo_compat) ali@69: pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH]; ali@186: if (paranoid_compat) ali@186: { ali@186: pswit[PARANOID_SWITCH]=!pswit[PARANOID_SWITCH]; ali@186: pswit[TYPO_SWITCH]=!pswit[TYPO_SWITCH]; ali@186: } ali@40: /* ali@40: * Web uploads - for the moment, this is really just a placeholder ali@40: * until we decide what processing we really want to do on web uploads ali@40: */ ali@40: if (pswit[WEB_SWITCH]) ali@40: { ali@40: /* specific override for web uploads */ ali@69: pswit[ECHO_SWITCH]=TRUE; ali@69: pswit[SQUOTE_SWITCH]=FALSE; ali@69: pswit[TYPO_SWITCH]=TRUE; ali@69: pswit[QPARA_SWITCH]=FALSE; ali@69: pswit[PARANOID_SWITCH]=TRUE; ali@69: pswit[LINE_END_SWITCH]=FALSE; ali@69: pswit[OVERVIEW_SWITCH]=FALSE; ali@69: pswit[STDOUT_SWITCH]=FALSE; ali@69: pswit[HEADER_SWITCH]=TRUE; ali@69: pswit[VERBOSE_SWITCH]=FALSE; ali@69: pswit[MARKUP_SWITCH]=FALSE; ali@69: pswit[USERTYPO_SWITCH]=FALSE; ali@69: pswit[DP_SWITCH]=FALSE; ali@40: } ali@185: if (opt_charset && !set_charset(opt_charset,&err)) ali@185: { ali@185: g_printerr("%s\n",err->message); ali@185: exit(1); ali@185: } ali@186: if (pswit[DUMP_CONFIG_SWITCH]) ali@186: { ali@186: dump_config(); ali@186: exit(0); ali@186: } ali@185: g_free(opt_charset); ali@185: opt_charset=NULL; ali@186: if (pswit[OVERVIEW_SWITCH]) ali@186: /* just print summary; don't echo */ ali@186: pswit[ECHO_SWITCH]=FALSE; ali@69: if (*argc<2) ali@40: { ali@69: proghelp(context); ali@69: exit(1); ali@40: } ali@69: g_option_context_free(context); ali@69: } ali@69: ali@69: /* ali@69: * read_user_scannos: ali@69: * ali@69: * Read in the user-defined stealth scanno list. ali@69: */ ali@69: void read_user_scannos(void) ali@69: { ali@69: GError *err=NULL; ali@69: gchar *usertypo_file; ali@69: gboolean okay; ali@69: int i; ali@70: gsize len,nb; ali@70: gchar *contents,*utf8,**lines; ali@69: usertypo_file=g_strdup("bookloupe.typ"); ali@69: okay=file_get_contents_text(usertypo_file,&contents,&len,&err); ali@69: if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) ali@69: { ali@69: g_clear_error(&err); ali@69: g_free(usertypo_file); ali@69: usertypo_file=g_build_filename(running_from,"bookloupe.typ",NULL); ali@69: okay=file_get_contents_text(usertypo_file,&contents,&len,&err); ali@69: } ali@69: if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) ali@69: { ali@69: g_clear_error(&err); ali@69: g_free(usertypo_file); ali@69: usertypo_file=g_strdup("gutcheck.typ"); ali@69: okay=file_get_contents_text(usertypo_file,&contents,&len,&err); ali@69: } ali@69: if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) ali@69: { ali@69: g_clear_error(&err); ali@69: g_free(usertypo_file); ali@69: usertypo_file=g_build_filename(running_from,"gutcheck.typ",NULL); ali@69: okay=file_get_contents_text(usertypo_file,&contents,&len,&err); ali@69: } ali@69: if (g_error_matches(err,G_FILE_ERROR,G_FILE_ERROR_NOENT)) ali@69: { ali@69: g_free(usertypo_file); ali@70: g_print(" --> I couldn't find bookloupe.typ " ali@69: "-- proceeding without user typos.\n"); ali@69: return; ali@69: } ali@69: else if (!okay) ali@69: { ali@69: fprintf(stderr,"%s: %s\n",usertypo_file,err->message); ali@69: g_free(usertypo_file); ali@69: g_clear_error(&err); ali@69: exit(1); ali@69: } ali@72: if (g_utf8_validate(contents,len,NULL)) ali@185: { ali@72: utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); ali@185: if (!charset) ali@185: (void)set_charset("UNICODE",NULL); ali@185: } ali@72: else ali@72: utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",NULL,&nb,NULL); ali@70: g_free(contents); ali@70: lines=g_strsplit_set(utf8,"\r\n",0); ali@70: g_free(utf8); ali@69: usertypo=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL); ali@69: for (i=0;lines[i];i++) ali@69: if (*(unsigned char *)lines[i]>'!') ali@69: g_tree_insert(usertypo,lines[i],GINT_TO_POINTER(1)); ali@69: else ali@69: g_free(lines[i]); ali@69: g_free(lines); ali@69: } ali@69: ali@69: /* ali@69: * read_etext: ali@69: * ali@69: * Read an etext returning a newly allocated string containing the file ali@69: * contents or NULL on error. ali@69: */ ali@69: gchar *read_etext(const char *filename,GError **err) ali@69: { ali@76: GError *tmp_err=NULL; ali@70: gchar *contents,*utf8; ali@76: gsize len,bytes_read,bytes_written; ali@76: int i,line,col; ali@69: if (!g_file_get_contents(filename,&contents,&len,err)) ali@69: return NULL; ali@72: if (g_utf8_validate(contents,len,NULL)) ali@72: { ali@72: utf8=g_utf8_normalize(contents,len,G_NORMALIZE_DEFAULT_COMPOSE); ali@72: g_set_print_handler(print_as_utf_8); ali@73: #ifdef __WIN32__ ali@73: SetConsoleOutputCP(CP_UTF8); ali@73: #endif ali@72: } ali@72: else ali@72: { ali@76: utf8=g_convert(contents,len,"UTF-8","WINDOWS-1252",&bytes_read, ali@76: &bytes_written,&tmp_err); ali@76: if (g_error_matches(tmp_err,G_CONVERT_ERROR, ali@76: G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) ali@76: { ali@76: line=col=1; ali@76: for(i=0;ibase++; ali@147: for(i=1;tokens[i];i++) ali@147: { ali@147: pc=g_utf8_get_char(g_utf8_prev_char(tokens[i-1]+strlen(tokens[i-1]))); ali@147: nc=g_utf8_get_char(tokens[i]); ali@147: if (g_unichar_isspace(pc) || g_unichar_isspace(nc)) ali@147: spaced=TRUE; ali@147: if (g_unichar_isspace(pc) && g_unichar_isspace(nc)) ali@147: spaced2=TRUE; ali@147: else if (!g_unichar_isspace(pc) && !g_unichar_isspace(nc)) ali@147: unspaced=TRUE; ali@147: } ali@147: if (spaced) ali@147: results->space++; ali@147: if (spaced2) ali@147: /* count of lines with em-dashes with spaces both sides */ ali@147: results->non_PG_space++; ali@147: if (unspaced) ali@147: /* count of lines with PG-type em-dashes with no spaces */ ali@147: results->PG_space++; ali@147: g_strfreev(tokens); ali@147: } ali@147: ali@40: /* ali@41: * first_pass: ali@40: * ali@41: * Run a first pass - verify that it's a valid PG ali@41: * file, decide whether to report some things that ali@41: * occur many times in the text like long or short ali@41: * lines, non-standard dashes, etc. ali@40: */ ali@69: struct first_pass_results *first_pass(const char *etext) ali@0: { ali@70: gunichar laststart=CHAR_SPACE; ali@54: const char *s; ali@69: gchar *lc_line; ali@70: int i,j,lbytes,llen; ali@69: gchar **lines; ali@41: unsigned int lastlen=0,lastblen=0; ali@41: long spline=0,nspline=0; ali@41: static struct first_pass_results results={0}; ali@147: struct dash_results tmp_dash_results; ali@69: gchar *inword; ali@142: QuoteClass qc; ali@69: lines=g_strsplit(etext,"\n",0); ali@69: for (j=0;lines[j];j++) ali@40: { ali@70: lbytes=strlen(lines[j]); ali@82: while (lbytes>0 && lines[j][lbytes-1]=='\r') ali@70: lines[j][--lbytes]='\0'; ali@70: llen=g_utf8_strlen(lines[j],lbytes); ali@68: linecnt++; ali@69: if (strstr(lines[j],"*END") && strstr(lines[j],"SMALL PRINT") && ali@69: (strstr(lines[j],"PUBLIC DOMAIN") || strstr(lines[j],"COPYRIGHT"))) ali@40: { ali@68: if (spline) ali@70: g_print(" --> Duplicate header?\n"); ali@68: spline=linecnt+1; /* first line of non-header text, that is */ ali@40: } ali@69: if (!strncmp(lines[j],"*** START",9) && ali@69: strstr(lines[j],"PROJECT GUTENBERG")) ali@40: { ali@68: if (nspline) ali@70: g_print(" --> Duplicate header?\n"); ali@68: nspline=linecnt+1; /* first line of non-header text, that is */ ali@40: } ali@68: if (spline || nspline) ali@40: { ali@70: lc_line=g_utf8_strdown(lines[j],lbytes); ali@69: if (strstr(lc_line,"end") && strstr(lc_line,"project gutenberg")) ali@40: { ali@69: if (strstr(lc_line,"end") Duplicate footer?\n"); ali@40: } ali@68: else ali@68: results.footerline=linecnt; ali@40: } ali@40: } ali@69: g_free(lc_line); ali@40: } ali@68: if (spline) ali@41: results.firstline=spline; ali@68: if (nspline) ali@41: results.firstline=nspline; /* override with new */ ali@68: if (results.footerline) ali@40: continue; /* don't count the boilerplate in the footer */ ali@68: results.totlen+=llen; ali@70: for (s=lines[j];*s;s=g_utf8_next_char(s)) ali@40: { ali@70: if (g_utf8_get_char(s)>127) ali@41: results.binlen++; ali@70: if (g_unichar_isalpha(g_utf8_get_char(s))) ali@41: results.alphalen++; ali@142: if (s>lines[j]) ali@142: { ali@142: if (CHAR_IS_DQUOTE(g_utf8_get_char(s))) ali@142: qc=QUOTE_CLASS(g_utf8_get_char(s)); ali@142: else ali@142: qc=INVALID_QUOTE; ali@142: if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && ali@147: g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s)))) ali@142: results.endquote_count++; ali@142: } ali@40: } ali@69: if (llen>2 && lastlen>2 && lastlen2 && ali@69: lastblen>SHORTEST_PG_LINE && laststart!=CHAR_SPACE) ali@41: results.shortline++; ali@70: if (lbytes>0 && ali@70: g_utf8_get_char(g_utf8_prev_char(lines[j]+lbytes))<=CHAR_SPACE) ali@40: cnt_spacend++; ali@69: if (strstr(lines[j],".,")) ali@41: results.dotcomma++; ali@68: /* only count ast lines for ignoring purposes where there is */ ali@68: /* locase text on the line */ ali@69: if (strchr(lines[j],'*')) ali@40: { ali@70: for (s=lines[j];*s;s=g_utf8_next_char(s)) ali@70: if (g_unichar_islower(g_utf8_get_char(s))) ali@68: break; ali@70: if (*s) ali@41: results.astline++; ali@40: } ali@69: if (strchr(lines[j],'/')) ali@68: results.fslashline++; ali@82: if (lbytes>0) ali@82: { ali@82: for (s=g_utf8_prev_char(lines[j]+lbytes); ali@82: s>lines[j] && g_utf8_get_char(s)<=CHAR_SPACE; ali@82: s=g_utf8_prev_char(s)) ali@82: ; ali@82: if (s>g_utf8_next_char(lines[j]) && g_utf8_get_char(s)=='-' && ali@82: g_utf8_get_char(g_utf8_prev_char(s))!='-') ali@82: results.hyphens++; ali@82: } ali@68: if (llen>LONGEST_PG_LINE) ali@41: results.longline++; ali@68: if (llen>WAY_TOO_LONG) ali@41: results.verylongline++; ali@69: if (strchr(lines[j],'<') && strchr(lines[j],'>')) ali@40: { ali@69: i=(int)(strchr(lines[j],'>')-strchr(lines[j],'<')+1); ali@68: if (i>0) ali@68: results.htmcount++; ali@69: if (strstr(lines[j],"")) ali@41: results.htmcount+=4; /* bonus marks! */ ali@40: } ali@68: /* Check for spaced em-dashes */ ali@147: memset(&tmp_dash_results,0,sizeof(tmp_dash_results)); ali@147: count_dashes(lines[j],"--",&tmp_dash_results); ali@147: count_dashes(lines[j],"—",&tmp_dash_results); ali@147: if (tmp_dash_results.base) ali@147: results.emdash.base++; ali@147: if (tmp_dash_results.non_PG_space) ali@147: results.emdash.non_PG_space++; ali@147: if (tmp_dash_results.PG_space) ali@147: results.emdash.PG_space++; ali@69: for (s=lines[j];*s;) ali@40: { ali@69: inword=getaword(&s); ali@68: if (!strcmp(inword,"hij") || !strcmp(inword,"niet")) ali@68: results.Dutchcount++; ali@68: if (!strcmp(inword,"dans") || !strcmp(inword,"avec")) ali@68: results.Frenchcount++; ali@68: if (!strcmp(inword,"0") || !strcmp(inword,"1")) ali@68: results.standalone_digit++; ali@69: g_free(inword); ali@40: } ali@68: /* Check for spaced dashes */ ali@69: if (strstr(lines[j]," -") && *(strstr(lines[j]," -")+2)!='-') ali@41: results.spacedash++; ali@68: lastblen=lastlen; ali@69: lastlen=llen; ali@69: laststart=lines[j][0]; ali@40: } ali@69: g_strfreev(lines); ali@41: return &results; ali@41: } ali@41: ali@42: /* ali@42: * report_first_pass: ali@42: * ali@42: * Make some snap decisions based on the first pass results. ali@42: */ ali@42: struct warnings *report_first_pass(struct first_pass_results *results) ali@42: { ali@42: static struct warnings warnings={0}; ali@42: if (cnt_spacend>0) ali@70: g_print(" --> %ld lines in this file have white space at end\n", ali@42: cnt_spacend); ali@42: warnings.dotcomma=1; ali@42: if (results->dotcomma>5) ali@42: { ali@68: warnings.dotcomma=0; ali@70: g_print(" --> %ld lines in this file contain '.,'. " ali@42: "Not reporting them.\n",results->dotcomma); ali@42: } ali@42: /* ali@42: * If more than 50 lines, or one-tenth, are short, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.shortline=1; ali@42: if (results->shortline>50 || results->shortline*10>linecnt) ali@42: { ali@68: warnings.shortline=0; ali@70: g_print(" --> %ld lines in this file are short. " ali@42: "Not reporting short lines.\n",results->shortline); ali@42: } ali@42: /* ali@42: * If more than 50 lines, or one-tenth, are long, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.longline=1; ali@42: if (results->longline>50 || results->longline*10>linecnt) ali@42: { ali@68: warnings.longline=0; ali@70: g_print(" --> %ld lines in this file are long. " ali@42: "Not reporting long lines.\n",results->longline); ali@42: } ali@42: /* If more than 10 lines contain asterisks, don't bother reporting them. */ ali@42: warnings.ast=1; ali@42: if (results->astline>10) ali@42: { ali@68: warnings.ast=0; ali@70: g_print(" --> %ld lines in this file contain asterisks. " ali@42: "Not reporting them.\n",results->astline); ali@42: } ali@42: /* ali@42: * If more than 10 lines contain forward slashes, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.fslash=1; ali@42: if (results->fslashline>10) ali@42: { ali@68: warnings.fslash=0; ali@70: g_print(" --> %ld lines in this file contain forward slashes. " ali@42: "Not reporting them.\n",results->fslashline); ali@42: } ali@42: /* ali@42: * If more than 20 lines contain unpunctuated endquotes, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.endquote=1; ali@42: if (results->endquote_count>20) ali@42: { ali@68: warnings.endquote=0; ali@70: g_print(" --> %ld lines in this file contain unpunctuated endquotes. " ali@42: "Not reporting them.\n",results->endquote_count); ali@42: } ali@42: /* ali@42: * If more than 15 lines contain standalone digits, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.digit=1; ali@42: if (results->standalone_digit>10) ali@42: { ali@68: warnings.digit=0; ali@70: g_print(" --> %ld lines in this file contain standalone 0s and 1s. " ali@42: "Not reporting them.\n",results->standalone_digit); ali@42: } ali@42: /* ali@42: * If more than 20 lines contain hyphens at end, ali@42: * don't bother reporting them. ali@42: */ ali@42: warnings.hyphen=1; ali@42: if (results->hyphens>20) ali@42: { ali@68: warnings.hyphen=0; ali@70: g_print(" --> %ld lines in this file have hyphens at end. " ali@42: "Not reporting them.\n",results->hyphens); ali@42: } ali@42: if (results->htmcount>20 && !pswit[MARKUP_SWITCH]) ali@42: { ali@70: g_print(" --> Looks like this is HTML. Switching HTML mode ON.\n"); ali@68: pswit[MARKUP_SWITCH]=1; ali@42: } ali@42: if (results->verylongline>0) ali@70: g_print(" --> %ld lines in this file are VERY long!\n", ali@42: results->verylongline); ali@42: /* ali@42: * If there are more non-PG spaced dashes than PG em-dashes, ali@42: * assume it's deliberate. ali@42: * Current PG guidelines say don't use them, but older texts do, ali@42: * and some people insist on them whatever the guidelines say. ali@42: */ ali@42: warnings.dash=1; ali@147: if (results->spacedash+results->emdash.non_PG_space> ali@147: results->emdash.PG_space) ali@42: { ali@68: warnings.dash=0; ali@70: g_print(" --> There are %ld spaced dashes and em-dashes. " ali@42: "Not reporting them.\n", ali@147: results->spacedash+results->emdash.non_PG_space); ali@42: } ali@185: if (charset) ali@185: warnings.bin=0; ali@185: else ali@42: { ali@185: /* Charset ISO_8859-1/ASCII checks for compatibility with gutcheck */ ali@185: warnings.bin=1; ali@185: /* If more than a quarter of characters are hi-bit, bug out. */ ali@185: if (results->binlen*4>results->totlen) ali@185: { ali@185: g_print(" --> This file does not appear to be ASCII. " ali@185: "Terminating. Best of luck with it!\n"); ali@185: exit(1); ali@185: } ali@185: if (results->alphalen*4totlen) ali@185: { ali@185: g_print(" --> This file does not appear to be text. " ali@185: "Terminating. Best of luck with it!\n"); ali@185: exit(1); ali@185: } ali@185: if (results->binlen*100>results->totlen || results->binlen>100) ali@185: { ali@185: g_print(" --> There are a lot of foreign letters here. " ali@185: "Not reporting them.\n"); ali@185: if (!pswit[VERBOSE_SWITCH]) ali@185: warnings.bin=0; ali@185: } ali@42: } ali@69: warnings.isDutch=FALSE; ali@42: if (results->Dutchcount>50) ali@42: { ali@69: warnings.isDutch=TRUE; ali@70: g_print(" --> This looks like Dutch - " ali@42: "switching off dashes and warnings for 's Middags case.\n"); ali@42: } ali@69: warnings.isFrench=FALSE; ali@42: if (results->Frenchcount>50) ali@42: { ali@69: warnings.isFrench=TRUE; ali@70: g_print(" --> This looks like French - " ali@42: "switching off some doublepunct.\n"); ali@42: } ali@42: if (results->firstline && results->footerline) ali@70: g_print(" The PG header and footer appear to be already on.\n"); ali@42: else ali@42: { ali@68: if (results->firstline) ali@70: g_print(" The PG header is on - no footer.\n"); ali@68: if (results->footerline) ali@70: g_print(" The PG footer is on - no header.\n"); ali@42: } ali@70: g_print("\n"); ali@42: if (pswit[VERBOSE_SWITCH]) ali@42: { ali@68: warnings.shortline=1; ali@68: warnings.dotcomma=1; ali@68: warnings.longline=1; ali@68: warnings.dash=1; ali@68: warnings.digit=1; ali@68: warnings.ast=1; ali@68: warnings.fslash=1; ali@68: warnings.hyphen=1; ali@68: warnings.endquote=1; ali@70: g_print(" *** Verbose output is ON -- you asked for it! ***\n"); ali@42: } ali@42: if (warnings.isDutch) ali@68: warnings.dash=0; ali@42: if (results->footerline>0 && results->firstline>0 && ali@42: results->footerline>results->firstline && ali@42: results->footerline-results->firstline<100) ali@42: { ali@70: g_print(" --> I don't really know where this text starts. \n"); ali@70: g_print(" There are no reference points.\n"); ali@70: g_print(" I'm going to have to report the header and footer " ali@42: "as well.\n"); ali@68: results->firstline=0; ali@42: } ali@42: return &warnings; ali@42: } ali@42: ali@43: /* ali@43: * analyse_quotes: ali@43: * ali@43: * Look along the line, accumulate the count of quotes, and see ali@43: * if this is an empty line - i.e. a line with nothing on it ali@43: * but spaces. ali@43: * If line has just spaces, period, * and/or - on it, don't ali@43: * count it, since empty lines with asterisks or dashes to ali@43: * separate sections are common. ali@43: * ali@69: * Returns: TRUE if the line is empty. ali@43: */ ali@164: gboolean analyse_quotes(const char *aline,struct counters *counters) ali@43: { ali@68: int guessquote=0; ali@69: /* assume the line is empty until proven otherwise */ ali@69: gboolean isemptyline=TRUE; ali@70: const char *s=aline,*sprev,*snext; ali@70: gunichar c; ali@70: sprev=NULL; ali@142: GError *tmp_err=NULL; ali@43: while (*s) ali@43: { ali@70: snext=g_utf8_next_char(s); ali@70: c=g_utf8_get_char(s); ali@142: if (CHAR_IS_DQUOTE(c)) ali@142: (void)count_quote(counters,c,QUOTE_CLASS(c),&tmp_err); ali@142: else if (CHAR_IS_SQUOTE(c) && pswit[SQUOTE_SWITCH]) ali@43: { ali@43: if (s==aline) ali@43: { ali@43: /* ali@142: * At start of line, it can only be a quotation mark. ali@43: * Hardcode a very common exception! ali@43: */ ali@70: if (!g_str_has_prefix(snext,"tis") && ali@70: !g_str_has_prefix(snext,"Tis")) ali@142: (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err); ali@43: } ali@70: else if (g_unichar_isalpha(g_utf8_get_char(sprev)) && ali@70: g_unichar_isalpha(g_utf8_get_char(snext))) ali@43: /* Do nothing! it's definitely an apostrophe, not a quote */ ali@43: ; ali@43: /* it's outside a word - let's check it out */ ali@99: else if (c==CHAR_OPEN_SQUOTE || c==CHAR_LS_QUOTE || ali@70: g_unichar_isalpha(g_utf8_get_char(snext))) ali@43: { ali@142: /* certainly looks like a quotation mark */ ali@70: if (!g_str_has_prefix(snext,"tis") && ali@70: !g_str_has_prefix(snext,"Tis")) ali@43: /* hardcode a very common exception! */ ali@142: { ali@142: if (strchr(".?!,;:",g_utf8_get_char(sprev))) ali@142: (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err); ali@142: else ali@142: (void)count_quote(counters,c,OPENING_QUOTE,&tmp_err); ali@142: } ali@43: } ali@43: else ali@43: { ali@142: /* now - is it a quotation mark? */ ali@43: guessquote=0; /* accumulate clues */ ali@70: if (g_unichar_isalpha(g_utf8_get_char(sprev))) ali@43: { ali@43: /* it follows a letter - could be either */ ali@43: guessquote++; ali@70: if (g_utf8_get_char(sprev)=='s') ali@43: { ali@43: /* looks like a plural apostrophe */ ali@43: guessquote-=3; ali@70: if (g_utf8_get_char(snext)==CHAR_SPACE) ali@70: /* bonus marks! */ ali@43: guessquote-=2; ali@43: } ali@142: if (innermost_quote_matches(counters,c)) ali@142: /* ali@142: * Give it the benefit of some doubt, ali@142: * if a squote is already open. ali@142: */ ali@142: guessquote++; ali@142: else ali@142: guessquote--; ali@142: if (guessquote>=0) ali@142: (void)count_quote(counters,c,CLOSING_QUOTE,&tmp_err); ali@43: } ali@43: else ali@142: /* no adjacent letter - it must be a quote of some kind */ ali@142: (void)count_quote(counters,c,NEUTRAL_QUOTE,&tmp_err); ali@43: } ali@43: } ali@142: if (tmp_err) ali@142: { ali@142: if (pswit[ECHO_SWITCH]) ali@142: g_print("\n%s\n",aline); ali@142: if (!pswit[OVERVIEW_SWITCH]) ali@142: g_print(" Line %ld column %ld - %s\n", ali@142: linecnt,g_utf8_pointer_to_offset(aline,s)+1,tmp_err->message); ali@142: g_clear_error(&tmp_err); ali@142: } ali@70: if (c!=CHAR_SPACE && c!='-' && c!='.' && c!=CHAR_ASTERISK && ali@70: c!='\r' && c!='\n') ali@69: isemptyline=FALSE; /* ignore lines like * * * as spacers */ ali@70: if (c==CHAR_UNDERSCORE) ali@43: counters->c_unders++; ali@103: if (c==CHAR_OPEN_SBRACK) ali@103: { ali@103: if (!matching_difference(counters,COUNTER_ILLUSTRATION) && ali@103: !matching_difference(counters,c) && s==aline && ali@103: g_str_has_prefix(s,"[Illustration:")) ali@103: increment_matching(counters,COUNTER_ILLUSTRATION,TRUE); ali@103: else ali@103: increment_matching(counters,c,TRUE); ali@103: } ali@103: else if (c==CHAR_OPEN_CBRACK || c==CHAR_OPEN_RBRACK) ali@99: increment_matching(counters,c,TRUE); ali@103: if (c==CHAR_CLOSE_SBRACK) ali@103: { ali@103: if (!matching_count(counters,COUNTER_ILLUSTRATION,FALSE) && ali@103: !matching_difference(counters,c) && !*snext) ali@103: increment_matching(counters,COUNTER_ILLUSTRATION,FALSE); ali@103: else ali@103: increment_matching(counters,c,FALSE); ali@103: } ali@103: else if (c==CHAR_CLOSE_CBRACK || c==CHAR_CLOSE_RBRACK) ali@99: increment_matching(counters,c,FALSE); ali@70: sprev=s; ali@70: s=snext; ali@43: } ali@43: return isemptyline; ali@43: } ali@43: ali@41: /* ali@67: * check_for_control_characters: ali@67: * ali@67: * Check for invalid or questionable characters in the line ali@67: * Anything above 127 is invalid for plain ASCII, and ali@67: * non-printable control characters should also be flagged. ali@67: * Tabs should generally not be there. ali@67: */ ali@67: void check_for_control_characters(const char *aline) ali@67: { ali@70: gunichar c; ali@67: const char *s; ali@70: for (s=aline;*s;s=g_utf8_next_char(s)) ali@67: { ali@70: c=g_utf8_get_char(s); ali@67: if (cbin && !eInvalidChar && ali@185: (c127)) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@70: if (c>127 && c<160 || c>255) ali@70: g_print(" Line %ld column %ld - " ali@70: "Non-ISO-8859 character %u\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); ali@44: else ali@70: g_print(" Line %ld column %ld - " ali@70: "Non-ASCII character %u\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); ali@44: else ali@44: cnt_bin++; ali@185: eInvalidChar=TRUE; ali@185: } ali@185: if (!eInvalidChar && charset) ali@185: { ali@185: if (charset_validator==(GIConv)-1) ali@185: { ali@185: if (!g_unichar_isdefined(c)) ali@185: { ali@185: if (pswit[ECHO_SWITCH]) ali@185: g_print("\n%s\n",aline); ali@185: if (!pswit[OVERVIEW_SWITCH]) ali@185: g_print(" Line %ld column %ld - Unassigned UNICODE " ali@185: "code point U+%04" G_GINT32_MODIFIER "X\n", ali@185: linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); ali@185: else ali@185: cnt_bin++; ali@185: eInvalidChar=TRUE; ali@185: } ali@185: else if (c>=0xE000 && c<=0xF8FF || c>=0xF0000 && c<=0xFFFFD || ali@185: c>=100000 && c<=0x10FFFD) ali@185: { ali@185: if (pswit[ECHO_SWITCH]) ali@185: g_print("\n%s\n",aline); ali@185: if (!pswit[OVERVIEW_SWITCH]) ali@185: g_print(" Line %ld column %ld - Private Use " ali@185: "character U+%04" G_GINT32_MODIFIER "X\n", ali@185: linecnt,g_utf8_pointer_to_offset(aline,s)+1,c); ali@185: else ali@185: cnt_bin++; ali@185: eInvalidChar=TRUE; ali@185: } ali@185: } ali@185: else ali@185: { ali@185: t=g_convert_with_iconv(s,g_utf8_next_char(s)-s, ali@185: charset_validator,NULL,&nb,NULL); ali@185: if (t) ali@185: g_free(t); ali@185: else ali@185: { ali@185: if (pswit[ECHO_SWITCH]) ali@185: g_print("\n%s\n",aline); ali@185: if (!pswit[OVERVIEW_SWITCH]) ali@185: g_print(" Line %ld column %ld - Non-%s " ali@185: "character %u\n",linecnt, ali@185: g_utf8_pointer_to_offset(aline,s)+1,charset,c); ali@185: else ali@185: cnt_bin++; ali@185: eInvalidChar=TRUE; ali@185: } ali@185: } ali@44: } ali@70: if (!eTab && c==CHAR_TAB) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Tab character?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@44: else ali@44: cnt_odd++; ali@70: eTab=TRUE; ali@44: } ali@70: if (!eTilde && c==CHAR_TILDE) ali@44: { ali@44: /* ali@44: * Often used by OCR software to indicate an ali@44: * unrecognizable character. ali@44: */ ali@44: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Tilde character?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@44: else ali@44: cnt_odd++; ali@70: eTilde=TRUE; ali@44: } ali@70: if (!eCarat && c==CHAR_CARAT) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Carat character?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@44: else ali@44: cnt_odd++; ali@70: eCarat=TRUE; ali@44: } ali@70: if (!eFSlash && c==CHAR_FORESLASH && warnings->fslash) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Forward slash?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@44: else ali@44: cnt_odd++; ali@70: eFSlash=TRUE; ali@44: } ali@44: /* ali@44: * Report asterisks only in paranoid mode, ali@44: * since they're often deliberate. ali@44: */ ali@44: if (!eAst && pswit[PARANOID_SWITCH] && warnings->ast && !isemptyline && ali@70: c==CHAR_ASTERISK) ali@44: { ali@44: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@44: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Asterisk?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@44: else ali@44: cnt_odd++; ali@70: eAst=TRUE; ali@44: } ali@44: } ali@44: } ali@44: ali@44: /* ali@45: * check_for_long_line: ali@45: * ali@45: * Check for line too long. ali@45: */ ali@45: void check_for_long_line(const char *aline) ali@45: { ali@70: if (g_utf8_strlen(aline,-1)>LONGEST_PG_LINE) ali@45: { ali@45: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@45: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Long line %ld\n", ali@70: linecnt,g_utf8_strlen(aline,-1),g_utf8_strlen(aline,-1)); ali@45: else ali@45: cnt_long++; ali@45: } ali@45: } ali@45: ali@45: /* ali@45: * check_for_short_line: ali@45: * ali@45: * Check for line too short. ali@45: * ali@45: * This one is a bit trickier to implement: we don't want to ali@45: * flag the last line of a paragraph for being short, so we ali@45: * have to wait until we know that our current line is a ali@45: * "normal" line, then report the _previous_ line if it was too ali@45: * short. We also don't want to report indented lines like ali@45: * chapter heads or formatted quotations. We therefore keep ali@45: * last->len as the length of the last line examined, and ali@45: * last->blen as the length of the last but one, and try to ali@45: * suppress unnecessary warnings by checking that both were of ali@45: * "normal" length. We keep the first character of the last ali@45: * line in last->start, and if it was a space, we assume that ali@45: * the formatting is deliberate. I can't figure out a way to ali@45: * distinguish something like a quoted verse left-aligned or ali@45: * the header or footer of a letter from a paragraph of short ali@45: * lines - maybe if I examined the whole paragraph, and if the ali@45: * para has less than, say, 8 lines and if all lines are short, ali@45: * then just assume it's OK? Need to look at some texts to see ali@45: * how often a formula like this would get the right result. ali@45: */ ali@45: void check_for_short_line(const char *aline,const struct line_properties *last) ali@45: { ali@70: if (g_utf8_strlen(aline,-1)>1 && last->len>1 && ali@70: last->lenblen>1 && ali@70: last->blen>SHORTEST_PG_LINE && last->start!=CHAR_SPACE) ali@45: { ali@45: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",prevline); ali@45: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Short line %ld?\n", ali@70: linecnt-1,g_utf8_strlen(prevline,-1),g_utf8_strlen(prevline,-1)); ali@45: else ali@45: cnt_short++; ali@45: } ali@45: } ali@45: ali@45: /* ali@46: * check_for_starting_punctuation: ali@46: * ali@46: * Look for punctuation other than full ellipses at start of line. ali@46: */ ali@46: void check_for_starting_punctuation(const char *aline) ali@46: { ali@70: if (*aline && g_utf8_strchr(".?!,;:",-1,g_utf8_get_char(aline)) && ali@70: !g_str_has_prefix(aline,". . .")) ali@46: { ali@46: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@46: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column 1 - Begins with punctuation?\n", ali@46: linecnt); ali@46: else ali@46: cnt_punct++; ali@46: } ali@46: } ali@46: ali@46: /* ali@147: * str_emdash: ali@147: * ali@147: * Find the first em-dash, return a pointer to it and set to the ali@147: * character following the dash. ali@147: */ ali@147: char *str_emdash(const char *s,const char **next) ali@147: { ali@147: const char *s1,*s2; ali@147: s1=strstr(s,"--"); ali@147: s2=strstr(s,"—"); ali@147: if (!s1) ali@147: { ali@147: if (s2) ali@147: *next=g_utf8_next_char(s2); ali@147: return (char *)s2; ali@147: } ali@147: else if (!s2) ali@147: { ali@147: *next=g_utf8_next_char(g_utf8_next_char(s1)); ali@147: return (char *)s1; ali@147: } ali@147: else if (s1aline && g_utf8_get_char(g_utf8_prev_char(t))==CHAR_SPACE || ali@70: g_utf8_get_char(next)==CHAR_SPACE) ali@47: { ali@47: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@47: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Spaced em-dash?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,t)+1); ali@47: else ali@47: cnt_dash++; ali@47: } ali@47: } ali@47: } ali@47: ali@47: /* ali@47: * check_for_spaced_dash: ali@47: * ali@47: * Check for spaced dashes. ali@47: */ ali@47: void check_for_spaced_dash(const char *aline) ali@47: { ali@47: const char *s; ali@47: if ((s=strstr(aline," -"))) ali@47: { ali@70: if (g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s)))!='-') ali@47: { ali@47: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@47: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Spaced dash?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@47: else ali@47: cnt_dash++; ali@47: } ali@47: } ali@47: else if ((s=strstr(aline,"- "))) ali@47: { ali@70: if (s==aline || g_utf8_get_char(g_utf8_prev_char(s))!='-') ali@47: { ali@47: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@47: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Spaced dash?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@47: else ali@47: cnt_dash++; ali@47: } ali@47: } ali@47: } ali@47: ali@47: /* ali@48: * check_for_unmarked_paragraphs: ali@48: * ali@48: * Check for unmarked paragraphs indicated by separate speakers. ali@48: * ali@48: * May well be false positive: ali@48: * "Bravo!" "Wonderful!" called the crowd. ali@48: * but useful all the same. ali@48: */ ali@48: void check_for_unmarked_paragraphs(const char *aline) ali@48: { ali@48: const char *s; ali@48: s=strstr(aline,"\" \""); ali@48: if (!s) ali@48: s=strstr(aline,"\" \""); ali@48: if (s) ali@48: { ali@48: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@48: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@70: "Query missing paragraph break?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@48: else ali@48: cnt_punct++; ali@48: } ali@48: } ali@48: ali@48: /* ali@49: * check_for_jeebies: ali@49: * ali@49: * Check for "to he" and other easy h/b errors. ali@49: * ali@49: * This is a very inadequate effort on the h/b problem, ali@49: * but the phrase "to he" is always an error, whereas "to ali@49: * be" is quite common. ali@49: * Similarly, '"Quiet!", be said.' is a non-be error ali@49: * "to he" is _not_ always an error!: ali@49: * "Where they went to he couldn't say." ali@49: * Another false positive: ali@49: * What would "Cinderella" be without the . . . ali@49: * and another: "If he wants to he can see for himself." ali@49: */ ali@49: void check_for_jeebies(const char *aline) ali@49: { ali@49: const char *s; ali@49: s=strstr(aline," be could "); ali@49: if (!s) ali@49: s=strstr(aline," be would "); ali@49: if (!s) ali@49: s=strstr(aline," was be "); ali@49: if (!s) ali@49: s=strstr(aline," be is "); ali@49: if (!s) ali@49: s=strstr(aline," is be "); ali@49: if (!s) ali@49: s=strstr(aline,"\", be "); ali@49: if (!s) ali@49: s=strstr(aline,"\" be "); ali@49: if (!s) ali@49: s=strstr(aline,"\" be "); ali@49: if (!s) ali@49: s=strstr(aline," to he "); ali@49: if (s) ali@49: { ali@49: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@49: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Query he/be error?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@49: else ali@49: cnt_word++; ali@49: } ali@49: s=strstr(aline," the had "); ali@49: if (!s) ali@49: s=strstr(aline," a had "); ali@49: if (!s) ali@49: s=strstr(aline," they bad "); ali@49: if (!s) ali@49: s=strstr(aline," she bad "); ali@49: if (!s) ali@49: s=strstr(aline," he bad "); ali@49: if (!s) ali@49: s=strstr(aline," you bad "); ali@49: if (!s) ali@49: s=strstr(aline," i bad "); ali@49: if (s) ali@49: { ali@49: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@49: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Query had/bad error?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@49: else ali@49: cnt_word++; ali@49: } ali@49: s=strstr(aline,"; hut "); ali@49: if (!s) ali@49: s=strstr(aline,", hut "); ali@49: if (s) ali@49: { ali@49: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@49: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Query hut/but error?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@49: else ali@49: cnt_word++; ali@49: } ali@49: } ali@49: ali@49: /* ali@50: * check_for_mta_from: ali@50: * ali@50: * Special case - angled bracket in front of "From" placed there by an ali@50: * MTA when sending an e-mail. ali@50: */ ali@50: void check_for_mta_from(const char *aline) ali@50: { ali@50: const char *s; ali@50: s=strstr(aline,">From"); ali@50: if (s) ali@50: { ali@50: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@50: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@70: "Query angled bracket with From\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@50: else ali@50: cnt_punct++; ali@50: } ali@50: } ali@50: ali@50: /* ali@51: * check_for_orphan_character: ali@51: * ali@51: * Check for a single character line - ali@51: * often an overflow from bad wrapping. ali@51: */ ali@51: void check_for_orphan_character(const char *aline) ali@51: { ali@70: gunichar c; ali@70: c=g_utf8_get_char(aline); ali@70: if (c && !*g_utf8_next_char(aline)) ali@51: { ali@70: if (c=='I' || c=='V' || c=='X' || c=='L' || g_unichar_isdigit(c)) ali@51: ; /* Nothing - ignore numerals alone on a line. */ ali@51: else ali@51: { ali@51: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@51: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column 1 - Query single character line\n", ali@51: linecnt); ali@51: else ali@51: cnt_punct++; ali@51: } ali@51: } ali@51: } ali@51: ali@51: /* ali@52: * check_for_pling_scanno: ali@52: * ali@52: * Check for I" - often should be ! ali@52: */ ali@52: void check_for_pling_scanno(const char *aline) ali@52: { ali@52: const char *s; ali@52: s=strstr(aline," I\""); ali@52: if (s) ali@52: { ali@52: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@52: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Query I=exclamation mark?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)); ali@52: else ali@52: cnt_punct++; ali@52: } ali@52: } ali@52: ali@52: /* ali@53: * check_for_extra_period: ali@53: * ali@53: * Check for period without a capital letter. Cut-down from gutspell. ali@53: * Only works when it happens on a single line. ali@53: */ ali@53: void check_for_extra_period(const char *aline,const struct warnings *warnings) ali@53: { ali@99: const char *s,*t,*s1,*sprev; ali@69: int i; ali@70: gsize len; ali@69: gboolean istypo; ali@69: gchar *testword; ali@99: gunichar c,nc,pc,*decomposition; ali@53: if (pswit[PARANOID_SWITCH]) ali@53: { ali@70: for (t=aline;t=strstr(t,". ");) ali@53: { ali@69: if (t==aline) ali@53: { ali@70: t=g_utf8_next_char(t); ali@53: /* start of line punctuation is handled elsewhere */ ali@53: continue; ali@53: } ali@70: if (!g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(t)))) ali@53: { ali@70: t=g_utf8_next_char(t); ali@53: continue; ali@53: } ali@53: if (warnings->isDutch) ali@53: { ali@53: /* For Frank & Jeroen -- 's Middags case */ ali@70: gunichar c2,c3,c4,c5; ali@70: c2=g_utf8_get_char(g_utf8_offset_to_pointer(t,2)); ali@70: c3=g_utf8_get_char(g_utf8_offset_to_pointer(t,3)); ali@70: c4=g_utf8_get_char(g_utf8_offset_to_pointer(t,4)); ali@70: c5=g_utf8_get_char(g_utf8_offset_to_pointer(t,5)); ali@99: if (CHAR_IS_APOSTROPHE(c2) && ali@99: g_unichar_islower(c3) && c4==CHAR_SPACE && ali@99: g_unichar_isupper(c5)) ali@53: { ali@70: t=g_utf8_next_char(t); ali@53: continue; ali@53: } ali@53: } ali@70: s1=g_utf8_next_char(g_utf8_next_char(t)); ali@70: while (*s1 && !g_unichar_isalpha(g_utf8_get_char(s1)) && ali@173: !g_unichar_isdigit(g_utf8_get_char(s1))) ali@70: s1=g_utf8_next_char(s1); ali@70: if (g_unichar_islower(g_utf8_get_char(s1))) ali@53: { ali@53: /* we have something to investigate */ ali@69: istypo=TRUE; ali@53: /* so let's go back and find out */ ali@99: nc=g_utf8_get_char(t); ali@99: s1=g_utf8_prev_char(t); ali@99: c=g_utf8_get_char(s1); ali@99: sprev=g_utf8_prev_char(s1); ali@99: pc=g_utf8_get_char(sprev); ali@99: while (s1>=aline && ali@99: (g_unichar_isalpha(c) || g_unichar_isdigit(c) || ali@99: g_unichar_isalpha(pc) && CHAR_IS_APOSTROPHE(c) && ali@99: g_unichar_isalpha(nc))) ali@99: { ali@99: nc=c; ali@99: s1=sprev; ali@99: c=pc; ali@99: sprev=g_utf8_prev_char(s1); ali@99: pc=g_utf8_get_char(sprev); ali@99: } ali@70: s1=g_utf8_next_char(s1); ali@69: s=strchr(s1,'.'); ali@69: if (s) ali@69: testword=g_strndup(s1,s-s1); ali@69: else ali@69: testword=g_strdup(s1); ali@53: for (i=0;*abbrev[i];i++) ali@53: if (!strcmp(testword,abbrev[i])) ali@69: istypo=FALSE; ali@70: if (g_unichar_isdigit(g_utf8_get_char(testword))) ali@69: istypo=FALSE; ali@70: if (!*g_utf8_next_char(testword)) ali@69: istypo=FALSE; ali@53: if (isroman(testword)) ali@69: istypo=FALSE; ali@53: if (istypo) ali@53: { ali@69: istypo=FALSE; ali@70: for (s=testword;*s;s=g_utf8_next_char(s)) ali@70: { ali@70: decomposition=g_unicode_canonical_decomposition( ali@70: g_utf8_get_char(s),&len); ali@70: if (g_utf8_strchr("aeiou",-1,decomposition[0])) ali@69: istypo=TRUE; ali@70: g_free(decomposition); ali@70: } ali@53: } ali@69: if (istypo && ali@69: (pswit[VERBOSE_SWITCH] || !g_tree_lookup(qperiod,testword))) ali@53: { ali@69: g_tree_insert(qperiod,g_strdup(testword), ali@69: GINT_TO_POINTER(1)); ali@69: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@69: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Extra period?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,t)+1); ali@69: else ali@69: cnt_punct++; ali@53: } ali@69: g_free(testword); ali@53: } ali@70: t=g_utf8_next_char(t); ali@53: } ali@53: } ali@53: } ali@53: ali@53: /* ali@54: * check_for_following_punctuation: ali@54: * ali@54: * Check for words usually not followed by punctuation. ali@54: */ ali@54: void check_for_following_punctuation(const char *aline) ali@54: { ali@54: int i; ali@54: const char *s,*wordstart; ali@70: gunichar c; ali@69: gchar *inword,*t; ali@54: if (pswit[TYPO_SWITCH]) ali@54: { ali@54: for (s=aline;*s;) ali@54: { ali@54: wordstart=s; ali@69: t=getaword(&s); ali@69: if (!*t) ali@69: { ali@69: g_free(t); ali@54: continue; ali@69: } ali@70: inword=g_utf8_strdown(t,-1); ali@69: g_free(t); ali@54: for (i=0;*nocomma[i];i++) ali@54: if (!strcmp(inword,nocomma[i])) ali@54: { ali@70: c=g_utf8_get_char(s); ali@70: if (c==',' || c==';' || c==':') ali@54: { ali@54: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@54: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@54: "Query punctuation after %s?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1, ali@70: inword); ali@54: else ali@54: cnt_punct++; ali@54: } ali@54: } ali@54: for (i=0;*noperiod[i];i++) ali@54: if (!strcmp(inword,noperiod[i])) ali@54: { ali@70: c=g_utf8_get_char(s); ali@70: if (c=='.' || c=='!') ali@54: { ali@54: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@54: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@54: "Query punctuation after %s?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1, ali@70: inword); ali@54: else ali@54: cnt_punct++; ali@54: } ali@54: } ali@69: g_free(inword); ali@54: } ali@54: } ali@54: } ali@54: ali@54: /* ali@55: * check_for_typos: ali@55: * ali@55: * Check for commonly mistyped words, ali@55: * and digits like 0 for O in a word. ali@55: */ ali@55: void check_for_typos(const char *aline,struct warnings *warnings) ali@55: { ali@70: const char *s,*t,*nt,*wordstart; ali@70: gchar *inword; ali@70: gunichar *decomposition; ali@70: gchar *testword; ali@70: int i,vowel,consonant,*dupcnt; ali@70: gboolean isdup,istypo,alower; ali@99: gunichar c,pc; ali@70: long offset,len; ali@70: gsize decomposition_len; ali@55: for (s=aline;*s;) ali@55: { ali@55: wordstart=s; ali@69: inword=getaword(&s); ali@55: if (!*inword) ali@69: { ali@69: g_free(inword); ali@55: continue; /* don't bother with empty lines */ ali@69: } ali@55: if (mixdigit(inword)) ali@55: { ali@55: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@55: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Query digit in %s\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1,inword); ali@55: else ali@55: cnt_word++; ali@55: } ali@55: /* ali@55: * Put the word through a series of tests for likely typos and OCR ali@55: * errors. ali@55: */ ali@69: if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH]) ali@55: { ali@69: istypo=FALSE; ali@70: alower=FALSE; ali@70: for (t=inword;*t;t=g_utf8_next_char(t)) ali@55: { ali@70: c=g_utf8_get_char(t); ali@70: nt=g_utf8_next_char(t); ali@55: /* lowercase for testing */ ali@70: if (g_unichar_islower(c)) ali@70: alower=TRUE; ali@70: if (alower && (g_unichar_isupper(c) || g_unichar_istitle(c))) ali@55: { ali@55: /* ali@55: * We have an uppercase mid-word. However, there are ali@55: * common cases: ali@55: * Mac and Mc like McGill ali@55: * French contractions like l'Abbe ali@55: */ ali@70: offset=g_utf8_pointer_to_offset(inword,t); ali@99: if (offset>0) ali@99: pc=g_utf8_get_char(g_utf8_prev_char(t)); ali@99: else ali@99: pc='\0'; ali@70: if (offset==2 && c=='m' && g_utf8_get_char(nt)=='c' || ali@70: offset==3 && c=='m' && g_utf8_get_char(nt)=='a' && ali@70: g_utf8_get_char(g_utf8_next_char(nt))=='c' || ali@99: CHAR_IS_APOSTROPHE(pc)) ali@55: ; /* do nothing! */ ali@55: else ali@69: istypo=TRUE; ali@55: } ali@55: } ali@70: testword=g_utf8_casefold(inword,-1); ali@69: } ali@69: if (pswit[TYPO_SWITCH]) ali@69: { ali@55: /* ali@55: * Check for certain unlikely two-letter combinations at word ali@55: * start and end. ali@55: */ ali@70: len=g_utf8_strlen(testword,-1); ali@70: if (len>1) ali@55: { ali@55: for (i=0;*nostart[i];i++) ali@70: if (g_str_has_prefix(testword,nostart[i])) ali@69: istypo=TRUE; ali@55: for (i=0;*noend[i];i++) ali@70: if (g_str_has_suffix(testword,noend[i])) ali@69: istypo=TRUE; ali@55: } ali@55: /* ght is common, gbt never. Like that. */ ali@55: if (strstr(testword,"cb")) ali@69: istypo=TRUE; ali@55: if (strstr(testword,"gbt")) ali@69: istypo=TRUE; ali@55: if (strstr(testword,"pbt")) ali@69: istypo=TRUE; ali@55: if (strstr(testword,"tbs")) ali@69: istypo=TRUE; ali@55: if (strstr(testword,"mrn")) ali@69: istypo=TRUE; ali@55: if (strstr(testword,"ahle")) ali@69: istypo=TRUE; ali@55: if (strstr(testword,"ihle")) ali@69: istypo=TRUE; ali@55: /* ali@55: * "TBE" does happen - like HEARTBEAT - but uncommon. ali@55: * Also "TBI" - frostbite, outbid - but uncommon. ali@55: * Similarly "ii" like Hawaii, or Pompeii, and in Roman ali@55: * numerals, but "ii" is a common scanno. ali@55: */ ali@55: if (strstr(testword,"tbi")) ali@69: istypo=TRUE; ali@55: if (strstr(testword,"tbe")) ali@69: istypo=TRUE; ali@55: if (strstr(testword,"ii")) ali@69: istypo=TRUE; ali@55: /* ali@55: * Check for no vowels or no consonants. ali@55: * If none, flag a typo. ali@55: */ ali@70: if (!istypo && len>1) ali@55: { ali@55: vowel=consonant=0; ali@70: for (t=testword;*t;t=g_utf8_next_char(t)) ali@55: { ali@70: c=g_utf8_get_char(t); ali@70: decomposition= ali@70: g_unicode_canonical_decomposition(c,&decomposition_len); ali@70: if (c=='y' || g_unichar_isdigit(c)) ali@55: { ali@55: /* Yah, this is loose. */ ali@55: vowel++; ali@55: consonant++; ali@55: } ali@70: else if (g_utf8_strchr("aeiou",-1,decomposition[0])) ali@55: vowel++; ali@55: else ali@55: consonant++; ali@70: g_free(decomposition); ali@55: } ali@55: if (!vowel || !consonant) ali@69: istypo=TRUE; ali@55: } ali@55: /* ali@55: * Now exclude the word from being reported if it's in ali@55: * the okword list. ali@55: */ ali@55: for (i=0;*okword[i];i++) ali@55: if (!strcmp(testword,okword[i])) ali@69: istypo=FALSE; ali@55: /* ali@55: * What looks like a typo may be a Roman numeral. ali@55: * Exclude these. ali@55: */ ali@55: if (istypo && isroman(testword)) ali@69: istypo=FALSE; ali@55: /* Check the manual list of typos. */ ali@55: if (!istypo) ali@55: for (i=0;*typo[i];i++) ali@55: if (!strcmp(testword,typo[i])) ali@69: istypo=TRUE; ali@55: /* ali@55: * Check lowercase s, l, i and m - special cases. ali@55: * "j" - often a semi-colon gone wrong. ali@55: * "d" for a missing apostrophe - he d ali@55: * "n" for "in" ali@55: */ ali@70: if (!istypo && len==1 && ali@70: g_utf8_strchr("slmijdn",-1,g_utf8_get_char(inword))) ali@69: istypo=TRUE; ali@55: if (istypo) ali@55: { ali@69: dupcnt=g_tree_lookup(qword,testword); ali@69: if (dupcnt) ali@69: { ali@69: (*dupcnt)++; ali@69: isdup=!pswit[VERBOSE_SWITCH]; ali@69: } ali@69: else ali@69: { ali@69: dupcnt=g_new0(int,1); ali@69: g_tree_insert(qword,g_strdup(testword),dupcnt); ali@69: isdup=FALSE; ali@69: } ali@55: if (!isdup) ali@55: { ali@55: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@55: if (!pswit[OVERVIEW_SWITCH]) ali@55: { ali@70: g_print(" Line %ld column %ld - Query word %s", ali@70: linecnt,g_utf8_pointer_to_offset(aline,wordstart)+1, ali@70: inword); ali@69: if (!pswit[VERBOSE_SWITCH]) ali@70: g_print(" - not reporting duplicates"); ali@70: g_print("\n"); ali@55: } ali@55: else ali@55: cnt_word++; ali@55: } ali@55: } ali@55: } ali@55: /* check the user's list of typos */ ali@69: if (!istypo && usertypo && g_tree_lookup(usertypo,testword)) ali@69: { ali@69: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@69: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Query possible scanno %s\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2,inword); ali@69: } ali@69: if (pswit[TYPO_SWITCH] || pswit[USERTYPO_SWITCH]) ali@69: g_free(testword); ali@55: if (pswit[PARANOID_SWITCH] && warnings->digit) ali@55: { ali@55: /* In paranoid mode, query all 0 and 1 standing alone. */ ali@55: if (!strcmp(inword,"0") || !strcmp(inword,"1")) ali@55: { ali@55: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@55: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Query standalone %s\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,wordstart)+2, ali@70: inword); ali@55: else ali@55: cnt_word++; ali@55: } ali@55: } ali@69: g_free(inword); ali@55: } ali@55: } ali@55: ali@56: /* ali@56: * check_for_misspaced_punctuation: ali@56: * ali@56: * Look for added or missing spaces around punctuation and quotes. ali@56: * If there is a punctuation character like ! with no space on ali@56: * either side, suspect a missing!space. If there are spaces on ali@56: * both sides , assume a typo. If we see a double quote with no ali@56: * space or punctuation on either side of it, assume unspaced ali@56: * quotes "like"this. ali@56: */ ali@56: void check_for_misspaced_punctuation(const char *aline, ali@69: struct parities *parities,gboolean isemptyline) ali@56: { ali@69: gboolean isacro,isellipsis; ali@56: const char *s; ali@70: gunichar c,nc,pc,n2c; ali@142: int parity; ali@70: c=g_utf8_get_char(aline); ali@70: nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0; ali@70: for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s)) ali@56: { ali@70: pc=c; ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@56: /* For each character in the line after the first. */ ali@70: if (g_utf8_strchr(".?!,;:_",-1,c)) /* if it's punctuation */ ali@56: { ali@56: /* we need to suppress warnings for acronyms like M.D. */ ali@69: isacro=FALSE; ali@56: /* we need to suppress warnings for ellipsis . . . */ ali@69: isellipsis=FALSE; ali@70: /* ali@70: * If there are letters on both sides of it or ali@70: * if it's strict punctuation followed by an alpha. ali@70: */ ali@70: if (g_unichar_isalpha(nc) && (g_unichar_isalpha(pc) || ali@70: g_utf8_strchr("?!,;:",-1,c))) ali@56: { ali@70: if (c=='.') ali@56: { ali@70: if (g_utf8_pointer_to_offset(aline,s)>2 && ali@70: g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.') ali@69: isacro=TRUE; ali@70: n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s))); ali@70: if (nc && n2c=='.') ali@69: isacro=TRUE; ali@56: } ali@56: if (!isacro) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Missing space?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@70: if (pc==CHAR_SPACE && (nc==CHAR_SPACE || !nc)) ali@56: { ali@56: /* ali@56: * If there are spaces on both sides, ali@56: * or space before and end of line. ali@56: */ ali@70: if (c=='.') ali@56: { ali@70: if (g_utf8_pointer_to_offset(aline,s)>2 && ali@70: g_utf8_get_char(g_utf8_offset_to_pointer(s,-2))=='.') ali@69: isellipsis=TRUE; ali@70: n2c=g_utf8_get_char(g_utf8_next_char(g_utf8_next_char(s))); ali@70: if (nc && n2c=='.') ali@69: isellipsis=TRUE; ali@56: } ali@56: if (!isemptyline && !isellipsis) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@70: "Spaced punctuation?\n",linecnt, ali@70: g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: } ali@56: } ali@56: /* Split out the characters that CANNOT be preceded by space. */ ali@70: c=g_utf8_get_char(aline); ali@70: nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0; ali@70: for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s)) ali@56: { ali@70: pc=c; ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@56: /* for each character in the line after the first */ ali@70: if (g_utf8_strchr("?!,;:",-1,c)) ali@56: { ali@56: /* if it's punctuation that _cannot_ have a space before it */ ali@70: if (pc==CHAR_SPACE && !isemptyline && nc!=CHAR_SPACE) ali@56: { ali@56: /* ali@70: * If nc DOES == space, ali@56: * it was already reported just above. ali@56: */ ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Spaced punctuation?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: } ali@56: /* ali@56: * Special case " .X" where X is any alpha. ali@56: * This plugs a hole in the acronym code above. ali@56: * Inelegant, but maintainable. ali@56: */ ali@70: c=g_utf8_get_char(aline); ali@70: nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0; ali@70: for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s)) ali@56: { ali@70: pc=c; ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@56: /* for each character in the line after the first */ ali@70: if (c=='.') ali@56: { ali@56: /* if it's a period */ ali@70: if (pc==CHAR_SPACE && g_unichar_isalpha(nc)) ali@56: { ali@56: /* ali@56: * If the period follows a space and ali@56: * is followed by a letter. ali@56: */ ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Spaced punctuation?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: } ali@70: c=g_utf8_get_char(aline); ali@70: nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0; ali@70: for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s)) ali@56: { ali@70: pc=c; ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@56: /* for each character in the line after the first */ ali@142: if (CHAR_IS_DQUOTE(c)) ali@56: { ali@70: if (!g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,pc) && ali@70: !g_utf8_strchr(" _-.'`,;:!/([{?}])",-1,nc) && nc || ali@70: !g_utf8_strchr(" _-([{'`",-1,pc) && g_unichar_isalpha(nc)) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Unspaced quotes?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: } ali@56: /* Check parity of quotes. */ ali@70: nc=g_utf8_get_char(aline); ali@70: for (s=aline;*s;s=g_utf8_next_char(s)) ali@56: { ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@142: if (CHAR_IS_DQUOTE(c)) ali@56: { ali@142: if (c==CHAR_DQUOTE) ali@142: { ali@142: parities->dquote=!parities->dquote; ali@142: parity=parities->dquote; ali@142: } ali@142: else if (c==CHAR_LD_QUOTE) ali@142: parity=1; ali@142: else ali@142: parity=0; ali@142: if (!parity) ali@56: { ali@56: /* parity even */ ali@173: if (!g_utf8_strchr("_-.'`‘’/,;:!?)]} ",-1,nc)) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@70: "Wrongspaced quotes?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: else ali@56: { ali@56: /* parity odd */ ali@173: if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) && ali@173: !g_utf8_strchr("_-/.'`‘’([{$",-1,nc) || !nc) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@70: "Wrongspaced quotes?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: } ali@56: } ali@142: c=g_utf8_get_char(aline); ali@142: if (CHAR_IS_DQUOTE(c)) ali@56: { ali@70: if (g_utf8_strchr(",;:!?)]} ",-1, ali@70: g_utf8_get_char(g_utf8_next_char(aline)))) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column 1 - Wrongspaced quotes?\n", ali@56: linecnt); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: if (pswit[SQUOTE_SWITCH]) ali@56: { ali@70: nc=g_utf8_get_char(aline); ali@70: for (s=aline;*s;s=g_utf8_next_char(s)) ali@56: { ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@99: if (CHAR_IS_SQUOTE(c) && (s==aline || s>aline && ali@70: !g_unichar_isalpha(g_utf8_get_char(g_utf8_prev_char(s))) || ali@70: !g_unichar_isalpha(nc))) ali@56: { ali@56: parities->squote=!parities->squote; ali@56: if (!parities->squote) ali@56: { ali@56: /* parity even */ ali@70: if (!g_utf8_strchr("_-.'`/\",;:!?)]} ",-1,nc)) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@56: "Wrongspaced singlequotes?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: else ali@56: { ali@56: /* parity odd */ ali@173: if (!g_unichar_isalpha(nc) && !g_unichar_isdigit(nc) && ali@70: !g_utf8_strchr("_-/\".'`",-1,nc) || !nc) ali@56: { ali@56: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@56: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@56: "Wrongspaced singlequotes?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@56: else ali@56: cnt_punct++; ali@56: } ali@56: } ali@56: } ali@56: } ali@56: } ali@56: } ali@56: ali@55: /* ali@57: * check_for_double_punctuation: ali@57: * ali@57: * Look for double punctuation like ,. or ,, ali@57: * Thanks to DW for the suggestion! ali@57: * In books with references, ".," and ".;" are common ali@57: * e.g. "etc., etc.," and vol. 1.; vol 3.; ali@57: * OTOH, from my initial tests, there are also fairly ali@57: * common errors. What to do? Make these cases paranoid? ali@57: * ".," is the most common, so warnings->dotcomma is used ali@57: * to suppress detailed reporting if it occurs often. ali@57: */ ali@57: void check_for_double_punctuation(const char *aline,struct warnings *warnings) ali@57: { ali@70: const char *s; ali@70: gunichar c,nc; ali@70: nc=g_utf8_get_char(aline); ali@70: for (s=aline;*s;s=g_utf8_next_char(s)) ali@57: { ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@57: /* for each punctuation character in the line */ ali@70: if (c && nc && g_utf8_strchr(".?!,;:",-1,c) && ali@70: g_utf8_strchr(".?!,;:",-1,nc)) ali@57: { ali@57: /* followed by punctuation, it's a query, unless . . . */ ali@70: if (c==nc && (c=='.' || c=='?' || c=='!') || ali@70: !warnings->dotcomma && c=='.' && nc==',' || ali@70: warnings->isFrench && g_str_has_prefix(s,",...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...,") || ali@70: warnings->isFrench && g_str_has_prefix(s,";...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...;") || ali@70: warnings->isFrench && g_str_has_prefix(s,":...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...:") || ali@70: warnings->isFrench && g_str_has_prefix(s,"!...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...!") || ali@70: warnings->isFrench && g_str_has_prefix(s,"?...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...?")) ali@57: { ali@70: if (warnings->isFrench && g_str_has_prefix(s,",...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...,") || ali@70: warnings->isFrench && g_str_has_prefix(s,";...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...;") || ali@70: warnings->isFrench && g_str_has_prefix(s,":...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...:") || ali@70: warnings->isFrench && g_str_has_prefix(s,"!...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...!") || ali@70: warnings->isFrench && g_str_has_prefix(s,"?...") || ali@70: warnings->isFrench && g_str_has_prefix(s,"...?")) ali@70: { ali@70: s+=4; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@70: } ali@57: ; /* do nothing for .. !! and ?? which can be legit */ ali@57: } ali@57: else ali@57: { ali@57: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@57: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Double punctuation?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@57: else ali@57: cnt_punct++; ali@57: } ali@57: } ali@57: } ali@57: } ali@57: ali@57: /* ali@58: * check_for_spaced_quotes: ali@58: */ ali@58: void check_for_spaced_quotes(const char *aline) ali@58: { ali@99: int i; ali@58: const char *s,*t; ali@99: const gunichar single_quotes[]={CHAR_SQUOTE,CHAR_OPEN_SQUOTE,CHAR_LS_QUOTE, ali@99: CHAR_RS_QUOTE}; ali@99: GString *pattern; ali@58: s=aline; ali@58: while ((t=strstr(s," \" "))) ali@58: { ali@58: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@58: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Spaced doublequote?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,t)+1); ali@58: else ali@58: cnt_punct++; ali@70: s=g_utf8_next_char(g_utf8_next_char(t)); ali@58: } ali@99: pattern=g_string_new(NULL); ali@99: for(i=0;istr))) ali@99: { ali@99: if (pswit[ECHO_SWITCH]) ali@99: g_print("\n%s\n",aline); ali@99: if (!pswit[OVERVIEW_SWITCH]) ali@99: g_print(" Line %ld column %ld - Spaced singlequote?\n", ali@99: linecnt,g_utf8_pointer_to_offset(aline,t)+1); ali@99: else ali@99: cnt_punct++; ali@99: s=g_utf8_next_char(g_utf8_next_char(t)); ali@99: } ali@58: } ali@99: g_string_free(pattern,TRUE); ali@58: } ali@58: ali@58: /* ali@59: * check_for_miscased_genative: ali@59: * ali@59: * Check special case of 'S instead of 's at end of word. ali@59: */ ali@59: void check_for_miscased_genative(const char *aline) ali@59: { ali@59: const char *s; ali@70: gunichar c,nc,pc; ali@69: if (!*aline) ali@69: return; ali@70: c=g_utf8_get_char(aline); ali@70: nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0; ali@70: for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s)) ali@59: { ali@70: pc=c; ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@99: if (CHAR_IS_APOSTROPHE(c) && nc=='S' && g_unichar_islower(pc)) ali@59: { ali@59: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@59: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Capital \"S\"?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+2); ali@59: else ali@59: cnt_punct++; ali@59: } ali@59: } ali@59: } ali@59: ali@59: /* ali@60: * check_end_of_line: ali@60: * ali@60: * Now check special cases - start and end of line - ali@60: * for single and double quotes. Start is sometimes [sic] ali@60: * but better to query it anyway. ali@60: * While we're here, check for dash at end of line. ali@60: */ ali@60: void check_end_of_line(const char *aline,struct warnings *warnings) ali@60: { ali@70: int lbytes; ali@70: const char *s; ali@70: gunichar c1,c2; ali@70: lbytes=strlen(aline); ali@70: if (g_utf8_strlen(aline,lbytes)>1) ali@60: { ali@70: s=g_utf8_prev_char(aline+lbytes); ali@70: c1=g_utf8_get_char(s); ali@70: c2=g_utf8_get_char(g_utf8_prev_char(s)); ali@142: if ((CHAR_IS_DQUOTE(c1) || CHAR_IS_SQUOTE(c1)) && c2==CHAR_SPACE) ali@60: { ali@60: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@60: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Spaced quote?\n",linecnt, ali@70: g_utf8_strlen(aline,lbytes)); ali@70: else ali@70: cnt_punct++; ali@70: } ali@70: c1=g_utf8_get_char(aline); ali@70: c2=g_utf8_get_char(g_utf8_next_char(aline)); ali@99: if (CHAR_IS_SQUOTE(c1) && c2==CHAR_SPACE) ali@70: { ali@70: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@70: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column 1 - Spaced quote?\n",linecnt); ali@60: else ali@60: cnt_punct++; ali@60: } ali@60: /* ali@60: * Dash at end of line may well be legit - paranoid mode only ali@60: * and don't report em-dash at line-end. ali@60: */ ali@60: if (pswit[PARANOID_SWITCH] && warnings->hyphen) ali@60: { ali@70: for (s=g_utf8_prev_char(aline+lbytes); ali@70: s>aline && g_utf8_get_char(s)<=CHAR_SPACE;s=g_utf8_prev_char(s)) ali@60: ; ali@70: if (g_utf8_get_char(s)=='-' && ali@70: g_utf8_get_char(g_utf8_prev_char(s))!='-') ali@60: { ali@60: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@60: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@70: "Hyphen at end of line?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)); ali@60: } ali@60: } ali@60: } ali@60: } ali@60: ali@60: /* ali@61: * check_for_unspaced_bracket: ali@61: * ali@61: * Brackets are often unspaced, but shouldn't be surrounded by alpha. ali@61: * If so, suspect a scanno like "a]most". ali@61: */ ali@61: void check_for_unspaced_bracket(const char *aline) ali@61: { ali@70: const char *s; ali@70: gunichar c,nc,pc; ali@70: c=g_utf8_get_char(aline); ali@70: nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0; ali@70: for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s)) ali@61: { ali@70: pc=c; ali@70: c=nc; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@70: if (!nc) ali@70: break; ali@61: /* for each bracket character in the line except 1st & last */ ali@70: if (g_utf8_strchr("{[()]}",-1,c) && ali@70: g_unichar_isalpha(pc) && g_unichar_isalpha(nc)) ali@61: { ali@61: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@61: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - Unspaced bracket?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)); ali@61: else ali@61: cnt_punct++; ali@61: } ali@61: } ali@61: } ali@61: ali@61: /* ali@62: * check_for_unpunctuated_endquote: ali@62: */ ali@62: void check_for_unpunctuated_endquote(const char *aline) ali@62: { ali@70: const char *s; ali@70: gunichar c,nc,pc; ali@142: QuoteClass qc; ali@70: c=g_utf8_get_char(aline); ali@70: nc=c?g_utf8_get_char(g_utf8_next_char(aline)):0; ali@70: for (s=g_utf8_next_char(aline);nc;s=g_utf8_next_char(s)) ali@62: { ali@70: pc=c; ali@70: c=nc; ali@142: qc=CHAR_IS_DQUOTE(c)?QUOTE_CLASS(c):INVALID_QUOTE; ali@70: nc=g_utf8_get_char(g_utf8_next_char(s)); ali@62: /* for each character in the line except 1st */ ali@147: if ((qc==CLOSING_QUOTE || qc==NEUTRAL_QUOTE) && g_unichar_isalpha(pc)) ali@62: { ali@62: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@62: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@70: "endquote missing punctuation?\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)); ali@62: else ali@62: cnt_punct++; ali@62: } ali@62: } ali@62: } ali@62: ali@62: /* ali@63: * check_for_html_tag: ali@63: * ali@63: * Check for . ali@63: * ali@63: * If there is a < in the line, followed at some point ali@63: * by a > then we suspect HTML. ali@63: */ ali@63: void check_for_html_tag(const char *aline) ali@63: { ali@63: const char *open,*close; ali@70: gchar *tag; ali@70: open=strchr(aline,'<'); ali@63: if (open) ali@63: { ali@70: close=strchr(g_utf8_next_char(open),'>'); ali@63: if (close) ali@63: { ali@70: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@70: if (!pswit[OVERVIEW_SWITCH]) ali@63: { ali@70: tag=g_strndup(open,close-open+1); ali@70: g_print(" Line %ld column %ld - HTML Tag? %s \n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,open)+1,tag); ali@70: g_free(tag); ali@63: } ali@70: else ali@70: cnt_html++; ali@63: } ali@63: } ali@63: } ali@63: ali@63: /* ali@64: * check_for_html_entity: ali@64: * ali@64: * Check for &symbol; HTML. ali@64: * ali@64: * If there is a & in the line, followed at ali@64: * some point by a ; then we suspect HTML. ali@64: */ ali@64: void check_for_html_entity(const char *aline) ali@64: { ali@64: const char *s,*amp,*scolon; ali@70: gchar *entity; ali@70: amp=strchr(aline,'&'); ali@64: if (amp) ali@64: { ali@70: scolon=strchr(amp,';'); ali@64: if (scolon) ali@64: { ali@70: for (s=amp;s=scolon) ali@64: { ali@64: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@64: if (!pswit[OVERVIEW_SWITCH]) ali@70: { ali@70: entity=g_strndup(amp,scolon-amp+1); ali@70: g_print(" Line %ld column %d - HTML symbol? %s \n", ali@70: linecnt,(int)(amp-aline)+1,entity); ali@70: g_free(entity); ali@70: } ali@64: else ali@64: cnt_html++; ali@64: } ali@64: } ali@64: } ali@64: } ali@64: ali@65: /* ali@66: * check_for_omitted_punctuation: ali@66: * ali@66: * Check for omitted punctuation at end of paragraph by working back ali@66: * through prevline. DW. ali@66: * Need to check this only for "normal" paras. ali@66: * So what is a "normal" para? ali@66: * Not normal if one-liner (chapter headings, etc.) ali@66: * Not normal if doesn't contain at least one locase letter ali@66: * Not normal if starts with space ali@66: */ ali@66: void check_for_omitted_punctuation(const char *prevline, ali@66: struct line_properties *last,int start_para_line) ali@66: { ali@70: gboolean letter_on_line=FALSE; ali@66: const char *s; ali@99: gunichar c; ali@142: gboolean closing_quote; ali@70: for (s=prevline;*s;s=g_utf8_next_char(s)) ali@70: if (g_unichar_isalpha(g_utf8_get_char(s))) ali@70: { ali@70: letter_on_line=TRUE; ali@70: break; ali@70: } ali@66: /* ali@66: * This next "if" is a problem. ali@66: * If we say "start_para_line <= linecnt - 1", that includes ali@66: * one-line "paragraphs" like chapter heads. Lotsa false positives. ali@66: * If we say "start_para_line < linecnt - 1" it doesn't, but then it ali@66: * misses genuine one-line paragraphs. ali@66: */ ali@70: if (letter_on_line && last->blen>2 && start_para_lineCHAR_SPACE) ali@66: { ali@99: s=prevline+strlen(prevline); ali@99: do ali@99: { ali@99: s=g_utf8_prev_char(s); ali@99: c=g_utf8_get_char(s); ali@142: if (QUOTE_CLASS(c)==CLOSING_QUOTE || QUOTE_CLASS(c)==NEUTRAL_QUOTE) ali@142: closing_quote=TRUE; ali@142: else ali@142: closing_quote=FALSE; ali@142: } while (closing_quote && s>prevline); ali@70: for (;s>prevline;s=g_utf8_prev_char(s)) ali@66: { ali@70: if (g_unichar_isalpha(g_utf8_get_char(s))) ali@66: { ali@66: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",prevline); ali@66: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@66: "No punctuation at para end?\n", ali@70: linecnt-1,g_utf8_strlen(prevline,-1)); ali@66: else ali@66: cnt_punct++; ali@66: break; ali@66: } ali@147: if (g_utf8_strchr("—-.:!([{?}])",-1,g_utf8_get_char(s))) ali@66: break; ali@66: } ali@66: } ali@66: } ali@66: ali@69: gboolean report_duplicate_queries(gpointer key,gpointer value,gpointer data) ali@69: { ali@69: const char *word=key; ali@69: int *dupcnt=value; ali@69: if (*dupcnt) ali@70: g_print("\nNote: Queried word %s was duplicated %d times\n", ali@69: word,*dupcnt); ali@69: return FALSE; ali@69: } ali@69: ali@70: void print_as_windows_1252(const char *string) ali@70: { ali@70: gsize inbytes,outbytes; ali@70: gchar *buf,*bp; ali@86: static GIConv converter=(GIConv)-1; ali@70: if (!string) ali@70: { ali@70: if (converter!=(GIConv)-1) ali@70: g_iconv_close(converter); ali@70: converter=(GIConv)-1; ali@70: return; ali@70: } ali@86: if (converter==(GIConv)-1) ali@70: converter=g_iconv_open("WINDOWS-1252","UTF-8"); ali@70: if (converter!=(GIConv)-1) ali@70: { ali@70: inbytes=outbytes=strlen(string); ali@70: bp=buf=g_malloc(outbytes+1); ali@70: g_iconv(converter,(char **)&string,&inbytes,&bp,&outbytes); ali@70: *bp='\0'; ali@70: fputs(buf,stdout); ali@70: g_free(buf); ali@70: } ali@70: else ali@70: fputs(string,stdout); ali@70: } ali@70: ali@72: void print_as_utf_8(const char *string) ali@72: { ali@72: fputs(string,stdout); ali@72: } ali@72: ali@66: /* ali@41: * procfile: ali@41: * ali@41: * Process one file. ali@41: */ ali@69: void procfile(const char *filename) ali@41: { ali@65: const char *s; ali@69: gchar *parastart=NULL; /* first line of current para */ ali@69: gchar *etext,*aline; ali@69: gchar *etext_ptr; ali@69: GError *err=NULL; ali@41: struct first_pass_results *first_pass_results; ali@42: struct warnings *warnings; ali@43: struct counters counters={0}; ali@45: struct line_properties last={0}; ali@56: struct parities parities={0}; ali@69: struct pending pending={0}; ali@69: gboolean isemptyline; ali@68: long start_para_line=0; ali@69: gboolean isnewpara=FALSE,enddash=FALSE; ali@45: last.start=CHAR_SPACE; ali@68: linecnt=checked_linecnt=0; ali@69: etext=read_etext(filename,&err); ali@69: if (!etext) ali@41: { ali@68: if (pswit[STDOUT_SWITCH]) ali@69: fprintf(stdout,"bookloupe: %s: %s\n",filename,err->message); ali@68: else ali@69: fprintf(stderr,"bookloupe: %s: %s\n",filename,err->message); ali@41: exit(1); ali@41: } ali@70: g_print("\n\nFile: %s\n\n",filename); ali@69: first_pass_results=first_pass(etext); ali@42: warnings=report_first_pass(first_pass_results); ali@69: qword=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,g_free); ali@69: qperiod=g_tree_new_full((GCompareDataFunc)strcmp,NULL,g_free,NULL); ali@40: /* ali@40: * Here we go with the main pass. Hold onto yer hat! ali@40: */ ali@65: linecnt=0; ali@69: etext_ptr=etext; ali@69: while ((aline=flgets(&etext_ptr,linecnt+1))) ali@40: { ali@68: linecnt++; ali@68: if (linecnt==1) ali@69: isnewpara=TRUE; ali@70: if (pswit[DP_SWITCH] && g_str_has_prefix(aline,"-----File: ")) ali@40: continue; // skip DP page separators completely ali@68: if (linecntfirstline || ali@41: (first_pass_results->footerline>0 && ali@41: linecnt>first_pass_results->footerline)) ali@40: { ali@68: if (pswit[HEADER_SWITCH]) ali@40: { ali@70: if (g_str_has_prefix(aline,"Title:")) ali@70: g_print(" %s\n",aline); ali@70: if (g_str_has_prefix(aline,"Author:")) ali@70: g_print(" %s\n",aline); ali@70: if (g_str_has_prefix(aline,"Release Date:")) ali@70: g_print(" %s\n",aline); ali@70: if (g_str_has_prefix(aline,"Edition:")) ali@70: g_print(" %s\n\n",aline); ali@40: } ali@68: continue; /* skip through the header */ ali@40: } ali@68: checked_linecnt++; ali@65: print_pending(aline,parastart,&pending); ali@164: isemptyline=analyse_quotes(aline,&counters); ali@68: if (isnewpara && !isemptyline) ali@40: { ali@40: /* This line is the start of a new paragraph. */ ali@68: start_para_line=linecnt; ali@40: /* Capture its first line in case we want to report it later. */ ali@69: g_free(parastart); ali@69: parastart=g_strdup(aline); ali@56: memset(&parities,0,sizeof(parities)); /* restart the quote count */ ali@68: s=aline; ali@70: while (*s && !g_unichar_isalpha(g_utf8_get_char(s)) && ali@70: !g_unichar_isdigit(g_utf8_get_char(s))) ali@70: s=g_utf8_next_char(s); ali@70: if (g_unichar_islower(g_utf8_get_char(s))) ali@40: { ali@40: /* and its first letter is lowercase */ ali@68: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@68: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - " ali@40: "Paragraph starts with lower-case\n", ali@70: linecnt,g_utf8_pointer_to_offset(aline,s)+1); ali@68: else ali@68: cnt_punct++; ali@40: } ali@69: isnewpara=FALSE; /* Signal the end of new para processing. */ ali@40: } ali@68: /* Check for an em-dash broken at line end. */ ali@70: if (enddash && g_utf8_get_char(aline)=='-') ali@40: { ali@68: if (pswit[ECHO_SWITCH]) ali@70: g_print("\n%s\n",aline); ali@68: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column 1 - Broken em-dash?\n",linecnt); ali@68: else ali@68: cnt_punct++; ali@40: } ali@69: enddash=FALSE; ali@70: for (s=g_utf8_prev_char(aline+strlen(aline)); ali@70: g_utf8_get_char(s)==' ' && s>aline;s=g_utf8_prev_char(s)) ali@40: ; ali@70: if (s>=aline && g_utf8_get_char(s)=='-') ali@69: enddash=TRUE; ali@67: check_for_control_characters(aline); ali@185: check_for_odd_characters(aline,warnings,isemptyline); ali@68: if (warnings->longline) ali@45: check_for_long_line(aline); ali@68: if (warnings->shortline) ali@45: check_for_short_line(aline,&last); ali@68: last.blen=last.len; ali@70: last.len=g_utf8_strlen(aline,-1); ali@70: last.start=g_utf8_get_char(aline); ali@46: check_for_starting_punctuation(aline); ali@68: if (warnings->dash) ali@40: { ali@47: check_for_spaced_emdash(aline); ali@47: check_for_spaced_dash(aline); ali@40: } ali@48: check_for_unmarked_paragraphs(aline); ali@49: check_for_jeebies(aline); ali@50: check_for_mta_from(aline); ali@51: check_for_orphan_character(aline); ali@52: check_for_pling_scanno(aline); ali@53: check_for_extra_period(aline,warnings); ali@54: check_for_following_punctuation(aline); ali@55: check_for_typos(aline,warnings); ali@56: check_for_misspaced_punctuation(aline,&parities,isemptyline); ali@57: check_for_double_punctuation(aline,warnings); ali@58: check_for_spaced_quotes(aline); ali@59: check_for_miscased_genative(aline); ali@60: check_end_of_line(aline,warnings); ali@61: check_for_unspaced_bracket(aline); ali@68: if (warnings->endquote) ali@62: check_for_unpunctuated_endquote(aline); ali@63: check_for_html_tag(aline); ali@64: check_for_html_entity(aline); ali@68: if (isemptyline) ali@40: { ali@65: check_for_mismatched_quotes(&counters,&pending); ali@103: counters_reset(&counters); ali@40: /* let the next iteration know that it's starting a new para */ ali@69: isnewpara=TRUE; ali@69: if (prevline) ali@69: check_for_omitted_punctuation(prevline,&last,start_para_line); ali@40: } ali@69: g_free(prevline); ali@69: prevline=g_strdup(aline); ali@0: } ali@103: linecnt++; ali@103: check_for_mismatched_quotes(&counters,&pending); ali@103: print_pending(NULL,parastart,&pending); ali@103: reset_pending(&pending); ali@69: if (prevline) ali@69: { ali@69: g_free(prevline); ali@69: prevline=NULL; ali@69: } ali@69: g_free(parastart); ali@69: g_free(prevline); ali@69: g_free(etext); ali@79: if (!pswit[OVERVIEW_SWITCH] && !pswit[VERBOSE_SWITCH]) ali@69: g_tree_foreach(qword,report_duplicate_queries,NULL); ali@69: g_tree_unref(qword); ali@69: g_tree_unref(qperiod); ali@99: counters_destroy(&counters); ali@70: g_set_print_handler(NULL); ali@70: print_as_windows_1252(NULL); ali@71: if (pswit[MARKUP_SWITCH]) ali@71: loseentities(NULL); ali@0: } ali@0: ali@40: /* ali@40: * flgets: ali@40: * ali@69: * Get one line from the input text, checking for ali@40: * the existence of exactly one CR/LF line-end per line. ali@40: * ali@40: * Returns: a pointer to the line. ali@40: */ ali@69: char *flgets(char **etext,long lcnt) ali@0: { ali@70: gunichar c; ali@69: gboolean isCR=FALSE; ali@69: char *theline=*etext; ali@70: char *eos=theline; ali@70: gchar *s; ali@70: for (;;) ali@40: { ali@70: c=g_utf8_get_char(*etext); ali@173: if (!c) ali@173: { ali@173: if (*etext==theline) ali@173: return NULL; ali@173: else if (pswit[LINE_END_SWITCH]) ali@173: { ali@173: if (pswit[ECHO_SWITCH]) ali@173: { ali@173: s=g_strndup(theline,eos-theline); ali@173: g_print("\n%s\n",s); ali@173: g_free(s); ali@173: } ali@173: if (!pswit[OVERVIEW_SWITCH]) ali@173: /* There may, or may not, have been a CR */ ali@173: g_print(" Line %ld - No LF?\n",lcnt); ali@173: else ali@173: cnt_lineend++; ali@173: } ali@173: break; ali@173: } ali@70: *etext=g_utf8_next_char(*etext); ali@40: /* either way, it's end of line */ ali@69: if (c=='\n') ali@40: { ali@68: if (isCR) ali@68: break; ali@68: else ali@40: { ali@40: /* Error - a LF without a preceding CR */ ali@68: if (pswit[LINE_END_SWITCH]) ali@40: { ali@68: if (pswit[ECHO_SWITCH]) ali@70: { ali@70: s=g_strndup(theline,eos-theline); ali@70: g_print("\n%s\n",s); ali@70: g_free(s); ali@70: } ali@68: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld - No CR?\n",lcnt); ali@68: else ali@68: cnt_lineend++; ali@40: } ali@68: break; ali@40: } ali@40: } ali@69: if (c=='\r') ali@40: { ali@68: if (isCR) ali@40: { ali@40: /* Error - two successive CRs */ ali@68: if (pswit[LINE_END_SWITCH]) ali@40: { ali@68: if (pswit[ECHO_SWITCH]) ali@70: { ali@70: s=g_strndup(theline,eos-theline); ali@70: g_print("\n%s\n",s); ali@70: g_free(s); ali@70: } ali@68: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld - Two successive CRs?\n",lcnt); ali@68: else ali@68: cnt_lineend++; ali@40: } ali@40: } ali@69: isCR=TRUE; ali@40: } ali@68: else ali@40: { ali@68: if (pswit[LINE_END_SWITCH] && isCR) ali@40: { ali@68: if (pswit[ECHO_SWITCH]) ali@70: { ali@70: s=g_strndup(theline,eos-theline); ali@70: g_print("\n%s\n",s); ali@70: g_free(s); ali@70: } ali@68: if (!pswit[OVERVIEW_SWITCH]) ali@70: g_print(" Line %ld column %ld - CR without LF?\n", ali@70: lcnt,g_utf8_pointer_to_offset(theline,eos)+1); ali@68: else ali@68: cnt_lineend++; ali@70: *eos=' '; ali@40: } ali@69: isCR=FALSE; ali@70: eos=g_utf8_next_char(eos); ali@40: } ali@69: } ali@70: *eos='\0'; ali@0: if (pswit[MARKUP_SWITCH]) ali@68: postprocess_for_HTML(theline); ali@0: if (pswit[DP_SWITCH]) ali@68: postprocess_for_DP(theline); ali@40: return theline; ali@0: } ali@0: ali@40: /* ali@40: * mixdigit: ali@40: * ali@40: * Takes a "word" as a parameter, and checks whether it ali@40: * contains a mixture of alpha and digits. Generally, this is an ali@40: * error, but may not be for cases like 4th or L5 12s. 3d. ali@40: * ali@70: * Returns: TRUE iff an is error found. ali@40: */ ali@70: gboolean mixdigit(const char *checkword) ali@0: { ali@70: gboolean wehaveadigit,wehavealetter,query; ali@70: const char *s,*nondigit; ali@70: wehaveadigit=wehavealetter=query=FALSE; ali@70: for (s=checkword;*s;s=g_utf8_next_char(s)) ali@70: if (g_unichar_isalpha(g_utf8_get_char(s))) ali@70: wehavealetter=TRUE; ali@70: else if (g_unichar_isdigit(g_utf8_get_char(s))) ali@70: wehaveadigit=TRUE; ali@40: if (wehaveadigit && wehavealetter) ali@40: { ali@40: /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */ ali@70: query=TRUE; ali@70: for (nondigit=checkword;g_unichar_isdigit(g_utf8_get_char(nondigit)); ali@70: nondigit=g_utf8_next_char(nondigit)) ali@68: ; ali@68: /* digits, ending in st, rd, nd, th of either case */ ali@70: if (!g_ascii_strcasecmp(nondigit,"st") || ali@70: !g_ascii_strcasecmp(nondigit,"rd") || ali@70: !g_ascii_strcasecmp(nondigit,"nd") || ali@70: !g_ascii_strcasecmp(nondigit,"th")) ali@70: query=FALSE; ali@70: if (!g_ascii_strcasecmp(nondigit,"sts") || ali@70: !g_ascii_strcasecmp(nondigit,"rds") || ali@70: !g_ascii_strcasecmp(nondigit,"nds") || ali@70: !g_ascii_strcasecmp(nondigit,"ths")) ali@70: query=FALSE; ali@70: if (!g_ascii_strcasecmp(nondigit,"stly") || ali@70: !g_ascii_strcasecmp(nondigit,"rdly") || ali@70: !g_ascii_strcasecmp(nondigit,"ndly") || ali@70: !g_ascii_strcasecmp(nondigit,"thly")) ali@70: query=FALSE; ali@68: /* digits, ending in l, L, s or d */ ali@70: if (!g_ascii_strcasecmp(nondigit,"l") || !strcmp(nondigit,"s") || ali@70: !strcmp(nondigit,"d")) ali@70: query=FALSE; ali@68: /* ali@40: * L at the start of a number, representing Britsh pounds, like L500. ali@70: * This is cute. We know the current word is mixed digit. If the first ali@68: * letter is L, there must be at least one digit following. If both ali@68: * digits and letters follow, we have a genuine error, else we have a ali@68: * capital L followed by digits, and we accept that as a non-error. ali@40: */ ali@70: if (g_utf8_get_char(checkword)=='L' && ali@70: !mixdigit(g_utf8_next_char(checkword))) ali@70: query=FALSE; ali@40: } ali@40: return query; ali@0: } ali@0: ali@40: /* ali@40: * getaword: ali@40: * ali@69: * Extracts the first/next "word" from the line, and returns it. ali@69: * A word is defined as one English word unit--or at least that's the aim. ali@69: * "ptr" is advanced to the position in the line where we will start ali@69: * looking for the next word. ali@40: * ali@69: * Returns: A newly-allocated string. ali@40: */ ali@69: gchar *getaword(const char **ptr) ali@0: { ali@70: const char *s,*t; ali@69: GString *word; ali@70: gunichar c,pc; ali@69: word=g_string_new(NULL); ali@70: for (;!g_unichar_isdigit(g_utf8_get_char(*ptr)) && ali@70: !g_unichar_isalpha(g_utf8_get_char(*ptr)) && ali@70: **ptr;*ptr=g_utf8_next_char(*ptr)) ali@174: { ali@174: /* Handle exceptions for footnote markers like [1] */ ali@174: if (g_utf8_get_char(*ptr)=='[') ali@174: { ali@174: g_string_append_c(word,'['); ali@174: s=g_utf8_next_char(*ptr); ali@174: for (;g_unichar_isdigit(g_utf8_get_char(s));s=g_utf8_next_char(s)) ali@174: g_string_append_unichar(word,g_utf8_get_char(s)); ali@174: if (g_utf8_get_char(s)==']') ali@174: { ali@174: g_string_append_c(word,']'); ali@174: *ptr=g_utf8_next_char(s); ali@174: return g_string_free(word,FALSE); ali@174: } ali@174: else ali@174: g_string_truncate(word,0); ali@174: } ali@174: } ali@40: /* ali@40: * Use a look-ahead to handle exceptions for numbers like 1,000 and 1.35. ali@40: * Especially yucky is the case of L1,000 ali@40: * This section looks for a pattern of characters including a digit ali@40: * followed by a comma or period followed by one or more digits. ali@40: * If found, it returns this whole pattern as a word; otherwise we discard ali@40: * the results and resume our normal programming. ali@40: */ ali@69: s=*ptr; ali@70: for (;g_unichar_isdigit(g_utf8_get_char(s)) || ali@70: g_unichar_isalpha(g_utf8_get_char(s)) || ali@70: g_utf8_get_char(s)==',' || g_utf8_get_char(s)=='.';s=g_utf8_next_char(s)) ali@70: g_string_append_unichar(word,g_utf8_get_char(s)); ali@82: if (word->len) ali@40: { ali@82: for (t=g_utf8_next_char(word->str);*t;t=g_utf8_next_char(t)) ali@40: { ali@82: c=g_utf8_get_char(t); ali@82: pc=g_utf8_get_char(g_utf8_prev_char(t)); ali@82: if ((c=='.' || c==',') && g_unichar_isdigit(pc)) ali@82: { ali@82: *ptr=s; ali@82: return g_string_free(word,FALSE); ali@82: } ali@40: } ali@40: } ali@0: /* we didn't find a punctuated number - do the regular getword thing */ ali@69: g_string_truncate(word,0); ali@99: c=g_utf8_get_char(*ptr); ali@99: for (;g_unichar_isdigit(c) || g_unichar_isalpha(c) || CHAR_IS_APOSTROPHE(c); ali@99: *ptr=g_utf8_next_char(*ptr),c=g_utf8_get_char(*ptr)) ali@99: g_string_append_unichar(word,c); ali@69: return g_string_free(word,FALSE); ali@0: } ali@0: ali@40: /* ali@40: * isroman: ali@40: * ali@40: * Is this word a Roman Numeral? ali@40: * ali@40: * It doesn't actually validate that the number is a valid Roman Numeral--for ali@40: * example it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not ali@40: * what we're here to do. If it passes this, it LOOKS like a Roman numeral. ali@40: * Anyway, the actual Romans were pretty tolerant of bad arithmetic, or ali@40: * expressions thereof, except when it came to taxes. Allow any number of M, ali@40: * an optional D, an optional CM or CD, any number of optional Cs, an optional ali@40: * XL or an optional XC, an optional IX or IV, an optional V and any number ali@40: * of optional Is. ali@40: */ ali@69: gboolean isroman(const char *t) ali@0: { ali@69: const char *s; ali@40: if (!t || !*t) ali@69: return FALSE; ali@40: s=t; ali@70: while (g_utf8_get_char(t)=='m' && *t) ali@40: t++; ali@70: if (g_utf8_get_char(t)=='d') ali@40: t++; ali@70: if (g_str_has_prefix(t,"cm")) ali@40: t+=2; ali@70: if (g_str_has_prefix(t,"cd")) ali@40: t+=2; ali@70: while (g_utf8_get_char(t)=='c' && *t) ali@40: t++; ali@70: if (g_str_has_prefix(t,"xl")) ali@40: t+=2; ali@70: if (g_str_has_prefix(t,"xc")) ali@40: t+=2; ali@70: if (g_utf8_get_char(t)=='l') ali@40: t++; ali@70: while (g_utf8_get_char(t)=='x' && *t) ali@40: t++; ali@70: if (g_str_has_prefix(t,"ix")) ali@40: t+=2; ali@70: if (g_str_has_prefix(t,"iv")) ali@40: t+=2; ali@70: if (g_utf8_get_char(t)=='v') ali@40: t++; ali@70: while (g_utf8_get_char(t)=='i' && *t) ali@40: t++; ali@40: return !*t; ali@0: } ali@0: ali@40: /* ali@40: * postprocess_for_DP: ali@40: * ali@40: * Invoked with the -d switch from flgets(). ali@40: * It simply "removes" from the line a hard-coded set of common ali@40: * DP-specific tags, so that the line passed to the main routine has ali@40: * been pre-cleaned of DP markup. ali@40: */ ali@0: void postprocess_for_DP(char *theline) ali@0: { ali@40: char *s,*t; ali@0: int i; ali@0: if (!*theline) ali@68: return; ali@40: for (i=0;*DPmarkup[i];i++) ali@70: while ((s=strstr(theline,DPmarkup[i]))) ali@40: { ali@68: t=s+strlen(DPmarkup[i]); ali@70: memmove(s,t,strlen(t)+1); ali@40: } ali@0: } ali@0: ali@40: /* ali@40: * postprocess_for_HTML: ali@40: * ali@40: * Invoked with the -m switch from flgets(). ali@40: * It simply "removes" from the line a hard-coded set of common ali@40: * HTML tags and "replaces" a hard-coded set of common HTML ali@40: * entities, so that the line passed to the main routine has ali@40: * been pre-cleaned of HTML. ali@40: */ ali@0: void postprocess_for_HTML(char *theline) ali@0: { ali@70: while (losemarkup(theline)) ali@70: ; ali@71: loseentities(theline); ali@0: } ali@0: ali@0: char *losemarkup(char *theline) ali@0: { ali@40: char *s,*t; ali@0: int i; ali@70: s=strchr(theline,'<'); ali@70: t=s?strchr(s,'>'):NULL; ali@40: if (!s || !t) ali@40: return NULL; ali@40: for (i=0;*markup[i];i++) ali@70: if (tagcomp(g_utf8_next_char(s),markup[i])) ali@40: { ali@70: t=g_utf8_next_char(t); ali@70: memmove(s,t,strlen(t)+1); ali@70: return s; ali@68: } ali@40: /* It's an unrecognized . */ ali@40: return NULL; ali@0: } ali@0: ali@71: void loseentities(char *theline) ali@0: { ali@0: int i; ali@71: gsize nb; ali@71: char *amp,*scolon; ali@71: gchar *s,*t; ali@71: gunichar c; ali@71: GTree *entities=NULL; ali@86: static GIConv translit=(GIConv)-1,to_utf8=(GIConv)-1; ali@71: if (!theline) ali@40: { ali@71: if (entities) ali@71: g_tree_destroy(entities); ali@71: entities=NULL; ali@86: if (translit!=(GIConv)-1) ali@71: g_iconv_close(translit); ali@71: translit=(GIConv)-1; ali@86: if (to_utf8!=(GIConv)-1) ali@71: g_iconv_close(to_utf8); ali@71: to_utf8=(GIConv)-1; ali@71: return; ali@71: } ali@71: if (!*theline) ali@71: return; ali@71: if (!entities) ali@71: { ali@71: entities=g_tree_new((GCompareFunc)strcmp); ali@71: for(i=0;i=192 && c<=255) /* An ISO-8859-1 character */ ali@71: theline+=g_unichar_to_utf8(c,theline); ali@71: else ali@71: { ali@71: s=g_malloc(6); ali@71: nb=g_unichar_to_utf8(c,s); ali@71: t=g_convert_with_iconv(s,nb,translit,NULL,&nb,NULL); ali@71: g_free(s); ali@71: s=g_convert_with_iconv(t,nb,to_utf8,NULL,&nb,NULL); ali@71: g_free(t); ali@71: memcpy(theline,s,nb); ali@71: g_free(s); ali@71: theline+=nb; ali@71: } ali@71: memmove(theline,g_utf8_next_char(scolon), ali@71: strlen(g_utf8_next_char(scolon))+1); ali@71: } ali@71: else ali@71: theline=g_utf8_next_char(amp); ali@40: } ali@0: } ali@0: ali@70: gboolean tagcomp(const char *strin,const char *basetag) ali@0: { ali@70: gboolean retval; ali@70: gchar *s,*t; ali@70: if (g_utf8_get_char(strin)=='/') ali@70: t=g_utf8_casefold(g_utf8_next_char(strin),-1); /* ignore a slash */ ali@70: else ali@70: t=g_utf8_casefold(strin,-1); ali@70: s=g_utf8_casefold(basetag,-1); ali@70: retval=g_str_has_prefix(t,s); ali@70: g_free(s); ali@70: g_free(t); ali@70: return retval; ali@0: } ali@0: ali@69: void proghelp(GOptionContext *context) ali@0: { ali@69: gchar *help; ali@40: fputs("Bookloupe version " PACKAGE_VERSION ".\n",stderr); ali@40: fputs("Copyright 2000-2005 Jim Tinsley .\n",stderr); ali@40: fputs("Copyright 2012- J. Ali Harlow .\n",stderr); ali@40: fputs("Bookloupe comes wih ABSOLUTELY NO WARRANTY. " ali@40: "For details, read the file COPYING.\n",stderr); ali@40: fputs("This is Free Software; " ali@40: "you may redistribute it under certain conditions (GPL);\n",stderr); ali@40: fputs("read the file COPYING for details.\n\n",stderr); ali@69: help=g_option_context_get_help(context,TRUE,NULL); ali@69: fputs(help,stderr); ali@69: g_free(help); ali@69: fputs("Sample usage: bookloupe warpeace.txt\n\n",stderr); ali@40: fputs("Bookloupe queries anything it thinks shouldn't be in a PG text; " ali@40: "non-ASCII\n",stderr); ali@40: fputs("characters like accented letters, " ali@40: "lines longer than 75 or shorter than 55,\n",stderr); ali@40: fputs("unbalanced quotes or brackets, " ali@40: "a variety of badly formatted punctuation, \n",stderr); ali@40: fputs("HTML tags, some likely typos. " ali@40: "It is NOT a substitute for human judgement.\n",stderr); ali@0: fputs("\n",stderr); ali@0: }