1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/gutcheck/gutcheck.c Tue Jan 24 23:57:11 2012 +0000
1.3 @@ -0,0 +1,2982 @@
1.4 +/*************************************************************************/
1.5 +/* gutcheck - check for assorted weirdnesses in a PG candidate text file */
1.6 +/* */
1.7 +/* Version 0.991 */
1.8 +/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
1.9 +/* */
1.10 +/* This program is free software; you can redistribute it and/or modify */
1.11 +/* it under the terms of the GNU General Public License as published by */
1.12 +/* the Free Software Foundation; either version 2 of the License, or */
1.13 +/* (at your option) any later version. */
1.14 +/* */
1.15 +/* This program is distributed in the hope that it will be useful, */
1.16 +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
1.17 +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
1.18 +/* GNU General Public License for more details. */
1.19 +/* */
1.20 +/* You should have received a copy of the GNU General Public License */
1.21 +/* along with this program; if not, write to the */
1.22 +/* Free Software Foundation, Inc., */
1.23 +/* 59 Temple Place, */
1.24 +/* Suite 330, */
1.25 +/* Boston, MA 02111-1307 USA */
1.26 +/* */
1.27 +/* */
1.28 +/* */
1.29 +/* Overview comments: */
1.30 +/* */
1.31 +/* If you're reading this, you're either interested in how to detect */
1.32 +/* formatting errors, or very very bored. */
1.33 +/* */
1.34 +/* Gutcheck is a homebrew formatting checker specifically for */
1.35 +/* spotting common formatting problems in a PG e-text. I typically */
1.36 +/* run it once or twice on a file I'm about to submit; it usually */
1.37 +/* finds a few formatting problems. It also usually finds lots of */
1.38 +/* queries that aren't problems at all; it _really_ doesn't like */
1.39 +/* the standard PG header, for example. It's optimized for straight */
1.40 +/* prose; poetry and non-fiction involving tables tend to trigger */
1.41 +/* false alarms. */
1.42 +/* */
1.43 +/* The code of gutcheck is not very interesting, but the experience */
1.44 +/* of what constitutes a possible error may be, and the best way to */
1.45 +/* illustrate that is by example. */
1.46 +/* */
1.47 +/* */
1.48 +/* Here are some common typos found in PG texts that gutcheck */
1.49 +/* will flag as errors: */
1.50 +/* */
1.51 +/* "Look!John , over there!" */
1.52 +/* <this is a HTML tag> */
1.53 +/* &so is this; */
1.54 +/* Margaret said: " Now you should start for school." */
1.55 +/* Margaret said: "Now you should start for school. (if end of para) */
1.56 +/* The horse is said to he worth a lot. */
1.57 +/* 0K - this'11 make you look close1y. */
1.58 +/* "If you do. you'll regret it!" */
1.59 +/* */
1.60 +/* There are some complications . The extra space left around that */
1.61 +/* period was an error . . . but that ellipsis wasn't. */
1.62 +/* */
1.63 +/* The last line of a paragraph */
1.64 +/* is usually short. */
1.65 +/* */
1.66 +/* This period is an error.But the periods in a.m. aren't. */
1.67 +/* */
1.68 +/* Checks that are do-able but not (well) implemented are: */
1.69 +/* Single-quote chcking. */
1.70 +/* Despite 3 attempts at it, singlequote checking is still */
1.71 +/* crap in gutcheck. It may not be possible without analysis */
1.72 +/* of the whole paragraph. */
1.73 +/* */
1.74 +/*************************************************************************/
1.75 +
1.76 +
1.77 +#include <stdio.h>
1.78 +#include <stdlib.h>
1.79 +#include <string.h>
1.80 +#include <ctype.h>
1.81 +
1.82 +#define MAXWORDLEN 80 /* max length of one word */
1.83 +#define LINEBUFSIZE 2048 /* buffer size for an input line */
1.84 +
1.85 +#define MAX_USER_TYPOS 1000
1.86 +#define USERTYPO_FILE "gutcheck.typ"
1.87 +
1.88 +#ifndef MAX_PATH
1.89 +#define MAX_PATH 16384
1.90 +#endif
1.91 +
1.92 +char aline[LINEBUFSIZE];
1.93 +char prevline[LINEBUFSIZE];
1.94 +
1.95 + /* Common typos. */
1.96 +char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
1.97 + "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
1.98 + "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
1.99 + "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
1.100 + "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
1.101 + "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
1.102 + "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
1.103 + "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
1.104 + "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
1.105 + "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
1.106 + "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
1.107 + "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
1.108 + "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
1.109 + "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
1.110 + "yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
1.111 + /* added h/b words for version 12 - removed a few with "tbe" v.25 */
1.112 + "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind",
1.113 + "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates",
1.114 + "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing",
1.115 + "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh",
1.116 + "meanwbile", "memher", "memhers", "numher", "numhers",
1.117 + "perbaps", "prohlem", "puhlic", "witbout",
1.118 + /* and a few more for .18 */
1.119 + "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
1.120 + "heside", "chapteb", "chaptee", "se",
1.121 + ""};
1.122 +
1.123 +char *usertypo[MAX_USER_TYPOS];
1.124 +
1.125 + /* Common abbreviations and other OK words not to query as typos. */
1.126 + /* 0.99 last-minute - removed "ms" */
1.127 +char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
1.128 + "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
1.129 + "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
1.130 + ""};
1.131 +
1.132 + /* Common abbreviations that cause otherwise unexplained periods. */
1.133 +char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
1.134 + "deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
1.135 + ""};
1.136 + /* Two-Letter combinations that rarely if ever start words, */
1.137 + /* but are common scannos or otherwise common letter */
1.138 + /* combinations. */
1.139 +char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
1.140 + "tn", "rn", "lt", "tj",
1.141 + "" };
1.142 +
1.143 + /* Two-Letter combinations that rarely if ever end words */
1.144 + /* but are common scannos or otherwise common letter */
1.145 + /* combinations */
1.146 +char *noend[] = { "cb", "gb", "pb", "sb", "tb",
1.147 + "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
1.148 + "iy",
1.149 + ""};
1.150 +
1.151 +char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center",
1.152 + "col", "div", "em", "font", "h1", "h2", "h3", "h4",
1.153 + "h5", "h6", "head", "hr", "html", "i", "img", "li",
1.154 + "meta", "ol", "p", "pre", "small", "span", "strong",
1.155 + "sub", "sup", "table", "td", "tfoot", "thead", "title",
1.156 + "tr", "tt", "u", "ul",
1.157 + ""};
1.158 +
1.159 +char *DPmarkup[] = { "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>",
1.160 + ""}; /* <tb> added .991 */
1.161 +
1.162 +char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
1.163 + "its", "whose", "every", "i'll", "your", "my",
1.164 + "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd",
1.165 + "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
1.166 + "i'm", "during", "let", "toward", "among",
1.167 + ""};
1.168 +
1.169 +
1.170 +char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
1.171 + "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
1.172 + "i'll", "whose", "who", "because", "when", "let", "till", "very",
1.173 + "an", "among", "those", "into", "whom", "having", "thence",
1.174 + ""};
1.175 +
1.176 +
1.177 +char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */
1.178 +
1.179 +struct {
1.180 + char *htmlent;
1.181 + char *htmlnum;
1.182 + char *textent;
1.183 + } entities[] = { "&", "&", "&",
1.184 + "<", "<", "<",
1.185 + ">", ">", ">",
1.186 + "°", "°", " degrees",
1.187 + "£", "£", "L",
1.188 + """, """, "\"", /* -- quotation mark = APL quote, */
1.189 + "Œ", "Œ", "OE", /* -- latin capital ligature OE, */
1.190 + "œ", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */
1.191 + "Š", "Š", "S", /* -- latin capital letter S with caron, */
1.192 + "š", "š", "s", /* -- latin small letter s with caron, */
1.193 + "Ÿ", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */
1.194 + "ˆ", "ˆ", "", /* -- modifier letter circumflex accent, */
1.195 + "˜", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */
1.196 + " ", " ", " ", /* -- en space, U+2002 ISOpub --> */
1.197 + " ", " ", " ", /* -- em space, U+2003 ISOpub --> */
1.198 + " ", " ", " ", /* -- thin space, U+2009 ISOpub --> */
1.199 + "–", "–", "-", /* -- en dash, U+2013 ISOpub --> */
1.200 + "—", "—", "--", /* -- em dash, U+2014 ISOpub --> */
1.201 + "‘", "‘", "'", /* -- left single quotation mark, */
1.202 + "’", "’", "'", /* -- right single quotation mark, */
1.203 + "‚", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */
1.204 + "“", "“", "\"", /* -- left double quotation mark, */
1.205 + "”", "”", "\"", /* -- right double quotation mark, */
1.206 + "„", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
1.207 + "‹", "‹", "\"", /* -- single left-pointing angle quotation mark, */
1.208 + "›", "›", "\"", /* -- single right-pointing angle quotation mark, */
1.209 + " ", " ", " ", /* -- no-break space = non-breaking space, */
1.210 + "¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
1.211 + "¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */
1.212 + "£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */
1.213 + "¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */
1.214 + "¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
1.215 + "§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */
1.216 + "¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */
1.217 + "©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
1.218 + "ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
1.219 + "«", "«", "\"", /* -- left-pointing double angle quotation mark */
1.220 + "­", "­", "-", /* -- soft hyphen = discretionary hyphen, */
1.221 + "®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */
1.222 + "¯", "¯", " ", /* -- macron = spacing macron = overline */
1.223 + "°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */
1.224 + "±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */
1.225 + "²", "²", "2", /* -- superscript two = superscript digit two */
1.226 + "³", "³", "3", /* -- superscript three = superscript digit three */
1.227 + "´", "´", " ", /* -- acute accent = spacing acute, */
1.228 + "µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */
1.229 + "¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */
1.230 + "¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
1.231 + "¹", "¹", "1", /* -- superscript one = superscript digit one, */
1.232 + "º", "º", " ", /* -- masculine ordinal indicator, */
1.233 + "»", "»", "\"", /* -- right-pointing double angle quotation mark */
1.234 + "¼", "¼", "1/4", /* -- vulgar fraction one quarter */
1.235 + "½", "½", "1/2", /* -- vulgar fraction one half */
1.236 + "¾", "¾", "3/4", /* -- vulgar fraction three quarters */
1.237 + "¿", "¿", "?", /* -- inverted question mark */
1.238 + "À", "À", "A", /* -- latin capital letter A with grave */
1.239 + "Á", "Á", "A", /* -- latin capital letter A with acute, */
1.240 + "Â", "Â", "A", /* -- latin capital letter A with circumflex, */
1.241 + "Ã", "Ã", "A", /* -- latin capital letter A with tilde, */
1.242 + "Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */
1.243 + "Å", "Å", "A", /* -- latin capital letter A with ring above */
1.244 + "Æ", "Æ", "AE", /* -- latin capital letter AE */
1.245 + "Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */
1.246 + "È", "È", "E", /* -- latin capital letter E with grave, */
1.247 + "É", "É", "E", /* -- latin capital letter E with acute, */
1.248 + "Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */
1.249 + "Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */
1.250 + "Ì", "Ì", "I", /* -- latin capital letter I with grave, */
1.251 + "Í", "Í", "I", /* -- latin capital letter I with acute, */
1.252 + "Î", "Î", "I", /* -- latin capital letter I with circumflex, */
1.253 + "Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */
1.254 + "Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
1.255 + "Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */
1.256 + "Ò", "Ò", "O", /* -- latin capital letter O with grave, */
1.257 + "Ó", "Ó", "O", /* -- latin capital letter O with acute, */
1.258 + "Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */
1.259 + "Õ", "Õ", "O", /* -- latin capital letter O with tilde, */
1.260 + "Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */
1.261 + "×", "×", "*", /* -- multiplication sign, U+00D7 ISOnum --> */
1.262 + "Ø", "Ø", "O", /* -- latin capital letter O with stroke */
1.263 + "Ù", "Ù", "U", /* -- latin capital letter U with grave, */
1.264 + "Ú", "Ú", "U", /* -- latin capital letter U with acute, */
1.265 + "Û", "Û", "U", /* -- latin capital letter U with circumflex, */
1.266 + "Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */
1.267 + "Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */
1.268 + "Þ", "Þ", "TH", /* -- latin capital letter THORN, */
1.269 + "ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */
1.270 + "à", "à", "a", /* -- latin small letter a with grave */
1.271 + "á", "á", "a", /* -- latin small letter a with acute, */
1.272 + "â", "â", "a", /* -- latin small letter a with circumflex, */
1.273 + "ã", "ã", "a", /* -- latin small letter a with tilde, */
1.274 + "ä", "ä", "a", /* -- latin small letter a with diaeresis, */
1.275 + "å", "å", "a", /* -- latin small letter a with ring above */
1.276 + "æ", "æ", "ae", /* -- latin small letter ae */
1.277 + "ç", "ç", "c", /* -- latin small letter c with cedilla, */
1.278 + "è", "è", "e", /* -- latin small letter e with grave, */
1.279 + "é", "é", "e", /* -- latin small letter e with acute, */
1.280 + "ê", "ê", "e", /* -- latin small letter e with circumflex, */
1.281 + "ë", "ë", "e", /* -- latin small letter e with diaeresis, */
1.282 + "ì", "ì", "i", /* -- latin small letter i with grave, */
1.283 + "í", "í", "i", /* -- latin small letter i with acute, */
1.284 + "î", "î", "i", /* -- latin small letter i with circumflex, */
1.285 + "ï", "ï", "i", /* -- latin small letter i with diaeresis, */
1.286 + "ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
1.287 + "ñ", "ñ", "n", /* -- latin small letter n with tilde, */
1.288 + "ò", "ò", "o", /* -- latin small letter o with grave, */
1.289 + "ó", "ó", "o", /* -- latin small letter o with acute, */
1.290 + "ô", "ô", "o", /* -- latin small letter o with circumflex, */
1.291 + "õ", "õ", "o", /* -- latin small letter o with tilde, */
1.292 + "ö", "ö", "o", /* -- latin small letter o with diaeresis, */
1.293 + "÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */
1.294 + "ø", "ø", "o", /* -- latin small letter o with stroke, */
1.295 + "ù", "ù", "u", /* -- latin small letter u with grave, */
1.296 + "ú", "ú", "u", /* -- latin small letter u with acute, */
1.297 + "û", "û", "u", /* -- latin small letter u with circumflex, */
1.298 + "ü", "ü", "u", /* -- latin small letter u with diaeresis, */
1.299 + "ý", "ý", "y", /* -- latin small letter y with acute, */
1.300 + "þ", "þ", "th", /* -- latin small letter thorn, */
1.301 + "ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */
1.302 + "", "" };
1.303 +
1.304 +/* ---- list of special characters ---- */
1.305 +#define CHAR_SPACE 32
1.306 +#define CHAR_TAB 9
1.307 +#define CHAR_LF 10
1.308 +#define CHAR_CR 13
1.309 +#define CHAR_DQUOTE 34
1.310 +#define CHAR_SQUOTE 39
1.311 +#define CHAR_OPEN_SQUOTE 96
1.312 +#define CHAR_TILDE 126
1.313 +#define CHAR_ASTERISK 42
1.314 +#define CHAR_FORESLASH 47
1.315 +#define CHAR_CARAT 94
1.316 +
1.317 +#define CHAR_UNDERSCORE '_'
1.318 +#define CHAR_OPEN_CBRACK '{'
1.319 +#define CHAR_CLOSE_CBRACK '}'
1.320 +#define CHAR_OPEN_RBRACK '('
1.321 +#define CHAR_CLOSE_RBRACK ')'
1.322 +#define CHAR_OPEN_SBRACK '['
1.323 +#define CHAR_CLOSE_SBRACK ']'
1.324 +
1.325 +
1.326 +
1.327 +
1.328 +
1.329 +/* ---- longest and shortest normal PG line lengths ----*/
1.330 +#define LONGEST_PG_LINE 75
1.331 +#define WAY_TOO_LONG 80
1.332 +#define SHORTEST_PG_LINE 55
1.333 +
1.334 +#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
1.335 + /* D - ignore DP-specific markup */
1.336 + /* E - echo queried line */
1.337 + /* S - check single quotes */
1.338 + /* T - check common typos */
1.339 + /* P - require closure of quotes on */
1.340 + /* every paragraph */
1.341 + /* X - "Trust no one" :-) Paranoid! */
1.342 + /* Queries everything */
1.343 + /* L - line end checking defaults on */
1.344 + /* -L turns it off */
1.345 + /* O - overview. Just shows counts. */
1.346 + /* Y - puts errors to stdout */
1.347 + /* instead of stderr */
1.348 + /* H - Echoes header fields */
1.349 + /* M - Ignore markup in < > */
1.350 + /* U - Use file of User-defined Typos*/
1.351 + /* W - Defaults for use on Web upload*/
1.352 + /* V - Verbose - list EVERYTHING! */
1.353 +#define SWITNO 14 /* max number of switch parms */
1.354 + /* - used for defining array-size */
1.355 +#define MINARGS 1 /* minimum no of args excl switches */
1.356 +#define MAXARGS 1 /* maximum no of args excl switches */
1.357 +
1.358 +int pswit[SWITNO]; /* program switches set by SWITCHES */
1.359 +
1.360 +#define ECHO_SWITCH 0
1.361 +#define SQUOTE_SWITCH 1
1.362 +#define TYPO_SWITCH 2
1.363 +#define QPARA_SWITCH 3
1.364 +#define PARANOID_SWITCH 4
1.365 +#define LINE_END_SWITCH 5
1.366 +#define OVERVIEW_SWITCH 6
1.367 +#define STDOUT_SWITCH 7
1.368 +#define HEADER_SWITCH 8
1.369 +#define WEB_SWITCH 9
1.370 +#define VERBOSE_SWITCH 10
1.371 +#define MARKUP_SWITCH 11
1.372 +#define USERTYPO_SWITCH 12
1.373 +#define DP_SWITCH 13
1.374 +
1.375 +
1.376 +
1.377 +long cnt_dquot; /* for overview mode, count of doublequote queries */
1.378 +long cnt_squot; /* for overview mode, count of singlequote queries */
1.379 +long cnt_brack; /* for overview mode, count of brackets queries */
1.380 +long cnt_bin; /* for overview mode, count of non-ASCII queries */
1.381 +long cnt_odd; /* for overview mode, count of odd character queries */
1.382 +long cnt_long; /* for overview mode, count of long line errors */
1.383 +long cnt_short; /* for overview mode, count of short line queries */
1.384 +long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
1.385 +long cnt_dash; /* for overview mode, count of dash-related queries */
1.386 +long cnt_word; /* for overview mode, count of word queries */
1.387 +long cnt_html; /* for overview mode, count of html queries */
1.388 +long cnt_lineend; /* for overview mode, count of line-end queries */
1.389 +long cnt_spacend; /* count of lines with space at end V .21 */
1.390 +long linecnt; /* count of total lines in the file */
1.391 +long checked_linecnt; /* count of lines actually gutchecked V .26 */
1.392 +
1.393 +void proghelp(void);
1.394 +void procfile(char *);
1.395 +
1.396 +#define LOW_THRESHOLD 0
1.397 +#define HIGH_THRESHOLD 1
1.398 +
1.399 +#define START 0
1.400 +#define END 1
1.401 +#define PREV 0
1.402 +#define NEXT 1
1.403 +#define FIRST_OF_PAIR 0
1.404 +#define SECOND_OF_PAIR 1
1.405 +
1.406 +#define MAX_WORDPAIR 1000
1.407 +
1.408 +char running_from[MAX_PATH];
1.409 +
1.410 +int mixdigit(char *);
1.411 +char *getaword(char *, char *);
1.412 +int matchword(char *, char *);
1.413 +char *flgets(char *, int, FILE *, long);
1.414 +void lowerit(char *);
1.415 +int gcisalpha(unsigned char);
1.416 +int gcisdigit(unsigned char);
1.417 +int gcisletter(unsigned char);
1.418 +char *gcstrchr(char *s, char c);
1.419 +void postprocess_for_HTML(char *);
1.420 +char *linehasmarkup(char *);
1.421 +char *losemarkup(char *);
1.422 +int tagcomp(char *, char *);
1.423 +char *loseentities(char *);
1.424 +int isroman(char *);
1.425 +int usertypo_count;
1.426 +void postprocess_for_DP(char *);
1.427 +
1.428 +char wrk[LINEBUFSIZE];
1.429 +
1.430 +/* This is disgustingly lazy, predefining max words & lengths, */
1.431 +/* but now I'm out of 16-bit restrictions, what's a couple of K? */
1.432 +#define MAX_QWORD 50
1.433 +#define MAX_QWORD_LENGTH 40
1.434 +char qword[MAX_QWORD][MAX_QWORD_LENGTH];
1.435 +char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
1.436 +signed int dupcnt[MAX_QWORD];
1.437 +
1.438 +
1.439 +
1.440 +
1.441 +int main(int argc, char **argv)
1.442 +{
1.443 + char *argsw, *s;
1.444 + int i, switno, invarg;
1.445 + char usertypo_file[MAX_PATH];
1.446 + FILE *usertypofile;
1.447 +
1.448 +
1.449 + if (strlen(argv[0]) < sizeof(running_from))
1.450 + strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */
1.451 +
1.452 + /* find out what directory we're running from */
1.453 + for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--)
1.454 + *s = 0;
1.455 +
1.456 +
1.457 + switno = strlen(SWITCHES);
1.458 + for (i = switno ; --i >0 ; )
1.459 + pswit[i] = 0; /* initialise switches */
1.460 +
1.461 + /* Standard loop to extract switches. */
1.462 + /* When we come out of this loop, the arguments will be */
1.463 + /* in argv[0] upwards and the switches used will be */
1.464 + /* represented by their equivalent elements in pswit[] */
1.465 + while ( --argc > 0 && **++argv == '-')
1.466 + for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
1.467 + for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
1.468 + if ((toupper(*argsw)) == SWITCHES[i] ) {
1.469 + invarg = 0;
1.470 + pswit[i] = 1;
1.471 + }
1.472 +
1.473 + pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */
1.474 +
1.475 + if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */
1.476 + pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */
1.477 + } /* v.20 removed s and p switches from paranoid mode */
1.478 +
1.479 + pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */
1.480 + pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */
1.481 +
1.482 + if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */
1.483 + pswit[ECHO_SWITCH] = 0;
1.484 +
1.485 + /* Web uploads - for the moment, this is really just a placeholder */
1.486 + /* until we decide what processing we really want to do on web uploads */
1.487 + if (pswit[WEB_SWITCH]) { /* specific override for web uploads */
1.488 + pswit[ECHO_SWITCH] = 1;
1.489 + pswit[SQUOTE_SWITCH] = 0;
1.490 + pswit[TYPO_SWITCH] = 1;
1.491 + pswit[QPARA_SWITCH] = 0;
1.492 + pswit[PARANOID_SWITCH] = 1;
1.493 + pswit[LINE_END_SWITCH] = 0;
1.494 + pswit[OVERVIEW_SWITCH] = 0;
1.495 + pswit[STDOUT_SWITCH] = 0;
1.496 + pswit[HEADER_SWITCH] = 1;
1.497 + pswit[VERBOSE_SWITCH] = 0;
1.498 + pswit[MARKUP_SWITCH] = 0;
1.499 + pswit[USERTYPO_SWITCH] = 0;
1.500 + pswit[DP_SWITCH] = 0;
1.501 + }
1.502 +
1.503 +
1.504 + if (argc < MINARGS || argc > MAXARGS) { /* check number of args */
1.505 + proghelp();
1.506 + return(1); /* exit */
1.507 + }
1.508 +
1.509 +
1.510 + /* read in the user-defined stealth scanno list */
1.511 +
1.512 + if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */
1.513 + if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */
1.514 + strcpy(usertypo_file, running_from);
1.515 + strcat(usertypo_file, USERTYPO_FILE);
1.516 + if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */
1.517 + printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
1.518 + }
1.519 + }
1.520 +
1.521 + usertypo_count = 0;
1.522 + if (usertypofile) { /* we managed to open a User Typo File! */
1.523 + if (pswit[USERTYPO_SWITCH]) {
1.524 + while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
1.525 + if (strlen(aline) > 1) {
1.526 + if ((int)*aline > 33) {
1.527 + s = malloc(strlen(aline)+1);
1.528 + if (!s) {
1.529 + fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
1.530 + exit(1);
1.531 + }
1.532 + strcpy(s, aline);
1.533 + usertypo[usertypo_count] = s;
1.534 + usertypo_count++;
1.535 + if (usertypo_count >= MAX_USER_TYPOS) {
1.536 + printf(" --> Only %d user-defined typos allowed: ignoring the rest\n");
1.537 + break;
1.538 + }
1.539 + }
1.540 + }
1.541 + }
1.542 + }
1.543 + fclose(usertypofile);
1.544 + }
1.545 + }
1.546 +
1.547 +
1.548 +
1.549 +
1.550 + fprintf(stderr, "gutcheck: Check and report on an e-text\n");
1.551 +
1.552 + cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
1.553 + cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
1.554 + cnt_spacend = 0;
1.555 +
1.556 + procfile(argv[0]);
1.557 +
1.558 + if (pswit[OVERVIEW_SWITCH]) {
1.559 + printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
1.560 + checked_linecnt, linecnt, linecnt - checked_linecnt);
1.561 + printf(" --------------- Queries found --------------\n");
1.562 + if (cnt_long) printf(" Long lines: %5ld\n",cnt_long);
1.563 + if (cnt_short) printf(" Short lines: %5ld\n",cnt_short);
1.564 + if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend);
1.565 + if (cnt_word) printf(" Common typos: %5ld\n",cnt_word);
1.566 + if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot);
1.567 + if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot);
1.568 + if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack);
1.569 + if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin);
1.570 + if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd);
1.571 + if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct);
1.572 + if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash);
1.573 + if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html);
1.574 + printf("\n");
1.575 + printf(" TOTAL QUERIES %5ld\n",
1.576 + cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
1.577 + cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
1.578 + }
1.579 +
1.580 + return(0);
1.581 +}
1.582 +
1.583 +
1.584 +
1.585 +/* procfile - process one file */
1.586 +
1.587 +void procfile(char *filename)
1.588 +{
1.589 +
1.590 + char *s, *t, *s1, laststart, *wordstart;
1.591 + char inword[MAXWORDLEN], testword[MAXWORDLEN];
1.592 + char parastart[81]; /* first line of current para */
1.593 + FILE *infile;
1.594 + long quot, squot, firstline, alphalen, totlen, binlen,
1.595 + shortline, longline, verylongline, spacedash, emdash,
1.596 + space_emdash, non_PG_space_emdash, PG_space_emdash,
1.597 + footerline, dotcomma, start_para_line, astline, fslashline,
1.598 + standalone_digit, hyphens, htmcount, endquote_count;
1.599 + long spline, nspline;
1.600 + signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
1.601 + eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
1.602 + signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
1.603 + warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
1.604 + unsigned int lastlen, lastblen;
1.605 + signed int s_brack, c_brack, r_brack, c_unders;
1.606 + signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
1.607 + signed int isnewpara, vowel, consonant;
1.608 + char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
1.609 + unders_err[80];
1.610 + signed int qword_index, qperiod_index, isdup;
1.611 + signed int enddash;
1.612 + signed int Dutchcount, isDutch, Frenchcount, isFrench;
1.613 +
1.614 +
1.615 +
1.616 +
1.617 +
1.618 + laststart = CHAR_SPACE;
1.619 + lastlen = lastblen = 0;
1.620 + *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err =
1.621 + *unders_err = *prevline = 0;
1.622 + linecnt = firstline = alphalen = totlen = binlen =
1.623 + shortline = longline = spacedash = emdash = checked_linecnt =
1.624 + space_emdash = non_PG_space_emdash = PG_space_emdash =
1.625 + footerline = dotcomma = start_para_line = astline = fslashline =
1.626 + standalone_digit = hyphens = htmcount = endquote_count = 0;
1.627 + quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
1.628 + i = llen = isemptyline = isacro = isellipsis = istypo = 0;
1.629 + warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma =
1.630 + warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
1.631 + isnewpara = vowel = consonant = enddash = 0;
1.632 + spline = nspline = 0;
1.633 + qword_index = qperiod_index = isdup = 0;
1.634 + *inword = *testword = 0;
1.635 + open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
1.636 + Dutchcount = isDutch = Frenchcount = isFrench = 0;
1.637 +
1.638 +
1.639 + for (j = 0; j < MAX_QWORD; j++) {
1.640 + dupcnt[j] = 0;
1.641 + for (i = 0; i < MAX_QWORD_LENGTH; i++)
1.642 + qword[i][j] = 0;
1.643 + qperiod[i][j] = 0;
1.644 + }
1.645 +
1.646 +
1.647 + if ((infile = fopen(filename, "rb")) == NULL) {
1.648 + if (pswit[STDOUT_SWITCH])
1.649 + fprintf(stdout, "gutcheck: cannot open %s\n", filename);
1.650 + else
1.651 + fprintf(stderr, "gutcheck: cannot open %s\n", filename);
1.652 + exit(1);
1.653 + }
1.654 +
1.655 + fprintf(stdout, "\n\nFile: %s\n\n", filename);
1.656 + firstline = shortline = longline = verylongline = 0;
1.657 +
1.658 +
1.659 + /*****************************************************/
1.660 + /* */
1.661 + /* Run a first pass - verify that it's a valid PG */
1.662 + /* file, decide whether to report some things that */
1.663 + /* occur many times in the text like long or short */
1.664 + /* lines, non-standard dashes, and other good stuff */
1.665 + /* I'll doubtless think of later. */
1.666 + /* */
1.667 + /*****************************************************/
1.668 +
1.669 + /*****************************************************/
1.670 + /* V.24 Sigh. Yet Another Header Change */
1.671 + /*****************************************************/
1.672 +
1.673 + while (fgets(aline, LINEBUFSIZE-1, infile)) {
1.674 + while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
1.675 + linecnt++;
1.676 + if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) {
1.677 + if (spline)
1.678 + printf(" --> Duplicate header?\n");
1.679 + spline = linecnt + 1; /* first line of non-header text, that is */
1.680 + }
1.681 + if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
1.682 + if (nspline)
1.683 + printf(" --> Duplicate header?\n");
1.684 + nspline = linecnt + 1; /* first line of non-header text, that is */
1.685 + }
1.686 + if (spline || nspline) {
1.687 + lowerit(aline);
1.688 + if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
1.689 + if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
1.690 + if (footerline) {
1.691 + if (!nspline) /* it's an old-form header - we can detect duplicates */
1.692 + printf(" --> Duplicate footer?\n");
1.693 + else
1.694 + ;
1.695 + }
1.696 + else {
1.697 + footerline = linecnt;
1.698 + }
1.699 + }
1.700 + }
1.701 + }
1.702 + if (spline) firstline = spline;
1.703 + if (nspline) firstline = nspline; /* override with new */
1.704 +
1.705 + if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */
1.706 +
1.707 + llen = strlen(aline);
1.708 + totlen += llen;
1.709 + for (i = 0; i < llen; i++) {
1.710 + if ((unsigned char)aline[i] > 127) binlen++;
1.711 + if (gcisalpha(aline[i])) alphalen++;
1.712 + if (i > 0)
1.713 + if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
1.714 + endquote_count++;
1.715 + }
1.716 + if (strlen(aline) > 2
1.717 + && lastlen > 2 && lastlen < SHORTEST_PG_LINE
1.718 + && lastblen > 2 && lastblen > SHORTEST_PG_LINE
1.719 + && laststart != CHAR_SPACE)
1.720 + shortline++;
1.721 +
1.722 + if (*aline) /* fixed line below for 0.96 */
1.723 + if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
1.724 +
1.725 + if (strstr(aline, ".,")) dotcomma++;
1.726 + /* 0.98 only count ast lines for ignoring purposes where there is */
1.727 + /* locase text on the line */
1.728 + if (strstr(aline, "*")) {
1.729 + for (s = aline; *s; s++)
1.730 + if (*s >='a' && *s <= 'z')
1.731 + break;
1.732 + if (*s) astline++;
1.733 + }
1.734 + if (strstr(aline, "/"))
1.735 + fslashline++;
1.736 + for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
1.737 + if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
1.738 +
1.739 + if (llen > LONGEST_PG_LINE) longline++;
1.740 + if (llen > WAY_TOO_LONG) verylongline++;
1.741 +
1.742 + if (strstr(aline, "<") && strstr(aline, ">")) {
1.743 + i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
1.744 + if (i > 0)
1.745 + htmcount++;
1.746 + if (strstr(aline, "<i>")) htmcount +=4; /* bonus marks! */
1.747 + }
1.748 +
1.749 + /* Check for spaced em-dashes */
1.750 + if (strstr(aline,"--")) {
1.751 + emdash++;
1.752 + if (*(strstr(aline, "--")-1) == CHAR_SPACE ||
1.753 + (*(strstr(aline, "--")+2) == CHAR_SPACE))
1.754 + space_emdash++;
1.755 + if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
1.756 + (*(strstr(aline, "--")+2) == CHAR_SPACE))
1.757 + non_PG_space_emdash++; /* count of em-dashes with spaces both sides */
1.758 + if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
1.759 + (*(strstr(aline, "--")+2) != CHAR_SPACE))
1.760 + PG_space_emdash++; /* count of PG-type em-dashes with no spaces */
1.761 + }
1.762 +
1.763 + for (s = aline; *s;) {
1.764 + s = getaword(s, inword);
1.765 + if (!strcmp(inword, "hij") || !strcmp(inword, "niet"))
1.766 + Dutchcount++;
1.767 + if (!strcmp(inword, "dans") || !strcmp(inword, "avec"))
1.768 + Frenchcount++;
1.769 + if (!strcmp(inword, "0") || !strcmp(inword, "1"))
1.770 + standalone_digit++;
1.771 + }
1.772 +
1.773 + /* Check for spaced dashes */
1.774 + if (strstr(aline," -"))
1.775 + if (*(strstr(aline, " -")+2) != '-')
1.776 + spacedash++;
1.777 + lastblen = lastlen;
1.778 + lastlen = strlen(aline);
1.779 + laststart = aline[0];
1.780 +
1.781 + }
1.782 + fclose(infile);
1.783 +
1.784 +
1.785 + /* now, based on this quick view, make some snap decisions */
1.786 + if (cnt_spacend > 0) {
1.787 + printf(" --> %ld lines in this file have white space at end\n", cnt_spacend);
1.788 + }
1.789 +
1.790 + warn_dotcomma = 1;
1.791 + if (dotcomma > 5) {
1.792 + warn_dotcomma = 0;
1.793 + printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
1.794 + }
1.795 +
1.796 + /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
1.797 + warn_short = 1;
1.798 + if (shortline > 50 || shortline * 10 > linecnt) {
1.799 + warn_short = 0;
1.800 + printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
1.801 + }
1.802 +
1.803 + /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
1.804 + warn_long = 1;
1.805 + if (longline > 50 || longline * 10 > linecnt) {
1.806 + warn_long = 0;
1.807 + printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline);
1.808 + }
1.809 +
1.810 + /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
1.811 + warn_ast = 1;
1.812 + if (astline > 10 ) {
1.813 + warn_ast = 0;
1.814 + printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
1.815 + }
1.816 +
1.817 + /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
1.818 + warn_fslash = 1;
1.819 + if (fslashline > 10 ) {
1.820 + warn_fslash = 0;
1.821 + printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
1.822 + }
1.823 +
1.824 + /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
1.825 + warn_endquote = 1;
1.826 + if (endquote_count > 20 ) {
1.827 + warn_endquote = 0;
1.828 + printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
1.829 + }
1.830 +
1.831 + /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
1.832 + warn_digit = 1;
1.833 + if (standalone_digit > 10 ) {
1.834 + warn_digit = 0;
1.835 + printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
1.836 + }
1.837 +
1.838 + /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
1.839 + warn_hyphen = 1;
1.840 + if (hyphens > 20 ) {
1.841 + warn_hyphen = 0;
1.842 + printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
1.843 + }
1.844 +
1.845 + if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
1.846 + printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
1.847 + pswit[MARKUP_SWITCH] = 1;
1.848 + }
1.849 +
1.850 + if (verylongline > 0) {
1.851 + printf(" --> %ld lines in this file are VERY long!\n", verylongline);
1.852 + }
1.853 +
1.854 + /* If there are more non-PG spaced dashes than PG em-dashes, */
1.855 + /* assume it's deliberate */
1.856 + /* Current PG guidelines say don't use them, but older texts do,*/
1.857 + /* and some people insist on them whatever the guidelines say. */
1.858 + /* V.20 removed requirement that PG_space_emdash be greater than*/
1.859 + /* ten before turning off warnings about spaced dashes. */
1.860 + warn_dash = 1;
1.861 + if (spacedash + non_PG_space_emdash > PG_space_emdash) {
1.862 + warn_dash = 0;
1.863 + printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
1.864 + }
1.865 +
1.866 + /* if more than a quarter of characters are hi-bit, bug out */
1.867 + warn_bin = 1;
1.868 + if (binlen * 4 > totlen) {
1.869 + printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
1.870 + exit(1);
1.871 + }
1.872 + if (alphalen * 4 < totlen) {
1.873 + printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n");
1.874 + exit(1);
1.875 + }
1.876 + if ((binlen * 100 > totlen) || (binlen > 100)) {
1.877 + printf(" --> There are a lot of foreign letters here. Not reporting them.\n");
1.878 + warn_bin = 0;
1.879 + }
1.880 +
1.881 + /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
1.882 + isDutch = 0;
1.883 + if (Dutchcount > 50) {
1.884 + isDutch = 1;
1.885 + printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
1.886 + }
1.887 +
1.888 + isFrench = 0;
1.889 + if (Frenchcount > 50) {
1.890 + isFrench = 1;
1.891 + printf(" --> This looks like French - switching off some doublepunct.\n");
1.892 + }
1.893 +
1.894 + if (firstline && footerline)
1.895 + printf(" The PG header and footer appear to be already on.\n");
1.896 + else {
1.897 + if (firstline)
1.898 + printf(" The PG header is on - no footer.\n");
1.899 + if (footerline)
1.900 + printf(" The PG footer is on - no header.\n");
1.901 + }
1.902 + printf("\n");
1.903 +
1.904 + /* V.22 George Davis asked for an override switch to force it to list everything */
1.905 + if (pswit[VERBOSE_SWITCH]) {
1.906 + warn_bin = 1;
1.907 + warn_short = 1;
1.908 + warn_dotcomma = 1;
1.909 + warn_long = 1;
1.910 + warn_dash = 1;
1.911 + warn_digit = 1;
1.912 + warn_ast = 1;
1.913 + warn_fslash = 1;
1.914 + warn_hyphen = 1;
1.915 + warn_endquote = 1;
1.916 + printf(" *** Verbose output is ON -- you asked for it! ***\n");
1.917 + }
1.918 +
1.919 + if (isDutch)
1.920 + warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */
1.921 +
1.922 + if ((infile = fopen(filename, "rb")) == NULL) {
1.923 + if (pswit[STDOUT_SWITCH])
1.924 + fprintf(stdout, "gutcheck: cannot open %s\n", filename);
1.925 + else
1.926 + fprintf(stderr, "gutcheck: cannot open %s\n", filename);
1.927 + exit(1);
1.928 + }
1.929 +
1.930 + if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
1.931 + printf(" --> I don't really know where this text starts. \n");
1.932 + printf(" There are no reference points.\n");
1.933 + printf(" I'm going to have to report the header and footer as well.\n");
1.934 + firstline=0;
1.935 + }
1.936 +
1.937 +
1.938 +
1.939 + /*****************************************************/
1.940 + /* */
1.941 + /* Here we go with the main pass. Hold onto yer hat! */
1.942 + /* */
1.943 + /*****************************************************/
1.944 +
1.945 + /* Re-init some variables we've dirtied */
1.946 + quot = squot = linecnt = 0;
1.947 + laststart = CHAR_SPACE;
1.948 + lastlen = lastblen = 0;
1.949 +
1.950 + while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
1.951 + linecnt++;
1.952 + if (linecnt == 1) isnewpara = 1;
1.953 + if (pswit[DP_SWITCH])
1.954 + if (!strncmp(aline, "-----File: ", 11))
1.955 + continue; // skip DP page separators completely
1.956 + if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) {
1.957 + if (pswit[HEADER_SWITCH]) {
1.958 + if (!strncmp(aline, "Title:", 6))
1.959 + printf(" %s\n", aline);
1.960 + if (!strncmp (aline, "Author:", 7))
1.961 + printf(" %s\n", aline);
1.962 + if (!strncmp(aline, "Release Date:", 13))
1.963 + printf(" %s\n", aline);
1.964 + if (!strncmp(aline, "Edition:", 8))
1.965 + printf(" %s\n\n", aline);
1.966 + }
1.967 + continue; /* skip through the header */
1.968 + }
1.969 + checked_linecnt++;
1.970 + s = aline;
1.971 + isemptyline = 1; /* assume the line is empty until proven otherwise */
1.972 +
1.973 + /* If we are in a state of unbalanced quotes, and this line */
1.974 + /* doesn't begin with a quote, output the stored error message */
1.975 + /* If the -P switch was used, print the warning even if the */
1.976 + /* new para starts with quotes */
1.977 + /* Version .20 - if the new paragraph does start with a quote, */
1.978 + /* but is indented, I was giving a spurious error. Need to */
1.979 + /* check the first _non-space_ character on the line rather */
1.980 + /* than the first character when deciding whether the para */
1.981 + /* starts with a quote. Using *t for this. */
1.982 + t = s;
1.983 + while (*t == ' ') t++;
1.984 + if (*dquote_err)
1.985 + if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) {
1.986 + if (!pswit[OVERVIEW_SWITCH]) {
1.987 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1.988 + printf(dquote_err);
1.989 + }
1.990 + else
1.991 + cnt_dquot++;
1.992 + }
1.993 + if (*squote_err) {
1.994 + if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) {
1.995 + if (!pswit[OVERVIEW_SWITCH]) {
1.996 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1.997 + printf(squote_err);
1.998 + }
1.999 + else
1.1000 + cnt_squot++;
1.1001 + }
1.1002 + squot = 0;
1.1003 + }
1.1004 + if (*rbrack_err) {
1.1005 + if (!pswit[OVERVIEW_SWITCH]) {
1.1006 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1.1007 + printf(rbrack_err);
1.1008 + }
1.1009 + else
1.1010 + cnt_brack++;
1.1011 + }
1.1012 + if (*sbrack_err) {
1.1013 + if (!pswit[OVERVIEW_SWITCH]) {
1.1014 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1.1015 + printf(sbrack_err);
1.1016 + }
1.1017 + else
1.1018 + cnt_brack++;
1.1019 + }
1.1020 + if (*cbrack_err) {
1.1021 + if (!pswit[OVERVIEW_SWITCH]) {
1.1022 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1.1023 + printf(cbrack_err);
1.1024 + }
1.1025 + else
1.1026 + cnt_brack++;
1.1027 + }
1.1028 + if (*unders_err) {
1.1029 + if (!pswit[OVERVIEW_SWITCH]) {
1.1030 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
1.1031 + printf(unders_err);
1.1032 + }
1.1033 + else
1.1034 + cnt_brack++;
1.1035 + }
1.1036 +
1.1037 + *dquote_err = *squote_err = *rbrack_err = *cbrack_err =
1.1038 + *sbrack_err = *unders_err = 0;
1.1039 +
1.1040 +
1.1041 + /* look along the line, accumulate the count of quotes, and see */
1.1042 + /* if this is an empty line - i.e. a line with nothing on it */
1.1043 + /* but spaces. */
1.1044 + /* V .12 also if line has just spaces, * and/or - on it, don't */
1.1045 + /* count it, since empty lines with asterisks or dashes to */
1.1046 + /* separate sections are common. */
1.1047 + /* V .15 new single-quote checking - has to be better than the */
1.1048 + /* previous version, but how much better? fingers crossed! */
1.1049 + /* V .20 add period to * and - as characters on a separator line*/
1.1050 + s = aline;
1.1051 + while (*s) {
1.1052 + if (*s == CHAR_DQUOTE) quot++;
1.1053 + if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
1.1054 + if (s == aline) { /* at start of line, it can only be an openquote */
1.1055 + if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
1.1056 + open_single_quote++;
1.1057 + }
1.1058 + else
1.1059 + if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
1.1060 + ; /* do nothing! - it's definitely an apostrophe, not a quote */
1.1061 + else /* it's outside a word - let's check it out */
1.1062 + if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */
1.1063 + if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
1.1064 + open_single_quote++;
1.1065 + }
1.1066 + else { /* now - is it a closequote? */
1.1067 + guessquote = 0; /* accumulate clues */
1.1068 + if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */
1.1069 + guessquote += 1;
1.1070 + if (*(s-1) == 's') { /* looks like a plural apostrophe */
1.1071 + guessquote -= 3;
1.1072 + if (*(s+1) == CHAR_SPACE) /* bonus marks! */
1.1073 + guessquote -= 2;
1.1074 + }
1.1075 + }
1.1076 + else /* it doesn't have a letter either side */
1.1077 + if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1))))
1.1078 + guessquote += 8; /* looks like a closequote */
1.1079 + else
1.1080 + guessquote += 1;
1.1081 + if (open_single_quote > close_single_quote)
1.1082 + guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
1.1083 + else
1.1084 + guessquote -= 1;
1.1085 + if (guessquote >= 0)
1.1086 + close_single_quote++;
1.1087 + }
1.1088 +
1.1089 + if (*s != CHAR_SPACE
1.1090 + && *s != '-'
1.1091 + && *s != '.'
1.1092 + && *s != CHAR_ASTERISK
1.1093 + && *s != 13
1.1094 + && *s != 10) isemptyline = 0; /* ignore lines like * * * as spacers */
1.1095 + if (*s == CHAR_UNDERSCORE) c_unders++;
1.1096 + if (*s == CHAR_OPEN_CBRACK) c_brack++;
1.1097 + if (*s == CHAR_CLOSE_CBRACK) c_brack--;
1.1098 + if (*s == CHAR_OPEN_RBRACK) r_brack++;
1.1099 + if (*s == CHAR_CLOSE_RBRACK) r_brack--;
1.1100 + if (*s == CHAR_OPEN_SBRACK) s_brack++;
1.1101 + if (*s == CHAR_CLOSE_SBRACK) s_brack--;
1.1102 + s++;
1.1103 + }
1.1104 +
1.1105 + if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */
1.1106 + start_para_line = linecnt;
1.1107 + strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
1.1108 + parastart[79] = 0;
1.1109 + dquotepar = squotepar = 0; /* restart the quote count 0.98 */
1.1110 + s = aline;
1.1111 + while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; /* V.97 fixed bug - overran line and gave false warning - rare */
1.1112 + if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */
1.1113 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1114 + if (!pswit[OVERVIEW_SWITCH])
1.1115 + printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
1.1116 + else
1.1117 + cnt_punct++;
1.1118 + }
1.1119 + isnewpara = 0; /* Signal the end of new para processing */
1.1120 + }
1.1121 +
1.1122 + /* Check for an em-dash broken at line end */
1.1123 + if (enddash && *aline == '-') {
1.1124 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1125 + if (!pswit[OVERVIEW_SWITCH])
1.1126 + printf(" Line %ld column 1 - Broken em-dash?\n", linecnt);
1.1127 + else
1.1128 + cnt_punct++;
1.1129 + }
1.1130 + enddash = 0;
1.1131 + for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
1.1132 + if (s >= aline && *s == '-')
1.1133 + enddash = 1;
1.1134 +
1.1135 +
1.1136 + /* Check for invalid or questionable characters in the line */
1.1137 + /* Anything above 127 is invalid for plain ASCII, and */
1.1138 + /* non-printable control characters should also be flagged. */
1.1139 + /* Tabs should generally not be there. */
1.1140 + /* Jan 06, in 0.99: Hm. For some strange reason, I either */
1.1141 + /* never created or deleted the check for unprintable */
1.1142 + /* control characters. They should be reported even if */
1.1143 + /* warn_bin is on, I think, and in full. */
1.1144 +
1.1145 + for (s = aline; *s; s++) {
1.1146 + i = (unsigned char) *s;
1.1147 + if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
1.1148 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1149 + if (!pswit[OVERVIEW_SWITCH])
1.1150 + printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
1.1151 + else
1.1152 + cnt_bin++;
1.1153 + }
1.1154 + }
1.1155 +
1.1156 + if (warn_bin) {
1.1157 + eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */
1.1158 + for (s = aline; *s; s++) {
1.1159 + if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) {
1.1160 + i = *s; /* annoying kludge for signed chars */
1.1161 + if (i < 0) i += 256;
1.1162 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1163 + if (!pswit[OVERVIEW_SWITCH])
1.1164 + if (i > 127 && i < 160)
1.1165 + printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
1.1166 + else
1.1167 + printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
1.1168 + else
1.1169 + cnt_bin++;
1.1170 + eNon_A = 1;
1.1171 + }
1.1172 + if (!eTab && *s == CHAR_TAB) {
1.1173 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1174 + if (!pswit[OVERVIEW_SWITCH])
1.1175 + printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
1.1176 + else
1.1177 + cnt_odd++;
1.1178 + eTab = 1;
1.1179 + }
1.1180 + if (!eTilde && *s == CHAR_TILDE) { /* often used by OCR software to indicate an unrecognizable character */
1.1181 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1182 + if (!pswit[OVERVIEW_SWITCH])
1.1183 + printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
1.1184 + else
1.1185 + cnt_odd++;
1.1186 + eTilde = 1;
1.1187 + }
1.1188 + if (!eCarat && *s == CHAR_CARAT) {
1.1189 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1190 + if (!pswit[OVERVIEW_SWITCH])
1.1191 + printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
1.1192 + else
1.1193 + cnt_odd++;
1.1194 + eCarat = 1;
1.1195 + }
1.1196 + if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {
1.1197 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1198 + if (!pswit[OVERVIEW_SWITCH])
1.1199 + printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
1.1200 + else
1.1201 + cnt_odd++;
1.1202 + eFSlash = 1;
1.1203 + }
1.1204 + /* report asterisks only in paranoid mode, since they're often deliberate */
1.1205 + if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
1.1206 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1207 + if (!pswit[OVERVIEW_SWITCH])
1.1208 + printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
1.1209 + else
1.1210 + cnt_odd++;
1.1211 + eAst = 1;
1.1212 + }
1.1213 + }
1.1214 + }
1.1215 +
1.1216 + /* Check for line too long */
1.1217 + if (warn_long) {
1.1218 + if (strlen(aline) > LONGEST_PG_LINE) {
1.1219 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1220 + if (!pswit[OVERVIEW_SWITCH])
1.1221 + printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
1.1222 + else
1.1223 + cnt_long++;
1.1224 + }
1.1225 + }
1.1226 +
1.1227 + /* Check for line too short. */
1.1228 + /* This one is a bit trickier to implement: we don't want to */
1.1229 + /* flag the last line of a paragraph for being short, so we */
1.1230 + /* have to wait until we know that our current line is a */
1.1231 + /* "normal" line, then report the _previous_ line if it was too */
1.1232 + /* short. We also don't want to report indented lines like */
1.1233 + /* chapter heads or formatted quotations. We therefore keep */
1.1234 + /* lastlen as the length of the last line examined, and */
1.1235 + /* lastblen as the length of the last but one, and try to */
1.1236 + /* suppress unnecessary warnings by checking that both were of */
1.1237 + /* "normal" length. We keep the first character of the last */
1.1238 + /* line in laststart, and if it was a space, we assume that the */
1.1239 + /* formatting is deliberate. I can't figure out a way to */
1.1240 + /* distinguish something like a quoted verse left-aligned or */
1.1241 + /* the header or footer of a letter from a paragraph of short */
1.1242 + /* lines - maybe if I examined the whole paragraph, and if the */
1.1243 + /* para has less than, say, 8 lines and if all lines are short, */
1.1244 + /* then just assume it's OK? Need to look at some texts to see */
1.1245 + /* how often a formula like this would get the right result. */
1.1246 + /* V0.99 changed the tolerance for length to ignore from 2 to 1 */
1.1247 + if (warn_short) {
1.1248 + if (strlen(aline) > 1
1.1249 + && lastlen > 1 && lastlen < SHORTEST_PG_LINE
1.1250 + && lastblen > 1 && lastblen > SHORTEST_PG_LINE
1.1251 + && laststart != CHAR_SPACE) {
1.1252 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
1.1253 + if (!pswit[OVERVIEW_SWITCH])
1.1254 + printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
1.1255 + else
1.1256 + cnt_short++;
1.1257 + }
1.1258 + }
1.1259 + lastblen = lastlen;
1.1260 + lastlen = strlen(aline);
1.1261 + laststart = aline[0];
1.1262 +
1.1263 + /* look for punctuation at start of line */
1.1264 + if (*aline && strchr(".?!,;:", aline[0])) { /* if it's punctuation */
1.1265 + if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
1.1266 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1267 + if (!pswit[OVERVIEW_SWITCH])
1.1268 + printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt);
1.1269 + else
1.1270 + cnt_punct++;
1.1271 + }
1.1272 + }
1.1273 +
1.1274 + /* Check for spaced em-dashes */
1.1275 + /* V.20 must check _all_ occurrences of "--" on the line */
1.1276 + /* hence the loop - even if the first double-dash is OK */
1.1277 + /* there may be another that's wrong later on. */
1.1278 + if (warn_dash) {
1.1279 + s = aline;
1.1280 + while (strstr(s,"--")) {
1.1281 + if (*(strstr(s, "--")-1) == CHAR_SPACE ||
1.1282 + (*(strstr(s, "--")+2) == CHAR_SPACE)) {
1.1283 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1284 + if (!pswit[OVERVIEW_SWITCH])
1.1285 + printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
1.1286 + else
1.1287 + cnt_dash++;
1.1288 + }
1.1289 + s = strstr(s,"--") + 2;
1.1290 + }
1.1291 + }
1.1292 +
1.1293 + /* Check for spaced dashes */
1.1294 + if (warn_dash)
1.1295 + if (strstr(aline," -")) {
1.1296 + if (*(strstr(aline, " -")+2) != '-') {
1.1297 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1298 + if (!pswit[OVERVIEW_SWITCH])
1.1299 + printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
1.1300 + else
1.1301 + cnt_dash++;
1.1302 + }
1.1303 + }
1.1304 + else
1.1305 + if (strstr(aline,"- ")) {
1.1306 + if (*(strstr(aline, "- ")-1) != '-') {
1.1307 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1308 + if (!pswit[OVERVIEW_SWITCH])
1.1309 + printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
1.1310 + else
1.1311 + cnt_dash++;
1.1312 + }
1.1313 + }
1.1314 +
1.1315 + /* v 0.99 */
1.1316 + /* Check for unmarked paragraphs indicated by separate speakers */
1.1317 + /* May well be false positive: */
1.1318 + /* "Bravo!" "Wonderful!" called the crowd. */
1.1319 + /* but useful all the same. */
1.1320 + s = wrk;
1.1321 + *s = 0;
1.1322 + if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
1.1323 + if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
1.1324 + if (*s) {
1.1325 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1326 + if (!pswit[OVERVIEW_SWITCH])
1.1327 + printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
1.1328 + else
1.1329 + cnt_punct++;
1.1330 + }
1.1331 +
1.1332 +
1.1333 +
1.1334 + /* Check for "to he" and other easy he/be errors */
1.1335 + /* This is a very inadequate effort on the he/be problem, */
1.1336 + /* but the phrase "to he" is always an error, whereas "to */
1.1337 + /* be" is quite common. I chuckle when it does catch one! */
1.1338 + /* Similarly, '"Quiet!", be said.' is a non-be error */
1.1339 + /* V .18 - "to he" is _not_ always an error!: */
1.1340 + /* "Where they went to he couldn't say." */
1.1341 + /* but I'm leaving it in anyway. */
1.1342 + /* V .20 Another false positive: */
1.1343 + /* What would "Cinderella" be without the . . . */
1.1344 + /* and another "If he wants to he can see for himself." */
1.1345 + /* V .21 Added " is be " and " be is " and " be was " */
1.1346 + /* V .99 Added jeebies code -- removed again. */
1.1347 + /* Is jeebies code worth adding? Rare to see he/be */
1.1348 + /* errors with modern OCR. Separate program? Yes! */
1.1349 + /* jeebies does the job without cluttering up this. */
1.1350 + /* We do get a few more queryable pairs from the */
1.1351 + /* project though -- they're cheap to implement. */
1.1352 + /* Also added a column number for guiguts. */
1.1353 +
1.1354 + s = wrk;
1.1355 + *s = 0;
1.1356 + if (strstr(aline," to he ")) s = strstr(aline," to he ");
1.1357 + if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
1.1358 + if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
1.1359 + if (strstr(aline," is be ")) s = strstr(aline," is be ");
1.1360 + if (strstr(aline," be is ")) s = strstr(aline," be is ");
1.1361 + if (strstr(aline," was be ")) s = strstr(aline," was be ");
1.1362 + if (strstr(aline," be would ")) s = strstr(aline," be would ");
1.1363 + if (strstr(aline," be could ")) s = strstr(aline," be could ");
1.1364 + if (*s) {
1.1365 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1366 + if (!pswit[OVERVIEW_SWITCH])
1.1367 + printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
1.1368 + else
1.1369 + cnt_word++;
1.1370 + }
1.1371 +
1.1372 + s = wrk;
1.1373 + *s = 0;
1.1374 + if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
1.1375 + if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
1.1376 + if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
1.1377 + if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
1.1378 + if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
1.1379 + if (strstr(aline," a had ")) s = strstr(aline," a had ");
1.1380 + if (strstr(aline," the had ")) s = strstr(aline," the had ");
1.1381 + if (*s) {
1.1382 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1383 + if (!pswit[OVERVIEW_SWITCH])
1.1384 + printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
1.1385 + else
1.1386 + cnt_word++;
1.1387 + }
1.1388 +
1.1389 +
1.1390 + /* V .97 Added ", hut " Not too common, hut pretty certain */
1.1391 + /* V.99 changed to add a column number for guiguts */
1.1392 + s = wrk;
1.1393 + *s = 0;
1.1394 + if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
1.1395 + if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
1.1396 + if (*s) {
1.1397 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1398 + if (!pswit[OVERVIEW_SWITCH])
1.1399 + printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
1.1400 + else
1.1401 + cnt_word++;
1.1402 + }
1.1403 +
1.1404 + /* Special case - angled bracket in front of "From" placed there by an MTA */
1.1405 + /* when sending an e-mail. V .21 */
1.1406 + if (strstr(aline, ">From")) {
1.1407 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1408 + if (!pswit[OVERVIEW_SWITCH])
1.1409 + printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
1.1410 + else
1.1411 + cnt_punct++;
1.1412 + }
1.1413 +
1.1414 + /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
1.1415 + if (*aline && !*(aline+1)) {
1.1416 + if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline))
1.1417 + ; /* nothing - ignore numerals alone on a line. */
1.1418 + else {
1.1419 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1420 + if (!pswit[OVERVIEW_SWITCH])
1.1421 + printf(" Line %ld column 1 - Query single character line\n", linecnt);
1.1422 + else
1.1423 + cnt_punct++;
1.1424 + }
1.1425 + }
1.1426 +
1.1427 + /* V 0.98 Check for I" - often should be ! */
1.1428 + if (strstr(aline, " I\"")) {
1.1429 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1430 + if (!pswit[OVERVIEW_SWITCH])
1.1431 + printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
1.1432 + else
1.1433 + cnt_punct++;
1.1434 + }
1.1435 +
1.1436 + /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
1.1437 + /* Only works when it happens on a single line. */
1.1438 +
1.1439 + if (pswit[PARANOID_SWITCH])
1.1440 + for (t = s = aline; strstr(t,". ");) {
1.1441 + t = strstr(t, ". ");
1.1442 + if (t == s) {
1.1443 + t++;
1.1444 + continue; /* start of line punctuation is handled elsewhere */
1.1445 + }
1.1446 + if (!gcisalpha(*(t-1))) {
1.1447 + t++;
1.1448 + continue;
1.1449 + }
1.1450 + if (isDutch) { /* For Frank & Jeroen -- 's Middags case */
1.1451 + if (*(t+2) == CHAR_SQUOTE &&
1.1452 + *(t+3)>='a' && *(t+3)<='z' &&
1.1453 + *(t+4) == CHAR_SPACE &&
1.1454 + *(t+5)>='A' && *(t+5)<='Z') {
1.1455 + t++;
1.1456 + continue;
1.1457 + }
1.1458 + }
1.1459 + s1 = t+2;
1.1460 + while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
1.1461 + s1++;
1.1462 + if (*s1 >= 'a' && *s1 <= 'z') { /* we have something to investigate */
1.1463 + istypo = 1;
1.1464 + for (s1 = t - 1; s1 >= s &&
1.1465 + (gcisalpha(*s1) || gcisdigit(*s1) ||
1.1466 + (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */
1.1467 + s1++;
1.1468 + for (i = 0; *s1 && *s1 != '.'; s1++, i++)
1.1469 + testword[i] = *s1;
1.1470 + testword[i] = 0;
1.1471 + for (i = 0; *abbrev[i]; i++)
1.1472 + if (!strcmp(testword, abbrev[i]))
1.1473 + istypo = 0;
1.1474 +// if (*testword >= 'A' && *testword <= 'Z')
1.1475 +// istypo = 0;
1.1476 + if (gcisdigit(*testword)) istypo = 0;
1.1477 + if (!*(testword+1)) istypo = 0;
1.1478 + if (isroman(testword)) istypo = 0;
1.1479 + if (istypo) {
1.1480 + istypo = 0;
1.1481 + for (i = 0; testword[i]; i++)
1.1482 + if (strchr(vowels, testword[i]))
1.1483 + istypo = 1;
1.1484 + }
1.1485 + if (istypo) {
1.1486 + isdup = 0;
1.1487 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
1.1488 + for (i = 0; i < qperiod_index; i++)
1.1489 + if (!strcmp(testword, qperiod[i])) {
1.1490 + isdup = 1;
1.1491 + }
1.1492 + if (!isdup) {
1.1493 + if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
1.1494 + strcpy(qperiod[qperiod_index], testword);
1.1495 + qperiod_index++;
1.1496 + }
1.1497 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1498 + if (!pswit[OVERVIEW_SWITCH])
1.1499 + printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
1.1500 + else
1.1501 + cnt_punct++;
1.1502 + }
1.1503 + }
1.1504 + }
1.1505 + t++;
1.1506 + }
1.1507 +
1.1508 +
1.1509 + if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */
1.1510 + /* Check for words usually not followed by punctuation 0.99 */
1.1511 + for (s = aline; *s;) {
1.1512 + wordstart = s;
1.1513 + s = getaword(s, inword);
1.1514 + if (!*inword) continue;
1.1515 + lowerit(inword);
1.1516 + for (i = 0; *nocomma[i]; i++)
1.1517 + if (!strcmp(inword, nocomma[i])) {
1.1518 + if (*s == ',' || *s == ';' || *s == ':') {
1.1519 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1520 + if (!pswit[OVERVIEW_SWITCH])
1.1521 + printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
1.1522 + else
1.1523 + cnt_punct++;
1.1524 + }
1.1525 + }
1.1526 + for (i = 0; *noperiod[i]; i++)
1.1527 + if (!strcmp(inword, noperiod[i])) {
1.1528 + if (*s == '.' || *s == '!') {
1.1529 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1530 + if (!pswit[OVERVIEW_SWITCH])
1.1531 + printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
1.1532 + else
1.1533 + cnt_punct++;
1.1534 + }
1.1535 + }
1.1536 + }
1.1537 + }
1.1538 +
1.1539 +
1.1540 +
1.1541 + /* Check for commonly mistyped words, and digits like 0 for O in a word */
1.1542 + for (s = aline; *s;) {
1.1543 + wordstart = s;
1.1544 + s = getaword(s, inword);
1.1545 + if (!*inword) continue; /* don't bother with empty lines */
1.1546 + if (mixdigit(inword)) {
1.1547 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1548 + if (!pswit[OVERVIEW_SWITCH])
1.1549 + printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
1.1550 + else
1.1551 + cnt_word++;
1.1552 + }
1.1553 +
1.1554 + /* put the word through a series of tests for likely typos and OCR errors */
1.1555 + /* V.21 I had allowed lots of typo-checking even with the typo switch */
1.1556 + /* turned off, but I really should disallow reporting of them when */
1.1557 + /* the switch is off. Hence the "if" below. */
1.1558 + if (pswit[TYPO_SWITCH]) {
1.1559 + istypo = 0;
1.1560 + strcpy(testword, inword);
1.1561 + alower = 0;
1.1562 + for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
1.1563 + if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
1.1564 + if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
1.1565 + /* we have an uppercase mid-word. However, there are common cases: */
1.1566 + /* Mac and Mc like McGill */
1.1567 + /* French contractions like l'Abbe */
1.1568 + if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') ||
1.1569 + (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') ||
1.1570 + (i > 0 && testword[i-1] == CHAR_SQUOTE))
1.1571 + ; /* do nothing! */
1.1572 +
1.1573 + else { /* V.97 - remove separate case of uppercase within word so that */
1.1574 + /* names like VanAllen fall into qword_index and get reported only once */
1.1575 + istypo = 1;
1.1576 + }
1.1577 + }
1.1578 + testword[i] = (char)tolower(testword[i]);
1.1579 + }
1.1580 +
1.1581 + /* check for certain unlikely two-letter combinations at word start and end */
1.1582 + /* V.0.97 - this replaces individual hardcoded checks in previous versions */
1.1583 + if (strlen(testword) > 1) {
1.1584 + for (i = 0; *nostart[i]; i++)
1.1585 + if (!strncmp(testword, nostart[i], 2))
1.1586 + istypo = 1;
1.1587 + for (i = 0; *noend[i]; i++)
1.1588 + if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
1.1589 + istypo = 1;
1.1590 + }
1.1591 +
1.1592 +
1.1593 + /* ght is common, gbt never. Like that. */
1.1594 + if (strstr(testword, "cb")) istypo = 1;
1.1595 + if (strstr(testword, "gbt")) istypo = 1;
1.1596 + if (strstr(testword, "pbt")) istypo = 1;
1.1597 + if (strstr(testword, "tbs")) istypo = 1;
1.1598 + if (strstr(testword, "mrn")) istypo = 1;
1.1599 + if (strstr(testword, "ahle")) istypo = 1;
1.1600 + if (strstr(testword, "ihle")) istypo = 1;
1.1601 +
1.1602 + /* "TBE" does happen - like HEARTBEAT - but uncommon. */
1.1603 + /* Also "TBI" - frostbite, outbid - but uncommon. */
1.1604 + /* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */
1.1605 + /* but these are covered in V.20. "ii" is a common scanno. */
1.1606 + if (strstr(testword, "tbi")) istypo = 1;
1.1607 + if (strstr(testword, "tbe")) istypo = 1;
1.1608 + if (strstr(testword, "ii")) istypo = 1;
1.1609 +
1.1610 + /* check for no vowels or no consonants. */
1.1611 + /* If none, flag a typo */
1.1612 + if (!istypo && strlen(testword)>1) {
1.1613 + vowel = consonant = 0;
1.1614 + for (i = 0; testword[i]; i++)
1.1615 + if (testword[i] == 'y' || gcisdigit(testword[i])) { /* Yah, this is loose. */
1.1616 + vowel++;
1.1617 + consonant++;
1.1618 + }
1.1619 + else
1.1620 + if (strchr(vowels, testword[i])) vowel++;
1.1621 + else consonant++;
1.1622 + if (!vowel || !consonant) {
1.1623 + istypo = 1;
1.1624 + }
1.1625 + }
1.1626 +
1.1627 + /* now exclude the word from being reported if it's in */
1.1628 + /* the okword list */
1.1629 + for (i = 0; *okword[i]; i++)
1.1630 + if (!strcmp(testword, okword[i]))
1.1631 + istypo = 0;
1.1632 +
1.1633 + /* what looks like a typo may be a Roman numeral. Exclude these */
1.1634 + if (istypo)
1.1635 + if (isroman(testword))
1.1636 + istypo = 0;
1.1637 +
1.1638 + /* check the manual list of typos */
1.1639 + if (!istypo)
1.1640 + for (i = 0; *typo[i]; i++)
1.1641 + if (!strcmp(testword, typo[i]))
1.1642 + istypo = 1;
1.1643 +
1.1644 +
1.1645 + /* V.21 - check lowercase s and l - special cases */
1.1646 + /* V.98 - added "i" and "m" */
1.1647 + /* V.99 - added "j" often a semi-colon gone wrong */
1.1648 + /* - and "d" for a missing apostrophe - he d */
1.1649 + /* - and "n" for "in" */
1.1650 + if (!istypo && strlen(testword) == 1)
1.1651 + if (strchr("slmijdn", *inword))
1.1652 + istypo = 1;
1.1653 +
1.1654 +
1.1655 + if (istypo) {
1.1656 + isdup = 0;
1.1657 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
1.1658 + for (i = 0; i < qword_index; i++)
1.1659 + if (!strcmp(testword, qword[i])) {
1.1660 + isdup = 1;
1.1661 + ++dupcnt[i];
1.1662 + }
1.1663 + if (!isdup) {
1.1664 + if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
1.1665 + strcpy(qword[qword_index], testword);
1.1666 + qword_index++;
1.1667 + }
1.1668 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1669 + if (!pswit[OVERVIEW_SWITCH]) {
1.1670 + printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
1.1671 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
1.1672 + printf(" - not reporting duplicates");
1.1673 + printf("\n");
1.1674 + }
1.1675 + else
1.1676 + cnt_word++;
1.1677 + }
1.1678 + }
1.1679 + } /* end of typo-checking */
1.1680 +
1.1681 + /* check the user's list of typos */
1.1682 + if (!istypo)
1.1683 + if (usertypo_count)
1.1684 + for (i = 0; i < usertypo_count; i++)
1.1685 + if (!strcmp(testword, usertypo[i])) {
1.1686 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1687 + if (!pswit[OVERVIEW_SWITCH])
1.1688 + printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
1.1689 + }
1.1690 +
1.1691 +
1.1692 +
1.1693 + if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
1.1694 + if (!strcmp(inword, "0") || !strcmp(inword, "1")) {
1.1695 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1696 + if (!pswit[OVERVIEW_SWITCH])
1.1697 + printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
1.1698 + else
1.1699 + cnt_word++;
1.1700 + }
1.1701 + }
1.1702 + }
1.1703 +
1.1704 + /* look for added or missing spaces around punctuation and quotes */
1.1705 + /* If there is a punctuation character like ! with no space on */
1.1706 + /* either side, suspect a missing!space. If there are spaces on */
1.1707 + /* both sides , assume a typo. If we see a double quote with no */
1.1708 + /* space or punctuation on either side of it, assume unspaced */
1.1709 + /* quotes "like"this. */
1.1710 + llen = strlen(aline);
1.1711 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
1.1712 + if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */
1.1713 + isacro = 0; /* we need to suppress warnings for acronyms like M.D. */
1.1714 + isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */
1.1715 + if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) || /* if there are letters on both sides of it or ... */
1.1716 + (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
1.1717 + if (aline[i] == '.') {
1.1718 + if (i > 2)
1.1719 + if (aline[i-2] == '.') isacro = 1;
1.1720 + if (i + 2 < llen)
1.1721 + if (aline[i+2] == '.') isacro = 1;
1.1722 + }
1.1723 + if (!isacro) {
1.1724 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1725 + if (!pswit[OVERVIEW_SWITCH])
1.1726 + printf(" Line %ld column %d - Missing space?\n", linecnt, i+1);
1.1727 + else
1.1728 + cnt_punct++;
1.1729 + }
1.1730 + }
1.1731 + if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
1.1732 + if (aline[i] == '.') {
1.1733 + if (i > 2)
1.1734 + if (aline[i-2] == '.') isellipsis = 1;
1.1735 + if (i + 2 < llen)
1.1736 + if (aline[i+2] == '.') isellipsis = 1;
1.1737 + }
1.1738 + if (!isemptyline && !isellipsis) {
1.1739 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1740 + if (!pswit[OVERVIEW_SWITCH])
1.1741 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
1.1742 + else
1.1743 + cnt_punct++;
1.1744 + }
1.1745 + }
1.1746 + }
1.1747 + }
1.1748 +
1.1749 + /* 0.98 -- split out the characters that CANNOT be preceded by space */
1.1750 + llen = strlen(aline);
1.1751 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
1.1752 + if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */
1.1753 + if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
1.1754 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1755 + if (!pswit[OVERVIEW_SWITCH])
1.1756 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
1.1757 + else
1.1758 + cnt_punct++;
1.1759 + }
1.1760 + }
1.1761 + }
1.1762 +
1.1763 +
1.1764 + /* 0.99 -- special case " .X" where X is any alpha. */
1.1765 + /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
1.1766 + llen = strlen(aline);
1.1767 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
1.1768 + if (aline[i] == '.') { /* if it's a period */
1.1769 + if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
1.1770 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1771 + if (!pswit[OVERVIEW_SWITCH])
1.1772 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
1.1773 + else
1.1774 + cnt_punct++;
1.1775 + }
1.1776 + }
1.1777 + }
1.1778 +
1.1779 +
1.1780 +
1.1781 +
1.1782 + /* v.21 breaking out the search for unspaced doublequotes */
1.1783 + /* This is not as efficient, but it's more maintainable */
1.1784 + /* V.97 added underscore to the list of characters not to query, */
1.1785 + /* since underscores are commonly used as italics indicators. */
1.1786 + /* V.98 Added slash as well, same reason. */
1.1787 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
1.1788 + if (aline[i] == CHAR_DQUOTE) {
1.1789 + if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) &&
1.1790 + !strchr(" _-.'`,;:!/([{?}])", aline[i+1]) &&
1.1791 + aline[i+1] != 0
1.1792 + || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
1.1793 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1794 + if (!pswit[OVERVIEW_SWITCH])
1.1795 + printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
1.1796 + else
1.1797 + cnt_punct++;
1.1798 + }
1.1799 + }
1.1800 + }
1.1801 +
1.1802 +
1.1803 + /* v.98 check parity of quotes */
1.1804 + /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */
1.1805 + for (s = aline; *s; s++) {
1.1806 + if (*s == CHAR_DQUOTE) {
1.1807 + if (!(dquotepar = !dquotepar)) { /* parity even */
1.1808 + if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) {
1.1809 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1810 + if (!pswit[OVERVIEW_SWITCH])
1.1811 + printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
1.1812 + else
1.1813 + cnt_punct++;
1.1814 + }
1.1815 + }
1.1816 + else { /* parity odd */
1.1817 + if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$", *(s+1)) || !*(s+1)) {
1.1818 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1819 + if (!pswit[OVERVIEW_SWITCH])
1.1820 + printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
1.1821 + else
1.1822 + cnt_punct++;
1.1823 + }
1.1824 + }
1.1825 + }
1.1826 + }
1.1827 +
1.1828 + if (*aline == CHAR_DQUOTE) {
1.1829 + if (strchr(",;:!?)]} ", aline[1])) {
1.1830 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1831 + if (!pswit[OVERVIEW_SWITCH])
1.1832 + printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
1.1833 + else
1.1834 + cnt_punct++;
1.1835 + }
1.1836 + }
1.1837 +
1.1838 + if (pswit[SQUOTE_SWITCH])
1.1839 + for (s = aline; *s; s++) {
1.1840 + if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
1.1841 + && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) {
1.1842 + if (!(squotepar = !squotepar)) { /* parity even */
1.1843 + if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) {
1.1844 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1845 + if (!pswit[OVERVIEW_SWITCH])
1.1846 + printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
1.1847 + else
1.1848 + cnt_punct++;
1.1849 + }
1.1850 + }
1.1851 + else { /* parity odd */
1.1852 + if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`", *(s+1)) || !*(s+1)) {
1.1853 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1854 + if (!pswit[OVERVIEW_SWITCH])
1.1855 + printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
1.1856 + else
1.1857 + cnt_punct++;
1.1858 + }
1.1859 + }
1.1860 + }
1.1861 + }
1.1862 +
1.1863 +
1.1864 + /* v.20 also look for double punctuation like ,. or ,, */
1.1865 + /* Thanks to DW for the suggestion! */
1.1866 + /* I'm putting this in a separate loop for clarity */
1.1867 + /* In books with references, ".," and ".;" are common */
1.1868 + /* e.g. "etc., etc.," and vol. 1.; vol 3.; */
1.1869 + /* OTOH, from my initial tests, there are also fairly */
1.1870 + /* common errors. What to do? Make these cases paranoid? */
1.1871 + /* V.21 ".," is the most common, so invented warn_dotcomma */
1.1872 + /* to suppress detailed reporting if it occurs often */
1.1873 + llen = strlen(aline);
1.1874 + for (i = 0; i < llen; i++) /* for each character in the line */
1.1875 + if (strchr(".?!,;:", aline[i]) /* if it's punctuation */
1.1876 + && (strchr(".?!,;:", aline[i+1]))
1.1877 + && aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */
1.1878 + if (
1.1879 + (aline[i] == aline[i+1]
1.1880 + && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!'))
1.1881 + || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
1.1882 + || (isFrench && !strncmp(aline+i, ",...", 4))
1.1883 + || (isFrench && !strncmp(aline+i, "...,", 4))
1.1884 + || (isFrench && !strncmp(aline+i, ";...", 4))
1.1885 + || (isFrench && !strncmp(aline+i, "...;", 4))
1.1886 + || (isFrench && !strncmp(aline+i, ":...", 4))
1.1887 + || (isFrench && !strncmp(aline+i, "...:", 4))
1.1888 + || (isFrench && !strncmp(aline+i, "!...", 4))
1.1889 + || (isFrench && !strncmp(aline+i, "...!", 4))
1.1890 + || (isFrench && !strncmp(aline+i, "?...", 4))
1.1891 + || (isFrench && !strncmp(aline+i, "...?", 4))
1.1892 + ) {
1.1893 + if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */
1.1894 + || (isFrench && !strncmp(aline+i, "...,", 4))
1.1895 + || (isFrench && !strncmp(aline+i, ";...", 4))
1.1896 + || (isFrench && !strncmp(aline+i, "...;", 4))
1.1897 + || (isFrench && !strncmp(aline+i, ":...", 4))
1.1898 + || (isFrench && !strncmp(aline+i, "...:", 4))
1.1899 + || (isFrench && !strncmp(aline+i, "!...", 4))
1.1900 + || (isFrench && !strncmp(aline+i, "...!", 4))
1.1901 + || (isFrench && !strncmp(aline+i, "?...", 4))
1.1902 + || (isFrench && !strncmp(aline+i, "...?", 4)))
1.1903 + i +=4;
1.1904 + ; /* do nothing for .. !! and ?? which can be legit */
1.1905 + }
1.1906 + else {
1.1907 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1908 + if (!pswit[OVERVIEW_SWITCH])
1.1909 + printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1);
1.1910 + else
1.1911 + cnt_punct++;
1.1912 + }
1.1913 +
1.1914 + /* v.21 breaking out the search for spaced doublequotes */
1.1915 + /* This is not as efficient, but it's more maintainable */
1.1916 + s = aline;
1.1917 + while (strstr(s," \" ")) {
1.1918 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1919 + if (!pswit[OVERVIEW_SWITCH])
1.1920 + printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
1.1921 + else
1.1922 + cnt_punct++;
1.1923 + s = strstr(s," \" ") + 2;
1.1924 + }
1.1925 +
1.1926 + /* v.20 also look for spaced singlequotes ' and ` */
1.1927 + s = aline;
1.1928 + while (strstr(s," ' ")) {
1.1929 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1930 + if (!pswit[OVERVIEW_SWITCH])
1.1931 + printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
1.1932 + else
1.1933 + cnt_punct++;
1.1934 + s = strstr(s," ' ") + 2;
1.1935 + }
1.1936 +
1.1937 + s = aline;
1.1938 + while (strstr(s," ` ")) {
1.1939 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1940 + if (!pswit[OVERVIEW_SWITCH])
1.1941 + printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
1.1942 + else
1.1943 + cnt_punct++;
1.1944 + s = strstr(s," ` ") + 2;
1.1945 + }
1.1946 +
1.1947 + /* v.99 check special case of 'S instead of 's at end of word */
1.1948 + s = aline + 1;
1.1949 + while (*s) {
1.1950 + if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z') {
1.1951 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1952 + if (!pswit[OVERVIEW_SWITCH])
1.1953 + printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
1.1954 + else
1.1955 + cnt_punct++;
1.1956 + }
1.1957 + s++;
1.1958 + }
1.1959 +
1.1960 +
1.1961 + /* v.21 Now check special cases - start and end of line - */
1.1962 + /* for single and double quotes. Start is sometimes [sic] */
1.1963 + /* but better to query it anyway. */
1.1964 + /* While I'm here, check for dash at end of line */
1.1965 + llen = strlen(aline);
1.1966 + if (llen > 1) {
1.1967 + if (aline[llen-1] == CHAR_DQUOTE ||
1.1968 + aline[llen-1] == CHAR_SQUOTE ||
1.1969 + aline[llen-1] == CHAR_OPEN_SQUOTE)
1.1970 + if (aline[llen-2] == CHAR_SPACE) {
1.1971 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1972 + if (!pswit[OVERVIEW_SWITCH])
1.1973 + printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen);
1.1974 + else
1.1975 + cnt_punct++;
1.1976 + }
1.1977 +
1.1978 + /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
1.1979 + /* Wrongspaced quotes test also catches it for " */
1.1980 + if (aline[0] == CHAR_SQUOTE ||
1.1981 + aline[0] == CHAR_OPEN_SQUOTE)
1.1982 + if (aline[1] == CHAR_SPACE) {
1.1983 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1984 + if (!pswit[OVERVIEW_SWITCH])
1.1985 + printf(" Line %ld column 1 - Spaced quote?\n", linecnt);
1.1986 + else
1.1987 + cnt_punct++;
1.1988 + }
1.1989 + /* dash at end of line may well be legit - paranoid mode only */
1.1990 + /* and don't report em-dash at line-end */
1.1991 + if (pswit[PARANOID_SWITCH] && warn_hyphen) {
1.1992 + for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
1.1993 + if (aline[i] == '-' && aline[i-1] != '-') {
1.1994 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.1995 + if (!pswit[OVERVIEW_SWITCH])
1.1996 + printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
1.1997 + }
1.1998 + }
1.1999 + }
1.2000 +
1.2001 + /* v.21 also look for brackets surrounded by alpha */
1.2002 + /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
1.2003 + /* If so, suspect a scanno like "a]most" */
1.2004 + llen = strlen(aline);
1.2005 + for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/
1.2006 + if (strchr("{[()]}", aline[i]) /* if it's a bracket */
1.2007 + && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
1.2008 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.2009 + if (!pswit[OVERVIEW_SWITCH])
1.2010 + printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i);
1.2011 + else
1.2012 + cnt_punct++;
1.2013 + }
1.2014 + }
1.2015 + /* The "Cinderella" case, back in again! :-S Give it another shot */
1.2016 + if (warn_endquote) {
1.2017 + llen = strlen(aline);
1.2018 + for (i = 1; i < llen; i++) { /* for each character in the line except 1st */
1.2019 + if (aline[i] == CHAR_DQUOTE)
1.2020 + if (isalpha(aline[i-1])) {
1.2021 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.2022 + if (!pswit[OVERVIEW_SWITCH])
1.2023 + printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
1.2024 + else
1.2025 + cnt_punct++;
1.2026 + }
1.2027 + }
1.2028 + }
1.2029 +
1.2030 + llen = strlen(aline);
1.2031 +
1.2032 + /* Check for <HTML TAG> */
1.2033 + /* If there is a < in the line, followed at some point */
1.2034 + /* by a > then we suspect HTML */
1.2035 + if (strstr(aline, "<") && strstr(aline, ">")) {
1.2036 + i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
1.2037 + if (i > 0) {
1.2038 + strncpy(wrk, strstr(aline, "<"), i);
1.2039 + wrk[i] = 0;
1.2040 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.2041 + if (!pswit[OVERVIEW_SWITCH])
1.2042 + printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
1.2043 + else
1.2044 + cnt_html++;
1.2045 + }
1.2046 + }
1.2047 +
1.2048 + /* Check for &symbol; HTML */
1.2049 + /* If there is a & in the line, followed at */
1.2050 + /* some point by a ; then we suspect HTML */
1.2051 + if (strstr(aline, "&") && strstr(aline, ";")) {
1.2052 + i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
1.2053 + for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)
1.2054 + if (*s == CHAR_SPACE) i = 0; /* 0.99 don't report "Jones & Son;" */
1.2055 + if (i > 0) {
1.2056 + strncpy(wrk, strstr(aline,"&"), i);
1.2057 + wrk[i] = 0;
1.2058 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
1.2059 + if (!pswit[OVERVIEW_SWITCH])
1.2060 + printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
1.2061 + else
1.2062 + cnt_html++;
1.2063 + }
1.2064 + }
1.2065 +
1.2066 + /* At end of paragraph, check for mismatched quotes. */
1.2067 + /* We don't want to report an error immediately, since it is a */
1.2068 + /* common convention to omit the quotes at end of paragraph if */
1.2069 + /* the next paragraph is a continuation of the same speaker. */
1.2070 + /* Where this is the case, the next para should begin with a */
1.2071 + /* quote, so we store the warning message and only display it */
1.2072 + /* at the top of the next iteration if the new para doesn't */
1.2073 + /* start with a quote. */
1.2074 + /* The -p switch overrides this default, and warns of unclosed */
1.2075 + /* quotes on _every_ paragraph, whether the next begins with a */
1.2076 + /* quote or not. */
1.2077 + /* Version .16 - only report mismatched single quotes if */
1.2078 + /* an open_single_quotes was found. */
1.2079 +
1.2080 + if (isemptyline) { /* end of para - add up the totals */
1.2081 + if (quot % 2)
1.2082 + sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt);
1.2083 + if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
1.2084 + sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt);
1.2085 + if (pswit[SQUOTE_SWITCH] && open_single_quote
1.2086 + && (open_single_quote != close_single_quote)
1.2087 + && (open_single_quote != close_single_quote +1) )
1.2088 + squot = 1; /* flag it to be noted regardless of the first char of the next para */
1.2089 + if (r_brack)
1.2090 + sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt);
1.2091 + if (s_brack)
1.2092 + sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt);
1.2093 + if (c_brack)
1.2094 + sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt);
1.2095 + if (c_unders % 2)
1.2096 + sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt);
1.2097 + quot = s_brack = c_brack = r_brack = c_unders =
1.2098 + open_single_quote = close_single_quote = 0;
1.2099 + isnewpara = 1; /* let the next iteration know that it's starting a new para */
1.2100 + }
1.2101 +
1.2102 + /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
1.2103 + /* by working back through prevline. DW. */
1.2104 + /* Hmmm. Need to check this only for "normal" paras. */
1.2105 + /* So what is a "normal" para? ouch! */
1.2106 + /* Not normal if one-liner (chapter headings, etc.) */
1.2107 + /* Not normal if doesn't contain at least one locase letter */
1.2108 + /* Not normal if starts with space */
1.2109 +
1.2110 + /* 0.99 tighten up on para end checks. Disallow comma and */
1.2111 + /* semi-colon. Check for legit para end before quotes. */
1.2112 + if (isemptyline) { /* end of para */
1.2113 + for (s = prevline, i = 0; *s && !i; s++)
1.2114 + if (gcisletter(*s))
1.2115 + i = 1; /* use i to indicate the presence of a letter on the line */
1.2116 + /* This next "if" is a problem. */
1.2117 + /* If I say "start_para_line <= linecnt - 1", that includes one-line */
1.2118 + /* "paragraphs" like chapter heads. Lotsa false positives. */
1.2119 + /* If I say "start_para_line < linecnt - 1" it doesn't, but then it */
1.2120 + /* misses genuine one-line paragraphs. */
1.2121 + /* So what do I do? */
1.2122 + if (i
1.2123 + && lastblen > 2
1.2124 + && start_para_line < linecnt - 1
1.2125 + && *prevline > CHAR_SPACE
1.2126 + ) {
1.2127 + for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
1.2128 + for ( ; i > 0; i--) {
1.2129 + if (gcisalpha(prevline[i])) {
1.2130 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
1.2131 + if (!pswit[OVERVIEW_SWITCH])
1.2132 + printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
1.2133 + else
1.2134 + cnt_punct++;
1.2135 + break;
1.2136 + }
1.2137 + if (strchr("-.:!([{?}])", prevline[i]))
1.2138 + break;
1.2139 + }
1.2140 + }
1.2141 + }
1.2142 + strcpy(prevline, aline);
1.2143 + }
1.2144 + fclose (infile);
1.2145 + if (!pswit[OVERVIEW_SWITCH])
1.2146 + for (i = 0; i < MAX_QWORD; i++)
1.2147 + if (dupcnt[i])
1.2148 + printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
1.2149 +}
1.2150 +
1.2151 +
1.2152 +
1.2153 +/* flgets - get one line from the input stream, checking for */
1.2154 +/* the existence of exactly one CR/LF line-end per line. */
1.2155 +/* Returns a pointer to the line. */
1.2156 +
1.2157 +char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt)
1.2158 +{
1.2159 + char c;
1.2160 + int len, isCR, cint;
1.2161 +
1.2162 + *theline = 0;
1.2163 + len = isCR = 0;
1.2164 + c = cint = fgetc(thefile);
1.2165 + do {
1.2166 + if (cint == EOF)
1.2167 + return (NULL);
1.2168 + if (c == 10) /* either way, it's end of line */
1.2169 + if (isCR)
1.2170 + break;
1.2171 + else { /* Error - a LF without a preceding CR */
1.2172 + if (pswit[LINE_END_SWITCH]) {
1.2173 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
1.2174 + if (!pswit[OVERVIEW_SWITCH])
1.2175 + printf(" Line %ld - No CR?\n", lcnt);
1.2176 + else
1.2177 + cnt_lineend++;
1.2178 + }
1.2179 + break;
1.2180 + }
1.2181 + if (c == 13) {
1.2182 + if (isCR) { /* Error - two successive CRs */
1.2183 + if (pswit[LINE_END_SWITCH]) {
1.2184 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
1.2185 + if (!pswit[OVERVIEW_SWITCH])
1.2186 + printf(" Line %ld - Two successive CRs?\n", lcnt);
1.2187 + else
1.2188 + cnt_lineend++;
1.2189 + }
1.2190 + }
1.2191 + isCR = 1;
1.2192 + }
1.2193 + else {
1.2194 + if (pswit[LINE_END_SWITCH] && isCR) {
1.2195 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
1.2196 + if (!pswit[OVERVIEW_SWITCH])
1.2197 + printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1);
1.2198 + else
1.2199 + cnt_lineend++;
1.2200 + }
1.2201 + theline[len] = c;
1.2202 + len++;
1.2203 + theline[len] = 0;
1.2204 + isCR = 0;
1.2205 + }
1.2206 + c = cint = fgetc(thefile);
1.2207 + } while(len < maxlen);
1.2208 + if (pswit[MARKUP_SWITCH])
1.2209 + postprocess_for_HTML(theline);
1.2210 + if (pswit[DP_SWITCH])
1.2211 + postprocess_for_DP(theline);
1.2212 + return(theline);
1.2213 +}
1.2214 +
1.2215 +
1.2216 +
1.2217 +
1.2218 +/* mixdigit - takes a "word" as a parameter, and checks whether it */
1.2219 +/* contains a mixture of alpha and digits. Generally, this is an */
1.2220 +/* error, but may not be for cases like 4th or L5 12s. 3d. */
1.2221 +/* Returns 0 if no error found, 1 if error. */
1.2222 +
1.2223 +int mixdigit(char *checkword) /* check for digits like 1 or 0 in words */
1.2224 +{
1.2225 + int wehaveadigit, wehavealetter, firstdigits, query, wl;
1.2226 + char *s;
1.2227 +
1.2228 +
1.2229 + wehaveadigit = wehavealetter = query = 0;
1.2230 + for (s = checkword; *s; s++)
1.2231 + if (gcisalpha(*s))
1.2232 + wehavealetter = 1;
1.2233 + else
1.2234 + if (gcisdigit(*s))
1.2235 + wehaveadigit = 1;
1.2236 + if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
1.2237 + query = 1;
1.2238 + wl = strlen(checkword);
1.2239 + for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
1.2240 + ;
1.2241 + /* digits, ending in st, rd, nd, th of either case */
1.2242 + /* 0.99 donovan points out an error below. Turns out */
1.2243 + /* I was using matchword like strcmp when the */
1.2244 + /* return values are different! Duh. */
1.2245 + if (firstdigits + 2 == wl &&
1.2246 + (matchword(checkword + wl - 2, "st")
1.2247 + || matchword(checkword + wl - 2, "rd")
1.2248 + || matchword(checkword + wl - 2, "nd")
1.2249 + || matchword(checkword + wl - 2, "th"))
1.2250 + )
1.2251 + query = 0;
1.2252 + if (firstdigits + 3 == wl &&
1.2253 + (matchword(checkword + wl - 3, "sts")
1.2254 + || matchword(checkword + wl - 3, "rds")
1.2255 + || matchword(checkword + wl - 3, "nds")
1.2256 + || matchword(checkword + wl - 3, "ths"))
1.2257 + )
1.2258 + query = 0;
1.2259 + if (firstdigits + 3 == wl &&
1.2260 + (matchword(checkword + wl - 4, "stly")
1.2261 + || matchword(checkword + wl - 4, "rdly")
1.2262 + || matchword(checkword + wl - 4, "ndly")
1.2263 + || matchword(checkword + wl - 4, "thly"))
1.2264 + )
1.2265 + query = 0;
1.2266 +
1.2267 + /* digits, ending in l, L, s or d */
1.2268 + if (firstdigits + 1 == wl &&
1.2269 + (checkword[wl-1] == 'l'
1.2270 + || checkword[wl-1] == 'L'
1.2271 + || checkword[wl-1] == 's'
1.2272 + || checkword[wl-1] == 'd'))
1.2273 + query = 0;
1.2274 + /* L at the start of a number, representing Britsh pounds, like L500 */
1.2275 + /* This is cute. We know the current word is mixeddigit. If the first */
1.2276 + /* letter is L, there must be at least one digit following. If both */
1.2277 + /* digits and letters follow, we have a genuine error, else we have a */
1.2278 + /* capital L followed by digits, and we accept that as a non-error. */
1.2279 + if (checkword[0] == 'L')
1.2280 + if (!mixdigit(checkword+1))
1.2281 + query = 0;
1.2282 + }
1.2283 + return (query);
1.2284 +}
1.2285 +
1.2286 +
1.2287 +
1.2288 +
1.2289 +/* getaword - extracts the first/next "word" from the line, and puts */
1.2290 +/* it into "thisword". A word is defined as one English word unit */
1.2291 +/* -- or at least that's what I'm trying for. */
1.2292 +/* Returns a pointer to the position in the line where we will start */
1.2293 +/* looking for the next word. */
1.2294 +
1.2295 +char *getaword(char *fromline, char *thisword)
1.2296 +{
1.2297 + int i, wordlen;
1.2298 + char *s;
1.2299 +
1.2300 + wordlen = 0;
1.2301 + for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ );
1.2302 +
1.2303 + /* V .20 */
1.2304 + /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */
1.2305 + /* Especially yucky is the case of L1,000 */
1.2306 + /* I hate this, and I see other ways, but I don't see that any is _better_.*/
1.2307 + /* This section looks for a pattern of characters including a digit */
1.2308 + /* followed by a comma or period followed by one or more digits. */
1.2309 + /* If found, it returns this whole pattern as a word; otherwise we discard */
1.2310 + /* the results and resume our normal programming. */
1.2311 + s = fromline;
1.2312 + for ( ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) {
1.2313 + thisword[wordlen] = *s;
1.2314 + wordlen++;
1.2315 + }
1.2316 + thisword[wordlen] = 0;
1.2317 + for (i = 1; i < wordlen -1; i++) {
1.2318 + if (thisword[i] == '.' || thisword[i] == ',') {
1.2319 + if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */
1.2320 + fromline = s;
1.2321 + return(fromline);
1.2322 + }
1.2323 + }
1.2324 + }
1.2325 +
1.2326 + /* we didn't find a punctuated number - do the regular getword thing */
1.2327 + wordlen = 0;
1.2328 + for ( ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
1.2329 + thisword[wordlen] = *fromline;
1.2330 + wordlen++;
1.2331 + }
1.2332 + thisword[wordlen] = 0;
1.2333 + return(fromline);
1.2334 +}
1.2335 +
1.2336 +
1.2337 +
1.2338 +
1.2339 +
1.2340 +/* matchword - just a case-insensitive string matcher */
1.2341 +/* yes, I know this is not efficient. I'll worry about */
1.2342 +/* that when I have a clear idea where I'm going with it.*/
1.2343 +
1.2344 +int matchword(char *checkfor, char *thisword)
1.2345 +{
1.2346 + unsigned int ismatch, i;
1.2347 +
1.2348 + if (strlen(checkfor) != strlen(thisword)) return(0);
1.2349 +
1.2350 + ismatch = 1; /* assume a match until we find a difference */
1.2351 + for (i = 0; i <strlen(checkfor); i++)
1.2352 + if (toupper(checkfor[i]) != toupper(thisword[i]))
1.2353 + ismatch = 0;
1.2354 + return (ismatch);
1.2355 +}
1.2356 +
1.2357 +
1.2358 +
1.2359 +
1.2360 +
1.2361 +/* lowerit - lowercase the line. Yes, strlwr does the same job, */
1.2362 +/* but not on all platforms, and I'm a bit paranoid about what */
1.2363 +/* some implementations of tolower might do to hi-bit characters,*/
1.2364 +/* which shouldn't matter, but better safe than sorry. */
1.2365 +
1.2366 +void lowerit(char *theline)
1.2367 +{
1.2368 + for ( ; *theline; theline++)
1.2369 + if (*theline >='A' && *theline <='Z')
1.2370 + *theline += 32;
1.2371 +}
1.2372 +
1.2373 +
1.2374 +/* Is this word a Roman Numeral? */
1.2375 +/* v 0.99 improved to be better. It still doesn't actually */
1.2376 +/* validate that the number is a valid Roman Numeral -- for example */
1.2377 +/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
1.2378 +/* what we're here to do. If it passes this, it LOOKS like a Roman */
1.2379 +/* numeral. Anyway, the actual Romans were pretty tolerant of bad */
1.2380 +/* arithmetic, or expressions thereof, except when it came to taxes.*/
1.2381 +/* Allow any number of M, an optional D, an optional CM or CD, */
1.2382 +/* any number of optional Cs, an optional XL or an optional XC, an */
1.2383 +/* optional IX or IV, an optional V and any number of optional Is. */
1.2384 +/* Good enough for jazz chords. */
1.2385 +
1.2386 +int isroman(char *t)
1.2387 +{
1.2388 + char *s;
1.2389 +
1.2390 + if (!t || !*t) return (0);
1.2391 +
1.2392 + s = t;
1.2393 +
1.2394 + while (*t == 'm' && *t ) t++;
1.2395 + if (*t == 'd') t++;
1.2396 + if (*t == 'c' && *(t+1) == 'm') t+=2;
1.2397 + if (*t == 'c' && *(t+1) == 'd') t+=2;
1.2398 + while (*t == 'c' && *t) t++;
1.2399 + if (*t == 'x' && *(t+1) == 'l') t+=2;
1.2400 + if (*t == 'x' && *(t+1) == 'c') t+=2;
1.2401 + if (*t == 'l') t++;
1.2402 + while (*t == 'x' && *t) t++;
1.2403 + if (*t == 'i' && *(t+1) == 'x') t+=2;
1.2404 + if (*t == 'i' && *(t+1) == 'v') t+=2;
1.2405 + if (*t == 'v') t++;
1.2406 + while (*t == 'i' && *t) t++;
1.2407 + if (!*t) return (1);
1.2408 +
1.2409 + return(0);
1.2410 +}
1.2411 +
1.2412 +
1.2413 +
1.2414 +
1.2415 +/* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */
1.2416 +/* If we use the standard isalpha() function, 8-bit accented characters break */
1.2417 +/* words, so that tete with accented characters appears to be two words, "t" */
1.2418 +/* and "t", with 8-bit characters between them. This causes over-reporting of */
1.2419 +/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */
1.2420 +/* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */
1.2421 +
1.2422 +int gcisalpha(unsigned char c)
1.2423 +{
1.2424 + if (c >='a' && c <='z') return(1);
1.2425 + if (c >='A' && c <='Z') return(1);
1.2426 + if (c < 140) return(0);
1.2427 + if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
1.2428 + if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1);
1.2429 + return(0);
1.2430 +}
1.2431 +
1.2432 +/* gcisdigit is a special version that doesn't get confused in 8-bit texts. */
1.2433 +int gcisdigit(unsigned char c)
1.2434 +{
1.2435 + if (c >= '0' && c <='9') return(1);
1.2436 + return(0);
1.2437 +}
1.2438 +
1.2439 +/* gcisletter is a special version that doesn't get confused in 8-bit texts. */
1.2440 +/* Yeah, we're ISO-8891-1-specific. So sue me. */
1.2441 +int gcisletter(unsigned char c)
1.2442 +{
1.2443 + if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1);
1.2444 + return(0);
1.2445 +}
1.2446 +
1.2447 +
1.2448 +
1.2449 +
1.2450 +/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
1.2451 +
1.2452 +char *gcstrchr(char *s, char c)
1.2453 +{
1.2454 + if (c == 0) return(NULL);
1.2455 + return(strchr(s,c));
1.2456 +}
1.2457 +
1.2458 +/* postprocess_for_DP is derived from postprocess_for_HTML */
1.2459 +/* It is invoked with the -d switch from flgets(). */
1.2460 +/* It simply "removes" from the line a hard-coded set of common */
1.2461 +/* DP-specific tags, so that the line passed to the main routine has*/
1.2462 +/* been pre-cleaned of DP markup. */
1.2463 +
1.2464 +void postprocess_for_DP(char *theline)
1.2465 +{
1.2466 +
1.2467 + char *s, *t;
1.2468 + int i;
1.2469 +
1.2470 + if (!*theline)
1.2471 + return;
1.2472 +
1.2473 + for (i = 0; *DPmarkup[i]; i++) {
1.2474 + s = strstr(theline, DPmarkup[i]);
1.2475 + while (s) {
1.2476 + t = s + strlen(DPmarkup[i]);
1.2477 + while (*t) {
1.2478 + *s = *t;
1.2479 + t++; s++;
1.2480 + }
1.2481 + *s = 0;
1.2482 + s = strstr(theline, DPmarkup[i]);
1.2483 + }
1.2484 + }
1.2485 +
1.2486 +}
1.2487 +
1.2488 +
1.2489 +/* postprocess_for_HTML is, at the moment (0.97), a very nasty */
1.2490 +/* short-term fix for Charlz. Nasty, nasty, nasty. */
1.2491 +/* It is invoked with the -m switch from flgets(). */
1.2492 +/* It simply "removes" from the line a hard-coded set of common */
1.2493 +/* HTML tags and "replaces" a hard-coded set of common HTML */
1.2494 +/* entities, so that the line passed to the main routine has */
1.2495 +/* been pre-cleaned of HTML. This is _so_ not the right way to */
1.2496 +/* deal with HTML, but what Charlz needs now is not HTML handling */
1.2497 +/* proper: just ignoring <i> tags and some others. */
1.2498 +/* To be revisited in future releases! */
1.2499 +
1.2500 +void postprocess_for_HTML(char *theline)
1.2501 +{
1.2502 +
1.2503 + if (strstr(theline, "<") && strstr(theline, ">"))
1.2504 + while (losemarkup(theline))
1.2505 + ;
1.2506 + while (loseentities(theline))
1.2507 + ;
1.2508 +}
1.2509 +
1.2510 +char *losemarkup(char *theline)
1.2511 +{
1.2512 + char *s, *t;
1.2513 + int i;
1.2514 +
1.2515 + if (!*theline)
1.2516 + return(NULL);
1.2517 +
1.2518 + s = strstr(theline, "<");
1.2519 + t = strstr(theline, ">");
1.2520 + if (!s || !t) return(NULL);
1.2521 + for (i = 0; *markup[i]; i++)
1.2522 + if (!tagcomp(s+1, markup[i])) {
1.2523 + if (!*(t+1)) {
1.2524 + *s = 0;
1.2525 + return(s);
1.2526 + }
1.2527 + else
1.2528 + if (t > s) {
1.2529 + strcpy(s, t+1);
1.2530 + return(s);
1.2531 + }
1.2532 + }
1.2533 + /* it's an unrecognized <xxx> */
1.2534 + return(NULL);
1.2535 +}
1.2536 +
1.2537 +char *loseentities(char *theline)
1.2538 +{
1.2539 + int i;
1.2540 + char *s, *t;
1.2541 +
1.2542 + if (!*theline)
1.2543 + return(NULL);
1.2544 +
1.2545 + for (i = 0; *entities[i].htmlent; i++) {
1.2546 + s = strstr(theline, entities[i].htmlent);
1.2547 + if (s) {
1.2548 + t = malloc((size_t)strlen(s));
1.2549 + if (!t) return(NULL);
1.2550 + strcpy(t, s + strlen(entities[i].htmlent));
1.2551 + strcpy(s, entities[i].textent);
1.2552 + strcat(s, t);
1.2553 + free(t);
1.2554 + return(theline);
1.2555 + }
1.2556 + }
1.2557 +
1.2558 + /* V0.97 Duh. Forgot to check the htmlnum member */
1.2559 + for (i = 0; *entities[i].htmlnum; i++) {
1.2560 + s = strstr(theline, entities[i].htmlnum);
1.2561 + if (s) {
1.2562 + t = malloc((size_t)strlen(s));
1.2563 + if (!t) return(NULL);
1.2564 + strcpy(t, s + strlen(entities[i].htmlnum));
1.2565 + strcpy(s, entities[i].textent);
1.2566 + strcat(s, t);
1.2567 + free(t);
1.2568 + return(theline);
1.2569 + }
1.2570 + }
1.2571 + return(NULL);
1.2572 +}
1.2573 +
1.2574 +
1.2575 +int tagcomp(char *strin, char *basetag)
1.2576 +{
1.2577 + char *s, *t;
1.2578 +
1.2579 + s = basetag;
1.2580 + t = strin;
1.2581 + if (*t == '/') t++; /* ignore a slash */
1.2582 + while (*s && *t) {
1.2583 + if (tolower(*s) != tolower(*t)) return(1);
1.2584 + s++; t++;
1.2585 + }
1.2586 + /* OK, we have < followed by a valid tag start */
1.2587 + /* should I do something about length? */
1.2588 + /* this is messy. The length of an <i> tag is */
1.2589 + /* limited, but a <table> could go on for miles */
1.2590 + /* so I'd have to parse the tags . . . ugh. */
1.2591 + /* It isn't what Charlz needs now, so mark it */
1.2592 + /* as 'pending'. */
1.2593 + return(0);
1.2594 +}
1.2595 +
1.2596 +void proghelp() /* explain program usage here */
1.2597 +{
1.2598 + fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
1.2599 + fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr);
1.2600 + fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr);
1.2601 + fputs("read the file COPYING for details.\n\n", stderr);
1.2602 + fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr);
1.2603 + fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr);
1.2604 + fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr);
1.2605 + fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr);
1.2606 + fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr);
1.2607 + fputs(" -d ignores DP-specific markup,\n",stderr);
1.2608 + fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr);
1.2609 + fputs("Sample usage: gutcheck warpeace.txt \n",stderr);
1.2610 + fputs("\n",stderr);
1.2611 + fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr);
1.2612 + fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr);
1.2613 + fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr);
1.2614 + fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr);
1.2615 + fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr);
1.2616 + fputs("\n",stderr);
1.2617 +}
1.2618 +
1.2619 +
1.2620 +
1.2621 +/*********************************************************************
1.2622 + Revision History:
1.2623 +
1.2624 + 04/22/01 Cleaned up some stuff and released .10
1.2625 +
1.2626 + ---------------
1.2627 +
1.2628 + 05/09/01 Added the typo list, added two extra cases of he/be error,
1.2629 + added -p switch, OPEN_SINGLE QUOTE char as .11
1.2630 +
1.2631 + ---------------
1.2632 +
1.2633 + 05/20/01 Increased the typo list,
1.2634 + added paranoid mode,
1.2635 + ANSIfied the code and added some casts
1.2636 + so the compiler wouldn't keep asking if I knew what I was doing,
1.2637 + fixed bug in l.s.d. condition (thanks, Dave!),
1.2638 + standardized spacing when echoing,
1.2639 + added letter-combo checking code to typo section,
1.2640 + added more h/b words to typo array.
1.2641 + Not too sure about putting letter combos outside of the TYPO conditions -
1.2642 + someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see.
1.2643 + Released as .12
1.2644 +
1.2645 + ---------------
1.2646 +
1.2647 + 06/01/01 Removed duplicate reporting of Tildes, asterisks, etc.
1.2648 + 06/10/01 Added flgets routine to help with platform-independent
1.2649 + detection of invalid line-ends. All PG text files should
1.2650 + have CR/LF (13/10) at end of line, regardless of system.
1.2651 + Gutcheck now validates this by default. (Thanks, Charles!)
1.2652 + Released as .13
1.2653 +
1.2654 + ---------------
1.2655 +
1.2656 + 06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.)
1.2657 + Released as .14
1.2658 +
1.2659 + ---------------
1.2660 +
1.2661 + 06/23/01 Fixed: 'No',he said. not being flagged.
1.2662 +
1.2663 + Improved: better single-quotes checking:
1.2664 +
1.2665 + Ignore singlequotes surrounded by alpha, like didn't. (was OK)
1.2666 +
1.2667 + If a singlequote is at the END of a word AND the word ends in "s":
1.2668 + The dogs' tails wagged.
1.2669 + it's probably an apostrophe, but less commonly may be a closequote:
1.2670 + "These 'pack dogs' of yours look more like wolves."
1.2671 +
1.2672 + If it's got punctuation before it and is followed by a space
1.2673 + or punctuation:
1.2674 + . . . was a problem,' he said
1.2675 + . . . was a problem,'"
1.2676 + it is probably (certainly?) a closequote.
1.2677 +
1.2678 + If it's at start of paragraph, it's probably an openquote.
1.2679 + (but watch dialect)
1.2680 +
1.2681 + Words with ' at beginning and end are probably quoted:
1.2682 + "You have the word 'chivalry' frequently on your lips."
1.2683 + (Not specifically implemented)
1.2684 + V.18 I'm glad I didn't implement this, 'cos it jest ain't so
1.2685 + where the convention is to punctuate outside the quotes.
1.2686 + 'Come', he said, 'and join the party'.
1.2687 +
1.2688 + If it is followed by an alpha, and especially a capital:
1.2689 + 'Hello,' called he.
1.2690 + it is either an openquote or dialect.
1.2691 +
1.2692 + Dialect breaks ALL the rules:
1.2693 + A man's a man for a' that.
1.2694 + "Aye, but 'tis all in the pas' now."
1.2695 + "'Tis often the way," he said.
1.2696 + 'Ave a drink on me.
1.2697 +
1.2698 + This version looks to be an improvement, and produces
1.2699 + fewer false positives, but is still not perfect. The
1.2700 + 'pack dogs' case still fools it, and dialect is still
1.2701 + a problem. Oh, well, it's an improvement, and I have
1.2702 + a weighted structure in place for refining guesses at
1.2703 + closequotes. Maybe next time, I'll add a bit of logic
1.2704 + where if there is an open quote and one that was guessed
1.2705 + to be a possessive apostrophe after s, I'll re-guess it
1.2706 + to be a closequote. Let's see how this one flies, first.
1.2707 +
1.2708 + (Afterview: it's still crap. Needs much work, and a deeper insight.)
1.2709 +
1.2710 + Released as .15
1.2711 +
1.2712 + TODO: More he/be checks. Can't be perfect - counterexamples:
1.2713 + I gave my son good advice: be married regardless of the world's opinion.
1.2714 + I gave my son good advice: he married regardless of the world's opinion.
1.2715 +
1.2716 + If by "primitive" be meant "crude", we can understand the sentence.
1.2717 + If by "primitive" he meant "crude", we can understand the sentence.
1.2718 +
1.2719 + No matter what be said, I must go on.
1.2720 + No matter what he said, I must go on.
1.2721 +
1.2722 + No value, however great, can be set upon them.
1.2723 + No value, however great, can he set upon them.
1.2724 +
1.2725 + Real-Life one from a DP International Weekly Miscellany:
1.2726 + He wandered through the forest without fear, sleeping
1.2727 + much, for in sleep be had companionship--the Great
1.2728 + Spirit teaching him what he should know in dreams.
1.2729 + That one found by jeebies, and it turned out to be "he".
1.2730 +
1.2731 +
1.2732 + ---------------
1.2733 +
1.2734 + 07/01/01 Added -O option.
1.2735 + Improved singlequotes by reporting mismatched single quotes
1.2736 + only if an open_single_quotes was found.
1.2737 +
1.2738 + Released as .16
1.2739 +
1.2740 + ---------------
1.2741 +
1.2742 + 08/27/01 Added -Y switch for Robert Rowe to allow his app to
1.2743 + catch the error output.
1.2744 +
1.2745 + Released as .17
1.2746 +
1.2747 + ---------------
1.2748 +
1.2749 + 09/08/01 Added checking Capitals at start of paragraph, but not
1.2750 + checking them at start of sentence.
1.2751 +
1.2752 + TODO: Parse sentences out so can check reliably for start of
1.2753 + sentence. Need a whole different approach for that.
1.2754 + (Can't just rely on periods, since they are also
1.2755 + used for abbreviations, etc.)
1.2756 +
1.2757 + Added checking for all vowels or all consonants in a word.
1.2758 +
1.2759 + While I was in, I added "ii" checking and "tl" at start of word.
1.2760 +
1.2761 + Added echoing of first line of paragraph when reporting
1.2762 + mismatched quoted or brackets (thanks to David Widger for the
1.2763 + suggestion)
1.2764 +
1.2765 + Not querying L at start of a number (used for British pounds).
1.2766 +
1.2767 + The spelling changes are sort of half-done but released anyway
1.2768 + Skipped .18 because I had given out a couple of test versions
1.2769 + with that number.
1.2770 +
1.2771 + 09/25/01 Released as .19
1.2772 +
1.2773 + ---------------
1.2774 +
1.2775 + TODO:
1.2776 + Use the logic from my new version of safewrap to stop querying
1.2777 + short lines like poems and TOCs.
1.2778 + Ignore non-standard ellipses like . . . or ...
1.2779 +
1.2780 +
1.2781 + ---------------
1.2782 + 10/01/01 Made any line over 80 a VERY long line (was 85).
1.2783 + Recognized openquotes on indented paragraphs as continuations
1.2784 + of the same speech.
1.2785 + Added "cf" to the okword list (how did I forget _that_?) and a few others.
1.2786 + Moved abbrev to okword and made it more general.
1.2787 + Removed requirement that PG_space_emdash be greater than
1.2788 + ten before turning off warnings about spaced dashes.
1.2789 + Added period to list of characters that might constitute a separator line.
1.2790 + Now checking for double punctuation (Thanks, David!)
1.2791 + Now if two spaced em-dashes on a line, reports both. (DW)
1.2792 + Bug: Wasn't catching spaced punctuation at line-end since I
1.2793 + added flgets in version .13 - fixed.
1.2794 + Bug: Wasn't catching spaced singlequotes - fixed
1.2795 + Now reads punctuated numbers like 1,000 as a single word.
1.2796 + (Used to give "standalone 1" type queries)
1.2797 + Changed paranoid mode - not including s and p options. -ex is now quite usable.
1.2798 + Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed
1.2799 + Bug: Sometimes gave _next_ line number for queried word at end of line - fixed
1.2800 +
1.2801 + 10/22/01 Released as .20
1.2802 +
1.2803 + ---------------
1.2804 +
1.2805 + Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!)
1.2806 + Reduced the number of hi-bit letters needed to stop reporting them
1.2807 + from 1/20 to 1/100 or 200 in total.
1.2808 + Added PG footer check.
1.2809 + Added the -h switch.
1.2810 + Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10
1.2811 + Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23"
1.2812 + Added unspaced brackets check when surrounded by alpha.
1.2813 + Removed all typo reporting unless the typo switch is on.
1.2814 + Added gcisalpha to ease over-reporting of 8-bit queries.
1.2815 + ECHO_SWITCH is now ON by default!
1.2816 + PARANOID_SWITCH is now ON by default!
1.2817 + Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg)
1.2818 + Checking for standalone lowercase "l"
1.2819 + Checking for standalone lowercase "s"
1.2820 + Considering "is be" and "be is" "be was" "was be" as he/be errors
1.2821 + Looking at punct at end of para
1.2822 +
1.2823 + 01/20/02 Released as .21
1.2824 +
1.2825 + ---------------
1.2826 +
1.2827 + Added VERBOSE_SWITCH to make it list everything. (George Davis)
1.2828 +
1.2829 + ---------------
1.2830 +
1.2831 + 02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have.
1.2832 + after which
1.2833 + This line caused a coredump on Solaris - fixed.
1.2834 + Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe
1.2835 + 03/09/02 Changed header recognition for another header change
1.2836 + Called it .24
1.2837 + 03/29/02 Added qword[][] so I can suppress massive overreporting
1.2838 + of queried "words" like "FN", "Wm.", "th'", people's
1.2839 + initials, chemical formulae and suchlike in some texts.
1.2840 + Called it .25
1.2841 + 04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed.
1.2842 + Added linecounts in overview mode.
1.2843 + Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done.
1.2844 + "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that?
1.2845 + 07/07/02 Added GPL.
1.2846 + Added checking for broken em-dash at line-end (enddash)
1.2847 + Released as 0.95
1.2848 + 08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo.
1.2849 + Released as 0.96
1.2850 + 10/10/02 Suppressing some annoying multiple reports by default:
1.2851 + Standalone Ones, Asterisks, Square Brackets.
1.2852 + Digit 1 occurs often in many scientific texts.
1.2853 + Asterisk occurs often in multi-footnoted texts.
1.2854 + Mismatch Square Brackets occurs often in multi-para footnotes.
1.2855 + Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil.
1.2856 + . . . but it does more or less work for the main cases.
1.2857 + Removed uppercase within a word as a separate category so
1.2858 + that names like VanAllen get reported only once, like other
1.2859 + suspected typos.
1.2860 + 11/24/02 Fixed - -m switch wasn't looking at htmlnum in
1.2861 + loseentities (Thanks, Brett!)
1.2862 + Fixed bug which occasionally gave false warning of
1.2863 + paragraph starting with lowercase.
1.2864 + Added underscore as character not to query around doublequotes.
1.2865 + Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859"
1.2866 + . . . this is to help detect things like CP1252 characters.
1.2867 + Released as 0.97
1.2868 +
1.2869 + 12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell,
1.2870 + for doublequotes only. Replaces "Spaced quote", since it also covers that
1.2871 + case.
1.2872 + Added "warn_hyphen" to ease over-reporting of hyphens.
1.2873 +
1.2874 + 12/20/02 Added "extra period" checks.
1.2875 + Added single character line check
1.2876 + Added I" check - is usually an exclam
1.2877 + Released as 0.98
1.2878 +
1.2879 + 1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile()
1.2880 + from when I was looking at ways to identify markup. Refuses to
1.2881 + open files for *nix users with upcase in the filemanes. Removed.
1.2882 + Fixed quickly and released as 0.981
1.2883 +
1.2884 + 1/8/03 Added "arid" to the list of typos, slightly against my better
1.2885 + judgement, but the DP gang are all excited about it. :-)
1.2886 + Added a check for comma followed by capital letter, where
1.2887 + a period has OCRed into a comma. (DW). Not sure about this
1.2888 + either; we'll see.
1.2889 + Compiling for Win32 to allow longfilenames.
1.2890 +
1.2891 + 6/1/04 A messy test release for DW to include the "gutcheck.typ"
1.2892 + process. And the gutcheck.jee trials. Removed "arid" --
1.2893 + it can go in gutcheck.typ
1.2894 +
1.2895 + Added checks for carats ^ and slants / but disabling slant
1.2896 + queries if more than 20 of them, because some people use them
1.2897 + for /italics/. Slants are commonly mistaken italic "I"s.
1.2898 +
1.2899 + Later: removed gutcheck.jee -- wrote jeebies instead.
1.2900 +
1.2901 +Random TODO:
1.2902 + Check brackets more closely, like quotes, so that it becomes
1.2903 + easy to find the error in long paragraphs full of brackets.
1.2904 +
1.2905 +
1.2906 + 11/4/04 Assorted cleanup. Fixed case where text started with an
1.2907 + unbalanced paragraph.
1.2908 +
1.2909 + 1/2/05 Has it really been that long? Added "nocomma", "noperiod" check.
1.2910 + Bits and pieces: improved isroman(). Added isletter().
1.2911 + Other stuff I never noted before this.
1.2912 +
1.2913 + 7/3/05 Stuck in a quick start on DP-markup ignoring
1.2914 + at BillFlis's suggestion.
1.2915 +
1.2916 + 1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in?
1.2917 + Don't count footer for dotcomma etc.
1.2918 +
1.2919 +
1.2920 +1 I
1.2921 +ail all
1.2922 +arc are
1.2923 +arid and
1.2924 +bad had
1.2925 +ball hall
1.2926 +band hand
1.2927 +bar her
1.2928 +bat but
1.2929 +be he
1.2930 +bead head
1.2931 +beads heads
1.2932 +bear hear
1.2933 +bit hit
1.2934 +bo be
1.2935 +boon been
1.2936 +borne home
1.2937 +bow how
1.2938 +bumbled humbled
1.2939 +car ear
1.2940 +carnage carriage
1.2941 +carne came
1.2942 +cast east
1.2943 +cat cut
1.2944 +cat eat
1.2945 +cheek check
1.2946 +clay day
1.2947 +coining coming
1.2948 +comer corner
1.2949 +die she
1.2950 +docs does
1.2951 +ease case
1.2952 +fail fall
1.2953 +fee he
1.2954 +haying having
1.2955 +ho he
1.2956 +ho who
1.2957 +hut but
1.2958 +is as
1.2959 +lie he
1.2960 +lime time
1.2961 +loth 10th
1.2962 +m in
1.2963 +modem modern
1.2964 +Ms his
1.2965 +ray away
1.2966 +ray my
1.2967 +ringer finger
1.2968 +ringers fingers
1.2969 +rioted noted
1.2970 +tho the
1.2971 +tie he
1.2972 +tie the
1.2973 +tier her
1.2974 +tight right
1.2975 +tile the
1.2976 +tiling thing
1.2977 +tip up
1.2978 +tram train
1.2979 +tune time
1.2980 +u "
1.2981 +wen well
1.2982 +yon you
1.2983 +
1.2984 +*********************************************************************/
1.2985 +